1
2
3
4
5
6
7 """ Automatic search for quantization bounds
8
9 This uses the expected informational gain to determine where quantization bounds should
10 lie.
11
12 **Notes**:
13
14 - bounds are less than, so if the bounds are [1.,2.],
15 [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2]
16
17 """
18 from __future__ import print_function
19 import numpy
20 from rdkit.ML.InfoTheory import entropy
21 from rdkit.six.moves import zip, map, range
22 try:
23 from rdkit.ML.Data import cQuantize
24 except ImportError:
25 hascQuantize = 0
26 else:
27 hascQuantize = 1
28
29 _float_tol = 1e-8
31 """ floating point equality with a tolerance factor
32
33 **Arguments**
34
35 - v1: a float
36
37 - v2: a float
38
39 - tol: the tolerance for comparison
40
41 **Returns**
42
43 0 or 1
44
45 """
46 return abs(v1-v2) < tol
47
49 """ Uses FindVarMultQuantBounds, only here for historic reasons
50 """
51 bounds,gain = FindVarMultQuantBounds(vals,1,results,nPossibleRes)
52 return (bounds[0],gain)
53
54
56 """ Primarily intended for internal use
57
58 constructs a variable table for the data passed in
59 The table for a given variable records the number of times each possible value
60 of that variable appears for each possible result of the function.
61
62 **Arguments**
63
64 - vals: a 1D Numeric array with the values of the variables
65
66 - cuts: a list with the indices of the quantization bounds
67 (indices are into _starts_ )
68
69 - starts: a list of potential starting points for quantization bounds
70
71 - results: a 1D Numeric array of integer result codes
72
73 - nPossibleRes: an integer with the number of possible result codes
74
75 **Returns**
76
77 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes
78
79 **Notes**
80
81 - _vals_ should be sorted!
82
83 """
84 nVals = len(cuts)+1
85 varTable = numpy.zeros((nVals,nPossibleRes),'i')
86 idx = 0
87 for i in range(nVals-1):
88 cut = cuts[i]
89 while idx < starts[cut]:
90 varTable[i,results[idx]] += 1
91 idx += 1
92 while idx < len(vals):
93 varTable[-1,results[idx]] += 1
94 idx += 1
95 return varTable
96
98 """ Primarily intended for internal use
99
100 Recursively finds the best quantization boundaries
101
102 **Arguments**
103
104 - vals: a 1D Numeric array with the values of the variables,
105 this should be sorted
106
107 - cuts: a list with the indices of the quantization bounds
108 (indices are into _starts_ )
109
110 - which: an integer indicating which bound is being adjusted here
111 (and index into _cuts_ )
112
113 - starts: a list of potential starting points for quantization bounds
114
115 - results: a 1D Numeric array of integer result codes
116
117 - nPossibleRes: an integer with the number of possible result codes
118
119 **Returns**
120
121 - a 2-tuple containing:
122
123 1) the best information gain found so far
124
125 2) a list of the quantization bound indices ( _cuts_ for the best case)
126
127 **Notes**
128
129 - this is not even remotely efficient, which is why a C replacement
130 was written
131
132 """
133 nBounds = len(cuts)
134 maxGain = -1e6
135 bestCuts = None
136 highestCutHere = len(starts) - nBounds + which
137 if varTable is None:
138 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
139 while cuts[which] <= highestCutHere:
140 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
141 gainHere = entropy.InfoGain(varTable)
142 if gainHere > maxGain:
143 maxGain = gainHere
144 bestCuts = cuts[:]
145
146 if which < nBounds-1:
147 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
148 varTable = varTable)
149 if gainHere > maxGain:
150 maxGain = gainHere
151 bestCuts = cutsHere
152
153 cuts[which] += 1
154 for i in range(which+1,nBounds):
155 if cuts[i] == cuts[i-1]:
156 cuts[i] += 1
157
158 return maxGain,bestCuts
159
161 """ Primarily intended for internal use
162
163 Recursively finds the best quantization boundaries
164
165 **Arguments**
166
167 - vals: a 1D Numeric array with the values of the variables,
168 this should be sorted
169
170 - cuts: a list with the indices of the quantization bounds
171 (indices are into _starts_ )
172
173 - which: an integer indicating which bound is being adjusted here
174 (and index into _cuts_ )
175
176 - starts: a list of potential starting points for quantization bounds
177
178 - results: a 1D Numeric array of integer result codes
179
180 - nPossibleRes: an integer with the number of possible result codes
181
182 **Returns**
183
184 - a 2-tuple containing:
185
186 1) the best information gain found so far
187
188 2) a list of the quantization bound indices ( _cuts_ for the best case)
189
190 **Notes**
191
192 - this is not even remotely efficient, which is why a C replacement
193 was written
194
195 """
196 nBounds = len(cuts)
197 maxGain = -1e6
198 bestCuts = None
199 highestCutHere = len(starts) - nBounds + which
200 if varTable is None:
201 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
202 while cuts[which] <= highestCutHere:
203 gainHere = entropy.InfoGain(varTable)
204 if gainHere > maxGain:
205 maxGain = gainHere
206 bestCuts = cuts[:]
207
208 if which < nBounds-1:
209 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
210 varTable = None)
211 if gainHere > maxGain:
212 maxGain = gainHere
213 bestCuts = cutsHere
214
215 oldCut = cuts[which]
216 cuts[which] += 1
217 bot = starts[oldCut]
218 if oldCut+1 < len(starts):
219 top = starts[oldCut+1]
220 else:
221 top = starts[-1]
222 for i in range(bot,top):
223 v = results[i]
224 varTable[which,v] += 1
225 varTable[which+1,v] -= 1
226 for i in range(which+1,nBounds):
227 if cuts[i] == cuts[i-1]:
228 cuts[i] += 1
229
230
231 return maxGain,bestCuts
232
233
234
235
236
237
238
239
240
241
242
243
244
245
247 startNext = []
248 tol = 1e-8
249 blockAct=sortResults[0]
250 lastBlockAct=None
251 lastDiv=None
252 i = 1
253 while i<nData:
254
255 while i<nData and sortVals[i]-sortVals[i-1]<=tol:
256 if sortResults[i] != blockAct:
257
258 blockAct=-1
259 i+=1
260 if lastBlockAct is None:
261
262 lastBlockAct = blockAct
263 lastDiv = i
264 else:
265 if blockAct==-1 or lastBlockAct==-1 or blockAct!=lastBlockAct:
266 startNext.append(lastDiv)
267 lastDiv = i
268 lastBlockAct = blockAct
269 else:
270 lastDiv=i
271 if i<nData:
272 blockAct=sortResults[i]
273 i+=1
274
275 if blockAct != lastBlockAct :
276 startNext.append(lastDiv)
277 return startNext
278
280 """ finds multiple quantization bounds for a single variable
281
282 **Arguments**
283
284 - vals: sequence of variable values (assumed to be floats)
285
286 - nBounds: the number of quantization bounds to find
287
288 - results: a list of result codes (should be integers)
289
290 - nPossibleRes: an integer with the number of possible values of the
291 result variable
292
293 **Returns**
294
295 - a 2-tuple containing:
296
297 1) a list of the quantization bounds (floats)
298
299 2) the information gain associated with this quantization
300
301
302 """
303 assert len(vals) == len(results), 'vals/results length mismatch'
304
305 nData = len(vals)
306 if nData == 0:
307 return [],-1e8
308
309
310 svs = list(zip(vals,results))
311 svs.sort()
312 sortVals,sortResults = zip(*svs)
313 startNext=_FindStartPoints(sortVals,sortResults,nData)
314 if not len(startNext):
315 return [0],0.0
316 if len(startNext)<nBounds:
317 nBounds = len(startNext)-1
318 if nBounds == 0:
319 nBounds=1
320 initCuts = list(range(nBounds))
321 maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext,
322 sortResults,nPossibleRes)
323 quantBounds = []
324 nVs = len(sortVals)
325 for cut in bestCuts:
326 idx = startNext[cut]
327 if idx == nVs:
328 quantBounds.append(sortVals[-1])
329 elif idx == 0:
330 quantBounds.append(sortVals[idx])
331 else:
332 quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.)
333
334 return quantBounds,maxGain
335
336
337 if hascQuantize:
338 _RecurseOnBounds = cQuantize._RecurseOnBounds
339 _FindStartPoints = cQuantize._FindStartPoints
340 else:
341 _RecurseOnBounds = _NewPyRecurseOnBounds
342 _FindStartPoints = _NewPyFindStartPoints
343
344 if __name__ == '__main__':
345 import sys
346 if 1:
347 d = [(1.,0),
348 (1.1,0),
349 (1.2,0),
350 (1.4,1),
351 (1.4,0),
352 (1.6,1),
353 (2.,1),
354 (2.1,0),
355 (2.1,0),
356 (2.1,0),
357 (2.2,1),
358 (2.3,0)]
359 varValues = list(map(lambda x:x[0],d))
360 resCodes = list(map(lambda x:x[1],d))
361 nPossibleRes = 2
362 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
363 print('RES:',res)
364 target = ([1.3, 2.05],.34707 )
365 else:
366 d = [(1.,0),
367 (1.1,0),
368 (1.2,0),
369 (1.4,1),
370 (1.4,0),
371 (1.6,1),
372 (2.,1),
373 (2.1,0),
374 (2.1,0),
375 (2.1,0),
376 (2.2,1),
377 (2.3,0)]
378 varValues = list(map(lambda x:x[0],d))
379 resCodes = list(map(lambda x:x[1],d))
380 nPossibleRes =2
381 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
382 print(res)
383
384 d = [(1.4,1),
385 (1.4,0)]
386
387 varValues = list(map(lambda x:x[0],d))
388 resCodes = list(map(lambda x:x[1],d))
389 nPossibleRes =2
390 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
391 print(res)
392
393 d = [(1.4,0),
394 (1.4,0),(1.6,1)]
395 varValues = list(map(lambda x:x[0],d))
396 resCodes = list(map(lambda x:x[1],d))
397 nPossibleRes =2
398 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
399 print(res)
400