Package rdkit :: Package ML :: Package Data :: Module Quantize
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.Quantize

  1  # $Id$ 
  2  # 
  3  #  Copyright (C) 2001-2008  Greg Landrum and Rational Discovery LLC 
  4  #   All Rights Reserved 
  5  # 
  6   
  7  """ Automatic search for quantization bounds 
  8   
  9  This uses the expected informational gain to determine where quantization bounds should 
 10  lie. 
 11   
 12  **Notes**: 
 13   
 14    - bounds are less than, so if the bounds are [1.,2.], 
 15      [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2] 
 16   
 17  """ 
 18  from __future__ import print_function 
 19  import numpy 
 20  from rdkit.ML.InfoTheory import entropy 
 21  from rdkit.six.moves import zip, map, range 
 22  try: 
 23    from rdkit.ML.Data import cQuantize 
 24  except ImportError: 
 25    hascQuantize = 0 
 26  else: 
 27    hascQuantize = 1   
 28     
 29  _float_tol = 1e-8 
30 -def feq(v1,v2,tol=_float_tol):
31 """ floating point equality with a tolerance factor 32 33 **Arguments** 34 35 - v1: a float 36 37 - v2: a float 38 39 - tol: the tolerance for comparison 40 41 **Returns** 42 43 0 or 1 44 45 """ 46 return abs(v1-v2) < tol
47
48 -def FindVarQuantBound(vals,results,nPossibleRes):
49 """ Uses FindVarMultQuantBounds, only here for historic reasons 50 """ 51 bounds,gain = FindVarMultQuantBounds(vals,1,results,nPossibleRes) 52 return (bounds[0],gain)
53 54
55 -def _GenVarTable(vals,cuts,starts,results,nPossibleRes):
56 """ Primarily intended for internal use 57 58 constructs a variable table for the data passed in 59 The table for a given variable records the number of times each possible value 60 of that variable appears for each possible result of the function. 61 62 **Arguments** 63 64 - vals: a 1D Numeric array with the values of the variables 65 66 - cuts: a list with the indices of the quantization bounds 67 (indices are into _starts_ ) 68 69 - starts: a list of potential starting points for quantization bounds 70 71 - results: a 1D Numeric array of integer result codes 72 73 - nPossibleRes: an integer with the number of possible result codes 74 75 **Returns** 76 77 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes 78 79 **Notes** 80 81 - _vals_ should be sorted! 82 83 """ 84 nVals = len(cuts)+1 85 varTable = numpy.zeros((nVals,nPossibleRes),'i') 86 idx = 0 87 for i in range(nVals-1): 88 cut = cuts[i] 89 while idx < starts[cut]: 90 varTable[i,results[idx]] += 1 91 idx += 1 92 while idx < len(vals): 93 varTable[-1,results[idx]] += 1 94 idx += 1 95 return varTable
96
97 -def _PyRecurseOnBounds(vals,cuts,which,starts,results,nPossibleRes,varTable=None):
98 """ Primarily intended for internal use 99 100 Recursively finds the best quantization boundaries 101 102 **Arguments** 103 104 - vals: a 1D Numeric array with the values of the variables, 105 this should be sorted 106 107 - cuts: a list with the indices of the quantization bounds 108 (indices are into _starts_ ) 109 110 - which: an integer indicating which bound is being adjusted here 111 (and index into _cuts_ ) 112 113 - starts: a list of potential starting points for quantization bounds 114 115 - results: a 1D Numeric array of integer result codes 116 117 - nPossibleRes: an integer with the number of possible result codes 118 119 **Returns** 120 121 - a 2-tuple containing: 122 123 1) the best information gain found so far 124 125 2) a list of the quantization bound indices ( _cuts_ for the best case) 126 127 **Notes** 128 129 - this is not even remotely efficient, which is why a C replacement 130 was written 131 132 """ 133 nBounds = len(cuts) 134 maxGain = -1e6 135 bestCuts = None 136 highestCutHere = len(starts) - nBounds + which 137 if varTable is None: 138 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 139 while cuts[which] <= highestCutHere: 140 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 141 gainHere = entropy.InfoGain(varTable) 142 if gainHere > maxGain: 143 maxGain = gainHere 144 bestCuts = cuts[:] 145 # recurse on the next vars if needed 146 if which < nBounds-1: 147 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes, 148 varTable = varTable) 149 if gainHere > maxGain: 150 maxGain = gainHere 151 bestCuts = cutsHere 152 # update this cut 153 cuts[which] += 1 154 for i in range(which+1,nBounds): 155 if cuts[i] == cuts[i-1]: 156 cuts[i] += 1 157 158 return maxGain,bestCuts
159
160 -def _NewPyRecurseOnBounds(vals,cuts,which,starts,results,nPossibleRes,varTable=None):
161 """ Primarily intended for internal use 162 163 Recursively finds the best quantization boundaries 164 165 **Arguments** 166 167 - vals: a 1D Numeric array with the values of the variables, 168 this should be sorted 169 170 - cuts: a list with the indices of the quantization bounds 171 (indices are into _starts_ ) 172 173 - which: an integer indicating which bound is being adjusted here 174 (and index into _cuts_ ) 175 176 - starts: a list of potential starting points for quantization bounds 177 178 - results: a 1D Numeric array of integer result codes 179 180 - nPossibleRes: an integer with the number of possible result codes 181 182 **Returns** 183 184 - a 2-tuple containing: 185 186 1) the best information gain found so far 187 188 2) a list of the quantization bound indices ( _cuts_ for the best case) 189 190 **Notes** 191 192 - this is not even remotely efficient, which is why a C replacement 193 was written 194 195 """ 196 nBounds = len(cuts) 197 maxGain = -1e6 198 bestCuts = None 199 highestCutHere = len(starts) - nBounds + which 200 if varTable is None: 201 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes) 202 while cuts[which] <= highestCutHere: 203 gainHere = entropy.InfoGain(varTable) 204 if gainHere > maxGain: 205 maxGain = gainHere 206 bestCuts = cuts[:] 207 # recurse on the next vars if needed 208 if which < nBounds-1: 209 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes, 210 varTable = None) 211 if gainHere > maxGain: 212 maxGain = gainHere 213 bestCuts = cutsHere 214 # update this cut 215 oldCut = cuts[which] 216 cuts[which] += 1 217 bot = starts[oldCut] 218 if oldCut+1 < len(starts): 219 top = starts[oldCut+1] 220 else: 221 top = starts[-1] 222 for i in range(bot,top): 223 v = results[i] 224 varTable[which,v] += 1 225 varTable[which+1,v] -= 1 226 for i in range(which+1,nBounds): 227 if cuts[i] == cuts[i-1]: 228 cuts[i] += 1 229 230 231 return maxGain,bestCuts
232 233 234 # -------------------------------- 235 # 236 # find all possible dividing points 237 # 238 # There are a couple requirements for a dividing point: 239 # 1) the dependent variable (descriptor) must change across it, 240 # 2) the result score must change across it 241 # 242 # So, in the list [(0,0),(1,0),(1,1),(2,1)]: 243 # we should divide before (1,0) and (2,1) 244 # 245 # --------------------------------
246 -def _NewPyFindStartPoints(sortVals,sortResults,nData):
247 startNext = [] 248 tol = 1e-8 249 blockAct=sortResults[0] 250 lastBlockAct=None 251 lastDiv=None 252 i = 1 253 while i<nData: 254 # move to the end of this block: 255 while i<nData and sortVals[i]-sortVals[i-1]<=tol: 256 if sortResults[i] != blockAct: 257 # this block is heterogeneous 258 blockAct=-1 259 i+=1 260 if lastBlockAct is None: 261 # first time through: 262 lastBlockAct = blockAct 263 lastDiv = i 264 else: 265 if blockAct==-1 or lastBlockAct==-1 or blockAct!=lastBlockAct: 266 startNext.append(lastDiv) 267 lastDiv = i 268 lastBlockAct = blockAct 269 else: 270 lastDiv=i 271 if i<nData: 272 blockAct=sortResults[i] 273 i+=1 274 # catch the case that the last point also sets a bin: 275 if blockAct != lastBlockAct : 276 startNext.append(lastDiv) 277 return startNext
278
279 -def FindVarMultQuantBounds(vals,nBounds,results,nPossibleRes):
280 """ finds multiple quantization bounds for a single variable 281 282 **Arguments** 283 284 - vals: sequence of variable values (assumed to be floats) 285 286 - nBounds: the number of quantization bounds to find 287 288 - results: a list of result codes (should be integers) 289 290 - nPossibleRes: an integer with the number of possible values of the 291 result variable 292 293 **Returns** 294 295 - a 2-tuple containing: 296 297 1) a list of the quantization bounds (floats) 298 299 2) the information gain associated with this quantization 300 301 302 """ 303 assert len(vals) == len(results), 'vals/results length mismatch' 304 305 nData = len(vals) 306 if nData == 0: 307 return [],-1e8 308 309 # sort the variable values: 310 svs = list(zip(vals,results)) 311 svs.sort() 312 sortVals,sortResults = zip(*svs) 313 startNext=_FindStartPoints(sortVals,sortResults,nData) 314 if not len(startNext): 315 return [0],0.0 316 if len(startNext)<nBounds: 317 nBounds = len(startNext)-1 318 if nBounds == 0: 319 nBounds=1 320 initCuts = list(range(nBounds)) 321 maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext, 322 sortResults,nPossibleRes) 323 quantBounds = [] 324 nVs = len(sortVals) 325 for cut in bestCuts: 326 idx = startNext[cut] 327 if idx == nVs: 328 quantBounds.append(sortVals[-1]) 329 elif idx == 0: 330 quantBounds.append(sortVals[idx]) 331 else: 332 quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.) 333 334 return quantBounds,maxGain
335 336 #hascQuantize=0 337 if hascQuantize: 338 _RecurseOnBounds = cQuantize._RecurseOnBounds 339 _FindStartPoints = cQuantize._FindStartPoints 340 else: 341 _RecurseOnBounds = _NewPyRecurseOnBounds 342 _FindStartPoints = _NewPyFindStartPoints 343 344 if __name__ == '__main__': 345 import sys 346 if 1: 347 d = [(1.,0), 348 (1.1,0), 349 (1.2,0), 350 (1.4,1), 351 (1.4,0), 352 (1.6,1), 353 (2.,1), 354 (2.1,0), 355 (2.1,0), 356 (2.1,0), 357 (2.2,1), 358 (2.3,0)] 359 varValues = list(map(lambda x:x[0],d)) 360 resCodes = list(map(lambda x:x[1],d)) 361 nPossibleRes = 2 362 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes) 363 print('RES:',res) 364 target = ([1.3, 2.05],.34707 ) 365 else: 366 d = [(1.,0), 367 (1.1,0), 368 (1.2,0), 369 (1.4,1), 370 (1.4,0), 371 (1.6,1), 372 (2.,1), 373 (2.1,0), 374 (2.1,0), 375 (2.1,0), 376 (2.2,1), 377 (2.3,0)] 378 varValues = list(map(lambda x:x[0],d)) 379 resCodes = list(map(lambda x:x[1],d)) 380 nPossibleRes =2 381 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes) 382 print(res) 383 #sys.exit(1) 384 d = [(1.4,1), 385 (1.4,0)] 386 387 varValues = list(map(lambda x:x[0],d)) 388 resCodes = list(map(lambda x:x[1],d)) 389 nPossibleRes =2 390 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes) 391 print(res) 392 393 d = [(1.4,0), 394 (1.4,0),(1.6,1)] 395 varValues = list(map(lambda x:x[0],d)) 396 resCodes = list(map(lambda x:x[1],d)) 397 nPossibleRes =2 398 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes) 399 print(res) 400