Package rdkit :: Package ML :: Package Data :: Module DataUtils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.DataUtils

  1  ## Automatically adapted for numpy.oldnumeric Jun 27, 2008 by -c 
  2   
  3  # 
  4  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  5  #   All Rights Reserved 
  6  # 
  7   
  8  """ Utilities for data manipulation 
  9   
 10  **FILE FORMATS:** 
 11   
 12   - *.qdat files* contain quantized data suitable for 
 13    feeding to learning algorithms. 
 14   
 15    The .qdat file, written by _DecTreeGui_, is structured as follows: 
 16   
 17     1) Any number of lines which are ignored. 
 18   
 19     2) A line containing the string 'Variable Table' 
 20   
 21        any number of variable definitions in the format: 
 22   
 23        '# Variable_name [quant_bounds]' 
 24   
 25          where '[quant_bounds]' is a list of the boundaries used for quantizing 
 26           that variable.  If the variable is inherently integral (i.e. not 
 27           quantized), this can be an empty list. 
 28   
 29     3) A line beginning with '# ----' which signals the end of the variable list 
 30   
 31     4) Any number of lines containing data points, in the format: 
 32   
 33        'Name_of_point var1 var2 var3 .... varN' 
 34   
 35        all variable values should be integers 
 36   
 37     Throughout, it is assumed that varN is the result 
 38   
 39   - *.dat files* contain the same information as .qdat files, but the variable 
 40     values can be anything (floats, ints, strings).  **These files should 
 41     still contain quant_bounds!** 
 42      
 43   - *.qdat.pkl file* contain a pickled (binary) representation of 
 44     the data read in.  They stores, in order: 
 45   
 46      1) A python list of the variable names 
 47   
 48      2) A python list of lists with the quantization bounds 
 49   
 50      3) A python list of the point names 
 51       
 52      4) A python list of lists with the data points 
 53   
 54  """ 
 55  from __future__ import print_function 
 56  import re,csv 
 57  import random 
 58   
 59  from rdkit import six 
 60  from rdkit.six.moves import cPickle 
 61  from rdkit.six.moves import xrange, map 
 62  from rdkit import RDConfig 
 63  from rdkit.utils import fileutils 
 64  from rdkit.ML.Data import MLData 
 65  from rdkit.Dbase.DbConnection import DbConnect 
 66  from rdkit.DataStructs import BitUtils 
 67   
68 -def permutation(nToDo):
69 res = list(xrange(nToDo)) 70 random.shuffle(res,random=random.random) 71 return res
72
73 -def WriteData(outFile,varNames,qBounds,examples):
74 """ writes out a .qdat file 75 76 **Arguments** 77 78 - outFile: a file object 79 80 - varNames: a list of variable names 81 82 - qBounds: the list of quantization bounds (should be the same length 83 as _varNames_) 84 85 - examples: the data to be written 86 87 """ 88 outFile.write('# Quantized data from DataUtils\n') 89 outFile.write('# ----------\n') 90 outFile.write('# Variable Table\n') 91 for i in xrange(len(varNames)): 92 outFile.write('# %s %s\n'%(varNames[i],str(qBounds[i]))) 93 outFile.write('# ----------\n') 94 for example in examples: 95 outFile.write(' '.join(map(str,example))+'\n')
96 97
98 -def ReadVars(inFile):
99 """ reads the variables and quantization bounds from a .qdat or .dat file 100 101 **Arguments** 102 103 - inFile: a file object 104 105 **Returns** 106 107 a 2-tuple containing: 108 109 1) varNames: a list of the variable names 110 111 2) qbounds: the list of quantization bounds for each variable 112 113 """ 114 varNames = [] 115 qBounds = [] 116 fileutils.MoveToMatchingLine(inFile,'Variable Table') 117 inLine = inFile.readline() 118 while inLine.find('# ----') == -1: 119 splitLine = inLine[2:].split('[') 120 varNames.append(splitLine[0].strip()) 121 qBounds.append(splitLine[1][:-2]) 122 inLine = inFile.readline() 123 for i in xrange(len(qBounds)): 124 125 if qBounds[i] != '': 126 l = qBounds[i].split(',') 127 qBounds[i] = [] 128 for item in l: 129 qBounds[i].append(float(item)) 130 else: 131 qBounds[i] = [] 132 return varNames,qBounds
133
134 -def ReadQuantExamples(inFile):
135 """ reads the examples from a .qdat file 136 137 **Arguments** 138 139 - inFile: a file object 140 141 **Returns** 142 143 a 2-tuple containing: 144 145 1) the names of the examples 146 147 2) a list of lists containing the examples themselves 148 149 **Note** 150 151 because this is reading a .qdat file, it assumed that all variable values 152 are integers 153 154 """ 155 expr1 = re.compile(r'^#') 156 expr2 = re.compile(r'[\ ]*|[\t]*') 157 examples = [] 158 names = [] 159 inLine = inFile.readline() 160 while inLine: 161 if expr1.search(inLine) is None: 162 resArr = expr2.split(inLine) 163 if len(resArr)>1: 164 examples.append(list(map(lambda x: int(x),resArr[1:]))) 165 names.append(resArr[0]) 166 inLine = inFile.readline() 167 return names,examples
168
169 -def ReadGeneralExamples(inFile):
170 """ reads the examples from a .dat file 171 172 **Arguments** 173 174 - inFile: a file object 175 176 **Returns** 177 178 a 2-tuple containing: 179 180 1) the names of the examples 181 182 2) a list of lists containing the examples themselves 183 184 **Note** 185 186 - this attempts to convert variable values to ints, then floats. 187 if those both fail, they are left as strings 188 189 """ 190 expr1 = re.compile(r'^#') 191 expr2 = re.compile(r'[\ ]*|[\t]*') 192 examples = [] 193 names = [] 194 inLine = inFile.readline() 195 while inLine: 196 if expr1.search(inLine) is None: 197 resArr = expr2.split(inLine)[:-1] 198 if len(resArr)>1: 199 for i in xrange(1,len(resArr)): 200 d = resArr[i] 201 try: 202 resArr[i] = int(d) 203 except ValueError: 204 try: 205 resArr[i] = float(d) 206 except ValueError: 207 pass 208 examples.append(resArr[1:]) 209 names.append(resArr[0]) 210 inLine = inFile.readline() 211 return names,examples
212
213 -def BuildQuantDataSet(fileName):
214 """ builds a data set from a .qdat file 215 216 **Arguments** 217 218 - fileName: the name of the .qdat file 219 220 **Returns** 221 222 an _MLData.MLQuantDataSet_ 223 224 """ 225 with open(fileName,'r') as inFile: 226 varNames,qBounds = ReadVars(inFile) 227 ptNames,examples = ReadQuantExamples(inFile) 228 data = MLData.MLQuantDataSet(examples,qBounds=qBounds,varNames=varNames, 229 ptNames=ptNames) 230 return data
231 232
233 -def BuildDataSet(fileName):
234 """ builds a data set from a .dat file 235 236 **Arguments** 237 238 - fileName: the name of the .dat file 239 240 **Returns** 241 242 an _MLData.MLDataSet_ 243 244 """ 245 with open(fileName,'r') as inFile: 246 varNames,qBounds = ReadVars(inFile) 247 ptNames,examples = ReadGeneralExamples(inFile) 248 data = MLData.MLDataSet(examples,qBounds=qBounds,varNames=varNames, 249 ptNames=ptNames) 250 return data
251 252
253 -def CalcNPossibleUsingMap(data,order,qBounds,nQBounds=None):
254 """ calculates the number of possible values for each variable in a data set 255 256 **Arguments** 257 258 - data: a list of examples 259 260 - order: the ordering map between the variables in _data_ and _qBounds_ 261 262 - qBounds: the quantization bounds for the variables 263 264 **Returns** 265 266 a list with the number of possible values each variable takes on in the data set 267 268 **Notes** 269 270 - variables present in _qBounds_ will have their _nPossible_ number read 271 from _qbounds 272 273 - _nPossible_ for other numeric variables will be calculated 274 275 """ 276 numericTypes = [int, float] 277 if six.PY2: 278 numericTypes.append(long) 279 280 print('order:',order, len(order)) 281 print('qB:',qBounds) 282 #print('nQB:',nQBounds, len(nQBounds)) 283 assert (qBounds and len(order)==len(qBounds)) or (nQBounds and len(order)==len(nQBounds)),\ 284 'order/qBounds mismatch' 285 nVars = len(order) 286 nPossible = [-1]*nVars 287 cols = range(nVars) 288 for i in xrange(nVars): 289 if nQBounds and nQBounds[i] != 0: 290 nPossible[i] = -1 291 cols.remove(i) 292 elif len(qBounds[i])>0: 293 nPossible[i] = len(qBounds[i]) 294 cols.remove(i) 295 296 nPts = len(data) 297 for i in xrange(nPts): 298 for col in cols[:]: 299 d = data[i][order[col]] 300 if type(d) in numericTypes: 301 if int(d) == d: 302 nPossible[col] = max(int(d),nPossible[col]) 303 else: 304 nPossible[col] = -1 305 cols.remove(col) 306 else: 307 print('bye bye col %d: %s'%(col,repr(d))) 308 nPossible[col] = -1 309 cols.remove(col) 310 311 return list(map(lambda x:int(x)+1,nPossible))
312 313 314
315 -def WritePickledData(outName,data):
316 """ writes either a .qdat.pkl or a .dat.pkl file 317 318 **Arguments** 319 320 - outName: the name of the file to be used 321 322 - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_ 323 324 """ 325 varNames = data.GetVarNames() 326 qBounds = data.GetQuantBounds() 327 ptNames = data.GetPtNames() 328 examples = data.GetAllData() 329 with open(outName,'wb+') as outFile: 330 cPickle.dump(varNames,outFile) 331 cPickle.dump(qBounds,outFile) 332 cPickle.dump(ptNames,outFile) 333 cPickle.dump(examples,outFile)
334
335 -def TakeEnsemble(vect,ensembleIds,isDataVect=False):
336 """ 337 338 >>> v = [10,20,30,40,50] 339 >>> TakeEnsemble(v,(1,2,3)) 340 [20, 30, 40] 341 >>> v = ['foo',10,20,30,40,50,1] 342 >>> TakeEnsemble(v,(1,2,3),isDataVect=True) 343 ['foo', 20, 30, 40, 1] 344 345 346 347 """ 348 if isDataVect: 349 ensembleIds = [x+1 for x in ensembleIds] 350 vect = [vect[0]]+[vect[x] for x in ensembleIds]+[vect[-1]] 351 else: 352 vect = [vect[x] for x in ensembleIds] 353 return vect
354 355 356
357 -def DBToData(dbName,tableName,user='sysdba',password='masterkey',dupCol=-1, 358 what='*',where='',join='',pickleCol=-1,pickleClass=None, 359 ensembleIds=None):
360 """ constructs an _MLData.MLDataSet_ from a database 361 362 **Arguments** 363 364 - dbName: the name of the database to be opened 365 366 - tableName: the table name containing the data in the database 367 368 - user: the user name to be used to connect to the database 369 370 - password: the password to be used to connect to the database 371 372 - dupCol: if nonzero specifies which column should be used to recognize 373 duplicates. 374 375 **Returns** 376 377 an _MLData.MLDataSet_ 378 379 **Notes** 380 381 - this uses Dbase.DataUtils functionality 382 383 """ 384 conn = DbConnect(dbName,tableName,user,password) 385 res = conn.GetData(fields=what,where=where,join=join,removeDups=dupCol, 386 forceList=1) 387 nPts = len(res) 388 vals = [None]*nPts 389 ptNames = [None]*nPts 390 classWorks=True 391 for i in range(nPts): 392 tmp = list(res[i]) 393 ptNames[i] = tmp.pop(0) 394 if pickleCol>=0: 395 if not pickleClass or not classWorks: 396 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) 397 else: 398 try: 399 tmp[pickleCol] = pickleClass(str(tmp[pickleCol])) 400 except Exception: 401 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol])) 402 classWorks=False 403 if ensembleIds: 404 tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol],ensembleIds) 405 else: 406 if ensembleIds: 407 tmp = TakeEnsemble(tmp,ensembleIds,isDataVect=True) 408 vals[i] = tmp 409 varNames = conn.GetColumnNames(join=join,what=what) 410 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames) 411 return data
412
413 -def TextToData(reader,ignoreCols=[],onlyCols=None):
414 """ constructs an _MLData.MLDataSet_ from a bunch of text 415 #DOC 416 **Arguments** 417 - reader needs to be iterable and return lists of elements 418 (like a csv.reader) 419 420 **Returns** 421 422 an _MLData.MLDataSet_ 423 424 """ 425 426 varNames = next(reader) 427 if not onlyCols: 428 keepCols = [] 429 for i,name in enumerate(varNames): 430 if name not in ignoreCols: 431 keepCols.append(i) 432 else: 433 keepCols = [-1]*len(onlyCols) 434 for i,name in enumerate(varNames): 435 if name in onlyCols: 436 keepCols[onlyCols.index(name)]=i 437 438 nCols = len(varNames) 439 varNames = tuple([varNames[x] for x in keepCols]) 440 nVars = len(varNames) 441 vals = [] 442 ptNames = [] 443 for splitLine in reader: 444 if len(splitLine): 445 if len(splitLine)!=nCols: 446 raise ValueError('unequal line lengths') 447 tmp = [splitLine[x] for x in keepCols] 448 ptNames.append(tmp[0]) 449 pt = [None]*(nVars-1) 450 for j in range(nVars-1): 451 try: 452 val = int(tmp[j+1]) 453 except ValueError: 454 try: 455 val = float(tmp[j+1]) 456 except ValueError: 457 val = str(tmp[j+1]) 458 pt[j] = val 459 vals.append(pt) 460 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames) 461 return data
462
463 -def TextFileToData(fName,onlyCols=None):
464 """ 465 #DOC 466 467 """ 468 ext = fName.split('.')[-1] 469 with open(fName,'r') as inF: 470 if ext.upper() == 'CSV': 471 # CSV module distributed with python2.3 and later 472 splitter = csv.reader(inF) 473 else: 474 splitter = csv.reader(inF,delimiter='\t') 475 res = TextToData(splitter,onlyCols=onlyCols) 476 return res
477
478 -def InitRandomNumbers(seed):
479 """ Seeds the random number generators 480 481 **Arguments** 482 483 - seed: a 2-tuple containing integers to be used as the random number seeds 484 485 **Notes** 486 487 this seeds both the RDRandom generator and the one in the standard 488 Python _random_ module 489 490 """ 491 from rdkit import RDRandom 492 RDRandom.seed(seed[0]) 493 import random 494 random.seed(seed[0])
495
496 -def FilterData(inData,val,frac,col=-1,indicesToUse=None,indicesOnly=0):
497 """ 498 #DOC 499 """ 500 if frac<0 or frac>1: raise ValueError('filter fraction out of bounds') 501 try: 502 inData[0][col] 503 except IndexError: 504 raise ValueError('target column index out of range') 505 506 507 # convert the input data to a list and sort them 508 if indicesToUse: 509 tmp = [inData[x] for x in indicesToUse] 510 else: 511 tmp = list(inData) 512 nOrig = len(tmp) 513 sortOrder = list(xrange(nOrig)) 514 #sortOrder.sort(lambda x,y,col=col,tmp=tmp:cmp(tmp[x][col],tmp[y][col])) 515 # no more cmp in python3, must use a key function 516 sortOrder.sort(key=lambda x: tmp[x][col]) 517 tmp = [tmp[x] for x in sortOrder] 518 519 # find the start of the entries with value val 520 start = 0 521 while start < nOrig and tmp[start][col] != val: 522 start += 1 523 if start >= nOrig: 524 raise ValueError('target value (%d) not found in data'%(val)) 525 526 # find the end of the entries with value val 527 finish = start+1 528 while finish<nOrig and tmp[finish][col] ==val: 529 finish += 1 530 531 # how many entries have the target value? 532 nWithVal = finish-start 533 534 # how many don't? 535 nOthers = len(tmp)-nWithVal 536 537 currFrac = float(nWithVal) / nOrig 538 if currFrac < frac: 539 # 540 # We're going to keep most of (all) the points with the target value, 541 # We need to figure out how many of the other points we'll 542 # toss out 543 # 544 nTgtFinal = nWithVal 545 nFinal = int(round(nWithVal / frac)) 546 nOthersFinal = nFinal - nTgtFinal 547 548 # 549 # We may need to reduce the number of targets to keep 550 # because it may make it impossible to hit exactly the 551 # fraction we're trying for. Take care of that now 552 # 553 while float(nTgtFinal) / nFinal > frac: 554 nTgtFinal -= 1 555 nFinal -= 1 556 557 else: 558 # 559 # There are too many points with the target value, 560 # we'll keep most of (all) the other points and toss a random 561 # selection of the target value points 562 # 563 nOthersFinal = nOthers 564 nFinal = int(round(nOthers/(1-frac))) 565 nTgtFinal = nFinal - nOthersFinal 566 567 # 568 # We may need to reduce the number of others to keep 569 # because it may make it impossible to hit exactly the 570 # fraction we're trying for. Take care of that now 571 # 572 while float(nTgtFinal) / nFinal < frac: 573 nOthersFinal -= 1 574 nFinal -= 1 575 576 others = list(xrange(start)) + list(xrange(finish,nOrig)) 577 othersTake = permutation(nOthers) 578 others = [others[x] for x in othersTake[:nOthersFinal]] 579 580 targets = list(xrange(start,finish)) 581 targetsTake = permutation(nWithVal) 582 targets = [targets[x] for x in targetsTake[:nTgtFinal]] 583 584 # these are all the indices we'll be keeping 585 indicesToKeep = targets+others 586 nToKeep = len(indicesToKeep) 587 nRej = nOrig-nToKeep 588 589 res = [] 590 rej = [] 591 # now pull the points, but in random order 592 if not indicesOnly: 593 for i in permutation(nOrig): 594 if i in indicesToKeep: 595 res.append(tmp[i]) 596 else: 597 rej.append(tmp[i]) 598 else: 599 # EFF: this is slower than it needs to be 600 for i in permutation(nOrig): 601 if not indicesToUse: 602 idx = sortOrder[i] 603 else: 604 idx = indicesToUse[sortOrder[i]] 605 if i in indicesToKeep: 606 res.append(idx) 607 else: 608 rej.append(idx) 609 return res,rej
610
611 -def CountResults(inData,col=-1,bounds=None):
612 """ #DOC 613 """ 614 counts = {} 615 for p in inData: 616 if not bounds: 617 r = p[col] 618 else: 619 act = p[col] 620 bound = 0 621 placed = 0 622 while not placed and bound < len(bounds): 623 if act < bounds[bound]: 624 r = bound 625 placed = 1 626 else: 627 bound += 1 628 if not placed: 629 r = bound 630 631 counts[r] = counts.get(r,0)+1 632 return counts
633 634
635 -def RandomizeActivities(dataSet,shuffle=0,runDetails=None):
636 """ randomizes the activity values of a dataset 637 638 **Arguments** 639 640 - dataSet: a _ML.Data.MLQuantDataSet_, the activities here will be randomized 641 642 - shuffle: an optional toggle. If this is set, the activity values 643 will be shuffled (so the number in each class remains constant) 644 645 - runDetails: an optional CompositeRun object 646 647 **Note** 648 649 - _examples_ are randomized in place 650 651 652 """ 653 nPossible = dataSet.GetNPossibleVals()[-1] 654 nPts = dataSet.GetNPts() 655 if shuffle: 656 if runDetails: runDetails.shuffled = 1 657 acts = dataSet.GetResults()[:] 658 random.shuffle(acts,random=random.random) 659 else: 660 if runDetails: runDetails.randomized = 1 661 acts = [random.randint(0,nPossible) for x in len(examples)] 662 for i in range(nPts): 663 tmp = dataSet[i] 664 tmp[-1] = acts[i] 665 dataSet[i] = tmp
666 667 668 669 670 671 #------------------------------------ 672 # 673 # doctest boilerplate 674 #
675 -def _test():
676 import doctest,sys 677 return doctest.testmod(sys.modules["__main__"])
678 679 if __name__ == '__main__': 680 import sys 681 failed,tried = _test() 682 sys.exit(failed) 683