Package rdkit :: Package Chem :: Module BuildFragmentCatalog
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.BuildFragmentCatalog

  1  # $Id$ 
  2  # 
  3  #  Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """  command line utility for working with FragmentCatalogs (CASE-type analysis) 
 12   
 13  **Usage** 
 14   
 15    BuildFragmentCatalog [optional args] <filename> 
 16   
 17   filename, the name of a delimited text file containing InData, is required 
 18   for some modes of operation (see below) 
 19   
 20  **Command Line Arguments** 
 21   
 22   - -n *maxNumMols*:  specify the maximum number of molecules to be processed 
 23   
 24   - -b: build the catalog and OnBitLists 
 25      *requires InData* 
 26   
 27   - -s: score compounds 
 28      *requires InData and a Catalog, can use OnBitLists* 
 29   
 30   - -g: calculate info gains 
 31      *requires Scores* 
 32   
 33   - -d: show details about high-ranking fragments 
 34      *requires a Catalog and Gains* 
 35   
 36   - --catalog=*filename*: filename with the pickled catalog. 
 37      If -b is provided, this file will be overwritten. 
 38   
 39   - --onbits=*filename*: filename to hold the pickled OnBitLists. 
 40     If -b is provided, this file will be overwritten 
 41     
 42   - --scores=*filename*: filename to hold the text score data. 
 43     If -s is provided, this file will be overwritten 
 44   
 45   - --gains=*filename*: filename to hold the text gains data. 
 46     If -g is provided, this file will be overwritten 
 47   
 48   - --details=*filename*: filename to hold the text details data. 
 49     If -d is provided, this file will be overwritten. 
 50   
 51   - --minPath=2: specify the minimum length for a path 
 52   
 53   - --maxPath=6: specify the maximum length for a path 
 54   
 55   - --smiCol=1: specify which column in the input data file contains 
 56       SMILES 
 57   
 58   - --actCol=-1: specify which column in the input data file contains 
 59       activities 
 60   
 61   - --nActs=2: specify the number of possible activity values 
 62   
 63   - --nBits=-1: specify the maximum number of bits to show details for 
 64   
 65  """ 
 66  from __future__ import print_function 
 67  import sys,os 
 68  from rdkit.six.moves import cPickle  #@UnresolvedImport #pylint: disable=F0401 
 69  from rdkit.six import next 
 70  from rdkit import Chem 
 71  from rdkit import RDConfig 
 72  from rdkit.Chem import FragmentCatalog 
 73  from rdkit.Dbase.DbConnection import DbConnect 
 74  import numpy 
 75  from rdkit.ML import InfoTheory 
 76  import types 
 77   
 78  _cvsVersion="$Revision$" 
 79  idx1 = _cvsVersion.find(':')+1 
 80  idx2 = _cvsVersion.rfind('$') 
 81  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 82   
83 -def message(msg,dest=sys.stdout):
84 dest.write(msg)
85
86 -def BuildCatalog(suppl,maxPts=-1,groupFileName=None, 87 minPath=2,maxPath=6,reportFreq=10):
88 """ builds a fragment catalog from a set of molecules in a delimited text block 89 90 **Arguments** 91 92 - suppl: a mol supplier 93 94 - maxPts: (optional) if provided, this will set an upper bound on the 95 number of points to be considered 96 97 - groupFileName: (optional) name of the file containing functional group 98 information 99 100 - minPath, maxPath: (optional) names of the minimum and maximum path lengths 101 to be considered 102 103 - reportFreq: (optional) how often to display status information 104 105 **Returns** 106 107 a FragmentCatalog 108 109 """ 110 if groupFileName is None: 111 groupFileName = os.path.join(RDConfig.RDDataDir,"FunctionalGroups.txt") 112 113 fpParams = FragmentCatalog.FragCatParams(minPath,maxPath,groupFileName) 114 catalog = FragmentCatalog.FragCatalog(fpParams) 115 fgen = FragmentCatalog.FragCatGenerator() 116 if maxPts >0: 117 nPts = maxPts 118 else: 119 if hasattr(suppl,'__len__'): 120 nPts = len(suppl) 121 else: 122 nPts = -1 123 for i,mol in enumerate(suppl): 124 if i == nPts: 125 break 126 if i and not i%reportFreq: 127 if nPts>-1: 128 message('Done %d of %d, %d paths\n'%(i,nPts,catalog.GetFPLength())) 129 else: 130 message('Done %d, %d paths\n'%(i,catalog.GetFPLength())) 131 fgen.AddFragsFromMol(mol,catalog) 132 return catalog
133
134 -def ScoreMolecules(suppl,catalog,maxPts=-1,actName='',acts=None, 135 nActs=2,reportFreq=10):
136 """ scores the compounds in a supplier using a catalog 137 138 **Arguments** 139 140 - suppl: a mol supplier 141 142 - catalog: the FragmentCatalog 143 144 - maxPts: (optional) the maximum number of molecules to be 145 considered 146 147 - actName: (optional) the name of the molecule's activity property. 148 If this is not provided, the molecule's last property will be used. 149 150 - acts: (optional) a sequence of activity values (integers). 151 If not provided, the activities will be read from the molecules. 152 153 - nActs: (optional) number of possible activity values 154 155 - reportFreq: (optional) how often to display status information 156 157 **Returns** 158 159 a 2-tuple: 160 161 1) the results table (a 3D array of ints nBits x 2 x nActs) 162 163 2) a list containing the on bit lists for each molecule 164 165 """ 166 nBits = catalog.GetFPLength() 167 resTbl = numpy.zeros((nBits,2,nActs),numpy.int) 168 obls = [] 169 170 if not actName and not acts: 171 actName = suppl[0].GetPropNames()[-1] 172 173 174 fpgen = FragmentCatalog.FragFPGenerator() 175 suppl.reset() 176 i = 1 177 for mol in suppl: 178 if i and not i%reportFreq: 179 message('Done %d.\n'%(i)) 180 if mol: 181 if not acts: 182 act = int(mol.GetProp(actName)) 183 else: 184 act = acts[i-1] 185 fp = fpgen.GetFPForMol(mol,catalog) 186 obls.append([x for x in fp.GetOnBits()]) 187 for j in range(nBits): 188 resTbl[j,0,act] += 1 189 for id in obls[i-1]: 190 resTbl[id-1,0,act] -= 1 191 resTbl[id-1,1,act] += 1 192 else: 193 obls.append([]) 194 i+=1 195 return resTbl,obls
196
197 -def ScoreFromLists(bitLists,suppl,catalog,maxPts=-1,actName='',acts=None, 198 nActs=2,reportFreq=10):
199 """ similar to _ScoreMolecules()_, but uses pre-calculated bit lists 200 for the molecules (this speeds things up a lot) 201 202 203 **Arguments** 204 205 - bitLists: sequence of on bit sequences for the input molecules 206 207 - suppl: the input supplier (we read activities from here) 208 209 - catalog: the FragmentCatalog 210 211 - maxPts: (optional) the maximum number of molecules to be 212 considered 213 214 - actName: (optional) the name of the molecule's activity property. 215 If this is not provided, the molecule's last property will be used. 216 217 - nActs: (optional) number of possible activity values 218 219 - reportFreq: (optional) how often to display status information 220 221 **Returns** 222 223 the results table (a 3D array of ints nBits x 2 x nActs) 224 225 """ 226 nBits = catalog.GetFPLength() 227 if maxPts >0: 228 nPts = maxPts 229 else: 230 nPts = len(bitLists) 231 resTbl = numpy.zeros((nBits,2,nActs),numpy.int) 232 if not actName and not acts: 233 actName = suppl[0].GetPropNames()[-1] 234 suppl.reset() 235 for i in range(1,nPts+1): 236 mol = next(suppl) 237 if not acts: 238 act = int(mol.GetProp(actName)) 239 else: 240 act = acts[i-1] 241 if i and not i%reportFreq: 242 message('Done %d of %d\n'%(i,nPts)) 243 ids = set() 244 for id in bitLists[i-1]: 245 ids.add(id-1) 246 for j in range(nBits): 247 resTbl[j,0,act] += 1 248 for id in ids: 249 resTbl[id,0,act] -= 1 250 resTbl[id,1,act] += 1 251 return resTbl
252
253 -def CalcGains(suppl,catalog,topN=-1,actName='',acts=None, 254 nActs=2,reportFreq=10,biasList=None,collectFps=0):
255 """ calculates info gains by constructing fingerprints 256 *DOC* 257 258 Returns a 2-tuple: 259 1) gains matrix 260 2) list of fingerprints 261 262 """ 263 nBits = catalog.GetFPLength() 264 if topN < 0: 265 topN = nBits 266 if not actName and not acts: 267 actName = suppl[0].GetPropNames()[-1] 268 269 gains = [0]*nBits 270 if hasattr(suppl,'__len__'): 271 nMols = len(suppl) 272 else: 273 nMols = -1 274 fpgen = FragmentCatalog.FragFPGenerator() 275 #ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) 276 if biasList: 277 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY) 278 ranker.SetBiasList(biasList) 279 else: 280 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) 281 i = 0 282 fps = [] 283 for mol in suppl: 284 if not acts: 285 try: 286 act = int(mol.GetProp(actName)) 287 except KeyError: 288 message('ERROR: Molecule has no property: %s\n'%(actName)) 289 message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames()))) 290 raise KeyError(actName) 291 else: 292 act = acts[i] 293 if i and not i%reportFreq: 294 if nMols>0: 295 message('Done %d of %d.\n'%(i,nMols)) 296 else: 297 message('Done %d.\n'%(i)) 298 fp = fpgen.GetFPForMol(mol,catalog) 299 ranker.AccumulateVotes(fp,act) 300 i+=1; 301 if collectFps: 302 fps.append(fp) 303 gains = ranker.GetTopN(topN) 304 return gains,fps
305
306 -def CalcGainsFromFps(suppl,fps,topN=-1,actName='',acts=None, 307 nActs=2,reportFreq=10,biasList=None):
308 """ calculates info gains from a set of fingerprints 309 310 *DOC* 311 312 """ 313 nBits = len(fps[0]) 314 if topN < 0: 315 topN = nBits 316 if not actName and not acts: 317 actName = suppl[0].GetPropNames()[-1] 318 319 gains = [0]*nBits 320 if hasattr(suppl,'__len__'): 321 nMols = len(suppl) 322 else: 323 nMols = -1 324 if biasList: 325 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY) 326 ranker.SetBiasList(biasList) 327 else: 328 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) 329 for i,mol in enumerate(suppl): 330 if not acts: 331 try: 332 act = int(mol.GetProp(actName)) 333 except KeyError: 334 message('ERROR: Molecule has no property: %s\n'%(actName)) 335 message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames()))) 336 raise KeyError(actName) 337 else: 338 act = acts[i] 339 if i and not i%reportFreq: 340 if nMols>0: 341 message('Done %d of %d.\n'%(i,nMols)) 342 else: 343 message('Done %d.\n'%(i)) 344 fp = fps[i] 345 ranker.AccumulateVotes(fp,act) 346 gains = ranker.GetTopN(topN) 347 return gains
348
349 -def OutputGainsData(outF,gains,cat,nActs=2):
350 actHeaders = ['Act-%d'%(x) for x in range(nActs)] 351 if cat: 352 outF.write('id,Description,Gain,%s\n'%(','.join(actHeaders))) 353 else: 354 outF.write('id,Gain,%s\n'%(','.join(actHeaders))) 355 for entry in gains: 356 id = int(entry[0]) 357 outL = [str(id)] 358 if cat: 359 descr = cat.GetBitDescription(id) 360 outL.append(descr) 361 outL.append('%.6f'%entry[1]) 362 outL += ['%d'%x for x in entry[2:]] 363 outF.write(','.join(outL)) 364 outF.write('\n')
365 366
367 -def ProcessGainsData(inF,delim=',',idCol=0,gainCol=1):
368 """ reads a list of ids and info gains out of an input file 369 370 """ 371 res = [] 372 inL = inF.readline() 373 for line in inF.xreadlines(): 374 splitL = line.strip().split(delim) 375 res.append((splitL[idCol],float(splitL[gainCol]))) 376 return res
377 378
379 -def ShowDetails(catalog,gains,nToDo=-1,outF=sys.stdout,idCol=0,gainCol=1, 380 outDelim=','):
381 """ 382 gains should be a sequence of sequences. The idCol entry of each 383 sub-sequence should be a catalog ID. _ProcessGainsData()_ provides 384 suitable input. 385 386 """ 387 if nToDo < 0: 388 nToDo = len(gains) 389 for i in range(nToDo): 390 id = int(gains[i][idCol]) 391 gain = float(gains[i][gainCol]) 392 descr = catalog.GetFragDescription(id) 393 if descr: 394 outF.write('%s\n'%(outDelim.join((str(id),descr,str(gain)))))
395
396 -def SupplierFromDetails(details):
397 from rdkit.VLib.NodeLib.DbMolSupply import DbMolSupplyNode 398 from rdkit.VLib.NodeLib.SmilesSupply import SmilesSupplyNode 399 400 if details.dbName: 401 conn = DbConnect(details.dbName,details.tableName) 402 suppl = DbMolSupplyNode(conn.GetData()) 403 else: 404 suppl = SmilesSupplyNode(details.inFileName,delim=details.delim, 405 nameColumn=details.nameCol, 406 smilesColumn=details.smiCol, 407 titleLine=details.hasTitle) 408 if type(details.actCol)==types.IntType: 409 suppl.reset() 410 m = next(suppl) 411 actName = m.GetPropNames()[details.actCol] 412 details.actCol = actName 413 if type(details.nameCol)==types.IntType: 414 suppl.reset() 415 m = next(suppl) 416 nameName = m.GetPropNames()[details.nameCol] 417 details.nameCol = nameName 418 suppl.reset() 419 if type(details.actCol)==types.IntType: 420 suppl.reset() 421 m = next(suppl) 422 actName = m.GetPropNames()[details.actCol] 423 details.actCol = actName 424 if type(details.nameCol)==types.IntType: 425 suppl.reset() 426 m = next(suppl) 427 nameName = m.GetPropNames()[details.nameCol] 428 details.nameCol = nameName 429 suppl.reset() 430 return suppl
431 432
433 -def Usage():
434 print("This is BuildFragmentCatalog version %s"%(__VERSION_STRING)) 435 print('usage error') 436 #print(__doc__) 437 sys.exit(-1)
438
439 -class RunDetails(object):
440 numMols=-1 441 doBuild=0 442 doSigs=0 443 doScore=0 444 doGains=0 445 doDetails=0 446 catalogName=None 447 onBitsName=None 448 scoresName=None 449 gainsName=None 450 dbName='' 451 tableName=None 452 detailsName=None 453 inFileName=None 454 fpName=None 455 minPath=2 456 maxPath=6 457 smiCol=1 458 actCol=-1 459 nameCol=-1 460 hasTitle=1 461 nActs = 2 462 nBits=-1 463 delim=',' 464 biasList=None 465 topN=-1
466
467 -def ParseArgs(details):
468 import getopt 469 try: 470 args,extras = getopt.getopt(sys.argv[1:],'n:d:cst', 471 ['catalog=','onbits=', 472 'scoresFile=','gainsFile=','detailsFile=','fpFile=', 473 'minPath=','maxPath=','smiCol=','actCol=','nameCol=','nActs=', 474 'nBits=','biasList=','topN=', 475 'build','sigs','gains','details','score','noTitle']) 476 except Exception: 477 sys.stderr.write('Error parsing command line:\n') 478 import traceback 479 traceback.print_exc() 480 Usage() 481 for arg,val in args: 482 if arg=='-n': 483 details.numMols=int(val) 484 elif arg=='-c': 485 details.delim=',' 486 elif arg=='-s': 487 details.delim=' ' 488 elif arg=='-t': 489 details.delim='\t' 490 elif arg=='-d': 491 details.dbName=val 492 elif arg=='--build': 493 details.doBuild=1 494 elif arg=='--score': 495 details.doScore=1 496 elif arg=='--gains': 497 details.doGains=1 498 elif arg=='--sigs': 499 details.doSigs=1 500 elif arg=='-details': 501 details.doDetails=1 502 elif arg=='--catalog': 503 details.catalogName=val 504 elif arg=='--onbits': 505 details.onBitsName=val 506 elif arg=='--scoresFile': 507 details.scoresName=val 508 elif arg=='--gainsFile': 509 details.gainsName=val 510 elif arg=='--detailsFile': 511 details.detailsName=val 512 elif arg=='--fpFile': 513 details.fpName=val 514 elif arg=='--minPath': 515 details.minPath=int(val) 516 elif arg=='--maxPath': 517 details.maxPath=int(val) 518 elif arg=='--smiCol': 519 try: 520 details.smiCol=int(val) 521 except ValueError: 522 details.smiCol=val 523 elif arg=='--actCol': 524 try: 525 details.actCol=int(val) 526 except ValueError: 527 details.actCol=val 528 elif arg=='--nameCol': 529 try: 530 details.nameCol=int(val) 531 except ValueError: 532 details.nameCol=val 533 elif arg=='--nActs': 534 details.nActs=int(val) 535 elif arg=='--nBits': 536 details.nBits=int(val) 537 elif arg=='--noTitle': 538 details.hasTitle=0 539 elif arg=='--biasList': 540 details.biasList=tuple(eval(val)) 541 elif arg=='--topN': 542 details.topN=int(val) 543 elif arg=='-h': 544 Usage() 545 sys.exit(0) 546 else: 547 Usage() 548 if len(extras): 549 if details.dbName: 550 details.tableName=extras[0] 551 else: 552 details.inFileName = extras[0] 553 else: 554 Usage()
555 if __name__=='__main__': 556 import time 557 details = RunDetails() 558 ParseArgs(details) 559 from io import StringIO 560 suppl = SupplierFromDetails(details) 561 562 cat = None 563 obls = None 564 if details.doBuild: 565 if not suppl: 566 message("We require inData to generate a catalog\n") 567 sys.exit(-2) 568 message("Building catalog\n") 569 t1 = time.time() 570 cat = BuildCatalog(suppl,maxPts=details.numMols, 571 minPath=details.minPath,maxPath=details.maxPath) 572 t2 = time.time() 573 message("\tThat took %.2f seconds.\n"%(t2-t1)) 574 if details.catalogName: 575 message("Dumping catalog data\n") 576 cPickle.dump(cat,open(details.catalogName,'wb+')) 577 elif details.catalogName: 578 message("Loading catalog\n") 579 cat = cPickle.load(open(details.catalogName,'rb')) 580 if details.onBitsName: 581 try: 582 obls = cPickle.load(open(details.onBitsName,'rb')) 583 except Exception: 584 obls = None 585 else: 586 if len(obls)<(inD.count('\n')-1): 587 obls = None 588 scores = None 589 if details.doScore: 590 if not suppl: 591 message("We require inData to score molecules\n") 592 sys.exit(-2) 593 if not cat: 594 message("We require a catalog to score molecules\n") 595 sys.exit(-2) 596 message("Scoring compounds\n") 597 if not obls or len(obls)<details.numMols: 598 scores,obls = ScoreMolecules(suppl,cat,maxPts=details.numMols, 599 actName=details.actCol, 600 nActs=details.nActs) 601 if details.scoresName: 602 cPickle.dump(scores,open(details.scoresName,'wb+')) 603 if details.onBitsName: 604 cPickle.dump(obls,open(details.onBitsName,'wb+')) 605 else: 606 scores = ScoreFromLists(obls,suppl,cat,maxPts=details.numMols, 607 actName=details.actCol, 608 nActs=details.nActs) 609 elif details.scoresName: 610 scores = cPickle.load(open(details.scoresName,'rb')) 611 612 if details.fpName and os.path.exists(details.fpName) and not details.doSigs: 613 message("Reading fingerprints from file.\n") 614 fps = cPickle.load(open(details.fpName,'rb')) 615 else: 616 fps = [] 617 gains = None 618 if details.doGains: 619 if not suppl: 620 message("We require inData to calculate gains\n") 621 sys.exit(-2) 622 if not (cat or fps): 623 message("We require either a catalog or fingerprints to calculate gains\n") 624 sys.exit(-2) 625 message("Calculating Gains\n") 626 t1 = time.time() 627 if details.fpName: 628 collectFps=1 629 else: 630 collectFps=0 631 if not fps: 632 gains,fps = CalcGains(suppl,cat,topN=details.topN,actName=details.actCol, 633 nActs=details.nActs,biasList=details.biasList, 634 collectFps=collectFps) 635 if details.fpName: 636 message("Writing fingerprint file.\n") 637 tmpF=open(details.fpName,'wb+') 638 cPickle.dump(fps,tmpF,1) 639 tmpF.close() 640 else: 641 gains = CalcGainsFromFps(suppl,fps,topN=details.topN,actName=details.actCol, 642 nActs=details.nActs,biasList=details.biasList) 643 t2=time.time() 644 message("\tThat took %.2f seconds.\n"%(t2-t1)) 645 if details.gainsName: 646 outF = open(details.gainsName,'w+') 647 OutputGainsData(outF,gains,cat,nActs=details.nActs) 648 else: 649 if details.gainsName: 650 inF = open(details.gainsName,'r') 651 gains = ProcessGainsData(inF) 652 653 654 if details.doDetails: 655 if not cat: 656 message("We require a catalog to get details\n") 657 sys.exit(-2) 658 if not gains: 659 message("We require gains data to get details\n") 660 sys.exit(-2) 661 io = StringIO() 662 io.write('id,SMILES,gain\n') 663 ShowDetails(cat,gains,nToDo=details.nBits,outF=io) 664 if details.detailsName: 665 open(details.detailsName,'w+').write(io.getvalue()) 666 else: 667 sys.stderr.write(io.getvalue()) 668