Package rdkit :: Package Chem :: Package Fingerprints :: Module FingerprintMols
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.FingerprintMols

  1  # $Id$ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ utility functionality for fingerprinting sets of molecules 
 12   includes a command line app for working with fingerprints 
 13   and databases 
 14   
 15   
 16  Sample Usage: 
 17   
 18    python FingerprintMols.py  -d data.gdb \ 
 19          -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID"  \ 
 20          --outTable="daylight_sig" 
 21   
 22   
 23  """ 
 24  from __future__ import print_function 
 25  from rdkit import Chem 
 26  from rdkit.Chem import MACCSkeys 
 27  from rdkit.ML.Cluster import Murtagh 
 28  from rdkit import DataStructs 
 29  import sys 
 30  from rdkit.six.moves import cPickle 
 31   
 32  _cvsVersion="$Id$" 
 33  idx1 = _cvsVersion.find(':')+1 
 34  idx2 = _cvsVersion.rfind('$') 
 35  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 36   
 37   
38 -def error(msg):
39 sys.stderr.write(msg)
40 -def message(msg):
41 sys.stderr.write(msg)
42
43 -def GetRDKFingerprint(mol):
44 """ uses default parameters """ 45 details = FingerprinterDetails() 46 return apply(FingerprintMol,(mol,),details.__dict__)
47
48 -def FoldFingerprintToTargetDensity(fp,**fpArgs):
49 nOn = fp.GetNumOnBits() 50 nTot = fp.GetNumBits() 51 while( float(nOn)/nTot < fpArgs['tgtDensity'] ): 52 if nTot / 2 > fpArgs['minSize']: 53 fp = DataStructs.FoldFingerprint(fp,2) 54 nOn = fp.GetNumOnBits() 55 nTot = fp.GetNumBits() 56 else: 57 break 58 return fp
59
60 -def FingerprintMol(mol, 61 fingerprinter=Chem.RDKFingerprint, 62 **fpArgs):
63 if not fpArgs: 64 details = FingerprinterDetails() 65 fpArgs = details.__dict__ 66 67 if fingerprinter != Chem.RDKFingerprint: 68 fp = fingerprinter(mol,**fpArgs) 69 fp = FoldFingerprintToTargetDensity(fp,**fpArgs) 70 else: 71 fp = fingerprinter(mol,fpArgs['minPath'],fpArgs['maxPath'], 72 fpArgs['fpSize'],fpArgs['bitsPerHash'], 73 fpArgs['useHs'],fpArgs['tgtDensity'], 74 fpArgs['minSize']) 75 return fp
76 77
78 -def FingerprintsFromSmiles(dataSource,idCol,smiCol, 79 fingerprinter=Chem.RDKFingerprint, 80 reportFreq=10,maxMols=-1, 81 **fpArgs):
82 """ fpArgs are passed as keyword arguments to the fingerprinter 83 84 Returns a list of 2-tuples: (id,fp) 85 86 """ 87 res = [] 88 nDone = 0 89 for entry in dataSource: 90 id,smi = str(entry[idCol]),str(entry[smiCol]) 91 mol = Chem.MolFromSmiles(smi) 92 if mol is not None: 93 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 94 res.append((id,fp)) 95 nDone += 1 96 if reportFreq>0 and not nDone % reportFreq: 97 message('Done %d molecules\n'%(nDone)) 98 if maxMols > 0 and nDone >= maxMols: 99 break 100 else: 101 error('Problems parsing SMILES: %s\n'%smi) 102 return res
103
104 -def FingerprintsFromMols(mols, 105 fingerprinter=Chem.RDKFingerprint, 106 reportFreq=10,maxMols=-1, 107 **fpArgs):
108 """ fpArgs are passed as keyword arguments to the fingerprinter 109 110 Returns a list of 2-tuples: (id,fp) 111 112 """ 113 res = [] 114 nDone = 0 115 for id,mol in mols: 116 if mol: 117 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 118 res.append((id,fp)) 119 nDone += 1 120 if reportFreq>0 and not nDone % reportFreq: 121 message('Done %d molecules\n'%(nDone)) 122 if maxMols > 0 and nDone >= maxMols: 123 break 124 else: 125 error('Problems parsing SMILES: %s\n'%smi) 126 return res
127
128 -def FingerprintsFromPickles(dataSource,idCol,pklCol, 129 fingerprinter=Chem.RDKFingerprint, 130 reportFreq=10,maxMols=-1, 131 **fpArgs):
132 """ fpArgs are passed as keyword arguments to the fingerprinter 133 134 Returns a list of 2-tuples: (id,fp) 135 136 """ 137 res = [] 138 nDone = 0 139 for entry in dataSource: 140 id,pkl = str(entry[idCol]),str(entry[pklCol]) 141 mol = Chem.Mol(pkl) 142 if mol is not None: 143 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 144 res.append((id,fp)) 145 nDone += 1 146 if reportFreq>0 and not nDone % reportFreq: 147 message('Done %d molecules\n'%(nDone)) 148 if maxMols > 0 and nDone >= maxMols: 149 break 150 else: 151 error('Problems parsing pickle for id: %s\n'%id) 152 return res
153
154 -def FingerprintsFromDetails(details,reportFreq=10):
155 data = None 156 if details.dbName and details.tableName: 157 from rdkit.Dbase.DbConnection import DbConnect 158 from rdkit.Dbase import DbInfo 159 from rdkit.ML.Data import DataUtils 160 try: 161 conn = DbConnect(details.dbName,details.tableName) 162 except Exception: 163 import traceback 164 error('Problems establishing connection to database: %s|%s\n'%(details.dbName, 165 details.tableName)) 166 traceback.print_exc() 167 if not details.idName: 168 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0] 169 dataSet = DataUtils.DBToData(details.dbName,details.tableName, 170 what='%s,%s'%(details.idName,details.smilesName)) 171 idCol = 0 172 smiCol = 1 173 elif details.inFileName and details.useSmiles: 174 from rdkit.ML.Data import DataUtils 175 conn = None 176 if not details.idName: 177 details.idName='ID' 178 try: 179 dataSet = DataUtils.TextFileToData(details.inFileName, 180 onlyCols=[details.idName,details.smilesName]) 181 except IOError: 182 import traceback 183 error('Problems reading from file %s\n'%(details.inFileName)) 184 traceback.print_exc() 185 186 idCol = 0 187 smiCol = 1 188 elif details.inFileName and details.useSD: 189 conn = None 190 dataset=None 191 if not details.idName: 192 details.idName='ID' 193 dataSet = [] 194 try: 195 s = Chem.SDMolSupplier(details.inFileName) 196 except Exception: 197 import traceback 198 error('Problems reading from file %s\n'%(details.inFileName)) 199 traceback.print_exc() 200 else: 201 while 1: 202 try: 203 m = s.next() 204 except StopIteration: 205 break 206 if m: 207 dataSet.append(m) 208 if reportFreq>0 and not len(dataSet) % reportFreq: 209 message('Read %d molecules\n'%(len(dataSet))) 210 if details.maxMols > 0 and len(dataSet) >= details.maxMols: 211 break 212 213 for i,mol in enumerate(dataSet): 214 if mol.HasProp(details.idName): 215 nm = mol.GetProp(details.idName) 216 else: 217 nm = mol.GetProp('_Name') 218 dataSet[i] = (nm,mol) 219 else: 220 dataSet = None 221 222 fps = None 223 if dataSet and not details.useSD: 224 data = dataSet.GetNamedData() 225 if not details.molPklName: 226 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol), 227 details.__dict__) 228 else: 229 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol), 230 details.__dict__) 231 elif dataSet and details.useSD: 232 fps = apply(FingerprintsFromMols,(dataSet,),details.__dict__) 233 234 if fps: 235 if details.outFileName: 236 outF = open(details.outFileName,'wb+') 237 for i in range(len(fps)): 238 cPickle.dump(fps[i],outF) 239 outF.close() 240 dbName = details.outDbName or details.dbName 241 if details.outTableName and dbName: 242 from rdkit.Dbase.DbConnection import DbConnect 243 from rdkit.Dbase import DbInfo,DbUtils,DbModule 244 conn = DbConnect(dbName) 245 # 246 # We don't have a db open already, so we'll need to figure out 247 # the types of our columns... 248 # 249 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0])) 250 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes, 251 keyCol=details.idName) 252 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName) 253 254 # FIX: we should really check to see if the table 255 # is already there and, if so, add the appropriate 256 # column. 257 258 # 259 # create the new table 260 # 261 if details.replaceTable or \ 262 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]: 263 conn.AddTable(details.outTableName,cols) 264 265 # 266 # And add the data 267 # 268 for id,fp in fps: 269 tpl = id,DbModule.binaryHolder(fp.ToBinary()) 270 conn.InsertData(details.outTableName,tpl) 271 conn.Commit() 272 return fps
273 # ------------------------------------------------ 274 # 275 # Command line parsing stuff 276 # 277 # ------------------------------------------------ 278
279 -class FingerprinterDetails(object):
280 """ class for storing the details of a fingerprinting run, 281 generates sensible defaults on construction 282 283 """
284 - def __init__(self):
285 self._fingerprinterInit() 286 self._screenerInit() 287 self._clusterInit()
288
289 - def _fingerprinterInit(self):
290 self.fingerprinter = Chem.RDKFingerprint 291 self.fpColName="AutoFragmentFP" 292 self.idName='' 293 self.dbName='' 294 self.outDbName='' 295 self.tableName='' 296 self.minSize=64 297 self.fpSize=2048 298 self.tgtDensity=0.3 299 self.minPath=1 300 self.maxPath=7 301 self.discrimHash=0 302 self.useHs=0 303 self.useValence=0 304 self.bitsPerHash=2 305 self.smilesName='SMILES' 306 self.maxMols=-1 307 self.outFileName='' 308 self.outTableName='' 309 self.inFileName='' 310 self.replaceTable=True 311 self.molPklName='' 312 self.useSmiles=True 313 self.useSD=False
314
315 - def _screenerInit(self):
316 self.metric = DataStructs.TanimotoSimilarity 317 self.doScreen='' 318 self.topN=10 319 self.screenThresh=0.75 320 self.doThreshold=0 321 self.smilesTableName='' 322 self.probeSmiles='' 323 self.probeMol=None 324 self.noPickle=0
325
326 - def _clusterInit(self):
327 self.clusterAlgo = Murtagh.WARDS 328 self.actTableName = '' 329 self.actName = ''
330
331 - def GetMetricName(self):
332 if self.metric == DataStructs.TanimotoSimilarity: 333 return 'Tanimoto' 334 elif self.metric == DataStructs.DiceSimilarity: 335 return 'Dice' 336 elif self.metric == DataStructs.CosineSimilarity: 337 return 'Cosine' 338 elif self.metric: 339 return self.metric 340 else: 341 return 'Unknown'
342 - def SetMetricFromName(self,name):
343 name = name.upper() 344 if name=="TANIMOTO": 345 self.metric = DataStructs.TanimotoSimilarity 346 elif name=="DICE": 347 self.metric = DataStructs.DiceSimilarity 348 elif name=="COSINE": 349 self.metric = DataStructs.CosineSimilarity
350
351 -def Usage():
352 """ prints a usage string and exits 353 354 """ 355 print(_usageDoc) 356 sys.exit(-1)
357 358 _usageDoc=""" 359 Usage: FingerprintMols.py [args] <fName> 360 361 If <fName> is provided and no tableName is specified (see below), 362 data will be read from the text file <fName>. Text files delimited 363 with either commas (extension .csv) or tabs (extension .txt) are 364 supported. 365 366 Command line arguments are: 367 - -d _dbName_: set the name of the database from which 368 to pull input molecule information. If output is 369 going to a database, this will also be used for that 370 unless the --outDbName option is used. 371 372 - -t _tableName_: set the name of the database table 373 from which to pull input molecule information 374 375 - --smilesName=val: sets the name of the SMILES column 376 in the input database. Default is *SMILES*. 377 378 - --useSD: Assume that the input file is an SD file, not a SMILES 379 table. 380 381 - --idName=val: sets the name of the id column in the input 382 database. Defaults to be the name of the first db column 383 (or *ID* for text files). 384 385 - -o _outFileName_: name of the output file (output will 386 be a pickle file with one label,fingerprint entry for each 387 molecule). 388 389 - --outTable=val: name of the output db table used to store 390 fingerprints. If this table already exists, it will be 391 replaced. 392 393 - --outDbName: name of output database, if it's being used. 394 Defaults to be the same as the input db. 395 396 - --fpColName=val: name to use for the column which stores 397 fingerprints (in pickled format) in the output db table. 398 Default is *AutoFragmentFP* 399 400 - --maxSize=val: base size of the fingerprints to be generated 401 Default is *2048* 402 403 - --minSize=val: minimum size of the fingerprints to be generated 404 (limits the amount of folding that happens). Default is *64* 405 406 - --density=val: target bit density in the fingerprint. The 407 fingerprint will be folded until this density is 408 reached. Default is *0.3* 409 410 - --minPath=val: minimum path length to be included in 411 fragment-based fingerprints. Default is *1*. 412 413 - --maxPath=val: maximum path length to be included in 414 fragment-based fingerprints. Default is *7*. 415 416 - --nBitsPerHash: number of bits to be set in the output 417 fingerprint for each fragment. Default is *2*. 418 419 - --discrim: use of path-based discriminators to hash bits. 420 Default is *false*. 421 422 - -V: include valence information in the fingerprints 423 Default is *false*. 424 425 - -H: include Hs in the fingerprint 426 Default is *false*. 427 428 - --maxMols=val: sets the maximum number of molecules to be 429 fingerprinted. 430 431 - --useMACCS: use the public MACCS keys to do the fingerprinting 432 (instead of a daylight-type fingerprint) 433 434 """ 435
436 -def ParseArgs(details=None):
437 """ parses the command line arguments and returns a 438 _FingerprinterDetails_ instance with the results. 439 440 **Note**: 441 442 - If you make modifications here, please update the global 443 _usageDoc string so the Usage message is up to date. 444 445 - This routine is used by both the fingerprinter, the clusterer and the 446 screener; not all arguments make sense for all applications. 447 448 """ 449 import sys,getopt 450 args = sys.argv[1:] 451 try: 452 args,extras = getopt.getopt(args,'HVs:d:t:o:h', 453 [ 454 'minSize=','maxSize=', 455 'density=', 456 'minPath=','maxPath=', 457 'bitsPerHash=', 458 'smilesName=', 459 'molPkl=', 460 'useSD', 461 'idName=', 462 'discrim', 463 'outTable=', 464 'outDbName=', 465 'fpColName=', 466 'maxMols=', 467 'useMACCS', 468 'keepTable', 469 # SCREENING: 470 'smilesTable=', 471 'doScreen=', 472 'topN=', 473 'thresh=', 474 'smiles=', 475 'dice', 476 'cosine', 477 # CLUSTERING: 478 'actTable=', 479 'actName=', 480 'SLINK', 481 'CLINK', 482 'UPGMA', 483 484 ]) 485 except Exception: 486 import traceback 487 traceback.print_exc() 488 Usage() 489 490 if details is None: 491 details = FingerprinterDetails() 492 if len(extras): 493 details.inFileName=extras[0] 494 495 for arg,val in args: 496 if arg=='-H': 497 details.useHs=1 498 elif arg=='-V': 499 details.useValence=1 500 elif arg=='-d': 501 details.dbName = val 502 elif arg=='-t': 503 details.tableName = val 504 elif arg=='-o': 505 details.outFileName = val 506 elif arg=='--minSize': 507 details.minSize= int(val) 508 elif arg=='--maxSize': 509 details.fpSize= int(val) 510 elif arg=='--density': 511 details.tgtDensity = float(val) 512 elif arg=='--outTable': 513 details.outTableName = val 514 elif arg=='--outDbName': 515 details.outDbName = val 516 elif arg=='--fpColName': 517 details.fpColName = val 518 elif arg=='--minPath': 519 details.minPath= int(val) 520 elif arg=='--maxPath': 521 details.maxPath= int(val) 522 elif arg=='--nBitsPerHash': 523 details.bitsPerHash= int(val) 524 elif arg=='--discrim': 525 details.discrimHash=1 526 elif arg=='--smilesName': 527 details.smilesName = val 528 elif arg=='--molPkl': 529 details.molPklName = val 530 elif arg=='--useSD': 531 details.useSmiles=False 532 details.useSD=True 533 elif arg=='--idName': 534 details.idName = val 535 elif arg=='--maxMols': 536 details.maxMols = int(val) 537 elif arg=='--useMACCS': 538 details.fingerprinter = MACCSkeys.GenMACCSKeys 539 elif arg=='--keepTable': 540 details.replaceTable=False 541 542 # SCREENER: 543 elif arg=='--smilesTable': 544 details.smilesTableName=val; 545 elif arg=='--topN': 546 details.doThreshold=0 547 details.topN=int(val) 548 elif arg=='--thresh': 549 details.doThreshold=1 550 details.screenThresh=float(val) 551 elif arg=='--smiles': 552 details.probeSmiles=val; 553 elif arg=='--dice': 554 details.metric = DataStructs.DiceSimilarity 555 elif arg=='--cosine': 556 details.metric = DataStructs.CosineSimilarity 557 558 # CLUSTERS: 559 elif arg=='--SLINK': 560 details.clusterAlgo = Murtagh.SLINK 561 elif arg=='--CLINK': 562 details.clusterAlgo = Murtagh.CLINK 563 elif arg=='--UPGMA': 564 details.clusterAlgo = Murtagh.UPGMA 565 elif arg=='--actTable': 566 details.actTableName = val 567 elif arg=='--actName': 568 details.actName = val 569 elif arg=='-h': 570 Usage() 571 return details
572 573 if __name__ == '__main__': 574 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING)) 575 details = ParseArgs() 576 FingerprintsFromDetails(details) 577