Package rdkit :: Package Chem :: Package Fingerprints :: Module MolSimilarity
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.MolSimilarity

  1  # $Id$ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ utility functionality for molecular similarity 
 12   includes a command line app for screening databases 
 13   
 14   
 15  Sample Usage: 
 16   
 17    python MolSimilarity.py  -d data.gdb -t daylight_sig --idName="Mol_ID" \ 
 18        --topN=100 --smiles='c1(C=O)ccc(Oc2ccccc2)cc1' --smilesTable=raw_dop_data \ 
 19        --smilesName="structure" -o results.csv  
 20   
 21  """ 
 22  from rdkit import RDConfig 
 23  from rdkit import DataStructs 
 24  from rdkit import Chem 
 25  from rdkit.Dbase.DbConnection import DbConnect 
 26  from rdkit.Dbase import DbModule 
 27  from rdkit.DataStructs.TopNContainer import TopNContainer 
 28  import sys,types 
 29  from rdkit.six.moves import cPickle 
 30  from rdkit.Chem.Fingerprints import FingerprintMols,DbFpSupplier 
 31  try:   
 32    from rdkit.VLib.NodeLib.DbPickleSupplier import _lazyDataSeq as _dataSeq 
 33  except ImportError: 
 34    _dataSeq=None 
 35     
 36   
 37  from rdkit import DataStructs 
 38   
 39  _cvsVersion="$Id$" 
 40  idx1 = _cvsVersion.find(':')+1 
 41  idx2 = _cvsVersion.rfind('$') 
 42  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 43   
 44   
45 -def _ConstructSQL(details,extraFields=''):
46 fields = '%s.%s'%(details.tableName,details.idName) 47 join = '' 48 if details.smilesTableName: 49 if details.smilesName: 50 fields = fields + ',%s'%(details.smilesName) 51 join='join %s smi on smi.%s=%s.%s'%(details.smilesTableName, 52 details.idName, 53 details.tableName, 54 details.idName) 55 if details.actTableName: 56 if details.actName: 57 fields = fields + ',%s'%(details.actName) 58 join = join + 'join %s act on act.%s=%s.%s'%(details.actTableName, 59 details.idName, 60 details.tableName, 61 details.idName) 62 #data = conn.GetData(fields=fields,join=join) 63 if extraFields: 64 fields += ','+extraFields 65 cmd = 'select %s from %s %s'%(fields,details.tableName,join) 66 return cmd
67
68 -def ScreenInDb(details,mol):
69 try: 70 probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__) 71 except Exception: 72 import traceback 73 FingerprintMols.error('Error: problems fingerprinting molecule.\n') 74 traceback.print_exc() 75 return [] 76 if details.dbName and details.tableName: 77 try: 78 conn = DbConnect(details.dbName,details.tableName) 79 if hasattr(details,'dbUser'): 80 conn.user = details.dbUser 81 if hasattr(details,'dbPassword'): 82 conn.password = details.dbPassword 83 except Exception: 84 import traceback 85 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n'%(details.dbName, 86 details.tableName)) 87 traceback.print_exc() 88 89 if details.metric not in (DataStructs.TanimotoSimilarity, 90 DataStructs.DiceSimilarity, 91 DataStructs.CosineSimilarity): 92 data = GetFingerprints(details) 93 res = ScreenFingerprints(details,data,mol) 94 else: 95 res = [] 96 if details.metric == DataStructs.TanimotoSimilarity: 97 func = 'rd_tanimoto' 98 pkl=probeFp.ToBitString() 99 elif details.metric == DataStructs.DiceSimilarity: 100 func = 'rd_dice' 101 pkl=probeFp.ToBitString() 102 elif details.metric == DataStructs.CosineSimilarity: 103 func = 'rd_cosine' 104 pkl=probeFp.ToBitString() 105 extraFields="%s(%s,%s) as tani"%(func,DbModule.placeHolder,details.fpColName) 106 cmd = _ConstructSQL(details,extraFields=extraFields) 107 108 if details.doThreshold: 109 # we need to do a subquery here: 110 cmd = "select * from (%s) tmp where tani>%f"%(cmd,details.screenThresh) 111 cmd += " order by tani desc" 112 if not details.doThreshold and details.topN>0: 113 cmd += " limit %d"%details.topN 114 curs = conn.GetCursor() 115 curs.execute(cmd,(pkl,)) 116 res = curs.fetchall() 117 118 return res
119
120 -def GetFingerprints(details):
121 """ returns an iterable sequence of fingerprints 122 each fingerprint will have a _fieldsFromDb member whose first entry is 123 the id. 124 125 """ 126 if details.dbName and details.tableName: 127 try: 128 conn = DbConnect(details.dbName,details.tableName) 129 if hasattr(details,'dbUser'): 130 conn.user = details.dbUser 131 if hasattr(details,'dbPassword'): 132 conn.password = details.dbPassword 133 except Exception: 134 import traceback 135 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n'%(details.dbName, 136 details.tableName)) 137 traceback.print_exc() 138 cmd = _ConstructSQL(details,extraFields=details.fpColName) 139 curs = conn.GetCursor() 140 #curs.execute(cmd) 141 #print 'CURSOR:',curs,curs.closed 142 if _dataSeq: 143 suppl = _dataSeq(curs,cmd,depickle=not details.noPickle,klass=DataStructs.ExplicitBitVect) 144 _dataSeq._conn = conn 145 else: 146 suppl = DbFpSupplier.ForwardDbFpSupplier(data,fpColName=details.fpColName) 147 elif details.inFileName: 148 conn = None 149 try: 150 inF = open(details.inFileName,'r') 151 except IOError: 152 import traceback 153 FingerprintMols.error('Error: Problems reading from file %s\n'%(details.inFileName)) 154 traceback.print_exc() 155 156 suppl = [] 157 done = 0 158 while not done: 159 try: 160 id,fp = cPickle.load(inF) 161 except Exception: 162 done = 1 163 else: 164 fp._fieldsFromDb = [id] 165 suppl.append(fp) 166 else: 167 suppl = None 168 169 return suppl
170
171 -def ScreenFingerprints(details,data,mol=None,probeFp=None):
172 """ Returns a list of results 173 174 """ 175 if probeFp is None: 176 try: 177 probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__) 178 except Exception: 179 import traceback 180 FingerprintMols.error('Error: problems fingerprinting molecule.\n') 181 traceback.print_exc() 182 return [] 183 if not probeFp: 184 return [] 185 186 res = [] 187 if not details.doThreshold and details.topN>0: 188 topN = TopNContainer(details.topN) 189 else: 190 topN = [] 191 res = [] 192 count = 0 193 for pt in data: 194 fp1 = probeFp 195 if not details.noPickle: 196 if type(pt) in (types.TupleType,types.ListType): 197 id,fp = pt 198 else: 199 fp = pt 200 id = pt._fieldsFromDb[0] 201 score = DataStructs.FingerprintSimilarity(fp1,fp,details.metric) 202 else: 203 id,pkl = pt 204 score = details.metric(fp1,str(pkl)) 205 if topN: 206 topN.Insert(score,id) 207 elif not details.doThreshold or \ 208 (details.doThreshold and score>=details.screenThresh): 209 res.append((id,score)) 210 count += 1 211 if hasattr(details,'stopAfter') and count >= details.stopAfter: 212 break 213 for score,id in topN: 214 res.append((id,score)) 215 216 return res
217
218 -def ScreenFromDetails(details,mol=None):
219 """ Returns a list of results 220 221 """ 222 if not mol: 223 if not details.probeMol: 224 smi = details.probeSmiles 225 try: 226 mol = Chem.MolFromSmiles(smi) 227 except Exception: 228 import traceback 229 FingerprintMols.error('Error: problems generating molecule for smiles: %s\n'%(smi)) 230 traceback.print_exc() 231 return 232 else: 233 mol = details.probeMol 234 if not mol: 235 return 236 237 if details.outFileName: 238 try: 239 outF = open(details.outFileName,'w+') 240 except IOError: 241 FingerprintMols.error("Error: could not open output file %s for writing\n"%(details.outFileName)) 242 return None 243 else: 244 outF = None 245 246 if not hasattr(details,'useDbSimilarity') or not details.useDbSimilarity: 247 data = GetFingerprints(details) 248 res = ScreenFingerprints(details,data,mol) 249 else: 250 res = ScreenInDb(details,mol) 251 if outF: 252 for pt in res: 253 outF.write(','.join([str(x) for x in pt])) 254 outF.write('\n') 255 return res
256 257 _usageDoc=""" 258 Usage: MolSimilarity.py [args] <fName> 259 260 If <fName> is provided and no tableName is specified (see below), 261 data will be read from the pickled file <fName>. This file should 262 contain a series of pickled (id,fingerprint) tuples. 263 264 NOTE: at the moment the user is responsible for ensuring that the 265 fingerprint parameters given at run time (used to fingerprint the 266 probe molecule) match those used to generate the input fingerprints. 267 268 Command line arguments are: 269 - --smiles=val: sets the SMILES for the input molecule. This is 270 a required argument. 271 272 - -d _dbName_: set the name of the database from which 273 to pull input fingerprint information. 274 275 - -t _tableName_: set the name of the database table 276 from which to pull input fingerprint information 277 278 - --smilesTable=val: sets the name of the database table 279 which contains SMILES for the input fingerprints. If this 280 information is provided along with smilesName (see below), 281 the output file will contain SMILES data 282 283 - --smilesName=val: sets the name of the SMILES column 284 in the input database. Default is *SMILES*. 285 286 - --topN=val: sets the number of results to return. 287 Default is *10*. 288 289 - --thresh=val: sets the similarity threshold. 290 291 - --idName=val: sets the name of the id column in the input 292 database. Default is *ID*. 293 294 - -o _outFileName_: name of the output file (output will 295 be a CSV file with one line for each of the output molecules 296 297 - --dice: use the DICE similarity metric instead of Tanimoto 298 299 - --cosine: use the cosine similarity metric instead of Tanimoto 300 301 - --fpColName=val: name to use for the column which stores 302 fingerprints (in pickled format) in the output db table. 303 Default is *AutoFragmentFP* 304 305 - --minPath=val: minimum path length to be included in 306 fragment-based fingerprints. Default is *1*. 307 308 - --maxPath=val: maximum path length to be included in 309 fragment-based fingerprints. Default is *7*. 310 311 - --nBitsPerHash: number of bits to be set in the output 312 fingerprint for each fragment. Default is *4*. 313 314 - --discrim: use of path-based discriminators to hash bits. 315 Default is *false*. 316 317 - -V: include valence information in the fingerprints 318 Default is *false*. 319 320 - -H: include Hs in the fingerprint 321 Default is *false*. 322 323 - --useMACCS: use the public MACCS keys to do the fingerprinting 324 (instead of a daylight-type fingerprint) 325 326 327 """ 328 if __name__ == '__main__': 329 FingerprintMols.message("This is MolSimilarity version %s\n\n"%(__VERSION_STRING)) 330 FingerprintMols._usageDoc=_usageDoc 331 details = FingerprintMols.ParseArgs() 332 ScreenFromDetails(details) 333