1
2
3
4
5
6
7
8
9
10
11 """ utility functionality for fingerprinting sets of molecules
12 includes a command line app for working with fingerprints
13 and databases
14
15
16 Sample Usage:
17
18 python FingerprintMols.py -d data.gdb \
19 -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID" \
20 --outTable="daylight_sig"
21
22
23 """
24 from __future__ import print_function
25 from rdkit import Chem
26 from rdkit.Chem import MACCSkeys
27 from rdkit.ML.Cluster import Murtagh
28 from rdkit import DataStructs
29 import sys
30 from rdkit.six.moves import cPickle
31
32 _cvsVersion="$Id$"
33 idx1 = _cvsVersion.find(':')+1
34 idx2 = _cvsVersion.rfind('$')
35 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
36
37
42
47
49 nOn = fp.GetNumOnBits()
50 nTot = fp.GetNumBits()
51 while( float(nOn)/nTot < fpArgs['tgtDensity'] ):
52 if nTot / 2 > fpArgs['minSize']:
53 fp = DataStructs.FoldFingerprint(fp,2)
54 nOn = fp.GetNumOnBits()
55 nTot = fp.GetNumBits()
56 else:
57 break
58 return fp
59
63 if not fpArgs:
64 details = FingerprinterDetails()
65 fpArgs = details.__dict__
66
67 if fingerprinter != Chem.RDKFingerprint:
68 fp = fingerprinter(mol,**fpArgs)
69 fp = FoldFingerprintToTargetDensity(fp,**fpArgs)
70 else:
71 fp = fingerprinter(mol,fpArgs['minPath'],fpArgs['maxPath'],
72 fpArgs['fpSize'],fpArgs['bitsPerHash'],
73 fpArgs['useHs'],fpArgs['tgtDensity'],
74 fpArgs['minSize'])
75 return fp
76
77
78 -def FingerprintsFromSmiles(dataSource,idCol,smiCol,
79 fingerprinter=Chem.RDKFingerprint,
80 reportFreq=10,maxMols=-1,
81 **fpArgs):
82 """ fpArgs are passed as keyword arguments to the fingerprinter
83
84 Returns a list of 2-tuples: (id,fp)
85
86 """
87 res = []
88 nDone = 0
89 for entry in dataSource:
90 id,smi = str(entry[idCol]),str(entry[smiCol])
91 mol = Chem.MolFromSmiles(smi)
92 if mol is not None:
93 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
94 res.append((id,fp))
95 nDone += 1
96 if reportFreq>0 and not nDone % reportFreq:
97 message('Done %d molecules\n'%(nDone))
98 if maxMols > 0 and nDone >= maxMols:
99 break
100 else:
101 error('Problems parsing SMILES: %s\n'%smi)
102 return res
103
108 """ fpArgs are passed as keyword arguments to the fingerprinter
109
110 Returns a list of 2-tuples: (id,fp)
111
112 """
113 res = []
114 nDone = 0
115 for id,mol in mols:
116 if mol:
117 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
118 res.append((id,fp))
119 nDone += 1
120 if reportFreq>0 and not nDone % reportFreq:
121 message('Done %d molecules\n'%(nDone))
122 if maxMols > 0 and nDone >= maxMols:
123 break
124 else:
125 error('Problems parsing SMILES: %s\n'%smi)
126 return res
127
128 -def FingerprintsFromPickles(dataSource,idCol,pklCol,
129 fingerprinter=Chem.RDKFingerprint,
130 reportFreq=10,maxMols=-1,
131 **fpArgs):
132 """ fpArgs are passed as keyword arguments to the fingerprinter
133
134 Returns a list of 2-tuples: (id,fp)
135
136 """
137 res = []
138 nDone = 0
139 for entry in dataSource:
140 id,pkl = str(entry[idCol]),str(entry[pklCol])
141 mol = Chem.Mol(pkl)
142 if mol is not None:
143 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
144 res.append((id,fp))
145 nDone += 1
146 if reportFreq>0 and not nDone % reportFreq:
147 message('Done %d molecules\n'%(nDone))
148 if maxMols > 0 and nDone >= maxMols:
149 break
150 else:
151 error('Problems parsing pickle for id: %s\n'%id)
152 return res
153
155 data = None
156 if details.dbName and details.tableName:
157 from rdkit.Dbase.DbConnection import DbConnect
158 from rdkit.Dbase import DbInfo
159 from rdkit.ML.Data import DataUtils
160 try:
161 conn = DbConnect(details.dbName,details.tableName)
162 except Exception:
163 import traceback
164 error('Problems establishing connection to database: %s|%s\n'%(details.dbName,
165 details.tableName))
166 traceback.print_exc()
167 if not details.idName:
168 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0]
169 dataSet = DataUtils.DBToData(details.dbName,details.tableName,
170 what='%s,%s'%(details.idName,details.smilesName))
171 idCol = 0
172 smiCol = 1
173 elif details.inFileName and details.useSmiles:
174 from rdkit.ML.Data import DataUtils
175 conn = None
176 if not details.idName:
177 details.idName='ID'
178 try:
179 dataSet = DataUtils.TextFileToData(details.inFileName,
180 onlyCols=[details.idName,details.smilesName])
181 except IOError:
182 import traceback
183 error('Problems reading from file %s\n'%(details.inFileName))
184 traceback.print_exc()
185
186 idCol = 0
187 smiCol = 1
188 elif details.inFileName and details.useSD:
189 conn = None
190 dataset=None
191 if not details.idName:
192 details.idName='ID'
193 dataSet = []
194 try:
195 s = Chem.SDMolSupplier(details.inFileName)
196 except Exception:
197 import traceback
198 error('Problems reading from file %s\n'%(details.inFileName))
199 traceback.print_exc()
200 else:
201 while 1:
202 try:
203 m = s.next()
204 except StopIteration:
205 break
206 if m:
207 dataSet.append(m)
208 if reportFreq>0 and not len(dataSet) % reportFreq:
209 message('Read %d molecules\n'%(len(dataSet)))
210 if details.maxMols > 0 and len(dataSet) >= details.maxMols:
211 break
212
213 for i,mol in enumerate(dataSet):
214 if mol.HasProp(details.idName):
215 nm = mol.GetProp(details.idName)
216 else:
217 nm = mol.GetProp('_Name')
218 dataSet[i] = (nm,mol)
219 else:
220 dataSet = None
221
222 fps = None
223 if dataSet and not details.useSD:
224 data = dataSet.GetNamedData()
225 if not details.molPklName:
226 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol),
227 details.__dict__)
228 else:
229 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol),
230 details.__dict__)
231 elif dataSet and details.useSD:
232 fps = apply(FingerprintsFromMols,(dataSet,),details.__dict__)
233
234 if fps:
235 if details.outFileName:
236 outF = open(details.outFileName,'wb+')
237 for i in range(len(fps)):
238 cPickle.dump(fps[i],outF)
239 outF.close()
240 dbName = details.outDbName or details.dbName
241 if details.outTableName and dbName:
242 from rdkit.Dbase.DbConnection import DbConnect
243 from rdkit.Dbase import DbInfo,DbUtils,DbModule
244 conn = DbConnect(dbName)
245
246
247
248
249 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0]))
250 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes,
251 keyCol=details.idName)
252 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName)
253
254
255
256
257
258
259
260
261 if details.replaceTable or \
262 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
263 conn.AddTable(details.outTableName,cols)
264
265
266
267
268 for id,fp in fps:
269 tpl = id,DbModule.binaryHolder(fp.ToBinary())
270 conn.InsertData(details.outTableName,tpl)
271 conn.Commit()
272 return fps
273
274
275
276
277
278
280 """ class for storing the details of a fingerprinting run,
281 generates sensible defaults on construction
282
283 """
288
290 self.fingerprinter = Chem.RDKFingerprint
291 self.fpColName="AutoFragmentFP"
292 self.idName=''
293 self.dbName=''
294 self.outDbName=''
295 self.tableName=''
296 self.minSize=64
297 self.fpSize=2048
298 self.tgtDensity=0.3
299 self.minPath=1
300 self.maxPath=7
301 self.discrimHash=0
302 self.useHs=0
303 self.useValence=0
304 self.bitsPerHash=2
305 self.smilesName='SMILES'
306 self.maxMols=-1
307 self.outFileName=''
308 self.outTableName=''
309 self.inFileName=''
310 self.replaceTable=True
311 self.molPklName=''
312 self.useSmiles=True
313 self.useSD=False
314
316 self.metric = DataStructs.TanimotoSimilarity
317 self.doScreen=''
318 self.topN=10
319 self.screenThresh=0.75
320 self.doThreshold=0
321 self.smilesTableName=''
322 self.probeSmiles=''
323 self.probeMol=None
324 self.noPickle=0
325
327 self.clusterAlgo = Murtagh.WARDS
328 self.actTableName = ''
329 self.actName = ''
330
350
352 """ prints a usage string and exits
353
354 """
355 print(_usageDoc)
356 sys.exit(-1)
357
358 _usageDoc="""
359 Usage: FingerprintMols.py [args] <fName>
360
361 If <fName> is provided and no tableName is specified (see below),
362 data will be read from the text file <fName>. Text files delimited
363 with either commas (extension .csv) or tabs (extension .txt) are
364 supported.
365
366 Command line arguments are:
367 - -d _dbName_: set the name of the database from which
368 to pull input molecule information. If output is
369 going to a database, this will also be used for that
370 unless the --outDbName option is used.
371
372 - -t _tableName_: set the name of the database table
373 from which to pull input molecule information
374
375 - --smilesName=val: sets the name of the SMILES column
376 in the input database. Default is *SMILES*.
377
378 - --useSD: Assume that the input file is an SD file, not a SMILES
379 table.
380
381 - --idName=val: sets the name of the id column in the input
382 database. Defaults to be the name of the first db column
383 (or *ID* for text files).
384
385 - -o _outFileName_: name of the output file (output will
386 be a pickle file with one label,fingerprint entry for each
387 molecule).
388
389 - --outTable=val: name of the output db table used to store
390 fingerprints. If this table already exists, it will be
391 replaced.
392
393 - --outDbName: name of output database, if it's being used.
394 Defaults to be the same as the input db.
395
396 - --fpColName=val: name to use for the column which stores
397 fingerprints (in pickled format) in the output db table.
398 Default is *AutoFragmentFP*
399
400 - --maxSize=val: base size of the fingerprints to be generated
401 Default is *2048*
402
403 - --minSize=val: minimum size of the fingerprints to be generated
404 (limits the amount of folding that happens). Default is *64*
405
406 - --density=val: target bit density in the fingerprint. The
407 fingerprint will be folded until this density is
408 reached. Default is *0.3*
409
410 - --minPath=val: minimum path length to be included in
411 fragment-based fingerprints. Default is *1*.
412
413 - --maxPath=val: maximum path length to be included in
414 fragment-based fingerprints. Default is *7*.
415
416 - --nBitsPerHash: number of bits to be set in the output
417 fingerprint for each fragment. Default is *2*.
418
419 - --discrim: use of path-based discriminators to hash bits.
420 Default is *false*.
421
422 - -V: include valence information in the fingerprints
423 Default is *false*.
424
425 - -H: include Hs in the fingerprint
426 Default is *false*.
427
428 - --maxMols=val: sets the maximum number of molecules to be
429 fingerprinted.
430
431 - --useMACCS: use the public MACCS keys to do the fingerprinting
432 (instead of a daylight-type fingerprint)
433
434 """
435
437 """ parses the command line arguments and returns a
438 _FingerprinterDetails_ instance with the results.
439
440 **Note**:
441
442 - If you make modifications here, please update the global
443 _usageDoc string so the Usage message is up to date.
444
445 - This routine is used by both the fingerprinter, the clusterer and the
446 screener; not all arguments make sense for all applications.
447
448 """
449 import sys,getopt
450 args = sys.argv[1:]
451 try:
452 args,extras = getopt.getopt(args,'HVs:d:t:o:h',
453 [
454 'minSize=','maxSize=',
455 'density=',
456 'minPath=','maxPath=',
457 'bitsPerHash=',
458 'smilesName=',
459 'molPkl=',
460 'useSD',
461 'idName=',
462 'discrim',
463 'outTable=',
464 'outDbName=',
465 'fpColName=',
466 'maxMols=',
467 'useMACCS',
468 'keepTable',
469
470 'smilesTable=',
471 'doScreen=',
472 'topN=',
473 'thresh=',
474 'smiles=',
475 'dice',
476 'cosine',
477
478 'actTable=',
479 'actName=',
480 'SLINK',
481 'CLINK',
482 'UPGMA',
483
484 ])
485 except Exception:
486 import traceback
487 traceback.print_exc()
488 Usage()
489
490 if details is None:
491 details = FingerprinterDetails()
492 if len(extras):
493 details.inFileName=extras[0]
494
495 for arg,val in args:
496 if arg=='-H':
497 details.useHs=1
498 elif arg=='-V':
499 details.useValence=1
500 elif arg=='-d':
501 details.dbName = val
502 elif arg=='-t':
503 details.tableName = val
504 elif arg=='-o':
505 details.outFileName = val
506 elif arg=='--minSize':
507 details.minSize= int(val)
508 elif arg=='--maxSize':
509 details.fpSize= int(val)
510 elif arg=='--density':
511 details.tgtDensity = float(val)
512 elif arg=='--outTable':
513 details.outTableName = val
514 elif arg=='--outDbName':
515 details.outDbName = val
516 elif arg=='--fpColName':
517 details.fpColName = val
518 elif arg=='--minPath':
519 details.minPath= int(val)
520 elif arg=='--maxPath':
521 details.maxPath= int(val)
522 elif arg=='--nBitsPerHash':
523 details.bitsPerHash= int(val)
524 elif arg=='--discrim':
525 details.discrimHash=1
526 elif arg=='--smilesName':
527 details.smilesName = val
528 elif arg=='--molPkl':
529 details.molPklName = val
530 elif arg=='--useSD':
531 details.useSmiles=False
532 details.useSD=True
533 elif arg=='--idName':
534 details.idName = val
535 elif arg=='--maxMols':
536 details.maxMols = int(val)
537 elif arg=='--useMACCS':
538 details.fingerprinter = MACCSkeys.GenMACCSKeys
539 elif arg=='--keepTable':
540 details.replaceTable=False
541
542
543 elif arg=='--smilesTable':
544 details.smilesTableName=val;
545 elif arg=='--topN':
546 details.doThreshold=0
547 details.topN=int(val)
548 elif arg=='--thresh':
549 details.doThreshold=1
550 details.screenThresh=float(val)
551 elif arg=='--smiles':
552 details.probeSmiles=val;
553 elif arg=='--dice':
554 details.metric = DataStructs.DiceSimilarity
555 elif arg=='--cosine':
556 details.metric = DataStructs.CosineSimilarity
557
558
559 elif arg=='--SLINK':
560 details.clusterAlgo = Murtagh.SLINK
561 elif arg=='--CLINK':
562 details.clusterAlgo = Murtagh.CLINK
563 elif arg=='--UPGMA':
564 details.clusterAlgo = Murtagh.UPGMA
565 elif arg=='--actTable':
566 details.actTableName = val
567 elif arg=='--actName':
568 details.actName = val
569 elif arg=='-h':
570 Usage()
571 return details
572
573 if __name__ == '__main__':
574 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING))
575 details = ParseArgs()
576 FingerprintsFromDetails(details)
577