Package rdkit ::
Package Chem ::
Module BuildFragmentCatalog
|
|
1
2
3
4
5
6
7
8
9
10
11 """ command line utility for working with FragmentCatalogs (CASE-type analysis)
12
13 **Usage**
14
15 BuildFragmentCatalog [optional args] <filename>
16
17 filename, the name of a delimited text file containing InData, is required
18 for some modes of operation (see below)
19
20 **Command Line Arguments**
21
22 - -n *maxNumMols*: specify the maximum number of molecules to be processed
23
24 - -b: build the catalog and OnBitLists
25 *requires InData*
26
27 - -s: score compounds
28 *requires InData and a Catalog, can use OnBitLists*
29
30 - -g: calculate info gains
31 *requires Scores*
32
33 - -d: show details about high-ranking fragments
34 *requires a Catalog and Gains*
35
36 - --catalog=*filename*: filename with the pickled catalog.
37 If -b is provided, this file will be overwritten.
38
39 - --onbits=*filename*: filename to hold the pickled OnBitLists.
40 If -b is provided, this file will be overwritten
41
42 - --scores=*filename*: filename to hold the text score data.
43 If -s is provided, this file will be overwritten
44
45 - --gains=*filename*: filename to hold the text gains data.
46 If -g is provided, this file will be overwritten
47
48 - --details=*filename*: filename to hold the text details data.
49 If -d is provided, this file will be overwritten.
50
51 - --minPath=2: specify the minimum length for a path
52
53 - --maxPath=6: specify the maximum length for a path
54
55 - --smiCol=1: specify which column in the input data file contains
56 SMILES
57
58 - --actCol=-1: specify which column in the input data file contains
59 activities
60
61 - --nActs=2: specify the number of possible activity values
62
63 - --nBits=-1: specify the maximum number of bits to show details for
64
65 """
66 from __future__ import print_function
67 import sys,os
68 from rdkit.six.moves import cPickle
69 from rdkit.six import next
70 from rdkit import Chem
71 from rdkit import RDConfig
72 from rdkit.Chem import FragmentCatalog
73 from rdkit.Dbase.DbConnection import DbConnect
74 import numpy
75 from rdkit.ML import InfoTheory
76 import types
77
78 _cvsVersion="$Revision$"
79 idx1 = _cvsVersion.find(':')+1
80 idx2 = _cvsVersion.rfind('$')
81 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
82
85
86 -def BuildCatalog(suppl,maxPts=-1,groupFileName=None,
87 minPath=2,maxPath=6,reportFreq=10):
88 """ builds a fragment catalog from a set of molecules in a delimited text block
89
90 **Arguments**
91
92 - suppl: a mol supplier
93
94 - maxPts: (optional) if provided, this will set an upper bound on the
95 number of points to be considered
96
97 - groupFileName: (optional) name of the file containing functional group
98 information
99
100 - minPath, maxPath: (optional) names of the minimum and maximum path lengths
101 to be considered
102
103 - reportFreq: (optional) how often to display status information
104
105 **Returns**
106
107 a FragmentCatalog
108
109 """
110 if groupFileName is None:
111 groupFileName = os.path.join(RDConfig.RDDataDir,"FunctionalGroups.txt")
112
113 fpParams = FragmentCatalog.FragCatParams(minPath,maxPath,groupFileName)
114 catalog = FragmentCatalog.FragCatalog(fpParams)
115 fgen = FragmentCatalog.FragCatGenerator()
116 if maxPts >0:
117 nPts = maxPts
118 else:
119 if hasattr(suppl,'__len__'):
120 nPts = len(suppl)
121 else:
122 nPts = -1
123 for i,mol in enumerate(suppl):
124 if i == nPts:
125 break
126 if i and not i%reportFreq:
127 if nPts>-1:
128 message('Done %d of %d, %d paths\n'%(i,nPts,catalog.GetFPLength()))
129 else:
130 message('Done %d, %d paths\n'%(i,catalog.GetFPLength()))
131 fgen.AddFragsFromMol(mol,catalog)
132 return catalog
133
134 -def ScoreMolecules(suppl,catalog,maxPts=-1,actName='',acts=None,
135 nActs=2,reportFreq=10):
136 """ scores the compounds in a supplier using a catalog
137
138 **Arguments**
139
140 - suppl: a mol supplier
141
142 - catalog: the FragmentCatalog
143
144 - maxPts: (optional) the maximum number of molecules to be
145 considered
146
147 - actName: (optional) the name of the molecule's activity property.
148 If this is not provided, the molecule's last property will be used.
149
150 - acts: (optional) a sequence of activity values (integers).
151 If not provided, the activities will be read from the molecules.
152
153 - nActs: (optional) number of possible activity values
154
155 - reportFreq: (optional) how often to display status information
156
157 **Returns**
158
159 a 2-tuple:
160
161 1) the results table (a 3D array of ints nBits x 2 x nActs)
162
163 2) a list containing the on bit lists for each molecule
164
165 """
166 nBits = catalog.GetFPLength()
167 resTbl = numpy.zeros((nBits,2,nActs),numpy.int)
168 obls = []
169
170 if not actName and not acts:
171 actName = suppl[0].GetPropNames()[-1]
172
173
174 fpgen = FragmentCatalog.FragFPGenerator()
175 suppl.reset()
176 i = 1
177 for mol in suppl:
178 if i and not i%reportFreq:
179 message('Done %d.\n'%(i))
180 if mol:
181 if not acts:
182 act = int(mol.GetProp(actName))
183 else:
184 act = acts[i-1]
185 fp = fpgen.GetFPForMol(mol,catalog)
186 obls.append([x for x in fp.GetOnBits()])
187 for j in range(nBits):
188 resTbl[j,0,act] += 1
189 for id in obls[i-1]:
190 resTbl[id-1,0,act] -= 1
191 resTbl[id-1,1,act] += 1
192 else:
193 obls.append([])
194 i+=1
195 return resTbl,obls
196
197 -def ScoreFromLists(bitLists,suppl,catalog,maxPts=-1,actName='',acts=None,
198 nActs=2,reportFreq=10):
199 """ similar to _ScoreMolecules()_, but uses pre-calculated bit lists
200 for the molecules (this speeds things up a lot)
201
202
203 **Arguments**
204
205 - bitLists: sequence of on bit sequences for the input molecules
206
207 - suppl: the input supplier (we read activities from here)
208
209 - catalog: the FragmentCatalog
210
211 - maxPts: (optional) the maximum number of molecules to be
212 considered
213
214 - actName: (optional) the name of the molecule's activity property.
215 If this is not provided, the molecule's last property will be used.
216
217 - nActs: (optional) number of possible activity values
218
219 - reportFreq: (optional) how often to display status information
220
221 **Returns**
222
223 the results table (a 3D array of ints nBits x 2 x nActs)
224
225 """
226 nBits = catalog.GetFPLength()
227 if maxPts >0:
228 nPts = maxPts
229 else:
230 nPts = len(bitLists)
231 resTbl = numpy.zeros((nBits,2,nActs),numpy.int)
232 if not actName and not acts:
233 actName = suppl[0].GetPropNames()[-1]
234 suppl.reset()
235 for i in range(1,nPts+1):
236 mol = next(suppl)
237 if not acts:
238 act = int(mol.GetProp(actName))
239 else:
240 act = acts[i-1]
241 if i and not i%reportFreq:
242 message('Done %d of %d\n'%(i,nPts))
243 ids = set()
244 for id in bitLists[i-1]:
245 ids.add(id-1)
246 for j in range(nBits):
247 resTbl[j,0,act] += 1
248 for id in ids:
249 resTbl[id,0,act] -= 1
250 resTbl[id,1,act] += 1
251 return resTbl
252
253 -def CalcGains(suppl,catalog,topN=-1,actName='',acts=None,
254 nActs=2,reportFreq=10,biasList=None,collectFps=0):
255 """ calculates info gains by constructing fingerprints
256 *DOC*
257
258 Returns a 2-tuple:
259 1) gains matrix
260 2) list of fingerprints
261
262 """
263 nBits = catalog.GetFPLength()
264 if topN < 0:
265 topN = nBits
266 if not actName and not acts:
267 actName = suppl[0].GetPropNames()[-1]
268
269 gains = [0]*nBits
270 if hasattr(suppl,'__len__'):
271 nMols = len(suppl)
272 else:
273 nMols = -1
274 fpgen = FragmentCatalog.FragFPGenerator()
275
276 if biasList:
277 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY)
278 ranker.SetBiasList(biasList)
279 else:
280 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY)
281 i = 0
282 fps = []
283 for mol in suppl:
284 if not acts:
285 try:
286 act = int(mol.GetProp(actName))
287 except KeyError:
288 message('ERROR: Molecule has no property: %s\n'%(actName))
289 message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames())))
290 raise KeyError(actName)
291 else:
292 act = acts[i]
293 if i and not i%reportFreq:
294 if nMols>0:
295 message('Done %d of %d.\n'%(i,nMols))
296 else:
297 message('Done %d.\n'%(i))
298 fp = fpgen.GetFPForMol(mol,catalog)
299 ranker.AccumulateVotes(fp,act)
300 i+=1;
301 if collectFps:
302 fps.append(fp)
303 gains = ranker.GetTopN(topN)
304 return gains,fps
305
306 -def CalcGainsFromFps(suppl,fps,topN=-1,actName='',acts=None,
307 nActs=2,reportFreq=10,biasList=None):
308 """ calculates info gains from a set of fingerprints
309
310 *DOC*
311
312 """
313 nBits = len(fps[0])
314 if topN < 0:
315 topN = nBits
316 if not actName and not acts:
317 actName = suppl[0].GetPropNames()[-1]
318
319 gains = [0]*nBits
320 if hasattr(suppl,'__len__'):
321 nMols = len(suppl)
322 else:
323 nMols = -1
324 if biasList:
325 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY)
326 ranker.SetBiasList(biasList)
327 else:
328 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY)
329 for i,mol in enumerate(suppl):
330 if not acts:
331 try:
332 act = int(mol.GetProp(actName))
333 except KeyError:
334 message('ERROR: Molecule has no property: %s\n'%(actName))
335 message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames())))
336 raise KeyError(actName)
337 else:
338 act = acts[i]
339 if i and not i%reportFreq:
340 if nMols>0:
341 message('Done %d of %d.\n'%(i,nMols))
342 else:
343 message('Done %d.\n'%(i))
344 fp = fps[i]
345 ranker.AccumulateVotes(fp,act)
346 gains = ranker.GetTopN(topN)
347 return gains
348
350 actHeaders = ['Act-%d'%(x) for x in range(nActs)]
351 if cat:
352 outF.write('id,Description,Gain,%s\n'%(','.join(actHeaders)))
353 else:
354 outF.write('id,Gain,%s\n'%(','.join(actHeaders)))
355 for entry in gains:
356 id = int(entry[0])
357 outL = [str(id)]
358 if cat:
359 descr = cat.GetBitDescription(id)
360 outL.append(descr)
361 outL.append('%.6f'%entry[1])
362 outL += ['%d'%x for x in entry[2:]]
363 outF.write(','.join(outL))
364 outF.write('\n')
365
366
368 """ reads a list of ids and info gains out of an input file
369
370 """
371 res = []
372 inL = inF.readline()
373 for line in inF.xreadlines():
374 splitL = line.strip().split(delim)
375 res.append((splitL[idCol],float(splitL[gainCol])))
376 return res
377
378
379 -def ShowDetails(catalog,gains,nToDo=-1,outF=sys.stdout,idCol=0,gainCol=1,
380 outDelim=','):
381 """
382 gains should be a sequence of sequences. The idCol entry of each
383 sub-sequence should be a catalog ID. _ProcessGainsData()_ provides
384 suitable input.
385
386 """
387 if nToDo < 0:
388 nToDo = len(gains)
389 for i in range(nToDo):
390 id = int(gains[i][idCol])
391 gain = float(gains[i][gainCol])
392 descr = catalog.GetFragDescription(id)
393 if descr:
394 outF.write('%s\n'%(outDelim.join((str(id),descr,str(gain)))))
395
431
432
434 print("This is BuildFragmentCatalog version %s"%(__VERSION_STRING))
435 print('usage error')
436
437 sys.exit(-1)
438
466
468 import getopt
469 try:
470 args,extras = getopt.getopt(sys.argv[1:],'n:d:cst',
471 ['catalog=','onbits=',
472 'scoresFile=','gainsFile=','detailsFile=','fpFile=',
473 'minPath=','maxPath=','smiCol=','actCol=','nameCol=','nActs=',
474 'nBits=','biasList=','topN=',
475 'build','sigs','gains','details','score','noTitle'])
476 except Exception:
477 sys.stderr.write('Error parsing command line:\n')
478 import traceback
479 traceback.print_exc()
480 Usage()
481 for arg,val in args:
482 if arg=='-n':
483 details.numMols=int(val)
484 elif arg=='-c':
485 details.delim=','
486 elif arg=='-s':
487 details.delim=' '
488 elif arg=='-t':
489 details.delim='\t'
490 elif arg=='-d':
491 details.dbName=val
492 elif arg=='--build':
493 details.doBuild=1
494 elif arg=='--score':
495 details.doScore=1
496 elif arg=='--gains':
497 details.doGains=1
498 elif arg=='--sigs':
499 details.doSigs=1
500 elif arg=='-details':
501 details.doDetails=1
502 elif arg=='--catalog':
503 details.catalogName=val
504 elif arg=='--onbits':
505 details.onBitsName=val
506 elif arg=='--scoresFile':
507 details.scoresName=val
508 elif arg=='--gainsFile':
509 details.gainsName=val
510 elif arg=='--detailsFile':
511 details.detailsName=val
512 elif arg=='--fpFile':
513 details.fpName=val
514 elif arg=='--minPath':
515 details.minPath=int(val)
516 elif arg=='--maxPath':
517 details.maxPath=int(val)
518 elif arg=='--smiCol':
519 try:
520 details.smiCol=int(val)
521 except ValueError:
522 details.smiCol=val
523 elif arg=='--actCol':
524 try:
525 details.actCol=int(val)
526 except ValueError:
527 details.actCol=val
528 elif arg=='--nameCol':
529 try:
530 details.nameCol=int(val)
531 except ValueError:
532 details.nameCol=val
533 elif arg=='--nActs':
534 details.nActs=int(val)
535 elif arg=='--nBits':
536 details.nBits=int(val)
537 elif arg=='--noTitle':
538 details.hasTitle=0
539 elif arg=='--biasList':
540 details.biasList=tuple(eval(val))
541 elif arg=='--topN':
542 details.topN=int(val)
543 elif arg=='-h':
544 Usage()
545 sys.exit(0)
546 else:
547 Usage()
548 if len(extras):
549 if details.dbName:
550 details.tableName=extras[0]
551 else:
552 details.inFileName = extras[0]
553 else:
554 Usage()
555 if __name__=='__main__':
556 import time
557 details = RunDetails()
558 ParseArgs(details)
559 from io import StringIO
560 suppl = SupplierFromDetails(details)
561
562 cat = None
563 obls = None
564 if details.doBuild:
565 if not suppl:
566 message("We require inData to generate a catalog\n")
567 sys.exit(-2)
568 message("Building catalog\n")
569 t1 = time.time()
570 cat = BuildCatalog(suppl,maxPts=details.numMols,
571 minPath=details.minPath,maxPath=details.maxPath)
572 t2 = time.time()
573 message("\tThat took %.2f seconds.\n"%(t2-t1))
574 if details.catalogName:
575 message("Dumping catalog data\n")
576 cPickle.dump(cat,open(details.catalogName,'wb+'))
577 elif details.catalogName:
578 message("Loading catalog\n")
579 cat = cPickle.load(open(details.catalogName,'rb'))
580 if details.onBitsName:
581 try:
582 obls = cPickle.load(open(details.onBitsName,'rb'))
583 except Exception:
584 obls = None
585 else:
586 if len(obls)<(inD.count('\n')-1):
587 obls = None
588 scores = None
589 if details.doScore:
590 if not suppl:
591 message("We require inData to score molecules\n")
592 sys.exit(-2)
593 if not cat:
594 message("We require a catalog to score molecules\n")
595 sys.exit(-2)
596 message("Scoring compounds\n")
597 if not obls or len(obls)<details.numMols:
598 scores,obls = ScoreMolecules(suppl,cat,maxPts=details.numMols,
599 actName=details.actCol,
600 nActs=details.nActs)
601 if details.scoresName:
602 cPickle.dump(scores,open(details.scoresName,'wb+'))
603 if details.onBitsName:
604 cPickle.dump(obls,open(details.onBitsName,'wb+'))
605 else:
606 scores = ScoreFromLists(obls,suppl,cat,maxPts=details.numMols,
607 actName=details.actCol,
608 nActs=details.nActs)
609 elif details.scoresName:
610 scores = cPickle.load(open(details.scoresName,'rb'))
611
612 if details.fpName and os.path.exists(details.fpName) and not details.doSigs:
613 message("Reading fingerprints from file.\n")
614 fps = cPickle.load(open(details.fpName,'rb'))
615 else:
616 fps = []
617 gains = None
618 if details.doGains:
619 if not suppl:
620 message("We require inData to calculate gains\n")
621 sys.exit(-2)
622 if not (cat or fps):
623 message("We require either a catalog or fingerprints to calculate gains\n")
624 sys.exit(-2)
625 message("Calculating Gains\n")
626 t1 = time.time()
627 if details.fpName:
628 collectFps=1
629 else:
630 collectFps=0
631 if not fps:
632 gains,fps = CalcGains(suppl,cat,topN=details.topN,actName=details.actCol,
633 nActs=details.nActs,biasList=details.biasList,
634 collectFps=collectFps)
635 if details.fpName:
636 message("Writing fingerprint file.\n")
637 tmpF=open(details.fpName,'wb+')
638 cPickle.dump(fps,tmpF,1)
639 tmpF.close()
640 else:
641 gains = CalcGainsFromFps(suppl,fps,topN=details.topN,actName=details.actCol,
642 nActs=details.nActs,biasList=details.biasList)
643 t2=time.time()
644 message("\tThat took %.2f seconds.\n"%(t2-t1))
645 if details.gainsName:
646 outF = open(details.gainsName,'w+')
647 OutputGainsData(outF,gains,cat,nActs=details.nActs)
648 else:
649 if details.gainsName:
650 inF = open(details.gainsName,'r')
651 gains = ProcessGainsData(inF)
652
653
654 if details.doDetails:
655 if not cat:
656 message("We require a catalog to get details\n")
657 sys.exit(-2)
658 if not gains:
659 message("We require gains data to get details\n")
660 sys.exit(-2)
661 io = StringIO()
662 io.write('id,SMILES,gain\n')
663 ShowDetails(cat,gains,nToDo=details.nBits,outF=io)
664 if details.detailsName:
665 open(details.detailsName,'w+').write(io.getvalue())
666 else:
667 sys.stderr.write(io.getvalue())
668