1
2
3
4
5
6
7
8
9
10
11 """ utility functionality for molecular similarity
12 includes a command line app for screening databases
13
14
15 Sample Usage:
16
17 python MolSimilarity.py -d data.gdb -t daylight_sig --idName="Mol_ID" \
18 --topN=100 --smiles='c1(C=O)ccc(Oc2ccccc2)cc1' --smilesTable=raw_dop_data \
19 --smilesName="structure" -o results.csv
20
21 """
22 from rdkit import RDConfig
23 from rdkit import DataStructs
24 from rdkit import Chem
25 from rdkit.Dbase.DbConnection import DbConnect
26 from rdkit.Dbase import DbModule
27 from rdkit.DataStructs.TopNContainer import TopNContainer
28 import sys,types
29 from rdkit.six.moves import cPickle
30 from rdkit.Chem.Fingerprints import FingerprintMols,DbFpSupplier
31 try:
32 from rdkit.VLib.NodeLib.DbPickleSupplier import _lazyDataSeq as _dataSeq
33 except ImportError:
34 _dataSeq=None
35
36
37 from rdkit import DataStructs
38
39 _cvsVersion="$Id$"
40 idx1 = _cvsVersion.find(':')+1
41 idx2 = _cvsVersion.rfind('$')
42 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
43
44
46 fields = '%s.%s'%(details.tableName,details.idName)
47 join = ''
48 if details.smilesTableName:
49 if details.smilesName:
50 fields = fields + ',%s'%(details.smilesName)
51 join='join %s smi on smi.%s=%s.%s'%(details.smilesTableName,
52 details.idName,
53 details.tableName,
54 details.idName)
55 if details.actTableName:
56 if details.actName:
57 fields = fields + ',%s'%(details.actName)
58 join = join + 'join %s act on act.%s=%s.%s'%(details.actTableName,
59 details.idName,
60 details.tableName,
61 details.idName)
62
63 if extraFields:
64 fields += ','+extraFields
65 cmd = 'select %s from %s %s'%(fields,details.tableName,join)
66 return cmd
67
69 try:
70 probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__)
71 except Exception:
72 import traceback
73 FingerprintMols.error('Error: problems fingerprinting molecule.\n')
74 traceback.print_exc()
75 return []
76 if details.dbName and details.tableName:
77 try:
78 conn = DbConnect(details.dbName,details.tableName)
79 if hasattr(details,'dbUser'):
80 conn.user = details.dbUser
81 if hasattr(details,'dbPassword'):
82 conn.password = details.dbPassword
83 except Exception:
84 import traceback
85 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n'%(details.dbName,
86 details.tableName))
87 traceback.print_exc()
88
89 if details.metric not in (DataStructs.TanimotoSimilarity,
90 DataStructs.DiceSimilarity,
91 DataStructs.CosineSimilarity):
92 data = GetFingerprints(details)
93 res = ScreenFingerprints(details,data,mol)
94 else:
95 res = []
96 if details.metric == DataStructs.TanimotoSimilarity:
97 func = 'rd_tanimoto'
98 pkl=probeFp.ToBitString()
99 elif details.metric == DataStructs.DiceSimilarity:
100 func = 'rd_dice'
101 pkl=probeFp.ToBitString()
102 elif details.metric == DataStructs.CosineSimilarity:
103 func = 'rd_cosine'
104 pkl=probeFp.ToBitString()
105 extraFields="%s(%s,%s) as tani"%(func,DbModule.placeHolder,details.fpColName)
106 cmd = _ConstructSQL(details,extraFields=extraFields)
107
108 if details.doThreshold:
109
110 cmd = "select * from (%s) tmp where tani>%f"%(cmd,details.screenThresh)
111 cmd += " order by tani desc"
112 if not details.doThreshold and details.topN>0:
113 cmd += " limit %d"%details.topN
114 curs = conn.GetCursor()
115 curs.execute(cmd,(pkl,))
116 res = curs.fetchall()
117
118 return res
119
121 """ returns an iterable sequence of fingerprints
122 each fingerprint will have a _fieldsFromDb member whose first entry is
123 the id.
124
125 """
126 if details.dbName and details.tableName:
127 try:
128 conn = DbConnect(details.dbName,details.tableName)
129 if hasattr(details,'dbUser'):
130 conn.user = details.dbUser
131 if hasattr(details,'dbPassword'):
132 conn.password = details.dbPassword
133 except Exception:
134 import traceback
135 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n'%(details.dbName,
136 details.tableName))
137 traceback.print_exc()
138 cmd = _ConstructSQL(details,extraFields=details.fpColName)
139 curs = conn.GetCursor()
140
141
142 if _dataSeq:
143 suppl = _dataSeq(curs,cmd,depickle=not details.noPickle,klass=DataStructs.ExplicitBitVect)
144 _dataSeq._conn = conn
145 else:
146 suppl = DbFpSupplier.ForwardDbFpSupplier(data,fpColName=details.fpColName)
147 elif details.inFileName:
148 conn = None
149 try:
150 inF = open(details.inFileName,'r')
151 except IOError:
152 import traceback
153 FingerprintMols.error('Error: Problems reading from file %s\n'%(details.inFileName))
154 traceback.print_exc()
155
156 suppl = []
157 done = 0
158 while not done:
159 try:
160 id,fp = cPickle.load(inF)
161 except Exception:
162 done = 1
163 else:
164 fp._fieldsFromDb = [id]
165 suppl.append(fp)
166 else:
167 suppl = None
168
169 return suppl
170
172 """ Returns a list of results
173
174 """
175 if probeFp is None:
176 try:
177 probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__)
178 except Exception:
179 import traceback
180 FingerprintMols.error('Error: problems fingerprinting molecule.\n')
181 traceback.print_exc()
182 return []
183 if not probeFp:
184 return []
185
186 res = []
187 if not details.doThreshold and details.topN>0:
188 topN = TopNContainer(details.topN)
189 else:
190 topN = []
191 res = []
192 count = 0
193 for pt in data:
194 fp1 = probeFp
195 if not details.noPickle:
196 if type(pt) in (types.TupleType,types.ListType):
197 id,fp = pt
198 else:
199 fp = pt
200 id = pt._fieldsFromDb[0]
201 score = DataStructs.FingerprintSimilarity(fp1,fp,details.metric)
202 else:
203 id,pkl = pt
204 score = details.metric(fp1,str(pkl))
205 if topN:
206 topN.Insert(score,id)
207 elif not details.doThreshold or \
208 (details.doThreshold and score>=details.screenThresh):
209 res.append((id,score))
210 count += 1
211 if hasattr(details,'stopAfter') and count >= details.stopAfter:
212 break
213 for score,id in topN:
214 res.append((id,score))
215
216 return res
217
219 """ Returns a list of results
220
221 """
222 if not mol:
223 if not details.probeMol:
224 smi = details.probeSmiles
225 try:
226 mol = Chem.MolFromSmiles(smi)
227 except Exception:
228 import traceback
229 FingerprintMols.error('Error: problems generating molecule for smiles: %s\n'%(smi))
230 traceback.print_exc()
231 return
232 else:
233 mol = details.probeMol
234 if not mol:
235 return
236
237 if details.outFileName:
238 try:
239 outF = open(details.outFileName,'w+')
240 except IOError:
241 FingerprintMols.error("Error: could not open output file %s for writing\n"%(details.outFileName))
242 return None
243 else:
244 outF = None
245
246 if not hasattr(details,'useDbSimilarity') or not details.useDbSimilarity:
247 data = GetFingerprints(details)
248 res = ScreenFingerprints(details,data,mol)
249 else:
250 res = ScreenInDb(details,mol)
251 if outF:
252 for pt in res:
253 outF.write(','.join([str(x) for x in pt]))
254 outF.write('\n')
255 return res
256
257 _usageDoc="""
258 Usage: MolSimilarity.py [args] <fName>
259
260 If <fName> is provided and no tableName is specified (see below),
261 data will be read from the pickled file <fName>. This file should
262 contain a series of pickled (id,fingerprint) tuples.
263
264 NOTE: at the moment the user is responsible for ensuring that the
265 fingerprint parameters given at run time (used to fingerprint the
266 probe molecule) match those used to generate the input fingerprints.
267
268 Command line arguments are:
269 - --smiles=val: sets the SMILES for the input molecule. This is
270 a required argument.
271
272 - -d _dbName_: set the name of the database from which
273 to pull input fingerprint information.
274
275 - -t _tableName_: set the name of the database table
276 from which to pull input fingerprint information
277
278 - --smilesTable=val: sets the name of the database table
279 which contains SMILES for the input fingerprints. If this
280 information is provided along with smilesName (see below),
281 the output file will contain SMILES data
282
283 - --smilesName=val: sets the name of the SMILES column
284 in the input database. Default is *SMILES*.
285
286 - --topN=val: sets the number of results to return.
287 Default is *10*.
288
289 - --thresh=val: sets the similarity threshold.
290
291 - --idName=val: sets the name of the id column in the input
292 database. Default is *ID*.
293
294 - -o _outFileName_: name of the output file (output will
295 be a CSV file with one line for each of the output molecules
296
297 - --dice: use the DICE similarity metric instead of Tanimoto
298
299 - --cosine: use the cosine similarity metric instead of Tanimoto
300
301 - --fpColName=val: name to use for the column which stores
302 fingerprints (in pickled format) in the output db table.
303 Default is *AutoFragmentFP*
304
305 - --minPath=val: minimum path length to be included in
306 fragment-based fingerprints. Default is *1*.
307
308 - --maxPath=val: maximum path length to be included in
309 fragment-based fingerprints. Default is *7*.
310
311 - --nBitsPerHash: number of bits to be set in the output
312 fingerprint for each fragment. Default is *4*.
313
314 - --discrim: use of path-based discriminators to hash bits.
315 Default is *false*.
316
317 - -V: include valence information in the fingerprints
318 Default is *false*.
319
320 - -H: include Hs in the fingerprint
321 Default is *false*.
322
323 - --useMACCS: use the public MACCS keys to do the fingerprinting
324 (instead of a daylight-type fingerprint)
325
326
327 """
328 if __name__ == '__main__':
329 FingerprintMols.message("This is MolSimilarity version %s\n\n"%(__VERSION_STRING))
330 FingerprintMols._usageDoc=_usageDoc
331 details = FingerprintMols.ParseArgs()
332 ScreenFromDetails(details)
333