1
2
3
4
5
6 from __future__ import print_function
7 from rdkit import RDConfig
8 from rdkit import DataStructs
9 from rdkit.DataStructs.TopNContainer import TopNContainer
10 import bisect
11
13 _picks = None
15 raise NotImplementedError("GenericPicker is a virtual base class")
24
26 """ A class for picking the top N overall best matches across a library
27
28 Connect to a database and build molecules:
29 >>> from rdkit import Chem
30 >>> import os.path
31 >>> from rdkit.Dbase.DbConnection import DbConnect
32 >>> dbName = RDConfig.RDTestDatabase
33 >>> conn = DbConnect(dbName,'simple_mols1')
34 >>> [x.upper() for x in conn.GetColumnNames()]
35 ['SMILES', 'ID']
36 >>> mols = []
37 >>> for smi,id in conn.GetData():
38 ... mol = Chem.MolFromSmiles(str(smi))
39 ... mol.SetProp('_Name',str(id))
40 ... mols.append(mol)
41 >>> len(mols)
42 12
43
44 Calculate fingerprints:
45 >>> probefps = []
46 >>> for mol in mols:
47 ... fp = Chem.RDKFingerprint(mol)
48 ... fp._id = mol.GetProp('_Name')
49 ... probefps.append(fp)
50
51 Start by finding the top matches for a single probe. This ether should pull
52 other ethers from the db:
53 >>> mol = Chem.MolFromSmiles('COC')
54 >>> probeFp = Chem.RDKFingerprint(mol)
55 >>> picker = TopNOverallPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps)
56 >>> len(picker)
57 2
58 >>> fp,score = picker[0]
59 >>> id = fp._id
60 >>> str(id)
61 'ether-1'
62 >>> score
63 1.0
64
65 The results come back in order:
66 >>> fp,score = picker[1]
67 >>> id = fp._id
68 >>> str(id)
69 'ether-2'
70
71 Now find the top matches for 2 probes. We'll get one ether and one acid:
72 >>> fps = []
73 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC')))
74 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O')))
75 >>> picker = TopNOverallPicker(numToPick=3,probeFps=fps,dataSet=probefps)
76 >>> len(picker)
77 3
78 >>> fp,score = picker[0]
79 >>> id = fp._id
80 >>> str(id)
81 'acid-1'
82 >>> fp,score = picker[1]
83 >>> id = fp._id
84 >>> str(id)
85 'ether-1'
86 >>> score
87 1.0
88 >>> fp,score = picker[2]
89 >>> id = fp._id
90 >>> str(id)
91 'acid-2'
92
93 """
96 """
97
98 dataSet should be a sequence of BitVectors
99
100 """
101 self.numToPick = numToPick
102 self.probes = probeFps
103 self.data = dataSet
104 self.simMetric = simMetric
105 self._picks = None
106
108 if self._picks is not None and not force:
109 return
110 picks = TopNContainer(self.numToPick)
111 for fp in self.data:
112 origFp = fp
113 bestScore = -1.0
114 for probeFp in self.probes:
115 score = DataStructs.FingerprintSimilarity(origFp,probeFp,
116 self.simMetric)
117 bestScore = max(score,bestScore)
118 picks.Insert(bestScore,fp)
119 self._picks = []
120 for score,pt in picks:
121 self._picks.append((pt,score))
122 self._picks.reverse()
123
125 """ A class for picking the best matches across a library
126
127 Connect to a database:
128 >>> from rdkit import Chem
129 >>> import os.path
130 >>> from rdkit.Dbase.DbConnection import DbConnect
131 >>> dbName = RDConfig.RDTestDatabase
132 >>> conn = DbConnect(dbName,'simple_mols1')
133 >>> [x.upper() for x in conn.GetColumnNames()]
134 ['SMILES', 'ID']
135 >>> mols = []
136 >>> for smi,id in conn.GetData():
137 ... mol = Chem.MolFromSmiles(str(smi))
138 ... mol.SetProp('_Name',str(id))
139 ... mols.append(mol)
140 >>> len(mols)
141 12
142
143 Calculate fingerprints:
144 >>> probefps = []
145 >>> for mol in mols:
146 ... fp = Chem.RDKFingerprint(mol)
147 ... fp._id = mol.GetProp('_Name')
148 ... probefps.append(fp)
149
150 Start by finding the top matches for a single probe. This ether should pull
151 other ethers from the db:
152 >>> mol = Chem.MolFromSmiles('COC')
153 >>> probeFp = Chem.RDKFingerprint(mol)
154 >>> picker = SpreadPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps)
155 >>> len(picker)
156 2
157 >>> fp,score = picker[0]
158 >>> id = fp._id
159 >>> str(id)
160 'ether-1'
161 >>> score
162 1.0
163
164 The results come back in order:
165 >>> fp,score = picker[1]
166 >>> id = fp._id
167 >>> str(id)
168 'ether-2'
169
170 Now find the top matches for 2 probes. We'll get one ether and one acid:
171 >>> fps = []
172 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC')))
173 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O')))
174 >>> picker = SpreadPicker(numToPick=3,probeFps=fps,dataSet=probefps)
175 >>> len(picker)
176 3
177 >>> fp,score = picker[0]
178 >>> id = fp._id
179 >>> str(id)
180 'ether-1'
181 >>> score
182 1.0
183 >>> fp,score = picker[1]
184 >>> id = fp._id
185 >>> str(id)
186 'acid-1'
187 >>> score
188 1.0
189 >>> fp,score = picker[2]
190 >>> id = fp._id
191 >>> str(id)
192 'ether-2'
193
194 """
198 """
199
200 dataSet should be a sequence of BitVectors or, if expectPickles
201 is False, a set of strings that can be converted to bit vectors
202
203 """
204 self.numToPick = numToPick
205 self.probes = probeFps
206 self.data = dataSet
207 self.simMetric = simMetric
208 self.expectPickles = expectPickles
209 self.onlyNames=onlyNames
210
211 self._picks = None
212
214 if self._picks is not None and not force:
215 return
216
217
218
219 nProbes = len(self.probes)
220 scores = [None]*nProbes
221 for i in range(nProbes):
222 scores[i] = []
223 j = 0
224 fps = []
225 for origFp in self.data:
226 for i in range(nProbes):
227 score = DataStructs.FingerprintSimilarity(self.probes[i],origFp,
228 self.simMetric)
229 bisect.insort(scores[i],(score,j))
230 if len(scores[i])>=self.numToPick:
231 del scores[self.numToPick:]
232 if self.onlyNames and hasattr(origFp,'_fieldsFromDb'):
233 fps.append(origFp._fieldsFromDb[0])
234 else:
235 fps.append(origFp)
236 j+=1
237 if not silent and not j%1000:
238 print('scored %d fps'%j)
239
240
241
242
243
244
245 nPicked = 0
246 self._picks = []
247 taken = [0]*len(fps)
248 while nPicked < self.numToPick:
249 rowIdx = nPicked%len(scores)
250 row = scores[rowIdx]
251 score,idx = row.pop()
252
253 while taken[idx] and len(row):
254 score,idx = row.pop()
255 if not taken[idx]:
256 fp = fps[idx]
257 self._picks.append((fp,score))
258 taken[idx]=1
259 nPicked += 1
260
261
262
263
264
266 import doctest,sys
267 return doctest.testmod(sys.modules["__main__"])
268
269 if __name__ == '__main__':
270 import sys
271 failed,tried = _test()
272 sys.exit(failed)
273