1
2
3
4
5
6
7
8
9
10
11 """ SMARTS definitions for the publically available MACCS keys
12 and a MACCS fingerprinter
13
14 I compared the MACCS fingerprints generated here with those from two
15 other packages (not MDL, unfortunately). Of course there are
16 disagreements between the various fingerprints still, but I think
17 these definitions work pretty well. Some notes:
18
19 1) most of the differences have to do with aromaticity
20 2) there's a discrepancy sometimes because the current RDKit
21 definitions do not require multiple matches to be distinct. e.g. the
22 SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my
23 definition. It's not clear to me what the correct behavior is.
24 3) Some keys are not fully defined in the MDL documentation
25 4) Two keys, 125 and 166, have to be done outside of SMARTS.
26 5) Key 1 (ISOTOPE) isn't defined
27
28 Rev history:
29 2006 (gl): Original open-source release
30 May 2011 (gl): Update some definitions based on feedback from Andrew Dalke
31
32 """
33 from __future__ import print_function
34 from rdkit import Chem
35 from rdkit.Chem import rdMolDescriptors
36 from rdkit import DataStructs
37
38 smartsPatts={
39 1:('?',0),
40
41 2:('[#104]',0),
42 3:('[#32,#33,#34,#50,#51,#52,#82,#83,#84]',0),
43 4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0),
44 5:('[Sc,Ti,Y,Zr,Hf]',0),
45 6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0),
46 7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0),
47 8:('[!#6;!#1]1~*~*~*~1',0),
48 9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0),
49 10:('[Be,Mg,Ca,Sr,Ba,Ra]',0),
50 11:('*1~*~*~*~1',0),
51 12:('[Cu,Zn,Ag,Cd,Au,Hg]',0),
52 13:('[#8]~[#7](~[#6])~[#6]',0),
53 14:('[#16]-[#16]',0),
54 15:('[#8]~[#6](~[#8])~[#8]',0),
55 16:('[!#6;!#1]1~*~*~1',0),
56 17:('[#6]#[#6]',0),
57 18:('[#5,#13,#31,#49,#81]',0),
58 19:('*1~*~*~*~*~*~*~1',0),
59 20:('[#14]',0),
60 21:('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]',0),
61 22:('*1~*~*~1',0),
62 23:('[#7]~[#6](~[#8])~[#8]',0),
63 24:('[#7]-[#8]',0),
64 25:('[#7]~[#6](~[#7])~[#7]',0),
65 26:('[#6]=;@[#6](@*)@*',0),
66 27:('[I]',0),
67 28:('[!#6;!#1]~[CH2]~[!#6;!#1]',0),
68 29:('[#15]',0),
69 30:('[#6]~[!#6;!#1](~[#6])(~[#6])~*',0),
70 31:('[!#6;!#1]~[F,Cl,Br,I]',0),
71 32:('[#6]~[#16]~[#7]',0),
72 33:('[#7]~[#16]',0),
73 34:('[CH2]=*',0),
74 35:('[Li,Na,K,Rb,Cs,Fr]',0),
75 36:('[#16R]',0),
76 37:('[#7]~[#6](~[#8])~[#7]',0),
77 38:('[#7]~[#6](~[#6])~[#7]',0),
78 39:('[#8]~[#16](~[#8])~[#8]',0),
79 40:('[#16]-[#8]',0),
80 41:('[#6]#[#7]',0),
81 42:('F',0),
82 43:('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]',0),
83 44:('[!#1;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#35;!#53]',0),
84 45:('[#6]=[#6]~[#7]',0),
85 46:('Br',0),
86 47:('[#16]~*~[#7]',0),
87 48:('[#8]~[!#6;!#1](~[#8])(~[#8])',0),
88 49:('[!+0]',0),
89 50:('[#6]=[#6](~[#6])~[#6]',0),
90 51:('[#6]~[#16]~[#8]',0),
91 52:('[#7]~[#7]',0),
92 53:('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]',0),
93 54:('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]',0),
94 55:('[#8]~[#16]~[#8]',0),
95 56:('[#8]~[#7](~[#8])~[#6]',0),
96 57:('[#8R]',0),
97 58:('[!#6;!#1]~[#16]~[!#6;!#1]',0),
98 59:('[#16]!:*:*',0),
99 60:('[#16]=[#8]',0),
100 61:('*~[#16](~*)~*',0),
101 62:('*@*!@*@*',0),
102 63:('[#7]=[#8]',0),
103 64:('*@*!@[#16]',0),
104 65:('c:n',0),
105 66:('[#6]~[#6](~[#6])(~[#6])~*',0),
106 67:('[!#6;!#1]~[#16]',0),
107 68:('[!#6;!#1;!H0]~[!#6;!#1;!H0]',0),
108 69:('[!#6;!#1]~[!#6;!#1;!H0]',0),
109 70:('[!#6;!#1]~[#7]~[!#6;!#1]',0),
110 71:('[#7]~[#8]',0),
111 72:('[#8]~*~*~[#8]',0),
112 73:('[#16]=*',0),
113 74:('[CH3]~*~[CH3]',0),
114 75:('*!@[#7]@*',0),
115 76:('[#6]=[#6](~*)~*',0),
116 77:('[#7]~*~[#7]',0),
117 78:('[#6]=[#7]',0),
118 79:('[#7]~*~*~[#7]',0),
119 80:('[#7]~*~*~*~[#7]',0),
120 81:('[#16]~*(~*)~*',0),
121 82:('*~[CH2]~[!#6;!#1;!H0]',0),
122 83:('[!#6;!#1]1~*~*~*~*~1',0),
123 84:('[NH2]',0),
124 85:('[#6]~[#7](~[#6])~[#6]',0),
125 86:('[C;H2,H3][!#6;!#1][C;H2,H3]',0),
126 87:('[F,Cl,Br,I]!@*@*',0),
127 88:('[#16]',0),
128 89:('[#8]~*~*~*~[#8]',0),
129 90:('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',0),
130 91:('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',0),
131 92:('[#8]~[#6](~[#7])~[#6]',0),
132 93:('[!#6;!#1]~[CH3]',0),
133 94:('[!#6;!#1]~[#7]',0),
134 95:('[#7]~*~*~[#8]',0),
135 96:('*1~*~*~*~*~1',0),
136 97:('[#7]~*~*~*~[#8]',0),
137 98:('[!#6;!#1]1~*~*~*~*~*~1',0),
138 99:('[#6]=[#6]',0),
139 100:('*~[CH2]~[#7]',0),
140 101:('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',0),
141 102:('[!#6;!#1]~[#8]',0),
142 103:('Cl',0),
143 104:('[!#6;!#1;!H0]~*~[CH2]~*',0),
144 105:('*@*(@*)@*',0),
145 106:('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]',0),
146 107:('[F,Cl,Br,I]~*(~*)~*',0),
147 108:('[CH3]~*~*~*~[CH2]~*',0),
148 109:('*~[CH2]~[#8]',0),
149 110:('[#7]~[#6]~[#8]',0),
150 111:('[#7]~*~[CH2]~*',0),
151 112:('*~*(~*)(~*)~*',0),
152 113:('[#8]!:*:*',0),
153 114:('[CH3]~[CH2]~*',0),
154 115:('[CH3]~*~[CH2]~*',0),
155 116:('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]',0),
156 117:('[#7]~*~[#8]',0),
157 118:('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]',1),
158 119:('[#7]=*',0),
159 120:('[!#6;R]',1),
160 121:('[#7;R]',0),
161 122:('*~[#7](~*)~*',0),
162 123:('[#8]~[#6]~[#8]',0),
163 124:('[!#6;!#1]~[!#6;!#1]',0),
164 125:('?',0),
165 126:('*!@[#8]!@*',0),
166 127:('*@*!@[#8]',1),
167 128:('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',0),
168 129:('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',0),
169 130:('[!#6;!#1]~[!#6;!#1]',1),
170 131:('[!#6;!#1;!H0]',1),
171 132:('[#8]~*~[CH2]~*',0),
172 133:('*@*!@[#7]',0),
173 134:('[F,Cl,Br,I]',0),
174 135:('[#7]!:*:*',0),
175 136:('[#8]=*',1),
176 137:('[!C;!c;R]',0),
177 138:('[!#6;!#1]~[CH2]~*',1),
178 139:('[O;!H0]',0),
179 140:('[#8]',3),
180 141:('[CH3]',2),
181 142:('[#7]',1),
182 143:('*@*!@[#8]',0),
183 144:('*!:*:*!:*',0),
184 145:('*1~*~*~*~*~*~1',1),
185 146:('[#8]',2),
186 147:('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]',0),
187 148:('*~[!#6;!#1](~*)~*',0),
188 149:('[C;H3,H4]',1),
189 150:('*!@*@*!@*',0),
190 151:('[#7;!H0]',0),
191 152:('[#8]~[#6](~[#6])~[#6]',0),
192 153:('[!#6;!#1]~[CH2]~*',0),
193 154:('[#6]=[#8]',0),
194 155:('*!@[CH2]!@*',0),
195 156:('[#7]~*(~*)~*',0),
196 157:('[#6]-[#8]',0),
197 158:('[#6]-[#7]',0),
198 159:('[#8]',1),
199 160:('[C;H3,H4]',0),
200 161:('[#7]',0),
201 162:('a',0),
202 163:('*1~*~*~*~*~*~1',0),
203 164:('[#8]',0),
204 165:('[R]',0),
205 166:('?',0),
206 }
207
208 maccsKeys = None
209
211 """ *Internal Use Only*
212
213 generates SMARTS patterns for the keys, run once
214
215 """
216 assert len(keyList) == len(keyDict.keys()),'length mismatch'
217 for key in keyDict.keys():
218 patt,count = keyDict[key]
219 if patt != '?':
220 sma = Chem.MolFromSmarts(patt)
221 if not sma:
222 print('SMARTS parser error for key #%d: %s'%(key,patt))
223 else:
224 keyList[key-1] = sma,count
225
227 """ generates the MACCS fingerprint for a molecules
228
229 **Arguments**
230
231 - mol: the molecule to be fingerprinted
232
233 - any extra keyword arguments are ignored
234
235 **Returns**
236
237 a _DataStructs.SparseBitVect_ containing the fingerprint.
238
239 >>> m = Chem.MolFromSmiles('CNO')
240 >>> bv = GenMACCSKeys(m)
241 >>> tuple(bv.GetOnBits())
242 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
243 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC'))
244 >>> tuple(bv.GetOnBits())
245 (74, 114, 149, 155, 160)
246
247 """
248 global maccsKeys
249 if maccsKeys is None:
250 maccsKeys = [(None,0)]*len(smartsPatts.keys())
251 _InitKeys(maccsKeys,smartsPatts)
252 ctor=kwargs.get('ctor',DataStructs.SparseBitVect)
253
254 res = ctor(len(maccsKeys)+1)
255 for i,(patt,count) in enumerate(maccsKeys):
256 if patt is not None:
257 if count==0:
258 res[i+1] = mol.HasSubstructMatch(patt)
259 else:
260 matches = mol.GetSubstructMatches(patt)
261 if len(matches) > count:
262 res[i+1] = 1
263 elif (i+1)==125:
264
265 ri = mol.GetRingInfo()
266 nArom=0
267 res[125]=0
268 for ring in ri.BondRings():
269 isArom=True
270 for bondIdx in ring:
271 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic():
272 isArom=False
273 break
274 if isArom:
275 nArom+=1
276 if nArom>1:
277 res[125]=1
278 break
279 elif (i+1)==166:
280 res[166]=0
281
282 if len(Chem.GetMolFrags(mol))>1:
283 res[166]=1
284
285 return res
286 GenMACCSKeys = rdMolDescriptors.GetMACCSKeysFingerprint
287 FingerprintMol = rdMolDescriptors.GetMACCSKeysFingerprint
288
289
290
291
292
294 import doctest,sys
295 return doctest.testmod(sys.modules["__main__"])
296
297 if __name__ == '__main__':
298 import sys
299 failed,tried = _test()
300 sys.exit(failed)
301