Package rdkit :: Package Chem :: Module MACCSkeys
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MACCSkeys

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2001-2011 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ SMARTS definitions for the publically available MACCS keys 
 12  and a MACCS fingerprinter 
 13   
 14  I compared the MACCS fingerprints generated here with those from two 
 15  other packages (not MDL, unfortunately). Of course there are 
 16  disagreements between the various fingerprints still, but I think 
 17  these definitions work pretty well. Some notes: 
 18   
 19  1) most of the differences have to do with aromaticity 
 20  2) there's a discrepancy sometimes because the current RDKit 
 21  definitions do not require multiple matches to be distinct. e.g. the 
 22  SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my 
 23  definition. It's not clear to me what the correct behavior is. 
 24  3) Some keys are not fully defined in the MDL documentation 
 25  4) Two keys, 125 and 166, have to be done outside of SMARTS. 
 26  5) Key 1 (ISOTOPE) isn't defined 
 27   
 28  Rev history: 
 29  2006 (gl): Original open-source release 
 30  May 2011 (gl): Update some definitions based on feedback from Andrew Dalke 
 31   
 32  """ 
 33  from __future__ import print_function 
 34  from rdkit import Chem 
 35  from rdkit.Chem import rdMolDescriptors 
 36  from rdkit import DataStructs 
 37  # these are SMARTS patterns corresponding to the MDL MACCS keys 
 38  smartsPatts={ 
 39    1:('?',0), # ISOTOPE 
 40    #2:('[#104,#105,#106,#107,#106,#109,#110,#111,#112]',0),  # atomic num >103 Not complete 
 41    2:('[#104]',0),  # limit the above def'n since the RDKit only accepts up to #104 
 42    3:('[#32,#33,#34,#50,#51,#52,#82,#83,#84]',0), # Group IVa,Va,VIa Rows 4-6  
 43    4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0), # actinide 
 44    5:('[Sc,Ti,Y,Zr,Hf]',0), # Group IIIB,IVB (Sc...)   
 45    6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0), # Lanthanide 
 46    7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0), # Group VB,VIB,VIIB 
 47    8:('[!#6;!#1]1~*~*~*~1',0), # QAAA@1 
 48    9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0), # Group VIII (Fe...) 
 49    10:('[Be,Mg,Ca,Sr,Ba,Ra]',0), # Group IIa (Alkaline earth) 
 50    11:('*1~*~*~*~1',0), # 4M Ring 
 51    12:('[Cu,Zn,Ag,Cd,Au,Hg]',0), # Group IB,IIB (Cu..) 
 52    13:('[#8]~[#7](~[#6])~[#6]',0), # ON(C)C 
 53    14:('[#16]-[#16]',0), # S-S 
 54    15:('[#8]~[#6](~[#8])~[#8]',0), # OC(O)O 
 55    16:('[!#6;!#1]1~*~*~1',0), # QAA@1 
 56    17:('[#6]#[#6]',0), #CTC 
 57    18:('[#5,#13,#31,#49,#81]',0), # Group IIIA (B...)  
 58    19:('*1~*~*~*~*~*~*~1',0), # 7M Ring 
 59    20:('[#14]',0), #Si 
 60    21:('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]',0), # C=C(Q)Q 
 61    22:('*1~*~*~1',0), # 3M Ring 
 62    23:('[#7]~[#6](~[#8])~[#8]',0), # NC(O)O 
 63    24:('[#7]-[#8]',0), # N-O 
 64    25:('[#7]~[#6](~[#7])~[#7]',0), # NC(N)N 
 65    26:('[#6]=;@[#6](@*)@*',0), # C$=C($A)$A 
 66    27:('[I]',0), # I 
 67    28:('[!#6;!#1]~[CH2]~[!#6;!#1]',0), # QCH2Q 
 68    29:('[#15]',0),# P 
 69    30:('[#6]~[!#6;!#1](~[#6])(~[#6])~*',0), # CQ(C)(C)A 
 70    31:('[!#6;!#1]~[F,Cl,Br,I]',0), # QX 
 71    32:('[#6]~[#16]~[#7]',0), # CSN 
 72    33:('[#7]~[#16]',0), # NS 
 73    34:('[CH2]=*',0), # CH2=A 
 74    35:('[Li,Na,K,Rb,Cs,Fr]',0), # Group IA (Alkali Metal) 
 75    36:('[#16R]',0), # S Heterocycle 
 76    37:('[#7]~[#6](~[#8])~[#7]',0), # NC(O)N 
 77    38:('[#7]~[#6](~[#6])~[#7]',0), # NC(C)N 
 78    39:('[#8]~[#16](~[#8])~[#8]',0), # OS(O)O 
 79    40:('[#16]-[#8]',0), # S-O 
 80    41:('[#6]#[#7]',0), # CTN 
 81    42:('F',0), # F 
 82    43:('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]',0), # QHAQH 
 83    44:('[!#1;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#35;!#53]',0), # OTHER 
 84    45:('[#6]=[#6]~[#7]',0), # C=CN 
 85    46:('Br',0), # BR 
 86    47:('[#16]~*~[#7]',0), # SAN 
 87    48:('[#8]~[!#6;!#1](~[#8])(~[#8])',0), # OQ(O)O 
 88    49:('[!+0]',0), # CHARGE   
 89    50:('[#6]=[#6](~[#6])~[#6]',0), # C=C(C)C 
 90    51:('[#6]~[#16]~[#8]',0), # CSO 
 91    52:('[#7]~[#7]',0), # NN 
 92    53:('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]',0), # QHAAAQH 
 93    54:('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]',0), # QHAAQH 
 94    55:('[#8]~[#16]~[#8]',0), #OSO 
 95    56:('[#8]~[#7](~[#8])~[#6]',0), # ON(O)C 
 96    57:('[#8R]',0), # O Heterocycle 
 97    58:('[!#6;!#1]~[#16]~[!#6;!#1]',0), # QSQ 
 98    59:('[#16]!:*:*',0), # Snot%A%A 
 99    60:('[#16]=[#8]',0), # S=O 
100    61:('*~[#16](~*)~*',0), # AS(A)A 
101    62:('*@*!@*@*',0), # A$!A$A 
102    63:('[#7]=[#8]',0), # N=O 
103    64:('*@*!@[#16]',0), # A$A!S 
104    65:('c:n',0), # C%N 
105    66:('[#6]~[#6](~[#6])(~[#6])~*',0), # CC(C)(C)A 
106    67:('[!#6;!#1]~[#16]',0), # QS 
107    68:('[!#6;!#1;!H0]~[!#6;!#1;!H0]',0), # QHQH (&...) SPEC Incomplete 
108    69:('[!#6;!#1]~[!#6;!#1;!H0]',0), # QQH 
109    70:('[!#6;!#1]~[#7]~[!#6;!#1]',0), # QNQ 
110    71:('[#7]~[#8]',0), # NO 
111    72:('[#8]~*~*~[#8]',0), # OAAO 
112    73:('[#16]=*',0), # S=A 
113    74:('[CH3]~*~[CH3]',0), # CH3ACH3 
114    75:('*!@[#7]@*',0), # A!N$A 
115    76:('[#6]=[#6](~*)~*',0), # C=C(A)A 
116    77:('[#7]~*~[#7]',0), # NAN 
117    78:('[#6]=[#7]',0), # C=N 
118    79:('[#7]~*~*~[#7]',0), # NAAN 
119    80:('[#7]~*~*~*~[#7]',0), # NAAAN 
120    81:('[#16]~*(~*)~*',0), # SA(A)A 
121    82:('*~[CH2]~[!#6;!#1;!H0]',0), # ACH2QH 
122    83:('[!#6;!#1]1~*~*~*~*~1',0), # QAAAA@1 
123    84:('[NH2]',0), #NH2 
124    85:('[#6]~[#7](~[#6])~[#6]',0), # CN(C)C 
125    86:('[C;H2,H3][!#6;!#1][C;H2,H3]',0), # CH2QCH2 
126    87:('[F,Cl,Br,I]!@*@*',0), # X!A$A 
127    88:('[#16]',0), # S 
128    89:('[#8]~*~*~*~[#8]',0), # OAAAO 
129    90:('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',0), # QHAACH2A 
130    91:('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',0), # QHAAACH2A 
131    92:('[#8]~[#6](~[#7])~[#6]',0), # OC(N)C 
132    93:('[!#6;!#1]~[CH3]',0), # QCH3 
133    94:('[!#6;!#1]~[#7]',0), # QN 
134    95:('[#7]~*~*~[#8]',0), # NAAO 
135    96:('*1~*~*~*~*~1',0), # 5 M ring 
136    97:('[#7]~*~*~*~[#8]',0), # NAAAO 
137    98:('[!#6;!#1]1~*~*~*~*~*~1',0), # QAAAAA@1 
138    99:('[#6]=[#6]',0), # C=C 
139    100:('*~[CH2]~[#7]',0), # ACH2N 
140    101:('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',0), # 8M Ring or larger. This only handles up to ring sizes of 14 
141    102:('[!#6;!#1]~[#8]',0), # QO 
142    103:('Cl',0), # CL 
143    104:('[!#6;!#1;!H0]~*~[CH2]~*',0), # QHACH2A 
144    105:('*@*(@*)@*',0), # A$A($A)$A 
145    106:('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]',0), # QA(Q)Q 
146    107:('[F,Cl,Br,I]~*(~*)~*',0), # XA(A)A 
147    108:('[CH3]~*~*~*~[CH2]~*',0), # CH3AAACH2A 
148    109:('*~[CH2]~[#8]',0), # ACH2O 
149    110:('[#7]~[#6]~[#8]',0), # NCO 
150    111:('[#7]~*~[CH2]~*',0), # NACH2A 
151    112:('*~*(~*)(~*)~*',0), # AA(A)(A)A 
152    113:('[#8]!:*:*',0), # Onot%A%A 
153    114:('[CH3]~[CH2]~*',0), # CH3CH2A 
154    115:('[CH3]~*~[CH2]~*',0), # CH3ACH2A 
155    116:('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]',0), # CH3AACH2A 
156    117:('[#7]~*~[#8]',0), # NAO 
157    118:('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]',1), # ACH2CH2A > 1 
158    119:('[#7]=*',0), # N=A 
159    120:('[!#6;R]',1), # Heterocyclic atom > 1 (&...) Spec Incomplete 
160    121:('[#7;R]',0), # N Heterocycle 
161    122:('*~[#7](~*)~*',0), # AN(A)A 
162    123:('[#8]~[#6]~[#8]',0), # OCO 
163    124:('[!#6;!#1]~[!#6;!#1]',0), # QQ 
164    125:('?',0), # Aromatic Ring > 1 
165    126:('*!@[#8]!@*',0), # A!O!A 
166    127:('*@*!@[#8]',1), # A$A!O > 1 (&...) Spec Incomplete 
167    128:('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',0), # ACH2AAACH2A 
168    129:('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',0), # ACH2AACH2A 
169    130:('[!#6;!#1]~[!#6;!#1]',1), # QQ > 1 (&...)  Spec Incomplete 
170    131:('[!#6;!#1;!H0]',1), # QH > 1 
171    132:('[#8]~*~[CH2]~*',0), # OACH2A 
172    133:('*@*!@[#7]',0), # A$A!N 
173    134:('[F,Cl,Br,I]',0), # X (HALOGEN) 
174    135:('[#7]!:*:*',0), # Nnot%A%A 
175    136:('[#8]=*',1), # O=A>1  
176    137:('[!C;!c;R]',0), # Heterocycle 
177    138:('[!#6;!#1]~[CH2]~*',1), # QCH2A>1 (&...) Spec Incomplete 
178    139:('[O;!H0]',0), # OH 
179    140:('[#8]',3), # O > 3 (&...) Spec Incomplete 
180    141:('[CH3]',2), # CH3 > 2  (&...) Spec Incomplete 
181    142:('[#7]',1), # N > 1 
182    143:('*@*!@[#8]',0), # A$A!O 
183    144:('*!:*:*!:*',0), # Anot%A%Anot%A 
184    145:('*1~*~*~*~*~*~1',1), # 6M ring > 1 
185    146:('[#8]',2), # O > 2 
186    147:('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]',0), # ACH2CH2A 
187    148:('*~[!#6;!#1](~*)~*',0), # AQ(A)A 
188    149:('[C;H3,H4]',1), # CH3 > 1 
189    150:('*!@*@*!@*',0), # A!A$A!A 
190    151:('[#7;!H0]',0), # NH 
191    152:('[#8]~[#6](~[#6])~[#6]',0), # OC(C)C 
192    153:('[!#6;!#1]~[CH2]~*',0), # QCH2A 
193    154:('[#6]=[#8]',0), # C=O 
194    155:('*!@[CH2]!@*',0), # A!CH2!A 
195    156:('[#7]~*(~*)~*',0), # NA(A)A 
196    157:('[#6]-[#8]',0), # C-O 
197    158:('[#6]-[#7]',0), # C-N 
198    159:('[#8]',1), # O>1 
199    160:('[C;H3,H4]',0), #CH3 
200    161:('[#7]',0), # N 
201    162:('a',0), # Aromatic 
202    163:('*1~*~*~*~*~*~1',0), # 6M Ring 
203    164:('[#8]',0), # O 
204    165:('[R]',0), # Ring 
205    166:('?',0), # Fragments  FIX: this can't be done in SMARTS 
206    } 
207   
208  maccsKeys = None 
209   
210 -def _InitKeys(keyList,keyDict):
211 """ *Internal Use Only* 212 213 generates SMARTS patterns for the keys, run once 214 215 """ 216 assert len(keyList) == len(keyDict.keys()),'length mismatch' 217 for key in keyDict.keys(): 218 patt,count = keyDict[key] 219 if patt != '?': 220 sma = Chem.MolFromSmarts(patt) 221 if not sma: 222 print('SMARTS parser error for key #%d: %s'%(key,patt)) 223 else: 224 keyList[key-1] = sma,count
225
226 -def _pyGenMACCSKeys(mol,**kwargs):
227 """ generates the MACCS fingerprint for a molecules 228 229 **Arguments** 230 231 - mol: the molecule to be fingerprinted 232 233 - any extra keyword arguments are ignored 234 235 **Returns** 236 237 a _DataStructs.SparseBitVect_ containing the fingerprint. 238 239 >>> m = Chem.MolFromSmiles('CNO') 240 >>> bv = GenMACCSKeys(m) 241 >>> tuple(bv.GetOnBits()) 242 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164) 243 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC')) 244 >>> tuple(bv.GetOnBits()) 245 (74, 114, 149, 155, 160) 246 247 """ 248 global maccsKeys 249 if maccsKeys is None: 250 maccsKeys = [(None,0)]*len(smartsPatts.keys()) 251 _InitKeys(maccsKeys,smartsPatts) 252 ctor=kwargs.get('ctor',DataStructs.SparseBitVect) 253 254 res = ctor(len(maccsKeys)+1) 255 for i,(patt,count) in enumerate(maccsKeys): 256 if patt is not None: 257 if count==0: 258 res[i+1] = mol.HasSubstructMatch(patt) 259 else: 260 matches = mol.GetSubstructMatches(patt) 261 if len(matches) > count: 262 res[i+1] = 1 263 elif (i+1)==125: 264 # special case: num aromatic rings > 1 265 ri = mol.GetRingInfo() 266 nArom=0 267 res[125]=0 268 for ring in ri.BondRings(): 269 isArom=True 270 for bondIdx in ring: 271 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic(): 272 isArom=False 273 break 274 if isArom: 275 nArom+=1 276 if nArom>1: 277 res[125]=1 278 break 279 elif (i+1)==166: 280 res[166]=0 281 # special case: num frags > 1 282 if len(Chem.GetMolFrags(mol))>1: 283 res[166]=1 284 285 return res
286 GenMACCSKeys = rdMolDescriptors.GetMACCSKeysFingerprint 287 FingerprintMol = rdMolDescriptors.GetMACCSKeysFingerprint 288 289 #------------------------------------ 290 # 291 # doctest boilerplate 292 #
293 -def _test():
294 import doctest,sys 295 return doctest.testmod(sys.modules["__main__"])
296 297 if __name__ == '__main__': 298 import sys 299 failed,tried = _test() 300 sys.exit(failed) 301