1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35 try:
36 from rdkit.Avalon import pyAvalonTools
37 except ImportError:
38 raise ImportError("This code requires the RDKit to be built with AvalonTools support")
39
40 from rdkit.Chem.MolKey import InchiInfo
41 from rdkit import RDConfig
42 from rdkit import Chem
43 from collections import namedtuple
44 import logging
45 import os,re,uuid,base64,hashlib,tempfile
46
49
52
53 MOL_KEY_VERSION = '1'
54
55 ERROR_DICT = dict(BAD_MOLECULE=1,
56 ALIAS_CONVERSION_FAILED=2,
57 TRANSFORMED=4,
58 FRAGMENTS_FOUND=8,
59 EITHER_WARNING=16,
60 STEREO_ERROR=32,
61 DUBIOUS_STEREO_REMOVED=64,
62 ATOM_CLASH=128,
63 ATOM_CHECK_FAILED=256,
64 SIZE_CHECK_FAILED=512,
65 RECHARGED=1024,
66 STEREO_FORCED_BAD=2048,
67 STEREO_TRANSFORMED=4096,
68 TEMPLATE_TRANSFORMED=8192,
69 INCHI_COMPUTATION_ERROR=65536,
70 RDKIT_CONVERSION_ERROR=131072,
71 INCHI_READWRITE_ERROR=262144,
72 NULL_MOL=524288)
73
74 INCHI_COMPUTATION_ERROR = ERROR_DICT['INCHI_COMPUTATION_ERROR']
75 RDKIT_CONVERSION_ERROR = ERROR_DICT['RDKIT_CONVERSION_ERROR']
76 INCHI_READWRITE_ERROR = ERROR_DICT['INCHI_READWRITE_ERROR']
77 NULL_MOL = ERROR_DICT['NULL_MOL']
78
79 BAD_SET = pyAvalonTools.StruChkResult.bad_set | INCHI_COMPUTATION_ERROR | RDKIT_CONVERSION_ERROR | INCHI_READWRITE_ERROR | NULL_MOL
80
81 GET_STEREO_RE = re.compile(r'^InChI=1S(.*?)/(t.*?)/m\d/s1(.*$)')
82 NULL_SMILES_RE = re.compile(r'^\s*$|^\s*NO_STRUCTURE\s*$', re.IGNORECASE)
83 PATTERN_NULL_MOL = r'^([\s0]+[1-9]+[\s]+V[\w]*)'
84
85 CHIRAL_POS = 12
86
87 T_NULL_MOL = (NULL_MOL, '')
88
89 stereo_code_dict = {}
90 stereo_code_dict['DEFAULT'] = 0
91 stereo_code_dict['S_ACHIR'] = 1
92 stereo_code_dict['S_ABS'] = 2
93 stereo_code_dict['S_REL'] = 3
94 stereo_code_dict['S_PART'] = 4
95 stereo_code_dict['S_UNKN'] = 5
96 stereo_code_dict['S_ABS_ACHIR'] = 6
97 stereo_code_dict['R_ONE'] = 11
98 stereo_code_dict['R_REL'] = 12
99 stereo_code_dict['R_OTHER'] = 13
100 stereo_code_dict['MX_ENANT'] = 21
101 stereo_code_dict['MX_DIAST'] = 22
102 stereo_code_dict['MX_SP2'] = 31
103 stereo_code_dict['MX_DIAST_ABS'] = 32
104 stereo_code_dict['MX_DIAST_REL'] = 33
105 stereo_code_dict['OTHER'] = 100
106 stereo_code_dict['UNDEFINED'] = 200
107
108
109
111 try :
112 new_string = re.sub(pat, sbt, my_string)
113 return new_string
114 except :
115 return None
116
118 pat = '\r\n{0,1}'
119 sbt = '\n'
120 return _fix_all(pat, sbt, my_string)
121
123 pat = '0V2000'
124 sbt = 'V2000'
125 return _fix_all(pat, sbt, my_string)
126
128 ''' look at atom count position (line 4, characters 0:3)
129 Return True if the count is >0, False if 0.
130 Throw BadMoleculeException if there are no characters
131 at the required position or if they cannot be converted
132 to a positive integer
133 '''
134 try:
135 str_a_count = ctab_lines[3][0:3]
136 a_count = int(str_a_count)
137 if a_count < 0:
138 raise BadMoleculeException('Atom count negative')
139 if a_count > 0:
140 rval = True
141 else:
142 rval = False
143 except IndexError:
144 raise BadMoleculeException('Invalid molfile format')
145 except ValueError:
146 raise BadMoleculeException('Expected integer')
147
148 return rval
149
151 ''' read the chiral flag (line 4, characters 12:15)
152 and set it to 0. Return True if it was 1, False if 0.
153 Throw BadMoleculeException if there are no characters
154 at the required position or if they where not 0 or 1
155 '''
156 try:
157 str_a_count = ctab_lines[3][12:15]
158 a_count = int(str_a_count)
159 if a_count == 0:
160 rval = False
161 elif a_count == 1:
162 rval = True
163 orig_line = ctab_lines[3]
164 ctab_lines[3] = orig_line[:CHIRAL_POS] + ' 0' + orig_line[CHIRAL_POS + 3:]
165 else:
166 raise BadMoleculeException('Expected chiral flag 0 or 1')
167 except IndexError:
168 raise BadMoleculeException('Invalid molfile format')
169 except ValueError:
170 raise BadMoleculeException('Expected integer, got {0}'.format(str_a_count))
171
172 return rval
173
174 __initCalled=False
176 global __initCalled
177 if configDir is None:
178 configDir=os.path.join(RDConfig.RDDataDir,'struchk')
179 if configDir[-1]!=os.path.sep:
180 configDir+=os.path.sep
181 if logFile is None:
182 fd = tempfile.NamedTemporaryFile(suffix='.log',delete=False)
183 fd.close()
184 logFile= fd.name
185 struchk_init = '''-tm
186 -ta {0}checkfgs.trn
187 -tm
188 -or
189 -ca {0}checkfgs.chk
190 -cc
191 -cl 3
192 -cs
193 -cn 999
194 -l {1}\n'''.format(configDir, logFile)
195 initRes=pyAvalonTools.InitializeCheckMol(struchk_init)
196 if initRes:
197 raise ValueError('bad result from InitializeCheckMol: '+str(initRes))
198 __initCalled=True
199
228
229 InchiResult = namedtuple('InchiResult',['error','inchi','fixed_ctab'])
231 """
232 >>> from rdkit.Chem.MolKey import MolKey
233 >>> from rdkit.Avalon import pyAvalonTools
234 >>> res = MolKey.GetInchiForCTAB(pyAvalonTools.Generate2DCoords('c1cn[nH]c1C(Cl)Br',True))
235 >>> res.inchi
236 'InChI=1/C4H4BrClN2/c5-4(6)3-1-2-7-8-3/h1-2,4H,(H,7,8)/t4?/f/h8H'
237 >>> res = MolKey.GetInchiForCTAB(pyAvalonTools.Generate2DCoords('c1c[nH]nc1C(Cl)Br',True))
238 >>> res.inchi
239 'InChI=1/C4H4BrClN2/c5-4(6)3-1-2-7-8-3/h1-2,4H,(H,7,8)/t4?/f/h7H'
240 >>>
241 """
242 inchi = None
243 ctab_str = ctab
244 (strucheck_err, fixed_mol) = CheckCTAB(ctab_str, False)
245 if strucheck_err & BAD_SET:
246 return (strucheck_err, None, fixed_mol)
247
248 conversion_err = 0
249 try:
250 r_mol = Chem.MolFromMolBlock(fixed_mol, sanitize=False)
251 if r_mol:
252 inchi = Chem.MolToInchi(r_mol, '/FixedH /SUU')
253 if not inchi:
254 conversion_err = INCHI_COMPUTATION_ERROR
255 else:
256 conversion_err = RDKIT_CONVERSION_ERROR
257 except Chem.InchiReadWriteError:
258 conversion_err = INCHI_READWRITE_ERROR
259
260 return InchiResult(strucheck_err | conversion_err, inchi, fixed_mol)
261
263 """ Normalize the stereo information (t-layer) to one selected isomer. """
264
265
266 new_stereo = '/m0/s3/'
267 stereo_match = GET_STEREO_RE.match(inchi)
268 if stereo_match:
269 inchi = stereo_match.group(1) + new_stereo + stereo_match.group(2)
270 return inchi
271
290
292 key_string = str(uuid.uuid1 ())
293 return key_string
294
296 pieces = []
297 ctab_str = ctab
298 if ctab_str:
299 ctab_str = _fix_line_ends(ctab_str)
300 ctab_str = _fix_chemdraw_header(ctab_str)
301 ctab_str = '\n'.join(ctab_str.split('\n')[3:])
302 pieces.append(ctab_str.replace('\n', r'\n'))
303 else:
304 pass
305 if stereo_category:
306 key_string = 'ST={0}'.format(stereo_category)
307 pieces.append(key_string)
308 if extra_stereo:
309 key_string = 'XTR={0}'.format(extra_stereo)
310 pieces.append(key_string)
311 key_string = '/'.join(pieces)
312 return key_string
313
314 -def _identify(err, ctab, inchi, stereo_category, extra_structure_desc=None):
315 """ Compute the molecule key based on the inchi string,
316 stereo category as well as extra structure
317 information """
318 key_string = _get_identification_string(err, ctab, inchi, stereo_category, extra_structure_desc)
319 if key_string:
320 return "{0}|{1}".format(MOL_KEY_VERSION,
321 base64.b64encode(hashlib.md5(key_string.encode('UTF-8')).digest()).decode())
322 else:
323 return None
324
326 assert n_def >= 0
327 assert n_udf >= 0
328 id_str = 'OTHER'
329
330 if n_def == 0 :
331 if n_udf == 0 :
332 id_str = 'S_ACHIR'
333 elif n_udf == 1 :
334 id_str = 'R_ONE'
335 else:
336 id_str = 'S_UNKN'
337 else:
338 if n_udf == 0 :
339 id_str = 'S_ABS'
340 else:
341 id_str = 'S_PART'
342 return id_str
343
345 " returns a list of error bit descriptions for the error code provided "
346 error_text_list = []
347 for err_dict_key in ERROR_DICT:
348 if (err & ERROR_DICT[err_dict_key]) > 0:
349 error_text_list.append(err_dict_key)
350 return error_text_list
351
352 MolKeyResult=namedtuple('MolKeyResult',['mol_key','error','inchi','fixed_ctab','stereo_code','stereo_comment'])
353 -def GetKeyForCTAB(ctab,stereo_info=None,stereo_comment=None,logger=None):
354 """
355 >>> from rdkit.Chem.MolKey import MolKey
356 >>> from rdkit.Avalon import pyAvalonTools
357 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1C(F)Cl',True))
358 >>> res.mol_key
359 '1|L7676nfGsSIU33wkx//NCg=='
360 >>> res.stereo_code
361 'R_ONE'
362 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1[C@H](F)Cl',True))
363 >>> res.mol_key
364 '1|Aj38EIxf13RuPDQG2A0UMw=='
365 >>> res.stereo_code
366 'S_ABS'
367 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1[C@@H](F)Cl',True))
368 >>> res.mol_key
369 '1|9ypfMrhxn1w0ncRooN5HXw=='
370 >>> res.stereo_code
371 'S_ABS'
372 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1[C@@H](F)Cl',True))
373 >>> res.mol_key
374 '1|c96jMSlbn7O9GW5d5uB9Mw=='
375 >>> res.stereo_code
376 'S_PART'
377 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc([C@H](Br)Cl)c1[C@@H](F)Cl',True))
378 >>> res.mol_key
379 '1|+B+GCEardrJteE8xzYdGLA=='
380 >>> res.stereo_code
381 'S_ABS'
382 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1C(F)Cl',True))
383 >>> res.mol_key
384 '1|5H9R3LvclagMXHp3Clrc/g=='
385 >>> res.stereo_code
386 'S_UNKN'
387 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1C(F)Cl',True),stereo_info='S_REL')
388 >>> res.mol_key
389 '1|cqKWVsUEY6QNpGCbDaDTYA=='
390 >>> res.stereo_code
391 'S_REL'
392 >>> res.inchi
393 'InChI=1/C8H6BrCl2F/c9-7(10)5-3-1-2-4-6(5)8(11)12/h1-4,7-8H/t7?,8?'
394
395 """
396 if logger is None:
397 logger = logging
398 try:
399 err, inchi, fixed_mol = GetInchiForCTAB(ctab)
400 except BadMoleculeException:
401 logger.warn('Corrupt molecule substituting no-struct: --->\n{0}\n<----'.format(ctab))
402 err = NULL_MOL
403 key = _identify(err, '', '', None, None)
404 return MolKeyResult(key, err, '', '', None, None)
405
406
407 stereo_category = None
408 extra_structure_desc = stereo_comment
409 if stereo_info:
410 info_flds = stereo_info.split(' ', 1)
411 code_fld = info_flds[0]
412 if code_fld in stereo_code_dict:
413 stereo_category = code_fld
414 if (not stereo_comment) and len(info_flds) > 1:
415 extra_structure_desc = info_flds[1].strip()
416 else:
417 logger.warn('stereo code {0} not recognized. Using default value for ctab.'.format(code_fld))
418
419 if not (err & BAD_SET):
420 (n_stereo, n_undef_stereo, is_meso, dummy) = InchiInfo.InchiInfo(inchi).get_sp3_stereo()['main']['non-isotopic']
421 if stereo_category == None or stereo_category == 'DEFAULT' :
422 stereo_category = _get_chiral_identification_string(n_stereo - n_undef_stereo,
423 n_undef_stereo)
424 else:
425 raise NotImplementedError("currently cannot generate correct keys for molecules with struchk errors")
426 key = _identify(err, fixed_mol, inchi, stereo_category, extra_structure_desc)
427 return MolKeyResult(key, err, inchi, fixed_mol, stereo_category, extra_structure_desc)
428
429
430
431
432
433
434
436 import doctest,sys
437 return doctest.testmod(sys.modules["__main__"])
438
439
440 if __name__ == '__main__':
441 import sys
442 failed,tried = _test()
443 sys.exit(failed)
444