Package rdkit :: Package Chem :: Package MolKey :: Module MolKey
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MolKey.MolKey

  1  # 
  2  #  Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. 
  3  #  All rights reserved. 
  4  # 
  5  # Redistribution and use in source and binary forms, with or without 
  6  # modification, are permitted provided that the following conditions are 
  7  # met: 
  8  # 
  9  #     * Redistributions of source code must retain the above copyright 
 10  #       notice, this list of conditions and the following disclaimer. 
 11  #     * Redistributions in binary form must reproduce the above 
 12  #       copyright notice, this list of conditions and the following 
 13  #       disclaimer in the documentation and/or other materials provided 
 14  #       with the distribution. 
 15  #     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
 16  #       nor the names of its contributors may be used to endorse or promote 
 17  #       products derived from this software without specific prior written permission. 
 18  # 
 19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
 22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
 23  # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 24  # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 25  # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 26  # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 27  # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 28  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 29  # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 30  # 
 31  # Created by Greg Landrum based on code from Thomas Mueller 
 32  # August 2012 
 33  # 
 34  #pylint: disable=C0111,W0311 
 35  try: 
 36      from rdkit.Avalon import pyAvalonTools 
 37  except ImportError: 
 38      raise ImportError("This code requires the RDKit to be built with AvalonTools support") 
 39   
 40  from rdkit.Chem.MolKey import InchiInfo 
 41  from rdkit import RDConfig 
 42  from rdkit import Chem 
 43  from collections import namedtuple 
 44  import logging 
 45  import os,re,uuid,base64,hashlib,tempfile 
 46   
47 -class MolIdentifierException(Exception):
48 pass
49
50 -class BadMoleculeException(Exception):
51 pass
52 53 MOL_KEY_VERSION = '1' 54 55 ERROR_DICT = dict(BAD_MOLECULE=1, 56 ALIAS_CONVERSION_FAILED=2, 57 TRANSFORMED=4, 58 FRAGMENTS_FOUND=8, 59 EITHER_WARNING=16, 60 STEREO_ERROR=32, 61 DUBIOUS_STEREO_REMOVED=64, 62 ATOM_CLASH=128, 63 ATOM_CHECK_FAILED=256, 64 SIZE_CHECK_FAILED=512, 65 RECHARGED=1024, 66 STEREO_FORCED_BAD=2048, 67 STEREO_TRANSFORMED=4096, 68 TEMPLATE_TRANSFORMED=8192, 69 INCHI_COMPUTATION_ERROR=65536, 70 RDKIT_CONVERSION_ERROR=131072, 71 INCHI_READWRITE_ERROR=262144, 72 NULL_MOL=524288) 73 74 INCHI_COMPUTATION_ERROR = ERROR_DICT['INCHI_COMPUTATION_ERROR'] 75 RDKIT_CONVERSION_ERROR = ERROR_DICT['RDKIT_CONVERSION_ERROR'] 76 INCHI_READWRITE_ERROR = ERROR_DICT['INCHI_READWRITE_ERROR'] 77 NULL_MOL = ERROR_DICT['NULL_MOL'] 78 79 BAD_SET = pyAvalonTools.StruChkResult.bad_set | INCHI_COMPUTATION_ERROR | RDKIT_CONVERSION_ERROR | INCHI_READWRITE_ERROR | NULL_MOL 80 81 GET_STEREO_RE = re.compile(r'^InChI=1S(.*?)/(t.*?)/m\d/s1(.*$)') 82 NULL_SMILES_RE = re.compile(r'^\s*$|^\s*NO_STRUCTURE\s*$', re.IGNORECASE) 83 PATTERN_NULL_MOL = r'^([\s0]+[1-9]+[\s]+V[\w]*)' 84 85 CHIRAL_POS = 12 86 87 T_NULL_MOL = (NULL_MOL, '') # the NULL mol result tuple 88 89 stereo_code_dict = {} 90 stereo_code_dict['DEFAULT'] = 0 91 stereo_code_dict['S_ACHIR'] = 1 92 stereo_code_dict['S_ABS'] = 2 93 stereo_code_dict['S_REL'] = 3 94 stereo_code_dict['S_PART'] = 4 95 stereo_code_dict['S_UNKN'] = 5 96 stereo_code_dict['S_ABS_ACHIR'] = 6 97 stereo_code_dict['R_ONE'] = 11 98 stereo_code_dict['R_REL'] = 12 99 stereo_code_dict['R_OTHER'] = 13 100 stereo_code_dict['MX_ENANT'] = 21 101 stereo_code_dict['MX_DIAST'] = 22 102 stereo_code_dict['MX_SP2'] = 31 103 stereo_code_dict['MX_DIAST_ABS'] = 32 104 stereo_code_dict['MX_DIAST_REL'] = 33 105 stereo_code_dict['OTHER'] = 100 106 stereo_code_dict['UNDEFINED'] = 200 107 108 109
110 -def _fix_all(pat, sbt, my_string) :
111 try : 112 new_string = re.sub(pat, sbt, my_string) 113 return new_string 114 except : 115 return None
116
117 -def _fix_line_ends(my_string) :
118 pat = '\r\n{0,1}' 119 sbt = '\n' 120 return _fix_all(pat, sbt, my_string)
121
122 -def _fix_chemdraw_header(my_string) :
123 pat = '0V2000' 124 sbt = 'V2000' 125 return _fix_all(pat, sbt, my_string)
126
127 -def _ctab_has_atoms(ctab_lines):
128 ''' look at atom count position (line 4, characters 0:3) 129 Return True if the count is >0, False if 0. 130 Throw BadMoleculeException if there are no characters 131 at the required position or if they cannot be converted 132 to a positive integer 133 ''' 134 try: 135 str_a_count = ctab_lines[3][0:3] 136 a_count = int(str_a_count) 137 if a_count < 0: 138 raise BadMoleculeException('Atom count negative') 139 if a_count > 0: 140 rval = True 141 else: 142 rval = False 143 except IndexError: 144 raise BadMoleculeException('Invalid molfile format') 145 except ValueError: 146 raise BadMoleculeException('Expected integer') 147 148 return rval
149
150 -def _ctab_remove_chiral_flag(ctab_lines):
151 ''' read the chiral flag (line 4, characters 12:15) 152 and set it to 0. Return True if it was 1, False if 0. 153 Throw BadMoleculeException if there are no characters 154 at the required position or if they where not 0 or 1 155 ''' 156 try: 157 str_a_count = ctab_lines[3][12:15] 158 a_count = int(str_a_count) 159 if a_count == 0: 160 rval = False 161 elif a_count == 1: 162 rval = True 163 orig_line = ctab_lines[3] 164 ctab_lines[3] = orig_line[:CHIRAL_POS] + ' 0' + orig_line[CHIRAL_POS + 3:] 165 else: 166 raise BadMoleculeException('Expected chiral flag 0 or 1') 167 except IndexError: 168 raise BadMoleculeException('Invalid molfile format') 169 except ValueError: 170 raise BadMoleculeException('Expected integer, got {0}'.format(str_a_count)) 171 172 return rval
173 174 __initCalled=False
175 -def initStruchk(configDir=None,logFile=None):
176 global __initCalled 177 if configDir is None: 178 configDir=os.path.join(RDConfig.RDDataDir,'struchk') 179 if configDir[-1]!=os.path.sep: 180 configDir+=os.path.sep 181 if logFile is None: 182 fd = tempfile.NamedTemporaryFile(suffix='.log',delete=False) 183 fd.close() 184 logFile= fd.name 185 struchk_init = '''-tm 186 -ta {0}checkfgs.trn 187 -tm 188 -or 189 -ca {0}checkfgs.chk 190 -cc 191 -cl 3 192 -cs 193 -cn 999 194 -l {1}\n'''.format(configDir, logFile) 195 initRes=pyAvalonTools.InitializeCheckMol(struchk_init) 196 if initRes: 197 raise ValueError('bad result from InitializeCheckMol: '+str(initRes)) 198 __initCalled=True
199
200 -def CheckCTAB(ctab, isSmiles=True):
201 if not __initCalled: 202 initStruchk() 203 mol_str = ctab 204 if not mol_str: 205 raise BadMoleculeException('Unexpected blank or NULL molecule') 206 else: 207 mol_str = _fix_line_ends(mol_str) 208 mol_str = _fix_chemdraw_header(mol_str) 209 210 if isSmiles: # branch for NULL_MOL checks 211 if mol_str and NULL_SMILES_RE.match(mol_str): 212 rval = T_NULL_MOL 213 else: 214 rval = pyAvalonTools.CheckMoleculeString(mol_str, isSmiles) 215 else: 216 # decompose the ctab into lines 217 # the line terminator may be \n or \r\n, or even r'\n' 218 ctab_lines = mol_str.split('\n') 219 if len(ctab_lines) <= 3: 220 raise BadMoleculeException('Not enough lines in CTAB') 221 _ctab_remove_chiral_flag(ctab_lines) 222 if not _ctab_has_atoms(ctab_lines): 223 rval = T_NULL_MOL 224 else: # reassemble the ctab lines into one string. 225 mol_str = '\n'.join(ctab_lines) 226 rval = pyAvalonTools.CheckMoleculeString(mol_str, isSmiles) 227 return rval
228 229 InchiResult = namedtuple('InchiResult',['error','inchi','fixed_ctab'])
230 -def GetInchiForCTAB(ctab):
231 """ 232 >>> from rdkit.Chem.MolKey import MolKey 233 >>> from rdkit.Avalon import pyAvalonTools 234 >>> res = MolKey.GetInchiForCTAB(pyAvalonTools.Generate2DCoords('c1cn[nH]c1C(Cl)Br',True)) 235 >>> res.inchi 236 'InChI=1/C4H4BrClN2/c5-4(6)3-1-2-7-8-3/h1-2,4H,(H,7,8)/t4?/f/h8H' 237 >>> res = MolKey.GetInchiForCTAB(pyAvalonTools.Generate2DCoords('c1c[nH]nc1C(Cl)Br',True)) 238 >>> res.inchi 239 'InChI=1/C4H4BrClN2/c5-4(6)3-1-2-7-8-3/h1-2,4H,(H,7,8)/t4?/f/h7H' 240 >>> 241 """ 242 inchi = None 243 ctab_str = ctab 244 (strucheck_err, fixed_mol) = CheckCTAB(ctab_str, False) 245 if strucheck_err & BAD_SET: 246 return (strucheck_err, None, fixed_mol) 247 248 conversion_err = 0 249 try: 250 r_mol = Chem.MolFromMolBlock(fixed_mol, sanitize=False) 251 if r_mol: 252 inchi = Chem.MolToInchi(r_mol, '/FixedH /SUU') 253 if not inchi: 254 conversion_err = INCHI_COMPUTATION_ERROR 255 else: 256 conversion_err = RDKIT_CONVERSION_ERROR 257 except Chem.InchiReadWriteError: 258 conversion_err = INCHI_READWRITE_ERROR 259 # keep warnings from strucheck 260 return InchiResult(strucheck_err | conversion_err, inchi, fixed_mol)
261
262 -def _make_racemate_inchi(inchi):
263 """ Normalize the stereo information (t-layer) to one selected isomer. """ 264 # set stereo type = 3 (racemate) for consistency 265 # reset inverted flag to m0 - not inverted 266 new_stereo = '/m0/s3/' 267 stereo_match = GET_STEREO_RE.match(inchi) 268 if stereo_match: 269 inchi = stereo_match.group(1) + new_stereo + stereo_match.group(2) 270 return inchi
271
272 -def _get_identification_string(err, ctab, inchi, stereo_category=None, extra_stereo=None):
273 if err & NULL_MOL : 274 return _get_null_mol_identification_string(extra_stereo) 275 elif err & BAD_SET: # bad molecules get special key 276 return _get_bad_mol_identification_string(ctab, stereo_category, extra_stereo) 277 278 # make key string 279 pieces = [] 280 if inchi: 281 pieces.append(inchi) 282 if not stereo_category: 283 raise MolIdentifierException('Stereo category may not be left undefined') 284 else: 285 pieces.append('ST=' + stereo_category) 286 if extra_stereo: 287 pieces.append('XTR=' + extra_stereo) 288 key_string = '/'.join(pieces) 289 return key_string
290
291 -def _get_null_mol_identification_string(extra_stereo) :
292 key_string = str(uuid.uuid1 ()) 293 return key_string
294
295 -def _get_bad_mol_identification_string(ctab, stereo_category, extra_stereo):
296 pieces = [] 297 ctab_str = ctab 298 if ctab_str: # make the ctab part of the key if available 299 ctab_str = _fix_line_ends(ctab_str) 300 ctab_str = _fix_chemdraw_header(ctab_str) 301 ctab_str = '\n'.join(ctab_str.split('\n')[3:]) 302 pieces.append(ctab_str.replace('\n', r'\n')) # make a handy one-line string 303 else: 304 pass 305 if stereo_category: # add xtra info if available 306 key_string = 'ST={0}'.format(stereo_category) 307 pieces.append(key_string) 308 if extra_stereo: # add xtra info if available 309 key_string = 'XTR={0}'.format(extra_stereo) 310 pieces.append(key_string) 311 key_string = '/'.join(pieces) 312 return key_string
313
314 -def _identify(err, ctab, inchi, stereo_category, extra_structure_desc=None):
315 """ Compute the molecule key based on the inchi string, 316 stereo category as well as extra structure 317 information """ 318 key_string = _get_identification_string(err, ctab, inchi, stereo_category, extra_structure_desc) 319 if key_string: 320 return "{0}|{1}".format(MOL_KEY_VERSION, 321 base64.b64encode(hashlib.md5(key_string.encode('UTF-8')).digest()).decode()) #pylint: disable=E1101 322 else: 323 return None
324
325 -def _get_chiral_identification_string(n_def, n_udf) :
326 assert n_def >= 0 327 assert n_udf >= 0 328 id_str = 'OTHER' 329 330 if n_def == 0 : # no defined stereocenters 331 if n_udf == 0 : # no undefined ones either 332 id_str = 'S_ACHIR' # -> achiral 333 elif n_udf == 1 : # one undefined, no defined 334 id_str = 'R_ONE' # -> racemate by convention 335 else: # several undefined, no defined 336 id_str = 'S_UNKN' # -> can't say anything based on the drawing 337 else: # some stereo defined 338 if n_udf == 0 : # fully specified stereo 339 id_str = 'S_ABS' # -> absolute stereo 340 else: # multiple possibilities 341 id_str = 'S_PART' # -> assume single compound (can usually be separated) 342 return id_str
343
344 -def ErrorBitsToText(err):
345 " returns a list of error bit descriptions for the error code provided " 346 error_text_list = [] 347 for err_dict_key in ERROR_DICT: 348 if (err & ERROR_DICT[err_dict_key]) > 0: 349 error_text_list.append(err_dict_key) 350 return error_text_list
351 352 MolKeyResult=namedtuple('MolKeyResult',['mol_key','error','inchi','fixed_ctab','stereo_code','stereo_comment'])
353 -def GetKeyForCTAB(ctab,stereo_info=None,stereo_comment=None,logger=None):
354 """ 355 >>> from rdkit.Chem.MolKey import MolKey 356 >>> from rdkit.Avalon import pyAvalonTools 357 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1C(F)Cl',True)) 358 >>> res.mol_key 359 '1|L7676nfGsSIU33wkx//NCg==' 360 >>> res.stereo_code 361 'R_ONE' 362 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1[C@H](F)Cl',True)) 363 >>> res.mol_key 364 '1|Aj38EIxf13RuPDQG2A0UMw==' 365 >>> res.stereo_code 366 'S_ABS' 367 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1ccccc1[C@@H](F)Cl',True)) 368 >>> res.mol_key 369 '1|9ypfMrhxn1w0ncRooN5HXw==' 370 >>> res.stereo_code 371 'S_ABS' 372 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1[C@@H](F)Cl',True)) 373 >>> res.mol_key 374 '1|c96jMSlbn7O9GW5d5uB9Mw==' 375 >>> res.stereo_code 376 'S_PART' 377 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc([C@H](Br)Cl)c1[C@@H](F)Cl',True)) 378 >>> res.mol_key 379 '1|+B+GCEardrJteE8xzYdGLA==' 380 >>> res.stereo_code 381 'S_ABS' 382 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1C(F)Cl',True)) 383 >>> res.mol_key 384 '1|5H9R3LvclagMXHp3Clrc/g==' 385 >>> res.stereo_code 386 'S_UNKN' 387 >>> res=MolKey.GetKeyForCTAB(pyAvalonTools.Generate2DCoords('c1cccc(C(Br)Cl)c1C(F)Cl',True),stereo_info='S_REL') 388 >>> res.mol_key 389 '1|cqKWVsUEY6QNpGCbDaDTYA==' 390 >>> res.stereo_code 391 'S_REL' 392 >>> res.inchi 393 'InChI=1/C8H6BrCl2F/c9-7(10)5-3-1-2-4-6(5)8(11)12/h1-4,7-8H/t7?,8?' 394 395 """ 396 if logger is None: 397 logger = logging 398 try: 399 err, inchi, fixed_mol = GetInchiForCTAB(ctab) 400 except BadMoleculeException: 401 logger.warn('Corrupt molecule substituting no-struct: --->\n{0}\n<----'.format(ctab)) 402 err = NULL_MOL 403 key = _identify(err, '', '', None, None) 404 return MolKeyResult(key, err, '', '', None, None) 405 406 # read or estimate stereo category and/or extra structure description 407 stereo_category = None 408 extra_structure_desc = stereo_comment 409 if stereo_info: # check stereo_info field for coded stereo category and extra stereo info 410 info_flds = stereo_info.split(' ', 1) 411 code_fld = info_flds[0] 412 if code_fld in stereo_code_dict: 413 stereo_category = code_fld 414 if (not stereo_comment) and len(info_flds) > 1: 415 extra_structure_desc = info_flds[1].strip() 416 else: 417 logger.warn('stereo code {0} not recognized. Using default value for ctab.'.format(code_fld)) 418 419 if not (err & BAD_SET): 420 (n_stereo, n_undef_stereo, is_meso, dummy) = InchiInfo.InchiInfo(inchi).get_sp3_stereo()['main']['non-isotopic'] 421 if stereo_category == None or stereo_category == 'DEFAULT' : # compute if not set 422 stereo_category = _get_chiral_identification_string(n_stereo - n_undef_stereo, 423 n_undef_stereo) 424 else: 425 raise NotImplementedError("currently cannot generate correct keys for molecules with struchk errors") 426 key = _identify(err, fixed_mol, inchi, stereo_category, extra_structure_desc) 427 return MolKeyResult(key, err, inchi, fixed_mol, stereo_category, extra_structure_desc)
428 429 430 431 #------------------------------------ 432 # 433 # doctest boilerplate 434 #
435 -def _test():
436 import doctest,sys 437 return doctest.testmod(sys.modules["__main__"])
438 439 440 if __name__ == '__main__': 441 import sys 442 failed,tried = _test() 443 sys.exit(failed) 444