Package rdkit :: Package Chem :: Package MolKey :: Module InchiInfo
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MolKey.InchiInfo

  1  # 
  2  #  Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. 
  3  #  All rights reserved. 
  4  #  
  5  # Redistribution and use in source and binary forms, with or without 
  6  # modification, are permitted provided that the following conditions are 
  7  # met:  
  8  # 
  9  #     * Redistributions of source code must retain the above copyright  
 10  #       notice, this list of conditions and the following disclaimer. 
 11  #     * Redistributions in binary form must reproduce the above 
 12  #       copyright notice, this list of conditions and the following  
 13  #       disclaimer in the documentation and/or other materials provided  
 14  #       with the distribution. 
 15  #     * Neither the name of Novartis Institutes for BioMedical Research Inc.  
 16  #       nor the names of its contributors may be used to endorse or promote  
 17  #       products derived from this software without specific prior written permission. 
 18  # 
 19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
 22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
 23  # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 24  # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 25  # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 26  # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 27  # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 28  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 29  # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 30  # 
 31  # Retrieve stereo and tautomer information from the the InChI string 
 32  # Created on Sep 23, 2010 
 33  # Original author: Thomas Muellerk muelleth 
 34  import logging 
 35  import re 
 36  import unittest 
 37  from rdkit import Chem 
 38   
 39  from rdkit.Chem import inchi 
 40  if not inchi.INCHI_AVAILABLE: 
 41      raise ImportError("This code requires the RDKit to be built with InChI suport") 
 42   
43 -def _is_achiral_by_symmetry(INCHI) :
44 mol = Chem.MolFromInchi(INCHI) 45 if not mol : 46 mol = Chem.MolFromInchi('InChI=1/{0}'.format(INCHI)) 47 48 try : 49 list_chiral = Chem.FindMolChiralCenters(mol, True, True) 50 except Exception : 51 return False 52 53 # is there any real chiral centre? 54 return len(list_chiral) == 0
55 56 console = logging.StreamHandler() 57 UPD_APP = logging.getLogger('inchiinfo.application') # application runtime information 58 59 version_re = re.compile('(.*?)/(.*)') # get version 60 reconnected_re = re.compile('(.*?)/r(.*)') # reconnected layer? 61 fixed_h_re = re.compile('(.*?)/f(.*)') # fixed-H layer? 62 isotope_re = re.compile('(.*?)/i(.*)') # isotope layer? 63 64 stereo_re = re.compile('.*\/t(.*?)\/.*') 65 stereo_all_re = re.compile('.*\/t([^\/]+)') 66 undef_stereo_re = re.compile('(\d+)\?') 67 all_stereo_re = re.compile('(\d+)[?+-]') 68 defined_stereo_re = re.compile('(\d+)[+-]') 69 h_layer_re = re.compile('.*\/h(.*)\/?') 70 mobile_h_group_re = re.compile('(\(H.+?\))') 71 mobile_h_atoms_re = re.compile(',(\d+)') 72
73 -class InchiInfo(object):
74
75 - def __init__(self, inchi_str):
76 (version, rest) = version_re.match(inchi_str).groups() 77 reconn_match = reconnected_re.match(rest) 78 79 connection_layers = {} 80 if reconn_match: 81 (connection_layers['id_disconnected'], connection_layers['id_reconnected']) = reconn_match.groups() 82 else: 83 (connection_layers['id']) = rest 84 85 fixed_h_layers = {} 86 for conn_layer in connection_layers: 87 fixed_h_layers[conn_layer] = {} 88 fixed_match = fixed_h_re.match(connection_layers[conn_layer]) 89 if fixed_match: 90 (fixed_h_layers[conn_layer]['main'], fixed_h_layers[conn_layer]['fixed_h']) = fixed_match.groups() 91 else: 92 fixed_h_layers[conn_layer]['main'] = connection_layers[conn_layer] 93 94 inchi = {} 95 for i0_layer in fixed_h_layers: 96 inchi[i0_layer] = {} 97 for i1_layer in fixed_h_layers[i0_layer]: 98 inchi[i0_layer][i1_layer] = {} 99 iso_match = isotope_re.match(fixed_h_layers[i0_layer][i1_layer]) 100 if iso_match: 101 (inchi[i0_layer][i1_layer]['non-isotopic'], inchi[i0_layer][i1_layer]['isotopic']) = iso_match.groups() 102 else: 103 inchi[i0_layer][i1_layer]['non-isotopic'] = fixed_h_layers[i0_layer][i1_layer] 104 105 self.parsed_inchi = inchi
106
107 - def get_sp3_stereo(self):
108 ''' retrieve sp3 stereo information 109 return a 4-item tuple containing 110 1) Number of stereocenters detected. If 0, the remaining items of the tuple = None 111 2) Number of undefined stereocenters. Must be smaller or equal to above 112 3) True if the molecule is a meso form (with chiral centers and a plane of symmetry) 113 4) Comma-separated list of internal atom numbers with sp3 stereochemistry 114 ''' 115 sp3_stereo = {} 116 117 for con_layer in self.parsed_inchi: 118 for fixed_layer in self.parsed_inchi[con_layer]: 119 sp3_stereo[fixed_layer] = {} 120 for iso_layer in self.parsed_inchi[con_layer][fixed_layer]: 121 sp3_stereo[fixed_layer][iso_layer] = {} 122 stereo_match = stereo_re.match(self.parsed_inchi[con_layer][fixed_layer][iso_layer]) 123 stereo_all_match = stereo_all_re.match(self.parsed_inchi[con_layer][fixed_layer][iso_layer]) 124 num_stereo = 0 125 num_undef_stereo = 0 126 is_meso = False 127 stereo = '' 128 stereo_centers = [] 129 undef_stereo_centers = [] 130 # match patterns with defined and undefined stereo 131 if stereo_match: 132 stereo = stereo_match.group(1) 133 # match patterns with only undefined stereo or for the MESO case 134 elif stereo_all_match : 135 stereo = stereo_all_match.group(1) 136 is_meso = len(defined_stereo_re.findall(stereo)) > 1 137 # Number of ALL stereo centres 138 stereo_centers = all_stereo_re.findall(stereo) 139 num_stereo = len(stereo_centers) 140 undef_stereo_centers = undef_stereo_re.findall(stereo) 141 num_undef_stereo = len(undef_stereo_centers) 142 # Meso centres -- VT -- 2011.12.08 143 inchi_layer = self.parsed_inchi[con_layer][fixed_layer][iso_layer] 144 is_meso = is_meso or (num_undef_stereo > 1 and _is_achiral_by_symmetry(inchi_layer)) 145 sp3_stereo[fixed_layer][iso_layer] = (num_stereo, num_undef_stereo, is_meso, stereo) 146 return sp3_stereo
147
148 - def get_mobile_h(self):
149 ''' retrieve mobile H (tautomer) information 150 return a 2-item tuple containing 151 1) Number of mobile hydrogen groups detected. If 0, next item = '' 152 2) List of groups 153 ''' 154 mobile_h = {} 155 for con_layer in self.parsed_inchi: 156 for fixed_layer in self.parsed_inchi[con_layer]: 157 mobile_h[fixed_layer] = {} 158 for iso_layer in self.parsed_inchi[con_layer][fixed_layer]: 159 num_groups = 0 160 mobile_h_groups = '' 161 h_layer_match = h_layer_re.match(self.parsed_inchi[con_layer][fixed_layer][iso_layer]) 162 if h_layer_match: 163 mobile_h_matches = mobile_h_group_re.findall(h_layer_match.group(1)) 164 num_groups = len(mobile_h_matches) 165 mobile_h_groups = ','.join(mobile_h_matches) 166 mobile_h[fixed_layer][iso_layer] = (num_groups, mobile_h_groups) 167 return mobile_h
168 169 # Test molecules as InChI strings 170 # tautomers 171 GUANINE='InChI=1S/C5H5N5O/c6-5-9-3-2(4(11)10-5)7-1-8-3/h1H0,(H4,6,7,8,9,10,11)' 172 # 'N=C(-O)N', '/FixedH /SUU' 173 UREA1 = 'InChI=1/CH4N2O/c2-1(3)4/h(H4,2,3,4)/f/h2,4H,3H2/b2-1?' 174 # 'NC(=O)N', '/FixedH /SUU' 175 UREA2 = 'InChI=1/CH4N2O/c2-1(3)4/h(H4,2,3,4)/f/h2-3H2' 176 TRITIATED_UREA='InChI=1S/CH4N2O/c2-1(3)4/h(H4,2,3,4)/i/hT3' 177 DEUTERATED_UREA='InChI=1S/CH4N2O/c2-1(3)4/h(H4,2,3,4)/i/hD2' 178 ACETIC_ACID='InChI=1S/C3H6O2/c1-2-3(4)5/h2H2,1H3,(H,4,5)' 179 ACETATE='InChI=1S/C3H6O2/c1-2-3(4)5/h2H2,1H3,(H,4,5)/p-1' 180 mobile1='InChI=1S/C5H5N3O2/c6-4(9)3-1-7-2-8-5(3)10/h1-2H,(H2,6,9)(H,7,8,10)' # invented 181 mobile2='InChI=1S/C7H10N4O/c1-4-2-5(3-6(8)12)11-7(9)10-4/h2H,3H2,1H3,(H2,8,12)(H2,9,10,11)' 182 183 # sp3 stereo 184 sugar1='InChI=1S/C14H20O9/c1-6-11(20-7(2)15)12(21-8(3)16)13(22-9(4)17)14(19-6)23-10(5)18/h6,11-14H,1-5H3/t6-,11-,12+,13+,14?/m0/s1' # L-rhamnopyranose (source: chemspider) 185 sugar2='InChI=1S/C12H20O6/c1-11(2)14-5-6(16-11)8-7(13)9-10(15-8)18-12(3,4)17-9/h6-10,13H,5H2,1-4H3/t6-,7-,8-,9-,10-/m1/s1' # MFCD00135634 (Diacetone-D-Glucose, souce: chemspider) 186 sp3_unk='InChI=1S/C12H21NO4/c1-8(2)10(12(15)16-3)13-11(14)9-5-4-6-17-7-9/h8-10H,4-7H2,1-3H3,(H,13,14)/t9?,10-/m0/s1' # derived from ChemSpider 34044335 187
188 -class TestInchiInfo(unittest.TestCase):
189
190 - def doTest(self, inchi, numSp3=0, numUndefSp3=0, numMobileHGroups=0, layer='non-isotopic'):
191 ii = InchiInfo(inchi) 192 (nSp3, nUndefSp3, isMeso, sp3Atoms) = ii.get_sp3_stereo()['main'][layer] 193 self.assertEqual(nSp3, numSp3) 194 self.assertEqual(nUndefSp3, numUndefSp3) 195 196 (nMobileHGroups, mobileHGroups) = ii.get_mobile_h()['main'][layer] 197 self.assertEqual(nMobileHGroups, numMobileHGroups)
198
199 - def testGuanine(self):
200 self.doTest(GUANINE, 0, 0, 1)
201 - def testTritiatedUrea(self):
202 self.doTest(TRITIATED_UREA, 0, 0, 1)
203 - def testDeuteratedUrea(self):
204 self.doTest(DEUTERATED_UREA, 0, 0, 1)
205 - def testAceticAcid(self):
206 self.doTest(ACETIC_ACID, 0, 0, 1)
207 - def testAcetate(self):
208 self.doTest(ACETATE, 0, 0, 1) 209
210 - def testMobile1(self):
211 self.doTest(mobile1, 0, 0, 2)
212 - def testMobile2(self):
213 self.doTest(mobile2, 0, 0, 2) 214 215 216 # sp3 stereo
217 - def testSugar1(self):
218 self.doTest(sugar1, 5, 1, 0)
219 - def testSugar2(self):
220 self.doTest(sugar2, 5, 0, 0)
221 - def testSP3_unk(self):
222 self.doTest(sp3_unk, 2, 1, 1) 223 224 if __name__ == '__main__': 225 unittest.main() 226