Package rdkit :: Package Chem :: Package MolDb :: Module Loader_orig
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MolDb.Loader_orig

  1  # $Id$ 
  2  # 
  3  #  Copyright (C) 2007-2008 Greg Landrum 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  from rdkit import Chem 
 11  from rdkit.Chem import AllChem 
 12  from rdkit.Chem import Lipinski,Descriptors,Crippen 
 13  from rdkit.Dbase.DbConnection import DbConnect 
 14  from rdkit.Dbase import DbModule 
 15  import re 
 16   
 17  #set up the logger: 
 18  import rdkit.RDLogger as logging 
 19  logger = logging.logger() 
 20  logger.setLevel(logging.INFO) 
 21   
22 -def ProcessMol(mol,typeConversions,globalProps,nDone,nameProp='_Name',nameCol='compound_id', 23 redraw=False,keepHs=False, 24 skipProps=False,addComputedProps=False, 25 skipSmiles=False, 26 uniqNames=None,namesSeen=None):
27 if not mol: 28 raise ValueError('no molecule') 29 if keepHs: 30 Chem.SanitizeMol(mol) 31 try: 32 nm = mol.GetProp(nameProp) 33 except KeyError: 34 nm = None 35 if not nm: 36 nm = 'Mol_%d'%nDone 37 if uniqNames and nm in namesSeen: 38 logger.error('duplicate compound id (%s) encountered. second instance skipped.'%nm) 39 return None 40 namesSeen.add(nm) 41 row = [nm] 42 if not skipProps: 43 if addComputedProps: 44 nHD=Lipinski.NumHDonors(mol) 45 mol.SetProp('DonorCount',str(nHD)) 46 nHA=Lipinski.NumHAcceptors(mol) 47 mol.SetProp('AcceptorCount',str(nHA)) 48 nRot=Lipinski.NumRotatableBonds(mol) 49 mol.SetProp('RotatableBondCount',str(nRot)) 50 MW=Descriptors.MolWt(mol) 51 mol.SetProp('AMW',str(MW)) 52 logp=Crippen.MolLogP(mol) 53 mol.SetProp('MolLogP',str(logp)) 54 55 pns = list(mol.GetPropNames()) 56 pD={} 57 for pi,pn in enumerate(pns): 58 if pn.lower()==nameCol.lower(): continue 59 pv = mol.GetProp(pn).strip() 60 if pv.find('>')<0 and pv.find('<')<0: 61 colTyp = globalProps.get(pn,2) 62 while colTyp>0: 63 try: 64 tpi = typeConversions[colTyp][1](pv) 65 except Exception: 66 colTyp-=1 67 else: 68 break 69 globalProps[pn]=colTyp 70 pD[pn]=typeConversions[colTyp][1](pv) 71 else: 72 pD[pn]=pv 73 else: 74 pD={} 75 if redraw: 76 AllChem.Compute2DCoords(m) 77 if not skipSmiles: 78 row.append(Chem.MolToSmiles(mol,True)) 79 row.append(DbModule.binaryHolder(mol.ToBinary())) 80 row.append(pD) 81 return row
82
83 -def ConvertRows(rows,globalProps,defaultVal,skipSmiles):
84 for i,row in enumerate(rows): 85 newRow = [row[0],row[1]] 86 pD=row[-1] 87 for pn in globalProps: 88 pv = pD.get(pn,defaultVal) 89 newRow.append(pv) 90 newRow.append(row[2]) 91 if not skipSmiles: 92 newRow.append(row[3]) 93 rows[i] = newRow
94
95 -def LoadDb(suppl,dbName,nameProp='_Name',nameCol='compound_id',silent=False, 96 redraw=False,errorsTo=None,keepHs=False,defaultVal='N/A',skipProps=False, 97 regName='molecules',skipSmiles=False,maxRowsCached=-1, 98 uniqNames=False,addComputedProps=False,lazySupplier=False, 99 startAnew=True):
100 if not lazySupplier: 101 nMols = len(suppl) 102 else: 103 nMols=-1 104 if not silent: 105 logger.info("Generating molecular database in file %s"%dbName) 106 if not lazySupplier: 107 logger.info(" Processing %d molecules"%nMols) 108 rows = [] 109 globalProps = {} 110 namesSeen = set() 111 nDone = 0 112 typeConversions={0:('varchar',str),1:('float',float),2:('int',int)} 113 for m in suppl: 114 nDone +=1 115 if not m: 116 if errorsTo: 117 if hasattr(suppl,'GetItemText'): 118 d = suppl.GetItemText(nDone-1) 119 errorsTo.write(d) 120 else: 121 logger.warning('full error file support not complete') 122 continue 123 124 row=ProcessMol(m,typeConversions,globalProps,nDone,nameProp=nameProp, 125 nameCol=nameCol,redraw=redraw, 126 keepHs=keepHs,skipProps=skipProps, 127 addComputedProps=addComputedProps,skipSmiles=skipSmiles, 128 uniqNames=uniqNames,namesSeen=namesSeen) 129 if row is None: continue 130 rows.append([nDone]+row) 131 if not silent and not nDone%100: 132 logger.info(' done %d'%nDone) 133 if len(rows)==maxRowsCached: 134 break 135 136 nameDef='%s varchar not null'%nameCol 137 if uniqNames: 138 nameDef += ' unique' 139 typs = ['guid integer not null primary key',nameDef] 140 pns = [] 141 for pn,v in globalProps.items(): 142 addNm = re.sub(r'[\W]','_',pn) 143 typs.append('%s %s'%(addNm,typeConversions[v][0])) 144 pns.append(pn.lower()) 145 146 if not skipSmiles: 147 if 'smiles' not in pns: 148 typs.append('smiles varchar') 149 else: 150 typs.append('cansmiles varchar') 151 typs.append('molpkl %s'%(DbModule.binaryTypeName)) 152 conn = DbConnect(dbName) 153 curs = conn.GetCursor() 154 if startAnew: 155 try: 156 curs.execute('drop table %s'%regName) 157 except Exception: 158 pass 159 curs.execute('create table %s (%s)'%(regName,','.join(typs))) 160 else: 161 curs.execute('select * from %s limit 1'%(regName,)) 162 ocolns = set([x[0] for x in curs.description]) 163 ncolns = set([x.split()[0] for x in typs]) 164 if ncolns != ocolns: 165 raise ValueError('Column names do not match: %s != %s'%(ocolns,ncolns)) 166 curs.execute('select max(guid) from %s'%(regName,)) 167 offset = curs.fetchone()[0] 168 for row in rows: 169 row[0] += offset 170 171 qs = ','.join([DbModule.placeHolder for x in typs]) 172 173 174 ConvertRows(rows,globalProps,defaultVal,skipSmiles) 175 curs.executemany('insert into %s values (%s)'%(regName,qs),rows) 176 conn.Commit() 177 178 rows = [] 179 while 1: 180 nDone +=1 181 try: 182 m = next(suppl) 183 except StopIteration: 184 break 185 if not m: 186 if errorsTo: 187 if hasattr(suppl,'GetItemText'): 188 d = suppl.GetItemText(nDone-1) 189 errorsTo.write(d) 190 else: 191 logger.warning('full error file support not complete') 192 continue 193 tmpProps={} 194 row=ProcessMol(m,typeConversions,globalProps,nDone,nameProp=nameProp, 195 nameCol=nameCol,redraw=redraw, 196 keepHs=keepHs,skipProps=skipProps, 197 addComputedProps=addComputedProps,skipSmiles=skipSmiles, 198 uniqNames=uniqNames,namesSeen=namesSeen) 199 if not row: continue 200 rows.append([nDone]+row) 201 if not silent and not nDone%100: 202 logger.info(' done %d'%nDone) 203 if len(rows)==maxRowsCached: 204 ConvertRows(rows,globalProps,defaultVal,skipSmiles) 205 curs.executemany('insert into %s values (%s)'%(regName,qs),rows) 206 conn.Commit() 207 rows = [] 208 if len(rows): 209 ConvertRows(rows,globalProps,defaultVal,skipSmiles) 210 curs.executemany('insert into %s values (%s)'%(regName,qs),rows) 211 conn.Commit()
212