Package rdkit :: Package Chem :: Package Pharm2D :: Module SigFactory
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Pharm2D.SigFactory

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2003-2008 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ contains factory class for producing signatures 
 12   
 13   
 14  """ 
 15  from __future__ import print_function, division 
 16  from rdkit.DataStructs import SparseBitVect,IntSparseIntVect,LongSparseIntVect 
 17  from rdkit.Chem.Pharm2D import Utils 
 18  import copy 
 19  import numpy 
 20   
 21  _verbose = False 
 22   
 23   
24 -class SigFactory(object):
25 """ 26 27 SigFactory's are used by creating one, setting the relevant 28 parameters, then calling the GetSignature() method each time a 29 signature is required. 30 31 """
32 - def __init__(self,featFactory,useCounts=False,minPointCount=2,maxPointCount=3, 33 shortestPathsOnly=True,includeBondOrder=False,skipFeats=None, 34 trianglePruneBins=True):
35 self.featFactory = featFactory 36 self.useCounts=useCounts 37 self.minPointCount=minPointCount 38 self.maxPointCount=maxPointCount 39 self.shortestPathsOnly=shortestPathsOnly 40 self.includeBondOrder=includeBondOrder 41 self.trianglePruneBins=trianglePruneBins 42 if skipFeats is None: 43 self.skipFeats=[] 44 else: 45 self.skipFeats=skipFeats 46 self._bins = None 47 self.sigKlass=None
48 49
50 - def SetBins(self,bins):
51 """ bins should be a list of 2-tuples """ 52 self._bins = copy.copy(bins) 53 self.Init()
54
55 - def GetBins(self):
56 return self._bins
57 - def GetNumBins(self):
58 return len(self._bins)
59
60 - def GetSignature(self):
61 return self.sigKlass(self._sigSize)
62
63 - def _GetBitSummaryData(self,bitIdx):
64 nPts,combo,scaffold = self.GetBitInfo(bitIdx) 65 fams=self.GetFeatFamilies() 66 labels = [fams[x] for x in combo] 67 dMat = numpy.zeros((nPts,nPts),numpy.int) 68 dVect = Utils.nPointDistDict[nPts] 69 for idx in range(len(dVect)): 70 i,j = dVect[idx] 71 dMat[i,j] = scaffold[idx] 72 dMat[j,i] = scaffold[idx] 73 74 return nPts,combo,scaffold,labels,dMat
75
76 - def GetBitDescriptionAsText(self,bitIdx,includeBins=0,fullPage=1):
77 """ returns text with a description of the bit 78 79 **Arguments** 80 81 - bitIdx: an integer bit index 82 83 - includeBins: (optional) if nonzero, information about the bins will be 84 included as well 85 86 - fullPage: (optional) if nonzero, html headers and footers will 87 be included (so as to make the output a complete page) 88 89 **Returns** 90 91 a string with the HTML 92 93 """ 94 nPts,combo,scaffold,labels,dMat=self._GetBitSummaryData(bitIdx)
95
96 - def GetBitDescription(self,bitIdx):
97 """ returns a text description of the bit 98 99 **Arguments** 100 101 - bitIdx: an integer bit index 102 103 **Returns** 104 105 a string 106 107 """ 108 nPts,combo,scaffold,labels,dMat=self._GetBitSummaryData(bitIdx) 109 res = " ".join(labels)+ " " 110 for row in dMat: 111 res += "|"+" ".join([str(x) for x in row]) 112 res += "|" 113 return res
114
115 - def _findBinIdx(self,dists,bins,scaffolds):
116 """ OBSOLETE: this has been rewritten in C++ 117 Internal use only 118 Returns the index of a bin defined by a set of distances. 119 120 **Arguments** 121 122 - dists: a sequence of distances (not binned) 123 124 - bins: a sorted sequence of distance bins (2-tuples) 125 126 - scaffolds: a list of possible scaffolds (bin combinations) 127 128 **Returns** 129 130 an integer bin index 131 132 **Note** 133 134 the value returned here is not an index in the overall 135 signature. It is, rather, an offset of a scaffold in the 136 possible combinations of distance bins for a given 137 proto-pharmacophore. 138 139 """ 140 nBins = len(bins) 141 nDists = len(dists) 142 whichBins = [0]*nDists 143 144 # This would be a ton easier if we had contiguous bins 145 # i.e. if we could maintain the bins as a list of bounds) 146 # because then we could use Python's bisect module. 147 # Since we can't do that, we've got to do our own binary 148 # search here. 149 for i in range(nDists): 150 dist = dists[i] 151 where = -1 152 153 # do a simple binary search: 154 startP,endP = 0,len(bins) 155 while startP<endP: 156 midP = (startP+endP) // 2 157 begBin,endBin = bins[midP] 158 if dist < begBin: 159 endP = midP 160 elif dist >= endBin: 161 startP = midP+1 162 else: 163 where = midP 164 break 165 if where < 0: 166 return None 167 whichBins[i] = where 168 res = scaffolds.index(tuple(whichBins)) 169 if _verbose: 170 print('----- _fBI -----------') 171 print(' scaffolds:',scaffolds) 172 print(' bins:',whichBins) 173 print(' res:',res) 174 return res
175
176 - def GetFeatFamilies(self):
177 fams = [fam for fam in self.featFactory.GetFeatureFamilies() if fam not in self.skipFeats] 178 fams.sort() 179 return fams
180
181 - def GetMolFeats(self,mol):
182 featFamilies=self.GetFeatFamilies() 183 featMatches = {} 184 for fam in featFamilies: 185 featMatches[fam] = [] 186 feats = self.featFactory.GetFeaturesForMol(mol,includeOnly=fam) 187 for feat in feats: 188 featMatches[fam].append(feat.GetAtomIds()) 189 return [featMatches[x] for x in featFamilies]
190
191 - def GetBitIdx(self,featIndices,dists,sortIndices=True):
192 """ returns the index for a pharmacophore described using a set of 193 feature indices and distances 194 195 **Arguments*** 196 197 - featIndices: a sequence of feature indices 198 199 - dists: a sequence of distance between the features, only the 200 unique distances should be included, and they should be in the 201 order defined in Utils. 202 203 - sortIndices : sort the indices 204 205 **Returns** 206 207 the integer bit index 208 209 """ 210 nPoints = len(featIndices) 211 if nPoints>3: 212 raise NotImplementedError('>3 points not supported') 213 if nPoints < self.minPointCount: raise IndexError('bad number of points') 214 if nPoints > self.maxPointCount: raise IndexError('bad number of points') 215 216 # this is the start of the nPoint-point pharmacophores 217 startIdx = self._starts[nPoints] 218 219 # 220 # now we need to map the pattern indices to an offset from startIdx 221 # 222 if sortIndices: 223 tmp = list(featIndices) 224 tmp.sort() 225 featIndices = tmp 226 227 if featIndices[0]<0: raise IndexError('bad feature index') 228 if max(featIndices)>=self._nFeats: raise IndexError('bad feature index') 229 230 if nPoints==3: 231 featIndices,dists=Utils.OrderTriangle(featIndices,dists) 232 233 234 offset = Utils.CountUpTo(self._nFeats,nPoints,featIndices) 235 if _verbose: print('offset for feature %s: %d'%(str(featIndices),offset)) 236 offset *= len(self._scaffolds[len(dists)]) 237 238 239 try: 240 if _verbose: 241 print('>>>>>>>>>>>>>>>>>>>>>>>') 242 print('\tScaffolds:',repr(self._scaffolds[len(dists)]),type(self._scaffolds[len(dists)])) 243 print('\tDists:',repr(dists),type(dists)) 244 print('\tbins:',repr(self._bins),type(self._bins)) 245 bin = self._findBinIdx(dists,self._bins,self._scaffolds[len(dists)]) 246 except ValueError: 247 fams = self.GetFeatFamilies() 248 fams = [fams[x] for x in featIndices] 249 raise IndexError('distance bin not found: feats: %s; dists=%s; bins=%s; scaffolds: %s'%(fams,dists,self._bins,self._scaffolds)) 250 251 return startIdx + offset + bin
252
253 - def GetBitInfo(self,idx):
254 """ returns information about the given bit 255 256 **Arguments** 257 258 - idx: the bit index to be considered 259 260 **Returns** 261 262 a 3-tuple: 263 264 1) the number of points in the pharmacophore 265 266 2) the proto-pharmacophore (tuple of pattern indices) 267 268 3) the scaffold (tuple of distance indices) 269 270 """ 271 if idx >= self._sigSize: 272 raise IndexError('bad index (%d) queried. %d is the max'%(idx,self._sigSize)) 273 # first figure out how many points are in the p'cophore 274 nPts = self.minPointCount 275 while nPts < self.maxPointCount and self._starts[nPts+1]<=idx: 276 nPts+=1 277 278 # how far are we in from the start point? 279 offsetFromStart = idx - self._starts[nPts] 280 if _verbose: 281 print('\t %d Points, %d offset'%(nPts,offsetFromStart)) 282 283 # lookup the number of scaffolds 284 nDists = len(Utils.nPointDistDict[nPts]) 285 scaffolds = self._scaffolds[nDists] 286 287 nScaffolds = len(scaffolds) 288 289 # figure out to which proto-pharmacophore we belong: 290 protoIdx = offsetFromStart // nScaffolds 291 indexCombos = Utils.GetIndexCombinations(self._nFeats,nPts) 292 combo = tuple(indexCombos[protoIdx]) 293 if _verbose: 294 print('\t combo: %s'%(str(combo))) 295 296 # and which scaffold: 297 scaffoldIdx = offsetFromStart % nScaffolds 298 scaffold = scaffolds[scaffoldIdx] 299 if _verbose: 300 print('\t scaffold: %s'%(str(scaffold))) 301 return nPts,combo,scaffold
302
303 - def Init(self):
304 """ Initializes internal parameters. This **must** be called after 305 making any changes to the signature parameters 306 307 """ 308 accum = 0 309 self._scaffolds = [0]*(len(Utils.nPointDistDict[self.maxPointCount+1])) 310 self._starts = {} 311 if not self.skipFeats: 312 self._nFeats = len(self.featFactory.GetFeatureFamilies()) 313 else: 314 self._nFeats = 0 315 for fam in self.featFactory.GetFeatureFamilies(): 316 if fam not in self.skipFeats: 317 self._nFeats+=1 318 for i in range(self.minPointCount,self.maxPointCount+1): 319 self._starts[i] = accum 320 nDistsHere = len(Utils.nPointDistDict[i]) 321 scaffoldsHere = Utils.GetPossibleScaffolds(i,self._bins, 322 useTriangleInequality=self.trianglePruneBins) 323 nBitsHere = len(scaffoldsHere) 324 self._scaffolds[nDistsHere] = scaffoldsHere 325 pointsHere = Utils.NumCombinations(self._nFeats,i) * nBitsHere 326 accum += pointsHere 327 self._sigSize = accum 328 if not self.useCounts: 329 self.sigKlass = SparseBitVect 330 elif self._sigSize<2**31: 331 self.sigKlass = IntSparseIntVect 332 else: 333 self.sigKlass = LongSparseIntVect
334
335 - def GetSigSize(self):
336 return self._sigSize
337 try: 338 from rdkit.Chem.Pharmacophores import cUtils 339 except ImportError: 340 pass 341 else: 342 SigFactory._findBinIdx = cUtils.FindBinIdx 343