Package rdkit :: Package ML :: Package InfoTheory :: Module BitRank
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.InfoTheory.BitRank

  1  # 
  2  #  Copyright (C) 2001,2002,2003  greg Landrum and Rational Discovery LLC 
  3  # 
  4  """ Functionality for ranking bits using info gains 
  5   
  6   **Definitions used in this module** 
  7   
  8      - *sequence*: an object capable of containing other objects which supports 
  9        __getitem__() and __len__().  Examples of these include lists, tuples, and 
 10        Numeric arrays. 
 11   
 12      - *IntVector*: an object containing integers which supports __getitem__() and 
 13         __len__(). Examples include lists, tuples, Numeric Arrays, and BitVects. 
 14   
 15   
 16   **NOTE**: Neither *sequences* nor *IntVectors* need to support item assignment. 
 17     It is perfectly acceptable for them to be read-only, so long as they are 
 18     random-access. 
 19   
 20  """ 
 21  import numpy 
 22  from rdkit.ML.InfoTheory import entropy 
 23   
24 -def FormCounts(bitVects,actVals,whichBit,nPossibleActs,nPossibleBitVals=2):
25 """ generates the counts matrix for a particular bit 26 27 **Arguments** 28 29 - bitVects: a *sequence* containing *IntVectors* 30 31 - actVals: a *sequence* 32 33 - whichBit: an integer, the bit number to use. 34 35 - nPossibleActs: the (integer) number of possible activity values. 36 37 - nPossibleBitVals: (optional) if specified, this integer provides the maximum 38 value attainable by the (increasingly inaccurately named) bits in _bitVects_. 39 40 **Returns** 41 42 a Numeric array with the counts 43 44 **Notes** 45 46 This is really intended for internal use. 47 48 """ 49 if len(bitVects) != len(actVals): raise ValueError('var and activity lists should be the same length') 50 res = numpy.zeros((nPossibleBitVals,nPossibleActs),numpy.integer) 51 for i in range(len(bitVects)): 52 res[bitVects[i][whichBit],actVals[i]] += 1 53 return res
54
55 -def CalcInfoGains(bitVects,actVals,nPossibleActs,nPossibleBitVals=2):
56 """ Calculates the information gain for a set of points and activity values 57 58 **Arguments** 59 60 - bitVects: a *sequence* containing *IntVectors* 61 62 - actVals: a *sequence* 63 64 - nPossibleActs: the (integer) number of possible activity values. 65 66 - nPossibleBitVals: (optional) if specified, this integer provides the maximum 67 value attainable by the (increasingly inaccurately named) bits in _bitVects_. 68 69 **Returns** 70 71 a list of floats 72 73 """ 74 if len(bitVects) != len(actVals): raise ValueError('var and activity lists should be the same length') 75 nBits = len(bitVects[0]) 76 res = numpy.zeros(nBits,Float) 77 78 for bit in range(nBits): 79 counts = FormCounts(bitVects,actVals,bit,nPossibleActs, 80 nPossibleBitVals=nPossibleBitVals) 81 res[bit] = entropy.InfoGain(counts) 82 return res
83
84 -def RankBits(bitVects,actVals,nPossibleBitVals=2, 85 metricFunc=CalcInfoGains):
86 """ Rank a set of bits according to a metric function 87 88 **Arguments** 89 90 - bitVects: a *sequence* containing *IntVectors* 91 92 - actVals: a *sequence* 93 94 - nPossibleBitVals: (optional) if specified, this integer provides the maximum 95 value attainable by the (increasingly inaccurately named) bits in _bitVects_. 96 97 - metricFunc: (optional) the metric function to be used. See _CalcInfoGains()_ 98 for a description of the signature of this function. 99 100 **Returns** 101 102 A 2-tuple containing: 103 104 - the relative order of the bits (a list of ints) 105 106 - the metric calculated for each bit (a list of floats) 107 108 """ 109 nPossibleActs = max(actVals)+1 110 metrics = metricFunc(bitVects,actVals,nPossibleActs, 111 nPossibleBitVals=nPossibleBitVals) 112 bitOrder = list(numpy.argsort(metrics)) 113 bitOrder.reverse() 114 return bitOrder,metrics
115 116
117 -def AnalyzeSparseVects(bitVects,actVals):
118 """ #DOC 119 120 **Arguments** 121 122 - bitVects: a *sequence* containing SBVs 123 124 - actVals: a *sequence* 125 126 **Returns** 127 128 a list of floats 129 130 **Notes** 131 132 - these need to be bit vects and binary activities 133 134 """ 135 nPts = len(bitVects) 136 if nPts != len(actVals): raise ValueError('var and activity lists should be the same length') 137 nBits = bitVects[0].GetSize() 138 139 actives = numpy.zeros(nBits,numpy.integer) 140 inactives = numpy.zeros(nBits,numpy.integer) 141 nActives,nInactives = 0,0 142 for i in range(nPts): 143 sig,act = bitVects[i],actVals[i] 144 onBitList = sig.GetOnBits() 145 if act: 146 for bit in onBitList: 147 actives[bit] += 1 148 nActives += 1 149 else: 150 for bit in onBitList: 151 inactives[bit] += 1 152 nInactives += 1 153 resTbl = numpy.zeros((2,2),numpy.integer) 154 res = [] 155 gains = [] 156 counts = [] 157 for bit in range(nBits): 158 nAct,nInact = actives[bit],inactives[bit] 159 if nAct or nInact: 160 resTbl[0,0] = nAct 161 resTbl[1,0] = nPts - nAct 162 resTbl[0,1] = nInact 163 resTbl[1,1] = nPts - nInact 164 gain = entropy.InfoGain(resTbl) 165 gains.append(gain) 166 res.append((bit,gain,nAct,nInact)) 167 return res,gains
168
169 -def SparseRankBits(bitVects,actVals,metricFunc=AnalyzeSparseVects):
170 """ Rank a set of bits according to a metric function 171 172 **Arguments** 173 174 - bitVects: a *sequence* containing SBVs 175 176 - actVals: a *sequence* 177 178 - metricFunc: (optional) the metric function to be used. See _SparseCalcInfoGains()_ 179 for a description of the signature of this function. 180 181 **Returns** 182 183 A 2-tuple containing: 184 185 - the relative order of the bits (a list of ints) 186 187 - the metric calculated for each bit (a list of floats) 188 189 **Notes** 190 191 - these need to be bit vects and binary activities 192 193 """ 194 info,metrics = metricFunc(bitVects,actVals) 195 bitOrder = list(numpy.argsort(metrics)) 196 bitOrder.reverse() 197 return bitOrder,info
198