1
2
3
4 """ Functionality for ranking bits using info gains
5
6 **Definitions used in this module**
7
8 - *sequence*: an object capable of containing other objects which supports
9 __getitem__() and __len__(). Examples of these include lists, tuples, and
10 Numeric arrays.
11
12 - *IntVector*: an object containing integers which supports __getitem__() and
13 __len__(). Examples include lists, tuples, Numeric Arrays, and BitVects.
14
15
16 **NOTE**: Neither *sequences* nor *IntVectors* need to support item assignment.
17 It is perfectly acceptable for them to be read-only, so long as they are
18 random-access.
19
20 """
21 import numpy
22 from rdkit.ML.InfoTheory import entropy
23
54
55 -def CalcInfoGains(bitVects,actVals,nPossibleActs,nPossibleBitVals=2):
56 """ Calculates the information gain for a set of points and activity values
57
58 **Arguments**
59
60 - bitVects: a *sequence* containing *IntVectors*
61
62 - actVals: a *sequence*
63
64 - nPossibleActs: the (integer) number of possible activity values.
65
66 - nPossibleBitVals: (optional) if specified, this integer provides the maximum
67 value attainable by the (increasingly inaccurately named) bits in _bitVects_.
68
69 **Returns**
70
71 a list of floats
72
73 """
74 if len(bitVects) != len(actVals): raise ValueError('var and activity lists should be the same length')
75 nBits = len(bitVects[0])
76 res = numpy.zeros(nBits,Float)
77
78 for bit in range(nBits):
79 counts = FormCounts(bitVects,actVals,bit,nPossibleActs,
80 nPossibleBitVals=nPossibleBitVals)
81 res[bit] = entropy.InfoGain(counts)
82 return res
83
86 """ Rank a set of bits according to a metric function
87
88 **Arguments**
89
90 - bitVects: a *sequence* containing *IntVectors*
91
92 - actVals: a *sequence*
93
94 - nPossibleBitVals: (optional) if specified, this integer provides the maximum
95 value attainable by the (increasingly inaccurately named) bits in _bitVects_.
96
97 - metricFunc: (optional) the metric function to be used. See _CalcInfoGains()_
98 for a description of the signature of this function.
99
100 **Returns**
101
102 A 2-tuple containing:
103
104 - the relative order of the bits (a list of ints)
105
106 - the metric calculated for each bit (a list of floats)
107
108 """
109 nPossibleActs = max(actVals)+1
110 metrics = metricFunc(bitVects,actVals,nPossibleActs,
111 nPossibleBitVals=nPossibleBitVals)
112 bitOrder = list(numpy.argsort(metrics))
113 bitOrder.reverse()
114 return bitOrder,metrics
115
116
118 """ #DOC
119
120 **Arguments**
121
122 - bitVects: a *sequence* containing SBVs
123
124 - actVals: a *sequence*
125
126 **Returns**
127
128 a list of floats
129
130 **Notes**
131
132 - these need to be bit vects and binary activities
133
134 """
135 nPts = len(bitVects)
136 if nPts != len(actVals): raise ValueError('var and activity lists should be the same length')
137 nBits = bitVects[0].GetSize()
138
139 actives = numpy.zeros(nBits,numpy.integer)
140 inactives = numpy.zeros(nBits,numpy.integer)
141 nActives,nInactives = 0,0
142 for i in range(nPts):
143 sig,act = bitVects[i],actVals[i]
144 onBitList = sig.GetOnBits()
145 if act:
146 for bit in onBitList:
147 actives[bit] += 1
148 nActives += 1
149 else:
150 for bit in onBitList:
151 inactives[bit] += 1
152 nInactives += 1
153 resTbl = numpy.zeros((2,2),numpy.integer)
154 res = []
155 gains = []
156 counts = []
157 for bit in range(nBits):
158 nAct,nInact = actives[bit],inactives[bit]
159 if nAct or nInact:
160 resTbl[0,0] = nAct
161 resTbl[1,0] = nPts - nAct
162 resTbl[0,1] = nInact
163 resTbl[1,1] = nPts - nInact
164 gain = entropy.InfoGain(resTbl)
165 gains.append(gain)
166 res.append((bit,gain,nAct,nInact))
167 return res,gains
168
170 """ Rank a set of bits according to a metric function
171
172 **Arguments**
173
174 - bitVects: a *sequence* containing SBVs
175
176 - actVals: a *sequence*
177
178 - metricFunc: (optional) the metric function to be used. See _SparseCalcInfoGains()_
179 for a description of the signature of this function.
180
181 **Returns**
182
183 A 2-tuple containing:
184
185 - the relative order of the bits (a list of ints)
186
187 - the metric calculated for each bit (a list of floats)
188
189 **Notes**
190
191 - these need to be bit vects and binary activities
192
193 """
194 info,metrics = metricFunc(bitVects,actVals)
195 bitOrder = list(numpy.argsort(metrics))
196 bitOrder.reverse()
197 return bitOrder,info
198