1
2
3
4
5
6 """ Defines Naive Baysean classification model
7 Based on development in: Chapter 6 of "Machine Learning" by Tom Mitchell
8
9 """
10 import numpy
11 from rdkit.ML.Data import Quantize
12 from rdkit.six import iteritems
14 bid = 0
15 for bnd in qBounds:
16 if (val > bnd) :
17 bid += 1
18 return bid
19
20
21
22
23
25 """
26 _NaiveBayesClassifier_s can save the following pieces of internal state, accessible via
27 standard setter/getter functions:
28
29 1) _Examples_: a list of examples which have been predicted
30
31 2) _TrainingExamples_: List of training examples - the descriptor value of these examples
32 are quantized based on info gain using ML/Data/Quantize.py if necessary
33
34 3) _TestExamples_: the list of examples used to test the model
35
36 4) _BadExamples_ : list of examples that were incorrectly classified
37
38 4) _QBoundVals_: Quant bound values for each varaible - a list of lists
39
40 5) _QBounds_ : Number of bounds for each variable
41
42 """
43
44 - def __init__(self, attrs, nPossibleVals,
45 nQuantBounds, mEstimateVal=-1.0, useSigs=False):
46 """ Constructor
47
48 """
49 self._attrs = attrs
50 self._mEstimateVal = mEstimateVal
51 self._useSigs=useSigs
52
53 self._classProbs = {}
54
55 self._examples = []
56 self._trainingExamples = []
57 self._testExamples = []
58 self._badExamples = []
59 self._QBoundVals = {}
60 self._nClasses = nPossibleVals[-1]
61 self._qBounds = nQuantBounds
62 self._nPosVals = nPossibleVals
63 self._needsQuant = 1
64
65 self._name = ""
66 self.mprob = -1.0
67
68
69
70
71
72
73 self._condProbs = [None]*self._nClasses
74 for i in range(self._nClasses):
75 if not (hasattr(self,'_useSigs') and self._useSigs):
76 nA = max(self._attrs)+1
77 self._condProbs[i] = [None]*nA
78 for j in range(nA):
79 nV = self._nPosVals[j]
80 if self._qBounds[j]:
81 nV = max(nV,self._qBounds[j]+1)
82 self._condProbs[i][j] = [0.0]*nV
83 else:
84 self._condProbs[i] = {}
85 for idx in self._attrs:
86 self._condProbs[i][idx] = [0.0]*2
87
90
93
95 self.SetName('NaiveBayesCalssifier')
96
99
101 self._examples = examples
102
104 return self._trainingExamples
105
107 self._trainingExamples = examples
108
110 return self._testExamples
111
113 self._testExamples = examples
114
116 self._badExamples = examples
117
119 return self._badExamples
120
122 neg = len(self._trainingExamples)
123 natr = len(self._attrs)
124
125
126 allVals = numpy.zeros((neg, natr), 'd')
127 res = []
128 i = 0
129 for eg in self._trainingExamples:
130 res.append(eg[-1])
131 j = 0
132 for ai in self._attrs:
133 val = eg[ai]
134 allVals[i,j] = val
135 j += 1
136 i += 1
137
138
139
140 i = 0
141 for ai in self._attrs:
142 nbnds = self._qBounds[ai]
143 if nbnds > 0 :
144 mbnds = []
145 mgain = -1.0
146
147 for j in range(1,nbnds+1):
148 bnds, igain = Quantize.FindVarMultQuantBounds(allVals[:,i], j, res, self._nClasses)
149 if (igain > mgain) :
150 mbnds = bnds
151 mgain = igain
152 self._QBoundVals[ai] = mbnds
153 i += 1
154
156 """ We will assume at this point that the training examples have been set
157
158 We have to estmate the conditional probabilities for each of the (binned) descriptor
159 component give a outcome (or class). Also the probabilities for each class is estimated
160 """
161
162 n = len(self._trainingExamples)
163 for i in range(self._nClasses):
164 self._classProbs[i] = 0.0
165
166
167
168
169
170 if not self._useSigs and max(self._qBounds)>0:
171 self._computeQuantBounds()
172
173
174 ncls = {}
175
176
177 incr = 1.0/n
178 for eg in self._trainingExamples :
179 cls = eg[-1]
180 self._classProbs[cls] += incr
181 ncls[cls] = ncls.get(cls,0)+1
182 tmp = self._condProbs[cls]
183 if not self._useSigs:
184 for ai in self._attrs:
185 bid = eg[ai]
186 if self._qBounds[ai] > 0 :
187 bid = _getBinId(bid, self._QBoundVals[ai])
188 tmp[ai][bid] += 1.0
189 else:
190 for ai in self._attrs:
191 if eg[1].GetBit(ai):
192 tmp[ai][1] += 1.0
193 else:
194 tmp[ai][0] += 1.0
195
196
197
198 for cls in range(self._nClasses) :
199 if not cls in ncls: continue
200
201 tmp = self._condProbs[cls]
202 for ai in self._attrs:
203 if not self._useSigs:
204 nbnds = self._nPosVals[ai]
205 if (self._qBounds[ai] > 0) :
206 nbnds = self._qBounds[ai]
207 else:
208 nbnds = 2
209 for bid in range(nbnds):
210 if self._mEstimateVal <= 0.0 :
211
212
213
214 tmp[ai][bid] /= ncls[cls]
215 else :
216
217
218
219
220
221
222
223
224
225
226
227 pdesc = 0.0
228 if self._qBounds[ai] > 0 :
229 pdesc = 1.0/(1 + len(self._QBoundVals[ai]))
230 elif (self._nPosVals[ai] > 0) :
231 pdesc = 1.0/(self._nPosVals[ai])
232 else :
233 raise ValueError('Neither Bounds set nor data pre-quantized for attribute ' + str(ai))
234 tmp[ai][bid] += (self._mEstimateVal)*pdesc
235 tmp[ai][bid] /= (ncls[cls] + self._mEstimateVal)
236
238 preds = []
239 for eg in examples:
240 pred = self.ClassifyExample(eg, appendExamples)
241 preds.append(int(pred))
242 return preds
243
245 """ returns the probability of the last prediction """
246 return self.mprob
247
249 """ Classify an example by summing over the conditional probabilities
250 The most likely class is the one with the largest probability
251 """
252 if appendExamples:
253 self._examples.append(example)
254 clsProb = {}
255 for key,prob in iteritems(self._classProbs):
256 clsProb[key] = prob
257 tmp = self._condProbs[key]
258 for ai in self._attrs:
259 if not (hasattr(self,'_useSigs') and self._useSigs):
260 bid = example[ai]
261 if self._qBounds[ai] > 0 :
262 bid = _getBinId(bid, self._QBoundVals[ai])
263 else:
264 if example[1].GetBit(ai):
265 bid=1
266 else:
267 bid=0
268 clsProb[key] *= tmp[ai][bid]
269
270 mkey = -1
271 self.mprob = -1.0
272 for key,prob in iteritems(clsProb):
273 if (prob > self.mprob) :
274 mkey = key
275 self.mprob = prob
276
277 return mkey
278