1
2
3
4
5 """ Informational Entropy functions
6
7 The definitions used are the same as those in Tom Mitchell's
8 book "Machine Learning"
9
10 """
11 import numpy
12 import math
13
14 from rdkit.six.moves import xrange
15
16
17 try:
18 import rdkit.ML.InfoTheory.rdInfoTheory as cEntropy
19 except ImportError:
20 hascEntropy=0
21 else:
22 hascEntropy=1
23
24
25 _log2 = math.log(2)
26
28 """ Calculates the informational entropy of a set of results.
29
30 **Arguments**
31
32 results is a 1D Numeric array containing the number of times a
33 given set hits each possible result.
34 For example, if a function has 3 possible results, and the
35 variable in question hits them 5, 6 and 1 times each,
36 results would be [5,6,1]
37
38 **Returns**
39
40 the informational entropy
41
42 """
43 nInstances = float(sum(results))
44 if nInstances == 0:
45
46 return 0
47 probs = results/nInstances
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63 t = numpy.choose(numpy.greater(probs,0.0),(1,probs))
64 return sum(-probs*numpy.log(t)/_log2)
65
66
68 """ calculates the information gain for a variable
69
70 **Arguments**
71
72 varMat is a Numeric array with the number of possible occurances
73 of each result for reach possible value of the given variable.
74
75 So, for a variable which adopts 4 possible values and a result which
76 has 3 possible values, varMat would be 4x3
77
78 **Returns**
79
80 The expected information gain
81 """
82 variableRes = numpy.sum(varMat,1)
83 overallRes = numpy.sum(varMat,0)
84
85 term2 = 0
86 for i in xrange(len(variableRes)):
87 term2 = term2 + variableRes[i] * InfoEntropy(varMat[i])
88 tSum = sum(overallRes)
89 if tSum != 0.0:
90 term2 = 1./tSum * term2
91 gain = InfoEntropy(overallRes) - term2
92 else:
93 gain = 0
94 return gain
95
96
97 if hascEntropy:
98 InfoEntropy = cEntropy.InfoEntropy
99 InfoGain = cEntropy.InfoGain
100 else:
101 InfoEntropy = PyInfoEntropy
102 InfoGain = PyInfoGain
103