1
2
3
4
5
6
7
8
9
10
11 """ Interface to the C++ Murtagh hierarchic clustering code
12
13 """
14 from rdkit.ML.Cluster import Clusters
15 from rdkit.ML.Cluster.Clustering import MurtaghCluster,MurtaghDistCluster
16 import numpy
17
18
19
20 WARDS=1
21 SLINK=2
22 CLINK=3
23 UPGMA=4
24 MCQUITTY=5
25 GOWER=6
26 CENTROID=7
27
28
29 methods = [
30 ("Ward's Minimum Variance",WARDS,"Ward's Minimum Variance"),
31 ('Average Linkage',UPGMA,'Group Average Linkage (UPGMA)'),
32 ('Single Linkage',SLINK,'Single Linkage (SLINK)'),
33 ('Complete Linkage',CLINK,'Complete Linkage (CLINK)'),
34
35
36 ("Centroid",CENTROID,"Centroid method"),
37 ]
38
40 """ *Internal Use Only*
41
42 returns the distance between points i and j in the symmetric
43 distance matrix _dists_
44
45 """
46 if i==j: return 0.0
47 if i > j: i,j=j,i
48 return dists[j*(j-1)/2+i]
49
50
51
53 """ *Internal Use Only*
54
55 Converts the results of the Murtagh clustering code into
56 a cluster tree, which is returned in a single-entry list
57
58 """
59 cs = [None]*nPts
60 for i in range(nPts):
61 cs[i] = Clusters.Cluster(metric=0.0,data=i,index=(i+1))
62
63 nClus = len(ia)-1
64 for i in range(nClus):
65 idx1 = ia[i]-1
66 idx2 = ib[i]-1
67 c1 = cs[idx1]
68 c2 = cs[idx2]
69 newClust = Clusters.Cluster(metric=crit[i],children=[c1,c2],
70 index=nPts+i+1)
71 cs[idx1] = newClust
72
73 return [newClust]
74
75
77 """ clusters the data points passed in and returns the cluster tree
78
79 **Arguments**
80
81 - data: a list of lists (or array, or whatever) with the input
82 data (see discussion of _isDistData_ argument for the exception)
83
84 - nPts: the number of points to be used
85
86 - method: determines which clustering algorithm should be used.
87 The defined constants for these are:
88 'WARDS, SLINK, CLINK, UPGMA'
89
90 - isDistData: set this toggle when the data passed in is a
91 distance matrix. The distance matrix should be stored
92 symmetrically so that _LookupDist (above) can retrieve
93 the results:
94 for i<j: d_ij = dists[j*(j-1)/2 + i]
95
96
97 **Returns**
98
99 - a single entry list with the cluster tree
100 """
101 data = numpy.array(data)
102 if not isDistData:
103 sz = data.shape[1]
104 ia,ib,crit = MurtaghCluster(data,nPts,sz,method)
105 else:
106 ia,ib,crit = MurtaghDistCluster(data,nPts,method)
107 c = _ToClusters(data,nPts,ia,ib,crit,isDistData=isDistData)
108
109 return c
110