1
2
3
4
5
6
7
8
9
10
11 """utility functions for clustering
12
13 """
14
16 """returns an ordered list of all nodes below cluster
17
18 the ordering is done using the lengths of the child nodes
19
20 **Arguments**
21
22 - cluster: the cluster in question
23
24 **Returns**
25
26 - a list of the leaves below this cluster
27
28 """
29 if len(cluster) == 1:
30 return [cluster]
31 else:
32 children = cluster.GetChildren()
33 children.sort(key=lambda x:len(x), reverse=True)
34 res = []
35 for child in children:
36 res += GetNodeList(child)
37 res += [cluster]
38 return res
39
41 """returns an ordered list of all nodes below cluster
42
43
44 """
45 if hasattr(cluster,'_isCentroid'):
46 cluster._aboveCentroid = 0
47 above = -1
48 else:
49 cluster._aboveCentroid = above
50 if len(cluster) == 1:
51 return [cluster]
52 else:
53 res = []
54 children = cluster.GetChildren()
55 children.sort(lambda x,y:cmp(len(y),len(x)))
56 for child in children:
57 res = res + GetNodesDownToCentroids(child,above)
58 res = res + [cluster]
59 return res
60
62 """ find the point in a cluster which has the smallest summed
63 Euclidean distance to all others
64
65 **Arguments**
66
67 - cluster: the cluster to work with
68
69 - dists: the distance matrix to use for the points
70
71 **Returns**
72
73 - the index of the centroid point
74
75 """
76 children = cluster.GetPoints()
77 pts = [x.GetData() for x in children]
78
79 best = 1e24
80 bestIdx = -1
81 for pt in pts:
82 dAccum = 0.0
83
84 for other in pts:
85 if other != pt:
86 if other > pt: row,col = pt,other
87 else: row,col = other,pt
88 dAccum += dists[col*(col-1)/2+row]
89 if dAccum >= best:
90
91 break
92 if dAccum < best:
93 best = dAccum
94 bestIdx = pt
95 for i in range(len(pts)):
96 pt = pts[i]
97 if pt != bestIdx:
98 if pt > bestIdx: row,col = bestIdx,pt
99 else: row,col = pt,bestIdx
100 children[i]._distToCenter = dists[col*(col-1)/2+row]
101 else:
102 children[i]._distToCenter = 0.0
103 children[i]._clustCenter = bestIdx
104 cluster._clustCenter=bestIdx
105 cluster._distToCenter=0.0
106
107 return bestIdx
108
110 """ *Internal Use Only*
111
112 """
113 if len(cluster)<n:
114 raise ValueError('Cannot split cluster of length %d into %d pieces'%(len(cluster),n))
115 if len(cluster)==n:
116 return cluster.GetPoints()
117 clusters = [cluster]
118 nxtIdx = 0
119 for i in range(n-1):
120 while nxtIdx<len(clusters) and len(clusters[nxtIdx])==1:
121 nxtIdx+=1
122 assert nxtIdx<len(clusters)
123
124 children = clusters[nxtIdx].GetChildren()
125 children.sort(key=lambda x: x.GetMetric(), reverse=True)
126 for child in children:
127 clusters.append(child)
128 del clusters[nxtIdx]
129 return clusters
130
132 """ *Internal Use Only*
133
134 """
135 if len(cluster)<n:
136 raise ValueError('Cannot split cluster of length %d into %d pieces'%(len(cluster),n))
137 if len(cluster)==n:
138 return cluster.GetPoints()
139 clusters = [cluster]
140 for i in range(n-1):
141 nxtIdx = 0
142 while nxtIdx<len(clusters) and len(clusters[nxtIdx])==1:
143 nxtIdx+=1
144 assert nxtIdx<len(clusters)
145
146 children = clusters[nxtIdx].GetChildren()
147 for child in children:
148 clusters.append(child)
149 del clusters[nxtIdx]
150 clusters.sort(key=lambda x: x.GetMetric(), reverse=True)
151 return clusters
152
153
155 """ splits a cluster tree into a set of branches
156
157 **Arguments**
158
159 - cluster: the root of the cluster tree
160
161 - n: the number of clusters to include in the split
162
163 - breadthFirst: toggles breadth first (vs depth first) cleavage
164 of the cluster tree.
165
166 **Returns**
167
168 - a list of sub clusters
169
170 """
171 if breadthFirst:
172 return _BreadthFirstSplit(cluster,n)
173 else:
174 return _HeightFirstSplit(cluster,n)
175