Package rdkit :: Package ML :: Package Cluster :: Module Murtagh
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Cluster.Murtagh

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2001-2008  greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ Interface to the C++ Murtagh hierarchic clustering code 
 12   
 13  """ 
 14  from rdkit.ML.Cluster import Clusters 
 15  from rdkit.ML.Cluster.Clustering import MurtaghCluster,MurtaghDistCluster 
 16  import numpy 
 17   
 18   
 19  # constants to select the clustering algorithm 
 20  WARDS=1 
 21  SLINK=2 
 22  CLINK=3 
 23  UPGMA=4 
 24  MCQUITTY=5 
 25  GOWER=6 
 26  CENTROID=7 
 27   
 28  # descriptions of the methods: 
 29  methods = [ 
 30    ("Ward's Minimum Variance",WARDS,"Ward's Minimum Variance"), 
 31    ('Average Linkage',UPGMA,'Group Average Linkage (UPGMA)'), 
 32    ('Single Linkage',SLINK,'Single Linkage (SLINK)'), 
 33    ('Complete Linkage',CLINK,'Complete Linkage (CLINK)'), 
 34  #  ("McQuitty",MCQUITTY,"McQuitty's method"), 
 35  #  ("Gower",GOWER,"Gower's median method"), 
 36    ("Centroid",CENTROID,"Centroid method"), 
 37    ] 
 38   
39 -def _LookupDist(dists,i,j,n):
40 """ *Internal Use Only* 41 42 returns the distance between points i and j in the symmetric 43 distance matrix _dists_ 44 45 """ 46 if i==j: return 0.0 47 if i > j: i,j=j,i 48 return dists[j*(j-1)/2+i]
49 50 51
52 -def _ToClusters(data,nPts,ia,ib,crit,isDistData=0):
53 """ *Internal Use Only* 54 55 Converts the results of the Murtagh clustering code into 56 a cluster tree, which is returned in a single-entry list 57 58 """ 59 cs = [None]*nPts 60 for i in range(nPts): 61 cs[i] = Clusters.Cluster(metric=0.0,data=i,index=(i+1)) 62 63 nClus = len(ia)-1 64 for i in range(nClus): 65 idx1 = ia[i]-1 66 idx2 = ib[i]-1 67 c1 = cs[idx1] 68 c2 = cs[idx2] 69 newClust = Clusters.Cluster(metric=crit[i],children=[c1,c2], 70 index=nPts+i+1) 71 cs[idx1] = newClust 72 73 return [newClust]
74 75
76 -def ClusterData(data,nPts,method,isDistData=0):
77 """ clusters the data points passed in and returns the cluster tree 78 79 **Arguments** 80 81 - data: a list of lists (or array, or whatever) with the input 82 data (see discussion of _isDistData_ argument for the exception) 83 84 - nPts: the number of points to be used 85 86 - method: determines which clustering algorithm should be used. 87 The defined constants for these are: 88 'WARDS, SLINK, CLINK, UPGMA' 89 90 - isDistData: set this toggle when the data passed in is a 91 distance matrix. The distance matrix should be stored 92 symmetrically so that _LookupDist (above) can retrieve 93 the results: 94 for i<j: d_ij = dists[j*(j-1)/2 + i] 95 96 97 **Returns** 98 99 - a single entry list with the cluster tree 100 """ 101 data = numpy.array(data) 102 if not isDistData: 103 sz = data.shape[1] 104 ia,ib,crit = MurtaghCluster(data,nPts,sz,method) 105 else: 106 ia,ib,crit = MurtaghDistCluster(data,nPts,method) 107 c = _ToClusters(data,nPts,ia,ib,crit,isDistData=isDistData) 108 109 return c
110