Package rdkit :: Package Chem :: Package Fingerprints :: Module ClusterMols
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.ClusterMols

  1  # $Id$ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ utility functionality for clustering molecules using fingerprints 
 12   includes a command line app for clustering 
 13   
 14   
 15  Sample Usage: 
 16    python ClusterMols.py  -d data.gdb -t daylight_sig \ 
 17      --idName="CAS_TF" -o clust1.pkl \ 
 18      --actTable="dop_test" --actName="moa_quant" 
 19   
 20  """ 
 21  from rdkit.Dbase.DbConnection import DbConnect 
 22  from rdkit.Dbase import DbInfo,DbUtils 
 23  from rdkit.ML.Data import DataUtils 
 24  from rdkit.ML.Cluster import Clusters 
 25  from rdkit.ML.Cluster import Murtagh 
 26  import sys 
 27  from rdkit.six.moves import cPickle 
 28  from rdkit.Chem.Fingerprints import FingerprintMols,MolSimilarity 
 29  from rdkit import DataStructs 
 30  import numpy 
 31  _cvsVersion="$Id$" 
 32  idx1 = _cvsVersion.find(':')+1 
 33  idx2 = _cvsVersion.rfind('$') 
 34  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 35   
 36  message=FingerprintMols.message 
 37  error=FingerprintMols.error 
 38   
39 -def GetDistanceMatrix(data,metric,isSimilarity=1):
40 """ data should be a list of tuples with fingerprints in position 1 41 (the rest of the elements of the tuple are not important) 42 43 Returns the symmetric distance matrix 44 (see ML.Cluster.Resemblance for layout documentation) 45 46 """ 47 nPts = len(data) 48 res = numpy.zeros((nPts*(nPts-1)/2),numpy.float) 49 nSoFar=0 50 for col in xrange(1,nPts): 51 for row in xrange(col): 52 fp1 = data[col][1] 53 fp2 = data[row][1] 54 if fp1.GetNumBits()>fp2.GetNumBits(): 55 fp1 = DataStructs.FoldFingerprint(fp1,fp1.GetNumBits()/fp2.GetNumBits()) 56 elif fp2.GetNumBits()>fp1.GetNumBits(): 57 fp2 = DataStructs.FoldFingerprint(fp2,fp2.GetNumBits()/fp1.GetNumBits()) 58 sim = metric(fp1,fp2) 59 if isSimilarity: 60 sim = 1.-sim 61 res[nSoFar] = sim 62 nSoFar += 1 63 return res
64
65 -def ClusterPoints(data,metric,algorithmId,haveLabels=False,haveActs=True,returnDistances=False):
66 message('Generating distance matrix.\n') 67 dMat = GetDistanceMatrix(data,metric) 68 message('Clustering\n') 69 clustTree = Murtagh.ClusterData(dMat,len(data),algorithmId, 70 isDistData=1)[0] 71 acts = [] 72 if haveActs and len(data[0])>2: 73 # we've got activities... use them: 74 acts = [int(x[2]) for x in data] 75 76 if not haveLabels: 77 labels = ['Mol: %s'%str(x[0]) for x in data] 78 else: 79 labels = [x[0] for x in data] 80 clustTree._ptLabels = labels 81 if acts: 82 clustTree._ptValues = acts 83 for pt in clustTree.GetPoints(): 84 idx = pt.GetIndex()-1 85 pt.SetName(labels[idx]) 86 if acts: 87 try: 88 pt.SetData(int(acts[idx])) 89 except Exception: 90 pass 91 if not returnDistances: 92 return clustTree 93 else: 94 return clustTree,dMat
95
96 -def ClusterFromDetails(details):
97 """ Returns the cluster tree 98 99 """ 100 data = MolSimilarity.GetFingerprints(details) 101 if details.maxMols > 0: 102 data = data[:details.maxMols] 103 if details.outFileName: 104 try: 105 outF = open(details.outFileName,'wb+') 106 except IOError: 107 error("Error: could not open output file %s for writing\n"%(details.outFileName)) 108 return None 109 else: 110 outF = None 111 112 if not data: 113 return None 114 115 clustTree = ClusterPoints(data,details.metric,details.clusterAlgo, 116 haveLabels=0,haveActs=1) 117 if outF: 118 cPickle.dump(clustTree,outF) 119 return clustTree
120 121 _usageDoc=""" 122 Usage: ClusterMols.py [args] <fName> 123 124 If <fName> is provided and no tableName is specified (see below), 125 data will be read from the text file <fName>. Text files delimited 126 with either commas (extension .csv) or tabs (extension .txt) are 127 supported. 128 129 Command line arguments are: 130 131 - -d _dbName_: set the name of the database from which 132 to pull input fingerprint information. 133 134 - -t _tableName_: set the name of the database table 135 from which to pull input fingerprint information 136 137 - --idName=val: sets the name of the id column in the input 138 database. Default is *ID*. 139 140 - -o _outFileName_: name of the output file (output will 141 be a pickle (.pkl) file with the cluster tree) 142 143 - --actTable=val: name of table containing activity values 144 (used to color points in the cluster tree). 145 146 - --actName=val: name of column with activities in the activity 147 table. The values in this column should either be integers or 148 convertible into integers. 149 150 - --SLINK: use the single-linkage clustering algorithm 151 (default is Ward's minimum variance) 152 153 - --CLINK: use the complete-linkage clustering algorithm 154 (default is Ward's minimum variance) 155 156 - --UPGMA: use the group-average clustering algorithm 157 (default is Ward's minimum variance) 158 159 - --dice: use the DICE similarity metric instead of Tanimoto 160 161 - --cosine: use the cosine similarity metric instead of Tanimoto 162 163 - --fpColName=val: name to use for the column which stores 164 fingerprints (in pickled format) in the input db table. 165 Default is *AutoFragmentFP* 166 167 - --minPath=val: minimum path length to be included in 168 fragment-based fingerprints. Default is *2*. 169 170 - --maxPath=val: maximum path length to be included in 171 fragment-based fingerprints. Default is *7*. 172 173 - --nBitsPerHash: number of bits to be set in the output 174 fingerprint for each fragment. Default is *4*. 175 176 - --discrim: use of path-based discriminators to hash bits. 177 Default is *false*. 178 179 - -V: include valence information in the fingerprints 180 Default is *false*. 181 182 - -H: include Hs in the fingerprint 183 Default is *false*. 184 185 - --useMACCS: use the public MACCS keys to do the fingerprinting 186 (instead of a daylight-type fingerprint) 187 188 189 """ 190 if __name__ == '__main__': 191 message("This is ClusterMols version %s\n\n"%(__VERSION_STRING)) 192 FingerprintMols._usageDoc=_usageDoc 193 details = FingerprintMols.ParseArgs() 194 ClusterFromDetails(details) 195