Package rdkit :: Package ML :: Package Data :: Module MLData
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Data.MLData

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  #    All Rights Reserved 
  4  # 
  5  """ classes to be used to help work with data sets 
  6   
  7  """ 
  8  from __future__ import print_function 
  9  import numpy 
 10  import math 
 11  import copy,types 
 12  from rdkit import six 
 13  from rdkit.six.moves import xrange 
 14   
 15  numericTypes = [int, float] 
 16  if six.PY2: 
 17    numericTypes.append(long) 
 18   
19 -class MLDataSet(object):
20 """ A data set for holding general data (floats, ints, and strings) 21 22 **Note** 23 this is intended to be a read-only data structure 24 (i.e. after calling the constructor you cannot touch it) 25 """
26 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None, 27 qBounds=None,varNames=None,ptNames=None,nResults=1):
28 """ Constructor 29 30 **Arguments** 31 32 - data: a list of lists containing the data. The data are copied, so don't worry 33 about us overwriting them. 34 35 - nVars: the number of variables 36 37 - nPts: the number of points 38 39 - nPossibleVals: an list containing the number of possible values 40 for each variable (should contain 0 when not relevant) 41 This is _nVars_ long 42 43 - qBounds: a list of lists containing quantization bounds for variables 44 which are to be quantized (note, this class does not quantize 45 the variables itself, it merely stores quantization bounds. 46 an empty sublist indicates no quantization for a given variable 47 This is _nVars_ long 48 49 - varNames: a list of the names of the variables. 50 This is _nVars_ long 51 52 - ptNames: the names (labels) of the individual data points 53 This is _nPts_ long 54 55 - nResults: the number of results columns in the data lists. This is usually 56 1, but can be higher. 57 """ 58 self.data = [x[:] for x in data] 59 self.nResults = nResults 60 if nVars is None: 61 nVars = len(self.data[0])-self.nResults 62 self.nVars = nVars 63 if nPts is None: 64 nPts = len(data) 65 self.nPts = nPts 66 if qBounds is None: 67 qBounds = [[]]*len(self.data[0]) 68 self.qBounds = qBounds 69 if nPossibleVals is None: 70 nPossibleVals = self._CalcNPossible(self.data) 71 self.nPossibleVals = nPossibleVals 72 if varNames is None: 73 varNames = ['']*self.nVars 74 self.varNames = varNames 75 if ptNames is None: 76 ptNames = ['']*self.nPts 77 self.ptNames = ptNames
78
79 - def _CalcNPossible(self,data):
80 """calculates the number of possible values of each variable (where possible) 81 82 **Arguments** 83 84 -data: a list of examples to be used 85 86 **Returns** 87 88 a list of nPossible values for each variable 89 90 """ 91 nVars = self.GetNVars()+self.nResults 92 nPossible = [-1]*nVars 93 cols = list(xrange(nVars)) 94 for i,bounds in enumerate(self.qBounds): 95 if len(bounds)>0: 96 nPossible[i] = len(bounds) 97 cols.remove(i) 98 99 nPts = self.GetNPts() 100 for i,pt in enumerate(self.data): 101 for col in cols[:]: 102 d = pt[col] 103 if type(d) in numericTypes: 104 if math.floor(d) == d: 105 nPossible[col] = max(math.floor(d),nPossible[col]) 106 else: 107 nPossible[col] = -1 108 cols.remove(col) 109 else: 110 nPossible[col] = -1 111 cols.remove(col) 112 return [int(x)+1 for x in nPossible]
113
114 - def GetNResults(self):
115 return self.nResults
116 - def GetNVars(self):
117 return self.nVars
118 - def GetNPts(self):
119 return self.nPts
120 - def GetNPossibleVals(self):
121 return self.nPossibleVals
122 - def GetQuantBounds(self):
123 return self.qBounds
124
125 - def __getitem__(self,idx):
126 res = [self.ptNames[idx]]+self.data[idx][:] 127 return res
128 - def __setitem__(self,idx,val):
129 if len(val) != self.GetNVars()+self.GetNResults()+1: 130 raise ValueError('bad value in assignment') 131 self.ptNames[idx] = val[0] 132 self.data[idx] = val[1:] 133 return val
134
135 - def GetNamedData(self):
136 """ returns a list of named examples 137 138 **Note** 139 140 a named example is the result of prepending the example 141 name to the data list 142 143 """ 144 res = [None]*self.nPts 145 for i in xrange(self.nPts): 146 res[i] = [self.ptNames[i]]+self.data[i][:] 147 return res
148
149 - def GetAllData(self):
150 """ returns a *copy* of the data 151 152 """ 153 return copy.deepcopy(self.data)
154 - def GetInputData(self):
155 """ returns the input data 156 157 **Note** 158 159 _inputData_ means the examples without their result fields 160 (the last _NResults_ entries) 161 162 """ 163 v = self.GetNResults() 164 return [x[:-v] for x in self.data]
165
166 - def GetResults(self):
167 """ Returns the result fields from each example 168 169 """ 170 if self.GetNResults()>1: 171 v = self.GetNResults() 172 res = [x[-v:] for x in self.data] 173 else: 174 res = [x[-1] for x in self.data] 175 return res
176
177 - def GetVarNames(self):
178 return self.varNames
179 - def GetPtNames(self):
180 return self.ptNames
181
182 - def AddPoint(self,pt):
183 self.data.append(pt[1:]) 184 self.ptNames.append(pt[0]) 185 self.nPts += 1
186
187 - def AddPoints(self,pts,names):
188 if len(pts)!=len(names): 189 raise ValueError("input length mismatch") 190 self.data += pts 191 self.ptNames += names 192 self.nPts = len(self.data)
193
194 -class MLQuantDataSet(MLDataSet):
195 """ a data set for holding quantized data 196 197 198 **Note** 199 200 this is intended to be a read-only data structure 201 (i.e. after calling the constructor you cannot touch it) 202 203 **Big differences to MLDataSet** 204 205 1) data are stored in a numpy array since they are homogenous 206 207 2) results are assumed to be quantized (i.e. no qBounds entry is required) 208 209 """
210 - def _CalcNPossible(self,data):
211 """calculates the number of possible values of each variable 212 213 **Arguments** 214 215 -data: a list of examples to be used 216 217 **Returns** 218 219 a list of nPossible values for each variable 220 221 """ 222 return [max(x)+1 for x in numpy.transpose(data)]
223
224 - def GetNamedData(self):
225 """ returns a list of named examples 226 227 **Note** 228 229 a named example is the result of prepending the example 230 name to the data list 231 232 """ 233 res = [None]*self.nPts 234 for i in xrange(self.nPts): 235 res[i] = [self.ptNames[i]]+self.data[i].tolist() 236 return res
237
238 - def GetAllData(self):
239 """ returns a *copy* of the data 240 241 """ 242 return self.data.tolist()
243 - def GetInputData(self):
244 """ returns the input data 245 246 **Note** 247 248 _inputData_ means the examples without their result fields 249 (the last _NResults_ entries) 250 251 """ 252 return (self.data[:,:-self.nResults]).tolist()
253 - def GetResults(self):
254 """ Returns the result fields from each example 255 256 """ 257 if self.GetNResults()>1: 258 v = self.GetNResults() 259 res = [x[-v:] for x in self.data] 260 else: 261 res = [x[-1] for x in self.data] 262 return res
263 264
265 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None, 266 qBounds=None,varNames=None,ptNames=None,nResults=1):
267 """ Constructor 268 269 **Arguments** 270 271 - data: a list of lists containing the data. The data are copied, so don't worry 272 about us overwriting them. 273 274 - nVars: the number of variables 275 276 - nPts: the number of points 277 278 - nPossibleVals: an list containing the number of possible values 279 for each variable (should contain 0 when not relevant) 280 This is _nVars_ long 281 282 - qBounds: a list of lists containing quantization bounds for variables 283 which are to be quantized (note, this class does not quantize 284 the variables itself, it merely stores quantization bounds. 285 an empty sublist indicates no quantization for a given variable 286 This is _nVars_ long 287 288 - varNames: a list of the names of the variables. 289 This is _nVars_ long 290 291 - ptNames: the names (labels) of the individual data points 292 This is _nPts_ long 293 294 - nResults: the number of results columns in the data lists. This is usually 295 1, but can be higher. 296 """ 297 self.data = numpy.array(data) 298 self.nResults = nResults 299 if nVars is None: 300 nVars = len(data[0])-self.nResults 301 self.nVars = nVars 302 if nPts is None: 303 nPts = len(data) 304 self.nPts = nPts 305 if qBounds is None: 306 qBounds = [[]]*self.nVars 307 self.qBounds = qBounds 308 if nPossibleVals is None: 309 nPossibleVals = self._CalcNPossible(data) 310 self.nPossibleVals = nPossibleVals 311 if varNames is None: 312 varNames = ['']*self.nVars 313 self.varNames = varNames 314 if ptNames is None: 315 ptNames = ['']*self.nPts 316 self.ptNames = ptNames
317 318 319 if __name__ == '__main__': 320 import DataUtils 321 examples = [[0,0,0,0,0], 322 [0,0,0,1,0], 323 [1,0,0,0,1], 324 [2,1,0,0,1], 325 [2,2,1,0,1] 326 ] 327 varNames = ['foo1','foo2','foo3','foo4','res'] 328 ptNames = ['p1','p2','p3','p4','p5'] 329 set = MLQuantDataSet(examples,varNames=varNames,ptNames=ptNames) 330 DataUtils.WritePickledData('test_data/test.qdat.pkl',set) 331 print('nVars:',set.GetNVars()) 332 print('nPts:',set.GetNPts()) 333 print('nPoss:',set.GetNPossibleVals()) 334 print('qBounds:',set.GetQuantBounds()) 335 print('data:',set.GetAllData()) 336 print('Input data:',set.GetInputData()) 337 print('results:',set.GetResults()) 338 339 print('nameddata:',set.GetNamedData()) 340 341 examples = [ 342 ['foo',1,1.0,1,1.1], 343 ['foo',2,1.0,1,2.1], 344 ['foo',3,1.2,1.1,3.1], 345 ['foo',4,1.0,1,4.1], 346 ['foo',5,1.1,1,5.1], 347 ] 348 qBounds = [[],[],[],[],[2,4]] 349 varNames = ['foo1','foo2','foo3','foo4','res'] 350 ptNames = ['p1','p2','p3','p4','p5'] 351 set = MLDataSet(examples,qBounds=qBounds) 352 DataUtils.WritePickledData('test_data/test.dat.pkl',set) 353 print('nVars:',set.GetNVars()) 354 print('nPts:',set.GetNPts()) 355 print('nPoss:',set.GetNPossibleVals()) 356 print('qBounds:',set.GetQuantBounds()) 357 print('data:',set.GetAllData()) 358 print('Input data:',set.GetInputData()) 359 print('results:',set.GetResults()) 360 361 print('nameddata:',set.GetNamedData()) 362