Package rdkit :: Package ML :: Module files
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.files

  1  # copyright 2000 greg landrum 
  2   
  3  """ Generic file manipulation stuff 
  4   
  5  """ 
  6  from __future__ import print_function 
  7  import numpy 
  8  import string,re 
  9   
10 -class ReFile:
11 """convenience class for dealing with files with comments 12 13 blank (all whitespace) lines, and lines beginning with comment 14 characters are skipped. 15 16 anything following a comment character on a line is stripped off 17 """
18 - def readline(self):
19 """ read the next line and return it. 20 21 return '' on EOF 22 23 """ 24 result = '' 25 while result == '': 26 inLine = self.inFile.readline() 27 if inLine == '': 28 return '' 29 result = string.strip(self.regExp.split(inLine)[0]) 30 return result
31
32 - def readlines(self):
33 """ return a list of all the lines left in the file 34 35 return [] if there are none 36 37 """ 38 res = [] 39 inLines = self.inFile.readlines() 40 for line in inLines: 41 result = string.strip(self.regExp.split(line)[0]) 42 if result != '': 43 res.append(result) 44 45 return res
46
47 - def rewind(self):
48 """ rewinds the file (seeks to the beginning) 49 50 """ 51 self.inFile.seek(0)
52
53 - def __init__(self,fileName,mode='r',comment=r'#',trailer=r'\n'):
54 if trailer is not None and trailer != '': 55 comment = comment + r'|' + trailer 56 self.regExp = re.compile(comment) 57 self.inFile = open(fileName,mode)
58 59
60 -def ReadDataFile(fileName,comment=r'#',depVarCol=0,dataType=numpy.float):
61 """ read in the data file and return a tuple of two Numeric arrays: 62 (independant variables, dependant variables). 63 64 **ARGUMENTS:** 65 66 - fileName: the fileName 67 68 - comment: the comment character for the file 69 70 - depVarcol: the column number containing the dependant variable 71 72 - dataType: the Numeric short-hand for the data type 73 74 RETURNS: 75 76 a tuple of two Numeric arrays: 77 78 (independant variables, dependant variables). 79 80 """ 81 inFile = ReFile(fileName) 82 dataLines = inFile.readlines() 83 nPts = len(dataLines) 84 85 if dataType in [numpy.float, numpy.float32, numpy.float64]: 86 _convfunc = float 87 else: 88 _convfunc = int 89 90 nIndVars = len(string.split(dataLines[0]))-1 91 indVarMat = numpy.zeros((nPts,nIndVars),dataType) 92 depVarVect = numpy.zeros(nPts,dataType) 93 for i in range(nPts): 94 splitLine = string.split(dataLines[i]) 95 depVarVect[i] = _convfunc(splitLine[depVarCol]) 96 del splitLine[depVarCol] 97 indVarMat[i,:] = map(_convfunc,splitLine) 98 99 return indVarMat,depVarVect
100 101 102 if __name__ == '__main__': 103 import sys 104 105 fileN = sys.argv[1] 106 iV,dV = ReadDataFile(fileN) 107 print('iV:', iV) 108 print('dV:', dV) 109