1
2
3
4 """ descriptor calculator for compounds defined by a composition alone
5 (only the composition is required)
6
7 """
8 from __future__ import print_function
9 from rdkit import RDConfig
10 from rdkit.utils import chemutils
11 import os
12 from rdkit.Dbase.DbConnection import DbConnect
13 from rdkit.ML.Descriptors import Parser,Descriptors
14 from rdkit.six.moves import xrange
15
16
17 countOptions = [('NVAL','total number of valence electrons'),
18 ('NVAL_NO_FULL_F','number of valence electrons neglecting filled f shells'),
19 ('NVAL_NO_FULL_D','number of valence electrons neglecting filled d shells'),
20 ('NVAL_NO_FULL','number of valence electrons neglecting filled f and d shells')]
21
23 """ gets possible descriptor names from a database
24
25 **Arguments**
26
27 - db: the name of the database to use
28
29 - tbl1: the name of the table to be used for reading descriptor values
30
31 - tbl2: the name of the table to be used for reading notes about the
32 descriptors (*descriptions of the descriptors if you like*)
33
34 - user: the user name for DB access
35
36 - password: the password for DB access
37
38 **Returns**
39
40 a 2-tuple containing:
41
42 1) a list of column names
43
44 2) a list of column descriptors
45
46 **Notes**
47
48 - this uses _Dbase.DbInfo_ and Dfunctionality for querying the database
49
50 - it is assumed that tbl2 includes 'property' and 'notes' columns
51
52 """
53 conn = DbConnect(db,user=user,password=password)
54
55 colNames = conn.GetColumnNames(table=tbl1)
56 colDesc = map(lambda x:(x[0].upper(),x[1]),
57 conn.GetColumns('property,notes',table=tbl2))
58 for name,desc in countOptions:
59 colNames.append(name)
60 colDesc.append((name,desc))
61 return colNames,colDesc
62
64 """ used for calculating descriptors
65
66 This is the central point for descriptor calculation
67
68 **Notes**
69
70 - There are two kinds of descriptors this cares about:
71
72 1) *Simple Descriptors* can be calculated solely using atomic descriptor
73 values and the composition of the compound. The full list of possible
74 simple descriptors is determined by the types of *Calculator Methods*
75 (see below) and the contents of an atomic database.
76
77 Simple Descriptors can be marked as *nonZeroDescriptors*. These are used
78 to winnow out atom types where particular atomic descriptors are zero
79 (usually indicating that the value is unknown)
80
81 Simple Descriptors are maintained locally in the _simpleList_
82
83 2) *Compound Descriptors* may rely upon more complicated computation schemes
84 and descriptors for the compound as a whole (e.g. structural variables, etc.).
85 The full list of compound descriptors is limitless. They are calculated using
86 the _ML.Descriptors.Parser_ module.
87
88 Compound Descriptors are maintained locally in the _compoundList_
89
90 - This class has a some special methods which are labelled as *Calculator Method*
91 These are used internally to take atomic descriptors and reduce them to a single
92 simple descriptor value for a composition. They are primarily intended for internal use.
93
94 - a *composition vector* is a list of 2-tuples: '[(atom1name,atom1Num),...]'
95 where atom1Num is the contribution of the atom to the stoichiometry of the
96 compound. No assumption is made about the stoichiometries (i.e. they don't
97 have to be either integral or all sum to one).
98
99 """
100
101
102
103
104
105 - def SUM(self,desc,compos):
106 """ *Calculator Method*
107
108 sums the descriptor values across the composition
109
110 **Arguments**
111
112 - desc: the name of the descriptor
113
114 - compos: the composition vector
115
116 **Returns**
117
118 a float
119
120 """
121 res = 0.0
122 for atom,num in compos:
123 res = res + self.atomDict[atom][desc]*num
124 return res
125 - def MEAN(self,desc,compos):
126 """ *Calculator Method*
127
128 averages the descriptor values across the composition
129
130 **Arguments**
131
132 - desc: the name of the descriptor
133
134 - compos: the composition vector
135
136 **Returns**
137
138 a float
139
140 """
141 res = 0.0
142 nSoFar = 0.0
143 for atom,num in compos:
144 res = res + self.atomDict[atom][desc]*num
145 nSoFar = nSoFar + num
146 return res/nSoFar
147 - def DEV(self,desc,compos):
148 """ *Calculator Method*
149
150 average deviation of the descriptor values across the composition
151
152 **Arguments**
153
154 - desc: the name of the descriptor
155
156 - compos: the composition vector
157
158 **Returns**
159
160 a float
161
162 """
163 mean = self.MEAN(desc,compos)
164 res = 0.0
165 nSoFar = 0.0
166 for atom,num in compos:
167 res = res + abs(self.atomDict[atom][desc]-mean)*num
168 nSoFar = nSoFar + num
169 return res/nSoFar
170 - def MIN(self,desc,compos):
171 """ *Calculator Method*
172
173 minimum of the descriptor values across the composition
174
175 **Arguments**
176
177 - desc: the name of the descriptor
178
179 - compos: the composition vector
180
181 **Returns**
182
183 a float
184
185 """
186 return min(map(lambda x,y=desc,z=self:z.atomDict[x[0]][y],compos))
187 - def MAX(self,desc,compos):
188 """ *Calculator Method*
189
190 maximum of the descriptor values across the composition
191
192 **Arguments**
193
194 - desc: the name of the descriptor
195
196 - compos: the composition vector
197
198 **Returns**
199
200 a float
201
202 """
203 return max(map(lambda x,y=desc,z=self:z.atomDict[x[0]][y],compos))
204
205
206
207
208
210 """ Handles the list of simple descriptors
211
212 This constructs the list of _nonZeroDescriptors_ and _requiredDescriptors_.
213
214 There's some other magic going on that I can't decipher at the moment.
215
216 """
217 global countOptions
218
219 self.nonZeroDescriptors = []
220 lCopy = self.simpleList[:]
221 tList = map(lambda x:x[0],countOptions)
222 for i in xrange(len(lCopy)):
223 entry = lCopy[i]
224 if 'NONZERO' in entry[1]:
225 if entry[0] not in tList:
226 self.nonZeroDescriptors.append('%s != 0'%entry[0])
227 if len(entry[1]) == 1:
228 self.simpleList.remove(entry)
229 else:
230 self.simpleList[self.simpleList.index(entry)][1].remove('NONZERO')
231 self.requiredDescriptors = map(lambda x:x[0],self.simpleList)
232 for entry in tList:
233 if entry in self.requiredDescriptors:
234 self.requiredDescriptors.remove(entry)
235
237 """ Adds entries from the _compoundList_ to the list of _requiredDescriptors_
238
239 Each compound descriptor is surveyed. Any atomic descriptors it requires
240 are added to the list of _requiredDescriptors_ to be pulled from the database.
241
242 """
243
244 for entry in self.compoundList:
245 for atomicDesc in entry[1]:
246 if atomicDesc != '' and atomicDesc not in self.requiredDescriptors:
247 self.requiredDescriptors.append(atomicDesc)
248
250 """ builds the local atomic dict
251
252 We don't want to keep around all descriptor values for all atoms, so this
253 method takes care of only pulling out the descriptors in which we are
254 interested.
255
256 **Notes**
257
258 - this uses _chemutils.GetAtomicData_ to actually pull the data
259
260 """
261 self.ProcessSimpleList()
262 self.ProcessCompoundList()
263
264 self.atomDict = {}
265 whereString = ' and '.join(self.nonZeroDescriptors)
266 if whereString != '':
267 whereString = 'where ' + whereString
268 chemutils.GetAtomicData(self.atomDict,self.requiredDescriptors,self.dbName,self.dbTable,
269 whereString,self.dbUser,self.dbPassword,
270 includeElCounts=1)
271
273 """ calculates all simple descriptors for a given composition
274
275 **Arguments**
276
277 - compos: a string representation of the composition
278
279 - composList: a *composVect*
280
281 The client must provide either _compos_ or _composList_. If both are
282 provided, _composList_ takes priority.
283
284 **Returns**
285 the list of descriptor values
286
287 **Notes**
288
289 - when _compos_ is provided, this uses _chemutils.SplitComposition_
290 to split the composition into its individual pieces
291
292 - if problems are encountered because of either an unknown descriptor or
293 atom type, a _KeyError_ will be raised.
294
295 """
296 if composList is None:
297 composList = chemutils.SplitComposition(compos)
298 try:
299 res = []
300 for i in xrange(len(self.simpleList)):
301 descName,targets = self.simpleList[i]
302 for target in targets:
303 try:
304 method = getattr(self,target)
305 except AttributeError:
306 print('Method %s does not exist'%(target))
307 else:
308 res.append(method(descName,composList))
309 except KeyError as msg:
310 print('composition %s caused problems'%composList)
311 raise KeyError(msg)
312 return res
313
316 """ calculates all simple descriptors for a given composition
317
318 **Arguments**
319
320 - compos: a string representation of the composition
321
322 - composList: a *composVect*
323
324 - propDict: a dictionary containing the properties of the composition
325 as a whole (e.g. structural variables, etc.)
326
327 The client must provide either _compos_ or _composList_. If both are
328 provided, _composList_ takes priority.
329
330 **Returns**
331 the list of descriptor values
332
333 **Notes**
334
335 - when _compos_ is provided, this uses _chemutils.SplitComposition_
336 to split the composition into its individual pieces
337
338 """
339 if composList is None:
340 composList = chemutils.SplitComposition(compos)
341 res = []
342 for i in xrange(len(self.compoundList)):
343 val = Parser.CalcSingleCompoundDescriptor(composList,self.compoundList[i][1:],
344 self.atomDict,propDict)
345 res.append(val)
346 return res
347
349 """ calculates all descriptors for a given composition
350
351 **Arguments**
352
353 - compos: a string representation of the composition
354
355 - propDict: a dictionary containing the properties of the composition
356 as a whole (e.g. structural variables, etc.). These are used to
357 generate Compound Descriptors
358
359 **Returns**
360 the list of all descriptor values
361
362 **Notes**
363
364 - this uses _chemutils.SplitComposition_
365 to split the composition into its individual pieces
366
367 """
368 composList = chemutils.SplitComposition(composVect[0])
369 try:
370 r1 = self.CalcSimpleDescriptorsForComposition(composList=composList)
371 except KeyError as msg:
372 res = []
373 else:
374 r2 = self.CalcCompoundDescriptorsForComposition(composList=composList,
375 propDict=propDict)
376 res = r1+r2
377
378 return tuple(res)
379 CalcDescriptors = CalcDescriptorsForComposition
380
382 """ returns a list of the names of the descriptors this calculator generates
383
384 """
385 if self.descriptorNames is not None:
386 return self.descriptorNames
387 else:
388 res = []
389 for i in xrange(len(self.simpleList)):
390 descName,targets = self.simpleList[i]
391 for target in targets:
392 try:
393 method = getattr(self,target)
394 except AttributeError:
395 print('Method %s does not exist'%(target))
396 else:
397 res.append('%s_%s'%(target,descName))
398 for entry in self.compoundList:
399 res.append(entry[0])
400 self.descriptorNames = res[:]
401 return tuple(res)
402
403 - def __init__(self,simpleList,compoundList=None,
404 dbName=None,
405 dbTable='atomic_data',dbUser='sysdba',dbPassword='masterkey'):
406 """ Constructor
407
408 **Arguments**
409
410 - simpleList: list of simple descriptors to be calculated
411 (see below for format)
412
413 - compoundList: list of compound descriptors to be calculated
414 (see below for format)
415
416 - dbName: name of the atomic database to be used
417
418 - dbTable: name the table in _dbName_ which has atomic data
419
420 - dbUser: user name for DB access
421
422 - dbPassword: password for DB access
423
424 **Note**
425
426 - format of simpleList:
427 a list of 2-tuples containing:
428
429 1) name of the atomic descriptor
430
431 2) a list of operations on that descriptor (e.g. NonZero, Max, etc.)
432 These must correspond to the *Calculator Method* names above.
433
434 - format of compoundList:
435 a list of 2-tuples containing:
436
437 1) name of the descriptor to be calculated
438
439 2) list of selected atomic descriptor names (define $1, $2, etc.)
440
441 3) list of selected compound descriptor names (define $a, $b, etc.)
442
443 4) text formula defining the calculation (see _Parser_)
444
445 """
446
447 if dbName is None:
448 dbName = RDConfig.RDDataDatabase
449
450 Descriptors.DescriptorCalculator.__init__(self)
451
452
453 self.simpleList = [(x[0].upper(), [y.upper() for y in x[1]])
454 for x in simpleList]
455 self.descriptorNames = None
456 self.compoundList = compoundList
457 if self.compoundList is None:
458 self.compoundList = []
459 self.dbName = dbName
460 self.dbTable = dbTable
461 self.dbUser = dbUser
462 self.dbPassword = dbPassword
463
464
465 if __name__ == '__main__':
466 d = [('DED',['NonZero','Mean','Dev']),
467 ('M_B_electroneg',['NonZero']),
468 ('Cov_rad',['Max','Min'])]
469 o = DescriptorCalculator(d)
470 o.BuildAtomDict()
471 print('len:',len(o.atomDict.keys()))
472 for key in o.atomDict.keys()[-4:-1]:
473 print(key,o.atomDict[key])
474
475 print('descriptors:',o.GetDescriptorNames())
476 composList = ['Nb','Nb3','NbPt','Nb2Pt']
477 for compos in composList:
478 descs = o.CalcSimpleDescriptorsForComposition(compos)
479 print(compos,descs)
480