1 '''
2 Importing pandasTools enables several features that allow for using RDKit molecules as columns of a Pandas dataframe.
3 If the dataframe is containing a molecule format in a column (e.g. smiles), like in this example:
4 >>> from rdkit.Chem import PandasTools
5 >>> import pandas as pd
6 >>> import os
7 >>> from rdkit import RDConfig
8 >>> antibiotics = pd.DataFrame(columns=['Name','Smiles'])
9 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C','Name':'Penicilline G'}, ignore_index=True)#Penicilline G
10 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O','Name':'Tetracycline'}, ignore_index=True)#Tetracycline
11 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)O)C','Name':'Ampicilline'}, ignore_index=True)#Ampicilline
12 >>> print([str(x) for x in antibiotics.columns])
13 ['Name', 'Smiles']
14 >>> print(antibiotics)
15 Name Smiles
16 0 Penicilline G CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
17 1 Tetracycline CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...
18 2 Ampicilline CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
19
20 a new column can be created holding the respective RDKit molecule objects. The fingerprint can be included to accelerate substructure searches on the dataframe.
21
22 >>> PandasTools.AddMoleculeColumnToFrame(antibiotics,'Smiles','Molecule',includeFingerprints=True)
23 >>> print([str(x) for x in antibiotics.columns])
24 ['Name', 'Smiles', 'Molecule']
25
26 A substructure filter can be applied on the dataframe using the RDKit molecule column, because the ">=" operator has been modified to work as a substructure check.
27 Such the antibiotics containing the beta-lactam ring "C1C(=O)NC1" can be obtained by
28
29 >>> beta_lactam = Chem.MolFromSmiles('C1C(=O)NC1')
30 >>> beta_lactam_antibiotics = antibiotics[antibiotics['Molecule'] >= beta_lactam]
31 >>> print(beta_lactam_antibiotics[['Name','Smiles']])
32 Name Smiles
33 0 Penicilline G CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
34 2 Ampicilline CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
35
36
37 It is also possible to load an SDF file can be load into a dataframe.
38
39 >>> sdfFile = os.path.join(RDConfig.RDDataDir,'NCI/first_200.props.sdf')
40 >>> frame = PandasTools.LoadSDF(sdfFile,smilesName='SMILES',molColName='Molecule',includeFingerprints=True)
41 >>> frame.info # doctest: +SKIP
42 <bound method DataFrame.info of <class 'pandas.core.frame.DataFrame'>
43 Int64Index: 200 entries, 0 to 199
44 Data columns:
45 AMW 200 non-null values
46 CLOGP 200 non-null values
47 CP 200 non-null values
48 CR 200 non-null values
49 DAYLIGHT.FPG 200 non-null values
50 DAYLIGHT_CLOGP 200 non-null values
51 FP 200 non-null values
52 ID 200 non-null values
53 ISM 200 non-null values
54 LIPINSKI_VIOLATIONS 200 non-null values
55 NUM_HACCEPTORS 200 non-null values
56 NUM_HDONORS 200 non-null values
57 NUM_HETEROATOMS 200 non-null values
58 NUM_LIPINSKIHACCEPTORS 200 non-null values
59 NUM_LIPINSKIHDONORS 200 non-null values
60 NUM_RINGS 200 non-null values
61 NUM_ROTATABLEBONDS 200 non-null values
62 P1 30 non-null values
63 SMILES 200 non-null values
64 Molecule 200 non-null values
65 dtypes: object(20)>
66
67 Conversion to html is quite easy:
68 >>> htm = frame.to_html()
69 >>> str(htm[:36])
70 '<table border="1" class="dataframe">'
71
72 In order to support rendering the molecules as images in the HTML export of the dataframe, the __str__ method is monkey-patched to return a base64 encoded PNG:
73 >>> molX = Chem.MolFromSmiles('Fc1cNc2ccccc12')
74 >>> print(molX) # doctest: +SKIP
75 <img src="data:image/png;base64,..." alt="Mol"/>
76 This can be reverted using the ChangeMoleculeRendering method
77 >>> ChangeMoleculeRendering(renderer='String')
78 >>> print(molX) # doctest: +SKIP
79 <rdkit.Chem.rdchem.Mol object at 0x10d179440>
80 >>> ChangeMoleculeRendering(renderer='PNG')
81 >>> print(molX) # doctest: +SKIP
82 <img src="data:image/png;base64,..." alt="Mol"/>
83
84 '''
85 from __future__ import print_function
86
87 from base64 import b64encode
88 import types,copy
89
90 from rdkit.six import BytesIO, string_types
91 from rdkit import Chem
92 from rdkit.Chem import Draw
93
94 try:
95 import pandas as pd
96 try:
97 v = pd.__version__.split('.')
98 except AttributeError:
99
100 v = pd.version.version.split('.')
101
102
103 if v[0]=='0' and int(v[1])<10:
104 print("Pandas version %s not compatible with tests"%v, file=sys.stderr)
105 pd = None
106 else:
107 if 'display.width' in pd.core.config._registered_options:
108 pd.set_option('display.width',1000000000)
109 if 'display.max_rows' in pd.core.config._registered_options:
110 pd.set_option('display.max_rows',1000000000)
111 elif 'display.height' in pd.core.config._registered_options:
112 pd.set_option('display.height',1000000000)
113 if 'display.max_colwidth' in pd.core.config._registered_options:
114 pd.set_option('display.max_colwidth',1000000000)
115
116 defPandasRendering = pd.core.frame.DataFrame.to_html
117 except ImportError:
118 import traceback
119 traceback.print_exc()
120 pd = None
121
122 except Exception as e:
123 import sys
124 import traceback
125 traceback.print_exc()
126 pd = None
127
128 if pd:
129 try:
130 from pandas.formats import format as fmt
131 except ImportError:
132 from pandas.core import format as fmt
133
134
135 highlightSubstructures=True
136 molRepresentation = 'png'
137 molSize = (200,200)
138
139
141 '''
142 Patched default escaping of HTML control characters to allow molecule image rendering dataframes
143 '''
144 formatter = fmt.DataFrameFormatter(self,buf=None,columns=None,col_space=None,colSpace=None,header=True,index=True,
145 na_rep='NaN',formatters=None,float_format=None,sparsify=None,index_names=True,
146 justify = None, force_unicode=None,bold_rows=True,classes=None,escape=False)
147 formatter.to_html()
148 html = formatter.buf.getvalue()
149 return html
150
152 '''Ensure inheritance of patched to_html in "head" subframe
153 '''
154 df = self[:n]
155 df.to_html = types.MethodType(patchPandasHTMLrepr,df)
156 df.head = types.MethodType(patchPandasHeadMethod,df)
157 return df
158
160 """displayhook function for PIL Images, rendered as PNG"""
161 import pandas as pd
162 bio = BytesIO()
163 x.save(bio,format='PNG')
164 s = b64encode(bio.getvalue()).decode('ascii')
165 pd.set_option('display.max_columns',len(s)+1000)
166 pd.set_option('display.max_rows',len(s)+1000)
167 if len(s)+100 > pd.get_option("display.max_colwidth"):
168 pd.set_option("display.max_colwidth",len(s)+1000)
169 return s
170
172 """ mol rendered as SVG """
173 from IPython.display import SVG
174 from rdkit.Chem import rdDepictor
175 from rdkit.Chem.Draw import rdMolDraw2D
176 try:
177
178 mol.GetConformer(-1)
179 except ValueError:
180 rdDepictor.Compute2DCoords(mol)
181 drawer = rdMolDraw2D.MolDraw2DSVG(*size)
182 drawer.DrawMolecule(mol,highlightAtoms=highlightAtoms)
183 drawer.FinishDrawing()
184 svg = drawer.GetDrawingText().replace('svg:','')
185 return SVG(svg).data
186
187 from rdkit import DataStructs
188
189 try:
190 from rdkit.Avalon import pyAvalonTools as pyAvalonTools
191 _fingerprinter=lambda x,y:pyAvalonTools.GetAvalonFP(x,isQuery=y,bitFlags=pyAvalonTools.avalonSSSBits)
192 except ImportError:
193 _fingerprinter=lambda x,y:Chem.PatternFingerprint(x,fpSize=2048)
194
196 """Allows for substructure check using the >= operator (X has substructure Y -> X >= Y) by
197 monkey-patching the __ge__ function
198 This has the effect that the pandas/numpy rowfilter can be used for substructure filtering (filtered = dframe[dframe['RDKitColumn'] >= SubstructureMolecule])
199 """
200 if x is None or y is None: return False
201 if hasattr(x,'_substructfp'):
202 if not hasattr(y,'_substructfp'):
203 y._substructfp=_fingerprinter(y,True)
204 if not DataStructs.AllProbeBitsMatch(y._substructfp,x._substructfp):
205 return False
206 match = x.GetSubstructMatch(y)
207 if match:
208 if highlightSubstructures:
209 x.__sssAtoms=list(match)
210 else:
211 x.__sssAtoms=[]
212 return True
213 else:
214 return False
215
216
217 Chem.Mol.__ge__ = _molge
218
230
231
234
235
236 Chem.Mol.__str__ = PrintAsBase64PNGString
237
239 '''Precomputes fingerprints and stores results in molecule objects to accelerate substructure matching
240 '''
241
242 if m is not None:
243 m._substructfp=_fingerprinter(m,False)
244 return m
245
247 '''Changes the default dataframe rendering to not escape HTML characters, thus allowing rendered images in all dataframes.
248 IMPORTANT: THIS IS A GLOBAL CHANGE THAT WILL AFFECT TO COMPLETE PYTHON SESSION. If you want to change the rendering only
249 for a single dataframe use the "ChangeMoleculeRendering" method instead.
250 '''
251 if images:
252 pd.core.frame.DataFrame.to_html = patchPandasHTMLrepr
253 else:
254 pd.core.frame.DataFrame.to_html = defPandasRendering
255
256
258 '''Converts the molecules contains in "smilesCol" to RDKit molecules and appends them to the dataframe "frame" using the specified column name.
259 If desired, a fingerprint can be computed and stored with the molecule objects to accelerate substructure matching
260 '''
261 if not includeFingerprints:
262 frame[molCol]=frame[smilesCol].map(Chem.MolFromSmiles)
263 else:
264 frame[molCol]=frame[smilesCol].map(lambda smiles: _MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
265 RenderImagesInAllDataFrames(images=True)
266
267
268
269
271 '''Allows to change the rendering of the molecules between base64 PNG images and string representations.
272 This serves two purposes: First it allows to avoid the generation of images if this is not desired and, secondly, it allows to enable image rendering for
273 newly created dataframe that already contains molecules, without having to rerun the time-consuming AddMoleculeColumnToFrame. Note: this behaviour is, because some pandas methods, e.g. head()
274 returns a new dataframe instance that uses the default pandas rendering (thus not drawing images for molecules) instead of the monkey-patched one.
275 '''
276 if renderer == 'String':
277 Chem.Mol.__str__ = PrintDefaultMolRep
278 else:
279 Chem.Mol.__str__ = PrintAsBase64PNGString
280 if frame is not None:
281 frame.to_html = types.MethodType(patchPandasHTMLrepr,frame)
282
283 -def LoadSDF(filename, idName='ID',molColName = 'ROMol',includeFingerprints=False, isomericSmiles=False, smilesName=None, embedProps=False):
284 """ Read file in SDF format and return as Pandas data frame. If embedProps=True all properties also get embedded in Mol objects in the molecule column. """
285 df = None
286 if isinstance(filename, string_types):
287 if filename.lower()[-3:] == ".gz":
288 import gzip
289 f = gzip.open(filename, "rb")
290 else:
291 f = open(filename, 'rb')
292 close = f.close
293 else:
294 f = filename
295 close = None
296 records = []
297 indices = []
298 for i, mol in enumerate(Chem.ForwardSDMolSupplier(f)):
299 if mol is None: continue
300 row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
301 if not embedProps:
302 for prop in mol.GetPropNames():
303 mol.ClearProp(prop)
304 if mol.HasProp('_Name'): row[idName] = mol.GetProp('_Name')
305 if smilesName is not None:
306 row[smilesName] = Chem.MolToSmiles(mol, isomericSmiles=isomericSmiles)
307 if not includeFingerprints:
308 row[molColName] = mol
309 else:
310 row[molColName] = _MolPlusFingerprint(mol)
311 records.append(row)
312 indices.append(i)
313
314 if close is not None: close()
315 RenderImagesInAllDataFrames(images=True)
316 return pd.DataFrame(records, index=indices)
317
318 from rdkit.Chem import SDWriter
319
320 -def WriteSDF(df, out, molColName='ROMol', idName=None, properties=None, allNumeric=False):
321 '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export all columns.
322 The "allNumeric" flag allows to automatically include all numeric columns in the output. User has to make sure that correct data type is assigned to column.
323 "idName" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title.
324 '''
325
326 close = None
327 if isinstance(out, string_types):
328 if out.lower()[-3:] == ".gz":
329 import gzip
330 out = gzip.open(out, "wb")
331 close = out.close
332
333 writer = SDWriter(out)
334 if properties is None:
335 properties=[]
336 else:
337 properties=list(properties)
338 if allNumeric:
339 properties.extend([dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt],float) or np.issubdtype(df.dtypes[dt],int))])
340
341 if molColName in properties:
342 properties.remove(molColName)
343 if idName in properties:
344 properties.remove(idName)
345 writer.SetProps(properties)
346 for row in df.iterrows():
347
348 mol = Chem.Mol(row[1][molColName])
349
350 if idName is not None:
351 if idName == 'RowID':
352 mol.SetProp('_Name',str(row[0]))
353 else:
354 mol.SetProp('_Name',str(row[1][idName]))
355 for p in properties:
356 cell_value = row[1][p]
357
358 if np.issubdtype(type(cell_value),float):
359 s = '{:f}'.format(cell_value).rstrip("0")
360 if s[-1] == ".":
361 s += "0"
362 mol.SetProp(p, s)
363 else:
364 mol.SetProp(p,str(cell_value))
365 writer.write(mol)
366 writer.close()
367 if close is not None: close()
368
369 _saltRemover = None
379
381 '''
382 Saves smi file. SMILES are generated from column with RDKit molecules. Column with names is optional.
383 '''
384 w = Chem.SmilesWriter(outFile, isomericSmiles=isomericSmiles)
385 if NamesCol != '':
386 for m,n in zip(frame[molCol], map(str,frame[NamesCol])):
387 m.SetProp('_Name',n)
388 w.write(m)
389 w.close()
390 else:
391 for m in frame[molCol]:
392 w.write(m)
393 w.close()
394
395 import numpy as np
396 import os
397 from rdkit.six.moves import cStringIO as StringIO
398
400 """
401 Saves pandas DataFrame as a xlsx file with embedded images.
402 It maps numpy data types to excel cell types:
403 int, float -> number
404 datetime -> datetime
405 object -> string (limited to 32k character - xlsx limitations)
406
407 Cells with compound images are a bit larger than images due to excel.
408 Column width weirdness explained (from xlsxwriter docs):
409 The width corresponds to the column width value that is specified in Excel.
410 It is approximately equal to the length of a string in the default font of Calibri 11.
411 Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format.
412 This feature is only available at runtime from within Excel.
413 """
414
415 import xlsxwriter
416
417 cols = list(frame.columns)
418 cols.remove(molCol)
419 dataTypes = dict(frame.dtypes)
420
421 workbook = xlsxwriter.Workbook(outFile)
422 worksheet = workbook.add_worksheet()
423 worksheet.set_column('A:A', size[0]/6.)
424
425
426 c2 = 1
427 for x in cols:
428 worksheet.write_string(0, c2, x)
429 c2 += 1
430
431 c = 1
432 for index, row in frame.iterrows():
433 image_data = StringIO()
434 img = Draw.MolToImage(row[molCol], size=size)
435 img.save(image_data, format='PNG')
436
437 worksheet.set_row(c, height=size[1])
438 worksheet.insert_image(c, 0, "f", {'image_data': image_data})
439
440 c2 = 1
441 for x in cols:
442 if str(dataTypes[x]) == "object":
443 worksheet.write_string(c, c2, str(row[x])[:32000])
444 elif ('float' in str(dataTypes[x])) or ('int' in str(dataTypes[x])):
445 if (row[x] != np.nan) or (row[x] != np.inf):
446 worksheet.write_number(c, c2, row[x])
447 elif 'datetime' in str(dataTypes[x]):
448 worksheet.write_datetime(c, c2, row[x])
449 c2 += 1
450 c += 1
451
452 workbook.close()
453 image_data.close()
454
455
457 '''
458 Draw grid image of mols in pandas DataFrame.
459 '''
460 if legendsCol:
461 if legendsCol == frame.index.name:
462 img = Draw.MolsToGridImage(frame[column], legends=list(map(str, list(frame.index))), **kwargs)
463 else:
464 img = Draw.MolsToGridImage(frame[column], legends=list(map(str, list(frame[legendsCol]))), **kwargs)
465 else:
466 img = Draw.MolsToGridImage(frame[column], **kwargs)
467 return img
468
469 from rdkit.Chem.Scaffolds import MurckoScaffold
470
471 -def AddMurckoToFrame(frame, molCol = 'ROMol', MurckoCol = 'Murcko_SMILES', Generic = False):
479
480
481 from rdkit.Chem import AllChem
482
491
493 '''
494 Aligns molecules in molCol to scaffolds in scaffoldCol
495 '''
496 frame[molCol] = frame.apply(lambda x: AlignMol(x[molCol],x[scaffoldCol]), axis=1)
497
498
499 if __name__ == "__main__":
500 import sys
501 if pd is None:
502 print("pandas installation not found, skipping tests", file=sys.stderr)
503 else:
504
505 try:
506 v = pd.__version__.split('.')
507 except AttributeError:
508
509 v = pd.version.version.split('.')
510
511 if v[0]=='0' and int(v[1])<10:
512 print("pandas installation >=0.10 not found, skipping tests",
513 file=sys.stderr)
514 else:
515 import doctest
516 failed,tried=doctest.testmod(optionflags=doctest.ELLIPSIS+doctest.NORMALIZE_WHITESPACE)
517 if failed:
518 sys.exit(failed)
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551