An Introduction to the RDKit for Cheminformatics
Import the necessary Python libraries
from rdkit import Chem #RDKit Chemistry
from rdkit.Chem.Draw import IPythonConsole #RDKit drawing
from rdkit.Chem import Draw #RDKit drawing
# A few settings to improve the quality of structures
from rdkit.Chem import rdDepictor
IPythonConsole.ipython_useSVG = True
rdDepictor.SetPreferCoordGen(True)
from rdkit.Chem import PandasTools #Add the ability to add a molecule to a dataframegrid
import mols2grid #The mols2grid library provides a convenient way of displaying molecules in a grid
Create a molecule (benzene) from a SMILES string
mol = Chem.MolFromSmiles("c1ccccc1")
mol
Get SMILES fro Gleevec from ChEMBL
glvc = Chem.MolFromSmiles("CN1CCN(Cc2ccc(cc2)C(=O)Nc3ccc(C)c(Nc4nccc(n4)c5cccnc5)c3)CC1")
glvc
Read a set of molecules from an SD file
mols = [x for x in Chem.SDMolSupplier("example_compounds.sdf")]
mols
[<rdkit.Chem.rdchem.Mol at 0x15a291940>, <rdkit.Chem.rdchem.Mol at 0x15a291a00>, <rdkit.Chem.rdchem.Mol at 0x15a291a60>, <rdkit.Chem.rdchem.Mol at 0x15a291ac0>, <rdkit.Chem.rdchem.Mol at 0x15a291b20>, <rdkit.Chem.rdchem.Mol at 0x15a291b80>, <rdkit.Chem.rdchem.Mol at 0x15a291be0>, <rdkit.Chem.rdchem.Mol at 0x15a291c40>, <rdkit.Chem.rdchem.Mol at 0x15a291ca0>, <rdkit.Chem.rdchem.Mol at 0x15a291d00>, <rdkit.Chem.rdchem.Mol at 0x10d8251c0>, <rdkit.Chem.rdchem.Mol at 0x15a291d60>, <rdkit.Chem.rdchem.Mol at 0x15a291dc0>, <rdkit.Chem.rdchem.Mol at 0x15a291e20>, <rdkit.Chem.rdchem.Mol at 0x15a291e80>, <rdkit.Chem.rdchem.Mol at 0x15a291ee0>, <rdkit.Chem.rdchem.Mol at 0x15a291f40>, <rdkit.Chem.rdchem.Mol at 0x15a291fa0>, <rdkit.Chem.rdchem.Mol at 0x15a758040>, <rdkit.Chem.rdchem.Mol at 0x15a7580a0>, <rdkit.Chem.rdchem.Mol at 0x15a758100>, <rdkit.Chem.rdchem.Mol at 0x15a758160>, <rdkit.Chem.rdchem.Mol at 0x15a7581c0>, <rdkit.Chem.rdchem.Mol at 0x15a758220>, <rdkit.Chem.rdchem.Mol at 0x15a758280>, <rdkit.Chem.rdchem.Mol at 0x15a7582e0>, <rdkit.Chem.rdchem.Mol at 0x15a758340>, <rdkit.Chem.rdchem.Mol at 0x15a7583a0>, <rdkit.Chem.rdchem.Mol at 0x15a758400>, <rdkit.Chem.rdchem.Mol at 0x15a758460>, <rdkit.Chem.rdchem.Mol at 0x15a7584c0>, <rdkit.Chem.rdchem.Mol at 0x15a758520>, <rdkit.Chem.rdchem.Mol at 0x15a758580>, <rdkit.Chem.rdchem.Mol at 0x15a7585e0>, <rdkit.Chem.rdchem.Mol at 0x15a758640>, <rdkit.Chem.rdchem.Mol at 0x15a7586a0>, <rdkit.Chem.rdchem.Mol at 0x15a758700>, <rdkit.Chem.rdchem.Mol at 0x15a758760>, <rdkit.Chem.rdchem.Mol at 0x15a7587c0>, <rdkit.Chem.rdchem.Mol at 0x15a758820>, <rdkit.Chem.rdchem.Mol at 0x15a758880>, <rdkit.Chem.rdchem.Mol at 0x15a7588e0>, <rdkit.Chem.rdchem.Mol at 0x15a758940>, <rdkit.Chem.rdchem.Mol at 0x15a7589a0>, <rdkit.Chem.rdchem.Mol at 0x15a758a00>, <rdkit.Chem.rdchem.Mol at 0x15a758a60>, <rdkit.Chem.rdchem.Mol at 0x15a758ac0>, <rdkit.Chem.rdchem.Mol at 0x15a758b20>, <rdkit.Chem.rdchem.Mol at 0x15a758b80>, <rdkit.Chem.rdchem.Mol at 0x15a758be0>]
Draw these molecules as a grid
Draw.MolsToGridImage(mols,molsPerRow=4,useSVG=True)
We can use the mols2grid library to display molecules in a grid
mols2grid.display(mols)
mols2grid.selection
{}
We can also read an SD file into a Pandas dataframe.
df = PandasTools.LoadSDF("example_compounds.sdf")
df.head()
ID | ROMol | |
---|---|---|
0 | 168691 CHEMBL318804 | |
1 | 86358 CHEMBL162 | |
2 | 575087 CHEMBL576683 | |
3 | 575065 CHEMBL571484 | |
4 | 575047 CHEMBL568937 |
Let's add columns with molecular weight and LogP to the dataframe.
from rdkit.Chem.Descriptors import MolWt
from rdkit.Chem.Crippen import MolLogP
df['MW'] = [MolWt(x) for x in df.ROMol]
df['LogP'] = [MolLogP(x) for x in df.ROMol]
df.head()
ID | ROMol | MW | LogP | |
---|---|---|---|---|
0 | 168691 CHEMBL318804 | 565.099 | 5.49788 | |
1 | 86358 CHEMBL162 | 466.541 | 4.35400 | |
2 | 575087 CHEMBL576683 | 527.915 | 4.96202 | |
3 | 575065 CHEMBL571484 | 491.935 | 4.36922 | |
4 | 575047 CHEMBL568937 | 487.991 | 5.12922 |
We can use a boxplot to examine the distribution of molecular weight within the dataframe.
import seaborn as sns
ax = sns.boxplot(x=df.MW)