#!/usr/bin/env python # coding: utf-8 # # parm@frosst-y to SMIRNOFF # # This notebook provides examples/utility functionality to assist with conversion of parm@frosst or relatives to SMIRNOFF format. Particularly, Christopher Bayly is generating modified AMBER `frcmod` files where the first entry for each parameter (i.e. `CT-CT-CT`) is replaced by the relevant SMIRKS pattern, for conversion into SMIRNOFF FFXML format. # # This notebook will: # 1. Convert a specified smirks-frcmod file to SMIRNOFF FFXML # 2. Generate (or take in) a set of molecules in OpenEye oemol format # 3. Use the SMIRNOFF `ForceField` class to determine (a) which parameters are used in which molecules; (b) which molecules contain a specified parameter; and (c) which molecules do NOT contain a specified parameter. # # Bayly has also updates the notebook with visualization for 3(b) and 3(c). # # Bannan added printed current atom types to make looking up references easier # # **Authors**: # * David L. Mobley (UCI) # * Contributions from Christopher I. Bayly (OpenEye Scientific Software Inc.) and Caitlin C. Bannan (UCI) # # In[2]: # Imports from __future__ import print_function from convert_frcmod import * import openeye.oechem as oechem import openeye.oeiupac as oeiupac import openeye.oeomega as oeomega import openeye.oedepict as oedepict from IPython.display import display from openff.toolkit.typing.engines.smirnoff.forcefield import * from openff.toolkit.typing.engines.smirnoff.forcefield_utils import get_molecule_parameterIDs from openff.toolkit.utils import * get_ipython().run_line_magic('', 'matplotlib inline') import matplotlib import numpy as np import pylab as pl import matplotlib.pyplot as plt import time import IPython import pickle import glob # # Relevant methods # # Relevant methods # In[3]: def depictAtomByIdx(mol_copy, atomIdxList, supH = True, width=900, height=500): mol = oechem.OEMol(mol_copy) OEGenerate2DCoordinates(mol) atomBondSet = oechem.OEAtomBondSet() for atom in mol.GetAtoms(): if atom.GetIdx() in atomIdxList: atomBondSet.AddAtom( atom) for bond in atom.GetBonds(): nbrAtom = bond.GetNbr(atom) nbrIdx = nbrAtom.GetIdx() if (nbrIdx in atomIdxList) and nbrIdx>atom.GetIdx(): atomBondSet.AddBond( bond) from IPython.display import Image dopt = oedepict.OEPrepareDepictionOptions() dopt.SetDepictOrientation( oedepict.OEDepictOrientation_Horizontal) dopt.SetSuppressHydrogens(supH) oedepict.OEPrepareDepiction(mol, dopt) opts = oedepict.OE2DMolDisplayOptions(width, height, oedepict.OEScale_AutoScale) disp = oedepict.OE2DMolDisplay(mol, opts) aroStyle = oedepict.OEHighlightStyle_Color aroColor = oechem.OEColor(oechem.OEGrey) oedepict.OEAddHighlighting(disp, aroColor, aroStyle, oechem.OEIsAromaticAtom(), oechem.OEIsAromaticBond() ) hstyle = oedepict.OEHighlightStyle_BallAndStick hcolor = oechem.OEColor(oechem.OELightGreen) oedepict.OEAddHighlighting(disp, hcolor, hstyle, atomBondSet) #ofs = oechem.oeosstream() img = oedepict.OEImage(width, height) oedepict.OERenderMolecule(img, disp) #oedepict.OERenderMolecule(ofs, 'png', disp) #ofs.flush() #return Image(data = "".join(ofs.str())) return Image(oedepict.OEWriteImageToString("png",img)) # In[4]: def getMolParamIDToAtomIndex( oemol, ff): """Take an OEMol and a SMIRNOFF force field object and return a dictionary, keyed by parameter ID, where each entry is a tuple of ( smirks, [[atom1, ... atomN], [atom1, ... atomN]) giving the SMIRKS corresponding to that parameter ID and a list of the atom groups in that molecule that parameter is applied to. Parameters ---------- oemol : OEMol OpenEye OEMol with the molecule to investigate. ff : ForceField SMIRNOFF ForceField object (obtained from an ffxml via ForceField(ffxml)) containing FF of interest. Returns ------- param_usage : dictionary Dictionary, keyed by parameter ID, where each entry is a tuple of ( smirks, [[atom1, ... atomN], [atom1, ... atomN]) giving the SMIRKS corresponding to that parameter ID and a list of the atom groups in that molecule that parameter is applied to. """ labels = ff.labelMolecules([oemol]) param_usage = {} for mol_entry in range(len(labels)): for force in labels[mol_entry].keys(): for (atom_indices, pid, smirks) in labels[mol_entry][force]: if not pid in param_usage: param_usage[pid] = (smirks, [atom_indices]) else: param_usage[pid][1].append( atom_indices ) return param_usage # In[5]: def GetAtomInfo(mol, indices, skip_atoms = []): #print(indices) atoms_by_index = dict() charges_by_index = dict() for atom in mol.GetAtoms(): idx = atom.GetIdx() if idx in indices: atoms_by_index[idx] = atom charge = atom.GetFormalCharge() if charge == 0: charges_by_index[idx] = '' elif charge > -1: charges_by_index[idx] = '+%i' % charge else: charges_by_index[idx] = str(charge) atoms = [(atoms_by_index[idx],charges_by_index[idx]) for idx in indices] types = [atom.GetType() for (atom,charge) in atoms] atom_smarts = ['[#%i%s]' % (atom.GetAtomicNum(),charge) for (atom,charge) in atoms] smarts = '~'.join(atom_smarts) types = '~'.join(types) for (atom, charge) in atoms: if atom.GetAtomicNum() in skip_atoms: return (True, smarts, types) return (False, smarts, types) def DepictMolWithParam(mol, indice_list, supH = False, print_atoms = True, skip_atoms = []): skip_count = 0 for IdxByOccurrence in indice_list: skip_it, approx_smarts, types = GetAtomInfo(mol, IdxByOccurrence, skip_atoms) if skip_it: skip_count += 1 continue if print_atoms: print("Approximate SMARTS: %s" % approx_smarts) print("Current Atom Types: %s" % types) display(depictAtomByIdx(mol, IdxByOccurrence, supH = supH)) if skip_count > 0: skips = ','.join([str(i) for i in skip_atoms]) print("This molecule contains %i fragment(s) with at least one atom in (%s)" % (skip_count, skips)) # In[6]: def make_param_histogram(param_id_counts, param_ids, letter, title): # Graph occurrences of bond parameters parm_ids = [ pid for pid in param_ids if pid[0]==letter] parm_ids.sort() counts_parms = [param_id_counts[parm_id] for parm_id in parm_ids] #print( parm_ids) #print( counts_parms) split = int(len(parm_ids)/2) indices = np.arange(len(parm_ids)) fix, ax = plt.subplots(2,1,figsize=(16,5)) ax[0].set_yscale('log', nonposy='clip') ax[1].set_yscale('log', nonposy='clip') rects2 = ax[0].bar(indices[0:split], counts_parms[0:split] ) ax[0].set_ylabel('Count') ax[0].set_xticks( indices[0:split]) ax[0].set_xticklabels( parm_ids[0:split], rotation=-60, ha='left') ax[0].set_xlim(indices[0], indices[split]) plt.yscale('log',nonposy='clip') rects2 = ax[1].bar(indices[split:], counts_parms[split:]) ax[1].set_ylabel('Count') ax[1].set_xticks( indices[split:]) ax[1].set_xticklabels( parm_ids[split:], rotation=-60, ha='left') ax[1].set_xlim(indices[split], indices[-1]+1) ax[0].set_title(title) plt.show() # In[7]: def check_valence(mol): """ Checks for hypervalency Parameter --------- mol - OEMol() Return ------ boolean - True (no inappropriate valency) False (an atom with atomic number < 10 has > 4 Valence) """ for atom in mol.GetAtoms(): atomNum = atom.GetAtomicNum() # find number of neighbors to this atom valence = atom.GetValence() if atomNum <= 10: # first row elements if valence > 4: print("Found a #%i atom with valence %i in molecule %s" % (atomNum, valence, oechem.OECreateIsoSmiString(mol))) return False return True # ## 1. Convert specified SMIRKS `frcmod` file to SMIRNOFF FFXML # In[8]: # Input and output info #infile = 'example.frcmod' # smirnoffish frcmod file to convert infile = 'smirnoffishFrcmod.parm99Frosst.txt' # smirnoffish frcmod file to convert ffxmlFile = 'smirnoff99FrosstFrcmod.offxml' template = 'template.offxml' # Template FFXML file without parameters (but with remainder of contents) # In[9]: # Convert # Already converted convert_frcmod_to_ffxml( infile, template, ffxmlFile) # In[10]: # Load SMIRNOFF FFXML ff = ForceField(ffxmlFile) # We will use this below to access details of parameters # ## 2. Generate or take in a set of molecules in OpenEye OEMol format # # Here we will take a set of molecules from openff-toolkit (or elsewhere), read in all molecules and then uncomment any filters you want. # # Here are some examples of molecule sets in openff-toolkit (at /openff/toolkit/data/molecules/): # * `AlkEthOH_test_filt1_ff.mol2` - 42 alkanes, ethers, and alcohols with parm@frosst atom types # * `DrugBank_atyped.oeb` - DrugBank database with parm@frosst atom types (including "untypable" atoms) # * `zinc-subset-parm@frosst.mol2.gz` - ZINC parm@frosst subset from CCL # # In[10]: # Un-comment this section if you want to use a local directory with individual mol2 files #DBpath = "path/to/molecules/*.mol2" #DBpath = "/Users/bannanc/gitHub/FreeSolv/mol2files_sybyl/*mol2" #DB_files = glob.glob(DBpath) #molecules = list() #for f in DB_files: # molecules += read_molecules(f, verbose=False) # These are atoms you don't want in your set, in this case metaloids or nobel gases skip_atoms = [2, 5, 14,33, 34, 52, 54] # Molecules file in openforcefield/data/molecules/ # OR any relative/absolute path mol_File = 'DrugBank_atyped.oeb' #mol_File = "zinc-subset-parm@frosst.mol2.gz" #mol_File = "/Users/bannanc/Google Drive/RESEARCH/OFF_draftAndTestSpace/eMolecules_missingParameters/output.mol2" molecules = read_molecules(mol_File) # For use later, generate isomeric SMILES for these so we can easily look up molecules by smiles isosmiles_to_mol = dict() repeat = 0 skipped = 0 for mol in molecules: c_mol = OEMol(mol) oechem.OEAddExplicitHydrogens(c_mol) # Get the smiles string for this molecule smi = oechem.OECreateIsoSmiString(c_mol) # uncomment to skip molecules with > n heavy atoms n=200 if OECount(c_mol, OEIsHeavy()) > n: continue # uncomment to skip molecules with metals #if OECount(c_mol, OEIsMetal()) > 0: #skipped +=1 # continue # uncomment to skip molecules containing the skip_atoms has_skip_atom = False for n in skip_atoms: if OECount(c_mol, OEHasAtomicNum(n)) > 0: has_skip_atom = True #if has_skip_atom: #skipped += 1 # continue # uncomment to skip molecules with 5 bonds to atoms with atomic number < 10 (i.e. pentavalent nitrogen) #if not check_valence(c_mol): #skipped += 1 # continue # uncomment to skip single molecules that contain > 1 molecule #if '.' in smi: # skipped +=1 # continue if smi in isosmiles_to_mol: repeat += 1 isosmiles_to_mol[smi] = c_mol oemols = [mol for smi, mol in isosmiles_to_mol.items()] print("\nAfter filtering %i molecules there were %i repeated SMILES.\nThe final set has %i/%i molecules"\ % (skipped, repeat, len(oemols), len(molecules))) # ## 3. Determine parameter usage in molecules # # Here we will use the SMIRNOFF ForceField class to determine (a) which parameters are used in which molecules; (b) which molecules contain a specified parameter; and (c) which molecules do NOT contain a specified parameter. We begin by just loading the SMIRNOFF force field we generated in section 1. # ### 3(a). Determine which parameters are used in which molecules # # Here we determine which parameters are actually used in which molecules, and make a couple example plots of the frequency of parameter occurrences for some of our example parameters. # In[11]: # Track time init_time = time.time() # label molecules labels = ff.labelMolecules(oemols, verbose = False) elapsed = (time.time() - init_time) / 60.0 print("Assigned labels took %.2f minutes" % (elapsed)) # In[12]: # organize dictionaries to reference information init_time = time.time() parameters_by_molecule = dict() parameters_by_ID = dict() param_ids = set() param_id_counts = dict() for idx, mol_dict in enumerate(labels): smi = OECreateIsoSmiString(oemols[idx]) parameters_by_molecule[smi] = dict() for force_type, label_set in mol_dict.items(): for (indices, pid, smirks) in label_set: if not pid in parameters_by_molecule[smi]: parameters_by_molecule[smi][pid] = list() parameters_by_molecule[smi][pid].append(indices) if not pid in parameters_by_ID: parameters_by_ID[pid] = set() parameters_by_ID[pid].add(smi) param_ids.add(pid) for pid in param_ids: param_id_counts[pid] = 0 for smi, pid_dict in parameters_by_molecule.items(): for pid, ind_list in pid_dict.items(): param_id_counts[pid] += len(ind_list) elapsed = (time.time() - init_time) / 60.0 print("Organizing dictionaries took %.2f minutes" % (elapsed)) # For fun/info, do a quick graph of frequency of occurrence of particular parameters. Here, let's just do bond parameters # In[13]: # Create Hisogram for each type of parameter # VdW make_param_histogram(param_id_counts, param_ids, 'n', "VdW for DrugBank Molecules") # Bonds make_param_histogram(param_id_counts, param_ids, 'b', "Bonds for DrugBank Molecules") # Angles make_param_histogram(param_id_counts, param_ids, 'a', "Angles for DrugBank Molecules") # Torsions make_param_histogram(param_id_counts, param_ids, 't', "Torsions for DrugBank Molecules") #make_param_histogram(param_id_counts, param_ids, 'i', "Impropers for DrugBank Molecules") # ### 3(b)-3(c). Determine which molecules do/do not contain selected parameter # # Determine which molecules do and do not contain a specified parameter; give access to isomeric smiles and OEMol for each molecule in each case. # In[16]: # INPUT: Pick what parameter to look at parameter_id = 'n1' # For info, get details of that parameter params = ff.getParameter(paramID=parameter_id) print("For parameter %s, the relevant parameters are:" % parameter_id) print(params) # Find molecules which do/do not use that parameter mols_with_param = [] mols_wo_param = [] for isosmi in parameters_by_molecule: # Store a tuple of (isomeric smiles, oemol) for each if parameter_id in parameters_by_molecule[isosmi].keys(): mols_with_param.append( (isosmi, isosmiles_to_mol[isosmi] )) else: mols_wo_param.append( (isosmi, isosmiles_to_mol[isosmi] )) print("\nThere are %s molecules containing that parameter and %s which do not, out of %s.\n" % (len(mols_with_param), len(mols_wo_param), len(isosmiles_to_mol))) # Print first 10 molecules not containing parameter not_containing = min(10, len(mols_wo_param)) if not_containing == 0: print("\nAll molecules conatin parameter '%s'" % parameter_id) elif not_containing <= 10: print("\nThe %i molecule(s) that do(es) not contain parameter '%s'" % (not_containing, parameter_id)) else: print("First 10 molecules not containing '%s' parameter:" % parameter_id) for i in range(not_containing): print(" %s" % mols_wo_param[i][0]) # Print first 10 molecules containing parameter containing = min(10,len(mols_with_param)) if containing == 0: print("\nNO molecules contain '%s' parameter" % parameter_id) elif containing <= 10: print("\nThe %i molecule(s) containing '%s' parameter:" % (containing, parameter_id)) else: print("\nFirst 10 molecules containing '%s' parameter:" % parameter_id) for i in range(containing): print(" %s" % mols_with_param[i][0]) # In[1]: lowerlimit = 0 upperlimit = 100 # Skip showing molecule if the highlighted parameter # includes an element we know we don't have parameters for skip_atoms = [2, 5, 14,33, 34, 52] # Correct with list of molecules is less than upper limit if len(mols_with_param) < upperlimit: upperlimit = len(mols_with_param) print("\nDepicting molecules %i to %i with parameter '%s'\n\n" % (lowerlimit,upperlimit-1, parameter_id)) # Show molecules form lower to upper limit highlighting fragment with the parameter_id for idx in range(lowerlimit, upperlimit): smiles = mols_with_param[idx][0] mol = mols_with_param[idx][1] skip_it = False OEAddExplicitHydrogens(mol) indice_list = parameters_by_molecule[smiles][parameter_id] print("looking at molecule %i" % idx) print('Selected smiles is %s' % smiles) print('Selected IUPAC name guessed: %s' % oeiupac.OECreateIUPACName(mol) ) print( 'mol title and NumAtoms: ', mol.GetTitle(), mol.NumAtoms() ) print( "Number of times '%s' appears: %i" % (parameter_id, len(indice_list))) DepictMolWithParam( mol, indice_list, supH = False, skip_atoms=skip_atoms) print() print() print() # In[ ]: