#!/usr/bin/env python # coding: utf-8 # # Exploring molecular generative models checking out their latent spaces # # Here you can find some examples on how to explore the latent spaces of some of the molecular generative models hosted in GT4SD. # In[1]: import torch import mols2grid import seaborn as sns from tqdm import tqdm from rdkit import Chem from typing import Optional, List from matplotlib import pyplot as plt from sklearn.decomposition import PCA from paccmann_generator.drug_evaluators.scsore import SCScore def render_latent_points(latent_points: torch.Tensor, values: Optional[List]): decomposed_points = PCA(n_components=2).fit_transform(latent_points) sns.scatterplot(x=decomposed_points[:, 0], y=decomposed_points[:, 1], hue=values) # ## PolymerBlocks # # An algorithm for generating monomers and catalyst for polymer chemistry. # In[2]: from gt4sd.algorithms.generation.polymer_blocks.core import PolymerBlocksGenerator, PolymerBlocks configuration = PolymerBlocksGenerator() algorithm = PolymerBlocks(configuration=configuration) # Via the algorithm you can easily inspect the generated molecules interactively: # In[3]: molecules = list(algorithm.sample(15)) # In[4]: mols2grid.display([Chem.MolFromSmiles(molecule) for molecule in molecules], fixedBondLength=200) # To unpack the model and inspect its latent spaces we can simply conduct an exploration using a random point cloud # In[5]: # get the actual generator generator = configuration.get_conditional_generator(configuration.ensure_artifacts()) # generate randomly points in the latent space number_of_latent_points = 1024 latent_points = torch.randn( number_of_latent_points - (number_of_latent_points % generator.batch_size), # making sure the points are chunked properly generator.gru_decoder.latent_dim, device=generator.device ) # keep points related to valid molecules molecules = [] for latent_points_batch in tqdm(torch.split(latent_points, generator.batch_size)): indexes = generator.decode(latent_points_batch) generated_molecules = [ Chem.MolFromSmiles(generator.smiles_language.token_indexes_to_smiles(index)) for index in indexes ] molecules.extend(generated_molecules) valid_indexes = [index for index, molecule in enumerate(molecules) if molecule is not None] molecules = [molecules[index] for index in valid_indexes] latent_points = latent_points[valid_indexes] # In[6]: get_ipython().run_line_magic('matplotlib', 'inline') values = [Chem.Descriptors.MolWt(molecule) for molecule in molecules] _ = render_latent_points(latent_points, values=values) _ = plt.title("Rendering molecular weight in a 2D projection of the latent space") # ## PaccMannRL # # A family of algorithms for generating candidate hit-like molecules either targeting an omic profile [[1]](https://doi.org/10.1016/j.isci.2021.102269) or AA sequences [[2]](https://doi.org/10.1088/2632-2153/abe808). # # Let's explore the model for targeted ligand design given a protein sequence. # In[7]: from gt4sd.algorithms.conditional_generation.paccmann_rl.core import PaccMannRL, PaccMannRLProteinBasedGenerator # >sp|Q9BYF1|ACE2_HUMAN Angiotensin-converting enzyme 2 OS=Homo sapiens OX=9606 GN=ACE2 PE=1 SV=2 target = "".join(filter(str.isalpha, (list(""" MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQ NMNNAGDKWSAFLKEQSTLAQMYPLQEIQNLTVKLQLQALQQNGSSVLSEDKSKRLNTIL NTMSTIYSTGKVCNPDNPQECLLLEPGLNEIMANSLDYNERLWAWESWRSEVGKQLRPLY EEYVVLKNEMARANHYEDYGDYWRGDYEVNGVDGYDYSRGQLIEDVEHTFEEIKPLYEHL HAYVRAKLMNAYPSYISPIGCLPAHLLGDMWGRFWTNLYSLTVPFGQKPNIDVTDAMVDQ AWDAQRIFKEAEKFFVSVGLPNMTQGFWENSMLTDPGNVQKAVCHPTAWDLGKGDFRILM CTKVTMDDFLTAHHEMGHIQYDMAYAAQPFLLRNGANEGFHEAVGEIMSLSAATPKHLKS IGLLSPDFQEDNETEINFLLKQALTIVGTLPFTYMLEKWRWMVFKGEIPKDQWMKKWWEM KREIVGVVEPVPHDETYCDPASLFHVSNDYSFIRYYTRTLYQFQFQEALCQAAKHEGPLH KCDISNSTEAGQKLFNMLRLGKSEPWTLALENVVGAKNMNVRPLLNYFEPLFTWLKDQNK NSFVGWSTDWSPYADQSIKVRISLKSALGDKAYEWNDNEMYLFRSSVAYAMRQYFLKVKN QMILFGEEDVRVANLKPRISFNFFVTAPKNVSDIIPRTEVEKAIRMSRSRINDAFRLNDN SLEFLGIQPTLGPPNQPPVSIWLIVFGVVMGVIVVGIVILIFTGIRDRKKKNKARSGENP YASIDISKGENNPGFQNTDDVQTSF """)))) configuration = PaccMannRLProteinBasedGenerator() algorithm = PaccMannRL(configuration=configuration, target=target) # Via the algorithm you can easily inspect the generated molecules interactively: # In[8]: molecules = list(algorithm.sample(15)) # In[9]: mols2grid.display([Chem.MolFromSmiles(molecule) for molecule in molecules], fixedBondLength=200) # To unpack the model and inspect its latent spaces we can simply conduct an exploration using a random point cloud # In[10]: # get the actual generator generator = configuration.get_conditional_generator(configuration.ensure_artifacts()) # generate randomly points in the latent space number_of_latent_points = 512 latent_points = torch.randn( number_of_latent_points, generator.encoder_latent_size, device=generator.device ) # keep points related to valid molecules molecules = [] for latent_point in tqdm(torch.unsqueeze(latent_points, 1)): smiles_list = generator.get_smiles_from_latent(latent_point) generated_molecules = [ Chem.MolFromSmiles(smiles) for smiles in smiles_list ][:1] # keeping only a molecule per point molecules.extend(generated_molecules) valid_indexes = [index for index, molecule in enumerate(molecules) if molecule is not None] molecules = [molecules[index] for index in valid_indexes] latent_points = latent_points[valid_indexes] # In[11]: get_ipython().run_line_magic('matplotlib', 'inline') values=[] for molecule in molecules: try: values.append(Chem.QED.qed(molecule)) except: values.append(0.0) _ = render_latent_points(latent_points, values=values) _ = plt.title("Rendering drug likeness in a 2D projection of the latent space") # ## PaccMannGP # # An algorithm for generating molecules using controlled sampling based on Gaussian Processes [[3]](https://doi.org/10.1021/acs.jcim.1c00889). # In[12]: from gt4sd.algorithms.controlled_sampling.paccmann_gp.core import PaccMannGP, PaccMannGPGenerator # maximizing drug likeness and synthesizability target = {"qed": {"weight": 1.0}, "sa": {"weight": 1.0}} configuration = PaccMannGPGenerator() algorithm = PaccMannGP(configuration=configuration, target=target) # Via the algorithm you can easily inspect the generated molecules interactively: # In[13]: molecules = list(algorithm.sample(15)) # In[14]: mols2grid.display([Chem.MolFromSmiles(molecule) for molecule in molecules], fixedBondLength=200) # To unpack the model and inspect its latent spaces we can simply conduct an exploration using a random point cloud # In[15]: # get the actual generator generator = configuration.get_conditional_generator(configuration.ensure_artifacts()) # generate randomly points in the latent space number_of_latent_points = 1024 latent_points = torch.randn( number_of_latent_points - (number_of_latent_points % generator.batch_size), generator.latent_dim, device=generator.device ) # keep points related to valid molecules molecules = [] for latent_points_batch in tqdm(torch.split(latent_points, generator.batch_size)): smiles_list = generator.smiles_generator.generate_smiles(latent_points_batch.unsqueeze(0)) generated_molecules = [ Chem.MolFromSmiles(smiles) for smiles in smiles_list ] molecules.extend(generated_molecules) valid_indexes = [index for index, molecule in enumerate(molecules) if molecule is not None] molecules = [molecules[index] for index in valid_indexes] latent_points = latent_points[valid_indexes] # In[16]: get_ipython().run_line_magic('matplotlib', 'inline') values=[] for molecule in molecules: try: values.append(Chem.QED.qed(molecule)) except: values.append(0.0) _ = render_latent_points(latent_points, values=values) _ = plt.title("Rendering drug likeness in a 2D projection of the latent space") # ## AdvancedManufacturing # # A collection of algorithm for generating molecules using controlled sampling leveraging data-driven continuos representations of molecules [[4]](https://doi.org/10.1021/acscentsci.7b00572) extending the autoencoders architecture considered (e.g., VAE-RNNs, VAE-Transformers). # In[17]: from gt4sd.algorithms.controlled_sampling.advanced_manufacturing.core import AdvancedManufacturing, CatalystGenerator # generate catalysts with target binding energy target = 10.0 configuration = CatalystGenerator() algorithm = AdvancedManufacturing(configuration=configuration, target=target) # Via the algorithm you can easily inspect the generated molecules interactively: # In[23]: molecules = list(algorithm.sample(15)) # In[24]: mols2grid.display([Chem.MolFromSmiles(molecule) for molecule in molecules], fixedBondLength=200) # To unpack the model and inspect its latent spaces we can simply conduct an exploration using a random point cloud # In[20]: generator = configuration.get_conditional_generator(configuration.ensure_artifacts()) # In[21]: # get the actual generator generator = configuration.get_conditional_generator(configuration.ensure_artifacts()) # generate randomly points in the latent space number_of_latent_points = 256 latent_points = torch.randn(number_of_latent_points, generator.vae.z_dimension) # keep points related to valid molecules molecules = [] for latent_point in tqdm(latent_points): molecules.append(Chem.MolFromSmiles(generator.vae.decode(latent_point))) valid_indexes = [index for index, molecule in enumerate(molecules) if molecule is not None] molecules = [molecules[index] for index in valid_indexes] latent_points = latent_points[valid_indexes] # In[22]: get_ipython().run_line_magic('matplotlib', 'inline') scs_scorer = SCScore() values = [scs_scorer(molecule) for molecule in molecules] _ = render_latent_points(latent_points, values=values) _ = plt.title("Rendering SCS score in a 2D projection of the latent space")