#!/usr/bin/env python # coding: utf-8 # # *This notebook contains material from [PyRosetta](https://RosettaCommons.github.io/PyRosetta.notebooks); # content is available [on Github](https://github.com/RosettaCommons/PyRosetta.notebooks.git).* # # < [Part I: Parallelized Global Ligand Docking with `pyrosetta.distributed`](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.05-Ligand-Docking-dask.ipynb) | [Contents](toc.ipynb) | [Index](index.ipynb) | [PyRosettaCluster Tutorial 1B. Reproduce simple protocol](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.07-PyRosettaCluster-Reproduce-simple-protocol.ipynb) >

# # PyRosettaCluster Tutorial 1A. Simple protocol # # PyRosettaCluster Tutorial 1A is a Jupyter Lab that generates a decoy using `PyRosettaCluster`. It is the simplest use case, where one protocol takes one input `.pdb` file and returns one output `.pdb` file. # # All information needed to reproduce the simulation is included in the output `.pdb` file. After completing PyRosettaCluster Tutorial 1A, see PyRosettaCluster Tutorial 1B to learn how to reproduce simulations from PyRosettaCluster Tutorial 1A. # *Warning*: This notebook uses `pyrosetta.distributed.viewer` code, which runs in `jupyter notebook` and might not run if you're using `jupyterlab`. # *Note:* This Jupyter notebook uses parallelization and is **not** meant to be executed within a Google Colab environment. # *Note:* This Jupyter notebook requires the PyRosetta distributed layer which is obtained by building PyRosetta with the `--serialization` flag or installing PyRosetta from the RosettaCommons conda channel # # **Please see Chapter 16.00 for setup instructions** # *Note:* This Jupyter notebook is intended to be run within **Jupyter Lab**, but may still be run as a standalone Jupyter notebook. # ### 1. Import packages # In[7]: import bz2 import glob import logging import os import pyrosetta import pyrosetta.distributed.io as io import pyrosetta.distributed.viewer as viewer from pyrosetta.distributed.cluster import PyRosettaCluster logging.basicConfig(level=logging.INFO) # ### 2. Initialize a compute cluster using `dask` # # 1. Click the "Dask" tab in Jupyter Lab (arrow, left) # 2. Click the "+ NEW" button to launch a new compute cluster (arrow, lower) # # ![title](Media/dask_labextension_1.png) # # 3. Once the cluster has started, click the brackets to "inject client code" for the cluster into your notebook # # ![title](Media/dask_labextension_2.png) # # Inject client code here, then run the cell: # In[8]: # This cell is an example of the injected client code. You should delete this cell and instantiate your own client with scheduler IP/port address. if not os.getenv("DEBUG"): from dask.distributed import Client client = Client("tcp://127.0.0.1:40329") else: client = None client # Providing a `client` allows you to monitor parallelization diagnostics from within this Jupyter Lab Notebook. However, providing a `client` is only optional for the `PyRosettaCluster` instance and `reproduce` function. If you do not provide a `client`, then `PyRosettaCluster` will instantiate a `LocalCluster` object using the `dask` module by default, or an `SGECluster` or `SLURMCluster` object using the `dask-jobqueue` module if you provide the `scheduler` argument parameter, e.g.: # *** # ``` # PyRosettaCluster( # ... # client=client, # Monitor diagnostics with existing client (see above) # scheduler=None, # Bypasses making a LocalCluster because client is provided # ... # ) # ``` # *** # ``` # PyRosettaCluster( # ... # client=None, # Existing client was not input (default) # scheduler=None, # Runs the simluations on a LocalCluster (default) # ... # ) # ``` # *** # ``` # PyRosettaCluster( # ... # client=None, # Existing client was not input (default) # scheduler="sge", # Runs the simluations on the SGE job scheduler # ... # ) # ``` # *** # ``` # PyRosettaCluster( # ... # client=None, # Existing client was not input (default) # scheduler="slurm", # Runs the simluations on the SLURM job scheduler # ... # ) # ``` # ### 3. Define or import the user-provided PyRosetta protocol(s): # # Remember, you *must* import `pyrosetta` locally within each user-provided PyRosetta protocol. Other libraries may not need to be locally imported because they are serializable by the `distributed` module. Although, it is a good practice to locally import all of your modules in each user-provided PyRosetta protocol. # In[9]: if not os.getenv("DEBUG"): from additional_scripts.my_protocols import my_protocol # In[10]: if not os.getenv("DEBUG"): client.upload_file("additional_scripts/my_protocols.py") # This sends a local file up to all worker nodes. # #### Let's look at the definition of the user-provided PyRosetta protocol `my_protocol` located in `additional_scripts/my_protocols.py`: # ``` # def my_protocol(input_packed_pose=None, **kwargs): # """ # Relax the input `PackedPose` object. # # Args: # input_packed_pose: A `PackedPose` object to be repacked. Optional. # **kwargs: PyRosettaCluster task keyword arguments. # # Returns: # A `PackedPose` object. # """ # import pyrosetta # Local import # import pyrosetta.distributed.io as io # Local import # import pyrosetta.distributed.tasks.rosetta_scripts as rosetta_scripts # Local import # # packed_pose = io.pose_from_file(kwargs["s"]) # # xml = """ # # # # # # # # # # # # """ # # return rosetta_scripts.SingleoutputRosettaScriptsTask(xml)(packed_pose) # ``` # ### 4. Define the user-provided keyword argument(s) (i.e. `kwargs`): # Upon PyRosetta initialization on the remote worker, the "`options`" and "`extra_options`" `kwargs` get concatenated before initialization. However, specifying the "`extra_options`" `kwargs` will override the default `-out:levels all:warning` command line flags, and specifying the "`options`" `kwargs` will override the default `-ex1 -ex2aro` command line flags. # In[11]: def create_kwargs(): yield { "options": "-ex1", "extra_options": "-out:level 300 -multithreading:total_threads 1", # Used by pyrosetta.init() on disributed workers "set_logging_handler": "interactive", # Used by pyrosetta.init() on disributed workers "s": os.path.join(os.getcwd(), "inputs", "1QYS.pdb"), } # Ideally, all pose manipulation is accomplished with the user-provided PyRosetta protocols. If you must manipulate a pose prior to instantiating `PyRosettaCluster`, here are some considerations: # - Avoid passing `Pose` and `PackedPose` objects through `create_kwargs()`. You might notice that the above cell passes the protein structure information to `PyRosettaCluster` as a `str` type locating the `.pdb` file. In this way, the input `PackedPose` object is instantiated from that `str` within `PyRosettaCluster` on the remote workers (using `io.pose_from_file(kwargs["s"])`) using a random seed which is saved by `PyRosettaCluster`. This allows the protocol to be reproduced, and avoids passing redundant large chunks of data over the network. # - It may be tempting to instantiate your pose before `PyRosettaCluster`, and pass a `Pose` or `PackedPose` object into the `create_kwargs()`. However, in this case PyRosetta will be initialized with a random seed outside `PyRosettaCluster`, and that random seed will not be saved by `PyRosettaCluster`. As a consequence, any action taken on the pose (e.g. filling in missing heavy atoms) will not be reproducible. # -If you must instantiate your pose before `PyRosettaCluster`, to ensure reproducibility the user must initialize PyRosetta with the constant seed `1111111` within the Jupyter notebook or standalone python script using: # # ``` # import pyrosetta # pyrosetta.init("-run:constant_seed 1") # ``` # # The `-run:constant_seed 1` command line flag defaults to the seed `1111111` ([documentation](https://www.rosettacommons.org/docs/latest/rosetta_basics/options/run-options)). Then, instantiate the pose: # # ``` # input_packed_pose = pyrosetta.io.pose_from_sequence("TEST") # ...Perform any pose manipulation... # ``` # # and then instantiate `PyRosettaCluster` with the additional `input_packed_pose` parameter argument, e.g.: # # ``` # PyRosettaCluster( # ... # input_packed_pose=input_packed_pose, # ... # ) # ``` # # For an initialization example, see Tutorial 4. # # In summary, the best practice involves giving `create_kwargs` information which will be used by the distributed protocol to create a pose within `PyRosettaCluster`. In edge cases, the user may provide a `Pose` or `PackedPose` object to the `input_packed_pose` argument of `PyRosettaCluster` and set a constant seed of `1111111` outside of `PyRosettaCluster`. # ### 5. Launch the original simulation using the `distribute()` method # # The protocol produces an output decoy, the exact coordinates of which we will reproduce in Tutorial 1B. # If the Jupyter Lab Notebook or standalone PyRosetta script did not yet initialize PyRosetta before instantiating `PyRosettaCluster` (preferred workflow), then `PyRosettaCluster` automatically initializes PyRosetta within the Jupyter Lab Notebook or standalone PyRosetta script with the command line flags `-run:constant_seed 1 -multithreading:total_threads 1 -mute all`. Thus, the master node is initialized with the default constant seed, where the master node acts as the client to the distributed workers. The distributed workers actually run the user-provided PyRosetta protocol(s), and each distributed worker initializes PyRosetta with a random seed, which is the seed saved by PyRosettaCluster for downstream reproducibility. The master node is always initialized with a constant seed as best practices. # To monitor parallelization diagnostics in real-time, in the "Dask" tab, click the various diagnostic tools _(arrows)_ to open new tabs: # ![title](Media/dask_labextension_4.png) # Arrange the diagnostic tool tabs within Jupyter Lab how you best see fit by clicking and dragging them: # ![title](Media/dask_labextension_3.png) # In[12]: if not os.getenv("DEBUG"): output_path = os.path.join(os.getcwd(), "outputs_1A") PyRosettaCluster( tasks=create_kwargs, client=client, scratch_dir=output_path, output_path=output_path, nstruct=4, # Run the first user-provided PyRosetta protocol four times in parallel ).distribute(protocols=[my_protocol]) # While jobs are running, you may monitor their progress using the dask dashboard diagnostics within Jupyter Lab! # ### 7. Visualize the resultant decoy # Gather the output decoys on disk into poses in memory: # In[13]: if not os.getenv("DEBUG"): results = glob.glob(os.path.join(output_path, "decoys", "*", "*.pdb.bz2")) packed_poses = [] for bz2file in results: with open(bz2file, "rb") as f: packed_poses.append(io.pose_from_pdbstring(bz2.decompress(f.read()).decode())) # View the poses in memory by clicking and draging to rotate, and zooming in and out with the mouse scroller. # In[14]: if not os.getenv("DEBUG"): view = viewer.init(packed_poses, window_size=(800, 600)) view.add(viewer.setStyle()) view.add(viewer.setStyle(colorscheme="whiteCarbon", radius=0.25)) view.add(viewer.setHydrogenBonds()) view.add(viewer.setHydrogens(polar_only=True)) view.add(viewer.setDisulfides(radius=0.25)) view() # Using the `pyrosetta.distributed.viewer` macromolecular visualizer, you can visualize your results in real-time as they complete. # ![title](Media/viewer_1.png) # ### Congrats! # # You have successfully performed a PyRosetta simulation using `PyRosettaCluster`! In the next tutorial we will reproduce one of the decoys precisely to make our computational science more reproducible. # # < [Part I: Parallelized Global Ligand Docking with `pyrosetta.distributed`](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.05-Ligand-Docking-dask.ipynb) | [Contents](toc.ipynb) | [Index](index.ipynb) | [PyRosettaCluster Tutorial 1B. Reproduce simple protocol](http://nbviewer.jupyter.org/github/RosettaCommons/PyRosetta.notebooks/blob/master/notebooks/16.07-PyRosettaCluster-Reproduce-simple-protocol.ipynb) >