This tutorial will walk you through how to design miniproteins in PyRosetta using the PyData stack for analysis and distributed computing.
This Jupyter notebook uses parallelization and is not meant to be executed within a Google Colab environment.
Please see setup instructions in Chapter 16.00
Alexander S. Ford, Brian D. Weitzner, Christopher D. Bahl
Documentation for the pyrosetta.distributed
namespace can be found here: https://nbviewer.jupyter.org/github/proteininnovation/Rosetta-PyData_Integration/blob/master/distributed_overview.ipynb
!pip install pyrosettacolabsetup
import pyrosettacolabsetup; pyrosettacolabsetup.install_pyrosetta()
import pyrosetta; pyrosetta.init()
import pyrosetta
import dask.bag
import dask.distributed
import os
import pandas
import seaborn
from matplotlib import pylab
from pyrosetta.distributed.tasks.score import ScorePoseTask
from pyrosetta.distributed.io import pose_from_pdbstring
from pyrosetta.distributed.packed_pose import to_dict
import zipfile
Setup LocalCluster to full utilize the current machine.
if not os.getenv("DEBUG"):
cluster = dask.distributed.LocalCluster(n_workers=1, threads_per_worker=1)
client = dask.distributed.Client(cluster)
else:
client = None
client
Load decoys from batch compute run, adding annotations for designed sequence, cystine count and cystine location.
def load_source(decoy):
src_pdb = zipfile.ZipFile(library).open(decoy).read()
p = pose_from_pdbstring(src_pdb)
cys_locations=[i for i, c in enumerate(p.pose.sequence()) if c == "C"]
p = p.update_scores(
library=library,
decoy=decoy,
sequence=p.pose.sequence(),
num_res = len(p.pose.sequence()),
num_cys=len(cys_locations),
cys_locations=",".join(map(str, cys_locations))
)
return p
if not os.getenv("DEBUG"):
library = "inputs/EHEE_library.zip"
decoy_names = [f.filename for f in zipfile.ZipFile(library).filelist if f.filename.endswith(".pdb")]
if not os.getenv("DEBUG"):
decoys = dask.bag.from_sequence(decoy_names).map(load_source).map(ScorePoseTask()).persist()
if not os.getenv("DEBUG"):
result_frame = pandas.DataFrame.from_records(decoys.map(to_dict).compute()).sort_values("total_score")
We anticipate a distribution of score results, with higher scores with more disulfide insertions.
if not os.getenv("DEBUG"):
seaborn.boxplot(x="num_cys", y="total_score", data=result_frame)
if not os.getenv("DEBUG"):
seaborn.boxplot(x="cys_locations", y="total_score", data=result_frame)
Select the best model by total_Score for each inserted disulfide location, allowing us to test a variety of disulfide architectures.
if not os.getenv("DEBUG"):
best_by_location = to_dict(result_frame.groupby("cys_locations").head(1).reset_index(drop=True))
print(len(best_by_location))
if not os.getenv("DEBUG"):
with open("EHEE.best_by_location.fasta", "w") as out:
for entry in best_by_location:
print(f">{entry['decoy']}", file=out)
print(entry['sequence'], file=out)
if not os.getenv("DEBUG"):
!head expected_outputs/EHEE.best_by_location.fasta
Visualize using Py3Dmol or PyMolMover as you have learned before.
'''
import py3Dmol
view = py3Dmol.view(viewergrid=(3, 3), linked=False, width=900, height=900)
for i in range(9):
view.addModel( pyrosetta.distributed.io.to_pdbstring(best_by_location[i]), "pdb", viewer=(i/3, i%3),)
view.setStyle({'cartoon':{'color':'spectrum'}})
view.setStyle({"resn": "CYS"}, {'stick': {}, 'cartoon':{'color':'spectrum'}} )
view.zoomTo()
'''