#!/usr/bin/env python # coding: utf-8 # # Rstoolbox IO & Silent Extraction # # @Author: 吴炜坤 # # @email:weikun.wu@xtalpi.com/weikunwu@163.com # rstoolbox是由RosettaCommon社区开发者jaumebonet开发的一款专门用于分析Rosett Silent文件的工具包。他可以以pandas的DataFrame的形式对Silent文件中的数据进行提取,使用其中的一些API可以非常方便地进行分析和作图。 # > github: https://github.com/jaumebonet/RosettaSilentToolbox # # > doc: http://jaumebonet.cat/RosettaSilentToolbox # ### 一、安装 # 命令解决: # ``` # pip install rstoolbox # ``` # 配置环境: 在\$HOME 目录下创建.rstoolbox.cfg文件。 # ``` # cd ~ # vi ~/.rstoolbox.cfg # ``` # .rstoolbox.cfg文件中的内容: # ``` # rosetta: # compilation: macosclangrelease # path: /Volumes/MiniTwo/bin/Rosetta/main/source/bin/ # system: # output: ./ # overwrite: false # ``` # 关键设置: # - compilation: rosetta app的后缀名 # - path: rosetta app所处的bin文件夹 【因此本地环境中需要额外安装好Rosetta】 # In[1]: # 初始化pyrosetta from pyrosetta import * init() # ### 二、SilentFiles文件的获取 # 在第一章节,我们曾经提及过如何从Pose中输出SilentFiles以及如何在Pose中添加额外的信息,同时SimpleMetrics中的信息也会被自动储存到SilentFiles文件中。所以一个标准的流程跑下来SilentFiles应该有着所有我们需要分析的数据。 # 复习一下输出Silent文件以及在Pose中添加额外的信息的方法: # **以下的设计可能会耗时比较久,可以选择性跳过,结果文件已经储存在data文件夹下!** # In[2]: # # 举例使用FastDesign快速设计一些序列和结构: # from pyrosetta import pose_from_pdb, init, create_score_function # from pyrosetta.rosetta.protocols.denovo_design.movers import FastDesign # from pyrosetta.rosetta.core.pack.task import TaskFactory # from pyrosetta.rosetta.core.pose import Pose # from pyrosetta.io import poses_to_silent # init('') # # load pose # starting_pose = pose_from_pdb('./data/EHEE_rd4_0976.pdb') # ref2015 = create_score_function('ref2015') # design_tf = TaskFactory() # # setup FastDesign # fastdesign = FastDesign(ref2015, 1) # fastdesign.set_default_movemap() #使用默认的Movemap() # fastdesign.set_task_factory(design_tf) # # design for 10 times: code for design pose. # for i in range(10): # design_pose = Pose() # design_pose.assign(starting_pose) # assign pose # fastdesign.apply(design_pose) ## apply design # # output to silent file; # poses_to_silent(fastdesign, './data/design_result.silent') # ### 三、使用rstoolbox做数据分析 # 上述我们已经通过分析计算了10条蛋白质的序列和结构,目前我们需要对输出的Silent文件进行分析,从中提取需要的数据并作图展示。 # rstoolbox主要可用于: # - 分析序列profile # - 分析储存在silent文件中的score/其他数据,作图展示 # - 比较不同参数设计的结果性质分布 # - 分析、筛选以及导出满足需求的序列和结构 # #### 3.1 Silent文件数据读取 # rstoolbox读取数据时,第一步要确定load什么数据。load数据的填写格式可以参考: # > http://jaumebonet.cat/RosettaSilentToolbox/tutorials/readrosetta.html#readrosetta # In[3]: # 初始化 import rstoolbox as rs from rstoolbox.io import parse_rosetta_file # In[4]: # 最简单的读取方式(全读取式): rules = {'sequence': 'A'} silent_file = './data/design_result.silent' raw_df = parse_rosetta_file(silent_file, rules) # 打印读取数据: raw_df # 所有记录在silent_file中的数据都会被读取,并储存在pandas的DataFrame对象中。 # 除此以外,parse_rosetta_file支持定义description,通过设定description一些过滤条件,可以预先筛选掉一部分的数据,或保留与分析过程最关心的数据。目前description支持10种语法: # # |definition term |description | # |-----------------|-----------------------------------------------------------------------------------------------------------------------------------------| # |scores |Basic selection of the scores to store. Default is all scores. | # |scores_ignore |Selection of specific scores to ignore. | # |scores_rename |Rename some score names to others. | # |scores_by_residue|Pick score by residue types into a single array value. | # |scores_missing |Names of scores that might be missing in some decoys. | # |naming |Use the decoy identifier’s name to create extra score terms. | # |sequence |Pick sequence data from the silent file. | # |structure |Pick structural data from the silent file. | # |psipred |Pick PSIPRED data from the silent file. | # |dihedrals |Retrieve dihedral data from the silent file. | # |labels |Retrieve residue labels from the silent file. | # |graft_ranges |When using the MotifGraftMover, multi-columns will be created when more than one segment is grafted. Provide here the number of segments.| # # 此处以保留某些score项作为description的条件作为使用案例:(非必要,全部读进来也不会产生负面影响) # In[5]: # 定义description, description是一个字典格式; rules = {'sequence': 'A'} # rules = {'scores_ignore': [''], 'sequence': 'HL'} # 根据rules进行读取: silent_file = './data/design_result.silent' df_ignore = parse_rosetta_file(silent_file, rules) # 打印读取数据: df_ignore # **练习: 尝试上述不同的rules,看看会有些什么效果?** # In[ ]: # #### 3.2 Selection:数据过滤筛选(重要) # 其实rstoolbox的重要的功能之一就是读取silent文件至pandas的DataFrame,之后的过滤筛选都是用的DataFrame的逻辑。 # 此处主要介绍: # - 选择语法 # - 排序语法 # - 截取语法 # **选择语法1**: # df_selection = df[ ( df['metrics'] == value ) ] # In[6]: # 例句1: df_selection1 = raw_df[(raw_df['score'] <= -115)] df_selection1 # **选择语法2: df_selection = df[ (df['metrics1'] == value) & (df['metrics2'] >= value)]** # In[7]: # 例句2: df_selection2 = raw_df[(raw_df['score'] <= -115) & (raw_df['percent_core'] <= 0.25)] df_selection2 # **排序语法** df_selection = df.sort_values('metrics').head(value) # - sort_value: 从小往上排序 # - head: 选取top排名 # - tail: 选取尾部排名 # In[8]: # 例句3--排序: df_selection3 = raw_df.sort_values('score') df_selection3 # In[9]: # 例句4--截断: df_selection4 = raw_df.sort_values('score').head(5) df_selection4 # In[10]: # 例句5--截断: df_selection5 = raw_df.sort_values('score').tail(5) df_selection5 # #### 3.3 Plot模块 # rstoolbox下plot模块进行一些简单的图表分析, 主要设计的API: # # |definition term |description | # |-----------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| # |multiple_distributions(df, fig, grid[, …]) |Automatically plot boxplot distributions for multiple score types of the decoy population. | # |sequence_frequency_plot(df, seqID, ax[, …]) |Makes a heatmap subplot into the provided axis showing the sequence distribution of each residue type for each position. | # |logo_plot(df, seqID[, refseq, key_residues, …])|Generates full figure classic LOGO plots. | # |logo_plot_in_axis(df, seqID, ax[, refseq, …]) |Generates classic LOGO plot in a given axis. | # |positional_sequence_similarity_plot(df, ax) |Generates a plot covering the amount of identities and positives matches from a population of designs to a reference sequence according to a substitution matrix.| # |per_residue_matrix_score_plot(df, seqID, ax) |Plot a linear representation of the scoring obtained by applying a substitution matrix. | # |positional_structural_similarity_plot(df, ax) |Generates a bar plot for positional prevalence of secondary structure elements. | # |plot_fragments(small_frags, large_frags, …) |Plot RMSD quality of a pair of FragmentFrame in two provided axis. | # |plot_fragment_profiles(fig, small_frags, …) |Plots a full summary of the a FragmentFrame quality with sequence and expected secondary structure match. | # |plot_alignment(df, seqID, ax[, line_break, …]) |Make an image representing the alignment of sequences with higlights to mutant positions. | # |plot_ramachandran(df, seqID, fig[, grid, …]) |Generates a ramachandran plot in RAMPAGE style. | # |plot_ramachandran_single(df, seqID, ax[, …]) |Plot only one of the 4 ramachandran plots in RAMPAGE format. | # |plot_dssp_vs_psipred(df, seqID, ax) |Generates a horizontal heatmap showing differences in psipred predictions to dssp assignments. | # # **举例A:数据分布图** # 此处举几个简单的使用案例进行说明: 试想下以下需求,在蛋白设计过程中我们相对一些设计地比较好的序列和结构进行筛选,进行这项工作前,必须先知道整体数据的分布形式,使用multiple_distributions API就可以快速地画出所有选定Score的分布图帮助下一步筛选设定筛选条件。 # In[11]: import rstoolbox as rs import pandas as pd import matplotlib.pyplot as plt import seaborn as sns plt.style.use('ggplot') plt.rcParams['savefig.dpi'] = 300 plt.rcParams['figure.dpi'] = 300 # In[12]: # plot的score选定(此处我们关注4个score) # values = ["score", "hbond_sr_bb", "fa_elec", "hbond_bb_sc", "rama_prepro", "lk_ball"] values = list(raw_df.columns)[:12] # 定义图表 fig = plt.figure(figsize=(12, 6)) grid = [2, 6] # 第一个数值代表行数量,第二个数值代表列数量,共计可放入2x3项score。 # 制图 axes = rs.plot.multiple_distributions(df=raw_df, fig=fig, grid=grid, values=values) # 展示 plt.tight_layout() plt.show() # **举例B:不同组设计的指标比较** # 假设目前我们有两种设计方法,我们想比对这两种方法设计得到的序列或其他特征分布的比较。 # In[13]: # 读取两组不同方法设计得到的计算结果: (这里为了方便,假装不一样吧。。) silent_file = './data/design_result.silent' method1_df = parse_rosetta_file(silent_file, rules) method1_df = rs.utils.add_column(method1_df, 'design_method', 'fastdesign') method2_df = parse_rosetta_file(silent_file, rules) method2_df = rs.utils.add_column(method2_df, 'design_method', 'random') # 合并df; new_df = pd.concat([method1_df, method2_df], ignore_index=False) # 开始作图比较: # In[14]: # 选取需要比较的metrics; values = list(raw_df.columns)[:6] # 定义图表 fig = plt.figure(figsize=(12, 6)) grid = [2, 3] # 第一个数值代表行数量,第二个数值代表列数量,共计可放入2x6项score。 # 制图 axes = rs.plot.multiple_distributions(df=new_df, fig=fig, grid=grid, values=values, x='design_method', hue='design_method',) # 展示 plt.tight_layout() plt.show() # 除了从两个silent文件中读取dataframe,比较分布也可以从不同的df_selection子集中进行添加标签与合并数据: # In[15]: # 从选择语句: df_selection4 df_selection4 = rs.utils.add_column(df_selection4, 'rank', 'worst') df_selection5 df_selection5 = rs.utils.add_column(df_selection5, 'rank', 'best') # 合并df; new_df2 = pd.concat([df_selection4, df_selection5], ignore_index=False) # 选取需要比较的metrics; values = list(raw_df.columns)[:6] # 定义图表 fig = plt.figure(figsize=(12, 6)) grid = [2, 3] # 第一个数值代表行数量,第二个数值代表列数量,共计可放入2x3项score。 # 制图 axes = rs.plot.multiple_distributions(df=new_df2, fig=fig, grid=grid, values=values, x='rank', hue='rank',) # 展示 plt.tight_layout() plt.show() # 更多API参考链接: http://jaumebonet.cat/RosettaSilentToolbox/api.html # #### 3.4 从silent中分离pdb文件 # 通过pyrosetta API,可以结合rstoolbox过滤筛选后的DataFrame信息,分离对应我们需要的PDB结构(无论是做后续的分析还是视觉观察、结构作图等)。 # In[20]: from pyrosetta.io import poses_from_silent # 读取silent; poses = poses_from_silent(silent_file) # 按照全面筛选得到data. df_selection5 # 获取description列信息: selected_pdb_list = list(df_selection5['description']) print(selected_pdb_list) # In[17]: # 分离结构: import os for pose in poses: # 获取pose中的description信息; description = pose.pdb_info().name() if description in selected_pdb_list: pose.dump_pdb(os.path.join('./data', description)) # #### 3.5 从df中直接输出Fasta文件 # In[18]: from rstoolbox.io import read_fasta, write_fasta df_selection5 # In[19]: # 保存至Fasta文件中: content = write_fasta(df_selection5, "A") with open('./data/design_decoy.fasta', 'w') as f: f.write(content) # In[ ]: