#!/usr/bin/env python # coding: utf-8 # # Use Case 10: Reactome Pathways # Enrichment analysis helps to focus on a subset of genes that participate in the same pathway from a larger gene set. Visualizing the pathway can aid further examination. In this use case, we demonstrate how to visualize results on Reactome Pathways using python packages. # ## Step 1: Importing packages # Various Python packages are needed to download data from cptac, conduct statistical analysis to find meaningful protein clusters, and overlay pathway diagrams found on Reactome. The cptac.utils package offers a plethora of functions for these tasks. You can find more details by using help(cptac.utils). # In[1]: import cptac import pandas as pd import cptac.utils as utils import matplotlib.pyplot as plt from scipy import stats import statsmodels as statsmodels from IPython.display import Image # ## Step 2: Acquiring Data # # Even after importing the cptac package, we still need to load specific datasets. Here, we load the Renal Cancer proteomics data. # In[2]: ds = cptac.Ccrcc() df = ds.get_proteomics('umich') df.head() # # # Utilizing utils, we can pull data from a range of large databases. In this example, the Hugo Gene Nomenclature database is queried for a list of proteins in the proteosome. # In[3]: hgnc_lists = utils.get_hgnc_protein_lists() proteasome_proteins = sorted(set(hgnc_lists["Proteasome"])) included_cols = df.columns.get_level_values(0).intersection(proteasome_proteins) included_cols # ## Step 3: Identifying Differences between Tumor and Normal Conditions # # We loop through the list of proteins, separating tumor and normal data from each other. A t-test is then performed on both data lists. For each protein with a p-value less than .05, the protein name, p-value, and the difference between tumor mean and normal mean is stored in separate lists. # In[4]: nameList =[] pVal_list = [] deltaList = [] selected_proteins = df[included_cols] for protein in selected_proteins.columns: data = selected_proteins[protein] normal = data[data.index.str.endswith('.N')] tumor = data[~data.index.str.endswith('.N')] pVal = stats.ttest_ind(normal, tumor, equal_var = False, nan_policy='omit').pvalue if pVal < .05: name = protein[0] delta = tumor.mean() - normal.mean() nameList.append(name) pVal_list.append(pVal) deltaList.append(delta) # We apply the Benjamini & Hochberg/Yekutieli correction to the list of p-values calculated in the previous cell. A dataframe is then constructed using the lists of names, p-values, and differences between tumor mean and normal mean. # In[5]: reject_null, adj_p, alpha_sidak, alpha_bonf = statsmodels.stats.multitest.multipletests(pVal_list, alpha=0.05, method='fdr_bh') pVal_list = adj_p zippedList = list(zip(nameList, pVal_list, deltaList)) dfObj = pd.DataFrame(zippedList, columns = ['Name', 'P-Value', 'Delta']) dfObj = dfObj.set_index('Name') # We then simplify the abundance change (Delta column). An increase is assigned 1, a decrease is assigned -1, and no change is set at 0. # In[6]: dfObj.insert(1, "simplified_change", dfObj["Delta"]) dfObj["simplified_change"][dfObj["Delta"] > 0] = 1 dfObj["simplified_change"][dfObj["Delta"] < 0] = -1 dfObj["simplified_change"][dfObj["Delta"] == 0] = 0 # Using the Benjamini & Hochberg/Yekutieli correction function we are able to also evaluate whether or not to reject the null. This is evaluated and added as a column in the dataframe. # In[7]: dfObj = dfObj.assign(adj_p=adj_p, reject_null=reject_null) sig_res = dfObj[dfObj["reject_null"]] sig_genes = dfObj.index.get_level_values(0).tolist() sig_res # ## Step 4: Display data with Reactome # # Within utils from the cptac package we can use a function called "reactome_pathway_overlay" to query reactome and overlay the proteosome data we have found to be significant. A detailed guide on using this function can be read by calling `help(utils.reactome_pathway_overlay)`. # In this example I have chosen to overlay my data with the R-HSA-1236978 pathway ID. Determining desired pathway ID's in relation to your specific data can be done in a variety of methods such as gene enrichment analysis. # Finally, with the quality parameter set at 10 you can zoom in on areas of the pathway to see further detail. # In[8]: expression_vals, image_path = utils.reactome_pathway_overlay( 'R-HSA-1236978', df=dfObj['simplified_change'], analysis_token=None, open_browser=False, export_path="test.png", image_format='png', display_col_idx=None, diagram_colors='Modern', overlay_colors='Standard', quality=10) Image(image_path)