import pandas as pd, numpy as np from emperor import Emperor, nbinstall from skbio import OrdinationResults from skbio.io.util import open_file from skbio.stats.composition import clr, centralize, closure from scipy.spatial.distance import euclidean from biom import load_table nbinstall() def load_mf(fn, index='#SampleID'): _mf = pd.read_csv(fn, sep='\t', dtype=str, keep_default_na=False, na_values=) _mf.set_index(index, inplace=True) return _mf
In this notebook we are going to showcase how to visualize a biplot using Emperor. To exemplify this, we are going to load data from Reber et al. 2016 (the data was retrieved from study 1634 in Qiita, remember you need to be logged in to access the study). Specifically, here we will reproduce Figure S4.
We start by loading the sample metadata and a BIOM table that has already been rarefied to an even depth of 20,000 sequences per sample (this table was generated using a closed reference protocol).
bt = load_table('ptsd-mice/table.biom') mf = load_mf('ptsd-mice/mapping-file.tsv')
Next we are going to create a table of metadata for the bacteria represented in this table. In this example we are only going to use the taxonomic information, but you could add any additional information that you have access to. Note that we only use the genus level (
'taxonomy_5') as our category to collapse the OTUs.
feature_mf = bt.metadata_to_dataframe('observation') feature_mf = feature_mf.reset_index(drop=True).drop_duplicates(subset=['taxonomy_5']).copy() feature_mf.set_index('taxonomy_5', inplace=True, ) feature_mf.index.name = 'FeatureID'
In the original figure, the authors created the ordination based on a table collapsed at the genus level.
collapse_genus = lambda id_, x: x['taxonomy'] bt = bt.collapse(collapse_genus, norm=False, min_group_size=1, axis='observation')
Lastly, we compute a compositional Principal Components Analysis ordination and select only the 10 most important features (meaning that in the plot we will only see 10 arrows).
table = bt.to_dataframe() mat = clr(centralize(closure(table.T + 1))) u, k, v = np.linalg.svd(mat) N = len(u) DIMENSIONS = 5 _k = k[:DIMENSIONS] # scale U matrix wrt to sqrt of eigenvalues u = u[:,:DIMENSIONS] * np.sqrt(N-1) # scale V matrix wrt to sqrt of eigenvalues v = np.multiply(v[:DIMENSIONS,:],(_k.reshape(DIMENSIONS,1) / np.sqrt(N-1))) axes = ['CPCA %d' % i for i in range(1, DIMENSIONS + 1)] samples = pd.DataFrame(u, index=table.columns, columns=axes) features = pd.DataFrame(v.T, index=table.index, columns=axes) features['importance'] = features.apply(lambda x: euclidean(np.zeros_like(x), x), axis=1) features.sort_values('importance', inplace=True, ascending=False) features.drop(['importance'], inplace=True, axis=1) # only keep the 10 most important features, change this number to see more arrows features = features[:10] res = OrdinationResults( short_method_name='CPCA', long_method_name='Compositional Principal Component Analysis', eigvals=pd.Series(_k, index=axes), samples=samples, features=features, proportion_explained=_k /_k.sum() )
The figure below will display the feature and sample data. You can go to Color, select
taxonomy_1 (this will color the arrows at the phylum level) and then select
collection_day_fixed to color the samples by collection day (we recommend that you use a continuou color mapping, for example Viridis).
Emperor(res, mf, feature_mapping_file=feature_mf, remote=False)
Emperor(res, mf, remote=False)