import sys
import os
import io
if 'google.colab' in str(get_ipython()):
# clone the momics-demos repository to use the utils module from there
# TODO: eventually utils from momics will be used for that
try:
os.system('git clone https://github.com/palec87/momics-demos.git')
print(f"Repository cloned")
except OSError as e:
print(f"An error occurred while cloning the repository: {e}")
sys.path.insert(0,'/content/momics-demos')
else:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils import init_setup, get_notebook_environment
init_setup()
# Initialize the environment variable
notebook_environment = 'unknown'
# Determine the notebook environment
env = get_notebook_environment()
print(f"Environment: {env}")
# Connection to MGnify API
import os
# this repo is not maintained, or less than jsonapi-requests
# consider a dep change
from jsonapi_client import Session as APISession
from jsonapi_client import Modifier
import requests
import panel as pn
# Dataframes and display
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
# Data transformation
from functools import reduce
# Plots
import matplotlib.pyplot as plt
import seaborn as sns
# import plotly.graph_objects as go
%matplotlib inline
import momics.plotting as pl
# Create signature of MAGs for comparison against database
# import sourmash
# import glob
# import time
# from pathlib import PurePath as pp
# from Bio import SeqIO
# Warning verbosity
import warnings
warnings.filterwarnings(action="ignore")
pn.extension()
## Query and show endpoints ##
##############################
select_endpoint = pn.widgets.Select(
name="MGnify endpoints",
value="",
options=[],
description="Select endpoint to query",
)
button0 = pn.widgets.Button(name='Get', button_type='primary')
def update_endpoints(event):
if not event:
return
r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/v1/")
endpoint_dict = r.json()['data']
endpoints = [k for k in endpoint_dict.keys()]
select_endpoint.options = endpoints
pn.bind(update_endpoints, button0, watch=True)
pn.Row(button0, select_endpoint)
## Query genomes ##
###################
styles = {
"box-shadow": "rgba(50, 50, 93, 0.25) 0px 6px 12px -2px, rgba(0, 0, 0, 0.3) 0px 3px 7px -3px",
"border-radius": "4px",
"padding": "10px",
}
pn.extension('tabulator')
button1 = pn.widgets.Button(name='Query', button_type='primary')
text_input = pn.widgets.TextInput(name='Text Input', placeholder='Enter a string here...')
atable = pn.widgets.Tabulator(sizing_mode="stretch_both", name="Data View")
def query_genomes(event):
if not event:
return
with APISession("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
search_filter = Modifier(f"taxon_lineage={text_input.value}")
resources = map(lambda r: r.json, mgnify.iterate(select_endpoint.value, filter=search_filter))
resources_df = pd.json_normalize(resources)
print('Queried', text_input.value, "from", select_endpoint.value)
# update table view
atable.value = resources_df
# create data folder if it doesn't exist
os.system("mkdir -p data")
# save to parquet
resources_df.to_parquet(os.path.join("data", f"{select_endpoint.value}_{text_input.value}.parquet"))
# this is alert for the dsahboard, TODO: needs to be tested
pn.pane.Alert('## Data saved to data folder ##', alert_type='success', width=500).servable()
pn.bind(query_genomes, button1, watch=True)
tabs = pn.Tabs(
('select', pn.Row(button1, text_input)),
('view', atable),
styles=styles, sizing_mode="stretch_width", height=500, margin=10
)
tabs
# Data transformation
from functools import reduce
def mgn_split_taxonomy(df):
features = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
# all_genomes_df = atable.value
# Split the 'attributes.taxon-lineage' column and create new columns
lineage_split = df['attributes.taxon-lineage'].str.split(';', expand=True)
lineage_split.columns = features
# Concatenate the original DataFrame with the new columns
all_genomes_tax_df = pd.concat([df, lineage_split], axis=1)
return features, all_genomes_tax_df
features, taxonomy_df = mgn_split_taxonomy(atable.value)
sankey_df = taxonomy_df.groupby(features).size().reset_index(name='count')
sankey_df.head()
pn.extension()
# panel plot widget with plotly sankey
sankey = pl.get_sankey(sankey_df, cat_cols=features[0:6],
value_cols='count', title='Taxon Lineage')
# panel pane for plotly sankey
sankey_pane = pn.pane.Plotly(sankey, config={'displayModeBar': False})
sankey_pane