Companion notebook for https://blog.luizirber.org/2020/07/23/mag-results/
!snakemake -j1
Building DAG of jobs... Nothing to be done. Complete log: /home/luizirber/work/sourmash-bio/2020-07-22-mag-search/.snakemake/log/2020-07-24T144113.032349.snakemake.log
from ipywidgets import interactive
import ipywidgets as widgets
import re
from IPython.display import HTML
import pandas as pd
import numpy as np
data = pd.read_csv("results.csv",
sep=",",
quotechar="'",
names=["MAG", "metagenome", "containment"])
# These MAGs come from TARA Oceans, and some of them are in Parks 8k, so let's remove them from results
tara_metag = pd.read_table("inputs/tara_runinfo.csv", sep=",", header=0, usecols=["Run"])
parks_8k = pd.read_table("inputs/parks_runinfo.csv", sep=",", header=0, usecols=["Run"])
# Fix names so it's easier to query
data['MAG'] = data['MAG'].str.replace(r"'(?P<id>.*)'", lambda m: m.group("id"))
data['metagenome'] = data['metagenome'].str.replace(r".*/(?P<id>.*).sig.*", lambda m: m.group("id"))
# Actually remove TARA and Parks
to_keep = set(data['metagenome'].values).difference(set(tara_metag["Run"].values))
to_keep = to_keep.difference(set(parks_8k["Run"].values))
filtered = data[data['metagenome'].isin(to_keep)]
print(filtered[filtered['containment'] > 0.5].sort_values(by="containment"))
# MAGs metadata
taxonomy = pd.read_excel("inputs/tully_mag_taxonomy.xlsx", header=1).set_index("Genome ID")
stats = pd.read_excel("inputs/tully_mag_stats.xlsx", header=1).set_index("Genome ID")
print("unique SRA runs: ", len(filtered['metagenome'].unique()))
MAG metagenome containment 89721 TOBG_NP-28 SRR2103020 0.500144 53860 TOBG_NP-42 SRR7168048 0.500169 28907 TOBG_EAC-55 SRR7479580 0.500182 73862 TOBG_SP-43 SRR7986296 0.500218 27250 TOBG_SAT-1356 SRR6713912 0.500264 ... ... ... ... 60804 TOBG_RS-626 ERR4013358 0.988004 66828 TOBG_SP-208 SRR5868539 0.989301 66807 TOBG_SP-4095 SRR5868539 0.990476 6268 TOBG_EAC-96 SRR8159436 0.992379 66794 TOBG_NP-110 SRR5868539 0.993755 [23644 rows x 3 columns] unique SRA runs: 6398
len(filtered["MAG"].unique())
2291
len(filtered[filtered['containment'] > 0.5]["MAG"].unique())
1407
len(filtered[filtered['containment'] > 0.5]["metagenome"].unique())
2938
filtered = filtered[filtered['containment'] > 0.5]
def update_candidate(candidate):
print(taxonomy.loc[candidate])
print()
print(stats.loc[candidate])
print()
filters = (
(filtered["MAG"] == candidate) &
(filtered['containment'] > 0.5)
)
with_link = filtered.copy()
with_link["metagenome"] = filtered["metagenome"].apply(
lambda x: "<a href='https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}'>{}</a>".format(x, x)
)
display(HTML(with_link[filters]
.sort_values(by="containment", ascending=False)
.to_html(render_links=True, escape=False,)))
candidatepicker = interactive(update_candidate, candidate=widgets.Dropdown(
options=filtered['MAG'].unique(),
value='TOBG_NP-110',
description='MAG name',
disabled=False
))
display(candidatepicker)
interactive(children=(Dropdown(description='MAG name', index=889, options=('TOBG_SP-231', 'TOBG_NAT-188', 'TOB…