import collections
import os
import json
import logging
import string
from scipy.stats import entropy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import networkx as nx
if os.getcwd().endswith('notebook'):
os.chdir('..')
from rna_learn.alphabet import ALPHABET_DNA, CODON_REDUNDANCY
sns.set(palette='colorblind', font_scale=1.3)
palette = sns.color_palette()
logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s")
logger = logging.getLogger(__name__)
db_path = os.path.join(os.getcwd(), 'data/db/seq.db')
engine = create_engine(f'sqlite+pysqlite:///{db_path}')
codon_to_aa_table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
}
possible_codons = sorted(CODON_REDUNDANCY.keys())
len(possible_codons)
64
codon_frequency_df = pd.read_csv(os.path.join(os.getcwd(), 'data/codon_bias.csv'))
print(len(codon_frequency_df))
codon_frequency_df.head()
3282
assembly_accession | species_taxid | in_test_set | n_total_codons | AAA | AAC | AAG | AAT | ACA | ACC | ... | TCG | TCT | TGA | TGC | TGG | TGT | TTA | TTC | TTG | TTT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GCA_000005825.2 | 79885 | False | 1203202 | 50709 | 16004 | 24775 | 32868 | 27117 | 8825 | ... | 6261 | 17452 | 677 | 2894 | 11946 | 5312 | 42464 | 14432 | 13363 | 38272 |
1 | GCA_000005845.2 | 562 | False | 1344061 | 45220 | 28975 | 13763 | 23608 | 9376 | 31511 | ... | 11968 | 11294 | 1244 | 8655 | 20469 | 6903 | 18597 | 22232 | 18329 | 29952 |
2 | GCA_000006175.2 | 2188 | True | 511762 | 41277 | 9506 | 6208 | 28398 | 10533 | 3688 | ... | 1693 | 7111 | 63 | 1841 | 2994 | 4972 | 27806 | 4525 | 7325 | 14409 |
3 | GCA_000006605.1 | 38289 | False | 737577 | 4666 | 18501 | 23297 | 4442 | 3555 | 23745 | ... | 7210 | 4657 | 337 | 4174 | 10432 | 1154 | 1167 | 18563 | 7493 | 4785 |
4 | GCA_000006685.1 | 83560 | False | 323926 | 13771 | 3527 | 5541 | 8332 | 5378 | 2884 | ... | 1987 | 11321 | 136 | 1924 | 3143 | 3316 | 11026 | 5157 | 6629 | 10685 |
5 rows × 68 columns
len(codon_frequency_df['species_taxid'].unique())
2680
len(codon_frequency_df)
3282
temperature_query = """
select s.assembly_accession, t.species_taxid, t.growth_tmp from assembly_source as s
left join (
select species_taxid, growth_tmp from species_traits
) as t
on s.species_taxid = t.species_taxid
"""
temperatures_df = pd.read_sql(temperature_query, engine)
temperatures_df.head()
assembly_accession | species_taxid | growth_tmp | |
---|---|---|---|
0 | GCA_000005825.2 | 79885 | 31.000 |
1 | GCA_000005845.2 | 562 | 37.875 |
2 | GCA_000006175.2 | 2188 | 36.600 |
3 | GCA_000006605.1 | 38289 | 37.000 |
4 | GCA_000006685.1 | 83560 | 37.000 |
species_temperatures_df = temperatures_df[
['species_taxid', 'growth_tmp']].groupby('species_taxid').first().reset_index()
codon_df = pd.merge(
codon_frequency_df,
temperatures_df,
how='left',
on='assembly_accession'
)
len(codon_df)
3282
codon_df.head()
assembly_accession | species_taxid | in_test_set | n_total_codons | AAA | AAC | AAG | AAT | ACA | ACC | ... | TCT | TGA | TGC | TGG | TGT | TTA | TTC | TTG | TTT | growth_tmp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GCA_000005825.2 | 79885 | False | 1203202 | 50709 | 16004 | 24775 | 32868 | 27117 | 8825 | ... | 17452 | 677 | 2894 | 11946 | 5312 | 42464 | 14432 | 13363 | 38272 | 31.000 |
1 | GCA_000005845.2 | 562 | False | 1344061 | 45220 | 28975 | 13763 | 23608 | 9376 | 31511 | ... | 11294 | 1244 | 8655 | 20469 | 6903 | 18597 | 22232 | 18329 | 29952 | 37.875 |
2 | GCA_000006175.2 | 2188 | True | 511762 | 41277 | 9506 | 6208 | 28398 | 10533 | 3688 | ... | 7111 | 63 | 1841 | 2994 | 4972 | 27806 | 4525 | 7325 | 14409 | 36.600 |
3 | GCA_000006605.1 | 38289 | False | 737577 | 4666 | 18501 | 23297 | 4442 | 3555 | 23745 | ... | 4657 | 337 | 4174 | 10432 | 1154 | 1167 | 18563 | 7493 | 4785 | 37.000 |
4 | GCA_000006685.1 | 83560 | False | 323926 | 13771 | 3527 | 5541 | 8332 | 5378 | 2884 | ... | 11321 | 136 | 1924 | 3143 | 3316 | 11026 | 5157 | 6629 | 10685 | 37.000 |
5 rows × 69 columns
def compute_codon_ratio(codon_df):
processed_codons = set()
for codon in sorted(CODON_REDUNDANCY.keys()):
if codon in processed_codons:
continue
elif len(CODON_REDUNDANCY[codon]) <= 1:
processed_codons.add(codon)
continue
synonymous_codons = CODON_REDUNDANCY[codon]
for c, _ in synonymous_codons:
processed_codons.add(c)
codon_df[f'{c}_ratio'] = codon_df[c] / sum((codon_df[cod] for cod, _ in synonymous_codons))
compute_codon_ratio(codon_df)
ratio_columns = [c for c in codon_df.columns if c.endswith('_ratio')]
x_train = codon_df[~codon_df['in_test_set']][ratio_columns].values
y_train = codon_df[~codon_df['in_test_set']]['growth_tmp'].values
x_test = codon_df[codon_df['in_test_set']][ratio_columns].values
y_test = codon_df[codon_df['in_test_set']]['growth_tmp'].values
print(len(x_train), len(x_test))
2222 1060
def fit_linear_regression(y_actual, x):
X = sm.add_constant(x)
model = sm.OLS(y_actual, X)
results = model.fit()
y_fit = results.predict(X)
return results, y_fit
results, y_fit_train = fit_linear_regression(y_train, x_train)
def plot_true_vs_prediction_per_specie(y_true, y_pred, ax=None):
if ax is None:
_, ax = plt.subplots(1, 1, figsize=(8, 8))
palette = sns.color_palette()
x_straight = range(int(np.min(y_true)) - 1, int(np.max(y_true)) + 1)
ax.plot(x_straight, x_straight, '--', color=palette[1], linewidth=2)
ax.scatter(y_true, y_pred, color=palette[0], alpha=0.5, label='Species')
ax.set_xlabel('Temperature °C (truth)')
ax.set_ylabel('Temperature °C (prediction)')
return ax
def compute_and_plot_pattern_against_ogt(y_actual, x, results, ax=None):
palette = sns.color_palette()
X = sm.add_constant(x)
y_fit = results.predict(X)
std, upper, lower = wls_prediction_std(results)
ax = plot_true_vs_prediction_per_specie(y_actual, y_fit, ax=ax)
df_data = [
[y_actual[i], y_fit[i], std[i]]
for i in range(len(y_actual))
]
df = pd.DataFrame(df_data, columns=['growth_tmp_actual', 'growth_tmp_prediction', 'growth_tmp_std'])
pred_mean = df[['growth_tmp_actual', 'growth_tmp_prediction']].groupby('growth_tmp_actual').mean()
std_mean = df[['growth_tmp_actual', 'growth_tmp_std']].groupby('growth_tmp_actual').mean().reset_index()
x_fill = std_mean['growth_tmp_actual'].values
x_base = pred_mean['growth_tmp_prediction'].values
y_fill_1 = x_base - 2 * std_mean['growth_tmp_std'].values
y_fill_2 = x_base + 2 * std_mean['growth_tmp_std'].values
ax.fill_between(x_fill, y_fill_1, y_fill_2, alpha=0.2, color=palette[1], label='Uncertainty');
ax.legend(loc='upper left')
return ax, y_fit
def compute_mae(y_true, y_pred):
return np.mean(np.abs(y_true - y_pred))
ax, y_fit_test = compute_and_plot_pattern_against_ogt(y_test, x_test, results)
ax.set_title((
f'Growth temperature prediction from codon bias\n\n'
f'R-squared = {results.rsquared:.2f} - Mean absolute error = {compute_mae(y_test, y_fit_test):.2f} °C\n'
));
def mae(y_true, y_pred):
return np.mean(np.abs(y_true - y_pred))
mae(y_train, y_fit_train)
6.712866857458104
mae(y_test, y_fit_test)
8.045457980527855
phylogeny_query = """
select species_taxid, species, genus, family, "order", class, phylum, superkingdom
from species_traits
where species_taxid in (
select species_taxid from train_test_split
where in_test_set = 1
)
"""
phylogeny_df = pd.read_sql(phylogeny_query, engine)
phylogeny_df.head()
species_taxid | species | genus | family | order | class | phylum | superkingdom | |
---|---|---|---|---|---|---|---|---|
0 | 7 | Azorhizobium caulinodans | Azorhizobium | Xanthobacteraceae | Rhizobiales | Alphaproteobacteria | Proteobacteria | Bacteria |
1 | 11 | Cellulomonas gilvus | Cellulomonas | Cellulomonadaceae | Micrococcales | Actinobacteria | Actinobacteria | Bacteria |
2 | 14 | Dictyoglomus thermophilum | Dictyoglomus | Dictyoglomaceae | Dictyoglomales | Dictyoglomia | Dictyoglomi | Bacteria |
3 | 19 | Pelobacter carbinolicus | Pelobacter | Desulfuromonadaceae | Desulfuromonadales | Deltaproteobacteria | Proteobacteria | Bacteria |
4 | 24 | Shewanella putrefaciens | Shewanella | Shewanellaceae | Alteromonadales | Gammaproteobacteria | Proteobacteria | Bacteria |
phylogeny_df[['superkingdom', 'species_taxid']].groupby('superkingdom').count()
species_taxid | |
---|---|
superkingdom | |
Archaea | 45 |
Bacteria | 525 |
phylogeny_df[['superkingdom', 'phylum', 'species_taxid']].groupby(['superkingdom', 'phylum']).count()
species_taxid | ||
---|---|---|
superkingdom | phylum | |
Archaea | Crenarchaeota | 13 |
Euryarchaeota | 32 | |
Bacteria | Acidobacteria | 1 |
Actinobacteria | 77 | |
Aquificae | 4 | |
Bacteroidetes | 26 | |
Calditrichaeota | 1 | |
Chlamydiae | 2 | |
Chlorobi | 1 | |
Chloroflexi | 5 | |
Cyanobacteria | 2 | |
Deinococcus-Thermus | 7 | |
Dictyoglomi | 1 | |
Firmicutes | 146 | |
Fusobacteria | 1 | |
Gemmatimonadetes | 1 | |
Nitrospirae | 1 | |
Planctomycetes | 1 | |
Proteobacteria | 215 | |
Spirochaetes | 4 | |
Synergistetes | 2 | |
Tenericutes | 18 | |
Thermotogae | 9 |
phylogeny_df[[
'superkingdom',
'phylum',
'class',
'species_taxid',
]].groupby([
'superkingdom',
'phylum',
'class',
]).count()
species_taxid | |||
---|---|---|---|
superkingdom | phylum | class | |
Archaea | Crenarchaeota | Thermoprotei | 13 |
Euryarchaeota | Archaeoglobi | 2 | |
Halobacteria | 11 | ||
Methanococci | 2 | ||
Methanomicrobia | 5 | ||
Methanopyri | 1 | ||
Thermococci | 10 | ||
Thermoplasmata | 1 | ||
Bacteria | Acidobacteria | Acidobacteriia | 1 |
Actinobacteria | Actinobacteria | 75 | |
Rubrobacteria | 2 | ||
Aquificae | Aquificae | 4 | |
Bacteroidetes | Bacteroidia | 7 | |
Chitinophagia | 2 | ||
Cytophagia | 3 | ||
Flavobacteriia | 12 | ||
Sphingobacteriia | 1 | ||
Calditrichaeota | Calditrichae | 1 | |
Chlamydiae | Chlamydiia | 2 | |
Chlorobi | Chlorobia | 1 | |
Chloroflexi | Anaerolineae | 3 | |
Chloroflexia | 1 | ||
Thermomicrobia | 1 | ||
Deinococcus-Thermus | Deinococci | 7 | |
Dictyoglomi | Dictyoglomia | 1 | |
Firmicutes | Bacilli | 94 | |
Clostridia | 47 | ||
Negativicutes | 3 | ||
Tissierellia | 2 | ||
Fusobacteria | Fusobacteriia | 1 | |
Gemmatimonadetes | Gemmatimonadetes | 1 | |
Nitrospirae | Nitrospira | 1 | |
Planctomycetes | Planctomycetia | 1 | |
Proteobacteria | Acidithiobacillia | 1 | |
Alphaproteobacteria | 46 | ||
Betaproteobacteria | 36 | ||
Deltaproteobacteria | 20 | ||
Epsilonproteobacteria | 12 | ||
Gammaproteobacteria | 98 | ||
Oligoflexia | 2 | ||
Spirochaetes | Spirochaetia | 4 | |
Synergistetes | Synergistia | 2 | |
Tenericutes | Mollicutes | 18 | |
Thermotogae | Thermotogae | 9 |
codon_with_phylogeny_df = pd.merge(
codon_df,
phylogeny_df,
how='left',
on='species_taxid',
)
print(len(codon_with_phylogeny_df))
3282
codon_test = codon_with_phylogeny_df[codon_with_phylogeny_df['in_test_set']].reset_index(drop=True)
def plot_true_vs_prediction_per_segment(df, y_true, y_pred, segmentation_column, n_items_threshold=3):
threshold = 5
possible_values = [
v for v in df[segmentation_column].unique()
if len(df[df[segmentation_column] == v].index) > n_items_threshold
]
n_cols = 2
n_rows = int(np.ceil(len(possible_values) / n_cols))
_, axes = plt.subplots(n_rows, 2, figsize=(8 * n_cols, 8 * n_rows))
axes = axes.flatten()
palette = sns.color_palette()
x_straight = range(int(np.min(y_true)) - 1, int(np.max(y_true)) + 1)
for i in range(len(axes)):
ax = axes[i]
if (i + 1) > len(possible_values):
ax.remove()
continue
possible_value = possible_values[i]
ix = df[df[segmentation_column] == possible_value].index
if len(ix) > n_items_threshold:
ax.plot(x_straight, x_straight, '--', color=palette[1], linewidth=2)
ax.scatter(y_true[ix], y_pred[ix], alpha=0.5, label=possible_value)
ax.set_title(possible_value)
ax.set_xlabel('Temperature °C (truth)')
ax.set_ylabel('Temperature °C (prediction)')
plot_true_vs_prediction_per_segment(
codon_test,
y_test,
y_fit_test,
segmentation_column='superkingdom',
)
plot_true_vs_prediction_per_segment(
codon_test,
y_test,
y_fit_test,
segmentation_column='phylum',
)
results.params
array([ 5.458943 , 3.56613788, 1.89280513, -7.77649539, 13.23543839, -4.48495806, -8.33884914, 4.19664132, 14.08610888, -16.34178529, 19.64366688, -52.4344612 , 9.50061777, -7.53049763, 52.62140248, 5.02500039, 11.96172608, -25.60271295, 16.62765253, -52.09826697, 49.54554392, 10.38126011, -25.16539965, 20.24308254, 5.36971817, 0.08922483, 8.26007106, -2.80112805, -27.13179652, -5.08726006, 19.44018735, 18.23781223, 14.8761475 , -6.28605849, -20.38055256, -17.2483225 , 17.89322537, 16.60450367, -0.35260828, 5.81155129, 3.27831769, 2.18062532, 36.36793316, 3.72197605, -5.86752807, -28.76343814, 8.63712107, -29.17868158, 5.43905798, 20.56144553, -8.70042447, 3.93264387, -11.37909354, 21.60581715, -8.55754133, 4.7555653 , 9.26091904, -3.05072275, 8.50966575, -12.96993379, 18.42887679, 6.84304768, -1.38410467])
feature_importance = np.abs(results.params[1:] / results.bse[1:])
feature_importance
array([ 2.10214631, 1.12424697, 2.6831855 , 4.59094671, 0.88468653, 2.14286898, 1.02893047, 3.40800832, 3.74120518, 5.05321843, 8.21709624, 2.41649738, 2.5678636 , 14.77510659, 0.81312307, 2.47547639, 4.04069409, 3.02230881, 6.70428408, 8.89263781, 4.01927352, 9.34370562, 7.86198486, 2.9963364 , 0.04965069, 3.57916173, 1.21616973, 5.6501089 , 1.25242067, 5.71847886, 5.93631885, 3.15133425, 1.42807991, 2.60706082, 4.58418159, 4.92803329, 4.40031427, 0.22455535, 3.72856248, 1.3218534 , 0.86651521, 6.54727092, 0.78451273, 1.26013672, 6.06032928, 1.96159608, 7.03308547, 1.5169691 , 4.33069267, 2.16770285, 1.03253919, 2.67752474, 6.75281935, 3.43189676, 1.93476195, 6.37077613, 1.31184561, 3.64011414, 6.06731715, 8.7139013 , 3.74224568, 0.76020899])
feature_importance[20:23]
array([4.01927352, 9.34370562, 7.86198486])
ratio_columns[20:23]
['ATT_ratio', 'ATC_ratio', 'ATA_ratio']
cols = [
'AGA_ratio',
'AGG_ratio',
'CGT_ratio',
'CGC_ratio',
'CGA_ratio',
'CGG_ratio',
'ATT_ratio',
'ATC_ratio',
'ATA_ratio',
]
res, f = fit_linear_regression(
y_actual=y_train,
x=codon_df[~codon_df['in_test_set']][cols].values
)
compute_and_plot_pattern_against_ogt(
y_test,
codon_df[codon_df['in_test_set']][cols].values,
res
);
def amino_acid_importance(y, x, results, possible_codons, ratio_columns):
aa_importance = {}
weights_importance = np.abs(results.params[1:] / results.bse[1:])
weights_importance_dict = {
ratio_columns[i].replace('_ratio', ''): weights_importance[i]
for i in range(len(weights_importance))
}
processed_codons = set()
for codon in possible_codons:
if len(CODON_REDUNDANCY[codon]) <= 1:
continue
elif codon in processed_codons:
continue
importance_factors = []
for c, _ in CODON_REDUNDANCY[codon]:
if c in processed_codons:
continue
processed_codons.add(c)
importance_factors.append(
weights_importance_dict[c]
)
aa = codon_to_aa_table[codon]
aa_importance[aa] = np.mean(importance_factors)
return aa_importance
aa_importance = amino_acid_importance(y_test, x_test, results, possible_codons, ratio_columns)
sorted([(k, v) for k, v in aa_importance.items()], key=lambda t: t[1], reverse=True)
[('C', 7.390609225741814), ('I', 7.074987997573898), ('R', 6.128497902148081), ('P', 4.639331818976233), ('S', 4.324754041065607), ('*', 3.9124782796655353), ('G', 3.7105858281695188), ('A', 3.6630624105820284), ('N', 3.637066104298797), ('L', 3.5165006870361193), ('V', 3.1576465337559703), ('Y', 2.4759798750799895), ('H', 2.3976657325528703), ('F', 2.2512273396487776), ('E', 1.976558913808437), ('T', 1.8661235754655932), ('K', 1.6131966375738807), ('Q', 1.5229935480855108), ('D', 1.0941843033942251)]
q = """
select assembly_accession, species_taxid, organism_name
from assembly_source
where organism_name like '%Methanosarcina barkeri%'
"""
pd.read_sql(q, engine)
assembly_accession | species_taxid | organism_name | |
---|---|---|---|
0 | GCA_000970025.1 | 2208 | Methanosarcina barkeri MS |
codon_df[codon_df['assembly_accession'] == 'GCA_000970025.1'][['TAG_ratio', 'growth_tmp']]
TAG_ratio | growth_tmp | |
---|---|---|
1324 | 0.045273 | 37.633 |
codon_df.iloc[0][['TAG_ratio', 'growth_tmp']]
TAG_ratio 0.181084 growth_tmp 31 Name: 0, dtype: object
codon_df.sort_values(['TAG_ratio']).head(30)[[
'assembly_accession',
'species_taxid',
'growth_tmp',
'TAG_ratio',
'TAA_ratio',
'TGA_ratio',
]]
assembly_accession | species_taxid | growth_tmp | TAG_ratio | TAA_ratio | TGA_ratio | |
---|---|---|---|---|---|---|
1795 | GCA_001889405.1 | 2177 | 30.000 | 0.010585 | 0.489415 | 0.500000 |
705 | GCA_000196655.1 | 2322 | 50.000 | 0.010686 | 0.677649 | 0.311665 |
1329 | GCA_000970325.1 | 2226 | 31.400 | 0.011374 | 0.433576 | 0.555050 |
438 | GCA_000025865.1 | 2176 | 37.000 | 0.011575 | 0.478108 | 0.510317 |
170 | GCA_000013725.1 | 29291 | 22.167 | 0.020696 | 0.451783 | 0.527521 |
298 | GCA_000019905.1 | 332410 | 29.000 | 0.033167 | 0.584743 | 0.382090 |
1327 | GCA_000970265.1 | 170861 | 24.100 | 0.035148 | 0.502333 | 0.462519 |
1323 | GCA_000969925.1 | 2210 | 51.000 | 0.037416 | 0.545490 | 0.417093 |
357 | GCA_000023205.1 | 1270 | 29.000 | 0.037567 | 0.019678 | 0.942755 |
1321 | GCA_000969885.1 | 2210 | 51.000 | 0.037588 | 0.553641 | 0.408771 |
1019 | GCA_000367205.1 | 33010 | 37.000 | 0.037913 | 0.039251 | 0.922837 |
1466 | GCA_001304615.2 | 1715806 | 45.000 | 0.038619 | 0.544374 | 0.417007 |
1916 | GCA_002158865.1 | 1082851 | 28.000 | 0.038783 | 0.058174 | 0.903043 |
1444 | GCA_001275345.1 | 71999 | 29.000 | 0.039391 | 0.016115 | 0.944494 |
932 | GCA_000299435.1 | 132920 | 25.000 | 0.039683 | 0.559163 | 0.401154 |
1326 | GCA_000970205.1 | 2209 | 36.500 | 0.039820 | 0.526647 | 0.433533 |
1328 | GCA_000970285.1 | 418008 | 37.000 | 0.039863 | 0.527757 | 0.432380 |
2556 | GCA_004363315.1 | 360241 | 45.000 | 0.040320 | 0.101560 | 0.858121 |
2755 | GCA_008329925.1 | 34103 | 26.000 | 0.042056 | 0.034787 | 0.923157 |
3167 | GCA_900476125.1 | 45363 | 37.000 | 0.042748 | 0.238931 | 0.718321 |
363 | GCA_000023405.1 | 43669 | 29.000 | 0.044980 | 0.008801 | 0.946219 |
2284 | GCA_003355475.1 | 556288 | 28.000 | 0.045198 | 0.009102 | 0.945700 |
1324 | GCA_000970025.1 | 2208 | 37.633 | 0.045273 | 0.580011 | 0.374715 |
3016 | GCA_900129205.1 | 213588 | 50.550 | 0.045616 | 0.151904 | 0.802480 |
1325 | GCA_000970085.1 | 38027 | 36.050 | 0.046418 | 0.495964 | 0.457619 |
295 | GCA_000019785.1 | 34029 | 23.000 | 0.046757 | 0.049049 | 0.904194 |
651 | GCA_000186385.1 | 309887 | 28.000 | 0.046875 | 0.241115 | 0.712010 |
3109 | GCA_900186975.1 | 33011 | 37.000 | 0.047645 | 0.023823 | 0.928532 |
1154 | GCA_000725405.1 | 1274 | 31.500 | 0.048267 | 0.024572 | 0.927161 |
2738 | GCA_008124525.1 | 1189325 | 50.000 | 0.048680 | 0.035310 | 0.916010 |
q = """
select assembly_accession, species_taxid, organism_name
from assembly_source
where species_taxid = 2177
"""
pd.read_sql(q, engine)
assembly_accession | species_taxid | organism_name | |
---|---|---|---|
0 | GCA_001889405.1 | 2177 | Methanohalophilus halophilus |
codon_df[codon_df['TGA'] > 0].sort_values(['TGA_ratio'])[[
'assembly_accession',
'species_taxid',
'growth_tmp',
'TAG_ratio',
'TAA_ratio',
'TGA_ratio',
]]
assembly_accession | species_taxid | growth_tmp | TAG_ratio | TAA_ratio | TGA_ratio | |
---|---|---|---|---|---|---|
2868 | GCA_009730295.1 | 84135 | 37.000 | 0.211838 | 0.769470 | 0.018692 |
2744 | GCA_008151785.1 | 89152 | 37.000 | 0.237606 | 0.741402 | 0.020992 |
2225 | GCA_003181555.1 | 1502 | 40.333 | 0.217361 | 0.753705 | 0.028934 |
2533 | GCA_004329605.1 | 1502 | 40.333 | 0.217566 | 0.751458 | 0.030977 |
2239 | GCA_003181895.1 | 1502 | 40.333 | 0.216109 | 0.752607 | 0.031284 |
... | ... | ... | ... | ... | ... | ... |
732 | GCA_000212695.1 | 1708 | 31.000 | 0.057682 | 0.007443 | 0.934875 |
357 | GCA_000023205.1 | 1270 | 29.000 | 0.037567 | 0.019678 | 0.942755 |
1444 | GCA_001275345.1 | 71999 | 29.000 | 0.039391 | 0.016115 | 0.944494 |
2284 | GCA_003355475.1 | 556288 | 28.000 | 0.045198 | 0.009102 | 0.945700 |
363 | GCA_000023405.1 | 43669 | 29.000 | 0.044980 | 0.008801 | 0.946219 |
3186 rows × 6 columns
test_set_df = codon_with_phylogeny_df[codon_with_phylogeny_df['in_test_set']].reset_index(drop=True)
from sklearn.manifold import TSNE
def plot_embedding(x_embedded, colors, cmap='YlOrRd', cbar=True):
_, ax = plt.subplots(1, 1, figsize=(12, 6))
x = x_embedded[:,0]
y = x_embedded[:,1]
sc = ax.scatter(x, y, c=colors, cmap=cmap, alpha=0.5)
if cbar:
plt.colorbar(sc)
return ax
x_test_embedded = TSNE(n_components=2, perplexity=40).fit_transform(x_test)
x_test_embedded.shape
(1060, 2)
ax = plot_embedding(x_test_embedded, y_test);
xlim = ax.get_xlim()
ylim = ax.get_ylim()
def plot_embedding_per_column(x_embedded, input_df, column, temperature_range=None, min_elements=1):
_, ax = plt.subplots(1, 1, figsize=(12, 6))
df = input_df
if temperature_range is not None:
df = input_df[
input_df['growth_tmp'].between(*temperature_range)
]
df = df.drop_duplicates(['species_taxid'])
categories = df[df[column].notnull()][column].unique()
for cat in sorted(categories):
ix = df[df[column] == cat].index
if len(ix) < min_elements:
continue
x = x_embedded[ix, 0]
y = x_embedded[ix, 1]
ax.scatter(x, y, alpha=.8, label=cat)
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
return ax
ax = plot_embedding_per_column(
x_test_embedded,
test_set_df,
column='phylum',
min_elements=3,
)
ax.set_xlim(xlim);
ax.set_ylim(ylim);
ax = plot_embedding_per_column(
x_test_embedded,
test_set_df,
column='genus',
temperature_range=(60, 120),
min_elements=2,
)
ax.set_xlim(xlim);
ax.set_ylim(ylim);
def make_heatmap_data_raw(df, ratio_columns, grouping_column, temperature_range=None):
if temperature_range is None:
agg_df = df[
[grouping_column] + ratio_columns
].groupby(grouping_column).mean().reset_index()
else:
agg_df = df[df['growth_tmp'].between(*temperature_range)][
[grouping_column] + ratio_columns
].groupby(grouping_column).mean().reset_index()
data = []
for tpl in agg_df.itertuples():
grouping_column_val = getattr(tpl, grouping_column)
for col in ratio_columns:
codon = col.replace('_ratio', '')
row = [
grouping_column_val,
codon_to_aa_table[codon],
codon,
getattr(tpl, col),
]
data.append(row)
output_columns = [
grouping_column,
'aa',
'codon',
'codon_ratio',
]
return pd.DataFrame(data, columns=output_columns)
def make_heatmap(
df,
ratio_columns,
grouping_column,
temperature_range,
amino_acids=None,
ax=None,
figsize=None,
sort_master=None,
):
heatmap_df_raw = make_heatmap_data_raw(df, ratio_columns, grouping_column, temperature_range)
if amino_acids is not None:
heatmap_df_raw = heatmap_df_raw[heatmap_df_raw['aa'].isin(amino_acids)].reset_index(drop=True)
heatmap_df = heatmap_df_raw.pivot(['aa', 'codon'], grouping_column, 'codon_ratio')
if sort_master is not None:
columns = sorted(
heatmap_df.columns.tolist(),
key=lambda s: sort_master.index(s),
)
heatmap_df.columns = columns
if ax is None:
_, ax = plt.subplots(1, 1, figsize=figsize or (25, 20))
sns.heatmap(heatmap_df, ax=ax, cmap='Greys')
return ax, heatmap_df
sorted([(k, v) for k, v in aa_importance.items()], key=lambda t: t[1], reverse=True)
[('C', 7.390609225741814), ('I', 7.074987997573898), ('R', 6.128497902148081), ('P', 4.639331818976233), ('S', 4.324754041065607), ('*', 3.9124782796655353), ('G', 3.7105858281695188), ('A', 3.6630624105820284), ('N', 3.637066104298797), ('L', 3.5165006870361193), ('V', 3.1576465337559703), ('Y', 2.4759798750799895), ('H', 2.3976657325528703), ('F', 2.2512273396487776), ('E', 1.976558913808437), ('T', 1.8661235754655932), ('K', 1.6131966375738807), ('Q', 1.5229935480855108), ('D', 1.0941843033942251)]
def vertical_lines_sep(ax, heatmap_df, dict_map, color, alpha=0.5):
prev = None
for i, species in enumerate(heatmap_df.columns):
v = dict_map[species]
if prev is not None and prev != v:
ax.axvline(i, color=color, alpha=alpha);
prev = v
else:
prev = v
bacteria_species = set(test_set_df[test_set_df['superkingdom'] == 'Bacteria']['species'].unique())
species_to_phylum = {
species: phylum
for phylum, species in test_set_df[['phylum', 'species']].values
}
species_sort = [
t[-1]
for t in test_set_df[
['superkingdom', 'phylum', 'genus', 'species', 'AAA', 'CCC', 'GGG']
].groupby(['superkingdom', 'phylum','genus', 'species']).mean().index.tolist()
]
ax, heatmap_df = make_heatmap(
test_set_df,
ratio_columns,
grouping_column='species',
temperature_range=(60, 120),
amino_acids=['C', 'I', 'R', 'P', 'S', '*'],
sort_master=species_sort,
figsize=(25, 8),
)
prev_aa = None
for i, (aa, codon) in enumerate(heatmap_df.index.tolist()):
if prev_aa is not None and aa != prev_aa:
ax.axhline(i, color='black', alpha=0.4);
prev_aa = aa
else:
prev_aa = aa
vertical_lines_sep(ax, heatmap_df, species_to_phylum, color='black', alpha=0.4);
for i, species in enumerate(heatmap_df.columns):
if species in bacteria_species:
ax.axvline(i, color='black', alpha=0.4, linewidth=3);
break
ax.set_title('Codon bias distribution across species\n');
def kl_divergence(p, q, axis=0):
return entropy(p, q, axis=axis)
def jensen_shannon_divergence(p, q, axis=0):
m = (1/2) * (p + q)
return (
(1/2) * kl_divergence(p, m, axis) +
(1/2) * kl_divergence(q, m, axis)
)
def distance_between_species(df, species_1, species_2, weights_dict=None):
agg_df = df[df['species'].isin([
species_1,
species_2,
])][['species_taxid', 'species'] + ratio_columns].groupby(['species_taxid', 'species']).mean()
measures = []
weights = []
seen_codons = set()
for c in possible_codons:
if c in seen_codons:
continue
redundant_codons = [cod for cod, _ in CODON_REDUNDANCY[c]]
if len(redundant_codons) < 2:
seen_codons.add(c)
continue
aa = codon_to_aa_table[c]
p = np.zeros((len(redundant_codons),))
q = np.zeros((len(redundant_codons),))
for i, codon in enumerate(redundant_codons):
seen_codons.add(codon)
pi, qi = agg_df[f'{codon}_ratio'].values
p[i] = pi
q[i] = qi
if weights_dict is not None:
weights.append(weights_dict[aa])
measures.append(
jensen_shannon_divergence(p, q)
)
if weights_dict is None:
return np.mean(measures)
else:
weights = np.array(weights) / np.sum(weights)
return np.average(measures, weights=weights)
distance_between_species(test_set_df, 'Thermococcus radiotolerans', 'Thermotoga maritima')
0.04378984069228989
distance_between_species(test_set_df, 'Thermosipho melanesiensis', 'Thermotoga maritima')
0.07751900953905895
distance_between_species(test_set_df, 'Azorhizobium caulinodans', 'Thermococcus radiotolerans')
0.054759904766251886
distance_between_species(test_set_df, 'Thermococcus radiotolerans', 'Desulfurococcus mucosus')
0.02250581778325244
test_set_df[test_set_df['species'].isin([
'Thermococcus radiotolerans', 'Thermotoga maritima', 'Desulfurococcus mucosus', 'Thermosipho melanesiensis'
])][['species_taxid', 'species', 'growth_tmp']].drop_duplicates()
species_taxid | species | growth_tmp | |
---|---|---|---|
16 | 2336 | Thermotoga maritima | 80.00 |
56 | 46541 | Thermosipho melanesiensis | 69.95 |
147 | 2275 | Desulfurococcus mucosus | 87.00 |
538 | 187880 | Thermococcus radiotolerans | 88.00 |
species_codon_ratios_path = os.path.join(os.getcwd(), 'data/species_codon_ratios.csv')
q = 'select species_taxid, species, genus, family, "order", class, phylum, superkingdom from species_traits'
species_codon_df = pd.merge(
pd.read_csv(species_codon_ratios_path),
species_temperatures_df,
how='inner',
on='species_taxid',
)
species_codon_df = pd.merge(
species_codon_df,
pd.read_sql(q, engine),
how='inner',
on='species_taxid',
)
species_codon_df.head()
species_taxid | in_test_set | AAA_ratio | AAG_ratio | AAT_ratio | AAC_ratio | ACT_ratio | ACC_ratio | ACA_ratio | ACG_ratio | ... | TTC_ratio | TTT_ratio | growth_tmp | species | genus | family | order | class | phylum | superkingdom | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7 | True | 0.082550 | 0.917450 | 0.327724 | 0.672276 | 0.024511 | 0.575353 | 0.036262 | 0.363875 | ... | 0.897180 | 0.102820 | 30.00 | Azorhizobium caulinodans | Azorhizobium | Xanthobacteraceae | Rhizobiales | Alphaproteobacteria | Proteobacteria | Bacteria |
1 | 9 | False | 0.919627 | 0.080373 | 0.858168 | 0.141832 | 0.454569 | 0.050026 | 0.446018 | 0.049387 | ... | 0.080224 | 0.919776 | 24.00 | Buchnera aphidicola | Buchnera | Erwiniaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria |
2 | 11 | True | 0.007302 | 0.992698 | 0.010730 | 0.989270 | 0.008586 | 0.425527 | 0.015789 | 0.550098 | ... | 0.994594 | 0.005406 | 26.00 | Cellulomonas gilvus | Cellulomonas | Cellulomonadaceae | Micrococcales | Actinobacteria | Actinobacteria | Bacteria |
3 | 14 | True | 0.664866 | 0.335134 | 0.775571 | 0.224429 | 0.458203 | 0.188052 | 0.304183 | 0.049563 | ... | 0.189441 | 0.810559 | 74.15 | Dictyoglomus thermophilum | Dictyoglomus | Dictyoglomaceae | Dictyoglomales | Dictyoglomia | Dictyoglomi | Bacteria |
4 | 19 | True | 0.551810 | 0.448190 | 0.440559 | 0.559441 | 0.099134 | 0.591478 | 0.097796 | 0.211592 | ... | 0.502549 | 0.497451 | 30.00 | Pelobacter carbinolicus | Pelobacter | Desulfuromonadaceae | Desulfuromonadales | Deltaproteobacteria | Proteobacteria | Bacteria |
5 rows × 72 columns
distance_matrix_path = os.path.join(os.getcwd(), 'data/distance_matrix.npy')
distance_matrix = np.load(distance_matrix_path, allow_pickle=True)
distance_matrix.shape
(2680, 2680)
ix = species_codon_df[species_codon_df['species_taxid'].isin((2336, 187880))].index
ix[0], ix[-1], distance_matrix[ix[0], ix[-1]], distance_matrix[ix[-1], ix[0]]
(682, 1769, 0.1956751534857982, 0.1956751534857982)
def compute_appropriate_threshold_distance(species_codon_df, distance_matrix):
output_columns = [
'species_taxid',
'n_neighbours',
'distance_mean',
'distance_std',
'distance_min',
'distance_max',
]
data = []
for ix in range(len(species_codon_df)):
species = species_codon_df.loc[ix]
d = [
v for i, v in enumerate(distance_matrix[ix])
if i != ix
]
data.append([
species['species_taxid'],
len(d),
np.mean(d) if len(d) > 0 else 0.,
np.std(d) if len(d) > 0 else 0.,
np.min(d) if len(d) > 0 else 0.,
np.max(d) if len(d) > 0 else 0.,
])
distance_df = pd.DataFrame(data, columns=output_columns)
return (distance_df['distance_mean'] - 1.1 * distance_df['distance_std']).mean()
threshold_distance = compute_appropriate_threshold_distance(species_codon_df, distance_matrix)
threshold_distance
0.1230565327515076
def get_neighbours(species_codon_df, distance_matrix, species_taxid, max_distance):
index = species_codon_df[species_codon_df['species_taxid'] == species_taxid].index
if len(index) == 0:
raise ValueError(f'No species foudn with id {species_taxid}')
ix = index[0]
#distances = np.delete(distance_matrix[ix], ix)
#mean = distances.mean()
#std = distances.std()
#threshold = mean - n_stds * std
indices = [
i for i, v in enumerate(distance_matrix[ix])
if i != ix and v <= max_distance
]
d = [
v for i, v in enumerate(distance_matrix[ix])
if i != ix and v <= max_distance
]
df = species_codon_df.loc[indices]
df['distance'] = d
return df.sort_values('distance')
get_neighbours(species_codon_df, distance_matrix, species_taxid=2336, max_distance=threshold_distance)
species_taxid | in_test_set | AAA_ratio | AAG_ratio | AAT_ratio | AAC_ratio | ACT_ratio | ACC_ratio | ACA_ratio | ACG_ratio | ... | TTT_ratio | growth_tmp | species | genus | family | order | class | phylum | superkingdom | distance | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1464 | 93930 | False | 0.557737 | 0.442263 | 0.299199 | 0.700801 | 0.165036 | 0.273389 | 0.284350 | 0.277225 | ... | 0.368468 | 77.15 | Thermotoga naphthophila | Thermotoga | Thermotogaceae | Thermotogales | Thermotogae | Thermotogae | Bacteria | 0.000010 |
1463 | 93929 | False | 0.560668 | 0.439332 | 0.307696 | 0.692304 | 0.165958 | 0.274580 | 0.283657 | 0.275805 | ... | 0.368234 | 77.25 | Thermotoga petrophila | Thermotoga | Thermotogaceae | Thermotogales | Thermotogae | Thermotogae | Bacteria | 0.000015 |
683 | 2337 | False | 0.530628 | 0.469372 | 0.247403 | 0.752597 | 0.124817 | 0.303189 | 0.283571 | 0.288422 | ... | 0.387126 | 79.00 | Thermotoga neapolitana | Thermotoga | Thermotogaceae | Thermotogales | Thermotogae | Thermotogae | Bacteria | 0.001007 |
3 rows × 73 columns
len(get_neighbours(species_codon_df, distance_matrix, species_taxid=58290, max_distance=threshold_distance))
24
def get_neighbours_stats(species_codon_df, distance_matrix, max_distance):
output_columns = [
'species_taxid',
'growth_tmp',
'species',
'genus',
'family',
'order',
'class',
'phylum',
'superkingdom',
'n_neighbours',
'distance_mean',
'distance_std',
'distance_min',
'distance_max',
]
data = []
for ix in range(len(species_codon_df)):
species = species_codon_df.loc[ix]
d = [
v for i, v in enumerate(distance_matrix[ix])
if i != ix and v <= max_distance
]
data.append([
species['species_taxid'],
species['growth_tmp'],
species['species'],
species['genus'],
species['family'],
species['order'],
species['class'],
species['phylum'],
species['superkingdom'],
len(d),
np.mean(d) if len(d) > 0 else 0.,
np.std(d) if len(d) > 0 else 0.,
np.min(d) if len(d) > 0 else 0.,
np.max(d) if len(d) > 0 else 0.,
])
return pd.DataFrame(data, columns=output_columns)
stats_df = get_neighbours_stats(species_codon_df, distance_matrix, max_distance=threshold_distance)
stats_df.head()
species_taxid | growth_tmp | species | genus | family | order | class | phylum | superkingdom | n_neighbours | distance_mean | distance_std | distance_min | distance_max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7 | 30.00 | Azorhizobium caulinodans | Azorhizobium | Xanthobacteraceae | Rhizobiales | Alphaproteobacteria | Proteobacteria | Bacteria | 142 | 0.009575 | 0.002239 | 0.001159 | 0.012413 |
1 | 9 | 24.00 | Buchnera aphidicola | Buchnera | Erwiniaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | 53 | 0.010796 | 0.001087 | 0.008200 | 0.012408 |
2 | 11 | 26.00 | Cellulomonas gilvus | Cellulomonas | Cellulomonadaceae | Micrococcales | Actinobacteria | Actinobacteria | Bacteria | 121 | 0.008754 | 0.002545 | 0.000537 | 0.012381 |
3 | 14 | 74.15 | Dictyoglomus thermophilum | Dictyoglomus | Dictyoglomaceae | Dictyoglomales | Dictyoglomia | Dictyoglomi | Bacteria | 104 | 0.008991 | 0.002246 | 0.000310 | 0.012370 |
4 | 19 | 30.00 | Pelobacter carbinolicus | Pelobacter | Desulfuromonadaceae | Desulfuromonadales | Deltaproteobacteria | Proteobacteria | Bacteria | 264 | 0.009170 | 0.002240 | 0.001609 | 0.012411 |
(
stats_df['n_neighbours'].mean(),
stats_df['n_neighbours'].std(),
stats_df['n_neighbours'].min(),
stats_df['n_neighbours'].max(),
)
(159.64328358208957, 104.57317749286152, 0, 428)
stats_df[stats_df['species_taxid'] == 2336]
species_taxid | growth_tmp | species | genus | family | order | class | phylum | superkingdom | n_neighbours | distance_mean | distance_std | distance_min | distance_max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
682 | 2336 | 80.0 | Thermotoga maritima | Thermotoga | Thermotogaceae | Thermotogales | Thermotogae | Thermotogae | Bacteria | 8 | 0.006247 | 0.004722 | 0.00001 | 0.011454 |
len(stats_df[stats_df['n_neighbours'] == 0])
13
neighbour_df = get_neighbours(species_codon_df, distance_matrix, species_taxid=2336, max_distance=threshold_distance)
d = neighbour_df['distance']
d = 1 / (d / np.sum(d))
d = d / np.sum(d)
neighbour_df['p'] = d
neighbour_df['growth_tmp'].mean(), np.average(neighbour_df['growth_tmp'].values, weights=neighbour_df['p'].values)
(67.85, 77.15308270955374)
neighbour_df
species_taxid | in_test_set | AAA_ratio | AAG_ratio | AAT_ratio | AAC_ratio | ACT_ratio | ACC_ratio | ACA_ratio | ACG_ratio | ... | growth_tmp | species | genus | family | order | class | phylum | superkingdom | distance | p | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1464 | 93930 | False | 0.557737 | 0.442263 | 0.299199 | 0.700801 | 0.165036 | 0.273389 | 0.284350 | 0.277225 | ... | 77.15 | Thermotoga naphthophila | Thermotoga | Thermotogaceae | Thermotogales | Thermotogae | Thermotogae | Bacteria | 0.000010 | 0.585270 |
1463 | 93929 | False | 0.560668 | 0.439332 | 0.307696 | 0.692304 | 0.165958 | 0.274580 | 0.283657 | 0.275805 | ... | 77.25 | Thermotoga petrophila | Thermotoga | Thermotogaceae | Thermotogales | Thermotogae | Thermotogae | Bacteria | 0.000015 | 0.405651 |
683 | 2337 | False | 0.530628 | 0.469372 | 0.247403 | 0.752597 | 0.124817 | 0.303189 | 0.283571 | 0.288422 | ... | 79.00 | Thermotoga neapolitana | Thermotoga | Thermotogaceae | Thermotogales | Thermotogae | Thermotogae | Bacteria | 0.001007 | 0.005949 |
1190 | 57487 | False | 0.433576 | 0.566424 | 0.215778 | 0.784222 | 0.148958 | 0.266927 | 0.191408 | 0.392707 | ... | 69.95 | Pseudothermotoga hypogea | Pseudothermotoga | Thermotogaceae | Thermotogales | Thermotogae | Thermotogae | Bacteria | 0.008109 | 0.000739 |
2558 | 1236046 | False | 0.474313 | 0.525687 | 0.349803 | 0.650197 | 0.209594 | 0.330351 | 0.193856 | 0.266199 | ... | 45.00 | Mesotoga infera | Mesotoga | Kosmotogaceae | Kosmotogales | Thermotogae | Thermotogae | Bacteria | 0.008150 | 0.000735 |
2334 | 565033 | True | 0.475462 | 0.524538 | 0.404465 | 0.595535 | 0.179864 | 0.270949 | 0.304756 | 0.244431 | ... | 80.00 | Geoglobus acetivorans | Geoglobus | Archaeoglobaceae | Archaeoglobales | Archaeoglobi | Euryarchaeota | Archaea | 0.009987 | 0.000600 |
2529 | 1184387 | False | 0.412678 | 0.587322 | 0.476407 | 0.523593 | 0.283333 | 0.229309 | 0.264346 | 0.223012 | ... | 37.00 | Mesotoga prima | Mesotoga | Kosmotogaceae | Kosmotogales | Thermotogae | Thermotogae | Bacteria | 0.011247 | 0.000533 |
1201 | 58290 | False | 0.389784 | 0.610216 | 0.327569 | 0.672431 | 0.206981 | 0.202531 | 0.281241 | 0.309247 | ... | 77.45 | Archaeoglobus veneficus | Archaeoglobus | Archaeoglobaceae | Archaeoglobales | Archaeoglobi | Euryarchaeota | Archaea | 0.011454 | 0.000523 |
8 rows × 74 columns
ax, heatmap_df = make_heatmap(
species_codon_df[
species_codon_df['species_taxid'].isin(
neighbour_df['species_taxid'].values.tolist() + [2336]
)
],
ratio_columns,
grouping_column='species',
temperature_range=(0, 120),
figsize=(20, 20),
)
prev_aa = None
for i, (aa, codon) in enumerate(heatmap_df.index.tolist()):
if prev_aa is not None and aa != prev_aa:
ax.axhline(i, color='black', alpha=0.6);
prev_aa = aa
else:
prev_aa = aa
species_to_phylum = {
species: phylum
for phylum, species in species_codon_df[['phylum', 'species']].values
}
vertical_lines_sep(ax, heatmap_df, species_to_phylum, color='black', alpha=0.4);
for i, species in enumerate(heatmap_df.columns):
if species in bacteria_species:
ax.axvline(i, color='black', alpha=0.6, linewidth=3);
break
def most_correlated_aa_from_neighbours(species_codon_df, distance_matrix, ratio_columns, species_taxid, max_distance):
index = species_codon_df[species_codon_df['species_taxid'] == species_taxid].index
if len(index) == 0:
raise ValueError(f'No species foudn with id {species_taxid}')
ix = index[0]
other_ix = [
i for i, v in enumerate(distance_matrix[ix])
if i != ix and v <= max_distance
]
all_measures = []
aa_list = []
for other_index in other_ix:
measures = []
seen_codons = set()
for c in possible_codons:
if c in seen_codons:
continue
redundant_codons = [cod for cod, _ in CODON_REDUNDANCY[c]]
if len(redundant_codons) < 2:
seen_codons.add(c)
continue
aa = codon_to_aa_table[c]
aa_list.append(aa)
p = np.zeros((len(redundant_codons),))
q = np.zeros((len(redundant_codons),))
for i, codon in enumerate(redundant_codons):
seen_codons.add(codon)
pi, qi = species_codon_df.loc[[ix, other_index]][f'{codon}_ratio'].values
p[i] = pi
q[i] = qi
measures.append(
jensen_shannon_divergence(p, q)
)
all_measures.append(measures)
avg_mesures = np.mean(all_measures, axis=0)
return sorted(
[
(aa_list[i], avg_mesures[i])
for i in range(len(avg_mesures))
],
key=lambda t: t[1],
)
most_correlated_aa_from_neighbours(
species_codon_df, distance_matrix, ratio_columns, species_taxid=2336, max_distance=threshold_distance)
[('D', 0.0007799467269615005), ('F', 0.0014119098144310918), ('P', 0.002031620519664627), ('H', 0.003232505721602191), ('N', 0.0038635502388189246), ('T', 0.004283755361816068), ('Y', 0.0046610051719975995), ('Q', 0.004865255733207527), ('K', 0.004946778056355858), ('E', 0.004995800545674859), ('A', 0.00595359012879638), ('I', 0.006942230921520069), ('R', 0.008046217619625016), ('S', 0.008577632074862129), ('L', 0.00890728613602228), ('C', 0.009152544414049193), ('V', 0.010110287603388762), ('G', 0.01240817873146247), ('*', 0.01353179671946442)]
def build_graph(species_codon_df, distance_matrix, max_distance):
graph = nx.Graph()
for ix in range(len(species_codon_df)):
if (ix + 1) % 200 == 0:
logger.info(f'{ix + 1} / {len(species_codon_df)}')
species_series = species_codon_df.loc[ix]
species_taxid = species_series['species_taxid']
if species_taxid not in graph.nodes:
add_node(graph, species_series)
nearby_ix = [
i for i, v in enumerate(distance_matrix[ix])
if i != ix and v <= max_distance
]
for other_ix in nearby_ix:
other_species_series = species_codon_df.loc[other_ix]
other_species_taxid = other_species_series['species_taxid']
if graph.has_edge(species_taxid, other_species_taxid):
continue
if other_species_taxid not in graph.nodes:
add_node(graph, other_species_series)
graph.add_edge(species_taxid, other_species_taxid, weight=distance_matrix[ix, other_ix])
return graph
def add_node(graph, species_series):
attributes = {
'species': species_series['species'],
'growth_tmp': species_series['growth_tmp'],
'genus': species_series['genus'],
'family': species_series['family'],
'order': species_series['order'],
'class': species_series['class'],
'phylum': species_series['phylum'],
'superkingdom': species_series['superkingdom'],
}
graph.add_node(species_series['species_taxid'], **attributes)
%%time
graph = build_graph(species_codon_df, distance_matrix, max_distance=threshold_distance)
2020-11-11 21:05:56,054 (INFO) 100 / 2680 2020-11-11 21:05:58,833 (INFO) 200 / 2680 2020-11-11 21:06:00,807 (INFO) 300 / 2680 2020-11-11 21:06:04,204 (INFO) 400 / 2680 2020-11-11 21:06:06,586 (INFO) 500 / 2680 2020-11-11 21:06:09,686 (INFO) 600 / 2680 2020-11-11 21:06:10,670 (INFO) 700 / 2680 2020-11-11 21:06:12,699 (INFO) 800 / 2680 2020-11-11 21:06:15,298 (INFO) 900 / 2680 2020-11-11 21:06:18,191 (INFO) 1000 / 2680 2020-11-11 21:06:20,433 (INFO) 1100 / 2680 2020-11-11 21:06:22,573 (INFO) 1200 / 2680 2020-11-11 21:06:25,129 (INFO) 1300 / 2680 2020-11-11 21:06:27,699 (INFO) 1400 / 2680 2020-11-11 21:06:29,994 (INFO) 1500 / 2680 2020-11-11 21:06:32,567 (INFO) 1600 / 2680 2020-11-11 21:06:34,589 (INFO) 1700 / 2680 2020-11-11 21:06:36,784 (INFO) 1800 / 2680 2020-11-11 21:06:38,961 (INFO) 1900 / 2680 2020-11-11 21:06:41,607 (INFO) 2000 / 2680 2020-11-11 21:06:43,870 (INFO) 2100 / 2680 2020-11-11 21:06:46,292 (INFO) 2200 / 2680 2020-11-11 21:06:48,587 (INFO) 2300 / 2680 2020-11-11 21:06:50,826 (INFO) 2400 / 2680 2020-11-11 21:06:53,132 (INFO) 2500 / 2680 2020-11-11 21:06:55,161 (INFO) 2600 / 2680
CPU times: user 1min 2s, sys: 218 ms, total: 1min 3s Wall time: 1min 3s
nx.number_connected_components(graph)
16
for component in nx.connected_components(graph):
if len(component) == 2:
print(component)
{200, 824} {45363, 53558}
def predict_temperature_from_neighbours_with_different_phylum(species_taxid):
current_node = graph.nodes[species_taxid]
current_phylum = current_node['phylum']
tmps = []
for key in graph.neighbors(species_taxid):
node = graph.nodes[key]
if key != species_taxid and node['phylum'] != current_phylum:
temp = node['growth_tmp']
tmps.append(temp)
if len(tmps) == 0:
prediction = None
else:
prediction = np.mean(tmps)
return current_node['growth_tmp'], prediction
def predict_temperature_from_neighbours(species_taxid):
current_node = graph.nodes[species_taxid]
tmps = []
for key in graph.neighbors(species_taxid):
node = graph.nodes[key]
if key != species_taxid:
temp = node['growth_tmp']
tmps.append(temp)
if len(tmps) == 0:
prediction = None
else:
prediction = np.mean(tmps)
return current_node['growth_tmp'], prediction
def compute_mae_from_neighbour_predictions(species_taxids, control_for_phylum=True):
actuals = []
predictions = []
for i, species_taxid in enumerate(species_taxids):
if control_for_phylum:
actual, prediction = predict_temperature_from_neighbours_with_different_phylum(species_taxid)
else:
actual, prediction = predict_temperature_from_neighbours(species_taxid)
if prediction is None:
continue
actuals.append(actual)
predictions.append(prediction)
return mae(np.mean(actuals), np.mean(predictions))
predict_temperature_from_neighbours_with_different_phylum(2336)
(80.0, 78.725)
predict_temperature_from_neighbours_with_different_phylum(120)
(30.0, 33.42583333333334)
predict_temperature_from_neighbours_with_different_phylum(29405)
(31.0, 31.8156976744186)
compute_mae_from_neighbour_predictions(species_codon_df['species_taxid'].values)
0.22799978735923787
compute_mae_from_neighbour_predictions(species_codon_df['species_taxid'].values, control_for_phylum=False)
1.0273254520904018
100 * (0.23 - 1.03) / 1.03
-77.66990291262135
def distance_to_species_ratios(species_codon_df, ratio_columns, species_taxid, assembly_accession):
query = """
select metadata_json, sequence from sequences
where assembly_accession = ? and sequence_type = 'CDS'
"""
df = pd.read_sql(query, engine, params=(assembly_accession,))
coding_sequences = df['sequence'].values
descriptions = df['metadata_json'].values
p = species_codon_df[
species_codon_df['species_taxid'] == species_taxid
].iloc[0][ratio_columns].values.astype(float)
aa_indices = get_aa_indices()
distances = []
metadata = []
for i, sequence in enumerate(coding_sequences):
q = compute_codon_ratios(sequence, ratio_columns)
dist = compute_distance_to_target(p, q, aa_indices)
distances.append(dist)
metadata.append(descriptions[i])
return np.array(distances), np.array(metadata)
def compute_distance_to_target(x, y, aa_indices):
distances = []
for ix in aa_indices:
p = x[ix]
q = y[ix]
d = jensen_shannon_divergence(p, q)
distances.append(d)
return np.mean(distances)
def compute_codon_ratios(sequence, ratio_columns):
codon_count = collections.defaultdict(int)
assert len(sequence) % 3 == 0
for pos in range(0, len(sequence), 3):
codon = sequence[pos:pos+3]
if codon not in CODON_REDUNDANCY:
# Ignore ambiguous letters (e.g. N, etc)
continue
codon_count[codon] += 1
codon_ratios = np.zeros((62,))
ordered_codons = [c.replace('_ratio', '') for c in ratio_columns]
processed_codons = set()
for codon in sorted(CODON_REDUNDANCY.keys()):
if codon in processed_codons:
continue
elif len(CODON_REDUNDANCY[codon]) <= 1:
processed_codons.add(codon)
continue
synonymous_codons = CODON_REDUNDANCY[codon]
aa_sum = np.sum([codon_count[cod] for cod, _ in synonymous_codons])
for i, (c, _) in enumerate(synonymous_codons):
processed_codons.add(c)
ix = ordered_codons.index(c)
if aa_sum > 0:
codon_ratios[ix] = codon_count[c] / aa_sum
else:
codon_ratios[ix] = 1 / len(synonymous_codons)
return codon_ratios
def get_aa_indices():
processed_codons = set()
indices = []
i = 0
for codon in sorted(CODON_REDUNDANCY.keys()):
if codon in processed_codons:
continue
elif len(CODON_REDUNDANCY[codon]) <= 1:
processed_codons.add(codon)
continue
row = []
synonymous_codons = CODON_REDUNDANCY[codon]
for c, _ in synonymous_codons:
processed_codons.add(c)
row.append(i)
i += 1
indices.append(row)
return indices
pd.read_sql(
'select assembly_accession from assembly_source where species_taxid = 565033',
engine
)
assembly_accession | |
---|---|
0 | GCA_000789255.1 |
species_taxid = 2336
assembly_accession = 'GCA_000008545.1'
distances, metadata = distance_to_species_ratios(species_codon_df, ratio_columns, species_taxid, assembly_accession)
def proteins_following_codon_bias(species_codon_df, ratio_columns, species_taxid, assembly_accession):
distances, metadata = distance_to_species_ratios(
species_codon_df, ratio_columns, species_taxid, assembly_accession)
sort_ix = np.argsort(distances)
thres = np.mean(distances) - 1.1 * np.std(distances)
_, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.hist(distances, bins=50);
ax.axvline(thres, color='red');
return [json.loads(metadata[ix])['protein'] for ix in sort_ix if distances[ix] <= thres]
proteins_following_codon_bias(
species_codon_df,
ratio_columns,
species_taxid=2336,
assembly_accession='GCA_000008545.1',
)
['DNA-directed RNA polymerase, beta subunit', 'comE protein, putative', 'alpha-mannosidase, putative', 'conserved hypothetical protein', 'alpha-mannosidase-related protein', 'hypothetical protein', 'conserved hypothetical protein', 'hypothetical protein', 'sensor histidine kinase', 'conserved hypothetical protein', 'iron(II) transport protein B', 'DNA-directed DNA polymerase I', 'conserved hypothetical protein', 'DNA gyrase, subunit A', 'cation-transporting ATPase, P-type', 'lipopolysaccharide biosynthesis protein', '(R)-2-hydroxyglutaryl-CoA dehydratase activator-related protein', 'oligopeptide ABC transporter, periplasmic oligopeptide-binding protein', 'DNA mismatch repair protein', 'conserved hypothetical protein', 'pyrimidine-nucleoside phosphorylase', 'pleD-related protein', 'anthranilate synthase component II', 'alpha-amylase', 'conserved hypothetical protein', '5-methyltetrahydrofolate S-homocysteine methyltransferase', 'beta-glucosidase', 'beta-D-galactosidase', 'single stranded DNA-specific exonuclease, putative', 'DNA ligase', 'conserved hypothetical protein', 'ribonucleotide reductase, B12-dependent', 'cellobiose-phosphorylase', '(p)ppGpp synthetase', 'conserved hypothetical protein', 'flagellar hook-associated protein 1', 'preprotein translocase SecA subunit', 'isoleucyl-tRNA synthetase', 'alpha-xylosidase', 'UDP-N-acetylglucosamine 1-carboxyvinyltransferase', 'pyruvate kinase', 'conserved hypothetical protein', '6-phosphogluconate dehydrogenase, decarboxylating', 'hypothetical protein', 'conserved hypothetical protein', 'alcohol dehydrogenase, zinc-containing', 'transcription termination factor Rho', 'DNA topoisomerase', "DNA-directed RNA polymerase, beta' subunit", 'DNA helicase, putative', 'oxidoreductase, putative', 'conserved hypothetical protein', 'conserved hypothetical protein, GGDEF domain', 'conserved hypothetical protein', 'ABC transporter, ATP-binding protein', 'electron transport complex protein, putative', 'propionyl-CoA carboxylase, beta subunit', 'ATP-dependent protease LA', 'penicillin-binding protein, class 1A', 'extracellular polysaccharide biosynthesis-related protein', 'lipopolysaccharide biosynthesis protein, putative', 'glycerol kinase', '5-methyltetrahydropteroyltriglutamate--homocysteine methyltransferase', 'oligopeptide ABC transporter, periplasmic oligopeptide-binding protein', 'glycine dehydrogenase (decarboxylating) subunit 1', 'ABC transporter, ATP-binding protein', 'lepA protein', 'beta-galactosidase', 'conserved hypothetical protein', 'beta-galactosidase', 'glucosamine--fructose-6-phosphate aminotransferase, isomerizing', 'conserved hypothetical protein', 'cytoplasmic axial filament protein, putative', 'cyclomaltodextrinase, putative', 'conserved hypothetical protein', 'thiamin biosynthesis protein ThiI', 'conserved hypothetical protein', 'ATP-dependent Clp protease, ATPase subunit', 'fixC protein', '3-phosphoshikimate-1-carboxyvinyltransferase', 'conserved hypothetical protein', 'phenylalanyl-tRNA synthetase, alpha subunit', 'alpha-glucuronidase', 'protein-export membrane protein SecD, putative', 'hypothetical protein', 'ornithine decarboxylase', 'polynucleotide phosphorylase', 'dehydrogenase', 'oligopeptide ABC transporter, ATP-binding protein', 'ATP-dependent DNA helicase', 'leucyl-tRNA synthetase', 'NADH dehydrogenase, putative', 'conserved hypothetical protein', 'regulatory protein, putative', 'conserved hypothetical protein', 'conserved hypothetical protein']
proteins_following_codon_bias(
species_codon_df,
ratio_columns,
species_taxid=565033,
assembly_accession='GCA_000789255.1',
)
['ATP-dependent helicase', 'heterodisulfide reductase, subunit A', 'Phosphoribosylformylglycinamidine synthase, synthetase subunit', 'Long-chain-fatty-acid--CoA ligase', 'chorismate mutase/prephenate dehydratase', 'carboxypeptidase', 'Protein similar to polyadenylation specificity factor', 'Lead, cadmium, zinc and mercury transporting ATPase', 'hypothetical protein', 'Archaeal DNA polymerase I', 'ATP-dependent helicase', 'Long-chain-fatty-acid--CoA ligase', 'CAP family transcription factor', 'DNA recombination protein RmuC', 'CO dehydrogenase/acetyl-CoA synthase corrinoid activation protein', 'ATP-dependent RNA helicase, EIF-4A family', 'hypothetical protein', 'Radical SAM domain protein', 'transglutaminase-like protein', 'Cobyric acid synthase', "DNA-directed RNA polymerase subunit A'", 'Cell division control protein 6 (cdc6)', 'Phosphoribosylamine--glycine ligase', 'hypothetical protein', 'Radical SAM domain protein', 'pyridine nucleotide-disulfide oxidoreductase', 'Long-chain-fatty-acid--CoA ligase', 'indolepyruvate ferredoxin oxidoreductase alpha subunit', 'Helicase domain protein', 'Medium-chain-fatty-acid--CoA ligase', '[NiFe', 'Carbamoyl-phosphate synthase large chain', 'O-phosphoseryl-tRNA(Cys) synthetase', 'Long-chain-fatty-acid--CoA ligase', 'hypothetical protein', 'Thymidylate synthase', '4-hydroxyphenylacetate 3-monooxygenase', 'archaeosine tRNA-ribosyltransferase type 1', 'F420H2-quinone oxidoreductase, subunit M', 'hypothetical protein', 'acetyl-CoA synthetase', 'radical SAM domain protein, menaquinone biosynthesis', 'Acetolactate synthase large subunit', 'methyltransferase', 'CoB--CoM heterodisulfide reductase subunit A', 'O-succinylbenzoic acid--CoA ligase', 'DNA primase, phage related', '4-hydroxyphenylacetate 3-monooxygenase', 'Transcriptional regulatory protein', 'Cyclohex-1-ene-1-carboxyl-CoA hydratase, BadK', 'PilT protein domain protein', 'DNA mismatch repair protein MutS', 'hypothetical protein', 'hypothetical protein', 'S-layer domain protein', 'hypothetical protein', '2-oxoacid ferredoxin oxidoreductase, subunit beta', 'hypothetical protein', 'oligopeptide ABC transporter, permease protein', 'phytoene dehydrogenase', 'Isoleucyl-tRNA synthetase', 'Phosphoenolpyruvate synthase', 'methylviologen-reducing hydrogenase, subunit alpha (MvhA)', 'DNA double-strand break repair Rad50 ATPase', '3-hydroxyacyl-CoA dehydrogenase', 'Transporter', '[NiFe', 'Hydroxymethylglutaryl-CoA reductase', 'Amidophosphoribosyltransferase', 'hypothetical protein', 'Biotin carboxylase of acetyl-CoA carboxylase', 'hypothetical protein', 'Hydrogenase expression/formation protein', "mRNA 3'-end processing factor", 'bipolar DNA helicase', 'Histidyl-tRNA synthetase', 'hypothetical protein', 'F420H2-quinone oxidoreductase, subunit BC (fused)', 'DNA replication helicase protein MCM', 'carbon monoxide dehydrogenase, catalytic subunit (cooS)', 'cytochrome c family protein', 'glucosamine--fructose-6-phosphate aminotransferase,isomerizing', 'Adenylylsulfate reductase alpha-subunit', 'Trk system potassium uptake protein TrkA', 'acetyl-CoA acetyltransferase', 'heterodisulfide reductase iron-sulfur subunit D', 'Phage integrase', 'DNA topoisomerase VI subunit B', 'signal-transducing histidine kinase', 'Acetyl-CoA synthetase (ADP-forming) alpha and beta chains, putative', 'Deblocking aminopeptidase', 'Serine hydroxymethyltransferase', 'Phosphopantothenoylcysteine decarboxylase', 'Translation elongation factor 2', 'S-adenosylhomocysteine deaminase', 'ABC-type antimicrobial peptide transport system, permease component', 'hypothetical protein', 'Glutamyl-tRNA synthetase', 'Flagella-related protein FlaI', 'peptide ABC transporter, permease protein', 'radical SAM domain protein', 'Lysine 2,3-aminomutase', '2-oxoacid ferredoxin oxidoreductase, subunit alpha', 'radical SAM protein', 'Alanyl-tRNA synthetase', 'Radical SAM domain protein', 'Tyrosyl-tRNA synthetase', 'hypothetical protein', 'Na /H antiporter (napA-1)', 'Replication initiation protein', "RNA 3'-terminal phosphate cyclase", 'Septum site-determining protein MinD', 'indolepyruvate ferredoxin oxidoreductase alpha subunit', 'LSU ribosomal protein L3e (L3p)', 'Glutaconate CoA-transferase subunit B', 'cysteate synthase', 'Cell division control protein 6 (cdc6)', 'Porphobilinogen deaminase', 'hypothetical protein', 'UbiD family decarboxylase', 'Butyryl-CoA dehydrogenase', 'cytochrome c family protein', 'hypothetical protein', 'hypothetical protein', 'hypothetical protein', 'hypothetical protein', 'Replication factor C small subunit', 'hypothetical protein', 'Ferredoxin--nitrite reductase', 'archaeosine tRNA-ribosyltransferase type 2', 'Sodium/proline symporter', 'Phenylalanyl-tRNA synthetase beta chain', 'Archaeal DNA polymerase II large subunit', 'tRNA modifying enzyme', 'hypothetical protein', "DNA-directed RNA polymerase subunit B''", 'hypothetical protein', 'DNA gyrase subunit A', 'ATP-grasp fold domain-containing protein', 'Glucose-1-phosphate thymidylyltransferase', 'FAD/FMN-containing dehydrogenase', 'aldehyde:ferredoxin oxidoreductase', 'Adenosylhomocysteinase', 'Molybdopterin binding motif, CinA N-terminal domain', 'hypothetical protein', 'Flagellin FlaB1', 'ATP-dependent DNA ligase', 'tRNA-splicing ligase RtcB', 'Translation initiation factor 2', 'CBS-domain-containing protein', 'Methionyl-tRNA synthetase', 'Replication factor C large subunit', 'ski2-type helicase', 'Branched-chain amino acid transport system permease protein LivM', 'Putative zinc metalloprotease', 'Dihydrolipoamide dehydrogenase']
def protein_description_word_count(assembly_accession, below_threshold=True, count_type='document'):
if count_type not in ('document', 'term'):
raise ValueError('count type should be one of document or term')
cds_codon_bias_folder = os.path.join(os.getcwd(), 'data/cds_codon_bias')
if below_threshold:
cds_codon_bias_folder = os.path.join(cds_codon_bias_folder, 'below_threshold')
else:
cds_codon_bias_folder = os.path.join(cds_codon_bias_folder, 'all')
cds_codon_bias_path = os.path.join(cds_codon_bias_folder, f'{assembly_accession}_codon_bias.csv')
black_list = [
'',
#'dna',
#'rna',
#'t-rna',
#'trna',
'protein',
'enzyme',
'subunit',
'domain',
'domain-containing',
'hypothetical',
'putative',
'probable',
'conserved',
'factor',
'large',
'small',
'family',
'superfamily',
'outer',
'cell',
'i',
'ii',
'iii',
'iv',
'v',
'vi',
'alpha',
'beta',
'gamma',
'type',
'and',
'or',
'of',
'class',
'assembly',
'acid',
'component',
'response',
'2x',
'1-2',
]
black_list += [c for c in string.ascii_lowercase]
black_list += [c for c in string.digits]
black_list = set(black_list)
df = pd.read_csv(cds_codon_bias_path)
descriptions = df['description'].values
counts = collections.defaultdict(int)
for description in descriptions:
desc = str(description).replace(',', '').replace('.', '').replace('(', '').replace(')', '').strip()
parts_1 = desc.split(' ')
parts = []
for p in parts_1:
parts += [v.lower() for v in p.split('/')]
for word in parts:
if word in black_list:
continue
if count_type == 'document':
counts[word] = 1
else:
counts[word] += 1
return dict(counts)
def compute_word_count(assembly_accessions, below_threshold=True, count_type='document', sorted_output=True):
if count_type not in ('document', 'term'):
raise ValueError('count type should be one of document or term')
total_counts = {}
for i, assembly_accession in enumerate(assembly_accessions):
if (i+1) % 200 == 0:
print(f'{i+1} / {len(assembly_accessions)}')
counts = protein_description_word_count(assembly_accession, below_threshold, count_type)
for k, v in counts.items():
total_counts[k] = total_counts.get(k, 0) + counts[k]
if sorted_output:
word_counts = [
(
k,
np.round(100 * v / len(assembly_accessions), 2)
)
for k, v in total_counts.items()
]
return sorted(word_counts, key=lambda t: t[1], reverse=True)
else:
return total_counts
assembly_accessions = codon_frequency_df['assembly_accession'].values
%%time
word_counts = compute_word_count(assembly_accessions)
200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 CPU times: user 22.5 s, sys: 1.67 s, total: 24.1 s Wall time: 24.2 s
len(word_counts)
23178
word_counts[:100]
[('synthase', 95.86), ('helicase', 92.14), ('kinase', 91.8), ('dehydrogenase', 90.07), ('polymerase', 88.79), ('transporter', 88.73), ('abc', 87.14), ('ligase', 86.72), ('atpase', 86.11), ('reductase', 85.89), ('synthetase', 85.5), ('atp-dependent', 85.47), ('oxidoreductase', 80.65), ('regulator', 74.83), ('atp-binding', 74.71), ('histidine', 74.53), ('hydrolase', 72.82), ('permease', 70.29), ('peptidase', 70.26), ('repair', 67.89), ('transcriptional', 66.76), ('system', 66.58), ('protease', 66.21), ('membrane', 64.2), ('oxidase', 64.14), ('biosynthesis', 63.86), ('sensor', 62.74), ('topoisomerase', 61.4), ('transferase', 59.99), ('aminotransferase', 59.78), ('serine', 58.96), ('excinuclease', 58.56), ('penicillin-binding', 57.1), ('mismatch', 54.24), ('carboxylase', 52.77), ('pyruvate', 52.71), ('dehydratase', 52.65), ('transport', 51.83), ('glycosyl', 49.76), ('decarboxylase', 49.39), ('glutamate', 49.12), ('methyltransferase', 49.06), ('bifunctional', 48.45), ('cyclase', 47.96), ('phosphate', 47.65), ('division', 47.62), ('amino', 47.32), ('exonuclease', 47.2), ('aminopeptidase', 46.07), ('ribonuclease', 45.95), ('p-type', 45.58), ('cytochrome', 45.19), ('translocase', 44.42), ('signal', 42.9), ('peptide', 42.72), ('threonine', 41.8), ('secretion', 41.77), ('multidrug', 41.71), ('gyrase', 41.62), ('efflux', 41.56), ('coupling', 41.41), ('chain', 40.92), ('muts', 40.89), ('transcription-repair', 40.65), ('diguanylate', 39.55), ('aldehyde', 38.79), ('phosphatase', 38.7), ('ftsk', 38.3), ('phosphoribosylformylglycinamidine', 38.3), ('glycosyltransferase', 37.9), ('resistance', 37.42), ('phosphorylase', 37.32), ('acyl-coa', 37.23), ('lyase', 37.17), ('pas', 37.02), ('box', 36.47), ('chemotaxis', 36.44), ('phosphoenolpyruvate', 36.23), ('antiporter', 36.05), ('iron', 35.65), ('radical', 35.13), ('receptor', 34.92), ('sugar', 34.92), ('chaperone', 34.8), ('hydratase', 34.7), ('sam', 34.61), ('ferredoxin', 34.03), ('glycine', 33.91), ('glycogen', 33.82), ('isomerase', 33.76), ('acetyl-coa', 33.76), ('endonuclease', 33.76), ('dead', 33.39), ('mutase', 33.21), ('phosphodiesterase', 32.54), ('chromosome', 32.36), ('binding', 31.96), ('smc', 31.81), ('aspartate', 31.54), ('substrate-binding', 30.9)]
_, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.hist([int(np.round(t[1], 0)) for t in word_counts], bins=100, log=True);
%%time
word_counts = compute_word_count(assembly_accessions, below_threshold=False)
200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 CPU times: user 2min 16s, sys: 14.9 s, total: 2min 31s Wall time: 2min 35s
word_counts[:100]
[('synthase', 99.88), ('kinase', 99.88), ('ribosomal', 99.88), ('atp', 99.76), ('methyltransferase', 99.7), ('polymerase', 99.7), ('dehydrogenase', 99.57), ('isomerase', 99.54), ('transporter', 99.51), ('phosphoribosyltransferase', 99.51), ('ribonuclease', 99.45), ('abc', 99.42), ('initiation', 99.42), ('elongation', 99.3), ('signal', 99.24), ('transcription', 99.21), ('deaminase', 99.21), ('translation', 99.18), ('helicase', 99.12), ('regulator', 99.12), ('reductase', 99.02), ('system', 98.96), ('atp-binding', 98.93), ('dna-binding', 98.93), ('hydrolase', 98.75), ('transcriptional', 98.69), ('chain', 98.6), ('atpase', 98.48), ('transferase', 98.42), ('synthetase', 98.32), ('adenylate', 98.29), ('serine', 98.23), ('rrna', 98.2), ('thioredoxin', 98.2), ('adenylyltransferase', 98.11), ('phosphate', 98.05), ('dna-directed', 98.02), ('acetyltransferase', 97.99), ('endonuclease', 97.99), ('biosynthesis', 97.9), ('peptide', 97.9), ('division', 97.87), ('pyruvate', 97.78), ('oxidoreductase', 97.71), ('atp-dependent', 97.65), ('aldolase', 97.65), ('ligase', 97.62), ('translocase', 97.56), ('amidotransferase', 97.5), ('peptidase', 97.26), ('repair', 97.2), ('protease', 97.2), ('aminopeptidase', 97.2), ('phosphatase', 97.17), ('decarboxylase', 97.11), ('mutase', 97.11), ('oxidase', 96.92), ('dehydratase', 96.92), ('release', 96.89), ('phosphorylase', 96.86), ('aminotransferase', 96.86), ('cyclohydrolase', 96.83), ('lyase', 96.8), ('phosphoglycerate', 96.77), ('permease', 96.68), ('carrier', 96.68), ('membrane', 96.65), ('gyrase', 96.56), ('glutamate', 96.53), ('chaperone', 96.5), ('aspartate', 96.47), ('amino', 96.4), ('replication', 96.37), ('methionine', 96.31), ('riboflavin', 96.19), ('histidine', 96.16), ('pseudouridine', 96.13), ('cysteine', 96.1), ('hydroxymethyltransferase', 96.04), ('ribosome', 96.04), ('recognition', 95.95), ('holliday', 95.83), ('formyltransferase', 95.8), ('carboxylase', 95.73), ('pyrophosphatase', 95.67), ('thymidylate', 95.64), ('adenine', 95.61), ('sugar', 95.61), ('nucleoside', 95.58), ('gmp', 95.46), ('carbamoyltransferase', 95.46), ('channel', 95.43), ('delta', 95.37), ('threonine', 95.28), ('epimerase', 95.16), ('glycosylase', 95.09), ('deacetylase', 95.06), ('adenylosuccinate', 95.03), ('junction', 95.0), ('transport', 94.97)]
_, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.hist([int(np.round(t[1], 0)) for t in word_counts], bins=100, log=True);
def compute_word_scores(assembly_accessions):
top_term_freq = compute_word_count(
assembly_accessions,
below_threshold=True,
count_type='term',
sorted_output=False,
)
all_term_freq = compute_word_count(
assembly_accessions,
below_threshold=False,
count_type='term',
sorted_output=False,
)
top_document_freq = compute_word_count(
assembly_accessions,
below_threshold=True,
count_type='document',
sorted_output=False,
)
all_document_freq = compute_word_count(
assembly_accessions,
below_threshold=False,
count_type='document',
sorted_output=False,
)
all_words = sorted(all_document_freq.keys())
word_scores = []
for word in all_words:
term_freq = top_term_freq.get(word, 0)
document_freq = top_document_freq.get(word, 0)
if term_freq < 100:
continue
elif document_freq < 10:
continue
term_score = term_freq / all_term_freq[word]
document_score = document_freq / all_document_freq[word]
score = (term_score + document_score) / 2
word_scores.append((
word, score, term_score, document_score
))
return (
sorted(word_scores, key=lambda t: t[1], reverse=True),
top_term_freq,
all_term_freq,
top_document_freq,
all_document_freq,
)
%%time
(
word_scores,
top_term_freq,
all_term_freq,
top_document_freq,
all_document_freq,
) = compute_word_scores(assembly_accessions)
print(f'{len(word_scores):,}')
200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 1,376 CPU times: user 5min 22s, sys: 32.3 s, total: 5min 55s Wall time: 6min 3s
word_scores_2 = sorted(word_scores, key=lambda t: t[1], reverse=True)
[
(t[0], np.round(100 * (t[2] * t[3]), 2), np.round(t[1], 2), np.round(t[2], 2), np.round(t[3], 2))
for t in word_scores_2
if t[1] >= 0.5
]
[('helicase-exonuclease', 80.69, 0.9, 0.88, 0.92), ('addab', 78.78, 0.89, 0.86, 0.91), ('adda', 70.19, 0.84, 0.84, 0.84), ('dnae', 65.57, 0.81, 0.8, 0.82), ('addb', 63.92, 0.8, 0.8, 0.8), ('spra', 61.17, 0.78, 0.78, 0.79), ('modules', 53.96, 0.74, 0.64, 0.84), ('polc', 54.84, 0.74, 0.71, 0.77), ('non-ribosomal', 54.3, 0.74, 0.69, 0.78), ('adenylation', 52.51, 0.73, 0.68, 0.77), ('duf490', 48.95, 0.7, 0.69, 0.71), ('alpha-2-macroglobulin', 46.74, 0.68, 0.67, 0.7), ('cobn', 46.35, 0.68, 0.66, 0.71), ('lhr', 45.14, 0.67, 0.65, 0.69), ('transcription-repair', 43.19, 0.66, 0.65, 0.66), ('transcription-repair-coupling', 41.6, 0.65, 0.63, 0.66), ('ca2+-transporting', 40.96, 0.64, 0.62, 0.66), ('polc-type', 40.14, 0.64, 0.58, 0.69), ('hrpa', 40.15, 0.63, 0.61, 0.66), ('alpha-mannosidase', 38.35, 0.62, 0.55, 0.7), ('nonribosomal', 38.69, 0.62, 0.62, 0.63), ('helicase', 26.48, 0.62, 0.28, 0.95), ('coupling', 37.48, 0.61, 0.57, 0.65), ('s-box-containing', 30.21, 0.6, 0.35, 0.86), ('deadenyltransferase', 36.15, 0.6, 0.6, 0.6), ('bara', 35.56, 0.6, 0.55, 0.65), ('glutamine-synthetase', 35.26, 0.59, 0.59, 0.59), ('s-dna-t', 34.86, 0.59, 0.55, 0.64), ('maltooligosyl', 34.75, 0.59, 0.55, 0.63), ('dna', 18.3, 0.59, 0.19, 0.99), ('tigr02099', 34.18, 0.58, 0.58, 0.58), ('pullulanase', 33.85, 0.58, 0.55, 0.62), ('nad-glutamate', 33.9, 0.58, 0.58, 0.58), ('arabinosyltransferase', 33.39, 0.58, 0.52, 0.64), ('5-carboxymethylaminomethyl-2-thiouridine34', 33.38, 0.58, 0.58, 0.58), ('uvra', 33.26, 0.58, 0.55, 0.6), ('glutamate-ammonia-ligase', 33.27, 0.58, 0.57, 0.58), ('uridylyl-removing', 32.56, 0.57, 0.57, 0.57), ('multi-sensor', 31.28, 0.57, 0.46, 0.68), ('alpha-n-acetylglucosaminidase', 32.22, 0.57, 0.52, 0.62), ('atp-dependent', 19.91, 0.57, 0.22, 0.92), ('afsr', 30.31, 0.57, 0.43, 0.7), ('cu+-exporting', 31.83, 0.57, 0.53, 0.6), ('puta', 31.34, 0.56, 0.54, 0.58), ('mnmd', 31.23, 0.56, 0.56, 0.56), ('ftsk', 30.61, 0.56, 0.49, 0.63), ('mcm', 29.92, 0.55, 0.5, 0.6), ('tonb-dependent', 23.87, 0.55, 0.3, 0.8), ('t9ss', 21.07, 0.55, 0.25, 0.85), ('sbcc', 29.86, 0.55, 0.52, 0.57), ('calcium-translocating', 29.63, 0.55, 0.5, 0.59), ('mukb', 29.48, 0.54, 0.54, 0.55), ('excinuclease', 26.13, 0.54, 0.36, 0.73), ('gramicidin', 28.9, 0.54, 0.49, 0.59), ('smc', 28.27, 0.54, 0.46, 0.61), ('sensors', 27.14, 0.53, 0.42, 0.64), ('muts', 27.33, 0.53, 0.44, 0.63), ('exodnase', 28.1, 0.53, 0.52, 0.54), ('pmca-type', 27.57, 0.53, 0.47, 0.59), ('restart', 27.76, 0.53, 0.49, 0.56), ('synthase', 6.97, 0.52, 0.07, 0.98), ('synthetase', 13.62, 0.52, 0.15, 0.89), ('magnesium-translocating', 27.16, 0.52, 0.5, 0.54), ('tamb', 27.06, 0.52, 0.52, 0.52), ('p-type', 24.0, 0.52, 0.35, 0.69), ('ligase', 11.73, 0.52, 0.13, 0.91), ('atpase', 11.23, 0.52, 0.12, 0.91), ('polymerase', 9.11, 0.52, 0.1, 0.94), ('hybrid', 25.27, 0.51, 0.41, 0.62), ('kinase', 8.8, 0.51, 0.09, 0.93), ('penicillin-binding', 21.54, 0.51, 0.3, 0.73), ('amphiphilic', 25.57, 0.51, 0.45, 0.57), ('oligosaccharyl', 25.67, 0.51, 0.48, 0.54), ('cu2+-exporting', 25.57, 0.51, 0.46, 0.55), ('alpha-l-rhamnosidase', 24.82, 0.51, 0.41, 0.6), ('acrd', 24.23, 0.5, 0.4, 0.61), ('uvrabc', 23.65, 0.5, 0.38, 0.63), ('exporter-1', 24.84, 0.5, 0.44, 0.56), ('csm1', 25.15, 0.5, 0.49, 0.52), ('recepter', 18.64, 0.5, 0.25, 0.75)]
_, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.hist([int(np.round(100 * t[2], 0)) for t in word_scores_2], bins=50, log=True);
top_term_freq = compute_word_count(
assembly_accessions,
below_threshold=True,
count_type='term',
sorted_output=False,
)
top_document_freq = compute_word_count(
assembly_accessions,
below_threshold=True,
count_type='document',
sorted_output=False,
)
200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282
top_term_freq_sorted = sorted(list(top_term_freq.items()), key=lambda t: t[1], reverse=True)
top_document_freq_sorted = sorted(list(top_document_freq.items()), key=lambda t: t[1], reverse=True)
len([t[1] for t in top_term_freq_sorted if t[1] >= 20])
6172
len([t[1] for t in top_document_freq_sorted if t[1] > 10])
7767
def retrieve_most_common_protein_from_words(assembly_accessions, words):
cds_codon_bias_folder = os.path.join(os.getcwd(), 'data/cds_codon_bias/below_threshold')
counts = {}
for ix, assembly_accession in enumerate(assembly_accessions):
if (ix + 1) % 200 == 0:
print(f'{ix + 1} / {len(assembly_accessions)}')
cds_codon_bias_path = os.path.join(cds_codon_bias_folder, f'{assembly_accession}_codon_bias.csv')
df = pd.read_csv(cds_codon_bias_path)
descriptions = df['description'].values
for description in descriptions:
protein = str(description)
desc = str(description).replace(',', '').replace('.', '').replace('(', '').replace(')', '').strip()
parts_1 = desc.split(' ')
parts = []
for p in parts_1:
parts += [v.lower() for v in p.split('/')]
parts_set = set(parts)
for word in words:
if word not in parts_set:
continue
if word not in counts:
counts[word] = {}
if protein not in counts[word]:
counts[word][protein] = 1
else:
counts[word][protein] += 1
word_to_most_common_protein = {}
for word in words:
if word not in counts:
continue
l = sorted([(k, v) for k, v in counts[word].items()], key=lambda t: t[1])
word_to_most_common_protein[word] = l[-1][0]
return word_to_most_common_protein
most_common_words = [t[0] for t in word_scores_2 if t[1] >= 0.5]
%%time
word_to_most_common_protein = retrieve_most_common_protein_from_words(
assembly_accessions,
most_common_words,
)
200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 CPU times: user 27.1 s, sys: 2.16 s, total: 29.2 s Wall time: 30.8 s
word_to_most_common_protein['helicase']
'DEAD/DEAH box helicase'
def compute_most_common_word_output(
most_common_words,
word_to_most_common_protein,
top_term_freq,
all_term_freq,
top_document_freq,
all_document_freq,
):
columns = [
'Keyword',
'Most common protein',
'Frequency',
'Frequency in top',
'Presence count',
'Present count in top',
'Term frequency score',
'Presence score',
'Score',
]
data = []
for word in most_common_words:
term_score = top_term_freq[word] / all_term_freq[word]
presence_score = top_document_freq[word] / all_document_freq[word]
score = (term_score + presence_score) / 2
data.append([
word,
word_to_most_common_protein[word],
all_term_freq[word],
top_term_freq[word],
all_document_freq[word],
top_document_freq[word],
np.round(term_score, 2),
np.round(presence_score, 2),
np.round(score, 2),
])
return pd.DataFrame(data, columns=columns)
most_common_word_output = compute_most_common_word_output(
most_common_words,
word_to_most_common_protein,
top_term_freq,
all_term_freq,
top_document_freq,
all_document_freq,
)
most_common_word_output.head(20)
Keyword | Most common protein | Frequency | Frequency in top | Presence count | Present count in top | Term frequency score | Presence score | Score | |
---|---|---|---|---|---|---|---|---|---|
0 | helicase-exonuclease | helicase-exonuclease AddAB subunit AddA | 506 | 445 | 303 | 278 | 0.88 | 0.92 | 0.90 |
1 | addab | helicase-exonuclease AddAB subunit AddA | 519 | 448 | 309 | 282 | 0.86 | 0.91 | 0.89 |
2 | adda | helicase-exonuclease AddAB subunit AddA | 466 | 390 | 465 | 390 | 0.84 | 0.84 | 0.84 |
3 | dnae | DNA polymerase III DnaE | 130 | 104 | 122 | 100 | 0.80 | 0.82 | 0.81 |
4 | addb | helicase-exonuclease AddAB subunit AddB | 394 | 315 | 394 | 315 | 0.80 | 0.80 | 0.80 |
5 | spra | cell surface protein SprA | 169 | 131 | 166 | 131 | 0.78 | 0.79 | 0.78 |
6 | modules | Erythronolide synthase, modules 3 and 4 | 175 | 112 | 51 | 43 | 0.64 | 0.84 | 0.74 |
7 | polc | DNA polymerase III PolC | 262 | 186 | 233 | 180 | 0.71 | 0.77 | 0.74 |
8 | non-ribosomal | non-ribosomal peptide synthetase | 2267 | 1569 | 543 | 426 | 0.69 | 0.78 | 0.74 |
9 | adenylation | amino acid adenylation domain-containing protein | 1889 | 1285 | 513 | 396 | 0.68 | 0.77 | 0.73 |
10 | duf490 | protein of unknown function DUF490 | 317 | 218 | 288 | 205 | 0.69 | 0.71 | 0.70 |
11 | alpha-2-macroglobulin | alpha-2-macroglobulin | 479 | 321 | 423 | 295 | 0.67 | 0.70 | 0.68 |
12 | cobn | cobaltochelatase subunit CobN | 784 | 515 | 659 | 465 | 0.66 | 0.71 | 0.68 |
13 | lhr | ATP dependent helicase, Lhr family | 210 | 137 | 172 | 119 | 0.65 | 0.69 | 0.67 |
14 | transcription-repair | transcription-repair coupling factor | 2499 | 1629 | 2454 | 1626 | 0.65 | 0.66 | 0.66 |
15 | transcription-repair-coupling | Transcription-repair-coupling factor | 372 | 236 | 334 | 219 | 0.63 | 0.66 | 0.65 |
16 | ca2+-transporting | Ca2+-transporting ATPase | 216 | 135 | 148 | 97 | 0.62 | 0.66 | 0.64 |
17 | polc-type | PolC-type DNA polymerase III | 613 | 358 | 518 | 356 | 0.58 | 0.69 | 0.64 |
18 | hrpa | ATP-dependent RNA helicase HrpA | 915 | 557 | 840 | 554 | 0.61 | 0.66 | 0.63 |
19 | alpha-mannosidase | alpha-mannosidase | 1355 | 743 | 662 | 463 | 0.55 | 0.70 | 0.62 |
most_common_word_output.to_excel(
os.path.join(os.getcwd(), 'data/codon_bias_common_proteins.xlsx'),
index=False,
)