import collections
import os
import json
import logging
import string
import itertools
from scipy.stats import entropy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import statsmo dels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import networkx as nx
if os.getcwd().endswith('notebook'):
os.chdir('..')
from rna_learn.alphabet import ALPHABET_DNA, CODON_REDUNDANCY
from rna_learn.codon_bias.graph import load_codon_bias
from rna_learn.codon_bias.distance import compute_codon_bias_distance
sns.set(palette='colorblind', font_scale=1.3)
palette = sns.color_palette()
logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s")
logger = logging.getLogger(__name__)
db_path = os.path.join(os.getcwd(), 'data/db/seq.db')
engine = create_engine(f'sqlite+pysqlite:///{db_path}')
possible_codons = sorted(CODON_REDUNDANCY.keys())
nucleotide_bias = load_codon_bias(engine, os.path.join(os.getcwd(), 'data/nucleotide_bias.csv'))
doublet_bias = load_codon_bias(engine, os.path.join(os.getcwd(), 'data/doublet_bias.csv'))
triplet_bias = load_codon_bias(engine, os.path.join(os.getcwd(), 'data/triplet_bias.csv'))
def plot_triplets_distribution(triplet_bias, possible_codons, species_taxids):
y_species = []
for species_taxid in species_taxids:
df = triplet_bias[triplet_bias['species_taxid'] == species_taxid]
ratios = []
codon_count_sum = sum((df[cod] for cod in possible_codons)).sum()
for codon in possible_codons:
ratio = df[codon].sum() / codon_count_sum
ratios.append(ratio)
y_species.append(ratios)
x = list(reversed(sorted(possible_codons)))
_, ax = plt.subplots(1, 1, figsize=(12, 18))
data = {}
for i, species_taxid in enumerate(species_taxids):
species = triplet_bias[triplet_bias['species_taxid'] == species_taxid]['species'].iloc[0]
data[species] = y_species[i]
pd.DataFrame(
data,
index=x,
).plot.barh(ax=ax, alpha=1)
ax.set_xlabel('Probability of occurence')
ax.set_title('Genome-wide nucleotide triplets distribution')
return ax
plot_triplets_distribution(triplet_bias, possible_codons, species_taxids=[562, 2336, 53946]);
def fit_linear_regression(y_actual, x):
X = sm.add_constant(x)
model = sm.OLS(y_actual, X)
results = model.fit()
y_fit = results.predict(X)
return results, y_fit
def plot_true_vs_prediction_per_specie(y_true, y_pred, ax=None):
if ax is None:
_, ax = plt.subplots(1, 1, figsize=(8, 8))
palette = sns.color_palette()
x_straight = range(int(np.min(y_true)) - 1, int(np.max(y_true)) + 1)
ax.plot(x_straight, x_straight, '--', color=palette[1], linewidth=2)
ax.scatter(y_true, y_pred, color=palette[0], alpha=0.5, label='Species')
ax.set_xlabel('Temperature °C (truth)')
ax.set_ylabel('Temperature °C (prediction)')
return ax
def compute_and_plot_pattern_against_ogt(y_actual, x, results, ax=None):
palette = sns.color_palette()
X = sm.add_constant(x)
y_fit = results.predict(X)
std, upper, lower = wls_prediction_std(results)
ax = plot_true_vs_prediction_per_specie(y_actual, y_fit, ax=ax)
df_data = [
[y_actual[i], y_fit[i], std[i]]
for i in range(len(y_actual))
]
df = pd.DataFrame(df_data, columns=['growth_tmp_actual', 'growth_tmp_prediction', 'growth_tmp_std'])
pred_mean = df[['growth_tmp_actual', 'growth_tmp_prediction']].groupby('growth_tmp_actual').mean()
std_mean = df[['growth_tmp_actual', 'growth_tmp_std']].groupby('growth_tmp_actual').mean().reset_index()
x_fill = std_mean['growth_tmp_actual'].values
x_base = pred_mean['growth_tmp_prediction'].values
y_fill_1 = x_base - 2 * std_mean['growth_tmp_std'].values
y_fill_2 = x_base + 2 * std_mean['growth_tmp_std'].values
ax.fill_between(x_fill, y_fill_1, y_fill_2, alpha=0.2, color=palette[1], label='Uncertainty');
ax.legend(loc='upper left')
return ax, y_fit
def compute_mae(y_true, y_pred):
return np.mean(np.abs(y_true - y_pred))
def compute_rmse(y_true, y_pred):
return np.sqrt(np.mean((y_true - y_pred)**2))
def fit_model_and_plot(df):
ratio_columns = [c for c in df.columns if c.endswith('_ratio')]
x_train = df[~df['in_test_set']][ratio_columns].values
y_train = df[~df['in_test_set']]['growth_tmp'].values
x_test = df[df['in_test_set']][ratio_columns].values
y_test = df[df['in_test_set']]['growth_tmp'].values
results, y_fit_train = fit_linear_regression(y_train, x_train)
ax, y_fit_test = compute_and_plot_pattern_against_ogt(y_test, x_test, results)
return compute_mae(y_test, y_fit_test), results.rsquared, ax
mae, r2, ax = fit_model_and_plot(triplet_bias)
ax.set_title((
f'Growth temperature prediction from nucleotide triplet distribution\n\n'
f'R-squared = {r2:.2f} - Mean absolute error = {mae:.2f} °C\n'
));
mae, r2, ax = fit_model_and_plot(doublet_bias)
ax.set_title((
f'Growth temperature prediction from nucleotide doublet distribution\n\n'
f'R-squared = {r2:.2f} - Mean absolute error = {mae:.2f} °C\n'
));
mae, r2, ax = fit_model_and_plot(nucleotide_bias)
ax.set_title((
f'Growth temperature prediction from nucleotide distribution\n\n'
f'R-squared = {r2:.2f} - Mean absolute error = {mae:.2f} °C\n'
));
possible_codons = sorted(CODON_REDUNDANCY.keys())
possible_doublets = [f'{a}{b}' for a, b in sorted(itertools.permutations(ALPHABET_DNA, 2))]
doublet_columns = [f'{c}_ratio' for c in possible_doublets]
triplet_columns = [f'{c}_ratio' for c in possible_codons]
combined_df = pd.concat(
[
nucleotide_bias.set_index('assembly_accession'),
doublet_bias.set_index('assembly_accession')[doublet_columns],
triplet_bias.set_index('assembly_accession')[triplet_columns],
],
axis=1,
).reset_index()
combined_df.head()
assembly_accession | species_taxid | in_test_set | A | C | G | T | A_ratio | C_ratio | G_ratio | ... | TCG_ratio | TCT_ratio | TGA_ratio | TGC_ratio | TGG_ratio | TGT_ratio | TTA_ratio | TTC_ratio | TTG_ratio | TTT_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GCA_000005825.2 | 79885 | False | 1589481 | 987485 | 1104093 | 1514536 | 0.305929 | 0.190062 | 0.212506 | ... | 0.009728 | 0.016952 | 0.023118 | 0.015000 | 0.013974 | 0.015261 | 0.025424 | 0.019694 | 0.021254 | 0.031125 |
1 | GCA_000005845.2 | 562 | False | 1341959 | 1374614 | 1459271 | 1338618 | 0.243353 | 0.249274 | 0.264626 | ... | 0.015260 | 0.012318 | 0.019977 | 0.021149 | 0.022337 | 0.012565 | 0.014898 | 0.016006 | 0.016710 | 0.020713 |
2 | GCA_000006175.2 | 2188 | True | 831476 | 306449 | 362019 | 730062 | 0.372858 | 0.137421 | 0.162340 | ... | 0.006074 | 0.011747 | 0.018214 | 0.010268 | 0.012998 | 0.013879 | 0.040932 | 0.016391 | 0.020027 | 0.044815 |
3 | GCA_000006605.1 | 38289 | False | 586107 | 929593 | 932665 | 561759 | 0.194712 | 0.308822 | 0.309843 | ... | 0.021046 | 0.010342 | 0.013026 | 0.020352 | 0.023503 | 0.009397 | 0.004139 | 0.015074 | 0.010660 | 0.007282 |
4 | GCA_000006685.1 | 83560 | False | 379185 | 255852 | 268679 | 390700 | 0.292939 | 0.197658 | 0.207568 | ... | 0.010135 | 0.024899 | 0.015785 | 0.014669 | 0.014870 | 0.015866 | 0.022905 | 0.024235 | 0.021775 | 0.038979 |
5 rows × 109 columns
mae, r2, ax = fit_model_and_plot(combined_df)
ax.set_title((
f'Growth temperature prediction from combined nucleotides distribution\n\n'
f'R-squared = {r2:.2f} - Mean absolute error = {mae:.2f} °C\n'
));