import argparse
import os
import json
import logging
import string
import collections
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
if os.getcwd().endswith('notebook'):
os.chdir('..')
from rna_learn.alphabet import CODON_REDUNDANCY
from rna_learn.codon_bias.graph import load_codon_bias
sns.set(palette='colorblind', font_scale=1.3)
palette = sns.color_palette()
logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s")
logger = logging.getLogger(__name__)
db_path = os.path.join(os.getcwd(), 'data/db/seq.db')
engine = create_engine(f'sqlite+pysqlite:///{db_path}')
distance_matrix_path = os.path.join(os.getcwd(), 'data/distance_matrix.npy')
distance_matrix = np.load(distance_matrix_path, allow_pickle=True)
distance_matrix.shape
(2680, 2680)
codon_bias_path = os.path.join(os.getcwd(), 'data/species_codon_ratios.csv')
species_codon_df = load_codon_bias(engine, codon_bias_path)
species_codon_df.head()
species_taxid | in_test_set | AAA_ratio | AAG_ratio | AAT_ratio | AAC_ratio | ACT_ratio | ACC_ratio | ACA_ratio | ACG_ratio | ... | motility | range_salinity | cell_shape | isolation_source | doubling_h | genome_size | gc_content | coding_genes | tRNA_genes | rRNA16S_genes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7 | True | 0.082550 | 0.917450 | 0.327724 | 0.672276 | 0.024511 | 0.575353 | 0.036262 | 0.363875 | ... | yes | None | None | host_plant | NaN | 5369771.500 | 67.300 | 4713.667 | 53.000 | 3.0 |
1 | 9 | False | 0.919627 | 0.080373 | 0.858168 | 0.141832 | 0.454569 | 0.050026 | 0.446018 | 0.049387 | ... | yes | None | coccobacillus | host_animal_ectotherm | 35.40 | 601699.243 | 25.469 | 517.549 | 30.485 | 1.0 |
2 | 11 | True | 0.007302 | 0.992698 | 0.010730 | 0.989270 | 0.008586 | 0.425527 | 0.015789 | 0.550098 | ... | yes | None | bacillus | host_animal_endotherm | NaN | 3526440.800 | 73.805 | 3139.333 | 45.000 | 2.0 |
3 | 14 | True | 0.664866 | 0.335134 | 0.775571 | 0.224429 | 0.458203 | 0.188052 | 0.304183 | 0.049563 | ... | no | None | bacillus | water_hotspring | 2.47 | 1959987.600 | 33.700 | 1876.333 | 46.000 | 2.0 |
4 | 19 | True | 0.551810 | 0.448190 | 0.440559 | 0.559441 | 0.099134 | 0.591478 | 0.097796 | 0.211592 | ... | yes | None | bacillus | petroleum | NaN | 3722544.667 | 55.100 | 3222.667 | 54.000 | 2.0 |
5 rows × 86 columns
ratio_columns = [c for c in species_codon_df.columns if c.endswith('_ratio')]
def compute_appropriate_threshold_distance(species_codon_df, distance_matrix, s_stds):
output_columns = [
'species_taxid',
'distance_mean',
'distance_std',
]
data = []
for ix in range(len(species_codon_df)):
species = species_codon_df.loc[ix]
d = [
v for i, v in enumerate(distance_matrix[ix])
if i != ix
]
data.append([
species['species_taxid'],
np.mean(d) if len(d) > 0 else 0.,
np.std(d) if len(d) > 0 else 0.,
])
distance_df = pd.DataFrame(data, columns=output_columns)
return (distance_df['distance_mean'] - s_stds * distance_df['distance_std']).mean()
def compute_neighbour_stats(species_codon_df, distance_matrix, s_stds):
threshold = compute_appropriate_threshold_distance(species_codon_df, distance_matrix, s_stds)
species_taxids = species_codon_df['species_taxid'].values
n_under_threshold = []
for i, species_taxid in enumerate(species_taxids):
distances = np.delete(distance_matrix[i], i)
n_under_threshold.append(len([d for d in distances if d <= threshold]))
n_zeros = len([v for v in n_under_threshold if v == 0])
return threshold, {
'mean': np.round(np.mean(n_under_threshold), 2),
'std': np.round(np.std(n_under_threshold), 2),
'min': np.min(n_under_threshold),
'max': np.max(n_under_threshold),
'n_zeros': n_zeros,
'n_zeros_percent': np.round(100 * n_zeros / len(species_taxids), 2),
}
threshold, stats = compute_neighbour_stats(species_codon_df, distance_matrix, s_stds=1.3)
print(f'Threshold: {threshold:.4f}')
stats
Threshold: 0.0982
{'mean': 186.73, 'std': 113.32, 'min': 0, 'max': 476, 'n_zeros': 6, 'n_zeros_percent': 0.22}
ix = species_codon_df[species_codon_df['species_taxid'] == 2336].index[0]
distances = distance_matrix[ix]
neighbours_ix = [i for i, d in enumerate(distances) if d <= threshold and ix != i]
species_codon_df.loc[neighbours_ix][['species_taxid', 'species', 'phylum', 'superkingdom']]
species_taxid | species | phylum | superkingdom | |
---|---|---|---|---|
683 | 2337 | Thermotoga neapolitana | Thermotogae | Bacteria |
1190 | 57487 | Pseudothermotoga hypogea | Thermotogae | Bacteria |
1201 | 58290 | Archaeoglobus veneficus | Euryarchaeota | Archaea |
1463 | 93929 | Thermotoga petrophila | Thermotogae | Bacteria |
1464 | 93930 | Thermotoga naphthophila | Thermotogae | Bacteria |
2334 | 565033 | Geoglobus acetivorans | Euryarchaeota | Archaea |
2529 | 1184387 | Mesotoga prima | Thermotogae | Bacteria |
2558 | 1236046 | Mesotoga infera | Thermotogae | Bacteria |
%%time
graph_path = os.path.join(os.getcwd(), 'data/codon_bias_graph.gpickle')
graph = nx.read_gpickle(graph_path)
CPU times: user 287 ms, sys: 53.1 ms, total: 340 ms Wall time: 342 ms
%%time
connected_components = list(nx.connected_components(graph))
CPU times: user 38.7 ms, sys: 1.2 ms, total: 39.9 ms Wall time: 39.1 ms
for i, component in enumerate(connected_components):
print(f'Component {i+1}: {len(component):,} species')
Component 1: 2,672 species Component 2: 2 species Component 3: 1 species Component 4: 1 species Component 5: 1 species Component 6: 1 species Component 7: 1 species Component 8: 1 species
from networkx.algorithms import community as nx_community
%%time
communities = list(nx_community.asyn_lpa_communities(graph))
CPU times: user 8.05 s, sys: 13.2 ms, total: 8.07 s Wall time: 8.08 s
for i, community in enumerate(communities):
print(f'Community {i+1}: {len(community):,} species')
if 2336 in community:
print(f' - Species 2336 is in community {i+1}')
if 58290 in community:
print(f' - Species 58290 is in community {i+1}')
if 565033 in community:
print(f' - Species 565033 is in community {i+1}')
Community 1: 1,091 species Community 2: 271 species Community 3: 1,251 species - Species 58290 is in community 3 - Species 565033 is in community 3 Community 4: 2 species Community 5: 2 species Community 6: 38 species Community 7: 1 species Community 8: 1 species Community 9: 1 species Community 10: 4 species Community 11: 1 species Community 12: 5 species - Species 2336 is in community 12 Community 13: 1 species Community 14: 4 species Community 15: 2 species Community 16: 4 species Community 17: 1 species