#!/usr/bin/env python # coding: utf-8 # In[70]: import argparse import os import json import logging import string import collections import pandas as pd import numpy as np from sqlalchemy import create_engine import networkx as nx import matplotlib.pyplot as plt import seaborn as sns if os.getcwd().endswith('notebook'): os.chdir('..') from rna_learn.alphabet import CODON_REDUNDANCY from rna_learn.codon_bias.graph import load_codon_bias # In[3]: sns.set(palette='colorblind', font_scale=1.3) palette = sns.color_palette() logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s") logger = logging.getLogger(__name__) # In[4]: db_path = os.path.join(os.getcwd(), 'data/db/seq.db') engine = create_engine(f'sqlite+pysqlite:///{db_path}') # ## Load distance matrix # In[2]: distance_matrix_path = os.path.join(os.getcwd(), 'data/distance_matrix.npy') distance_matrix = np.load(distance_matrix_path, allow_pickle=True) distance_matrix.shape # ## Load codon bias # In[37]: codon_bias_path = os.path.join(os.getcwd(), 'data/species_codon_ratios.csv') species_codon_df = load_codon_bias(engine, codon_bias_path) species_codon_df.head() # In[52]: ratio_columns = [c for c in species_codon_df.columns if c.endswith('_ratio')] # ## Distance Threshold # In[10]: def compute_appropriate_threshold_distance(species_codon_df, distance_matrix, s_stds): output_columns = [ 'species_taxid', 'distance_mean', 'distance_std', ] data = [] for ix in range(len(species_codon_df)): species = species_codon_df.loc[ix] d = [ v for i, v in enumerate(distance_matrix[ix]) if i != ix ] data.append([ species['species_taxid'], np.mean(d) if len(d) > 0 else 0., np.std(d) if len(d) > 0 else 0., ]) distance_df = pd.DataFrame(data, columns=output_columns) return (distance_df['distance_mean'] - s_stds * distance_df['distance_std']).mean() # In[28]: def compute_neighbour_stats(species_codon_df, distance_matrix, s_stds): threshold = compute_appropriate_threshold_distance(species_codon_df, distance_matrix, s_stds) species_taxids = species_codon_df['species_taxid'].values n_under_threshold = [] for i, species_taxid in enumerate(species_taxids): distances = np.delete(distance_matrix[i], i) n_under_threshold.append(len([d for d in distances if d <= threshold])) n_zeros = len([v for v in n_under_threshold if v == 0]) return threshold, { 'mean': np.round(np.mean(n_under_threshold), 2), 'std': np.round(np.std(n_under_threshold), 2), 'min': np.min(n_under_threshold), 'max': np.max(n_under_threshold), 'n_zeros': n_zeros, 'n_zeros_percent': np.round(100 * n_zeros / len(species_taxids), 2), } # In[44]: threshold, stats = compute_neighbour_stats(species_codon_df, distance_matrix, s_stds=1.3) print(f'Threshold: {threshold:.4f}') stats # In[45]: ix = species_codon_df[species_codon_df['species_taxid'] == 2336].index[0] distances = distance_matrix[ix] neighbours_ix = [i for i, d in enumerate(distances) if d <= threshold and ix != i] species_codon_df.loc[neighbours_ix][['species_taxid', 'species', 'phylum', 'superkingdom']] # ## Load graph # In[46]: get_ipython().run_cell_magic('time', '', "graph_path = os.path.join(os.getcwd(), 'data/codon_bias_graph.gpickle')\ngraph = nx.read_gpickle(graph_path)\n") # In[47]: get_ipython().run_cell_magic('time', '', 'connected_components = list(nx.connected_components(graph))\n') # In[55]: for i, component in enumerate(connected_components): print(f'Component {i+1}: {len(component):,} species') # In[84]: from networkx.algorithms import community as nx_community # In[85]: get_ipython().run_cell_magic('time', '', 'communities = list(nx_community.asyn_lpa_communities(graph))\n') # In[86]: for i, community in enumerate(communities): print(f'Community {i+1}: {len(community):,} species') if 2336 in community: print(f' - Species 2336 is in community {i+1}') if 58290 in community: print(f' - Species 58290 is in community {i+1}') if 565033 in community: print(f' - Species 565033 is in community {i+1}') # In[ ]: