import argparse
import os
import json
import logging
import string
import collections
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
if os.getcwd().endswith('notebook'):
os.chdir('..')
from rna_learn.alphabet import CODON_REDUNDANCY
from rna_learn.codon_bias.trna_ai import compute_trna_ai_weights
from rna_learn.codon_bias.graph import load_codon_bias
from rna_learn.codon_bias.distance import compute_codon_bias_distance
sns.set(palette='colorblind', font_scale=1.3)
palette = sns.color_palette()
logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s")
logger = logging.getLogger(__name__)
db_path = os.path.join(os.getcwd(), 'data/db/seq.db')
engine = create_engine(f'sqlite+pysqlite:///{db_path}')
codon_to_aa_table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
}
aa_to_codons = collections.defaultdict(list)
for codon, aa in codon_to_aa_table.items():
aa_to_codons[aa].append(codon)
aa_to_codons = dict(aa_to_codons)
aa_to_codons
{'I': ['ATA', 'ATC', 'ATT'], 'M': ['ATG'], 'T': ['ACA', 'ACC', 'ACG', 'ACT'], 'N': ['AAC', 'AAT'], 'K': ['AAA', 'AAG'], 'S': ['AGC', 'AGT', 'TCA', 'TCC', 'TCG', 'TCT'], 'R': ['AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGT'], 'L': ['CTA', 'CTC', 'CTG', 'CTT', 'TTA', 'TTG'], 'P': ['CCA', 'CCC', 'CCG', 'CCT'], 'H': ['CAC', 'CAT'], 'Q': ['CAA', 'CAG'], 'V': ['GTA', 'GTC', 'GTG', 'GTT'], 'A': ['GCA', 'GCC', 'GCG', 'GCT'], 'D': ['GAC', 'GAT'], 'E': ['GAA', 'GAG'], 'G': ['GGA', 'GGC', 'GGG', 'GGT'], 'F': ['TTC', 'TTT'], 'Y': ['TAC', 'TAT'], '*': ['TAA', 'TAG', 'TGA'], 'C': ['TGC', 'TGT'], 'W': ['TGG']}
%%time
graph_path = os.path.join(os.getcwd(), 'data/codon_bias_graph.gpickle')
graph = nx.read_gpickle(graph_path)
CPU times: user 335 ms, sys: 50.1 ms, total: 385 ms Wall time: 387 ms
nx.number_connected_components(graph)
8
len(nx.node_connected_component(graph, 2336))
2672
species_query = """
select species_taxid, species, phylum, superkingdom, growth_tmp from species_traits
"""
species_df = pd.read_sql(species_query, engine)
species_taxids = species_df['species_taxid'].values
species_df[species_df['growth_tmp'] < 20]
species_taxid | species | phylum | superkingdom | growth_tmp | |
---|---|---|---|---|---|
6 | 25 | Shewanella hanedai | Proteobacteria | Bacteria | 16.0 |
20 | 102 | Polaribacter glomeratus | Bacteroidetes | Bacteria | 15.0 |
141 | 531 | Polaribacter irgensii | Bacteroidetes | Bacteria | 4.0 |
184 | 659 | Photobacterium phosphoreum | Proteobacteria | Bacteria | 19.5 |
194 | 688 | Aliivibrio logei | Proteobacteria | Bacteria | 16.0 |
... | ... | ... | ... | ... | ... |
2548 | 1217908 | Octadecabacter antarcticus | Proteobacteria | Bacteria | 4.0 |
2621 | 1424659 | Cryobacterium flavum | Actinobacteria | Bacteria | 10.0 |
2622 | 1424661 | Cryobacterium luteum | Actinobacteria | Bacteria | 10.0 |
2667 | 1706046 | Sulfurirhabdus autotrophica | Proteobacteria | Bacteria | 15.0 |
2677 | 1835254 | Polynucleobacter duraquae | Proteobacteria | Bacteria | 18.0 |
89 rows × 5 columns
def compute_mae(y_true, y_pred):
return np.mean(np.abs(y_true - y_pred))
def predict_numerical_field_from_neighbours(graph, species_taxid, trait, control_for_phylum=True):
current_node = graph.nodes[species_taxid]
if pd.isnull(current_node[trait]):
return None, None, None
current_phylum = current_node['phylum']
values = []
for key in graph.neighbors(species_taxid):
node = graph.nodes[key]
if pd.isnull(node[trait]):
continue
if control_for_phylum and node['phylum'] != current_phylum:
v = node[trait]
values.append(v)
elif not control_for_phylum:
v = node[trait]
values.append(v)
if len(values) == 0:
prediction = None
else:
prediction = np.mean(values)
return current_node[trait], prediction, values
def compute_mae_from_neighbour_predictions(graph, species_taxids, trait, control_for_phylum=True):
actuals = []
predictions = []
for i, species_taxid in enumerate(species_taxids):
actual, prediction, _ = predict_numerical_field_from_neighbours(
graph,
species_taxid,
trait,
control_for_phylum,
)
if actual is None or prediction is None:
continue
actuals.append(actual)
predictions.append(prediction)
return compute_mae(np.array(actuals), np.array(predictions))
def compute_accuracy(y_true, y_pred):
correct = [
1 if y_true[i] == y_pred[i] else 0
for i in range(len(y_pred))
]
return 100 * np.sum(correct) / len(y_pred)
def predict_categorical_field_from_neighbours(graph, species_taxid, trait, control_for_phylum=True):
current_node = graph.nodes[species_taxid]
if pd.isnull(current_node[trait]):
return None, None, None
current_phylum = current_node['phylum']
values = collections.defaultdict(int)
for key in graph.neighbors(species_taxid):
node = graph.nodes[key]
if pd.isnull(node[trait]):
continue
if control_for_phylum and node['phylum'] != current_phylum:
v = node[trait]
values[v] += 1
elif not control_for_phylum:
v = node[trait]
values[v] += 1
if len(values) == 0:
prediction = None
sorted_values = []
else:
sorted_values = sorted(values.items(), key=lambda t: t[1], reverse=True)
prediction = sorted_values[0][0]
return current_node[trait], prediction, sorted_values
def compute_accuracy_from_neighbour_predictions(graph, species_taxids, trait, control_for_phylum=True):
actuals = []
predictions = []
for i, species_taxid in enumerate(species_taxids):
actual, prediction, _ = predict_categorical_field_from_neighbours(
graph,
species_taxid,
trait,
control_for_phylum,
)
if actual is None or prediction is None:
continue
actuals.append(actual)
predictions.append(prediction)
return compute_accuracy(actuals, predictions)
compute_mae_from_neighbour_predictions(graph, species_taxids, 'growth_tmp')
8.644639843926784
compute_mae_from_neighbour_predictions(graph, species_taxids, 'growth_tmp', control_for_phylum=False)
6.363267744129107
compute_mae_from_neighbour_predictions(graph, species_taxids, 'optimum_ph')
0.8045548369656437
compute_mae_from_neighbour_predictions(graph, species_taxids, 'gc_content')
1.5698871501477607
compute_mae_from_neighbour_predictions(graph, species_taxids, 'tRNA_genes')
14.398689358230122
compute_mae_from_neighbour_predictions(graph, species_taxids, 'rRNA16S_genes')
2.167322515728827
compute_mae_from_neighbour_predictions(graph, species_taxids, 'coding_genes')
1183.0628712801658
actual, prediction, values = predict_numerical_field_from_neighbours(graph, 2336, 'growth_tmp')
print(actual, prediction, np.std(values))
values
80.0 78.725 1.2749999999999986
[77.45, 80.0]
compute_accuracy_from_neighbour_predictions(graph, species_taxids, 'isolation_source')
14.698162729658792
for key in graph.neighbors(2336):
node = graph.nodes[key]
print(
node['species_taxid'],
node['species'],
node['superkingdom'],
node['phylum'],
node['growth_tmp'],
node['isolation_source'],
)
2337 Thermotoga neapolitana Bacteria Thermotogae 79.0 sediment_marine_hydrothermal 57487 Pseudothermotoga hypogea Bacteria Thermotogae 69.95 petroleum 58290 Archaeoglobus veneficus Archaea Euryarchaeota 77.45 sediment_marine_hydrothermal 93929 Thermotoga petrophila Bacteria Thermotogae 77.25 wastewater 93930 Thermotoga naphthophila Bacteria Thermotogae 77.15 wastewater 565033 Geoglobus acetivorans Archaea Euryarchaeota 80.0 sediment_marine_hydrothermal 1184387 Mesotoga prima Bacteria Thermotogae 37.0 sediment_marine 1236046 Mesotoga infera Bacteria Thermotogae 45.0 water_marine
graph.nodes[2336]
{'species_taxid': 2336, 'species': 'Thermotoga maritima', 'growth_tmp': 80.0, 'genus': 'Thermotoga', 'family': 'Thermotogaceae', 'order': 'Thermotogales', 'class': 'Thermotogae', 'phylum': 'Thermotogae', 'superkingdom': 'Bacteria', 'optimum_ph': nan, 'gram_stain': 'negative', 'metabolism': 'anaerobic', 'sporulation': 'no', 'motility': 'yes', 'range_salinity': None, 'cell_shape': 'bacillus', 'isolation_source': 'sediment_marine_hydrothermal', 'doubling_h': 1.233, 'genome_size': 1865969.5, 'gc_content': 46.214, 'coding_genes': 1871.167, 'tRNA_genes': 46.0, 'rRNA16S_genes': 1.0, 'n_assemblies': 6, 'assembly_accessions': ('GCA_000008545.1', 'GCA_000230655.3', 'GCA_000390265.1', 'GCA_000978535.1', 'GCA_000978555.1', 'GCA_000978575.1')}
graph.nodes[565033]
{'species_taxid': 565033, 'species': 'Geoglobus acetivorans', 'growth_tmp': 80.0, 'genus': 'Geoglobus', 'family': 'Archaeoglobaceae', 'order': 'Archaeoglobales', 'class': 'Archaeoglobi', 'phylum': 'Euryarchaeota', 'superkingdom': 'Archaea', 'optimum_ph': nan, 'gram_stain': None, 'metabolism': 'anaerobic', 'sporulation': None, 'motility': 'yes', 'range_salinity': None, 'cell_shape': None, 'isolation_source': 'sediment_marine_hydrothermal', 'doubling_h': nan, 'genome_size': 1860816.25, 'gc_content': 46.8, 'coding_genes': 2172.667, 'tRNA_genes': 48.0, 'rRNA16S_genes': 1.0, 'n_assemblies': 1, 'assembly_accessions': ('GCA_000789255.1',)}
graph.nodes[58290]
{'species_taxid': 58290, 'species': 'Archaeoglobus veneficus', 'growth_tmp': 77.45, 'genus': 'Archaeoglobus', 'family': 'Archaeoglobaceae', 'order': 'Archaeoglobales', 'class': 'Archaeoglobi', 'phylum': 'Euryarchaeota', 'superkingdom': 'Archaea', 'optimum_ph': nan, 'gram_stain': 'negative', 'metabolism': 'anaerobic', 'sporulation': None, 'motility': 'yes', 'range_salinity': None, 'cell_shape': 'coccus', 'isolation_source': 'sediment_marine_hydrothermal', 'doubling_h': 1.02, 'genome_size': 1901942.4, 'gc_content': 47.0, 'coding_genes': 2076.0, 'tRNA_genes': 46.0, 'rRNA16S_genes': 1.0, 'n_assemblies': 1, 'assembly_accessions': ('GCA_000194625.1',)}
predict_categorical_field_from_neighbours(graph, 2336, 'isolation_source')
('sediment_marine_hydrothermal', 'sediment_marine_hydrothermal', [('sediment_marine_hydrothermal', 2)])
def protein_description_word_count(assembly_accession, below_threshold=True, count_type='document'):
if count_type not in ('document', 'term'):
raise ValueError('count type should be one of document or term')
cds_codon_bias_folder = os.path.join(os.getcwd(), 'data/cds_codon_bias')
if below_threshold:
cds_codon_bias_folder = os.path.join(cds_codon_bias_folder, 'below_threshold')
else:
cds_codon_bias_folder = os.path.join(cds_codon_bias_folder, 'all')
cds_codon_bias_path = os.path.join(cds_codon_bias_folder, f'{assembly_accession}_codon_bias.csv')
black_list = [
'',
'protein',
'enzyme',
'subunit',
'domain',
'domain-containing',
'hypothetical',
'putative',
'probable',
'conserved',
'factor',
'large',
'small',
'family',
'superfamily',
'outer',
'cell',
'i',
'ii',
'iii',
'iv',
'v',
'vi',
'alpha',
'beta',
'gamma',
'type',
'and',
'or',
'of',
'class',
'assembly',
'acid',
'component',
'response',
'2x',
'1-2',
]
black_list += [c for c in string.ascii_lowercase]
black_list += [c for c in string.digits]
black_list = set(black_list)
df = pd.read_csv(cds_codon_bias_path)
descriptions = df['description'].values
counts = collections.defaultdict(int)
for description in descriptions:
desc = str(description).replace(',', '').replace('.', '').replace('(', '').replace(')', '').strip()
parts_1 = desc.split(' ')
parts = []
for p in parts_1:
parts += [v.lower() for v in p.split('/')]
for word in parts:
if word in black_list:
continue
if count_type == 'document':
counts[word] = 1
else:
counts[word] += 1
return dict(counts)
def compute_word_count(assembly_accessions, below_threshold=True, count_type='document', sorted_output=True):
if count_type not in ('document', 'term'):
raise ValueError('count type should be one of document or term')
total_counts = {}
for i, assembly_accession in enumerate(assembly_accessions):
if (i+1) % 200 == 0:
print(f'{i+1} / {len(assembly_accessions)}')
counts = protein_description_word_count(assembly_accession, below_threshold, count_type)
for k, v in counts.items():
total_counts[k] = total_counts.get(k, 0) + counts[k]
if sorted_output:
word_counts = [
(
k,
np.round(100 * v / len(assembly_accessions), 2)
)
for k, v in total_counts.items()
]
return sorted(word_counts, key=lambda t: t[1], reverse=True)
else:
return total_counts
def compute_word_scores(assembly_accessions, min_term_freq=1):
top_term_freq = compute_word_count(
assembly_accessions,
below_threshold=True,
count_type='term',
sorted_output=False,
)
all_term_freqs = compute_word_count(
assembly_accessions,
below_threshold=False,
count_type='term',
sorted_output=False,
)
all_words = sorted(all_term_freqs.keys())
word_scores = []
for word in all_words:
term_freq = top_term_freq.get(word, 0)
all_term_freq = all_term_freqs[word]
term_freq_norm = term_freq / len(assembly_accessions)
all_term_freq_norm = all_term_freq / len(assembly_accessions)
if term_freq_norm < min_term_freq:
continue
score = term_freq / all_term_freqs[word]
word_scores.append((
word,
score,
np.round(term_freq_norm, 2),
np.round(all_term_freq_norm, 2),
))
return sorted(word_scores, key=lambda t: t[1], reverse=True)
def local_word_scores(graph, species_taxid, control_for_phylum=True, include_neighbours=True):
current_node = graph.nodes[species_taxid]
current_word_scores = compute_word_scores(current_node['assembly_accessions'])
score_dicts = [{t[0]: t for t in current_word_scores}]
word_set = set([t[0] for t in current_word_scores])
neighbours = list(graph.neighbors(species_taxid))
if include_neighbours:
for key in neighbours:
node = graph.nodes[key]
if control_for_phylum and node['phylum'] == current_node['phylum']:
continue
word_scores = compute_word_scores(node['assembly_accessions'])
word_set &= set([t[0] for t in word_scores])
score_dicts.append({t[0]: t for t in word_scores})
final_word_scores = []
for word in sorted(word_set):
scores, term_freqs, all_term_freqs = [], [], []
for score_dict in score_dicts:
_, score, term_freq, all_term_freq = score_dict[word]
scores.append(score)
term_freqs.append(term_freq)
all_term_freqs.append(all_term_freq)
final_word_scores.append((
word,
np.round(np.mean(score), 3),
np.round(np.mean(term_freqs), 2),
np.round(np.mean(all_term_freqs), 2),
))
return sorted(final_word_scores, key=lambda t: t[1], reverse=True)
def retrieve_orginal_proteins(graph, species_taxid, include_neighbours=True):
cds_codon_bias_folder = os.path.join(os.getcwd(), 'data/cds_codon_bias/below_threshold')
species_taxids = [species_taxid]
if include_neighbours:
species_taxids += [key for key in graph.neighbors(species_taxid)]
word_scores = local_word_scores(graph, species_taxid, include_neighbours=include_neighbours)
words = [t[0] for t in word_scores]
protein_dict = collections.defaultdict(list)
for species_taxid in species_taxids:
node = graph.nodes[species_taxid]
assembly_accessions = node['assembly_accessions']
for assembly_accession in assembly_accessions:
cds_codon_bias_path = os.path.join(cds_codon_bias_folder, f'{assembly_accession}_codon_bias.csv')
df = pd.read_csv(cds_codon_bias_path)
descriptions = df['description'].values
distances = df['distance'].values
for word in words:
for i, protein_description in enumerate(descriptions):
protein_desc = str(protein_description)
protein_desc_key = protein_desc.replace(',', '').replace(': ', ' ').replace(':', ' ').lower()
if word.lower() in protein_desc_key:
protein_dict[protein_desc_key].append((protein_desc, distances[i]))
output_proteins = [
(protein_dict[protein_desc][0][0], np.mean([t[1] for t in protein_dict[protein_desc]]))
for protein_desc in protein_dict.keys()
]
return sorted(output_proteins, key=lambda t: t[1])
def retrieve_orginal_proteins_global(graph, species_taxids):
cds_codon_bias_folder = os.path.join(os.getcwd(), 'data/cds_codon_bias/below_threshold')
protein_dict = collections.defaultdict(list)
seen_species = set()
for s_ix, species_taxid in enumerate(species_taxids):
if (s_ix + 1) % 10 == 0:
print(f'{s_ix + 1:,} / {len(species_taxids):,}')
if species_taxid in seen_species:
continue
node = graph.nodes[species_taxid]
assembly_accessions = node['assembly_accessions']
neighbors = list(graph.neighbors(species_taxid))
seen_species.add(species_taxid)
seen_species |= set(neighbors)
word_scores = local_word_scores(graph, species_taxid)
words = [t[0] for t in word_scores]
for assembly_accession in assembly_accessions:
cds_codon_bias_path = os.path.join(cds_codon_bias_folder, f'{assembly_accession}_codon_bias.csv')
df = pd.read_csv(cds_codon_bias_path)
descriptions = df['description'].values
distances = df['distance'].values
for word in words:
for i, protein_description in enumerate(descriptions):
protein_desc = str(protein_description)
protein_desc_key = protein_desc.replace(',', '').replace(': ', ' ').replace(':', ' ').lower()
if word.lower() in protein_desc_key:
protein_dict[protein_desc_key].append((protein_desc, distances[i]))
output_proteins = [
(protein_dict[protein_desc][0][0], np.mean([t[1] for t in protein_dict[protein_desc]]))
for protein_desc in protein_dict.keys()
]
return sorted(output_proteins, key=lambda t: t[1])
original_proteins_2336 = retrieve_orginal_proteins(
graph,
species_taxid=2336,
include_neighbours=False,
)
print(f'{len(original_proteins_2336):,} proteins')
for p in [t[0] for t in original_proteins_2336]:
print(p)
134 proteins permease DNA-directed RNA polymerase subunit beta alpha-mannosidase-related protein alpha-mannosidase alpha-mannosidase, putative DNA-directed RNA polymerase, beta subunit sensor histidine kinase histidine kinase DNA polymerase I DNA-directed DNA polymerase I DNA polymerase Xylose-regulated ABC transporter, substrate-binding component oligopeptide ABC transporter, periplasmic oligopeptide-binding protein iron transporter FeoB DNA gyrase, subunit A pyrimidine-nucleoside phosphorylase thymidine phosphorylase UDP-N-acetylglucosamine 2-epimerase Flagellar hook protein FlgE flagellar hook-basal body protein helicase single stranded DNA-specific exonuclease, putative single-stranded DNA exonuclease Single-stranded-DNA-specific exonuclease RecJ Superfamily II DNA/RNA helicase, SNF2 family CRISPR-associated protein DNA mismatch repair protein DNA mismatch repair protein MutS DNA ligase NAD-dependent DNA ligase LigA CRISPR-associated helicase Cas3 double-stranded DNA repair protein Rad50 Exonuclease SbcC DNA repair protein Rad50 cation-transporting ATPase, P-type ATPase P Lead, cadmium, zinc and mercury transporting ATPase ; Copper-translocating P-type ATPase diguanylate cyclase Phosphoglycerol transferase (R)-2-hydroxyglutaryl-CoA dehydratase activator-related protein 2-hydroxyglutaryl-CoA dehydratase RND transporter membrane protein, putative ATPase AAA MoxR-like ATPase isoleucyl-tRNA synthetase isoleucine--tRNA ligase diguanylate cyclase (GGDEF domain) with PAS/PAC sensor (p)ppGpp synthetase GTP pyrophosphokinase , (p)ppGpp synthetase I 6-phosphogluconate dehydrogenase, decarboxylating 6-phosphogluconate dehydrogenase peptide ABC transporter substrate-binding protein Fe-S oxidoreductase tRNA nucleotidyl transferase-related protein tRNA nucleotidyltransferase beta-galactosidase lipopolysaccharide biosynthesis protein ribonucleotide reductase, B12-dependent ribonucleotide reductase Ribonucleotide reductase of class II (coenzyme B12-dependent) cellobiose-phosphorylase glycosyl transferase Cellobiose phosphorylase 5-methyltetrahydrofolate S-homocysteine methyltransferase 5-methyltetrahydrofolate--homocysteine methyltransferase preprotein translocase subunit SecA preprotein translocase SecA subunit Protein export cytoplasm protein SecA ATPase RNA helicase glycyl-tRNA synthetase, beta subunit glycyl-tRNA synthetase subunit beta Glycyl-tRNA synthetase beta chain lipopolysaccharide biosynthesis protein, putative ABC transporter UDP-N-acetylglucosamine 1-carboxyvinyltransferase CRISPR-associated protein Csm1 CRISPR-associated protein, Csm1 family oligopeptide ABC transporter, ATP-binding protein Xylobiose ABC transport system, ATP-binding protein 1 GTP-binding protein LepA Translation elongation factor LepA ABC transporter, ATP-binding protein Lipid A export ATP-binding/permease protein MsbA multidrug ABC transporter ATP-binding protein DNA topoisomerase DNA topoisomerase I ATP-dependent protease LA, putative ATP-dependent protease ATP-dependent protease La Type II 5-methyltetrahydropteroyltriglutamate--homocysteine methyltransferase alcohol dehydrogenase, zinc-containing L-threonine 3-dehydrogenase alanyl-tRNA synthetase alanyl-tRNA synthase dehydrogenase oxidoreductase Myo-inositol 2-dehydrogenase 1 FAD dependent oxidoreductase putative electron transfer flavoprotein-quinone oxidoreductase FixC flagellar M-ring protein FliF flagellar M-ring protein alpha-glucuronidase lepA protein flagellar hook-associated protein 1 flagellar hook-associated protein FlgK flagellar hook protein FlgK ATPase, AAA family flagellar biosynthesis protein FlhA alcohol dehydrogenase Acetyl-coenzyme A carboxyl transferase alpha chain / Acetyl-coenzyme A carboxyl transferase beta chain ; Propionyl-CoA carboxylase beta chain methylmalonyl-CoA carboxyltransferase protein-export membrane protein SecD, putative Protein-export membrane protein SecD preprotein translocase subunit SecD maltodextrin glycosyltransferase membrane protein transcription termination factor Rho MFS transporter Permease of the major facilitator superfamily 3-phosphoshikimate 1-carboxyvinyltransferase ATP-dependent protease LA ATP-dependent protease La Type I DNA-binding protein DNA-directed RNA polymerase subunit beta' pyruvate kinase Xyloglucan ABC transport system, sugar-binding protein glucosamine--fructose-6-phosphate aminotransferase Glucosamine--fructose-6-phosphate aminotransferase [isomerizing phenylalanyl-tRNA synthase subunit alpha phenylalanine--tRNA ligase glycine dehydrogenase subunit 1 glycine dehydrogenase transcription-repair coupling factor aspartate aminotransferase
#for v in [t[0] for t in original_proteins_2336 if t[1] < 0.1][:51]:
# print(v)
original_proteins_562 = retrieve_orginal_proteins(
graph,
species_taxid=562,
include_neighbours=False,
)
print(f'{len(original_proteins_562):,} proteins')
for p in [t[0] for t in original_proteins_562]:
print(p)
164 proteins aconitate hydratase 1 alpha2-macroglobulin exodeoxyribonuclease V subunit RecB bacteriophage N4 receptor, outer membrane protein putative outer membrane protein, N-terminal fragment ribonucleoside-diphosphate reductase 2 subunit alpha apo-serine activating enzyme alpha-mannosidase peptidase M16 family protein inner membrane protein - inhibits the Rcs signaling pathway biotin sulfoxide reductase putative selenate reductase YnfE multidrug efflux pump RND permease AcrF DUF5107 domain-containing protein YphG 3-hydroxyacyl-CoA dehydrogenase FadJ peptidoglycan DD-transpeptidase FtsI transcription-repair coupling factor protein-tyrosine kinase Etk putative selenate reductase YnfF putative TonB-dependent outer membrane receptor exodeoxyribonuclease V subunit RecC putative xanthine dehydrogenase molybdenum-binding subunit XdhA fused DNA-binding transcriptional repressor/proline dehydrogenase/1-pyrroline-5-carboxylate dehydrogenase PutA DNA mismatch repair protein MutS ATP dependent, structure specific DNA nuclease - SbcC subunit putative transporter YhfK polyphosphate kinase bacteriophage N4 receptor, inner membrane subunit fumarase C soluble pyridine nucleotide transhydrogenase putative ATP-dependent protease YcbZ flagellar biosynthesis protein FlhA regulator of CsrB and CsrC decay oxepin-CoA hydrolase/3-oxo-5,6-dehydrosuberyl-CoA semialdehyde dehydrogenase DUF4132 domain-containing protein YehI multidrug efflux pump RND permease MdtF Type II secretion system protein GspD putative pyruvate-flavodoxin oxidoreductase sensory histidine kinase NarQ intermembrane transport protein YebT N-acetylglutamate synthase AsmA famly protein YicH cellulose phosphoethanolamine transferase carbamoyltransferase HypF beta-galactosidase alpha-D-xyloside xylohydrolase multidrug efflux pump RND permease AcrD beta-glucoside specific PTS enzyme II/BglG kinase/BglG phosphatase 1,4-alpha-glucan branching enzyme phosphoglycerol transferase I negative regulator of MalT activity/cystathionine beta-lyase acyltransferase adenylate cyclase ornithine decarboxylase, degradative kojibiose phosphorylase DNA replication terminus site-binding protein DNA-binding transcriptional activator HyfR osmoregulated periplasmic glucans (OPGs) biosynthesis protein H cell division DNA translocase FtsK putative aminohydrolase DUF3029 domain-containing protein YjjI DNA-binding transcriptional activator FhlA multidrug efflux pump RND permease subunit MdtC sensory histidine kinase BarA RcsD phosphotransferase putative autotransporter adhesin YfaL phosphoethanolamine transferase putative oxidoreductase Zn(2(+))/Cd(2(+))/Pb(2(+)) exporting P-type ATPase Cu(+) exporting P-type ATPase 1-deoxyxylulose-5-phosphate synthase YajO putative K(+):H(+) antiporter sensory histidine kinase ArcB sensory histidine kinase TorS DNA helicase IV fused indole-3-glycerol phosphate synthase/phosphoribosylanthranilate isomerase putative lipoprotein GfcD fused putative oxidoreductase, Fe-S subunit and nucleotide-binding subunit putative transporter YfcC putative NAD(P)-dependent oxidoreductase YbjT putative transporter YccS selenocysteyl-tRNA-specific translation elongation factor ATP-dependent RNA helicase HrpA carnitine-CoA ligase DUF418 domain-containing protein YeiB membrane-bound lytic murein transglycosylase F N-succinylarginine dihydrolase aromatic carboxylic acid efflux pump subunit AaeB putative diguanylate cyclase DgcE toxin YjjJ 3-ketoacyl-CoA thiolase FadI carbamoyl phosphate synthetase subunit alpha DNA-binding transcriptional dual regulator ArgP putative DNA repair helicase RadD dimethyl sulfoxide reductase subunit A multidrug efflux pump RND permease subunit MdtB putative hydratase YbhJ trimethylamine N-oxide reductase 2 DUF1479 domain-containing protein YbiU putative uncharacterized protein YjiV putative diguanylate cyclase DgcQ DNA-binding transcriptional activator/ornithine decarboxylase inhibitor AtoC metal phosphate:H(+) symporter PitB chaperone protein HscC ClpB chaperone Fe-S cluster scaffold complex subunit SufB L-tartrate:succinate antiporter inverse autotransporter adhesin sensory histidine kinase KdpD putative 5' to 3' DNA helicase implicated in DNA repair putative TonB-dependent receptor putative formate acetyltransferase 2 putative oxidoreductase, Fe-S subunit putative PTS enzyme IIC component SgcC glycerol-3-phosphate dehydrogenase putative oxidoreductase YdeP hydrogenase 4 component B putative FAD-linked oxidoreductase fatty acyl-CoA synthetase exopolyphosphatase peptidoglycan DD-transpeptidase MrdA protein ViaA putative c-di-GMP phosphodiesterase PdeK primosome factor N' RNase D poly-N-acetyl-D-glucosamine synthase subunit PgaC DNA-binding transcriptional dual regulator GalR copper-containing amine oxidase putative D,D-dipeptide ABC transporter periplasmic binding protein putative fimbrial usher protein YbgQ putative multidrug efflux pump outer membrane channel phosphatidylserine synthase fermentation-respiration switch protein tail-specific protease exodeoxyribonuclease V subunit RecD DNA-binding transcriptional activator MalT NADP(+)-dependent aldehyde reductase dGTP triphosphohydrolase acetohydroxy acid synthase I subunit IlvB malate synthase A ubiquinone biosynthesis protein UbiB 23S rRNA 5-hydroxycytidine C2501 synthase fused putative xanthine/hypoxanthine oxidase: molybdopterin-binding subunit and Fe-S binding subunit putative major facilitator superfamily transport protein YajR putative xyloside transporter YicJ AsmA2 domain-containing protein aspartate-semialdehyde dehydrogenase peptidoglycan DD-endopeptidase MepM putative iron siderophore outer membrane transporter radical SAM superfamily protein YgiQ fused chorismate mutase/prephenate dehydratase peptidyl-lysine acetyltransferase putative hydrolase YieL fused aspartate kinase/homoserine dehydrogenase 1 DNA topoisomerase III putative c-di-GMP phosphodiesterase PdeD oligopeptide ABC transporter periplasmic binding protein D-amino acid dehydrogenase transaldolase A DNA polymerase III subunit tau phosphoenolpyruvate-protein phosphotransferase PtsP putative transporter YdhK outer membrane channel TolC inorganic triphosphatase
original_proteins_531 = retrieve_orginal_proteins(
graph,
species_taxid=531,
include_neighbours=False,
)
print(f'{len(original_proteins_531):,} proteins')
for p in [t[0] for t in original_proteins_531]:
print(p)
79 proteins FAD linked oxidase-like protein Large subunit of NADH-dependent glutamate synthase HAE1 efflux family protein oxidoreductase alpha (molybdopterin) subunit ATP-dependent DNA helicase DNA polymerase I glycine dehydrogenase probable glycosyl hydrolase probable periplasmic tail-specific proteinase bifunctional GMP synthase/glutamine amidotransferase protein putative DNA polymerase III alpha subunit ATP-dependent Clp protease, ATP-binding subunit ClpC phosphoribosylformylglycinamidine synthase DNA mismatch repair protein isoleucyl-tRNA synthetase TonB-dependent siderophore receptor putative NADP-dependent malic enzyme probable long chain fatty-acid CoA ligase glutamine synthetase AcrB/AcrD family multidrug resistance protein putative chloride channel protein DNA topoisomerase IV subunit A methionyl-tRNA synthetase putative protease ATP-dependent helicase sensory transduction histidine kinase acetoin dehydrogenase (TPP-dependent) beta chain aminoacyl-histidine dipeptidase FixL-related histidine kinase ferric enterobactin receptor penicillin-binding protein 1A GTP-binding protein LepA Secreted protein containing N-terminal zinc-dependent carboxypeptidase related domain primosomal protein N' aspartate kinase III hypothetical protein aminopeptidase M1 family protein Penicillin amidase putative outer membrane protein, probably involved in nutrient binding DNA topoisomerase III phosphoglucomutase phosphomannomutase molybdopterin oxidoreductase, iron-sulfur binding subunit putative outer membrane receptor protein two-component regulatory system, sensor kinase protein ribonuclease R putative FtsK/SpoIIIE-like protein 2-succinyl-6-hydroxy-2,4-cyclohexadiene-1-carboxylic acid synthase/2-oxoglutarate decarboxylase alanyl-tRNA synthetase hypothetical carboxypeptidase putative peptidase/protease family protein phosphate acetyltransferase putative ATP-dependent DNA helicase Alpha-glucosidase, family 31 of glycosyl hydrolase GTP pyrophosphokinase putative DNA ligase pyruvate carboxylase methylmalonyl-CoA mutase aspartyl-tRNA synthetase glutaminyl-tRNA synthetase glutaryl-7-ACA acylase precursor Nitrilase/cyanide hydratase and apolipoprotein N-acyltransferase isocitrate dehydrogenase, NADP histidine kinase sensor protein putative outer membrane protein molecular chaperone DnaK beta-glucanase precursor isoquinoline 1-oxidoreductase, beta subunit glycosyl hydrolase FKBP-type peptidyl-prolyl cis-transisomerase Amidohydrolase family, putative aminopeptidase TonB-dependent receptor, putative phosphoenolpyruvate carboxylase excinuclease ABC subunit A putative capsular polysaccharide biosynthesis protein subtilisin-like serine protease putative auxin-regulated protein 2-isopropylmalate synthase acyl-CoA dehydrogenase
species_codon_ratios_path = os.path.join(os.getcwd(), 'data/species_codon_ratios.csv')
species_codon_df = load_codon_bias(engine, species_codon_ratios_path).set_index('species_taxid')
species_codon_df.head()
in_test_set | AAA_ratio | AAG_ratio | AAT_ratio | AAC_ratio | ACT_ratio | ACC_ratio | ACA_ratio | ACG_ratio | CGT_ratio | ... | motility | range_salinity | cell_shape | isolation_source | doubling_h | genome_size | gc_content | coding_genes | tRNA_genes | rRNA16S_genes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
species_taxid | |||||||||||||||||||||
7 | True | 0.082550 | 0.917450 | 0.327724 | 0.672276 | 0.024511 | 0.575353 | 0.036262 | 0.363875 | 0.078732 | ... | yes | None | None | host_plant | NaN | 5369771.500 | 67.300 | 4713.667 | 53.000 | 3.0 |
9 | False | 0.919627 | 0.080373 | 0.858168 | 0.141832 | 0.454569 | 0.050026 | 0.446018 | 0.049387 | 0.325065 | ... | yes | None | coccobacillus | host_animal_ectotherm | 35.40 | 601699.243 | 25.469 | 517.549 | 30.485 | 1.0 |
11 | True | 0.007302 | 0.992698 | 0.010730 | 0.989270 | 0.008586 | 0.425527 | 0.015789 | 0.550098 | 0.092597 | ... | yes | None | bacillus | host_animal_endotherm | NaN | 3526440.800 | 73.805 | 3139.333 | 45.000 | 2.0 |
14 | True | 0.664866 | 0.335134 | 0.775571 | 0.224429 | 0.458203 | 0.188052 | 0.304183 | 0.049563 | 0.035122 | ... | no | None | bacillus | water_hotspring | 2.47 | 1959987.600 | 33.700 | 1876.333 | 46.000 | 2.0 |
19 | True | 0.551810 | 0.448190 | 0.440559 | 0.559441 | 0.099134 | 0.591478 | 0.097796 | 0.211592 | 0.204394 | ... | yes | None | bacillus | petroleum | NaN | 3722544.667 | 55.100 | 3222.667 | 54.000 | 2.0 |
5 rows × 85 columns
ratio_columns = [c for c in species_codon_df.columns if c.endswith('_ratio')]
def load_trnas(engine, assembly_accession):
trna_query = """
select * from sequences where sequence_type = 'tRNA' and assembly_accession = ?
"""
trna_df = pd.read_sql(trna_query, engine, params=(assembly_accession,))
metadata = [json.loads(v) if v is not None else {} for v in trna_df['metadata_json'].values]
trna_df['codon'] = [v.get('codon') for v in metadata]
trna_df['anticodon'] = [v.get('anticodon') for v in metadata]
trna_df['amino_acid'] = [v.get('amino_acid') for v in metadata]
return trna_df
trna_df = load_trnas(engine, 'GCA_000005845.2')
print(len(trna_df), 'tRNA records')
trna_df.head()
86 tRNA records
assembly_accession | species_taxid | sequence_type | chromosome_id | location_json | strand | length | description | metadata_json | sequence | codon | anticodon | amino_acid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GCA_000005845.2 | 562 | tRNA | U00096.3 | [[225381, 225457]] | + | 77 | lcl|U00096.3_trna_5 [gene=ileV] [locus_tag=b02... | {"amino_acid": "Ile", "anticodon": "GAT", "cod... | AGGCTTGTAGCTCAGGTGGTTAGAGCGCACCCCTGATAAGGGTGAG... | ATC | GAT | Ile |
1 | GCA_000005845.2 | 562 | tRNA | U00096.3 | [[225500, 225575]] | + | 76 | lcl|U00096.3_trna_6 [gene=alaV] [locus_tag=b02... | {"amino_acid": "Ala", "anticodon": "TGC", "cod... | GGGGCTATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGG... | GCA | TGC | Ala |
2 | GCA_000005845.2 | 562 | tRNA | U00096.3 | [[228928, 229004]] | + | 77 | lcl|U00096.3_trna_9 [gene=aspU] [locus_tag=b02... | {"amino_acid": "Asp", "anticodon": "GTC", "cod... | GGAGCGGTAGTTCAGTCGGTTAGAATACCTGCCTGTCACGCAGGGG... | GAC | GTC | Asp |
3 | GCA_000005845.2 | 562 | tRNA | U00096.3 | [[236931, 237007]] | + | 77 | lcl|U00096.3_trna_10 [gene=aspV] [locus_tag=b0... | {"amino_acid": "Asp", "anticodon": "GTC", "cod... | GGAGCGGTAGTTCAGTCGGTTAGAATACCTGCCTGTCACGCAGGGG... | GAC | GTC | Asp |
4 | GCA_000005845.2 | 562 | tRNA | U00096.3 | [[262871, 262946]] | + | 76 | lcl|U00096.3_trna_11 [gene=thrW] [locus_tag=b0... | {"amino_acid": "Thr", "anticodon": "CGT", "cod... | GCCGATATAGCTCAGTTGGTAGAGCAGCGCATTCGTAATGCGAAGG... | ACG | CGT | Thr |
trna_ai_path = os.path.join(os.getcwd(), f'data/trn_adaptation_index/GCA_000005845.2_tai.csv')
trna_ai = pd.read_csv(trna_ai_path)
trna_ai.head()
assembly_accession | species_taxid | row_id | protein_id | protein | gene | adaptation_index | |
---|---|---|---|---|---|---|---|
0 | GCA_000005845.2 | 562 | 255469 | AAC74347.1 | trp operon leader peptide | trpL | 1.000000 |
1 | GCA_000005845.2 | 562 | 258414 | ABP93450.1 | uncharacterized protein YpfM | ypfM | 0.858984 |
2 | GCA_000005845.2 | 562 | 257893 | AAC77015.1 | putative stress response protein | yjbJ | 0.848080 |
3 | GCA_000005845.2 | 562 | 254740 | AAC73573.1 | putative nucleoid-associated protein YbaB | ybaB | 0.812893 |
4 | GCA_000005845.2 | 562 | 257793 | AAC76910.1 | cell division factor ZapB | zapB | 0.775375 |
adaptation_ix = trna_ai['adaptation_index'].values
_, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.hist(adaptation_ix, bins=100);
matching_df = trna_ai[
trna_ai['protein'].isin(np.unique([t[0] for t in original_proteins_562]))
]
adaptation_ix_top_codon = matching_df['adaptation_index'].values
_, ax2 = plt.subplots(1, 1, figsize=(12, 6))
ax2.hist(adaptation_ix_top_codon, bins=20);
ax2.set_xlim(ax.get_xlim());
matching_df[['protein', 'adaptation_index']]
protein | adaptation_index | |
---|---|---|
677 | ClpB chaperone | 0.616203 |
720 | malate synthase A | 0.613474 |
817 | peptidoglycan DD-transpeptidase MrdA | 0.607896 |
819 | tail-specific protease | 0.607810 |
828 | copper-containing amine oxidase | 0.607257 |
... | ... | ... |
3616 | putative transporter YdhK | 0.493055 |
3723 | fused putative oxidoreductase, Fe-S subunit an... | 0.486201 |
3856 | exodeoxyribonuclease V subunit RecD | 0.476493 |
3878 | DNA-binding transcriptional activator HyfR | 0.474149 |
4144 | putative oxidoreductase | 0.440617 |
172 rows × 2 columns
def compute_adaptation_mean_diff(assembly_accessions):
cached_proteins = {}
diff = []
for i, assembly_accession in enumerate(assembly_accessions):
if (i+1) % 100 == 0:
print(f'{i+1:,} / {len(assembly_accessions):,}')
trna_ai_path = os.path.join(os.getcwd(), f'data/trn_adaptation_index/{assembly_accession}_tai.csv')
trna_ai = pd.read_csv(trna_ai_path)
species_taxid = trna_ai['species_taxid'].iloc[0]
if species_taxid in cached_proteins:
proteins = cached_proteins[species_taxid]
else:
proteins = retrieve_orginal_proteins(
graph,
species_taxid=species_taxid,
include_neighbours=False,
)
cached_proteins[species_taxid] = proteins
mean_1 = trna_ai['adaptation_index'].mean()
mean_2 = trna_ai[
trna_ai['protein'].isin(np.unique([t[0] for t in proteins]))
]['adaptation_index'].mean()
if pd.isnull(mean_1) or pd.isnull(mean_2):
print(species_taxid, assembly_accession, mean_1, mean_2)
continue
diff.append(mean_2 - mean_1)
return np.mean(diff), np.std(diff), np.min(diff), np.max(diff)
q = """
select assembly_accession, species_taxid from assembly_source
"""
response_df = pd.read_sql(q, engine)
assembly_accessions = response_df['assembly_accession'].values
%%time
mean, std, min_, max_ = compute_adaptation_mean_diff(assembly_accessions)
mean, std, min_, max_
100 / 3,282 200 / 3,282 680 GCA_000017705.1 0.6611196531617023 nan 300 / 3,282 400 / 3,282 500 / 3,282 600 / 3,282 700 / 3,282 800 / 3,282 900 / 3,282 1,000 / 3,282 1,100 / 3,282 1,200 / 3,282 1,300 / 3,282 1,400 / 3,282 1,500 / 3,282 1,600 / 3,282 1,700 / 3,282 1,800 / 3,282 1,900 / 3,282 2,000 / 3,282 2,100 / 3,282 2,200 / 3,282 2,300 / 3,282 2,400 / 3,282 2,500 / 3,282 2,600 / 3,282 2,700 / 3,282 2,800 / 3,282 2,900 / 3,282 3,000 / 3,282 3,100 / 3,282 3,200 / 3,282 CPU times: user 5min 37s, sys: 22.4 s, total: 5min 59s Wall time: 6min 8s
(-0.004329882242986181, 0.008983127220666189, -0.058914431101381504, 0.030915092541605182)
def compute_mean_adaptation_per_species(assembly_accessions):
cached_proteins = {}
res = collections.defaultdict(list)
for i, assembly_accession in enumerate(assembly_accessions):
if (i+1) % 100 == 0:
print(f'{i+1:,} / {len(assembly_accessions):,}')
trna_ai_path = os.path.join(os.getcwd(), f'data/trn_adaptation_index/{assembly_accession}_tai.csv')
trna_ai = pd.read_csv(trna_ai_path)
species_taxid = trna_ai['species_taxid'].iloc[0]
if species_taxid in cached_proteins:
proteins = cached_proteins[species_taxid]
else:
proteins = retrieve_orginal_proteins(
graph,
species_taxid=species_taxid,
include_neighbours=False,
)
cached_proteins[species_taxid] = proteins
mean = trna_ai['adaptation_index'].mean()
res[species_taxid].append(mean)
output = {}
for key in sorted(res.keys()):
output[key] = np.mean([m for m in res[key] if not pd.isnull(m)])
return output
%%time
mean_adaptation_per_species = compute_mean_adaptation_per_species(assembly_accessions)
100 / 3,282 200 / 3,282 300 / 3,282 400 / 3,282 500 / 3,282 600 / 3,282 700 / 3,282 800 / 3,282 900 / 3,282 1,000 / 3,282 1,100 / 3,282 1,200 / 3,282 1,300 / 3,282 1,400 / 3,282 1,500 / 3,282 1,600 / 3,282 1,700 / 3,282 1,800 / 3,282 1,900 / 3,282 2,000 / 3,282 2,100 / 3,282 2,200 / 3,282 2,300 / 3,282 2,400 / 3,282 2,500 / 3,282 2,600 / 3,282 2,700 / 3,282 2,800 / 3,282 2,900 / 3,282 3,000 / 3,282 3,100 / 3,282 3,200 / 3,282 CPU times: user 5min 53s, sys: 24 s, total: 6min 17s Wall time: 6min 26s
_, ax3 = plt.subplots(1, 1, figsize=(12, 6))
ax3.hist(list(mean_adaptation_per_species.values()), bins=100);
print(np.mean(list(mean_adaptation_per_species.values())))
ax3.set_title('Histogram of mean tRNA adaptation index across all species\n');
ax3.set_xlabel('Mean tRNA adaptation index');
ax3.set_ylabel('Species count');
0.7526456316462177
def compute_distributions_of_length(assembly_accessions):
cds_codon_bias_folder = os.path.join(os.getcwd(), 'data/cds_codon_bias/')
lengths_all = []
lengths_below = []
for i, assembly_accession in enumerate(assembly_accessions):
if (i+1) % 200 == 0:
print(f'{i+1} / {len(assembly_accessions)}')
path_all = os.path.join(cds_codon_bias_folder, f'all/{assembly_accession}_codon_bias.csv')
path_below = os.path.join(cds_codon_bias_folder, f'below_threshold/{assembly_accession}_codon_bias.csv')
df_all = pd.read_csv(path_all)
df_below = pd.read_csv(path_below)
lengths_all.extend(df_all['length'].values.tolist())
lengths_below.extend(df_below['length'].values.tolist())
return np.array(lengths_all), np.array(lengths_below)
%%time
lengths_all, lengths_below = compute_distributions_of_length(assembly_accessions)
200 / 3282 400 / 3282 600 / 3282 800 / 3282 1000 / 3282 1200 / 3282 1400 / 3282 1600 / 3282 1800 / 3282 2000 / 3282 2200 / 3282 2400 / 3282 2600 / 3282 2800 / 3282 3000 / 3282 3200 / 3282 CPU times: user 2min 8s, sys: 17.3 s, total: 2min 25s Wall time: 2min 30s
def plot_distributions(lengths_all, lengths_below):
palette = sns.color_palette()
_, axes = plt.subplots(1, 2, figsize=(20, 6))
ax1 = axes[0]
ax2 = axes[1]
ax1.hist(lengths_all, bins=100, log=True)
ax1.set_title('Length distribution (all)')
xlim = ax1.get_xlim()
ax2.hist(lengths_below, bins=100, color=palette[1], log=True)
ax2.set_title('Length distribution (below threshold)')
ax2.set_xlim(xlim)
return ax1, ax2
plot_distributions(lengths_all, lengths_below);
distance_matrix_path = os.path.join(os.getcwd(), 'data/distance_matrix.npy')
distance_matrix = np.load(distance_matrix_path, allow_pickle=True)
species_codon_df[ratio_columns]
AAA_ratio | AAG_ratio | AAT_ratio | AAC_ratio | ACT_ratio | ACC_ratio | ACA_ratio | ACG_ratio | CGT_ratio | CGC_ratio | ... | GTG_ratio | TAA_ratio | TAG_ratio | TGA_ratio | TAC_ratio | TAT_ratio | TGC_ratio | TGT_ratio | TTC_ratio | TTT_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
species_taxid | |||||||||||||||||||||
7 | 0.082550 | 0.917450 | 0.327724 | 0.672276 | 0.024511 | 0.575353 | 0.036262 | 0.363875 | 0.078732 | 0.615278 | ... | 0.603462 | 0.059996 | 0.093068 | 0.846937 | 0.387338 | 0.612662 | 0.903673 | 0.096327 | 0.897180 | 0.102820 |
9 | 0.919627 | 0.080373 | 0.858168 | 0.141832 | 0.454569 | 0.050026 | 0.446018 | 0.049387 | 0.325065 | 0.043907 | ... | 0.064608 | 0.790826 | 0.100917 | 0.108257 | 0.130608 | 0.869392 | 0.176688 | 0.823312 | 0.080224 | 0.919776 |
11 | 0.007302 | 0.992698 | 0.010730 | 0.989270 | 0.008586 | 0.425527 | 0.015789 | 0.550098 | 0.092597 | 0.559134 | ... | 0.480311 | 0.009482 | 0.085967 | 0.904551 | 0.981660 | 0.018340 | 0.966107 | 0.033893 | 0.994594 | 0.005406 |
14 | 0.664866 | 0.335134 | 0.775571 | 0.224429 | 0.458203 | 0.188052 | 0.304183 | 0.049563 | 0.035122 | 0.008361 | ... | 0.153708 | 0.563059 | 0.206130 | 0.230811 | 0.208150 | 0.791850 | 0.204657 | 0.795343 | 0.189441 | 0.810559 |
19 | 0.551810 | 0.448190 | 0.440559 | 0.559441 | 0.099134 | 0.591478 | 0.097796 | 0.211592 | 0.204394 | 0.366337 | ... | 0.334484 | 0.340503 | 0.164193 | 0.495304 | 0.486740 | 0.513260 | 0.636140 | 0.363860 | 0.502549 | 0.497451 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1813871 | 0.772490 | 0.227510 | 0.681741 | 0.318259 | 0.240711 | 0.192149 | 0.446839 | 0.120302 | 0.243327 | 0.187369 | ... | 0.205557 | 0.680140 | 0.164518 | 0.155342 | 0.300637 | 0.699363 | 0.476391 | 0.523609 | 0.250815 | 0.749185 |
1833852 | 0.589533 | 0.410467 | 0.567693 | 0.432307 | 0.202326 | 0.538744 | 0.183109 | 0.075821 | 0.195981 | 0.173208 | ... | 0.343317 | 0.582863 | 0.286616 | 0.130521 | 0.346503 | 0.653497 | 0.459139 | 0.540861 | 0.237507 | 0.762493 |
1835254 | 0.555758 | 0.444242 | 0.685138 | 0.314862 | 0.374892 | 0.272930 | 0.209890 | 0.142287 | 0.341983 | 0.323291 | ... | 0.254571 | 0.660029 | 0.156387 | 0.183584 | 0.391660 | 0.608340 | 0.547508 | 0.452492 | 0.294507 | 0.705493 |
1945662 | 0.131137 | 0.868863 | 0.229159 | 0.770841 | 0.020880 | 0.561770 | 0.044037 | 0.373313 | 0.050484 | 0.587127 | ... | 0.458268 | 0.073481 | 0.117497 | 0.809021 | 0.314323 | 0.685677 | 0.909613 | 0.090387 | 0.738885 | 0.261115 |
1985254 | 0.163778 | 0.836222 | 0.293950 | 0.706050 | 0.048542 | 0.627622 | 0.044654 | 0.279182 | 0.150186 | 0.649024 | ... | 0.619580 | 0.197077 | 0.142955 | 0.659968 | 0.667897 | 0.332103 | 0.870500 | 0.129500 | 0.803614 | 0.196386 |
2680 rows × 62 columns
def make_uniform_bias(codon_to_aa_table, aa_to_codons, ratio_columns):
uniform_bias = np.zeros((len(ratio_columns),))
for i, col in enumerate(ratio_columns):
codon = col.replace('_ratio', '')
aa = codon_to_aa_table[codon]
synonyms = aa_to_codons[aa]
uniform_bias[i] = 1 / len(synonyms)
return uniform_bias
uniform_bias = make_uniform_bias(codon_to_aa_table, aa_to_codons, ratio_columns)
uniform_bias
array([0.5 , 0.5 , 0.5 , 0.5 , 0.25 , 0.25 , 0.25 , 0.25 , 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.33333333, 0.33333333, 0.33333333, 0.5 , 0.5 , 0.5 , 0.5 , 0.25 , 0.25 , 0.25 , 0.25 , 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.5 , 0.5 , 0.5 , 0.5 , 0.25 , 0.25 , 0.25 , 0.25 , 0.25 , 0.25 , 0.25 , 0.25 , 0.25 , 0.25 , 0.25 , 0.25 , 0.33333333, 0.33333333, 0.33333333, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ])
def compute_distance_to_uniform_bias(species_codon_df, uniform_bias, ratio_columns):
X = species_codon_df[ratio_columns].values
return compute_codon_bias_distance(X, uniform_bias)
distance_to_uniform = compute_distance_to_uniform_bias(species_codon_df, uniform_bias, ratio_columns)
species_codon_df['uniform_dist'] = distance_to_uniform
species_codon_df.plot.scatter(
x='growth_tmp',
y='uniform_dist',
figsize=(12, 6),
);
2020-12-04 12:34:41,529 (WARNING) *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
species_codon_df.loc[[562, 2336]][['uniform_dist']]
uniform_dist | |
---|---|
species_taxid | |
562 | 0.145268 |
2336 | 0.133543 |