In [1]:

import collections
import os
import json
import logging
import string
import re

from scipy.stats import entropy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import networkx as nx

if os.getcwd().endswith('notebook'):
    os.chdir('..')

from rna_learn.codon_bias.process_domains import compute_protein_domain_scores

In [2]:

sns.set(palette='colorblind', font_scale=1.3)
palette = sns.color_palette()
logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s")
logger = logging.getLogger(__name__)

In [3]:

db_path = os.path.join(os.getcwd(), 'data/db/seq.db')
engine = create_engine(f'sqlite+pysqlite:///{db_path}')

Match¶

In [4]:

def compute_match_score(engine):
    q = """
    select assembly_accession from assembly_source
    """
    assembly_accessions = pd.read_sql(q, engine)['assembly_accession'].values
    
    count = 0
    for assembly in assembly_accessions:
        protein_domains_path = os.path.join(
            os.getcwd(), 
            f'data/domains/codon_bias/pfam/{assembly}_protein_domains.csv',
        )
        if os.path.isfile(protein_domains_path):
            count += 1
            
    return count, 100 * count / len(assembly_accessions)

In [5]:

compute_match_score(engine)

Out[5]:

(1217, 37.08104814137721)

Thermotoga maritima¶

In [6]:

species_taxid = 2336

In [160]:

q = """
select assembly_accession, species_taxid from assembly_source
where species_taxid = ?
"""
assembly_accession = pd.read_sql(q, engine, params=(species_taxid,))['assembly_accession'].iloc[0]
assembly_accession

Out[160]:

'GCA_000008545.1'

In [161]:

protein_domains_path = os.path.join(
    os.getcwd(), 
    f'data/domains/codon_bias/pfam/{assembly_accession}_protein_domains.csv',
)
thermotoga_domains = pd.read_csv(protein_domains_path)
thermotoga_domains.head(20)

Out[161]:

	assembly_accession	protein_id	record_type	pfam_query	pfam_accession	protein_label	below_threshold
0	GCA_000008545.1	AAD36281.1	pfam	MrpF_PhaF	PF04066.13	conserved-hypothetical-protein	False
1	GCA_000008545.1	AAD36013.1	pfam	Mrr_cat	PF04471.12	conserved-hypothetical-protein	False
2	GCA_000008545.1	AAD35343.1	pfam	Mrr_cat	PF04471.12	conserved-hypothetical-protein	False
3	GCA_000008545.1	AAD35100.1	pfam	MR_MLE_C	PF13378.6	muconate-cycloisomerase	False
4	GCA_000008545.1	AAD35100.1	pfam	MR_MLE_N	PF02746.16	muconate-cycloisomerase	False
5	GCA_000008545.1	AAD36629.1	pfam	MS_channel	PF00924.18	conserved-hypothetical-protein	False
6	GCA_000008545.1	AAD35703.1	pfam	MTES_1575	PF18741.1	conserved-hypothetical-protein	False
7	GCA_000008545.1	AAD35359.1	pfam	MTHFR	PF02219.17	conserved-hypothetical-protein	False
8	GCA_000008545.1	AAD36731.1	pfam	MTS	PF05175.14	conserved-hypothetical-protein	False
9	GCA_000008545.1	AAD36156.1	pfam	MTS	PF05175.14	ribosomal-protein-L11-methyltransferase,-putative	False
10	GCA_000008545.1	AAD35573.1	pfam	MTS	PF05175.14	hemK-protein	False
11	GCA_000008545.1	AAD35773.1	pfam	MTS	PF05175.14	conserved-hypothetical-protein	False
12	GCA_000008545.1	AAD36458.1	pfam	MTS	PF05175.14	conserved-hypothetical-protein	False
13	GCA_000008545.1	AAD36019.1	pfam	MTS	PF05175.14	conserved-hypothetical-protein	False
14	GCA_000008545.1	AAD35835.1	pfam	MTS	PF05175.14	ubiquinone/menaquinone-biosynthesis-methyltran...	False
15	GCA_000008545.1	AAD36764.1	pfam	MTS	PF05175.14	conserved-hypothetical-protein	False
16	GCA_000008545.1	AAD35789.1	pfam	MTS	PF05175.14	glucose-inhibited-division-protein-B	False
17	GCA_000008545.1	AAD35954.1	pfam	MTS	PF05175.14	conserved-hypothetical-protein	False
18	GCA_000008545.1	AAD35829.1	pfam	MTS	PF05175.14	conserved-hypothetical-protein	False
19	GCA_000008545.1	AAD36681.1	pfam	Mt_ATP-synt_B	PF05405.14	ATP-synthase-F0,-subunit-b	False

In [162]:

v = thermotoga_domains[thermotoga_domains['below_threshold']]
100 * len(v) / len(thermotoga_domains)

Out[162]:

6.539035305818001

In [163]:

all_counts = thermotoga_domains[['pfam_query', 'pfam_accession']].groupby('pfam_query').count()
all_counts.columns = ['count_all']
all_counts = all_counts.sort_values('count_all', ascending=False)
all_counts.head()

Out[163]:

	count_all
pfam_query
ABC_tran	65
BPD_transp_1	48
AAA_21	41
HD	36
Radical_SAM	33

In [164]:

below_threshold_counts = thermotoga_domains[
    thermotoga_domains['below_threshold']
][['pfam_query', 'pfam_accession']].groupby('pfam_query').count()
below_threshold_counts.columns = ['count_below']
below_threshold_counts = below_threshold_counts.sort_values('count_below', ascending=False)
below_threshold_counts.head()

Out[164]:

	count_below
pfam_query
Helicase_C	6
DHH	4
DHHA1	4
AAA	4
HD	4

In [165]:

counts = pd.merge(
    all_counts,
    below_threshold_counts,
    how='left',
    on='pfam_query',
)
counts['count_below'] = counts['count_below'].fillna(0).astype(int)
counts.head()

Out[165]:

	count_all	count_below
pfam_query
ABC_tran	65	3
BPD_transp_1	48	0
AAA_21	41	2
HD	36	4
Radical_SAM	33	1

In [166]:

counts['frequency_weight'] = counts['count_below'] / counts['count_all']

In [167]:

counts['species_score'] = np.sqrt(counts['count_below']) * counts['frequency_weight']

In [168]:

counts.sort_values('species_score', ascending=False).head(20)

Out[168]:

	count_all	count_below	frequency_weight	species_score
pfam_query
DHH	4	4	1.0	2.000000
DHHA1	4	4	1.0	2.000000
Glyco_hydro_38N	2	2	1.0	1.414214
Glycos_transf_3	2	2	1.0	1.414214
Glycos_trans_3N	2	2	1.0	1.414214
HD_4	2	2	1.0	1.414214
PAS_8	2	2	1.0	1.414214
GMC_oxred_N	2	2	1.0	1.414214
RNA_pol_Rpb2_45	1	1	1.0	1.000000
MdcE	1	1	1.0	1.000000
LON_substr_bdg	1	1	1.0	1.000000
YscJ_FliF	1	1	1.0	1.000000
DUF1156	1	1	1.0	1.000000
Gate	1	1	1.0	1.000000
DNA_topoisoIV	1	1	1.0	1.000000
PYNP_C	1	1	1.0	1.000000
RelA_SpoT	1	1	1.0	1.000000
Glyco_hydro_35	1	1	1.0	1.000000
FeoB_Cyto	1	1	1.0	1.000000
Glyco_hydro_36	1	1	1.0	1.000000

All species¶

In [16]:

def compute_protein_domain_score(engine, query_type):
    if query_type not in ('pfam', 'tigr'):
        raise ValueError('Query type must be one of (pfam, tigr)')
        
    q = """
    select a.assembly_accession, s.phylum from assembly_source as a
    left join species_traits as s on s.species_taxid = a.species_taxid
    """
    df = pd.read_sql(q, engine)
    assembly_accessions = df['assembly_accession'].values
    phyla = df['phylum'].values
    
    logger.info(f'Counting protein domains for {len(assembly_accessions):,} assemblies')
    
    n_assemblies = len(assembly_accessions)
    n_assemblies_present = 0
    
    domain_to_phyla = collections.defaultdict(set)
    domain_to_score = collections.defaultdict(int)
    domain_count = collections.defaultdict(int)
    domain_count_top = collections.defaultdict(int)
    for i, assembly in enumerate(assembly_accessions):
        if (i+1) % 200 == 0:
            logger.info(f'{i+1:,} / {n_assemblies:,}')
            
        protein_domains_path = os.path.join(
            os.getcwd(), 
            f'data/domains/codon_bias/{query_type}/{assembly}_protein_domains.csv',
        )
        if not os.path.isfile(protein_domains_path):
            continue
        else:
            n_assemblies_present += 1
        
        protein_domains = pd.read_csv(protein_domains_path)
        
        phylum = phyla[i]
        
        all_counts = protein_domains[['pfam_query', 'pfam_accession']].groupby('pfam_query').count()
        all_counts.columns = ['count_all']
        
        below_threshold_counts = protein_domains[
            protein_domains['below_threshold']
        ][['pfam_query', 'pfam_accession']].groupby('pfam_query').count()
        below_threshold_counts.columns = ['count_below']
        
        counts = pd.merge(
            all_counts,
            below_threshold_counts,
            how='left',
            on='pfam_query',
        )
        counts['count_below'] = counts['count_below'].fillna(0).astype(int)
        
        counts['frequency_weight'] = counts['count_below'] / counts['count_all']
        counts['assembly_score'] = np.sqrt(counts['count_below']) * counts['frequency_weight']
        
        for pfam_query in counts.index:
            domain_to_phyla[pfam_query].add(phylum)
            domain_to_score[pfam_query] += counts.loc[pfam_query, 'assembly_score']
            domain_count[pfam_query] += 1
            if counts.loc[pfam_query, 'count_below'] > 0:
                domain_count_top[pfam_query] += 1
        
    query_key = f'{query_type}_query'
    sorted_queries = sorted(domain_to_score.keys())
    data = {
        query_key: sorted_queries,
        'n_phylum': [len(domain_to_phyla[k]) for k in sorted_queries],
        'assembly_score_sum': [domain_to_score[k] for k in sorted_queries],
        'assembly_count': [domain_count[k] for k in sorted_queries],
        'assembly_count_top': [domain_count_top[k] for k in sorted_queries],
        'score': [domain_to_score[k] / n_assemblies_present for k in sorted_queries],
    }
    output_df = pd.DataFrame.from_dict(data).set_index(query_key)
    
    return output_df.sort_values(['score', 'assembly_count'], ascending=False)

In [17]:

def compute_query_to_most_common_label(engine, query_type):
    if query_type not in ('pfam', 'tigr'):
        raise ValueError('Query type must be one of (pfam, tigr)')
        
    q = """
    select assembly_accession from assembly_source
    """
    assembly_accessions = pd.read_sql(q, engine)['assembly_accession'].values
    
    logger.info(f'Finding most common protein labels per query for {len(assembly_accessions):,} assemblies')
    
    query_to_protein_labels = {}
    for i, assembly in enumerate(assembly_accessions):
        if (i+1) % 200 == 0:
            logger.info(f'{i+1:,} / {len(assembly_accessions):,}')
            
        protein_domains_path = os.path.join(
            os.getcwd(), 
            f'data/domains/codon_bias/{query_type}/{assembly}_protein_domains.csv',
        )
        if not os.path.isfile(protein_domains_path):
            continue
        
        protein_domains = pd.read_csv(protein_domains_path)
        
        for tpl in protein_domains.itertuples():
            query, label = tpl.pfam_query, tpl.protein_label
            
            if pd.isnull(label):
                label = 'Unknown'
                
            label = label.strip()
                
            if query not in query_to_protein_labels:
                query_to_protein_labels[query] = {
                    label: 1,
                }
            elif label not in query_to_protein_labels[query]:
                query_to_protein_labels[query][label] = 1
            else:
                query_to_protein_labels[query][label] += 1
    
    query_to_most_common_label = {}
    for query in sorted(query_to_protein_labels.keys()):
        label_counts = [(k, v) for k, v in query_to_protein_labels[query].items()]
        
        sorted_labels = sorted(label_counts, key=lambda t: t[1], reverse=True)
        
        query_to_most_common_label[query] = sorted_labels[0][0]
        
    return query_to_most_common_label

In [18]:

%%time
pfam_counts = compute_protein_domain_score(engine, query_type='pfam')

2020-12-07 14:01:44,412 (INFO) Counting protein domains for 3,282 assemblies
2020-12-07 14:01:52,142 (INFO) 200 / 3,282
2020-12-07 14:02:00,214 (INFO) 400 / 3,282
2020-12-07 14:02:07,345 (INFO) 600 / 3,282
2020-12-07 14:02:15,108 (INFO) 800 / 3,282
2020-12-07 14:02:20,804 (INFO) 1,000 / 3,282
2020-12-07 14:02:26,130 (INFO) 1,200 / 3,282
2020-12-07 14:02:31,222 (INFO) 1,400 / 3,282
2020-12-07 14:02:35,316 (INFO) 1,600 / 3,282
2020-12-07 14:02:39,263 (INFO) 1,800 / 3,282
2020-12-07 14:02:42,761 (INFO) 2,000 / 3,282
2020-12-07 14:02:45,631 (INFO) 2,200 / 3,282
2020-12-07 14:02:48,356 (INFO) 2,400 / 3,282
2020-12-07 14:02:49,304 (INFO) 2,600 / 3,282
2020-12-07 14:02:49,310 (INFO) 2,800 / 3,282
2020-12-07 14:02:51,095 (INFO) 3,000 / 3,282
2020-12-07 14:02:54,820 (INFO) 3,200 / 3,282

CPU times: user 1min 5s, sys: 2.72 s, total: 1min 8s
Wall time: 1min 11s

In [19]:

pfam_query_to_most_common_label = compute_query_to_most_common_label(engine, query_type='pfam')

2020-12-07 14:02:55,655 (INFO) Finding most common protein labels per query for 3,282 assemblies
2020-12-07 14:02:58,728 (INFO) 200 / 3,282
2020-12-07 14:03:02,201 (INFO) 400 / 3,282
2020-12-07 14:03:05,323 (INFO) 600 / 3,282
2020-12-07 14:03:08,473 (INFO) 800 / 3,282
2020-12-07 14:03:11,025 (INFO) 1,000 / 3,282
2020-12-07 14:03:13,288 (INFO) 1,200 / 3,282
2020-12-07 14:03:15,415 (INFO) 1,400 / 3,282
2020-12-07 14:03:17,200 (INFO) 1,600 / 3,282
2020-12-07 14:03:18,968 (INFO) 1,800 / 3,282
2020-12-07 14:03:20,582 (INFO) 2,000 / 3,282
2020-12-07 14:03:21,953 (INFO) 2,200 / 3,282
2020-12-07 14:03:23,217 (INFO) 2,400 / 3,282
2020-12-07 14:03:23,662 (INFO) 2,600 / 3,282
2020-12-07 14:03:23,669 (INFO) 2,800 / 3,282
2020-12-07 14:03:24,465 (INFO) 3,000 / 3,282
2020-12-07 14:03:26,171 (INFO) 3,200 / 3,282

In [20]:

pfam_labels = [pfam_query_to_most_common_label[k] for k in pfam_counts.index]
pfam_counts['label'] = pfam_labels

In [21]:

pfam_counts.head(50)

Out[21]:

	n_phylum	assembly_score_sum	assembly_count	assembly_count_top	score	label
pfam_query
HHH_6	31	1285.177049	1212	921	1.056021	DNA-polymerase-III,-alpha-subunit
Helicase_C	31	1253.404987	1217	1160	1.029914	DEAD/DEAH-box-helicase-domain-protein
DNA_pol3_alpha	31	954.846423	1211	927	0.784590	DNA-polymerase-III,-alpha-subunit
DNA_pol3_finger	31	954.682354	1211	926	0.784456	DNA-polymerase-III,-alpha-subunit
HATPase_c	31	866.584429	1217	1078	0.712066	histidine-kinase
DEAD	31	847.189268	1217	1145	0.696129	DEAD/DEAH-box-helicase-domain-protein
ResIII	31	805.120859	1217	1149	0.661562	ATP-dependent-DNA-helicase-RecG
HisKA	31	772.113047	1184	956	0.634440	histidine-kinase
HD_4	29	739.445267	1181	623	0.607597	GTP-pyrophosphokinase
UvrD_C	31	735.901076	1215	975	0.604685	UvrD/REP-helicase
TRCF	31	735.242641	1153	735	0.604144	transcription-repair-coupling-factor
PDDEXK_1	31	706.002791	1145	803	0.580117	CRISPR-associated-protein-Cas4
HHH_2	30	699.163542	1189	642	0.574498	DNA-ligase,-NAD-dependent
UvrD-helicase	31	696.531793	1214	976	0.572335	UvrD/REP-helicase
UvrA_DNA-bind	31	693.821033	1210	729	0.570108	excinuclease-ABC-subunit-A
UvrD_C_2	31	680.867160	1215	977	0.559464	UvrD/REP-helicase
UvrA_inter	31	680.347130	1209	697	0.559036	excinuclease-ABC-subunit-A
PAS_4	31	676.698027	1013	809	0.556038	PAS/PAC-sensor-signal-transduction-histidine-k...
PAS	31	658.108907	1045	816	0.540763	PAS/PAC-sensor-signal-transduction-histidine-k...
AAA_19	31	648.212791	1216	1035	0.532632	UvrD/REP-helicase
E1-E2_ATPase	31	645.665463	1182	918	0.530539	heavy-metal-translocating-P-type-ATPase
PAS_9	31	642.209875	969	767	0.527699	PAS-domain-S-box-containing-protein
Transpeptidase	30	627.626578	1175	943	0.515716	penicillin-binding-protein
FtsK_alpha	27	623.682083	1133	632	0.512475	cell-division-protein-FtsK
PHP	31	619.833305	1216	927	0.509312	DNA-polymerase-III,-alpha-subunit
PAS_8	31	618.235339	972	737	0.507999	PAS/PAC-sensor-signal-transduction-histidine-k...
FtsK_gamma	26	614.244180	1130	629	0.504720	cell-division-protein-FtsK
PAS_3	30	611.126491	870	667	0.502158	diguanylate-cyclase/phosphodiesterase-with-PAS...
FtsK_SpoIIIE	27	605.743321	1143	694	0.497735	cell-division-protein-FtsK
UvrB_inter	31	589.068602	1214	807	0.484033	transcription-repair-coupling-factor
DNA_pol_A	31	587.270647	1194	621	0.482556	DNA-polymerase-I
CarD_CdnL_TRCF	31	566.578427	1162	737	0.465553	transcription-repair-coupling-factor
MutS_V	31	565.436599	1023	651	0.464615	DNA-mismatch-repair-protein-MutS
GXGXG	28	561.463566	849	422	0.461351	glutamate-synthase
MutS_I	30	550.990188	984	552	0.452745	DNA-mismatch-repair-protein-MutS
5_3_exonuc	31	548.909307	1208	612	0.451035	DNA-polymerase-I
5_3_exonuc_N	31	548.409307	1209	612	0.450624	DNA-polymerase-I
Anticodon_1	31	547.675312	1216	730	0.450021	leucyl-tRNA-synthetase
Hpt	27	546.242679	830	510	0.448844	CheA-signal-transduction-histidine-kinase
Topoisom_bac	31	544.492580	1199	639	0.447406	DNA-topoisomerase-I
MutS_III	30	542.707520	985	559	0.445939	DNA-mismatch-repair-protein-MutS
MutS_IV	30	542.242641	968	543	0.445557	DNA-mismatch-repair-protein-MutS
AAA_16	31	538.848836	1215	909	0.442768	ATP-dependent-chaperone-ClpB
Viral_helicase1	31	536.158880	1196	810	0.440558	UvrD/REP-helicase
ACR_tran	29	534.647959	940	720	0.439316	acriflavin-resistance-protein
PriA_3primeBD	30	531.914214	1138	532	0.437070	primosomal-protein-N'
MutS_II	29	530.685450	948	530	0.436060	DNA-mismatch-repair-protein-MutS
Molybdopterin	27	525.168247	901	674	0.431527	molybdopterin-oxidoreductase
B5	31	514.914214	1201	515	0.423101	phenylalanyl-tRNA-synthetase,-beta-subunit
FDX-ACB	31	513.414214	1176	514	0.421869	phenylalanyl-tRNA-synthetase,-beta-subunit

In [38]:

100 * len(pfam_counts[pfam_counts['score'] >= 0.1]) / len(pfam_counts)

Out[38]:

4.816451700189411

In [50]:

_, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.hist(pfam_counts['score'].values, bins=50, log=True);
pfam_threshold = np.percentile(pfam_counts['score'].values, 95)
ax.axvline(pfam_threshold, color='red')
ax.set_xlabel('Score (%)')
ax.set_ylabel('Pfam entry count')
ax.set_title('Distribution of scores for all Pfam entries');

In [46]:

Out[46]:

0.09701828073063584

In [ ]:

In [24]:

%%time
tigr_counts = compute_protein_domain_score(engine, query_type='tigr')

2020-12-07 14:03:27,845 (INFO) Counting protein domains for 3,282 assemblies
2020-12-07 14:03:31,651 (INFO) 200 / 3,282
2020-12-07 14:03:35,309 (INFO) 400 / 3,282
2020-12-07 14:03:38,640 (INFO) 600 / 3,282
2020-12-07 14:03:41,973 (INFO) 800 / 3,282
2020-12-07 14:03:44,667 (INFO) 1,000 / 3,282
2020-12-07 14:03:47,189 (INFO) 1,200 / 3,282
2020-12-07 14:03:49,559 (INFO) 1,400 / 3,282
2020-12-07 14:03:51,417 (INFO) 1,600 / 3,282
2020-12-07 14:03:53,149 (INFO) 1,800 / 3,282
2020-12-07 14:03:54,717 (INFO) 2,000 / 3,282
2020-12-07 14:03:55,988 (INFO) 2,200 / 3,282
2020-12-07 14:03:57,172 (INFO) 2,400 / 3,282
2020-12-07 14:03:57,568 (INFO) 2,600 / 3,282
2020-12-07 14:03:57,575 (INFO) 2,800 / 3,282
2020-12-07 14:03:58,356 (INFO) 3,000 / 3,282
2020-12-07 14:03:59,967 (INFO) 3,200 / 3,282

CPU times: user 30.7 s, sys: 752 ms, total: 31.4 s
Wall time: 32.4 s

In [37]:

100 * len(tigr_counts[tigr_counts['score'] > 0.1]) / len(tigr_counts)

Out[37]:

3.6013187927973624

In [26]:

tigr_query_to_most_common_label = compute_query_to_most_common_label(engine, query_type='tigr')

2020-12-07 14:04:00,269 (INFO) Finding most common protein labels per query for 3,282 assemblies
2020-12-07 14:04:01,223 (INFO) 200 / 3,282
2020-12-07 14:04:02,186 (INFO) 400 / 3,282
2020-12-07 14:04:03,026 (INFO) 600 / 3,282
2020-12-07 14:04:03,900 (INFO) 800 / 3,282
2020-12-07 14:04:04,585 (INFO) 1,000 / 3,282
2020-12-07 14:04:05,246 (INFO) 1,200 / 3,282
2020-12-07 14:04:05,907 (INFO) 1,400 / 3,282
2020-12-07 14:04:06,389 (INFO) 1,600 / 3,282
2020-12-07 14:04:06,881 (INFO) 1,800 / 3,282
2020-12-07 14:04:07,348 (INFO) 2,000 / 3,282
2020-12-07 14:04:07,736 (INFO) 2,200 / 3,282
2020-12-07 14:04:08,086 (INFO) 2,400 / 3,282
2020-12-07 14:04:08,212 (INFO) 2,600 / 3,282
2020-12-07 14:04:08,219 (INFO) 2,800 / 3,282
2020-12-07 14:04:08,455 (INFO) 3,000 / 3,282
2020-12-07 14:04:08,934 (INFO) 3,200 / 3,282

In [27]:

tigr_labels = [tigr_query_to_most_common_label[k] for k in tigr_counts.index]

In [28]:

tigr_counts['label'] = tigr_labels
tigr_counts.head(20)

Out[28]:

	n_phylum	assembly_score_sum	assembly_count	assembly_count_top	score	label
tigr_query
TIGR00594	31	845.086076	1190	870	0.694401	DNA-polymerase-III,-alpha-subunit
TIGR00580	31	728.000000	1138	728	0.598192	transcription-repair-coupling-factor
TIGR00229	31	704.536214	1033	831	0.578912	PAS-domain-S-box-containing-protein
TIGR00630	31	694.991743	1209	722	0.571070	excinuclease-ABC-subunit-A
TIGR01494	31	644.297111	1182	918	0.529414	heavy-metal-translocating-P-type-ATPase
TIGR00593	31	611.000000	1181	611	0.502054	DNA-polymerase-I
TIGR00691	27	559.426407	1171	608	0.459677	GTP-pyrophosphokinase
TIGR01070	30	542.828427	964	543	0.446038	DNA-mismatch-repair-protein-MutS
TIGR02168	31	527.000000	906	527	0.433032	chromosome-segregation-protein-SMC
TIGR00472	31	514.914214	1179	515	0.423101	phenylalanyl-tRNA-synthetase,-beta-subunit
TIGR00575	29	507.914214	1176	510	0.417349	DNA-ligase,-NAD-dependent
TIGR01525	31	503.028299	1137	719	0.413335	heavy-metal-translocating-P-type-ATPase
TIGR01512	31	500.463291	1130	707	0.411227	heavy-metal-translocating-P-type-ATPase
TIGR01051	29	494.833333	1113	498	0.406601	DNA-topoisomerase-I
TIGR01511	31	494.121129	1115	642	0.406016	heavy-metal-translocating-P-type-ATPase
TIGR00595	28	491.414214	991	491	0.403791	primosomal-protein-N'
TIGR00344	31	485.000000	1209	485	0.398521	alanyl-tRNA-synthetase
TIGR00392	31	462.985281	1206	461	0.380432	isoleucyl-tRNA-synthetase
TIGR00254	31	444.075262	894	696	0.364893	diguanylate-cyclase
TIGR01369	28	430.369216	1076	447	0.353631	carbamoyl-phosphate-synthase-large-subunit

In [49]:

_, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.hist(tigr_counts['score'].values, bins=20, log=True);
tigr_thresold = np.percentile(tigr_counts['score'].values, 95)
ax.axvline(tigr_thresold, color='red')
ax.set_xlabel('Score (%)')
ax.set_ylabel('TIGR entry count')
ax.set_title('Distribution of scores for all TIGR entries');

In [51]:

score_threshold = 0.05
base_path = os.path.join(os.getcwd(), 'data/domains/codon_bias/')
pfam_counts[pfam_counts['score'] >= pfam_threshold].to_excel(os.path.join(base_path, 'pfam_top.xlsx'))
tigr_counts[tigr_counts['score'] >= tigr_thresold].to_excel(os.path.join(base_path, 'tigr_top.xlsx'))

Validation: Protein id match¶

Let's make sure IDs properly match and we are not simply seeing an artefact of proteins that joined

In [31]:

def check_protein_matching(engine, query_type):
    if query_type not in ('pfam', 'tigr'):
        raise ValueError('Query type must be one of (pfam, tigr)')
        
    q = """
    select assembly_accession from assembly_source
    """
    assembly_accessions = pd.read_sql(q, engine)['assembly_accession'].values
    
    logger.info(f'Checking {query_type} protein ID match for {len(assembly_accessions):,} assemblies')
    
    matching_scores = {}
    for i, assembly in enumerate(assembly_accessions):
        if (i+1) % 200 == 0:
            logger.info(f'{i+1:,} / {len(assembly_accessions):,}')
            
        protein_domains_path = os.path.join(
            os.getcwd(), 
            f'data/domains/codon_bias/{query_type}/{assembly}_protein_domains.csv',
        )
        if not os.path.isfile(protein_domains_path):
            continue
        
        protein_domains = pd.read_csv(protein_domains_path)
        
        protein_query = """
        select metadata_json from sequences where sequence_type = 'CDS' and assembly_accession = ?
        """
        cds_metadata_df = pd.read_sql(protein_query, engine, params=(assembly,))
        metadata = [json.loads(v) for v in cds_metadata_df['metadata_json'].values if not pd.isnull(v)]
        
        cds_protein_ids = {
            m['protein_id'].strip() for m in metadata
            if m.get('protein_id') is not None
        }
        query_protein_ids = set([p.strip() for p in protein_domains['protein_id'].values if not pd.isnull(p)])
        
        matching_score = 100 * len(cds_protein_ids & query_protein_ids) / len(query_protein_ids)
        matching_scores[assembly] = matching_score
        
    return matching_scores

In [32]:

pfam_matching_scores = check_protein_matching(engine, query_type='pfam')

2020-12-07 14:04:10,201 (INFO) Checking pfam protein ID match for 3,282 assemblies
2020-12-07 14:04:18,377 (INFO) 200 / 3,282
2020-12-07 14:04:27,256 (INFO) 400 / 3,282
2020-12-07 14:04:35,071 (INFO) 600 / 3,282
2020-12-07 14:04:43,468 (INFO) 800 / 3,282
2020-12-07 14:04:50,303 (INFO) 1,000 / 3,282
2020-12-07 14:04:56,335 (INFO) 1,200 / 3,282
2020-12-07 14:05:02,157 (INFO) 1,400 / 3,282
2020-12-07 14:05:06,842 (INFO) 1,600 / 3,282
2020-12-07 14:05:11,228 (INFO) 1,800 / 3,282
2020-12-07 14:05:15,209 (INFO) 2,000 / 3,282
2020-12-07 14:05:18,400 (INFO) 2,200 / 3,282
2020-12-07 14:05:21,498 (INFO) 2,400 / 3,282
2020-12-07 14:05:22,629 (INFO) 2,600 / 3,282
2020-12-07 14:05:22,635 (INFO) 2,800 / 3,282
2020-12-07 14:05:24,594 (INFO) 3,000 / 3,282
2020-12-07 14:05:28,438 (INFO) 3,200 / 3,282

In [33]:

tigr_matching_scores = check_protein_matching(engine, query_type='tigr')

2020-12-07 14:05:29,092 (INFO) Checking tigr protein ID match for 3,282 assemblies
2020-12-07 14:05:36,206 (INFO) 200 / 3,282
2020-12-07 14:05:44,101 (INFO) 400 / 3,282
2020-12-07 14:05:50,840 (INFO) 600 / 3,282
2020-12-07 14:05:58,058 (INFO) 800 / 3,282
2020-12-07 14:06:03,764 (INFO) 1,000 / 3,282
2020-12-07 14:06:09,050 (INFO) 1,200 / 3,282
2020-12-07 14:06:14,001 (INFO) 1,400 / 3,282
2020-12-07 14:06:18,021 (INFO) 1,600 / 3,282
2020-12-07 14:06:21,759 (INFO) 1,800 / 3,282
2020-12-07 14:06:25,215 (INFO) 2,000 / 3,282
2020-12-07 14:06:27,934 (INFO) 2,200 / 3,282
2020-12-07 14:06:30,548 (INFO) 2,400 / 3,282
2020-12-07 14:06:31,517 (INFO) 2,600 / 3,282
2020-12-07 14:06:31,524 (INFO) 2,800 / 3,282
2020-12-07 14:06:33,182 (INFO) 3,000 / 3,282
2020-12-07 14:06:36,440 (INFO) 3,200 / 3,282

In [34]:

outlier_threshold = 90
outlier_assemblies = {a for a in pfam_matching_scores.keys() if pfam_matching_scores[a] < outlier_threshold}
outlier_assemblies |= {a for a in tigr_matching_scores.keys() if tigr_matching_scores[a] < outlier_threshold}
len(outlier_assemblies)

Out[34]:

In [35]:

sorted(outlier_assemblies)

Out[35]:

['GCA_000011225.1',
 'GCA_000011445.1',
 'GCA_000012765.1',
 'GCA_000063605.1',
 'GCA_000092585.1',
 'GCA_000183365.1',
 'GCA_000183385.1',
 'GCA_000195875.1',
 'GCA_000328725.1',
 'GCA_000400935.1',
 'GCA_000400955.1',
 'GCA_000439435.1',
 'GCA_000439455.1',
 'GCA_000500935.1',
 'GCA_000565175.1',
 'GCA_000565215.1',
 'GCA_001029265.1',
 'GCA_001262715.1',
 'GCA_001267155.1',
 'GCA_001274875.1',
 'GCA_001281045.1',
 'GCA_001715535.1',
 'GCA_001886855.1',
 'GCA_001900245.1',
 'GCA_001998865.1',
 'GCA_002163585.1',
 'GCA_002237575.1',
 'GCA_002803985.1',
 'GCA_002804105.1',
 'GCA_002813555.1',
 'GCA_002865545.1',
 'GCA_003339775.1',
 'GCA_003363775.1']

In [55]:

q = """
select a.assembly_accession, s.species_taxid, s.species, s.phylum from assembly_source as a
left join species_traits as s on s.species_taxid = a.species_taxid
"""
df = pd.read_sql(q, engine, index_col='assembly_accession')
ix = set(df.index.tolist()) - set(outlier_assemblies)
phyla = df.loc[ix]['phylum'].unique()
len(phyla)

Out[55]:

In [56]:

df[df['phylum'].isnull()]

Out[56]:

	species_taxid	species	phylum
assembly_accession
GCA_000025005.1	166501	Thermobaculum terrenum	None

Top Gene Ontology (GO) categories¶

In [107]:

pfam2go_path = os.path.join(os.getcwd(), 'data/domains/Pfam2go.txt')
pfam_results = pd.read_excel(
    os.path.join(os.getcwd(), f'data/domains/codon_bias/pfam_top.xlsx'),
    index_col='pfam_query',
)

In [108]:

def parse_pfam_to_go_file(path):
    line_re = r'^Pfam:([^\s]+) ([^>]+) > GO:([^;]+) ; GO:([0-9]+)$'
    domain_to_go = collections.defaultdict(list)
    with open(path, 'r') as f:
        for line in f:
            if not line.strip() or line.startswith('!'):
                continue
                
            m = re.match(line_re, line)
            if m:
                pfam_id = m[1].strip()
                query =  m[2].strip()
                go_label = m[3].strip()
                go_id = m[4].strip()
                
                domain_to_go[query].append((go_id, go_label))
                
    return dict(domain_to_go)

In [109]:

domain_to_go = parse_pfam_to_go_file(pfam2go_path)

In [9]:

domain_to_go['Helicase_C_2']

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-316f32008b92> in <module>
----> 1 domain_to_go['Helicase_C_2']

NameError: name 'domain_to_go' is not defined

In [111]:

def compute_top_go_categories(pfam_results, domain_to_go):
    data = {
        'go_id': [],
        'go_label': [],
        'count': [],
    }
    matching = 0
    go_id_count = collections.defaultdict(int)
    go_id_to_label = {}
    for domain in pfam_results.index:
        if domain not in domain_to_go:
            continue
        else:
            matching += 1
            
        go_data = domain_to_go[domain]
        
        for go_id, go_label in go_data:
            go_id_count[go_id] += 1
            go_id_to_label[go_id] = go_label
            
    for go_id in sorted(go_id_count.keys()):
        data['go_id'].append(go_id)
        data['go_label'].append(go_id_to_label[go_id])
        data['count'].append(go_id_count[go_id])
            
    print(f'{matching} / {len(pfam_results)} ({100 * matching / len(pfam_results):.0f}%) matching domains with go')
    
    return pd.DataFrame.from_dict(data).set_index('go_id').sort_values('count', ascending=False)

In [112]:

go_df = compute_top_go_categories(pfam_results, domain_to_go)
go_df.head(20)

245 / 553 (44%) matching domains with go

Out[112]:

	go_label	count
go_id
0005524	ATP binding	53
0003677	DNA binding	26
0055114	oxidation-reduction process	26
0005975	carbohydrate metabolic process	21
0016491	oxidoreductase activity	15
0003824	catalytic activity	14
0005515	protein binding	13
0003899	DNA-directed 5'-3' RNA polymerase activity	11
0006351	transcription, DNA-templated	11
0000166	nucleotide binding	10
0016021	integral component of membrane	8
0006260	DNA replication	8
0003723	RNA binding	8
0004553	hydrolase activity, hydrolyzing O-glycosyl com...	8
0003676	nucleic acid binding	7
0006298	mismatch repair	7
0006508	proteolysis	7
0005737	cytoplasm	6
0030983	mismatched DNA binding	6
0006265	DNA topological change	6

In [153]:

go_df.to_excel(os.path.join(os.getcwd(), 'data/domains/codon_bias/go_labels.xlsx'))

tRNA adaptation index¶

In [116]:

trnai = pd.read_csv(os.path.join(os.getcwd(), 'data/trn_adaptation_index/GCA_000005825.2_tai.csv'))
trnai.head()

Out[116]:

	assembly_accession	species_taxid	row_id	protein_id	protein	gene	adaptation_index
0	GCA_000005825.2	79885	253459	ADC51616.1	hypothetical protein	NaN	1.000000
1	GCA_000005825.2	79885	252123	ADC50280.1	hypothetical protein	NaN	0.901509
2	GCA_000005825.2	79885	252052	ADC50209.1	hypothetical protein	NaN	0.898870
3	GCA_000005825.2	79885	253277	ADC51434.1	cold-shock DNA-binding protein family protein	cspC3	0.871788
4	GCA_000005825.2	79885	253078	ADC51235.1	hypothetical protein	NaN	0.842573

In [150]:

def score_fn(trnai):
    mean = trnai['adaptation_index'].mean()
    std = trnai['adaptation_index'].std()
    
    def fn(adaptation_index):
        if adaptation_index > mean + std:
            return 'over expressed'
        elif adaptation_index < mean - std:
            return 'under expressed'
        else:
            return 'normally expressed'
    
    return fn

In [151]:

trnai['expression'] = trnai['adaptation_index'].apply(score_fn(trnai))
trnai.head()

Out[151]:

	assembly_accession	species_taxid	row_id	protein_id	protein	gene	adaptation_index	percentile	expression
0	GCA_000005825.2	79885	253459	ADC51616.1	hypothetical protein	NaN	1.000000	100	over expressed
1	GCA_000005825.2	79885	252123	ADC50280.1	hypothetical protein	NaN	0.901509	100	over expressed
2	GCA_000005825.2	79885	252052	ADC50209.1	hypothetical protein	NaN	0.898870	100	over expressed
3	GCA_000005825.2	79885	253277	ADC51434.1	cold-shock DNA-binding protein family protein	cspC3	0.871788	100	over expressed
4	GCA_000005825.2	79885	253078	ADC51235.1	hypothetical protein	NaN	0.842573	100	over expressed

In [155]:

trnai['expression'].hist();

AAA domains scale in numbers with genome size¶

How does it affect our scoring?

In [4]:

species_traits = pd.read_sql(
    'select species_taxid, species, genome_size from species_traits', 
    engine,
    index_col='species_taxid',
)
species_traits.head()

Out[4]:

	species	genome_size
species_taxid
7	Azorhizobium caulinodans	5369771.500
9	Buchnera aphidicola	601699.243
11	Cellulomonas gilvus	3526440.800
14	Dictyoglomus thermophilum	1959987.600
19	Pelobacter carbinolicus	3722544.667

In [5]:

species_traits.loc[[2336]]

Out[5]:

	species	genome_size
species_taxid
2336	Thermotoga maritima	1865969.5

In [6]:

thermotoga_maritima_domains = compute_protein_domain_scores(engine, ['GCA_000008545.1'], 'pfam')
aaa_domains = sorted([d for d in thermotoga_maritima_domains.index if 'AAA' in d])
thermotoga_maritima_domains.loc[aaa_domains].sort_values('score', ascending=False)

Out[6]:

	assembly_score_sum	assembly_count	assembly_count_top	score
pfam_query
AAA_assoc_2	1.000000	1	1	1.000000
AAA_PrkA	1.000000	1	1	1.000000
AAA_32	1.000000	1	1	1.000000
AAA_27	0.250000	2	1	0.250000
AAA_3	0.111111	3	1	0.111111
AAA_5	0.092239	13	3	0.092239
AAA_15	0.062500	4	1	0.062500
AAA	0.060491	23	4	0.060491
AAA_23	0.040000	5	1	0.040000
AAA_21	0.003365	41	2	0.003365
AAA_19	0.000000	6	0	0.000000
AAA_33	0.000000	2	0	0.000000
AAA_lid_4	0.000000	1	0	0.000000
AAA_lid_3	0.000000	1	0	0.000000
AAA_lid_2	0.000000	1	0	0.000000
AAA_12	0.000000	4	0	0.000000
AAA_14	0.000000	6	0	0.000000
AAA_7	0.000000	1	0	0.000000
AAA_16	0.000000	3	0	0.000000
AAA_31	0.000000	5	0	0.000000
AAA_17	0.000000	3	0	0.000000
AAA_2	0.000000	4	0	0.000000
AAA_30	0.000000	6	0	0.000000
AAA_18	0.000000	4	0	0.000000
AAA_29	0.000000	1	0	0.000000
AAA_11	0.000000	2	0	0.000000
AAA_25	0.000000	3	0	0.000000
AAA_22	0.000000	12	0	0.000000
AAA_lid_9	0.000000	2	0	0.000000

In [ ]:

Count unique Pfam¶

In [5]:

%%time
import pathlib
pfam_folder = '/Users/srom/workspace/rna_learn/data/domains/tri_nucleotide_bias/pfam'

protein_domains = set()
paths = pathlib.Path(pfam_folder).glob('*.csv')
for p in paths:
    with p.open() as f:
        df = pd.read_csv(f)
        protein_domains |= set(df['pfam_query'].unique())
        
print(len(protein_domains))

11087
CPU times: user 12.3 s, sys: 2.11 s, total: 14.4 s
Wall time: 15.8 s

In [6]:

n_pfam_domains = len(protein_domains)

In [12]:

100 * 240 / n_pfam_domains

Out[12]:

2.1646973933435554

In [ ]: