import collections
import os
import json
import logging
import string
import re
import gzip
from scipy.stats import entropy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import networkx as nx
from Bio import SeqIO
if os.getcwd().endswith('notebook'):
os.chdir('..')
from rna_learn.codon_bias.graph import load_codon_bias
sns.set(palette='colorblind', font_scale=1.3)
palette = sns.color_palette()
logging.basicConfig(level=logging.INFO, format="%(asctime)s (%(levelname)s) %(message)s")
logger = logging.getLogger(__name__)
db_path = os.path.join(os.getcwd(), 'data/db/seq.db')
engine = create_engine(f'sqlite+pysqlite:///{db_path}')
pfam2go_path = os.path.join(os.getcwd(), 'data/domains/Pfam2go.txt')
def parse_pfam_to_go_file(path):
line_re = r'^Pfam:([^\s]+) ([^>]+) > GO:([^;]+) ; GO:([0-9]+)$'
domain_to_go = collections.defaultdict(list)
with open(path, 'r') as f:
for line in f:
if not line.strip() or line.startswith('!'):
continue
m = re.match(line_re, line)
if m:
pfam_id = m[1].strip()
query = m[2].strip()
go_label = m[3].strip()
go_id = m[4].strip()
domain_to_go[query].append((go_id, go_label))
return dict(domain_to_go)
domain_to_go = parse_pfam_to_go_file(pfam2go_path)
domain_to_go['GATase_3']
[('0003824', 'catalytic activity'), ('0009236', 'cobalamin biosynthetic process')]
pfam_domains_path = os.path.join(
os.getcwd(),
f'data/domains/tri_nucleotide_bias/pfam/GCA_000005825.2_protein_domains.csv',
)
df = pd.read_csv(pfam_domains_path, index_col='protein_id')
df.loc[['ADC49551.1']]
assembly_accession | record_type | pfam_query | pfam_accession | protein_label | below_threshold | |
---|---|---|---|---|---|---|
protein_id | ||||||
ADC49551.1 | GCA_000005825.2 | pfam | NAD_binding_10 | PF13460.6 | hypothetical-protein-BpOF4_07465 | False |
ADC49551.1 | GCA_000005825.2 | pfam | NAD_binding_4 | PF07993.12 | hypothetical-protein-BpOF4_07465 | False |
ADC49551.1 | GCA_000005825.2 | pfam | Semialdhyde_dh | PF01118.24 | hypothetical-protein-BpOF4_07465 | False |
ADC49551.1 | GCA_000005825.2 | pfam | Epimerase | PF01370.21 | hypothetical-protein-BpOF4_07465 | False |
np.log10(0.818195847004645/0.02347481222011315)
1.542255143837179
np.log10(0.11764705882352941/0.022941176470588232)
0.709965388637482