In [11]:

import re
import os
import datetime
import gzip
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq
import gffutils

if os.getcwd().endswith('notebook'):
    os.chdir('..')

In [12]:

sns.set(palette='colorblind', font_scale=1.3)

Load traits¶

In [8]:

species_traits_path = os.path.join(os.getcwd(), 'data/condensed_traits/condensed_species_NCBI_with_ogt.csv')
species_traits_all = pd.read_csv(species_traits_path)
len(species_traits_all)

Out[8]:

In [10]:

species_traits_all['species_taxid'] = species_traits_all['species_tax_id']

In [11]:

species_traits_all.head()

Out[11]:

	species_tax_id	species	genus	family	order	class	phylum	superkingdom	gram_stain	metabolism	...	gc_content.stdev	coding_genes.stdev	optimum_tmp.stdev	optimum_ph.stdev	rRNA16S_genes.stdev	tRNA_genes.stdev	data_source	ref_id	species_taxid
0	1416943	Lactobacillus sicerae	Lactobacillus	Lactobacillaceae	Lactobacillales	Bacilli	Firmicutes	Bacteria	NaN	anaerobic	...	NaN	NaN	NaN	NaN	NaN	NaN	campedelli	152	1416943
1	1229250	Ammoniibacillus agariperforans	Ammoniibacillus	Paenibacillaceae	Bacillales	Bacilli	Firmicutes	Bacteria	NaN	aerobic	...	NaN	NaN	NaN	NaN	NaN	NaN	corkrey	181	1229250
2	1081799	Thermomarinilinea lacunifontana	Thermomarinilinea	Anaerolineaceae	Anaerolineales	Anaerolineae	Chloroflexi	Bacteria	NaN	aerobic	...	NaN	NaN	NaN	NaN	NaN	NaN	corkrey	206	1081799
3	1323370	Caloranaerobacter ferrireducens	Caloranaerobacter	Clostridiaceae	Clostridiales	Clostridia	Firmicutes	Bacteria	NaN	anaerobic	...	NaN	NaN	NaN	NaN	NaN	NaN	corkrey	380	1323370
4	192731	Halomicronema excentricum	Halomicronema	Leptolyngbyaceae	Synechococcales	NaN	Cyanobacteria	Bacteria	NaN	aerobic	...	NaN	NaN	NaN	NaN	NaN	NaN	corkrey	406	192731

5 rows × 80 columns

Load NCBI species short-list¶

In [334]:

ncbi_species_metadata = pd.read_csv(
    os.path.join(os.getcwd(), 'data/condensed_traits/ncbi_species_final.csv'),
    parse_dates=['seq_rel_date_processed'],
)
len(ncbi_species_metadata)

Out[334]:

In [335]:

ncbi_species_metadata.head()

Out[335]:

	assembly_accession	bioproject	biosample	wgs_master	refseq_category	taxid	species_taxid	organism_name	infraspecific_name	isolate	...	seq_rel_date	asm_name	submitter	gbrs_paired_asm	paired_asm_comp	ftp_path	excluded_from_refseq	relation_to_type_material	seq_rel_date_processed	download_url_base
0	GCA_000005825.2	PRJNA28811	SAMN02603086	NaN	representative genome	398511	79885	Bacillus pseudofirmus OF4	strain=OF4	NaN	...	15/12/2010	ASM582v2	Center for Genomic Sciences, Allegheny-Singer ...	GCF_000005825.2	identical	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...	NaN	NaN	2010-12-15	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...
1	GCA_000005845.2	PRJNA225	SAMN02604091	NaN	reference genome	511145	562	Escherichia coli str. K-12 substr. MG1655	strain=K-12 substr. MG1655	NaN	...	26/09/2013	ASM584v2	Univ. Wisconsin	GCF_000005845.2	identical	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...	NaN	NaN	2013-09-26	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...
2	GCA_000006175.2	PRJNA20131	SAMN00000040	NaN	representative genome	456320	2188	Methanococcus voltae A3	strain=A3	NaN	...	03/06/2010	ASM617v2	US DOE Joint Genome Institute (JGI-PGF)	GCF_000006175.1	identical	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...	NaN	NaN	2010-06-03	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...
3	GCA_000006605.1	PRJNA13967	SAMEA3283089	NaN	representative genome	306537	38289	Corynebacterium jeikeium K411	NaN	NaN	...	27/06/2005	ASM660v1	Bielefeld Univ	GCF_000006605.1	identical	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...	NaN	NaN	2005-06-27	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...
4	GCA_000019345.1	PRJNA19087	SAMN02604063	NaN	representative genome	505682	134821	Ureaplasma parvum serovar 3 str. ATCC 27815	strain=ATCC 27815	NaN	...	26/02/2008	ASM1934v1	TIGR	GCF_000019345.1	identical	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...	NaN	assembly from type material	2008-02-26	ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...

5 rows × 24 columns

In [338]:

ncbi_species_metadata.columns

Out[338]:

Index(['assembly_accession', 'bioproject', 'biosample', 'wgs_master',
       'refseq_category', 'taxid', 'species_taxid', 'organism_name',
       'infraspecific_name', 'isolate', 'version_status', 'assembly_level',
       'release_type', 'genome_rep', 'seq_rel_date', 'asm_name', 'submitter',
       'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path',
       'excluded_from_refseq', 'relation_to_type_material',
       'seq_rel_date_processed', 'download_url_base'],
      dtype='object')

Keep traits from short-listed species only¶

In [12]:

species_traits = species_traits_all[
    species_traits_all['species_taxid'].isin(ncbi_species_metadata['species_taxid'].values)
].reset_index(drop=True)
len(species_traits)

Out[12]:

In [15]:

species_traits.columns

Out[15]:

Index(['species_tax_id', 'species', 'genus', 'family', 'order', 'class',
       'phylum', 'superkingdom', 'gram_stain', 'metabolism', 'pathways',
       'carbon_substrates', 'sporulation', 'motility', 'range_tmp',
       'range_salinity', 'cell_shape', 'isolation_source', 'd1_lo', 'd1_up',
       'd2_lo', 'd2_up', 'doubling_h', 'genome_size', 'gc_content',
       'coding_genes', 'optimum_tmp', 'optimum_ph', 'growth_tmp',
       'rRNA16S_genes', 'tRNA_genes', 'gram_stain.count', 'metabolism.count',
       'pathways.count', 'carbon_substrates.count', 'sporulation.count',
       'motility.count', 'range_tmp.count', 'range_salinity.count',
       'cell_shape.count', 'isolation_source.count', 'gram_stain.prop',
       'metabolism.prop', 'pathways.prop', 'carbon_substrates.prop',
       'sporulation.prop', 'motility.prop', 'range_tmp.prop',
       'range_salinity.prop', 'cell_shape.prop', 'isolation_source.prop',
       'd1_lo.count', 'd1_up.count', 'd2_lo.count', 'd2_up.count',
       'doubling_h.count', 'genome_size.count', 'gc_content.count',
       'coding_genes.count', 'optimum_tmp.count', 'optimum_ph.count',
       'growth_tmp.count', 'rRNA16S_genes.count', 'tRNA_genes.count',
       'd1_lo.stdev', 'd1_up.stdev', 'd2_lo.stdev', 'd2_up.stdev',
       'doubling_h.stdev', 'genome_size.stdev', 'gc_content.stdev',
       'coding_genes.stdev', 'optimum_tmp.stdev', 'optimum_ph.stdev',
       'growth_tmp.stdev', 'rRNA16S_genes.stdev', 'tRNA_genes.stdev',
       'data_source', 'ref_id', 'species_taxid'],
      dtype='object')

In [18]:

assert np.all(species_traits['growth_tmp'].notnull())

Load CDS¶

In [103]:

cds_path_fmt = os.path.join(os.getcwd(), 'data/condensed_traits/sequences/{taxid}/{taxid}_cds_from_genomic.fna.gz')

In [104]:

idx = 110
specie_taxid = species_traits.iloc[idx]['species_taxid']
species_traits.iloc[[idx]][['species_taxid', 'species', 'phylum', 'growth_tmp', 'genome_size', 'coding_genes']]

Out[104]:

	species_taxid	species	phylum	growth_tmp	genome_size	coding_genes
110	375	Bradyrhizobium japonicum	Proteobacteria	27.0	9075243.179	8530.667

In [105]:

with gzip.open(cds_path_fmt.format(taxid=specie_taxid), mode='rt') as f:
    record_dict = SeqIO.to_dict(SeqIO.parse(f, "fasta"))

In [106]:

r = sorted(record_dict.keys())[0]

In [107]:

cds = record_dict[r]

In [108]:

cds

Out[108]:

SeqRecord(seq=Seq('ATGTTCGCTGATCGCCTCAAAGATTACAACCTTGCCCTAGCGACCGTGCTTCAG...TGA', SingleLetterAlphabet()), id='lcl|AP012206.1_cds_BAL05302.1_1', name='lcl|AP012206.1_cds_BAL05302.1_1', description='lcl|AP012206.1_cds_BAL05302.1_1 [locus_tag=BJ6T_00010] [protein=hypothetical protein] [protein_id=BAL05302.1] [location=123..428] [gbkey=CDS]', dbxrefs=[])

In [109]:

cds.translate()

Out[109]:

SeqRecord(seq=Seq('MFADRLKDYNLALATVLQSVNSFELVGVGLVLDVPTDIASRRIFSNRTSASWSN...CS*', HasStopCodon(ExtendedIUPACProtein(), '*')), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])

Load RNA¶

In [112]:

rna_path_fmt = os.path.join(os.getcwd(), 'data/condensed_traits/sequences/{taxid}/{taxid}_rna_from_genomic.fna.gz')

In [113]:

with gzip.open(rna_path_fmt.format(taxid=specie_taxid), mode='rt') as f:
    rna_dict = SeqIO.to_dict(SeqIO.parse(f, "fasta"))

In [118]:

rna_id = sorted(rna_dict.keys())[20]
rna = rna_dict[rna_id]

In [119]:

rna

Out[119]:

SeqRecord(seq=Seq('GGCGGGGTAGCTCAGCTGGTTAGAGCACGGGAATCATAATCCTGGGGTCGGGGG...CCA', SingleLetterAlphabet()), id='lcl|AP012206.1_trna_23', name='lcl|AP012206.1_trna_23', description='lcl|AP012206.1_trna_23 [locus_tag=BJ6T_24130] [product=tRNA-Met] [location=2487544..2487620] [gbkey=tRNA]', dbxrefs=[])

Putting it together¶

In [120]:

def load_specie_sequences(specie_taxid):
    dna_path = os.path.join(
        os.getcwd(), 
        f'data/condensed_traits/sequences/{specie_taxid}/{specie_taxid}_genomic.fna.gz',
    )
    cds_path = os.path.join(
        os.getcwd(), 
        f'data/condensed_traits/sequences/{specie_taxid}/{specie_taxid}_cds_from_genomic.fna.gz',
    )
    rna_path = os.path.join(
        os.getcwd(), 
        f'data/condensed_traits/sequences/{specie_taxid}/{specie_taxid}_rna_from_genomic.fna.gz',
    )
    
    with gzip.open(dna_path, mode='rt') as f:
        dna = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
    
    with gzip.open(cds_path, mode='rt') as f:
        cds = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
        
    with gzip.open(rna_path, mode='rt') as f:
        rna = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
        
    return dna, cds, rna

Parse location information¶

In [ ]:

dna_dict, cds_dict, rna_dict = load_specie_sequences(specie_taxid)

In [161]:

cds_dict['lcl|AP012206.1_cds_BAL05308.1_7']

Out[161]:

SeqRecord(seq=Seq('ATGTCAGTGATCGATCTATCGCGCGCCGATCCGCGCACGTTGATCGGCAAGGGA...TAG', SingleLetterAlphabet()), id='lcl|AP012206.1_cds_BAL05308.1_7', name='lcl|AP012206.1_cds_BAL05308.1_7', description='lcl|AP012206.1_cds_BAL05308.1_7 [locus_tag=BJ6T_00070] [protein=ribonucleoside-diphosphate reductase beta subunit] [protein_id=BAL05308.1] [location=complement(4970..6043)] [gbkey=CDS]', dbxrefs=[])

In [ ]:

description = (
    "lcl|CP042806.1_cds_QEE29787.1_3650 [locus_tag=FTW19_18435] "
    "[protein=peptide chain release factor 2] [exception=ribosomal slippage] "
    "[protein_id=QEE29787.1] [location=complement(join(4615525..4616595,4616597..4616665))] [gbkey=CDS]"
)

In [165]:

gff_path_fmt = os.path.join(os.getcwd(), 'data/condensed_traits/sequences/{taxid}/{taxid}_genomic.gff.gz')

In [301]:

annotations = gffutils.create_db(
    gff_path_fmt.format(taxid=specie_taxid), 
    ':memory:', 
    merge_strategy='replace',
)

In [169]:

for f in annotations.featuretypes():
    print(f)

CDS
exon
gene
rRNA
region
tRNA
tmRNA

In [269]:

c = 0
cds_lengths = []
for i, feature in enumerate(annotations.features_of_type('CDS')):
    start, end = feature.start, feature.end
    assert start < end
    cds_lengths.append((end - start) + 1)
    c += 1
    
print(c)
np.mean(cds_lengths), np.min(cds_lengths), np.max(cds_lengths)

Out[269]:

(885.5548759768943, 120, 17028)

In [302]:

for i, feature in enumerate(annotations.features_of_type('CDS')):
    length = (feature.end - feature.start) + 1
    assert length % 3 == 0, 'nope'

In [268]:

def extract_non_coding_locations(dna_dict, annotations):
    regions = sorted(dna_dict.keys())
    full_length = sum([
        len(dna_dict[region].seq)
        for region in regions
    ])
    
    all_indices = set(range(1, full_length + 1))
    coding_indices = set()
    
    for feature_type in ['CDS', 'exon']:
        for i, feature in enumerate(annotations.features_of_type(feature_type)):
            start, end, strand = feature.start, feature.end, feature.strand
            
            if strand == '-':
                prev_start = start
                start = full_length - (end - 1)
                end = full_length - (prev_start - 1)
                assert start < end
            
            coding_indices |= set(range(start, end + 1))
            
    remaining_indices = sorted(all_indices - coding_indices)
    
    non_coding_segments = []
    start, end = None, None
    for i in remaining_indices:
        if start is None:
            start = i
            end = start
        elif end + 1 != i:
            if end != start:
                non_coding_segments.append([start, end])
            else:
                non_coding_segments.append([start])
                
            start = i
            end = start
        elif i == remaining_indices[-1]:
            if start == i:
                non_coding_segments.append([start])
            else:
                non_coding_segments.append([start, i])
        else:
            end = i
        
    return remaining_indices, full_length, non_coding_segments

In [259]:

remaining_indices, full_length, non_coding_segments = extract_non_coding_locations(dna_dict, annotations)

100 * len(remaining_indices) / full_length

Out[259]:

33.33924163475749

In [260]:

len(non_coding_segments)

Out[260]:

In [283]:

lengths = []
for s in non_coding_segments:
    if len(s) == 1:
        lengths.append(1)
    else:
        assert len(s) == 2
        start, end = s
        assert start < end, f'{start}, {end}'
        lengths.append((end - start) + 1)

In [284]:

np.mean(lengths), np.min(lengths), np.max(lengths)

Out[284]:

(685.3476222371065, 1, 19661)

In [265]:

non_coding_segments[0], non_coding_segments[1]

Out[265]:

([1, 122], [429, 483])

Location field¶

In [364]:

def retrieve_odd_looking_location_fields(species_taxid):
    for specie_taxid in species_taxid:
        _, cds_dict, rna_dict = load_specie_sequences(specie_taxid)
        data = {}
        data.update(cds_dict)
        data.update(rna_dict)

        keys = sorted(data.keys())

        for key in keys:
            description = data[key].description

            m1 = re.match(r'^\s?.*\[location=([0-9]+)..([0-9]+)\].*$', description)
            m2 = re.match(r'^\s?.*\[location=complement\(([0-9]+)..([0-9]+)\)\]\s?.*$', description)

            location = re.match(r'^.*\s?\[location=([^\s]+)\]\s?.*$', description)[1]
            if m1 is None and m2 is None and 'join' not in location:
                print(key, location)

In [366]:

#retrieve_odd_looking_location_fields(ncbi_species_metadata['species_taxid'].values)

In [377]:

def all_cds_have_length_modulo_3(species_taxid):
    for i, specie_taxid in enumerate(species_taxid):
        if (i+1) % 100 == 0:
            print(f'{i+1} / {len(species_taxid)}')
            
        _, cds_dict, _ = load_specie_sequences(specie_taxid)

        keys = sorted(cds_dict.keys())

        c = 0
        for key in keys:
            seq_len = len(cds_dict[key].seq)

            if seq_len % 3 != 0:
                c += 1
                
        if c > 0:
            print(specie_taxid, c, f'{100 * c / len(keys):.2f}%')

In [378]:

all_cds_have_length_modulo_3(sorted(ncbi_species_metadata['species_taxid'].values.tolist()))

24 12 0.30%
56 1 0.01%
69 301 5.91%
72 26 1.09%
122 78 1.33%
134 52 1.21%
140 12 0.96%
159 14 0.53%
162 50 1.63%
193 1 0.02%
235 99 3.00%
246 99 2.19%
254 32 0.84%
293 20 0.58%
296 120 2.58%
337 108 1.83%
346 166 3.26%
358 115 2.09%
359 71 1.37%
381 83 1.17%
382 1 0.02%
100 / 2359
426 67 1.41%
435 38 1.15%
438 34 1.21%
454 39 1.43%
476 70 2.42%
484 70 3.17%
488 98 3.69%
492 13 0.63%
519 1 0.02%
520 2 0.05%
521 1 0.03%
539 38 1.58%
545 20 0.47%
549 185 4.00%
562 30 0.69%
575 50 0.97%
615 8 0.17%
622 1 0.02%
623 141 3.00%
624 325 6.53%
630 1 0.02%
631 38 0.86%
632 62 1.47%
633 137 3.50%
636 25 0.75%
648 81 1.91%
651 60 1.33%
654 10 0.23%
668 4 0.10%
672 36 0.80%
673 12 0.34%
676 17 0.39%
687 21 0.50%
689 40 0.74%
691 7 0.16%
728 65 2.89%
729 24 1.21%
200 / 2359
735 41 2.09%
803 77 5.89%
849 15 0.94%
850 29 1.12%
859 58 2.46%
860 46 1.86%
960 2 0.05%
1017 33 1.24%
1050 33 1.24%
1075 18 0.51%
1084 1 0.03%
1112 7 0.25%
300 / 2359
1246 11 0.66%
1249 12 0.66%
1254 11 0.56%
1275 32 0.90%
1281 132 5.24%
1286 18 0.73%
1290 22 1.02%
1292 4 0.16%
1294 12 0.60%
1296 7 0.26%
1307 2 0.10%
1322 24 0.81%
1329 63 3.03%
1335 8 0.45%
1336 1 0.05%
1341 24 1.21%
1352 1 0.03%
1353 34 1.11%
1366 20 0.91%
1374 26 0.79%
1376 5 0.29%
1377 18 0.91%
1397 96 1.99%
1401 56 0.86%
1421 8 0.25%
1422 103 3.03%
1462 142 3.84%
1464 294 6.70%
1474 22 0.72%
1476 23 0.54%
1479 6 0.16%
1482 12 0.32%
400 / 2359
1497 23 0.56%
1504 127 4.06%
1505 3 0.09%
1520 41 0.74%
1570 7 0.17%
1571 13 0.41%
1583 10 0.51%
1588 42 1.42%
1589 89 2.67%
1590 2 0.06%
1599 53 2.63%
1601 17 0.83%
1602 27 1.20%
1603 18 1.12%
1605 9 0.43%
1610 62 2.21%
1612 15 0.62%
1614 9 0.68%
1618 89 3.61%
1622 25 1.16%
1633 54 3.02%
1640 1 0.04%
1642 14 0.49%
1648 4 0.23%
1660 18 0.88%
1703 9 0.27%
1710 93 2.26%
1719 51 2.47%
1725 30 1.19%
1736 14 0.34%
500 / 2359
1749 25 0.95%
1769 698 25.66%
1823 36 0.53%
1829 65 1.24%
1830 82 1.65%
1885 133 1.80%
1886 642 10.61%
1893 316 3.69%
1894 111 1.81%
1901 21 0.35%
1902 19 0.23%
1906 59 1.02%
1908 295 4.32%
1927 113 1.39%
1930 1 0.01%
1932 311 4.66%
1935 85 1.26%
1950 430 6.06%
1960 120 1.74%
1967 103 1.18%
1969 72 0.83%
2026 18 0.69%
2035 924 24.51%
2055 18 0.36%
2116 2 0.19%
600 / 2359
2130 6 0.91%
2177 3 0.15%
2238 61 1.46%
2246 61 1.49%
2257 12 0.42%
2264 23 1.18%
2271 1 0.05%
2283 91 3.56%
2320 1 0.06%
2434 10 0.24%
2746 4 0.11%
2748 23 0.94%
13689 24 0.65%
13690 74 1.35%
28026 33 1.91%
28038 23 1.16%
28077 21 0.34%
28090 40 1.21%
28104 135 1.95%
700 / 2359
28113 34 1.19%
28119 59 2.15%
28131 43 1.88%
28132 28 1.15%
28141 15 0.33%
28151 22 0.42%
28184 200 4.73%
28189 11 0.48%
28200 7 0.37%
28258 19 0.56%
28447 1 0.03%
28449 50 2.41%
28450 1 0.02%
28894 106 1.59%
29380 17 0.66%
29382 62 2.39%
29384 15 0.59%
29391 16 0.99%
29394 13 0.67%
29429 109 2.12%
29430 45 1.30%
29438 154 2.84%
29449 1 0.02%
29461 86 2.62%
29471 47 1.07%
29484 38 0.84%
29519 33 2.98%
29542 93 3.20%
29555 11 1.67%
29570 63 1.77%
31958 41 0.49%
32002 453 6.95%
800 / 2359
33038 43 1.24%
33070 172 6.98%
33887 47 1.61%
33899 61 0.88%
33932 32 0.67%
33934 76 2.70%
33936 106 2.76%
33938 68 1.92%
33941 40 1.18%
33945 62 1.30%
33959 68 3.72%
33962 34 1.45%
33968 31 1.39%
34018 10 0.25%
34038 1 0.02%
34062 13 0.56%
34103 25 0.64%
35703 37 0.82%
35760 11 0.61%
35787 47 2.88%
35806 21 0.53%
35841 62 1.67%
36805 46 1.42%
36818 79 1.18%
36855 86 2.65%
37482 28 0.88%
37928 41 0.90%
38293 70 1.74%
900 / 2359
38313 26 0.61%
38323 54 3.35%
39777 21 1.15%
39950 8 0.68%
39956 140 2.23%
40215 71 2.25%
40269 4 0.09%
40318 108 1.58%
41276 11 0.33%
41673 159 5.10%
41675 50 2.35%
41977 22 0.54%
43263 2 0.05%
43306 314 4.97%
43657 25 0.50%
43662 237 5.17%
43770 43 1.57%
44935 44 1.10%
45065 7 0.30%
45243 35 1.52%
45398 99 1.27%
45668 34 0.80%
45972 34 1.40%
46126 245 10.50%
46127 17 0.74%
46256 20 1.07%
46257 190 3.01%
47304 13 0.55%
1000 / 2359
47678 42 1.15%
47716 383 5.51%
47763 46 0.65%
47770 47 2.40%
47877 143 2.48%
47878 42 0.66%
47879 25 0.48%
47885 22 0.46%
47886 66 1.20%
47951 34 0.78%
47960 3 0.13%
48003 4 0.52%
48074 2 0.06%
48664 205 5.41%
48665 85 1.25%
49899 15 0.71%
50719 64 1.33%
51049 20 0.94%
51101 138 2.70%
51664 16 0.93%
51665 6 0.34%
53336 16 0.39%
53345 72 2.59%
53408 31 0.46%
53409 69 1.34%
53413 165 3.93%
53437 23 0.42%
53444 26 1.87%
53458 15 0.45%
53952 27 1.34%
54077 11 0.55%
54262 31 1.47%
54291 100 1.66%
54736 45 1.10%
54913 131 2.24%
55208 84 1.88%
55210 63 1.40%
55211 136 2.91%
55213 65 1.85%
1100 / 2359
56448 140 3.14%
56453 113 2.86%
56454 130 2.89%
56455 137 3.16%
56459 256 5.60%
56460 129 2.80%
57706 18 0.40%
57975 298 4.74%
58346 116 1.57%
59277 66 2.79%
59310 72 3.26%
59737 40 1.25%
59814 23 0.55%
60519 26 1.55%
60520 69 2.13%
60548 64 0.96%
60550 146 1.97%
60552 115 1.83%
61645 20 0.45%
61647 33 0.67%
61648 21 0.47%
61858 48 1.13%
64897 1 0.07%
65058 23 1.04%
65959 72 2.40%
66269 118 2.25%
66425 139 2.15%
67258 87 1.34%
67260 75 1.12%
67345 133 1.99%
67581 117 1.79%
67780 126 3.40%
67824 89 1.58%
67825 2 0.04%
67827 34 0.72%
68173 69 1.00%
68203 86 1.27%
68209 61 0.93%
68214 97 1.04%
68249 113 1.54%
68270 88 1.07%
69218 13 0.28%
1200 / 2359
69220 45 1.00%
69527 43 1.15%
69656 102 3.44%
69666 16 0.37%
70255 31 1.25%
70348 55 1.49%
70775 69 1.38%
71433 98 1.45%
71997 15 0.83%
71998 18 0.94%
72361 109 2.49%
72803 9 0.43%
73044 52 0.89%
74969 10 0.56%
75105 98 1.22%
76759 27 0.69%
76761 218 3.32%
76832 30 0.87%
76853 10 0.26%
78344 22 1.09%
79263 64 1.10%
79601 63 2.45%
79880 8 0.19%
79881 93 2.26%
79883 37 0.87%
79967 123 3.24%
80842 25 0.51%
81479 47 1.21%
82367 87 2.04%
82633 19 0.30%
82688 28 1.17%
82983 98 2.14%
83262 15 0.28%
1300 / 2359
83555 34 3.40%
83655 29 0.64%
83656 129 1.84%
83683 85 5.34%
84023 17 0.42%
84032 28 0.70%
84096 77 1.72%
84135 28 1.69%
84292 28 0.94%
85404 11 0.55%
86185 250 5.87%
86664 160 3.95%
87541 25 1.69%
87883 50 0.85%
88724 92 2.04%
89059 1 0.04%
90245 26 1.12%
90270 210 4.12%
90736 26 0.64%
92400 1 0.16%
92644 259 3.00%
93064 23 0.50%
93219 16 0.30%
93221 14 0.28%
93378 27 0.83%
94624 2 0.04%
95486 172 2.32%
96344 57 0.87%
101564 22 0.46%
102684 14 0.78%
103733 115 1.24%
1400 / 2359
104087 87 1.46%
104102 45 1.21%
104268 8 0.27%
104336 15 0.43%
104955 19 1.14%
106592 70 1.04%
106648 46 1.10%
106654 46 1.16%
107401 25 0.87%
109790 29 1.86%
110937 11 0.28%
111015 36 1.43%
112234 14 0.32%
114090 49 2.21%
114527 3 0.18%
115561 8 0.23%
116188 159 1.88%
119206 7 0.40%
128944 8 0.49%
129338 34 1.04%
132132 36 1.41%
132473 106 1.54%
134534 53 1.41%
134536 204 3.16%
135740 64 1.55%
137545 66 1.36%
139021 12 0.34%
139438 5 0.18%
142649 2 0.33%
142650 4 0.59%
142651 1 0.16%
1500 / 2359
146923 25 0.37%
147645 93 2.01%
150056 28 1.21%
150120 51 1.24%
152500 118 2.01%
152682 215 5.63%
153496 33 1.00%
155085 50 2.20%
157779 15 0.46%
157782 25 0.55%
158500 55 0.88%
158836 30 0.68%
160453 47 1.25%
160660 108 3.20%
161360 72 1.98%
162426 14 0.42%
163877 43 1.06%
169176 63 1.56%
169292 1 0.04%
169427 99 1.55%
169430 198 2.00%
170573 17 0.74%
1600 / 2359
172045 38 1.00%
172713 107 2.13%
173366 45 1.25%
176102 31 0.78%
176292 17 0.86%
180282 115 1.89%
182773 14 0.54%
183417 41 1.17%
185007 21 0.66%
187304 18 0.30%
187452 1 0.05%
187493 2 0.05%
187880 5 0.25%
188770 140 2.22%
189381 137 3.03%
190721 13 0.23%
192421 26 0.83%
192812 30 0.98%
193462 39 0.57%
195105 40 1.02%
196024 20 0.46%
197162 16 0.67%
197614 75 3.50%
198620 155 2.20%
199441 133 2.85%
200991 3 0.09%
202954 19 0.59%
202956 29 1.04%
206506 68 2.00%
207340 40 0.89%
208223 95 2.11%
208224 56 1.07%
208479 51 0.84%
1700 / 2359
214473 41 1.54%
216427 3 0.41%
217203 84 1.38%
217204 13 0.22%
218284 204 4.45%
221822 21 0.54%
222805 94 1.59%
223919 33 0.65%
225992 36 1.11%
225999 1 0.13%
227942 134 7.14%
229155 3 0.24%
233316 6 0.15%
236753 5 0.34%
237258 16 0.64%
237610 18 0.37%
238834 22 0.51%
240427 73 2.27%
241244 38 1.15%
242606 8 0.19%
244366 19 0.37%
244566 40 0.73%
244734 168 2.36%
247523 44 1.11%
252514 17 0.53%
252967 35 1.04%
255247 26 0.65%
257438 87 2.62%
257708 23 0.48%
260364 38 1.02%
260552 5 0.14%
260554 390 9.41%
1800 / 2359
264691 29 0.49%
264697 31 0.65%
265883 2 0.08%
265960 66 1.88%
267375 13 0.42%
272774 33 0.91%
274537 27 1.08%
277988 20 0.89%
280236 124 2.33%
283686 51 0.95%
285570 78 1.19%
287094 17 0.44%
290335 53 2.07%
291048 2 0.11%
292800 19 0.52%
293091 126 4.20%
293387 23 0.59%
296842 28 0.69%
299767 20 0.45%
300019 15 0.47%
303541 5 0.33%
304207 76 2.17%
311180 36 0.75%
311230 75 0.83%
312306 17 0.33%
312539 26 1.12%
314275 13 0.35%
314281 42 1.00%
317577 20 0.51%
321983 19 0.31%
321984 133 2.61%
322596 54 1.90%
323284 22 0.57%
323450 19 0.45%
1900 / 2359
327575 13 0.60%
329854 53 1.21%
330922 10 0.39%
332101 29 0.58%
333962 162 3.87%
334542 68 1.09%
336203 319 6.32%
337243 54 1.48%
338644 18 0.57%
351679 78 1.93%
353852 18 2.95%
361111 26 0.88%
371601 71 1.35%
375175 14 0.55%
376489 30 0.78%
379347 15 0.33%
386891 34 1.75%
391905 33 0.79%
399497 78 2.22%
401471 16 0.38%
402297 482 14.05%
402384 22 0.55%
412593 12 0.67%
412690 78 2.37%
2000 / 2359
413503 20 0.49%
414771 143 1.88%
414778 12 0.38%
414996 80 1.76%
416273 34 1.18%
417367 39 1.06%
417368 23 0.95%
419257 45 0.95%
419475 23 0.50%
420953 92 1.39%
421000 56 2.42%
421525 90 3.49%
421767 100 1.91%
430522 6 0.11%
436515 256 3.91%
441209 14 0.38%
442694 106 2.49%
444444 15 0.40%
450367 106 2.03%
453304 8 0.21%
453783 11 0.30%
455432 58 0.73%
456327 168 3.42%
459472 1324 32.38%
463025 16 0.31%
468911 55 2.23%
470933 34 0.78%
470934 21 0.48%
479978 93 1.40%
482957 184 2.22%
487316 17 0.51%
488447 61 0.79%
492670 38 1.00%
526944 26 1.17%
528191 51 0.84%
530584 22 0.34%
544580 34 1.42%
546160 62 1.78%
546367 25 0.62%
549298 95 4.14%
553151 20 0.49%
553239 13 0.32%
553510 100 1.35%
2100 / 2359
556288 22 0.67%
556499 46 1.79%
558152 30 0.78%
561061 2 0.06%
569860 14 0.44%
573737 61 1.08%
582702 86 2.39%
587753 52 0.81%
588932 5 0.18%
622488 84 4.60%
626937 16 0.57%
633807 62 2.84%
639310 11 0.35%
643674 455 17.54%
648995 78 1.65%
651561 24 0.50%
655015 33 0.75%
656178 14 0.29%
656179 40 0.80%
659243 82 1.91%
661488 79 1.30%
664662 78 6.18%
665099 49 0.86%
665913 118 2.97%
665914 125 3.03%
674079 20 0.65%
696485 43 0.76%
701042 39 0.80%
754476 17 0.57%
759811 64 1.33%
859143 112 2.45%
863372 18 0.36%
864828 28 0.46%
871742 20 0.34%
881260 52 1.11%
889453 15 0.41%
904291 15 0.54%
913107 2 0.07%
929813 19 0.40%
930166 16 0.28%
930805 228 5.93%
938155 23 0.89%
945844 112 2.19%
948519 89 2.21%
980427 167 4.48%
985002 38 1.50%
985762 61 2.74%
2200 / 2359
1032623 2 0.11%
1038856 64 2.04%
1049583 60 2.39%
1074311 462 10.30%
1076588 11 0.38%
1082851 44 1.16%
1089444 22 0.54%
1108595 20 0.45%
1109412 17 0.37%
1120045 272 4.94%
1124835 46 0.94%
1134435 12 0.32%
1134687 181 3.04%
1138822 8 0.41%
1158459 13 0.29%
1177574 70 2.28%
1178537 38 1.05%
1190415 31 0.60%
1193713 50 0.91%
1197717 6 0.19%
1208308 59 1.21%
1215089 29 0.89%
1218493 23 1.26%
1218494 9 0.53%
1221500 7 0.17%
1222016 89 3.27%
1263550 27 0.74%
1287055 48 1.72%
1288494 15 0.51%
1296540 35 1.59%
1303590 10 0.59%
1312183 231 3.05%
1323375 42 1.36%
1338368 78 1.69%
1355015 170 2.67%
1387353 22 0.38%
2300 / 2359
1402861 59 1.15%
1411902 21 0.59%
1424653 5 0.25%
1463165 38 0.77%
1484693 10 0.25%
1500843 103 2.23%
1510150 18 0.48%
1535768 225 2.92%
1546149 40 1.09%
1549855 7 0.32%
1563222 79 1.77%
1586287 299 3.20%
1592106 16 0.32%
1610493 59 2.02%
1616788 39 0.84%
1619308 17 0.42%
1642299 68 0.94%
1667168 83 3.92%
1674146 60 4.88%
1715806 69 2.44%
1778540 46 1.06%
1813019 45 3.09%
1813871 13 0.28%
1833852 18 0.58%
1945662 57 2.01%

Percentage of non-coding DNA in species of our dataset¶

In [300]:

%%time
non_coding_percentages = []
for i, taxid in enumerate(species_traits['species_taxid'].values):
    if i == 0 or (i + 1) % 10 == 0:
        print(f'Specie {i+1} / {len(species_traits)}')
    
    dna_dict, _, _ = load_specie_sequences(specie_taxid)
    
    annotations = gffutils.create_db(
        gff_path_fmt.format(taxid=taxid), 
        ':memory:', 
        merge_strategy='replace',
    )
    
    remaining_indices, full_length, _ = extract_non_coding_locations(dna_dict, annotations)
    perc = 100 * len(remaining_indices) / full_length
    non_coding_percentages.append(perc)

Specie 1 / 2359

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<timed exec> in <module>

<ipython-input-268-464c8a066cfa> in extract_non_coding_locations(dna_dict, annotations)
     29             start = i
     30             end = start
---> 31         elif end + 1 != i:
     32             if end != start:
     33                 non_coding_segments.append([start, end])

KeyboardInterrupt:

In [381]:

seq_record = cds_dict['lcl|AP012206.1_cds_BAL05302.1_1']
seq_record

Out[381]:

SeqRecord(seq=Seq('ATGTTCGCTGATCGCCTCAAAGATTACAACCTTGCCCTAGCGACCGTGCTTCAG...TGA', SingleLetterAlphabet()), id='lcl|AP012206.1_cds_BAL05302.1_1', name='lcl|AP012206.1_cds_BAL05302.1_1', description='lcl|AP012206.1_cds_BAL05302.1_1 [locus_tag=BJ6T_00010] [protein=hypothetical protein] [protein_id=BAL05302.1] [location=123..428] [gbkey=CDS]', dbxrefs=[])

In [382]:

seq = seq_record.seq

In [387]:

translated_seq_str = str(seq.translate())

In [388]:

translated_seq_str[-1] == '*'

Out[388]:

True

In [389]:

'*' not in translated_seq_str[:-1]

Out[389]:

True

In [391]:

for specie_taxid in species_traits['species_taxid'].values:
    dna_path = os.path.join(
        os.getcwd(), 
        f'data/condensed_traits/sequences/{specie_taxid}/{specie_taxid}_genomic.fna.gz',
    )
    with gzip.open(dna_path, mode='rt') as f:
        dna = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
    
    if len(dna) > 1:
        print(specie_taxid)
        break

In [392]:

dna_path_1441386 = os.path.join(
    os.getcwd(), 
    f'data/condensed_traits/sequences/1441386/1441386_genomic.fna.gz',
)
with gzip.open(dna_path_1441386, mode='rt') as f:
    dna_1441386 = SeqIO.to_dict(SeqIO.parse(f, "fasta"))

In [393]:

dna_1441386.keys()

Out[393]:

dict_keys(['AP019551.1', 'AP019552.1', 'AP019553.1'])

In [ ]: