import re
import os
import datetime
import gzip
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq
import gffutils
if os.getcwd().endswith('notebook'):
os.chdir('..')
sns.set(palette='colorblind', font_scale=1.3)
species_traits_path = os.path.join(os.getcwd(), 'data/condensed_traits/condensed_species_NCBI_with_ogt.csv')
species_traits_all = pd.read_csv(species_traits_path)
len(species_traits_all)
11265
species_traits_all['species_taxid'] = species_traits_all['species_tax_id']
species_traits_all.head()
species_tax_id | species | genus | family | order | class | phylum | superkingdom | gram_stain | metabolism | ... | gc_content.stdev | coding_genes.stdev | optimum_tmp.stdev | optimum_ph.stdev | growth_tmp.stdev | rRNA16S_genes.stdev | tRNA_genes.stdev | data_source | ref_id | species_taxid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1416943 | Lactobacillus sicerae | Lactobacillus | Lactobacillaceae | Lactobacillales | Bacilli | Firmicutes | Bacteria | NaN | anaerobic | ... | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | campedelli | 152 | 1416943 |
1 | 1229250 | Ammoniibacillus agariperforans | Ammoniibacillus | Paenibacillaceae | Bacillales | Bacilli | Firmicutes | Bacteria | NaN | aerobic | ... | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | corkrey | 181 | 1229250 |
2 | 1081799 | Thermomarinilinea lacunifontana | Thermomarinilinea | Anaerolineaceae | Anaerolineales | Anaerolineae | Chloroflexi | Bacteria | NaN | aerobic | ... | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | corkrey | 206 | 1081799 |
3 | 1323370 | Caloranaerobacter ferrireducens | Caloranaerobacter | Clostridiaceae | Clostridiales | Clostridia | Firmicutes | Bacteria | NaN | anaerobic | ... | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | corkrey | 380 | 1323370 |
4 | 192731 | Halomicronema excentricum | Halomicronema | Leptolyngbyaceae | Synechococcales | NaN | Cyanobacteria | Bacteria | NaN | aerobic | ... | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | corkrey | 406 | 192731 |
5 rows × 80 columns
ncbi_species_metadata = pd.read_csv(
os.path.join(os.getcwd(), 'data/condensed_traits/ncbi_species_final.csv'),
parse_dates=['seq_rel_date_processed'],
)
len(ncbi_species_metadata)
2359
ncbi_species_metadata.head()
assembly_accession | bioproject | biosample | wgs_master | refseq_category | taxid | species_taxid | organism_name | infraspecific_name | isolate | ... | seq_rel_date | asm_name | submitter | gbrs_paired_asm | paired_asm_comp | ftp_path | excluded_from_refseq | relation_to_type_material | seq_rel_date_processed | download_url_base | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GCA_000005825.2 | PRJNA28811 | SAMN02603086 | NaN | representative genome | 398511 | 79885 | Bacillus pseudofirmus OF4 | strain=OF4 | NaN | ... | 15/12/2010 | ASM582v2 | Center for Genomic Sciences, Allegheny-Singer ... | GCF_000005825.2 | identical | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... | NaN | NaN | 2010-12-15 | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... |
1 | GCA_000005845.2 | PRJNA225 | SAMN02604091 | NaN | reference genome | 511145 | 562 | Escherichia coli str. K-12 substr. MG1655 | strain=K-12 substr. MG1655 | NaN | ... | 26/09/2013 | ASM584v2 | Univ. Wisconsin | GCF_000005845.2 | identical | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... | NaN | NaN | 2013-09-26 | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... |
2 | GCA_000006175.2 | PRJNA20131 | SAMN00000040 | NaN | representative genome | 456320 | 2188 | Methanococcus voltae A3 | strain=A3 | NaN | ... | 03/06/2010 | ASM617v2 | US DOE Joint Genome Institute (JGI-PGF) | GCF_000006175.1 | identical | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... | NaN | NaN | 2010-06-03 | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... |
3 | GCA_000006605.1 | PRJNA13967 | SAMEA3283089 | NaN | representative genome | 306537 | 38289 | Corynebacterium jeikeium K411 | NaN | NaN | ... | 27/06/2005 | ASM660v1 | Bielefeld Univ | GCF_000006605.1 | identical | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... | NaN | NaN | 2005-06-27 | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... |
4 | GCA_000019345.1 | PRJNA19087 | SAMN02604063 | NaN | representative genome | 505682 | 134821 | Ureaplasma parvum serovar 3 str. ATCC 27815 | strain=ATCC 27815 | NaN | ... | 26/02/2008 | ASM1934v1 | TIGR | GCF_000019345.1 | identical | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... | NaN | assembly from type material | 2008-02-26 | ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000... |
5 rows × 24 columns
ncbi_species_metadata.columns
Index(['assembly_accession', 'bioproject', 'biosample', 'wgs_master', 'refseq_category', 'taxid', 'species_taxid', 'organism_name', 'infraspecific_name', 'isolate', 'version_status', 'assembly_level', 'release_type', 'genome_rep', 'seq_rel_date', 'asm_name', 'submitter', 'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path', 'excluded_from_refseq', 'relation_to_type_material', 'seq_rel_date_processed', 'download_url_base'], dtype='object')
species_traits = species_traits_all[
species_traits_all['species_taxid'].isin(ncbi_species_metadata['species_taxid'].values)
].reset_index(drop=True)
len(species_traits)
2359
species_traits.columns
Index(['species_tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom', 'gram_stain', 'metabolism', 'pathways', 'carbon_substrates', 'sporulation', 'motility', 'range_tmp', 'range_salinity', 'cell_shape', 'isolation_source', 'd1_lo', 'd1_up', 'd2_lo', 'd2_up', 'doubling_h', 'genome_size', 'gc_content', 'coding_genes', 'optimum_tmp', 'optimum_ph', 'growth_tmp', 'rRNA16S_genes', 'tRNA_genes', 'gram_stain.count', 'metabolism.count', 'pathways.count', 'carbon_substrates.count', 'sporulation.count', 'motility.count', 'range_tmp.count', 'range_salinity.count', 'cell_shape.count', 'isolation_source.count', 'gram_stain.prop', 'metabolism.prop', 'pathways.prop', 'carbon_substrates.prop', 'sporulation.prop', 'motility.prop', 'range_tmp.prop', 'range_salinity.prop', 'cell_shape.prop', 'isolation_source.prop', 'd1_lo.count', 'd1_up.count', 'd2_lo.count', 'd2_up.count', 'doubling_h.count', 'genome_size.count', 'gc_content.count', 'coding_genes.count', 'optimum_tmp.count', 'optimum_ph.count', 'growth_tmp.count', 'rRNA16S_genes.count', 'tRNA_genes.count', 'd1_lo.stdev', 'd1_up.stdev', 'd2_lo.stdev', 'd2_up.stdev', 'doubling_h.stdev', 'genome_size.stdev', 'gc_content.stdev', 'coding_genes.stdev', 'optimum_tmp.stdev', 'optimum_ph.stdev', 'growth_tmp.stdev', 'rRNA16S_genes.stdev', 'tRNA_genes.stdev', 'data_source', 'ref_id', 'species_taxid'], dtype='object')
assert np.all(species_traits['growth_tmp'].notnull())
cds_path_fmt = os.path.join(os.getcwd(), 'data/condensed_traits/sequences/{taxid}/{taxid}_cds_from_genomic.fna.gz')
idx = 110
specie_taxid = species_traits.iloc[idx]['species_taxid']
species_traits.iloc[[idx]][['species_taxid', 'species', 'phylum', 'growth_tmp', 'genome_size', 'coding_genes']]
species_taxid | species | phylum | growth_tmp | genome_size | coding_genes | |
---|---|---|---|---|---|---|
110 | 375 | Bradyrhizobium japonicum | Proteobacteria | 27.0 | 9075243.179 | 8530.667 |
with gzip.open(cds_path_fmt.format(taxid=specie_taxid), mode='rt') as f:
record_dict = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
r = sorted(record_dict.keys())[0]
cds = record_dict[r]
cds
SeqRecord(seq=Seq('ATGTTCGCTGATCGCCTCAAAGATTACAACCTTGCCCTAGCGACCGTGCTTCAG...TGA', SingleLetterAlphabet()), id='lcl|AP012206.1_cds_BAL05302.1_1', name='lcl|AP012206.1_cds_BAL05302.1_1', description='lcl|AP012206.1_cds_BAL05302.1_1 [locus_tag=BJ6T_00010] [protein=hypothetical protein] [protein_id=BAL05302.1] [location=123..428] [gbkey=CDS]', dbxrefs=[])
cds.translate()
SeqRecord(seq=Seq('MFADRLKDYNLALATVLQSVNSFELVGVGLVLDVPTDIASRRIFSNRTSASWSN...CS*', HasStopCodon(ExtendedIUPACProtein(), '*')), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])
rna_path_fmt = os.path.join(os.getcwd(), 'data/condensed_traits/sequences/{taxid}/{taxid}_rna_from_genomic.fna.gz')
with gzip.open(rna_path_fmt.format(taxid=specie_taxid), mode='rt') as f:
rna_dict = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
rna_id = sorted(rna_dict.keys())[20]
rna = rna_dict[rna_id]
rna
SeqRecord(seq=Seq('GGCGGGGTAGCTCAGCTGGTTAGAGCACGGGAATCATAATCCTGGGGTCGGGGG...CCA', SingleLetterAlphabet()), id='lcl|AP012206.1_trna_23', name='lcl|AP012206.1_trna_23', description='lcl|AP012206.1_trna_23 [locus_tag=BJ6T_24130] [product=tRNA-Met] [location=2487544..2487620] [gbkey=tRNA]', dbxrefs=[])
def load_specie_sequences(specie_taxid):
dna_path = os.path.join(
os.getcwd(),
f'data/condensed_traits/sequences/{specie_taxid}/{specie_taxid}_genomic.fna.gz',
)
cds_path = os.path.join(
os.getcwd(),
f'data/condensed_traits/sequences/{specie_taxid}/{specie_taxid}_cds_from_genomic.fna.gz',
)
rna_path = os.path.join(
os.getcwd(),
f'data/condensed_traits/sequences/{specie_taxid}/{specie_taxid}_rna_from_genomic.fna.gz',
)
with gzip.open(dna_path, mode='rt') as f:
dna = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
with gzip.open(cds_path, mode='rt') as f:
cds = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
with gzip.open(rna_path, mode='rt') as f:
rna = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
return dna, cds, rna
dna_dict, cds_dict, rna_dict = load_specie_sequences(specie_taxid)
cds_dict['lcl|AP012206.1_cds_BAL05308.1_7']
SeqRecord(seq=Seq('ATGTCAGTGATCGATCTATCGCGCGCCGATCCGCGCACGTTGATCGGCAAGGGA...TAG', SingleLetterAlphabet()), id='lcl|AP012206.1_cds_BAL05308.1_7', name='lcl|AP012206.1_cds_BAL05308.1_7', description='lcl|AP012206.1_cds_BAL05308.1_7 [locus_tag=BJ6T_00070] [protein=ribonucleoside-diphosphate reductase beta subunit] [protein_id=BAL05308.1] [location=complement(4970..6043)] [gbkey=CDS]', dbxrefs=[])
description = (
"lcl|CP042806.1_cds_QEE29787.1_3650 [locus_tag=FTW19_18435] "
"[protein=peptide chain release factor 2] [exception=ribosomal slippage] "
"[protein_id=QEE29787.1] [location=complement(join(4615525..4616595,4616597..4616665))] [gbkey=CDS]"
)
gff_path_fmt = os.path.join(os.getcwd(), 'data/condensed_traits/sequences/{taxid}/{taxid}_genomic.gff.gz')
annotations = gffutils.create_db(
gff_path_fmt.format(taxid=specie_taxid),
':memory:',
merge_strategy='replace',
)
for f in annotations.featuretypes():
print(f)
CDS exon gene rRNA region tRNA tmRNA
c = 0
cds_lengths = []
for i, feature in enumerate(annotations.features_of_type('CDS')):
start, end = feature.start, feature.end
assert start < end
cds_lengths.append((end - start) + 1)
c += 1
print(c)
np.mean(cds_lengths), np.min(cds_lengths), np.max(cds_lengths)
8829
(885.5548759768943, 120, 17028)
for i, feature in enumerate(annotations.features_of_type('CDS')):
length = (feature.end - feature.start) + 1
assert length % 3 == 0, 'nope'
def extract_non_coding_locations(dna_dict, annotations):
regions = sorted(dna_dict.keys())
full_length = sum([
len(dna_dict[region].seq)
for region in regions
])
all_indices = set(range(1, full_length + 1))
coding_indices = set()
for feature_type in ['CDS', 'exon']:
for i, feature in enumerate(annotations.features_of_type(feature_type)):
start, end, strand = feature.start, feature.end, feature.strand
if strand == '-':
prev_start = start
start = full_length - (end - 1)
end = full_length - (prev_start - 1)
assert start < end
coding_indices |= set(range(start, end + 1))
remaining_indices = sorted(all_indices - coding_indices)
non_coding_segments = []
start, end = None, None
for i in remaining_indices:
if start is None:
start = i
end = start
elif end + 1 != i:
if end != start:
non_coding_segments.append([start, end])
else:
non_coding_segments.append([start])
start = i
end = start
elif i == remaining_indices[-1]:
if start == i:
non_coding_segments.append([start])
else:
non_coding_segments.append([start, i])
else:
end = i
return remaining_indices, full_length, non_coding_segments
remaining_indices, full_length, non_coding_segments = extract_non_coding_locations(dna_dict, annotations)
100 * len(remaining_indices) / full_length
33.33924163475749
len(non_coding_segments)
4479
lengths = []
for s in non_coding_segments:
if len(s) == 1:
lengths.append(1)
else:
assert len(s) == 2
start, end = s
assert start < end, f'{start}, {end}'
lengths.append((end - start) + 1)
np.mean(lengths), np.min(lengths), np.max(lengths)
(685.3476222371065, 1, 19661)
non_coding_segments[0], non_coding_segments[1]
([1, 122], [429, 483])
def retrieve_odd_looking_location_fields(species_taxid):
for specie_taxid in species_taxid:
_, cds_dict, rna_dict = load_specie_sequences(specie_taxid)
data = {}
data.update(cds_dict)
data.update(rna_dict)
keys = sorted(data.keys())
for key in keys:
description = data[key].description
m1 = re.match(r'^\s?.*\[location=([0-9]+)..([0-9]+)\].*$', description)
m2 = re.match(r'^\s?.*\[location=complement\(([0-9]+)..([0-9]+)\)\]\s?.*$', description)
location = re.match(r'^.*\s?\[location=([^\s]+)\]\s?.*$', description)[1]
if m1 is None and m2 is None and 'join' not in location:
print(key, location)
#retrieve_odd_looking_location_fields(ncbi_species_metadata['species_taxid'].values)
def all_cds_have_length_modulo_3(species_taxid):
for i, specie_taxid in enumerate(species_taxid):
if (i+1) % 100 == 0:
print(f'{i+1} / {len(species_taxid)}')
_, cds_dict, _ = load_specie_sequences(specie_taxid)
keys = sorted(cds_dict.keys())
c = 0
for key in keys:
seq_len = len(cds_dict[key].seq)
if seq_len % 3 != 0:
c += 1
if c > 0:
print(specie_taxid, c, f'{100 * c / len(keys):.2f}%')
all_cds_have_length_modulo_3(sorted(ncbi_species_metadata['species_taxid'].values.tolist()))
24 12 0.30% 56 1 0.01% 69 301 5.91% 72 26 1.09% 122 78 1.33% 134 52 1.21% 140 12 0.96% 159 14 0.53% 162 50 1.63% 193 1 0.02% 235 99 3.00% 246 99 2.19% 254 32 0.84% 293 20 0.58% 296 120 2.58% 337 108 1.83% 346 166 3.26% 358 115 2.09% 359 71 1.37% 381 83 1.17% 382 1 0.02% 100 / 2359 426 67 1.41% 435 38 1.15% 438 34 1.21% 454 39 1.43% 476 70 2.42% 484 70 3.17% 488 98 3.69% 492 13 0.63% 519 1 0.02% 520 2 0.05% 521 1 0.03% 539 38 1.58% 545 20 0.47% 549 185 4.00% 562 30 0.69% 575 50 0.97% 615 8 0.17% 622 1 0.02% 623 141 3.00% 624 325 6.53% 630 1 0.02% 631 38 0.86% 632 62 1.47% 633 137 3.50% 636 25 0.75% 648 81 1.91% 651 60 1.33% 654 10 0.23% 668 4 0.10% 672 36 0.80% 673 12 0.34% 676 17 0.39% 687 21 0.50% 689 40 0.74% 691 7 0.16% 728 65 2.89% 729 24 1.21% 200 / 2359 735 41 2.09% 803 77 5.89% 849 15 0.94% 850 29 1.12% 859 58 2.46% 860 46 1.86% 960 2 0.05% 1017 33 1.24% 1050 33 1.24% 1075 18 0.51% 1084 1 0.03% 1112 7 0.25% 300 / 2359 1246 11 0.66% 1249 12 0.66% 1254 11 0.56% 1275 32 0.90% 1281 132 5.24% 1286 18 0.73% 1290 22 1.02% 1292 4 0.16% 1294 12 0.60% 1296 7 0.26% 1307 2 0.10% 1322 24 0.81% 1329 63 3.03% 1335 8 0.45% 1336 1 0.05% 1341 24 1.21% 1352 1 0.03% 1353 34 1.11% 1366 20 0.91% 1374 26 0.79% 1376 5 0.29% 1377 18 0.91% 1397 96 1.99% 1401 56 0.86% 1421 8 0.25% 1422 103 3.03% 1462 142 3.84% 1464 294 6.70% 1474 22 0.72% 1476 23 0.54% 1479 6 0.16% 1482 12 0.32% 400 / 2359 1497 23 0.56% 1504 127 4.06% 1505 3 0.09% 1520 41 0.74% 1570 7 0.17% 1571 13 0.41% 1583 10 0.51% 1588 42 1.42% 1589 89 2.67% 1590 2 0.06% 1599 53 2.63% 1601 17 0.83% 1602 27 1.20% 1603 18 1.12% 1605 9 0.43% 1610 62 2.21% 1612 15 0.62% 1614 9 0.68% 1618 89 3.61% 1622 25 1.16% 1633 54 3.02% 1640 1 0.04% 1642 14 0.49% 1648 4 0.23% 1660 18 0.88% 1703 9 0.27% 1710 93 2.26% 1719 51 2.47% 1725 30 1.19% 1736 14 0.34% 500 / 2359 1749 25 0.95% 1769 698 25.66% 1823 36 0.53% 1829 65 1.24% 1830 82 1.65% 1885 133 1.80% 1886 642 10.61% 1893 316 3.69% 1894 111 1.81% 1901 21 0.35% 1902 19 0.23% 1906 59 1.02% 1908 295 4.32% 1927 113 1.39% 1930 1 0.01% 1932 311 4.66% 1935 85 1.26% 1950 430 6.06% 1960 120 1.74% 1967 103 1.18% 1969 72 0.83% 2026 18 0.69% 2035 924 24.51% 2055 18 0.36% 2116 2 0.19% 600 / 2359 2130 6 0.91% 2177 3 0.15% 2238 61 1.46% 2246 61 1.49% 2257 12 0.42% 2264 23 1.18% 2271 1 0.05% 2283 91 3.56% 2320 1 0.06% 2434 10 0.24% 2746 4 0.11% 2748 23 0.94% 13689 24 0.65% 13690 74 1.35% 28026 33 1.91% 28038 23 1.16% 28077 21 0.34% 28090 40 1.21% 28104 135 1.95% 700 / 2359 28113 34 1.19% 28119 59 2.15% 28131 43 1.88% 28132 28 1.15% 28141 15 0.33% 28151 22 0.42% 28184 200 4.73% 28189 11 0.48% 28200 7 0.37% 28258 19 0.56% 28447 1 0.03% 28449 50 2.41% 28450 1 0.02% 28894 106 1.59% 29380 17 0.66% 29382 62 2.39% 29384 15 0.59% 29391 16 0.99% 29394 13 0.67% 29429 109 2.12% 29430 45 1.30% 29438 154 2.84% 29449 1 0.02% 29461 86 2.62% 29471 47 1.07% 29484 38 0.84% 29519 33 2.98% 29542 93 3.20% 29555 11 1.67% 29570 63 1.77% 31958 41 0.49% 32002 453 6.95% 800 / 2359 33038 43 1.24% 33070 172 6.98% 33887 47 1.61% 33899 61 0.88% 33932 32 0.67% 33934 76 2.70% 33936 106 2.76% 33938 68 1.92% 33941 40 1.18% 33945 62 1.30% 33959 68 3.72% 33962 34 1.45% 33968 31 1.39% 34018 10 0.25% 34038 1 0.02% 34062 13 0.56% 34103 25 0.64% 35703 37 0.82% 35760 11 0.61% 35787 47 2.88% 35806 21 0.53% 35841 62 1.67% 36805 46 1.42% 36818 79 1.18% 36855 86 2.65% 37482 28 0.88% 37928 41 0.90% 38293 70 1.74% 900 / 2359 38313 26 0.61% 38323 54 3.35% 39777 21 1.15% 39950 8 0.68% 39956 140 2.23% 40215 71 2.25% 40269 4 0.09% 40318 108 1.58% 41276 11 0.33% 41673 159 5.10% 41675 50 2.35% 41977 22 0.54% 43263 2 0.05% 43306 314 4.97% 43657 25 0.50% 43662 237 5.17% 43770 43 1.57% 44935 44 1.10% 45065 7 0.30% 45243 35 1.52% 45398 99 1.27% 45668 34 0.80% 45972 34 1.40% 46126 245 10.50% 46127 17 0.74% 46256 20 1.07% 46257 190 3.01% 47304 13 0.55% 1000 / 2359 47678 42 1.15% 47716 383 5.51% 47763 46 0.65% 47770 47 2.40% 47877 143 2.48% 47878 42 0.66% 47879 25 0.48% 47885 22 0.46% 47886 66 1.20% 47951 34 0.78% 47960 3 0.13% 48003 4 0.52% 48074 2 0.06% 48664 205 5.41% 48665 85 1.25% 49899 15 0.71% 50719 64 1.33% 51049 20 0.94% 51101 138 2.70% 51664 16 0.93% 51665 6 0.34% 53336 16 0.39% 53345 72 2.59% 53408 31 0.46% 53409 69 1.34% 53413 165 3.93% 53437 23 0.42% 53444 26 1.87% 53458 15 0.45% 53952 27 1.34% 54077 11 0.55% 54262 31 1.47% 54291 100 1.66% 54736 45 1.10% 54913 131 2.24% 55208 84 1.88% 55210 63 1.40% 55211 136 2.91% 55213 65 1.85% 1100 / 2359 56448 140 3.14% 56453 113 2.86% 56454 130 2.89% 56455 137 3.16% 56459 256 5.60% 56460 129 2.80% 57706 18 0.40% 57975 298 4.74% 58346 116 1.57% 59277 66 2.79% 59310 72 3.26% 59737 40 1.25% 59814 23 0.55% 60519 26 1.55% 60520 69 2.13% 60548 64 0.96% 60550 146 1.97% 60552 115 1.83% 61645 20 0.45% 61647 33 0.67% 61648 21 0.47% 61858 48 1.13% 64897 1 0.07% 65058 23 1.04% 65959 72 2.40% 66269 118 2.25% 66425 139 2.15% 67258 87 1.34% 67260 75 1.12% 67345 133 1.99% 67581 117 1.79% 67780 126 3.40% 67824 89 1.58% 67825 2 0.04% 67827 34 0.72% 68173 69 1.00% 68203 86 1.27% 68209 61 0.93% 68214 97 1.04% 68249 113 1.54% 68270 88 1.07% 69218 13 0.28% 1200 / 2359 69220 45 1.00% 69527 43 1.15% 69656 102 3.44% 69666 16 0.37% 70255 31 1.25% 70348 55 1.49% 70775 69 1.38% 71433 98 1.45% 71997 15 0.83% 71998 18 0.94% 72361 109 2.49% 72803 9 0.43% 73044 52 0.89% 74969 10 0.56% 75105 98 1.22% 76759 27 0.69% 76761 218 3.32% 76832 30 0.87% 76853 10 0.26% 78344 22 1.09% 79263 64 1.10% 79601 63 2.45% 79880 8 0.19% 79881 93 2.26% 79883 37 0.87% 79967 123 3.24% 80842 25 0.51% 81479 47 1.21% 82367 87 2.04% 82633 19 0.30% 82688 28 1.17% 82983 98 2.14% 83262 15 0.28% 1300 / 2359 83555 34 3.40% 83655 29 0.64% 83656 129 1.84% 83683 85 5.34% 84023 17 0.42% 84032 28 0.70% 84096 77 1.72% 84135 28 1.69% 84292 28 0.94% 85404 11 0.55% 86185 250 5.87% 86664 160 3.95% 87541 25 1.69% 87883 50 0.85% 88724 92 2.04% 89059 1 0.04% 90245 26 1.12% 90270 210 4.12% 90736 26 0.64% 92400 1 0.16% 92644 259 3.00% 93064 23 0.50% 93219 16 0.30% 93221 14 0.28% 93378 27 0.83% 94624 2 0.04% 95486 172 2.32% 96344 57 0.87% 101564 22 0.46% 102684 14 0.78% 103733 115 1.24% 1400 / 2359 104087 87 1.46% 104102 45 1.21% 104268 8 0.27% 104336 15 0.43% 104955 19 1.14% 106592 70 1.04% 106648 46 1.10% 106654 46 1.16% 107401 25 0.87% 109790 29 1.86% 110937 11 0.28% 111015 36 1.43% 112234 14 0.32% 114090 49 2.21% 114527 3 0.18% 115561 8 0.23% 116188 159 1.88% 119206 7 0.40% 128944 8 0.49% 129338 34 1.04% 132132 36 1.41% 132473 106 1.54% 134534 53 1.41% 134536 204 3.16% 135740 64 1.55% 137545 66 1.36% 139021 12 0.34% 139438 5 0.18% 142649 2 0.33% 142650 4 0.59% 142651 1 0.16% 1500 / 2359 146923 25 0.37% 147645 93 2.01% 150056 28 1.21% 150120 51 1.24% 152500 118 2.01% 152682 215 5.63% 153496 33 1.00% 155085 50 2.20% 157779 15 0.46% 157782 25 0.55% 158500 55 0.88% 158836 30 0.68% 160453 47 1.25% 160660 108 3.20% 161360 72 1.98% 162426 14 0.42% 163877 43 1.06% 169176 63 1.56% 169292 1 0.04% 169427 99 1.55% 169430 198 2.00% 170573 17 0.74% 1600 / 2359 172045 38 1.00% 172713 107 2.13% 173366 45 1.25% 176102 31 0.78% 176292 17 0.86% 180282 115 1.89% 182773 14 0.54% 183417 41 1.17% 185007 21 0.66% 187304 18 0.30% 187452 1 0.05% 187493 2 0.05% 187880 5 0.25% 188770 140 2.22% 189381 137 3.03% 190721 13 0.23% 192421 26 0.83% 192812 30 0.98% 193462 39 0.57% 195105 40 1.02% 196024 20 0.46% 197162 16 0.67% 197614 75 3.50% 198620 155 2.20% 199441 133 2.85% 200991 3 0.09% 202954 19 0.59% 202956 29 1.04% 206506 68 2.00% 207340 40 0.89% 208223 95 2.11% 208224 56 1.07% 208479 51 0.84% 1700 / 2359 214473 41 1.54% 216427 3 0.41% 217203 84 1.38% 217204 13 0.22% 218284 204 4.45% 221822 21 0.54% 222805 94 1.59% 223919 33 0.65% 225992 36 1.11% 225999 1 0.13% 227942 134 7.14% 229155 3 0.24% 233316 6 0.15% 236753 5 0.34% 237258 16 0.64% 237610 18 0.37% 238834 22 0.51% 240427 73 2.27% 241244 38 1.15% 242606 8 0.19% 244366 19 0.37% 244566 40 0.73% 244734 168 2.36% 247523 44 1.11% 252514 17 0.53% 252967 35 1.04% 255247 26 0.65% 257438 87 2.62% 257708 23 0.48% 260364 38 1.02% 260552 5 0.14% 260554 390 9.41% 1800 / 2359 264691 29 0.49% 264697 31 0.65% 265883 2 0.08% 265960 66 1.88% 267375 13 0.42% 272774 33 0.91% 274537 27 1.08% 277988 20 0.89% 280236 124 2.33% 283686 51 0.95% 285570 78 1.19% 287094 17 0.44% 290335 53 2.07% 291048 2 0.11% 292800 19 0.52% 293091 126 4.20% 293387 23 0.59% 296842 28 0.69% 299767 20 0.45% 300019 15 0.47% 303541 5 0.33% 304207 76 2.17% 311180 36 0.75% 311230 75 0.83% 312306 17 0.33% 312539 26 1.12% 314275 13 0.35% 314281 42 1.00% 317577 20 0.51% 321983 19 0.31% 321984 133 2.61% 322596 54 1.90% 323284 22 0.57% 323450 19 0.45% 1900 / 2359 327575 13 0.60% 329854 53 1.21% 330922 10 0.39% 332101 29 0.58% 333962 162 3.87% 334542 68 1.09% 336203 319 6.32% 337243 54 1.48% 338644 18 0.57% 351679 78 1.93% 353852 18 2.95% 361111 26 0.88% 371601 71 1.35% 375175 14 0.55% 376489 30 0.78% 379347 15 0.33% 386891 34 1.75% 391905 33 0.79% 399497 78 2.22% 401471 16 0.38% 402297 482 14.05% 402384 22 0.55% 412593 12 0.67% 412690 78 2.37% 2000 / 2359 413503 20 0.49% 414771 143 1.88% 414778 12 0.38% 414996 80 1.76% 416273 34 1.18% 417367 39 1.06% 417368 23 0.95% 419257 45 0.95% 419475 23 0.50% 420953 92 1.39% 421000 56 2.42% 421525 90 3.49% 421767 100 1.91% 430522 6 0.11% 436515 256 3.91% 441209 14 0.38% 442694 106 2.49% 444444 15 0.40% 450367 106 2.03% 453304 8 0.21% 453783 11 0.30% 455432 58 0.73% 456327 168 3.42% 459472 1324 32.38% 463025 16 0.31% 468911 55 2.23% 470933 34 0.78% 470934 21 0.48% 479978 93 1.40% 482957 184 2.22% 487316 17 0.51% 488447 61 0.79% 492670 38 1.00% 526944 26 1.17% 528191 51 0.84% 530584 22 0.34% 544580 34 1.42% 546160 62 1.78% 546367 25 0.62% 549298 95 4.14% 553151 20 0.49% 553239 13 0.32% 553510 100 1.35% 2100 / 2359 556288 22 0.67% 556499 46 1.79% 558152 30 0.78% 561061 2 0.06% 569860 14 0.44% 573737 61 1.08% 582702 86 2.39% 587753 52 0.81% 588932 5 0.18% 622488 84 4.60% 626937 16 0.57% 633807 62 2.84% 639310 11 0.35% 643674 455 17.54% 648995 78 1.65% 651561 24 0.50% 655015 33 0.75% 656178 14 0.29% 656179 40 0.80% 659243 82 1.91% 661488 79 1.30% 664662 78 6.18% 665099 49 0.86% 665913 118 2.97% 665914 125 3.03% 674079 20 0.65% 696485 43 0.76% 701042 39 0.80% 754476 17 0.57% 759811 64 1.33% 859143 112 2.45% 863372 18 0.36% 864828 28 0.46% 871742 20 0.34% 881260 52 1.11% 889453 15 0.41% 904291 15 0.54% 913107 2 0.07% 929813 19 0.40% 930166 16 0.28% 930805 228 5.93% 938155 23 0.89% 945844 112 2.19% 948519 89 2.21% 980427 167 4.48% 985002 38 1.50% 985762 61 2.74% 2200 / 2359 1032623 2 0.11% 1038856 64 2.04% 1049583 60 2.39% 1074311 462 10.30% 1076588 11 0.38% 1082851 44 1.16% 1089444 22 0.54% 1108595 20 0.45% 1109412 17 0.37% 1120045 272 4.94% 1124835 46 0.94% 1134435 12 0.32% 1134687 181 3.04% 1138822 8 0.41% 1158459 13 0.29% 1177574 70 2.28% 1178537 38 1.05% 1190415 31 0.60% 1193713 50 0.91% 1197717 6 0.19% 1208308 59 1.21% 1215089 29 0.89% 1218493 23 1.26% 1218494 9 0.53% 1221500 7 0.17% 1222016 89 3.27% 1263550 27 0.74% 1287055 48 1.72% 1288494 15 0.51% 1296540 35 1.59% 1303590 10 0.59% 1312183 231 3.05% 1323375 42 1.36% 1338368 78 1.69% 1355015 170 2.67% 1387353 22 0.38% 2300 / 2359 1402861 59 1.15% 1411902 21 0.59% 1424653 5 0.25% 1463165 38 0.77% 1484693 10 0.25% 1500843 103 2.23% 1510150 18 0.48% 1535768 225 2.92% 1546149 40 1.09% 1549855 7 0.32% 1563222 79 1.77% 1586287 299 3.20% 1592106 16 0.32% 1610493 59 2.02% 1616788 39 0.84% 1619308 17 0.42% 1642299 68 0.94% 1667168 83 3.92% 1674146 60 4.88% 1715806 69 2.44% 1778540 46 1.06% 1813019 45 3.09% 1813871 13 0.28% 1833852 18 0.58% 1945662 57 2.01%
%%time
non_coding_percentages = []
for i, taxid in enumerate(species_traits['species_taxid'].values):
if i == 0 or (i + 1) % 10 == 0:
print(f'Specie {i+1} / {len(species_traits)}')
dna_dict, _, _ = load_specie_sequences(specie_taxid)
annotations = gffutils.create_db(
gff_path_fmt.format(taxid=taxid),
':memory:',
merge_strategy='replace',
)
remaining_indices, full_length, _ = extract_non_coding_locations(dna_dict, annotations)
perc = 100 * len(remaining_indices) / full_length
non_coding_percentages.append(perc)
Specie 1 / 2359
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <timed exec> in <module> <ipython-input-268-464c8a066cfa> in extract_non_coding_locations(dna_dict, annotations) 29 start = i 30 end = start ---> 31 elif end + 1 != i: 32 if end != start: 33 non_coding_segments.append([start, end]) KeyboardInterrupt:
seq_record = cds_dict['lcl|AP012206.1_cds_BAL05302.1_1']
seq_record
SeqRecord(seq=Seq('ATGTTCGCTGATCGCCTCAAAGATTACAACCTTGCCCTAGCGACCGTGCTTCAG...TGA', SingleLetterAlphabet()), id='lcl|AP012206.1_cds_BAL05302.1_1', name='lcl|AP012206.1_cds_BAL05302.1_1', description='lcl|AP012206.1_cds_BAL05302.1_1 [locus_tag=BJ6T_00010] [protein=hypothetical protein] [protein_id=BAL05302.1] [location=123..428] [gbkey=CDS]', dbxrefs=[])
seq = seq_record.seq
translated_seq_str = str(seq.translate())
translated_seq_str[-1] == '*'
True
'*' not in translated_seq_str[:-1]
True
for specie_taxid in species_traits['species_taxid'].values:
dna_path = os.path.join(
os.getcwd(),
f'data/condensed_traits/sequences/{specie_taxid}/{specie_taxid}_genomic.fna.gz',
)
with gzip.open(dna_path, mode='rt') as f:
dna = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
if len(dna) > 1:
print(specie_taxid)
break
1441386
dna_path_1441386 = os.path.join(
os.getcwd(),
f'data/condensed_traits/sequences/1441386/1441386_genomic.fna.gz',
)
with gzip.open(dna_path_1441386, mode='rt') as f:
dna_1441386 = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
dna_1441386.keys()
dict_keys(['AP019551.1', 'AP019552.1', 'AP019553.1'])