EMBLmyGFF3
(needed to avoid weird dependency conflicts - likely specific to my Anaconda installation)¶%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Thu Mar 12 12:31:06 PDT 2020 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2925.931 BogoMIPS: 5851.96 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 3.2G 56G 408M 10G 66G Swap: 4.7G 245M 4.4G
No LSB modules are available.
%env
variables are good for passing to bash cells
# Set workding directory
%env wd=/home/sam/Downloads/20200311_swoose_pgen_genome_ena_submission
wd="/home/sam/Downloads/20200311_swoose_pgen_genome_ena_submission"
# Input output files
%env embl_out=Panopea-generosa-v1.0.a4.embl
%env combined_gff=Panopea-generosa-v1.0.a4.combined.gff
%env fasta=Panopea-generosa-v1.0.fa
# ENA-specific info
%env pid=PRJEB36299
%env locus_tag=PGEN
%env species='Panopea generosa'
env: wd=/home/sam/Downloads/20200311_swoose_pgen_genome_ena_submission env: embl_out=Panopea-generosa-v1.0.a4.embl env: combined_gff=Panopea-generosa-v1.0.a4.combined.gff env: fasta=Panopea-generosa-v1.0.fa env: pid=PRJEB36299 env: locus_tag=PGEN env: species='Panopea generosa'
cd {wd}
/home/sam/Downloads/20200311_swoose_pgen_genome_ena_submission
%%bash
ls -ltrh
total 1.1G -rw-rw-r-- 1 sam sam 914M Mar 3 15:08 Panopea-generosa-v1.0.fa -rw-rw-r-- 1 sam sam 217 Mar 3 15:09 pgen_v074_ena_submission_manifest.tab -rw-rw-r-- 1 sam sam 658 Mar 3 21:08 Panopea-generosa-v1.0.fa.fai -rw-rw-r-- 1 sam sam 53M Mar 4 06:31 Panopea-generosa-v1.0.a4.CDS.gff3 -rw-rw-r-- 1 sam sam 55M Mar 4 06:32 Panopea-generosa-v1.0.a4.exon.gff3 -rw-rw-r-- 1 sam sam 9.1M Mar 4 06:32 Panopea-generosa-v1.0.a4.mRNA.gff3 -rw-rw-r-- 1 sam sam 1.5K Mar 4 06:32 Panopea-generosa-v1.0.a4.rRNA.gff3 -rw-rw-r-- 1 sam sam 2.6M Mar 4 06:32 Panopea-generosa-v1.0.a4.tRNA.gff3 -rw-rw-r-- 1 sam sam 5.4M Mar 4 06:32 Panopea-generosa-v1.0.a4.repeats.DNA.gff3 -rw-rw-r-- 1 sam sam 18M Mar 4 06:33 Panopea-generosa-v1.0.a4.repeats.LINE.gff3 -rw-rw-r-- 1 sam sam 763K Mar 4 06:33 Panopea-generosa-v1.0.a4.repeats.LTR.gff3 -rw-rw-r-- 1 sam sam 143K Mar 4 06:33 Panopea-generosa-v1.0.a4.repeats.RC.gff3 -rw-rw-r-- 1 sam sam 4.6M Mar 4 06:33 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3 -rw-rw-r-- 1 sam sam 11M Mar 4 06:33 Panopea-generosa-v1.0.a4.repeats.SINE.gff3 -rw-rw-r-- 1 sam sam 9.6M Mar 4 14:51 Panopea-generosa-v1.0.a4.gene.gff3
This step also changes the text "notes=" to "note=". This will allow that section of the GFF to be included in the output EMBL flat file. The info in this section includes information on SwissProt, InterProScan, and Pfam accession numbers.
%%bash
# Copy GFF3 header to new GFF3
awk 'NR < 4' Panopea-generosa-v1.0.a4.CDS.gff3 > ${combined_gff}
echo "GFF line counts:"
echo ""
# Concatenate all GFFs
for gff in *.gff3
do
# Print file name and line count (excluding header)
echo "${gff}"
awk 'NR > 3' ${gff} | wc -l
# Skip header and substitute "notes=" with "note="
awk 'NR > 3' ${gff} | sed 's/notes=/note=/g' >> ${combined_gff}
done
echo ""
echo "-------------------------------"
echo ""
echo "${combined_gff} line count:"
awk 'NR > 3' ${combined_gff} | wc -l
GFF line counts: Panopea-generosa-v1.0.a4.CDS.gff3 236960 Panopea-generosa-v1.0.a4.exon.gff3 236960 Panopea-generosa-v1.0.a4.gene.gff3 34947 Panopea-generosa-v1.0.a4.mRNA.gff3 38326 Panopea-generosa-v1.0.a4.repeat_region.gff3 1676544 Panopea-generosa-v1.0.a4.repeats.DNA.gff3 23195 Panopea-generosa-v1.0.a4.repeats.LINE.gff3 75939 Panopea-generosa-v1.0.a4.repeats.LTR.gff3 3255 Panopea-generosa-v1.0.a4.repeats.RC.gff3 603 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3 19865 Panopea-generosa-v1.0.a4.repeats.SINE.gff3 43129 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3 1510558 Panopea-generosa-v1.0.a4.rRNA.gff3 8 Panopea-generosa-v1.0.a4.tRNA.gff3 16889 ------------------------------- Panopea-generosa-v1.0.a4.combined.gff line count: 3917178
%%bash
EMBLmyGFF3 -h
usage: EMBLmyGFF3 [-h] [-a] [-c CREATED] [-d {CON,PAT,EST,GSS,HTC,HTG,MGA,WGS,TSA,STS,STD}] [-g ORGANELLE] [-i LOCUS_TAG] [-k KEYWORD [KEYWORD ...]] [-l CLASSIFICATION] [-m {genomic DNA,genomic RNA,mRNA,tRNA,rRNA,other RNA,other DNA,transcribed RNA,viral cRNA,unassigned DNA,unassigned RNA}] [-o OUTPUT] [-p PROJECT_ID] [-q] [-r {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}] [-s SPECIES] [-t {linear,circular}] [-v] [-x {PHG,ENV,FUN,HUM,INV,MAM,VRT,MUS,PLN,PRO,ROD,SYN,TGN,UNC,VRL}] [-z] [--ah {One of the parameters above}] [--de DE] [--ra RA [RA ...]] [--rc RC] [--rg RG] [--rl RL] [--rt RT] [--rx RX] [--email EMAIL] [--expose_translations] [--force_unknown_features] [--force_uncomplete_features] [--interleave_genes] [--keep_duplicates] [--locus_numbering_start LOCUS_NUMBERING_START] [--no_progress] [--no_wrap_qualifier] [--shame] [--translate] [--use_attribute_value_as_locus_tag USE_ATTRIBUTE_VALUE_AS_LOCUS_TAG] [--uncompressed_log] [--version VERSION] [--strain STRAIN] [--environmental_sample] [--isolation_source ISOLATION_SOURCE] [--isolate ISOLATE] gff_file fasta EMBL writer for ENA data submission. Note that this implementation is basically just the documentation at ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/usrman.txt in python form - the implementation could be a lot more efficient! GFF convertion is based on specifications from https://github.com/The-Sequence- Ontology/Specifications/blob/master/gff3.md positional arguments: gff_file Input gff-file. fasta Input fasta sequence. optional arguments: -h, --help show this help message and exit -a, --accession Bolean. Accession number(s) for the entry. Default value: XXX. The proper value is automatically filled up by ENA during the submission by a unique accession number they will assign. The accession number is used to set up the AC line and the first token of the ID line as well. Please visit [this page](https://www.ebi.ac.uk/ena/submit/accession- number-formats) and [this one](https://www.ebi.ac.uk/ena/submit/sequence- submission) to learn more about it. Activating the option will set the Accession number with the fasta sequence identifier. -c CREATED, --created CREATED Creation time of the original entry. The default value is the date of the day. -d {CON,PAT,EST,GSS,HTC,HTG,MGA,WGS,TSA,STS,STD}, --data_class {CON,PAT,EST,GSS,HTC,HTG,MGA,WGS,TSA,STS,STD} Data class of the sample. Default value 'XXX'. This option is used to set up the 5th token of the ID line. -g ORGANELLE, --organelle ORGANELLE Sample organelle. No default value. -i LOCUS_TAG, --locus_tag LOCUS_TAG Locus tag prefix used to set up the prefix of the locus_tag qualifier. The locus tag has to be registered at ENA prior any submission. More information [here](https://www.ebi.ac.uk/ena/submit /locus-tags). -k KEYWORD [KEYWORD ...], --keyword KEYWORD [KEYWORD ...] Keywords for the entry. No default value. -l CLASSIFICATION, --classification CLASSIFICATION Organism classification e.g 'Eukaryota; Opisthokonta; Metazoa'. The default value is the classification found in the NCBI taxonomy DB from the species/taxid given as --species parameter. If none is found, 'Life' will be the default value. -m {genomic DNA,genomic RNA,mRNA,tRNA,rRNA,other RNA,other DNA,transcribed RNA,viral cRNA,unassigned DNA,unassigned RNA}, --molecule_type {genomic DNA,genomic RNA,mRNA,tRNA,rRNA,other RNA,other DNA,transcribed RNA,viral cRNA,unassigned DNA,unassigned RNA} Molecule type of the sample. No default value. -o OUTPUT, --output OUTPUT Output filename. -p PROJECT_ID, --project_id PROJECT_ID Project ID. Default is 'XXX' (This is used to set up the PR line). -q, --quiet Decrease verbosity. -r {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}, --transl_table {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25} Translation table. No default. (This is used to set up the translation table qualifier transl_table of the CDS features.) Please visit [NCBI genetic code](https: //www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi) for more information. -s SPECIES, --species SPECIES Sample species, formatted as 'Genus species' or taxid. No default. (This is used to set up the OS line.) -t {linear,circular}, --topology {linear,circular} Sequence topology. No default. (This is used to set up the Topology that is the 3rd token of the ID line.) -v, --verbose Increase verbosity. -x {PHG,ENV,FUN,HUM,INV,MAM,VRT,MUS,PLN,PRO,ROD,SYN,TGN,UNC,VRL}, --taxonomy {PHG,ENV,FUN,HUM,INV,MAM,VRT,MUS,PLN,PRO,ROD,SYN,TGN,UNC,VRL} Source taxonomy. Default value 'XXX'. This option is used to set the taxonomic division within ID line (6th token). -z, --gzip Gzip output file. --ah {One of the parameters above}, --advanced_help {One of the parameters above} Display advanced information of the parameter specified or of all parameters if none specified. --de DE Description. Default value 'XXX'. --ra RA [RA ...], --author RA [RA ...] Author for the reference. No default value. --rc RC Reference Comment. No default value. --rg RG Reference Group, the working groups/consortia that produced the record. Default value 'XXX'. --rl RL Reference publishing location. No default value. --rt RT Reference Title. No default value. --rx RX Reference cross-reference. No default value --email EMAIL Email used to fetch information from NCBI taxonomy database. Default value 'EMBLmyGFF3@tool.org'. --expose_translations Copy feature and attribute mapping files to the working directory. They will be used as mapping files instead of the default internal JSON files. You may modify them as it suits you. --force_unknown_features Force to keep feature types not accepted by EMBL. /!\ Option not suitable for submission purpose. --force_uncomplete_features Force to keep features whithout all the mandatory qualifiers. /!\ Option not suitable for submission purpose. --interleave_genes Print gene features with interleaved mRNA and CDS features. --keep_duplicates Do not remove duplicate features during the process. /!\ Option not suitable for submission purpose. --locus_numbering_start LOCUS_NUMBERING_START Start locus numbering with the provided value. --no_progress Hide conversion progress counter. --no_wrap_qualifier By default there is a line wrapping at 80 characters. The cut is at the world level. Activating this option will avoid the line-wrapping for the qualifiers. --shame Suppress the shameless plug. --translate Include translation in CDS features. --use_attribute_value_as_locus_tag USE_ATTRIBUTE_VALUE_AS_LOCUS_TAG Use the value of the defined attribute as locus_tag. --uncompressed_log Some logs can be compressed for better lisibility, they won't. --version VERSION Sequence version number. The default value is 1. --strain STRAIN Strain from which sequence was obtained. May be needed when organism belongs to Bacteria. --environmental_sample Bolean. Identifies sequences derived by direct molecular isolation from a bulk environmental DNA sample with no reliable identification of the source organism. May be needed when organism belongs to Bacteria. --isolation_source ISOLATION_SOURCE Describes the physical, environmental and/or local geographical source of the biological sample from which the sequence was derived. Mandatory when environmental_sample option used. --isolate ISOLATE Individual isolate from which the sequence was obtained. May be needed when organism belongs to Bacteria.
%%bash
time \
EMBLmyGFF3 ${combined_gff} \
${fasta} \
--topology linear \
--molecule_type 'genomic DNA' \
--transl_table 1 \
--species 'Panopea generosa' \
--locus_tag ${locus_tag} \
--project_id ${pid} \
--output ${embl_out} \
&> emblymygff3.log
real 174m10.906s user 159m20.296s sys 14m43.564s
%%bash
head -n 50 ${embl_out}
ID XXX; XXX; linear; genomic DNA; XXX; XXX; 89643857 BP. XX AC XXX; XX AC * _Scaffold_01 XX PR Project:PRJEB36299; XX DT 12-Mar-2020 (Rel. 133, Created) XX DE XXX XX KW . XX OS Panopea generosa XX OC cellular organisms; Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; OC Protostomia; Spiralia; Lophotrochozoa; Mollusca; Bivalvia; Heterodonta; OC Euheterodonta; Myoida; Hiatelloidea; Hiatellidae; Panopea. XX RN [1] RP 1-89643857 RG XXX RT ; RL Submitted (12-MAR-2020) to the INSDC. XX FH Key Location/Qualifiers FH FT source 1..89643857 FT /mol_type="genomic DNA" FT /organism="Panopea generosa" FT gene 2..4719 FT /locus_tag="PGEN_LOCUS1" FT /note="source:GenSAS_5d9637f372b5d-publish" FT /note="ID:PGEN_.00g000010" FT /standard_name="PGEN_.00g000010" FT mRNA join(2..125,1995..2095,3325..3495,4651..4719) FT /locus_tag="PGEN_LOCUS1" FT /note="source:GenSAS_5d9637f372b5d-publish" FT /note="ID:PGEN_.00g000010.m01" FT /standard_name="PGEN_.00g000010.m01" FT CDS join(<2..125,1995..2095,3325..3495,4651..4719) FT /standard_name="PGEN_.00g000010.m01.CDS01" FT /standard_name="PGEN_.00g000010.m01.CDS02" FT /standard_name="PGEN_.00g000010.m01.CDS03" FT /standard_name="PGEN_.00g000010.m01.CDS04" FT /locus_tag="PGEN_LOCUS1" FT /codon_start=1 FT /note="source:GenSAS_5d9637f372b5d-publish" FT /note="ID:PGEN_.00g000010.m01.CDS01"
%%bash
ls -ltrh
total 5.7G -rw-rw-r-- 1 sam sam 914M Mar 3 15:08 Panopea-generosa-v1.0.fa -rw-rw-r-- 1 sam sam 658 Mar 3 21:08 Panopea-generosa-v1.0.fa.fai -rw-rw-r-- 1 sam sam 215 Mar 12 15:59 pgen_v074_ena_submission_manifest.tab -rw-rw-r-- 1 sam sam 844M Mar 12 17:11 Panopea-generosa-v1.0.a4.combined.gff -rw-rw-r-- 1 sam sam 2.3G Mar 12 22:25 Panopea-generosa-v1.0.a4.embl -rw-rw-r-- 1 sam sam 805M Mar 12 22:25 emblymygff3.log -rw------- 1 sam sam 322M Mar 12 23:56 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3 -rw------- 1 sam sam 358M Mar 12 23:56 Panopea-generosa-v1.0.a4.repeat_region.gff3 -rw------- 1 sam sam 53M Mar 12 23:57 Panopea-generosa-v1.0.a4.CDS.gff3 -rw------- 1 sam sam 55M Mar 12 23:57 Panopea-generosa-v1.0.a4.exon.gff3 -rw------- 1 sam sam 9.1M Mar 12 23:57 Panopea-generosa-v1.0.a4.mRNA.gff3 -rw------- 1 sam sam 9.5M Mar 12 23:57 Panopea-generosa-v1.0.a4.gene.gff3 -rw------- 1 sam sam 2.5M Mar 12 23:57 Panopea-generosa-v1.0.a4.tRNA.gff3 -rw------- 1 sam sam 17M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.LINE.gff3 -rw------- 1 sam sam 1.4K Mar 12 23:57 Panopea-generosa-v1.0.a4.rRNA.gff3 -rw------- 1 sam sam 9.7M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.SINE.gff3 -rw------- 1 sam sam 4.4M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3 -rw------- 1 sam sam 726K Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.LTR.gff3 -rw------- 1 sam sam 5.2M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.DNA.gff3 -rw------- 1 sam sam 136K Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.RC.gff3
%%bash
cat pgen_v074_ena_submission_manifest.tab
STUDY PRJEB36299 SAMPLE ERS4366161 ASSEMBLYNAME PGEN-v1.0 ASSEMBLY_TYPE isolate COVERAGE 86 PROGRAM Proximo PLATFORM Hi-C,HiSeq4000 MOLECULETYPE genomic DNA FLATFILE Panopea-generosa-v1.0.a4.embl.gz
%%bash
cat pgen_v074_ena_submission_manifest.tab
STUDY PRJEB36299 SAMPLE ERS4366161 ASSEMBLYNAME PGEN-v1.0 ASSEMBLY_TYPE isolate COVERAGE 86 PROGRAM Proximo PLATFORM Hi-C,HiSeq4000 MOLECULETYPE genomic DNA FLATFILE Panopea-generosa-v1.0.a4.embl.gz
%%bash
time \
gzip --keep Panopea-generosa-v1.0.a4.embl
ls -ltrh
total 6.1G -rw-rw-r-- 1 sam sam 914M Mar 3 15:08 Panopea-generosa-v1.0.fa -rw-rw-r-- 1 sam sam 658 Mar 3 21:08 Panopea-generosa-v1.0.fa.fai -rw-rw-r-- 1 sam sam 215 Mar 12 15:59 pgen_v074_ena_submission_manifest.tab -rw-rw-r-- 1 sam sam 844M Mar 12 17:11 Panopea-generosa-v1.0.a4.combined.gff -rw-rw-r-- 1 sam sam 397M Mar 12 22:25 Panopea-generosa-v1.0.a4.embl.gz -rw-rw-r-- 1 sam sam 2.3G Mar 12 22:25 Panopea-generosa-v1.0.a4.embl -rw-rw-r-- 1 sam sam 805M Mar 12 22:25 emblymygff3.log -rw------- 1 sam sam 322M Mar 12 23:56 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3 -rw------- 1 sam sam 358M Mar 12 23:56 Panopea-generosa-v1.0.a4.repeat_region.gff3 -rw------- 1 sam sam 53M Mar 12 23:57 Panopea-generosa-v1.0.a4.CDS.gff3 -rw------- 1 sam sam 55M Mar 12 23:57 Panopea-generosa-v1.0.a4.exon.gff3 -rw------- 1 sam sam 9.1M Mar 12 23:57 Panopea-generosa-v1.0.a4.mRNA.gff3 -rw------- 1 sam sam 9.5M Mar 12 23:57 Panopea-generosa-v1.0.a4.gene.gff3 -rw------- 1 sam sam 2.5M Mar 12 23:57 Panopea-generosa-v1.0.a4.tRNA.gff3 -rw------- 1 sam sam 17M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.LINE.gff3 -rw------- 1 sam sam 1.4K Mar 12 23:57 Panopea-generosa-v1.0.a4.rRNA.gff3 -rw------- 1 sam sam 9.7M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.SINE.gff3 -rw------- 1 sam sam 4.4M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3 -rw------- 1 sam sam 726K Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.LTR.gff3 -rw------- 1 sam sam 5.2M Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.DNA.gff3 -rw------- 1 sam sam 136K Mar 12 23:57 Panopea-generosa-v1.0.a4.repeats.RC.gff3
real 2m36.904s user 2m34.020s sys 0m0.756s