%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Mon Feb 20 12:04:26 PM PST 2023 ------------ Distributor ID: Ubuntu Description: Ubuntu 22.04.1 LTS Release: 22.04 Codename: jammy ------------ HOSTNAME: computer ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Address sizes: 45 bits physical, 48 bits virtual Byte Order: Little Endian CPU(s): 4 On-line CPU(s) list: 0-3 Vendor ID: GenuineIntel Model name: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz CPU family: 6 Model: 165 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 4 Stepping: 2 BogoMIPS: 4800.01 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat flush_l1d arch_capabilities Hypervisor vendor: VMware Virtualization type: full L1d cache: 128 KiB (4 instances) L1i cache: 128 KiB (4 instances) L2 cache: 1 MiB (4 instances) L3 cache: 64 MiB (4 instances) NUMA node(s): 1 NUMA node0 CPU(s): 0-3 Vulnerability Itlb multihit: KVM: Mitigation: VMX unsupported Vulnerability L1tf: Mitigation; PTE Inversion Vulnerability Mds: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Meltdown: Mitigation; PTI Vulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Retbleed: Mitigation; IBRS Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS Not affected Vulnerability Srbds: Unknown: Dependent on hypervisor status Vulnerability Tsx async abort: Not affected ------------ Memory Specs total used free shared buff/cache available Mem: 54Gi 4.4Gi 42Gi 198Mi 7.4Gi 49Gi Swap: 2.0Gi 0B 2.0Gi
No LSB modules are available.
%env
indicates a bash variable
without %env
is Python variable
# Set directories, input/output files
%env data_dir=/home/sam/data/M_capitata/genomes
%env analysis_dir=/home/sam/analyses/20230127-pver-gff_to_gtf
analysis_dir="20230127-pver-gff_to_gtf"
# Input files (from NCBI)
%env gff=Pver_genome_assembly_v1.0.gff3
# URL of file directory
%env url=https://owl.fish.washington.edu/halfshell/genomic-databank
# Output file(s)
%env valid_gff=Pver_genome_assembly_v1.0-valid.gff3
%env gtf=Pver_genome_assembly_v1.0-valid.gtf
# Set program locations
%env gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread
# Set some formatting stuff
%env break_line=--------------------------------------------------------------------------
env: data_dir=/home/sam/data/M_capitata/genomes env: analysis_dir=/home/sam/analyses/20230127-pver-gff_to_gtf env: gff=Pver_genome_assembly_v1.0.gff3 env: url=https://owl.fish.washington.edu/halfshell/genomic-databank env: valid_gff=Pver_genome_assembly_v1.0-valid.gff3 env: gtf=Pver_genome_assembly_v1.0-valid.gtf env: gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread env: break_line=--------------------------------------------------------------------------
%%bash
# Make analysis and data directory, if doesn't exist
mkdir --parents "${analysis_dir}"
mkdir --parents "${data_dir}"
%%bash
cd "${data_dir}"
# Download with wget.
# Use --quiet option to prevent wget output from printing too many lines to notebook
# Use --continue to prevent re-downloading fie if it's already been downloaded.
# Use --no-check-certificate to avoid download error from gannet
wget --quiet \
--continue \
--no-check-certificate \
${url}/${gff}
ls -ltrh "${gff}"
-rw-rw-r-- 1 sam sam 71M Mar 23 2020 Pver_genome_assembly_v1.0.gff3
%%bash
head -n 20 "${data_dir}"/"${gff}"
# ORIGINAL: Pver_g1.t2 original gene structure, not modified by PASA Pver_Sc0000000_size2095917 . gene 13766 20466 . + . ID=Pver_gene_g1;Name=Pver_g1.t1 Pver_Sc0000000_size2095917 . mRNA 13766 20466 . + . ID=Pver_g1.t2;Parent=Pver_gene_g1;Name=Pver_g1.t1 Pver_Sc0000000_size2095917 . five_prime_UTR 13766 14013 . + . ID=Pver_g1.t2.utr5p1;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 13766 14098 . + . ID=Pver_g1.t2.exon1;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 14014 14098 . + 0 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 16629 16667 . + . ID=Pver_g1.t2.exon2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 16629 16667 . + 1 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 17615 17698 . + . ID=Pver_g1.t2.exon3;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 17615 17698 . + 1 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 18109 18420 . + . ID=Pver_g1.t2.exon4;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 18109 18420 . + 1 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 18845 19071 . + . ID=Pver_g1.t2.exon5;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 18845 19071 . + 1 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 19404 19581 . + . ID=Pver_g1.t2.exon6;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 19404 19581 . + 0 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 19848 20466 . + . ID=Pver_g1.t2.exon7;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 19848 19873 . + 1 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . three_prime_UTR 19874 20466 . + . ID=Pver_g1.t2.utr3p1;Parent=Pver_g1.t2 #PROT Pver_g1.t2 Pver_gene_g1 MLTRYCLGSQKLTPLIGNVTVTFIIPEKELSQPSCILFNFQDFKTHLSKIMMYSPLTFVLFVALTFQSTVAIEYSRIGCYRDTLVKPRPLPELIENFRGGRVDWNNLNNTIAACAEAAKKKGYLYFGLQFYGECWSGPQAQLTYARDGPSKNCSKGVGEERANFVYKIKLLEKENECTTYRVLDSADRSKTNVNTVSQGDKCDHWNSGFVRNAWYRFTGAAGQTMADECVQAGSCQTTMAGWMNGTHPKVFDGIQRRKACFSSESNPYKRQNNNCCERQIYIHVRNCGEFYVYKLPSTPGCFLRYCGSGVSQNKNA*
%%bash
# Capture number of fields (NF) in each row in array.
field_count_array=($(awk -F "\t" '{print NF}' "${data_dir}/${gff}" | sort --unique))
# Check array contents
echo "List of number of fields in ${data_dir}/${gff}:"
echo ""
for field_count in "${field_count_array[@]}"
do
echo "${field_count}"
done
echo ""
echo "${break_line}"
echo ""
# Preview of each line "type" with a given number of fields
# Check array contents
echo ""
for field_count in "${field_count_array[@]}"
do
echo "Preview of lines with ${field_count} field(s):"
echo ""
awk \
-v field_count="${field_count}" \
-F "\t" \
'NF == field_count' \
"${data_dir}/${gff}" \
| head
echo ""
echo "${break_line}"
echo ""
done
List of number of fields in /home/sam/data/M_capitata/genomes/Pver_genome_assembly_v1.0.gff3: 1 10 11 2 9 -------------------------------------------------------------------------- Preview of lines with 1 field(s): # ORIGINAL: Pver_g1.t2 original gene structure, not modified by PASA # PASA_UPDATE: Pver_g2.t1, single gene model update, valid-1, status:[pasa:asmbl_2,status:12], valid-1 # PASA_UPDATE: Pver_g3.t1, single gene model update, valid-1, status:[pasa:asmbl_3,status:8], valid-1 # PASA_UPDATE: Pver_g4.t1, single gene model update, valid-1, status:[pasa:asmbl_4,status:13], valid-1 # ORIGINAL: Pver_g5.t1 original gene structure, not modified by PASA # PASA_UPDATE: Pver_g6.t1, single gene model update, valid-1, status:[pasa:asmbl_6,status:8], valid-1 # ORIGINAL: Pver_g7.t1 original gene structure, not modified by PASA # PASA_UPDATE: Pver_g8.t1, single gene model update, valid-1, status:[pasa:asmbl_8,status:8], valid-1 # PASA_UPDATE: Pver_g9.t1, single gene model update, valid-1, status:[pasa:asmbl_11,status:13], valid-1 # PASA_UPDATE: Pver_g10.t1, single gene model update, valid-1, status:[pasa:asmbl_12,status:13], valid-1 -------------------------------------------------------------------------- Preview of lines with 10 field(s): Pver_Sc0000004_size1560107 . CDS 1538 1660 . - 2 ID=cds.Pver_g535.t1;Parent=Pver_g535.t1 3_prime_partial true Pver_Sc0000005_size1451149 . CDS 6000 6369 . + 0 ID=cds.Pver_g658.t1;Parent=Pver_g658.t1 5_prime_partial true Pver_Sc0000015_size1181740 . CDS 1180243 1180328 . + 0 ID=cds.Pver_g1699.t2;Parent=Pver_g1699.t2 3_prime_partial true Pver_Sc0000020_size1082251 . CDS 991 1066 . + 0 ID=cds.Pver_g1987.t2;Parent=Pver_g1987.t2 5_prime_partial true Pver_Sc0000023_size1082203 . CDS 1 215 . - 2 ID=cds.Pver_g2184.t1;Parent=Pver_g2184.t1 3_prime_partial true Pver_Sc0000034_size904800 . CDS 3640 4620 . - 0 ID=cds.Pver_g2932.t1;Parent=Pver_g2932.t1 3_prime_partial true Pver_Sc0000040_size808600 . CDS 2 248 . + 0 ID=cds.Pver_g3290.t1;Parent=Pver_g3290.t1 5_prime_partial true Pver_Sc0000048_size787391 . CDS 4784 4886 . + 0 ID=cds.Pver_g3688.t1;Parent=Pver_g3688.t1 5_prime_partial true Pver_Sc0000053_size770660 . CDS 769075 769758 . + 2 ID=cds.Pver_g3960.t1;Parent=Pver_g3960.t1 3_prime_partial true Pver_Sc0000055_size736968 . CDS 350 452 . + 0 ID=cds.Pver_g4033.t1;Parent=Pver_g4033.t1 5_prime_partial true -------------------------------------------------------------------------- Preview of lines with 11 field(s): Pver_Sc0002193_size7256 . CDS 1 1630 . - 0 ID=cds.Pver_g20342.t1;Parent=Pver_g20342.t1 5_prime_partial true 3_prime_partial true Pver_Sc0002913_size4362 . CDS 1 1500 . - 0 ID=cds.Pver_g20701.t1;Parent=Pver_g20701.t1 5_prime_partial true 3_prime_partial true Pver_Sc0002951_size4270 . CDS 1321 4270 . + 0 ID=cds.Pver_g20712.t1;Parent=Pver_g20712.t1 5_prime_partial true 3_prime_partial true Pver_Sc0003413_size3396 . CDS 1 3394 . - 0 ID=cds.Pver_g20894.t1;Parent=Pver_g20894.t1 5_prime_partial true 3_prime_partial true Pver_Sc0003758_size2910 . CDS 1308 2908 . - 0 ID=cds.Pver_g21008.t1;Parent=Pver_g21008.t1 5_prime_partial true 3_prime_partial true Pver_Sc0005005_size1901 . CDS 2 1901 . + 0 ID=cds.Pver_g21369.t1;Parent=Pver_g21369.t1 5_prime_partial true 3_prime_partial true Pver_Sc0005269_size1786 . CDS 1 1785 . - 0 ID=cds.Pver_g21442.t1;Parent=Pver_g21442.t1 5_prime_partial true 3_prime_partial true Pver_Sc0006090_size1463 . CDS 2 1463 . + 0 ID=cds.Pver_g21627.t1;Parent=Pver_g21627.t1 5_prime_partial true 3_prime_partial true Pver_Sc0006735_size1286 . CDS 1 1285 . - 0 ID=cds.Pver_g21746.t1;Parent=Pver_g21746.t1 5_prime_partial true 3_prime_partial true Pver_Sc0007030_size1213 . CDS 1 1213 . - 0 ID=cds.Pver_g21783.t1;Parent=Pver_g21783.t1 5_prime_partial true 3_prime_partial true -------------------------------------------------------------------------- Preview of lines with 2 field(s): #PROT Pver_g1.t2 Pver_gene_g1 MLTRYCLGSQKLTPLIGNVTVTFIIPEKELSQPSCILFNFQDFKTHLSKIMMYSPLTFVLFVALTFQSTVAIEYSRIGCYRDTLVKPRPLPELIENFRGGRVDWNNLNNTIAACAEAAKKKGYLYFGLQFYGECWSGPQAQLTYARDGPSKNCSKGVGEERANFVYKIKLLEKENECTTYRVLDSADRSKTNVNTVSQGDKCDHWNSGFVRNAWYRFTGAAGQTMADECVQAGSCQTTMAGWMNGTHPKVFDGIQRRKACFSSESNPYKRQNNNCCERQIYIHVRNCGEFYVYKLPSTPGCFLRYCGSGVSQNKNA* #PROT Pver_g2.t1 Pver_gene_g2 MITPQFLLLQFFLSCLIAAEENEKINLFRDEEDFHYEPIGCFGDKLDVPRPLPLLIRNYRSRPYRVDWNNINNTIQACAKEVKKAGYVYFGLQFYGECWSGPHAHLTYDEDGKSTRCVNGVGKRMANFVYRLVFKECKEYRILQAPDRSIHHPYTPSAPCDISLKPSWYRFEGRAGTAMANSCPSRFKCGTIVPGWLNGPLPSVQDGIVSREICFNQDRDCCFTSTQAKVRNCAGFYVFYLRSAPYCQLRYCGNGRIG* #PROT Pver_g3.t1 Pver_gene_g3 MNLLVLCSVFLLYFGYYTRCADASQYVKVGCFRDKLSPRARALPELLANYRGNINWNNLMEVVEKCAKKAKEKNYMYFAIQFYGECWSGATAPKTYDRYGRSSPCTSNSVVGTALTNVVYRFAGDEQECVKYEELNKVDRSESSRLISGTSPACDKELKPGWYRFQKPAGNLMASKCLPSARCGTVVTGWLSSPHPKMTDGIVNGKVCYSWKSDCCKWDQDIKLRNCGRFYVYKLAGTQACPMGFCGSAASV* #PROT Pver_g4.t1 Pver_gene_g4 MKLKTLAAVAFLCLEYSLSLTDASQFVKVGCFKDKRSPRARALPELIANYRGNIDWNNTMEVVDKCAKRAKEKNYMYFGIQFYGECWSGATAPLTYDRYGRSLHCTSEVGGMHANLVYRFVGEESECLQYLQLSTRDRSVESDPPAGLAAICDNSLSPRWYRFLRPAGDKMASKCVPTQKCGTAVTGWLSSPHPKVSDGIVNGKVCFHWLNECCFRDQVIKVRDCGRFFVYKLNKPEGCPMRFCGSNVE* #PROT Pver_g5.t1 Pver_gene_g5 MRRIMEVLYSLLVIFFMADTCCSQTCKNANWWHTFDREGWSYCDSENQYITGLWRNDNKGSNDGIYLIEHAKCCFAPYGVHAQDIPASCKKANWWKVLDGTNKWATCPDGYFMGGLYRTGKHHWLHNIEEARCCKPVGLPEKYADCYDENVWSSFDGKGLSECKRKGYYMAGIFRGECDKLYCLEKFKCCRMIAFMASSQQRYNIGVKSKALITLSIALVSFSHSQEVAVRDSEPQSKQLTPIPEFRPIGCFVDSGAIPRPIPKLVANFRGNIDWHNLNKTVLDCARRVNSKGFRYFGIQFYGECWSGENAELTYNKQGTSKNCFRGLGERKANFVYAFVVKEINARLANGSSSRSGRVEVFHRGSWGAVCGKEWDIRDADVVCRMLGYPGALNAYQDDRYRSGKGRVRLGDVQCNGNEENLAFCPYKDYSVCSQSGEAGVKCRSVSDSIQPRIPYRLSGGNSRSEGRVEVYHDRKWGTICDRGWDLIDAGIFCRMLAYTGARATPKYGQGSGPIWLSDVKCIGTEDSIFRCENAGWKNVDNCDHSNDAGAECYYD* #PROT Pver_g6.t1 Pver_gene_g6 MKSAIYSLLIILFLADTCCSQTCSNANWWISFDGEGWSYCDHKNQYMTGLWRNDAKGSDDGIYLIEYAKCCVAPYGMKDVPASCKTANWWGVLDSNNKWATCPDGYFMGGLYRTGDDPWLHNIEEARCCKPEGLPDRYADCYIENVWGSFDGKGLSECKREGYYMAGIFRGDCDKLYCLEEFKCCRMIAIEGPPMLADETKSKE* #PROT Pver_g7.t1 Pver_gene_g7 MNIILFKSLKVILERSETILCSPYYPNIHRIEKKRTKDTVNFIMKKFLLFSMATLMMMDQCWSQCRKANWWGSFDKEGWSKCASSVEYLKGFYRNNKNNNDPIYLLEEGRCCKAPPPNQNQASTCKNANWWGVLDKTNRWAFCPTGYFLQGLYRSKNHNIHNIEEGHCCKPNNLPSSYLRCYEHDISSSFDNKGWSECDSDHYLTGVYRGGCDKLQCIEKIKCCMMPDSCKMANWWKAFDKKGWVQCDSTKHYITGLYRNNNWGKNDKIFLLEEAKCCPAPPPYQNTGSTCRDANWWGVLDKTNSWAVCPAGYFVRGFYRNNGAWLHHLEMGKCCKPNGFPDRYEHCYNEDVKSSFDRRGLSKCQREGYYLAGIFRGGCDYLYCIESFKCCKMNVDECRTKNPCQNGAACSDQPGTYKCTCKSGFTGKNCESDINECSKSPCKNGAKCVNLKGSYRCDCKSGYTGKNCESDKNECSANPCKNGATCVNLQGSYRCDCKSGYTGNHCESDKNECSTSPCKNGATCVNLQGSYRCDCKSGYTGKHCDSDVNECSNNPCKHGATCVNLQGGHRCDCKSGYTGSSCESDINECSNSPCKNGATCVNLQGSYRCDCKTGYTGNNCETDVNECSNNPCKNGATCVNIQGGHRCDCKSGYTGISCESDINECSNSPCKNGATCVNLSGSYRCDCKTGYTGNNCETDIDECSNNPCQNGALCANLQGSYRCDCKTGYTGNQCQTDINECAPAPCQNGGTCVDLVGSFRCDCPAEFEGANCENAAENGIEECEM* #PROT Pver_g8.t1 Pver_gene_g8 MKLLLASLVALIISDVFTTLEGKPSTGRSFLKPRILSRGRRAVPDVSSIRIDIRSEGCNDPGKTPNTCGRAYIKVNGNEHSKKKRGHNVVVLDAITGIVEHSVSFDTHGSTAAANQLKDFLNGIAGDKIILIAVQDEGSRYLKPAFDALMKIGAHDLDFLNHRGSYALVGYSREKKPSYVQQVQNKGGKGPSVISTTVPLTKNPFVDIDIRSEGCNDPNKKPDTCGIAYIKVDGKDHSLHRRGHNVVIVERKTGRVLKSEAFDTHGDGSAGTRLKDFLNAQGEDKIVLVAVQDSAASHVGVALDSLRRVGAIDPILVEFRGSYALIGTLDANKPPWVTQDQHPRYKGPSEISIRIPLSDCQKALGMENYGIPNGKVRASSEWDSNHAAIQGRLHYKPPRGKQGAWSARHNNINQWLQVDLGSAFIKVTGVATQGRYNYDQWVTKYKLQYSNDGATFTYYKEPGQTADMVFSGNTDRNTVVYHFLNAPVTARYIRFRPVTWSGHISMRVELYGCSACEKALGMASYAIPNGQVKASSEWDPNHAAIQGRLHYLPPPGKQGGWSAKYNKANQWLQIDLGALFRVTAVATQGRSNYNQWVTKYKLQYSDNGATFTYYMEAGQNVAKEFVANKDRNTVVYHSLNPPKTTHFIRFRPVAWKSHISMRVEVYGCSACGEALGMASYAIPNGQVKASSEWDPNHAAIQGRLHYLPPPGKQGGWSAKHNNANQWLQIDLGALLKVTAVATQGRSNHDQWVTKYKLQYSDDGATFTYYMEAGQSVAKEFVANKDRSSVVYHSLNPPKLTRFIRFRPVSWKSHISMRVEVYGCSARDYCAQNPCKNGATCSNVEEGYQCTCKPGYSGAQCDQDINECTNSPCQNGATCVNLQGSYRCDCKSGYNGNKCENDINECSNNPCKNGATCVNLQGSYSCDCKSGYNGNNCENDINECTNSPCKNGATCVNLQGSYRCDCKSGYDGNNCETDINECSNNPCKNGATCVNLQGSHRCDCKSGYDGNNCENDINECSNSPCKNGATCVNLVGSYRCDCKSGYDGNNCENDVNECTNNPCKNGATCVNLSGGHRCDCAKGYSGSSCETAINYCAPDPCLNGGTCVDLVDGFRCDCAAGWLGIICDEPEVDECEE* #PROT Pver_g9.t1 Pver_gene_g9 MEFAFNINDLLPDLITVVDDKLAPFRSRIKNDRYEFANKQSQLKIVIDHMGEASTRAQGLRAPITSALKLQHSDHRLYVMKDSAANNGQGAVVGILKVGHKKLFLMDMQSVQYEVNPLCVLDFYVHESRQRTGCGKRLFEHMLKMEGRMVYQLAIDRPSHKFVSFLKKYHGFKNAIPQANNFVIHEHFFSDLQAVGHVPRRSYRFANASLSGKPPIHTYRKTSHARDTPPLLLRRQSSSSRQSSRPNSGKEFSHDTAHSGESEPSALQRINNSLQDINLPPVVPRPSSRNSAPGSRGSSAGRRSVTRQMELGTPEATGHVSSETYNASRGLAARATSYSRHSNRGNSADFSKTKKDVVVTGTKVSDNFYRNNSGGRARSLHLTSVENPFLGHGVMEKSSLQRKQERTGTQEQMASTMVPPEEGAQDQNFNITDKKETSETSQQSGITPSGSFTVGTMFKAHFGNRQPFIGSSWNIFGVPTFMNKDSNSAHYAYTRRTQSRHHPF* #PROT Pver_g10.t1 Pver_gene_g10 MIAETVEGWLTILAEDGNEWLSRLCVLFSNEKKLRTFVDDVEYLDCARFLAATAKIKAKLCDSPDLILNPTHEGLSRAHRLSRRSRSENALSAALNGNRSKLSDFFSKKIADGKRNRSLTRSLFLPGTNNYDSGFSSLSPPTIPRWRSHETLESKMKTTSPDSVDLSISGHVTVRPIHPSILSRQHCFQVTTPHETKYFSCRSSQELEKWVVSIKKSIQPTRDVQYRTDIGLTVWIVEAKGLTDKPKRRFYCELFFDKVLYAKTSSKTKSDILFWGENFAFNDLSEVKTVTVQIYKETDGKNKKDKRKPVGSVDVPVDSLEMGVEVEKWYPVTMESNKTNGGEASSIRLRFKYQKVSILPVRSYVELSEYLNRNYMTICKALEPVVSVKQKDEISRVLLRILQACGKATEFLSTIVMDEVNNLEDENLVFRGNSFATKAMDSYMKMIGESYLQDTLGDFVQSVYEFEEDCEVDPSKKSSGNLESNKENLKTFVEKAWNLIMCSACLFPSDLKVTFHEFRRWCEGRSEDLSTKLISGSIFLRFLCPAILSPSLFHLVQEYPDERTSRALTLIAKVIQNLANFTRFGGKEEYMDFMNGFVEKEFVNMRKFLNEISSRTDTTENHYEGIIDIGRELALMFNMLKDQTSKMNQDALETLRPLQFILKSLQEIYHDSERKFAVGNESLVAHRKWASSSDLVSDQDKNNITLTPVSQIRIMKSPNKRTQNPPIRTSSESNLVFNGGTAATTPEHLMTSRYDESILSPISDIDPPSLSPRLRANYDDDLQSSFRRRSYRRAIASDEVSKAIPGAIERERSSSERRSLTRNSDSHRRSQSESPHVISSVDSGSPTEARVIYKGELSNSRSSLKSKESANHRHYRAVYDSRQRDGKDGCLKRRSLPRGVPPGSSPIITDRSRTSSPEPPPPPRPTAPKPLLSSASVDPSHSSPRLNNHDSSRFLNPAVGRVARSPSGTSSSSSGSEESWHSVGSSVEGGGGGGGLARRRARKMPRTPSPEDRRRGSGSNPWERMPPRAEDAFDNGYDVPPPKTVAEYEKELQDLREELLQTKRTLASTHEQLILQESSTHKLVASFKERLAESESSLQQLRQEKDKEMKELLDRLVSVETELRQEQREMQEVIQAKQVIIEAQERRIKSTDSSNAKLIATLNQVGGSRASNKVLNYPSSDL* -------------------------------------------------------------------------- Preview of lines with 9 field(s): Pver_Sc0000000_size2095917 . gene 13766 20466 . + . ID=Pver_gene_g1;Name=Pver_g1.t1 Pver_Sc0000000_size2095917 . mRNA 13766 20466 . + . ID=Pver_g1.t2;Parent=Pver_gene_g1;Name=Pver_g1.t1 Pver_Sc0000000_size2095917 . five_prime_UTR 13766 14013 . + . ID=Pver_g1.t2.utr5p1;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 13766 14098 . + . ID=Pver_g1.t2.exon1;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 14014 14098 . + 0 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 16629 16667 . + . ID=Pver_g1.t2.exon2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 16629 16667 . + 1 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 17615 17698 . + . ID=Pver_g1.t2.exon3;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . CDS 17615 17698 . + 1 ID=cds.Pver_g1.t2;Parent=Pver_g1.t2 Pver_Sc0000000_size2095917 . exon 18109 18420 . + . ID=Pver_g1.t2.exon4;Parent=Pver_g1.t2 --------------------------------------------------------------------------
In the above results, we can see that there are rows with 10 and 11 fields, due to incorrect tabs entered in Field 9. This creates an invalid GFF3 file.
This will remove tabs in Field 9 and join those extra fields into Field 9 using a semi-colon separator with the Note=
attribute.
%%bash
time \
while read -r line
do
# Count number of fields
number_of_fields=$(echo "${line}" | awk -F "\t" '{print NF}')
# If number of fields is 9, print the entire line
if [[ "${number_of_fields}" -eq 9 ]]; then
echo "${line}"
# If there are 10 fields, cut/capture the first 9 and
# capture the 10th.
# Use printf to join 10th field to 9th via semi-colon
elif [[ "${number_of_fields}" -eq 10 ]]; then
first_nine=$(echo "${line}" | cut -f 1-9)
tenth=$(echo "${line}" | cut -f 10 )
printf "%s;%s\n" "${first_nine}" "Note=${tenth}"
# If there are 11 fields, cut/capture the first 9, followed
# by capturing the 10th and 11th fields.
# Use printf to join 10th and 11th field to 9th via semi-colon.
elif [[ "${number_of_fields}" -eq 11 ]]; then
first_nine=$(echo "${line}" | cut -f 1-9)
tenth=$(echo "${line}" | cut -f 10)
eleventh=$(echo "${line}" | cut -f 11)
printf "%s;%s;%s\n" "${first_nine}" "Note=${tenth}" "Note=${eleventh}"
else
# If the line doesn't match any of the above, just print it.
echo "${line}"
fi
done < "${data_dir}/${gff}" > "${analysis_dir}/${valid_gff}"
real 63m22.307s user 62m10.233s sys 12m37.124s
%%bash
wc -l "${data_dir}/${gff}"
wc -l "${analysis_dir}/${valid_gff}"
575514 /home/sam/data/M_capitata/genomes/Pver_genome_assembly_v1.0.gff3 575514 /home/sam/analyses/20230127-pver-gff_to_gtf/Pver_genome_assembly_v1.0-valid.gff3
%%bash
awk -F "\t" '{print NF}' "${analysis_dir}/${valid_gff}" | sort --unique
1 2 9
%%bash
cd "${data_dir}"
${gffread} -E \
"${analysis_dir}/${valid_gff}" -T \
1> ${analysis_dir}/"${gtf}" \
2> ${analysis_dir}/gffread-gff_to_gtf.stderr
%%bash
head ${analysis_dir}/"${gtf}"
Pver_Sc0000000_size2095917 . transcript 13766 20466 . + . transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1" Pver_Sc0000000_size2095917 . exon 13766 14098 . + . transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1"; Pver_Sc0000000_size2095917 . exon 16629 16667 . + . transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1"; Pver_Sc0000000_size2095917 . exon 17615 17698 . + . transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1"; Pver_Sc0000000_size2095917 . exon 18109 18420 . + . transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1"; Pver_Sc0000000_size2095917 . exon 18845 19071 . + . transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1"; Pver_Sc0000000_size2095917 . exon 19404 19581 . + . transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1"; Pver_Sc0000000_size2095917 . exon 19848 20466 . + . transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1"; Pver_Sc0000000_size2095917 . CDS 14014 14098 . + 0 transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1"; Pver_Sc0000000_size2095917 . CDS 16629 16667 . + 2 transcript_id "Pver_g1.t2"; gene_id "Pver_gene_g1";
%%bash
cd "${analysis_dir}"
for file in *
do
md5sum "${file}" | tee --append checksums.md5
done
9828841097f80b93fc2c599b76432b7c gffread-gff_to_gtf.stderr 5dd8f21a4faea1f46c48a5ab253749d7 Pver_genome_assembly_v1.0-valid.gff3 c3cc8fb576bcf39dd17b6d229100aa56 Pver_genome_assembly_v1.0-valid.gtf
%%bash
${gffread} -h
gffread v0.12.7. Usage: gffread [-g <genomic_seqs_fasta> | <dir>] [-s <seq_info.fsize>] [-o <outfile>] [-t <trackname>] [-r [<strand>]<chr>:<start>-<end> [-R]] [--jmatch <chr>:<start>-<end>] [--no-pseudo] [-CTVNJMKQAFPGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>] [-j ][--ids <IDs.lst> | --nids <IDs.lst>] [--attrs <attr-list>] [-i <maxintron>] [--stream] [--bed | --gtf | --tlf] [--table <attrlist>] [--sort-by <ref.lst>] [<input_gff>] Filter, convert or cluster GFF/GTF/BED records, extract the sequence of transcripts (exon or CDS) and more. By default (i.e. without -O) only transcripts are processed, discarding any other non-transcript features. Default output is a simplified GFF3 with only the basic attributes. Options: --ids discard records/transcripts if their IDs are not listed in <IDs.lst> --nids discard records/transcripts if their IDs are listed in <IDs.lst> -i discard transcripts having an intron larger than <maxintron> -l discard transcripts shorter than <minlen> bases -r only show transcripts overlapping coordinate range <start>..<end> (on chromosome/contig <chr>, strand <strand> if provided) -R for -r option, discard all transcripts that are not fully contained within the given range --jmatch only output transcripts matching the given junction -U discard single-exon transcripts -C coding only: discard mRNAs that have no CDS features --nc non-coding only: discard mRNAs that have CDS features --ignore-locus : discard locus features and attributes found in the input -A use the description field from <seq_info.fsize> and add it as the value for a 'descr' attribute to the GFF record -s <seq_info.fsize> is a tab-delimited file providing this info for each of the mapped sequences: <seq-name> <seq-length> <seq-description> (useful for -A option with mRNA/EST/protein mappings) Sorting: (by default, chromosomes are kept in the order they were found) --sort-alpha : chromosomes (reference sequences) are sorted alphabetically --sort-by : sort the reference sequences by the order in which their names are given in the <refseq.lst> file Misc options: -F keep all GFF attributes (for non-exon features) --keep-exon-attrs : for -F option, do not attempt to reduce redundant exon/CDS attributes -G do not keep exon attributes, move them to the transcript feature (for GFF3 output) --attrs <attr-list> only output the GTF/GFF attributes listed in <attr-list> which is a comma delimited list of attribute names to --keep-genes : in transcript-only mode (default), also preserve gene records --keep-comments: for GFF3 input/output, try to preserve comments -O process other non-transcript GFF records (by default non-transcript records are ignored) -V discard any mRNAs with CDS having in-frame stop codons (requires -g) -H for -V option, check and adjust the starting CDS phase if the original phase leads to a translation with an in-frame stop codon -B for -V option, single-exon transcripts are also checked on the opposite strand (requires -g) -P add transcript level GFF attributes about the coding status of each transcript, including partialness or in-frame stop codons (requires -g) --add-hasCDS : add a "hasCDS" attribute with value "true" for transcripts that have CDS features --adj-stop stop codon adjustment: enables -P and performs automatic adjustment of the CDS stop coordinate if premature or downstream -N discard multi-exon mRNAs that have any intron with a non-canonical splice site consensus (i.e. not GT-AG, GC-AG or AT-AC) -J discard any mRNAs that either lack initial START codon or the terminal STOP codon, or have an in-frame stop codon (i.e. only print mRNAs with a complete CDS) --no-pseudo: filter out records matching the 'pseudo' keyword --in-bed: input should be parsed as BED format (automatic if the input filename ends with .bed*) --in-tlf: input GFF-like one-line-per-transcript format without exon/CDS features (see --tlf option below); automatic if the input filename ends with .tlf) --stream: fast processing of input GFF/BED transcripts as they are received ((no sorting, exons must be grouped by transcript in the input data) Clustering: -M/--merge : cluster the input transcripts into loci, discarding "redundant" transcripts (those with the same exact introns and fully contained or equal boundaries) -d <dupinfo> : for -M option, write duplication info to file <dupinfo> --cluster-only: same as -M/--merge but without discarding any of the "duplicate" transcripts, only create "locus" features -K for -M option: also discard as redundant the shorter, fully contained transcripts (intron chains matching a part of the container) -Q for -M option, no longer require boundary containment when assessing redundancy (can be combined with -K); only introns have to match for multi-exon transcripts, and >=80% overlap for single-exon transcripts -Y for -M option, enforce -Q but also discard overlapping single-exon transcripts, even on the opposite strand (can be combined with -K) Output options: --force-exons: make sure that the lowest level GFF features are considered "exon" features --gene2exon: for single-line genes not parenting any transcripts, add an exon feature spanning the entire gene (treat it as a transcript) --t-adopt: try to find a parent gene overlapping/containing a transcript that does not have any explicit gene Parent -D decode url encoded characters within attributes -Z merge very close exons into a single exon (when intron size<4) -g full path to a multi-fasta file with the genomic sequences for all input mappings, OR a directory with single-fasta files (one per genomic sequence, with file names matching sequence names) -j output the junctions and the corresponding transcripts -w write a fasta file with spliced exons for each transcript --w-add <N> for the -w option, extract additional <N> bases both upstream and downstream of the transcript boundaries --w-nocds for -w, disable the output of CDS info in the FASTA file -x write a fasta file with spliced CDS for each GFF transcript -y write a protein fasta file with the translation of CDS for each record -W for -w, -x and -y options, write in the FASTA defline all the exon coordinates projected onto the spliced sequence; -S for -y option, use '*' instead of '.' as stop codon translation -L Ensembl GTF to GFF3 conversion, adds version to IDs -m <chr_replace> is a name mapping table for converting reference sequence names, having this 2-column format: <original_ref_ID> <new_ref_ID> -t use <trackname> in the 2nd column of each GFF/GTF output line -o write the output records into <outfile> instead of stdout -T main output will be GTF instead of GFF3 --bed output records in BED format instead of default GFF3 --tlf output "transcript line format" which is like GFF but with exons and CDS related features stored as GFF attributes in the transcript feature line, like this: exoncount=N;exons=<exons>;CDSphase=<N>;CDS=<CDScoords> <exons> is a comma-delimited list of exon_start-exon_end coordinates; <CDScoords> is CDS_start:CDS_end coordinates or a list like <exons> --table output a simple tab delimited format instead of GFF, with columns having the values of GFF attributes given in <attrlist>; special pseudo-attributes (prefixed by @) are recognized: @id, @geneid, @chr, @start, @end, @strand, @numexons, @exons, @cds, @covlen, @cdslen If any of -w/-y/-x FASTA output files are enabled, the same fields (excluding @id) are appended to the definition line of corresponding FASTA records -v,-E expose (warn about) duplicate transcript IDs and other potential problems with the given GFF/GTF records
--------------------------------------------------------------------------- CalledProcessError Traceback (most recent call last) /tmp/ipykernel_1145435/1000630337.py in <module> ----> 1 get_ipython().run_cell_magic('bash', '', '${gffread} -h\n') ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell) 2417 with self.builtin_trap: 2418 args = (magic_arg_s, cell) -> 2419 result = fn(*args, **kwargs) 2420 return result 2421 ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magics/script.py in named_script_magic(line, cell) 140 else: 141 line = script --> 142 return self.shebang(line, cell) 143 144 # write a basic docstring: ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/decorator.py in fun(*args, **kw) 230 if not kwsyntax: 231 args, kw = fix(args, kw, sig) --> 232 return caller(func, *(extras + args), **kw) 233 fun.__name__ = func.__name__ 234 fun.__doc__ = func.__doc__ ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k) 185 # but it's overkill for just that one bit of state. 186 def magic_deco(arg): --> 187 call = lambda f, *a, **k: f(*a, **k) 188 189 if callable(arg): ~/programs/miniconda3/envs/gffutils_env/lib/python3.9/site-packages/IPython/core/magics/script.py in shebang(self, line, cell) 243 sys.stderr.flush() 244 if args.raise_error and p.returncode!=0: --> 245 raise CalledProcessError(p.returncode, cell, output=out, stderr=err) 246 247 def _run_script(self, p, cell, to_close): CalledProcessError: Command 'b'${gffread} -h\n'' returned non-zero exit status 1.