This notebook relies on GFFutils to be installed and available in your $PATH
.
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Thu 18 Aug 2022 07:34:19 AM PDT ------------ Distributor ID: Ubuntu Description: Ubuntu 20.04.4 LTS Release: 20.04 Codename: focal ------------ HOSTNAME: computer ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 45 bits physical, 48 bits virtual CPU(s): 2 On-line CPU(s) list: 0,1 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 165 Model name: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz Stepping: 2 CPU MHz: 2400.007 BogoMIPS: 4800.01 Hypervisor vendor: VMware Virtualization type: full L1d cache: 64 KiB L1i cache: 64 KiB L2 cache: 512 KiB L3 cache: 32 MiB NUMA node0 CPU(s): 0,1 Vulnerability Itlb multihit: KVM: Mitigation: VMX unsupported Vulnerability L1tf: Mitigation; PTE Inversion Vulnerability Mds: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Meltdown: Mitigation; PTI Vulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Retbleed: Mitigation; IBRS Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; IBRS, IBPB conditional, RSB filling Vulnerability Srbds: Unknown: Dependent on hypervisor status Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat flush_l1d arch_capabilities ------------ Memory Specs total used free shared buff/cache available Mem: 54Gi 31Gi 19Gi 100Mi 4.2Gi 22Gi Swap: 2.0Gi 25Mi 2.0Gi
No LSB modules are available.
%env
indicates a bash variable
without %env
is Python variable
# Set directories, input/output files
%env data_dir=/home/sam/data/S_namaycush/genomes
%env analysis_dir=/home/sam/analyses/20220818-snam-gff_to_bed-genes
analysis_dir="/home/sam/analyses/20220818-snam-gff_to_bed-genes"
# Input GFF (from NCBI)
%env orig_gff=GCF_016432855.1_SaNama_1.0_genomic.gff
%env orig_gff_gz=GCF_016432855.1_SaNama_1.0_genomic.gff.gz
%env orig_gff_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/432/855/GCF_016432855.1_SaNama_1.0
# GTF extractor output
%env gtf_extractor_output=20220818-snam-GCF_016432855.1_SaNama_1.0_genes.bed
env: data_dir=/home/sam/data/S_namaycush/genomes env: analysis_dir=/home/sam/analyses/20220818-snam-gff_to_bed-genes env: orig_gff=GCF_016432855.1_SaNama_1.0_genomic.gff env: orig_gff_gz=GCF_016432855.1_SaNama_1.0_genomic.gff.gz env: orig_gff_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/432/855/GCF_016432855.1_SaNama_1.0 env: gtf_extractor_output=20220818-snam-GCF_016432855.1_SaNama_1.0_genes.bed
%%bash
cd "${data_dir}"
# Download with wget.
# Use --quiet option to prevent wget output from printing too many lines to notebook
# Use --continue to prevent re-downloading fie if it's already been downloaded.
wget --quiet \
--continue \
${orig_gff_url}/${orig_gff_gz}
# Unzip download GFF
gunzip "${orig_gff_gz}"
ls -ltrh "${orig_gff}"
-rw-rw-r-- 1 sam sam 373M Jan 13 2021 GCF_016432855.1_SaNama_1.0_genomic.gff
gzip: GCF_016432855.1_SaNama_1.0_genomic.gff already exists; not overwritten
%%bash
head -n 20 "${data_dir}"/"${orig_gff}"
##gff-version 3 #!gff-spec-version 1.21 #!processor NCBI annotwriter #!genome-build SaNama_1.0 #!genome-build-accession NCBI_Assembly:GCF_016432855.1 #!annotation-source NCBI Salvelinus namaycush Annotation Release 100 ##sequence-region NC_052307.1 1 84126519 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=8040 NC_052307.1 RefSeq region 1 84126519 . + . ID=NC_052307.1:1..84126519;Dbxref=taxon:8040;Name=1;chromosome=1;gbkey=Src;genome=chromosome;isolate=Seneca;mol_type=genomic DNA;sex=female;tissue-type=white muscle NC_052307.1 Gnomon gene 13938 48855 . + . ID=gene-LOC120017344;Dbxref=GeneID:120017344;Name=LOC120017344;gbkey=Gene;gene=LOC120017344;gene_biotype=protein_coding NC_052307.1 Gnomon mRNA 13938 48855 . + . ID=rna-XM_038988747.1;Parent=gene-LOC120017344;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;Name=XM_038988747.1;gbkey=mRNA;gene=LOC120017344;model_evidence=Supporting evidence includes similarity to: 6 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 53 samples with support for all annotated introns;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 13938 14098 . + . ID=exon-XM_038988747.1-1;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 15382 15453 . + . ID=exon-XM_038988747.1-2;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 15794 15984 . + . ID=exon-XM_038988747.1-3;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 20383 20506 . + . ID=exon-XM_038988747.1-4;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 22557 22678 . + . ID=exon-XM_038988747.1-5;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 24602 24719 . + . ID=exon-XM_038988747.1-6;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 27866 27930 . + . ID=exon-XM_038988747.1-7;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 28019 28097 . + . ID=exon-XM_038988747.1-8;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1 NC_052307.1 Gnomon exon 30973 31119 . + . ID=exon-XM_038988747.1-9;Parent=rna-XM_038988747.1;Dbxref=GeneID:120017344,Genbank:XM_038988747.1;gbkey=mRNA;gene=LOC120017344;product=threonine--tRNA ligase 1%2C cytoplasmic-like;transcript_id=XM_038988747.1
%%bash
# Make analysis directory, if it doesn't exist
mkdir --parents "${analysis_dir}"
# Extract just gene features
# Extract chrom,start,end,gene=,and strand fields
# "gene=" is the NCBI gene name, in this particular instance
# Specify input as GFF
# Use awk to to insert a "score" column before the strand column ($5)
# and fill new "score" column with arbitrary value of "0"
time \
gtf_extract \
--feature gene \
--fields=chrom,start,end,ID,strand \
--gff ${data_dir}/${orig_gff} \
| awk 'BEGIN{FS=OFS="\t"}{$5 = 0 OFS $5}1' \
> ${analysis_dir}/${gtf_extractor_output}
real 1m12.914s user 1m2.161s sys 0m5.052s
%%bash
cd "${analysis_dir}"
ls -ltrh ${gtf_extractor_output}
echo ""
head ${gtf_extractor_output}
-rw-rw-r-- 1 sam sam 2.3M Aug 18 07:36 20220818-snam-GCF_016432855.1_SaNama_1.0_genes.bed NC_052307.1 13938 48855 gene-LOC120017344 0 + NC_052307.1 243315 251581 gene-LOC120053455 0 + NC_052307.1 265811 273478 gene-LOC120050024 0 + NC_052307.1 289722 315386 gene-LOC120050032 0 - NC_052307.1 428889 450578 gene-LOC120049777 0 - NC_052307.1 606253 621424 gene-LOC120058477 0 + NC_052307.1 629244 678079 gene-pdzd2 0 + NC_052307.1 684892 686121 gene-pmaip1 0 + NC_052307.1 741348 859581 gene-LOC120053472 0 + NC_052307.1 1052957 1054465 gene-LOC120050046 0 +
%%bash
# Count gene features via GFFutils
echo "GFFutils number of extracted genes:"
gtf_extract --feature gene --fields=ID --gff ${data_dir}/${orig_gff} | wc -l
echo ""
# Count gene features via awk
echo "awk number of extracted genes:"
awk '$3 == "gene" { print $0 }' ${data_dir}/${orig_gff} | wc -l
GFFutils number of extracted genes: 46359 awk number of extracted genes: 46359
%%bash
cd "${analysis_dir}"
for file in *
do
md5sum "${file}" | tee --append checksums.md5
done
440d09ac4bd225a6585d69ef623fd812 20220818-snam-GCF_016432855.1_SaNama_1.0_genes.bed