This notebook relies on GFFutils to be installed and available in your $PATH
.
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Thu 09 Dec 2021 01:47:25 PM PST ------------ Distributor ID: Ubuntu Description: Ubuntu 20.04.3 LTS Release: 20.04 Codename: focal ------------ HOSTNAME: computer ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 45 bits physical, 48 bits virtual CPU(s): 2 On-line CPU(s) list: 0,1 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 165 Model name: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz Stepping: 2 CPU MHz: 2400.007 BogoMIPS: 4800.01 Hypervisor vendor: VMware Virtualization type: full L1d cache: 64 KiB L1i cache: 64 KiB L2 cache: 512 KiB L3 cache: 32 MiB NUMA node0 CPU(s): 0,1 Vulnerability Itlb multihit: KVM: Mitigation: VMX unsupported Vulnerability L1tf: Not affected Vulnerability Mds: Not affected Vulnerability Meltdown: Not affected Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Full generic retpoline, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat flush_l1d arch_capabilities ------------ Memory Specs total used free shared buff/cache available Mem: 54Gi 27Gi 20Gi 109Mi 5.9Gi 26Gi Swap: 2.0Gi 0B 2.0Gi
No LSB modules are available.
%env
indicates a bash variable
without %env
is Python variable
# Set directories, input/output files
%env data_dir=/home/sam/data/C_virginica/igv_tracks
%env analysis_dir=/home/sam/analyses/20211209_cvir_gff-to-bed
analysis_dir="/home/sam/analyses/20211209_cvir_gff-to-bed"
# Input GFF (from NCBI)
%env orig_gff=GCF_002022765.2_C_virginica-3.0_genomic.gff
%env orig_gff_gz=GCF_002022765.2_C_virginica-3.0_genomic.gff.gz
%env orig_gff_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0
# GTF extractor output
%env gtf_extractor_output=20211209_cvir_GCF_002022765.2_genes.bed
env: data_dir=/home/sam/data/C_virginica/igv_tracks env: analysis_dir=/home/sam/analyses/20211209_cvir_gff-to-bed env: orig_gff=GCF_002022765.2_C_virginica-3.0_genomic.gff env: orig_gff_gz=GCF_002022765.2_C_virginica-3.0_genomic.gff.gz env: orig_gff_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0 env: gtf_extractor_output=20211209_cvir_GCF_002022765.2_genes.bed
%%bash
cd "${data_dir}"
# Download with wget.
# Use --quiet option to prevent wget output from printing too many lines to notebook
# Use --continue to prevent re-downloading fie if it's already been downloaded.
wget --quiet \
--continue \
${orig_gff_url}/${orig_gff_gz}
# Unzip download GFF
gunzip "${orig_gff_gz}"
ls -ltrh "${orig_gff}"
-rw-rw-r-- 1 sam sam 412M Dec 10 2019 GCF_002022765.2_C_virginica-3.0_genomic.gff
%%bash
head -n 20 "${data_dir}"/"${orig_gff}"
##gff-version 3 #!gff-spec-version 1.21 #!processor NCBI annotwriter #!genome-build C_virginica-3.0 #!genome-build-accession NCBI_Assembly:GCF_002022765.2 #!annotation-source NCBI Crassostrea virginica Annotation Release 100 ##sequence-region NC_035780.1 1 65668440 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6565 NC_035780.1 RefSeq region 1 65668440 . + . ID=NC_035780.1:1..65668440;Dbxref=taxon:6565;Name=1;chromosome=1;collection-date=22-Mar-2015;country=USA;gbkey=Src;genome=chromosome;isolate=RU13XGHG1-28;isolation-source=Rutgers Haskin Shellfish Research Laboratory inbred lines (NJ);mol_type=genomic DNA;tissue-type=whole sample NC_035780.1 Gnomon gene 13578 14594 . + . ID=gene-LOC111116054;Dbxref=GeneID:111116054;Name=LOC111116054;gbkey=Gene;gene=LOC111116054;gene_biotype=lncRNA NC_035780.1 Gnomon lnc_RNA 13578 14594 . + . ID=rna-XR_002636969.1;Parent=gene-LOC111116054;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;Name=XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 1 sample with support for all annotated introns;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1 NC_035780.1 Gnomon exon 13578 13603 . + . ID=exon-XR_002636969.1-1;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1 NC_035780.1 Gnomon exon 14237 14290 . + . ID=exon-XR_002636969.1-2;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1 NC_035780.1 Gnomon exon 14557 14594 . + . ID=exon-XR_002636969.1-3;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1 NC_035780.1 Gnomon gene 28961 33324 . + . ID=gene-LOC111126949;Dbxref=GeneID:111126949;Name=LOC111126949;gbkey=Gene;gene=LOC111126949;gene_biotype=protein_coding NC_035780.1 Gnomon mRNA 28961 33324 . + . ID=rna-XM_022471938.1;Parent=gene-LOC111126949;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;Name=XM_022471938.1;gbkey=mRNA;gene=LOC111126949;model_evidence=Supporting evidence includes similarity to: 3 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 21 samples with support for all annotated introns;product=UNC5C-like protein;transcript_id=XM_022471938.1 NC_035780.1 Gnomon exon 28961 29073 . + . ID=exon-XM_022471938.1-1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1 NC_035780.1 Gnomon exon 30524 31557 . + . ID=exon-XM_022471938.1-2;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1 NC_035780.1 Gnomon exon 31736 31887 . + . ID=exon-XM_022471938.1-3;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1 NC_035780.1 Gnomon exon 31977 32565 . + . ID=exon-XM_022471938.1-4;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1
%%bash
# Make analysis directory, if it doesn't exist
mkdir --parents "${analysis_dir}"
# Extract just gene features
# Extract chrom,start,end,gene=,and strand fields
# "gene=" is the NCBI gene name, in this particular instance
# Specify input as GFF
# Use awk to to insert a "score" column before the strand column ($5)
# and fill new "score" column with arbitrary value of "0"
time \
gtf_extract \
--feature gene \
--fields=chrom,start,end,ID,strand \
--gff ${data_dir}/${orig_gff} \
| awk 'BEGIN{FS=OFS="\t"}{$5 = 0 OFS $5}1' \
> ${analysis_dir}/${gtf_extractor_output}
real 1m3.584s user 1m2.504s sys 0m0.333s
%%bash
cd "${analysis_dir}"
ls -ltrh ${gtf_extractor_output}
echo ""
head ${gtf_extractor_output}
-rw-rw-r-- 1 sam sam 2.0M Dec 10 07:02 20211209_cvir_GCF_002022765.2_genes.bed NC_035780.1 13578 14594 gene-LOC111116054 0 + NC_035780.1 28961 33324 gene-LOC111126949 0 + NC_035780.1 43111 66897 gene-LOC111110729 0 - NC_035780.1 85606 95254 gene-LOC111112434 0 - NC_035780.1 99840 106460 gene-LOC111120752 0 + NC_035780.1 108305 110077 gene-LOC111128944 0 - NC_035780.1 151859 157536 gene-LOC111128953 0 + NC_035780.1 163809 183798 gene-LOC111105691 0 - NC_035780.1 164820 166793 gene-LOC111105685 0 + NC_035780.1 169468 170178 gene-LOC111105702 0 -
%%bash
# Count gene features via GFFutils
echo "GFFutils number of extracted genes:"
gtf_extract --feature gene --fields=ID --gff ${data_dir}/${orig_gff} | wc -l
echo ""
# Count gene features via awk
echo "awk number of extracted genes:"
awk '$3 == "gene" { print $0 }' ${data_dir}/${orig_gff} | wc -l
GFFutils number of extracted genes: 38838 awk number of extracted genes: 38838
%%bash
cd "${analysis_dir}"
for file in *
do
md5sum "${file}" | tee --append checksums.md5
done
c8f203de591c0608b96f4299c0f847dc 20211209_cvir_GCF_002022765.2_genes.bed