This notebook relies on GFFutils to be installed and available in your $PATH
.
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Mon Sep 26 01:12:01 PM PDT 2022 ------------ Distributor ID: Ubuntu Description: Ubuntu 22.04.1 LTS Release: 22.04 Codename: jammy ------------ HOSTNAME: computer ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Address sizes: 45 bits physical, 48 bits virtual Byte Order: Little Endian CPU(s): 8 On-line CPU(s) list: 0-7 Vendor ID: GenuineIntel Model name: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz CPU family: 6 Model: 165 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 8 Stepping: 2 BogoMIPS: 4800.01 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat flush_l1d arch_capabilities Hypervisor vendor: VMware Virtualization type: full L1d cache: 256 KiB (8 instances) L1i cache: 256 KiB (8 instances) L2 cache: 2 MiB (8 instances) L3 cache: 128 MiB (8 instances) NUMA node(s): 1 NUMA node0 CPU(s): 0-7 Vulnerability Itlb multihit: KVM: Mitigation: VMX unsupported Vulnerability L1tf: Mitigation; PTE Inversion Vulnerability Mds: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Meltdown: Mitigation; PTI Vulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Retbleed: Mitigation; IBRS Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; IBRS, IBPB conditional, RSB filling Vulnerability Srbds: Unknown: Dependent on hypervisor status Vulnerability Tsx async abort: Not affected ------------ Memory Specs total used free shared buff/cache available Mem: 54Gi 6.4Gi 43Gi 201Mi 5.2Gi 47Gi Swap: 2.0Gi 0B 2.0Gi
No LSB modules are available.
%env
indicates a bash variable
without %env
is Python variable
# Set directories, input/output files
%env data_dir=/home/sam/data/C_virginica/igv_tracks
%env analysis_dir=/home/sam/analyses/20220926-cvir-gff-to-bed-genes_and_pseudogenes
analysis_dir="20220926-cvir-gff-to-bed-genes_and_pseudogenes"
# Input GFF (from NCBI)
%env orig_gff=GCF_002022765.2_C_virginica-3.0_genomic.gff
%env orig_gff_gz=GCF_002022765.2_C_virginica-3.0_genomic.gff.gz
%env orig_gff_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0
# GTF extractor output
%env gtf_extractor_output_genes=20220926-cvir-GCF_002022765.2-genes.bed
%env gtf_extractor_output_pseudogenes=20220926-cvir-GCF_002022765.2-pseudogenes.bed
%env genes_and_psuedogenes=20220926-cvir-GCF_002022765.2-genes-and-pseudogenes.bed
env: data_dir=/home/sam/data/C_virginica/igv_tracks env: analysis_dir=/home/sam/analyses/20220926-cvir-gff-to-bed-genes_and_pseudogenes env: orig_gff=GCF_002022765.2_C_virginica-3.0_genomic.gff env: orig_gff_gz=GCF_002022765.2_C_virginica-3.0_genomic.gff.gz env: orig_gff_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0 env: gtf_extractor_output_genes=20220926-cvir-GCF_002022765.2-genes.bed env: gtf_extractor_output_pseudogenes=20220926-cvir-GCF_002022765.2-pseudogenes.bed env: genes_and_psuedogenes=20220926-cvir-GCF_002022765.2-genes-and-pseudogenes.bed
%%bash
cd "${data_dir}"
# Download with wget.
# Use --quiet option to prevent wget output from printing too many lines to notebook
# Use --continue to prevent re-downloading fie if it's already been downloaded.
wget --quiet \
--continue \
${orig_gff_url}/${orig_gff_gz}
# Unzip downloaded GFF, if it exists
if [[ -f "${orig_gff_gz}" ]]; then
gunzip "${orig_gff_gz}"
fi
ls -ltrh "${orig_gff}"
-rw-rw-r-- 1 sam sam 412M Dec 10 2019 GCF_002022765.2_C_virginica-3.0_genomic.gff
gzip: GCF_002022765.2_C_virginica-3.0_genomic.gff already exists; not overwritten
%%bash
head -n 20 "${data_dir}"/"${orig_gff}"
##gff-version 3 #!gff-spec-version 1.21 #!processor NCBI annotwriter #!genome-build C_virginica-3.0 #!genome-build-accession NCBI_Assembly:GCF_002022765.2 #!annotation-source NCBI Crassostrea virginica Annotation Release 100 ##sequence-region NC_035780.1 1 65668440 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6565 NC_035780.1 RefSeq region 1 65668440 . + . ID=NC_035780.1:1..65668440;Dbxref=taxon:6565;Name=1;chromosome=1;collection-date=22-Mar-2015;country=USA;gbkey=Src;genome=chromosome;isolate=RU13XGHG1-28;isolation-source=Rutgers Haskin Shellfish Research Laboratory inbred lines (NJ);mol_type=genomic DNA;tissue-type=whole sample NC_035780.1 Gnomon gene 13578 14594 . + . ID=gene-LOC111116054;Dbxref=GeneID:111116054;Name=LOC111116054;gbkey=Gene;gene=LOC111116054;gene_biotype=lncRNA NC_035780.1 Gnomon lnc_RNA 13578 14594 . + . ID=rna-XR_002636969.1;Parent=gene-LOC111116054;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;Name=XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;model_evidence=Supporting evidence includes similarity to: 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 1 sample with support for all annotated introns;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1 NC_035780.1 Gnomon exon 13578 13603 . + . ID=exon-XR_002636969.1-1;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1 NC_035780.1 Gnomon exon 14237 14290 . + . ID=exon-XR_002636969.1-2;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1 NC_035780.1 Gnomon exon 14557 14594 . + . ID=exon-XR_002636969.1-3;Parent=rna-XR_002636969.1;Dbxref=GeneID:111116054,Genbank:XR_002636969.1;gbkey=ncRNA;gene=LOC111116054;product=uncharacterized LOC111116054;transcript_id=XR_002636969.1 NC_035780.1 Gnomon gene 28961 33324 . + . ID=gene-LOC111126949;Dbxref=GeneID:111126949;Name=LOC111126949;gbkey=Gene;gene=LOC111126949;gene_biotype=protein_coding NC_035780.1 Gnomon mRNA 28961 33324 . + . ID=rna-XM_022471938.1;Parent=gene-LOC111126949;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;Name=XM_022471938.1;gbkey=mRNA;gene=LOC111126949;model_evidence=Supporting evidence includes similarity to: 3 Proteins%2C and 100%25 coverage of the annotated genomic feature by RNAseq alignments%2C including 21 samples with support for all annotated introns;product=UNC5C-like protein;transcript_id=XM_022471938.1 NC_035780.1 Gnomon exon 28961 29073 . + . ID=exon-XM_022471938.1-1;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1 NC_035780.1 Gnomon exon 30524 31557 . + . ID=exon-XM_022471938.1-2;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1 NC_035780.1 Gnomon exon 31736 31887 . + . ID=exon-XM_022471938.1-3;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1 NC_035780.1 Gnomon exon 31977 32565 . + . ID=exon-XM_022471938.1-4;Parent=rna-XM_022471938.1;Dbxref=GeneID:111126949,Genbank:XM_022471938.1;gbkey=mRNA;gene=LOC111126949;product=UNC5C-like protein;transcript_id=XM_022471938.1
%%bash
# Make analysis directory, if it doesn't exist
mkdir --parents "${analysis_dir}"
# Extract gene or pseudogene features
# Extract chrom,start,end,gene=,and strand fields
# "gene=" is the NCBI gene name, in this particular instance
# Specify input as GFF
# Use awk to to insert a "score" column before the strand column ($5)
# and fill new "score" column with arbitrary value of "0"
time \
gtf_extract \
--feature gene \
--fields=chrom,start,end,ID,strand \
--gff ${data_dir}/${orig_gff} \
| awk 'BEGIN{FS=OFS="\t"}{$5 = 0 OFS $5}1' \
> ${analysis_dir}/${gtf_extractor_output_genes}
time \
gtf_extract \
--feature pseudogene \
--fields=chrom,start,end,ID,strand \
--gff ${data_dir}/${orig_gff} \
| awk 'BEGIN{FS=OFS="\t"}{$5 = 0 OFS $5}1' \
> ${analysis_dir}/${gtf_extractor_output_pseudogenes}
real 0m59.836s user 0m59.627s sys 0m0.208s real 0m59.366s user 0m59.243s sys 0m0.112s
%%bash
cd "${analysis_dir}"
ls -ltrh *.bed
echo ""
head *.bed
-rw-rw-r-- 1 sam sam 2.0M Sep 26 13:13 20220926-cvir-GCF_002022765.2-genes.bed -rw-rw-r-- 1 sam sam 34K Sep 26 13:14 20220926-cvir-GCF_002022765.2-pseudogenes.bed ==> 20220926-cvir-GCF_002022765.2-genes.bed <== NC_035780.1 13578 14594 gene-LOC111116054 0 + NC_035780.1 28961 33324 gene-LOC111126949 0 + NC_035780.1 43111 66897 gene-LOC111110729 0 - NC_035780.1 85606 95254 gene-LOC111112434 0 - NC_035780.1 99840 106460 gene-LOC111120752 0 + NC_035780.1 108305 110077 gene-LOC111128944 0 - NC_035780.1 151859 157536 gene-LOC111128953 0 + NC_035780.1 163809 183798 gene-LOC111105691 0 - NC_035780.1 164820 166793 gene-LOC111105685 0 + NC_035780.1 169468 170178 gene-LOC111105702 0 - ==> 20220926-cvir-GCF_002022765.2-pseudogenes.bed <== NC_035780.1 2215606 2223571 gene-LOC111129349 0 - NC_035780.1 2997762 3000443 gene-LOC111101388 0 + NC_035780.1 3070311 3078520 gene-LOC111129757 0 + NC_035780.1 3376106 3379058 gene-LOC111129887 0 - NC_035780.1 3379153 3381662 gene-LOC111129920 0 - NC_035780.1 3850672 3852462 gene-LOC111125334 0 + NC_035780.1 4397181 4401310 gene-LOC111130254 0 - NC_035780.1 6721106 6745518 gene-LOC111130754 0 + NC_035780.1 7313912 7327414 gene-LOC111128791 0 + NC_035780.1 8003747 8006376 gene-LOC111120605 0 -
%%bash
# Count gene features via GFFutils
echo "GFFutils number of extracted genes:"
gtf_extract --feature gene --fields=ID --gff ${data_dir}/${orig_gff} | wc -l
echo ""
# Count gene features via awk
echo "awk number of extracted genes:"
awk '$3 == "gene" { print $0 }' ${data_dir}/${orig_gff} | wc -l
GFFutils number of extracted genes: 38838 awk number of extracted genes: 38838
%%bash
# Write to temp file
cat ${analysis_dir}/${gtf_extractor_output_genes} \
${analysis_dir}/${gtf_extractor_output_pseudogenes} \
> ${analysis_dir}/tmp.txt
# Sort the file by chromosome, then
sort --version-sort -k1,1 -k2,2 ${analysis_dir}/tmp.txt \
> ${analysis_dir}/${genes_and_psuedogenes}
#
wc -l ${analysis_dir}/${genes_and_psuedogenes}
# Check out combined file
head ${analysis_dir}/*genes-and-pseudogenes.bed
39505 /home/sam/analyses/20220926-cvir-gff-to-bed-genes_and_pseudogenes/20220926-cvir-GCF_002022765.2-genes-and-pseudogenes.bed NC_007175.2 1 1623 gene-COX1 0 + NC_007175.2 2558 3429 gene-COX3 0 + NC_007175.2 3647 4859 gene-CYTB 0 + NC_007175.2 4897 5589 gene-COX2 0 + NC_007175.2 9518 10192 gene-ATP6 0 + NC_007175.2 10295 11290 gene-ND2 0 + NC_007175.2 11515 12864 gene-ND4 0 + NC_007175.2 13126 14793 gene-ND5 0 + NC_007175.2 14807 15268 gene-ND6 0 + NC_007175.2 15352 15705 gene-ND3 0 +
%%bash
rm ${analysis_dir}/tmp.txt ${analysis_dir}/${gtf_extractor_output_genes} ${analysis_dir}/${gtf_extractor_output_pseudogenes}
ls -ltrh ${analysis_dir}
total 2.0M -rw-rw-r-- 1 sam sam 2.0M Sep 26 13:15 20220926-cvir-GCF_002022765.2-genes-and-pseudogenes.bed
%%bash
cd "${analysis_dir}"
for file in *
do
md5sum "${file}" | tee --append checksums.md5
done
101b49158ac0d7efe103670094118ecc 20220926-cvir-GCF_002022765.2-genes-and-pseudogenes.bed