This notebook relies on gffread.
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Wed Jul 26 07:27:52 PDT 2023 ------------
No LSB modules are available.
Distributor ID: Ubuntu Description: Ubuntu 18.04.6 LTS Release: 18.04 Codename: bionic ------------ HOSTNAME: raven ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 48 On-line CPU(s) list: 0-47 Thread(s) per core: 2 Core(s) per socket: 24 Socket(s): 1 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 85 Model name: Intel(R) Xeon(R) Gold 5220R CPU @ 2.20GHz Stepping: 7 CPU MHz: 1000.168 CPU max MHz: 4000.0000 CPU min MHz: 1000.0000 BogoMIPS: 4400.00 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 1024K L3 cache: 36608K NUMA node0 CPU(s): 0-47 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single intel_ppin ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm mpx rdt_a avx512f avx512dq rdseed adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local dtherm ida arat pln pts hwp hwp_act_window hwp_epp hwp_pkg_req pku ospke avx512_vnni md_clear flush_l1d arch_capabilities ------------ Memory Specs total used free shared buff/cache available Mem: 247G 2.0G 202G 940K 42G 243G Swap: 99G 178M 99G
%env
indicates a bash variable
without %env
is Python variable
# Set directories, input/output files
%env data_dir=/home/shared/8TB_HDD_01/sam/data/C_virginica/genomes
%env analysis_dir=/home/shared/8TB_HDD_01/sam/analyses/20230726-cvir-genes_bed-to-fasta
analysis_dir="20230726-cvir-genes_bed-to-fasta"
# Base paths and/or URLs for rsyncing/dowloaoding files
%env genes_bed_url=https://eagle.fish.washington.edu/Cvirg_tracks
%env ncbi_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0
# Input filenames
%env genes_bed=C_virginica-3.0_Gnomon_genes.bed
%env ncbi_fasta_gz=GCF_002022765.2_C_virginica-3.0_genomic.fna.gz
%env ncbi_fasta=GCF_002022765.2_C_virginica-3.0_genomic.fna
%env genes_bed_md5=c8f203de591c0608b96f4299c0f847dc
# Output
%env genes_fasta=GCF_002022765.2_C_virginica-3.0-genes.fasta
# Program(s)
%env gffread=/home/shared/gffread-0.12.7.Linux_x86_64/gffread
%env samtools=/home/shared/samtools-1.12/samtools
env: data_dir=/home/shared/8TB_HDD_01/sam/data/C_virginica/genomes env: analysis_dir=/home/shared/8TB_HDD_01/sam/analyses/20230726-cvir-genes_bed-to-fasta env: genes_bed_url=https://eagle.fish.washington.edu/Cvirg_tracks env: ncbi_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0 env: genes_bed=C_virginica-3.0_Gnomon_genes.bed env: ncbi_fasta_gz=GCF_002022765.2_C_virginica-3.0_genomic.fna.gz env: ncbi_fasta=GCF_002022765.2_C_virginica-3.0_genomic.fna env: genes_bed_md5=c8f203de591c0608b96f4299c0f847dc env: genes_fasta=GCF_002022765.2_C_virginica-3.0-genes.fasta env: gffread=/home/shared/gffread-0.12.7.Linux_x86_64/gffread env: samtools=/home/shared/samtools-1.12/samtools
%%bash
# Make analysis directory, if it doesn't exist
mkdir --parents "${analysis_dir}"
%%bash
cd "${analysis_dir}"
# Download with wget.
# Use --quiet option to prevent wget output from printing too many lines to notebook
# Use --continue to prevent re-downloading fie if it's already been downloaded.
wget --quiet \
--continue \
${ncbi_url}/${ncbi_fasta_gz}
wget --quiet \
--continue \
${genes_bed_url}/${genes_bed}
# Unzip downloaded GFF, if it exists
if [[ -f "${ncbi_fasta_gz}" ]]; then
gunzip "${ncbi_fasta_gz}"
fi
ls -ltrh
total 664M -rw-rw-r-- 1 sam sam 662M Dec 10 2019 GCF_002022765.2_C_virginica-3.0_genomic.fna -rw-rw-r-- 1 sam sam 2.0M Dec 10 2021 C_virginica-3.0_Gnomon_genes.bed
%%bash
cd "${analysis_dir}"
head -n 10 "${genes_bed}"
echo ""
echo "------------------------------------------------------"
echo ""
wc -l "${genes_bed}"
NC_035780.1 13578 14594 gene-LOC111116054 0 + NC_035780.1 28961 33324 gene-LOC111126949 0 + NC_035780.1 43111 66897 gene-LOC111110729 0 - NC_035780.1 85606 95254 gene-LOC111112434 0 - NC_035780.1 99840 106460 gene-LOC111120752 0 + NC_035780.1 108305 110077 gene-LOC111128944 0 - NC_035780.1 151859 157536 gene-LOC111128953 0 + NC_035780.1 163809 183798 gene-LOC111105691 0 - NC_035780.1 164820 166793 gene-LOC111105685 0 + NC_035780.1 169468 170178 gene-LOC111105702 0 - ------------------------------------------------------ 38838 C_virginica-3.0_Gnomon_genes.bed
%%bash
cd "${analysis_dir}"
${samtools} faidx "${ncbi_fasta}"
ls -ltrh
total 664M -rw-rw-r-- 1 sam sam 662M Dec 10 2019 GCF_002022765.2_C_virginica-3.0_genomic.fna -rw-rw-r-- 1 sam sam 2.0M Dec 10 2021 C_virginica-3.0_Gnomon_genes.bed -rw-rw-r-- 1 sam sam 398 Jul 26 07:28 GCF_002022765.2_C_virginica-3.0_genomic.fna.fai
%%bash
cd "${analysis_dir}"
${gffread} \
-w ${genes_fasta} \
-g ${data_dir}/${ncbi_fasta} \
${data_dir}/${genes_bed}
ls -ltrh
total 1.1G -rw-rw-r-- 1 sam sam 662M Dec 10 2019 GCF_002022765.2_C_virginica-3.0_genomic.fna -rw-rw-r-- 1 sam sam 2.0M Dec 10 2021 C_virginica-3.0_Gnomon_genes.bed -rw-rw-r-- 1 sam sam 398 Jul 26 07:28 GCF_002022765.2_C_virginica-3.0_genomic.fna.fai -rw-rw-r-- 1 sam sam 408M Jul 26 07:28 GCF_002022765.2_C_virginica-3.0-genes.fasta
%%bash
cd "${analysis_dir}"
grep "^>" ${genes_fasta} | head -n 10
>gene-LOC111116054 >gene-LOC111126949 >gene-LOC111110729 >gene-LOC111112434 >gene-LOC111120752 >gene-LOC111128944 >gene-LOC111128953 >gene-LOC111105691 >gene-LOC111105685 >gene-LOC111105702
%%bash
cd "${analysis_dir}"
wc -l ${data_dir}/${genes_bed}
echo ""
echo ""
grep --count "^>" ${genes_fasta}
38838 /home/shared/8TB_HDD_01/sam/data/C_virginica/genomes/C_virginica-3.0_Gnomon_genes.bed 38838
%%bash
cd "${analysis_dir}"
${samtools} faidx ${genes_fasta}
ls -ltrh
total 1.1G -rw-rw-r-- 1 sam sam 662M Dec 10 2019 GCF_002022765.2_C_virginica-3.0_genomic.fna -rw-rw-r-- 1 sam sam 2.0M Dec 10 2021 C_virginica-3.0_Gnomon_genes.bed -rw-rw-r-- 1 sam sam 398 Jul 26 07:28 GCF_002022765.2_C_virginica-3.0_genomic.fna.fai -rw-rw-r-- 1 sam sam 408M Jul 26 07:28 GCF_002022765.2_C_virginica-3.0-genes.fasta -rw-rw-r-- 1 sam sam 1.5M Jul 26 07:30 GCF_002022765.2_C_virginica-3.0-genes.fasta.fai
%%bash
cd "${analysis_dir}"
for file in *
do
md5sum "${file}" | tee --append checksums.md5
done
c8f203de591c0608b96f4299c0f847dc C_virginica-3.0_Gnomon_genes.bed a0546fd42642673d80b3071089a6711b GCF_002022765.2_C_virginica-3.0-genes.fasta e69ecc217c2e695a6dab7e599984d592 GCF_002022765.2_C_virginica-3.0-genes.fasta.fai f9135e323583dc77fc726e9df2677a32 GCF_002022765.2_C_virginica-3.0_genomic.fna e564100fa0aba3d09f2de8d70f531b2c GCF_002022765.2_C_virginica-3.0_genomic.fna.fai
%%bash
cd "${analysis_dir}"
rm ${ncbi_fasta} ${genes_bed} ${ncbi_fasta}.fai
ls -ltrh
total 409M -rw-rw-r-- 1 sam sam 408M Jul 26 07:28 GCF_002022765.2_C_virginica-3.0-genes.fasta -rw-rw-r-- 1 sam sam 1.5M Jul 26 07:30 GCF_002022765.2_C_virginica-3.0-genes.fasta.fai -rw-rw-r-- 1 sam sam 387 Jul 26 07:30 checksums.md5