#!/usr/bin/env python # coding: utf-8 # ## Create _C.virginia_ gene BED file # # ### Resulting gene BED file will be used for [_C.virginica_ RNAseq/DML sex/OA project](https://github.com/epigeneticstoocean/2018_L18-adult-methylation) (GitHub repo) # # This notebook relies on [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) to be installed and available in your `$PATH`. # ### List computer specs # In[1]: get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n') # ### Set variables # - `%env` indicates a bash variable # # - without `%env` is Python variable # In[1]: # Set directories, input/output files get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/C_virginica/igv_tracks') get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20211209_cvir_gff-to-bed') analysis_dir="/home/sam/analyses/20211209_cvir_gff-to-bed" # Input GFF (from NCBI) get_ipython().run_line_magic('env', 'orig_gff=GCF_002022765.2_C_virginica-3.0_genomic.gff') get_ipython().run_line_magic('env', 'orig_gff_gz=GCF_002022765.2_C_virginica-3.0_genomic.gff.gz') get_ipython().run_line_magic('env', 'orig_gff_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0') # GTF extractor output get_ipython().run_line_magic('env', 'gtf_extractor_output=20211209_cvir_GCF_002022765.2_genes.bed') # ### Download GFF # In[3]: get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget.\n# Use --quiet option to prevent wget output from printing too many lines to notebook\n# Use --continue to prevent re-downloading fie if it\'s already been downloaded.\nwget --quiet \\\n--continue \\\n${orig_gff_url}/${orig_gff_gz}\n\n# Unzip download GFF\ngunzip "${orig_gff_gz}"\n\nls -ltrh "${orig_gff}"\n') # ### Examine GFF # In[4]: get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${orig_gff}"\n') # ### Use [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) to extract gene features # In[5]: get_ipython().run_cell_magic('bash', '', '# Make analysis directory, if it doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\n# Extract just gene features\n# Extract chrom,start,end,gene=,and strand fields\n# "gene=" is the NCBI gene name, in this particular instance\n# Specify input as GFF\n# Use awk to to insert a "score" column before the strand column ($5)\n# and fill new "score" column with arbitrary value of "0"\ntime \\\ngtf_extract \\\n--feature gene \\\n--fields=chrom,start,end,ID,strand \\\n--gff ${data_dir}/${orig_gff} \\\n| awk \'BEGIN{FS=OFS="\\t"}{$5 = 0 OFS $5}1\' \\\n> ${analysis_dir}/${gtf_extractor_output}\n') # #### Check results # In[6]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nls -ltrh ${gtf_extractor_output}\n\necho ""\n\nhead ${gtf_extractor_output}\n') # #### Confirm that [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) output seem okay # In[7]: get_ipython().run_cell_magic('bash', '', '# Count gene features via GFFutils\necho "GFFutils number of extracted genes:"\ngtf_extract --feature gene --fields=ID --gff ${data_dir}/${orig_gff} | wc -l\n\necho ""\n\n# Count gene features via awk\necho "awk number of extracted genes:"\nawk \'$3 == "gene" { print $0 }\' ${data_dir}/${orig_gff} | wc -l\n') # ### Generate checksums # In[8]: get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n md5sum "${file}" | tee --append checksums.md5\ndone\n')