#!/usr/bin/env python
# coding: utf-8

# ## Create _C.virginia_ gene BED file
# 
# ### Resulting gene BED file will be used for [_C.virginica_ RNAseq/DML sex/OA project](https://github.com/epigeneticstoocean/2018_L18-adult-methylation) (GitHub repo)
# 
# This notebook relies on [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) to be installed and available in your `$PATH`.

# ### List computer specs

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables
# - `%env` indicates a bash variable
# 
# - without `%env` is Python variable

# In[1]:


# Set directories, input/output files
get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/C_virginica/igv_tracks')
get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20211209_cvir_gff-to-bed')
analysis_dir="/home/sam/analyses/20211209_cvir_gff-to-bed"

# Input GFF (from NCBI)
get_ipython().run_line_magic('env', 'orig_gff=GCF_002022765.2_C_virginica-3.0_genomic.gff')
get_ipython().run_line_magic('env', 'orig_gff_gz=GCF_002022765.2_C_virginica-3.0_genomic.gff.gz')
get_ipython().run_line_magic('env', 'orig_gff_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0')

# GTF extractor output
get_ipython().run_line_magic('env', 'gtf_extractor_output=20211209_cvir_GCF_002022765.2_genes.bed')


# ### Download GFF

# In[3]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget.\n# Use --quiet option to prevent wget output from printing too many lines to notebook\n# Use --continue to prevent re-downloading fie if it\'s already been downloaded.\nwget --quiet \\\n--continue \\\n${orig_gff_url}/${orig_gff_gz}\n\n# Unzip download GFF\ngunzip "${orig_gff_gz}"\n\nls -ltrh "${orig_gff}"\n')


# ### Examine GFF

# In[4]:


get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${orig_gff}"\n')


# ### Use [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) to extract gene features

# In[5]:


get_ipython().run_cell_magic('bash', '', '# Make analysis directory, if it doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\n# Extract just gene features\n# Extract chrom,start,end,gene=,and strand fields\n# "gene=" is the NCBI gene name, in this particular instance\n# Specify input as GFF\n# Use awk to to insert a "score" column before the strand column ($5)\n# and fill new "score" column with arbitrary value of "0"\ntime \\\ngtf_extract \\\n--feature gene \\\n--fields=chrom,start,end,ID,strand \\\n--gff ${data_dir}/${orig_gff} \\\n| awk \'BEGIN{FS=OFS="\\t"}{$5 = 0 OFS $5}1\' \\\n> ${analysis_dir}/${gtf_extractor_output}\n')


# #### Check results

# In[6]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nls -ltrh ${gtf_extractor_output}\n\necho ""\n\nhead ${gtf_extractor_output}\n')


# #### Confirm that [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) output seem okay

# In[7]:


get_ipython().run_cell_magic('bash', '', '# Count gene features via GFFutils\necho "GFFutils number of extracted genes:"\ngtf_extract --feature gene --fields=ID --gff ${data_dir}/${orig_gff} | wc -l\n\necho ""\n\n# Count gene features via awk\necho "awk number of extracted genes:"\nawk \'$3 == "gene" { print $0 }\' ${data_dir}/${orig_gff} | wc -l\n')


# ### Generate checksums

# In[8]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n  md5sum "${file}" | tee --append checksums.md5\ndone\n')