#!/usr/bin/env python
# coding: utf-8

# ## Convert _P.meandrina_ GFF to GTF
# 
# _P.meandrina_ GFF (and genome files) were originally obtained from here: 
# 
# http://cyanophora.rutgers.edu/Pocillopora_meandrina/
# 
# #### Notebook relies on:
# 
# - [GffRead](https://github.com/gpertea/gffread)

# ### List computer specs

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables
# - `%env` indicates a bash variable
# 
# - without `%env` is Python variable

# In[2]:


# Set directories, input/output files
get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/P_meandrina/genomes')
get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20230519-pmea-gff_to_gtf')
analysis_dir="20230519-pmea-gff_to_gtf"

# Input files (from NCBI)
get_ipython().run_line_magic('env', 'gff=Pocillopora_meandrina_HIv1.genes.gff3')

# URL of file directory
get_ipython().run_line_magic('env', 'url=owl:/volume1/web/halfshell/genomic-databank')

# Output file(s)
get_ipython().run_line_magic('env', 'gtf=Pocillopora_meandrina_HIv1.genes.gtf')


# Set program locations
get_ipython().run_line_magic('env', 'gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread')

# Set some formatting stuff
get_ipython().run_line_magic('env', 'break_line=--------------------------------------------------------------------------')


# ### Create analysis directory

# In[3]:


get_ipython().run_cell_magic('bash', '', '# Make analysis and data directory, if doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\nmkdir --parents "${data_dir}"\n')


# ### Download GFF

# In[4]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\nrsync "${url}/${gff}" .\n\n\nls -ltrh "${gff}"\n')


# ### Examine GFF

# In[5]:


get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${gff}"\n')


# #### Count unique number of fields
# 
# This identifies if there are rows with >9 fields (which there shouldn't be in a [GFF3](http://gmod.org/wiki/GFF3)).

# In[6]:


get_ipython().run_cell_magic('bash', '', '# Capture number of fields (NF) in each row in array.\nfield_count_array=($(awk -F "\\t" \'{print NF}\' "${data_dir}/${gff}" | sort --unique))\n\n# Check array contents\necho "List of number of fields in ${data_dir}/${gff}:"\necho ""\nfor field_count in "${field_count_array[@]}"\ndo\n  echo "${field_count}"\ndone\n\necho ""\necho "${break_line}"\necho ""\n\n# Preview of each line "type" with a given number of fields\n# Check array contents\necho ""\nfor field_count in "${field_count_array[@]}"\ndo\n  echo "Preview of lines with ${field_count} field(s):"\n  echo ""\n  awk \\\n    -v field_count="${field_count}" \\\n    -F "\\t" \\\n    \'NF == field_count\' \\\n    "${data_dir}/${gff}" \\\n    | head\n  echo ""\n  echo "${break_line}"\n  echo ""\ndone\n')


# ### Convert GFF to GTF

# In[7]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${gffread} -E \\\n${data_dir}/"${gff}" -T \\\n1> ${analysis_dir}/"${gtf}" \\\n2> ${analysis_dir}/gffread-gff_to_gtf.stderr\n')


# ### Inspect GTF

# In[8]:


get_ipython().run_cell_magic('bash', '', 'head ${analysis_dir}/"${gtf}"\n')


# ### Generate checksum(s)

# In[9]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n  md5sum "${file}" | tee --append checksums.md5\ndone\n')


# ### Document GffRead program options

# In[10]:


get_ipython().run_cell_magic('bash', '', '${gffread} -h\n')