#!/usr/bin/env python
# coding: utf-8

# ## Validate and Convert _P.verrucosa_ GFF to GTF
# 
# ### Notebook relies on:
# 
# - [GffRead](https://github.com/gpertea/gffread)

# ### List computer specs

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables
# - `%env` indicates a bash variable
# 
# - without `%env` is Python variable

# In[14]:


# Set directories, input/output files
get_ipython().run_line_magic('env', 'data_dir=/home/sam/data/M_capitata/genomes')
get_ipython().run_line_magic('env', 'analysis_dir=/home/sam/analyses/20230127-pver-gff_to_gtf')
analysis_dir="20230127-pver-gff_to_gtf"

# Input files (from NCBI)
get_ipython().run_line_magic('env', 'gff=Pver_genome_assembly_v1.0.gff3')

# URL of file directory
get_ipython().run_line_magic('env', 'url=https://owl.fish.washington.edu/halfshell/genomic-databank')

# Output file(s)
get_ipython().run_line_magic('env', 'valid_gff=Pver_genome_assembly_v1.0-valid.gff3')
get_ipython().run_line_magic('env', 'gtf=Pver_genome_assembly_v1.0-valid.gtf')


# Set program locations
get_ipython().run_line_magic('env', 'gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread')

# Set some formatting stuff
get_ipython().run_line_magic('env', 'break_line=--------------------------------------------------------------------------')


# ### Create analysis directory

# In[3]:


get_ipython().run_cell_magic('bash', '', '# Make analysis and data directory, if doesn\'t exist\nmkdir --parents "${analysis_dir}"\n\nmkdir --parents "${data_dir}"\n')


# ### Download GFF

# In[4]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget.\n# Use --quiet option to prevent wget output from printing too many lines to notebook\n# Use --continue to prevent re-downloading fie if it\'s already been downloaded.\n# Use --no-check-certificate to avoid download error from gannet\nwget --quiet \\\n--continue \\\n--no-check-certificate \\\n${url}/${gff}\n\nls -ltrh "${gff}"\n')


# ### Examine GFF

# #### Check first 20 lines

# In[5]:


get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${gff}"\n')


# #### Count unique number of fields
# 
# This identifies if there are rows with >9 fields (which there shouldn't be in a [GFF3](http://gmod.org/wiki/GFF3)).

# In[15]:


get_ipython().run_cell_magic('bash', '', '# Capture number of fields (NF) in each row in array.\nfield_count_array=($(awk -F "\\t" \'{print NF}\' "${data_dir}/${gff}" | sort --unique))\n\n# Check array contents\necho "List of number of fields in ${data_dir}/${gff}:"\necho ""\nfor field_count in "${field_count_array[@]}"\ndo\n  echo "${field_count}"\ndone\n\necho ""\necho "${break_line}"\necho ""\n\n# Preview of each line "type" with a given number of fields\n# Check array contents\necho ""\nfor field_count in "${field_count_array[@]}"\ndo\n  echo "Preview of lines with ${field_count} field(s):"\n  echo ""\n  awk \\\n    -v field_count="${field_count}" \\\n    -F "\\t" \\\n    \'NF == field_count\' \\\n    "${data_dir}/${gff}" \\\n    | head\n  echo ""\n  echo "${break_line}"\n  echo ""\ndone\n')


# In the above results, we can see that there are rows with 10 and 11 fields, due to incorrect tabs entered in Field 9. This creates an invalid GFF3 file.

# ### Fix invalid tabs in Field 9
# 
# This will remove tabs in Field 9 and join those extra fields into Field 9 using a semi-colon separator with the `Note=` attribute.

# In[19]:


get_ipython().run_cell_magic('bash', '', 'time \\\nwhile read -r line\ndo\n    # Count number of fields\n    number_of_fields=$(echo "${line}" | awk -F "\\t" \'{print NF}\')\n\n    # If number of fields is 9, print the entire line\n    if [[ "${number_of_fields}" -eq 9 ]]; then\n      echo "${line}"\n    # If there are 10 fields, cut/capture the first 9 and\n    # capture the 10th.\n    # Use printf to join 10th field to 9th via semi-colon\n    elif [[ "${number_of_fields}" -eq 10 ]]; then\n      first_nine=$(echo "${line}" | cut -f 1-9)\n      tenth=$(echo "${line}" | cut -f 10 )\n      printf "%s;%s\\n" "${first_nine}" "Note=${tenth}"\n    # If there are 11 fields, cut/capture the first 9, followed\n    # by capturing the 10th and 11th fields.\n    # Use printf to join 10th and 11th field to 9th via semi-colon.\n    elif [[ "${number_of_fields}" -eq 11 ]]; then\n      first_nine=$(echo "${line}" | cut -f 1-9)\n      tenth=$(echo "${line}" | cut -f 10)\n      eleventh=$(echo "${line}" | cut -f 11)\n      printf "%s;%s;%s\\n" "${first_nine}" "Note=${tenth}" "Note=${eleventh}"\n    else\n      # If the line doesn\'t match any of the above, just print it.\n      echo "${line}"\n    fi\n\ndone < "${data_dir}/${gff}" > "${analysis_dir}/${valid_gff}"\n')


# #### Compare line numbers between the two GFFs to make sure we didn't lose anything

# In[20]:


get_ipython().run_cell_magic('bash', '', 'wc -l "${data_dir}/${gff}"\nwc -l "${analysis_dir}/${valid_gff}"\n')


# #### Count fields to make sure no more rows with >9 fields

# In[21]:


get_ipython().run_cell_magic('bash', '', 'awk -F "\\t" \'{print NF}\' "${analysis_dir}/${valid_gff}" | sort --unique\n')


# ### Convert GFF to GTF

# In[22]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n${gffread} -E \\\n"${analysis_dir}/${valid_gff}" -T \\\n1> ${analysis_dir}/"${gtf}" \\\n2> ${analysis_dir}/gffread-gff_to_gtf.stderr\n')


# ### Inspect GTF

# In[23]:


get_ipython().run_cell_magic('bash', '', 'head ${analysis_dir}/"${gtf}"\n')


# ### Generate checksum(s)

# In[24]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n  md5sum "${file}" | tee --append checksums.md5\ndone\n')


# ### Document GffRead program options

# In[25]:


get_ipython().run_cell_magic('bash', '', '${gffread} -h\n')