#!/usr/bin/env python
# coding: utf-8

# ## Create _S.salar_ UniProt annotations file for Shelly
# 
# See this [GitHub Issue](https://github.com/RobertsLab/resources/issues/1220)
# 
# This notebook relies on [GFFutils](https://gffutils.readthedocs.io/en/v0.12.0/index.html) to be installed and available in your `$PATH`.

# ### List computer specs

# In[1]:


get_ipython().run_cell_magic('bash', '', 'echo "TODAY\'S DATE:"\ndate\necho "------------"\necho ""\n#Display operating system info\nlsb_release -a\necho ""\necho "------------"\necho "HOSTNAME: "; hostname \necho ""\necho "------------"\necho "Computer Specs:"\necho ""\nlscpu\necho ""\necho "------------"\necho ""\necho "Memory Specs"\necho ""\nfree -mh\n')


# ### Set variables
# - `%env` indicates a bash variable; without `%env` is Python variable

# In[1]:


# Set directories, input/output files
get_ipython().run_line_magic('env', 'data_dir=/home/samb/data/S_salar/genomes')
get_ipython().run_line_magic('env', 'analysis_dir=/home/samb/analyses/20210601_ssal_gff-annotations')
analysis_dir="/home/samb/analyses/20210601_ssal_gff-annotations"
# Input GFF
get_ipython().run_line_magic('env', 'orig_gff=GCF_000233375.1_ICSASG_v2_genomic.gff')
get_ipython().run_line_magic('env', 'orig_gff_url=https://gannet.fish.washington.edu/metacarcinus/Salmo_Calig/GENOMES/v2/RefSeq')

# UniProt batch output
get_ipython().run_line_magic('env', 'perl_output=20210601_ssal_uniprot_batch_results.txt')

# GTF extractor output
get_ipython().run_line_magic('env', 'gtf_extractor_output=20210601_ssal_chrom-start-end-Dbxref.csv')

# Gene name list for UniProt batch submission
get_ipython().run_line_magic('env', 'gene_list=20210601_ssal_gene-list.txt')

# Parsed UniProt
get_ipython().run_line_magic('env', 'parsed_uniprot=20210601_ssal_accession-gene_id-gene-gene_description-go_ids.csv')

# Final output
get_ipython().run_line_magic('env', 'joined_output=20210601_ssal_chrom-gene_id_start-end-acc-gene-gene_description-go_ids.csv')


# ### Download GFF

# In[3]:


get_ipython().run_cell_magic('bash', '', 'cd "${data_dir}"\n\n# Download with wget. Use --no-check-certificate to avoid issues with Gannet certificate\n# Use --quiet option to prevent wget output from printing too many lines to notebook\nwget --no-check-certificate --quiet ${orig_gff_url}/${orig_gff}\n\nls -ltrh "${orig_gff}"\n')


# ### Examine GFF

# In[4]:


get_ipython().run_cell_magic('bash', '', 'head -n 20 "${data_dir}"/"${orig_gff}"\n')


# ### Use GFFutils to extract gene features

# In[7]:


get_ipython().run_cell_magic('bash', '', '# Extract just gene features\n# Extract chromosome name, start, end, and Dbxref fields\n# Dbxref is the NCBI gene name, in this particular instance\n# Specify input as GFF\n# Use awk to format as comma-delimited output to help with downstream parsing/joining\ntime \\\ngtf_extract \\\n--feature gene \\\n--fields=chrom,start,end,Dbxref \\\n--gff ${data_dir}/${orig_gff} \\\n| awk \'BEGIN { OFS = ","; FS="[\\t:]"} {print $1, $2, $3, $5}\' \\\n> ${analysis_dir}/${gtf_extractor_output}\n')


# #### Check results

# In[8]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nls -ltrh ${gtf_extractor_output}\n\necho ""\n\nhead ${gtf_extractor_output}\n')


# #### Confirm that GFFutils output seem okay

# In[10]:


get_ipython().run_cell_magic('bash', '', '# Count gene features via GFFutils\necho "GFFutils number of extracted genes:"\ngtf_extract -f gene --fields=Dbxref --gff ${data_dir}/${orig_gff} | wc -l\n\necho ""\n\n# Count gene features via awk\necho "awk number of extracted genes:"\nawk \'$3 == "gene" { print $0 }\' ${data_dir}/${orig_gff} | wc -l\n')


# ### Extract gene ids for batch submission to UniProt

# In[11]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nawk -F"," \'{print $4}\' "${gtf_extractor_output}" > "${gene_list}"\n')


# #### Check gene list

# In[12]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nhead "${gene_list}"\n')


# ### Batch submission to UniProt
# 
# Perl script obtained from UniProt: https://www.uniprot.org/help/api_batch_retrieval
# 
# Modified to map NCIB gene ID to UniProt accession.

# In[13]:


get_ipython().run_cell_magic('bash', '', '# Print the script for viewing\ncat /home/samb/programs/uniprot_mapping.pl\n')


# In[16]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\ntime \\\nperl /home/samb/programs/uniprot_mapping.pl "${gene_list}" > "${perl_output}"\nls -ltrh\n\necho ""\necho ""\necho "--------------------------------------------------"\necho ""\necho "Line count:"\nwc -l "${perl_output}"\n')


# #### Check mapping output

# In[17]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nhead -n 30 "${perl_output}"\n')


# ### Parse out the stuff we want:
# 
# - UniProt accession
# 
# - Gene ID (NCBI gene ID)
# 
# - Gene name/abbraviation
# 
# - Gene description
# 
# - GO terms

# In[18]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\ntime \\\nwhile read -r line\ndo\n  # Get record line descriptor\n  descriptor=$(echo "${line}" | awk \'{print $1}\' )\n\n  # Capture second field for evaluation\n  go_line=$(echo "${line}" | awk \'{print $2}\')\n\n  # Append GO IDs to array\n  if [[ "${go_line}" == "GO;" ]]; then\n    go_id=$(echo "${line}" | awk \'{print $3}\')\n    go_ids_array+=("${go_id}")\n  elif [[ "${go_line}" == "GeneID;" ]]; then\n    # Uses sed to strip trailing semi-colon\n    gene_id=$(echo "${line}" | awk \'{print $3}\' | sed \'s/;$//\')\n  fi\n\n  # Get gene description\n  if [[ "${descriptor}"  == "DE" ]] && [[ "${go_line}"  == "SubName:" ]]; then\n    # Uses sed to strip trailing spaces at end of line and remove commas\n    gene_description=$(echo "${line}" | awk -F"[={]" \'{print $2}\' | sed \'s/[[:blank:]]*$//\' | sed \'s/,//g\')\n\n  # Get gene name\n  elif [[ "${descriptor}"  == "GN"  ]] && [[ $(echo "${line}" | awk -F "=" \'{print $1}\') == "GN   Name" ]]; then\n    # Uses sed to strip trailing spaces at end of line\n    gene=$(echo "${line}" | awk -F\'Name=|{\' \'{print $2}\' | sed \'s/[[:blank:]]*$//\')\n\n  # Get UniProt accession\n  elif [[ "${descriptor}"  == "AC" ]]; then\n    # Uses sed to strip trailing semi-colon\n    accession=$(echo "${line}" | awk \'{print $2}\' | sed \'s/;$//\')\n\n  # Identify beginning on new record\n  elif [[ "${descriptor}"  == "//" ]]; then\n\n    # Prints other comma-separated variables, then GOID1;GOID2;GOIDn\n    # IFS prevents spaces from being added between GO IDs\n    # sed removes ";" after final GO ID\n    (IFS=; printf "%s,%s,%s,%s,%s\\n" "${accession}" "${gene_id}" "${gene}" "${gene_description}" "${go_ids_array[*]}" | sed \'s/;$//\')\n\n    # Re-initialize variables\n    accession=""  \n    descriptor=""\n    gene=""\n    gene_description\n    gene_id=""\n    go_id=""\n    go_ids_array=()\n  fi\n\n\ndone < "${perl_output}" >> "${parsed_uniprot}"\n')


# Despite notebook error message, if you check the time stamps on the files below, it looks like this took nearly 6.5hrs!!

# #### Check parsed file

# In[20]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nls -ltrh\necho ""\nhead "${parsed_uniprot}"\n\necho ""\necho ""\necho "--------------------------------------------------"\necho ""\necho "Line count:"\nwc -l "${parsed_uniprot}"\n')


# Line count looks reasonable, as I know that some NCBI gene IDs are associated with multiple UniProt accessions, so we should end up with more results than were submitted.

# ### Join parsed UniProt info with chromosome names

# This will sort the both files on the columns with the NCBI gene ID for joining.
# 
# Then, it will replace the commas with tabs and re-order the columns so that the NCBI chromosome is in the first column.

# In[22]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\njoin \\\n-t "," \\\n-1 4 \\\n-2 2 \\\n<(sort -t "," -k 4,4 "${gtf_extractor_output}")  \\\n<(sort -t "," -k2,2 "${parsed_uniprot}") \\\n| awk \'BEGIN {FS=","; OFS="\\t"} {print $2, $1, $3, $4, $5, $6, $7, $8}\' \\\n> "${joined_output}"\n')


# In[23]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\nls -ltrh\n\necho ""\n\necho "Line count:"\nwc -l "${joined_output}"\n')


# Hmmm, does this line count make sense?
# 
# Submitted to UniProt: 79030
# 
# Returned from UniProt: 82393
# 
# Joined: 82570

# Look at the file. Columns will be tab-separated in this order:
# 
# | chromosome | NCBI gene ID | start | end | UniProt accession | gene abbreviation/name | gene description | GO IDs |
# |---|----|----|----|---|----|----|----|

# In[24]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nhead "${joined_output}"\n')


# ### Generate cheksums
# 
# Forgot to do this before closing notebook.

# In[2]:


get_ipython().run_cell_magic('bash', '', 'cd "${analysis_dir}"\n\nfor file in *\ndo\n  md5sum "${file}" | tee --append checksums.md5\ndone\n')


# In[ ]: