#!/usr/bin/env python
# coding: utf-8

# ## Notebook to take `blast` output from transcriptome v 3.1 to GOslim

# ### Set working directory

# In[1]:


pwd


# In[2]:


wd = "../analyses/BLAST-to-GOslim/"


# In[3]:


cd $wd


# ### Need three files in this directory

# 1. `blast` output from transcriptome v 3.1 in format -6
# 2. Uniprot GO annotation file (340M) available here: http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted
# 3. GOslim file available here: http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted

# In[4]:


#`curl` `blast` output
get_ipython().system('curl --insecure https://gannet.fish.washington.edu/Atumefaciens/20200608_cbai_diamond_blastx_v2.1_v3.1/cbai_transcriptome_v3.1.blastx.outfmt6  -o 20200608.C_bairdi.Trinity.blastx.outfmt6')


# In[5]:


get_ipython().system('curl http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted -o uniprot-SP-GO.sorted')


# In[6]:


get_ipython().system('curl http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted -o GO-GOslim.sorted')


# In[7]:


#check that files are in directory
get_ipython().system('ls')


# In[17]:


#need to remove rows that are duplicate Trinity IDs
#Sam's notes on what below code does:
#1.  Sort BLAST output file by Trinity ID (column 1) and then by e-value (column 11), while setting a tab as the field separator (sort defaults to space as field separator).
#2. Pipe to awk. Runs an array where the first field (Trinity ID) is loaded into the array and compared to the value of the first field in the next line. If they do not match, it will print the line.
get_ipython().system("sort -k1,1 -k11,11 --field-separator $'\\t' 20200608.C_bairdi.Trinity.blastx.outfmt6 | awk '!($1 in array){array[$1]; print}' | wc -l")


# In[12]:


get_ipython().system("sort -u -k1,1 --field-separator $'\\t' 20200608.C_bairdi.Trinity.blastx.outfmt6 > blastout")


# In[18]:


get_ipython().system('wc -l blastout')


# In[11]:


#set `blast` output file as a variable
blastout="20200608.C_bairdi.Trinity.blastx.outfmt6"


# In[21]:


get_ipython().system('head -2 blastout')


# In[23]:


#convert pipes to tab
get_ipython().system("tr '|' '\\t' < blastout  > _blast-sep.tab")


# In[24]:


get_ipython().system('head -2 _blast-sep.tab')
#commit _blast-sep.tab to directory --> will be used to get list of Trinity_IDs from GOslim 'stress response'


# In[25]:


#reducing number of columns and sorting 
get_ipython().system("awk -v OFS='\\t' '{print $3, $1, $13}' < _blast-sep.tab | sort  > _blast-sort.tab")


# In[26]:


get_ipython().system('head -2 _blast-sort.tab')


# In[27]:


get_ipython().system('head -2 uniprot-SP-GO.sorted')


# In[28]:


#joining blast with uniprot annoation file and reducing to three columns UniprotID, Query, All GO terms
get_ipython().system("join -t $'\\t'  _blast-sort.tab  uniprot-SP-GO.sorted  | cut -f1,2,14  > _blast-annot.tab")


# In[29]:


get_ipython().system('head -2 _blast-annot.tab')


# #### The following is a script modified by Sam White

# In[30]:


get_ipython().run_cell_magic('bash', '', '\n# This script was originally written to address a specific problem that Rhonda was having\n\n\n\n# input_file is the initial, "problem" file\n# file is an intermediate file that most of the program works upon\n# output_file is the final file produced by the script\ninput_file="_blast-annot.tab"\nfile="_intermediate.file"\noutput_file="_blast-GO-unfolded.tab"\n\n# sed command substitutes the "; " sequence to a tab and writes the new format to a new file.\n# This character sequence is how the GO terms are delimited in their field.\nsed $\'s/; /\\t/g\' "$input_file" > "$file"\n\n# Identify first field containing a GO term.\n# Search file with grep for "GO:" and pipe to awk.\n# Awk sets tab as field delimiter (-F\'\\t\'), runs a for loop that looks for "GO:" (~/GO:/), and then prints the field number).\n# Awk results are piped to sort, which sorts unique by number (-ug).\n# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; "-n1").\nbegin_goterms=$(grep "GO:" "$file" | awk -F\'\\t\' \'{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}\' | sort -ug | head -n1)\n\n# While loop to process each line of the input file.\nwhile read -r line\n\tdo\n\t\n\t# Send contents of the current line to awk.\n\t# Set the field separator as a tab (-F\'\\t\') and print the number of fields in that line.\n\t# Save the results of the echo/awk pipe (i.e. number of fields) to the variable "max_field".\n\tmax_field=$(echo "$line" | awk -F\'\\t\' \'{print NF}\')\n\n\t# Send contents of current line to cut.\n\t# Cut fields (i.e. retain those fields) 1-12.\n\t# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable "fixed_fields"\n\tfixed_fields=$(echo "$line" | cut -f1-2)\n\n\t# Since not all the lines contain the same number of fields (e.g. may not have GO terms),\n\t# evaluate the number of fields in each line to determine how to handle current line.\n\n\t# If the value in max_field is less than the field number where the GO terms begin,\n\t# then just print the current line (%s) followed by a newline (\\n).\n\tif (( "$max_field" < "$begin_goterms" ))\n\t\tthen printf "%s\\n" "$line"\n\t\t\telse\n\n\t\t\t# Send contents of current line (which contains GO terms) to cut.\n\t\t\t# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.\n\t\t\t# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable "goterms".\n\t\t\tgoterms=$(echo "$line" | cut -f"$begin_goterms"-"$max_field")\n\t\t\t\n\t\t\t# Assign values in the variable "goterms" to a new indexed array (called "array"), \n\t\t\t# with tab delimiter (IFS=$\'\\t\')\n\t\t\tIFS=$\'\\t\' read -r -a array <<<"$goterms"\n\t\t\t\n\t\t\t# Iterate through each element of the array.\n\t\t\t# Print the first 12 fields (i.e. the fields stored in "fixed_fields") followed by a tab (%s\\t).\n\t\t\t# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\\n).\n\t\t\tfor element in "${!array[@]}"\t\n\t\t\t\tdo printf "%s\\t%s\\n" "$fixed_fields" "${array[$element]}"\n\t\t\tdone\n\tfi\n\n# Send the input file into the while loop and send the output to a file named "rhonda_fixed.txt".\ndone < "$file" > "$output_file"\n')


# In[31]:


get_ipython().system('head _blast-GO-unfolded.tab')


# In[32]:


#gets rid of lines with no GOIDs
get_ipython().system('sort -k3 _blast-GO-unfolded.tab | grep "GO:" > _blast-GO-unfolded.sorted')


# In[33]:


#joining files to get GOslim for each query (with duplicate GOslim / query removed)
get_ipython().system("join -1 3 -2 1 -t $'\\t'  _blast-GO-unfolded.sorted  GO-GOslim.sorted  | uniq | awk -F'\\t' -v OFS='\\t' '{print $3, $1, $5, $6}'  > Blastquery-GOslim.tab")


# In[34]:


get_ipython().system('head Blastquery-GOslim.tab')


# In[35]:


get_ipython().system('cat Blastquery-GOslim.tab | grep "\tP"  | awk -F $\'\\t\' \'{print $4}\' | sort | uniq -c')


# In[36]:


get_ipython().system('cat Blastquery-GOslim.tab | grep "\tP" | awk -F $\'\\t\' \'{print $3}\' | sort | uniq -c | sort -r')


# In[37]:


get_ipython().system('cat Blastquery-GOslim.tab | grep "\tP" | awk -F $\'\\t\' \'{print $3}\' | sort | uniq -c | sort -r > GOslim-P-pie.txt')


# In[ ]: