#!/usr/bin/env python
# coding: utf-8

# ## A notebook to seamlessly take blast output to GO Slim list

# This is a notebook meant to run in a working directory. Please set working directory as variable in next cell

# In[2]:


wd="/Volumes/Alanine/wd/17-07-20b/"


# In[3]:


cd $wd


# ### In this directory you will need three files
# 1) blastout file in format `-6`    
# 2) Uniprot GO annotation file (340M) available here `http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted`    
# 3) GOslim file available here `http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted`

# In[6]:


#checking if files in directory
get_ipython().system('ls')


# ### Set blastout file as variable

# In[7]:


blastout="giga-uniprot-blast.tab"


# # That should be the last thing you have to Type!

# 

# In[9]:


get_ipython().system('head -2 $blastout')


# In[12]:


#convert pipes to tab
get_ipython().system("tr '|' '\\t' < $blastout  > _blast-sep.tab")


# In[13]:


get_ipython().system('head -2 _blast-sep.tab')


# In[14]:


#reducing number of columns and sorting 
get_ipython().system("awk -v OFS='\\t' '{print $4, $1, $14}' < _blast-sep.tab | sort  > _blast-sort.tab")


# In[15]:


get_ipython().system('head -2 _blast-sort.tab')


# In[19]:


#joining blast with uniprot annoation file and reducing to three columns UniprotID, Query, All GO terms
get_ipython().system("join -t $'\\t'  _blast-sort.tab  uniprot-SP-GO.sorted  | cut -f1,2,14  > _blast-annot.tab")


# In[21]:


get_ipython().system('head -2 _blast-annot.tab')


# ## The following is a script modidified from Sam

# In[24]:


get_ipython().run_cell_magic('bash', '', '\n# This script was originally written to address a specific problem that Rhonda was having\n\n\n\n# input_file is the initial, "problem" file\n# file is an intermediate file that most of the program works upon\n# output_file is the final file produced by the script\ninput_file="_blast-annot.tab"\nfile="_intermediate.file"\noutput_file="_blast-GO-unfolded.tab"\n\n# sed command substitutes the "; " sequence to a tab and writes the new format to a new file.\n# This character sequence is how the GO terms are delimited in their field.\nsed $\'s/; /\\t/g\' "$input_file" > "$file"\n\n# Identify first field containing a GO term.\n# Search file with grep for "GO:" and pipe to awk.\n# Awk sets tab as field delimiter (-F\'\\t\'), runs a for loop that looks for "GO:" (~/GO:/), and then prints the field number).\n# Awk results are piped to sort, which sorts unique by number (-ug).\n# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; "-n1").\nbegin_goterms=$(grep "GO:" "$file" | awk -F\'\\t\' \'{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}\' | sort -ug | head -n1)\n\n# While loop to process each line of the input file.\nwhile read -r line\n\tdo\n\t\n\t# Send contents of the current line to awk.\n\t# Set the field separator as a tab (-F\'\\t\') and print the number of fields in that line.\n\t# Save the results of the echo/awk pipe (i.e. number of fields) to the variable "max_field".\n\tmax_field=$(echo "$line" | awk -F\'\\t\' \'{print NF}\')\n\n\t# Send contents of current line to cut.\n\t# Cut fields (i.e. retain those fields) 1-12.\n\t# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable "fixed_fields"\n\tfixed_fields=$(echo "$line" | cut -f1-2)\n\n\t# Since not all the lines contain the same number of fields (e.g. may not have GO terms),\n\t# evaluate the number of fields in each line to determine how to handle current line.\n\n\t# If the value in max_field is less than the field number where the GO terms begin,\n\t# then just print the current line (%s) followed by a newline (\\n).\n\tif (( "$max_field" < "$begin_goterms" ))\n\t\tthen printf "%s\\n" "$line"\n\t\t\telse\n\n\t\t\t# Send contents of current line (which contains GO terms) to cut.\n\t\t\t# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.\n\t\t\t# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable "goterms".\n\t\t\tgoterms=$(echo "$line" | cut -f"$begin_goterms"-"$max_field")\n\t\t\t\n\t\t\t# Assign values in the variable "goterms" to a new indexed array (called "array"), \n\t\t\t# with tab delimiter (IFS=$\'\\t\')\n\t\t\tIFS=$\'\\t\' read -r -a array <<<"$goterms"\n\t\t\t\n\t\t\t# Iterate through each element of the array.\n\t\t\t# Print the first 12 fields (i.e. the fields stored in "fixed_fields") followed by a tab (%s\\t).\n\t\t\t# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\\n).\n\t\t\tfor element in "${!array[@]}"\t\n\t\t\t\tdo printf "%s\\t%s\\n" "$fixed_fields" "${array[$element]}"\n\t\t\tdone\n\tfi\n\n# Send the input file into the while loop and send the output to a file named "rhonda_fixed.txt".\ndone < "$file" > "$output_file"\n')


# In[25]:


get_ipython().system('head _blast-GO-unfolded.tab')


# In[26]:


#gets rid of lines with no GOIDs
get_ipython().system('sort -k3 _blast-GO-unfolded.tab | grep "GO:" > _blast-GO-unfolded.sorted')


# In[29]:


#joining files to get GOslim for each query (with duplicate GOslim / query removed)
get_ipython().system("join -1 3 -2 1 -t $'\\t'  _blast-GO-unfolded.sorted  GO-GOslim.sorted  | uniq | awk -F'\\t' -v OFS='\\t' '{print $3, $1, $5, $6}'  > Blastquery-GOslim.tab")


# In[30]:


get_ipython().system('head Blastquery-GOslim.tab')


# In[ ]: