This is a notebook meant to run in a working directory. Please set working directory as variable in next cell
wd="/Volumes/Alanine/wd/17-07-20b/"
cd $wd
/Volumes/Alanine/wd/17-07-20b
-6
http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted
http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted
#checking if files in directory
!ls
GO-GOslim.sorted giga-uniprot-blast.tab uniprot-SP-GO.sorted
blastout="giga-uniprot-blast.tab"
!head -2 $blastout
CHOYP_043R.5.5|m.64252 sp|Q06852|SLAP1_CLOTH 57.45 235 61 19 611 816 1467 1691 2e-15 85.1 CHOYP_14332.1.2|m.5643 sp|Q2F637|1433Z_BOMMO 66.03 262 74 2 19 280 1 247 1e-117 344
#convert pipes to tab
!tr '|' '\t' < $blastout \
> _blast-sep.tab
!head -2 _blast-sep.tab
CHOYP_043R.5.5 m.64252 sp Q06852 SLAP1_CLOTH 57.45 235 61 19 611 816 1467 1691 2e-15 85.1 CHOYP_14332.1.2 m.5643 sp Q2F637 1433Z_BOMMO 66.03 262 74 2 19 280 1 247 1e-117 344
#reducing number of columns and sorting
!awk -v OFS='\t' '{print $4, $1, $14}' < _blast-sep.tab | sort \
> _blast-sort.tab
!head -2 _blast-sort.tab
A1BHN5 CHOYP_AAAS.3.3 2.1 A5PLL7 CHOYP_AAEL_AAEL001208.1.1 6e-124
#joining blast with uniprot annoation file and reducing to three columns UniprotID, Query, All GO terms
!join -t $'\t' \
_blast-sort.tab \
uniprot-SP-GO.sorted \
| cut -f1,2,14 \
> _blast-annot.tab
!head -2 _blast-annot.tab
A1BHN5 CHOYP_AAAS.3.3 GO:0005524; GO:0006298; GO:0030983 A5PLL7 CHOYP_AAEL_AAEL001208.1.1 GO:0005737; GO:0005783; GO:0005789; GO:0016021; GO:0031625; GO:0061630
%%bash
# This script was originally written to address a specific problem that Rhonda was having
# input_file is the initial, "problem" file
# file is an intermediate file that most of the program works upon
# output_file is the final file produced by the script
input_file="_blast-annot.tab"
file="_intermediate.file"
output_file="_blast-GO-unfolded.tab"
# sed command substitutes the "; " sequence to a tab and writes the new format to a new file.
# This character sequence is how the GO terms are delimited in their field.
sed $'s/; /\t/g' "$input_file" > "$file"
# Identify first field containing a GO term.
# Search file with grep for "GO:" and pipe to awk.
# Awk sets tab as field delimiter (-F'\t'), runs a for loop that looks for "GO:" (~/GO:/), and then prints the field number).
# Awk results are piped to sort, which sorts unique by number (-ug).
# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; "-n1").
begin_goterms=$(grep "GO:" "$file" | awk -F'\t' '{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}' | sort -ug | head -n1)
# While loop to process each line of the input file.
while read -r line
do
# Send contents of the current line to awk.
# Set the field separator as a tab (-F'\t') and print the number of fields in that line.
# Save the results of the echo/awk pipe (i.e. number of fields) to the variable "max_field".
max_field=$(echo "$line" | awk -F'\t' '{print NF}')
# Send contents of current line to cut.
# Cut fields (i.e. retain those fields) 1-12.
# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable "fixed_fields"
fixed_fields=$(echo "$line" | cut -f1-2)
# Since not all the lines contain the same number of fields (e.g. may not have GO terms),
# evaluate the number of fields in each line to determine how to handle current line.
# If the value in max_field is less than the field number where the GO terms begin,
# then just print the current line (%s) followed by a newline (\n).
if (( "$max_field" < "$begin_goterms" ))
then printf "%s\n" "$line"
else
# Send contents of current line (which contains GO terms) to cut.
# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.
# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable "goterms".
goterms=$(echo "$line" | cut -f"$begin_goterms"-"$max_field")
# Assign values in the variable "goterms" to a new indexed array (called "array"),
# with tab delimiter (IFS=$'\t')
IFS=$'\t' read -r -a array <<<"$goterms"
# Iterate through each element of the array.
# Print the first 12 fields (i.e. the fields stored in "fixed_fields") followed by a tab (%s\t).
# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\n).
for element in "${!array[@]}"
do printf "%s\t%s\n" "$fixed_fields" "${array[$element]}"
done
fi
# Send the input file into the while loop and send the output to a file named "rhonda_fixed.txt".
done < "$file" > "$output_file"
!head _blast-GO-unfolded.tab
A1BHN5 CHOYP_AAAS.3.3 GO:0005524 A1BHN5 CHOYP_AAAS.3.3 GO:0006298 A1BHN5 CHOYP_AAAS.3.3 GO:0030983 A5PLL7 CHOYP_AAEL_AAEL001208.1.1 GO:0005737 A5PLL7 CHOYP_AAEL_AAEL001208.1.1 GO:0005783 A5PLL7 CHOYP_AAEL_AAEL001208.1.1 GO:0005789 A5PLL7 CHOYP_AAEL_AAEL001208.1.1 GO:0016021 A5PLL7 CHOYP_AAEL_AAEL001208.1.1 GO:0031625 A5PLL7 CHOYP_AAEL_AAEL001208.1.1 GO:0061630 B0S104 CHOYP_AAAT.1.1 GO:0000049
#gets rid of lines with no GOIDs
!sort -k3 _blast-GO-unfolded.tab | grep "GO:" > _blast-GO-unfolded.sorted
#joining files to get GOslim for each query (with duplicate GOslim / query removed)
!join -1 3 -2 1 -t $'\t' \
_blast-GO-unfolded.sorted \
GO-GOslim.sorted \
| uniq | awk -F'\t' -v OFS='\t' '{print $3, $1, $5, $6}' \
> Blastquery-GOslim.tab
!head Blastquery-GOslim.tab
CHOYP_AAAT.1.1 GO:0000049 nucleic acid binding activity F CHOYP_AAEL_AAEL004839.1.1 GO:0000049 nucleic acid binding activity F CHOYP_1433E.2.2 GO:0000077 cell cycle and proliferation P CHOYP_1433E.2.2 GO:0000077 signal transduction P CHOYP_1433E.2.2 GO:0000077 stress response P CHOYP_AAEL_AAEL001593.1.1 GO:0000082 cell cycle and proliferation P CHOYP_4EBP1.1.1 GO:0000082 cell cycle and proliferation P CHOYP_AAEL_AAEL004557.1.1 GO:0000105 other metabolic processes P CHOYP_AAEL_AAEL003539.1.1 GO:0000118 nucleus C CHOYP_AAEL_AAEL003539.1.1 GO:0000122 RNA metabolism P