blast
output from transcriptome v 3.1 to GOslim¶pwd
'/Users/graciecrandall/Documents/GitHub/paper-tanner-crab/notebooks'
wd = "../analyses/BLAST-to-GOslim/"
cd $wd
/Users/graciecrandall/Documents/GitHub/paper-tanner-crab/analyses/BLAST-to-GOslim
blast
output from transcriptome v 3.1 in format -6#`curl` `blast` output
!curl --insecure https://gannet.fish.washington.edu/Atumefaciens/20200608_cbai_diamond_blastx_v2.1_v3.1/cbai_transcriptome_v3.1.blastx.outfmt6 \
-o 20200608.C_bairdi.Trinity.blastx.outfmt6
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 4391k 100 4391k 0 0 2531k 0 0:00:01 0:00:01 --:--:-- 4186k
!curl http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/uniprot-SP-GO.sorted -o uniprot-SP-GO.sorted
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 340M 100 340M 0 0 7996k 0 0:00:43 0:00:43 --:--:-- 14.2M 0 0:00:47 0:00:40 0:00:07 8104k
!curl http://owl.fish.washington.edu/halfshell/bu-alanine-wd/17-07-20/GO-GOslim.sorted -o GO-GOslim.sorted
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 2314k 100 2314k 0 0 1182k 0 0:00:01 0:00:01 --:--:-- 3464k
#check that files are in directory
!ls
20200608.C_bairdi.Trinity.blastx.outfmt6 Blastquery-GOslim.tab GO-GOslim.sorted GOslim-P-pie.txt GOslim-pie-transcriptome-v3.1.pdf _blast-sep.tab allterms.GOslim-pie-transcriptome-v3.1.pdf readme.md transcriptome-stress-genes.tab uniprot-SP-GO.sorted
#need to remove rows that are duplicate Trinity IDs
#Sam's notes on what below code does:
#1. Sort BLAST output file by Trinity ID (column 1) and then by e-value (column 11), while setting a tab as the field separator (sort defaults to space as field separator).
#2. Pipe to awk. Runs an array where the first field (Trinity ID) is loaded into the array and compared to the value of the first field in the next line. If they do not match, it will print the line.
!sort -k1,1 -k11,11 --field-separator $'\t' 20200608.C_bairdi.Trinity.blastx.outfmt6 | awk '!($1 in array){array[$1]; print}' | wc -l
48551
!sort -u -k1,1 --field-separator $'\t' 20200608.C_bairdi.Trinity.blastx.outfmt6 > blastout
!wc -l blastout
48551 blastout
#set `blast` output file as a variable
blastout="20200608.C_bairdi.Trinity.blastx.outfmt6"
!head -2 blastout
TRINITY_DN100026_c0_g1_i1 sp|P60322|NANO2_MOUSE 79.3 29 6 0 319 233 88 116 4.1e-08 58.9 TRINITY_DN100027_c0_g1_i1 sp|Q27597|NCPR_DROME 48.4 62 32 0 547 362 460 521 2.2e-11 70.5
#convert pipes to tab
!tr '|' '\t' < blastout \
> _blast-sep.tab
!head -2 _blast-sep.tab
#commit _blast-sep.tab to directory --> will be used to get list of Trinity_IDs from GOslim 'stress response'
TRINITY_DN100026_c0_g1_i1 sp P60322 NANO2_MOUSE 79.3 29 6 0 319 233 88 116 4.1e-08 58.9 TRINITY_DN100027_c0_g1_i1 sp Q27597 NCPR_DROME 48.4 62 32 0 547 362 460 521 2.2e-11 70.5
#reducing number of columns and sorting
!awk -v OFS='\t' '{print $3, $1, $13}' < _blast-sep.tab | sort \
> _blast-sort.tab
!head -2 _blast-sort.tab
A0A087WPF7 TRINITY_DN5521_c0_g1_i1 7.9e-06 A0A0B4J2F2 TRINITY_DN97262_c0_g1_i1 3.6e-10
!head -2 uniprot-SP-GO.sorted
A0A023GPI8 LECA_CANBL reviewed Lectin alpha chain (CboL) [Cleaved into: Lectin beta chain; Lectin gamma chain] Canavalia boliviana 237 mannose binding [GO:0005537]; metal ion binding [GO:0046872] mannose binding [GO:0005537]; metal ion binding [GO:0046872] GO:0005537; GO:0046872 A0A023GPJ0 CDII_ENTCC reviewed Immunity protein CdiI cdiI ECL_04450.1 Enterobacter cloacae subsp. cloacae (strain ATCC 13047 / DSM 30054 / NBRC 13535 / NCDC 279-56) 145
#joining blast with uniprot annoation file and reducing to three columns UniprotID, Query, All GO terms
!join -t $'\t' \
_blast-sort.tab \
uniprot-SP-GO.sorted \
| cut -f1,2,14 \
> _blast-annot.tab
!head -2 _blast-annot.tab
A0A0F7YZI5 TRINITY_DN115719_c0_g1_i1 GO:0005179; GO:0005576 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0005634; GO:0005829; GO:0006351; GO:0006355; GO:0006364; GO:0006476; GO:0006915; GO:0007517; GO:0007569; GO:0010046; GO:0010460; GO:0010667; GO:0010976; GO:0014858; GO:0016605; GO:0017136; GO:0019899; GO:0030424; GO:0030426; GO:0031667; GO:0032720; GO:0035774; GO:0043392; GO:0043422; GO:0043524; GO:0045471; GO:0045722; GO:0046872; GO:0048511; GO:0060125; GO:0060548; GO:0070301; GO:0070403; GO:0070932; GO:0071236; GO:0071303; GO:0071407; GO:0090312; GO:0097755; GO:1900181; GO:1901984; GO:1902617; GO:1903427; GO:1904373; GO:1904638; GO:1904644; GO:1904646; GO:1904648; GO:2000270; GO:2000505; GO:2000614
%%bash
# This script was originally written to address a specific problem that Rhonda was having
# input_file is the initial, "problem" file
# file is an intermediate file that most of the program works upon
# output_file is the final file produced by the script
input_file="_blast-annot.tab"
file="_intermediate.file"
output_file="_blast-GO-unfolded.tab"
# sed command substitutes the "; " sequence to a tab and writes the new format to a new file.
# This character sequence is how the GO terms are delimited in their field.
sed $'s/; /\t/g' "$input_file" > "$file"
# Identify first field containing a GO term.
# Search file with grep for "GO:" and pipe to awk.
# Awk sets tab as field delimiter (-F'\t'), runs a for loop that looks for "GO:" (~/GO:/), and then prints the field number).
# Awk results are piped to sort, which sorts unique by number (-ug).
# Sort results are piped to head to retrieve the lowest value (i.e. the top of the list; "-n1").
begin_goterms=$(grep "GO:" "$file" | awk -F'\t' '{for (i=1;i<=NF;i++) if($i ~/GO:/) print i}' | sort -ug | head -n1)
# While loop to process each line of the input file.
while read -r line
do
# Send contents of the current line to awk.
# Set the field separator as a tab (-F'\t') and print the number of fields in that line.
# Save the results of the echo/awk pipe (i.e. number of fields) to the variable "max_field".
max_field=$(echo "$line" | awk -F'\t' '{print NF}')
# Send contents of current line to cut.
# Cut fields (i.e. retain those fields) 1-12.
# Save the results of the echo/cut pipe (i.e. fields 1-12) to the variable "fixed_fields"
fixed_fields=$(echo "$line" | cut -f1-2)
# Since not all the lines contain the same number of fields (e.g. may not have GO terms),
# evaluate the number of fields in each line to determine how to handle current line.
# If the value in max_field is less than the field number where the GO terms begin,
# then just print the current line (%s) followed by a newline (\n).
if (( "$max_field" < "$begin_goterms" ))
then printf "%s\n" "$line"
else
# Send contents of current line (which contains GO terms) to cut.
# Cut fields (i.e. retain those fields) 13 to whatever the last field is in the curent line.
# Save the results of the echo/cut pipe (i.e. all the GO terms fields) to the variable "goterms".
goterms=$(echo "$line" | cut -f"$begin_goterms"-"$max_field")
# Assign values in the variable "goterms" to a new indexed array (called "array"),
# with tab delimiter (IFS=$'\t')
IFS=$'\t' read -r -a array <<<"$goterms"
# Iterate through each element of the array.
# Print the first 12 fields (i.e. the fields stored in "fixed_fields") followed by a tab (%s\t).
# Print the current element in the array (i.e. the current GO term) followed by a new line (%s\n).
for element in "${!array[@]}"
do printf "%s\t%s\n" "$fixed_fields" "${array[$element]}"
done
fi
# Send the input file into the while loop and send the output to a file named "rhonda_fixed.txt".
done < "$file" > "$output_file"
!head _blast-GO-unfolded.tab
A0A0F7YZI5 TRINITY_DN115719_c0_g1_i1 GO:0005179 A0A0F7YZI5 TRINITY_DN115719_c0_g1_i1 GO:0005576 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0005634 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0005829 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0006351 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0006355 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0006364 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0006476 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0006915 A0A0G2JZ79 TRINITY_DN4191_c0_g1_i1 GO:0007517
#gets rid of lines with no GOIDs
!sort -k3 _blast-GO-unfolded.tab | grep "GO:" > _blast-GO-unfolded.sorted
#joining files to get GOslim for each query (with duplicate GOslim / query removed)
!join -1 3 -2 1 -t $'\t' \
_blast-GO-unfolded.sorted \
GO-GOslim.sorted \
| uniq | awk -F'\t' -v OFS='\t' '{print $3, $1, $5, $6}' \
> Blastquery-GOslim.tab
!head Blastquery-GOslim.tab
TRINITY_DN15054_c1_g1_i1 GO:0000001 cell organization and biogenesis P TRINITY_DN1054_c1_g1_i1 GO:0000001 cell organization and biogenesis P TRINITY_DN1054_c1_g1_i3 GO:0000001 cell organization and biogenesis P TRINITY_DN1054_c1_g1_i5 GO:0000001 cell organization and biogenesis P TRINITY_DN1054_c1_g1_i7 GO:0000001 cell organization and biogenesis P TRINITY_DN2169_c3_g1_i2 GO:0000002 cell organization and biogenesis P TRINITY_DN2169_c3_g1_i3 GO:0000002 cell organization and biogenesis P TRINITY_DN1742_c0_g1_i19 GO:0000002 cell organization and biogenesis P TRINITY_DN15054_c1_g1_i1 GO:0000002 cell organization and biogenesis P TRINITY_DN23579_c0_g1_i1 GO:0000002 cell organization and biogenesis P
!cat Blastquery-GOslim.tab | grep " P" | awk -F $'\t' '{print $4}' | sort | uniq -c
198745 P
!cat Blastquery-GOslim.tab | grep " P" | awk -F $'\t' '{print $3}' | sort | uniq -c | sort -r
8841 cell cycle and proliferation 6507 DNA metabolism 3858 death 27904 other biological processes 26964 developmental processes 23297 RNA metabolism 23265 cell organization and biogenesis 2129 cell-cell signaling 20834 other metabolic processes 1768 cell adhesion 17143 protein metabolism 15006 transport 10910 stress response 10319 signal transduction
!cat Blastquery-GOslim.tab | grep " P" | awk -F $'\t' '{print $3}' | sort | uniq -c | sort -r > GOslim-P-pie.txt