%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Wed Oct 23 11:08:28 PDT 2019 ------------ Distributor ID: Ubuntu Description: Ubuntu 16.04.6 LTS Release: 16.04 Codename: xenial ------------ HOSTNAME: swoose ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 24 On-line CPU(s) list: 0-23 Thread(s) per core: 2 Core(s) per socket: 6 Socket(s): 2 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 44 Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz Stepping: 2 CPU MHz: 2925.798 BogoMIPS: 5851.95 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 256K L3 cache: 12288K NUMA node0 CPU(s): 0-23 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d ------------ Memory Specs total used free shared buff/cache available Mem: 70G 13G 37G 602M 19G 55G Swap: 4.7G 1.0G 3.6G
No LSB modules are available.
%env
variables are good for passing to bash cells
%env wd=/home/sam/analyses/20191022_pgen_Pgen_v074.a4_BLAST-GO-mapping
wd="/home/sam/analyses/20191022_pgen_Pgen_v074.a4_BLAST-GO-mapping"
%env rsync_gannet=gannet:/volume2/web/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab
%env blast_tab=Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab
%env sp_id_list=20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id.list
%env sp_id_go_mapping_tab=20191022_SP-ID-GO-mapping.tab
%env blast_go_join=20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id-join-go_id.tab
env: wd=/home/sam/analyses/20191022_pgen_Pgen_v074.a4_BLAST-GO-mapping env: rsync_gannet=gannet:/volume2/web/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab env: blast_tab=Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab env: sp_id_list=20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id.list env: sp_id_go_mapping_tab=20191022_SP-ID-GO-mapping.tab env: blast_go_join=20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id-join-go_id.tab
%%bash
[ -d "${wd}" ] || mkdir --parents "${wd}"
cd {wd}
/home/sam/analyses/20191022_pgen_Pgen_v074.a4_BLAST-GO-mapping
%%bash
rsync \
--archive \
--verbose \
"${rsync_gannet}" .
echo "--------------------------------------"
ls -lh
receiving incremental file list Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab sent 30 bytes received 1,533,721 bytes 1,022,500.67 bytes/sec total size is 1,533,389 speedup is 1.00 -------------------------------------- total 1.5M -rw-rw-rw- 1 sam users 1.5M Oct 3 11:18 Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab
%%bash
head -n 25 "${blast_tab}"
echo ""
echo "----------------------------"
tail -n +14 "${blast_tab}" | head
echo ""
echo ""
echo "----------------------------"
tail -n +14 "${blast_tab}" | wc -l
# # Output is generated by GenSAS 7.x-5.0 # #name : mRNA #start : Start of alignment in subject #end : End of alignment in subject #m_start : Start of alignment in query #m_end : End of alignment in query #al : Alignment length #score : Row score of the match #evalue : E value of the match #identity : Percentage of identical matches mame start end score Accession Match ID m_start m_end E-value identity al 21910-PGEN_.00g000010.m01 121 229 165 Q86IC9 sp|Q86IC9|CAMT1_DICDI 11 122 8.93e-14 35.652 115 21910-PGEN_.00g000020.m01 147 467 968 P04177 sp|P04177|TY3H_RAT 20 339 3.47e-127 55.140 321 21910-PGEN_.00g000050.m01 566 722 182 Q8L840 sp|Q8L840|RQL4A_ARATH 2 167 2.67e-14 35.119 168 21910-PGEN_.00g000080.m01 268 322 152 A1E2V0 sp|A1E2V0|BIRC3_CANLF 163 220 3.91e-10 53.448 58 21910-PGEN_.00g000090.m01 199 327 161 P34456 sp|P34456|YMD2_CAEEL 7 134 7.52e-12 26.357 129 21910-PGEN_.00g000210.m01 18 200 263 O00463 sp|O00463|TRAF5_HUMAN 5 191 2.24e-25 34.921 189 21910-PGEN_.00g000230.m01 48 155 287 Q00945 sp|Q00945|CONO_LYMST 31 134 1.59e-32 50.000 108 21910-PGEN_.00g000240.m01 4 605 1091 Q5SWK7 sp|Q5SWK7|RN145_MOUSE 13 601 2.65e-139 39.607 611 21910-PGEN_.00g000280.m01 4 153 210 Q8ZXT3 sp|Q8ZXT3|Y1111_PYRAE 853 1012 1.10e-17 38.750 160 21910-PGEN_.00g000300.m01 159 347 480 Q5REG4 sp|Q5REG4|DTX3_PONAB 1135 1320 1.20e-51 50.794 189 21910-PGEN_.00g000300.m02 159 347 480 Q5REG4 sp|Q5REG4|DTX3_PONAB 1138 1323 1.18e-51 50.794 189 21910-PGEN_.00g000380.m01 381 508 205 Q8QG60 sp|Q8QG60|CRY2_CHICK 2 145 4.92e-18 36.111 144 ---------------------------- 21910-PGEN_.00g000010.m01 121 229 165 Q86IC9 sp|Q86IC9|CAMT1_DICDI 11 122 8.93e-14 35.652 115 21910-PGEN_.00g000020.m01 147 467 968 P04177 sp|P04177|TY3H_RAT 20 339 3.47e-127 55.140 321 21910-PGEN_.00g000050.m01 566 722 182 Q8L840 sp|Q8L840|RQL4A_ARATH 2 167 2.67e-14 35.119 168 21910-PGEN_.00g000080.m01 268 322 152 A1E2V0 sp|A1E2V0|BIRC3_CANLF 163 220 3.91e-10 53.448 58 21910-PGEN_.00g000090.m01 199 327 161 P34456 sp|P34456|YMD2_CAEEL 7 134 7.52e-12 26.357 129 21910-PGEN_.00g000210.m01 18 200 263 O00463 sp|O00463|TRAF5_HUMAN 5 191 2.24e-25 34.921 189 21910-PGEN_.00g000230.m01 48 155 287 Q00945 sp|Q00945|CONO_LYMST 31 134 1.59e-32 50.000 108 21910-PGEN_.00g000240.m01 4 605 1091 Q5SWK7 sp|Q5SWK7|RN145_MOUSE 13 601 2.65e-139 39.607 611 21910-PGEN_.00g000280.m01 4 153 210 Q8ZXT3 sp|Q8ZXT3|Y1111_PYRAE 853 1012 1.10e-17 38.750 160 21910-PGEN_.00g000300.m01 159 347 480 Q5REG4 sp|Q5REG4|DTX3_PONAB 1135 1320 1.20e-51 50.794 189 ---------------------------- 16548
https://www.uniprot.org/uploadlists
The UniProt mapping website only generates matches for each unique ID.
%%bash
tail -n +14 "${blast_tab}" > "${sp_id_list}"
head "${sp_id_list}"
echo ""
sort "${sp_id_list}" | uniq -c | wc -l
21910-PGEN_.00g000010.m01 121 229 165 Q86IC9 sp|Q86IC9|CAMT1_DICDI 11 122 8.93e-14 35.652 115 21910-PGEN_.00g000020.m01 147 467 968 P04177 sp|P04177|TY3H_RAT 20 339 3.47e-127 55.140 321 21910-PGEN_.00g000050.m01 566 722 182 Q8L840 sp|Q8L840|RQL4A_ARATH 2 167 2.67e-14 35.119 168 21910-PGEN_.00g000080.m01 268 322 152 A1E2V0 sp|A1E2V0|BIRC3_CANLF 163 220 3.91e-10 53.448 58 21910-PGEN_.00g000090.m01 199 327 161 P34456 sp|P34456|YMD2_CAEEL 7 134 7.52e-12 26.357 129 21910-PGEN_.00g000210.m01 18 200 263 O00463 sp|O00463|TRAF5_HUMAN 5 191 2.24e-25 34.921 189 21910-PGEN_.00g000230.m01 48 155 287 Q00945 sp|Q00945|CONO_LYMST 31 134 1.59e-32 50.000 108 21910-PGEN_.00g000240.m01 4 605 1091 Q5SWK7 sp|Q5SWK7|RN145_MOUSE 13 601 2.65e-139 39.607 611 21910-PGEN_.00g000280.m01 4 153 210 Q8ZXT3 sp|Q8ZXT3|Y1111_PYRAE 853 1012 1.10e-17 38.750 160 21910-PGEN_.00g000300.m01 159 347 480 Q5REG4 sp|Q5REG4|DTX3_PONAB 1135 1320 1.20e-51 50.794 189 16548
Line count below should match the uniq
line count from the cell above
%%bash
head "${sp_id_go_mapping_tab}"
echo ""
tail -n +2 "${sp_id_go_mapping_tab}" | wc -l
Entry yourlist:M20191022216DA2B77BFBD2E6699CA9B6D1C41EB24D59497 Gene ontology IDs Q86IC9 Q86IC9 GO:0042409; GO:0046872 P04177 P04177 GO:0001666; GO:0001963; GO:0001975; GO:0004497; GO:0004511; GO:0005634; GO:0005737; GO:0005739; GO:0005790; GO:0005829; GO:0006585; GO:0006631; GO:0006665; GO:0007507; GO:0007601; GO:0007605; GO:0007612; GO:0007613; GO:0007617; GO:0007626; GO:0008016; GO:0008021; GO:0008198; GO:0008199; GO:0009414; GO:0009416; GO:0009635; GO:0009651; GO:0009898; GO:0010038; GO:0010043; GO:0010259; GO:0014070; GO:0014823; GO:0015842; GO:0016137; GO:0016597; GO:0017085; GO:0018963; GO:0019825; GO:0019899; GO:0019904; GO:0021987; GO:0030424; GO:0030425; GO:0030659; GO:0031667; GO:0032355; GO:0032496; GO:0033076; GO:0033162; GO:0034617; GO:0035094; GO:0035176; GO:0035240; GO:0035690; GO:0035900; GO:0035902; GO:0042136; GO:0042214; GO:0042418; GO:0042421; GO:0042423; GO:0042462; GO:0042493; GO:0042745; GO:0042755; GO:0043005; GO:0043025; GO:0043195; GO:0043204; GO:0043434; GO:0045471; GO:0045472; GO:0046684; GO:0048545; GO:0048596; GO:0050890; GO:0051289; GO:0051412; GO:0051602; GO:0052314; GO:0070848; GO:0071287; GO:0071312; GO:0071316; GO:0071333; GO:0071363 Q8L840 Q8L840 GO:0000724; GO:0003676; GO:0005524; GO:0005634; GO:0005694; GO:0005737; GO:0006268; GO:0006281; GO:0006310; GO:0006974; GO:0009378; GO:0009506; GO:0032508; GO:0043138; GO:0046872; GO:0051276; GO:0070417; GO:0071215 A1E2V0 A1E2V0 GO:0005634; GO:0005737; GO:0031398; GO:0043027; GO:0043154; GO:0046872; GO:0060546; GO:0061630; GO:1990001 P34456 P34456 GO:0004748; GO:0005829; GO:0005971; GO:0009263 O00463 O00463 GO:0005164; GO:0005813; GO:0005829; GO:0006915; GO:0007165; GO:0008270; GO:0009898; GO:0031625; GO:0031996; GO:0032991; GO:0035631; GO:0042802; GO:0042981; GO:0043123; GO:0051091; GO:0051092 Q00945 Q00945 GO:0005185; GO:0005576 Q5SWK7 Q5SWK7 GO:0005783; GO:0005789; GO:0008270; GO:0012505; GO:0016021; GO:0061630 Q8ZXT3 Q8ZXT3 10157
%%bash
join -1 5 -2 1 \
<(tail -n +14 "${blast_tab}" | sort -t$'\t' -k5,5) \
<(awk -F"\t" 'FNR>1 {print $2, $1, $3}' "${sp_id_go_mapping_tab}" | sort -k1,1) \
> "${blast_go_join}"
# Check out new file
head "${blast_go_join}"
echo ""
echo "---------------------------------------------"
echo ""
# Line count of new file (should match $sp_id_list line count)
wc -l "${blast_go_join}"
A0A044RE18 21910-PGEN_.00g298100.m01 195 626 542 sp|A0A044RE18|BLI_ONCVO 99 542 7.89e-57 32.972 461 A0A044RE18 GO:0004252; GO:0005576; GO:0005634; GO:0007635; GO:0031638; GO:0040002; GO:0045887; GO:0046872; GO:0090472; GO:1902075 A0A044RE18 21910-PGEN_.00g298510.m01 195 626 539 sp|A0A044RE18|BLI_ONCVO 135 578 2.66e-56 32.829 463 A0A044RE18 GO:0004252; GO:0005576; GO:0005634; GO:0007635; GO:0031638; GO:0040002; GO:0045887; GO:0046872; GO:0090472; GO:1902075 A0A0A7DNP6 21910-PGEN_.00g047160.m01 5 94 284 sp|A0A0A7DNP6|GRHLP_RUDPH 9 96 1.66e-33 56.667 90 A0A0A7DNP6 GO:0005576; GO:0007218 A0A0B5A7M7 21910-PGEN_.00g059020.m01 26 149 135 sp|A0A0B5A7M7|INS1_CONIM 59 178 5.90e-09 29.927 137 A0A0B5A7M7 GO:0005179; GO:0005576; GO:0006006; GO:0090729 A0A0B5A8P8 21910-PGEN_.00g272560.m01 33 140 164 sp|A0A0B5A8P8|INS2_CONIM 27 140 7.44e-14 31.707 123 A0A0B5A8P8 GO:0005179; GO:0005576; GO:0006006; GO:0090729 A0A0E0RTV6 21910-PGEN_.00g274950.m01 119 292 258 sp|A0A0E0RTV6|ZEB1_GIBZE 126 311 1.75e-22 38.298 188 A0A0E0RTV6 GO:0016491; GO:0071949 A0A0F7YYX3 21910-PGEN_.00g031120.m01 12 193 197 sp|A0A0F7YYX3|CPROH_CONVC 28 213 1.19e-17 31.720 186 A0A0F7YYX3 GO:0005576 A0A0F7YZI5 21910-PGEN_.00g015590.m01 22 135 327 sp|A0A0F7YZI5|CTHB5_CONVC 56 168 2.42e-38 54.783 115 A0A0F7YZI5 GO:0005179; GO:0005576; GO:0090729 A0A0F7Z3J2 21910-PGEN_.00g015600.m01 45 143 368 sp|A0A0F7Z3J2|CTHA2_CONVC 72 170 2.24e-44 68.687 99 A0A0F7Z3J2 GO:0005179; GO:0005576; GO:0090729 A0A0G2K344 21910-PGEN_.00g320520.m01 5 1063 3021 sp|A0A0G2K344|PK3CA_RAT 3 1049 0.0 54.229 1064 A0A0G2K344 GO:0001525; GO:0004674; GO:0005524; GO:0005737; GO:0005829; GO:0005886; GO:0005942; GO:0005943; GO:0006909; GO:0010629; GO:0014065; GO:0014870; GO:0016020; GO:0016303; GO:0035005; GO:0036092; GO:0043201; GO:0046854; GO:0046934; GO:0048015; GO:0048661; GO:0052812; GO:0071548; GO:1903544 --------------------------------------------- 16548 20191022_Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional_sp_id-join-go_id.tab