%%bash
ls -lh
total 99G -rw-rw-r-- 1 sam sam 1.3K Apr 24 12:29 20190424_swoose_uniprot_go_goslim.ipynb -rw-rw-r-- 1 sam sam 99G Apr 23 11:19 goa_uniprot_all.gaf -rw-rw-r-- 1 sam sam 129K Apr 18 06:33 goslim_generic.obo
%%bash
head -n 25 goa_uniprot_all.gaf
!gaf-version: 2.1 ! !This file contains all GO annotations and gene product information for proteins in the UniProt KnowledgeBase (UniProtKB), !ComplexPortal protein complexes, and RNAcentral identifiers. ! !Generated: 2019-04-08 11:50 !GO-version: http://purl.obolibrary.org/obo/go/releases/2019-03-29/extensions/go-plus.owl ! UniProtKB A0A000 moeA5 GO:0003824 GO_REF:0000002 IEA InterPro:IPR015421|InterPro:IPR015422 F MoeA5 moeA5 protein taxon:67581 20190406 InterPro UniProtKB A0A000 moeA5 GO:0003870 GO_REF:0000002 IEA InterPro:IPR010961 F MoeA5 moeA5 protein taxon:67581 20190406 InterPro UniProtKB A0A000 moeA5 GO:0009058 GO_REF:0000002 IEA InterPro:IPR004839 P MoeA5 moeA5 protein taxon:67581 20190406 InterPro UniProtKB A0A000 moeA5 GO:0030170 GO_REF:0000002 IEA InterPro:IPR004839|InterPro:IPR010961 F MoeA5 moeA5 protein taxon:67581 20190406 InterPro UniProtKB A0A000 moeA5 GO:0033014 GO_REF:0000002 IEA InterPro:IPR010961 P MoeA5 moeA5 protein taxon:67581 20190406 InterPro UniProtKB A0A001 moeD5 GO:0000166 GO_REF:0000038 IEA UniProtKB-KW:KW-0547 F MoeD5 moeD5 protein taxon:67581 20190406 UniProt UniProtKB A0A001 moeD5 GO:0000166 GO_REF:0000104 IEA UniRule:UR000400038 F MoeD5 moeD5 protein taxon:67581 20190406 UniProt UniProtKB A0A001 moeD5 GO:0005524 GO_REF:0000002 IEA InterPro:IPR003439|InterPro:IPR011527|InterPro:IPR017871|InterPro:IPR036640 F MoeD5 moeD5 protein taxon:67581 20190406 InterPro UniProtKB A0A001 moeD5 GO:0005524 GO_REF:0000038 IEA UniProtKB-KW:KW-0067 F MoeD5 moeD5 protein taxon:67581 20190406 UniProt UniProtKB A0A001 moeD5 GO:0005524 GO_REF:0000104 IEA UniRule:UR000400038 F MoeD5 moeD5 protein taxon:67581 20190406 UniProt UniProtKB A0A001 moeD5 GO:0016020 GO_REF:0000038 IEA UniProtKB-KW:KW-0472 C MoeD5 moeD5 protein taxon:67581 20190406 UniProt UniProtKB A0A001 moeD5 GO:0016021 GO_REF:0000002 IEA InterPro:IPR011527|InterPro:IPR036640 C MoeD5 moeD5 protein taxon:67581 20190406 InterPro UniProtKB A0A001 moeD5 GO:0016021 GO_REF:0000038 IEA UniProtKB-KW:KW-0812 C MoeD5 moeD5 protein taxon:67581 20190406 UniProt UniProtKB A0A001 moeD5 GO:0016887 GO_REF:0000002 IEA InterPro:IPR003439|InterPro:IPR017871 F MoeD5 moeD5 protein taxon:67581 20190406 InterPro UniProtKB A0A001 moeD5 GO:0042626 GO_REF:0000002 IEA InterPro:IPR011527 F MoeD5 moeD5 protein taxon:67581 20190406 InterPro UniProtKB A0A001 moeD5 GO:0055085 GO_REF:0000002 IEA InterPro:IPR011527 P MoeD5 moeD5 protein taxon:67581 20190406 InterPro UniProtKB A0A002 A0A002 GO:0000166 GO_REF:0000038 IEA UniProtKB-KW:KW-0547 F MoeJ5 protein taxon:67581 20190406 UniProt
%%bash
head -n 25 goslim_generic.obo
format-version: 1.2 subsetdef: gocheck_do_not_annotate "Term not to be used for direct annotation" subsetdef: gocheck_do_not_manually_annotate "Term not to be used for direct manual annotation" subsetdef: goslim_agr "AGR slim" subsetdef: goslim_aspergillus "Aspergillus GO slim" subsetdef: goslim_candida "Candida GO slim" subsetdef: goslim_chembl "ChEMBL protein targets summary" subsetdef: goslim_flybase_ribbon "FlyBase Drosophila GO ribbon slim" subsetdef: goslim_generic "Generic GO slim" subsetdef: goslim_metagenomics "Metagenomics GO slim" subsetdef: goslim_mouse "Mouse GO slim" subsetdef: goslim_pir "PIR GO slim" subsetdef: goslim_plant "Plant GO slim" subsetdef: goslim_pombe "Fission yeast GO slim" subsetdef: goslim_synapse "synapse GO slim" subsetdef: goslim_yeast "Yeast GO slim" synonymtypedef: syngo_official_label "label approved by the SynGO project" synonymtypedef: systematic_synonym "Systematic synonym" EXACT ontology: go/subsets/goslim_generic [Term] id: GO:0000003 name: reproduction namespace: biological_process alt_id: GO:0019952
Represented by P
in the "Aspect" column (Column #8)
Line count:
%%bash
time wc -l goa_uniprot_all.gaf
612663769 goa_uniprot_all.gaf
real 15m51.024s user 0m27.220s sys 0m51.784s
Use awk to pull out desired records.
But first, verify awk command does what we want...
%%bash
head -n 50 goa_uniprot_all.gaf | awk '{if ($8=="P") print}'
UniProtKB A0A000 moeA5 GO:0009058 GO_REF:0000002 IEA InterPro:IPR004839 P MoeA5 moeA5 protein taxon:67581 20190406 InterPro UniProtKB A0A000 moeA5 GO:0033014 GO_REF:0000002 IEA InterPro:IPR010961 P MoeA5 moeA5 protein taxon:67581 20190406 InterPro UniProtKB A0A001 moeD5 GO:0055085 GO_REF:0000002 IEA InterPro:IPR011527 P MoeD5 moeD5 protein taxon:67581 20190406 InterPro UniProtKB A0A002 A0A002 GO:0055085 GO_REF:0000002 IEA InterPro:IPR011527 P MoeJ5 protein taxon:67581 20190406 InterPro UniProtKB A0A004 moeF5 GO:0006529 GO_REF:0000002 IEA InterPro:IPR001962|InterPro:IPR006426 P MoeF5 moeF5 protein taxon:67581 20190406 InterPro UniProtKB A0A004 moeF5 GO:0006529 GO_REF:0000038 IEA UniProtKB-KW:KW-0061 P MoeF5 moeF5 protein taxon:67581 20190406 UniProt UniProtKB A0A004 moeF5 GO:0006541 GO_REF:0000038 IEA UniProtKB-KW:KW-0315 P MoeF5 moeF5 protein taxon:67581 20190406 UniProt UniProtKB A0A004 moeF5 GO:0008652 GO_REF:0000038 IEA UniProtKB-KW:KW-0028 P MoeF5 moeF5 protein taxon:67581 20190406 UniProt UniProtKB A0A009 moeM5 GO:0009058 GO_REF:0000002 IEA InterPro:IPR003696 P MoeM5 moeM5 protein taxon:67581 20190406 InterPro
Now, let's put all of them in a new file
%%bash
time \
awk '{if ($8=="P") print}' goa_uniprot_all.gaf \
> goa_uniprot_P.gaf
real 22m15.433s user 8m9.976s sys 1m37.932s
Count biological process entries in new file
Line count:
%%bash
time \
wc -l goa_uniprot_P.gaf
181626721 goa_uniprot_P.gaf
real 4m17.147s user 0m7.680s sys 0m13.904s
Counts to confirm that parsing strategy will work.
If it works correctly, both grep
commands should yield same number.
%%bash
echo "GOslim Term counts"
# Search for lines beginning (^) with [Term]
# Backslashes needed to escape brackets to prevent interpretation as regex
grep "^\[Term\]" goslim_generic.obo | wc -l
echo "----------------------------------"
echo "----------------------------------"
# Search for lines beginning (^) wiht id: GO:
echo "GOslim ID counts"
grep "^id: GO:" goslim_generic.obo | wc -l
GOslim Term counts 147 ---------------------------------- ---------------------------------- GOslim ID counts 147
It worked, so let's create a list of GOslim IDs
%%bash
grep "^id: GO:" goslim_generic.obo \
| awk '{print $2}' \
> goslim_ID_list.txt
wc -l goslim_ID_list.txt
head goslim_ID_list.txt
147 goslim_ID_list.txt GO:0000003 GO:0000228 GO:0000229 GO:0000278 GO:0000902 GO:0002376 GO:0003013 GO:0003674 GO:0003677 GO:0003700
%%bash
time \
grep --file=goslim_ID_list.txt goa_uniprot_P.gaf \
> uniprot_goslim_P.txt
wc -l uniprot_goslim_P.txt
head uniprot_goslim_P.txt
28391971 uniprot_goslim_P.txt UniProtKB A0A000 moeA5 GO:0009058 GO_REF:0000002 IEA InterPro:IPR004839 P MoeA5 moeA5 protein taxon:67581 20190406 InterPro UniProtKB A0A001 moeD5 GO:0055085 GO_REF:0000002 IEA InterPro:IPR011527 P MoeD5 moeD5 protein taxon:67581 20190406 InterPro UniProtKB A0A002 A0A002 GO:0055085 GO_REF:0000002 IEA InterPro:IPR011527 P MoeJ5 protein taxon:67581 20190406 InterPro UniProtKB A0A009 moeM5 GO:0009058 GO_REF:0000002 IEA InterPro:IPR003696 P MoeM5 moeM5 protein taxon:67581 20190406 InterPro UniProtKB A0A009DWE1 J504_3685 GO:0055085 GO_REF:0000002 IEA InterPro:IPR001036 P AcrB/AcrD/AcrF family protein J504_3685 protein taxon:1310605 20190406 InterPro UniProtKB A0A009DWJ5 J504_3662 GO:0032196 GO_REF:0000002 IEA InterPro:IPR038965 P Putative transposase J504_3662 protein taxon:1310605 20190406 InterPro UniProtKB A0A009DWL0 J504_3657 GO:0032196 GO_REF:0000002 IEA InterPro:IPR038965 P Putative iSRSO8-transposase orfB protein J504_3657 protein taxon:1310605 20190406 InterPro UniProtKB A0A009E3I5 J504_3523 GO:0032196 GO_REF:0000002 IEA InterPro:IPR038965 P Integrase core domain protein J504_3523 protein taxon:1310605 20190406 InterPro UniProtKB A0A009E5V4 J504_3420 GO:0032196 GO_REF:0000002 IEA InterPro:IPR038965 P Integrase core domain protein J504_3420|J504_3472 protein taxon:1310605 20190406 InterPro UniProtKB A0A009E6I2 J504_3410 GO:0032196 GO_REF:0000002 IEA InterPro:IPR038965 P Integrase core domain protein J504_3410 protein taxon:1310605 20190406 InterPro
real 1m41.021s user 1m27.764s sys 0m10.892s
ls -lh uniprot_goslim_P.txt
-rw-rw-r-- 1 sam sam 4.7G Apr 24 14:04 uniprot_goslim_P.txt
%%bash
rsync -av uniprot_goslim_P.txt \
gannet:/volume2/web/Atumefaciens/20190424_uniprot_go_goslim_P
sending incremental file list uniprot_goslim_P.txt sent 4,944,356,869 bytes received 34 bytes 106,330,255.98 bytes/sec total size is 4,943,149,944 speedup is 1.00