#!/usr/bin/env python # coding: utf-8 # ## Summarizing annotations to a term and descendants # # This notebook demonstrates summarizing annotation counts for a term and its descendants. # # An example use of this is a GO annotator exploring refactoring a subtree in GO # # Of course, if this were a regular thing we would make a command line or even web interface, # but keeping as a notebook gives us some flexibility in logic, and anyway is intended largely # as a demonstration # ### boilerplate # # * importing relevant ontobiolibraries # * set up key objects # In[1]: import pandas as pd ## Create an ontology factory in order to fetch GO from ontobio.ontol_factory import OntologyFactory ofactory = OntologyFactory() ## GOLR queries from ontobio.golr.golr_query import GolrAssociationQuery ## rendering ontologies from ontobio import GraphRenderer # In[2]: ## Load GO. Note the first time this runs Jupyter will show '*' - be patient ont = ofactory.create("go") # ### Finding descendants # # Here we are using the in-memory ontology object, no external service calls are executed # # Change the value of `term_id` to what you like # In[3]: term_id = "GO:0009070" ## serine family amino acid biosynthetic process descendants = ont.descendants(term_id, reflexive=True, relations=['subClassOf', 'BFO:0000050']) # In[4]: descendants # ### rendering subtrees # # We use the good-old-fashioned Tree renderer # # (this doesn't scale well for latticey-subontologies) # In[5]: renderer = GraphRenderer.create('tree') # In[6]: print(renderer.render_subgraph(ont, nodes=descendants)) # ### summarizing annotations # # We write a short procedure to wrap calling Golr and returning a summary dict # # The dict is keyed by taxon label. We also include an entry for `ALL` # # In[58]: DEFAULT_FACET_FIELDS = ['taxon_subset_closure_label', 'evidence_label', 'assigned_by'] def summarize(t: str, evidence_closure='ECO:0000269', ## restrict to experimental facet_fields=None) -> dict: """ Summarize a term """ if facet_fields == None: facet_fields = DEFAULT_FACET_FIELDS q = GolrAssociationQuery(object=t, rows=0, object_category='function', fq={'evidence_closur'taxon_subset_closure_label'e_label':'experimental evidence'}, facet_fields=facet_fields) #params = q.solr_params() #print(params) result = q.exec() fc = result['facet_counts'] item = {'ALL': result['numFound']} ## make sure this is the first entry for ff in facet_fields: if ff in fc: item.update(fc[ff]) return item # In[59]: print(summarize(term_id)) # In[63]: def summarize_set(ids, facet_fields=None) -> pd.DataFrame: """ Summarize a set of annotations, return a dataframe """ items = [] for id in ids: item = {'id': id, 'name:': ont.label(id)} for k,v in summarize(id, facet_fields=facet_fields).items(): item[k] = v items.append(item) df = pd.DataFrame(items).fillna(0) # sort using total number df.sort_values('ALL', axis=0, ascending=False, inplace=True) return df # ## Summarize GO term and descendants # # More advanced visualziations are easy with plotly etc. We leave as an exercise to the reader... # # As an example, for the first query we bundle all facets (species, evidence, assigned by) together # In[64]: pd.options.display.max_columns = None df = summarize_set(descendants) df # ## Summary by assigned by # # # In[67]: summarize_set(descendants, facet_fields=['assigned_by']) # ### Summarize by species # # use `taxon_subset_closure_label` facet # In[69]: summarize_set(descendants, facet_fields=['taxon_subset_closure_label']) # In[ ]: