#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: from tf.app import use # In[3]: A = use("ETCBC/bhsa:clone", checkout="clone") # Before we start, we make a test instruction set. # # We pick up all words whose lexeme starts with `BJT_` and that have multiple occurrence forms. # We collect the occurrence forms and use them to populate a spreadsheet with instructions. # # See the file `ner/sheets/places.xlsx` # In[4]: F = A.api.F # In[5]: candidates = {} candidates_utf8 = {} for w in F.otype.s("word"): lex = F.lex.v(w) if not lex.startswith("BJT_"): continue lex_utf8 = F.lex_utf8.v(w) candidates.setdefault(lex, set()).add(F.g_cons.v(w)) candidates_utf8.setdefault(lex_utf8, set()).add(F.g_cons_utf8.v(w)) multiples = {lex: shapes for (lex, shapes) in candidates.items() if len(shapes) > 1} multiples_utf8 = {lex: shapes for (lex, shapes) in candidates_utf8.items() if len(shapes) > 1} def show(d): for (k, vs) in sorted(d.items()): print(k) print("\t" + (" ; ".join(v.replace(" ", "_") for v in vs))) show(multiples) show(multiples_utf8) # Now we start with the entity assignment. # In[6]: NE = A.makeNer() # In[8]: NE.readInstructions("places", force=True) # In[9]: NE.makeInventory() NE.showInventory() # In[10]: NE.setSet("power") # In[11]: NE.resetSet() # In[12]: NE.markEntities() # In[14]: results = NE.filterContent(anyEnt=True, showStats=None) # In[15]: NE.showContent(results) # In[ ]: # In[ ]: