#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[2]:


from tf.app import use


# In[3]:


A = use("ETCBC/bhsa:clone", checkout="clone")


# Before we start, we make a test instruction set.
# 
# We pick up all words whose lexeme starts with `BJT_` and that have multiple occurrence forms.
# We collect the occurrence forms and use them to populate a spreadsheet with instructions.
# 
# See the file `ner/sheets/places.xlsx`

# In[4]:


F = A.api.F


# In[5]:


candidates = {}
candidates_utf8 = {}

for w in F.otype.s("word"):
    lex = F.lex.v(w)
    if not lex.startswith("BJT_"):
        continue
    lex_utf8 = F.lex_utf8.v(w)
    candidates.setdefault(lex, set()).add(F.g_cons.v(w))
    candidates_utf8.setdefault(lex_utf8, set()).add(F.g_cons_utf8.v(w))

multiples = {lex: shapes for (lex, shapes) in candidates.items() if len(shapes) > 1}
multiples_utf8 = {lex: shapes for (lex, shapes) in candidates_utf8.items() if len(shapes) > 1}

def show(d):
    for (k, vs) in sorted(d.items()):
        print(k)
        print("\t" + (" ; ".join(v.replace(" ", "_") for v in vs)))
        
show(multiples)
show(multiples_utf8)


# Now we start with the entity assignment.

# In[6]:


NE = A.makeNer()


# In[8]:


NE.readInstructions("places", force=True)


# In[9]:


NE.makeInventory()
NE.showInventory()


# In[10]:


NE.setSet("power")


# In[11]:


NE.resetSet()


# In[12]:


NE.markEntities()


# In[14]:


results = NE.filterContent(anyEnt=True, showStats=None)


# In[15]:


NE.showContent(results)


# In[ ]:


# In[ ]: