#!/usr/bin/env python
# coding: utf-8

# ### Using pre-trained BERT based NER tagging
# #### It is trained on CONLL2003 dataset

# In[1]:


from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")


# In[2]:


nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = """The Kashmir Files is a 2022 Indian Hindi-language drama film, \
written and directed by Vivek Agnihotri. Produced by Zee Studios, \
the film is based on the exodus of Kashmiri Pandits during the Kashmir Insurgency, \
which it portrays as a genocide. \
It stars Anupam Kher, Darshan Kumar, Pallavi Joshi and Mithun Chakraborty."""

ner_results = nlp(example)


# #### Result

# In[3]:


for items in ner_results:
    print(f"{items['entity']:<10} {items['word']}")


# #### Some clear output

# In[5]:


chunklst = []
index=0
while index <= len(ner_results)-1:
    word = ner_results[index]['word']
    entity = ner_results[index]['entity']
    chunklst = [word]
    
    while index+1<=len(ner_results)-1:
        if ner_results[index+1]['word'].startswith('#'):
            chunklst.append(ner_results[index+1]['word'].replace("#",''))
            index+=1
        else:
            break
            
    print(''.join(chunklst), entity)
    index+=1