Notebook

Using pre-trained BERT based NER tagging¶

It is trained on CONLL2003 dataset¶

In [1]:

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [2]:

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = """The Kashmir Files is a 2022 Indian Hindi-language drama film, \
written and directed by Vivek Agnihotri. Produced by Zee Studios, \
the film is based on the exodus of Kashmiri Pandits during the Kashmir Insurgency, \
which it portrays as a genocide. \
It stars Anupam Kher, Darshan Kumar, Pallavi Joshi and Mithun Chakraborty."""

ner_results = nlp(example)

Result¶

In [3]:

for items in ner_results:
    print(f"{items['entity']:<10} {items['word']}")

B-MISC     Kashmir
I-MISC     Files
B-MISC     Indian
B-MISC     Hindi
B-PER      V
B-PER      ##ive
B-PER      ##k
I-PER      A
I-PER      ##gni
I-PER      ##hot
I-PER      ##ri
B-ORG      Z
I-ORG      ##ee
I-ORG      Studios
B-MISC     Kashmir
I-MISC     ##i
I-MISC     Pan
I-MISC     ##dit
B-MISC     Kashmir
I-MISC     In
I-MISC     ##su
I-MISC     ##cy
B-PER      An
B-PER      ##up
B-PER      ##am
I-PER      K
I-PER      ##her
B-PER      Dar
B-PER      ##shan
I-PER      Kumar
B-PER      Pa
B-PER      ##lla
B-PER      ##vi
I-PER      Josh
I-PER      ##i
B-PER      Mi
B-PER      ##th
B-PER      ##un
I-PER      Cha
I-PER      ##kra
I-PER      ##bor
I-PER      ##ty

Some clear output¶

In [5]:

chunklst = []
index=0
while index <= len(ner_results)-1:
    word = ner_results[index]['word']
    entity = ner_results[index]['entity']
    chunklst = [word]
    
    while index+1<=len(ner_results)-1:
        if ner_results[index+1]['word'].startswith('#'):
            chunklst.append(ner_results[index+1]['word'].replace("#",''))
            index+=1
        else:
            break
            
    print(''.join(chunklst), entity)
    index+=1

Kashmir B-MISC
Files I-MISC
Indian B-MISC
Hindi B-MISC
Vivek B-PER
Agnihotri I-PER
Zee B-ORG
Studios I-ORG
Kashmiri B-MISC
Pandit I-MISC
Kashmir B-MISC
Insucy I-MISC
Anupam B-PER
Kher I-PER
Darshan B-PER
Kumar I-PER
Pallavi B-PER
Joshi I-PER
Mithun B-PER
Chakraborty I-PER