from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = """The Kashmir Files is a 2022 Indian Hindi-language drama film, \
written and directed by Vivek Agnihotri. Produced by Zee Studios, \
the film is based on the exodus of Kashmiri Pandits during the Kashmir Insurgency, \
which it portrays as a genocide. \
It stars Anupam Kher, Darshan Kumar, Pallavi Joshi and Mithun Chakraborty."""
ner_results = nlp(example)
for items in ner_results:
print(f"{items['entity']:<10} {items['word']}")
B-MISC Kashmir I-MISC Files B-MISC Indian B-MISC Hindi B-PER V B-PER ##ive B-PER ##k I-PER A I-PER ##gni I-PER ##hot I-PER ##ri B-ORG Z I-ORG ##ee I-ORG Studios B-MISC Kashmir I-MISC ##i I-MISC Pan I-MISC ##dit B-MISC Kashmir I-MISC In I-MISC ##su I-MISC ##cy B-PER An B-PER ##up B-PER ##am I-PER K I-PER ##her B-PER Dar B-PER ##shan I-PER Kumar B-PER Pa B-PER ##lla B-PER ##vi I-PER Josh I-PER ##i B-PER Mi B-PER ##th B-PER ##un I-PER Cha I-PER ##kra I-PER ##bor I-PER ##ty
chunklst = []
index=0
while index <= len(ner_results)-1:
word = ner_results[index]['word']
entity = ner_results[index]['entity']
chunklst = [word]
while index+1<=len(ner_results)-1:
if ner_results[index+1]['word'].startswith('#'):
chunklst.append(ner_results[index+1]['word'].replace("#",''))
index+=1
else:
break
print(''.join(chunklst), entity)
index+=1
Kashmir B-MISC Files I-MISC Indian B-MISC Hindi B-MISC Vivek B-PER Agnihotri I-PER Zee B-ORG Studios I-ORG Kashmiri B-MISC Pandit I-MISC Kashmir B-MISC Insucy I-MISC Anupam B-PER Kher I-PER Darshan B-PER Kumar I-PER Pallavi B-PER Joshi I-PER Mithun B-PER Chakraborty I-PER