#!/usr/bin/env python # coding: utf-8 # ### Using pre-trained BERT based NER tagging # #### It is trained on CONLL2003 dataset # In[1]: from transformers import AutoTokenizer, AutoModelForTokenClassification from transformers import pipeline tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") # In[2]: nlp = pipeline("ner", model=model, tokenizer=tokenizer) example = """The Kashmir Files is a 2022 Indian Hindi-language drama film, \ written and directed by Vivek Agnihotri. Produced by Zee Studios, \ the film is based on the exodus of Kashmiri Pandits during the Kashmir Insurgency, \ which it portrays as a genocide. \ It stars Anupam Kher, Darshan Kumar, Pallavi Joshi and Mithun Chakraborty.""" ner_results = nlp(example) # #### Result # In[3]: for items in ner_results: print(f"{items['entity']:<10} {items['word']}") # #### Some clear output # In[5]: chunklst = [] index=0 while index <= len(ner_results)-1: word = ner_results[index]['word'] entity = ner_results[index]['entity'] chunklst = [word] while index+1<=len(ner_results)-1: if ner_results[index+1]['word'].startswith('#'): chunklst.append(ner_results[index+1]['word'].replace("#",'')) index+=1 else: break print(''.join(chunklst), entity) index+=1