import re
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = r"Paul [A-Z]\w+"
matches = re.finditer(pattern, text)
for match in matches:
print (match)
<re.Match object; span=(0, 11), match='Paul Newman'> <re.Match object; span=(39, 53), match='Paul Hollywood'>
import spacy
from spacy.tokens import Span
INFO:tensorflow:Enabling eager execution INFO:tensorflow:Enabling v2 tensorshape INFO:tensorflow:Enabling resource variables INFO:tensorflow:Enabling tensor equality INFO:tensorflow:Enabling control flow v2
nlp = spacy.blank("en")
doc = nlp(text)
print (doc.ents)
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
if span is not None:
mwt_ents.append((span.start, span.end, span.text))
for ent in mwt_ents:
start, end, name = ent
per_ent = Span(doc, start, end, label="PERSON")
original_ents.append(per_ent)
doc.ents = original_ents
for ent in doc.ents:
print (ent.text, ent.label_)
() Paul Newman PERSON Paul Hollywood PERSON
print (mwt_ents)
[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]
from spacy.language import Language
@Language.component("paul_ner")
def paul_ner(doc):
pattern = r"Paul [A-Z]\w+"
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
if span is not None:
mwt_ents.append((span.start, span.end, span.text))
for ent in mwt_ents:
start, end, name = ent
per_ent = Span(doc, start, end, label="PERSON")
original_ents.append(per_ent)
doc.ents = original_ents
return (doc)
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")
<function __main__.paul_ner(doc)>
doc2 = nlp2(text)
print (doc2.ents)
(Paul Newman, Paul Hollywood)
from spacy.language import Language
from spacy.util import filter_spans
@Language.component("cinema_ner")
def cinema_ner(doc):
pattern = r"Hollywood"
original_ents = list(doc.ents)
mwt_ents = []
for match in re.finditer(pattern, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
if span is not None:
mwt_ents.append((span.start, span.end, span.text))
for ent in mwt_ents:
start, end, name = ent
per_ent = Span(doc, start, end, label="CINEMA")
original_ents.append(per_ent)
filtered = filter_spans(original_ents)
doc.ents = filtered
return (doc)
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")
<function __main__.cinema_ner(doc)>
doc3 = nlp3(text)
for ent in doc3.ents:
print (ent.text, ent.label_)
Paul Newman PERSON American NORP Paul Hollywood PERSON British NORP Paul PERSON