In [1]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

In [2]:

import ktrain
from ktrain import text

Using TensorFlow backend.

STEP 1: Load and Preprocess Data¶

The CoNLL2003 NER dataset can be downloaded from here.

In [3]:

TDATA = 'data/conll2003/train.txt'
VDATA = 'data/conll2003/valid.txt'
(trn, val, preproc) = text.entities_from_conll2003(TDATA, val_filepath=VDATA)

Number of sentences:  14041
Number of words in the dataset:  23623
Tags: ['B-MISC', 'I-MISC', 'I-PER', 'B-ORG', 'I-ORG', 'I-LOC', 'O', 'B-PER', 'B-LOC']
Number of Labels:  9
Longest sentence: 113 words

STEP 2: Define a Model¶

In [4]:

model = text.sequence_tagger('bilstm-crf', preproc)

In [5]:

learner = ktrain.get_learner(model, train_data=trn, val_data=val)

STEP 3: Train and Evaluate Model¶

In [6]:

learner.fit(0.001, 5)

Epoch 1/5
439/439 [==============================] - 123s 281ms/step - loss: 9.4288 - val_loss: 9.3419
Epoch 2/5
439/439 [==============================] - 120s 274ms/step - loss: 9.0161 - val_loss: 9.2529
Epoch 3/5
439/439 [==============================] - 119s 271ms/step - loss: 8.9478 - val_loss: 9.2348
Epoch 4/5
439/439 [==============================] - 120s 273ms/step - loss: 8.9266 - val_loss: 9.2303
Epoch 5/5
439/439 [==============================] - 120s 273ms/step - loss: 8.9159 - val_loss: 9.2310

Out[6]:

<keras.callbacks.History at 0x7fb8cde39390>

In [7]:

learner.validate()

   F1: 87.20
           precision    recall  f1-score   support

      LOC       0.87      0.94      0.91      1837
      ORG       0.82      0.80      0.81      1341
     MISC       0.88      0.78      0.83       922
      PER       0.89      0.91      0.90      1842

micro avg       0.87      0.88      0.87      5942
macro avg       0.87      0.88      0.87      5942

We can use the view_top_losses method to inspect the sentences we're getting the most wrong. Here, we can see our model has trouble with movie titles, which is understandable since it is mixed into a catch-all miscellaneous category.

In [8]:

learner.view_top_losses(n=1)

total incorrect: 12
Word            True : (Pred)
==============================
Best           :O     (B-ORG)
known          :O     (O)
for            :O     (O)
appearances    :O     (O)
in             :O     (O)
"              :O     (O)
Ice            :B-MISC (O)
Cold           :I-MISC (O)
in             :I-MISC (O)
Alex           :I-MISC (B-PER)
"              :O     (O)
,              :O     (O)
"              :O     (O)
Lawrence       :B-MISC (B-PER)
of             :I-MISC (I-MISC)
Arabia         :I-MISC (I-LOC)
"              :O     (O)
and            :O     (O)
,              :O     (O)
as             :O     (O)
Cardinal       :O     (B-PER)
Wolsey         :B-PER (I-PER)
,              :O     (O)
in             :O     (O)
"              :O     (O)
Anne           :B-MISC (B-PER)
of             :I-MISC (O)
a              :I-MISC (O)
Thousand       :I-MISC (I-MISC)
Days           :I-MISC (I-MISC)
"              :O     (O)
.              :O     (O)

Make Predictions on New Data¶

In [13]:

predictor = ktrain.get_predictor(learner.model, preproc)

In [14]:

predictor.predict('As of 2019, Donald Trump is still the President of the United States.')

Out[14]:

[('As', 'O'),
 ('of', 'O'),
 ('2019', 'O'),
 (',', 'O'),
 ('Donald', 'B-PER'),
 ('Trump', 'I-PER'),
 ('is', 'O'),
 ('still', 'O'),
 ('the', 'O'),
 ('President', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('United', 'B-LOC'),
 ('States', 'I-LOC'),
 ('.', 'O')]

In [15]:

predictor.save('/tmp/mypred')

In [16]:

reloaded_predictor = ktrain.load_predictor('/tmp/mypred')

In [18]:

reloaded_predictor.predict('Paul Newman is my favorite actor.')

Out[18]:

[('Paul', 'B-PER'),
 ('Newman', 'I-PER'),
 ('is', 'O'),
 ('my', 'O'),
 ('favorite', 'O'),
 ('actor', 'O'),
 ('.', 'O')]

In [ ]: