%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";
import ktrain
from ktrain import text
Using TensorFlow backend.
TDATA = 'data/conll2003/train.txt'
VDATA = 'data/conll2003/valid.txt'
(trn, val, preproc) = text.entities_from_conll2003(TDATA, val_filepath=VDATA)
Number of sentences: 14041 Number of words in the dataset: 23623 Tags: ['B-MISC', 'I-MISC', 'I-PER', 'B-ORG', 'I-ORG', 'I-LOC', 'O', 'B-PER', 'B-LOC'] Number of Labels: 9 Longest sentence: 113 words
model = text.sequence_tagger('bilstm-crf', preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val)
learner.fit(0.001, 5)
Epoch 1/5 439/439 [==============================] - 123s 281ms/step - loss: 9.4288 - val_loss: 9.3419 Epoch 2/5 439/439 [==============================] - 120s 274ms/step - loss: 9.0161 - val_loss: 9.2529 Epoch 3/5 439/439 [==============================] - 119s 271ms/step - loss: 8.9478 - val_loss: 9.2348 Epoch 4/5 439/439 [==============================] - 120s 273ms/step - loss: 8.9266 - val_loss: 9.2303 Epoch 5/5 439/439 [==============================] - 120s 273ms/step - loss: 8.9159 - val_loss: 9.2310
<keras.callbacks.History at 0x7fb8cde39390>
learner.validate()
F1: 87.20 precision recall f1-score support LOC 0.87 0.94 0.91 1837 ORG 0.82 0.80 0.81 1341 MISC 0.88 0.78 0.83 922 PER 0.89 0.91 0.90 1842 micro avg 0.87 0.88 0.87 5942 macro avg 0.87 0.88 0.87 5942
We can use the view_top_losses
method to inspect the sentences we're getting the most wrong. Here, we can see our model has trouble with movie titles, which is understandable since it is mixed into a catch-all miscellaneous category.
learner.view_top_losses(n=1)
total incorrect: 12 Word True : (Pred) ============================== Best :O (B-ORG) known :O (O) for :O (O) appearances :O (O) in :O (O) " :O (O) Ice :B-MISC (O) Cold :I-MISC (O) in :I-MISC (O) Alex :I-MISC (B-PER) " :O (O) , :O (O) " :O (O) Lawrence :B-MISC (B-PER) of :I-MISC (I-MISC) Arabia :I-MISC (I-LOC) " :O (O) and :O (O) , :O (O) as :O (O) Cardinal :O (B-PER) Wolsey :B-PER (I-PER) , :O (O) in :O (O) " :O (O) Anne :B-MISC (B-PER) of :I-MISC (O) a :I-MISC (O) Thousand :I-MISC (I-MISC) Days :I-MISC (I-MISC) " :O (O) . :O (O)
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.predict('As of 2019, Donald Trump is still the President of the United States.')
[('As', 'O'), ('of', 'O'), ('2019', 'O'), (',', 'O'), ('Donald', 'B-PER'), ('Trump', 'I-PER'), ('is', 'O'), ('still', 'O'), ('the', 'O'), ('President', 'O'), ('of', 'O'), ('the', 'O'), ('United', 'B-LOC'), ('States', 'I-LOC'), ('.', 'O')]
predictor.save('/tmp/mypred')
reloaded_predictor = ktrain.load_predictor('/tmp/mypred')
reloaded_predictor.predict('Paul Newman is my favorite actor.')
[('Paul', 'B-PER'), ('Newman', 'I-PER'), ('is', 'O'), ('my', 'O'), ('favorite', 'O'), ('actor', 'O'), ('.', 'O')]