#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text # ## STEP 1: Load and Preprocess Data # # # The CoNLL2003 NER dataset can be downloaded from [here](https://github.com/amaiya/ktrain/tree/master/ktrain/tests/conll2003). # In[3]: TDATA = 'data/conll2003/train.txt' VDATA = 'data/conll2003/valid.txt' (trn, val, preproc) = text.entities_from_conll2003(TDATA, val_filepath=VDATA) # ## STEP 2: Define a Model # In[4]: model = text.sequence_tagger('bilstm-crf', preproc) # In[5]: learner = ktrain.get_learner(model, train_data=trn, val_data=val) # ## STEP 3: Train and Evaluate Model # In[6]: learner.fit(0.001, 5) # In[7]: learner.validate() # We can use the `view_top_losses` method to inspect the sentences we're getting the most wrong. Here, we can see our model has trouble with movie titles, which is understandable since it is mixed into a catch-all miscellaneous category. # In[8]: learner.view_top_losses(n=1) # ## Make Predictions on New Data # In[13]: predictor = ktrain.get_predictor(learner.model, preproc) # In[14]: predictor.predict('As of 2019, Donald Trump is still the President of the United States.') # In[15]: predictor.save('/tmp/mypred') # In[16]: reloaded_predictor = ktrain.load_predictor('/tmp/mypred') # In[18]: reloaded_predictor.predict('Paul Newman is my favorite actor.') # In[ ]: