#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 


# In[2]:


import ktrain
from ktrain import text


# ## STEP 1: Load and Preprocess Data
# 
# 
# The CoNLL2003 NER dataset can be downloaded from [here](https://github.com/amaiya/ktrain/tree/master/ktrain/tests/conll2003).

# In[3]:


TDATA = 'data/conll2003/train.txt'
VDATA = 'data/conll2003/valid.txt'
(trn, val, preproc) = text.entities_from_conll2003(TDATA, val_filepath=VDATA)


# ## STEP 2: Define a Model

# In[4]:


model = text.sequence_tagger('bilstm-crf', preproc)


# In[5]:


learner = ktrain.get_learner(model, train_data=trn, val_data=val)


# ## STEP 3: Train and Evaluate Model

# In[6]:


learner.fit(0.001, 5)


# In[7]:


learner.validate()


# We can use the `view_top_losses` method to inspect the sentences we're getting the most wrong. Here, we can see our model has trouble with movie titles, which is understandable since it is mixed into a catch-all miscellaneous category.

# In[8]:


learner.view_top_losses(n=1)


# ## Make Predictions on New Data

# In[13]:


predictor = ktrain.get_predictor(learner.model, preproc)


# In[14]:


predictor.predict('As of 2019, Donald Trump is still the President of the United States.')


# In[15]:


predictor.save('/tmp/mypred')


# In[16]:


reloaded_predictor = ktrain.load_predictor('/tmp/mypred')


# In[18]:


reloaded_predictor.predict('Paul Newman is my favorite actor.')


# In[ ]: