#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";   


# # Text Classification with Hugging Face Transformers in *ktrain*
# 
# As of v0.8.x, *ktrain* now includes an easy-to-use, thin wrapper to the Hugging Face transformers library for text classification.

# ## Load Data Into Arrays

# In[2]:


categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
train_b = fetch_20newsgroups(subset='train',
   categories=categories, shuffle=True, random_state=42)
test_b = fetch_20newsgroups(subset='test',
   categories=categories, shuffle=True, random_state=42)

print('size of training set: %s' % (len(train_b['data'])))
print('size of validation set: %s' % (len(test_b['data'])))
print('classes: %s' % (train_b.target_names))

x_train = train_b.data
y_train = train_b.target
x_test = test_b.data
y_test = test_b.target


# ## STEP 1: Preprocess Data and Build a Transformer Model
# 
# For `MODEL_NAME`, *ktrain* supports both the "official" built-in models [available here](https://huggingface.co/transformers/pretrained_models.html) and the [community-upoaded models available here](https://huggingface.co/models).

# In[3]:


import ktrain
from ktrain import text
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=train_b.target_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)


# Note that `x_train` and `x_test` are the raw texts that look like this:
# ```python
# x_train = ['I hate this movie.', 'I like this movie.']
# ```
# The labels are arrays in one of the following forms:
# ```python
# # string labels
# y_train = ['negative', 'positive']
# # integer labels
# y_train = [0, 1]  # labels must start from 0 if in integer format
# # multi or one-hot encoded labels
# y_train = [[1,0], [0,1]]
# ```
# In the latter two cases, you must supply a `class_names` argument to the `Transformer` constructor, which tells *ktrain* how indices map to class names.  In this case, `class_names=['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']` because 0=alt.atheism, 1=comp.graphics, etc.

# ## STEP 2 [Optional]: Estimate a Good Learning Rate
# 
# Learning rates between `2e-5` and `5e-5` tend to work well with transformer models based on papers from Google. However, we will run our learning-rate-finder for two epochs to estimate the LR on this particular dataset.
# 
# As shown below, our results are consistent Google's findings.

# In[5]:


learner.lr_find(show_plot=True, max_epochs=2)


# ## STEP 3: Train Model
# 
# Train using a [1cycle learning rate schedule](https://arxiv.org/pdf/1803.09820.pdf).

# In[4]:


learner.fit_onecycle(8e-5, 4)


# ## STEP 4: Evaluate/Inspect Model

# In[5]:


learner.validate(class_names=t.get_classes())


# In[6]:


# the one we got most wrong
learner.view_top_losses(n=1, preproc=t)


# In[7]:


# understandable mistake - this sci.med post talks a lot about computer graphics
print(x_test[521])


# ## STEP 5: Make Predictions on New Data in Deployment

# In[8]:


predictor = ktrain.get_predictor(learner.model, preproc=t)


# In[9]:


predictor.predict('Jesus Christ is the central figure of Christianity.')


# In[10]:


predictor.explain('Jesus Christ is the central figure of Christianity.')


# In[11]:


predictor.save('/tmp/my_20newsgroup_predictor')


# In[12]:


reloaded_predictor = ktrain.load_predictor('/tmp/my_20newsgroup_predictor')


# In[13]:


reloaded_predictor.predict('Jesus Christ is the central figure of Christianity.')


# In[14]:


reloaded_predictor.predict_proba('Jesus Christ is the central figure of Christianity.')


# In[15]:


reloaded_predictor.get_classes()


# In[ ]: