#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";   


# # Text Classification with Hugging Face Transformers in *ktrain*
# 
# As of v0.8.x, *ktrain* now includes an easy-to-use, thin wrapper to the Hugging Face transformers library for text classification.

# ## Load Data Into Arrays

# In[2]:


categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
train_b = fetch_20newsgroups(subset='train',
   categories=categories, shuffle=True, random_state=42)
test_b = fetch_20newsgroups(subset='test',
   categories=categories, shuffle=True, random_state=42)

print('size of training set: %s' % (len(train_b['data'])))
print('size of validation set: %s' % (len(test_b['data'])))
print('classes: %s' % (train_b.target_names))

x_train = train_b.data
y_train = train_b.target
x_test = test_b.data
y_test = test_b.target


# ## STEP 1: Preprocess Data and Build a Transformer Model
# 
# For `MODEL_NAME`, *ktrain* supports both the "official" built-in models [available here](https://huggingface.co/transformers/pretrained_models.html) and the [community-upoaded models available here](https://huggingface.co/models).

# In[3]:


import ktrain
from ktrain import text
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=train_b.target_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)


# Note that `x_train` and `x_test` are the raw texts that look like this:
# ```python
# x_train = ['I hate this movie.', 'I like this movie.']
# ```
# The labels are arrays in one of the following forms:
# ```python
# # string labels
# y_train = ['negative', 'positive']
# # integer labels
# y_train = [0, 1]  # labels must start from 0 if in integer format
# # multi or one-hot encoded labels
# y_train = [[1,0], [0,1]]
# ```
# In the latter two cases, you must supply a `class_names` argument to the `Transformer` constructor, which tells *ktrain* how indices map to class names.  In this case, `class_names=['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']` because 0=alt.atheism, 1=comp.graphics, etc.

# ## STEP 2 [Optional]: Estimate a Good Learning Rate
# 
# Learning rates between `2e-5` and `5e-5` tend to work well with transformer models based on papers from Google. However, we will run our learning-rate-finder for two epochs to estimate the LR on this particular dataset.
# 
# As shown below, our results are consistent Google's findings.

# In[5]:


learner.lr_find(show_plot=True, max_epochs=2)


# ## STEP 3: Train Model
# 
# Train using a [1cycle learning rate schedule](https://arxiv.org/pdf/1803.09820.pdf).

# In[4]:


learner.fit_onecycle(8e-5, 4)


# ## STEP 4: Evaluate/Inspect Model

# In[5]:


learner.validate(class_names=t.get_classes())


# In[6]:


# the one we got most wrong
learner.view_top_losses(n=1, preproc=t)


# In[7]:


# understandable mistake - this sci.med post talks a lot about computer graphics
print(x_test[521])


# ## STEP 5: Make Predictions on New Data in Deployment

# In[8]:


predictor = ktrain.get_predictor(learner.model, preproc=t)


# In[9]:


predictor.predict('Jesus Christ is the central figure of Christianity.')


# In[10]:


predictor.explain('Jesus Christ is the central figure of Christianity.')


# In[11]:


predictor.save('/tmp/my_20newsgroup_predictor')


# In[12]:


reloaded_predictor = ktrain.load_predictor('/tmp/my_20newsgroup_predictor')


# In[13]:


reloaded_predictor.predict('Jesus Christ is the central figure of Christianity.')


# In[14]:


reloaded_predictor.predict_proba('Jesus Christ is the central figure of Christianity.')


# In[15]:


reloaded_predictor.get_classes()


# ## Additional Tips and Tricks

# If you have a **transformers** model that has already been trained/fine-tuned, you can easily wrap it into a **ktrain** `Predictor`.  The example below loads the pre-fine-tuned [coronabert model](https://huggingface.co/jakelever/coronabert) into **ktrain** to make predictions:
# 
# ```python
# # Import ktrain along with a couple things from transformers
# import ktrain
# from transformers import TFAutoModelForSequenceClassification
# 
# # Load the model and compile it for ktrain/tf.Keras
# model = TFAutoModelForSequenceClassification.from_pretrained("jakelever/coronabert")
# model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
# 
# # Pull the categories out of the model (or set class_names manually)
# class_names = list(model.config.id2label.values())
# 
# # Set up a ktrain preprocessor (which manages the tokenization) with the labels
# preproc = ktrain.text.Transformer('jakelever/coronabert',maxlen=500,class_names=class_names)
# preproc.preprocess_train_called = True  # needed to suppress warnings about not calling preprocess_train
# 
# # Get the predictor (which takes in the model and tokenizer info)
# predictor = ktrain.get_predictor(model, preproc)
# 
# # Make predictions
# text = ["A genomic region associated with protection against severe COVID-19 is inherited from Neandertals."]
# predictor.predict(text)
# 
# # OUTPUT:
# # [[('Clinical Reports', 0.0003284997),
# #   ('Comment/Editorial', 0.0022700194),
# #   ('Communication', 0.00060458254),
# #   ('Contact Tracing', 0.00027690193),
# #   ('Diagnostics', 0.0003987006),
# #   ('Drug Targets', 0.0008852846),
# #   ('Education', 0.00018228142),
# #   ('Effect on Medical Specialties', 0.00045943243),
# #   ('Forecasting & Modelling', 0.00047854715),
# #   ('Health Policy', 0.00042494797),
# #   ('Healthcare Workers', 6.292213e-05),
# #   ('Imaging', 0.00021008229),
# #   ('Immunology', 0.00072542584),
# #   ('Inequality', 0.0007106358),
# #   ('Infection Reports', 0.00033797201),
# #   ('Long Haul', 0.00034338655),
# #   ('Medical Devices', 0.0002488097),
# #   ('Meta-analysis', 0.00030506376),
# #   ('Misinformation', 0.0012771417),
# #   ('Model Systems & Tools', 0.0020338537),
# #   ('Molecular Biology', 0.9950799),
# #   ('News', 0.00034808667),
# #   ('Non-human', 0.98562455),
# #   ('Non-medical', 0.0005655724),
# #   ('Pediatrics', 0.00042545484),
# #   ('Prevalence', 0.0011711525),
# #   ('Prevention', 0.00043099752),
# #   ('Psychology', 0.00045698017),
# #   ('Recommendations', 0.0004172316),
# #   ('Review', 0.002200645),
# #   ('Risk Factors', 0.00014382145),
# #   ('Surveillance', 0.00081551325),
# #   ('Therapeutics', 0.0010580326),
# #   ('Transmission', 0.0031670583),
# #   ('Vaccines', 0.0011023124)]]
# 
# ```
# 
# Finally, to make predictions with a smaller deployment footprint, you can export the model to ONNX format as described in [this example notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/develop/examples/text/ktrain-ONNX-TFLite-examples.ipynb).

# In[ ]: