#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";   


# In[2]:


import ktrain
from ktrain import text


# ## Load the Data Into Arrays

# In[3]:


categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
train_b = fetch_20newsgroups(subset='train',
   categories=categories, shuffle=True, random_state=42)
test_b = fetch_20newsgroups(subset='test',
   categories=categories, shuffle=True, random_state=42)

print('size of training set: %s' % (len(train_b['data'])))
print('size of validation set: %s' % (len(test_b['data'])))
print('classes: %s' % (train_b.target_names))

x_train = train_b.data
y_train = train_b.target
x_test = test_b.data
y_test = test_b.target


# ## STEP 1: Preprocess Data

# In[4]:


trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                          x_test=x_test, y_test=y_test,
                                          class_names=train_b.target_names,
                                          preprocess_mode='distilbert',
                                          maxlen=350)


# ## STEP 2:  Build a Model and Wrap in Learner

# In[5]:


text.print_text_classifiers()


# In[6]:


model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)


# In[7]:


learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)


# ## STEP 3: Train Model

# In[8]:


learner.fit_onecycle(3e-5, 4)


# ## Predict on New Data

# In[9]:


p = ktrain.get_predictor(model, preproc)


# In[11]:


p.predict("There is a problem with my computer monitor's resolution.  Everything is blurry.")


# In[ ]: