#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";   


# In this example, we will apply ktrain to the dataset employed in the **scikit-learn** [*Working with Text Data*](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html) tutorial.  As in the tutorial, we will sample 4 newgroups to create a small multiclass text classification dataset.  Let's fetch the [20newsgroups dataset](http://qwone.com/~jason/20Newsgroups/) using **scikit-learn**.

# In[3]:


categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
train_b = fetch_20newsgroups(subset='train',
   categories=categories, shuffle=True, random_state=42)
test_b = fetch_20newsgroups(subset='test',
   categories=categories, shuffle=True, random_state=42)

# # inspect
print(train_b.keys())
print(train_b['target_names'])
print(train_b['filenames'][:5])
print(train_b['target'][:5])
print(train_b['data'][0][:300])
print(train_b['target'][0])
#print(set(train_b['target']))

x_train = train_b.data
y_train = train_b.target
x_test = test_b.data
y_test = test_b.target


# In[4]:


import ktrain
from ktrain import text


# In[5]:


(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                                                       x_test=x_test, y_test=y_test,
                                                                       class_names=train_b.target_names,
                                                                      ngram_range=1, 
                                                                       maxlen=350, 
                                                                       max_features=35000)


# In[6]:


model = text.text_classifier('nbsvm', train_data=(x_train, y_train), preproc=preproc)


# In[7]:


learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test))


# In[8]:


learner.lr_find()
learner.lr_plot()


# In[9]:


learner.autofit(0.01)


# In[10]:


learner.validate()


# In[12]:


predictor = ktrain.get_predictor(learner.model, preproc)


# In[13]:


predictor.get_classes()


# In[14]:


predictor.predict(test_b.data[0:3])


# In[15]:


test_b.target[:3]