#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In this example, we will apply ktrain to the dataset employed in the **scikit-learn** [*Working with Text Data*](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html) tutorial. As in the tutorial, we will sample 4 newgroups to create a small multiclass text classification dataset. Let's fetch the [20newsgroups dataset](http://qwone.com/~jason/20Newsgroups/) using **scikit-learn**. # In[3]: categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] from sklearn.datasets import fetch_20newsgroups train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) test_b = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) # # inspect print(train_b.keys()) print(train_b['target_names']) print(train_b['filenames'][:5]) print(train_b['target'][:5]) print(train_b['data'][0][:300]) print(train_b['target'][0]) #print(set(train_b['target'])) x_train = train_b.data y_train = train_b.target x_test = test_b.data y_test = test_b.target # In[4]: import ktrain from ktrain import text # In[5]: (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=train_b.target_names, ngram_range=1, maxlen=350, max_features=35000) # In[6]: model = text.text_classifier('nbsvm', train_data=(x_train, y_train), preproc=preproc) # In[7]: learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test)) # In[8]: learner.lr_find() learner.lr_plot() # In[9]: learner.autofit(0.01) # In[10]: learner.validate() # In[12]: predictor = ktrain.get_predictor(learner.model, preproc) # In[13]: predictor.get_classes() # In[14]: predictor.predict(test_b.data[0:3]) # In[15]: test_b.target[:3]