#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text # ## Load the Data Into Arrays # In[3]: categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] from sklearn.datasets import fetch_20newsgroups train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) test_b = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) print('size of training set: %s' % (len(train_b['data']))) print('size of validation set: %s' % (len(test_b['data']))) print('classes: %s' % (train_b.target_names)) x_train = train_b.data y_train = train_b.target x_test = test_b.data y_test = test_b.target # ## STEP 1: Preprocess Data # In[4]: trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=train_b.target_names, preprocess_mode='distilbert', maxlen=350) # ## STEP 2: Build a Model and Wrap in Learner # In[5]: text.print_text_classifiers() # In[6]: model = text.text_classifier('distilbert', train_data=trn, preproc=preproc) # In[7]: learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) # ## STEP 3: Train Model # In[8]: learner.fit_onecycle(3e-5, 4) # ## Predict on New Data # In[9]: p = ktrain.get_predictor(model, preproc) # In[11]: p.predict("There is a problem with my computer monitor's resolution. Everything is blurry.") # In[ ]: