#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text # Here, we will classify Wikipedia comments into one or more categories of so-called *toxic comments*. Categories of toxic online behavior include toxic, severe_toxic, obscene, threat, insult, and identity_hate. The dataset can be downloaded from the [Kaggle Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data) as a CSV file (i.e., download the file ```train.csv```). We will load the data using the ```texts_from_csv``` method, which assumes the label_columns are already one-hot-encoded in the spreadsheet. Since *val_filepath* is None, 10% of the data will automatically be used as a validation set. # # In[3]: DATA_PATH = 'data/toxic-comments/train.csv' NUM_WORDS = 50000 MAXLEN = 150 (x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(DATA_PATH, 'comment_text', label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], val_filepath=None, # if None, 10% of data will be used for validation max_features=NUM_WORDS, maxlen=MAXLEN, ngram_range=1) # In[4]: model = text.text_classifier('fasttext', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test)) # In[5]: learner.lr_find() learner.lr_plot() # In[6]: learner.autofit(0.001) # In[ ]: