#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # ## Text Classification Example: Sentiment Analysis with IMDb Movie Reviews # # We will begin by importing some required modules for performing text classification in *ktrain*. # In[2]: import ktrain from ktrain import text # Next, we will load and preprocess the text data for training and validation. *ktrain* can load texts and associated labels from a variety of source: # # - `texts_from_folder`: labels are represented as subfolders containing text files [ [example notebook] ](https://github.com/amaiya/ktrain/blob/master/examples/text/IMDb-BERT.ipynb) # - `texts_from_csv`: texts and associated labels are stored in columns in a CSV file [ [example notebook](https://github.com/amaiya/ktrain/blob/master/examples/text/toxic_comments-fasttext.ipynb) ] # - `texts_from_df`: texts and associated labels are stored in columns in a *pandas* DataFrame [ [example notebook](https://github.com/amaiya/ktrain/blob/master/examples/text/ArabicHotelReviews-nbsvm.ipynb) ] # - `texts_from_array`: texts and labels are loaded and preprocessed from an array [ [example notebook](https://github.com/amaiya/ktrain/blob/master/examples/text/20newsgroup-distilbert.ipynb) ] # # For `texts_from_csv` and `texts_from_df`, labels can either be multi or one-hot-encoded with one column per class or can be a single column storing integers or strings like this: # ```python # # my_training_data.csv # TEXT,LABEL # I like this movie,positive # I hate this movie,negative # ``` # # For `texts_from_array`, the labels are arrays in one of the following forms: # ```python # # string labels # y_train = ['negative', 'positive'] # # integer labels # y_train = [0, 1] # indices must start from 0 # # multi or one-hot encoded labels (used for multi-label problems) # y_train = [[1,0], [0,1]] # ``` # # In the latter two cases, you must supply a `class_names` argument to the `texts_from_array`, which tells *ktrain* how indices map to class names. In this case, `class_names=['negative', 'positive']` because 0=negative and 1=positive. # # Sample arrays for `texts_from_array` might look like this: # ```python # x_train = ['I hate this movie.', 'I like this movie.'] # y_train = ['negative', 'positive'] # x_test = ['I despise this movie.', 'I love this movie.'] # y_test = ['negative', 'positive'] # ``` # # All of the above methods transform the texts into a sequence of word IDs in one way or another, as expected by neural network models. # # # In this first example problem, we use the ```texts_from_folder``` function to load documents as fixed-length sequences of word IDs from a folder of raw documents. This function assumes a directory structure like the following: # # ``` # ├── datadir # │ ├── train # │ │ ├── class0 # folder containing documents of class 0 # │ │ ├── class1 # folder containing documents of class 1 # │ │ ├── class2 # folder containing documents of class 2 # │ │ └── classN # folder containing documents of class N # │ └── test # │ ├── class0 # folder containing documents of class 0 # │ ├── class1 # folder containing documents of class 1 # │ ├── class2 # folder containing documents of class 2 # │ └── classN # folder containing documents of class N # ``` # # Each subfolder will contain documents in plain text format (e.g., `.txt` files) pertaining to the class represented by the subfolder. # # For our text classification example, we will again classifiy IMDb movie reviews as either positive or negative. However, instead of using the pre-processed version of the dataset pre-packaged with Keras, we will use the original (or raw) *aclImdb* dataset. The dataset can be downloaded from [here](http://ai.stanford.edu/~amaas/data/sentiment/). Set the ```DATADIR``` variable to the location of the extracted *aclImdb* folder. # # In the cell below, note that we supplied `preprocess_mode='standard'` to the data-loading function (which is the default). For pretrained models like BERT and DistilBERT, the dataset must be preprocessed in a specific way. If you are planning to use BERT for text classification, you should replace this argument with `preprocess_mode='bert'`. Since we will not be using BERT in this example, we leave it as `preprocess_mode='standard'`. See [this notebook](https://github.com/amaiya/ktrain/blob/master/examples/text/IMDb-BERT.ipynb) for an example of how to use BERT for text classification in *ktrain*. There is also a [DistilBERT example notebook](https://github.com/amaiya/ktrain/blob/master/examples/text/20newsgroup-distilbert.ipynb). # **NOTE:** If using `preprocess_mode='bert'` or `preprocess_mode='distilbert'`, an English pretrained model is used for English, a Chinese pretrained model is used for Chinese, and a multilingual pretrained model is used for all other languages. For more flexibility in choosing the model used, you can use the alternative [Transformer API for text classification](https://github.com/amaiya/ktrain/blob/master/tutorials/tutorial-A3-hugging_face_transformers.ipynb) in *ktrain*. # # Please also note that, when specifying `preprocess_mode='distilbert'`, the first two return values are `TransformerDataset` objects, not Numpy arrays. So, it is best to always use `trn, val, preproc` on the left-hand side of the expression (instead of `(x_train, y_train), (x_test, y_test_, preproc`) to avoid confusion, as shown below. # In[3]: # load training and validation data from a folder DATADIR = 'data/aclImdb' trn, val, preproc = text.texts_from_folder(DATADIR, max_features=80000, maxlen=2000, ngram_range=3, preprocess_mode='standard', classes=['pos', 'neg']) # Having loaded the data, we will now create a text classification model. The `print_text_classifier` function prints some available models. The model selected should be consistent with the `preprocess_mode` selected above. # # (As mentioned above, one can also use the alternative `Transformer` API for text classification in *ktrain* to access an even larger library of Hugging Face Transformer models like RoBERTa and XLNet. See [this tutorial](https://github.com/amaiya/ktrain/blob/master/tutorials/tutorial-A3-hugging_face_transformers.ipynb) for more information on this.) # # In this example, the `text_classifier` function will return a [neural implementation of NBSVM](https://medium.com/@asmaiya/a-neural-implementation-of-nbsvm-in-keras-d4ef8c96cb7c), which is a strong baseline that can outperform more complex neural architectures. It may take a few moments to return as it builds a document-term matrix from the input data we provide it. The ```text_classifier``` function expects `trn` to be a preprocessed training set returned from the `texts_from*` function above. In this case where we have used `preprocess_mode='standard'`, `trn` is a numpy array with each document represented as fixed-size sequence of word IDs. # In[4]: text.print_text_classifiers() # In[5]: # load an NBSVM model model = text.text_classifier('nbsvm', trn, preproc=preproc) # Next, we instantiate a Learner object and call the ```lr_find``` and ```lr_plot``` methods to help identify a good learning rate. # In[6]: learner = ktrain.get_learner(model, train_data=trn, val_data=val) # In[25]: learner.lr_find() # In[26]: learner.lr_plot() # Finally, we will fit our model using and [SGDR learning rate schedule](https://github.com/amaiya/ktrain/blob/master/example-02-tuning-learning-rates.ipynb) by invoking the ```fit``` method with the *cycle_len* parameter (along with the *cycle_mult* parameter). # In[27]: learner.fit(0.001, 3, cycle_len=1, cycle_mult=2) # #### As can be seen, our final model yields a validation accuracy of 92.27%. # # ### Making Predictions # # Let's predict the sentiment of new movie reviews (or comments in this case) using our trained model. # # The ```preproc``` object (returned by ```texts_from_folder```) is important here, as it is used to preprocess data in a way our model expects. # In[8]: predictor = ktrain.get_predictor(learner.model, preproc) # In[20]: data = [ 'This movie was horrible! The plot was boring. Acting was okay, though.', 'The film really sucked. I want my money back.', 'What a beautiful romantic comedy. 10/10 would see again!'] # In[21]: predictor.predict(data) # As can be seen, our model returns predictions that appear to be correct. The predictor instance can also be used to return "probabilities" of our predictions with respect to each class. Let us first print the classes and their order. The class *pos* stands for positive sentiment and *neg* stands for negative sentiment. Then, we will re-run ```predictor.predict``` with *return_proba=True* to see the probabilities. # In[22]: predictor.get_classes() # In[23]: predictor.predict(data, return_proba=True) # For text classifiers, there is also `predictor.predict_proba`, which is simply calls `predict` with `return_proba=True`. # # Our movie review sentiment predictor can be saved to disk and reloaded/re-used later as part of an application. This is illustrated below: # In[14]: predictor.save('/tmp/my_moviereview_predictor') # In[15]: predictor = ktrain.load_predictor('/tmp/my_moviereview_predictor') # In[17]: predictor.predict(['Groundhog Day is my favorite movie of all time!']) # Note that both the `load_predictor` and `get_predictor` functions accept an optional `batch_size` argument that is set to 32 by default. The `batch_size` can also be set manually on the `Predictor` instance. That is, the `batch_size` used for inference and predictions can be increased with either of the following: # ```python # # you can set the batch_size as an argument to load_predictor (or get_predictor) # predictor = ktrain.load_predictor('/tmp/my_moviereview_predictor', batch_size=128) # # # you can also set the batch_size used for predictions this way # predictor.batch_size = 128 # ``` # Larger batch sizes can potentially speed predictions when `predictor.predict` is supplied with a list of examples. # ## Multi-Label Text Classification: Identifying Toxic Online Comments # # In the previous example, the classes (or categories) were mutually exclusive. By contrast, in multi-label text classification, a document or text snippet can belong to multiple classes. Here, we will classify Wikipedia comments into one or more categories of so-called *toxic comments*. Categories of toxic online behavior include toxic, severe_toxic, obscene, threat, insult, and identity_hate. The dataset can be downloaded from the [Kaggle Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data) as a CSV file (i.e., download the file ```train.csv```). We will load the data using the ```texts_from_csv``` function. This function expects one column to contain the texts of documents and one or more other columns to store the labels. Labels can be in any of the following formats: # # ``` # 1. one-hot-encoded or arrays representing classes will have a single one in each row: # Binary Classification (two classes): # text|positive|negative # I like this movie.|1|0 # I hated this movie.|0|1 # Multiclass Classification (more than two classes): # text|negative|neutral|positive # I hated this movie.|1|0|0 # negative # I loved this movie.|0|0|1 # positive # I saw the movie.|0|1|0 # neutral # 2. multi-hot-encoded arrays representing classes: # Multi-label classification will have one or more ones in each row: # text|politics|television|sports # I will vote in 2020.|1|0|0 # politics # I watched the debate on CNN.|1|1|0 # politics and television # Did you watch the game on ESPN?|0|1|1 # sports and television # I play basketball.|0|0|1 # sports # 3. labels are in a single column of string or integer values representing classs labels # Example with label_columns=['label'] and text_column='text': # text|label # I like this movie.|positive # I hated this movie.|negative # ``` # # Since the Toxic Comment Classification Challenge is a multi-label problem, we must use the second format, where labels are already multi-hot-encoded. Luckily, the `train.csv` file for this problem is already multi-hot-encoded, so no extra processing is required. # # Since `val_filepath is None`, 10% of the data will automatically be used as a validation set. # # In[14]: DATA_PATH = 'data/toxic-comments/train.csv' NUM_WORDS = 50000 MAXLEN = 150 trn, val, preproc = text.texts_from_csv(DATA_PATH, 'comment_text', label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], val_filepath=None, # if None, 10% of data will be used for validation max_features=NUM_WORDS, maxlen=MAXLEN, ngram_range=1) # Next, as before, we load a text classification model and wrap the model and data in Learner object. Instead of using the NBSVM model, we will explicitly request a different model called fasttext using the ```name``` parameter of ```text_classifier```. The fastText architecture was created by [Facebook](https://arxiv.org/abs/1607.01759) in 2016. (You can call the ```print_textmodels``` to show the available text classification models.) # In[15]: text.print_text_classifiers() # In[20]: model = text.text_classifier('fasttext', trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val) # As before, we use our learning rate finder to find a good learning rate. In this case, a learning rate of 0.0007 appears to be good. # In[21]: learner.lr_find() # In[23]: learner.lr_plot() # Finally, we will train our model for 8 epochs using ```autofit``` with a learning rate of 0.0007. Having explicitly specified the number of epochs, ```autofit``` will automatically employ a triangular learning rate policy. Our final ROC-AUC score is **0.98**. # # As shown in [this example notebook](https://github.com/amaiya/ktrain/blob/master/examples/text/toxic_comments-bigru.ipynb) on our GitHub project, even better results can be obtained using a Bidirectional GRU with pretrained word vectors (called ‘bigru’ in ktrain) # In[24]: learner.autofit(0.0007, 8) # #### Let's compute for ROC-AUC of our final model for identifying toxic online behavior: # In[25]: from sklearn.metrics import roc_auc_score y_pred = learner.model.predict(x_test, verbose=0) score = roc_auc_score(y_test, y_pred) print("\n ROC-AUC score: %.6f \n" % (score)) # ### Making Predictions # # As before, let's make some predictions about toxic comments using our model by wrapping it in a Predictor instance. # In[26]: predictor = ktrain.get_predictor(learner.model, preproc) # In[30]: # correctly predict a toxic comment that includes a threat predictor.predict(["If you don't stop immediately, I will kill you."]) # In[36]: # non-toxic comment predictor.predict(["Okay - I'm calling it a night. See you tomorrow."]) # In[31]: predictor.save('/tmp/toxic_detector') # In[33]: predictor = ktrain.load_predictor('/tmp/toxic_detector') # In[45]: # model works correctly and as expected after reloading from disk predictor.predict(["You have a really ugly face."]) # ## The `Transformers` API in *ktrain* # # If using transformer models like BERT or DistilBert or RoBERTa, *ktrain* includes an alternative API for text classification, which allows the use of **any** Hugging Face `transformers` model. This API can be used as follows: # # ```python # import ktrain # from ktrain import text # MODEL_NAME = 'bert-base-uncased' # t = text.Transformer(MODEL_NAME, maxlen=500, # classes=label_list) # trn = t.preprocess_train(x_train, y_train) # val = t.preprocess_test(x_test, y_test) # model = t.get_classifier() # learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) # learner.fit_onecycle(3e-5, 1) # ``` # # Note that `x_train` and `x_test` are the raw texts here: # ```python # x_train = ['I hate this movie.', 'I like this movie.'] # ``` # Similar to `texts_from_array`, the labels are arrays in one of the following forms: # ```python # # string labels # y_train = ['negative', 'positive'] # # integer labels # y_train = [0, 1] # # multi or one-hot encoded labels # y_train = [[1,0], [0,1]] # ``` # In the latter two cases, you must supply a `class_names` argument to the `Transformer` constructor, which tells *ktrain* how indices map to class names. In this case, `class_names=['negative', 'positive']` because 0=negative and 1=positive. # # For an example, see [this notebook](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/examples/text/ArabicHotelReviews-AraBERT.ipynb), which builds and Arabic sentiment analysis model using [AraBERT](https://huggingface.co/aubmindlab/bert-base-arabert). # # # For more information, see our tutorial on [text classification with Hugging Face Transformers](https://github.com/amaiya/ktrain/blob/master/tutorials/tutorial-A3-hugging_face_transformers.ipynb). # # You may be also interested in some of our blog posts on text classification: # - [Text Classification With Hugging Face Transformers in TensorFlow 2 (Without Tears)](https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed) # - [BERT Text Classification in 3 Lines of Code](https://towardsdatascience.com/bert-text-classification-in-3-lines-of-code-using-keras-264db7e7a358) # In[ ]: