#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text # # Building a Chinese-Language Sentiment Analyzer # # In this notebook, we will build a Chinese-language text classification model in 4 simple steps. More specifically, we will build a model that classifies Chinese hotel reviews as either positive or negative. # # The dataset can be downloaded from Chengwei Zhang's GitHub repository [here](https://github.com/Tony607/Chinese_sentiment_analysis/tree/master/data/ChnSentiCorp_htl_ba_6000). # # (**Disclaimer:** I don't speak Chinese. Please forgive mistakes.) # ## STEP 1: Load and Preprocess the Data # # First, we use the `texts_from_folder` function to load and preprocess the data. We assume that the data is in the following form: # ``` # ├── datadir # │ ├── train # │ │ ├── class0 # folder containing documents of class 0 # │ │ ├── class1 # folder containing documents of class 1 # │ │ ├── class2 # folder containing documents of class 2 # │ │ └── classN # folder containing documents of class N # ``` # We set `val_pct` as 0.1, which will automatically sample 10% of the data for validation. We specifiy `preprocess_mode='standard'` to employ normal text preprocessing. If you are using the BERT model (i.e., 'bert'), you should use `preprocess_mode='bert'`. # # **Notice that there is nothing speical or extra we need to do here for non-English text.** *ktrain* automatically detects the language and character encoding and prepares the data and configures the model appropriately. # # # In[3]: (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder('data/ChnSentiCorp_htl_ba_6000', maxlen=100, max_features=30000, preprocess_mode='standard', train_test_names=['train'], val_pct=0.1, ngram_range=3, classes=['pos', 'neg']) # ## STEP 2: Create a Model and Wrap in Learner Object # In[4]: model = text.text_classifier('nbsvm', (x_train, y_train) , preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=32) # ## STEP 3: Estimate the LR # We'll use the *ktrain* learning rate finder to find a good learning rate to use with *nbsvm*. We will, then, select the highest learning rate associated with a still falling loss. # # In[5]: learner.lr_find(show_plot=True) # ## STEP 4: Train the Model # # We will use the `autofit` method that employs a triangular learning rate policy with EarlyStopping and ReduceLROnPlateau automatically enabled, since the epochs argument is omitted. We monitor `val_acc`, so weights from the epoch with the highest validation accuracy will be automatically loaded into our model when training completes. # # As shown in the cell below, our final validation accuracy is **92%** with only 7 seconds of training! # In[6]: learner.autofit(7e-3, monitor='val_acc') # In[7]: learner.validate(class_names=preproc.get_classes()) # ### Inspecting Misclassifications # In[8]: learner.view_top_losses(n=1, preproc=preproc) # Using Google Translate, the above roughly translates to: # ``` # The hotel environment is not bad, the decoration is also very good. Breakfast is not good, the price is high. # ``` # # This is a mixed review, but is labeled only as negative. Our classifier is undertandably confused and predicts positive for this reivew. # ### Making Predictions on New Data # In[9]: p = ktrain.get_predictor(learner.model, preproc) # Predicting label for the text # > "*The view and service of this hotel were terrible and our room was dirty.*" # In[10]: p.predict("这家酒店的看法和服务都很糟糕,我们的房间很脏。") # Predicting label for: # > "*I like the service of this hotel.*" # In[11]: p.predict('我喜欢这家酒店的服务') # In[ ]: