#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text # # Building a Chinese-Language Sentiment Analyzer # # In this notebook, we will build a Chinese-language text classification model in 3 simple steps. More specifically, we will build a model that classifies Chinese hotel reviews as either positive or negative. # # The dataset can be downloaded from Chengwei Zhang's GitHub repository [here](https://github.com/Tony607/Chinese_sentiment_analysis/tree/master/data/ChnSentiCorp_htl_ba_6000). # # (**Disclaimer:** I don't speak Chinese. Please forgive mistakes.) # ## STEP 1: Load and Preprocess the Data # # First, we use the `texts_from_folder` function to load and preprocess the data. We assume that the data is in the following form: # ``` # ├── datadir # │ ├── train # │ │ ├── class0 # folder containing documents of class 0 # │ │ ├── class1 # folder containing documents of class 1 # │ │ ├── class2 # folder containing documents of class 2 # │ │ └── classN # folder containing documents of class N # ``` # We set `val_pct` as 0.1, which will automatically sample 10% of the data for validation. Since we will be using a pretrained BERT model for classification, we specifiy `preprocess_mode='bert'`. If you are using any other model (e.g., `fasttext`), you should either omit this parameter or use `preprocess_mode='standard'`). # # **Notice that there is nothing speical or extra we need to do here for non-English text.** *ktrain* automatically detects the language and character encoding and prepares the data and configures the model appropriately. # # # In[3]: trn, val, preproc = text.texts_from_folder('/home/amaiya/data/ChnSentiCorp_htl_ba_6000', maxlen=75, max_features=30000, preprocess_mode='bert', train_test_names=['train'], val_pct=0.1, classes=['pos', 'neg']) # ## STEP 2: Create a Model and Wrap in Learner Object # In[4]: model = text.text_classifier('bert', trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) # ## STEP 3: Train the Model # # We will use the `fit_onecycle` method that employs a [1cycle learning rate policy](https://arxiv.org/pdf/1803.09820.pdf) for four epochs. We will save the weights from each epoch using the `checkpoint_folder` argument, so that we can go reload the weights from the best epoch in case we overfit. # In[5]: learner.fit_onecycle(2e-5, 4, checkpoint_folder='/tmp/saved_weights') # Although Epoch 03 had the lowest validation loss, the final validation accuracy at the end of the last epoch is still the highest (i.e., **93.24%**), so we will just leave the model weights as they are this time. # ### Inspecting Misclassifications # In[7]: learner.view_top_losses(n=1, preproc=preproc) # Using Google Translate, the above roughly translates to: # ``` # # Hotel location is good, access to West Street is more convenient; viewing room is the view of Guanxi Street, although the night is noisy, but it will not affect sleep; the opposite side of the hotel is the Jiujiu car line booking bicycles on Ctrip, easy to pick up. By Ctrip # # ``` # # Although there is a minor negative comment embedded in this review about noise, the review appears to be overall positive and was predicted as positive by our classifier. The ground-truth label, however, is negative, which may be a mistake and may explain the high loss. # # ### Making Predictions on New Data # In[8]: p = ktrain.get_predictor(learner.model, preproc) # Predicting label for the text # > "*The view and service of this hotel were terrible and our room was dirty.*" # In[9]: p.predict("这家酒店的看法和服务都很糟糕,我们的房间很脏。") # Predicting label for: # > "*I like the service of this hotel.*" # In[10]: p.predict('我喜欢这家酒店的服务') # ### Save Predictor for Later Deployment # In[11]: p.save('/tmp/mypred') # In[12]: p = ktrain.load_predictor('/tmp/mypred') # In[13]: # still works p.predict('我喜欢这家酒店的服务')