#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text # # Building an Arabic Sentiment Analyzer # # In this notebook, we will build a simple, fast, and accurate Arabic-language text classification model in 4 simple steps. More specifically, we will build a model that classifies Arabic hotel reviews as either positive or negative. # # The dataset can be downloaded from Ashraf Elnagar's GitHub repository (https://github.com/elnagara/HARD-Arabic-Dataset). # # Each entry in the dataset includes a review in Arabic and a rating between 1 and 5. We will convert this to a binary classification dataset by assigning reviews with a rating of above 3 a positive label of 1 and assigning reviews with a rating of less than 3 a negative label of 0. # # (**Disclaimer:** I don't speak Arabic. Please forgive mistakes.) # # # In[3]: # convert ratings to a binary format: 1=positive, 0=negative import pandas as pd df = pd.read_csv('data/arabic_hotel_reviews/balanced-reviews.txt', delimiter='\t', encoding='utf-16') df = df[['rating', 'review']] df['rating'] = df['rating'].apply(lambda x: 'neg' if x < 3 else 'pos') df.columns = ['label', 'text'] df = pd.concat([df, df.label.astype('str').str.get_dummies()], axis=1, sort=False) df = df[['text', 'neg', 'pos']] df.head() # ## STEP 1: Load and Preprocess the Data # # First, we use the `texts_from_df` function to load and preprocess the data in to arrays that can be directly fed into a neural network model. # # We set `val_pct` as 0.1, which will automatically sample 10% of the data for validation. We specifiy `preprocess_mode='bert'`, as we will fine-tuning a BERT model in this example. If using a different model, you will select `preprocess_mode='standard'`. # # **Notice that there is nothing speical or extra we need to do here for non-English text.** *ktrain* automatically detects the language and character encoding and prepares the data and configures the model appropriately. # # # In[4]: (x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(df, 'text', # name of column containing review text label_columns=['neg', 'pos'], maxlen=75, max_features=100000, preprocess_mode='bert', val_pct=0.1) # ## STEP 2: Create a Model and Wrap in Learner Object # We will employ a neural implementation of the [NBSVM](https://www.aclweb.org/anthology/P12-2018/). # In[5]: model = text.text_classifier('bert', (x_train, y_train) , preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=32) # ## STEP 3: Train the Model # # We will use the `fit_onecycle` method that employs a 1cycle learning rate policy and train 1 epoch. # # As shown in the cell below, our final validation accuracy is **95.53%** over a single epoch! # In[6]: learner.fit_onecycle(2e-5, 1) # ### Making Predictions on New Data # In[9]: p = ktrain.get_predictor(learner.model, preproc) # Predicting label for the text # > "*The room was clean, the food excellent, and I loved the view from my room.*" # In[10]: p.predict("الغرفة كانت نظيفة ، الطعام ممتاز ، وأنا أحب المنظر من غرفتي.") # Predicting label for: # > "*This hotel was too expensive and the staff is rude.*" # In[11]: p.predict('كان هذا الفندق باهظ الثمن والموظفين غير مهذبين.') # ### Save our Predictor for Later Deployment # In[12]: # save model for later use p.save('/tmp/arabic_predictor') # In[13]: # reload from disk p = ktrain.load_predictor('/tmp/arabic_predictor') # In[14]: # still works as expected after reloading from disk p.predict("الغرفة كانت نظيفة ، الطعام ممتاز ، وأنا أحب المنظر من غرفتي.") # In[ ]: