#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 


# In[2]:


import ktrain
from ktrain import text


# # Building an Arabic Sentiment Analyzer
# 
# In this notebook, we will build a simple, fast, and accurate Arabic-language text classification model in 4 simple steps. More specifically, we will build a model that classifies Arabic hotel reviews as either positive or negative.
# 
# The dataset can be downloaded from Ashraf Elnagar's GitHub repository (https://github.com/elnagara/HARD-Arabic-Dataset).
# 
# Each entry in the dataset includes a review in Arabic and a rating between 1 and 5.  We will convert this to a binary classification dataset by assigning reviews with a rating of above 3 a positive label of 1 and assigning reviews with a rating of less than 3 a negative label of 0.
# 
# (**Disclaimer:** I don't speak Arabic. Please forgive mistakes.) 
# 
# 

# In[3]:


# convert ratings to a binary format:  1=positive, 0=negative
import pandas as pd
df = pd.read_csv('data/arabic_hotel_reviews/balanced-reviews.txt', delimiter='\t', encoding='utf-16')
df = df[['rating', 'review']] 
df['rating'] = df['rating'].apply(lambda x: 'neg' if x < 3 else 'pos')
df.columns = ['label', 'text']
df = pd.concat([df, df.label.astype('str').str.get_dummies()], axis=1, sort=False)
df = df[['text', 'neg', 'pos']]
df.head()


# ## STEP 1:  Load and Preprocess the Data
# 
# First, we use the `texts_from_df` function to load and preprocess the data in to arrays that can be directly fed into a neural network model. 
# 
# We set `val_pct` as 0.1, which will automatically sample 10% of the data for validation.  We specifiy `preprocess_mode='bert'`, as we will fine-tuning a BERT model in this example. If using a different model, you will select `preprocess_mode='standard'`.
# 
# **Notice that there is nothing speical or extra we need to do here for non-English text.**  *ktrain* automatically detects the language and character encoding and prepares the data and configures the model appropriately.
# 
# 

# In[4]:


(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(df, 
                                                                   'text', # name of column containing review text
                                                                   label_columns=['neg', 'pos'],
                                                                   maxlen=75, 
                                                                   max_features=100000,
                                                                   preprocess_mode='bert',
                                                                   val_pct=0.1)


# ## STEP 2:  Create a Model and Wrap in Learner Object

# We will employ a neural implementation of the [NBSVM](https://www.aclweb.org/anthology/P12-2018/).

# In[5]:


model = text.text_classifier('bert', (x_train, y_train) , preproc=preproc)
learner = ktrain.get_learner(model, 
                             train_data=(x_train, y_train), 
                             val_data=(x_test, y_test), 
                             batch_size=32)


# ## STEP 3: Train the Model
# 
# We will use the `fit_onecycle` method that employs a 1cycle learning rate policy and train 1 epoch.
# 
# As shown in the cell below, our final validation accuracy is **95.53%** over a single epoch!

# In[6]:


learner.fit_onecycle(2e-5, 1)


# ### Making Predictions on New Data

# In[9]:


p = ktrain.get_predictor(learner.model, preproc)


# Predicting label for the text
# > "*The room was clean, the food excellent, and I loved the view from my room.*"

# In[10]:


p.predict("الغرفة كانت نظيفة ، الطعام ممتاز ، وأنا أحب المنظر من غرفتي.")


# Predicting label for:
# > "*This hotel was too expensive and the staff is rude.*"

# In[11]:


p.predict('كان هذا الفندق باهظ الثمن والموظفين غير مهذبين.')


# ### Save our Predictor for Later Deployment

# In[12]:


# save model for later use
p.save('/tmp/arabic_predictor')


# In[13]:


# reload from disk
p = ktrain.load_predictor('/tmp/arabic_predictor')


# In[14]:


# still works as expected after reloading from disk
p.predict("الغرفة كانت نظيفة ، الطعام ممتاز ، وأنا أحب المنظر من غرفتي.")


# In[ ]: