#!/usr/bin/env python
# coding: utf-8

# ## Protodash Explanations for Text data
# 
# In the example shown in this notebook, we train a text classifier based on [UCI SMS dataset](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection) to distinguish 'SPAM' and 'HAM' (i.e. non spam) SMS messages. We then use the ProtodashExplainer to obtain spam and ham prototypes based on the labels assigned by the text classifier. 
# 
# In order to run this notebook, please: 
# 1. Download [UCI SMS dataset](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection) dataset and place the directory 'smsspamcollection' in the location of this notebook. 
# 2. Place glove embedding file "glove.6B.100d.txt" in the location of this notebook. This can be downloaded from [here](https://nlp.stanford.edu/projects/glove/) 
# 3. Create 2 folders: "results" and "logs" in the location of this notebook (these are used to store training logs). 
# 4. The models trained in this notebook can also be accessed from [here](https://github.com/IBM/AIX360/tree/master/aix360/models/protodash) if required. 

# ### Step 1. Train a LSTM classifier on SMS dataset
# We train a LSTM model to label the dataset as spam / ham. The model is based on the following code: https://www.thepythoncode.com/article/build-spam-classifier-keras-python 

# #### Import statements

# In[1]:


import warnings
warnings.filterwarnings('ignore')

import tqdm
import numpy as np
import keras_metrics # for recall and precision metrics
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn.model_selection import train_test_split
import time
import numpy as np
import pickle
import os.path
from keras.models import model_from_json


# In[2]:


SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
TEST_SIZE = 0.25 # ratio of testing set

BATCH_SIZE = 64
EPOCHS = 20 # number of epochs

# to convert labels to integers and vice-versa
label2int = {"ham": 0, "spam": 1}
int2label = {0: "ham", 1: "spam"}


# In[3]:


import pandas as pd
combined_df = pd.read_csv('smsspamcollection/SMSSpamCollection.csv', delimiter='\t',header=None)
combined_df.columns = ['label', 'text']


# In[4]:


# clean text and store as a column in original df
X = combined_df['text'].values.tolist()
y = combined_df['label'].values.tolist()


# In[5]:


# Text tokenization
# vectorizing text, turning each text into sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
# convert to sequence of integers
X = tokenizer.texts_to_sequences(X)


# In[6]:


# convert to numpy arrays
X = np.array(X)
y = np.array(y)
# pad sequences at the beginning of each sequence with 0's
# for example if SEQUENCE_LENGTH=4:
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
# will be transformed to:
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)


# In[7]:


y = [ label2int[label] for label in y ]
y = to_categorical(y)


# In[8]:


# split and shuffle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)


# #### Use glove embeddings

# In[9]:


def get_embedding_vectors(tokenizer, dim=100):
    embedding_index = {}
    with open(f"glove.6B.{dim}d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading GloVe"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix


# In[10]:


def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              weights=[embedding_matrix],
              trainable=False,
              input_length=SEQUENCE_LENGTH))

    model.add(LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall()])
    model.summary()
    return model


# In[11]:


# constructs the model with 128 LSTM units
model = get_model(tokenizer=tokenizer, lstm_units=128)


# #### Train model or load trained model from disk

# In[12]:


to_train = False

if (to_train): 

    # initialize our ModelCheckpoint and TensorBoard callbacks
    # model checkpoint for saving best weights
    model_checkpoint = ModelCheckpoint("results/spam_classifier_{val_loss:.2f}", save_best_only=True,
                                        verbose=1)
    # for better visualization
    tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")
    # print our data shapes
    print("X_train.shape:", X_train.shape)
    print("X_test.shape:", X_test.shape)
    print("y_train.shape:", y_train.shape)
    print("y_test.shape:", y_test.shape)
    # train the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test),
              batch_size=BATCH_SIZE, epochs=EPOCHS,
              callbacks=[tensorboard, model_checkpoint],
              verbose=1)
    
    # serialize model to JSON
    model_json = model.to_json()
    with open("sms-lstm-forprotodash.json", "w") as json_file:
        json_file.write(model_json)

    # serialize weights to HDF5
    model.save_weights("sms-lstm-forprotodash.h5")
    print("Saved model to disk")
        
else: 

    # load json and create model
    json_file = open("sms-lstm-forprotodash.json", 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)

    # load weights into new model
    model.load_weights("sms-lstm-forprotodash.h5")
    print("Loaded model from disk")

    # print model 
    model.summary()

    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall()])                


# In[13]:


# get the loss and metrics
result = model.evaluate(X_test, y_test)
# extract those
loss = result[0]
accuracy = result[1]
precision = result[2]
recall = result[3]

print(f"[+] Accuracy: {accuracy*100:.2f}%")
print(f"[+] Precision:   {precision*100:.2f}%")
print(f"[+] Recall:   {recall*100:.2f}%")


# ### Step 2. Get model predictions for the dataset

# In[14]:


def get_predictions(doclist):
    
    sequence = tokenizer.texts_to_sequences(doclist)
    
    # pad the sequence
    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)

    # get the prediction as one-hot encoded vector
    prediction = model.predict(sequence)
    
    return (prediction)    


# In[15]:


text = "Congratulations! you have won 100,000$ this week, click here to claim fast"
pred = get_predictions([text])
print(int2label [ np.argmax(pred, axis=1)[0] ] )


# In[16]:


text = "Hi man, I was wondering if we can meet tomorrow."
pred = get_predictions([text])
print(int2label [ np.argmax(pred, axis=1)[0] ] )


# In[17]:


doclist = combined_df['text'].values.tolist()
one_hot_prediction = get_predictions(doclist)
label_prediction = np.argmax(one_hot_prediction, axis=1)

# 0: ham, 1:spam
idx_ham = (label_prediction == 0)
idx_spam = (label_prediction == 1)


# ###  Step 3. Use protodash explainer to compute spam and ham prototypes

# In[18]:


from sklearn.feature_extraction.text import TfidfVectorizer
from aix360.algorithms.protodash import ProtodashExplainer


# #### Convert text to vectors using TF-IDF for use in explainer
# 
# We use TF-IDF vectors for scalability reasons as the original embedding vector for a full sentence can be quite large. 

# In[19]:


# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(doclist)

vec = vectorizer.transform(doclist)
docvec = vec.toarray()
print(docvec.shape)


# In[20]:


# separate spam and ham messages and corrsponding vectors

docvec_spam = docvec[idx_spam, :]
docvec_ham = docvec[idx_ham, :]

df_spam = combined_df[idx_spam]['text']
df_ham = combined_df[idx_ham]['text']


# In[21]:


print(df_spam.shape)
print(df_ham.shape)


# #### Compute prototypes for spam and ham datasets

# In[22]:


explainer = ProtodashExplainer()


# In[23]:


m = 10

# call protodash explainer
# S contains indices of the selected prototypes
# W contains importance weights associated with the selected prototypes 
(W_spam, S_spam, _) = explainer.explain(docvec_spam, docvec_spam, m=m)
(W_ham, S_ham, _) = explainer.explain(docvec_ham, docvec_ham, m=m)


# In[24]:


# get prototypes from index
df_spam_prototypes = df_spam.iloc[S_spam].copy()
df_ham_prototypes = df_ham.iloc[S_ham].copy()

#normalize weights
W_spam = np.around(W_spam/np.sum(W_spam), 2) 
W_ham = np.around(W_ham/np.sum(W_ham), 2) 


# In[25]:


print("SPAM prototypes with weights:")
print("----------------------------")
for i in range(m):
    print(W_spam[i], df_spam_prototypes.iloc[i])


# In[26]:


print("HAM prototypes with weights:")
print("----------------------------")
for i in range(m):
    print(W_ham[i], df_ham_prototypes.iloc[i])


# #### Given a message, look for similar messages that are classified as spam by classifier

# In[27]:


k = 0
sample_text = df_spam.iloc[k]
sample_vec = docvec_spam[k]
sample_vec = sample_vec.reshape(1, sample_vec.shape[0])


# In[28]:


print(sample_text)
print(sample_vec.shape)


# In[29]:


docvec_spam_other = docvec_spam[np.arange(docvec_spam.shape[0]) != k, :] 
df_spam_other = df_spam.iloc[np.arange(docvec_spam.shape[0]) != k].copy()


# In[30]:


# Take a sample spam text and find samples similar to it. 
(W1_spam, S1_spam, _) = explainer.explain(sample_vec, docvec_spam_other, m=m) 


# In[31]:


#normalize weights
W1_spam = np.around(W1_spam/np.sum(W1_spam), 2) 


# In[32]:


S1_spam


# In[33]:


# similar spam prototypes
print("original text")
print("-------------")
print(sample_text)
print("")

print("Similar SPAM prototypes:")
print("------------------------")
m = 10
for i in range(m):
    print(W1_spam[i], df_spam_other.iloc[S1_spam[i]])


# #### Observation
# 
# Note several spam messages repeat in the dataset as these may have been sent by the same entity to multiple users. As a consequence, the explainer retireves these. Try with a different k above to see prototypes corrsponding to other sample messages. 

# #### Given a ham message, look for similar messages that are classified as spam by classifier

# In[34]:


k = 3
sample_text = df_ham.iloc[k]
sample_vec = docvec_ham[k]
sample_vec = sample_vec.reshape(1, sample_vec.shape[0])


# In[35]:


print(sample_text)
print(sample_vec.shape)


# In[36]:


docvec_ham_other = docvec_ham[np.arange(docvec_ham.shape[0]) != k, :] 
df_ham_other = df_ham.iloc[np.arange(docvec_ham.shape[0]) != k].copy()


# In[37]:


# Take a sample spam text and find samples similar to it. 
(W1_ham, S1_ham, _) = explainer.explain(sample_vec, docvec_ham_other, m=m) 


# In[38]:


#normalize weights
W1_ham = np.around(W1_ham/np.sum(W1_ham), 2) 


# In[39]:


S1_ham


# In[40]:


# similar spam prototypes
print("original text")
print("-------------")
print(sample_text)
print("")

print("Similar HAM prototypes:")
print("------------------------")
m = 10
for i in range(m):
    print(W1_ham[i], df_ham_other.iloc[S1_ham[i]])


# In[ ]: