#!/usr/bin/env python
# coding: utf-8

# ## Explaining a classfier 
# 
# It's good to able to explain what an algorithm is doing (esp in public sector).
# 
# It boosts confidence in the ML, good for transparency, and is useful in iterating the model. 
# 
# This uses the python package lime (Local Interpretable Model-Agnostic Explanations), an implementation by the authors of their paper - 
# https://arxiv.org/abs/1602.04938

# In[1]:


get_ipython().system(' pip install lime')


# In[2]:


import sklearn
import sklearn.ensemble
import numpy as np
import lime
import lime.lime_tabular
from __future__ import print_function
np.random.seed(2)


# In[3]:


blob_account_name = "parlpublic"
blob_account_key = "xKEIV42ZsO8eL2IPjvbLarR2Xu1brxGucDauvVytPXD1uKhAfYUId7SwbGF82FslfkKebPB/ic6/RcPYnNBO6w==" 
container = "trainingdata" 
blobname = "5000_edms_justonetopic.csv"
datafile = "output.txt" 
import os
import pandas as pd
from azure.storage.blob import BlockBlobService

dirname = os.getcwd()

blob_service = BlockBlobService(account_name=blob_account_name,account_key=blob_account_key)
blob_service.get_blob_to_path(container, blobname, datafile)

edm = pd.read_csv(datafile, header = 0)
os.remove(os.path.join(dirname, datafile))
print(edm.shape)
edm.head()


# In[4]:


# load in tag hierarchy to extract top terms
import json
with open("tag_hierarchy.json", 'r') as f:
    data = json.load(f) 


# In[5]:


# Cleaning
import re

def clean_text(string):
    string = re.sub(r"\d", "", string) # remove numbers  
    string = re.sub(r"_+", "", string) # remove consecutive underscores
    string = re.sub(r"<[^>]*>", " ", string) #remove all html tags
    string = re.sub(r"[^0-9a-zA-Z]+", " ", string)  # remove speacial chars
    string = string.lower() # tranform to lower case    
    
    return string.strip()

edm["doc"] = edm.doc.apply(clean_text)

def get_parent(term):
    for i in data['children']:
        for k in i['children']:
            if term == k['name']:
                return i['name']
            if "children" in k:
                for j in k['children']:
                    if term == j['name']:
                        return get_parent(k['name'])
            
# get_parent("Supported housing")

edm["topic"] = edm.topic.apply(get_parent)

edm.head()


# In[6]:


from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=1000,stop_words='english')
tf_vectorizer


# In[7]:


X = tf_vectorizer.fit_transform(edm['doc'])
X


# In[8]:


doc_array = X.toarray()  
doc_array.astype(float)


# In[9]:


analyze = tf_vectorizer.build_analyzer()
analyze("we shall fight on the beaches")


# In[10]:


feature_names = tf_vectorizer.get_feature_names()


# In[11]:


train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(doc_array, edm.topic, train_size=0.80)
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train, labels_train)


# In[12]:


sklearn.metrics.accuracy_score(labels_test, rf.predict(test))


# In[13]:


from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(edm.topic)
classes_names = le.classes_


# In[14]:


explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=feature_names, class_names=classes_names, discretize_continuous=True)


# In[15]:


i = np.random.randint(0, len(test))
tf_vectorizer.inverse_transform(test[i])


# In[16]:


exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=10, top_labels=3)
exp.show_in_notebook(show_table=True, show_all=False)


# In[18]:


# Example that a piece of text that's classified as health services with .95 probability 
# You can tell for each class (here there are 3), which text features contributed to the prediction and to what degree
from IPython.display import Image
Image(filename='explainml.png') 


# ## Notes
# 
# This isn't the model that'd be used in production, just one sklearn random forests implementation. Though its possible to plug this python script into Azure ML studio (compatible with R and python) and inspect particular ones as requested. With human-in-the-loop retraining setups, may want to save historic models if you want audit trail.
# 
# Besides transprency can check the predictions with low confidence to seek improvements. 
# 
# Also this relies on human readable text features. If use feature hashing, have to reverse hash if you want text labels
# 
# To further investigate:
# - how this works with more classes, instead of just top 20
# - deep learning classifiers
# - integrating with Azure to store output into blob
# - what are good measures to track across retrained models?
# - ...

# In[ ]: