#!/usr/bin/env python # coding: utf-8 # ## Explaining a classfier # # It's good to able to explain what an algorithm is doing (esp in public sector). # # It boosts confidence in the ML, good for transparency, and is useful in iterating the model. # # This uses the python package lime (Local Interpretable Model-Agnostic Explanations), an implementation by the authors of their paper - # https://arxiv.org/abs/1602.04938 # In[1]: get_ipython().system(' pip install lime') # In[2]: import sklearn import sklearn.ensemble import numpy as np import lime import lime.lime_tabular from __future__ import print_function np.random.seed(2) # In[3]: blob_account_name = "parlpublic" blob_account_key = "xKEIV42ZsO8eL2IPjvbLarR2Xu1brxGucDauvVytPXD1uKhAfYUId7SwbGF82FslfkKebPB/ic6/RcPYnNBO6w==" container = "trainingdata" blobname = "5000_edms_justonetopic.csv" datafile = "output.txt" import os import pandas as pd from azure.storage.blob import BlockBlobService dirname = os.getcwd() blob_service = BlockBlobService(account_name=blob_account_name,account_key=blob_account_key) blob_service.get_blob_to_path(container, blobname, datafile) edm = pd.read_csv(datafile, header = 0) os.remove(os.path.join(dirname, datafile)) print(edm.shape) edm.head() # In[4]: # load in tag hierarchy to extract top terms import json with open("tag_hierarchy.json", 'r') as f: data = json.load(f) # In[5]: # Cleaning import re def clean_text(string): string = re.sub(r"\d", "", string) # remove numbers string = re.sub(r"_+", "", string) # remove consecutive underscores string = re.sub(r"<[^>]*>", " ", string) #remove all html tags string = re.sub(r"[^0-9a-zA-Z]+", " ", string) # remove speacial chars string = string.lower() # tranform to lower case return string.strip() edm["doc"] = edm.doc.apply(clean_text) def get_parent(term): for i in data['children']: for k in i['children']: if term == k['name']: return i['name'] if "children" in k: for j in k['children']: if term == j['name']: return get_parent(k['name']) # get_parent("Supported housing") edm["topic"] = edm.topic.apply(get_parent) edm.head() # In[6]: from sklearn.feature_extraction.text import CountVectorizer tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=1000,stop_words='english') tf_vectorizer # In[7]: X = tf_vectorizer.fit_transform(edm['doc']) X # In[8]: doc_array = X.toarray() doc_array.astype(float) # In[9]: analyze = tf_vectorizer.build_analyzer() analyze("we shall fight on the beaches") # In[10]: feature_names = tf_vectorizer.get_feature_names() # In[11]: train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(doc_array, edm.topic, train_size=0.80) rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500) rf.fit(train, labels_train) # In[12]: sklearn.metrics.accuracy_score(labels_test, rf.predict(test)) # In[13]: from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(edm.topic) classes_names = le.classes_ # In[14]: explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=feature_names, class_names=classes_names, discretize_continuous=True) # In[15]: i = np.random.randint(0, len(test)) tf_vectorizer.inverse_transform(test[i]) # In[16]: exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=10, top_labels=3) exp.show_in_notebook(show_table=True, show_all=False) # In[18]: # Example that a piece of text that's classified as health services with .95 probability # You can tell for each class (here there are 3), which text features contributed to the prediction and to what degree from IPython.display import Image Image(filename='explainml.png') # ## Notes # # This isn't the model that'd be used in production, just one sklearn random forests implementation. Though its possible to plug this python script into Azure ML studio (compatible with R and python) and inspect particular ones as requested. With human-in-the-loop retraining setups, may want to save historic models if you want audit trail. # # Besides transprency can check the predictions with low confidence to seek improvements. # # Also this relies on human readable text features. If use feature hashing, have to reverse hash if you want text labels # # To further investigate: # - how this works with more classes, instead of just top 20 # - deep learning classifiers # - integrating with Azure to store output into blob # - what are good measures to track across retrained models? # - ... # In[ ]: