#!/usr/bin/env python # coding: utf-8 # In[1]: # Imports import os import lime import sklearn import sklearn.ensemble import pandas as pd import numpy as np from sklearn.datasets import fetch_20newsgroups print pd.__version__ print sklearn.__version__ print np.__version__ # In[2]: # lists out all the classes possible in the newsgroup dataset newsgroups_train = fetch_20newsgroups(subset='train') print newsgroups_train.target_names # In[3]: # we will be exploring "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware" categories = ["comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) newsgroups_train.keys() # In[4]: # Sample print newsgroups_test.get('data')[0] print "++++++++++++++++++" print newsgroups_test.get('target_names')[0] # In[5]: # we will be vectorizing the text using TF-IDF vectorization technique # we will discuss this TF-IDF in future as part of this challenge itself; vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False) train_vectors = vectorizer.fit_transform(newsgroups_train.data) test_vectors = vectorizer.transform(newsgroups_test.data) # In[6]: train_vectors.shape, test_vectors.shape # ## Random Forest # In[7]: # model 1 rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500) rf.fit(train_vectors, newsgroups_train.target) # In[8]: # do prediction pred = rf.predict(test_vectors) sklearn.metrics.accuracy_score(newsgroups_test.target, pred) # ## Naive Bayes # In[9]: # model 2 from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(train_vectors, newsgroups_train.target) # In[10]: # do prediction pred = nb.predict(test_vectors) sklearn.metrics.accuracy_score(newsgroups_test.target, pred) # ## Logistic Classifier # In[11]: # model 3 from sklearn.linear_model import LogisticRegression lr = LogisticRegression() lr.fit(train_vectors, newsgroups_train.target) # In[12]: # do prediction pred = lr.predict(test_vectors) sklearn.metrics.accuracy_score(newsgroups_test.target, pred) # ## Lime in Action # In[13]: from lime import lime_text from sklearn.pipeline import make_pipeline crf = make_pipeline(vectorizer, rf) cnb = make_pipeline(vectorizer, nb) clr = make_pipeline(vectorizer, lr) # In[14]: from lime.lime_text import LimeTextExplainer explainer = LimeTextExplainer(class_names=['ibm', 'mac']) # In[15]: # picking on random example from the test dataset; and seeing the top 6 features learnt by each different # classifier for predicting the actual class of the example data point idx = np.random.randint(1, len(newsgroups_test.data)) exp_crf = explainer.explain_instance(newsgroups_test.data[idx], crf.predict_proba, num_features=6) exp_clr = explainer.explain_instance(newsgroups_test.data[idx], clr.predict_proba, num_features=6) exp_cnb = explainer.explain_instance(newsgroups_test.data[idx], cnb.predict_proba, num_features=6) # In[16]: exp_crf.show_in_notebook(text=True) # In[17]: exp_cnb.show_in_notebook(text=True) # In[18]: exp_clr.show_in_notebook(text=True)