#!/usr/bin/env python
# coding: utf-8

# In[1]:


# Imports

import os
import lime
import sklearn
import sklearn.ensemble
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups

print pd.__version__
print sklearn.__version__
print np.__version__


# In[2]:


# lists out all the classes possible in the newsgroup dataset
newsgroups_train = fetch_20newsgroups(subset='train')
print newsgroups_train.target_names


# In[3]:


# we will be exploring "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"
categories = ["comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

newsgroups_train.keys()


# In[4]:


# Sample
print newsgroups_test.get('data')[0]
print "++++++++++++++++++"
print newsgroups_test.get('target_names')[0]


# In[5]:


# we will be vectorizing the text using TF-IDF vectorization technique
# we will discuss this TF-IDF in future as part of this challenge itself;
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
test_vectors = vectorizer.transform(newsgroups_test.data)


# In[6]:


train_vectors.shape, test_vectors.shape


# ## Random Forest

# In[7]:


# model 1
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train_vectors, newsgroups_train.target)


# In[8]:


# do prediction
pred = rf.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)


# ## Naive Bayes

# In[9]:


# model 2
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_vectors, newsgroups_train.target)


# In[10]:


# do prediction
pred = nb.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)


# ## Logistic Classifier

# In[11]:


# model 3
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_vectors, newsgroups_train.target)


# In[12]:


# do prediction
pred = lr.predict(test_vectors)
sklearn.metrics.accuracy_score(newsgroups_test.target, pred)


# ## Lime in Action

# In[13]:


from lime import lime_text
from sklearn.pipeline import make_pipeline

crf = make_pipeline(vectorizer, rf)
cnb = make_pipeline(vectorizer, nb)
clr = make_pipeline(vectorizer, lr)


# In[14]:


from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=['ibm', 'mac'])


# In[15]:


# picking on random example from the test dataset; and seeing the top 6 features learnt by each different
# classifier for predicting the actual class of the example data point
idx = np.random.randint(1, len(newsgroups_test.data))
exp_crf = explainer.explain_instance(newsgroups_test.data[idx], crf.predict_proba, num_features=6)
exp_clr = explainer.explain_instance(newsgroups_test.data[idx], clr.predict_proba, num_features=6)
exp_cnb = explainer.explain_instance(newsgroups_test.data[idx], cnb.predict_proba, num_features=6)


# In[16]:


exp_crf.show_in_notebook(text=True)


# In[17]:


exp_cnb.show_in_notebook(text=True)


# In[18]:


exp_clr.show_in_notebook(text=True)