Licensed under the MIT License.
import sys
sys.path.append("../..")
import os
# sklearn
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from interpret_text.experimental.classical import ClassicalTextExplainer
from notebooks.test_utils.utils_mnli import load_mnli_pandas_df
# for testing
from scrapbook.api import glue
working_dir = os.getcwd()
This notebook illustrates how to locally use interpret-text to help interpret text classification using a logisitic regression baseline and bag of words encoding. It demonstrates the API calls needed to obtain the feature importances along with a visualization dashbard.
The notebook is built on features made available by scikit-learn and spacy for easier compatibiltiy with popular tookits.
DATA_FOLDER = './temp'
TRAIN_SIZE = 0.7
TEST_SIZE = 0.3
df = load_mnli_pandas_df(DATA_FOLDER, "train")
df = df[df["gold_label"] == "neutral"] # get unique sentences
# fetch documents and labels from data frame
X_str = df['sentence1'] # the document we want to analyze
ylabels = df['genre'] # the labels, or answers, we want to test against
# Create explainer object that contains default glassbox classifier and explanation methods
explainer = ClassicalTextExplainer(n_jobs=-1, tol=0.1)
label_encoder = LabelEncoder()
This step will cast the training data and labels into the correct format
X_train, X_test, y_train, y_test = train_test_split(X_str, ylabels, train_size=0.8, test_size=0.2)
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
print("X_train shape =" + str(X_train.shape))
print("y_train shape =" + str(y_train.shape))
print("X_train data structure = " + str(type(X_train)))
The 1-gram Bag of Words allows a 1:1 mapping from individual words to their respective frequencies in the document-term matrix.
classifier, best_params = explainer.fit(X_train, y_train)
# obtain best classifier and hyper params
print("best classifier: " + str(best_params))
mean_accuracy = classifier.score(X_test, y_test, sample_weight=None)
print("accuracy = " + str(mean_accuracy * 100) + "%")
y_pred = classifier.predict(X_test)
[precision, recall, fscore, support] = precision_recall_fscore_support(y_test, y_pred,average='macro')
Capture metrics for integration testing
glue("accuracy", mean_accuracy)
glue("precision", precision)
glue("recall", recall)
glue("f1", fscore)
print("[precision, recall, fscore, support] = " + str([precision, recall, fscore, support]))
Local importances are the most and least important words for a single document.
# Enter any document or a document and label pair that needs to be interpreted
document = "I travelled to the beach. I took the train. I saw fairies, dragons and elves"
document1 = "The term construction means fabrication, erection, or installation of an affected unit."
document2 = "Demonstrating Product Reliability Indicates the Product Is Ready for Production"
document3 = "and see there\'s no secrecy to that because the bill always comes in and we know how much they pay for it"
document4 = "Had that piquant gipsy face been at the bottom of the crime, or was it 73 the baser mainspring of money?"
document5 = "No, the boy trusted me, and I shan\'t let him down."
# Obtain the top feature ids for the selected class label
explainer.preprocessor.labelEncoder = label_encoder
local_explanation = explainer.explain_local(document)
Alternatively, you can pass the predicted label with the document
y = classifier.predict(document1)
predicted_label = label_encoder.inverse_transform(y)
local_explanation = explainer.explain_local(document1, predicted_label)
from interpret_text.experimental.widget import ExplanationDashboard
ExplanationDashboard(local_explanation)