Licensed under the MIT License.
import sys sys.path.append("../..") import os # sklearn from sklearn.metrics import precision_recall_fscore_support from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from interpret_text.experimental.classical import ClassicalTextExplainer from notebooks.test_utils.utils_mnli import load_mnli_pandas_df # for testing from scrapbook.api import glue working_dir = os.getcwd()
This notebook illustrates how to locally use interpret-text to help interpret text classification using a logisitic regression baseline and bag of words encoding. It demonstrates the API calls needed to obtain the feature importances along with a visualization dashbard.
The notebook is built on features made available by scikit-learn and spacy for easier compatibiltiy with popular tookits.
DATA_FOLDER = './temp' TRAIN_SIZE = 0.7 TEST_SIZE = 0.3
df = load_mnli_pandas_df(DATA_FOLDER, "train") df = df[df["gold_label"] == "neutral"] # get unique sentences # fetch documents and labels from data frame X_str = df['sentence1'] # the document we want to analyze ylabels = df['genre'] # the labels, or answers, we want to test against
# Create explainer object that contains default glassbox classifier and explanation methods explainer = ClassicalTextExplainer(n_jobs=-1, tol=0.1) label_encoder = LabelEncoder()
This step will cast the training data and labels into the correct format
X_train, X_test, y_train, y_test = train_test_split(X_str, ylabels, train_size=0.8, test_size=0.2) y_train = label_encoder.fit_transform(y_train) y_test = label_encoder.transform(y_test)
print("X_train shape =" + str(X_train.shape)) print("y_train shape =" + str(y_train.shape)) print("X_train data structure = " + str(type(X_train)))
The 1-gram Bag of Words allows a 1:1 mapping from individual words to their respective frequencies in the document-term matrix.
classifier, best_params = explainer.fit(X_train, y_train)
# obtain best classifier and hyper params print("best classifier: " + str(best_params))
mean_accuracy = classifier.score(X_test, y_test, sample_weight=None) print("accuracy = " + str(mean_accuracy * 100) + "%") y_pred = classifier.predict(X_test) [precision, recall, fscore, support] = precision_recall_fscore_support(y_test, y_pred,average='macro')
Capture metrics for integration testing
glue("accuracy", mean_accuracy) glue("precision", precision) glue("recall", recall) glue("f1", fscore) print("[precision, recall, fscore, support] = " + str([precision, recall, fscore, support]))
# Enter any document or a document and label pair that needs to be interpreted document = "I travelled to the beach. I took the train. I saw fairies, dragons and elves" document1 = "The term construction means fabrication, erection, or installation of an affected unit." document2 = "Demonstrating Product Reliability Indicates the Product Is Ready for Production" document3 = "and see there\'s no secrecy to that because the bill always comes in and we know how much they pay for it" document4 = "Had that piquant gipsy face been at the bottom of the crime, or was it 73 the baser mainspring of money?" document5 = "No, the boy trusted me, and I shan\'t let him down."
# Obtain the top feature ids for the selected class label explainer.preprocessor.labelEncoder = label_encoder
local_explanation = explainer.explain_local(document)
Alternatively, you can pass the predicted label with the document
y = classifier.predict(document1) predicted_label = label_encoder.inverse_transform(y) local_explanation = explainer.explain_local(document1, predicted_label)
from interpret_text.experimental.widget import ExplanationDashboard