#!/usr/bin/env python # coding: utf-8 # # Integrated gradients for text classification on the IMDB dataset # ## Table of Content # 1. Overview # 2. Data # 3. Model Training # 4. Model Evaluation # 5. Summary # 6. Exercises # ## 1. Overview # In this example, we apply the integrated gradients method to a sentiment analysis model # trained on the IMDB dataset. In text classification models, integrated gradients define # an attribution value for each word in the input sentence. The attributions are calculated # considering the integral of the model gradients with respect to the word embedding layer # along a straight path from a baseline instance $x^\prime$ to the input instance $x$. A # description of the method can be found # [here](https://docs.seldon.io/projects/alibi/en/stable/methods/IntegratedGradients.html). Integrated # gradients was originally proposed in Sundararajan et al., # ["Axiomatic Attribution for Deep Networks"](https://arxiv.org/abs/1703.01365) # # The IMDB data set contains 50K movie reviews labelled as positive or negative. We train a # convolutional neural network classifier with a single 1-d convolutional layer followed by a # fully connected layer. The reviews in the dataset are truncated at 100 words and each word # is represented by 50-dimesional word embedding vector. We calculate attributions for the # elements of the embedding layer. # # Note: To enable support for IntegratedGradients, you may need to run # # ```bash # pip install alibi[tensorflow] # ``` # In[ ]: import tensorflow as tf import numpy as np import os import pandas as pd from tensorflow.keras.datasets import imdb from tensorflow.keras.preprocessing import sequence from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout from tensorflow.keras.utils import to_categorical from alibi.explainers import IntegratedGradients import matplotlib.pyplot as plt print('TF version: ', tf.__version__) print('Eager execution enabled: ', tf.executing_eagerly()) # True # ## 2. Data # Note that the IMDB dataset has a lot of reviews with different amounts of words so, for our # purposes, we'll limit the amount of tokens we get back for our examples. # In[2]: max_features = 10000 maxlen = 100 # In[3]: from tqdm import tqdm # In[ ]: with tqdm(max_features): (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) # In[ ]: x_train.shape, x_train[0][:10] # In[ ]: test_labels = y_test.copy() train_labels = y_train.copy() train_labels.shape, train_labels[:20] # In[ ]: print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') # In[ ]: get_ipython().run_line_magic('pinfo2', 'to_categorical') # In[ ]: y_train, y_test = to_categorical(y_train), to_categorical(y_test) y_train[:2] # In[ ]: get_ipython().run_line_magic('pinfo2', 'sequence.pad_sequences') # In[ ]: x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) x_train[0] # In[ ]: print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) # In[ ]: index = imdb.get_word_index() reverse_index = {value: key for (key, value) in index.items()} i = 0 for k, v, in reverse_index.items(): print(f"Key --> {k} represents Value --> {v}") print("-"*10) i += 1 if i == 10: break # A sample review from the test set. Note that unknown words are replaced with 'UNK' # In[28]: def decode_sentence(x, reverse_index): # the `-3` offset is due to the special tokens used by keras # see https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset return " ".join([reverse_index.get(i - 3, 'UNK') for i in x]) # In[ ]: print(decode_sentence(x_test[1], reverse_index)) # ## 3. Model Training # The model includes one convolutional layer and reaches a test accuracy of 0.85. If # `save_model = True`, a local folder `../model_imdb` will be created and the trained # model will be saved in that folder. If the model was previously saved, it can be # loaded by setting `load_model = True`. # In[30]: batch_size = 32 embedding_dims = 50 filters = 250 kernel_size = 3 hidden_dims = 250 # In[31]: load_model = False save_model = True # In[ ]: filepath = './models/model_imdb/' # change to directory where model is downloaded if load_model: model = tf.keras.models.load_model(os.path.join(filepath, 'model.h5')) else: print('Build model...') inputs = Input(shape=(maxlen,), dtype=tf.int32) embedded_sequences = Embedding(max_features, embedding_dims)(inputs) out = Conv1D( filters, kernel_size, padding='valid', activation='relu', strides=1 )(embedded_sequences) out = Dropout(0.4)(out) out = GlobalMaxPooling1D()(out) out = Dense(hidden_dims, activation='relu')(out) out = Dropout(0.4)(out) outputs = Dense(2, activation='softmax')(out) model = Model(inputs=inputs, outputs=outputs) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit( x_train, y_train, batch_size=256, epochs=3, validation_data=(x_test, y_test) ) if save_model: if not os.path.exists(filepath): os.makedirs(filepath) model.save(os.path.join(filepath, 'model.h5')) # ## 4. Model Evaluation # The integrated gradients attributions are calculated with respect to the embedding layer for # 10 samples from the test set. Since the model uses a word to vector embedding with vector # dimensionality of 50 and sequence length of 100 words, the dimensionality of the attributions # is (10, 100, 50). In order to obtain a single attribution value for each word, we sum all # the attribution values for the 50 elements of each word's vector representation. # # The default baseline is used in this example which is internally defined as a sequence of # zeros. In this case, this corresponds to a sequence of padding characters (**NB:** in general # the numerical value corresponding to a "non-informative" baseline such as the PAD token will # depend on the tokenizer used, make sure that the numerical value of the baseline used corresponds # to your desired token value to avoid surprises). The path integral is defined as a straight # line from the baseline to the input image. The path is approximated by choosing 50 discrete # steps according to the Gauss-Legendre method. # In[ ]: layer = model.layers[1] layer # In[34]: n_steps = 50 method = "gausslegendre" internal_batch_size = 100 nb_samples = 10 ig = IntegratedGradients( model, layer=layer, n_steps=n_steps, method=method, internal_batch_size=internal_batch_size ) # In[ ]: x_test_sample = x_test[:nb_samples] predictions = model(x_test_sample).numpy().argmax(axis=1) predictions[:4] # In[41]: explanation = ig.explain( x_test_sample, baselines=None, target=predictions, attribute_to_layer_inputs=False ) # In[ ]: # Metadata from the explanation object explanation.meta # In[ ]: # Data fields from the explanation object explanation.data.keys() # In[ ]: # Get attributions values from the explanation object attrs = explanation.attributions[0] print('Attributions shape:', attrs.shape) attrs[0] # In[ ]: attrs = attrs.sum(axis=2) print('Attributions shape:', attrs.shape) attrs[0] # Time to visualize the attributions of our model. We can do so for the text instance by # mapping the values of the attributions onto a matplotlib colormap. Below we define some # utility functions for doing this. # In[57]: from IPython.display import HTML def hlstr(string, color='white'): """ Return HTML markup highlighting text with the desired color. """ return f"{string} " # In[58]: def colorize(attrs, cmap='PiYG'): """ Compute hex colors based on the attributions for a single instance. Uses a diverging colorscale by default and normalizes and scales the colormap so that colors are consistent with the attributions. """ import matplotlib as mpl cmap_bound = np.abs(attrs).max() norm = mpl.colors.Normalize(vmin=-cmap_bound, vmax=cmap_bound) cmap = mpl.cm.get_cmap(cmap) return list(map(lambda x: mpl.colors.rgb2hex(cmap(norm(x))), attrs)) # Now, we can visualize the attribution values (highlighted in the text) having the highest # positive attributions. Words with high positive attribution are highlighted in shades of # green and words with negative attribution in shades of pink. Stronger shading corresponds # to higher attribution values. Positive attributions can be interpreted as increase in # probability of the predicted class ("Positive sentiment") while negative attributions # correspond to decrease in probability of the predicted class. # In[76]: i = 0 x_i = x_test_sample[i] attrs_i = attrs[i] pred = predictions[i] pred_dict = {1: 'Positive review', 0: 'Negative review'} # In[ ]: print('Predicted label = {}: {}'.format(pred, pred_dict[pred])) # In[ ]: words = decode_sentence(x_i, reverse_index).split() words[:10] # In[ ]: colors = colorize(attrs_i) colors[:10] # In[ ]: HTML("".join(list(map(hlstr, words, colors)))) # ## 5. Summary # - Integrated gradients for NLP help us evaluate the contribution of each token with respect to # the predicted variable. # # - Integrated gradients is a model agnostic explainability method and can be used with different # models and not only different data modalities. # ## 6. Exercise # Load the reuters dataset. # In[88]: from tensorflow.keras.____ import ____ # In[115]: (___, ___), (___, ___) = ____.load_data(num_words=____) # Evaluate the shape of your data, both training and outcomes. # In[116]: # Copy the labels. # In[117]: ____ = ____.____() ____ = y_train.____() train_labels.shape, train_labels[:20] # Convert the classes into categories. # In[118]: ____, ____ = to_categorical(____), to_categorical(____) ____[:2] # Pad the sequences. # In[119]: x_train = sequence.pad_sequences(____, maxlen=____) x_test = sequence.pad_sequences(____, maxlen=____) x_train[0][:10] # Look at the shape of both again. # In[120]: print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) # Get the words index and print a few samples. # In[121]: index = reuters.____() reverse_index = {value: key for (key, value) in index.items()} i = 0 for ____, ____ in reverse_index.____(): ____ # Decode one sentence to see what they look like. # In[122]: print(decode_sentence(____[1], ____)) # Train your model. # In[123]: batch_size = 32 embedding_dims = 50 filters = 250 kernel_size = 3 hidden_dims = 250 # In[127]: filepath = './models/model_reuters/' # change to directory where model is downloaded inputs = Input(shape=(maxlen,), dtype=tf.int32) embedded_sequences = Embedding(max_features, embedding_dims)(inputs) out = Conv1D( filters, kernel_size, padding='valid', activation='relu', strides=1 )(embedded_sequences) out = Dropout(0.4)(out) out = GlobalMaxPooling1D()(out) out = Dense(hidden_dims, activation='relu')(out) out = Dropout(0.4)(out) outputs = Dense(46, activation='softmax')(out) model = Model(inputs=inputs, outputs=_____) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit( _____, _____, batch_size=256, epochs=3, validation_data=(x_test, y_test) ) # Save it with a unique name # In[ ]: if not os.path.exists(filepath): os.makedirs(filepath) model.save(os.path.join(filepath, '_____.h5')) # In[128]: layer = model.layers[1] layer # In[129]: n_steps = 50 method = "gausslegendre" internal_batch_size = 100 ig = IntegratedGradients( _____, layer=_____, n_steps=_____, method=_____, internal_batch_size=_____ ) # Pick a few samples and predict their class. # In[130]: nb_samples = _____ _____ = x_test[:nb_samples] predictions = model(_____).numpy().argmax(axis=1) predictions[:4] # Explain the samples above. No baseline needed. # In[131]: explanation = ig.explain( _____, baselines=None, target=_____, attribute_to_layer_inputs=False ) # In[132]: explanation.meta # In[133]: # Data fields from the explanation object explanation.data.keys() # Get the attribution values from your explainer. # In[134]: # Get attributions values from the explanation object attrs = _____._____[0] print('Attributions shape:', attrs.shape) attrs = attrs.sum(axis=2) print('Attributions shape:', attrs.shape) # Visualize the attributions of your model. # In[136]: i = 0 x_i = x_test_sample[i] attrs_i = attrs[i] pred = predictions[i] pred_dict = {1: 'Positive review', 0: 'Negative review'} # In[138]: words = decode_sentence(x_i, reverse_index).split() # In[ ]: colors = colorize(attrs_i) # Evaluate different samples as well and see if the words highlighted make sense as the # contributors to the prediction. # In[140]: HTML("".join(list(map(hlstr, words, colors)))) # In[ ]: print('Predicted label = {}'.format(pred)) # The predicted label is not immediately available in the dataset, but you can look a the # equivalent value for your prediction label here. # # # # |Index | Class name | Nr of docs train | Nr of docs test | Mean nr of words in train set | # |:---:|:---:|:---:|:---:|:---:| # | 0 | cocoa | 55 | 12 | 225.78 | # | 1 | grain | 432 | 105 | 188.67 | # | 2 | veg-oil | 74 | 20 | 184.86 | # | 3 | earn | 3159 | 813 | 87.67 | # | 4 | acq | 1949 | 474 | 135.83 | # | 5 | wheat | 17 | 5 | 213.35 | # | 6 | copper | 48 | 14 | 154.46 | # | 7 | housing | 16 | 3 | 180.38 | # | 8 | money-supply | 139 | 38 | 191.48 | # | 9 | coffee | 101 | 25 | 225.87 | # | 10 | sugar | 124 | 30 | 184.73 | # | 11 | trade | 390 | 83 | 253.80 | # | 12 | reserves | 49 | 13 | 186.92 | # | 13 | ship | 172 | 37 | 164.66 | # | 14 | cotton | 26 | 2 | 142.69 | # | 15 | carcass | 20 | 9 | 170.45 | # | 16 | crude | 444 | 99 | 219.79 | # | 17 | nat-gas | 39 | 12 | 149.82 | # | 18 | cpi | 66 | 20 | 146.85 | # | 19 | money-fx | 549 | 133 | 185.34 | # | 20 | interest | 269 | 70 | 201.00 | # | 21 | gnp | 100 | 27 | 281.83 | # | 22 | meal-feed | 15 | 7 | 183.73 | # | 23 | alum | 41 | 12 | 157.34 | # | 24 | oilseed | 62 | 19 | 151.24 | # | 25 | gold | 92 | 31 | 152.38 | # | 26 | tin | 24 | 8 | 259.88 | # | 27 | strategic-metal | 15 | 4 | 145.27 | # | 28 | livestock | 48 | 10 | 177.48 | # | 29 | retail | 19 | 4 | 258.32 | # |30 | ipi | 45 | 12 | 175.78 | # | 31 | iron-steel | 39 | 13 | 157.51 | # | 32 | rubber | 32 | 10 | 207.44 | # | 33 | heat | 11 | 5 | 115.55 | # | 34 | jobs | 50 | 7 | 152.94 | # | 35 | lei | 10 | 6 | 142.30 | # | 36 | bop | 49 | 11 | 228.45 | # | 37 | zinc | 19 | 2 | 164.74 | # | 38 | orange | 19 | 3 | 130.21 | # | 39 | pet-chem | 24 | 5 | 153.96 | # | 40 | dlr | 36 | 10 | 278.39 | # | 41 | gas | 30 | 8 | 175.07 | # | 42 | silver | 13 | 3 | 197.92 | # | 43 | wpi | 21 | 6 | 152.71 | # | 44 | hog | 12 | 5 | 90.75 | # | 45 | lead | 18 | 1 | 159.89 |