This tutorial demonstrates how one can use the library to measure to what extent the outcome of evaluation is sensitive to the choice of hyperparameters e.g., choice of baseline value to mask an image with, patch sizes or number of runs. We use a LeNet model and CIFAR-10 dataset to showcase the library's functionality and test the Faithfulness Correlation by Bhatt et al., 2020..
# Mount Google Drive.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# Install packages.
from IPython.display import clear_output
!pip install captum opencv-python xmltodict
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
clear_output()
# Imports general.
import sys
import gc
import warnings
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchvision
from torchvision import transforms
import captum
from captum.attr import *
import random
import os
import cv2
# Import package.
path = "/content/drive/MyDrive/Projects"
sys.path.append(f'{path}/quantus')
import quantus
# Collect garbage.
gc.collect()
torch.cuda.empty_cache()
# Notebook settings.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
warnings.filterwarnings("ignore", category=UserWarning)
%load_ext autoreload
%autoreload 2
clear_output()
# Load datasets and make loaders.
test_samples = 200
transformer = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
train_set = torchvision.datasets.CIFAR10(root='./sample_data', train=True, transform=transformer, download=True)
test_set = torchvision.datasets.CIFAR10(root='./sample_data', train=False, transform=transformer, download=True)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True, pin_memory=True) # num_workers=4,
test_loader = torch.utils.data.DataLoader(test_set, batch_size=200, pin_memory=True)
# Specify class labels.
classes = {0: 'plane', 1: 'car', 2: 'bird', 3: 'cat', 4: 'deer', 5: 'dog', 6: 'frog', 7: 'horse', 8: 'ship', 9: 'truck'}
# Load a batch of inputs and outputs to use for evaluation.
x_batch, y_batch = iter(test_loader).next()
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./sample_data/cifar-10-python.tar.gz Failed download. Trying https -> http instead. Downloading http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./sample_data/cifar-10-python.tar.gz
0%| | 0/170498071 [00:00<?, ?it/s]
Extracting ./sample_data/cifar-10-python.tar.gz to ./sample_data Files already downloaded and verified
def plot_images(images) -> None:
"""Plot some images."""
fig = plt.figure(figsize=(10, 10))
img = images / 2 + 0.5
plt.imshow(np.transpose(img.cpu().numpy(), (1, 2, 0)))
plt.axis("off")
plt.show()
# Plot image examples!
plot_images(torchvision.utils.make_grid(x_batch[:6, :, :, :]))
(or any other model of choice). Network architecture and training procedure is partly copied from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py.
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv_1 = torch.nn.Conv2d(3, 6, 5)
self.pool_1 = torch.nn.MaxPool2d(2, 2)
self.pool_2 = torch.nn.MaxPool2d(2, 2)
self.conv_2 = torch.nn.Conv2d(6, 16, 5)
self.fc_1 = torch.nn.Linear(16 * 5 * 5, 120)
self.fc_2 = torch.nn.Linear(120, 84)
self.fc_3 = torch.nn.Linear(84, 10)
self.relu_1 = torch.nn.ReLU()
self.relu_2 = torch.nn.ReLU()
self.relu_3 = torch.nn.ReLU()
self.relu_4 = torch.nn.ReLU()
def forward(self, x):
x = self.pool_1(self.relu_1(self.conv_1(x)))
x = self.pool_2(self.relu_2(self.conv_2(x)))
x = x.view(-1, 16 * 5 * 5)
x = self.relu_3(self.fc_1(x))
x = self.relu_4(self.fc_2(x))
x = self.fc_3(x)
return x
# Load model architecture.
model = Net()
print(f"\n Model architecture: {model.eval()}\n")
Model architecture: Net( (conv_1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1)) (pool_1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (pool_2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (conv_2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1)) (fc_1): Linear(in_features=400, out_features=120, bias=True) (fc_2): Linear(in_features=120, out_features=84, bias=True) (fc_3): Linear(in_features=84, out_features=10, bias=True) (relu_1): ReLU() (relu_2): ReLU() (relu_3): ReLU() (relu_4): ReLU() )
def train_model(model,
train_data: torchvision.datasets,
test_data: torchvision.datasets,
device: torch.device,
epochs: int = 20,
criterion: torch.nn = torch.nn.CrossEntropyLoss(),
optimizer: torch.optim = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9),
evaluate: bool = False):
"""Train torch model."""
model.train()
for epoch in range(epochs):
for images, labels in train_data:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
logits = model(images)
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
# Evaluate model!
if evaluate:
predictions, labels = evaluate_model(model, test_data, device)
test_acc = np.mean(np.argmax(predictions.cpu().numpy(), axis=1) == labels.cpu().numpy())
print(f"Epoch {epoch+1}/{epochs} - test accuracy: {(100 * test_acc):.2f}% and CE loss {loss.item():.2f}")
return model
def evaluate_model(model, data, device):
"""Evaluate torch model."""
model.eval()
logits = torch.Tensor().to(device)
targets = torch.LongTensor().to(device)
with torch.no_grad():
for images, labels in data:
images, labels = images.to(device), labels.to(device)
logits = torch.cat([logits, model(images)])
targets = torch.cat([targets, labels])
return torch.nn.functional.softmax(logits, dim=1), targets
path_model_weights = path + "/quantus/tutorials/assets/cifar10"
if pathlib.Path(path_model_weights).is_file():
model.load_state_dict(torch.load(path_model_weights))
else:
# Train and evaluate model.
model = train_model(model=model.to(device),
train_data=train_loader,
test_data=test_loader,
device=device,
epochs=20,
criterion=torch.nn.CrossEntropyLoss().to(device),
optimizer=torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9),
evaluate=True)
# Save model.
torch.save(model.state_dict(), path_model_weights)
# Model to GPU and eval mode.
model.to(device)
model.eval()
# Check test set performance.
predictions, labels = evaluate_model(model, test_loader, device)
test_acc = np.mean(np.argmax(predictions.cpu().numpy(), axis=1) == labels.cpu().numpy())
print(f"Model test accuracy: {(100 * test_acc):.2f}%")
Model test accuracy: 59.51%
# Load some attributions and plot them.
a_batch = quantus.explain(model,
x_batch,
y_batch,
method="IntegratedGradients",
**{"normalize": True})
# Plot examplary inputs!
nr_images = 3
fig, axes = plt.subplots(nrows=nr_images, ncols=2, figsize=(nr_images*1.5, int(nr_images*3)))
for i in range(nr_images):
axes[i, 0].imshow(np.moveaxis(np.clip(x_batch[i].cpu().numpy(), 0, 1), 0, -1),
vmin=0.0, vmax=1.0)
axes[i, 0].title.set_text(f"CIFAR-10 - {classes[y_batch[i].item()]}")
axes[i, 0].axis("off")
axes[i, 1].imshow(a_batch[i], cmap="seismic")
axes[i, 1].title.set_text(f"IG_norm=[0, 1]")
axes[i, 1].axis("off")
plt.show()
We want to understand how sensitive the evaluation outome of Faithfulness Correlation (Bhatt et al., 2020) is from its hyperparameters.
# Let's list the default parameters of the metric.
metric = quantus.FaithfulnessCorrelation()
WARNINGS. The Faithfulness correlation metric is likely to be sensitive to the choice of baseline value 'perturb_baseline', size of subset |S| 'subset_size' and the number of runs (for each input and explanation pair) 'nr_runs'. Go over and select each hyperparameter of the metric carefully to avoid misinterpretation of scores. To view all relevant hyperparameters call .get_params of the metric instance. For further reading: Bhatt, Umang, Adrian Weller, and José MF Moura. 'Evaluating and aggregating feature-based model explanations.' arXiv preprint arXiv:2005.00631 (2020). Normalising attributions may destroy or skew information in the explanation and as a result, affect the overall evaluation outcome.
# Recompute some Saliency explanations.
a_batch = Saliency(model).attribute(inputs=x_batch, target=y_batch, abs=True).sum(axis=1).cpu().numpy()
a_batch_occ = Occlusion(model).attribute(inputs=x_batch, target=y_batch, sliding_window_shapes=(1, 4, 4)).sum(axis=1).cpu().numpy()
a_batch_ig = IntegratedGradients(model.to(device)).attribute(inputs=x_batch,
target=y_batch,
baselines=torch.zeros_like(x_batch),
n_steps=10,
method="riemann_trapezoid").sum(axis=1).cpu().numpy()
a_batch_gh = GradientShap(model).attribute(inputs=x_batch,
target=y_batch,
baselines=torch.zeros_like(x_batch),).sum(axis=1).cpu().data.numpy()
# Metric class expects numpy arrays.
x_batch, y_batch = x_batch.cpu().numpy(), y_batch.cpu().numpy()
# Define some parameter settings to evaluate.
baseline_strategies = ["mean", "random", "uniform", "black", "white"]
subset_sizes = np.array([ 2, 52, 102])
iterations = [100, 200]
absolutes = [True, False]
normalisations = [True, False]
sim_funcs = {"pearson": quantus.correlation_pearson, "spearman": quantus.correlation_spearman}
result = {
"Normalise": [],
"Absolute": [],
"Similarity function": [],
"Faithfulness score": [],
"Baseline strategy": [],
"Subset size": [],
"Method": [],
"Iterations": [],
}
methods = {"Saliency": a_batch, "Occlusion": a_batch_occ, "Integrated Gradients": a_batch_ig, "GradShap": a_batch_gh}
#!ls drive/MyDrive/Projects/quantus/tutorials/assets/data
path_sensitivity_results = "sensitivity_results_200_extra_extra.csv" #"drive/MyDrive/Projects/quantus/tutorials/assets/data/sensitivity_results.csv"
if pathlib.Path(path_sensitivity_results).is_file():
df = pd.read_csv(path_sensitivity_results)
len(df)
2160
path_sensitivity_results = path + "/quantus/tutorials/assets/data/sensitivity_results.csv"
if pathlib.Path(path_sensitivity_results).is_file():
df = pd.read_csv(path_sensitivity_results)
else:
# Score explanations!
for b in baseline_strategies:
for s in subset_sizes:
for nr in iterations:
for method, attr in methods.items():
for sim, sim_func in sim_funcs.items():
score = np.mean(FaithfulnessCorrelation({'abs': True,
'normalize': True,
'normalize_func': quantus.normalize_by_max,
'nr_runs': nr,
'perturb_baseline': b,
'perturb_func': quantus.baseline_replacement_by_indices,
'similarity_func': sim_func,
'subset_size': s})(model=model.cuda(),
x_batch=x_batch,
y_batch=y_batch,
a_batch=attr,
**{"device": device}))
result["Method"].append(method)
result["Baseline strategy"].append(b.capitalize())
result["Subset size"].append(s)
result["Iterations"].append(nr)
result["Faithfulness score"].append(score)
result["Similarity function"].append("spearman")
df = pd.DataFrame(result)
df["Rank"] = df.groupby(['Baseline strategy', 'Subset size', 'Iterations', 'Similarity function'])["Faithfulness score"].rank()
df.to_csv(path + "/quantus/tutorials/assets/data/sensitivity_results.csv")
# Smaller adjustments.
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.replace(to_replace="Integrated Gradients", value="Integrated\nGradients", inplace=True)
df.replace(value="GradShap", to_replace="GS", inplace=True)
df.columns = map(lambda x: str(x).capitalize(), df.columns)
path_sensitivity_results = "sensitivity_results_200_extra_extra.csv" #"drive/MyDrive/Projects/quantus/tutorials/assets/data/sensitivity_results.csv"
if pathlib.Path(path_sensitivity_results).is_file():
df = pd.read_csv(path_sensitivity_results)
# Smaller adjustments.
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.replace(to_replace="Integrated Gradients", value="Integrated\nGradients", inplace=True)
df.replace(value="GradShap", to_replace="GS", inplace=True)
df.columns = map(lambda x: str(x).capitalize(), df.columns)
df["Rank_2"] = df.groupby(['Baseline strategy', 'Subset size', 'Iterations', 'Similarity function'])["Faithfulness score"].rank()
df
Normalise | Absolute | Faithfulness score | Baseline strategy | Subset size | Method | Iterations | Similarity function | Rank | Rank_2 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | True | True | 0.029557 | Mean | 2 | Saliency | 100 | pearson | 3.0 | 7.0 |
1 | True | True | 0.018670 | Mean | 2 | Integrated\nGradients | 100 | pearson | 2.0 | 3.0 |
2 | True | True | 0.024692 | Mean | 2 | Saliency | 100 | spearman | 3.0 | 8.0 |
3 | True | False | 0.021572 | Mean | 2 | Integrated\nGradients | 100 | pearson | 3.0 | 5.0 |
4 | True | False | -0.063219 | Mean | 2 | Saliency | 100 | spearman | 1.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2155 | False | False | 0.048248 | White | 142 | Occlusion | 500 | pearson | 3.0 | 9.0 |
2156 | False | False | 0.023836 | White | 142 | Integrated\nGradients | 500 | pearson | 2.0 | 4.0 |
2157 | False | False | -0.019863 | White | 142 | Saliency | 500 | spearman | 1.0 | 1.0 |
2158 | False | False | -0.016180 | White | 142 | Occlusion | 500 | spearman | 2.0 | 2.0 |
2159 | False | False | 0.041108 | White | 142 | Integrated\nGradients | 500 | spearman | 3.0 | 9.0 |
2160 rows × 10 columns
df.groupby(['Baseline strategy', 'Subset size', 'Iterations', 'Similarity function']).mean()
Normalise | Absolute | Faithfulness score | Rank | Rank_2 | ||||
---|---|---|---|---|---|---|---|---|
Baseline strategy | Subset size | Iterations | Similarity function | |||||
Black | 2 | 100 | pearson | 0.5 | 0.5 | 0.024805 | 2.0 | 6.5 |
spearman | 0.5 | 0.5 | 0.034920 | 2.0 | 6.5 | |||
250 | pearson | 0.5 | 0.5 | 0.031858 | 2.0 | 6.5 | ||
spearman | 0.5 | 0.5 | 0.023986 | 2.0 | 6.5 | |||
500 | pearson | 0.5 | 0.5 | 0.044035 | 2.0 | 6.5 | ||
... | ... | ... | ... | ... | ... | ... | ... | ... |
White | 142 | 100 | spearman | 0.5 | 0.5 | 0.018924 | 2.0 | 6.5 |
250 | pearson | 0.5 | 0.5 | 0.035314 | 2.0 | 6.5 | ||
spearman | 0.5 | 0.5 | 0.022408 | 2.0 | 6.5 | |||
500 | pearson | 0.5 | 0.5 | 0.036599 | 2.0 | 6.5 | ||
spearman | 0.5 | 0.5 | 0.030558 | 2.0 | 6.5 |
180 rows × 5 columns
#df["Rank"] = df.groupby(['Baseline strategy', 'Subset size', 'Iterations', 'Similarity function'])["Faithfulness score"].rank()
# Convert to datafame and rank.
df = pd.DataFrame(result)
df.to_csv(path + "/quantus/tutorials/assets/data/sensitivity_results.csv")
df["Rank"] = df.groupby(['Baseline strategy', 'Subset size', 'Iterations', 'Similarity function']).rank()
# Write to disk and re-open.
df.to_csv(path + "/quantus/tutorials/assets/data/sensitivity_results.csv")
df = pd.read_csv(path + "/quantus/tutorials/assets/data/sensitivity_results.csv")
# Smaller adjustments.
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.replace(to_replace="Integrated Gradients", value="Integrated\nGradients", inplace=True)
df.replace(value="GradShap", to_replace="GS", inplace=True)
df.columns = map(lambda x: str(x).capitalize(), df.columns)
df
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py:4389: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy method=method,
Normalise | Absolute | Faithfulness score | Baseline strategy | Subset size | Method | Iterations | Similarity function | Rank | |
---|---|---|---|---|---|---|---|---|---|
0 | True | True | 0.029557 | Mean | 2 | Saliency | 100 | pearson | 7.0 |
1 | True | True | 0.018670 | Mean | 2 | Integrated\nGradients | 100 | pearson | 3.0 |
2 | True | True | 0.024692 | Mean | 2 | Saliency | 100 | spearman | 8.0 |
3 | True | False | 0.021572 | Mean | 2 | Integrated\nGradients | 100 | pearson | 5.0 |
4 | True | False | -0.063219 | Mean | 2 | Saliency | 100 | spearman | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2155 | False | False | 0.048248 | White | 142 | Occlusion | 500 | pearson | 9.0 |
2156 | False | False | 0.023836 | White | 142 | Integrated\nGradients | 500 | pearson | 4.0 |
2157 | False | False | -0.019863 | White | 142 | Saliency | 500 | spearman | 1.0 |
2158 | False | False | -0.016180 | White | 142 | Occlusion | 500 | spearman | 2.0 |
2159 | False | False | 0.041108 | White | 142 | Integrated\nGradients | 500 | spearman | 9.0 |
2160 rows × 9 columns
# Group by rank
df_view = df.groupby(["Method"])["Rank"].value_counts(normalize=True).mul(100).reset_index(name='Percentage').round(2)
df_view = df_view.append({'Method': 'Method A', 'Rank': 1.0, 'Percentage': 100}, ignore_index=True)
df_view = df_view.append({'Method': 'Method B', 'Rank': 2.0, 'Percentage': 100}, ignore_index=True)
df_view = df_view.append({'Method': 'Method C', 'Rank': 3.0, 'Percentage': 100}, ignore_index=True)
#df_view = df_view.append({'Method': 'Method D', 'Rank': 4.0, 'Percentage': 100}, ignore_index=True)
# Reorder the methods for plotting purporses.
df_view_ordered = pd.DataFrame(columns=["Method", "Rank", "Percentage"])
df_view_ordered = df_view_ordered.append({'Method': 'Method A', 'Rank': 1.0, 'Percentage': 100}, ignore_index=True)
df_view_ordered = df_view_ordered.append({'Method': 'Method B', 'Rank': 2.0, 'Percentage': 100}, ignore_index=True)
df_view_ordered = df_view_ordered.append({'Method': 'Method C', 'Rank': 3.0, 'Percentage': 100}, ignore_index=True)
df_view_ordered = df_view_ordered.append({'Method': 'Method D', 'Rank': 4.0, 'Percentage': 100}, ignore_index=True)
df_view_ordered = df_view_ordered.append([df_view.loc[df_view["Method"] == 'Saliency']], ignore_index=True)
df_view_ordered = df_view_ordered.append([df_view.loc[df_view["Method"] == 'Occlusion']], ignore_index=True)
df_view_ordered = df_view_ordered.append([df_view.loc[df_view["Method"] == 'Integrated\nGradients']], ignore_index=True)
df_view_ordered = df_view_ordered.append([df_view.loc[df_view["Method"] == 'GradShap']], ignore_index=True)
df_view_ordered
plt.style.use('seaborn-white')
sns.set(font_scale=1.5)
path = "drive/MyDrive/Projects/"
# Plot 1!
fig, ax = plt.subplots(figsize=(6.5,5))
ax = sns.histplot(x='Method', hue='Rank', weights='Percentage', multiple='stack', data=df_view_ordered, shrink=0.6, palette="colorblind", legend=False)
ax.spines["right"].set_visible(False)
ax.spines['top'].set_visible(False)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.set_ylabel('Frequency of rank')
ax.set_xlabel('')
ax.set_xticklabels(["A", "B", "C", "D", "SAL", "OCC", "IG", "GD"])
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=4, fancybox=True, shadow=False, labels=['1st', "2nd", "3rd", "4th"])
plt.axvline(x=2.5, ymax=0.95, color='black', linestyle='-')
plt.tight_layout()
plt.savefig(f'{path}sensitivity_analysis_1.png', dpi = 400)
# Plot 2!
ax = sns.catplot(x="Baseline strategy", y="Rank", hue="Method", kind="bar", estimator=np.mean, hue_order=['SAL', 'OCC', 'IG', "GD"],
data=df, palette=sns.color_palette("husl", 3), legend=False, height=5, aspect=7.5/5.8)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=4, fancybox=True, shadow=False)
ax.set_ylabels('Mean Rank')
ax.set_xlabels('')
plt.tight_layout()
plt.savefig(f'{path}sensitivity_analysis_2.png', dpi = 400)
# Plot 3!
df_subset = df.loc[(df["Iterations"] == 100) & (df["Subset size"] == 102) & (df["Similarity function"] == "spearman")]
ax = sns.catplot(x="Baseline strategy", y="Rank", hue="Method", kind="bar", estimator=np.mean, hue_order=['Saliency', 'Occlusion', 'Integrated\nGradients'],
data=df_subset, palette=sns.color_palette("husl", 3), legend=False)
ax.set_xlabels('')
ax.set_ylabels('')
plt.legend(handles=["", "", ""], labels=["", "", ""], loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=False, shadow=False)
plt.tight_layout()
plt.show()