This notebook demonstrates the use of the responsibleai
API to assess a text classification huggingface transformers model trained on the covid 19 events dataset (see https://huggingface.co/datasets/joelito/covid19_emergency_event for more information about the dataset). It walks through the API calls necessary to create a widget with model analysis insights, then guides a visual analysis of the model.
The following section examines the code necessary to create datasets and a model. It then generates insights using the responsibleai
API that can be visually analyzed.
The following section can be skipped. It loads a dataset and trains a model for illustrative purposes.
First we import all necessary dependencies
import datasets
import pandas as pd
import numpy as np
import zipfile
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
pipeline)
from raiutils.common.retries import retry_function
from pathlib import Path
try:
from urllib import urlretrieve
except ImportError:
from urllib.request import urlretrieve
Next we load the covid 19 events dataset from huggingface datasets
NUM_TEST_SAMPLES = 100
def load_dataset(split):
dataset = datasets.load_dataset("joelito/covid19_emergency_event", split=split)
return pd.DataFrame({"language": dataset["language"],
"text": dataset["text"],
"event1": dataset["event1"],
"event2": dataset["event2"],
"event3": dataset["event3"],
"event4": dataset["event4"],
"event5": dataset["event5"],
"event6": dataset["event6"],
"event7": dataset["event7"],
"event8": dataset["event8"]})
def select_english_subset(dataset):
# select only English subset
dataset = dataset[dataset.language == "en"].reset_index(drop=True)
dataset = dataset.drop(columns="language")
return dataset
pd_data = load_dataset("train")
pd_data = select_english_subset(pd_data)
pd_valid_data = load_dataset("test")
pd_valid_data = select_english_subset(pd_valid_data)
train_data = pd_data
test_data = pd_valid_data
Fetch a pre-trained huggingface model on the DBPedia dataset
COVID19_EVENTS_MODEL_NAME = "covid19_events_model"
NUM_LABELS = 8
labels = ["event1", "event2", "event3", "event4", "event5", "event6", "event7", "event8"]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
class FetchModel(object):
def __init__(self):
pass
def fetch(self):
zipfilename = COVID19_EVENTS_MODEL_NAME + '.zip'
if not Path(zipfilename).exists():
url = ('https://publictestdatasets.blob.core.windows.net/models/' +
COVID19_EVENTS_MODEL_NAME + '.zip')
urlretrieve(url, zipfilename)
with zipfile.ZipFile(zipfilename, 'r') as unzip:
unzip.extractall(COVID19_EVENTS_MODEL_NAME)
def retrieve_covid19_events_model():
fetcher = FetchModel()
action_name = "Model download"
err_msg = "Failed to download model"
max_retries = 4
retry_delay = 60
retry_function(fetcher.fetch, action_name, err_msg,
max_retries=max_retries,
retry_delay=retry_delay)
model = AutoModelForSequenceClassification.from_pretrained(
COVID19_EVENTS_MODEL_NAME, num_labels=NUM_LABELS,
problem_type="multi_label_classification",
id2label=id2label,
label2id=label2id)
return model
model = retrieve_covid19_events_model()
Load the model and tokenizer
# load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
device = -1
if device >= 0:
model = model.cuda()
# build a pipeline object to do predictions
pred = pipeline(
"text-classification",
model=model,
tokenizer=tokenizer,
device=device,
return_all_scores=True
)
from responsibleai_text import RAITextInsights, ModelTask
from raiwidgets import ResponsibleAIDashboard
To use Responsible AI Dashboard, initialize a RAITextInsights object upon which different components can be loaded.
RAITextInsights accepts the model, the test dataset, the classes and the task type as its arguments.
rai_insights = RAITextInsights(pred, test_data[:3],
labels,
task_type=ModelTask.MULTILABEL_TEXT_CLASSIFICATION)
Add the components of the toolbox for model assessment.
rai_insights.error_analysis.add()
rai_insights.explainer.add()
Once all the desired components have been loaded, compute insights on the test set.
rai_insights.compute()
Finally, visualize and explore the model insights. Use the resulting widget or follow the link to view this in a new tab.
ResponsibleAIDashboard(rai_insights)