#!/usr/bin/env python # coding: utf-8 # In[ ]: import pandas as pd import numpy as np import dask.dataframe as dd # In[ ]: from fairlearn.metrics import ( MetricFrame, true_positive_rate, false_negative_rate, false_positive_rate, count ) # In[ ]: import matplotlib import matplotlib.pyplot as plt import seaborn as sns sns.set() # In[ ]: from collections import defaultdict import logging logger = logging.getLogger() logger.setLevel(logging.CRITICAL) # # Sample Notebook - Face Validation # This Jupyter notebook walks you through an example of assessing a face validation system for any potential fairness-related disparities. You can either use the provided sample CSV file `face_verify_sample_rand_data.csv` or use your own dataset. # In[ ]: import zipfile from raiutils.dataset import fetch_dataset outdirname = 'responsibleai.12.28.21' zipfilename = outdirname + '.zip' fetch_dataset('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename) with zipfile.ZipFile(zipfilename, 'r') as unzip: unzip.extractall('.') results_csv = "face_verify_sample_rand_data.csv" # In[ ]: df = pd.read_csv(results_csv, index_col=0) # In[ ]: df.head() # Our fairness assessment can be broken down into three tasks: # # 1. Idenfity harms and which groups may be harmed. # # 2. Define fairness metrics to quantify harms # # 3. Compare our quantified harms across the relevant groups. # ## 1.) Identify which groups may be harmed and how # The first step of our fairness assessment is understanding which groups are more likely to be *adversely affected* by our face verification system. # The work of Joy Buolamwini and Timnit Gebru on *Gender Shades* ([Buolamwini and Gebru, 2018](http://proceedings.mlr.press/v81/buolamwini18a/buolamwini18a.pdf)) showed a performance disparity in the accuracy of commerically available facial recognition systems between darker-skinned women and lighter-skinned men. One key takeaway from this work is the importance of intersectionality when conducting a fairness assessment. For this fairness assessment, we will explore performance disparities disaggregated by `race` and `gender`. # Using the terminology recommended by the [Fairlearn User Guide](https://fairlearn.org/v0.7.0/user_guide/fairness_in_machine_learning.html#fairness-of-ai-systems), we are interested in mitigating **quality-of-service harms**. **Quality-of-Service** harms are focused on whether a systems achieves the same level of performance for one person as it does for others, even when no opportunities or resources are withheld. The *Face validation* system produces this harm if it fails to validate faces for members of one demographic group higher compared to other demographic groups. # In[ ]: sensitive_features = ["race", "gender"] # In[ ]: df.groupby(sensitive_features)["golden_label"].mean() # The `matching_score` represents the probability the two images represent the same face, according to the vision model. We say two faces *match* if the `matching_score` is greater than a specific threshold, `0.5` by default. Based on your needs, you can increase or decrease this threshold to any value between `0.0` and `1.0`. # In[ ]: threshold = 0.5 df.loc[:, "matching_score_binary"] = df["matching_score"] >= threshold # ## 2.) Define fairness to quantify harms # The second step of our fairness assessment is to translate our fairness-related harms into quantifiable metrics.With face validation, there are two harms we should consider: # # 1. *False Positives* where two different faces are considered by the system to be matching. A *false positive* can be extremely dangerous in many cases, such as security authentication. We would not want people to unlock someone else's phone due to a Face ID false positive. # # 2. *False Negatives* occur when two pictures of the same person are not considered to be a match by the system. A *false negative* may result in an individual being locked out their account due to a lack of facial verifications. However in many cases, *false negatives* are not nearly as harmful as *false positives*. # To assess fairness-related disparities using the `MetricFrame`, we must first specify our *sensitive features* `A` along with our `fairness_metrics`. In this scenario, we will look at three different *fairness metrics*: # - `count`: The number of data points in each demographic category. # - `FNR`: The false negative rate for the group. # - `FPR`: The false positive rate for the group. # With our system, we want to keep *false_positives* as low as possible while also not yielding too much disparity in the *false_negative_rate* for each group. For our example, we will look at the system's performance disaggregated by `race` and `gender`. # In[ ]: A, Y = df.loc[:, sensitive_features], df.loc[:, "golden_label"] Y_pred = df.loc[:, "matching_score_binary"] # In[ ]: fairness_metrics = { "count": count, "FNR": false_negative_rate, "FPR": false_positive_rate } # ## 3.) Compared quantified harms across different groups # In the final step of our fairness assessment, we instantiate our `MetricFrame` by defining the following parameters: # # - *metrics*: The metrics of interest for our fairness assessment. # - *y_true*: The ground truth labels for the ML task # - *y_pred*: The model's predicted labels for the ML tasks # - *sensitive_features*: The set of feature(s) for our fairness assessment # In[ ]: metricframe = MetricFrame( metrics=fairness_metrics, y_true=Y, y_pred=Y_pred, sensitive_features=A ) # With our `MetricFrame`, we can call the `by_group` function to view our `fairness_metrics` dissaggregated by our different demographic groups. # In[ ]: metricframe.by_group # With the `difference` method, we can view the maximal disparity in each metric. We see there is a maximal `false negative rate difference` between `Black female` and `White male` of `0.0177`. # In[ ]: metricframe.difference() # ### Applying Different Thresholds # In the previous section, we used a *threshold* of `0.5` to determine the minimum `matching_score` needed for a successful match. In practice, we could choose any *threshold* between 0.0 and 1.0 to get a *false negative rate* and *false positive rate* that is acceptable for the specific task. # # Now, we're going to explore how changing the threshold affects the resultant *false positive rate* and *false negative rate*. # In[ ]: def update_dictionary_helper(dictionary, results): for (k, v) in results.items(): dictionary[k].append(v) return dictionary # The following function iterates through a set of potential thresholds and computes the resultant model predictions at each threshold. The function then creates a `MetricFrame` to compute the disaggregated metrics at this threshold level. # In[ ]: def compute_group_thresholds_dask(dataframe, metric, A,bins=10): thresholds = np.linspace(0, 1, bins+1)[1:] full_dict = defaultdict(list) for threshold in thresholds: Y_pred_threshold = dataframe.loc[:, "matching_score"] >= threshold metricframe_threshold = MetricFrame( metrics={f"{metric.__name__}": metric}, y_true= dataframe.loc[:, "golden_label"], y_pred = Y_pred_threshold, sensitive_features=A ) results = metricframe_threshold.by_group[metric.__name__].to_dict() full_dict = update_dictionary_helper(full_dict, results) return full_dict # Using the `plot_thresholds` function, we can visualize the `false_positive_rate` and `false_negative_rate` for the data at each *threshold* level. # In[ ]: def plot_thresholds(thresholds, thresholds_dict,metric): plt.figure(figsize=[12,8]) for (k, vals) in thresholds_dict.items(): plt.plot(thresholds, vals, label=f"{k}") plt.scatter(thresholds, vals, s=20) plt.xlabel("Threshold") plt.xticks(thresholds) plt.ylabel(f"{metric.__name__}") plt.legend(bbox_to_anchor=(1,1), loc="upper left") plt.grid(b=True, which="both", axis="both", color='gray', linestyle='dashdot', linewidth=1) # In[ ]: thresholds = np.linspace(0, 1, 11)[1:] fn_thresholds_dict = compute_group_thresholds_dask(df, false_negative_rate, A) fp_thresholds_dict = compute_group_thresholds_dask(df, false_positive_rate, A) # From the visualization, we see the *false_negative_rate* for all groups increases as the threshold increases. Furthermore, the maximal `false_negative_rate_difference` occurs between *White Female* and *Black Male* when the `threshold` is set to `0.7`. # In[ ]: plot_thresholds(thresholds, fn_thresholds_dict, false_negative_rate) # In[ ]: plot_thresholds(thresholds, fp_thresholds_dict, false_positive_rate) # If it were essential to keep the *false_positive_rate* at 0 for all groups, then according to the plots above, we simply need to choose a *threshold* greater than or equal to 0.5. However increasing the *threshold* above *0.5* in our data also increases the **absolute false negative rate** across all groups as well as the *relative false negative rate difference* between groups. # ### Comparison to Synthetic Disparity # In our dataset, there isn't a substantial disparity in the `false_negative_rate` between the different demographic groups. In this section, we will introduce a synthetic `race_synth` feature to illustate what the results would look like if a disparity were present. We generate `race_synth` such that the feature is uncorrelated with `gender` and dependent entirely on the `golden_label`. # If `golden_label` is `0`, then the synthetic `matching_score` is drawn from `Uniform(0, 0.5)`. If the synthetic `golden_label` is `1`, then the `matching_score` is drawn from `Uniform(0, 1)`. The below function `create_disparity` creates additional rows in the DataFrame using this process. # In[ ]: def create_disparity(dataframe, num_rows=2000): n = dataframe.shape[0] synth_ground_truth = np.random.randint(low=0,high=2, size=num_rows) synth_gender = np.random.choice(["Male", "Female"], size=num_rows) synth_match_score = np.random.random(size=num_rows)/(2.0-synth_ground_truth) new_indices = range(n, n+num_rows) src_imgs, dst_imgs = [f"Source_Img_{i}" for i in new_indices], [f"Target_Img_{i}" for i in new_indices] synth_rows = pd.DataFrame.from_dict({ "source_image": src_imgs, "target_image": dst_imgs, "race": ["race_synth" for i in new_indices], "gender": synth_gender, "golden_label": synth_ground_truth, "matching_score": synth_match_score }) return synth_rows # In[ ]: disp = create_disparity(df) # In[ ]: synth_df = pd.concat([df, disp], axis=0) # In[ ]: synth_df.loc[:, "matching_score_binary"] = synth_df["matching_score"] > threshold # Now we create another `MetricFrame` with the same parameters as above one. # In[ ]: synth_metricframe = MetricFrame( metrics=fairness_metrics, y_true=synth_df.loc[:, "golden_label"], y_pred=synth_df.loc[:,"matching_score_binary"], sensitive_features=synth_df.loc[:, sensitive_features] ) # Now when we call `by_group` on this new `MetricFrame`, we can easily see the vast disparity between the `race_synth` groups and the other racial groups. # In[ ]: synth_metricframe.by_group # In[ ]: synth_metricframe.difference() # In[ ]: synth_metricframe.by_group.plot(kind="bar", y="FNR", figsize=[12,8], title="FNR by Race and Gender") # ## Fairness Assessment Dashboard # With the `raiwidgets` library, we can use the `FairnessDashboard` to visualize the disparities between our different `race` and `gender` demographics. We pass in our *sensitive_features*, *golden_labels*, and *thresholded matching scores* to the dashboard. We can view the **dashboard** either within this Jupyter notebook or at a separate **localhost**. # In[ ]: from raiwidgets import ( FairnessDashboard ) # We instantitate the `FairnessDashboard` by passing in three parameters: # - `sensitive_feature`: The set of sensitive features # - `y_true`: The ground truth labels # - `y_pred`: The model's predictive labels # # The `FairnessDashboard` can either be accessed within the Jupyter notebook or by going to the *localhost url*. # In[ ]: FairnessDashboard( sensitive_features=synth_df.loc[:, sensitive_features], y_true=synth_df.loc[:, "golden_label"], y_pred=synth_df.loc[:,"matching_score_binary"] )