#!/usr/bin/env python # coding: utf-8 # # Recidivism # # This notebook contains my analysis of data presented in # "[Machine Bias](https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing)", Julia Angwin, Jeff Larson, Surya Mattu and Lauren Kirchner, ProPublica, May 23, 2016. # # I would like to thank the authors of that article for making their data and analysis freely available. They are a model of open science. # # # Copyright 2018 Allen Downey # # The code and text of this notebook are under this license: [Creative Commons Attribution 4.0 International](https://creativecommons.org/licenses/by/4.0) # # The data are from [this repository](https://github.com/propublica/compas-analysis), which contains the data and analysis pipeline described on [this web page](https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm). # # The terms of use for the data [are here](https://www.propublica.org/datastore/terms). In compliance with those terms, I am not distributing the data, but there is a link below that downloads it directly. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('config', "InteractiveShell.ast_node_interactivity='last_expr_or_assign'") import pandas as pd import numpy as np import matplotlib.pyplot as plt from sympy import symbols, Eq, solve from overthink import decorate # ### Metrics # # In this section I start with the [data reported here](https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm) and replicate the analysis there, computing various metrics based on the [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix): prevalence, sensitivity, specificity, false positive rate, false negative rate, positive predictive value, and negative predictive value. # # The following function takes an array and returns a Pandas DataFrame: # In[2]: def make_matrix(a): """Make a confusion matrix from an array. a: array or list of lists returns: DataFrame """ a = np.asarray(a).reshape((2, 2)) index = ['Survived', 'Recidivated'] columns = ['Low', 'High'] return pd.DataFrame(a, index=index, columns=columns) # Make the matrix for all defendants. # In[3]: a = [[2681, 1282], [1216, 2035]] matrix_all = make_matrix(a) # Confirm that the total matches what's reported in the article. # In[4]: np.sum(a) # Compute sensitivity and specificity. # In[5]: def percent(x, y): """Compute the percentage `x/(x+y)`. """ return x / (x+y) * 100 def sens_spec(m): """Compute sensitivity and specificity. m: confusion matrix """ tn, fp, fn, tp = m.values.flatten() sens = percent(tp, fn) spec = percent(tn, fp) return sens, spec # Compute sensitivity and specificity for all defendants. # In[6]: sens, spec = sens_spec(matrix_all) sens, spec # Compute error rates. # In[7]: def error_rates(m): """Compute false positive and false negative rate. m: confusion matrix """ tn, fp, fn, tp = m.values.flatten() fpr = percent(fp, tn) fnr = percent(fn, tp) return fpr, fnr # Compute error rates for all defendants. # In[8]: fpr, fnr = error_rates(matrix_all) fpr, fnr # Compute predictive value. # In[9]: def predictive_value(m): """Compute positive and negatie predictive value. m: confusion matrix """ tn, fp, fn, tp = m.values.flatten() ppv = percent(tp, fp) npv = percent(tn, fn) return ppv, npv # Compute predictive value for all defendants. # In[10]: ppv, npv = predictive_value(matrix_all) ppv, npv # Compute prevalence. # In[11]: def prevalence(df): """Compute prevalence. m: confusion matrix """ tn, fp, fn, tp = df.values.flatten() prevalence = percent(tp+fn, tn+fp) return prevalence # Compute prevalences for all defendants. # In[12]: prev = prevalence(matrix_all) prev # Compute all metrics and put them in a DataFrame. # In[13]: def compute_metrics(m, name=''): """Compute all metrics. m: confusion matrix returns: DataFrame """ fpr, fnr = error_rates(m) ppv, npv = predictive_value(m) sens, spec = sens_spec(m) prev = prevalence(m) index = ['FP rate', 'FN rate', 'PPV', 'NPV', 'Sensitivity', 'Specificity', 'Prevalence'] df = pd.DataFrame(index=index, columns=['Percent']) df.Percent = fpr, fnr, ppv, npv, sens, spec, prev df.index.name = name return df # Compute metrics for all defendants. # In[14]: compute_metrics(matrix_all, 'All defendants') # Make the confusion matrix for black defendants. # In[15]: a = [[990, 805], [532, 1369]] matrix_black = make_matrix(a) # Compute metrics for black defendants. # In[16]: black_metrics = compute_metrics(matrix_black, 'Black defendants') # Make the confusion matrix for white defendants. # In[17]: a = [[1139, 349], [461, 505]] matrix_white = make_matrix(a) # Compute metrics for white defendants. # In[18]: white_metrics = compute_metrics(matrix_white, 'White defendants') # So far, all results are consistent with those reported in the article, including the headline results: # # 1. The false positive rate for black defendants is substantially higher than for white defendants (45%, compared to 23%). # # 2. The false negative rate for black defendants is substantially lower (28%, compared to 48%). # # The false positive rate is the fraction of all non-recidivists who not labeled low risk. # # The false negative rate is the fraction of all recidivists who were labeled low risk. # In[19]: error_rates(matrix_black) # In[20]: error_rates(matrix_white) # ### The constant predictive value model # # An ideal test should have equal predicitive value in all groups; that is, two people in the same risk category should have the same probability of recidivism, regardless of what group they are in. # # An ideal test should also have the same error rates for all groups; that is, two non-recidivists should have the same probability of being classified as high risk. # # Unfortunately, these two goals are in conflict: # # * If you design a test to achieve equal predictive value across groups with different prevalence, you will find that error rates depend on prevalence. Speficially, false positive rates will be higher in groups with higher rates of recividism. # # * If you design a test to achieve equal error rates across groups, you will find that predictive value depends on prevalence. Specifically, positive predictive value will be lower in groups with lower rates of recidivism. # # The next two sections demonstrate these effects. # # A confusion matrix contains four values, but because they are contrained to add up to 100, it only takes 3 values to determine a confusion matrix. # # For example, if you specify prevalence, PPV, and NPV, that determines a confusion matrix, and then you can compute the error rates. # # Or, if you specify prevalence, FPR, and FNR, that determines a confusion matrix, and then you can compute predictive values. # # The following function takes prevalence, PPV, and NPV and returns a confusion matrix. # In[21]: def constant_predictive_value(prev, ppv, npv): """Make a confusion matrix with given metrics. prev: prevalence ppv: positive predictive value npv: negative predictive value returns: confusion matrix """ tn, fp, fn, tp = symbols('tn fp fn tp') eq1 = Eq(percent(tp+fn, tn+fp), prev) eq2 = Eq(percent(tp, fp), ppv) eq3 = Eq(percent(tn,fn), npv) eq4 = Eq(tn+fp+fn+tp, 1) soln = solve([eq1, eq2, eq3, eq4], [tn, fp, fn, tp]) a = list(soln.values()) return make_matrix(a) # To test it, I'll construct a confusion matrix with the actual metrics from all defendents. # In[22]: ppv, npv = predictive_value(matrix_all) prev = prevalence(matrix_all) m = constant_predictive_value(prev, ppv, npv) # If we use it to compute the other metrics, they are consistent with the results we got with the original data. # In[23]: compute_metrics(m) # We can use this function to run the "constant predictive value" model, which asks what happens if we keep PPV and NPV constant, and vary prevalence. # In[80]: ppv, npv = predictive_value(matrix_all) prevalences = np.linspace(32, 60, 31) fp_rates = pd.Series(index=prevalences) fn_rates = pd.Series(index=prevalences) for prev in prevalences: df = constant_predictive_value(prev, ppv, npv) fpr, fnr = error_rates(df) fp_rates[prev] = fpr fn_rates[prev] = fnr # The following figure shows the error rates we would expect from a test with equal predictive value for all groups, regardless of prevalence. # In[25]: def plot_cpv_model(): fp_rates.plot(label='false positive rate', color='C1') fn_rates.plot(label='false negative rate', color='C4') decorate(xlabel='Prevalence', ylabel='Percent', title='Expected error rates, constant predictive value', loc='upper center') plot_cpv_model() # As prevalence increases, false positive rates increase quickly. Note the vertical scale: the difference in error rates between a low-prevalence group and a high-prevalence group is dramatic! # # For the COMPAS test, the effect is not as extreme. The following figure shows the constant prediction model again, including data points for the white defendants (left), all defendants (middle), and black defendants (right). # In[26]: def plot_fpr_fnr(m): """Plot error rates versus prevalence. m: confusion matrix """ prev = prevalence(m) fpr, fnr = error_rates(m) plt.plot(prev, fpr, 'o', color='C1') plt.plot(prev, fnr, 'o', color='C4') # In[27]: plot_fpr_fnr(matrix_all) plot_fpr_fnr(matrix_black) plot_fpr_fnr(matrix_white) plot_cpv_model() # For black defendants: # # * The actual false positive rate is lower that what we would expect if the test had the same predictive value for all groups. # # * The actual false negative rate is higher than expected. # # For white defendants: # # * The actual false positive rate is higher than what we would expect if the test had the same predictive value for all groups. # # * The actual false negative rate is lower than expected. # # Relative to the CPV model, the COMPAS test is what I will call "tempered", that is, less sensitive to variation in prevalence between groups. # # ### Constant error rate model # # In the previous section we held predictive value constant and computed the effect on error rates. In this section we'll go the other way: if we hold error rates constant for all groups, what effect does that have on predictive value? # # The following function takes prevalence and error rates and returns a confusion matrix. # # # In[24]: def constant_error_rates(prev, fpr, fnr): """Make a confusion matrix with given metrics. prev: prevalence fpr: false positive rate fnr: false negative rate returns: confusion matrix """ tn, fp, fn, tp = symbols('tn fp fn tp') eq1 = Eq(percent(tp+fn, tn+fp), prev) eq2 = Eq(percent(fp, tn), fpr) eq3 = Eq(percent(fn, tp), fnr) eq4 = Eq(tn+fp+fn+tp, 1) soln = solve([eq1, eq2, eq3, eq4], [tn, fp, fn, tp]) a = list(soln.values()) return make_matrix(a) # Again, just to test it, we can replicate the observed confusion matrix. # In[25]: fpr, fnr = error_rates(matrix_all) prev = prevalence(matrix_all) m = constant_error_rates(prev, fpr, fnr) # And it has the right metrics # In[26]: compute_metrics(m) # Now we can see how predictive value depends on prevalence (with error rates held constant). # In[27]: fpr, fnr = error_rates(matrix_all) prevalences = np.linspace(20, 70, 31) ppv_rates = pd.Series(index=prevalences) npv_rates = pd.Series(index=prevalences) for prev in prevalences: df = constant_error_rates(prev, fpr, fnr) ppv, npv = predictive_value(df) ppv_rates[prev] = ppv npv_rates[prev] = npv # The following function plots the results. # In[28]: def plot_cer_model(): ppv_rates.plot(label='Positive predictive value', color='C0') npv_rates.plot(label='Negative predictive value', color='C2') decorate(xlabel='Prevalence', ylabel='Percent', title='Expected predictive value, constant error rates', loc='upper center') plot_cer_model() # As prevalence increases, so does positive predictive value. # # For the COMPAS test, the effect is not as extreme. The following figure shows the constant error rate again, including data points for the white defendants (left), all defendants (middle), and black defendants (right). # In[29]: def plot_ppv_npv(m): """Plot predictive values versus prevalence. m: confusion matrix """ prev = prevalence(m) ppv, npv = predictive_value(m) plt.plot(prev, ppv, 'o', color='C0') plt.plot(prev, npv, 'o', color='C2') # In[30]: plot_ppv_npv(matrix_all) plot_ppv_npv(matrix_black) plot_ppv_npv(matrix_white) plot_cer_model() # Again, the test is less sensitive to differences in prevalence between groups than we would expect from the constant error rate model. # ### More data, more details # # In this section I read the detailed dataset available from [this repository](https://github.com/propublica/compas-analysis) and run validation checks. # In[31]: # Uncomment and run this cell once to download the data. # Then comment it again so you don't download it every time you run the notebook. # !wget 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv' # In[32]: cp = pd.read_csv("compas-scores-two-years.csv") cp.head() # In[33]: cp.shape # In[34]: for col in cp.columns: print(col) # The following functions compute value counts and percentages for various variables. # In[35]: def make_dataframe(series, *columns): """Make a Series into a DataFrame with one column. So it looks better in Jupyter. series: Series columns: column name(s) returns: DataFrame """ df = pd.DataFrame(series.values, index=series.index, columns=columns) df.index.name = series.name return df def counts(df, var): """Compute counts for each unique value. df: DataFrame var: variable name """ series = df[var].value_counts() return make_dataframe(series, 'Count') def percentages(df, var): """Compute percentages for each unique value. df: DataFrame var: variable name """ series = df[var].value_counts() / len(df) * 100 return make_dataframe(series, 'Percentage') # Breakdown by age # In[36]: counts(cp, 'age_cat') # In[37]: percentages(cp, 'age_cat') # Breakdown by race # In[38]: counts(cp, 'race') # In[39]: percentages(cp, 'race') # Breakdown by sex # In[40]: counts(cp, 'sex') # In[41]: percentages(cp, 'sex') # Breakdown by recidivism # In[42]: counts(cp, 'two_year_recid') # In[43]: percentages(cp, 'two_year_recid') # Breakdown by risk category # In[44]: counts(cp, 'score_text') # In[45]: percentages(cp, 'score_text') # The following function computes cross-tabulations. # In[46]: def crosstab(df, index, columns): """Compute a cross-tabulation. df: DataFrame index: variable(s) that will label the rows columns: variable(s) that will label the columns returns: DataFrame """ xtab = df.pivot_table(index=index, columns=columns, values='id', aggfunc='count') return xtab # Breakdown by sex and race # In[47]: xtab = crosstab(cp, 'sex', 'race') # Breakdown by age and race # In[48]: xtab = crosstab(cp, 'age_cat', 'race') # Breakdown by age and sex # In[49]: xtab = crosstab(cp, 'age_cat', 'sex') # Distribution of decile scores for black defendants. # In[50]: cp.loc[cp.race=='African-American', 'decile_score'].hist() decorate(xlabel='Decile Score', ylabel='Count', title='Black defendant decile scores', ylim=[0, 700]) # Distribution of decile scores for white defendants. # In[51]: cp.loc[cp.race=='Caucasian', 'decile_score'].hist() decorate(xlabel='Decile Score', ylabel='Count', title='White defendant decile scores', ylim=[0, 700]) # Cross tabulation of decile score and race. # In[52]: crosstab(cp, 'decile_score', 'race') # Cross tabulation of decile score and age group. # In[53]: crosstab(cp, 'decile_score', 'age_cat') # Here's the confusion matrix with all three score categories. # In[54]: crosstab(cp, 'two_year_recid', 'score_text') # To make sure I've got the data right, I'll reproduce the confusion matrices from the article. # In[55]: def compute_matrix(df): """Compute a confusion matrix from data. df: DataFrame returns: confusion matrix """ high = cp.score_text.isin(['Medium', 'High']).astype(int) return crosstab(df, 'two_year_recid', high) # All defendants. # In[56]: compute_matrix(cp) # And the differences are 0. # In[57]: matrix_all - compute_matrix(cp).values # Black defendants. # In[58]: black = cp[cp.race=='African-American'] compute_matrix(black) # In[59]: matrix_black - compute_matrix(black).values # White defendants. # In[60]: white = cp[cp.race=='Caucasian'] compute_matrix(white) # In[61]: matrix_white - compute_matrix(white).values # ### Calibration # # To check for calibration, I group defendents by decile score and compute prevalence (recidivism rate) in each group. # # This analysis does not take observation time into account, unlike the analysis in the original article. # # The following function groups defendants by decile score and computes prevalence in each group. # In[62]: def calibration_curve(df): """Compute probability of recidivism by decile score. df: DataFrame returns: Series """ return df.groupby('decile_score').two_year_recid.mean() # The following figure shows this calibration curve for all defendants and broken down by race. # In[63]: def plot_calibration(df, group_vars, title=''): calibration_curve(cp).plot(linestyle='dotted', label='All defendants', color='gray') for name, group in cp.groupby(group_vars): if len(group) > 1000: calibration_curve(group).plot(label=name) decorate(xlabel='Decile score', ylabel='Prob recidivism', title=title) # In[64]: plot_calibration(cp, 'race') # The test is well calibrated. People with higher scores have higher probabilities of recidivism. In fact, we could use this curve to transform COMPAS scores into probabilities. # # The test is about equally calibrated for black and white defendants, although black defendants with scores 3 and 4 are more likely to recidivate than white defendants with the same scores (that apparent difference might not be statistically significant). # # Here's the breakdown by age group. # In[66]: plot_calibration(cp, 'age_cat') # The test is about equally calibrated for all age groups, which means that people with the same score have about the same probability of recidivism, regardless of what group they are in. # # There are only 4 people in the "Less than 25" group with decile score 1, which is why that data point is so out of line. # # Here's the breakdown by sex. # In[67]: plot_calibration(cp, 'sex') # Here's the first case where the test does not seem well calibrated. At all decile scores, female defendants are substantially less likely to recidivate than male defendants. # # Or, reading this graph the other way, female defendants are given decile scores 1-2 points higher than male defendants with the same actual risk of recidivism. # ### Comparing reality to the CPV model # # In this section I'll compare actual PPV and FPR for a variety of subgroups to the values we would expect based on the CPV model; that is, a model where the predictive values are the same for all groups. # # The following function groups defendants by `group_vars` and returns a table with one row for each group. # In[68]: def make_table(df, group_vars, expected_ppv, expected_npv): """Make a table with one line per group. df: DataFrame group_vars: string or list of string variable names expected_ppv: expected_npv: returns: table """ # make the DataFrame columns = ['count', 'prevalence', 'actual PPV', 'actual NPV', 'actual FNR', 'actual FPR', 'expected FPR', 'difference'] columns = group_vars + columns table = pd.DataFrame(columns=columns) # loop through the groups grouped = df.groupby(group_vars) for i, (name, group) in enumerate(grouped): if not isinstance(name, tuple): name = name, # size of group count = len(group) # compute metrics matrix = compute_matrix(group) prev = prevalence(matrix) actual_ppv, actual_npv = predictive_value(matrix) actual_fpr, actual_fnr = error_rates(matrix) # generate the CPV matrix cpv = constant_predictive_value(prev, expected_ppv, expected_npv) # get the expected error rates expected_fpr, _ = error_rates(cpv * 100) # for very low and high prevalences, it might # not be possible to achieve given predictive values if expected_fpr < 0: expected_fpr = 0 if expected_fpr > 100: expected_fpr = 100 # difference between actual and expected diff = actual_fpr - expected_fpr # add a row to the table row = name + (count, prev, actual_ppv, actual_npv, actual_fnr, actual_fpr, expected_fpr, diff) table.loc[i] = row # sort the table by prevalence table.sort_values(by='prevalence', inplace=True) return table # In[69]: subset = cp[cp.race.isin(['African-American', 'Caucasian'])] subset.shape # Here's the breakdown by age category. # In[70]: ppv, npv = predictive_value(matrix_all) group_vars = ['age_cat'] table1 = make_table(subset, group_vars, ppv, npv) # Again, the actual behavior of the test is tempered, compare to the CPV model; that is, the results are less extreme than the model expects. # # Here's the breakdown by race. # In[71]: ppv, npv = predictive_value(matrix_all) group_vars = ['race'] table2 = make_table(subset, group_vars, ppv, npv) # The false positive rate for whites is higher than we would expect if predictive value were the same for all groups. # # The false positive rate for blacks is lower than we would expect. # # Here's the breakdown by sex. # In[72]: ppv, npv = predictive_value(matrix_all) group_vars = ['sex'] table3 = make_table(subset, group_vars, ppv, npv) # The false positive rate for women is substantially higher than what we would expect in the CPV model, which is consistent with the calibration results in the previous section. # # Here's the breakdown by age and race. # In[73]: ppv, npv = predictive_value(matrix_all) group_vars = ['age_cat', 'race'] table4 = make_table(subset, group_vars, ppv, npv) # Breakdown by age and sex. # In[74]: ppv, npv = predictive_value(matrix_all) group_vars = ['age_cat', 'sex'] table5 = make_table(subset, group_vars, ppv, npv) # Breakdown by race and sex. # In[75]: ppv, npv = predictive_value(matrix_all) group_vars = ['race', 'sex'] table6 = make_table(subset, group_vars, ppv, npv) # Breakdown by age, race, and sex. # In[76]: ppv, npv = predictive_value(matrix_all) group_vars = ['age_cat', 'race', 'sex'] table7 = make_table(subset, group_vars, ppv, npv) # Those are all the possible subgroups for these three variables. # # Now we can see what the results look like. # In[77]: tables = [table1, table2, table3, table4, table5, table6, table7]; # The following function plots one data point per subgroup showing the given metric versus prevalence. # # Groups with a small number of people are shown with lighter colors. # # In[78]: def plot_table_var(table, var, color): """Plot one data point per row. table: DataFrame var: which metric to plot color: string """ for _, row in table.iterrows(): alpha = 0.8 if row['count'] > 200 else 0.3 plt.plot(row['prevalence'], row[var], 'o', color=color, alpha=alpha) # Here's what the results look like for FPR. # In[81]: fp_rates.plot(label='Expected FPR, constant CPV', color='C1') plt.axhline(fpr, linestyle='dotted', label='Expected FPR, constant FPR', color='gray') for table in tables: plot_table_var(table, 'actual FPR', 'C1') decorate(xlabel='Prevalence', ylabel='Percent', title='False positive rates by subgroup') # In general, groups with higher prevalence have higher false positive rates, but the effect is less extreme than what we would expect from the CPV model. # # Here are the results for positive predictive value. # In[82]: ppv_rates.plot(label='Expected PPV, constant FPR', color='C0') plt.axhline(ppv, linestyle='dotted', label='Expected PPV, constant PPV', color='gray') for table in tables: plot_table_var(table, 'actual PPV', 'C0') decorate(xlabel='Prevalence', ylabel='Rate', title='Positive predictive value by subgroup') # Groups with higher prevalence have higher PPV, but the effect is less extreme than we would expect from the CPV model. # # Here are the results for false negative rate. # In[83]: fn_rates.plot(label='Expected FNR, constant CPV', color='C4') plt.axhline(fnr, linestyle='dotted', label='Expected FNR, constant FNR', color='gray') for table in tables: plot_table_var(table, 'actual FNR', 'C4') decorate(xlabel='Prevalence', ylabel='Percent', title='False negative rates by subgroup') # Groups with higher prevalence have lower FNR, but the effect is less extreme than we would expect from the CPV model. # # Here are the results for negative predictive value. # In[84]: npv_rates.plot(label='Expected NPV, constant FPR', color='C2') plt.axhline(npv, linestyle='dotted', label='Expected NPV, constant NPV', color='gray') for table in tables: plot_table_var(table, 'actual NPV', 'C2') decorate(xlabel='Prevalence', ylabel='Percent', title='Negative predictive value by subgroup') # Groups with higher prevalence have lower NPV. In this case, the effect is almost exactly what we would expect from the CPV model. # ### Individual FPR # In[85]: from scipy.interpolate import interp1d def crossing(series, value, **options): """Find where a function crosses a value. series: Series value: number options: passed to interp1d (default is linear interp) returns: number """ interp = interp1d(series.values, series.index, **options) return interp(value) def interpolate(series, value, **options): """Evaluate a function at a value. series: Series value: number options: passed to interp1d (default is linear interp) returns: number """ interp = interp1d(series.index, series.values, **options) return interp(value) # In[86]: cal_all = calibration_curve(cp) cal_all.plot() decorate(ylabel='Probability of recidivism') # In[87]: crossing(cal_all, 0.4) # In[88]: crossing(cal_all, 0.7) # In[89]: interpolate(cal_all, 3.4) # In[90]: interpolate(cal_all, 9) # In[91]: def make_error_dist(std_dev): """Make a discrete Gaussian distribution. std_dev: standard deviation returns: Series that maps errors to probabilities """ errors = np.linspace(-3, 3, 21) prob_error = np.exp(-(errors/std_dev)**2) prob_error /= np.sum(prob_error) error_dist = pd.Series(prob_error, index=errors) return error_dist # In[92]: error_dist = make_error_dist(std_dev=2) error_dist.plot(label='') decorate(xlabel='Error (score)', ylabel='Probability') # In[93]: def individual_fpr(actual_prob_recid, cal, thresh, std_dev): """Compute an individual FPR. actual_prob_recid: actual probability of recidivism cal: calibration curve, map from score to prob_recid thresh: threshold between low and not low risk std_dev: standard deviation of the error function returns: individual FPR """ # look up actual_prob_recid to get correct score correct_score = crossing(cal, actual_prob_recid) # make the error distribution error_dist = make_error_dist(std_dev) # loop through possible errors total_prob = 0 for error, prob_error in error_dist.iteritems(): # hypothetical score score = correct_score+error score = max(score, 1) score = min(score, 10) # probability of being classified 'not low' | error prob_positive = 0 if score < thresh else 1 # probability of being a false positive | error prob_fp = prob_positive * (1-actual_prob_recid) total_prob += prob_error * prob_fp return total_prob # In[94]: individual_fpr(0.3, cal_all, 4.5, 2) # In[95]: individual_fpr(0.5, cal_all, 4.5, 2) # In[96]: individual_fpr(0.7, cal_all, 4.5, 2) # In[97]: def compute_fpr_vs_prob_recid(cal, thresh, std_dev): """Computes FPR as a function of probability of recidivism. cal: calibration curve, map from score to prob_recid thresh: threshold between low and not low risk std_dev: standard deviation of the error function returns: Series """ prob_recid_array = np.linspace(min(cal), max(cal), 21) prob_fpr_series = pd.Series(index=prob_recid_array) for prob_recid in prob_recid_array: fpr = individual_fpr(prob_recid, cal, thresh, std_dev) prob_fpr_series[prob_recid] = fpr return prob_fpr_series # In[98]: s = compute_fpr_vs_prob_recid(cal_all, thresh=4.5, std_dev=2) s.plot(label='FPR, std_dev=2') s = compute_fpr_vs_prob_recid(cal_all, thresh=4.5, std_dev=1) s.plot(label='FPR, std_dev=1') decorate(xlabel='Actual probability of recidivism', ylabel='Probability of false positive') # In[99]: def individual_fpr_given_score(actual_score, cal, thresh, std_dev): """Compute an individual FPR. actual_score: score assigned cal: calibration curve, map from score to prob_recid thresh: threshold between low and high risk std_dev: standard deviation of the error function returns: individual FPR """ # make the error distribution error_dist = make_error_dist(std_dev) # loop through possible errors total_prob = 0 for error, prob_error in error_dist.iteritems(): # correct score correct_score = actual_score-error correct_score = max(correct_score, 1) correct_score = min(correct_score, 10) # map from correct score to probability of recidivism. # if calibration curves are different for different # groups, this one should be group specific. correct_prob_recid = interpolate(cal, correct_score) cond_ifpr = individual_fpr(correct_prob_recid, cal, thresh, std_dev) total_prob += prob_error * cond_ifpr return total_prob # In[100]: individual_fpr_given_score(6, cal_all, thresh=4.5, std_dev=2) # In[131]: def make_ifpr_series(cal, thresh, std_dev): scores = np.arange(1, 11) t = [individual_fpr_given_score(score, cal, thresh, std_dev) for score in scores] ifpr_series = pd.Series(t, scores) return ifpr_series # In[132]: thresh = 4.5 s = make_ifpr_series(cal_all, thresh, std_dev=2) s.plot(label='FPR, std_dev=2') s = make_ifpr_series(cal_all, thresh, std_dev=1) s.plot(label='FPR, std_dev=1') decorate(xlabel='Score', ylabel='Probability of false positive') # In[133]: def assign_individual_fpr(df, cal, thresh, std_dev): """Assign individual FPRs to defendants. df: DataFrame cal: calibration curve, map from score to prob_recid thresh: threshold between low and high risk std_dev: standard deviation of the error function """ # compute the map from score to FPR ifpr_series = make_ifpr_series(cal, thresh, std_dev) # assign FPR to each defendant df['ifpr'] = [ifpr_series[score] for score in df.decile_score] # In[134]: assign_individual_fpr(cp, cal_all, thresh=4.5, std_dev=2) # In[135]: def make_cdf(series): """Make a CDF.""" counts = series.value_counts().sort_index() counts /= counts.sum() return counts.cumsum() def plot_cdf(cdf, **options): """Plot a CDF as a step function.""" plt.step(cdf.index, cdf.values, where='post', **options) # In[136]: cdf_ifpr = make_cdf(cp.ifpr) # In[137]: plot_cdf(cdf_ifpr, label='All') decorate(xlabel='Individual probability of false positive', ylabel='CDF', ylim=[0,1]) # In[138]: black = cp[cp.race=='African-American'] white = cp[cp.race=='Caucasian'] thresh = 4.5 std_dev = 2 cal_black = calibration_curve(black) s = make_ifpr_series(cal_black, thresh, std_dev) s.plot(label='FPR, black') cal_white = calibration_curve(white) s = make_ifpr_series(cal_white, thresh, std_dev) s.plot(label='FPR, white') decorate(xlabel='Score', ylabel='Probability of false positive') # In[139]: male = cp[cp.sex=='Male'] female = cp[cp.sex=='Female'] thresh = 4.5 std_dev = 2 cal_male = calibration_curve(male) s = make_ifpr_series(cal_male, thresh, std_dev) s.plot(label='FPR, male') cal_female = calibration_curve(female) s = make_ifpr_series(cal_female, thresh, std_dev) s.plot(label='FPR, female') decorate(xlabel='Score', ylabel='Probability of false positive') # In[176]: assign_individual_fpr(cp, cal_all, thresh=4.5, std_dev=2) cp.ifpr.mean() # In[177]: cp.groupby('race').ifpr.mean() # In[178]: assign_individual_fpr(cp, cal_all, thresh=4.5, std_dev=1) cp.ifpr.mean() # In[179]: cp.groupby('race').ifpr.mean() # In[180]: assign_individual_fpr(cp, cal_all, thresh=4.5, std_dev=0.01) cp.ifpr.mean() # In[181]: cp.groupby('race').ifpr.mean() # ### Individual FNR # In[141]: cal_all = calibration_curve(cp) cal_all.plot() decorate(ylabel='Probability of recidivism') # In[142]: def individual_fnr(actual_prob_recid, cal, thresh, std_dev): """Compute an individual FNR. actual_prob_recid: actual probability of recidivism cal: calibration curve, map from score to prob_recid thresh: threshold between low and not low risk std_dev: standard deviation of the error function returns: individual FNR """ # look up actual_prob_recid to get correct score correct_score = crossing(cal, actual_prob_recid) # make the error distribution error_dist = make_error_dist(std_dev) # loop through possible errors total_prob = 0 for error, prob_error in error_dist.iteritems(): # hypothetical score score = correct_score+error score = max(score, 1) score = min(score, 10) # probability of being classified 'low' | error prob_negative = 0 if score >= thresh else 1 # probability of being a false negative | error prob_fp = prob_negative * actual_prob_recid total_prob += prob_error * prob_fp return total_prob # In[143]: individual_fnr(0.3, cal_all, 4.5, 2) # In[144]: individual_fnr(0.5, cal_all, 4.5, 2) # In[145]: individual_fnr(0.7, cal_all, 4.5, 2) # In[146]: def compute_fnr_vs_prob_recid(cal, thresh, std_dev): """Computes FNR as a function of probability of recidivism. cal: calibration curve, map from score to prob_recid thresh: threshold between low and not low risk std_dev: standard deviation of the error function returns: Series """ prob_recid_array = np.linspace(min(cal), max(cal), 21) prob_fnr_series = pd.Series(index=prob_recid_array) for prob_recid in prob_recid_array: fnr = individual_fnr(prob_recid, cal, thresh, std_dev) prob_fnr_series[prob_recid] = fnr return prob_fnr_series # In[147]: s = compute_fnr_vs_prob_recid(cal_all, thresh=4.5, std_dev=2) s.plot(label='FNR, std_dev=2') s = compute_fnr_vs_prob_recid(cal_all, thresh=4.5, std_dev=1) s.plot(label='FNR, std_dev=1') decorate(xlabel='Actual probability of recidivism', ylabel='Probability of false negative') # In[148]: def individual_fnr_given_score(actual_score, cal, thresh, std_dev): """Compute an individual FNR. actual_score: score assigned cal: calibration curve, map from score to prob_recid thresh: threshold between low and high risk std_dev: standard deviation of the error function returns: individual FNR """ # make the error distribution error_dist = make_error_dist(std_dev) # loop through possible errors total_prob = 0 for error, prob_error in error_dist.iteritems(): # correct score correct_score = actual_score-error correct_score = max(correct_score, 1) correct_score = min(correct_score, 10) # map from correct score to probability of recidivism. # if calibration curves are different for different # groups, this one should be group specific. correct_prob_recid = interpolate(cal, correct_score) cond_ifnr = individual_fnr(correct_prob_recid, cal, thresh, std_dev) total_prob += prob_error * cond_ifnr return total_prob # In[149]: individual_fnr_given_score(6, cal_all, thresh=4.5, std_dev=2) # In[150]: def make_ifnr_series(cal, thresh, std_dev): scores = np.arange(1, 11) t = [individual_fnr_given_score(score, cal, thresh, std_dev) for score in scores] ifnr_series = pd.Series(t, scores) return ifnr_series # In[151]: thresh = 4.5 s = make_ifnr_series(cal_all, thresh, std_dev=2) s.plot(label='FNR, std_dev=2') s = make_ifnr_series(cal_all, thresh, std_dev=1) s.plot(label='FNR, std_dev=1') decorate(xlabel='Score', ylabel='Probability of false negative') # In[152]: def assign_individual_fnr(df, cal, thresh, std_dev): """Assign individual FNRs to defendants. df: DataFrame cal: calibration curve, map from score to prob_recid thresh: threshold between low and high risk std_dev: standard deviation of the error function """ # compute the map from score to FPR ifnr_series = make_ifnr_series(cal, thresh, std_dev) # assign FPR to each defendant df['ifnr'] = [ifnr_series[score] for score in df.decile_score] # In[153]: assign_individual_fnr(cp, cal_all, thresh=4.5, std_dev=2) # In[154]: cdf_ifnr = make_cdf(cp.ifnr) # In[155]: plot_cdf(cdf_ifnr, label='All') decorate(xlabel='Individual probability of false negative', ylabel='CDF', ylim=[0,1]) # In[159]: thresh = 4.5 std_dev = 2 cal_black = calibration_curve(black) s = make_ifnr_series(cal_black, thresh, std_dev) s.plot(label='FNR, black') cal_white = calibration_curve(white) s = make_ifnr_series(cal_white, thresh, std_dev) s.plot(label='FNR, white') decorate(xlabel='Score', ylabel='Probability of false negative') # In[160]: thresh = 4.5 std_dev = 2 cal_male = calibration_curve(male) s = make_ifnr_series(cal_male, thresh, std_dev) s.plot(label='FNR, male') cal_female = calibration_curve(female) s = make_ifnr_series(cal_female, thresh, std_dev) s.plot(label='FNR, female') decorate(xlabel='Score', ylabel='Probability of false negative') # In[182]: assign_individual_fnr(cp, cal_all, thresh=4.5, std_dev=2) cp.ifnr.mean() # In[183]: cp.groupby('race').ifnr.mean() # In[184]: assign_individual_fnr(cp, cal_all, thresh=4.5, std_dev=1) cp.ifnr.mean() # In[185]: cp.groupby('race').ifnr.mean() # In[186]: assign_individual_fnr(cp, cal_all, thresh=4.5, std_dev=0.01) cp.ifnr.mean() # In[187]: cp.groupby('race').ifnr.mean() # In[ ]: # In[ ]: # In[ ]: # ### What would it take? # # In this section I explore what it would take to make a test with the same false positive rate for all groups. # In[ ]: def fpr_thresh(df, thresh): df = df.copy() df['high'] = df.decile_score >= thresh matrix_all = crosstab(df, 'two_year_recid', 'high') fpr, fnr = error_rates(matrix_all) return fpr # # In[ ]: fpr_thresh(cp, 5) # # In[ ]: fpr_thresh(black, 5) # # In[ ]: fpr_thresh(white, 5) # # In[ ]: def sweep_thresh(df): threshes = range(2,10) sweep = pd.Series(index=threshes) for thresh in threshes: sweep[thresh] = fpr_thresh(df, thresh) return sweep # # In[ ]: plt.axhline(32.25, color='gray') sweep_thresh(cp).plot(label='All') sweep_thresh(black).plot(label='Black') sweep_thresh(white).plot(label='White') decorate(xlabel='Threshold', ylabel='False positive rate') # # In[ ]: # # In[ ]: def find_threshold(group, fpr): series = sweep_thresh(group) xs = crossing(series.dropna(), fpr) return xs # # In[ ]: all_thresh = find_threshold(cp, 32.35) # # In[ ]: black_thresh = find_threshold(black, 32.35) # # In[ ]: white_thresh = find_threshold(white, 32.35) # # In[ ]: # # In[ ]: interpolate(calibration_curve(cp), all_thresh) # # In[ ]: interpolate(calibration_curve(cp), black_thresh) # # In[ ]: interpolate(calibration_curve(black), black_thresh) # # In[ ]: interpolate(calibration_curve(cp), white_thresh) # # In[ ]: interpolate(calibration_curve(white), white_thresh) # # In[ ]: # # In[ ]: # # In[ ]: # # In[ ]: # # In[ ]: # # In[ ]: # # In[ ]: black_male = black[black.sex=='Male'] black_male.shape # # In[ ]: black_female = black[black.sex=='Female'] black_female.shape # # In[ ]: old_black_female = black_female[black_female.age_cat=='Greater than 45'] old_black_female.shape # # In[ ]: old_white_female = cp[(cp.age_cat=='Greater than 45') & (cp.sex=='Female') & (cp.race=='Caucasian')] old_white_female.shape # # In[ ]: young_black_male = cp[(cp.age_cat=='Less than 25') & (cp.sex=='Male') & (cp.race=='African-American')] young_black_male.shape # # In[ ]: fpr_thresh(cp, 5) # # In[ ]: fpr_thresh(black, 5) # # In[ ]: fpr_thresh(black_female, 5) # # In[ ]: fpr_thresh(old_black_female, 5) # # In[ ]: fpr_thresh(black_male, 5) # # In[ ]: fpr_thresh(young_black_male, 5) # # In[ ]: plt.axhline(32.25, color='gray') sweep_thresh(black).plot(label='Black', color='gray') sweep_thresh(black_male).plot(label='Black male') sweep_thresh(young_black_male).plot(label='Young black male') decorate(xlabel='Threshold', ylabel='False positive rate') # # In[ ]: plt.axhline(32.25, color='gray') sweep_thresh(black).plot(label='Black', color='gray') sweep_thresh(black_female).plot(label='Black female') sweep_thresh(old_black_female).plot(label='Old black female') decorate(xlabel='Threshold', ylabel='False positive rate') # # In[ ]: ybm_thresh = find_threshold(young_black_male, 32.35) # # In[ ]: obf_thresh = find_threshold(old_black_female, 32.35) # # In[ ]: interpolate(calibration_curve(cp), ybm_thresh) # # In[ ]: interpolate(calibration_curve(cp), obf_thresh) # # In[ ]: