#!/usr/bin/env python
# coding: utf-8

# # Recidivism
# 
# This notebook contains my analysis of data presented in 
# "[Machine Bias](https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing)", Julia Angwin, Jeff Larson, Surya Mattu and Lauren Kirchner, ProPublica, May 23, 2016.
# 
# I would like to thank the authors of that article for making their data and analysis freely available.  They are a model of open science.
# 
# 
# Copyright 2018 Allen Downey
# 
# The code and text of this notebook are under this license: [Creative Commons Attribution 4.0 International](https://creativecommons.org/licenses/by/4.0)
# 
# The data are from [this repository](https://github.com/propublica/compas-analysis), which contains the data and analysis pipeline described on [this web page](https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm).
# 
# The terms of use for the data [are here](https://www.propublica.org/datastore/terms).  In compliance with those terms, I am not distributing the data, but there is a link below that downloads it directly.

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InteractiveShell.ast_node_interactivity='last_expr_or_assign'")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sympy import symbols, Eq, solve

from overthink import decorate


# ### Metrics
# 
# In this section I start with the [data reported here](https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm) and replicate the analysis there, computing various metrics based on the [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix): prevalence, sensitivity, specificity, false positive rate, false negative rate, positive predictive value, and negative predictive value.
# 
# The following function takes an array and returns a Pandas DataFrame:

# In[2]:


def make_matrix(a):
    """Make a confusion matrix from an array.
    
    a: array or list of lists
    
    returns: DataFrame
    """
    a = np.asarray(a).reshape((2, 2))
    index = ['Survived', 'Recidivated']
    columns = ['Low', 'High']
    return pd.DataFrame(a, index=index, columns=columns)


# Make the matrix for all defendants.

# In[3]:


a = [[2681, 1282], [1216, 2035]]
matrix_all = make_matrix(a)


# Confirm that the total matches what's reported in the article.

# In[4]:


np.sum(a)


# Compute sensitivity and specificity.

# In[5]:


def percent(x, y):
    """Compute the percentage `x/(x+y)`.
    """
    return x / (x+y) * 100

def sens_spec(m):
    """Compute sensitivity and specificity.
    
    m: confusion matrix
    """
    tn, fp, fn, tp = m.values.flatten()
    sens = percent(tp, fn)
    spec = percent(tn, fp)
    return sens, spec


# Compute sensitivity and specificity for all defendants.

# In[6]:


sens, spec = sens_spec(matrix_all)
sens, spec


# Compute error rates.

# In[7]:


def error_rates(m):
    """Compute false positive and false negative rate.
    
    m: confusion matrix
    """
    tn, fp, fn, tp = m.values.flatten()
    fpr = percent(fp, tn)
    fnr = percent(fn, tp)
    return fpr, fnr


# Compute error rates for all defendants.

# In[8]:


fpr, fnr = error_rates(matrix_all)
fpr, fnr


# Compute predictive value.

# In[9]:


def predictive_value(m):
    """Compute positive and negatie predictive value.
    
    m: confusion matrix
    """
    tn, fp, fn, tp = m.values.flatten()
    ppv = percent(tp, fp)
    npv = percent(tn, fn)
    return ppv, npv


# Compute predictive value for all defendants.

# In[10]:


ppv, npv = predictive_value(matrix_all)
ppv, npv


# Compute prevalence.

# In[11]:


def prevalence(df):
    """Compute prevalence.
    
    m: confusion matrix
    """
    tn, fp, fn, tp = df.values.flatten()
    prevalence = percent(tp+fn, tn+fp)
    return prevalence


# Compute prevalences for all defendants.

# In[12]:


prev = prevalence(matrix_all)
prev


# Compute all metrics and put them in a DataFrame.

# In[13]:


def compute_metrics(m, name=''):
    """Compute all metrics.
    
    m: confusion matrix
    
    returns: DataFrame
    """
    fpr, fnr = error_rates(m)
    ppv, npv = predictive_value(m)
    sens, spec = sens_spec(m)
    prev = prevalence(m)
    
    index = ['FP rate', 'FN rate', 'PPV', 'NPV',
             'Sensitivity', 'Specificity', 'Prevalence']
    df = pd.DataFrame(index=index, columns=['Percent'])
    df.Percent = fpr, fnr, ppv, npv, sens, spec, prev
    df.index.name = name
    return df


# Compute metrics for all defendants.

# In[14]:


compute_metrics(matrix_all, 'All defendants')


# Make the confusion matrix for black defendants.

# In[15]:


a = [[990, 805], [532, 1369]]
matrix_black = make_matrix(a)


# Compute metrics for black defendants.

# In[16]:


black_metrics = compute_metrics(matrix_black, 'Black defendants')


# Make the confusion matrix for white defendants.

# In[17]:


a = [[1139, 349], [461, 505]]
matrix_white = make_matrix(a)


# Compute metrics for white defendants.

# In[18]:


white_metrics = compute_metrics(matrix_white, 'White defendants')


# So far, all results are consistent with those reported in the article, including the headline results:
# 
# 1. The false positive rate for black defendants is substantially higher than for white defendants (45%, compared to 23%).
# 
# 2. The false negative rate for black defendants is substantially lower (28%, compared to 48%).
# 
# The false positive rate is the fraction of all non-recidivists who not labeled low risk.
# 
# The false negative rate is the fraction of all recidivists who were labeled low risk.

# In[19]:


error_rates(matrix_black)


# In[20]:


error_rates(matrix_white)


# ### The constant predictive value model
# 
# An ideal test should have equal predicitive value in all groups; that is, two people in the same risk category should have the same probability of recidivism, regardless of what group they are in.
# 
# An ideal test should also have the same error rates for all groups; that is, two non-recidivists should have the same probability of being classified as high risk. 
# 
# Unfortunately, these two goals are in conflict:
# 
# * If you design a test to achieve equal predictive value across groups with different prevalence, you will find that error rates depend on prevalence.  Speficially, false positive rates will be higher in groups with higher rates of recividism.
# 
# * If you design a test to achieve equal error rates across groups, you will find that predictive value depends on prevalence.  Specifically, positive predictive value will be lower in groups with lower rates of recidivism.
# 
# The next two sections demonstrate these effects.
# 
# A confusion matrix contains four values, but because they are contrained to add up to 100, it only takes 3 values to determine a confusion matrix.
# 
# For example, if you specify prevalence, PPV, and NPV, that determines a confusion matrix, and then you can compute the error rates.
# 
# Or, if you specify prevalence, FPR, and FNR, that determines a confusion matrix, and then you can compute predictive values.
# 
# The following function takes prevalence, PPV, and NPV and returns a confusion matrix.

# In[21]:


def constant_predictive_value(prev, ppv, npv):
    """Make a confusion matrix with given metrics.
    
    prev: prevalence
    ppv: positive predictive value
    npv: negative predictive value
    
    returns: confusion matrix
    """
    tn, fp, fn, tp = symbols('tn fp fn tp')
    eq1 = Eq(percent(tp+fn, tn+fp), prev)
    eq2 = Eq(percent(tp, fp), ppv)
    eq3 = Eq(percent(tn,fn), npv)
    eq4 = Eq(tn+fp+fn+tp, 1)
    soln = solve([eq1, eq2, eq3, eq4], [tn, fp, fn, tp])
    a = list(soln.values())
    return make_matrix(a)


# To test it, I'll construct a confusion matrix with the actual metrics from all defendents.

# In[22]:


ppv, npv = predictive_value(matrix_all)
prev = prevalence(matrix_all)

m = constant_predictive_value(prev, ppv, npv)


# If we use it to compute the other metrics, they are consistent with the results we got with the original data.

# In[23]:


compute_metrics(m)


# We can use this function to run the "constant predictive value" model, which asks what happens if we keep PPV and NPV constant, and vary prevalence. 

# In[80]:


ppv, npv = predictive_value(matrix_all)
prevalences = np.linspace(32, 60, 31)

fp_rates = pd.Series(index=prevalences)
fn_rates = pd.Series(index=prevalences)

for prev in prevalences:
    df = constant_predictive_value(prev, ppv, npv)
    fpr, fnr = error_rates(df)
    fp_rates[prev] = fpr
    fn_rates[prev] = fnr


# The following figure shows the error rates we would expect from a test with equal predictive value for all groups, regardless of prevalence.

# In[25]:


def plot_cpv_model():
    fp_rates.plot(label='false positive rate', color='C1')
    fn_rates.plot(label='false negative rate', color='C4')
    decorate(xlabel='Prevalence', ylabel='Percent',
             title='Expected error rates, constant predictive value',
             loc='upper center')
    
plot_cpv_model()


# As prevalence increases, false positive rates increase quickly.  Note the vertical scale: the difference in error rates between a low-prevalence group and a high-prevalence group is dramatic!
# 
# For the COMPAS test, the effect is not as extreme.  The following figure shows the constant prediction model again, including data points for the white defendants (left), all defendants (middle), and black defendants (right).

# In[26]:


def plot_fpr_fnr(m):
    """Plot error rates versus prevalence.
    
    m: confusion matrix
    """
    prev = prevalence(m)
    fpr, fnr = error_rates(m)
    plt.plot(prev, fpr, 'o', color='C1')
    plt.plot(prev, fnr, 'o', color='C4')


# In[27]:


plot_fpr_fnr(matrix_all)
plot_fpr_fnr(matrix_black)
plot_fpr_fnr(matrix_white)

plot_cpv_model()


# For black defendants:
# 
# * The actual false positive rate is lower that what we would expect if the test had the same predictive value for all groups.
# 
# * The actual false negative rate is higher than expected.
# 
# For white defendants:
# 
# * The actual false positive rate is higher than what we would expect if the test had the same predictive value for all groups.
# 
# * The actual false negative rate is lower than expected.
# 
# Relative to the CPV model, the COMPAS test is what I will call "tempered", that is, less sensitive to variation in prevalence between groups.
# 

# ### Constant error rate model
# 
# In the previous section we held predictive value constant and computed the effect on error rates.  In this section we'll go the other way: if we hold error rates constant for all groups, what effect does that have on predictive value?
# 
# The following function takes prevalence and error rates and returns a confusion matrix.
# 
# 

# In[24]:


def constant_error_rates(prev, fpr, fnr):
    """Make a confusion matrix with given metrics.
    
    prev: prevalence
    fpr: false positive rate
    fnr: false negative rate
    
    returns: confusion matrix
    """
    tn, fp, fn, tp = symbols('tn fp fn tp')
    eq1 = Eq(percent(tp+fn, tn+fp), prev)
    eq2 = Eq(percent(fp, tn), fpr)
    eq3 = Eq(percent(fn, tp), fnr)
    eq4 = Eq(tn+fp+fn+tp, 1)
    soln = solve([eq1, eq2, eq3, eq4], [tn, fp, fn, tp])
    a = list(soln.values())
    return make_matrix(a)


# Again, just to test it, we can replicate the observed confusion matrix.

# In[25]:


fpr, fnr = error_rates(matrix_all)
prev = prevalence(matrix_all)
m = constant_error_rates(prev, fpr, fnr)


# And it has the right metrics

# In[26]:


compute_metrics(m)


# Now we can see how predictive value depends on prevalence (with error rates held constant).

# In[27]:


fpr, fnr = error_rates(matrix_all)
prevalences = np.linspace(20, 70, 31)

ppv_rates = pd.Series(index=prevalences)
npv_rates = pd.Series(index=prevalences)

for prev in prevalences:
    df = constant_error_rates(prev, fpr, fnr)
    ppv, npv = predictive_value(df)
    ppv_rates[prev] = ppv
    npv_rates[prev] = npv


# The following function plots the results.

# In[28]:


def plot_cer_model():
    ppv_rates.plot(label='Positive predictive value', color='C0')
    npv_rates.plot(label='Negative predictive value', color='C2')
    decorate(xlabel='Prevalence', ylabel='Percent',
             title='Expected predictive value, constant error rates',
             loc='upper center')
    
plot_cer_model()


# As prevalence increases, so does positive predictive value.
# 
# For the COMPAS test, the effect is not as extreme.  The following figure shows the constant error rate again, including data points for the white defendants (left), all defendants (middle), and black defendants (right).

# In[29]:


def plot_ppv_npv(m):
    """Plot predictive values versus prevalence.
    
    m: confusion matrix
    """
    prev = prevalence(m)
    ppv, npv = predictive_value(m)
    plt.plot(prev, ppv, 'o', color='C0')
    plt.plot(prev, npv, 'o', color='C2')


# In[30]:


plot_ppv_npv(matrix_all)
plot_ppv_npv(matrix_black)
plot_ppv_npv(matrix_white)

plot_cer_model()


# Again, the test is less sensitive to differences in prevalence between groups than we would expect from the constant error rate model.

# ### More data, more details
# 
# In this section I read the detailed dataset available from [this repository](https://github.com/propublica/compas-analysis) and run validation checks.

# In[31]:


# Uncomment and run this cell once to download the data.
# Then comment it again so you don't download it every time you run the notebook.
# !wget 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv'


# In[32]:


cp = pd.read_csv("compas-scores-two-years.csv")
cp.head()


# In[33]:


cp.shape


# In[34]:


for col in cp.columns:
    print(col)


# The following functions compute value counts and percentages for various variables.

# In[35]:


def make_dataframe(series, *columns):
    """Make a Series into a DataFrame with one column.
    
    So it looks better in Jupyter.
    
    series: Series
    columns: column name(s)
    
    returns: DataFrame
    """
    df = pd.DataFrame(series.values,
                      index=series.index,
                      columns=columns)
    df.index.name = series.name
    return df

def counts(df, var):
    """Compute counts for each unique value.
    
    df: DataFrame
    var: variable name
    """
    series = df[var].value_counts()
    return make_dataframe(series, 'Count')

def percentages(df, var):
    """Compute percentages for each unique value.
    
    df: DataFrame
    var: variable name
    """
    series = df[var].value_counts() / len(df) * 100
    return make_dataframe(series, 'Percentage')


# Breakdown by age

# In[36]:


counts(cp, 'age_cat')


# In[37]:


percentages(cp, 'age_cat')


# Breakdown by race

# In[38]:


counts(cp, 'race')


# In[39]:


percentages(cp, 'race')


# Breakdown by sex

# In[40]:


counts(cp, 'sex')


# In[41]:


percentages(cp, 'sex')


# Breakdown by recidivism

# In[42]:


counts(cp, 'two_year_recid')


# In[43]:


percentages(cp, 'two_year_recid')


# Breakdown by risk category

# In[44]:


counts(cp, 'score_text')


# In[45]:


percentages(cp, 'score_text')


# The following function computes cross-tabulations.

# In[46]:


def crosstab(df, index, columns):
    """Compute a cross-tabulation.
    
    df: DataFrame
    index: variable(s) that will label the rows
    columns: variable(s) that will label the columns
    
    returns: DataFrame
    """

    xtab = df.pivot_table(index=index, 
                          columns=columns,
                          values='id',
                          aggfunc='count')
    
    return xtab


# Breakdown by sex and race

# In[47]:


xtab = crosstab(cp, 'sex', 'race')


# Breakdown by age and race

# In[48]:


xtab = crosstab(cp, 'age_cat', 'race')


# Breakdown by age and sex

# In[49]:


xtab = crosstab(cp, 'age_cat', 'sex')


# Distribution of decile scores for black defendants.

# In[50]:


cp.loc[cp.race=='African-American', 'decile_score'].hist()
decorate(xlabel='Decile Score',
         ylabel='Count',
         title='Black defendant decile scores',
         ylim=[0, 700])


# Distribution of decile scores for white defendants.

# In[51]:


cp.loc[cp.race=='Caucasian', 'decile_score'].hist()
decorate(xlabel='Decile Score',
         ylabel='Count',
         title='White defendant decile scores',
         ylim=[0, 700])


# Cross tabulation of decile score and race.

# In[52]:


crosstab(cp, 'decile_score', 'race')


# Cross tabulation of decile score and age group.

# In[53]:


crosstab(cp, 'decile_score', 'age_cat')


# Here's the confusion matrix with all three score categories.

# In[54]:


crosstab(cp, 'two_year_recid', 'score_text')


# To make sure I've got the data right, I'll reproduce the confusion matrices from the article.

# In[55]:


def compute_matrix(df):
    """Compute a confusion matrix from data.
    
    df: DataFrame
    
    returns: confusion matrix
    """
    high = cp.score_text.isin(['Medium', 'High']).astype(int)
    return crosstab(df, 'two_year_recid', high)


# All defendants.

# In[56]:


compute_matrix(cp)


# And the differences are 0.

# In[57]:


matrix_all - compute_matrix(cp).values


# Black defendants.

# In[58]:


black = cp[cp.race=='African-American']
compute_matrix(black)


# In[59]:


matrix_black - compute_matrix(black).values


# White defendants.

# In[60]:


white = cp[cp.race=='Caucasian']
compute_matrix(white)


# In[61]:


matrix_white - compute_matrix(white).values


# ### Calibration
# 
# To check for calibration, I group defendents by decile score and compute prevalence (recidivism rate) in each group.
# 
# This analysis does not take observation time into account, unlike the analysis in the original article.
# 
# The following function groups defendants by decile score and computes prevalence in each group.

# In[62]:


def calibration_curve(df):
    """Compute probability of recidivism by decile score.
    
    df: DataFrame
    
    returns: Series
    """
    return df.groupby('decile_score').two_year_recid.mean()


# The following figure shows this calibration curve for all defendants and broken down by race.

# In[63]:


def plot_calibration(df, group_vars, title=''):
    calibration_curve(cp).plot(linestyle='dotted',
                        label='All defendants', color='gray')

    for name, group in cp.groupby(group_vars):
        if len(group) > 1000:
            calibration_curve(group).plot(label=name)

    decorate(xlabel='Decile score',
             ylabel='Prob recidivism',
             title=title)


# In[64]:


plot_calibration(cp, 'race')


# The test is well calibrated.  People with higher scores have higher probabilities of recidivism.  In fact, we could use this curve to transform COMPAS scores into probabilities.
# 
# The test is about equally calibrated for black and white defendants, although black defendants with scores 3 and 4 are more likely to recidivate than white defendants with the same scores (that apparent difference might not be statistically significant).
# 
# Here's the breakdown by age group.

# In[66]:


plot_calibration(cp, 'age_cat')


# The test is about equally calibrated for all age groups, which means that people with the same score have about the same probability of recidivism, regardless of what group they are in.
# 
# There are only 4 people in the "Less than 25" group with decile score 1, which is why that data point is so out of line.
# 
# Here's the breakdown by sex.

# In[67]:


plot_calibration(cp, 'sex')


# Here's the first case where the test does not seem well calibrated.  At all decile scores, female defendants are substantially less likely to recidivate than male defendants.
# 
# Or, reading this graph the other way, female defendants are given decile scores 1-2 points higher than male defendants with the same actual risk of recidivism. 

# ### Comparing reality to the CPV model
# 
# In this section I'll compare actual PPV and FPR for a variety of subgroups to the values we would expect based on the CPV model; that is, a model where the predictive values are the same for all groups.
# 
# The following function groups defendants by `group_vars` and returns a table with one row for each group.

# In[68]:


def make_table(df, group_vars, expected_ppv, expected_npv):
    """Make a table with one line per group.
    
    df: DataFrame
    group_vars: string or list of string variable names
    expected_ppv: 
    expected_npv:
    
    returns: table
    """
    # make the DataFrame
    columns = ['count', 'prevalence',
               'actual PPV', 'actual NPV', 'actual FNR',
               'actual FPR', 'expected FPR', 'difference']
    columns = group_vars + columns 

    table = pd.DataFrame(columns=columns)

    # loop through the groups
    grouped = df.groupby(group_vars)
    for i, (name, group) in enumerate(grouped):
        if not isinstance(name, tuple):
            name = name,
        
        # size of group
        count = len(group)
        
        # compute metrics
        matrix = compute_matrix(group)
        prev = prevalence(matrix)
        actual_ppv, actual_npv = predictive_value(matrix)
        actual_fpr, actual_fnr = error_rates(matrix)

        # generate the CPV matrix
        cpv = constant_predictive_value(prev, 
                          expected_ppv, expected_npv)

        # get the expected error rates
        expected_fpr, _ = error_rates(cpv * 100)
        
        # for very low and high prevalences, it might
        # not be possible to achieve given predictive values
        if expected_fpr < 0:
            expected_fpr = 0

        if expected_fpr > 100:
            expected_fpr = 100

        # difference between actual and expected
        diff = actual_fpr - expected_fpr

        # add a row to the table
        row = name + (count, prev,
                      actual_ppv, actual_npv, actual_fnr,
                      actual_fpr, expected_fpr, diff)
        
        table.loc[i] = row
        
    # sort the table by prevalence
    table.sort_values(by='prevalence', inplace=True)
    return table


# In[69]:


subset = cp[cp.race.isin(['African-American', 'Caucasian'])]
subset.shape


# Here's the breakdown by age category.

# In[70]:


ppv, npv = predictive_value(matrix_all)
group_vars = ['age_cat']
table1 = make_table(subset, group_vars, ppv, npv)


# Again, the actual behavior of the test is tempered, compare to the CPV model; that is, the results are less extreme than the model expects.
# 
# Here's the breakdown by race.

# In[71]:


ppv, npv = predictive_value(matrix_all)
group_vars = ['race']
table2 = make_table(subset, group_vars, ppv, npv)


# The false positive rate for whites is higher than we would expect if predictive value were the same for all groups.
# 
# The false positive rate for blacks is lower than we would expect.
# 
# Here's the breakdown by sex.

# In[72]:


ppv, npv = predictive_value(matrix_all)
group_vars = ['sex']
table3 = make_table(subset, group_vars, ppv, npv)


# The false positive rate for women is substantially higher than what we would expect in the CPV model, which is consistent with the calibration results in the previous section.
# 
# Here's the breakdown by age and race.

# In[73]:


ppv, npv = predictive_value(matrix_all)
group_vars = ['age_cat', 'race']
table4 = make_table(subset, group_vars, ppv, npv)


# Breakdown by age and sex.

# In[74]:


ppv, npv = predictive_value(matrix_all)
group_vars = ['age_cat', 'sex']
table5 = make_table(subset, group_vars, ppv, npv)


# Breakdown by race and sex.

# In[75]:


ppv, npv = predictive_value(matrix_all)
group_vars = ['race', 'sex']
table6 = make_table(subset, group_vars, ppv, npv)


# Breakdown by age, race, and sex.

# In[76]:


ppv, npv = predictive_value(matrix_all)
group_vars = ['age_cat', 'race', 'sex']
table7 = make_table(subset, group_vars, ppv, npv)


# Those are all the possible subgroups for these three variables.
# 
# Now we can see what the results look like.

# In[77]:


tables = [table1, table2, table3, table4, table5, table6, table7];


# The following function plots one data point per subgroup showing the given metric versus prevalence.
# 
# Groups with a small number of people are shown with lighter colors.
# 

# In[78]:


def plot_table_var(table, var, color):
    """Plot one data point per row.
    
    table: DataFrame
    var: which metric to plot
    color: string
    """
    for _, row in table.iterrows():
        alpha = 0.8 if row['count'] > 200 else 0.3

        plt.plot(row['prevalence'], row[var],
                 'o', color=color, alpha=alpha)


# Here's what the results look like for FPR.

# In[81]:


fp_rates.plot(label='Expected FPR, constant CPV',
              color='C1')
plt.axhline(fpr, linestyle='dotted', 
            label='Expected FPR, constant FPR', color='gray')

for table in tables:
    plot_table_var(table, 'actual FPR', 'C1')
    
decorate(xlabel='Prevalence',
         ylabel='Percent',
         title='False positive rates by subgroup')


# In general, groups with higher prevalence have higher false positive rates, but the effect is less extreme than what we would expect from the CPV model.
# 
# Here are the results for positive predictive value.

# In[82]:


ppv_rates.plot(label='Expected PPV, constant FPR', color='C0')
plt.axhline(ppv, linestyle='dotted', 
            label='Expected PPV, constant PPV', color='gray')

for table in tables:
    plot_table_var(table, 'actual PPV', 'C0')
    
decorate(xlabel='Prevalence',
         ylabel='Rate',
         title='Positive predictive value by subgroup')


# Groups with higher prevalence have higher PPV, but the effect is less extreme than we would expect from the CPV model.
# 
# Here are the results for false negative rate.

# In[83]:


fn_rates.plot(label='Expected FNR, constant CPV',
              color='C4')
plt.axhline(fnr, linestyle='dotted', 
            label='Expected FNR, constant FNR', color='gray')

for table in tables:
    plot_table_var(table, 'actual FNR', 'C4')
    
decorate(xlabel='Prevalence',
         ylabel='Percent',
         title='False negative rates by subgroup')


# Groups with higher prevalence have lower FNR, but the effect is less extreme than we would expect from the CPV model.
# 
# Here are the results for negative predictive value.

# In[84]:


npv_rates.plot(label='Expected NPV, constant FPR', color='C2')
plt.axhline(npv, linestyle='dotted', 
            label='Expected NPV, constant NPV', color='gray')

for table in tables:
    plot_table_var(table, 'actual NPV', 'C2')
    
decorate(xlabel='Prevalence',
         ylabel='Percent',
         title='Negative predictive value by subgroup')


# Groups with higher prevalence have lower NPV.  In this case, the effect is almost exactly what we would expect from the CPV model.

# ### Individual FPR

# In[85]:


from scipy.interpolate import interp1d

def crossing(series, value, **options):
    """Find where a function crosses a value.
    
    series: Series
    value: number
    options: passed to interp1d (default is linear interp)
    
    returns: number
    """
    interp = interp1d(series.values, series.index, **options)
    return interp(value)

def interpolate(series, value, **options):
    """Evaluate a function at a value.
    
    series: Series
    value: number
    options: passed to interp1d (default is linear interp)
    
    returns: number
    """
    interp = interp1d(series.index, series.values, **options)
    return interp(value)


# In[86]:


cal_all = calibration_curve(cp)
cal_all.plot()
decorate(ylabel='Probability of recidivism')


# In[87]:


crossing(cal_all, 0.4)


# In[88]:


crossing(cal_all, 0.7)


# In[89]:


interpolate(cal_all, 3.4)


# In[90]:


interpolate(cal_all, 9)


# In[91]:


def make_error_dist(std_dev):
    """Make a discrete Gaussian distribution.
    
    std_dev: standard deviation
    
    returns: Series that maps errors to probabilities
    """
    errors = np.linspace(-3, 3, 21)
    prob_error = np.exp(-(errors/std_dev)**2)
    prob_error /= np.sum(prob_error)
    error_dist = pd.Series(prob_error, index=errors)
    return error_dist


# In[92]:


error_dist = make_error_dist(std_dev=2)
error_dist.plot(label='')
decorate(xlabel='Error (score)',
         ylabel='Probability')


# In[93]:


def individual_fpr(actual_prob_recid, cal, thresh, std_dev):
    """Compute an individual FPR.
    
    actual_prob_recid: actual probability of recidivism
    cal: calibration curve, map from score to prob_recid
    thresh: threshold between low and not low risk
    std_dev: standard deviation of the error function
    
    returns: individual FPR
    """
    # look up actual_prob_recid to get correct score
    correct_score = crossing(cal, actual_prob_recid)

    # make the error distribution
    error_dist = make_error_dist(std_dev)

    # loop through possible errors
    total_prob = 0
    for error, prob_error in error_dist.iteritems():
        # hypothetical score
        score = correct_score+error
        score = max(score, 1)
        score = min(score, 10)
        
        # probability of being classified 'not low' | error
        prob_positive = 0 if score < thresh else 1

        # probability of being a false positive | error
        prob_fp = prob_positive * (1-actual_prob_recid)
        
        total_prob += prob_error * prob_fp
    return total_prob


# In[94]:


individual_fpr(0.3, cal_all, 4.5, 2)


# In[95]:


individual_fpr(0.5, cal_all, 4.5, 2)


# In[96]:


individual_fpr(0.7, cal_all, 4.5, 2)


# In[97]:


def compute_fpr_vs_prob_recid(cal, thresh, std_dev):
    """Computes FPR as a function of probability of recidivism.
    
    cal: calibration curve, map from score to prob_recid
    thresh: threshold between low and not low risk
    std_dev: standard deviation of the error function
    
    returns: Series
    """
    prob_recid_array = np.linspace(min(cal), max(cal), 21)
    prob_fpr_series = pd.Series(index=prob_recid_array)
    for prob_recid in prob_recid_array:
        fpr = individual_fpr(prob_recid, cal, thresh, std_dev)
        prob_fpr_series[prob_recid] = fpr
    return prob_fpr_series


# In[98]:


s = compute_fpr_vs_prob_recid(cal_all, thresh=4.5, std_dev=2)
s.plot(label='FPR, std_dev=2')

s = compute_fpr_vs_prob_recid(cal_all, thresh=4.5, std_dev=1)
s.plot(label='FPR, std_dev=1')

decorate(xlabel='Actual probability of recidivism',
         ylabel='Probability of false positive')


# In[99]:


def individual_fpr_given_score(actual_score, cal, thresh, std_dev):
    """Compute an individual FPR.
    
    actual_score: score assigned
    cal: calibration curve, map from score to prob_recid
    thresh: threshold between low and high risk
    std_dev: standard deviation of the error function
    
    returns: individual FPR
    """
    # make the error distribution
    error_dist = make_error_dist(std_dev)

    # loop through possible errors
    total_prob = 0
    for error, prob_error in error_dist.iteritems():
        # correct score
        correct_score = actual_score-error
        correct_score = max(correct_score, 1)
        correct_score = min(correct_score, 10)
        
        # map from correct score to probability of recidivism.
        # if calibration curves are different for different
        # groups, this one should be group specific.
        correct_prob_recid = interpolate(cal, correct_score)
        cond_ifpr = individual_fpr(correct_prob_recid,
                                   cal, thresh, std_dev)
        
        total_prob += prob_error * cond_ifpr
    return total_prob


# In[100]:


individual_fpr_given_score(6, cal_all, thresh=4.5, std_dev=2)


# In[131]:


def make_ifpr_series(cal, thresh, std_dev):

    scores = np.arange(1, 11)
    t = [individual_fpr_given_score(score, cal, thresh, std_dev)
         for score in scores]
    
    ifpr_series = pd.Series(t, scores)
    return ifpr_series


# In[132]:


thresh = 4.5

s = make_ifpr_series(cal_all, thresh, std_dev=2)
s.plot(label='FPR, std_dev=2')

s = make_ifpr_series(cal_all, thresh, std_dev=1)
s.plot(label='FPR, std_dev=1')

decorate(xlabel='Score',
         ylabel='Probability of false positive')


# In[133]:


def assign_individual_fpr(df, cal, thresh, std_dev):
    """Assign individual FPRs to defendants.
    
    df: DataFrame
    cal: calibration curve, map from score to prob_recid
    thresh: threshold between low and high risk
    std_dev: standard deviation of the error function
    """
    # compute the map from score to FPR
    ifpr_series = make_ifpr_series(cal, thresh, std_dev)
    
    # assign FPR to each defendant
    df['ifpr'] = [ifpr_series[score] for score in df.decile_score]


# In[134]:


assign_individual_fpr(cp, cal_all, thresh=4.5, std_dev=2)


# In[135]:


def make_cdf(series):
    """Make a CDF."""
    counts = series.value_counts().sort_index()
    counts /= counts.sum()
    return counts.cumsum()

def plot_cdf(cdf, **options):
    """Plot a CDF as a step function."""
    plt.step(cdf.index, cdf.values, where='post', **options)


# In[136]:


cdf_ifpr = make_cdf(cp.ifpr)


# In[137]:


plot_cdf(cdf_ifpr, label='All')

decorate(xlabel='Individual probability of false positive', 
         ylabel='CDF',
         ylim=[0,1])


# In[138]:


black = cp[cp.race=='African-American']
white = cp[cp.race=='Caucasian']

thresh = 4.5
std_dev = 2

cal_black = calibration_curve(black)
s = make_ifpr_series(cal_black, thresh, std_dev)
s.plot(label='FPR, black')

cal_white = calibration_curve(white)
s = make_ifpr_series(cal_white, thresh, std_dev)
s.plot(label='FPR, white')

decorate(xlabel='Score',
         ylabel='Probability of false positive')


# In[139]:


male = cp[cp.sex=='Male']
female = cp[cp.sex=='Female']

thresh = 4.5
std_dev = 2

cal_male = calibration_curve(male)
s = make_ifpr_series(cal_male, thresh, std_dev)
s.plot(label='FPR, male')

cal_female = calibration_curve(female)
s = make_ifpr_series(cal_female, thresh, std_dev)
s.plot(label='FPR, female')

decorate(xlabel='Score',
         ylabel='Probability of false positive')


# In[176]:


assign_individual_fpr(cp, cal_all, thresh=4.5, std_dev=2)
cp.ifpr.mean()


# In[177]:


cp.groupby('race').ifpr.mean()


# In[178]:


assign_individual_fpr(cp, cal_all, thresh=4.5, std_dev=1)
cp.ifpr.mean()


# In[179]:


cp.groupby('race').ifpr.mean()


# In[180]:


assign_individual_fpr(cp, cal_all, thresh=4.5, std_dev=0.01)
cp.ifpr.mean()


# In[181]:


cp.groupby('race').ifpr.mean()


# ### Individual FNR

# In[141]:


cal_all = calibration_curve(cp)
cal_all.plot()
decorate(ylabel='Probability of recidivism')


# In[142]:


def individual_fnr(actual_prob_recid, cal, thresh, std_dev):
    """Compute an individual FNR.
    
    actual_prob_recid: actual probability of recidivism
    cal: calibration curve, map from score to prob_recid
    thresh: threshold between low and not low risk
    std_dev: standard deviation of the error function
    
    returns: individual FNR
    """
    # look up actual_prob_recid to get correct score
    correct_score = crossing(cal, actual_prob_recid)

    # make the error distribution
    error_dist = make_error_dist(std_dev)

    # loop through possible errors
    total_prob = 0
    for error, prob_error in error_dist.iteritems():
        # hypothetical score
        score = correct_score+error
        score = max(score, 1)
        score = min(score, 10)
        
        # probability of being classified 'low' | error
        prob_negative = 0 if score >= thresh else 1

        # probability of being a false negative | error
        prob_fp = prob_negative * actual_prob_recid
        
        total_prob += prob_error * prob_fp
    return total_prob


# In[143]:


individual_fnr(0.3, cal_all, 4.5, 2)


# In[144]:


individual_fnr(0.5, cal_all, 4.5, 2)


# In[145]:


individual_fnr(0.7, cal_all, 4.5, 2)


# In[146]:


def compute_fnr_vs_prob_recid(cal, thresh, std_dev):
    """Computes FNR as a function of probability of recidivism.
    
    cal: calibration curve, map from score to prob_recid
    thresh: threshold between low and not low risk
    std_dev: standard deviation of the error function
    
    returns: Series
    """
    prob_recid_array = np.linspace(min(cal), max(cal), 21)
    prob_fnr_series = pd.Series(index=prob_recid_array)
    for prob_recid in prob_recid_array:
        fnr = individual_fnr(prob_recid, cal, thresh, std_dev)
        prob_fnr_series[prob_recid] = fnr
    return prob_fnr_series


# In[147]:


s = compute_fnr_vs_prob_recid(cal_all, thresh=4.5, std_dev=2)
s.plot(label='FNR, std_dev=2')

s = compute_fnr_vs_prob_recid(cal_all, thresh=4.5, std_dev=1)
s.plot(label='FNR, std_dev=1')

decorate(xlabel='Actual probability of recidivism',
         ylabel='Probability of false negative')


# In[148]:


def individual_fnr_given_score(actual_score, cal, thresh, std_dev):
    """Compute an individual FNR.
    
    actual_score: score assigned
    cal: calibration curve, map from score to prob_recid
    thresh: threshold between low and high risk
    std_dev: standard deviation of the error function
    
    returns: individual FNR
    """
    # make the error distribution
    error_dist = make_error_dist(std_dev)

    # loop through possible errors
    total_prob = 0
    for error, prob_error in error_dist.iteritems():
        # correct score
        correct_score = actual_score-error
        correct_score = max(correct_score, 1)
        correct_score = min(correct_score, 10)
        
        # map from correct score to probability of recidivism.
        # if calibration curves are different for different
        # groups, this one should be group specific.
        correct_prob_recid = interpolate(cal, correct_score)
        cond_ifnr = individual_fnr(correct_prob_recid,
                                   cal, thresh, std_dev)
        
        total_prob += prob_error * cond_ifnr
    return total_prob


# In[149]:


individual_fnr_given_score(6, cal_all, thresh=4.5, std_dev=2)


# In[150]:


def make_ifnr_series(cal, thresh, std_dev):

    scores = np.arange(1, 11)
    t = [individual_fnr_given_score(score, cal, thresh, std_dev)
         for score in scores]
    
    ifnr_series = pd.Series(t, scores)
    return ifnr_series


# In[151]:


thresh = 4.5

s = make_ifnr_series(cal_all, thresh, std_dev=2)
s.plot(label='FNR, std_dev=2')

s = make_ifnr_series(cal_all, thresh, std_dev=1)
s.plot(label='FNR, std_dev=1')

decorate(xlabel='Score',
         ylabel='Probability of false negative')


# In[152]:


def assign_individual_fnr(df, cal, thresh, std_dev):
    """Assign individual FNRs to defendants.
    
    df: DataFrame
    cal: calibration curve, map from score to prob_recid
    thresh: threshold between low and high risk
    std_dev: standard deviation of the error function
    """
    # compute the map from score to FPR
    ifnr_series = make_ifnr_series(cal, thresh, std_dev)
    
    # assign FPR to each defendant
    df['ifnr'] = [ifnr_series[score] for score in df.decile_score]


# In[153]:


assign_individual_fnr(cp, cal_all, thresh=4.5, std_dev=2)


# In[154]:


cdf_ifnr = make_cdf(cp.ifnr)


# In[155]:


plot_cdf(cdf_ifnr, label='All')

decorate(xlabel='Individual probability of false negative', 
         ylabel='CDF',
         ylim=[0,1])


# In[159]:


thresh = 4.5
std_dev = 2

cal_black = calibration_curve(black)
s = make_ifnr_series(cal_black, thresh, std_dev)
s.plot(label='FNR, black')

cal_white = calibration_curve(white)
s = make_ifnr_series(cal_white, thresh, std_dev)
s.plot(label='FNR, white')

decorate(xlabel='Score',
         ylabel='Probability of false negative')


# In[160]:


thresh = 4.5
std_dev = 2

cal_male = calibration_curve(male)
s = make_ifnr_series(cal_male, thresh, std_dev)
s.plot(label='FNR, male')

cal_female = calibration_curve(female)
s = make_ifnr_series(cal_female, thresh, std_dev)
s.plot(label='FNR, female')

decorate(xlabel='Score',
         ylabel='Probability of false negative')


# In[182]:


assign_individual_fnr(cp, cal_all, thresh=4.5, std_dev=2)
cp.ifnr.mean()


# In[183]:


cp.groupby('race').ifnr.mean()


# In[184]:


assign_individual_fnr(cp, cal_all, thresh=4.5, std_dev=1)
cp.ifnr.mean()


# In[185]:


cp.groupby('race').ifnr.mean()


# In[186]:


assign_individual_fnr(cp, cal_all, thresh=4.5, std_dev=0.01)
cp.ifnr.mean()


# In[187]:


cp.groupby('race').ifnr.mean()


# In[ ]:


# In[ ]:


# In[ ]:


# ### What would it take?
# 
# In this section I explore what it would take to make a test with the same false positive rate for all groups.

# In[ ]:


def fpr_thresh(df, thresh):
    df = df.copy()
    df['high'] = df.decile_score >= thresh
    matrix_all = crosstab(df, 'two_year_recid', 'high')
    fpr, fnr = error_rates(matrix_all)
    return fpr


# 

# In[ ]:


fpr_thresh(cp, 5)


# 

# In[ ]:


fpr_thresh(black, 5)


# 

# In[ ]:


fpr_thresh(white, 5)


# 

# In[ ]:


def sweep_thresh(df):
    threshes = range(2,10)
    sweep = pd.Series(index=threshes)
    for thresh in threshes:
        sweep[thresh] = fpr_thresh(df, thresh)
        
    return sweep


# 

# In[ ]:


plt.axhline(32.25, color='gray')
sweep_thresh(cp).plot(label='All')
sweep_thresh(black).plot(label='Black')
sweep_thresh(white).plot(label='White')
decorate(xlabel='Threshold',
         ylabel='False positive rate')


# 

# In[ ]:


# 

# In[ ]:


def find_threshold(group, fpr):
    series = sweep_thresh(group)
    xs = crossing(series.dropna(), fpr)
    return xs


# 

# In[ ]:


all_thresh = find_threshold(cp, 32.35)


# 

# In[ ]:


black_thresh = find_threshold(black, 32.35)


# 

# In[ ]:


white_thresh = find_threshold(white, 32.35)


# 

# In[ ]:


# 

# In[ ]:


interpolate(calibration_curve(cp), all_thresh)


# 

# In[ ]:


interpolate(calibration_curve(cp), black_thresh)


# 

# In[ ]:


interpolate(calibration_curve(black), black_thresh)


# 

# In[ ]:


interpolate(calibration_curve(cp), white_thresh)


# 

# In[ ]:


interpolate(calibration_curve(white), white_thresh)


# 

# In[ ]:


# 

# In[ ]:


# 

# In[ ]:


# 

# In[ ]:


# 

# In[ ]:


# 

# In[ ]:


# 

# In[ ]:


black_male = black[black.sex=='Male']
black_male.shape


# 

# In[ ]:


black_female = black[black.sex=='Female']
black_female.shape


# 

# In[ ]:


old_black_female = black_female[black_female.age_cat=='Greater than 45']
old_black_female.shape


# 

# In[ ]:


old_white_female = cp[(cp.age_cat=='Greater than 45') &
                      (cp.sex=='Female') &
                      (cp.race=='Caucasian')]
old_white_female.shape


# 

# In[ ]:


young_black_male = cp[(cp.age_cat=='Less than 25') &
                      (cp.sex=='Male') &
                      (cp.race=='African-American')]
young_black_male.shape


# 

# In[ ]:


fpr_thresh(cp, 5)


# 

# In[ ]:


fpr_thresh(black, 5)


# 

# In[ ]:


fpr_thresh(black_female, 5)


# 

# In[ ]:


fpr_thresh(old_black_female, 5)


# 

# In[ ]:


fpr_thresh(black_male, 5)


# 

# In[ ]:


fpr_thresh(young_black_male, 5)


# 

# In[ ]:


plt.axhline(32.25, color='gray')
sweep_thresh(black).plot(label='Black', color='gray')
sweep_thresh(black_male).plot(label='Black male')
sweep_thresh(young_black_male).plot(label='Young black male')

decorate(xlabel='Threshold',
         ylabel='False positive rate')


# 

# In[ ]:


plt.axhline(32.25, color='gray')
sweep_thresh(black).plot(label='Black', color='gray')
sweep_thresh(black_female).plot(label='Black female')
sweep_thresh(old_black_female).plot(label='Old black female')

decorate(xlabel='Threshold',
         ylabel='False positive rate')


# 

# In[ ]:


ybm_thresh = find_threshold(young_black_male, 32.35)


# 

# In[ ]:


obf_thresh = find_threshold(old_black_female, 32.35)


# 

# In[ ]:


interpolate(calibration_curve(cp), ybm_thresh)


# 

# In[ ]:


interpolate(calibration_curve(cp), obf_thresh)


# 

# In[ ]: