#!/usr/bin/env python
# coding: utf-8

# In[1]:


# use svg graphics, display inline
get_ipython().run_line_magic('matplotlib', 'inline')
# %config InlineBackend.figure_format = 'svg'

import glob
import re
import copy
import sys
from datetime import datetime

# basic scientific computing imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sparklines import sparklines

# hex colors for plotting
SOFT_PURPLE = '#8565C4'
SOFT_RED = '#C23F38'
SOFT_GREEN = '#56B000'
NEUTRAL_GREY = '#A9A9A9'

# display config
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.style.use('ggplot')
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 6.4, 4.8
np.set_printoptions(suppress=True)
np.random.seed(42)

print(sys.version)


# # Fetch and load the data

# In[2]:


url = 'https://data.texas.gov/api/views/y6p5-x3b9/rows.csv?accessType=DOWNLOAD'
date_cols = [
    'Projected Release', 
    'Maximum Sentence Date', 
    'Parole Eligibility Date',
    'Sentence Date', 
    'Offense Date', 
    'Next Parole Review Date'
]

df = pd.read_csv(url, parse_dates=date_cols)
df.sample(3)


# Some of the date columns didn't get parsed as `datetime` columns. This is because some of the release dates were coded as 9999/01/01, meaning that the person is never getting out. I'll fix it if I end up needing the columns for something.

# # Clean and format
# - format the column names
#     - e.g. 'First Name' -> 'first_name'
# - convert race codes to full words
#     - e.g. 'W' -> 'white'
# - Assign new columns

# In[3]:


df = (
    df
    .rename(
        columns=lambda x: x.lower().replace(' ', '_').replace('(', '').replace(')', '')
    )
    .replace({
        'race': {
            'W': 'white',
            'H': 'hispanic',
            'B': 'black',
            'O': 'other',
            'A': 'asian',
            'I': 'indian'
        }
    })
    .assign(
        is_female=lambda df_: df_.gender.str.match('F'),
        sentence_year=lambda df_: df_.sentence_date.dt.year,
        is_life_sentence=lambda df_: df_.sentence_years.str.lower().str.contains('life'),
        is_murder=lambda df_: df_.tdcj_offense.str.contains('MURDER'),
        is_drug_possession=lambda df_: df_.tdcj_offense.str.contains('POSS'),
        is_sex_crime=lambda df_: df_.tdcj_offense.str.contains('INDEC|SEX'),
        is_robbery=lambda df_: df_.tdcj_offense.str.contains('ROBBERY'),
        is_assault=lambda df_: df_.tdcj_offense.str.contains('ASLT'),
        age_at_sentencing=lambda df_: df_.sentence_year - (datetime.now().year - df_.age),
        last_parole_denied=lambda df_: df_.last_parole_decision.str.contains('Denied'),
        last_parole_approved=lambda df_: df_.last_parole_decision.str.contains('Approved')
    )
)

df.sample(3)


# In[4]:


(df
    .query('sentence_year > 1980')
    .pivot_table(
        index='sentence_year',
        columns='is_murder',
        values='is_life_sentence'
    )
    .multiply(100)
    .plot.line(title='Life sentence rate for murder and non-murder convictions')
);


# In[5]:


(df
    .query('is_life_sentence == True')
    .tdcj_offense
    .value_counts()
    .head(15)
    .sort_values()
    .plot.barh(title='Top offenses for inmates serving life sentence', color=SOFT_PURPLE)
);


# In[6]:


(df
    .gender
    .value_counts(normalize=True)
    .multiply(100)
    .plot.pie(title='Gender breakdown of inmates')
);


# In[7]:


(df
    .query('1990 <= sentence_year <= 2020')
    .pivot_table(index='sentence_year', columns='is_life_sentence', values='is_female')
    .multiply(100)
    .plot.line(title='% female inmates, life sentences and overall', ylim=(0, None))
);


# In[8]:


(df
    .race
    .value_counts(normalize=True)
    .sort_values()
    .multiply(100)
    .plot.barh(
        title='Racial breakdown of inmates', 
        xlim=(0, 40), 
        color=SOFT_PURPLE
    )
);


# In[9]:


(df
    .query('1990 <= sentence_year <= 2020')
    .groupby('sentence_year')
    .race
    .value_counts(normalize=True)
    .unstack('race')
    .multiply(100)
    .plot.line(title='Racial breakdown of inmates')
);


# In[10]:


(df
    .query('is_life_sentence == True')
    .current_facility
    .value_counts()
    .head(10)
    .sort_values()
    .plot.barh(
        title='Facilities with the highest number of life sentences',
        color=SOFT_PURPLE
    )
);


# In[11]:


(df
    .groupby('current_facility')
    .is_life_sentence
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .multiply(100)
    .sort_values()
    .plot.barh(
        title='Facilities with the percentage of life sentences',
        color=SOFT_PURPLE
    )
);


# In[12]:


(df
    .groupby('tdcj_offense')
    .agg(
        perc_female=('is_female', np.mean),
        total=('is_female', len)
    )
    .query('total >= 20')   # semi arbitrary threshold
    .head(10)
    .sort_values('perc_female')
    .perc_female
    .multiply(100)
    .plot.barh(
        title='Offenses disproportionately committed by women',
        color=SOFT_PURPLE
    )
);


# In[13]:


(df
    .query('1980 <= sentence_year <= 2020')
    .groupby('sentence_year')
    .is_female
    .mean()
    .multiply(100)
    .plot.line(
        title='% female imates by year', 
        ylim=(0, None),
        color=SOFT_PURPLE
    )
);


# In[14]:


(df
    .query('1980 <= sentence_year <= 2020')
    .groupby('sentence_year')
    .tdcj_number
    .nunique()
    .plot.line(
        title='Total inmates sentenced by year', 
        color=SOFT_PURPLE
    )
);


# In[15]:


(df
    .query('1980 <= sentence_year <= 2020')
    .groupby('sentence_year')
    .is_murder
    .mean()
    .multiply(100)
    .plot.line(
        title='% of sentencings for murder',
        color=SOFT_PURPLE
    )
);


# In[16]:


(df
    .query('1980 <= sentence_year <= 2021')
    .groupby('sentence_year')
    .is_murder
    .sum()
    .plot.line(
        title='Number of sentencings for murder',
        color=SOFT_PURPLE
    )
);


# In[17]:


(df
    .tdcj_offense
    .value_counts()
    .head(10)
    .sort_values(ascending=True)
    .plot.barh(title='Most common offenses', color=SOFT_PURPLE)
);


# In[18]:


(df
    [[
        'sentence_year', 
        'is_murder', 
        'is_drug_possession', 
        'is_sex_crime', 
        'is_robbery', 
        'is_assault'
     ]]
    .query('1980 <= sentence_year <= 2020')
    .groupby('sentence_year')
    .mean()
    .multiply(100)
    .plot.line()
);


# In[19]:


(df
    .query('1980 <= sentence_year <= 2021')
    .groupby('sentence_year')
    .agg(
        mean=('age_at_sentencing', np.mean),
        median=('age_at_sentencing', np.median),
    )
    .plot.line(
        title='Age at sentencing', 
        ylim=(0, 40),
        ylabel='age (years)'
    )
);


# In[20]:


(df
    .query('sentence_year == 2000')
    .age_at_sentencing
    .plot.hist(
        title='Age at sentencing',
        color=SOFT_PURPLE
    )
);


# In[21]:


(df
    .query('1980 <= sentence_year <= 2021')
    .groupby('sentence_year')
    .age_at_sentencing
    .agg(lambda x: x.gt(35).mean())
    .multiply(100)
    .plot.line(
        title='% of sentences over age 35', 
        ylim=(-2, 50), 
        ylabel='% > 35', 
        color=SOFT_PURPLE
    )
);


# In[22]:


(df
    .query('sentence_year == 2000')
    .age_at_sentencing
    .std()
)


# In[23]:


(df
    .query('1980 <= sentence_year <= 2021')
    .pivot_table(
        values='age_at_sentencing',
        index='sentence_year',
        columns='gender',
        aggfunc=lambda x: x.gt(35).mean() * 100
    )
    .plot.line(title='% of sentences over age 35, by gender', ylim=(-2, 50))
);


# The effect is present in both genders

# In[24]:


(df
    .groupby('tdcj_offense').filter(lambda df_: len(df_) > 20)
    .groupby('tdcj_offense')
    .aggregate(mean_age=('age', np.mean))
    .sort_values('mean_age', ascending=True)
    .mean_age
    .head(15)
    .sort_values(ascending=False)
    .plot.barh(
        title='Offenses with youngest average age', 
        xlim=(25, None), 
        color=SOFT_PURPLE
    )
);


# In[25]:


(df
    .groupby('tdcj_offense')
    .agg(
        mean_age=('age', np.mean),
        total=('age', len)
    )
    .query('total > 20')
    .mean_age
    .sort_values()
    .tail(20)
    .plot.barh(
        title='Offenses with oldest average age', 
        xlim=(40, None), 
        color=SOFT_PURPLE
    )
);


# In[26]:


df.sample(3).T