import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from constants import *
%matplotlib inline
DATA_FILE = 'data (download CSVs here)/crdc-data-with-lat-long.csv'
crdc_data = pd.read_csv(
DATA_FILE,
usecols=COLS_WITH_NEEDED_DATA,
low_memory=False,
encoding="ISO-8859-1"
)
df = crdc_data[crdc_data['LEA_STATE_NAME'] == 'NORTH CAROLINA']
df
LEA_STATE_NAME | LEAID | LEA_NAME | SCH_NAME | SCH_ENR_HI_M | SCH_ENR_HI_F | SCH_ENR_BL_M | SCH_ENR_BL_F | SCH_ENR_WH_M | SCH_ENR_WH_F | ... | TOT_DISCWODIS_EXPZT_M | TOT_DISCWODIS_EXPZT_F | SCH_FTESECURITY_LEO | SCH_FTESECURITY_GUA | SCH_FTESERVICES_NUR | SCH_FTESERVICES_PSY | SCH_FTESERVICES_SOC | SCH_JJTYPE | LAT1516 | LON1516 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
61931 | NORTH CAROLINA | 3700001 | NC Health and Human Services | Riverbend School | 0 | 2 | 2 | 2 | 2 | 2 | ... | 0 | 0 | 1.00 | 0.0 | 13.00 | 2.00 | 3.00 | -9 | 35.766864 | -78.656315 |
61932 | NORTH CAROLINA | 3700001 | NC Health and Human Services | The Whitaker School PRTF | 0 | 2 | 2 | 2 | 5 | 2 | ... | 0 | 0 | -9.00 | 0.0 | 11.00 | 2.00 | 3.00 | -9 | 35.766864 | -78.656315 |
61933 | NORTH CAROLINA | 3700001 | NC Health and Human Services | Caswell Center | 0 | 0 | 5 | 0 | 2 | 0 | ... | 0 | 0 | -9.00 | 0.0 | 2.00 | 1.00 | 1.00 | -9 | 35.766864 | -78.656315 |
61934 | NORTH CAROLINA | 3700001 | NC Health and Human Services | Enola School | 2 | 0 | 5 | 2 | 5 | 5 | ... | 0 | 0 | 17.00 | 0.0 | 24.00 | 2.00 | 2.00 | -9 | 35.766864 | -78.656315 |
61935 | NORTH CAROLINA | 3700001 | NC Health and Human Services | Pine Valley and Bowling Green Schools | 2 | 2 | 8 | 5 | 8 | 5 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 1.00 | 9.00 | -9 | 35.766864 | -78.656315 |
61936 | NORTH CAROLINA | 3700001 | NC Health and Human Services | Bonnie Springer School | 2 | 0 | 8 | 2 | 11 | 8 | ... | 0 | 0 | -9.00 | 0.0 | 12.00 | 4.00 | 3.00 | -9 | 35.766864 | -78.656315 |
61937 | NORTH CAROLINA | 3700001 | NC Health and Human Services | The Wright School | 0 | 0 | 2 | 2 | 11 | 5 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.766864 | -78.656315 |
61938 | NORTH CAROLINA | 3700002 | NC School of Science and Math | NC School of Science and Math | 17 | 17 | 23 | 35 | 173 | 176 | ... | 0 | 0 | 6.00 | 8.0 | 1.00 | 0.00 | 0.00 | -9 | 36.019200 | -78.920700 |
61939 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Ashley Elementary | 8 | 5 | 47 | 53 | 41 | 38 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.91 | 1.00 | -9 | 35.038152 | -78.906834 |
61940 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Beaver Dam Elementary | 5 | 8 | 2 | 2 | 38 | 41 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 34.894480 | -78.580797 |
61941 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Lillian Black Elementary | 20 | 23 | 56 | 71 | 11 | 17 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.167600 | -78.971236 |
61942 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Brentwood Elementary | 41 | 38 | 185 | 179 | 35 | 32 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.038114 | -78.983271 |
61943 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Douglas Byrd Middle | 53 | 50 | 212 | 170 | 74 | 62 | ... | 0 | 0 | 0.70 | 0.0 | 0.00 | 1.00 | 1.00 | -9 | 35.032889 | -78.948687 |
61944 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Douglas Byrd High | 89 | 68 | 320 | 281 | 131 | 110 | ... | 0 | 0 | 1.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.030827 | -78.948512 |
61945 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Cape Fear High | 56 | 65 | 227 | 188 | 449 | 407 | ... | 0 | 0 | 1.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.034044 | -78.762384 |
61946 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Elizabeth M Cashwell Elementary | 53 | 50 | 203 | 209 | 86 | 62 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 1.00 | 1.00 | -9 | 35.006686 | -78.914821 |
61947 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Eastover-Central Elementary | 14 | 20 | 35 | 32 | 137 | 137 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.132837 | -78.750063 |
61948 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Anne Chesnutt Middle | 38 | 23 | 131 | 113 | 23 | 35 | ... | 0 | 0 | 1.00 | 0.0 | 0.00 | 0.99 | 1.00 | -9 | 35.050781 | -78.972985 |
61949 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Cliffdale Elementary | 50 | 53 | 209 | 203 | 41 | 32 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 1.00 | 1.00 | -9 | 35.059376 | -78.995347 |
61950 | NORTH CAROLINA | 3700011 | Cumberland County Schools | College Lakes Elementary | 29 | 17 | 128 | 113 | 62 | 38 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.127892 | -78.899248 |
61951 | NORTH CAROLINA | 3700011 | Cumberland County Schools | J W Coon Elementary | 17 | 17 | 47 | 41 | 41 | 35 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.032548 | -78.959857 |
61952 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Cumberland Mills Elementary | 62 | 53 | 161 | 140 | 71 | 83 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 1.00 | 1.00 | -9 | 35.002905 | -78.968089 |
61953 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Cumberland Road Elementary | 38 | 29 | 95 | 107 | 44 | 44 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.021270 | -78.910509 |
61954 | NORTH CAROLINA | 3700011 | Cumberland County Schools | District No 7 Elementary | 17 | 14 | 20 | 11 | 95 | 65 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.144980 | -78.710200 |
61955 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Alderman Road Elementary | 50 | 44 | 119 | 101 | 179 | 155 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 34.895249 | -78.860336 |
61956 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Hope Mills Middle | 53 | 32 | 95 | 104 | 137 | 122 | ... | 0 | 0 | 1.00 | 0.0 | 0.00 | 0.00 | 1.50 | -9 | 34.965736 | -78.939631 |
61957 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Ed V Baldwin Elementary | 44 | 35 | 119 | 131 | 131 | 122 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 34.978253 | -78.934382 |
61958 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Lewis Chapel Middle | 38 | 23 | 233 | 176 | 29 | 14 | ... | 0 | 0 | 1.00 | 0.0 | 0.00 | 1.00 | 1.00 | -9 | 35.048145 | -78.976740 |
61959 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Long Hill Elementary | 29 | 29 | 47 | 50 | 119 | 122 | ... | 0 | 0 | -9.00 | 0.0 | 0.00 | 0.00 | 0.50 | -9 | 35.162099 | -78.865417 |
61960 | NORTH CAROLINA | 3700011 | Cumberland County Schools | Massey Hill Classical High | 26 | 23 | 32 | 56 | 68 | 95 | ... | 0 | 0 | 0.77 | 0.0 | 0.00 | 0.00 | 1.00 | -9 | 35.031037 | -78.895077 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
64519 | NORTH CAROLINA | 3705020 | Wilson County Schools | Charles H Darden Middle | 32 | 35 | 131 | 119 | 8 | 8 | ... | 0 | 0 | 1.00 | 0.0 | 0.25 | 0.22 | 0.12 | -9 | 35.725200 | -77.911600 |
64520 | NORTH CAROLINA | 3705020 | Wilson County Schools | Forest Hills Middle | 56 | 35 | 158 | 191 | 65 | 59 | ... | 0 | 0 | 1.00 | 0.0 | 0.25 | 0.22 | 0.12 | -9 | 35.725200 | -77.911600 |
64521 | NORTH CAROLINA | 3705020 | Wilson County Schools | Toisnot Middle | 35 | 35 | 122 | 134 | 56 | 44 | ... | 0 | 0 | 1.00 | 0.0 | 0.25 | 0.22 | 0.12 | -9 | 35.725200 | -77.911600 |
64522 | NORTH CAROLINA | 3705020 | Wilson County Schools | Milton M Daniels Learning Cntr | 11 | 2 | 53 | 32 | 11 | 0 | ... | 0 | 0 | 1.00 | 0.0 | 0.13 | 0.22 | 0.12 | -9 | 35.725200 | -77.911600 |
64523 | NORTH CAROLINA | 3705020 | Wilson County Schools | Vick Elementary | 47 | 44 | 122 | 122 | 8 | 2 | ... | 0 | 0 | -9.00 | 0.0 | 0.25 | 0.22 | 0.12 | -9 | 35.725200 | -77.911600 |
64524 | NORTH CAROLINA | 3705020 | Wilson County Schools | John W Jones Elementary | 41 | 35 | 167 | 161 | 110 | 77 | ... | 0 | 0 | -9.00 | 0.0 | 0.25 | 0.22 | 0.12 | -9 | 35.725200 | -77.911600 |
64525 | NORTH CAROLINA | 3705020 | Wilson County Schools | Wilson Early College Academy | 20 | 38 | 29 | 86 | 38 | 65 | ... | 0 | 0 | 1.00 | 0.0 | 0.12 | 0.22 | 0.12 | -9 | 35.725200 | -77.911600 |
64526 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Boonville Elementary | 29 | 32 | 8 | 8 | 137 | 110 | ... | 0 | 0 | -9.00 | 0.0 | 0.33 | 0.10 | 0.00 | -9 | 36.136680 | -80.669441 |
64527 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Courtney Elementary | 26 | 32 | 5 | 2 | 110 | 107 | ... | 0 | 0 | -9.00 | 0.0 | 1.40 | 0.20 | 0.33 | -9 | 36.136680 | -80.669441 |
64528 | NORTH CAROLINA | 3705040 | Yadkin County Schools | East Bend Elementary | 26 | 29 | 2 | 2 | 101 | 107 | ... | 0 | 0 | -9.00 | 0.0 | 0.40 | 0.05 | 0.10 | -9 | 36.136680 | -80.669441 |
64529 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Fall Creek Elementary | 29 | 17 | 0 | 2 | 131 | 125 | ... | 0 | 0 | -9.00 | 0.0 | 0.50 | 0.44 | 0.10 | -9 | 36.136680 | -80.669441 |
64530 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Forbush High | 95 | 68 | 11 | 11 | 377 | 338 | ... | 0 | 0 | 0.50 | 0.0 | 0.50 | 0.15 | 0.10 | -9 | 36.136680 | -80.669441 |
64531 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Jonesville Elementary | 41 | 38 | 20 | 14 | 98 | 110 | ... | 0 | 0 | -9.00 | 0.0 | 0.50 | 0.50 | 0.89 | -9 | 36.136680 | -80.669441 |
64532 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Starmount High | 62 | 62 | 14 | 8 | 251 | 221 | ... | 0 | 0 | 0.50 | 0.0 | 0.67 | 0.18 | 0.03 | -9 | 36.136680 | -80.669441 |
64533 | NORTH CAROLINA | 3705040 | Yadkin County Schools | West Yadkin Elementary | 107 | 101 | 2 | 2 | 191 | 143 | ... | 0 | 0 | -9.00 | 0.0 | 0.50 | 0.16 | 0.03 | -9 | 36.136680 | -80.669441 |
64534 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Yadkinville Elementary | 134 | 107 | 11 | 11 | 167 | 149 | ... | 0 | 0 | -9.00 | 0.0 | 0.60 | 0.33 | 0.50 | -9 | 36.136680 | -80.669441 |
64535 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Forbush Elementary | 20 | 8 | 2 | 2 | 119 | 113 | ... | 0 | 0 | -9.00 | 0.0 | 0.40 | 0.05 | 0.10 | -9 | 36.136680 | -80.669441 |
64536 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Yadkin Success Academy | 5 | 2 | 2 | 2 | 17 | 14 | ... | 0 | 0 | 0.00 | 0.0 | 0.10 | 0.05 | 1.20 | -9 | 36.136680 | -80.669441 |
64537 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Yadkin Early College | 26 | 47 | 0 | 2 | 65 | 71 | ... | 0 | 0 | 0.50 | 0.0 | 0.10 | 0.00 | 0.00 | -9 | 36.136680 | -80.669441 |
64538 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Forbush Middle | 50 | 44 | 5 | 5 | 164 | 167 | ... | 0 | 0 | 0.50 | 0.0 | 0.50 | 0.15 | 0.10 | -9 | 36.136680 | -80.669441 |
64539 | NORTH CAROLINA | 3705040 | Yadkin County Schools | Starmount Middle | 35 | 44 | 8 | 5 | 131 | 101 | ... | 0 | 0 | 0.50 | 0.0 | 0.50 | 0.13 | 0.03 | -9 | 36.136680 | -80.669441 |
64540 | NORTH CAROLINA | 3705070 | Yancey County Schools | Bald Creek Elementary | 11 | 8 | 0 | 0 | 74 | 65 | ... | 0 | 0 | 0.10 | 0.0 | 0.50 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
64541 | NORTH CAROLINA | 3705070 | Yancey County Schools | Bee Log Elementary | 0 | 0 | 0 | 0 | 26 | 26 | ... | 0 | 0 | 0.10 | 0.0 | 0.10 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
64542 | NORTH CAROLINA | 3705070 | Yancey County Schools | Burnsville Elementary | 71 | 44 | 2 | 2 | 128 | 122 | ... | 0 | 0 | 0.20 | 0.0 | 1.00 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
64543 | NORTH CAROLINA | 3705070 | Yancey County Schools | Cane River Middle | 8 | 11 | 0 | 2 | 131 | 95 | ... | 0 | 0 | 0.70 | 0.0 | 1.00 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
64544 | NORTH CAROLINA | 3705070 | Yancey County Schools | Clearmont Elementary | 2 | 2 | 0 | 0 | 59 | 53 | ... | 0 | 0 | 0.10 | 0.0 | 0.50 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
64545 | NORTH CAROLINA | 3705070 | Yancey County Schools | East Yancey Middle | 20 | 29 | 2 | 2 | 122 | 92 | ... | 0 | 0 | 0.70 | 0.0 | 1.00 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
64546 | NORTH CAROLINA | 3705070 | Yancey County Schools | Micaville Elementary | 8 | 11 | 2 | 0 | 74 | 71 | ... | 0 | 0 | 0.10 | 0.0 | 0.53 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
64547 | NORTH CAROLINA | 3705070 | Yancey County Schools | Mountain Heritage High | 29 | 17 | 2 | 2 | 329 | 311 | ... | 0 | 0 | 1.00 | 1.0 | 1.00 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
64548 | NORTH CAROLINA | 3705070 | Yancey County Schools | South Toe Elementary | 8 | 5 | 0 | 0 | 53 | 41 | ... | 0 | 0 | 0.10 | 0.0 | 0.47 | 0.11 | 0.11 | -9 | 35.922069 | -82.294133 |
2618 rows × 52 columns
RACES = ['BL', 'WH', 'HI']
SEXES = ['M', 'F']
POP_LOWER_BOUND = 20 # Remove populations (e.g. white male) smaller than this threshold
# 1. Plotting
def plot_measure_accross_all_demographics(df, calculation, measure, bounds=[0,1]):
figure_num = 0
plt.figure(figsize=(20,6))
for sex_index, sex in enumerate(SEXES):
for race_index, race in enumerate(RACES):
figure_num += 1
likelyhood = f'{calculation}_{measure}_{race}_{sex}'
curr_dem_data = df[pd.notnull(data[likelyhood])]
plt.subplot(len(SEXES), len(RACES), figure_num)
plt.scatter(x=curr_dem_data['LON1516'], y=curr_dem_data['LAT1516'], c=curr_dem_data[likelyhood], s=1, alpha=1, cmap='coolwarm')
plt.title(f'{race}_{sex}, avg: {round(curr_dem_data[likelyhood].mean(), 2)}, n: {curr_dem_data[likelyhood].count()}')
plt.colorbar()
plt.clim(*bounds)
plt.axis('off')
plt.subplots_adjust(wspace=0.8, hspace=0.6)
plt.show()
# 2. Calculations
# ITERATIVE FUNCTION which appends likelyhood columns to the df for all demographics
# Flag parameter 'comarison_race' lets you compare how many times the first races is likely to be
# affected as the second race.
def calculate_likelyhood_comparisons(df, measure, comparison_race=None, races=RACES, sexes=SEXES, lower_bound=POP_LOWER_BOUND):
df = remove_schools_with_pop_less_than(lower_bound)
for sex in sexes:
for race in races:
df = calculate_likelyhood_comparison(df, measure, race, sex, comparison_race, sex)
return df
def remove_schools_with_pop_less_than(lower_bound):
filter_col_df = df[DEMOGRAPHIC_COUNT_COLS]
filtered_df = filter_col_df[filter_col_df >= lower_bound].dropna()
return df.merge(filtered_df)
def calculate_likelyhood_comparison(df, measure, race, sex, comparison_race, comparison_sex):
likelyhood = get_percentage_affected(df, measure, race, sex)
column_name = f'PERCENT_AFFECTED_{measure}_{race}_{sex}'
if comparison_race:
likelyhood = likelyhood / get_percentage_affected(df, measure, comparison_race, comparison_sex)
column_name = f'LH_COMPARED_TO_WH_FOR_{measure}_{race}_{sex}'
likelyhood = likelyhood[(likelyhood != np.inf) & (pd.notnull(likelyhood)) & (likelyhood > 0)] # Filter out infinity and NaN
return df.merge(
likelyhood.to_frame(column_name),
how='left',
left_index=True,
right_index=True,
)
def get_percentage_affected(df, measure, race, sex):
affected = f'{measure}_{race}_{sex}' # e.g. 'SCH_DISCWODIS_MULTOOS_BL_M'
pop_total = f'SCH_ENR_{race}_{sex}' # e.g. 'SCH_ENR_TR_M'
return df[affected] / df[pop_total]
MULTY_DAY_SUSPENSION = 'SCH_DISCWODIS_MULTOOS'
data = calculate_likelyhood_comparisons(df, MULTY_DAY_SUSPENSION) # "more than one out of school suspension"
measure = MULTY_DAY_SUSPENSION
race = 'BL'
sex = 'M'
col = f'PERCENT_AFFECTED_{measure}_{race}_{sex}'
race2 = 'BL'
sex2 = 'F'
col2 = f'PERCENT_AFFECTED_{measure}_{race2}_{sex2}'
race3 = 'WH'
sex3 = 'M'
col3_wh_m = f'PERCENT_AFFECTED_{measure}_{race3}_{sex3}'
data.groupby('LEA_NAME').agg('count').sort_values(by=['LEAID'], ascending=False).head(20)
biggest_districts = [
'Wake County Schools',
'Charlotte-Mecklenburg Schools',
'Cumberland County Schools',
'Guilford County Schools',
'Winston Salem/Forsyth County Schools',
# 'Gaston County Schools',
# 'Cabarrus County Schools',
# 'Onslow County Schools',
# 'Johnston County Schools',
# 'Alamance-Burlington Schools',
# 'Harnett County Schools',
# 'Pitt County Schools',
# 'Union County Public Schools',
# 'Wayne County Public Schools',
# 'Rowan-Salisbury Schools',
# 'Iredell-Statesville Schools',
# 'New Hanover County Schools',
# 'Durham Public Schools',
# 'Franklin County Schools',
# 'Rockingham County Schools'
]
biggest_districts_df = data[data['LEA_NAME'].isin(biggest_districts)]
Pros: Clearly shows basic stats, illuminating large number of high suspension extremes/outliers.
Cons: Unclear if outliers are significant compared to overall distribution density.
sns.boxplot(data[col].dropna())
<matplotlib.axes._subplots.AxesSubplot at 0x12efe0eb8>
Pros: Clearly shows distributions, and where majority of schools fall.
Cons: Unweighted by population, the plot fails to show how many students recieve suspensions, rather than just where schools of varying populations fall. Also, doesn't give a clear picture of the outliers.
sns.distplot(data[col])
<matplotlib.axes._subplots.AxesSubplot at 0x12d7955f8>
Pros: Shows every school in data sets as a distinct hair.
Cons: Difficult to differentiate schools in dense regions, still unclear the pop of each school.
from matplotlib import colors as mcolors
sns.rugplot(data[col], height=1)
<matplotlib.axes._subplots.AxesSubplot at 0x1323cff60>
Pros: ?
Cons: Just a redundant version of the curve approximating the histogram distribution. Histogram is preferable.
sns.violinplot(data[col])
<matplotlib.axes._subplots.AxesSubplot at 0x12b688cf8>
Pros: Quick, smoothed, high-level comparison of distributions and mean.
Cons: Redundant accross y-axis.
ax = sns.violinplot(x="LEA_NAME", y=col, data=biggest_districts_df, inner="quart")
for label in ax.get_xticklabels():
label.set_rotation(80)
Pros: invites more curiosity on specific data groups -- ie who are the "spear-head" schools in the extremes?
ax = sns.violinplot(x="LEA_NAME", y=col, data=biggest_districts_df, inner="quart", bw=0.1)
for label in ax.get_xticklabels():
label.set_rotation(80)
Pros: ... Cons: This is just an inferior version of the violin plots, as it conveys just summary stats, and nothing about the distribution.
sns.set(style="ticks", palette="pastel")
ax = sns.boxplot(x="LEA_NAME", y=col,
hue="LEA_NAME",
data=biggest_districts_df)
sns.despine(offset=10, trim=True)
for label in ax.get_xticklabels():
label.set_rotation(80)
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
<matplotlib.legend.Legend at 0x12f1e30f0>
Pros: Shows large density of easily comparable information via Tufte's "small multiples", grouped by school.
Cons: We've split each school's data across 3 variables, and we have no way of differentiating each school from another. Interactivity could help here
# Load the dataset
crashes = sns.load_dataset("car_crashes")
# Make the PairGrid
g = sns.PairGrid(biggest_districts_df,
x_vars=[col, col3_wh_m, col2], y_vars=['LEA_NAME'],
height=10, aspect=.25)
# Draw a dot plot using the stripplot function
g.map(sns.stripplot, size=10, orient="h",
palette="ch:s=1,r=-.1,h=1_r", linewidth=1, edgecolor="w")
# Use the same x axis limits on all columns and add better labels
g.set(xlim=(0, 0.2), xlabel="% pop affected")
# Use semantically meaningful titles for the columns
titles = ["Black Males", "White Males", "Black Females"]
for ax, title in zip(g.axes.flat, titles):
# Set a different title for each axes
ax.set(title=title)
# Make the grid horizontal instead of vertical
ax.xaxis.grid(False)
ax.yaxis.grid(True)
sns.despine(left=True, bottom=True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-06cd1ddc6675> in <module> 1 # Load the dataset ----> 2 crashes = sns.load_dataset("car_crashes") 3 4 # Make the PairGrid 5 g = sns.PairGrid(biggest_districts_df, NameError: name 'sns' is not defined
Pros: descriptive stats typical of boxplot, with a little more window into where each school sits.
Cons: a hybrid of violin/boxplot, would work best if the swarmplots communicated something useful, like school pop size via size, or some factor via their color.
sns.set(style="ticks")
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
# ax.set_xscale("log")
# Load the example planets dataset
planets = sns.load_dataset("planets")
# Plot the orbital period with horizontal boxes
sns.boxplot(x=col, y="LEA_NAME", data=biggest_districts_df,
whis="range", palette="vlag")
# Add in points to show each observation
sns.swarmplot(x=col, y="LEA_NAME", data=biggest_districts_df,
size=2, color=".3", linewidth=0)
# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)
Pros: shows the distribution of white and black males.
Cons: This is the improper use of this sort of graph. Density/overlap isn't meaningful.
sns.set(style="ticks")
sns.jointplot(data[col], data[col3_wh_m], kind="hex", color="#4CB391")
<seaborn.axisgrid.JointGrid at 0x12f2e82e8>
Pros: Shows every single school in all districts, with trend lines, as well as expanding uncertainty bars. Shows a clear skew towards black male higher suspension rates, but also some outliers where white male suspension is much higher.
Cons: Too cluttered, difficult to read.
# sns.set()
# Plot sepal with as a function of sepal_length across days
data_cleaned = biggest_districts_df.dropna(subset=[col, col3_wh_m])
g = sns.lmplot(x=col, y=col3_wh_m, hue="LEA_NAME",
truncate=True, height=5, data=data_cleaned)
# Use more informative axis labels than are provided by default
g.set_axis_labels("Black male suspensions", "White Male Suspensions")
# Use the same x axis limits on all columns and add better labels
g.set(xlim=[0, 0.3], ylim=[0, 0.3])
# plt.axis('equal')
<seaborn.axisgrid.FacetGrid at 0x13035aba8>
melted_df = pd.melt(biggest_districts_df.dropna(), id_vars=['LEA_NAME'], value_vars=[col, col3_wh_m])
melted_df
melted_df_all_schools = pd.melt(data.dropna(subset=[col, col3_wh_m]), id_vars=['LEA_NAME'], value_vars=[col, col3_wh_m])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-c8bb940ba309> in <module> ----> 1 melted_df = pd.melt(biggest_districts_df.dropna(), id_vars=['LEA_NAME'], value_vars=[col, col3_wh_m]) 2 melted_df 3 4 # melted_df_all_schools = pd.melt(data.dropna(subset=[col, col3_wh_m]), id_vars=['LEA_NAME'], value_vars=[col, col3_wh_m]) 5 NameError: name 'pd' is not defined
Pros: Clearly communicates contrasting means between white and black male students, with a secondary semi-transparent layer of individual schools communicating distribution. Allows comparion within and accross schools, and gives a big picture view.
Cons: This plot should reall be sorted from highest to lowest average (I just can't quite figure out how to... :P)
sns.set(rc={'figure.figsize':(11.7,20)})
# Initialize the figure
f, ax = plt.subplots()
sns.despine(bottom=True, left=True)
# Show each observation with a scatterplot
sns.stripplot(x="value", y="LEA_NAME", hue="variable",
data=melted_df_all_schools, dodge=True, jitter=False,
alpha=.4, zorder=1)
# Show the means
sns.pointplot(x="value", y="LEA_NAME", hue="variable",
data=melted_df_all_schools, dodge=.532, join=False,
palette="dark", markers="d", scale=.75, ci=None)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title="Suspensions",
handletextpad=0,
frameon=True)
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x132a813c8>
Pros: displays and fascilitates comparison between black and white male suspension rates. Notably, its clear how white suspension rates nearly entirely distribute at the bottom of the plot.
Cons: Currently, the average/std could be better displayed to enable comparison.
sns.set(style="whitegrid", rc={'figure.figsize': (10, 7)})
ax = sns.violinplot(x="LEA_NAME", y="value", hue="variable",
data=melted_df, palette="Set2", split=True,
scale="count", inner="quart",
scale_hue=False, bw=.2)
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
for label in ax.get_xticklabels():
label.set_rotation(20)
Pros: Comparison and distribution
Cons: It's unclear what the plot is attempting to show. Other plots in this assignment much more effectively fascilitate comparison.
# Use JointGrid directly to draw a custom plot
grid = sns.JointGrid(biggest_districts_df[col], biggest_districts_df[col3_wh_m], space=0, height=6, ratio=50)
grid.plot_joint(plt.scatter, color="g")
grid.plot_marginals(sns.rugplot, height=1, color="g")
<seaborn.axisgrid.JointGrid at 0x132ee6400>
Pros: Gives quick, clear high-level comparison of suspension rates, as well as range of values.
Cons: You lose all finer grain details with individual schools, and again this isn't weighted by student population, which may change view.
# Set up a grid to plot survival probability against several variables
g = sns.PairGrid(melted_df, y_vars="value",
x_vars=["LEA_NAME", "variable"],
height=5, aspect=.5)
# Draw a seaborn pointplot onto each Axes
g.map(sns.pointplot, scale=1, errwidth=4, color="xkcd:plum")
g.set(ylabel="Average Percent Suspended")
sns.despine(fig=g.fig, left=True)
for ax in g.axes.flat:
for label in ax.get_xticklabels():
label.set_rotation(70)