import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
The data for this analysis comes from the American Community Survey's 2012 5-year data, via American Fact Finder. Specifically, it comes from tables DP03 (employment), DP04 (housing), DP05 (race), and S1701 (poverty).
def read_acs_file(path, coldict):
return pd.read_csv(path, skiprows=1, na_values=["N", "(X)"], low_memory=False)\
.rename(columns=coldict)\
.set_index([ "Geography", "Id" ])
dp03_cols = {
"Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force": "n_labor_force",
"Estimate; EMPLOYMENT STATUS - In labor force - Civilian labor force - Employed": "n_employed"
}
dp04_cols = {
"Estimate; HOUSING OCCUPANCY - Total housing units": "n_housing_units",
"Estimate; HOUSING TENURE - Occupied housing units": "n_occupied_units",
"Estimate; HOUSING TENURE - Owner-occupied": "n_owner_occupied_units",
"Estimate; HOUSING TENURE - Renter-occupied": "n_renter_occupied_units",
}
dp05_cols = {
"Estimate; SEX AND AGE - Total population": "total_pop",
"Estimate; RACE - One race - White": "white_pop",
"Estimate; RACE - One race - Black or African American": "black_pop"
}
s1701_cols = {
"Total; Estimate; Population for whom poverty status is determined": "n_poverty_universe",
"Below poverty level; Estimate; Population for whom poverty status is determined": "n_below_poverty_level"
}
dp03 = read_acs_file("../data/all-tracts/ACS_12_5YR_DP03_with_ann.csv", dp03_cols)
dp04 = read_acs_file("../data/all-tracts/ACS_12_5YR_DP04_with_ann.csv", dp04_cols)
dp05 = read_acs_file("../data/all-tracts/ACS_12_5YR_DP05_with_ann.csv", dp05_cols)
s1701 = read_acs_file("../data/all-tracts/ACS_12_5YR_S1701_with_ann.csv", s1701_cols)
joined = dp05[dp05_cols.values()]\
.join(dp04[dp04_cols.values()])\
.join(dp03[dp03_cols.values()])\
.join(s1701[s1701_cols.values()])\
joined["county"] = [ ", ".join(x.split(", ")[1:]) for x in joined.reset_index()["Geography"] ]
stl_county_ids = joined[joined["county"] == "St. Louis County, Missouri"].index.values
stl = joined.ix[stl_county_ids]
For this analysis, we're using the index of dissimilarity to measure segregation. Because we're particularly interested in segregation between black residents (people who identify as "Black or African American" as their only race) and white residents (people who identify only as non-Hispanic "White"), our calculations focus on just those residents.
def get_bwdi(tracts):
b_total = tracts["black_pop"].sum()
w_total = tracts["white_pop"].sum()
x = (tracts["black_pop"] / b_total).replace([np.inf, -np.inf], np.nan).fillna(0)
y = (tracts["white_pop"] / w_total).replace([np.inf, -np.inf], np.nan).fillna(0)
abs_dist = (x - y).apply(abs)
dist = abs_dist.sum() / 2
return dist
def get_grouped_bwdi(tracts, grouper):
grouped = tracts.groupby(grouper)
pop = grouped["total_pop"].sum()
black_pop = grouped["black_pop"].sum()
df = pd.DataFrame({
"bwdi": grouped.apply(get_bwdi),
"pop": pop,
"black_pop": black_pop,
"p_black_pop": black_pop * 1.0 / pop
})
return df
dissim = get_grouped_bwdi(joined, joined["county"])
We also want to ignore counties with small populations (less than 20,000 people), or relatively small proportions of black residents (less than 20% black).
large_black_populations = dissim[(dissim["pop"] >= 20000) & (dissim["p_black_pop"] >= 0.2)]
Given these parameters, we can construct a ranking, by black–white dissimilarity index, of the most segregated counties in the country:
large_black_populations["rank"] = large_black_populations["bwdi"].rank(ascending=False)
large_black_populations.sort("rank")[["rank", "bwdi", "pop", "p_black_pop"]]
def get_p_tracts_within_x_of_average(df, band=0.1):
n = len(df)
avg = df["black_pop"].sum() * 1. / df["total_pop"].sum()
within_range = df[(avg - (df["black_pop"] * 1. / df["total_pop"])).apply(abs) <= band]
return len(within_range) * 1.0 / n
Proportion of St. Louis County residents that are black:
round(stl["black_pop"].sum() * 1. / stl["total_pop"].sum(), 3)
Proportion of St. Louis County tracts that fall within 10 percentage points of that average:
round(get_p_tracts_within_x_of_average(stl, 0.1), 3)
def get_p_black_pop_in_segregated(tracts, threshold=0.8):
meets_threshold = tracts[(tracts["black_pop"] * 1.0 / tracts["total_pop"]) >= threshold]
return meets_threshold["black_pop"].sum() * 1.0 / tracts["black_pop"].sum()
Proportion of black population in St. Louis County that lives in tracts that are at least 80% black:
round(get_p_black_pop_in_segregated(stl), 3)
Proportion of black population, nationwide, that lives in tracts that are at least 80% black:
round(get_p_black_pop_in_segregated(joined), 3)
def get_demography(df, grouper):
grouped = df.groupby(grouper)
return pd.DataFrame({
"n_occupied_units": grouped["n_occupied_units"].sum(),
"p_owner_occupied": grouped["n_owner_occupied_units"].sum() * 1.0 / \
grouped["n_occupied_units"].sum(),
"p_renter_occupied": df["n_renter_occupied_units"].sum() * 1.0 / \
grouped["n_occupied_units"].sum(),
"p_unemployed": 1 - (grouped["n_employed"].sum() * 1.0 / \
grouped["n_labor_force"].sum()),
"p_poverty": grouped["n_below_poverty_level"].sum() * 1.0 / \
grouped["n_poverty_universe"].sum()
})
Comparing economic indicators several economic indicators bewteen tracts that are 80% black ("True") vs. other tracts ("False"):
demography = get_demography(stl, (stl["black_pop"] * 1.0 / stl["total_pop"]) >= 0.8)
demography.applymap(lambda x: round(x, 3))
import matplotlib as mpl
import mplstyle
import mplstyle.styles.simple
%matplotlib inline
mplstyle.set(mplstyle.styles.simple)
mplstyle.set({
"figure.figsize": (8, 6)
})
x_tick_labels = [ "Highly Segregated Tracts", "Other Tracts" ]
def style_axis(ax, labels):
ax.xaxis.grid(False)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=0, fontsize="xx-large")#, fontweight="bold")
ax.set_yticklabels([ "{0:.0f}%".format(y * 100) for y in ax.get_yticks() ])
mpl.pyplot.tight_layout()
def make_chart(variable, title):
mpl.pyplot.bar(left=[0,1],
height=(variable[True], variable[False]),
align="center",
width=0.6,
color="red",
alpha=0.8)
ax = mpl.pyplot.axes()
style_axis(ax, x_tick_labels)
ax.set_title(title, fontsize="xx-large")
return ax
ax = make_chart(demography["p_poverty"], "Population Below Poverty Level (St. Louis County, 2012)\n")
ax = make_chart(demography["p_unemployed"], "Percent Unemployment (St. Louis County, 2012)\n")
ax = make_chart(demography["p_owner_occupied"], "Households Owning Home (St. Louis County, 2012))\n")
mpl.pyplot.bar(left=[0,1],
height=(get_p_black_pop_in_segregated(joined), get_p_black_pop_in_segregated(stl)),
align="center",
width=0.6,
color="red",
alpha=0.8)
ax = mpl.pyplot.axes()
style_axis(ax, [ "Nationwide", "St. Louis County" ])
ax.set_title("Percentage of Black Residents Who Live In \nCensus Tracts That Are At Least 80% Black\n", fontsize="xx-large")
pass