This notebook contains analysis related to a paper on marriage patterns in the U.S., based on data from the National Survey of Family Growth (NSFG).
It is based on Chapter 13 of Think Stats, 2nd Edition, by Allen Downey, available from thinkstats2.com
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import math
import shelve
import matplotlib.pyplot as plt
from matplotlib import pylab
from scipy.interpolate import interp1d
from scipy.misc import derivative
import thinkstats2
import thinkplot
from thinkstats2 import Cdf
import survival
import marriage
%time df = pd.read_hdf('FemMarriageData.hdf', 'FemMarriageData')
df.shape
CPU times: user 47.7 ms, sys: 194 ms, total: 242 ms Wall time: 1.42 s
(70183, 42)
Make a table showing the number of respondents in each cycle:
df.cycle.value_counts().sort_index()
cycle 3 7969 4 8450 5 10847 6 7643 7 12279 8 5601 9 5699 10 5554 11 6141 Name: count, dtype: int64
SAVE_PRED = False
if SAVE_PRED:
df = df[df['cycle'] < 10]
df.cycle.value_counts().sort_index()
cycle 3 7969 4 8450 5 10847 6 7643 7 12279 8 5601 9 5699 10 5554 11 6141 Name: count, dtype: int64
70183 - 5554 - 6141
58488
def format_date_range(array):
a, b = array.astype(int)
return '%d--%d' % (a, b)
def SummarizeCycle(cycle, df):
ages = df.age.min(), df.age.max()
ages= np.array(ages)
intvws = df.cmintvw.min(), df.cmintvw.max()
intvws = np.array(intvws) / 12 + 1900
births = df.cmbirth.min(), df.cmbirth.max()
births = np.array(births) / 12 + 1900
intvw_dates = format_date_range(intvws)
birth_dates = format_date_range(births)
print(cycle, ' & ', intvw_dates, '&', len(df), '&', birth_dates, r'\\')
for cycle, group in df.groupby('cycle'):
SummarizeCycle(cycle, group)
3 & 1982--1983 & 7969 & 1937--1968 \\ 4 & 1988--1988 & 8450 & 1943--1973 \\ 5 & 1995--1995 & 10847 & 1950--1980 \\ 6 & 2002--2003 & 7643 & 1957--1988 \\ 7 & 2006--2010 & 12279 & 1961--1995 \\ 8 & 2011--2013 & 5601 & 1966--1998 \\ 9 & 2013--2015 & 5699 & 1968--2000 \\ 10 & 2015--2017 & 5554 & 1966--2002 \\ 11 & 2017--2019 & 6141 & 1968--2004 \\
Check for missing values in agemarry
:
def CheckAgeVars(df):
print(sum(df[df.evrmarry].agemarry.isnull()))
for cycle, group in df.groupby('cycle'):
CheckAgeVars(group)
0 22 0 37 16 17 11 0 0
Generate a table with the number of respondents in each cohort:
marriage.DigitizeResp(df)
grouped = df.groupby('birth_index')
for name, group in iter(grouped):
age_range = '%d--%d' % (int(group.age.min()), int(group.age_index.max()))
print(name, '&', len(group), '&', age_range,
'&', len(group[group.evrmarry]), '&', sum(group.missing), r'\\')
30 & 325 & 42--45 & 310 & 0 \\ 40 & 3608 & 32--45 & 3287 & 9 \\ 50 & 10631 & 22--45 & 8667 & 18 \\ 60 & 15064 & 15--50 & 8902 & 33 \\ 70 & 16466 & 14--49 & 9185 & 30 \\ 80 & 14318 & 14--39 & 5331 & 13 \\ 90 & 8539 & 15--29 & 915 & 0 \\ 100 & 1232 & 15--19 & 2 & 0 \\
def ComputeCutoffs(df):
grouped = df.groupby('birth_index')
cutoffs = {}
for name, group in sorted(grouped):
cutoffs[name] = int(group.age.max())
return cutoffs
cutoffs = ComputeCutoffs(df)
cutoffs
{30: 45, 40: 45, 50: 45, 60: 50, 70: 49, 80: 39, 90: 29, 100: 19}
Estimate the hazard function for the 80s cohort (curious to see what's going on during the "marriage strike")
cohort = grouped.get_group(80)
missing = (cohort.evrmarry & cohort.agemarry.isnull())
cohort = cohort[~missing]
complete = cohort[cohort.evrmarry].agemarry_index
ongoing = cohort[~cohort.evrmarry].age_index
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)
10 14305 1 0 7e-05 11 14304 1 0 7e-05 12 14303 2 0 0.00014 13 14301 12 0 0.00084 14 14289 16 14 0.0011 15 14259 60 260 0.0042 16 13939 131 242 0.0094 17 13566 183 338 0.013 18 13045 412 460 0.032 19 12173 514 569 0.042 20 11090 511 579 0.046 21 10000 505 572 0.051 22 8923 556 572 0.062 23 7795 491 454 0.063 24 6850 450 533 0.066 25 5867 398 576 0.068 26 4893 307 554 0.063 27 4032 243 506 0.06 28 3283 185 435 0.056 29 2663 122 461 0.046 30 2080 84 401 0.04 31 1595 67 360 0.042 32 1168 31 297 0.027 33 840 18 228 0.021 34 594 9 176 0.015 35 409 4 143 0.0098 36 262 2 120 0.0076 37 140 2 74 0.014 38 64 1 49 0.016 39 14 0 14 0
Run the same analysis for the 70s cohort (to extract $\lambda(33)$).
cohort = grouped.get_group(70)
missing = (cohort.evrmarry & cohort.agemarry.isnull())
cohort = cohort[~missing]
complete = cohort[cohort.evrmarry].agemarry_index
ongoing = cohort[~cohort.evrmarry].age_index
hf = survival.EstimateHazardFunction(complete, ongoing, verbose=True)
10 16436 2 0 0.00012 11 16434 1 0 6.1e-05 12 16433 8 0 0.00049 13 16425 13 0 0.00079 14 16412 46 3 0.0028 15 16363 101 388 0.0062 16 15874 222 518 0.014 17 15134 323 540 0.021 18 14271 636 340 0.045 19 13295 708 268 0.053 20 12319 706 242 0.057 21 11371 755 220 0.066 22 10396 761 264 0.073 23 9371 735 347 0.078 24 8289 675 345 0.081 25 7269 595 205 0.082 26 6469 534 149 0.083 27 5786 415 164 0.072 28 5207 419 198 0.08 29 4590 348 215 0.076 30 4027 278 315 0.069 31 3434 202 278 0.059 32 2954 175 212 0.059 33 2567 114 155 0.044 34 2298 103 173 0.045 35 2022 79 216 0.039 36 1727 52 203 0.03 37 1472 41 186 0.028 38 1245 27 179 0.022 39 1039 22 161 0.021 40 856 17 148 0.02 41 691 15 141 0.022 42 535 9 122 0.017 43 404 6 114 0.015 44 284 7 87 0.025 45 190 3 56 0.016 46 131 1 47 0.0076 47 83 1 51 0.012 48 31 0 24 0 49 7 0 7 0
Use the 30s cohort to demonstrate the simple way to do survival analysis, by computing the survival function directly.
cohort = grouped.get_group(30)
sf = survival.MakeSurvivalFromSeq(cohort.agemarry_index.fillna(np.inf))
ts, ss = sf.Render()
print(ss)
thinkplot.Plot(ts, ss)
thinkplot.Config(xlim=[12, 42])
plt.ylabel('Survival Function')
[ 9.93846154e-01 9.69230769e-01 9.47692308e-01 9.01538462e-01 8.18461538e-01 6.86153846e-01 5.60000000e-01 4.64615385e-01 3.63076923e-01 2.95384615e-01 2.61538462e-01 2.33846154e-01 2.12307692e-01 1.75384615e-01 1.47692308e-01 1.32307692e-01 1.10769231e-01 1.01538462e-01 8.92307692e-02 7.69230769e-02 7.38461538e-02 6.46153846e-02 6.15384615e-02 5.84615385e-02 5.53846154e-02 4.92307692e-02 4.61538462e-02 -2.22044605e-16]
Text(0, 0.5, 'Survival Function')
Then use the SurvivalFunction to compute the HazardFunction:
hf = sf.MakeHazardFunction()
ts, lams = hf.Render()
print(lams)
thinkplot.Plot(ts, lams)
thinkplot.Config(xlim=[12, 42])
plt.ylabel('Hazard Function')
plt.xlabel('Age (years)')
[0.00615385 0.0247678 0.02222222 0.0487013 0.09215017 0.16165414 0.1838565 0.17032967 0.21854305 0.18644068 0.11458333 0.10588235 0.09210526 0.17391304 0.15789474 0.10416667 0.1627907 0.08333333 0.12121212 0.13793103 0.04 0.125 0.04761905 0.05 0.05263158 0.11111111 0.0625 1. ]
Text(0.5, 0, 'Age (years)')
Make the first figure, showing sf and hf for the 30s cohort:
options = dict(formats=['pdf', 'png'], clf=False)
thinkplot.PrePlot(rows=2)
thinkplot.Plot(sf, label='survival')
thinkplot.Config(xlim=[13, 41], ylim=[0, 1.05])
plt.ylabel('Survival Function')
thinkplot.SubPlot(2)
thinkplot.Plot(hf, label='hazard')
thinkplot.Config(xlabel='age(years)', ylabel='Hazard function', xlim=[13, 41])
plt.ylabel('Hazard Function')
plt.xlabel('Age (years)')
thinkplot.Save(root='figs/marriage1', **options)
Writing figs/marriage1.pdf Writing figs/marriage1.png
thinkplot.Plot(sf, label='30s')
thinkplot.Config(xlim=[13, 41], ylim=[0, 1.05])
plt.xlabel('Age (years)', fontsize=14)
plt.ylabel('Survival function', fontsize=14)
thinkplot.Save(root='figs/marriage2', **options)
Writing figs/marriage2.pdf Writing figs/marriage2.png
thinkplot.Plot(hf, label='30s')
thinkplot.Config(xlim=[13, 41])
plt.xlabel('Age (years)', fontsize=14)
plt.ylabel('Hazard function', fontsize=14)
thinkplot.Save(root='figs/marriage3', **options)
Writing figs/marriage3.pdf Writing figs/marriage3.png
Make some pivot tables, just to see where the data are:
pt = df.pivot_table(index='birth_index', columns='age_index', values='age', aggfunc=len, fill_value=0)
pt
age_index | 14.0 | 15.0 | 16.0 | 17.0 | 18.0 | 19.0 | 20.0 | 21.0 | 22.0 | 23.0 | ... | 41.0 | 42.0 | 43.0 | 44.0 | 45.0 | 46.0 | 47.0 | 48.0 | 49.0 | 50.0 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
birth_index | |||||||||||||||||||||
30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 19 | 146 | 148 | 12 | 0 | 0 | 0 | 0 | 0 |
40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 414 | 384 | 194 | 198 | 25 | 0 | 0 | 0 | 0 | 0 |
50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 42 | 301 | ... | 371 | 482 | 633 | 533 | 38 | 0 | 0 | 0 | 0 | 0 |
60 | 0 | 269 | 300 | 370 | 665 | 722 | 609 | 513 | 520 | 242 | ... | 562 | 537 | 512 | 517 | 33 | 18 | 83 | 173 | 228 | 4 |
70 | 3 | 389 | 523 | 549 | 362 | 293 | 283 | 285 | 365 | 572 | ... | 581 | 491 | 483 | 383 | 258 | 272 | 202 | 122 | 48 | 0 |
80 | 14 | 260 | 242 | 340 | 483 | 589 | 658 | 683 | 705 | 602 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
90 | 0 | 749 | 829 | 895 | 909 | 873 | 583 | 652 | 635 | 566 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
100 | 0 | 347 | 368 | 286 | 171 | 60 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 rows × 37 columns
The following pivot table is not as helpful as it could be, since it doesn't show the number at risk.
df.pivot_table(index='birth_index', columns='agemarry_index', values='age', aggfunc=len, fill_value=0)
agemarry_index | 10.0 | 11.0 | 12.0 | 13.0 | 14.0 | 15.0 | 16.0 | 17.0 | 18.0 | 19.0 | ... | 38.0 | 39.0 | 40.0 | 41.0 | 42.0 | 43.0 | 44.0 | 45.0 | 46.0 | 47.0 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
birth_index | |||||||||||||||||||||
30 | 0 | 0 | 0 | 2 | 8 | 7 | 15 | 27 | 43 | 41 | ... | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
40 | 1 | 0 | 0 | 5 | 31 | 87 | 163 | 218 | 425 | 467 | ... | 3 | 9 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
50 | 0 | 0 | 1 | 10 | 50 | 125 | 384 | 642 | 1111 | 1057 | ... | 12 | 11 | 7 | 4 | 4 | 4 | 0 | 0 | 0 | 0 |
60 | 0 | 1 | 3 | 8 | 41 | 130 | 303 | 455 | 868 | 841 | ... | 25 | 24 | 18 | 15 | 11 | 6 | 2 | 3 | 1 | 1 |
70 | 2 | 1 | 8 | 13 | 46 | 101 | 222 | 324 | 636 | 708 | ... | 27 | 22 | 17 | 15 | 9 | 6 | 7 | 3 | 1 | 1 |
80 | 1 | 1 | 2 | 12 | 16 | 60 | 131 | 183 | 412 | 514 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
90 | 0 | 0 | 0 | 1 | 6 | 13 | 24 | 88 | 128 | 148 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
100 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 rows × 38 columns
Estimate the survival curve for each cohort:
df['complete'] = df.evrmarry
df['complete_var'] = df.agemarry_index
df['ongoing_var'] = df.age_index
df['complete_missing'] = df.complete & df.complete_var.isnull()
df['ongoing_missing'] = ~df.complete & df.ongoing_var.isnull()
# for some marriages, we don't have the date of marriage
for cycle, group in df.groupby('cycle'):
print(cycle, sum(group.complete_missing), sum(group.ongoing_missing))
3 0 0 4 22 0 5 0 0 6 37 0 7 16 0 8 17 0 9 11 0 10 0 0 11 0 0
resps = [group for cycle, group in df.groupby('cycle')]
iters = 101
%time sf_map = marriage.EstimateSurvivalByCohort(resps, iters=iters, cutoffs=cutoffs)
CPU times: user 6.31 s, sys: 0 ns, total: 6.31 s Wall time: 6.31 s
del sf_map[30]
try:
del sf_map[100]
except KeyError:
pass
Check a sample:
for sf in sf_map[90]:
print(sf.ss)
print(sf.Prob(34))
break
13.0 0.999507 14.0 0.998890 15.0 0.998274 16.0 0.995719 17.0 0.984416 18.0 0.965172 19.0 0.939227 20.0 0.908974 21.0 0.877881 22.0 0.839327 23.0 0.803716 24.0 0.764404 25.0 0.741132 26.0 0.715760 27.0 0.697753 28.0 0.693281 dtype: float64 0.6932805494378153
for sf in sf_map[80]:
print(sf.ss)
print(sf.Prob(34))
break
10.0 0.999927 11.0 0.999780 12.0 0.999559 13.0 0.999192 14.0 0.998163 15.0 0.993160 16.0 0.984599 17.0 0.969712 18.0 0.936817 19.0 0.896370 20.0 0.857985 21.0 0.807654 22.0 0.745007 23.0 0.690961 24.0 0.643513 25.0 0.591521 26.0 0.534436 27.0 0.487923 28.0 0.458039 29.0 0.428954 30.0 0.405191 31.0 0.383190 32.0 0.371897 33.0 0.352755 34.0 0.349753 35.0 0.343544 36.0 0.336504 37.0 0.325736 38.0 0.308888 dtype: float64 0.349752784342326
Make the figure showing estimated survival curves:
def PlotSurvivalFunctions(root, sf_map, sf_map_pred=None, **options):
if sf_map_pred:
marriage.PlotSurvivalFunctions(sf_map_pred, predict_flag=True)
marriage.PlotSurvivalFunctions(sf_map)
thinkplot.config(xlabel='Age (years)',
ylabel='Percentage never married',
xlim=[13, 50],
ylim=[-5, 105],
loc='upper right',
frameon=False,
**options)
plt.tight_layout()
thinkplot.save(root=root, formats=['pdf', 'png'])
def set_palette(*args, **kwds):
"""Set the matplotlib color cycler.
args, kwds: same as for sns.color_palette
Also takes a boolean kwd, `reverse`, to indicate
whether the order of the palette should be reversed.
returns: list of colors
"""
reverse = kwds.pop('reverse', False)
palette = sns.color_palette(*args, **kwds)
palette = list(palette)
if reverse:
palette.reverse()
cycler = plt.cycler(color=palette)
plt.gca().set_prop_cycle(cycler)
return palette
def draw_age_lines(ages):
for age in ages:
plt.axvline(age, color='gray', linestyle='dotted', alpha=0.3)
palette = set_palette('hls', 6)
ages = [28, 38, 48]
draw_age_lines(ages)
options_w = dict(title='Women in the U.S. by decade of birth')
PlotSurvivalFunctions('figs/marriage4', sf_map, None, **options_w)
Writing figs/marriage4.pdf Writing figs/marriage4.png
with shelve.open('sf_map') as d:
d['sf_map'] = sf_map
with shelve.open('predictions') as d:
sf_map_pred = d['sf_map_pred']
options_w = dict(title='Women in the U.S. by decade of birth')
marriage.PlotSurvivalFunctions(sf_map_pred, predict_flag=True)
palette = set_palette('hls', 6)
draw_age_lines(ages)
options_w = dict(title='Women in the U.S. by decade of birth')
PlotSurvivalFunctions('figs/marriage4', sf_map, None, **options_w)
Writing figs/marriage4.pdf Writing figs/marriage4.png
Make a table of marriage rates for each cohort at each age:
def MakeTable(sf_map, ages):
t = []
for name, sf_seq in sorted(sf_map.items()):
ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
ss = ss[0]
vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
t.append((name, vals))
return t
def MakePercentageTable(sf_map, ages):
"""Prints percentage unmarried for each cohort at each age.
"""
t = MakeTable(sf_map, ages)
for name, sf_seq in sorted(sf_map.items()):
ts, ss = marriage.MakeSurvivalCI(sf_seq, [50])
ss = ss[0]
vals = [np.interp(age, ts, ss, right=np.nan) for age in ages]
print(name, '&', ' & '.join('%0.0f' % (val*100) for val in vals), r'\\')
MakePercentageTable(sf_map, ages)
40 & 1187 & 726 & nan \\ 50 & 1915 & 1139 & nan \\ 60 & 3009 & 1455 & 1113 \\ 70 & 3684 & 2029 & 1821 \\ 80 & 4514 & 3108 & nan \\ 90 & 6727 & nan & nan \\
Generate projections:
%time sf_map_pred = marriage.EstimateSurvivalByCohort(resps, iters=iters, cutoffs=cutoffs, predict_flag=True)
del sf_map_pred[30]
del sf_map_pred[100]
CPU times: user 9.21 s, sys: 243 ms, total: 9.45 s Wall time: 9.33 s
import shelve
if SAVE_PRED:
with shelve.open('predictions') as d:
d['sf_map_pred'] = sf_map_pred
for cohort, seq in sf_map_pred.items():
if cohort > 90:
break
medians = [sf.MakeCdf().Value(0.5) for sf in seq]
print(cohort, np.median(medians))
40 20.0 50 21.0 60 24.0 70 25.0 80 27.0 90 37.0
And make the figure showing projections:
palette = set_palette('hls', 6)
draw_age_lines(ages)
PlotSurvivalFunctions('figs/marriage5', sf_map, sf_map_pred, **options_w)
Writing figs/marriage5.pdf Writing figs/marriage5.png
Make the table again with the projections filled in.
MakePercentageTable(sf_map_pred, ages)
40 & 1209 & 731 & nan \\ 50 & 2031 & 1192 & nan \\ 60 & 3025 & 1482 & 1252 \\ 70 & 3614 & 1959 & 1687 \\ 80 & 4495 & 3241 & 2792 \\ 90 & 7114 & 5130 & 4418 \\
def PlotFractions(sf_map, ages, label_flag=False, **options):
t = MakeTable(sf_map, ages)
cohorts, cols = zip(*t)
rows = zip(*cols)
thinkplot.PrePlot(3)
t = list(zip(ages, rows))
for age, row in reversed(t):
label = 'at age %d' % age if label_flag else ''
thinkplot.Plot(cohorts, row, label=label, **options)
PlotFractions(sf_map_pred, ages, color='gray', linestyle='dashed', linewidth=2)
PlotFractions(sf_map, ages, label_flag=True, alpha=1)
#fontsize=12
#thinkplot.Text(36, 0.26, '24', fontsize=fontsize)
#thinkplot.Text(37, 0.13, '9', fontsize=fontsize)
#thinkplot.Text(37, 0.07, '7', fontsize=fontsize)
#thinkplot.Text(90, 0.85, '80', fontsize=fontsize)
#thinkplot.Text(90, 0.56, '51', fontsize=fontsize)
#thinkplot.Text(89.5, 0.47, '42', fontsize=fontsize)
#thinkplot.Text(80, 0.42, '35', fontsize=fontsize)
#thinkplot.Text(70, 0.18, '18', fontsize=fontsize)
thinkplot.Config(xlim=[34, 97], ylim=[0, 1], legend=True, loc='lower left',
xlabel='cohort (decade)', ylabel='Fraction ever married',
title='Women in the U.S.')
thinkplot.Save(root='figs/marriage6', **options)
Writing figs/marriage6.pdf Writing figs/marriage6.png
/home/downey/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance. In a future version, a new instance will always be created and returned. Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance. warnings.warn(message, mplDeprecation, stacklevel=1)
%time df2 = pd.read_hdf('MaleMarriageData.hdf', 'MaleMarriageData')
df2.shape
CPU times: user 19.9 ms, sys: 0 ns, total: 19.9 ms Wall time: 18.7 ms
(34398, 38)
for cycle, group in df2.groupby('cycle'):
SummarizeCycle(cycle, group)
6 & 2002--2003 & 4928 & 1957--1988 \\ 7 & 2006--2010 & 10403 & 1961--1995 \\ 8 & 2011--2013 & 4815 & 1966--1998 \\ 9 & 2013--2015 & 4506 & 1968--2000 \\ 10 & 2015--2017 & 4540 & 1965--2002 \\ 11 & 2017--2019 & 5206 & 1968--2004 \\
sum(df2.missing)
0
marriage.DigitizeResp(df2)
grouped = df2.groupby('birth_index')
for name, group in iter(grouped):
age_range = '%d--%d' % (int(group.age.min()), int(group.age_index.max()))
print(name, '&', len(group), '&', age_range,
'&', len(group[group.evrmarry]), '&', sum(group.missing), r'\\')
50 & 322 & 42--45 & 224 & 0 \\ 60 & 4071 & 32--50 & 2827 & 0 \\ 70 & 9476 & 22--49 & 5743 & 0 \\ 80 & 11421 & 15--39 & 3259 & 0 \\ 90 & 7856 & 15--29 & 457 & 0 \\ 100 & 1252 & 15--19 & 1 & 0 \\
cutoffs2 = ComputeCutoffs(df2)
cutoffs2
{50: 45, 60: 50, 70: 49, 80: 39, 90: 29, 100: 19}
resps2 = [group for cycle, group in df2.groupby('cycle')]
%time sf_map_male = marriage.EstimateSurvivalByCohort(resps2, iters=iters, cutoffs=cutoffs2)
del sf_map_male[100]
CPU times: user 5.15 s, sys: 4.31 ms, total: 5.16 s Wall time: 5.1 s
palette = set_palette('hls', 6)
draw_age_lines(ages)
options_m = dict(title='Men in the U.S. by decade of birth')
PlotSurvivalFunctions('figs/marriage7', sf_map_male, None, **options_m)
Writing figs/marriage7.pdf Writing figs/marriage7.png
%time sf_map_male_pred = marriage.EstimateSurvivalByCohort(resps2, iters=iters, cutoffs=cutoffs2, predict_flag=True)
del sf_map_male_pred[100]
CPU times: user 5.41 s, sys: 325 µs, total: 5.41 s Wall time: 5.35 s
for cohort, seq in sf_map_male_pred.items():
if cohort > 90:
break
medians = [sf.MakeCdf().Value(0.5) for sf in seq]
print(cohort, np.median(medians))
50 26.0 60 27.0 70 28.0 80 30.0 90 36.0
palette = set_palette('hls', 5)
draw_age_lines(ages)
PlotSurvivalFunctions('figs/marriage8', sf_map_male, sf_map_male_pred, **options_m)
Writing figs/marriage8.pdf Writing figs/marriage8.png
MakePercentageTable(sf_map_male, ages)
50 & 3625 & 2075 & nan \\ 60 & 4360 & 2134 & 1581 \\ 70 & 4735 & 2178 & 1786 \\ 80 & 5549 & 3732 & nan \\ 90 & 7330 & nan & nan \\
MakePercentageTable(sf_map_male_pred, ages)
50 & 3575 & 1899 & nan \\ 60 & 4274 & 2044 & 1492 \\ 70 & 4693 & 2166 & 1718 \\ 80 & 5576 & 3610 & 2862 \\ 90 & 7471 & 4837 & 3835 \\
PlotFractions(sf_map_male_pred, ages, color='gray', linestyle='dashed', linewidth=2)
PlotFractions(sf_map_male, ages, label_flag=True, alpha=1)
fontsize=12
thinkplot.Text(46, 0.69, '68', fontsize=fontsize)
thinkplot.Text(46, 0.30, '26', fontsize=fontsize)
thinkplot.Text(46, 0.20, '18', fontsize=fontsize)
thinkplot.Text(70, 0.18, '19', fontsize=fontsize)
thinkplot.Text(80, 0.43, '43', fontsize=fontsize)
thinkplot.Text(90, 0.89, '86', fontsize=fontsize)
thinkplot.Text(90, 0.56, '52', fontsize=fontsize)
thinkplot.Text(90, 0.40, '38', fontsize=fontsize)
thinkplot.Config(xlim=[34, 97], ylim=[0, 1], legend=True, loc='upper left',
xlabel='cohort (decade)', ylabel='Fraction unmarried',
title='Men in the U.S.')
thinkplot.Save(root='figs/marriage9', **options)
Writing figs/marriage9.pdf Writing figs/marriage9.png
/home/downey/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance. In a future version, a new instance will always be created and returned. Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance. warnings.warn(message, mplDeprecation, stacklevel=1)