Comparing results from NSFG using my original code and lifelines.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shelve
from lifelines import KaplanMeierFitter
from utils import decorate
from utils import values
from utils import resample_rows_weighted
from utils import round_into_bins
from empiricaldist import Pmf, Cdf
random_seed = 12
def set_palette(*args, **kwds):
"""Set the matplotlib color cycler.
args, kwds: same as for sns.color_palette
Also takes a boolean kwd, `reverse`, to indicate
whether the order of the palette should be reversed.
returns: list of colors
"""
reverse = kwds.pop('reverse', False)
palette = sns.color_palette(*args, **kwds)
palette = list(palette)
if reverse:
palette.reverse()
cycler = plt.cycler(color=palette)
plt.gca().set_prop_cycle(cycler)
return palette
def plot_kmf_map(ax, kmf_map, cohorts, **options):
"""
"""
label = options.pop('label', None)
palette = set_palette('hls', len(cohorts))
for cohort in cohorts:
series = kmf_map[cohort]
label = label if label is not None else f'{cohort+1900}s'
series.plot(ax=ax, label=label, **options)
def PercentileRow(array, p):
"""Selects the row from a sorted array that maps to percentile p.
p: float 0--100
returns: NumPy array (one row)
"""
rows, cols = array.shape
index = int(rows * p / 100)
return array[index,]
def PercentileRows(ys_seq, percents):
"""Given a collection of lines, selects percentiles along vertical axis.
For example, if ys_seq contains simulation results like ys as a
function of time, and percents contains (5, 95), the result would
be a 90% CI for each vertical slice of the simulation results.
ys_seq: sequence of lines (y values)
percents: list of percentiles (0-100) to select
returns: list of NumPy arrays, one for each percentile
"""
nrows = len(ys_seq)
ncols = len(ys_seq[0])
array = np.zeros((nrows, ncols))
for i, ys in enumerate(ys_seq):
array[i,] = ys
array = np.sort(array, axis=0)
rows = [PercentileRow(array, p) for p in percents]
return rows
def MakeSurvivalCI(sf_seq, percents, flip=False):
"""Makes confidence intervals from a list of survival functions.
sf_seq: list of SurvivalFunction
percents: list of percentiles to select, like [5, 95]
returns: (ts, rows) where ts is a sequence of times and
rows contains one row of values for each percent
"""
# find the union of all ts where the sfs are evaluated
ts = set()
for sf in sf_seq:
ts |= set(sf.ts)
ts = list(ts)
ts.sort()
# evaluate each sf at all times
if flip:
ys = 1-sf.Probs(ts)
else:
ys = sf.Probs(ts)
ss_seq = [100*ys for sf in sf_seq if len(sf) > 0]
# return the requested percentiles from each column
rows = PercentileRows(ss_seq, percents)
return ts, rows
def PlotSurvivalFunctions(sf_map):
"""Plot estimated survival functions.
sf_map: map from group name to sequence of survival functions
predict_flag: whether the lines are predicted or actual
colormap: map from group name to color
"""
for name, sf_seq in sorted(sf_map.items(), reverse=False):
if len(sf_seq) == 0:
continue
sf = sf_seq[0]
if len(sf) == 0:
continue
ts, rows = MakeSurvivalCI(sf_seq, [50])
plt.plot(ts, 100-rows[0], label='19%ds'%name, alpha=0.8)
Get the curves computed in marriage.ipynb
with shelve.open('sf_map') as d:
sf_map = d['sf_map']
palette = set_palette('hls', 6)
PlotSurvivalFunctions(sf_map)
decorate()
Load the data cleaned in clean_nsfg.ipynb and run the analysis with lifelines
%time unweighted = pd.read_hdf('FemMarriageData.hdf', 'FemMarriageData')
unweighted.shape
CPU times: user 46.6 ms, sys: 5.19 ms, total: 51.8 ms Wall time: 49.1 ms
(70183, 45)
evrmarry = unweighted['evrmarry']
values(evrmarry)
evrmarry False 33584 True 36599 Name: count, dtype: int64
for name, group in unweighted.groupby('birth_index'):
missing = group['evrmarry'] & group['agemarry'].isna()
print(name, missing.sum())
30 0 40 9 50 18 60 33 70 30 80 13 90 0 100 0
def resample_by_cycle(unweighted):
dfs = []
for name, group in unweighted.groupby('cycle'):
df = resample_rows_weighted(group, 'finalwgt')
dfs.append(df)
return pd.concat(dfs).reset_index(drop=True)
sample = resample_by_cycle(unweighted)
values(sample['birth_index'])
birth_index 30 323 40 4014 50 10381 60 15638 70 17170 80 13793 90 7835 100 1029 Name: count, dtype: int64
def fill_missing(df):
for cohort in range(30, 110, 10):
# find people in this cohort who are married by have no agemarry
missing = (df['birth_index'] == cohort) & df['evrmarry'] & df['agemarry'].isna()
n = missing.sum()
if n == 0:
continue
# find people in this cohort who have been married once
donors = (df['birth_index'] == cohort) & df['evrmarry'] & df['agemarry'].notna()
# select their ages when married
agemarry = df.loc[donors, 'agemarry']
low, high = np.percentile(agemarry, [1, 99])
index = agemarry.between(low, high)
# fill missing data with a random sample
sample = np.random.choice(agemarry[index], size=n, replace=True)
df.loc[missing, 'agemarry'] = sample
fill_missing(sample)
missing = sample['evrmarry'] & sample['agemarry'].isna()
missing.sum()
0
cdf = Cdf.from_seq(sample['age'], name='age')
cdf.plot()
decorate()
cdf = Cdf.from_seq(sample['agemarry'], name='agemarry')
cdf.plot()
decorate()
def make_kmf_map(grouped):
"""Dictionary that maps from cohort name to survival function
"""
kmf_map = {}
for cohort, group in grouped:
kmf = KaplanMeierFitter()
kmf.fit(group['duration'], group['observed'])
series = (1 - kmf.survival_function_['KM_estimate']) * 100
series.name = cohort
kmf_map[cohort] = series
return kmf_map
evrmarry = sample['evrmarry']
sample['observed'] = evrmarry
sample.loc[evrmarry, 'duration'] = sample['agemarry']
sample.loc[~evrmarry, 'duration'] = sample['age']
grouped = sample.groupby('birth_index')
nsfg_map = make_kmf_map(grouped)
def decorate_plot(**options):
decorate(xlabel='Age (years)',
ylabel='Percent ever married',
xlim=[13, 55],
**options,
)
ax.legend(title='Decade of birth', loc='lower right')
cohorts = range(40, 100, 10)
fig, ax = plt.subplots()
plot_kmf_map(ax, nsfg_map, cohorts, title='U.S. Women, NSFG')
decorate_plot()
fig, ax = plt.subplots()
plot_kmf_map(ax, nsfg_map, cohorts, label='')
PlotSurvivalFunctions(sf_map)
decorate_plot(title='U.S. Women, NSFG')
fig, ax = plt.subplots()
for i in range(11):
sample = resample_by_cycle(unweighted)
fill_missing(sample)
evrmarry = sample['evrmarry']
sample['observed'] = evrmarry
sample.loc[evrmarry, 'duration'] = sample['agemarry']
sample.loc[~evrmarry, 'duration'] = sample['age']
grouped = sample.groupby('birth_index')
nsfg_map = make_kmf_map(grouped)
plot_kmf_map(ax, nsfg_map, cohorts, label='', alpha=0.1)
PlotSurvivalFunctions(sf_map)
decorate_plot()
values(sample['cycle'])
cycle 3 7969 4 8450 5 10847 6 7643 7 12279 8 5601 9 5699 10 5554 11 6141 Name: count, dtype: int64
In the last two cycles, the mardat
variables are in years rather than century-months.
subset = sample.query('cycle <= 9')
subset['mardat01'].notna().sum()
16635
subset['mardat02'].notna().sum()
902
subset['mardat03'].notna().sum()
91
subset['mardat04'].notna().sum()
19
time_between_two = (subset['mardat02'] - subset['mardat01']) / 12
pmf_two = Pmf.from_seq(time_between_two, name='first and second')
pmf_two.make_cdf().plot()
decorate(xlabel='Time between first two marriages')
time_between_three = (subset['mardat03'] - subset['mardat01']) / 12
pmf_three = Pmf.from_seq(time_between_three, name='first and third')
pmf_three.make_cdf().plot()
decorate(xlabel='Time between first three marriages')
from scipy.stats import gaussian_kde
kde_two = gaussian_kde(time_between_two.dropna())
kde_three = gaussian_kde(time_between_three.dropna())
qs = np.linspace(0, 25)
ps = kde_two(qs)
pmf = Pmf(ps, qs)
pmf.normalize()
pmf.make_cdf().plot(color='gray')
ps = kde_three(qs)
pmf = Pmf(ps, qs)
pmf.normalize()
pmf.make_cdf().plot(color='gray')
pmf_two.make_cdf().plot()
pmf_three.make_cdf().plot()
decorate(xlabel='Time between first two marriages')
np.percentile(time_between_two.dropna(), [50])
array([7.91666667])
np.percentile(time_between_three.dropna(), [50])
array([12.66666667])