Age at first marriage¶

Comparing results from NSFG using my original code and lifelines.

In [4]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import shelve

from lifelines import KaplanMeierFitter

from utils import decorate
from utils import values
from utils import resample_rows_weighted
from utils import round_into_bins

from empiricaldist import Pmf, Cdf

random_seed = 12

In [5]:

def set_palette(*args, **kwds):
    """Set the matplotlib color cycler.
    
    args, kwds: same as for sns.color_palette
    
    Also takes a boolean kwd, `reverse`, to indicate
    whether the order of the palette should be reversed.
    
    returns: list of colors
    """
    reverse = kwds.pop('reverse', False)
    palette = sns.color_palette(*args, **kwds)
    
    palette = list(palette)
    if reverse:
        palette.reverse()
        
    cycler = plt.cycler(color=palette)
    plt.gca().set_prop_cycle(cycler)
    return palette

In [103]:

def plot_kmf_map(ax, kmf_map, cohorts, **options):
    """
    """
    label = options.pop('label', None)

    palette = set_palette('hls', len(cohorts))
    for cohort in cohorts:
        series = kmf_map[cohort]
        label = label if label is not None else f'{cohort+1900}s'
        series.plot(ax=ax, label=label, **options)

In [6]:

In [7]:

def PercentileRow(array, p):
    """Selects the row from a sorted array that maps to percentile p.

    p: float 0--100

    returns: NumPy array (one row)
    """
    rows, cols = array.shape
    index = int(rows * p / 100)
    return array[index,]

In [8]:

def PercentileRows(ys_seq, percents):
    """Given a collection of lines, selects percentiles along vertical axis.

    For example, if ys_seq contains simulation results like ys as a
    function of time, and percents contains (5, 95), the result would
    be a 90% CI for each vertical slice of the simulation results.

    ys_seq: sequence of lines (y values)
    percents: list of percentiles (0-100) to select

    returns: list of NumPy arrays, one for each percentile
    """
    nrows = len(ys_seq)
    ncols = len(ys_seq[0])
    array = np.zeros((nrows, ncols))

    for i, ys in enumerate(ys_seq):
        array[i,] = ys

    array = np.sort(array, axis=0)

    rows = [PercentileRow(array, p) for p in percents]
    return rows

In [9]:

def MakeSurvivalCI(sf_seq, percents, flip=False):
    """Makes confidence intervals from a list of survival functions.

    sf_seq: list of SurvivalFunction
    percents: list of percentiles to select, like [5, 95]

    returns: (ts, rows) where ts is a sequence of times and
             rows contains one row of values for each percent
    """
    # find the union of all ts where the sfs are evaluated
    ts = set()
    for sf in sf_seq:
        ts |= set(sf.ts)

    ts = list(ts)
    ts.sort()

    # evaluate each sf at all times
    if flip:
        ys = 1-sf.Probs(ts)
    else:
        ys = sf.Probs(ts)
    ss_seq = [100*ys for sf in sf_seq if len(sf) > 0]

    # return the requested percentiles from each column
    rows = PercentileRows(ss_seq, percents)
    return ts, rows

In [10]:

def PlotSurvivalFunctions(sf_map):
    """Plot estimated survival functions.

    sf_map: map from group name to sequence of survival functions
    predict_flag: whether the lines are predicted or actual
    colormap: map from group name to color
    """
    for name, sf_seq in sorted(sf_map.items(), reverse=False):
        if len(sf_seq) == 0:
            continue

        sf = sf_seq[0]
        if len(sf) == 0:
            continue

        ts, rows = MakeSurvivalCI(sf_seq, [50])
        plt.plot(ts, 100-rows[0], label='19%ds'%name, alpha=0.8)

NSFG original analysis¶

Get the curves computed in marriage.ipynb

In [11]:

with shelve.open('sf_map') as d:
    sf_map = d['sf_map']

In [12]:

palette = set_palette('hls', 6)
PlotSurvivalFunctions(sf_map)
decorate()

NSFG corrected analysis¶

Load the data cleaned in clean_nsfg.ipynb and run the analysis with lifelines

In [30]:

%time unweighted = pd.read_hdf('FemMarriageData.hdf', 'FemMarriageData')
unweighted.shape

CPU times: user 46.6 ms, sys: 5.19 ms, total: 51.8 ms
Wall time: 49.1 ms

Out[30]:

(70183, 45)

In [31]:

evrmarry = unweighted['evrmarry']
values(evrmarry)

Out[31]:

evrmarry
False    33584
True     36599
Name: count, dtype: int64

In [70]:

for name, group in unweighted.groupby('birth_index'):
    missing = group['evrmarry'] & group['agemarry'].isna()
    print(name, missing.sum())

In [109]:

def resample_by_cycle(unweighted):
    dfs = []

    for name, group in unweighted.groupby('cycle'):
        df = resample_rows_weighted(group, 'finalwgt')
        dfs.append(df)

    return pd.concat(dfs).reset_index(drop=True)

In [72]:

sample = resample_by_cycle(unweighted)

In [73]:

values(sample['birth_index'])

Out[73]:

birth_index
30       323
40      4014
50     10381
60     15638
70     17170
80     13793
90      7835
100     1029
Name: count, dtype: int64

In [74]:

def fill_missing(df):
    for cohort in range(30, 110, 10):
        # find people in this cohort who are married by have no agemarry
        missing = (df['birth_index'] == cohort) & df['evrmarry'] & df['agemarry'].isna()
        n = missing.sum()
        if n == 0:
            continue
        
        # find people in this cohort who have been married once
        donors = (df['birth_index'] == cohort) & df['evrmarry'] & df['agemarry'].notna()

        # select their ages when married
        agemarry = df.loc[donors, 'agemarry']
        low, high = np.percentile(agemarry, [1, 99])
        index = agemarry.between(low, high)

        # fill missing data with a random sample
        sample = np.random.choice(agemarry[index], size=n, replace=True)
        df.loc[missing, 'agemarry'] = sample

In [75]:

fill_missing(sample)
missing = sample['evrmarry'] & sample['agemarry'].isna()
missing.sum()

Out[75]:

In [62]:

cdf = Cdf.from_seq(sample['age'], name='age')
cdf.plot()
decorate()

In [63]:

cdf = Cdf.from_seq(sample['agemarry'], name='agemarry')
cdf.plot()
decorate()

In [64]:

def make_kmf_map(grouped):
    """Dictionary that maps from cohort name to survival function
    """
    kmf_map = {}

    for cohort, group in grouped:
        kmf = KaplanMeierFitter()
        kmf.fit(group['duration'], group['observed'])
        series = (1 - kmf.survival_function_['KM_estimate']) * 100
        series.name = cohort
        kmf_map[cohort] = series
        
    return kmf_map

In [77]:

evrmarry = sample['evrmarry']
sample['observed'] = evrmarry
sample.loc[evrmarry, 'duration'] = sample['agemarry']
sample.loc[~evrmarry, 'duration'] = sample['age']

In [78]:

grouped = sample.groupby('birth_index')
nsfg_map = make_kmf_map(grouped)

In [85]:

def decorate_plot(**options):
    decorate(xlabel='Age (years)',
             ylabel='Percent ever married',
             xlim=[13, 55], 
             **options,
             )

    ax.legend(title='Decade of birth', loc='lower right')

In [104]:

cohorts = range(40, 100, 10)

fig, ax = plt.subplots()
plot_kmf_map(ax, nsfg_map, cohorts, title='U.S. Women, NSFG')
decorate_plot()

In [105]:

fig, ax = plt.subplots()
plot_kmf_map(ax, nsfg_map, cohorts, label='')
PlotSurvivalFunctions(sf_map)
decorate_plot(title='U.S. Women, NSFG')

In [112]:

fig, ax = plt.subplots()

for i in range(11):
    sample = resample_by_cycle(unweighted)
    fill_missing(sample)
    evrmarry = sample['evrmarry']
    sample['observed'] = evrmarry
    sample.loc[evrmarry, 'duration'] = sample['agemarry']
    sample.loc[~evrmarry, 'duration'] = sample['age']
    grouped = sample.groupby('birth_index')
    nsfg_map = make_kmf_map(grouped)
    plot_kmf_map(ax, nsfg_map, cohorts, label='', alpha=0.1)

PlotSurvivalFunctions(sf_map)
decorate_plot()

NSFG time between marriages¶

In [21]:

values(sample['cycle'])

Out[21]:

cycle
3      7969
4      8450
5     10847
6      7643
7     12279
8      5601
9      5699
10     5554
11     6141
Name: count, dtype: int64

In the last two cycles, the mardat variables are in years rather than century-months.

In [22]:

subset = sample.query('cycle <= 9')

In [23]:

subset['mardat01'].notna().sum()

Out[23]:

In [24]:

subset['mardat02'].notna().sum()

Out[24]:

In [25]:

subset['mardat03'].notna().sum()

Out[25]:

In [26]:

subset['mardat04'].notna().sum()

Out[26]:

In [27]:

time_between_two = (subset['mardat02'] - subset['mardat01']) / 12
pmf_two = Pmf.from_seq(time_between_two, name='first and second')
pmf_two.make_cdf().plot()
decorate(xlabel='Time between first two marriages')

In [28]:

time_between_three = (subset['mardat03'] - subset['mardat01']) / 12
pmf_three = Pmf.from_seq(time_between_three, name='first and third')
pmf_three.make_cdf().plot()
decorate(xlabel='Time between first three marriages')

In [29]:

from scipy.stats import gaussian_kde

kde_two = gaussian_kde(time_between_two.dropna())
kde_three = gaussian_kde(time_between_three.dropna())

qs = np.linspace(0, 25)
ps = kde_two(qs)
pmf = Pmf(ps, qs)
pmf.normalize()
pmf.make_cdf().plot(color='gray')

ps = kde_three(qs)
pmf = Pmf(ps, qs)
pmf.normalize()
pmf.make_cdf().plot(color='gray')

pmf_two.make_cdf().plot()
pmf_three.make_cdf().plot()
decorate(xlabel='Time between first two marriages')

In [30]:

np.percentile(time_between_two.dropna(), [50])

Out[30]:

array([7.91666667])

In [31]:

np.percentile(time_between_three.dropna(), [50])

Out[31]:

array([12.66666667])

In [ ]: