Copyright 2015 Allen Downey
from __future__ import print_function, division
import marriage
import thinkstats2
import thinkplot
import math
import pandas as pd
import numpy as np
%matplotlib inline
Load the data:
def ReadFemPreg(dct_file, dat_file, usecols):
"""Reads the NSFG pregnancy data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file, encoding='ISO-8859-1')
df = dct.ReadFixedWidth(dat_file, compression='gzip', usecols=usecols)
return df
usecols = ['outcome', 'prglngth', 'birthord', 'finalwgt']
preg6 = ReadFemPreg('2002FemPreg.dct', '2002FemPreg.dat.gz', usecols)
print(preg6.shape)
preg6 = preg6[preg6.outcome == 1]
(13593, 4)
preg6.birthord.value_counts().sort_index()
1 4413 2 2874 3 1234 4 421 5 126 6 50 7 20 8 7 9 2 10 1 dtype: int64
preg6.prglngth.value_counts().sort_index()
0 1 4 1 9 1 13 1 17 2 18 1 19 1 20 1 21 2 22 7 23 1 24 13 25 3 26 35 27 3 28 32 29 21 30 138 31 27 32 115 33 49 34 60 35 311 36 321 37 455 38 607 39 4693 40 1116 41 587 42 328 43 148 44 46 45 10 46 1 47 1 48 7 50 2 dtype: int64
Make a list of DataFrames, one for each cycle:
sum(preg6.prglngth >= 27)
9078
preg6.finalwgt.describe()
count 9148.000000 mean 8404.364417 std 9142.507490 min 118.656790 25% 3995.837136 50% 6415.602101 75% 9555.442649 max 261879.953864 Name: finalwgt, dtype: float64
usecols = ['outcome', 'prglngth', 'birthord', 'wgtq1q16']
preg7 = ReadFemPreg('2006_2010_FemPregSetup.dct', '2006_2010_FemPreg.dat.gz', usecols)
print(preg7.shape)
preg7 = preg7[preg7.outcome == 1]
preg7['finalwgt'] = preg7.wgtq1q16
preg7.shape
(20492, 4)
(14292, 5)
preg7.birthord.value_counts().sort_index()
1 6683 2 4415 3 2030 4 734 5 269 6 102 7 36 8 13 9 7 10 2 11 1 dtype: int64
preg7.prglngth.value_counts().sort_index()
0 1 7 1 17 1 19 2 20 2 21 4 22 16 23 4 24 15 25 14 26 65 27 21 28 37 29 26 30 184 31 40 32 156 33 59 34 155 35 509 36 622 37 835 38 1304 39 6263 40 2280 41 907 42 553 43 185 44 19 45 3 46 4 48 3 52 1 57 1 dtype: int64
sum(preg7.prglngth >= 27)
14167
# note: min and max should be 38.700642694-49735.071265
preg7.finalwgt.describe()
count 14292.000000 mean 5306.068571 std 5967.788429 min 44.023984 25% 1509.629839 50% 3078.497086 75% 6594.905896 max 30226.354508 Name: finalwgt, dtype: float64
usecols = ['outcome', 'prglngth', 'birthord', 'wgt2011_2013']
preg8 = ReadFemPreg('2011_2013_FemPregSetup.dct', '2011_2013_FemPregData.dat.gz', usecols)
print(preg7.shape)
preg8 = preg8[preg8.outcome == 1]
preg8['finalwgt'] = preg8.wgt2011_2013
preg8.shape
(14292, 5)
(6670, 5)
preg8.birthord.value_counts().sort_index()
1 3141 2 2076 3 957 4 327 5 113 6 34 7 16 8 5 9 1 dtype: int64
preg8.prglngth.value_counts().sort_index()
17 1 20 2 21 2 22 8 23 3 24 12 25 6 26 21 27 7 28 22 29 19 30 61 31 24 32 90 33 30 34 97 35 219 36 283 37 406 38 706 39 2566 40 1306 41 426 42 277 43 63 44 8 45 2 46 1 47 2 dtype: int64
sum(preg8.prglngth >= 27)
6615
preg8.finalwgt.describe()
count 6670.000000 mean 11173.091142 std 12939.068547 min 1714.541000 25% 3985.078597 50% 6834.593716 75% 12984.239442 max 85207.950000 Name: finalwgt, dtype: float64
#resp8 = marriage.ReadFemResp2013()
#marriage.Validate2013(resp8)
#resp8 = resp8[resp8.parity >= 1]
#resp7 = marriage.ReadFemResp2010()
#marriage.Validate2010(resp7)
#resp7 = resp7[resp7.parity >= 1]
#resp6 = marriage.ReadFemResp2002()
#marriage.Validate2002(resp6)
#resp6 = resp6[resp6.parity >= 1]
#resps = [resp6, resp7, resp8]
pregs = [preg6, preg7, preg8]
for preg in pregs:
print(len(preg))
9148 14292 6670
def SummarizeCycle(df):
ages = df.age.min(), df.age.max()
ages= np.array(ages)
intvws = df.cmintvw.min(), df.cmintvw.max()
intvws = np.array(intvws) / 12 + 1900
births = df.cmbirth.min(), df.cmbirth.max()
births = np.array(births) / 12 + 1900
print('# & ', intvws.astype(int), '&', len(df), '&', births.astype(int), r'\\')
#for resp in reversed(resps):
# SummarizeCycle(resp)
def ResampleAndSelect(resp, preg):
sample = thinkstats2.ResampleRowsWeighted(resp, column='finalwgt')
print('1')
dfs = [preg[preg.caseid == caseid] for caseid in sample.caseid]
print('2')
rows = pd.concat(dfs, ignore_index=True)
print('3')
return rows
#%time sample6 = ResampleAndSelect(resp6, preg6)
#sample6.shape
#grouped6 = preg6.groupby('caseid')
def ResampleAndSelect(resp, grouped):
sample = thinkstats2.ResampleRowsWeighted(resp, column='finalwgt')
print('1')
dfs = [grouped.get_group(caseid) for caseid in sample.caseid]
print('2', len(dfs))
rows = pd.concat(dfs, ignore_index=True)
print('3')
return rows
#%time sample6 = ResampleAndSelect(resp6, grouped6)
#sample6.shape
def ResamplePreg(preg):
sample = thinkstats2.ResampleRowsWeighted(preg, column='finalwgt')
return sample
#%time sample7 = ResamplePreg(preg7)
#ample7.shape
def Resample(pregs):
samples = [thinkstats2.ResampleRowsWeighted(preg, column='finalwgt')
for preg in pregs]
sample = pd.concat(samples)
return sample
sample = Resample(pregs)
firsts = sample[sample.birthord == 1]
firsts.shape
(13864, 6)
others = sample[sample.birthord > 1]
others.shape
(16246, 6)
sum(firsts.prglngth.isnull())
0
sum(others.prglngth.isnull())
0
firsts.prglngth.value_counts().sort_index()
0 7 17 5 20 1 21 4 22 14 23 1 24 13 25 17 26 51 27 12 28 76 29 23 30 204 31 42 32 164 33 79 34 150 35 434 36 570 37 688 38 1110 39 5728 40 2218 41 1234 42 700 43 255 44 54 45 5 46 2 47 1 48 2 dtype: int64
others.prglngth.value_counts().sort_index()
9 1 18 1 19 12 20 7 21 7 22 6 23 3 24 8 25 8 26 32 27 22 28 50 29 30 30 168 31 25 32 156 33 79 34 179 35 474 36 657 37 990 38 1600 39 7732 40 2604 41 796 42 422 43 128 44 37 45 4 46 1 48 5 50 2 dtype: int64
len(firsts.prglngth), len(others.prglngth)
(13864, 16246)
firsts.prglngth.mean(), others.prglngth.mean()
(38.6130986728217, 38.518342976732733)
diff = firsts.prglngth.mean() - others.prglngth.mean()
diff, diff * 7, diff * 7 * 24
(0.094755696088967056, 0.66328987262276939, 15.918956942946465)
firsts.prglngth.std(), others.prglngth.std()
(2.8703583448993912, 2.3963914827683821)
se1 = firsts.prglngth.std() / math.sqrt(len(firsts.prglngth))
se1
0.024377650370258221
se2 = others.prglngth.std() / math.sqrt(len(others.prglngth))
se2
0.01880115556593805
diff / se1, diff / se2
(3.8869905282000872, 5.0398868174164484)
cdf_firsts = thinkstats2.Cdf(firsts.prglngth, label='firsts')
thinkplot.PrePlot(2)
thinkplot.Cdf(cdf_firsts)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc='upper left')
cdf_others = thinkstats2.Cdf(others.prglngth, label='others')
thinkplot.Cdf(cdf_others)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc='upper left')
jitter = 0.5
length_firsts = thinkstats2.Jitter(firsts.prglngth, jitter)
cdf_firsts = thinkstats2.Cdf(length_firsts, label='firsts')
length_others = thinkstats2.Jitter(others.prglngth, jitter)
cdf_others = thinkstats2.Cdf(length_others, label='others')
thinkplot.PrePlot(2)
thinkplot.Cdf(cdf_firsts)
thinkplot.Cdf(cdf_others)
thinkplot.Config(xlabel='pregnancy length (weeks)',
xlim=[30, 45],
ylabel='CDF',
loc='upper left')
def RemainingDurationPmf(pmf, weeks):
"""Returns PMF of remaining duration conditioned on weeks.
pmf: PMF of pregnancy length
weeks: current weeks of pregnancy
"""
new = thinkstats2.Pmf(label=pmf.label)
for x, p in pmf.Items():
if x >= weeks:
new[x - weeks] = p
new.Normalize()
return new
pmf_firsts = thinkstats2.Pmf(length_firsts, label='firsts')
pmf_others = thinkstats2.Pmf(length_others, label='others')
def PlotRemaining(pmfs, weeks):
"""Plot CDF of remaining weeks.
pmfs: list of PMF
weeks: current weeks of pregnancy
"""
cdfs = [RemainingDurationPmf(pmf, weeks).MakeCdf()
for pmf in pmfs]
thinkplot.PrePlot(len(cdfs))
thinkplot.Cdfs(cdfs)
loc = 'lower right'
PlotRemaining([pmf_firsts, pmf_others], 30)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
PlotRemaining([pmf_firsts, pmf_others], 32)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
PlotRemaining([pmf_firsts, pmf_others], 34)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
PlotRemaining([pmf_firsts, pmf_others], 36)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
PlotRemaining([pmf_firsts, pmf_others], 38)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
PlotRemaining([pmf_firsts, pmf_others], 39)
thinkplot.Config(xlabel='pregnancy length (weeks)',
ylabel='CDF',
loc=loc)
def PercentilesRemaining(pmfs, weeks):
cdfs = [RemainingDurationPmf(pmf, weeks).MakeCdf()
for pmf in pmfs]
for cdf in cdfs:
print([cdf.Percentile(p) * 7 for p in [25, 50, 75]])
PercentilesRemaining([pmf_firsts, pmf_others], 39)
[2.7143548802028761, 6.4367781041855068, 13.206703169606556] [2.1736408667971858, 4.9317939465359615, 9.3739933152324184]
percentiles = [50, 75, 25]
def PercentilesRemaining(pmf, ts):
array = np.zeros((len(ts), len(percentiles)))
for i, t in enumerate(ts):
cdf = RemainingDurationPmf(pmf, t).MakeCdf()
ps = [cdf.Percentile(p) * 7 for p in percentiles]
#print(t, ps)
array[i, :] = ps
return array
ts = np.arange(36, 44)
ps_firsts = PercentilesRemaining(pmf_firsts, ts)
ps_firsts
array([[ 22.64578082, 29.14642029, 17.95618349], [ 16.10550278, 22.63422464, 11.78280693], [ 9.97194385, 16.53705261, 5.9922938 ], [ 6.4367781 , 13.20670317, 2.71435488], [ 6.76981776, 12.44368589, 2.86739126], [ 5.59852858, 10.38567513, 2.37625612], [ 4.38732448, 7.85479168, 2.06292504], [ 2.57863214, 6.10710894, 1.19931778]])
ps_others = PercentilesRemaining(pmf_others, ts)
ps_others
array([[ 21.60271001, 26.40089751, 17.18171224], [ 15.00369008, 19.79528516, 11.06307321], [ 8.87411349, 13.48416292, 5.44552693], [ 4.93179395, 9.37399332, 2.17364087], [ 4.65239838, 9.90509012, 1.87870862], [ 5.5198786 , 9.65339663, 2.22971555], [ 4.32478024, 7.90697449, 1.72204552], [ 4.01518618, 7.66307581, 1.71902067]])
nrows, ncols = ps_firsts.shape
thinkplot.PrePlot(3)
for i in range(ncols):
thinkplot.Plot(ts, ps_firsts[:, i], label=percentiles[i])
thinkplot.PrePlot(3)
for i in range(ncols):
thinkplot.Plot(ts, ps_others[:, i], linestyle='dashed')
thinkplot.Config(xlabel='t (weeks)',
ylabel='remaining time (days)',
legend=True,
loc='upper right')
sample = Resample(pregs)
sample.shape
(30110, 6)
firsts = sample[sample.birthord == 1]
others = sample[sample.birthord > 1]
jitter = 0.5
lengths = thinkstats2.Jitter(firsts.prglngth, jitter)
pmf_firsts = thinkstats2.Pmf(lengths, label='firsts')
lengths = thinkstats2.Jitter(others.prglngth, jitter)
pmf_others = thinkstats2.Pmf(lengths, label='others')
def SelectFirsts(preg):
return preg[preg.birthord == 1]
def SelectOthers(preg):
return preg[preg.birthord > 1]
def MeanRemaining(pmf, t):
rem_pmf = RemainingDurationPmf(pmf, t)
mean = rem_pmf.Mean()
return mean
MeanRemaining(pmf_firsts, 36)
3.4346722690706315
def MedianRemaining(pmf, t):
rem_pmf = RemainingDurationPmf(pmf, t)
rem_cdf = rem_pmf.MakeCdf()
median = rem_cdf.Percentile(50)
return median
MedianRemaining(pmf_firsts, 36)
3.2662877239065011
def ProbRemainingLess(pmf, t, weeks=1):
rem_pmf = RemainingDurationPmf(pmf, t)
rem_cdf = rem_pmf.MakeCdf()
p = rem_cdf[weeks]
return p
ProbRemainingLess(pmf_firsts, 36)
0.048448823385729728
def Remaining(preg, iters, ts, select_func, stat_func):
"""Computes remaining lifetime versus weeks of pregnancy.
preg: DataFrame of pregnancy data
iters: int, how many resampling iterations to run
ts: sequence of float, times to evaluate, in weeks
select_func: function that selects a subset of pregnancies
stat_func: function that computes summary statistic
returns: array with one row for each percentile, one column
for each item in ts
"""
# make an array to hold one row per iteration,
# one col per element of ts
array = np.zeros((iters, len(ts)))
for i in range(iters):
sample = Resample(pregs)
group = select_func(sample)
lengths = thinkstats2.Jitter(group.prglngth, jitter=0.5)
pmf = thinkstats2.Pmf(lengths)
stats = [stat_func(pmf, t) for t in ts]
array[i, :] = stats
array = np.sort(array, axis=0)
rows = [thinkstats2.PercentileRow(array, p) for p in [10, 50, 90]]
return rows
def PlotStats(stat_func, iters, ts):
"""Plots a summary statistic versus weeks of pregnancy.
stat_func: function that computes summary statistic
iters: int, how many resampling iterations to run
ts: sequence of float, times to evaluate, in weeks
"""
# list of labels and select_funcs
labels = [('firsts', SelectFirsts),
('others', SelectOthers)]
thinkplot.PrePlot(2)
for label, select_func in labels:
rows = Remaining(pmf_firsts, iters, ts, select_func, stat_func)
thinkplot.FillBetween(ts, rows[0], rows[2], color='gray')
thinkplot.Plot(ts, rows[1], label=label)
print(label)
for t, stat in zip(ts, rows[1]):
print(t, stat)
iters = 101
ts = np.arange(36, 43.5, 0.5)
PlotStats(MeanRemaining, iters, ts)
thinkplot.Config(xlabel='week of pregnancy',
ylabel='remaining time (weeks)',
legend=True,
loc='upper right')
thinkplot.Save(root='first1', formats=['pdf', 'png'])
firsts 36.0 3.39754156441 36.5 2.97508980334 37.0 2.55456121193 37.5 2.13886585341 38.0 1.74537235835 38.5 1.42681323973 39.0 1.26628280514 39.5 1.23078085688 40.0 1.17889460956 40.5 1.09181678181 41.0 0.980629908801 41.5 0.862167177764 42.0 0.751218876485 42.5 0.68828895529 43.0 0.629491972579 others 36.0 3.13540208072 36.5 2.70930346011 37.0 2.29039746926 37.5 1.88055601698 38.0 1.49280931068 38.5 1.17409914359 39.0 1.0078923987 39.5 0.985300112973 40.0 0.998127077118 40.5 1.01971112502 41.0 1.01329044103 41.5 0.95862350461 42.0 0.911055084446 42.5 0.911300869622 43.0 0.98479776888 Writing first1.pdf Writing first1.png
<matplotlib.figure.Figure at 0x7f576f2e0c50>
PlotStats(MedianRemaining, iters, ts)
thinkplot.Config(xlabel='t (weeks)',
ylabel='week of pregnancy',
legend=True,
loc='upper right')
thinkplot.Save(root='first2', formats=['pdf', 'png'])
firsts 36.0 3.241368552 36.5 2.77492320164 37.0 2.31279655389 37.5 1.86185466683 38.0 1.43469000445 38.5 1.08499873418 39.0 0.928918281925 39.5 0.942092187176 40.0 0.944034901894 40.5 0.893216546978 41.0 0.794109392844 41.5 0.680121224128 42.0 0.569214418639 42.5 0.506499558605 43.0 0.443257768096 others 36.0 3.08268446578 36.5 2.61050684262 37.0 2.14660335686 37.5 1.69113799843 38.0 1.26087655638 38.5 0.90146729939 39.0 0.707699287694 39.5 0.669730616568 40.0 0.675815864934 40.5 0.735595662149 41.0 0.753391859416 41.5 0.699055415717 42.0 0.623727458023 42.5 0.565583628248 43.0 0.515788745635 Writing first2.pdf Writing first2.png
<matplotlib.figure.Figure at 0x7f57700b1490>
PlotStats(ProbRemainingLess, iters, ts)
thinkplot.Config(xlabel='week of pregnancy',
ylabel='prob(birth within 1 week)',
legend=True,
loc='upper left')
thinkplot.Save(root='first3', formats=['pdf', 'png'])
firsts 36.0 0.0518651832461 36.5 0.0631412127784 37.0 0.087215320911 37.5 0.157749488115 38.0 0.311731315043 38.5 0.466408539138 39.0 0.524979746152 39.5 0.521584861029 40.0 0.519954389966 40.5 0.54363256785 41.0 0.592375366569 41.5 0.665464382326 42.0 0.723489932886 42.5 0.774373259053 43.0 0.815533980583 others 36.0 0.0570483391844 36.5 0.0735008328706 37.0 0.10406256789 37.5 0.18672074508 38.0 0.364137483787 38.5 0.547933884298 39.0 0.631398013751 39.5 0.644706857261 40.0 0.631614654003 40.5 0.607536231884 41.0 0.608486017358 41.5 0.648367952522 42.0 0.687192118227 42.5 0.721030042918 43.0 0.739837398374 Writing first3.pdf Writing first3.png
<matplotlib.figure.Figure at 0x7f579d60d610>