from __future__ import print_function, division
import survival
import thinkstats2
import thinkplot
import gzip
import pandas
import numpy as np
%matplotlib inline
filename = '1988FemRespData.dat'
fin = open(filename, 'r')
line = fin.read(3553)
print(line)
--------------------------------------------------------------------------- IOError Traceback (most recent call last) <ipython-input-2-2fcebc07c2d8> in <module>() 1 filename = '1988FemRespData.dat' ----> 2 fin = open(filename, 'r') 3 line = fin.read(3553) 4 print(line) IOError: [Errno 2] No such file or directory: '1988FemRespData.dat'
def add_newlines():
filename = '1988FemRespData.dat'
fin = open(filename, 'r')
fout = open('1988FemRespDataLines.dat', 'w')
for i in range(8450):
line = fin.read(3553)
fout.write(line + '\n')
fout.close()
filename = '1988FemRespDataLines.dat.gz'
fp = gzip.open(filename, 'r')
s = '0123456789'
print(s*8)
for i, line in enumerate(fp):
print(line)
if i > 0:
break
01234567890123456789012345678901234567890123456789012345678901234567890123456789 0000100 201061 6 00723 0232821111 1 1 1 1 161 00000 0000011 01044 2 1 213318 11 0000000000 2 000022 2 0000000000 0000000000 0000000000 2 111111111111111195060207241200000 00000 00000000000000000000000000000000 00000 00000 00000 00000 00000 00000 00000 00000 00000 01061222 00000 00000 00000 00000 00000 1 00000 2 2 1021 969602 2 01103536 00000 00000 00000 00000 2 0000000000 2222 2 21000003000000 2 51000003000000 51000200040506000000152 222222 1 2048180125 440004 2 000200000000000000000000000000002 5 00000 00000 00000 00000 0000000000000000000000000 0000000000000000000000000 00000000000000000000 00000 00000 00000 00000 00000 1010000000000070020600163017452210104810000055061200 1 11222222210606011999999999999999999999999999999 01061 628279931161 2600000113181100000000000000000 00002 30 00 42 5 6 9696 0201 33333 2 1 51 5100020004050600000015 00 1 62114701312 200 2812 2 2 2053249105343590534359057514407137921000000000000000000000000000 00000 00000000000000000000000 0000000000000000 0 000000 0000000000000000 0000000000000000000000000 0 00 00000000000000000000000000000000 00000000 1262234 0 0 1547411 1625872 0 0 1563075 0 1394346 1455117 0 1530365 0 0 1530459 0 1384579 1584067 0 0 1390994 0 1449819 1492664 0 1616948 0 0 1275054 0 1302225 1289356 0 0 1483580 1440527 0 0 1325360 0 1740542 1504652 0 1466335 0 1373721 0 1494901 0 0 1492668 0 1530022 0 1449765 0 1661727 1370854 0 0 0 1463082 0 1522949 0 1538348 0 1457197 0 1327488 0 1491723 0 1621816 0 1478120 0 0 1485925 0 1413752 0 1503169 0 1532063 0 1370681 1200511 0 0 1307784 1440308 0 0 1430987 0 1619480 1500545 0 0000200 201059 2600614 0053732222 1 1 1 1 112 00000 000002 200843 2 2307198217 222 0100000102 1 0201212 2 0000000000 0000000000 0000000000 2 1121121112112112 060102432 00784 00000 302010530105400000000000000000000122 00000 00000 00000 00000 00000 00000 00000 00000 222 1 2 00000 1 22222222222222220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000050005000000000000000000000006000600060006000600001059221 00000 00000 00000 00000 00000 211 010000000000002910541 02 1 221 00000 2 2222222222221 190826 101000000000000000000000700020000000000785 00000 13222 0000000000 2221 2 21010000000500 1 52 100020004050600000015100005 222222 30000010062 96222 1014336121 110030 2 010000000000000000000000000000002 3 0 00000 00000 00000 00000 0000000000000000000000000 0000000000000000000000000 00000000000000000000 01014 100964030 2 00000 00681 101010030 2 010000000000000000000000000000002 12 2 12 1908261 010000000000070020900170014537620102710000040031200 400000005000000 453762032222221222100501100 00 01059 237370553103 190000020417120202010100000102000793 8882669962751408007841492100793 4 00011995995 222222222222222222200002333300 0078414322422 5 3 01019960002 14 33 221221 000000020000000033333 2 1 52 1000200040506000000153 275 00 0096403024 62121601211 1002917316737011416 21491001 2023575002357500235750028185303670223000000000000000000000000000 00000 00000000000000000000000 0000000000000000 0 000000 0000000000000000 0000000000000000000000000 0 00 00000000000000000000000000001000 000000001 0 697085 1108855 0 0 553996 0 799192 777176 0 672263 0 0 669647 642954 0 472556 0 0 955138 0 540935 735102 0 835328 0 0 836293 732392 0 0 690691 520363 0 1202312 0 0 516871 556760 0 0 752986 845990 0 561170 0 767332 0 547626 0 518231 0 0 595288 0 1506157 0 422700 0 544607 0 643258 0 1045080 0 763247 0 626630 538157 896762 0 562327 584254 0 820451 0 496369 0 863794 0 0 599524 0 620449 0 705749 0 977984 598224 0 576808 0 0 655070 485218 0 0 868572 542184 0
filename = '1988FemRespDataLines.dat.gz'
names = ['finalwgt', 'ageint', 'currentcm', 'firstcm', 'cmintvw', 'cmbirth']
colspecs = [(2568-1, 2574),
(36-1, 37),
(1521-1, 1525),
(1538-1, 1542),
(12-1, 16),
(26-1, 30),
]
df = pandas.read_fwf(filename,
colspecs=colspecs,
names=names,
header=None,
compression='gzip')
len(df) # should be 8450
8450
df.ageint.value_counts().sort_index()
14 3 15 210 16 240 17 263 18 258 19 260 20 259 21 238 22 280 23 240 24 292 25 308 26 322 27 330 28 292 29 355 30 342 31 335 32 370 33 328 34 314 35 296 36 305 37 314 38 251 39 270 40 263 41 257 42 247 43 194 44 197 45 17 dtype: int64
df.currentcm.value_counts().sort_index()
0 4419 685 1 705 1 722 1 723 1 726 2 728 1 729 2 730 3 731 2 732 1 734 1 735 1 736 1 738 3 ... 1056 26 1057 14 1058 11 1059 6 1060 6 1061 3 1062 1 90781 1 90834 1 90978 1 90990 1 91018 1 91026 1 91038 2 99999 16 Length: 344, dtype: int64
df.currentcm.replace([0, 99999], np.nan, inplace=True)
df.loc[df.currentcm>90000, 'currentcm'] -= 90000
df.firstcm.value_counts().sort_index()
0 6452 706 1 708 2 710 2 711 1 712 1 714 2 720 2 722 3 724 1 726 5 727 1 728 1 729 1 730 3 ... 90886 1 90894 6 90901 1 90906 1 90907 1 90918 2 90922 1 90930 1 90942 1 90943 1 90978 2 90979 1 90990 1 91002 1 99999 19 Length: 355, dtype: int64
df.firstcm.replace([0, 99999], np.nan, inplace=True)
df.loc[df.firstcm>90000, 'firstcm'] -= 90000
df['cmmarrhx'] = df.currentcm
df.cmmarrhx.fillna(df.firstcm)
sum(df.cmmarrhx.isnull())
4435
df.cmintvw.value_counts().sort_index()
1057 545 1058 2034 1059 2288 1060 1701 1061 874 1062 682 1063 191 1064 135 dtype: int64
df.cmbirth.value_counts().sort_index()
519 10 520 14 521 20 522 17 523 17 524 18 525 16 526 20 527 13 528 16 529 25 530 14 531 19 532 11 533 16 ... 865 31 866 13 867 15 868 15 869 22 870 18 871 23 872 20 873 28 874 15 875 17 876 22 877 22 878 11 879 7 Length: 361, dtype: int64
survival.CleanData(df)
df['evrmarry'] = ~df.cmmarrhx.isnull()
df
finalwgt | ageint | currentcm | firstcm | cmintvw | cmbirth | cmmarrhx | agemarry | age | decade | fives | evrmarry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 713792 | 28 | NaN | NaN | 1061 | 723 | NaN | NaN | 28.166667 | 6 | 12 | False |
1 | 367022 | 37 | NaN | NaN | 1059 | 614 | NaN | NaN | 37.083333 | 5 | 10 | False |
2 | 975924 | 21 | NaN | NaN | 1057 | 796 | NaN | NaN | 21.750000 | 6 | 13 | False |
3 | 587796 | 39 | NaN | 838 | 1057 | 581 | NaN | NaN | 39.666667 | 4 | 9 | False |
4 | 719633 | 31 | 974 | 882 | 1062 | 683 | 974 | 24.250000 | 31.583333 | 5 | 11 | True |
5 | 730622 | 17 | NaN | NaN | 1060 | 844 | NaN | NaN | 18.000000 | 7 | 14 | False |
6 | 608474 | 39 | 848 | 809 | 1057 | 578 | 848 | 22.500000 | 39.916667 | 4 | 9 | True |
7 | 777787 | 30 | NaN | 924 | 1058 | 696 | NaN | NaN | 30.166667 | 5 | 11 | False |
8 | 1030290 | 22 | NaN | NaN | 1059 | 791 | NaN | NaN | 22.333333 | 6 | 13 | False |
9 | 639364 | 18 | NaN | NaN | 1059 | 836 | NaN | NaN | 18.583333 | 6 | 13 | False |
10 | 820707 | 34 | 947 | NaN | 1058 | 642 | 947 | 25.416667 | 34.666667 | 5 | 10 | True |
11 | 766942 | 29 | NaN | 957 | 1058 | 699 | NaN | NaN | 29.916667 | 5 | 11 | False |
12 | 712940 | 30 | 960 | NaN | 1061 | 696 | 960 | 22.000000 | 30.416667 | 5 | 11 | True |
13 | 748807 | 38 | 848 | NaN | 1060 | 595 | 848 | 21.083333 | 38.750000 | 4 | 9 | True |
14 | 682754 | 33 | NaN | 937 | 1058 | 652 | NaN | NaN | 33.833333 | 5 | 10 | False |
15 | 773938 | 38 | NaN | 894 | 1057 | 592 | NaN | NaN | 38.750000 | 4 | 9 | False |
16 | 787966 | 33 | NaN | NaN | 1060 | 659 | NaN | NaN | 33.416667 | 5 | 10 | False |
17 | 773938 | 35 | 942 | NaN | 1060 | 629 | 942 | 26.083333 | 35.916667 | 5 | 10 | True |
18 | 642272 | 30 | 977 | NaN | 1060 | 688 | 977 | 24.083333 | 31.000000 | 5 | 11 | True |
19 | 733060 | 30 | 918 | NaN | 1060 | 693 | 918 | 18.750000 | 30.583333 | 5 | 11 | True |
20 | 734133 | 37 | 835 | NaN | 1060 | 610 | 835 | 18.750000 | 37.500000 | 5 | 10 | True |
21 | 745297 | 25 | 1031 | NaN | 1060 | 754 | 1031 | 23.083333 | 25.500000 | 6 | 12 | True |
22 | 799872 | 35 | 1039 | 870 | 1060 | 635 | 1039 | 33.666667 | 35.416667 | 5 | 10 | True |
23 | 807851 | 20 | NaN | NaN | 1060 | 811 | NaN | NaN | 20.750000 | 6 | 13 | False |
24 | 807851 | 20 | NaN | NaN | 1060 | 809 | NaN | NaN | 20.916667 | 6 | 13 | False |
25 | 929923 | 43 | 824 | NaN | 1060 | 532 | 824 | 24.333333 | 44.000000 | 4 | 8 | True |
26 | 705780 | 35 | 846 | NaN | 1060 | 629 | 846 | 18.083333 | 35.916667 | 5 | 10 | True |
27 | 724044 | 38 | 945 | 848 | 1060 | 596 | 945 | 29.083333 | 38.666667 | 4 | 9 | True |
28 | 690662 | 33 | 875 | NaN | 1060 | 655 | 875 | 18.333333 | 33.750000 | 5 | 10 | True |
29 | 2183253 | 15 | NaN | NaN | 1058 | 866 | NaN | NaN | 16.000000 | 7 | 14 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8420 | 2036322 | 39 | NaN | 825 | 1058 | 585 | NaN | NaN | 39.416667 | 4 | 9 | False |
8421 | 2478173 | 30 | 992 | NaN | 1059 | 693 | 992 | 24.916667 | 30.500000 | 5 | 11 | True |
8422 | 2174687 | 29 | NaN | 946 | 1058 | 701 | NaN | NaN | 29.750000 | 5 | 11 | False |
8423 | 2301277 | 40 | 821 | NaN | 1058 | 570 | 821 | 20.916667 | 40.666667 | 4 | 9 | True |
8424 | 2594495 | 25 | NaN | NaN | 1059 | 756 | NaN | NaN | 25.250000 | 6 | 12 | False |
8425 | 1963379 | 43 | 765 | NaN | 1058 | 536 | 765 | 19.083333 | 43.500000 | 4 | 8 | True |
8426 | 2276609 | 34 | 936 | NaN | 1058 | 646 | 936 | 24.166667 | 34.333333 | 5 | 10 | True |
8427 | 2056998 | 37 | 896 | NaN | 1058 | 604 | 896 | 24.333333 | 37.833333 | 5 | 10 | True |
8428 | 2592015 | 24 | NaN | NaN | 1058 | 763 | NaN | NaN | 24.583333 | 6 | 12 | False |
8429 | 2486216 | 36 | 961 | NaN | 1059 | 619 | 961 | 28.500000 | 36.666667 | 5 | 10 | True |
8430 | 2624510 | 32 | NaN | NaN | 1058 | 668 | NaN | NaN | 32.500000 | 5 | 11 | False |
8431 | 2460840 | 25 | 1052 | NaN | 1060 | 752 | 1052 | 25.000000 | 25.666667 | 6 | 12 | True |
8432 | 2384200 | 25 | NaN | NaN | 1058 | 748 | NaN | NaN | 25.833333 | 6 | 12 | False |
8433 | 2384200 | 27 | NaN | NaN | 1058 | 723 | NaN | NaN | 27.916667 | 6 | 12 | False |
8434 | 2126824 | 23 | 1044 | NaN | 1058 | 779 | 1044 | 22.083333 | 23.250000 | 6 | 12 | True |
8435 | 1925357 | 27 | 1044 | NaN | 1059 | 728 | 1044 | 26.333333 | 27.583333 | 6 | 12 | True |
8436 | 2105497 | 26 | NaN | NaN | 1058 | 745 | NaN | NaN | 26.083333 | 6 | 12 | False |
8437 | 2518126 | 23 | NaN | NaN | 1058 | 776 | NaN | NaN | 23.500000 | 6 | 12 | False |
8438 | 2384200 | 26 | NaN | NaN | 1059 | 746 | NaN | NaN | 26.083333 | 6 | 12 | False |
8439 | 2518126 | 23 | NaN | NaN | 1059 | 772 | NaN | NaN | 23.916667 | 6 | 12 | False |
8440 | 2549695 | 33 | NaN | NaN | 1059 | 656 | NaN | NaN | 33.583333 | 5 | 10 | False |
8441 | 2518126 | 24 | NaN | NaN | 1058 | 764 | NaN | NaN | 24.500000 | 6 | 12 | False |
8442 | 645391 | 31 | 929 | NaN | 1059 | 679 | 929 | 20.833333 | 31.666667 | 5 | 11 | True |
8443 | 2986139 | 26 | 997 | NaN | 1058 | 740 | 997 | 21.416667 | 26.500000 | 6 | 12 | True |
8444 | 2092079 | 34 | 978 | NaN | 1058 | 642 | 978 | 28.000000 | 34.666667 | 5 | 10 | True |
8445 | 2251351 | 26 | NaN | NaN | 1059 | 740 | NaN | NaN | 26.583333 | 6 | 12 | False |
8446 | 2251351 | 26 | NaN | NaN | 1058 | 736 | NaN | NaN | 26.833333 | 6 | 12 | False |
8447 | 2384200 | 26 | NaN | NaN | 1058 | 741 | NaN | NaN | 26.416667 | 6 | 12 | False |
8448 | 1469892 | 38 | 931 | 839 | 1063 | 606 | 931 | 27.083333 | 38.083333 | 5 | 10 | True |
8449 | 2620612 | 30 | 1014 | NaN | 1063 | 693 | 1014 | 26.750000 | 30.833333 | 5 | 11 | True |
8450 rows × 12 columns
cdf = thinkstats2.Cdf(df.age - df.ageint)
thinkplot.Cdf(cdf)
{'xscale': 'linear', 'yscale': 'linear'}
cdf = thinkstats2.Cdf(df.agemarry)
thinkplot.Cdf(cdf)
len(df.agemarry.dropna())
4015