import pandas as pd
pd.set_option("display.precision", 3)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
import warnings
import janitor
import numpy as np
import pingouin as pg
warnings.filterwarnings('ignore')
_url = "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/sleep75.csv"
drop_var = ['case', 'leis1', 'leis2', 'leis3']
df = (pd.read_csv(_url, index_col=0)
.drop(drop_var, axis=1)
# .assign(lspsepay=lambda df: np.log1p(df.spsepay))
)
df.head(3)
age | black | clerical | construc | educ | earns74 | gdhlth | inlf | smsa | lhrwage | ... | spwrk75 | totwrk | union | worknrm | workscnd | exper | yngkid | yrsmarr | hrwage | agesq | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 32 | 0 | 0.00000 | 0.00000 | 12 | 0 | 0 | 1 | 0 | 1.95586 | ... | 0 | 3438 | 0 | 3438 | 0 | 14 | 0 | 13 | 7.07000 | 1024 |
2 | 31 | 0 | 0.00000 | 0.00000 | 14 | 9500 | 1 | 1 | 0 | 0.35767 | ... | 0 | 5020 | 0 | 5020 | 0 | 11 | 0 | 0 | 1.43000 | 961 |
3 | 44 | 0 | 0.00000 | 0.00000 | 17 | 42500 | 1 | 1 | 1 | 3.02189 | ... | 1 | 2815 | 0 | 2815 | 0 | 21 | 0 | 0 | 20.53000 | 1936 |
3 rows × 30 columns
# Prep variable lablels (fold cell)
# varlabels: http://fmwww.bc.edu/ec-p/data/wooldridge/sleep75.des
df_label = (pd.read_csv('data/sleep75-des.csv', encoding="ISO-8859-1")
.assign(label=lambda df: df['des'].str.encode('ascii', 'ignore').str.decode('ascii'))
.drop(['des'], axis=1)
.set_index('var')
.drop(drop_var)
.reset_index()
)
df_label.head(3)
var | group | label | |
---|---|---|---|
0 | age | age | in years |
1 | black | other factors | =1 if black |
2 | clerical | occupation | =1 if clerical worker |
stdopts = {'relimp': False, 'remove_na': True}
x = [
"age",
"black",
"clerical",
"construc",
"educ",
"gdhlth",
"inlf",
"smsa",
"lhrwage",
"prot",
"selfe",
"south",
"spsepay",
"totwrk",
"yrsmarr",
]
lm_all = (pg.linear_regression(df[x], df['sleep'], **stdopts)
.assign(model='all')
)
lm_male = (pg.linear_regression(df.query('male==1')[x], df.query('male==1')['sleep'], **stdopts)
.assign(model='men')
)
lm_female = (pg.linear_regression(df.query('male==0')[x], df.query('male==0')['sleep'], **stdopts)
.assign(model='women')
)
lm_kids = (pg.linear_regression(df.query('yngkid==1')[x], df.query('yngkid==1')['sleep'], **stdopts)
.assign(model='young kids')
)
df_results = (pd.concat([lm_all, lm_female, lm_male, lm_kids])
.query('names!="Intercept"')
.reset_index(drop=True)
# Get labels
.rename_column('names', 'var')
.merge(df_label, how='left', on='var', validate='m:1')
.sort_values(['var', 'model', 'group'])
.reset_index(drop=True)
# Tidy up columns
.rename_column("CI[2.5%]", "ll")
.rename_column("CI[97.5%]", "hl")
)
df_results
var | coef | se | T | pval | r2 | adj_r2 | ll | hl | model | group | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | age | 0.99489 | 1.96925 | 0.50521 | 0.61362 | 0.12729 | 0.10366 | -2.87382 | 4.86360 | all | age | in years |
1 | age | 1.48289 | 2.88546 | 0.51392 | 0.60772 | 0.18038 | 0.13911 | -4.19724 | 7.16302 | men | age | in years |
2 | age | 0.95320 | 2.94321 | 0.32386 | 0.74634 | 0.10911 | 0.05342 | -4.84672 | 6.75312 | women | age | in years |
3 | age | 22.63402 | 15.49532 | 1.46070 | 0.14931 | 0.17815 | -0.01362 | -8.36124 | 53.62927 | young kids | age | in years |
4 | black | -84.79661 | 82.15012 | -1.03222 | 0.30245 | 0.12729 | 0.10366 | -246.18572 | 76.59249 | all | other factors | =1 if black |
5 | black | -132.96711 | 132.58117 | -1.00291 | 0.31678 | 0.18038 | 0.13911 | -393.95764 | 128.02342 | men | other factors | =1 if black |
6 | black | -68.35063 | 108.90175 | -0.62764 | 0.53088 | 0.10911 | 0.05342 | -282.95362 | 146.25236 | women | other factors | =1 if black |
7 | black | -119.74104 | 296.43295 | -0.40394 | 0.68769 | 0.17815 | -0.01362 | -712.69523 | 473.21315 | young kids | other factors | =1 if black |
8 | clerical | 22.50583 | 48.96326 | 0.45965 | 0.64596 | 0.12729 | 0.10366 | -73.68559 | 118.69725 | all | occupation | =1 if clerical worker |
9 | clerical | -229.27596 | 102.88822 | -2.22840 | 0.02665 | 0.18038 | 0.13911 | -431.81492 | -26.73700 | men | occupation | =1 if clerical worker |
10 | clerical | 106.50726 | 61.12732 | 1.74238 | 0.08281 | 0.10911 | 0.05342 | -13.95091 | 226.96543 | women | occupation | =1 if clerical worker |
11 | clerical | -239.90257 | 211.76020 | -1.13290 | 0.26176 | 0.17815 | -0.01362 | -663.48604 | 183.68089 | young kids | occupation | =1 if clerical worker |
12 | construc | 113.92956 | 105.04514 | 1.08458 | 0.27861 | 0.12729 | 0.10366 | -92.43824 | 320.29736 | all | occupation | =1 if construction worker |
13 | construc | 65.69606 | 107.88921 | 0.60892 | 0.54307 | 0.18038 | 0.13911 | -146.68751 | 278.07963 | men | occupation | =1 if construction worker |
14 | construc | 361.75643 | 428.26483 | 0.84470 | 0.39918 | 0.10911 | 0.05342 | -482.18695 | 1205.69980 | women | occupation | =1 if construction worker |
15 | construc | -186.41520 | 358.22350 | -0.52039 | 0.60471 | 0.17815 | -0.01362 | -902.96889 | 530.13849 | young kids | occupation | =1 if construction worker |
16 | educ | -7.21056 | 7.51674 | -0.95927 | 0.33787 | 0.12729 | 0.10366 | -21.97766 | 7.55654 | all | labor factors | years of schooling |
17 | educ | -7.54542 | 9.72034 | -0.77625 | 0.43826 | 0.18038 | 0.13911 | -26.68024 | 11.58940 | men | labor factors | years of schooling |
18 | educ | -6.27840 | 11.86988 | -0.52894 | 0.59737 | 0.10911 | 0.05342 | -29.66932 | 17.11252 | women | labor factors | years of schooling |
19 | educ | -20.97212 | 26.97403 | -0.77749 | 0.43992 | 0.17815 | -0.01362 | -74.92821 | 32.98398 | young kids | labor factors | years of schooling |
20 | gdhlth | -82.58043 | 58.20472 | -1.41879 | 0.15656 | 0.12729 | 0.10366 | -196.92727 | 31.76641 | all | health factors | =1 if in good or excel. health |
21 | gdhlth | -188.28451 | 80.47654 | -2.33962 | 0.02001 | 0.18038 | 0.13911 | -346.70531 | -29.86371 | men | health factors | =1 if in good or excel. health |
22 | gdhlth | -24.49158 | 87.71687 | -0.27921 | 0.78034 | 0.10911 | 0.05342 | -197.34740 | 148.36424 | women | health factors | =1 if in good or excel. health |
23 | gdhlth | -141.81498 | 203.21302 | -0.69786 | 0.48796 | 0.17815 | -0.01362 | -548.30154 | 264.67159 | young kids | health factors | =1 if in good or excel. health |
24 | lhrwage | 21.60570 | 31.98930 | 0.67540 | 0.49972 | 0.12729 | 0.10366 | -41.23930 | 84.45070 | all | labor factors | log hourly wage |
25 | lhrwage | 3.54475 | 47.04592 | 0.07535 | 0.93999 | 0.18038 | 0.13911 | -89.06675 | 96.15625 | men | labor factors | log hourly wage |
26 | lhrwage | -22.58048 | 53.78232 | -0.41985 | 0.67500 | 0.10911 | 0.05342 | -128.56451 | 83.40355 | women | labor factors | log hourly wage |
27 | lhrwage | 35.60932 | 122.32374 | 0.29111 | 0.77197 | 0.17815 | -0.01362 | -209.07459 | 280.29322 | young kids | labor factors | log hourly wage |
28 | prot | -9.24943 | 39.77146 | -0.23256 | 0.81619 | 0.12729 | 0.10366 | -87.38298 | 68.88412 | all | other factors | =1 if Protestant |
29 | prot | -20.96733 | 52.20106 | -0.40166 | 0.68824 | 0.18038 | 0.13911 | -123.72689 | 81.79223 | men | other factors | =1 if Protestant |
30 | prot | 1.06664 | 61.89805 | 0.01723 | 0.98627 | 0.10911 | 0.05342 | -120.91033 | 123.04361 | women | other factors | =1 if Protestant |
31 | prot | -24.17312 | 127.56650 | -0.18949 | 0.85035 | 0.17815 | -0.01362 | -279.34411 | 230.99786 | young kids | other factors | =1 if Protestant |
32 | selfe | -21.27418 | 63.30392 | -0.33606 | 0.73696 | 0.12729 | 0.10366 | -145.63872 | 103.09036 | all | labor factors | =1 if self employed |
33 | selfe | 3.48409 | 77.58571 | 0.04491 | 0.96421 | 0.18038 | 0.13911 | -149.24600 | 156.21419 | men | labor factors | =1 if self employed |
34 | selfe | -90.68594 | 112.55650 | -0.80569 | 0.42127 | 0.10911 | 0.05342 | -312.49100 | 131.11913 | women | labor factors | =1 if self employed |
35 | selfe | -370.95651 | 241.15347 | -1.53826 | 0.12924 | 0.17815 | -0.01362 | -853.33528 | 111.42226 | young kids | labor factors | =1 if self employed |
36 | smsa | -40.65356 | 39.50772 | -1.02900 | 0.30396 | 0.12729 | 0.10366 | -118.26897 | 36.96186 | all | area of residence | =1 if live in smsa |
37 | smsa | -27.63143 | 52.41859 | -0.52713 | 0.59852 | 0.18038 | 0.13911 | -130.81921 | 75.55635 | men | area of residence | =1 if live in smsa |
38 | smsa | -57.14640 | 61.52103 | -0.92889 | 0.35394 | 0.10911 | 0.05342 | -178.38042 | 64.08762 | women | area of residence | =1 if live in smsa |
39 | smsa | 43.42957 | 138.68309 | 0.31316 | 0.75525 | 0.17815 | -0.01362 | -233.97791 | 320.83705 | young kids | area of residence | =1 if live in smsa |
40 | south | 82.48698 | 46.37757 | 1.77860 | 0.07589 | 0.12729 | 0.10366 | -8.62469 | 173.59864 | all | area of residence | =1 if live in south |
41 | south | 79.40169 | 64.43983 | 1.23218 | 0.21892 | 0.18038 | 0.13911 | -47.45031 | 206.25369 | men | area of residence | =1 if live in south |
42 | south | 114.62293 | 69.30049 | 1.65400 | 0.09953 | 0.10911 | 0.05342 | -21.94138 | 251.18725 | women | area of residence | =1 if live in south |
43 | south | 71.85138 | 136.01011 | 0.52828 | 0.59925 | 0.17815 | -0.01362 | -200.20935 | 343.91211 | young kids | area of residence | =1 if live in south |
44 | spsepay | -0.00101 | 0.00265 | -0.37974 | 0.70430 | 0.12729 | 0.10366 | -0.00621 | 0.00420 | all | other factors | spousal wage income |
45 | spsepay | 0.00307 | 0.00492 | 0.62334 | 0.53357 | 0.18038 | 0.13911 | -0.00662 | 0.01276 | men | other factors | spousal wage income |
46 | spsepay | -0.00051 | 0.00362 | -0.14032 | 0.88853 | 0.10911 | 0.05342 | -0.00764 | 0.00663 | women | other factors | spousal wage income |
47 | spsepay | -0.00470 | 0.01144 | -0.41085 | 0.68265 | 0.17815 | -0.01362 | -0.02759 | 0.01819 | young kids | other factors | spousal wage income |
48 | totwrk | -0.15192 | 0.02037 | -7.45943 | 0.00000 | 0.12729 | 0.10366 | -0.19193 | -0.11191 | all | labor factors | mins worked per week |
49 | totwrk | -0.19101 | 0.02977 | -6.41715 | 0.00000 | 0.18038 | 0.13911 | -0.24960 | -0.13241 | men | labor factors | mins worked per week |
50 | totwrk | -0.12551 | 0.03271 | -3.83678 | 0.00016 | 0.10911 | 0.05342 | -0.18998 | -0.06105 | women | labor factors | mins worked per week |
51 | totwrk | -0.11299 | 0.07636 | -1.47971 | 0.14418 | 0.17815 | -0.01362 | -0.26574 | 0.03975 | young kids | labor factors | mins worked per week |
52 | yrsmarr | -0.07596 | 2.00936 | -0.03780 | 0.96986 | 0.12729 | 0.10366 | -4.02346 | 3.87155 | all | family factors | years married |
53 | yrsmarr | 0.22412 | 2.93479 | 0.07637 | 0.93918 | 0.18038 | 0.13911 | -5.55312 | 6.00135 | men | family factors | years married |
54 | yrsmarr | -1.41414 | 2.93198 | -0.48231 | 0.63005 | 0.10911 | 0.05342 | -7.19194 | 4.36366 | women | family factors | years married |
55 | yrsmarr | -31.21114 | 16.91401 | -1.84528 | 0.06993 | 0.17815 | -0.01362 | -65.04420 | 2.62192 | young kids | family factors | years married |
df_results.to_csv('../examples/data/sleep-mmodel.csv', index=False)
# _cols = ['var', 'label', 'coef', 'model', 'group', 'pval', 'll', 'hl']
# df_results[_cols].head(6).to_markdown()