Notes: This notebook prepares the example sleep data - sleep.csv.

The resulting output csv file (sleep.csv) that indicates how certain individual characteristics correlates to the amount of sleep an one gets per week. Rows are the variables correlating with sleep. Columns included the computed pearson correlation coefficient, sample size, p-value, confidence interval (95%), etc. The pingouin is used to compute correlations.

Raw src:

sleep75.csv (/wooldridge/sleep75) from https://vincentarelbundock.github.io/Rdatasets/articles/data.html
See https://rdrr.io/cran/wooldridge/man/sleep75.html for variable labels to the variables in sleep75.csv.

Requirements: Mainly pingouin. See first cell of imports for requirements

In [1]:

import pandas as pd
import numpy as np
import pingouin as pg
import warnings
warnings.filterwarnings('ignore')

_url = "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/sleep75.csv"
drop_var = ['case', 'leis1', 'leis2', 'leis3']
df = (pd.read_csv(_url, index_col=0)
      .drop(drop_var, axis=1)
     )
df.head(3)

Out[1]:

	age	educ	earns74	gdhlth	inlf	smsa	lhrwage	...	spwrk75	totwrk	worknrm	exper	yrsmarr	hrwage	agesq
1	32	12	0	0	1	0	1.955861	...	0	3438	3438	14	13	7.070004	1024
2	31	14	9500	1	1	0	0.357674	...	0	5020	5020	11	0	1.429999	961
3	44	17	42500	1	1	1	3.021887	...	1	2815	2815	21	0	20.529997	1936

3 rows × 30 columns

In [2]:

# Prep variable lablels (fold cell)
# varlabels: http://fmwww.bc.edu/ec-p/data/wooldridge/sleep75.des
df_label = (pd.read_csv('data/sleep75-des.csv', encoding="ISO-8859-1")
            .assign(label=lambda df: df['des'].str.encode('ascii', 'ignore').str.decode('ascii'))
            .drop(['des'], axis=1)
            .set_index('var')
            .drop(drop_var)
            .reset_index()
           )

df_label.head(3)

Out[2]:

	var	group	label
0	age	age	in years
1	black	other factors	=1 if black
2	clerical	occupation	=1 if clerical worker

In [3]:

# Compute correlations
df_corr = (pg.pairwise_corr(df)
           .rename(columns={'p-unc': 'p-val'})
           .query('Y=="sleep"|X=="sleep"')
           .assign(var=lambda df: df['X'])
           .assign(var=lambda df: np.where(df['var']=="sleep", df['Y'], df['var']))
           .drop(["Y", "X", "method", "alternative"], axis=1)
           .assign(
               hl=lambda df: [float(ci[1]) for ci in df['CI95%']],
               ll=lambda df: [float(ci[0]) for ci in df['CI95%']],
               moerror=lambda df: df['hl'] - df['r'],
               power=lambda df: df.power.round(decimals=2),
               n=lambda df: df.n.map(str)
           )
           # Get labels
           .merge(df_label, how='left', on='var', validate='1:1')
           .reset_index(drop=True)
          )
df_corr

Out[3]:

	n	r	CI95%	p-val	BF10	power	var	hl	ll	moerror	group	label
0	706	0.090373	[0.02, 0.16]	1.630887e-02	0.839	0.67	age	0.16	0.02	0.069627	age	in years
1	706	-0.027057	[-0.1, 0.05]	4.728889e-01	0.061	0.11	black	0.05	-0.10	0.077057	other factors	=1 if black
2	706	0.048081	[-0.03, 0.12]	2.019484e-01	0.106	0.25	clerical	0.12	-0.03	0.071919	occupation	=1 if clerical worker
3	706	0.041229	[-0.03, 0.11]	2.739475e-01	0.086	0.19	construc	0.11	-0.03	0.068771	occupation	=1 if construction worker
4	706	-0.095004	[-0.17, -0.02]	1.155151e-02	1.137	0.72	educ	-0.02	-0.17	0.075004	labor factors	years of schooling
5	706	-0.076890	[-0.15, -0.0]	4.110934e-02	0.378	0.53	earns74	-0.00	-0.15	0.076890	labor factors	total earnings, 1974
6	706	-0.102825	[-0.18, -0.03]	6.246660e-03	1.967	0.78	gdhlth	-0.03	-0.18	0.072825	health factors	=1 if in good or excel. health
7	706	-0.027126	[-0.1, 0.05]	4.717698e-01	0.061	0.11	inlf	0.05	-0.10	0.077126	labor factors	=1 if in labor force
8	706	-0.066997	[-0.14, 0.01]	7.524015e-02	0.229	0.43	smsa	0.01	-0.14	0.076997	area of residence	=1 if live in smsa
9	532	-0.067197	[-0.15, 0.02]	1.216222e-01	0.179	0.34	lhrwage	0.02	-0.15	0.087197	labor factors	log hourly wage
10	706	0.036661	[-0.04, 0.11]	3.306971e-01	0.076	0.16	lothinc	0.11	-0.04	0.073339	labor factors	log othinc, unless othinc < 0
11	706	-0.035909	[-0.11, 0.04]	3.407214e-01	0.074	0.16	male	0.04	-0.11	0.075909	other factors	=1 if male
12	706	0.053757	[-0.02, 0.13]	1.536188e-01	0.13	0.30	marr	0.13	-0.02	0.076243	family factors	=1 if married
13	706	0.027147	[-0.05, 0.1]	4.714176e-01	0.061	0.11	prot	0.10	-0.05	0.072853	other factors	=1 if Protestant
14	706	0.867744	[0.85, 0.88]	6.051022e-216	6.697e+211	1.00	rlxall	0.88	0.85	0.012256	other sleep factors	slpnaps + personal activs
15	706	0.001782	[-0.07, 0.08]	9.623058e-01	0.047	0.05	selfe	0.08	-0.07	0.078218	labor factors	=1 if self employed
16	706	0.893043	[0.88, 0.91]	2.339108e-246	1.38e+242	1.00	slpnaps	0.91	0.88	0.016957	other sleep factors	minutes sleep, inc. naps
17	706	0.078600	[0.0, 0.15]	3.679946e-02	0.415	0.55	south	0.15	0.00	0.071400	area of residence	=1 if live in south
18	706	0.007881	[-0.07, 0.08]	8.344125e-01	0.048	0.06	spsepay	0.08	-0.07	0.072119	other factors	spousal wage income
19	706	0.007868	[-0.07, 0.08]	8.346888e-01	0.048	0.05	spwrk75	0.08	-0.07	0.072132	other factors	=1 if spouse works
20	706	-0.321384	[-0.39, -0.25]	1.994095e-18	1.961e+15	1.00	totwrk	-0.25	-0.39	0.071384	labor factors	mins worked per week
21	706	0.009965	[-0.06, 0.08]	7.915440e-01	0.049	0.06	union	0.08	-0.06	0.070035	labor factors	=1 if belong to union
22	706	-0.322300	[-0.39, -0.25]	1.577335e-18	2.471e+15	1.00	worknrm	-0.25	-0.39	0.072300	labor factors	mins work main job
23	706	0.001139	[-0.07, 0.07]	9.759034e-01	0.047	0.05	workscnd	0.07	-0.07	0.068861	labor factors	mins work second job
24	706	0.104191	[0.03, 0.18]	5.587422e-03	2.175	0.79	exper	0.18	0.03	0.075809	labor factors	age - educ - 6
25	706	-0.013262	[-0.09, 0.06]	7.250012e-01	0.05	0.06	yngkid	0.06	-0.09	0.073262	family factors	=1 if children < 3 present
26	706	0.063997	[-0.01, 0.14]	8.928507e-02	0.199	0.40	yrsmarr	0.14	-0.01	0.076003	family factors	years married
27	532	-0.049450	[-0.13, 0.04]	2.548774e-01	0.104	0.21	hrwage	0.04	-0.13	0.089450	labor factors	hourly wage
28	706	0.099722	[0.03, 0.17]	8.010946e-03	1.574	0.76	agesq	0.17	0.03	0.070278	age	age^2

In [4]:

df_corr.to_csv('data/sleep-untruncated.csv', index=False)

_drop = ['earns74', 'inlf', 'lothinc', 'workscnd', 'lhrwage', 'worknrm', 
         'spwrk75', 'marr', 'black', 'agesq', 'union', 'exper', 'rlxall', 'slpnaps']
df_corr.query('var not in @_drop').to_csv('data/sleep.csv', index=False)

In [5]:

_cols = ['var', 'r', 'moerror', 'label', 'group', 'll', 'hl', 'n', 'power', 'p-val']
print(df_corr[_cols].head(3).to_markdown())

|    | var      |          r |   moerror | label                 | group         |    ll |   hl |   n |   power |     p-val |
|---:|:---------|-----------:|----------:|:----------------------|:--------------|------:|-----:|----:|--------:|----------:|
|  0 | age      |  0.0903729 | 0.0696271 | in years              | age           |  0.02 | 0.16 | 706 |    0.67 | 0.0163089 |
|  1 | black    | -0.0270573 | 0.0770573 | =1 if black           | other factors | -0.1  | 0.05 | 706 |    0.11 | 0.472889  |
|  2 | clerical |  0.0480811 | 0.0719189 | =1 if clerical worker | occupation    | -0.03 | 0.12 | 706 |    0.25 | 0.201948  |

In [ ]: