Notes: This notebook prepares the example sleep data - sleep.csv.

The resulting output csv file (sleep.csv) that indicates how certain individual characteristics correlates to the amount of sleep an one gets per week. Rows are the variables correlating with sleep. Columns included the computed pearson correlation coefficient, sample size, p-value, confidence interval (95%), etc. The pingouin is used to compute correlations.

Raw src:

Requirements: Mainly pingouin. See first cell of imports for requirements

In [1]:
import pandas as pd
import numpy as np
import pingouin as pg
import warnings
warnings.filterwarnings('ignore')

_url = "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/sleep75.csv"
drop_var = ['case', 'leis1', 'leis2', 'leis3']
df = (pd.read_csv(_url, index_col=0)
      .drop(drop_var, axis=1)
     )
df.head(3)
Out[1]:
age black clerical construc educ earns74 gdhlth inlf smsa lhrwage ... spwrk75 totwrk union worknrm workscnd exper yngkid yrsmarr hrwage agesq
1 32 0 0.0 0.0 12 0 0 1 0 1.955861 ... 0 3438 0 3438 0 14 0 13 7.070004 1024
2 31 0 0.0 0.0 14 9500 1 1 0 0.357674 ... 0 5020 0 5020 0 11 0 0 1.429999 961
3 44 0 0.0 0.0 17 42500 1 1 1 3.021887 ... 1 2815 0 2815 0 21 0 0 20.529997 1936

3 rows × 30 columns

In [2]:
# Prep variable lablels (fold cell)
# varlabels: http://fmwww.bc.edu/ec-p/data/wooldridge/sleep75.des
df_label = (pd.read_csv('data/sleep75-des.csv', encoding="ISO-8859-1")
            .assign(label=lambda df: df['des'].str.encode('ascii', 'ignore').str.decode('ascii'))
            .drop(['des'], axis=1)
            .set_index('var')
            .drop(drop_var)
            .reset_index()
           )

df_label.head(3)
Out[2]:
var group label
0 age age in years
1 black other factors =1 if black
2 clerical occupation =1 if clerical worker
In [3]:
# Compute correlations
df_corr = (pg.pairwise_corr(df)
           .rename(columns={'p-unc': 'p-val'})
           .query('Y=="sleep"|X=="sleep"')
           .assign(var=lambda df: df['X'])
           .assign(var=lambda df: np.where(df['var']=="sleep", df['Y'], df['var']))
           .drop(["Y", "X", "method", "alternative"], axis=1)
           .assign(
               hl=lambda df: [float(ci[1]) for ci in df['CI95%']],
               ll=lambda df: [float(ci[0]) for ci in df['CI95%']],
               moerror=lambda df: df['hl'] - df['r'],
               power=lambda df: df.power.round(decimals=2),
               n=lambda df: df.n.map(str)
           )
           # Get labels
           .merge(df_label, how='left', on='var', validate='1:1')
           .reset_index(drop=True)
          )
df_corr
Out[3]:
n r CI95% p-val BF10 power var hl ll moerror group label
0 706 0.090373 [0.02, 0.16] 1.630887e-02 0.839 0.67 age 0.16 0.02 0.069627 age in years
1 706 -0.027057 [-0.1, 0.05] 4.728889e-01 0.061 0.11 black 0.05 -0.10 0.077057 other factors =1 if black
2 706 0.048081 [-0.03, 0.12] 2.019484e-01 0.106 0.25 clerical 0.12 -0.03 0.071919 occupation =1 if clerical worker
3 706 0.041229 [-0.03, 0.11] 2.739475e-01 0.086 0.19 construc 0.11 -0.03 0.068771 occupation =1 if construction worker
4 706 -0.095004 [-0.17, -0.02] 1.155151e-02 1.137 0.72 educ -0.02 -0.17 0.075004 labor factors years of schooling
5 706 -0.076890 [-0.15, -0.0] 4.110934e-02 0.378 0.53 earns74 -0.00 -0.15 0.076890 labor factors total earnings, 1974
6 706 -0.102825 [-0.18, -0.03] 6.246660e-03 1.967 0.78 gdhlth -0.03 -0.18 0.072825 health factors =1 if in good or excel. health
7 706 -0.027126 [-0.1, 0.05] 4.717698e-01 0.061 0.11 inlf 0.05 -0.10 0.077126 labor factors =1 if in labor force
8 706 -0.066997 [-0.14, 0.01] 7.524015e-02 0.229 0.43 smsa 0.01 -0.14 0.076997 area of residence =1 if live in smsa
9 532 -0.067197 [-0.15, 0.02] 1.216222e-01 0.179 0.34 lhrwage 0.02 -0.15 0.087197 labor factors log hourly wage
10 706 0.036661 [-0.04, 0.11] 3.306971e-01 0.076 0.16 lothinc 0.11 -0.04 0.073339 labor factors log othinc, unless othinc < 0
11 706 -0.035909 [-0.11, 0.04] 3.407214e-01 0.074 0.16 male 0.04 -0.11 0.075909 other factors =1 if male
12 706 0.053757 [-0.02, 0.13] 1.536188e-01 0.13 0.30 marr 0.13 -0.02 0.076243 family factors =1 if married
13 706 0.027147 [-0.05, 0.1] 4.714176e-01 0.061 0.11 prot 0.10 -0.05 0.072853 other factors =1 if Protestant
14 706 0.867744 [0.85, 0.88] 6.051022e-216 6.697e+211 1.00 rlxall 0.88 0.85 0.012256 other sleep factors slpnaps + personal activs
15 706 0.001782 [-0.07, 0.08] 9.623058e-01 0.047 0.05 selfe 0.08 -0.07 0.078218 labor factors =1 if self employed
16 706 0.893043 [0.88, 0.91] 2.339108e-246 1.38e+242 1.00 slpnaps 0.91 0.88 0.016957 other sleep factors minutes sleep, inc. naps
17 706 0.078600 [0.0, 0.15] 3.679946e-02 0.415 0.55 south 0.15 0.00 0.071400 area of residence =1 if live in south
18 706 0.007881 [-0.07, 0.08] 8.344125e-01 0.048 0.06 spsepay 0.08 -0.07 0.072119 other factors spousal wage income
19 706 0.007868 [-0.07, 0.08] 8.346888e-01 0.048 0.05 spwrk75 0.08 -0.07 0.072132 other factors =1 if spouse works
20 706 -0.321384 [-0.39, -0.25] 1.994095e-18 1.961e+15 1.00 totwrk -0.25 -0.39 0.071384 labor factors mins worked per week
21 706 0.009965 [-0.06, 0.08] 7.915440e-01 0.049 0.06 union 0.08 -0.06 0.070035 labor factors =1 if belong to union
22 706 -0.322300 [-0.39, -0.25] 1.577335e-18 2.471e+15 1.00 worknrm -0.25 -0.39 0.072300 labor factors mins work main job
23 706 0.001139 [-0.07, 0.07] 9.759034e-01 0.047 0.05 workscnd 0.07 -0.07 0.068861 labor factors mins work second job
24 706 0.104191 [0.03, 0.18] 5.587422e-03 2.175 0.79 exper 0.18 0.03 0.075809 labor factors age - educ - 6
25 706 -0.013262 [-0.09, 0.06] 7.250012e-01 0.05 0.06 yngkid 0.06 -0.09 0.073262 family factors =1 if children < 3 present
26 706 0.063997 [-0.01, 0.14] 8.928507e-02 0.199 0.40 yrsmarr 0.14 -0.01 0.076003 family factors years married
27 532 -0.049450 [-0.13, 0.04] 2.548774e-01 0.104 0.21 hrwage 0.04 -0.13 0.089450 labor factors hourly wage
28 706 0.099722 [0.03, 0.17] 8.010946e-03 1.574 0.76 agesq 0.17 0.03 0.070278 age age^2
In [4]:
df_corr.to_csv('data/sleep-untruncated.csv', index=False)

_drop = ['earns74', 'inlf', 'lothinc', 'workscnd', 'lhrwage', 'worknrm', 
         'spwrk75', 'marr', 'black', 'agesq', 'union', 'exper', 'rlxall', 'slpnaps']
df_corr.query('var not in @_drop').to_csv('data/sleep.csv', index=False)
In [5]:
_cols = ['var', 'r', 'moerror', 'label', 'group', 'll', 'hl', 'n', 'power', 'p-val']
print(df_corr[_cols].head(3).to_markdown())
|    | var      |          r |   moerror | label                 | group         |    ll |   hl |   n |   power |     p-val |
|---:|:---------|-----------:|----------:|:----------------------|:--------------|------:|-----:|----:|--------:|----------:|
|  0 | age      |  0.0903729 | 0.0696271 | in years              | age           |  0.02 | 0.16 | 706 |    0.67 | 0.0163089 |
|  1 | black    | -0.0270573 | 0.0770573 | =1 if black           | other factors | -0.1  | 0.05 | 706 |    0.11 | 0.472889  |
|  2 | clerical |  0.0480811 | 0.0719189 | =1 if clerical worker | occupation    | -0.03 | 0.12 | 706 |    0.25 | 0.201948  |
In [ ]: