#!/usr/bin/env python # coding: utf-8 # **Notes**: # This notebook prepares the [example sleep data - sleep.csv](https://github.com/LSYS/pyforestplot/blob/main/examples/data/sleep.csv). # # The resulting output csv file ([sleep.csv](https://github.com/LSYS/pyforestplot/blob/main/examples/data/sleep.csv)) that indicates how certain individual characteristics correlates to the amount of sleep an one gets per week. # Rows are the variables correlating with sleep. Columns included the computed pearson correlation coefficient, sample size, p-value, confidence interval (95%), etc. # The `pingouin` is used to compute correlations. # # **Raw src**: # * `sleep75.csv` (/wooldridge/sleep75) from https://vincentarelbundock.github.io/Rdatasets/articles/data.html # * See https://rdrr.io/cran/wooldridge/man/sleep75.html for variable labels to the variables in `sleep75.csv`. # # # # **Requirements**: Mainly `pingouin`. See first cell of imports for requirements # In[1]: import pandas as pd import numpy as np import pingouin as pg import warnings warnings.filterwarnings('ignore') _url = "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/sleep75.csv" drop_var = ['case', 'leis1', 'leis2', 'leis3'] df = (pd.read_csv(_url, index_col=0) .drop(drop_var, axis=1) ) df.head(3) # In[2]: # Prep variable lablels (fold cell) # varlabels: http://fmwww.bc.edu/ec-p/data/wooldridge/sleep75.des df_label = (pd.read_csv('data/sleep75-des.csv', encoding="ISO-8859-1") .assign(label=lambda df: df['des'].str.encode('ascii', 'ignore').str.decode('ascii')) .drop(['des'], axis=1) .set_index('var') .drop(drop_var) .reset_index() ) df_label.head(3) # In[3]: # Compute correlations df_corr = (pg.pairwise_corr(df) .rename(columns={'p-unc': 'p-val'}) .query('Y=="sleep"|X=="sleep"') .assign(var=lambda df: df['X']) .assign(var=lambda df: np.where(df['var']=="sleep", df['Y'], df['var'])) .drop(["Y", "X", "method", "alternative"], axis=1) .assign( hl=lambda df: [float(ci[1]) for ci in df['CI95%']], ll=lambda df: [float(ci[0]) for ci in df['CI95%']], moerror=lambda df: df['hl'] - df['r'], power=lambda df: df.power.round(decimals=2), n=lambda df: df.n.map(str) ) # Get labels .merge(df_label, how='left', on='var', validate='1:1') .reset_index(drop=True) ) df_corr # In[4]: df_corr.to_csv('data/sleep-untruncated.csv', index=False) _drop = ['earns74', 'inlf', 'lothinc', 'workscnd', 'lhrwage', 'worknrm', 'spwrk75', 'marr', 'black', 'agesq', 'union', 'exper', 'rlxall', 'slpnaps'] df_corr.query('var not in @_drop').to_csv('data/sleep.csv', index=False) # In[5]: _cols = ['var', 'r', 'moerror', 'label', 'group', 'll', 'hl', 'n', 'power', 'p-val'] print(df_corr[_cols].head(3).to_markdown()) # In[ ]: