#!/usr/bin/env python
# coding: utf-8

# **Notes**:
# This notebook prepares the [example sleep data - sleep.csv](https://github.com/LSYS/pyforestplot/blob/main/examples/data/sleep.csv).
# 
# The resulting output csv file ([sleep.csv](https://github.com/LSYS/pyforestplot/blob/main/examples/data/sleep.csv)) that indicates how certain individual characteristics correlates to the amount of sleep an one gets per week.
# Rows are the variables correlating with sleep. Columns included the computed pearson correlation coefficient, sample size, p-value, confidence interval (95%), etc.
# The `pingouin` is used to compute correlations.
# 
# **Raw src**:
# * `sleep75.csv` (/wooldridge/sleep75) from https://vincentarelbundock.github.io/Rdatasets/articles/data.html
# * See https://rdrr.io/cran/wooldridge/man/sleep75.html for variable labels to the variables in `sleep75.csv`.
# 
# 
# 
# **Requirements**: Mainly `pingouin`. See first cell of imports for requirements

# In[1]:


import pandas as pd
import numpy as np
import pingouin as pg
import warnings
warnings.filterwarnings('ignore')

_url = "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/sleep75.csv"
drop_var = ['case', 'leis1', 'leis2', 'leis3']
df = (pd.read_csv(_url, index_col=0)
      .drop(drop_var, axis=1)
     )
df.head(3)


# In[2]:


# Prep variable lablels (fold cell)
# varlabels: http://fmwww.bc.edu/ec-p/data/wooldridge/sleep75.des
df_label = (pd.read_csv('data/sleep75-des.csv', encoding="ISO-8859-1")
            .assign(label=lambda df: df['des'].str.encode('ascii', 'ignore').str.decode('ascii'))
            .drop(['des'], axis=1)
            .set_index('var')
            .drop(drop_var)
            .reset_index()
           )

df_label.head(3)


# In[3]:


# Compute correlations
df_corr = (pg.pairwise_corr(df)
           .rename(columns={'p-unc': 'p-val'})
           .query('Y=="sleep"|X=="sleep"')
           .assign(var=lambda df: df['X'])
           .assign(var=lambda df: np.where(df['var']=="sleep", df['Y'], df['var']))
           .drop(["Y", "X", "method", "alternative"], axis=1)
           .assign(
               hl=lambda df: [float(ci[1]) for ci in df['CI95%']],
               ll=lambda df: [float(ci[0]) for ci in df['CI95%']],
               moerror=lambda df: df['hl'] - df['r'],
               power=lambda df: df.power.round(decimals=2),
               n=lambda df: df.n.map(str)
           )
           # Get labels
           .merge(df_label, how='left', on='var', validate='1:1')
           .reset_index(drop=True)
          )
df_corr


# In[4]:


df_corr.to_csv('data/sleep-untruncated.csv', index=False)

_drop = ['earns74', 'inlf', 'lothinc', 'workscnd', 'lhrwage', 'worknrm', 
         'spwrk75', 'marr', 'black', 'agesq', 'union', 'exper', 'rlxall', 'slpnaps']
df_corr.query('var not in @_drop').to_csv('data/sleep.csv', index=False)


# In[5]:


_cols = ['var', 'r', 'moerror', 'label', 'group', 'll', 'hl', 'n', 'power', 'p-val']
print(df_corr[_cols].head(3).to_markdown())


# In[ ]: