#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd pd.set_option("display.precision", 3) pd.set_option('display.float_format', lambda x: '%.5f' % x) import warnings import janitor import numpy as np import pingouin as pg warnings.filterwarnings('ignore') _url = "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/sleep75.csv" drop_var = ['case', 'leis1', 'leis2', 'leis3'] df = (pd.read_csv(_url, index_col=0) .drop(drop_var, axis=1) # .assign(lspsepay=lambda df: np.log1p(df.spsepay)) ) df.head(3) # In[2]: # Prep variable lablels (fold cell) # varlabels: http://fmwww.bc.edu/ec-p/data/wooldridge/sleep75.des df_label = (pd.read_csv('data/sleep75-des.csv', encoding="ISO-8859-1") .assign(label=lambda df: df['des'].str.encode('ascii', 'ignore').str.decode('ascii')) .drop(['des'], axis=1) .set_index('var') .drop(drop_var) .reset_index() ) df_label.head(3) # In[3]: stdopts = {'relimp': False, 'remove_na': True} x = [ "age", "black", "clerical", "construc", "educ", "gdhlth", "inlf", "smsa", "lhrwage", "prot", "selfe", "south", "spsepay", "totwrk", "yrsmarr", ] lm_all = (pg.linear_regression(df[x], df['sleep'], **stdopts) .assign(model='all') ) lm_male = (pg.linear_regression(df.query('male==1')[x], df.query('male==1')['sleep'], **stdopts) .assign(model='men') ) lm_female = (pg.linear_regression(df.query('male==0')[x], df.query('male==0')['sleep'], **stdopts) .assign(model='women') ) lm_kids = (pg.linear_regression(df.query('yngkid==1')[x], df.query('yngkid==1')['sleep'], **stdopts) .assign(model='young kids') ) df_results = (pd.concat([lm_all, lm_female, lm_male, lm_kids]) .query('names!="Intercept"') .reset_index(drop=True) # Get labels .rename_column('names', 'var') .merge(df_label, how='left', on='var', validate='m:1') .sort_values(['var', 'model', 'group']) .reset_index(drop=True) # Tidy up columns .rename_column("CI[2.5%]", "ll") .rename_column("CI[97.5%]", "hl") ) df_results # In[4]: df_results.to_csv('../examples/data/sleep-mmodel.csv', index=False) # In[5]: # _cols = ['var', 'label', 'coef', 'model', 'group', 'pval', 'll', 'hl'] # df_results[_cols].head(6).to_markdown() # In[ ]: