import pandas as pd
import seaborn as sns
wh_file = 'WHR20_DataForTable2.1.xls'
wh_all_df = pd.read_excel(wh_file)
wh_all_df
Country name | year | Life Ladder | Log GDP per capita | Social support | Healthy life expectancy at birth | Freedom to make life choices | Generosity | Perceptions of corruption | Positive affect | ... | GINI index (World Bank estimate) | GINI index (World Bank estimate), average 2000-2017, unbalanced panel | gini of household income reported in Gallup, by wp5-year | Most people can be trusted, Gallup | Most people can be trusted, WVS round 1981-1984 | Most people can be trusted, WVS round 1989-1993 | Most people can be trusted, WVS round 1994-1998 | Most people can be trusted, WVS round 1999-2004 | Most people can be trusted, WVS round 2005-2009 | Most people can be trusted, WVS round 2010-2014 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | 2008 | 3.723590 | 7.144916 | 0.450662 | 50.799999 | 0.718114 | 0.178993 | 0.881686 | 0.517637 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | Afghanistan | 2009 | 4.401778 | 7.314788 | 0.552308 | 51.200001 | 0.678896 | 0.201228 | 0.850035 | 0.583926 | ... | NaN | NaN | 0.441906 | 0.286315 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Afghanistan | 2010 | 4.758381 | 7.421525 | 0.539075 | 51.599998 | 0.600127 | 0.131578 | 0.706766 | 0.618265 | ... | NaN | NaN | 0.327318 | 0.275833 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Afghanistan | 2011 | 3.831719 | 7.394349 | 0.521104 | 51.919998 | 0.495901 | 0.173452 | 0.731109 | 0.611387 | ... | NaN | NaN | 0.336764 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | Afghanistan | 2012 | 3.782938 | 7.480296 | 0.520637 | 52.240002 | 0.530935 | 0.246943 | 0.775620 | 0.710385 | ... | NaN | NaN | 0.344540 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1843 | Zimbabwe | 2015 | 3.703191 | 7.827643 | 0.735800 | 53.799999 | 0.667193 | -0.117035 | 0.810457 | 0.715079 | ... | NaN | 0.432 | 0.655137 | NaN | NaN | NaN | NaN | 0.116683 | NaN | 0.082942 |
1844 | Zimbabwe | 2016 | 3.735400 | 7.819675 | 0.768425 | 54.400002 | 0.732971 | -0.088488 | 0.723612 | 0.737636 | ... | NaN | 0.432 | 0.596690 | NaN | NaN | NaN | NaN | 0.116683 | NaN | 0.082942 |
1845 | Zimbabwe | 2017 | 3.638300 | 7.851042 | 0.754147 | 55.000000 | 0.752826 | -0.091540 | 0.751208 | 0.806428 | ... | NaN | 0.432 | 0.581484 | NaN | NaN | NaN | NaN | 0.116683 | NaN | 0.082942 |
1846 | Zimbabwe | 2018 | 3.616480 | 7.896704 | 0.775388 | 55.599998 | 0.762675 | -0.063282 | 0.844209 | 0.710119 | ... | NaN | 0.432 | 0.541772 | NaN | NaN | NaN | NaN | 0.116683 | NaN | 0.082942 |
1847 | Zimbabwe | 2019 | 2.693523 | 7.850442 | 0.759162 | 56.200001 | 0.631908 | -0.062325 | 0.830652 | 0.716004 | ... | NaN | 0.432 | 0.699588 | NaN | NaN | NaN | NaN | 0.116683 | NaN | 0.082942 |
1848 rows × 26 columns
wh_all_df.columns
Index(['Country name', 'year', 'Life Ladder', 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Positive affect', 'Negative affect', 'Confidence in national government', 'Democratic Quality', 'Delivery Quality', 'Standard deviation of ladder by country-year', 'Standard deviation/Mean of ladder by country-year', 'GINI index (World Bank estimate)', 'GINI index (World Bank estimate), average 2000-2017, unbalanced panel', 'gini of household income reported in Gallup, by wp5-year', 'Most people can be trusted, Gallup', 'Most people can be trusted, WVS round 1981-1984', 'Most people can be trusted, WVS round 1989-1993', 'Most people can be trusted, WVS round 1994-1998', 'Most people can be trusted, WVS round 1999-2004', 'Most people can be trusted, WVS round 2005-2009', 'Most people can be trusted, WVS round 2010-2014'], dtype='object')
cols_selec = wh_all_df.columns[1:9]
cols_selec
Index(['year', 'Life Ladder', 'Log GDP per capita', 'Social support', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'], dtype='object')
wh_withna_df = wh_all_df[cols_selec]
sns.pairplot(wh_withna_df);
wh_corrmat = wh_withna_df.corr()
wh_corrmat
year | Life Ladder | Log GDP per capita | Social support | Healthy life expectancy at birth | Freedom to make life choices | Generosity | Perceptions of corruption | |
---|---|---|---|---|---|---|---|---|
year | 1.000000 | 0.005903 | 0.048507 | -0.032555 | 0.133141 | 0.193448 | -0.045407 | -0.071487 |
Life Ladder | 0.005903 | 1.000000 | 0.776676 | 0.704046 | 0.741604 | 0.523349 | 0.196849 | -0.422653 |
Log GDP per capita | 0.048507 | 0.776676 | 1.000000 | 0.678557 | 0.829849 | 0.361548 | -0.000773 | -0.342296 |
Social support | -0.032555 | 0.704046 | 0.678557 | 1.000000 | 0.612411 | 0.409955 | 0.066356 | -0.214311 |
Healthy life expectancy at birth | 0.133141 | 0.741604 | 0.829849 | 0.612411 | 1.000000 | 0.378726 | 0.030108 | -0.315927 |
Freedom to make life choices | 0.193448 | 0.523349 | 0.361548 | 0.409955 | 0.378726 | 1.000000 | 0.334219 | -0.490592 |
Generosity | -0.045407 | 0.196849 | -0.000773 | 0.066356 | 0.030108 | 0.334219 | 1.000000 | -0.299612 |
Perceptions of corruption | -0.071487 | -0.422653 | -0.342296 | -0.214311 | -0.315927 | -0.490592 | -0.299612 | 1.000000 |
sns.heatmap(wh_corrmat, cmap='coolwarm', vmin=-1, vmax=1, annot=True);
wh_withna_df.isna().sum()
year 0 Life Ladder 0 Log GDP per capita 29 Social support 13 Healthy life expectancy at birth 52 Freedom to make life choices 31 Generosity 83 Perceptions of corruption 103 dtype: int64
wh_df = wh_withna_df.dropna()
n_drops = wh_withna_df.shape[0] - wh_df.shape[0]
n_drops_pc = n_drops / wh_df.shape[0] * 100
print(f"Number of rows removed: {n_drops}, that is {n_drops_pc:.2f}%")
Number of rows removed: 221, that is 13.58%
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
from happypred import whdata_to_csv
X_train_csv, y_train_csv, X_test_csv, y_test_csv = whdata_to_csv('WHR20_DataForTable2.1.xls')
X_train_df = pd.read_csv(X_train_csv)
y_train_df = pd.read_csv(y_train_csv)
X_test_df = pd.read_csv(X_test_csv)
y_test_df = pd.read_csv(y_test_csv)
X_train_df
Log GDP per capita | Social support | Healthy life expectancy at birth | Freedom to make life choices | Generosity | Perceptions of corruption | |
---|---|---|---|---|---|---|
0 | 7.144916 | 0.450662 | 50.799999 | 0.718114 | 0.178993 | 0.881686 |
1 | 7.314788 | 0.552308 | 51.200001 | 0.678896 | 0.201228 | 0.850035 |
2 | 7.421525 | 0.539075 | 51.599998 | 0.600127 | 0.131578 | 0.706766 |
3 | 7.394349 | 0.521104 | 51.919998 | 0.495901 | 0.173452 | 0.731109 |
4 | 7.480296 | 0.520637 | 52.240002 | 0.530935 | 0.246943 | 0.775620 |
... | ... | ... | ... | ... | ... | ... |
1504 | 7.826639 | 0.765839 | 52.380001 | 0.642034 | -0.067743 | 0.820217 |
1505 | 7.827643 | 0.735800 | 53.799999 | 0.667193 | -0.117035 | 0.810457 |
1506 | 7.819675 | 0.768425 | 54.400002 | 0.732971 | -0.088488 | 0.723612 |
1507 | 7.851042 | 0.754147 | 55.000000 | 0.752826 | -0.091540 | 0.751208 |
1508 | 7.896704 | 0.775388 | 55.599998 | 0.762675 | -0.063282 | 0.844209 |
1509 rows × 6 columns
y_train_df
Life Ladder | |
---|---|
0 | 3.723590 |
1 | 4.401778 |
2 | 4.758381 |
3 | 3.831719 |
4 | 3.782938 |
... | ... |
1504 | 4.184451 |
1505 | 3.703191 |
1506 | 3.735400 |
1507 | 3.638300 |
1508 | 3.616480 |
1509 rows × 1 columns
X_test_df
Log GDP per capita | Social support | Healthy life expectancy at birth | Freedom to make life choices | Generosity | Perceptions of corruption | |
---|---|---|---|---|---|---|
0 | 7.458469 | 0.419973 | 52.400002 | 0.393656 | -0.096549 | 0.923849 |
1 | 9.456569 | 0.686365 | 69.000000 | 0.777351 | -0.100784 | 0.914284 |
2 | 9.537369 | 0.803259 | 66.099998 | 0.385083 | -0.017092 | 0.740609 |
3 | 9.772140 | 0.896371 | 69.000000 | 0.817053 | -0.202615 | 0.830460 |
4 | 10.725492 | 0.942774 | 73.900002 | 0.917537 | 0.117622 | 0.430209 |
... | ... | ... | ... | ... | ... | ... |
113 | 9.959532 | 0.933471 | 69.099998 | 0.902679 | -0.102370 | 0.599400 |
114 | 8.773669 | 0.915276 | 65.400002 | 0.970295 | 0.303127 | 0.511197 |
115 | 8.850640 | 0.847592 | 68.099998 | 0.952469 | -0.122386 | 0.787889 |
116 | 8.224403 | 0.637894 | 55.799999 | 0.811040 | 0.066340 | 0.831956 |
117 | 7.850442 | 0.759162 | 56.200001 | 0.631908 | -0.062325 | 0.830652 |
118 rows × 6 columns
y_test_df
Life Ladder | |
---|---|
0 | 2.375092 |
1 | 4.995318 |
2 | 4.744627 |
3 | 6.085561 |
4 | 7.233995 |
... | ... |
113 | 6.600337 |
114 | 6.154049 |
115 | 5.467451 |
116 | 3.306797 |
117 | 2.693523 |
118 rows × 1 columns
No changes
from happypred import hwtrain, hwpredict
pickled_model_path = hwtrain(X_train_csv, y_train_csv, model='lm')
y_pred1 = hwpredict(7.144916, 0.450662, 50.799999, 0.718114, 0.178993, 0.881686)
y_pred1
3.5262099333794374
y_pred2 = hwpredict(7.314788, 0.552308, 51.200001, 0.678896, 0.201228, 0.850035)
y_pred2
3.833786823757028