# Core Python libraries:
import pandas as pd
import numpy as np
from typing import Union
import re
import os
import certifi
# Visiualization libraries:
import mercury as mr
import pygwalker as pyg
from matplotlib import pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
import scienceplots
import seaborn as sns
from lets_plot import *
from lets_plot.bistro import *
from lets_plot.geo_data import *
# Machine Learning and Numerical Processing libraries:
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
The geodata is provided by © OpenStreetMap contributors and is made available here under the Open Database License (ODbL).
# Run "$mercury run" in terminal to view this notebook as an interactive web application!
app = mr.App(title="Uncovering Trends and Patterns in Raw World Happiness Data for Strategic Insights!", description="""This notebook aims
to analyze and understand the factors influencing the happiness levels of countries worldwide, as reported in the World Happiness Report.
Through a detailed examination of indicators and behaviors associated with happiness, we seek to unveil trends and patterns.
The insights and visualizations derived from this data exploration and mining are designed to guide strategic decision-making for
governmental and non-governmental leaders interested in enhancing societal well-being. By leveraging this information, leadership
can enact policies and initiatives that directly target areas with potential for improvement.""", show_code=False)
pd.set_option('display.max_columns', None) # Display all the columns of a dataframe
pd.set_option('expand_frame_repr', False) # Display the dataframe's records on the same line
np.set_printoptions(linewidth=np.inf, threshold=np.inf) # Display the numerically processed dataframe's records on the same line
os.environ['SSL_CERT_FILE'] = certifi.where()
separator = "\n" * 2 + "#" * 150 + "\n"
LetsPlot.setup_html()
plt.style.use(['ieee', 'science', 'notebook'])
plt.rcParams["figure.autolayout"] = True
set_matplotlib_formats('svg')
dfs = {"current_df": pd.read_csv("world_happiness_data/world-happiness-report-2021.csv"),
"historic_df": pd.read_csv("world_happiness_data/world-happiness-report.csv")}
for df in dfs:
print(f"*** {df} ***\n")
print(dfs[df].head(), separator)
print(dfs[df].tail(), separator)
print(dfs[df].info(), separator)
*** current_df *** Country name Regional indicator Ladder score Standard error of ladder score upperwhisker lowerwhisker Logged GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption Ladder score in Dystopia Explained by: Log GDP per capita Explained by: Social support Explained by: Healthy life expectancy Explained by: Freedom to make life choices Explained by: Generosity Explained by: Perceptions of corruption Dystopia + residual 0 Finland Western Europe 7.842 0.032 7.904 7.780 10.775 0.954 72.0 0.949 -0.098 0.186 2.43 1.446 1.106 0.741 0.691 0.124 0.481 3.253 1 Denmark Western Europe 7.620 0.035 7.687 7.552 10.933 0.954 72.7 0.946 0.030 0.179 2.43 1.502 1.108 0.763 0.686 0.208 0.485 2.868 2 Switzerland Western Europe 7.571 0.036 7.643 7.500 11.117 0.942 74.4 0.919 0.025 0.292 2.43 1.566 1.079 0.816 0.653 0.204 0.413 2.839 3 Iceland Western Europe 7.554 0.059 7.670 7.438 10.878 0.983 73.0 0.955 0.160 0.673 2.43 1.482 1.172 0.772 0.698 0.293 0.170 2.967 4 Netherlands Western Europe 7.464 0.027 7.518 7.410 10.932 0.942 72.4 0.913 0.175 0.338 2.43 1.501 1.079 0.753 0.647 0.302 0.384 2.798 ###################################################################################################################################################### Country name Regional indicator Ladder score Standard error of ladder score upperwhisker lowerwhisker Logged GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption Ladder score in Dystopia Explained by: Log GDP per capita Explained by: Social support Explained by: Healthy life expectancy Explained by: Freedom to make life choices Explained by: Generosity Explained by: Perceptions of corruption Dystopia + residual 144 Lesotho Sub-Saharan Africa 3.512 0.120 3.748 3.276 7.926 0.787 48.700 0.715 -0.131 0.915 2.43 0.451 0.731 0.007 0.405 0.103 0.015 1.800 145 Botswana Sub-Saharan Africa 3.467 0.074 3.611 3.322 9.782 0.784 59.269 0.824 -0.246 0.801 2.43 1.099 0.724 0.340 0.539 0.027 0.088 0.648 146 Rwanda Sub-Saharan Africa 3.415 0.068 3.548 3.282 7.676 0.552 61.400 0.897 0.061 0.167 2.43 0.364 0.202 0.407 0.627 0.227 0.493 1.095 147 Zimbabwe Sub-Saharan Africa 3.145 0.058 3.259 3.030 7.943 0.750 56.201 0.677 -0.047 0.821 2.43 0.457 0.649 0.243 0.359 0.157 0.075 1.205 148 Afghanistan South Asia 2.523 0.038 2.596 2.449 7.695 0.463 52.493 0.382 -0.102 0.924 2.43 0.370 0.000 0.126 0.000 0.122 0.010 1.895 ###################################################################################################################################################### <class 'pandas.core.frame.DataFrame'> RangeIndex: 149 entries, 0 to 148 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country name 149 non-null object 1 Regional indicator 149 non-null object 2 Ladder score 149 non-null float64 3 Standard error of ladder score 149 non-null float64 4 upperwhisker 149 non-null float64 5 lowerwhisker 149 non-null float64 6 Logged GDP per capita 149 non-null float64 7 Social support 149 non-null float64 8 Healthy life expectancy 149 non-null float64 9 Freedom to make life choices 149 non-null float64 10 Generosity 149 non-null float64 11 Perceptions of corruption 149 non-null float64 12 Ladder score in Dystopia 149 non-null float64 13 Explained by: Log GDP per capita 149 non-null float64 14 Explained by: Social support 149 non-null float64 15 Explained by: Healthy life expectancy 149 non-null float64 16 Explained by: Freedom to make life choices 149 non-null float64 17 Explained by: Generosity 149 non-null float64 18 Explained by: Perceptions of corruption 149 non-null float64 19 Dystopia + residual 149 non-null float64 dtypes: float64(18), object(2) memory usage: 23.4+ KB None ###################################################################################################################################################### *** historic_df *** Country name year Life Ladder Log GDP per capita Social support Healthy life expectancy at birth Freedom to make life choices Generosity Perceptions of corruption Positive affect Negative affect 0 Afghanistan 2008 3.724 7.370 0.451 50.80 0.718 0.168 0.882 0.518 0.258 1 Afghanistan 2009 4.402 7.540 0.552 51.20 0.679 0.190 0.850 0.584 0.237 2 Afghanistan 2010 4.758 7.647 0.539 51.60 0.600 0.121 0.707 0.618 0.275 3 Afghanistan 2011 3.832 7.620 0.521 51.92 0.496 0.162 0.731 0.611 0.267 4 Afghanistan 2012 3.783 7.705 0.521 52.24 0.531 0.236 0.776 0.710 0.268 ###################################################################################################################################################### Country name year Life Ladder Log GDP per capita Social support Healthy life expectancy at birth Freedom to make life choices Generosity Perceptions of corruption Positive affect Negative affect 1944 Zimbabwe 2016 3.735 7.984 0.768 54.4 0.733 -0.095 0.724 0.738 0.209 1945 Zimbabwe 2017 3.638 8.016 0.754 55.0 0.753 -0.098 0.751 0.806 0.224 1946 Zimbabwe 2018 3.616 8.049 0.775 55.6 0.763 -0.068 0.844 0.710 0.212 1947 Zimbabwe 2019 2.694 7.950 0.759 56.2 0.632 -0.064 0.831 0.716 0.235 1948 Zimbabwe 2020 3.160 7.829 0.717 56.8 0.643 -0.009 0.789 0.703 0.346 ###################################################################################################################################################### <class 'pandas.core.frame.DataFrame'> RangeIndex: 1949 entries, 0 to 1948 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country name 1949 non-null object 1 year 1949 non-null int64 2 Life Ladder 1949 non-null float64 3 Log GDP per capita 1913 non-null float64 4 Social support 1936 non-null float64 5 Healthy life expectancy at birth 1894 non-null float64 6 Freedom to make life choices 1917 non-null float64 7 Generosity 1860 non-null float64 8 Perceptions of corruption 1839 non-null float64 9 Positive affect 1927 non-null float64 10 Negative affect 1933 non-null float64 dtypes: float64(9), int64(1), object(1) memory usage: 167.6+ KB None ######################################################################################################################################################
def clean_columns(column, elements_to_remove: Union[list, tuple]=(",", ".")):
column = re.sub(r'\([^)]*\)', '', column) # Remove all parentheses and their content
for e in elements_to_remove:
column = column.replace(e, "") # Remove all commas, periods, etc.
column = column.strip().replace(" ", "_") # Remove any leading and trailing whitespaces
return column.title()
for df in dfs:
print(f"*** {df} ***\n")
print(f"Old column names: {dfs[df].columns.values}")
dfs[df] = dfs[df].rename(columns=lambda col: clean_columns(col, elements_to_remove=("", "?")))
print(f"New column names: {dfs[df].columns.values}", separator)
*** current_df *** Old column names: ['Country name' 'Regional indicator' 'Ladder score' 'Standard error of ladder score' 'upperwhisker' 'lowerwhisker' 'Logged GDP per capita' 'Social support' 'Healthy life expectancy' 'Freedom to make life choices' 'Generosity' 'Perceptions of corruption' 'Ladder score in Dystopia' 'Explained by: Log GDP per capita' 'Explained by: Social support' 'Explained by: Healthy life expectancy' 'Explained by: Freedom to make life choices' 'Explained by: Generosity' 'Explained by: Perceptions of corruption' 'Dystopia + residual'] New column names: ['Country_Name' 'Regional_Indicator' 'Ladder_Score' 'Standard_Error_Of_Ladder_Score' 'Upperwhisker' 'Lowerwhisker' 'Logged_Gdp_Per_Capita' 'Social_Support' 'Healthy_Life_Expectancy' 'Freedom_To_Make_Life_Choices' 'Generosity' 'Perceptions_Of_Corruption' 'Ladder_Score_In_Dystopia' 'Explained_By:_Log_Gdp_Per_Capita' 'Explained_By:_Social_Support' 'Explained_By:_Healthy_Life_Expectancy' 'Explained_By:_Freedom_To_Make_Life_Choices' 'Explained_By:_Generosity' 'Explained_By:_Perceptions_Of_Corruption' 'Dystopia_+_Residual'] ###################################################################################################################################################### *** historic_df *** Old column names: ['Country name' 'year' 'Life Ladder' 'Log GDP per capita' 'Social support' 'Healthy life expectancy at birth' 'Freedom to make life choices' 'Generosity' 'Perceptions of corruption' 'Positive affect' 'Negative affect'] New column names: ['Country_Name' 'Year' 'Life_Ladder' 'Log_Gdp_Per_Capita' 'Social_Support' 'Healthy_Life_Expectancy_At_Birth' 'Freedom_To_Make_Life_Choices' 'Generosity' 'Perceptions_Of_Corruption' 'Positive_Affect' 'Negative_Affect'] ######################################################################################################################################################
# Check if we have any rows that are identical on every column in both data frames:
for df in dfs:
print(dfs[df].duplicated().any())
print(separator)
# Check if we have any rows that are identical on the key columns ("Country_Name" and "Year") in "historic_df":
print(dfs["historic_df"].duplicated(subset=("Country_Name", "Year")).any())
False False ###################################################################################################################################################### False
# Join the two data frames to include additional columns:
dfs["historic_df"] = dfs["current_df"][["Country_Name", "Regional_Indicator"]].merge(dfs["historic_df"], on="Country_Name", how="inner")
# Create and modify columns:
dfs["current_df"].insert(loc=2, column="Year", value=2021)
dfs["current_df"] = dfs["current_df"].rename(columns={"Ladder_Score": "Happiness_Index"})
columns = dfs["current_df"].columns
key_columns = np.concatenate((columns[:4], columns[7: 13])) # Grab the desired columns
dfs["current_df"] = dfs["current_df"][key_columns]
dfs["historic_df"] = dfs["historic_df"].iloc[:, :-2] # Remove the undesired columns
dfs["historic_df"].columns = key_columns # Rename columns to match in both data frames
# Combine the two data frames:
happiness_df = (pd.concat([dfs["current_df"], dfs["historic_df"]], ignore_index=True)
.sort_values(["Country_Name", "Year"])
.reset_index(drop=True))
print(happiness_df)
Country_Name Regional_Indicator Year Happiness_Index Logged_Gdp_Per_Capita Social_Support Healthy_Life_Expectancy Freedom_To_Make_Life_Choices Generosity Perceptions_Of_Corruption 0 Afghanistan South Asia 2008 3.724 7.370 0.451 50.800 0.718 0.168 0.882 1 Afghanistan South Asia 2009 4.402 7.540 0.552 51.200 0.679 0.190 0.850 2 Afghanistan South Asia 2010 4.758 7.647 0.539 51.600 0.600 0.121 0.707 3 Afghanistan South Asia 2011 3.832 7.620 0.521 51.920 0.496 0.162 0.731 4 Afghanistan South Asia 2012 3.783 7.705 0.521 52.240 0.531 0.236 0.776 ... ... ... ... ... ... ... ... ... ... ... 2030 Zimbabwe Sub-Saharan Africa 2017 3.638 8.016 0.754 55.000 0.753 -0.098 0.751 2031 Zimbabwe Sub-Saharan Africa 2018 3.616 8.049 0.775 55.600 0.763 -0.068 0.844 2032 Zimbabwe Sub-Saharan Africa 2019 2.694 7.950 0.759 56.200 0.632 -0.064 0.831 2033 Zimbabwe Sub-Saharan Africa 2020 3.160 7.829 0.717 56.800 0.643 -0.009 0.789 2034 Zimbabwe Sub-Saharan Africa 2021 3.145 7.943 0.750 56.201 0.677 -0.047 0.821 [2035 rows x 10 columns]
def is_int(x):
try:
if np.isnan(x) or int(x):
return True
except:
return False
def is_float(x):
try:
float(x)
return True
except:
return False
def fix_columns(df):
for column in df.columns:
if any(word in column.lower() for word in ('date', 'time')):
df[column] = df[column].astype('datetime64[ns]')
print(f'Changed column "{column}"`s datatype to "datetime"')
elif df[column].dtype in (object, str):
df[column] = df[column].str.strip()
if np.prod(df[column].value_counts().values) == 1:
print(f'Column "{column}" contains meta data!')
elif all(df[column].apply(is_int)):
df[column] = df[column].astype(pd.Int32Dtype()) # Cannot convert "nan" values into "int", need "pd.Int"
print(f'Changed column "{column}`s" datatype to "int"')
else:
temp = df[column].str.replace(',', '.')
if all(temp.apply(is_float)):
df[column] = temp.astype(float)
print(f'Changed column "{column}`s" datatype to "float"')
else:
df[column] = df[column].astype('category') # More memory and performance efficient
print(f'Changed column "{column}`s" datatype to "category"')
fix_columns(happiness_df)
Changed column "Country_Name`s" datatype to "category" Changed column "Regional_Indicator`s" datatype to "category"
print(happiness_df.info(), separator)
print(happiness_df.describe())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2035 entries, 0 to 2034 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country_Name 2035 non-null category 1 Regional_Indicator 2035 non-null category 2 Year 2035 non-null int64 3 Happiness_Index 2035 non-null float64 4 Logged_Gdp_Per_Capita 2011 non-null float64 5 Social_Support 2026 non-null float64 6 Healthy_Life_Expectancy 1984 non-null float64 7 Freedom_To_Make_Life_Choices 2005 non-null float64 8 Generosity 1959 non-null float64 9 Perceptions_Of_Corruption 1931 non-null float64 dtypes: category(2), float64(7), int64(1) memory usage: 138.9 KB None ###################################################################################################################################################### Year Happiness_Index Logged_Gdp_Per_Capita Social_Support Healthy_Life_Expectancy Freedom_To_Make_Life_Choices Generosity Perceptions_Of_Corruption count 2035.000000 2035.000000 2011.000000 2026.000000 1984.000000 2005.000000 1959.000000 1931.000000 mean 2013.826536 5.490948 9.391096 0.814959 63.695212 0.748269 -0.002346 0.746277 std 4.514250 1.107523 1.141129 0.116125 7.376080 0.139289 0.162257 0.186760 min 2005.000000 2.375000 6.635000 0.291000 32.300000 0.258000 -0.335000 0.035000 25% 2010.000000 4.669000 8.484000 0.751000 59.180000 0.656000 -0.117000 0.690000 50% 2014.000000 5.420000 9.487000 0.836000 65.400000 0.769000 -0.029000 0.801000 75% 2018.000000 6.298000 10.370500 0.906750 68.800000 0.861000 0.089000 0.870000 max 2021.000000 8.019000 11.648000 0.987000 77.100000 0.985000 0.698000 0.983000
missing_count = happiness_df.isna().sum()
missing_count = missing_count.reset_index(name="Count").rename(columns={"index": "Column Name"})
missing_count = missing_count[missing_count["Count"] > 0].sort_values("Count", ascending=False)
(ggplot(missing_count)
+ geom_bar(aes(x="Column Name", y="Count", fill="Column Name", color="Column Name"), color="black", alpha=0.9, stat="identity")
+ ggtitle("Number of missing values in each column")
+ ggsize(800, 600)
+ theme(legend_title=element_text(size=15), legend_text=element_text(size=13), axis_text_x=element_text(angle=70), panel_grid_major_x='blank')
)
# Notice that the columns of all the missing values are of "numeric" type!
missing_corruption = (happiness_df.groupby(["Country_Name", "Regional_Indicator"], observed=True)
.agg({
"Perceptions_Of_Corruption": lambda x: x.isna().sum(),
})
).reset_index()
missing_corruption = (missing_corruption[missing_corruption["Perceptions_Of_Corruption"] > 0]
.rename(columns={"Perceptions_Of_Corruption": "Missing_Corruption"})
.sort_values("Missing_Corruption", ascending=False)
.reset_index()
)
(ggplot(missing_corruption)
+ geom_bar(aes(x="Country_Name", y="Missing_Corruption", fill="Regional_Indicator"), color="black", stat="identity",
tooltips=layer_tooltips().line("@Country_Name").line("Number of missing values|= ^y").format("^y", ".1s"))
+ggtitle('Countries with the highest number of missing "Corruption" values grouped by region')
+ggsize(900, 500)
+ theme(legend_title=element_text(size=15), legend_text=element_text(size=13), axis_text_x=element_text(angle=85))
)
missing_per_country = (happiness_df.groupby(["Country_Name", "Regional_Indicator"], observed=True)
.apply(lambda x: x.isna().sum().sum())
.reset_index(name="Count"))
missing_per_country = missing_per_country[missing_per_country['Count'] > 0].sort_values('Count', ascending=False)
(ggplot(missing_per_country.head(10))
+ geom_bar(aes(x='Count', y='Country_Name', fill="Regional_Indicator"), stat='identity', orientation='y', color='black', tooltips=layer_tooltips().line("@Country_Name").line("Number of missing values|= ^x").format("^x", ".1s"))
+ ggtitle("Countries with the highest number of missing values grouped by region")
+ ggsize(900, 500)
+theme(panel_grid_major_y='blank')
)
# A function that drops missing values from a data frame in a smart way by accounting for "information lost"
def smart_dropna(df: pd.DataFrame):
missing = df.isna().any()
if missing.any():
n_cols = len(df.columns) # Represents the fragments/fractions of information in a sample/record
for col in df.columns[missing]: # Get the columns that contain missing values
info_lost_by_col_rmv = df[col].count()/n_cols
info_lost_by_rows_rmv = df[df[col].isna()].notna().sum(axis=1).sum()/n_cols
if info_lost_by_col_rmv < info_lost_by_rows_rmv:
df = df.drop(col, axis=1) # Drop the column
else:
df = df[df[col].notna()].reset_index(drop=True) # Drop all the rows containing missing values across "col"
return df
class LinearStochasticRegressor(LinearRegression):
def __init__(self, target: str, train_df: pd.DataFrame, target_df: pd.DataFrame):
super().__init__()
self.target_df = target_df
self.train_df = smart_dropna(train_df.copy())
self.target = self.train_df.pop(target).values
extra_cols = np.setdiff1d(self.target_df.columns, self.train_df.columns)
self.target_df = self.target_df.drop(extra_cols, axis=1).values
self.train_df = self.train_df.values
def fit_data(self):
self.fit(self.train_df, self.target)
def get_r2_score(self):
return self.score(self.train_df, self.target)
def add_random_error(self):
# Residual sum of squares (Residual Variance):
rrs = mean_squared_error(self.target, self.predict(self.train_df))
std_dev = np.sqrt(rrs)
return np.random.normal(0, std_dev, size=len(self.target_df))
def predict_missing(self):
return self.predict(self.target_df) + self.add_random_error()
class TreeStochasticRegressor(HistGradientBoostingRegressor):
def __init__(self, target: str, train_df: pd.DataFrame, target_df: pd.DataFrame):
super().__init__(learning_rate=0.1, max_depth=5,)
self.target_df = target_df.values
self.train_df = train_df
self.target = self.train_df.pop(target).values
self.train_df = self.train_df.values
def fit_data(self):
self.fit(self.train_df, self.target)
def get_r2_score(self):
return self.score(self.train_df, self.target)
def add_random_error(self):
# Residual sum of squares (Residual Variance):
rrs = mean_squared_error(self.target, self.predict(self.train_df))
std_dev = np.sqrt(rrs)
return np.random.normal(0, std_dev, size=len(self.target_df))
def predict_missing(self):
return self.predict(self.target_df) + self.add_random_error()
# First, we assume that the data statistics are most accurate at the "Country_Name" granularity level
# Hence, we'll utilize the low-level, in-distribution "Country_Name" data statistics for imputation when applicable; otherwise, we'll rely on the high-level, out-of-distribution data statistics based on the "Regional_Indicator" and "Year" granularity levels
hopeless_TH = 0.9 # Threshold at which we can't sensibly fill in the missing values in a meaningful way without introducing any bias
guess_TH = 0.7 # Threshold at which the missing values are hard (inaccurate) to predict (rely on high-level statistics)
# In between are the thresholds at which the missing values can be predicted using farily complex predictive models
estimate_TH = 0.25 # Threshold at which the missing values are simple to estimate/approximate (linearlly: utilize low-level statistics)
grouped_df = happiness_df.groupby(["Country_Name", "Regional_Indicator"], observed=True)
order = ('hopeless', 'estimate', 'guess', 'predict') # Order at which the imputing mechanism is executed
for step in order:
for (country_name, country_region), country_group in grouped_df:
country_group = country_group.select_dtypes('number')
missing = country_group.isna().sum()
if missing.any():
missing_ratio = missing[missing > 0]/len(country_group)
region_df = happiness_df[happiness_df["Regional_Indicator"] == country_region].select_dtypes('number')
if step == 'hopeless':
hopeless_cols = missing_ratio[missing_ratio >= hopeless_TH].index
for col in hopeless_cols:
print(f'Column "{col}"`s values are almost completely missing for country: "{country_name}"')
elif step == 'estimate':
estimate_cols = missing_ratio[missing_ratio <= estimate_TH].index
for col in estimate_cols:
happiness_df.loc[happiness_df["Country_Name"] == country_name, col] = np.round(country_group
.set_index("Year")[col]
.interpolate(method="index")
.bfill()
.values, 3)
elif step == 'guess':
guess_cols = missing_ratio[missing_ratio >= guess_TH].index
for col in guess_cols:
missing = country_group[col].isna() # Boolean mask where rows contain missing values on "col"
missing_years = country_group.loc[missing, "Year"]
region_by_year = region_df.loc[region_df["Year"].isin(missing_years), ["Year", col]].groupby("Year")
happiness_df.loc[(happiness_df["Country_Name"] == country_name) & missing, col] = np.round(region_by_year
.mean()
.values, 3) # Or median
elif step == 'predict':
# Train a predictive hypothesis function (model) to predict missing values:
predict_cols = missing_ratio[(estimate_TH < missing_ratio) & (missing_ratio < guess_TH)].index
for col in predict_cols:
missing = country_group[col].isna() # Boolean mask where rows contain missing values on "col"
target_df = country_group[missing].drop(col, axis=1)
train_df = country_group[~missing]
if len(smart_dropna(train_df)) < 5: # Not enough statistics to use in-distribution data
missing_years = country_group.loc[missing, "Year"]
region_by_year_df = region_df[region_df["Year"].isin(missing_years) & region_df[col].notna()]
params = dict(target=col, train_df=region_by_year_df, target_df=target_df)
else:
params = dict(target=col, train_df=train_df, target_df=target_df)
if target_df.isna().any().any():
model_name = "TSR"
model = TreeStochasticRegressor(**params)
else:
model_name = "LSR"
model = LinearStochasticRegressor(**params)
model.fit_data()
if model_name == "LSR":
# If no robust linear correlation or if overfitting:
if model.get_r2_score() < 0.8 or model.get_r2_score() == 1:
model = TreeStochasticRegressor(**params)
model.fit_data()
happiness_df.loc[(happiness_df["Country_Name"] == country_name) & missing, col] = np.round(model.predict_missing(), 3)
# Make sure we no longer have any missing values in our data frame:
print(separator)
print(f'Does our data frame contain any missing values? {happiness_df.isna().any().any()}')
Column "Perceptions_Of_Corruption"`s values are almost completely missing for country: "China" Column "Healthy_Life_Expectancy"`s values are almost completely missing for country: "Hong Kong S.A.R. of China" Column "Healthy_Life_Expectancy"`s values are almost completely missing for country: "Kosovo" Column "Perceptions_Of_Corruption"`s values are almost completely missing for country: "Turkmenistan" ###################################################################################################################################################### Does our data frame contain any missing values? False
# Cluster our samples based on the "Happiness_Index" column:
kmeans = KMeans(n_clusters=5, init='k-means++', n_init=15, max_iter=500)
cluster_name = ['Sad', 'Somewhat Sad', 'Neurtal', 'Somewhat Happy', 'Happy']
clusters = kmeans.fit_predict(happiness_df[["Happiness_Index"]])
cluster_order = np.argsort(kmeans.cluster_centers_[:, 0])
cluster_mapping = {c_order: c_name for (c_order, c_name) in zip(cluster_order, cluster_name)}
happiness_df['Happiness_Level'] = pd.Series([cluster_mapping[c] for c in clusters]).astype('category')
pyg.walk(happiness_df, hideDataSourceConfig=True)
column_select = mr.Select(value="Perceptions_Of_Corruption", choices=happiness_df.columns, label="Select a column to investigte")
mercury.Select
kde_plot = sns.kdeplot(x=column_select.value, data=happiness_df, ax=None)
plt.close()
y_values = kde_plot.lines[0].get_ydata()
x_values = kde_plot.lines[0].get_xdata()
max_indx = np.argmax(y_values)
max_prob = x_values[max_indx]
mean = happiness_df[column_select.value].mean()
diff = max_prob - mean
if abs(diff) < happiness_df[column_select.value].std()/2.5:
dist = "Nomarl"
elif diff > 0:
dist = "Left-Skewed"
else:
dist = "Right-Skewed"
(ggplot(happiness_df)
+ geom_density(aes(x=column_select.value, y="..density..", fill=column_select.value), stat="density", quantile_lines=True, color="gray")
+ggtitle(f'Historic distribution ({dist}) of {column_select.value} across all countries')
+geom_vline(
xintercept=mean,
color="red", linetype="dashed", size=1)
+ geom_text(
x=mean,
y=np.max(y_values)/50,
label='(mean)',
color="#ff7f0e",
size=9,
)
+theme(panel_grid_major_x='blank')
+ggsize(900, 500)
)
(ggplot(happiness_df)
+geom_density(aes(x=column_select.value, y="..density..", fill="Happiness_Level"), color="black", position="dodge", stat="density")
+labs(y="Density", title=f"Distribution of {column_select.value} grouped by Happines Levels")
+ggsize(900, 500)
+theme(panel_grid_major_x='blank')
)
(ggplot(happiness_df, aes(x='Happiness_Level', y=column_select.value))
+ geom_violin(aes(color='Happiness_Level', fill='Happiness_Level'), size=2, alpha=.5, scale='width',
tooltips=layer_tooltips().line(f"{column_select.value}:|^y"))
+ geom_boxplot(aes(fill='Happiness_Level'), width=.2)
+ ggsize(900, 500))
(ggplot(happiness_df)
+ geom_bar(aes(x="Happiness_Level", y="..count..", fill="Regional_Indicator"), color="black",
tooltips=layer_tooltips().line("^y"))
+ ggtitle("Chart")
+ ggsize(900, 500)
+theme(panel_grid_major_x='blank')
+labs(title="Number of countries in each happiness level grouped by region", y="Count"))
column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select a column to investigte")
mercury.Select
means = happiness_df.groupby("Regional_Indicator", observed=True)[column_select.value].mean().reset_index()
(ggplot()
+geom_point(aes(x="Regional_Indicator", y=column_select.value, color="Happiness_Level"), data=happiness_df, shape=1, size=2.5)
+geom_point(aes(x="Regional_Indicator", y=column_select.value, size=column_select.value,), fill="orange", color="black", data=means, shape=23)
+ggsize(900, 500)
+ scale_size(range=[3, 7])
+theme(axis_text_x=element_text(angle=85))
+ ggtitle(f"Historic distribution of the {column_select.value} in each region (yellow diamond: mean)")
)
means = happiness_df.groupby("Year", observed=True)[column_select.value].mean().reset_index()
years = np.sort(happiness_df["Year"].unique())
(ggplot()
+geom_point(aes(x="Year", y=column_select.value, fill=column_select.value), shape=21, color="black", data=happiness_df)
+geom_point(aes(x="Year", y=column_select.value, size=column_select.value,), fill="blue", color="black", data=means, shape=23)
+ggsize(900, 500)
+ scale_fill_gradient2(midpoint=means[column_select.value].mean(), low='red', mid='yellow', high='green', guide=guide_colorbar(nbin=3, barwidth=10))
+ scale_x_continuous(breaks=years)
+ scale_size(range=[3, 10], guide='none')
+theme(axis_text_x=element_text(angle=30))
+ggtitle(f"Distribution of the overall {column_select.value} across the years (blue diamond: mean)") )
years = [int(y) for y in years]
year_select = mr.Select(value=2021, choices=years, label="Select year of interest")
mercury.Select
happiness_sorted = happiness_df.loc[happiness_df["Year"] == year_select.value, ["Happiness_Index", "Country_Name", "Regional_Indicator"]].sort_values("Happiness_Index", ascending=False)
happiest = happiness_sorted.head(5).copy()
happiest["Feeling"] = "Happiest"
saddest = happiness_sorted.tail(5).copy()
saddest["Feeling"] = "Saddest"
combined_df = pd.concat([happiest, saddest], ignore_index=True)
(ggplot(combined_df)
+ geom_bar(aes(x="Country_Name", y="Happiness_Index", fill='Regional_Indicator', color="Feeling"), size=1.5, stat='identity', width=0.7, tooltips=layer_tooltips().line("Feeling:|@Feeling"))
+ggsize(900, 500)
+ggtitle(f"Top 5 happiest & saddest countries in {year_select.value} grouped by region")
+theme(panel_grid_major_x='blank')
+scale_color_manual(["green", "red"])
)
column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select column of interest")
mercury.Select
col_change = (happiness_df[["Year", "Country_Name", column_select.value, "Regional_Indicator"]]
.groupby(["Country_Name", "Regional_Indicator"], observed=True)[column_select.value]
.pct_change() * 100)
col_change = col_change.rename("Percent_Change")
col_change = pd.concat([happiness_df, col_change], axis=1)
col_change_by_year = col_change.sort_values("Year").groupby("Year", observed=True)
max_col_change_by_year = col_change_by_year["Percent_Change"].idxmax()
max_country_by_year = col_change.loc[max_col_change_by_year.dropna()]
max_country_by_year["Year_Country"] = max_country_by_year["Year"].astype(str) + ":\n" + max_country_by_year["Country_Name"].astype(str)
(
ggplot(max_country_by_year)
+ geom_bar(aes(x="Year_Country", y="Percent_Change", fill="Percent_Change"), stat='identity', color="black",
tooltips=layer_tooltips().line("@Country_Name").line(f"Percent Change in {column_select.value}|= ^y%"))
+ scale_fill_gradient2(low='red', mid='white', high='darkgreen', midpoint=0)
+ ylim(max_country_by_year["Percent_Change"].min()-0.5, max_country_by_year["Percent_Change"].max()+0.5)
+geom_hline(
yintercept=0,
color="pink", size=1)
+ ggsize(900, 500)
+ ggtitle(f"YoY maximum percent change in {column_select.value} observed in each year")
+theme(axis_text_x=element_text(angle=70), panel_grid_major_x='blank')
)
(corr_plot(happiness_df.select_dtypes('number'))
.points(type="full")
.labels(type="full").build() + ggtitle("Tiles, points and labels")
+ggsize(900, 600)
+ggtitle("Overall Correlation Heatmap"))
# Very surprising to see that there is no overall correlation at all between "Generosity" and "Gdp_Per_Caipta"!
country_select = mr.Select(value="United Arab Emirates", choices=happiness_df["Country_Name"].unique(), label="Select country of interest")
mercury.Select
(corr_plot(happiness_df[happiness_df["Country_Name"] == country_select.value].select_dtypes('number'))
.points(type="full")
.labels(type="full").palette_BrBG().build() + ggtitle("Tiles, points and labels")
+ggsize(900, 600)
+ggtitle(f"{country_select.value}'s Correlation Heatmap"))
columns_select = mr.MultiSelect(label="Select two features to investigate",
value=["Logged_Gdp_Per_Capita", "Happiness_Index"],
choices=happiness_df.columns)
mercury.MultiSelect
columns_pair = columns_select.value
assert len(columns_pair) == 2, f"Please select exactly two columns, you've selected {len(columns_pair)}!"
(joint_plot(happiness_df, x=columns_pair[0], y=columns_pair[1], color_by="Happiness_Level")
+ facet_wrap("Regional_Indicator", nrow=5, scales="free")
+ geom_smooth(color="gray")
+ ggsize(1000, 1200)
+ ggtitle(f"Overall Trend in {columns_pair[1]} Against {columns_pair[0]} Across Every Region Grouped by Happiness Levels")
+ labs(caption=f"This figure illustrates how the {columns_pair[1]} of the different happiness groups within each region reacts to changes in {columns_pair[0]}")
+theme(legend_position="top", axis_title=element_text(margin=margin(t=20, b=30,l=30)))
)
columns_select = mr.MultiSelect(label="Select two features to investigate",
value=["Perceptions_Of_Corruption", "Happiness_Index"],
choices=happiness_df.columns)
mercury.MultiSelect
columns_pair = columns_select.value
assert len(columns_pair) == 2, f"Please select exactly two columns, you've selected {len(columns_pair)}!"
(ggplot(happiness_df)
+ geom_smooth(aes(x=columns_pair[0], y=columns_pair[1], color="Happiness_Level"), size=1.3)
+ ggsize(900, 500)
+ ggtitle(f"How The {columns_pair[1]} varies against {columns_pair[0]} grouped by Happiness Levels worldwide")
)
column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select column to view trend overtime")
mercury.Select
region_happiness = happiness_df.groupby(["Regional_Indicator", "Year"]).agg({
column_select.value: "mean"
}).reset_index()
(ggplot(region_happiness)
+geom_line(aes(x="Year", y=column_select.value, color="Regional_Indicator"), size=3,
tooltips=layer_tooltips().line(f"{column_select.value}: ^y"))
+ facet_wrap("Regional_Indicator", nrow=5, scales="free")
+ ggtitle(f"Trend of average {column_select.value} over the years accross every region")
+theme(legend_position="bottom")
+scale_color_discrete(guide=guide_legend(ncol=2))
+ scale_x_continuous(breaks=list(range(2005, 2022, 5)) + [2021])
+ ggsize(1000, 1000)
)
countries_of_interest = ["United Arab Emirates", "Egypt", "Finland", "United States", "Russia"]
countries_select = mr.MultiSelect(value=countries_of_interest, choices=happiness_df["Country_Name"].unique(), label="Select countries of interest")
column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select column of interest")
mercury.MultiSelect
mercury.Select
countries_df = happiness_df[happiness_df["Country_Name"].isin(countries_select.value)]
(
ggplot(countries_df)
+ geom_point(aes(x="Year", y="Country_Name", fill=column_select.value, size=column_select.value), shape=25, color="black")
+ scale_fill_gradient2(midpoint=happiness_df[column_select.value].mean(), low='#d7191c', mid='yellow', high='#2b83ba', guide=guide_colorbar(nbin=5, barwidth=10))
+ scale_size(range=[3, 10], guide='none')
+ ggsize(900, 500)
+ scale_x_continuous(breaks=years)
+theme(axis_text_x=element_text(angle=70))
)
column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select column of interest to view worldwide")
mercury.Select
happiness_by_country = happiness_df.groupby(["Country_Name", "Regional_Indicator"], observed=True).agg({
column_select.value: "mean",
}).reset_index()
happiness_by_country = happiness_by_country[happiness_by_country["Country_Name"] != "Hong Kong S.A.R. of China"]
happiness_by_country["Country_Name"] = happiness_by_country["Country_Name"].astype('object')
happiness_by_country["Country_Name"] = happiness_by_country["Country_Name"].replace({"North Cyprus": "Cyprus", "Palestinian Territories": "Palestine", "Taiwan Province of China": "Taiwan"})
countries_gcoder = geocode_countries(happiness_by_country["Country_Name"])
(ggplot()
+ geom_livemap(location=[53, 24],
zoom=4)
+ geom_polygon(aes(fill=column_select.value),
data=happiness_by_country,
map=countries_gcoder.get_boundaries(),
map_join=[["Country_Name"], ["country"]],
alpha=0.7,
color="gray",
size=0.5,
tooltips=layer_tooltips().line('@Country_Name').line(f'{column_select.value}:| @{column_select.value}'))
+ theme(legend_position='top')
+ ggsize(900, 600)
+ scale_fill_gradient2(midpoint=happiness_by_country[column_select.value].mean(), low='red', mid='yellow', high='green', guide=guide_colorbar(nbin=5))
)
def process_df_features(df: pd.DataFrame, ignore: Union[list, tuple]=(), drop: Union[list, tuple]=()):
df = df.copy()
numerical_preprocessor = StandardScaler()
categorical_preprocessor = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
for col in df.columns:
if col in drop:
df = df.drop(col, axis=1)
elif col in ignore:
continue
elif df[col].dtype in (int, 'category', bool):
unique_vals = np.sort(df[col].unique())
print(f'Column "{col}`s" unique values are: {unique_vals}')
df[col] = df[col].astype('category')
onehot_enc = categorical_preprocessor.fit_transform(df[[col]])
onehot_df = pd.DataFrame(onehot_enc, columns=[f"{col}_{val}" for val in unique_vals])
df = pd.concat([df.drop(col, axis=1), onehot_df], axis=1)
elif df[col].dtype == float:
df[col] = numerical_preprocessor.fit_transform(df[[col]])
return df
processed_df = process_df_features(happiness_df, drop=["Country_Name", "Happiness_Level"])
target = processed_df["Happiness_Index"].values
data = processed_df.drop('Happiness_Index', axis=1).values
x_train, x_test, y_train, y_test = train_test_split(data, target, train_size=0.85)
# If necessary, we can conduct a grid search on the model's hyperparameters:
"""
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=False, scoring='r2', n_jobs = -1)
grid_search.fit(x_train, y_train)
grid_search.best_params_
"""
params = dict(learning_rate=0.05, max_depth=10)
models = dict(linear=LinearRegression(), XGBoost=XGBRegressor(), hist_boosting=HistGradientBoostingRegressor(**params), random_forest=RandomForestRegressor(n_estimators=100))
scores = ['r2', 'neg_mean_absolute_error']
print('\nStart training our models:\n')
for (model_name, model) in tqdm(models.items()):
print(f"Model: {model_name}")
model.fit(x_train, y_train)
cv_results = cross_validate(model, data, target, cv=5, scoring=scores)
print(f"\tModel's CV average R2 score: {cv_results['test_r2'].mean()*100:.4}%")
print(f"\tModel's CV average MSE loss value: {abs(cv_results['test_neg_mean_absolute_error']).mean():.4}")
print(f"\tModel's test R2 score: {model.score(x_test, y_test)*100:.4}%\n")
Column "Regional_Indicator`s" unique values are: ['Central and Eastern Europe' 'Commonwealth of Independent States' 'East Asia' 'Latin America and Caribbean' 'Middle East and North Africa' 'North America and ANZ' 'South Asia' 'Southeast Asia' 'Sub-Saharan Africa' 'Western Europe'] Column "Year`s" unique values are: [2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021] Start training our models:
0%| | 0/4 [00:00<?, ?it/s]
Model: linear
25%|██████████████████████████████████ | 1/4 [00:01<00:03, 1.25s/it]
Model's CV average R2 score: 73.81% Model's CV average MSE loss value: 0.3849 Model's test R2 score: 74.94% Model: XGBoost
50%|████████████████████████████████████████████████████████████████████ | 2/4 [00:02<00:02, 1.06s/it]
Model's CV average R2 score: 74.63% Model's CV average MSE loss value: 0.3862 Model's test R2 score: 85.86% Model: hist_boosting
75%|██████████████████████████████████████████████████████████████████████████████████████████████████████ | 3/4 [00:05<00:01, 1.91s/it]
Model's CV average R2 score: 75.89% Model's CV average MSE loss value: 0.3801 Model's test R2 score: 85.42% Model: random_forest
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:08<00:00, 2.02s/it]
Model's CV average R2 score: 75.63% Model's CV average MSE loss value: 0.384 Model's test R2 score: 87.34%