#!/usr/bin/env python # coding: utf-8 # # Uncovering Trends and Patterns in Raw World Happiness Data for Strategic Insights # # Table of Contents # 1. [Import Relevant Packages](#import) # 2. [Setup Notebook Configuration](#setup) # 3. [Load the Data Frames](#load) # 4. [Perform EDA (Exploratory Data Analysis)](#eda) # 5. [Cluster Our Data Frame](#cluster) # 6. [Feature (Column) Understanding](#feature) # 7. [Model Training and Evaluation](#model) # # ## Import Relevant Packages: # In[1]: # Core Python libraries: import pandas as pd import numpy as np from typing import Union import re import os import certifi # Visiualization libraries: import mercury as mr import pygwalker as pyg from matplotlib import pyplot as plt from matplotlib_inline.backend_inline import set_matplotlib_formats import scienceplots import seaborn as sns from lets_plot import * from lets_plot.bistro import * from lets_plot.geo_data import * # Machine Learning and Numerical Processing libraries: from tqdm import tqdm from xgboost import XGBRegressor from sklearn.cluster import KMeans from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer from sklearn.linear_model import LinearRegression from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor, RandomForestRegressor from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV # # ## Setup Notebook Configuration: # In[2]: # Run "$mercury run" in terminal to view this notebook as an interactive web application! app = mr.App(title="Uncovering Trends and Patterns in Raw World Happiness Data for Strategic Insights!", description="""This notebook aims to analyze and understand the factors influencing the happiness levels of countries worldwide, as reported in the World Happiness Report. Through a detailed examination of indicators and behaviors associated with happiness, we seek to unveil trends and patterns. The insights and visualizations derived from this data exploration and mining are designed to guide strategic decision-making for governmental and non-governmental leaders interested in enhancing societal well-being. By leveraging this information, leadership can enact policies and initiatives that directly target areas with potential for improvement.""", show_code=False) pd.set_option('display.max_columns', None) # Display all the columns of a dataframe pd.set_option('expand_frame_repr', False) # Display the dataframe's records on the same line np.set_printoptions(linewidth=np.inf, threshold=np.inf) # Display the numerically processed dataframe's records on the same line os.environ['SSL_CERT_FILE'] = certifi.where() separator = "\n" * 2 + "#" * 150 + "\n" LetsPlot.setup_html() plt.style.use(['ieee', 'science', 'notebook']) plt.rcParams["figure.autolayout"] = True set_matplotlib_formats('svg') # # ## Load the Data Frames: # In[3]: dfs = {"current_df": pd.read_csv("world_happiness_data/world-happiness-report-2021.csv"), "historic_df": pd.read_csv("world_happiness_data/world-happiness-report.csv")} # # ## Perform EDA (Exploratory Data Analysis) # ### Peek into the data: # In[4]: for df in dfs: print(f"*** {df} ***\n") print(dfs[df].head(), separator) print(dfs[df].tail(), separator) print(dfs[df].info(), separator) # ### Clean column names as necessary: # In[5]: def clean_columns(column, elements_to_remove: Union[list, tuple]=(",", ".")): column = re.sub(r'\([^)]*\)', '', column) # Remove all parentheses and their content for e in elements_to_remove: column = column.replace(e, "") # Remove all commas, periods, etc. column = column.strip().replace(" ", "_") # Remove any leading and trailing whitespaces return column.title() for df in dfs: print(f"*** {df} ***\n") print(f"Old column names: {dfs[df].columns.values}") dfs[df] = dfs[df].rename(columns=lambda col: clean_columns(col, elements_to_remove=("", "?"))) print(f"New column names: {dfs[df].columns.values}", separator) # ### Check for duplicated records (rows) in our data frames (if any): # In[6]: # Check if we have any rows that are identical on every column in both data frames: for df in dfs: print(dfs[df].duplicated().any()) print(separator) # Check if we have any rows that are identical on the key columns ("Country_Name" and "Year") in "historic_df": print(dfs["historic_df"].duplicated(subset=("Country_Name", "Year")).any()) # ### Enrich our dataset in a meaningful way by appropriately combining the two datasets into one: # In[7]: # Join the two data frames to include additional columns: dfs["historic_df"] = dfs["current_df"][["Country_Name", "Regional_Indicator"]].merge(dfs["historic_df"], on="Country_Name", how="inner") # Create and modify columns: dfs["current_df"].insert(loc=2, column="Year", value=2021) dfs["current_df"] = dfs["current_df"].rename(columns={"Ladder_Score": "Happiness_Index"}) columns = dfs["current_df"].columns key_columns = np.concatenate((columns[:4], columns[7: 13])) # Grab the desired columns dfs["current_df"] = dfs["current_df"][key_columns] dfs["historic_df"] = dfs["historic_df"].iloc[:, :-2] # Remove the undesired columns dfs["historic_df"].columns = key_columns # Rename columns to match in both data frames # Combine the two data frames: happiness_df = (pd.concat([dfs["current_df"], dfs["historic_df"]], ignore_index=True) .sort_values(["Country_Name", "Year"]) .reset_index(drop=True)) print(happiness_df) # ### Fix data frame's column datatypes as necessary: # In[8]: def is_int(x): try: if np.isnan(x) or int(x): return True except: return False def is_float(x): try: float(x) return True except: return False def fix_columns(df): for column in df.columns: if any(word in column.lower() for word in ('date', 'time')): df[column] = df[column].astype('datetime64[ns]') print(f'Changed column "{column}"`s datatype to "datetime"') elif df[column].dtype in (object, str): df[column] = df[column].str.strip() if np.prod(df[column].value_counts().values) == 1: print(f'Column "{column}" contains meta data!') elif all(df[column].apply(is_int)): df[column] = df[column].astype(pd.Int32Dtype()) # Cannot convert "nan" values into "int", need "pd.Int" print(f'Changed column "{column}`s" datatype to "int"') else: temp = df[column].str.replace(',', '.') if all(temp.apply(is_float)): df[column] = temp.astype(float) print(f'Changed column "{column}`s" datatype to "float"') else: df[column] = df[column].astype('category') # More memory and performance efficient print(f'Changed column "{column}`s" datatype to "category"') fix_columns(happiness_df) # ### Take a quick glance at the modified data's statistics: # In[9]: print(happiness_df.info(), separator) print(happiness_df.describe()) # ### Visualize the number of missing values in each column of our data frame: # In[10]: missing_count = happiness_df.isna().sum() missing_count = missing_count.reset_index(name="Count").rename(columns={"index": "Column Name"}) missing_count = missing_count[missing_count["Count"] > 0].sort_values("Count", ascending=False) (ggplot(missing_count) + geom_bar(aes(x="Column Name", y="Count", fill="Column Name", color="Column Name"), color="black", alpha=0.9, stat="identity") + ggtitle("Number of missing values in each column") + ggsize(800, 600) + theme(legend_title=element_text(size=15), legend_text=element_text(size=13), axis_text_x=element_text(angle=70), panel_grid_major_x='blank') ) # Notice that the columns of all the missing values are of "numeric" type! # ### Given that "Perceptions_Of_Corruption" has the highest number of missing values, visualize which countries most contributed to that: # In[11]: missing_corruption = (happiness_df.groupby(["Country_Name", "Regional_Indicator"], observed=True) .agg({ "Perceptions_Of_Corruption": lambda x: x.isna().sum(), }) ).reset_index() missing_corruption = (missing_corruption[missing_corruption["Perceptions_Of_Corruption"] > 0] .rename(columns={"Perceptions_Of_Corruption": "Missing_Corruption"}) .sort_values("Missing_Corruption", ascending=False) .reset_index() ) (ggplot(missing_corruption) + geom_bar(aes(x="Country_Name", y="Missing_Corruption", fill="Regional_Indicator"), color="black", stat="identity", tooltips=layer_tooltips().line("@Country_Name").line("Number of missing values|= ^y").format("^y", ".1s")) +ggtitle('Countries with the highest number of missing "Corruption" values grouped by region') +ggsize(900, 500) + theme(legend_title=element_text(size=15), legend_text=element_text(size=13), axis_text_x=element_text(angle=85)) ) # ### Visualize which countries had the highest number of missing values grouped by region: # In[12]: missing_per_country = (happiness_df.groupby(["Country_Name", "Regional_Indicator"], observed=True) .apply(lambda x: x.isna().sum().sum()) .reset_index(name="Count")) missing_per_country = missing_per_country[missing_per_country['Count'] > 0].sort_values('Count', ascending=False) (ggplot(missing_per_country.head(10)) + geom_bar(aes(x='Count', y='Country_Name', fill="Regional_Indicator"), stat='identity', orientation='y', color='black', tooltips=layer_tooltips().line("@Country_Name").line("Number of missing values|= ^x").format("^x", ".1s")) + ggtitle("Countries with the highest number of missing values grouped by region") + ggsize(900, 500) +theme(panel_grid_major_y='blank') ) # ### Imputation: Fill in the missing values for each column using the appropriate techniques: # In[13]: # A function that drops missing values from a data frame in a smart way by accounting for "information lost" def smart_dropna(df: pd.DataFrame): missing = df.isna().any() if missing.any(): n_cols = len(df.columns) # Represents the fragments/fractions of information in a sample/record for col in df.columns[missing]: # Get the columns that contain missing values info_lost_by_col_rmv = df[col].count()/n_cols info_lost_by_rows_rmv = df[df[col].isna()].notna().sum(axis=1).sum()/n_cols if info_lost_by_col_rmv < info_lost_by_rows_rmv: df = df.drop(col, axis=1) # Drop the column else: df = df[df[col].notna()].reset_index(drop=True) # Drop all the rows containing missing values across "col" return df # In[14]: class LinearStochasticRegressor(LinearRegression): def __init__(self, target: str, train_df: pd.DataFrame, target_df: pd.DataFrame): super().__init__() self.target_df = target_df self.train_df = smart_dropna(train_df.copy()) self.target = self.train_df.pop(target).values extra_cols = np.setdiff1d(self.target_df.columns, self.train_df.columns) self.target_df = self.target_df.drop(extra_cols, axis=1).values self.train_df = self.train_df.values def fit_data(self): self.fit(self.train_df, self.target) def get_r2_score(self): return self.score(self.train_df, self.target) def add_random_error(self): # Residual sum of squares (Residual Variance): rrs = mean_squared_error(self.target, self.predict(self.train_df)) std_dev = np.sqrt(rrs) return np.random.normal(0, std_dev, size=len(self.target_df)) def predict_missing(self): return self.predict(self.target_df) + self.add_random_error() class TreeStochasticRegressor(HistGradientBoostingRegressor): def __init__(self, target: str, train_df: pd.DataFrame, target_df: pd.DataFrame): super().__init__(learning_rate=0.1, max_depth=5,) self.target_df = target_df.values self.train_df = train_df self.target = self.train_df.pop(target).values self.train_df = self.train_df.values def fit_data(self): self.fit(self.train_df, self.target) def get_r2_score(self): return self.score(self.train_df, self.target) def add_random_error(self): # Residual sum of squares (Residual Variance): rrs = mean_squared_error(self.target, self.predict(self.train_df)) std_dev = np.sqrt(rrs) return np.random.normal(0, std_dev, size=len(self.target_df)) def predict_missing(self): return self.predict(self.target_df) + self.add_random_error() # In[15]: # First, we assume that the data statistics are most accurate at the "Country_Name" granularity level # Hence, we'll utilize the low-level, in-distribution "Country_Name" data statistics for imputation when applicable; otherwise, we'll rely on the high-level, out-of-distribution data statistics based on the "Regional_Indicator" and "Year" granularity levels hopeless_TH = 0.9 # Threshold at which we can't sensibly fill in the missing values in a meaningful way without introducing any bias guess_TH = 0.7 # Threshold at which the missing values are hard (inaccurate) to predict (rely on high-level statistics) # In between are the thresholds at which the missing values can be predicted using farily complex predictive models estimate_TH = 0.25 # Threshold at which the missing values are simple to estimate/approximate (linearlly: utilize low-level statistics) grouped_df = happiness_df.groupby(["Country_Name", "Regional_Indicator"], observed=True) order = ('hopeless', 'estimate', 'guess', 'predict') # Order at which the imputing mechanism is executed for step in order: for (country_name, country_region), country_group in grouped_df: country_group = country_group.select_dtypes('number') missing = country_group.isna().sum() if missing.any(): missing_ratio = missing[missing > 0]/len(country_group) region_df = happiness_df[happiness_df["Regional_Indicator"] == country_region].select_dtypes('number') if step == 'hopeless': hopeless_cols = missing_ratio[missing_ratio >= hopeless_TH].index for col in hopeless_cols: print(f'Column "{col}"`s values are almost completely missing for country: "{country_name}"') elif step == 'estimate': estimate_cols = missing_ratio[missing_ratio <= estimate_TH].index for col in estimate_cols: happiness_df.loc[happiness_df["Country_Name"] == country_name, col] = np.round(country_group .set_index("Year")[col] .interpolate(method="index") .bfill() .values, 3) elif step == 'guess': guess_cols = missing_ratio[missing_ratio >= guess_TH].index for col in guess_cols: missing = country_group[col].isna() # Boolean mask where rows contain missing values on "col" missing_years = country_group.loc[missing, "Year"] region_by_year = region_df.loc[region_df["Year"].isin(missing_years), ["Year", col]].groupby("Year") happiness_df.loc[(happiness_df["Country_Name"] == country_name) & missing, col] = np.round(region_by_year .mean() .values, 3) # Or median elif step == 'predict': # Train a predictive hypothesis function (model) to predict missing values: predict_cols = missing_ratio[(estimate_TH < missing_ratio) & (missing_ratio < guess_TH)].index for col in predict_cols: missing = country_group[col].isna() # Boolean mask where rows contain missing values on "col" target_df = country_group[missing].drop(col, axis=1) train_df = country_group[~missing] if len(smart_dropna(train_df)) < 5: # Not enough statistics to use in-distribution data missing_years = country_group.loc[missing, "Year"] region_by_year_df = region_df[region_df["Year"].isin(missing_years) & region_df[col].notna()] params = dict(target=col, train_df=region_by_year_df, target_df=target_df) else: params = dict(target=col, train_df=train_df, target_df=target_df) if target_df.isna().any().any(): model_name = "TSR" model = TreeStochasticRegressor(**params) else: model_name = "LSR" model = LinearStochasticRegressor(**params) model.fit_data() if model_name == "LSR": # If no robust linear correlation or if overfitting: if model.get_r2_score() < 0.8 or model.get_r2_score() == 1: model = TreeStochasticRegressor(**params) model.fit_data() happiness_df.loc[(happiness_df["Country_Name"] == country_name) & missing, col] = np.round(model.predict_missing(), 3) # Make sure we no longer have any missing values in our data frame: print(separator) print(f'Does our data frame contain any missing values? {happiness_df.isna().any().any()}') # # ## Cluster Our Data Frame Based on The Happiness_Index to Discover Underlying (latent) Happiness Levels (clusters) # In[16]: # Cluster our samples based on the "Happiness_Index" column: kmeans = KMeans(n_clusters=5, init='k-means++', n_init=15, max_iter=500) cluster_name = ['Sad', 'Somewhat Sad', 'Neurtal', 'Somewhat Happy', 'Happy'] clusters = kmeans.fit_predict(happiness_df[["Happiness_Index"]]) cluster_order = np.argsort(kmeans.cluster_centers_[:, 0]) cluster_mapping = {c_order: c_name for (c_order, c_name) in zip(cluster_order, cluster_name)} happiness_df['Happiness_Level'] = pd.Series([cluster_mapping[c] for c in clusters]).astype('category') # # ## Feature (Column) Understanding # ### Generate custom interactive visualizations # In[48]: pyg.walk(happiness_df, hideDataSourceConfig=True) # ### Perform univariate feature analysis: # In[17]: column_select = mr.Select(value="Perceptions_Of_Corruption", choices=happiness_df.columns, label="Select a column to investigte") # In[18]: kde_plot = sns.kdeplot(x=column_select.value, data=happiness_df, ax=None) plt.close() y_values = kde_plot.lines[0].get_ydata() x_values = kde_plot.lines[0].get_xdata() max_indx = np.argmax(y_values) max_prob = x_values[max_indx] mean = happiness_df[column_select.value].mean() diff = max_prob - mean if abs(diff) < happiness_df[column_select.value].std()/2.5: dist = "Nomarl" elif diff > 0: dist = "Left-Skewed" else: dist = "Right-Skewed" (ggplot(happiness_df) + geom_density(aes(x=column_select.value, y="..density..", fill=column_select.value), stat="density", quantile_lines=True, color="gray") +ggtitle(f'Historic distribution ({dist}) of {column_select.value} across all countries') +geom_vline( xintercept=mean, color="red", linetype="dashed", size=1) + geom_text( x=mean, y=np.max(y_values)/50, label='(mean)', color="#ff7f0e", size=9, ) +theme(panel_grid_major_x='blank') +ggsize(900, 500) ) # ### Breakdown how the countries from different happiness levels make up that column's distribution: # In[19]: (ggplot(happiness_df) +geom_density(aes(x=column_select.value, y="..density..", fill="Happiness_Level"), color="black", position="dodge", stat="density") +labs(y="Density", title=f"Distribution of {column_select.value} grouped by Happines Levels") +ggsize(900, 500) +theme(panel_grid_major_x='blank') ) # ### Further deepdive into that column's statistics: # In[20]: (ggplot(happiness_df, aes(x='Happiness_Level', y=column_select.value)) + geom_violin(aes(color='Happiness_Level', fill='Happiness_Level'), size=2, alpha=.5, scale='width', tooltips=layer_tooltips().line(f"{column_select.value}:|^y")) + geom_boxplot(aes(fill='Happiness_Level'), width=.2) + ggsize(900, 500)) # In[21]: (ggplot(happiness_df) + geom_bar(aes(x="Happiness_Level", y="..count..", fill="Regional_Indicator"), color="black", tooltips=layer_tooltips().line("^y")) + ggtitle("Chart") + ggsize(900, 500) +theme(panel_grid_major_x='blank') +labs(title="Number of countries in each happiness level grouped by region", y="Count")) # In[22]: column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select a column to investigte") # In[23]: means = happiness_df.groupby("Regional_Indicator", observed=True)[column_select.value].mean().reset_index() (ggplot() +geom_point(aes(x="Regional_Indicator", y=column_select.value, color="Happiness_Level"), data=happiness_df, shape=1, size=2.5) +geom_point(aes(x="Regional_Indicator", y=column_select.value, size=column_select.value,), fill="orange", color="black", data=means, shape=23) +ggsize(900, 500) + scale_size(range=[3, 7]) +theme(axis_text_x=element_text(angle=85)) + ggtitle(f"Historic distribution of the {column_select.value} in each region (yellow diamond: mean)") ) # In[24]: means = happiness_df.groupby("Year", observed=True)[column_select.value].mean().reset_index() years = np.sort(happiness_df["Year"].unique()) (ggplot() +geom_point(aes(x="Year", y=column_select.value, fill=column_select.value), shape=21, color="black", data=happiness_df) +geom_point(aes(x="Year", y=column_select.value, size=column_select.value,), fill="blue", color="black", data=means, shape=23) +ggsize(900, 500) + scale_fill_gradient2(midpoint=means[column_select.value].mean(), low='red', mid='yellow', high='green', guide=guide_colorbar(nbin=3, barwidth=10)) + scale_x_continuous(breaks=years) + scale_size(range=[3, 10], guide='none') +theme(axis_text_x=element_text(angle=30)) +ggtitle(f"Distribution of the overall {column_select.value} across the years (blue diamond: mean)") ) # In[25]: years = [int(y) for y in years] year_select = mr.Select(value=2021, choices=years, label="Select year of interest") # In[26]: happiness_sorted = happiness_df.loc[happiness_df["Year"] == year_select.value, ["Happiness_Index", "Country_Name", "Regional_Indicator"]].sort_values("Happiness_Index", ascending=False) happiest = happiness_sorted.head(5).copy() happiest["Feeling"] = "Happiest" saddest = happiness_sorted.tail(5).copy() saddest["Feeling"] = "Saddest" combined_df = pd.concat([happiest, saddest], ignore_index=True) (ggplot(combined_df) + geom_bar(aes(x="Country_Name", y="Happiness_Index", fill='Regional_Indicator', color="Feeling"), size=1.5, stat='identity', width=0.7, tooltips=layer_tooltips().line("Feeling:|@Feeling")) +ggsize(900, 500) +ggtitle(f"Top 5 happiest & saddest countries in {year_select.value} grouped by region") +theme(panel_grid_major_x='blank') +scale_color_manual(["green", "red"]) ) # In[27]: column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select column of interest") # In[28]: col_change = (happiness_df[["Year", "Country_Name", column_select.value, "Regional_Indicator"]] .groupby(["Country_Name", "Regional_Indicator"], observed=True)[column_select.value] .pct_change() * 100) col_change = col_change.rename("Percent_Change") col_change = pd.concat([happiness_df, col_change], axis=1) col_change_by_year = col_change.sort_values("Year").groupby("Year", observed=True) max_col_change_by_year = col_change_by_year["Percent_Change"].idxmax() max_country_by_year = col_change.loc[max_col_change_by_year.dropna()] max_country_by_year["Year_Country"] = max_country_by_year["Year"].astype(str) + ":\n" + max_country_by_year["Country_Name"].astype(str) ( ggplot(max_country_by_year) + geom_bar(aes(x="Year_Country", y="Percent_Change", fill="Percent_Change"), stat='identity', color="black", tooltips=layer_tooltips().line("@Country_Name").line(f"Percent Change in {column_select.value}|= ^y%")) + scale_fill_gradient2(low='red', mid='white', high='darkgreen', midpoint=0) + ylim(max_country_by_year["Percent_Change"].min()-0.5, max_country_by_year["Percent_Change"].max()+0.5) +geom_hline( yintercept=0, color="pink", size=1) + ggsize(900, 500) + ggtitle(f"YoY maximum percent change in {column_select.value} observed in each year") +theme(axis_text_x=element_text(angle=70), panel_grid_major_x='blank') ) # ### Perform Bivariate feature analysis: # In[29]: (corr_plot(happiness_df.select_dtypes('number')) .points(type="full") .labels(type="full").build() + ggtitle("Tiles, points and labels") +ggsize(900, 600) +ggtitle("Overall Correlation Heatmap")) # Very surprising to see that there is no overall correlation at all between "Generosity" and "Gdp_Per_Caipta"! # In[30]: country_select = mr.Select(value="United Arab Emirates", choices=happiness_df["Country_Name"].unique(), label="Select country of interest") # In[31]: (corr_plot(happiness_df[happiness_df["Country_Name"] == country_select.value].select_dtypes('number')) .points(type="full") .labels(type="full").palette_BrBG().build() + ggtitle("Tiles, points and labels") +ggsize(900, 600) +ggtitle(f"{country_select.value}'s Correlation Heatmap")) # In[32]: columns_select = mr.MultiSelect(label="Select two features to investigate", value=["Logged_Gdp_Per_Capita", "Happiness_Index"], choices=happiness_df.columns) # In[33]: columns_pair = columns_select.value assert len(columns_pair) == 2, f"Please select exactly two columns, you've selected {len(columns_pair)}!" (joint_plot(happiness_df, x=columns_pair[0], y=columns_pair[1], color_by="Happiness_Level") + facet_wrap("Regional_Indicator", nrow=5, scales="free") + geom_smooth(color="gray") + ggsize(1000, 1200) + ggtitle(f"Overall Trend in {columns_pair[1]} Against {columns_pair[0]} Across Every Region Grouped by Happiness Levels") + labs(caption=f"This figure illustrates how the {columns_pair[1]} of the different happiness groups within each region reacts to changes in {columns_pair[0]}") +theme(legend_position="top", axis_title=element_text(margin=margin(t=20, b=30,l=30))) ) # In[34]: columns_select = mr.MultiSelect(label="Select two features to investigate", value=["Perceptions_Of_Corruption", "Happiness_Index"], choices=happiness_df.columns) # In[35]: columns_pair = columns_select.value assert len(columns_pair) == 2, f"Please select exactly two columns, you've selected {len(columns_pair)}!" (ggplot(happiness_df) + geom_smooth(aes(x=columns_pair[0], y=columns_pair[1], color="Happiness_Level"), size=1.3) + ggsize(900, 500) + ggtitle(f"How The {columns_pair[1]} varies against {columns_pair[0]} grouped by Happiness Levels worldwide") ) # In[36]: column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select column to view trend overtime") # In[37]: region_happiness = happiness_df.groupby(["Regional_Indicator", "Year"]).agg({ column_select.value: "mean" }).reset_index() (ggplot(region_happiness) +geom_line(aes(x="Year", y=column_select.value, color="Regional_Indicator"), size=3, tooltips=layer_tooltips().line(f"{column_select.value}: ^y")) + facet_wrap("Regional_Indicator", nrow=5, scales="free") + ggtitle(f"Trend of average {column_select.value} over the years accross every region") +theme(legend_position="bottom") +scale_color_discrete(guide=guide_legend(ncol=2)) + scale_x_continuous(breaks=list(range(2005, 2022, 5)) + [2021]) + ggsize(1000, 1000) ) # In[38]: countries_of_interest = ["United Arab Emirates", "Egypt", "Finland", "United States", "Russia"] countries_select = mr.MultiSelect(value=countries_of_interest, choices=happiness_df["Country_Name"].unique(), label="Select countries of interest") column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select column of interest") # In[39]: countries_df = happiness_df[happiness_df["Country_Name"].isin(countries_select.value)] ( ggplot(countries_df) + geom_point(aes(x="Year", y="Country_Name", fill=column_select.value, size=column_select.value), shape=25, color="black") + scale_fill_gradient2(midpoint=happiness_df[column_select.value].mean(), low='#d7191c', mid='yellow', high='#2b83ba', guide=guide_colorbar(nbin=5, barwidth=10)) + scale_size(range=[3, 10], guide='none') + ggsize(900, 500) + scale_x_continuous(breaks=years) +theme(axis_text_x=element_text(angle=70)) ) # In[40]: column_select = mr.Select(value="Happiness_Index", choices=happiness_df.columns, label="Select column of interest to view worldwide") # In[41]: happiness_by_country = happiness_df.groupby(["Country_Name", "Regional_Indicator"], observed=True).agg({ column_select.value: "mean", }).reset_index() happiness_by_country = happiness_by_country[happiness_by_country["Country_Name"] != "Hong Kong S.A.R. of China"] happiness_by_country["Country_Name"] = happiness_by_country["Country_Name"].astype('object') happiness_by_country["Country_Name"] = happiness_by_country["Country_Name"].replace({"North Cyprus": "Cyprus", "Palestinian Territories": "Palestine", "Taiwan Province of China": "Taiwan"}) countries_gcoder = geocode_countries(happiness_by_country["Country_Name"]) (ggplot() + geom_livemap(location=[53, 24], zoom=4) + geom_polygon(aes(fill=column_select.value), data=happiness_by_country, map=countries_gcoder.get_boundaries(), map_join=[["Country_Name"], ["country"]], alpha=0.7, color="gray", size=0.5, tooltips=layer_tooltips().line('@Country_Name').line(f'{column_select.value}:| @{column_select.value}')) + theme(legend_position='top') + ggsize(900, 600) + scale_fill_gradient2(midpoint=happiness_by_country[column_select.value].mean(), low='red', mid='yellow', high='green', guide=guide_colorbar(nbin=5)) ) # # ## Model Training and Evaluation # ### Feature Engineering: Prepare data to be digestible appropriately by machine learning models for training: # In[42]: def process_df_features(df: pd.DataFrame, ignore: Union[list, tuple]=(), drop: Union[list, tuple]=()): df = df.copy() numerical_preprocessor = StandardScaler() categorical_preprocessor = OneHotEncoder(sparse_output=False, handle_unknown="ignore") for col in df.columns: if col in drop: df = df.drop(col, axis=1) elif col in ignore: continue elif df[col].dtype in (int, 'category', bool): unique_vals = np.sort(df[col].unique()) print(f'Column "{col}`s" unique values are: {unique_vals}') df[col] = df[col].astype('category') onehot_enc = categorical_preprocessor.fit_transform(df[[col]]) onehot_df = pd.DataFrame(onehot_enc, columns=[f"{col}_{val}" for val in unique_vals]) df = pd.concat([df.drop(col, axis=1), onehot_df], axis=1) elif df[col].dtype == float: df[col] = numerical_preprocessor.fit_transform(df[[col]]) return df # In[43]: processed_df = process_df_features(happiness_df, drop=["Country_Name", "Happiness_Level"]) target = processed_df["Happiness_Index"].values data = processed_df.drop('Happiness_Index', axis=1).values x_train, x_test, y_train, y_test = train_test_split(data, target, train_size=0.85) # If necessary, we can conduct a grid search on the model's hyperparameters: """ grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=False, scoring='r2', n_jobs = -1) grid_search.fit(x_train, y_train) grid_search.best_params_ """ params = dict(learning_rate=0.05, max_depth=10) models = dict(linear=LinearRegression(), XGBoost=XGBRegressor(), hist_boosting=HistGradientBoostingRegressor(**params), random_forest=RandomForestRegressor(n_estimators=100)) scores = ['r2', 'neg_mean_absolute_error'] print('\nStart training our models:\n') for (model_name, model) in tqdm(models.items()): print(f"Model: {model_name}") model.fit(x_train, y_train) cv_results = cross_validate(model, data, target, cv=5, scoring=scores) print(f"\tModel's CV average R2 score: {cv_results['test_r2'].mean()*100:.4}%") print(f"\tModel's CV average MSE loss value: {abs(cv_results['test_neg_mean_absolute_error']).mean():.4}") print(f"\tModel's test R2 score: {model.score(x_test, y_test)*100:.4}%\n")