Malnutrition in the World

Data comes from here.

In [1]:
import pandas as pd
import geopandas as gpd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from lets_plot import *
from lets_plot.bistro.corr import *
LetsPlot.setup_html()
In [2]:
cwa_df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/"
                     "master/data/malnutrition/country_wise_average.csv")
In [3]:
me_df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/"
                    "master/data/malnutrition/malnutrition_estimates.csv")
me_df = me_df.sort_values(by='Year', ascending=False).drop_duplicates(subset='Country')
me_df = me_df[['Country', 'ISO code', 'LDC', 'LIFD', 'LLDC or SID2']]
In [4]:
df = cwa_df.merge(me_df, on='Country')
df.Country = df.Country.apply(lambda country_name: country_name.capitalize())

A Bit of Correlation Analysis

Let's look at the correlation coefficients of random variables that correspond to dataframe columns.

In [5]:
corr_plot(df).tiles().palette_RdBu().build() + ggsize(600, 600)
Out[5]:

Choropleth Maps

For each map, the total of children with this disorder in red countries equals to that in blue countries.

In [6]:
cat_cols = ['Income Classification', 'LDC', 'LIFD', 'LLDC or SID2']
num_cols = ['U5 Population (\'000s)']
features = cat_cols + num_cols
targets = ['Severe Wasting', 'Wasting', 'Overweight', 'Stunting', 'Underweight']
targets = list(df[targets].isnull().sum().sort_values().keys())

for target in targets:
    df_train = df[~df[target].isnull()]
    df_test = df[df[target].isnull()]
    X, y = df_train[features], df_train[target]
    X_test = df_test[features]

    parameters = dict(n_estimators=[10, 20, 30, 60], max_depth=[1, 2, 3, 4, 5, 6, 7])
    model = GridSearchCV(RandomForestRegressor(), parameters)
    model.fit(X, y)
    y_pred = model.predict(X_test)
    
    df[target] = df[target].fillna(pd.Series(y_pred, index=X_test.index))

    features = features + [target]
In [7]:
def plot_world_decomposition(df, target):
    target_abs = target + ' ABS'
    target_bh = target + ' BH'

    df_copy = df.copy()
    df_copy[target_abs] = df_copy[target] * df_copy['U5 Population (\'000s)'] / 100
    df_copy = df_copy.sort_values(by=target_abs, ascending=False)
    df_copy[target_bh] = df_copy[target_abs].cumsum() < df_copy[target_abs].sum() / 2
    df_copy.loc[df_copy[target_bh].ne(True).idxmax(), target_bh] = True
    df_copy[target_bh] = df_copy[target_bh].apply(lambda r: 'Bigger half' if r else 'Smaller half')

    sifted_df = df_copy[['ISO code', target_abs, target_bh]]
    world_gdf = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))[['iso_a3', 'geometry', 'name']]
    merged_df = gpd.GeoDataFrame(sifted_df.merge(world_gdf, how='right', left_on='ISO code', right_on='iso_a3'))
    merged_df[target_abs] = merged_df[target_abs].fillna(0)
    merged_df[target_bh] = merged_df[target_bh].fillna('Not in statistics')

    return ggplot() + \
        geom_polygon(aes(fill=target_bh, size='name', alpha=target_abs), \
                     data=merged_df, color='black', \
                     tooltips=layer_tooltips().title('@name')\
                                              .line("{0}|@{{{1}}}".format(target.lower(), target_abs))) + \
        scale_size(name='Country', range=[.3, .3]) + \
        scale_alpha(name=target, range=[.6, 1]) + \
        scale_fill_manual(name=target, values=['#0571b0', '#bababa', '#ca0020']) + \
        ggtitle('%s in the World' % target) + \
        ggsize(600, 450) + \
        theme_classic() + theme(axis='blank', legend_position='none')
In [8]:
plot_world_decomposition(df, 'Severe Wasting')
Out[8]:
In [9]:
plot_world_decomposition(df, 'Wasting')
Out[9]:
In [10]:
plot_world_decomposition(df, 'Overweight')
Out[10]: