In this notebook we use the new Sunburst plot by plotly to illustrate how the World population is splitted among regions and countries. The data set illustrated here originates from the World Bank. This notebook is also a quick demo for the world_bank_data Python package.
import pandas as pd
import plotly
import plotly.offline as offline
import world_bank_data as wb
def version_to_int_list(version):
return [int(s) for s in version.split('.')]
assert version_to_int_list(plotly.__version__) >= version_to_int_list('3.8.0'), 'Sunburst plots require Plotly >= 3.8.0'
pd.set_option('display.max_rows', 12)
offline.init_notebook_mode()
# Countries and associated regions
countries = wb.get_countries()
countries
iso2Code | name | region | adminregion | incomeLevel | lendingType | capitalCity | longitude | latitude | |
---|---|---|---|---|---|---|---|---|---|
id | |||||||||
ABW | AW | Aruba | Latin America & Caribbean | High income | Not classified | Oranjestad | -70.0167 | 12.51670 | |
AFG | AF | Afghanistan | South Asia | South Asia | Low income | IDA | Kabul | 69.1761 | 34.52280 |
AFR | A9 | Africa | Aggregates | Aggregates | Aggregates | NaN | NaN | ||
AGO | AO | Angola | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IBRD | Luanda | 13.2420 | -8.81155 |
ALB | AL | Albania | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper middle income | IBRD | Tirane | 19.8172 | 41.33170 |
AND | AD | Andorra | Europe & Central Asia | High income | Not classified | Andorra la Vella | 1.5218 | 42.50750 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
XKX | XK | Kosovo | Europe & Central Asia | Europe & Central Asia (excluding high income) | Upper middle income | IDA | Pristina | 20.9260 | 42.56500 |
XZN | A5 | Sub-Saharan Africa excluding South Africa and ... | Aggregates | Aggregates | Aggregates | NaN | NaN | ||
YEM | YE | Yemen, Rep. | Middle East & North Africa | Middle East & North Africa (excluding high inc... | Low income | IDA | Sana'a | 44.2075 | 15.35200 |
ZAF | ZA | South Africa | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Upper middle income | IBRD | Pretoria | 28.1871 | -25.74600 |
ZMB | ZM | Zambia | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | IDA | Lusaka | 28.2937 | -15.39820 |
ZWE | ZW | Zimbabwe | Sub-Saharan Africa | Sub-Saharan Africa (excluding high income) | Lower middle income | Blend | Harare | 31.0672 | -17.83120 |
304 rows × 9 columns
# Population dataset, by the World Bank (most recent value)
population = wb.get_series('SP.POP.TOTL', mrv=1)
population
Country Series Year Arab World Population, total 2018 4.197906e+08 Caribbean small states Population, total 2018 7.358965e+06 Central Europe and the Baltics Population, total 2018 1.025119e+08 Early-demographic dividend Population, total 2018 3.249141e+09 East Asia & Pacific Population, total 2018 2.328221e+09 East Asia & Pacific (excluding high income) Population, total 2018 2.081652e+09 ... Vietnam Population, total 2018 9.554040e+07 Virgin Islands (U.S.) Population, total 2018 1.069770e+05 West Bank and Gaza Population, total 2018 4.569087e+06 Yemen, Rep. Population, total 2018 2.849869e+07 Zambia Population, total 2018 1.735182e+07 Zimbabwe Population, total 2018 1.443902e+07 Name: SP.POP.TOTL, Length: 264, dtype: float64
# Same data set, indexed with the country code
population = wb.get_series('SP.POP.TOTL', id_or_value='id', simplify_index=True, mrv=1)
population
Country ARB 4.197906e+08 CSS 7.358965e+06 CEB 1.025119e+08 EAR 3.249141e+09 EAS 2.328221e+09 EAP 2.081652e+09 ... VNM 9.554040e+07 VIR 1.069770e+05 PSE 4.569087e+06 YEM 2.849869e+07 ZMB 1.735182e+07 ZWE 1.443902e+07 Name: SP.POP.TOTL, Length: 264, dtype: float64
# Aggregate region, country and population
df = countries[['region', 'name']].rename(columns={'name': 'country'}).loc[countries.region != 'Aggregates']
df['population'] = population
df
region | country | population | |
---|---|---|---|
id | |||
ABW | Latin America & Caribbean | Aruba | 105845.0 |
AFG | South Asia | Afghanistan | 37172386.0 |
AGO | Sub-Saharan Africa | Angola | 30809762.0 |
ALB | Europe & Central Asia | Albania | 2866376.0 |
AND | Europe & Central Asia | Andorra | 77006.0 |
ARE | Middle East & North Africa | United Arab Emirates | 9630959.0 |
... | ... | ... | ... |
WSM | East Asia & Pacific | Samoa | 196130.0 |
XKX | Europe & Central Asia | Kosovo | 1845300.0 |
YEM | Middle East & North Africa | Yemen, Rep. | 28498687.0 |
ZAF | Sub-Saharan Africa | South Africa | 57779622.0 |
ZMB | Sub-Saharan Africa | Zambia | 17351822.0 |
ZWE | Sub-Saharan Africa | Zimbabwe | 14439018.0 |
218 rows × 3 columns
# The sunburst plot requires weights (values), labels, and parent (region, or World)
# We build the corresponding table here
columns = ['parents', 'labels', 'values']
level1 = df.copy()
level1.columns = columns
level1['text'] = level1['values'].apply(lambda pop: '{:,.0f}'.format(pop))
level2 = df.groupby('region').population.sum().reset_index()[['region', 'region', 'population']]
level2.columns = columns
level2['parents'] = 'World'
# move value to text for this level
level2['text'] = level2['values'].apply(lambda pop: '{:,.0f}'.format(pop))
level2['values'] = 0
level3 = pd.DataFrame({'parents': [''], 'labels': ['World'],
'values': [0.0], 'text': ['{:,.0f}'.format(population.loc['WLD'])]})
all_levels = pd.concat([level1, level2, level3], axis=0).reset_index(drop=True)
all_levels
parents | labels | values | text | |
---|---|---|---|---|
0 | Latin America & Caribbean | Aruba | 105845.0 | 105,845 |
1 | South Asia | Afghanistan | 37172386.0 | 37,172,386 |
2 | Sub-Saharan Africa | Angola | 30809762.0 | 30,809,762 |
3 | Europe & Central Asia | Albania | 2866376.0 | 2,866,376 |
4 | Europe & Central Asia | Andorra | 77006.0 | 77,006 |
5 | Middle East & North Africa | United Arab Emirates | 9630959.0 | 9,630,959 |
... | ... | ... | ... | ... |
220 | World | Latin America & Caribbean | 0.0 | 641,357,515 |
221 | World | Middle East & North Africa | 0.0 | 448,912,859 |
222 | World | North America | 0.0 | 364,290,258 |
223 | World | South Asia | 0.0 | 1,814,388,744 |
224 | World | Sub-Saharan Africa | 0.0 | 1,074,853,734 |
225 | World | 0.0 | 7,594,270,356 |
226 rows × 4 columns
# And now we can plot the World Population
offline.iplot(dict(
data=[dict(type='sunburst', hoverinfo='text', **all_levels)],
layout=dict(title='World Population (World Bank, 2017)<br>Click on a region to zoom',
width=800, height=800)),
validate=False)