from sys import executable
!{executable} -m pip install colorcet
Requirement already satisfied: colorcet in /home/asmirnov/Applications/miniconda3/envs/lets-plot-docs/lib/python3.10/site-packages (3.1.0)
import numpy as np
import pandas as pd
import colorcet as cc
from lets_plot import *
from lets_plot.geo_data import *
The geodata is provided by © OpenStreetMap contributors and is made available here under the Open Database License (ODbL).
LetsPlot.setup_html()
def continuous_color_scale(name=None):
return scale_brewer('paint_a', name=name, type='seq', palette='Blues')
def discrete_color_scale(name=None):
return scale_brewer('paint_a', name=name, type='qual', palette='Set2')
def get_counts_df(local_df, *, column, column_name=None):
vc_df = local_df[column].value_counts().to_frame('count')
vc_df.index.name = column_name if column_name else column
vc_df = vc_df.reset_index()
return vc_df
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/nobel.csv")
all_countries = pd.Series(list(set(list(df.born_country_code) + \
list(df.died_country_code) + \
list(df.country_of_university))))
all_countries = all_countries[~all_countries.isna()]
geocoded_countries_df = geocode_countries(all_countries).ignore_not_found().get_geocodes()
geocoded_countries_dict = geocoded_countries_df.set_index('country').to_dict()['found name']
df = df.replace({
'born_country_code':
dict(geocoded_countries_dict, \
**{str(c): np.nan for c in set(df.born_country_code) - set(geocoded_countries_dict.keys())}),
'died_country_code':
dict(geocoded_countries_dict, \
**{str(c): np.nan for c in set(df.died_country_code) - set(geocoded_countries_dict.keys())}),
'country_of_university':
dict(geocoded_countries_dict, \
**{str(c): np.nan for c in set(df.country_of_university) - set(geocoded_countries_dict.keys())})
})
df = df.rename(columns={'born_country_code': 'born_country', \
'died_country_code': 'died_country', \
'share': 'prize_share'})
df['decade'] = (df.year / 10).astype(int) * 10
df['fullname'] = df.firstname + ' ' + df.surname
df.head()
firstname | surname | born_country | died_country | gender | year | category | prize_share | name_of_university | city_of_university | country_of_university | born_month | age | age_get_prize | decade | fullname | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Wilhelm Conrad | Röntgen | Germany | Germany | male | 1901 | physics | 1 | Munich University | Munich | Germany | Mar | 78 | 56 | 1900 | Wilhelm Conrad Röntgen |
1 | Hendrik A. | Lorentz | Netherlands | Netherlands | male | 1902 | physics | 2 | Leiden University | Leiden | NaN | Jul | 75 | 49 | 1900 | Hendrik A. Lorentz |
2 | Pieter | Zeeman | Netherlands | Netherlands | male | 1902 | physics | 2 | Amsterdam University | Amsterdam | NaN | May | 78 | 37 | 1900 | Pieter Zeeman |
3 | Henri | Becquerel | France | France | male | 1903 | physics | 2 | Ăcole Polytechnique | Paris | France | Dec | 56 | 51 | 1900 | Henri Becquerel |
4 | Pierre | Curie | France | France | male | 1903 | physics | 4 | Ăcole municipale de physique et de chimie indu... | Paris | France | May | 47 | 44 | 1900 | Pierre Curie |
country_prizes_df = df[~df.country_of_university.isna()]\
.drop_duplicates(subset=['country_of_university', 'year', 'category'])
country_prizes_df = country_prizes_df.rename(columns={'country_of_university': 'country'})
laureates_df = df.drop_duplicates(subset=['fullname'])
not_migrated_laureates_df = df[(~df.died_country.isna())&(df.born_country == df.died_country)]\
.drop_duplicates(subset=['born_country', 'died_country', 'fullname'])
migrated_laureates_df = df[(~df.died_country.isna())&(df.born_country != df.died_country)]\
.drop_duplicates(subset=['born_country', 'died_country', 'fullname'])
decades = sorted(df.decade.unique())
N = 10
countries_colors = {country: cc.palette['glasbey_dark'][i] \
for i, country in enumerate(geocoded_countries_dict.values())}
plots = []
for d, column, counted_name in [(country_prizes_df, 'country', 'Nobel prizes'), \
(not_migrated_laureates_df, 'born_country', 'non migrated laureates'), \
(migrated_laureates_df, 'died_country', 'immigrated laureates'), \
(migrated_laureates_df, 'born_country', 'emigrated laureates')]:
local_df = get_counts_df(d, column=column, column_name='country')
local_df['color'] = np.vectorize(countries_colors.get)(local_df.country)
plots.append(ggplot(local_df) + \
geom_bar(aes(x='country', y='count', color='color', fill='color'), \
stat='identity', sampling=sampling_pick(N), alpha=.75, show_legend=False, \
tooltips=layer_tooltips().line('@country').line('{0} number|@count'.format(counted_name))) + \
scale_color_identity() + scale_fill_identity() + \
ggtitle('Top {0} Countries by {1}'.format(N, counted_name.title())) + \
theme(axis_text_x='blank', axis_ticks_x='blank'))
w, h = 400, 300
bunch = GGBunch()
bunch.add_plot(plots[0], 0, 0, w, h)
bunch.add_plot(plots[1], w, 0, w, h)
bunch.add_plot(plots[2], 0, h, w, h)
bunch.add_plot(plots[3], w, h, w, h)
bunch.show()
Obviously, the US is the absolute champion in the Nobel race. A great deal of its success is due to immigrant scientists.
Also here we see that many Nobel laureates have left Poland, Germany and the UK. However, the US, the UK and Germany take top positions regardless of brain drain.
migrated_laureates_df['migration'] = migrated_laureates_df.born_country + ' â ' + migrated_laureates_df.died_country
migration_df = get_counts_df(migrated_laureates_df, column='migration')
ggplot(migration_df[migration_df['count'] > 1]) + \
geom_bar(aes(x='migration', y='count', paint_a='count'), stat='identity', \
color='#08306b', fill_by='paint_a', show_legend=False, \
tooltips=layer_tooltips().line('@migration').line('migrated laureates number|@count')) + \
continuous_color_scale() + \
ggtitle('Popular Migration Directions for Nobel Laureates') + \
theme(axis_text_x='blank', axis_ticks_x='blank')
The most popular direction of migration is from the UK to the US.
Except those moving to the US, another popular migration route for scientists is from Poland to Germany.
countries_df = get_counts_df(country_prizes_df, column='country')
top_countries = countries_df[countries_df['count'].cumsum() < 3 * country_prizes_df.shape[0] / 4].country.values
country_prizes_df['half'] = np.where(country_prizes_df.country.isin(top_countries), country_prizes_df.country, 'Other')
country_prizes_df.half = pd.Categorical(country_prizes_df.half, list(top_countries) + ['Other'])
ggplot(country_prizes_df.sort_values(by="half")) + \
geom_bar(aes("decade", paint_a="half"), position='fill', fill_by='paint_a') + \
scale_x_continuous(breaks=decades, labels=[str(d) for d in decades]) + ylab('proportion of prizes') + \
discrete_color_scale(name='country') + \
ggtitle('Prize Proportion between Top Countries and Others')
3/4 of all the Nobel prizes ever awarded belong to the US, the UK and Germany. But the situation changes over time, mostly in favor of the US and not in favor of Germany.
country_boundaries_gdf = geocode_countries().get_boundaries()
ggplot() + \
geom_map(aes(paint_a='count'), \
data=laureates_df.groupby('born_country').count().iloc[:, 0].to_frame('count').reset_index(), \
map=country_boundaries_gdf, \
map_join=('born_country', 'country'),
fill_by='paint_a',
tooltips=layer_tooltips().line('@born_country').line('laureates number|@count')) + \
continuous_color_scale() + \
ggtitle('Distribution of Nobel Laureates in the World') + \
theme_void()
Here we see that the Nobel committee prefers to acknowledge the achievements of Western science and ignore almost the whole of Africa.
N = 10
top_universities = country_prizes_df.name_of_university.value_counts().to_frame('count')[:N].index
ggplot(country_prizes_df[country_prizes_df.name_of_university.isin(top_universities)]) + \
geom_bar(aes(x='name_of_university', group='category', paint_a='category'), fill_by='paint_a', \
tooltips=layer_tooltips().line('^x').line('@|@category').line('prizes number|^y')) + \
discrete_color_scale() + \
xlab('university') + \
ggtitle('Top {0} Universities by Prize Number'.format(N)) + \
theme(axis_text_x='blank', axis_ticks_x='blank')
Most top universities pay attention to a wide range of scientific disciplines, but some specialize in particular areas.
p1 = ggplot(laureates_df) + \
geom_bar(aes(x='gender', paint_a='gender'), fill_by='paint_a') + \
discrete_color_scale() + \
ggtitle('Gender Ratio')
p2 = ggplot(laureates_df) + \
geom_bar(aes(x='category', group='gender', paint_a='gender'), fill_by='paint_a') + \
discrete_color_scale() + \
ggtitle('Gender Ratio by Category')
p3 = ggplot(laureates_df) + \
geom_bar(aes(x='decade', group='gender', paint_a='gender'), fill_by='paint_a') + \
discrete_color_scale() + \
scale_x_discrete(labels=df.decade.unique().astype(str)) + \
ggtitle('Gender Ratio by Decade')
w, h = 600, 300
bunch = GGBunch()
bunch.add_plot(p1, 0, 0, w, h)
bunch.add_plot(p2, 0, h, w, h)
bunch.add_plot(p3, 0, 2 * h, w, h)
bunch.show()
We see not only inequality in gender but also a slow change of this trend through the years, except the 1910s and 1950s.
The best possible female/male ratio is seen in peace and literature.
ggplot(df) + \
geom_bar(aes(x='category', paint_a='category'), fill_by='paint_a') + \
discrete_color_scale() + \
ggtitle('Nobel Prizes by Categories')
Not all categories feature the same number of laureates, mostly due to prize sharing in collective research.
breaks = sorted(df.prize_share.unique())
labels = ['1' if b == 1 else '1/{0}'.format(b) for b in breaks]
ggplot(df) + \
geom_bar(aes(x='prize_share', group='category', paint_a='category'), fill_by='paint_a') + \
scale_x_continuous(name='prize share', breaks=breaks, labels=labels) + \
discrete_color_scale() + \
ggtitle('Sharing Prizes')
In most cases the winner gets the full prize or half of it. For peace and especially for literature, it is unusual to share your prize with someone.
ggplot(df.groupby(['year', 'category']).agg({'decade': 'count', 'age_get_prize': 'mean'}).reset_index()) + \
geom_point(aes(x='year', y='category', size='decade', paint_a='age_get_prize'), color_by='paint_a', shape=15, \
tooltips=layer_tooltips().line('laureates number|^size').line('laureates mean age|^paint_a')\
.line('@|@year').line('@|@category')) + \
scale_x_continuous(breaks=decades, labels=[str(d) for d in decades]) + \
scale_size(range=[1, 2], guide='none') + \
continuous_color_scale(name='age get prize') + \
ggtitle('Nobel Prizes by Year and Category') + \
ggsize(900, 200)
Throughout the years we see gaps in Nobel prizes awarded for some categories, especially peace. Also there is one big common gap during World War II.
Finally we see that the Nobel prize for economics was first awarded in the 1970s.
ggplot(laureates_df) + \
geom_density(aes(x='age')) + \
ggtitle('Death Age Distribution of Nobel Laureates')
The mean age of death for Nobel laureates is 85 years.
Wouldn't be too bad to achieve the same life span!
ggplot(df) + \
geom_histogram(aes(x='age_get_prize'), binwidth=5, boundary=22) + \
ggtitle('Distribution of Nobel Prize Winners Age')
The mean age of winning the Nobel prize is 60 years.
ggplot(df, aes(x='age', y='age_get_prize')) + \
geom_bin2d(aes(paint_a='..count..'), size=.2, color='#08306b', fill_by='paint_a', binwidth=[5, 5]) + \
continuous_color_scale() + \
facet_grid(x='gender') + \
ggtitle('Common Distribution of Death Age and Getting the Prize Age')
This graph confirms our previous conclusions.
ggplot(df) + \
geom_boxplot(aes(x='category', y='age_get_prize', paint_a='category'), fill_by='paint_a') + \
facet_grid(x='gender') + \
discrete_color_scale() + \
ggtitle('Aggregated Information About Age by Category and Gender')
If we consider the categories, the mean age would be different. In many cases, physicists were relatively young when they achieved success, but economists tended to win the prize in slightly older age. For women, higher scatter in graph data could be explained by smaller sampling.
ggplot(df, aes(x='year', y='age_get_prize')) + \
geom_point(aes(paint_a='gender'), color_by='paint_a') + \
geom_smooth(method='loess', color='black') + \
scale_x_continuous(breaks=decades, labels=[str(d) for d in decades]) + \
discrete_color_scale() + \
ggtitle('Distribution of Ages by Years')
Here we also see that the mean age when Nobel laureates receive their prize is rising over time.
ggplot(df, aes(x='year', y='age_get_prize')) + \
geom_point(aes(paint_a='gender'), color_by='paint_a') + \
geom_smooth(method='loess', color='black') + \
scale_x_continuous(breaks=decades, labels=[str(d) for d in decades]) + \
facet_grid(y='category') + \
discrete_color_scale() + \
ggtitle('Category Wise Distribution of Ages by Years')
Taking categories into account, we realize that the mean age doesn't always increase. It could even decrease in the case of the peace prize. For literature or economics, things don't change.
N = 20
p1 = ggplot(df.sort_values(by='age_get_prize', ascending=False)[:N]) + \
geom_bar(aes(x='fullname', y='age_get_prize', paint_a='gender'), fill_by='paint_a', stat='identity') + \
discrete_color_scale() + \
ggtitle('Top {0} Oldest Nobel Prize Laureates'.format(N))
p2 = ggplot(df.sort_values(by='age_get_prize', ascending=False)[-N:]) + \
geom_bar(aes(x='fullname', y='age_get_prize', paint_a='gender'), fill_by='paint_a', stat='identity') + \
discrete_color_scale() + \
ggtitle('Top {0} Youngest Nobel Prize Laureates'.format(N))
bunch = GGBunch()
bunch.add_plot(p1, 0, 0, 400, 300)
bunch.add_plot(p2, 400, 0, 400, 300)
bunch.show()
Finally we take a look at the oldest and youngest people who got the prize.
multiple_laureates = list({k: v for k, v in (df.fullname.value_counts() > 1).items() if v}.keys())
ggplot(df[df.fullname.isin(multiple_laureates)]) + \
geom_point(aes(x='year', y='fullname', paint_a='category', \
shape='gender', size='age_get_prize'), \
color_by='paint_a', fill_by='paint_a',
alpha=.5, tooltips=layer_tooltips().line('@fullname').line('year get prize|@year')\
.line('prize category|@category')\
.line('prize share|1/@prize_share')\
.line('university|@name_of_university')
.line('@|@gender').line('prize winning age|@age_get_prize')\
.line('age at death|@age')\
.line('born country|@born_country')\
.line('died country|@died_country')) + \
scale_x_continuous(breaks=decades, labels=[str(d) for d in decades]) + \
discrete_color_scale() + \
scale_shape_manual(values=[24, 25]) + scale_size(name='prize winning age', range=[4, 8]) + \
ggsize(600, 400) + ggtitle('Laureates Who Won Nobel Prize More Than Once') + \
theme(legend_position='bottom', axis_title_y='blank', axis_tooltip='blank')
By now, there are four people who have received the prize more than once. One of them is a woman. Also she is the only one who moved to a different country. Two of them changed categories in which they achieved the results.
The first case was in 1903 and the last one in 1980.
Look at the graph and you will find out even more fascinating details about these people.