import numpy as np
import pandas as pd
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.patches as pat
from matplotlib import cm
# TOTAL IMMIGRATION DATA
# https://ec.europa.eu/eurostat/cache/metadata/en/migr_immi_esms.htm
df_imm_total = pd.read_excel('migr_imm2ctz.xlsx', sheet_name='total')
df_imm_total
df_imm_total.info()
df_imm_total.keys()[0]
# Set index
df_imm_total.set_index('GEO/TIME', inplace=True)
# Replace colons with missing value NaNs and set datatype
df_imm_total = df_imm_total.apply(pd.to_numeric, errors='coerce')
# Drop rows with no data
df_imm_total.dropna(how='all', inplace=True)
# Explore remaining missing values
df_imm_total[df_imm_total.isnull().sum(axis=1) > 0]
# Backfill missing values
df_imm_total.bfill(axis=1, inplace=True)
# Rename Germany
df_imm_total.index = [s.replace('Germany (until 1990 former territory of the FRG)', 'Germany') for s in df_imm_total.index]
# Strip column names
df_imm_total.columns = [col.strip() for col in df_imm_total.columns]
df_imm_total.head()
# Eurostat dataset cleaning function
countries = df_imm_total.index.tolist()
def clean_eurostat_excel(file_name, sheet):
data = pd.read_excel(file_name, sheet_name = sheet, index_col=0)
# rename Germany
data.index = [str(s).replace('Germany (until 1990 former territory of the FRG)', 'Germany') for s in data.index]
# replace ':' with NaNs, set datatype, and impute missing data with backfill/frontfill
data = data.apply(pd.to_numeric, errors='coerce').bfill(axis=1).ffill(axis=1)
# drop rows with no data
data.dropna(how='all', inplace=True)
# strip column names of extraneous spaces
data.columns = [col.strip() for col in data.columns]
# reduce rows to country list
data = data.loc[countries]
return data;
df_imm_total = clean_eurostat_excel('migr_imm2ctz.xlsx', 'total')
df_imm_total.head()
# TOTAL EMIGRATION DATA
# https://ec.europa.eu/eurostat/cache/metadata/en/migr_immi_esms.htm
df_emi_total = clean_eurostat_excel('migr_emi1ctz.xlsx', 'total')
df_emi_total.head()
# Pivot transformation to long-form for visual analysis
def df_to_longform(df, data_col_name):
df = pd.melt(df.reset_index(), id_vars='index')
df.columns=('country','year', data_col_name)
df.sort_values(by=['country','year'], inplace=True)
df['year'] = df['year'].astype(str)
return df;
dfg_imm_total = df_to_longform(df_imm_total, 'immigrants')
dfg_imm_total.head()
dfg_imm_total.info()
# Dataframe to long-form for graphing
dfg_emi_total = df_to_longform(df_emi_total, 'emigrants')
dfg_emi_total.head()
# Combining long-form dataframes to graph
dfg = pd.merge(dfg_imm_total, dfg_emi_total, on=['country', 'year'])
dfg.head()
# Style
sns.set_style('whitegrid')
# Pairplot by country
pal = sns.cubehelix_palette(10, start=0.3, rot=-0.8)
g = sns.pairplot(dfg, vars=dfg.iloc[:,2:4],
height=5, palette=pal, kind='reg', hue='country')
# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Immigration & Emigration by Country', size=16, weight='demi');
# Pairplot by years
pal = sns.cubehelix_palette(10, start=0.3, rot=-0.8)
g = sns.pairplot(dfg, vars=dfg.iloc[:,2:4],
height=5, palette=pal, kind='reg', hue='year')
# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Immigration & Emigration by Year', size=16, weight='demi');
# Initialize FacetGrid object
g = sns.FacetGrid(dfg_imm_total, col='country', col_wrap=4,
height=2,aspect=2).set(xticks=np.arange(0,10,3))
# Create plot
g.map(plt.plot, 'year', 'immigrants', color='darkorange')
g.map(plt.fill_between, 'year', 'immigrants', color='darkorange', alpha=0.5)
g.map(plt.axhline, y=0, lw=2, color='darkorange')
# Facet titles
for ax in g.axes:
g.set_titles("{col_name}")
# Formatting axes
g.set(yticks=[])
g.despine(bottom=True, left=True)
# Legend
color_key = {'Immigrants': 'darkorange'}
patches = [pat.Patch(color=v, label=k) for k,v in color_key.items()]
g.fig.legend(handles=patches, bbox_to_anchor =
[0.49, 0.93]).get_frame().set_edgecolor('1.0')
# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Total Immigration', size=16, weight='demi');
# FIRST GENERATION IMMIGRANTS REASONS DATA 2014
# https://ec.europa.eu/eurostat/cache/metadata/en/lfso_14_esms.htm
df_imm_fgen_bythous = clean_eurostat_excel('lfso_14b1dr.xlsx', 'Reason first gen imm 2014')
df_imm_fgen_bythous[df_imm_fgen_bythous.isnull().sum(axis=1) > 0]
# Drop countries with no data
df_imm_fgen_bythous.dropna(how='all', inplace=True)
# Order dataframe by total first generation immigrants
df_imm_fgen_bythous.sort_values(by=['Total'], inplace=True, ascending=False)
fgen_order = df_imm_fgen_bythous.index
# Drop total - only graphing reasons
df_imm_fgen_bythous.drop('Total', axis=1, inplace=True)
# Dataframe to long-form for graphing
dfg_imm_fgen_bythous = df_to_longform(df_imm_fgen_bythous, 'x')
dfg_imm_fgen_bythous.columns = ['country', 'Reason', 'thousands']
# Set plots and style
sns.set(rc={'figure.figsize':(15,30)}, style='whitegrid')
# Plot
ax = sns.barplot(data=dfg_imm_fgen_bythous, x='thousands', y='country', hue='Reason',
palette=sns.color_palette('deep', 7), order=fgen_order,
hue_order=['Family reasons', 'Work, no job found before migrating',
'Work, job found before migrating',
'International protection or asylum', 'Education reasons',
'Other', 'No response']) #
# Legend
plt.legend(loc='center right')
# Title
ax.set_title('First Generation Immigrants: Reasons for Migration (2014)', fontsize='large', fontweight='demi', y=1.02)
plt.figtext(0.5,0.893, 'Countries in descending order of total immigrants in 2014', ha="center", va="top", fontsize=12, color='grey');
# Create boolean criteria for subplots
df_imm_fgen_bythous['Fam500k+'] = df_imm_fgen_bythous['Family reasons'] > 500
df_imm_fgen_bythous['Fam100k+'] = df_imm_fgen_bythous['Family reasons'] > 100
df_imm_fgen_bythous.head()
# Creating sub-dataframes for zoomed-in subplots
df_imm_fgen_bythous_lar = df_imm_fgen_bythous.loc[df_imm_fgen_bythous['Fam500k+'] == True].drop(
['Fam500k+', 'Fam100k+'], axis=1)
df_imm_fgen_bythous_med = df_imm_fgen_bythous.loc[(df_imm_fgen_bythous['Fam500k+'] == False) & (
df_imm_fgen_bythous['Fam100k+'] == True)].drop(
['Fam500k+', 'Fam100k+'], axis=1)
df_imm_fgen_bythous_sma = df_imm_fgen_bythous.loc[df_imm_fgen_bythous['Fam100k+'] == False].drop(
['Fam500k+', 'Fam100k+'], axis=1)
df_imm_fgen_bythous_lar.head()
# Data to long-form for graphing
dfg_imm_fgen_bythous_lar = df_to_longform(df_imm_fgen_bythous_lar, 'x')
dfg_imm_fgen_bythous_med = df_to_longform(df_imm_fgen_bythous_med, 'x')
dfg_imm_fgen_bythous_sma = df_to_longform(df_imm_fgen_bythous_sma, 'x')
dfg_imm_fgen_bythous_lar.columns = ['country', 'Reason', 'thousands']
dfg_imm_fgen_bythous_med.columns = ['country', 'Reason', 'thousands']
dfg_imm_fgen_bythous_sma.columns = ['country', 'Reason', 'thousands']
dfg_imm_fgen_bythous_lar.head()
# Sort for graphing order
dfg_imm_fgen_bythous_lar.sort_values(by=['thousands'], ascending=False, inplace=True)
dfg_imm_fgen_bythous_med.sort_values(by=['thousands'], ascending=False, inplace=True)
dfg_imm_fgen_bythous_sma.sort_values(by=['thousands'], ascending=False, inplace=True)
# Set plots and style
fig, axs = plt.subplots(3, figsize=(10, 20), gridspec_kw={'height_ratios': [3, 2, 4]})
# Set style
sns.set_style('whitegrid')
# First subplot (> 500k family reasons)
p0 = sns.barplot(data=dfg_imm_fgen_bythous_lar, x='thousands', y='country', hue='Reason',
palette=sns.color_palette('deep', 7),
hue_order=['Family reasons', 'Work, no job found before migrating',
'Work, job found before migrating',
'International protection or asylum', 'Education reasons',
'Other', 'No response'], ax=axs[0])
# Second subplot (200k > 500k family reasons)
p1 = sns.barplot(data=dfg_imm_fgen_bythous_med, x='thousands', y='country', hue='Reason',
palette=sns.color_palette('deep', 7),
hue_order=['Family reasons', 'Work, no job found before migrating',
'Work, job found before migrating',
'International protection or asylum', 'Education reasons',
'Other', 'No response'], ax=axs[1])
# Third subplot (< 200k family reasons)
p2 = sns.barplot(data=dfg_imm_fgen_bythous_sma, x='thousands', y='country', hue='Reason',
palette=sns.color_palette('deep', 7),
hue_order=['Family reasons', 'Work, no job found before migrating',
'Work, job found before migrating',
'International protection or asylum', 'Education reasons',
'Other', 'No response'], ax=axs[2])
# Formatting
fig.tight_layout()
axs[0].set(xlabel='', ylabel='')
axs[1].set(xlabel='', ylabel='')
axs[2].set(xlabel='First generation immigrants', ylabel='')
# Legend
handles, labels = axs[2].get_legend_handles_labels()
p0.legend(loc='lower right')
p1.legend(handles[:0], labels[:0])
p2.legend(handles[:0], labels[:0])
# Title
plt.subplots_adjust(top=0.98)
fig.suptitle('First Generation Immigrants: Reasons for Migration (2014)', fontsize='large', fontweight='demi', y=1.01);
dfi_imm_fgen_bythous = df_imm_fgen_bythous.reset_index()
dfi_imm_fgen_bythous.head()
# Set up PairGrid
g = sns.PairGrid(data=dfi_imm_fgen_bythous,
x_vars=dfi_imm_fgen_bythous.columns[1:6], y_vars=['index'],
height=10, aspect=0.3)
# Create stripplot
g.map(sns.stripplot, size=10, orient='h', palette=sns.cubehelix_palette(32, start=0.5, rot=-0.8, reverse=True),
linewidth=1, edgecolor='w')
# Set x-axis limits on all columns
g.set(xlim=(-150, 4000), xlabel="Immigrants in thousands", ylabel="")
# Column titles
titles = ['Family', 'Education', 'Work (job prev.)', 'Work (no job prev.)', 'Asylum']
for ax, title in zip(g.axes.flat, titles):
# Set a different title for each axes
ax.set(title=title)
# Make the grid horizontal instead of vertical
ax.xaxis.grid(False)
ax.yaxis.grid(True)
# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('First Generation Immigrants: Reasons for Migration (2014)', size=16, weight='demi')
plt.figtext(0.5,0.95, 'Countries in descending order of total immigrants in 2014', ha="center", va="top", fontsize=14, color='grey')
sns.despine(left=True, bottom=True);
# Initialize FacetGrid object
g = sns.FacetGrid(dfg, col='country', col_wrap=4,
height=2,aspect=2).set(xticks=np.arange(0,10,3))
# Create immigration plot - set for immigrants
g.map(plt.plot, 'year', 'immigrants', color='darkorange')
g.map(plt.fill_between, 'year', 'immigrants', color='darkorange', alpha=0.5)
g.map(plt.plot, 'year', 'emigrants', color='darkgrey')
g.map(plt.fill_between, 'year', 'emigrants', color='darkgrey', alpha=0.5)
# Facet titles
for ax in g.axes:
g.set_titles("{col_name}")
g.set_axis_labels(y_var= '')
# Formatting axes
g.set(yticks=[])
g.despine(left=True)
# Legend
color_key = {'Immigrants': 'darkorange',
'Emigrants' : 'darkgrey'}
patches = [pat.Patch(color=v, label=k) for k,v in color_key.items()]
g.fig.legend(handles=patches, bbox_to_anchor =
[0.525, 0.93], ncol=2).get_frame().set_edgecolor('1.0')
# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Total Immigration & Emigration', size=16, weight='demi');
# REPORTING COUNTRY EMIGRATION DATA
# https://ec.europa.eu/eurostat/cache/metadata/en/migr_immi_esms.htm
df_emi_reporting = clean_eurostat_excel('migr_emi1ctz.xlsx', 'reporting_country')
# Pivot transformation to long-form for visual analysis
dfg_emi_reporting = df_to_longform(df_emi_reporting, 'reporting_country_emi')
# Merge for graphing
dfg = pd.merge(dfg, dfg_emi_reporting, on=['country', 'year'])
dfg.isnull().sum()
# Initialize FacetGrid object
g = sns.FacetGrid(dfg, col='country', col_wrap=4,
height=2,aspect=2).set(xticks=np.arange(0,10,3))
# Create immigration plot - set for immigrants
g.map(plt.plot, 'year', 'emigrants', color='darkgrey')
g.map(plt.fill_between, 'year', 'emigrants', color='darkgrey', alpha=0.5)
g.map(plt.plot, 'year', 'reporting_country_emi', color='cadetblue')
g.map(plt.fill_between, 'year', 'reporting_country_emi', color='cadetblue', alpha=0.5)
# Facet titles
for ax in g.axes:
g.set_titles("{col_name}")
g.set_axis_labels(y_var= '')
# Formatting axes
g.set(yticks=[])
g.despine(left=True)
# Legend
color_key = {'Total emigrants': 'darkgrey',
'Reporting country emigrants' : 'cadetblue'}
patches = [pat.Patch(color=v, label=k) for k,v in color_key.items()]
g.fig.legend(handles=patches, bbox_to_anchor =
[0.58, 0.93], ncol=2).get_frame().set_edgecolor('1.0')
# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Foreign vs. Reporting Country Emigration', size=16, weight='demi');
# WORLD POPULATION DATA (IN THOUSANDS)
# https://population.un.org/wpp/Download/Standard/Population/
df_pop = pd.read_excel('world_population.xlsx')
df_pop
# Select subset of dataframe
df_pop = df_pop.iloc[15:, 2:]
df_pop.head()
# Create new column header
df_pop.columns = df_pop.iloc[0]
df_pop = df_pop[1:]
df_pop.head()
# Cleaning up index
df_pop.set_index('Region, subregion, country or area *',inplace=True)
df_pop.drop(['Notes','Country code','Type','Parent code'],axis=1, inplace=True)
df_pop.head()
# Get rid of 14 in corner and index name
del df_pop.columns.name
del df_pop.index.name
df_pop.head()
# Reduce dataset to select countries and years
df_pop = df_pop.loc[countries].astype('int')
df_pop = df_pop.loc[:, '2008': '2017']
# Replace ':' with NaNs and set datatype
df_pop = df_pop.replace(':', np.nan).bfill(axis=1).ffill(axis=1)
df_pop.head()
# Push index into column for graphing
dfi_pop = df_pop.reset_index()
# Drop columns for graphing
dfi_pop = dfi_pop.drop(['2009','2010','2012','2013', '2015', '2016'],axis=1)
dfi_pop.head()
# Set up PairGrid
g = sns.PairGrid(data=dfi_pop.sort_values('2008', ascending=False),
x_vars=dfi_pop.columns[1:], y_vars=['index'],
height=10, aspect=0.3)
# Create stripplot
g.map(sns.stripplot, size=10, orient='h', palette=sns.cubehelix_palette(32, start=0.5, rot=-0.8, reverse=True),
linewidth=1, edgecolor='w')
# Set x-axis limits on all columns
g.set(xlim=(-10000, 100000), xlabel="Population in thousands", ylabel="")
# Column titles
titles = ['2008', '2011', '2014', '2017']
for ax, title in zip(g.axes.flat, titles):
# Set a different title for each axes
ax.set(title=title)
# Make the grid horizontal instead of vertical
ax.xaxis.grid(False)
ax.yaxis.grid(True)
# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Population by Country', size=16, weight='demi')
sns.despine(left=True, bottom=True);
# Pivot data to long-form for visual analysis
dfg_pop = df_to_longform(df_pop, 'pop_in_thous')
dfg = pd.merge(dfg, dfg_pop, on=['country', 'year'])
dfg.info()
# Create immigrants/emigrants per capita data
dfg['immigrants_per_capita'] = dfg['immigrants']/dfg['pop_in_thous']
dfg['emigrants_per_capita'] = dfg['emigrants']/dfg['pop_in_thous']
dfg.head()