In this project we will cover:
For the sake of simplicity we will use a simple dataset: the Canadian immigration dataset that report immigration data from 1980 to 2013.
# import libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches # needed for waffle Charts
from PIL import Image # converting images into arrays
# use the inline backend to generate the plots within the browser
%matplotlib inline
# ingore future wanings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# import dataset
df= pd.read_excel('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DV0101EN/labs/Data_Files/Canada.xlsx',
sheet_name='Canada by Citizenship',
skiprows=range(20),
skipfooter=2
)
df.head()
Type | Coverage | OdName | AREA | AreaName | REG | RegName | DEV | DevName | 1980 | ... | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Immigrants | Foreigners | Afghanistan | 935 | Asia | 5501 | Southern Asia | 902 | Developing regions | 16 | ... | 2978 | 3436 | 3009 | 2652 | 2111 | 1746 | 1758 | 2203 | 2635 | 2004 |
1 | Immigrants | Foreigners | Albania | 908 | Europe | 925 | Southern Europe | 901 | Developed regions | 1 | ... | 1450 | 1223 | 856 | 702 | 560 | 716 | 561 | 539 | 620 | 603 |
2 | Immigrants | Foreigners | Algeria | 903 | Africa | 912 | Northern Africa | 902 | Developing regions | 80 | ... | 3616 | 3626 | 4807 | 3623 | 4005 | 5393 | 4752 | 4325 | 3774 | 4331 |
3 | Immigrants | Foreigners | American Samoa | 909 | Oceania | 957 | Polynesia | 902 | Developing regions | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | Immigrants | Foreigners | Andorra | 908 | Europe | 925 | Southern Europe | 901 | Developed regions | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5 rows × 43 columns
# remove non informative features
df.drop(['AREA', 'REG', 'DEV', 'Type', 'Coverage'], axis=1, inplace=True)
# sanity check
df.head()
OdName | AreaName | RegName | DevName | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | ... | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | Asia | Southern Asia | Developing regions | 16 | 39 | 39 | 47 | 71 | 340 | ... | 2978 | 3436 | 3009 | 2652 | 2111 | 1746 | 1758 | 2203 | 2635 | 2004 |
1 | Albania | Europe | Southern Europe | Developed regions | 1 | 0 | 0 | 0 | 0 | 0 | ... | 1450 | 1223 | 856 | 702 | 560 | 716 | 561 | 539 | 620 | 603 |
2 | Algeria | Africa | Northern Africa | Developing regions | 80 | 67 | 71 | 69 | 63 | 44 | ... | 3616 | 3626 | 4807 | 3623 | 4005 | 5393 | 4752 | 4325 | 3774 | 4331 |
3 | American Samoa | Oceania | Polynesia | Developing regions | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | Andorra | Europe | Southern Europe | Developed regions | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5 rows × 38 columns
# change columns name for a better undesrtanding
df.rename(columns={'OdName':'Country', 'AreaName':'Continent','RegName':'Region'}, inplace=True)
# sanity check
df.head()
Country | Continent | Region | DevName | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | ... | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | Asia | Southern Asia | Developing regions | 16 | 39 | 39 | 47 | 71 | 340 | ... | 2978 | 3436 | 3009 | 2652 | 2111 | 1746 | 1758 | 2203 | 2635 | 2004 |
1 | Albania | Europe | Southern Europe | Developed regions | 1 | 0 | 0 | 0 | 0 | 0 | ... | 1450 | 1223 | 856 | 702 | 560 | 716 | 561 | 539 | 620 | 603 |
2 | Algeria | Africa | Northern Africa | Developing regions | 80 | 67 | 71 | 69 | 63 | 44 | ... | 3616 | 3626 | 4807 | 3623 | 4005 | 5393 | 4752 | 4325 | 3774 | 4331 |
3 | American Samoa | Oceania | Polynesia | Developing regions | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | Andorra | Europe | Southern Europe | Developed regions | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
5 rows × 38 columns
# let's set country as dataframe index for better .loc method use
df.set_index('Country', inplace=True)
# now add total feature
df['Total'] = df.sum(axis=1)
# check
df.head()
Continent | Region | DevName | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | 1986 | ... | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country | |||||||||||||||||||||
Afghanistan | Asia | Southern Asia | Developing regions | 16 | 39 | 39 | 47 | 71 | 340 | 496 | ... | 3436 | 3009 | 2652 | 2111 | 1746 | 1758 | 2203 | 2635 | 2004 | 58639 |
Albania | Europe | Southern Europe | Developed regions | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 1223 | 856 | 702 | 560 | 716 | 561 | 539 | 620 | 603 | 15699 |
Algeria | Africa | Northern Africa | Developing regions | 80 | 67 | 71 | 69 | 63 | 44 | 69 | ... | 3626 | 4807 | 3623 | 4005 | 5393 | 4752 | 4325 | 3774 | 4331 | 69439 |
American Samoa | Oceania | Polynesia | Developing regions | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
Andorra | Europe | Southern Europe | Developed regions | 0 | 0 | 0 | 0 | 0 | 0 | 2 | ... | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 15 |
5 rows × 38 columns
# just for fun see the top 5 countries
df.sort_values(['Total'], ascending=False, axis=0, inplace=True)
df.head()
Continent | Region | DevName | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | 1986 | ... | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country | |||||||||||||||||||||
India | Asia | Southern Asia | Developing regions | 8880 | 8670 | 8147 | 7338 | 5704 | 4211 | 7150 | ... | 36210 | 33848 | 28742 | 28261 | 29456 | 34235 | 27509 | 30933 | 33087 | 691904 |
China | Asia | Eastern Asia | Developing regions | 5123 | 6682 | 3308 | 1863 | 1527 | 1816 | 1960 | ... | 42584 | 33518 | 27642 | 30037 | 29622 | 30391 | 28502 | 33024 | 34129 | 659962 |
United Kingdom of Great Britain and Northern Ireland | Europe | Northern Europe | Developed regions | 22045 | 24796 | 20620 | 10015 | 10170 | 9564 | 9470 | ... | 7258 | 7140 | 8216 | 8979 | 8876 | 8724 | 6204 | 6195 | 5827 | 551500 |
Philippines | Asia | South-Eastern Asia | Developing regions | 6051 | 5921 | 5249 | 4562 | 3801 | 3150 | 4166 | ... | 18139 | 18400 | 19837 | 24887 | 28573 | 38617 | 36765 | 34315 | 29544 | 511391 |
Pakistan | Asia | Southern Asia | Developing regions | 978 | 972 | 1201 | 900 | 668 | 514 | 691 | ... | 14314 | 13127 | 10124 | 8994 | 7217 | 6811 | 7468 | 11227 | 12603 | 241600 |
5 rows × 38 columns
# since years features are handled so awfully...
# let's create a list of years from 1980 to 2013, remeber range() second element non inclusive behaviour
years = list(range(1980,2014))
# get the top 5 entries
df_top5 = df.head()
# transpose the dataframe
df_top5 = df_top5[years].transpose()
# check
df_top5.head()
Country | India | China | United Kingdom of Great Britain and Northern Ireland | Philippines | Pakistan |
---|---|---|---|---|---|
1980 | 8880 | 5123 | 22045 | 6051 | 978 |
1981 | 8670 | 6682 | 24796 | 5921 | 972 |
1982 | 8147 | 3308 | 20620 | 5249 | 1201 |
1983 | 7338 | 1863 | 10015 | 4562 | 900 |
1984 | 5704 | 1527 | 10170 | 3801 | 668 |
df_top5.index = df_top5.index.map(int) # let's change the index values of df_top5 to type integer for plotting
df_top5.plot(kind='area',
stacked=False,
alpha=.25,
figsize=(20, 10), # pass a tuple (x, y) size
)
plt.title('Immigration Trend of Top 5 Countries')
plt.ylabel('Number of Immigrants')
plt.xlabel('Years')
plt.show()
# view 1999 data
df[1999].plot(kind='hist',bins=25, figsize=(8, 5))
plt.title('Histogram of Immigration from 195 Countries in 2013') # add a title to the histogram
plt.ylabel('Number of Countries') # add y-label
plt.xlabel('Number of Immigrants') # add x-label
plt.show()
df_iceland = df.loc['Iceland', years]
df_iceland.head()
1980 17 1981 33 1982 10 1983 9 1984 13 Name: Iceland, dtype: object
df_iceland.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Year') # add to x-label to the plot
plt.ylabel('Number of immigrants') # add y-label to the plot
plt.title('Icelandic immigrants to Canada from 1980 to 2013') # add title to the plot
plt.show()
#group by continent and sum all immigrants
df_cont = df.groupby('Continent', axis=0).sum()
# check
df_cont.head()
1980 | 1981 | 1982 | 1983 | 1984 | 1985 | 1986 | 1987 | 1988 | 1989 | ... | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Continent | |||||||||||||||||||||
Africa | 3951 | 4363 | 3819 | 2671 | 2639 | 2650 | 3782 | 7494 | 7552 | 9894 | ... | 27523 | 29188 | 28284 | 29890 | 34534 | 40892 | 35441 | 38083 | 38543 | 618948 |
Asia | 31025 | 34314 | 30214 | 24696 | 27274 | 23850 | 28739 | 43203 | 47454 | 60256 | ... | 159253 | 149054 | 133459 | 139894 | 141434 | 163845 | 146894 | 152218 | 155075 | 3317794 |
Europe | 39760 | 44802 | 42720 | 24638 | 22287 | 20844 | 24370 | 46698 | 54726 | 60893 | ... | 35955 | 33053 | 33495 | 34692 | 35078 | 33425 | 26778 | 29177 | 28691 | 1410947 |
Latin America and the Caribbean | 13081 | 15215 | 16769 | 15427 | 13678 | 15171 | 21179 | 28471 | 21924 | 25060 | ... | 24747 | 24676 | 26011 | 26547 | 26867 | 28818 | 27856 | 27173 | 24950 | 765148 |
Northern America | 9378 | 10030 | 9074 | 7100 | 6661 | 6543 | 7074 | 7705 | 6469 | 6790 | ... | 8394 | 9613 | 9463 | 10190 | 8995 | 8142 | 7677 | 7892 | 8503 | 241142 |
5 rows × 35 columns
# create the chart and plot it
df_cont['Total'].plot(kind='pie', figsize=(10,10))
plt.title('Immigration to Canada by Continet (1980-2013)')
plt.show()
# get only japan data
df_japan = df.loc[['Japan'], years].transpose()
# check
df_japan.head()
Country | Japan |
---|---|
1980 | 701 |
1981 | 756 |
1982 | 598 |
1983 | 309 |
1984 | 246 |
df_japan.plot(kind='box')
plt.title('Box Plot of Japanese Imigration from 1980 to 2013')
plt.ylabel('Number of Immigrants')
plt.show()
# get total value
df_total = df[years].sum().to_frame().reset_index(level=0).rename(columns={'index':'years', 0: 'Total'})
# check
df_total.head()
years | Total | |
---|---|---|
0 | 1980 | 99137 |
1 | 1981 | 110563 |
2 | 1982 | 104271 |
3 | 1983 | 75550 |
4 | 1984 | 73417 |
df_total.plot(
kind="scatter",
x='years',
y='Total'
)
plt.title(' Total Immigrant population to Canada from 1980 to 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')
plt.show()
A Waffle Chart can be used to easy visualize data in relation to a whole or to highlight progress against a given threshold.
# create a subset of samples
df_sub = df.loc[['Denmark','Norway','Sweden'], :]
df_sub
Continent | Region | DevName | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | 1986 | ... | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | Total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Country | |||||||||||||||||||||
Denmark | Europe | Northern Europe | Developed regions | 272 | 293 | 299 | 106 | 93 | 73 | 93 | ... | 62 | 101 | 97 | 108 | 81 | 92 | 93 | 94 | 81 | 3901 |
Norway | Europe | Northern Europe | Developed regions | 116 | 77 | 106 | 51 | 31 | 54 | 56 | ... | 57 | 53 | 73 | 66 | 75 | 46 | 49 | 53 | 59 | 2327 |
Sweden | Europe | Northern Europe | Developed regions | 281 | 308 | 222 | 176 | 128 | 158 | 187 | ... | 205 | 139 | 193 | 165 | 167 | 159 | 134 | 140 | 140 | 5866 |
3 rows × 38 columns
# Step 1: determine proportion of each category with respect to the total
# total values
tot_val = sum(df_sub['Total'])
#category proprtion
cat_pro = [(float(value) / tot_val) for value in df_sub['Total']]
# check proportion
for i, pro in enumerate(cat_pro):
print(df_sub.index.values[i] + ": " + str(round(pro,3)))
Denmark: 0.323 Norway: 0.192 Sweden: 0.485
# Step 2: define overall size of the waffle chart
width = 40
height = 10
# total number of tiles
tot_tiles = width * height
# Step 3: using a proprtion, determine the amount of tiles for each cat
tiles_per_cat = [round(pro * tot_tiles) for pro in cat_pro]
# check number tiles per category
for i, tiles in enumerate(tiles_per_cat):
print(df_sub.index.values[i] + ": "+str(tiles))
Denmark: 129 Norway: 77 Sweden: 194
# Step 4: create a matrix that resembles the waffle and populate it
# init the waffle chart as an empty matrix
waffle_chart = np.zeros((height, width))
#define indices to loop through waffle chart
cat_index = 0
tile_index = 0
# ppulate the waffle
for col in range(width):
for row in range(height):
tile_index += 1
# allocalte tiles conditionally
if tile_index > sum(tiles_per_cat[0:cat_index]):
# go next
cat_index += 1
# set class value as integer, wich increase with class
waffle_chart[row,col] = cat_index
# Step 5: map waffle chart to visual
fig = plt.figure()
# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()
plt.show()
<Figure size 432x288 with 0 Axes>
# Step 6: make the waffle chart better
# instantiate a new figure object
fig = plt.figure()
# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()
# get the axis
ax = plt.gca()
# set minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
# add gridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
plt.xticks([])
plt.yticks([])
# remove unnececessary text
plt.show()
<Figure size 432x288 with 0 Axes>
# Step 7: create a legend for the chart
# instantiate a new figure object
fig = plt.figure()
# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()
# get the axis
ax = plt.gca()
# set minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
# add gridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
plt.xticks([])
plt.yticks([])
# compute cumulative sum of individual categories to match color schemes between chart and legend
values_cumsum = np.cumsum(df_sub['Total'])
tot_val = values_cumsum[len(values_cumsum) - 1]
# create legend
legend_handles = []
for i, category in enumerate(df_sub.index.values):
label_str = category + ' (' + str(df_sub['Total'][i]) + ')'
color_val = colormap(float(values_cumsum[i])/tot_val)
legend_handles.append(mpatches.Patch(color=color_val, label=label_str))
# add legend to chart
plt.legend(handles=legend_handles,
loc='lower center',
ncol=len(df_sub.index.values),
bbox_to_anchor=(0., -0.2, 0.95, .1)
)
plt.show()
<Figure size 432x288 with 0 Axes>
# Step 8: pack everything into a function
def create_waffle_chart(dataset, categories, values, height, width, colormap, value_sign=''):
# compute the proportion of each category with respect to the total
total_values = sum(values)
category_proportions = [(float(value) / total_values) for value in values]
# compute the total number of tiles
total_num_tiles = width * height # total number of tiles
print ('Total number of tiles is', total_num_tiles)
# compute the number of tiles for each catagory
tiles_per_category = [round(proportion * total_num_tiles) for proportion in category_proportions]
# print out number of tiles per category
for i, tiles in enumerate(tiles_per_category):
print (dataset.index.values[i] + ': ' + str(tiles))
# initialize the waffle chart as an empty matrix
waffle_chart = np.zeros((height, width))
# define indices to loop through waffle chart
category_index = 0
tile_index = 0
# populate the waffle chart
for col in range(width):
for row in range(height):
tile_index += 1
# if the number of tiles populated for the current category
# is equal to its corresponding allocated tiles...
if tile_index > sum(tiles_per_category[0:category_index]):
# ...proceed to the next category
category_index += 1
# set the class value to an integer, which increases with class
waffle_chart[row, col] = category_index
# instantiate a new figure object
fig = plt.figure()
# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()
# get the axis
ax = plt.gca()
# set minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
# add dridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
plt.xticks([])
plt.yticks([])
# compute cumulative sum of individual categories to match color schemes between chart and legend
values_cumsum = np.cumsum(values)
total_values = values_cumsum[len(values_cumsum) - 1]
# create legend
legend_handles = []
for i, category in enumerate(categories):
if value_sign == '%':
label_str = category + ' (' + str(values[i]) + value_sign + ')'
else:
label_str = category + ' (' + value_sign + str(values[i]) + ')'
color_val = colormap(float(values_cumsum[i])/total_values)
legend_handles.append(mpatches.Patch(color=color_val, label=label_str))
# add legend to chart
plt.legend(
handles=legend_handles,
loc='lower center',
ncol=len(categories),
bbox_to_anchor=(0., -0.2, 0.95, .1)
)
width = 40 # width of chart
height = 10 # height of chart
categories = df_sub.index.values # categories
values = df_sub['Total'] # correponding values of categories
colormap = plt.cm.coolwarm # color map class
create_waffle_chart(df_sub, categories, values, height, width, colormap )
Total number of tiles is 400 Denmark: 129 Norway: 77 Sweden: 194
<Figure size 432x288 with 0 Axes>