Before we begin, we will change a few settings to make the notebook look a bit prettier:
%%html
<style> body {font-family: "Calibri", cursive, sans-serif;} </style>
In this notebook, I will generate different visualizations of the Favorite Pokémon Survey results. The data was collected and made public by reddit user mamamia1001. Full credit goes to him/her. I will also be including a few comments on the results and how I interpret them. If you want a more simplified version (i.e., only the plots, no code), you can find it here. For more info, take a look at the README file.
Alright, let's get started.
First, let's import all the relevant packages, configure some plotting options, and define some basic (path) variables.
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import pathlib
from PIL import Image
import requests
from io import BytesIO
import pokefunctions
mpl.rcParams['font.sans-serif'] = 'Calibri'
mpl.rcParams['font.family'] = 'sans-serif'
sns.set(font_scale=1.75)
sns.set(font = 'Calibri')
sns.set_style('ticks')
plt.rc('axes.spines', top=False, right=False)
# Control font sizes.
plt.rc('font', size=12) # Default text size
plt.rc('axes', titlesize=16) # Axes title
plt.rc('axes', labelsize=16) # x and y labels
plt.rc('xtick', labelsize=12) # x tick labels
plt.rc('ytick', labelsize=12) # y tick labels
plt.rc('legend', fontsize=12) # Legend
plt.rc('figure', titlesize=16) # Figure title
# Define paths.
PATH_DATA = pathlib.Path(r'../data')
PATH_IMAGES = pathlib.Path(r'../images')
I downloaded a copy of the original results as an Excel file. Let's read just the relevant parts:
df = pokefunctions.read_raw_data(PATH_DATA/'responses.xlsx')
df.head(3)
name | votes | types | generation | family | |
---|---|---|---|---|---|
1 | Bulbasaur | 710.0 | Grass Poison | 1 | Bulbasaur |
2 | Ivysaur | 83.0 | Grass Poison | 1 | Bulbasaur |
3 | Venusaur | 127.0 | Grass Poison | 1 | Bulbasaur |
From the aggregated results:
n_votes = df['votes'].sum()
print(n_votes)
52725.0
Original result: 65
n_pokemon = len(df['name'].unique())
print(n_votes/n_pokemon)
65.17305315203956
df_votes_generation = df.groupby(['generation']).mean()
print(df_votes_generation)
votes generation 1 121.119205 2 91.150000 3 70.140741 4 70.383178 5 28.147436 6 29.916667 7 20.181818
fig, ax = plt.subplots(1, 1, figsize=[5, 5])
ax.plot(df_votes_generation.index, df.groupby(['generation']).mean()['votes'], linewidth=5)
ax.set_xticks(df_votes_generation.index)
ax.set_ylim((0, 130))
ax.set_title("Average votes per generation", fontdict = {'weight':'bold'})
ax.set_xlabel("Generation")
ax.set_ylabel("Average votes")
fig.savefig(PATH_IMAGES/'generation_average.png', dpi=1000, bbox_inches='tight')
plt.show();
Generation colors were taken from Bulbapedia.
generation_palette = pokefunctions.generation_palette()
fig, ax = plt.subplots(1, 1, figsize=[7, 7])
sns.boxplot(x='generation', y='votes', data=df, ax=ax, palette=generation_palette)
ax.set_title("Vote dispersion per generation", fontdict = {'weight':'bold'})
ax.set_xlabel("Generation")
ax.set_ylabel("Votes")
fig.savefig(PATH_IMAGES/'generation_boxplot.png', dpi=1000, bbox_inches='tight')
plt.show();
The outliers show us Pokémon with a high number of votes. As we will see in a second, it is clear that no Pokémon from generations 5, 6, or 7 cracks the top preference spots.
Sort Pokémon from most to least popular:
df_sorted = df.sort_values('votes', ascending=False)
The top 10 (most popular) Pokémon:
df_sorted.head(10)
name | votes | types | generation | family | |
---|---|---|---|---|---|
6 | Charizard | 1107.0 | Fire Flying Dragon | 1 | Charmander |
94 | Gengar | 1056.0 | Ghost Poison | 1 | Gastly |
59 | Arcanine | 923.0 | Fire | 1 | Growlithe |
1 | Bulbasaur | 710.0 | Grass Poison | 1 | Bulbasaur |
257 | Blaziken | 613.0 | Fire Fighting | 3 | Torchic |
197 | Umbreon | 607.0 | Dark | 2 | Eevee |
448 | Lucario | 604.0 | Fighting Steel | 4 | Riolu |
282 | Gardevoir | 585.0 | Psychic Fairy | 3 | Ralts |
133 | Eevee | 581.0 | Normal | 1 | Eevee |
149 | Dragonite | 551.0 | Dragon Flying | 1 | Dratini |
fig, ax = plt.subplots(1, 1, figsize=[7, 7])
sns.barplot(x=df_sorted.head(10)['votes'], y=df_sorted.head(10)['name'], hue='generation', palette=generation_palette, dodge=False, data=df_sorted.head(10), ax=ax)
pokemon_image = pokefunctions.get_sprite(df_sorted.index[0])
if pokemon_image != None:
ax_ins = inset_axes(ax, width=2, height=2, loc='lower right')
ax_ins.imshow(pokemon_image)
ax_ins.axis('off')
ax.set_title("Most popular Pokemon", fontdict={'weight':'bold'})
ax.set_xlabel("Votes")
ax.set_ylabel("")
ax.yaxis.set_tick_params(labelsize=16)
ax.legend(frameon=False, loc='upper right', bbox_to_anchor=(1.25, 1)).set_title("Generation")
fig.savefig(PATH_IMAGES/'popular.png', dpi=1000, bbox_inches='tight')
plt.show();
Now we know who were those outliers from earlier ;) .
The top 10 (most popular) Pokémon families:
df_families = df[['votes', 'family']].groupby(['family']).sum().sort_values('votes', ascending=False).head(10)
df_families
votes | |
---|---|
family | |
Eevee | 2865.0 |
Charmander | 1551.0 |
Gastly | 1384.0 |
Growlithe | 1199.0 |
Cyndaquil | 1092.0 |
Squirtle | 1066.0 |
Ralts | 924.0 |
Bulbasaur | 920.0 |
Scyther | 818.0 |
Mudkip | 806.0 |
fig, ax = plt.subplots(1, 1, figsize=[7, 7])
sns.barplot(x=df_families['votes'], y=df_families.index, data=df_families, ax=ax)
ax.set_title("Most popular Pokemon families", fontdict={'weight':'bold'})
ax.set_xlabel("Votes")
ax.set_ylabel("Pokemon Family")
ax.yaxis.set_tick_params(labelsize=16)
fig.savefig(PATH_IMAGES/'popular_family.png', dpi=1000, bbox_inches='tight')
plt.show();
The most popular Pokémon of each generation:
generations = list(df['generation'].unique().astype(int))
generations.insert(0, 'all')
for generation in generations:
if generation == 'all':
idx = df.groupby('generation')['votes'].transform(max) == df['votes']
df_generation = df[idx]
title_str = "each generation"
file_name = "all_generations"
else:
df_temp = df.query('generation == "' + str(generation) + '"')
df_generation = df_temp.sort_values('votes', ascending=False).head(10)
title_str = "generation " + str(generation)
file_name = "generation" + str(generation)
fig, ax = plt.subplots(1, 1, figsize=[7, 7])
sns.barplot(x=df_generation['votes'], y=df_generation['name'], hue='generation', palette=generation_palette, dodge=False, data=df_generation, ax=ax)
if generation == 'all':
ax.legend(frameon=False, loc='lower right').set_title("Generation")
else:
# Remove legend.
ax.get_legend().remove()
# Add sprite of the most popular Pokemon of each type.
pokemon_image = pokefunctions.get_sprite(df_generation.index[0])
if pokemon_image != None:
ax_ins = inset_axes(ax, width=2, height=2, loc='lower right')
ax_ins.imshow(pokemon_image)
ax_ins.axis('off')
ax.set_title("Most popular Pokemon of " + title_str, fontdict={'weight':'bold'})
ax.set_xlabel("Votes")
ax.set_ylabel("")
ax.yaxis.set_tick_params(labelsize=16)
fig.savefig((PATH_IMAGES/(file_name + '.png')), dpi=1000, bbox_inches='tight')
plt.show()
pass; # Supress plot output.