#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'nb_black') import pandas as pd import numpy as np from os import listdir, path output_path = "./output" players_df = ( pd.concat( pd.read_excel(path.join(output_path, f)) for f in listdir("./output") if f.startswith("season_") ) .drop(columns=["Unnamed: 0"]) .dropna() ).sort_values(["season", "rank"]) players_df # In[2]: players_df[players_df["name"] == "sepro"] # In[3]: def sdi(data) -> float: """ Given a list of counts, calculate the Shannon Diversity Index :param data: list of integers :return: Shannon Diversity Index """ def p(n, N): return n / N * np.log(n / N) x = np.array([d for d in data if d != 0]) N = np.sum(x.astype("float64")) if N == 0: raise ZeroDivisionError( "Cannot calculate Shannon Diversity Index when the sum of all observations is zero" ) return 0 - np.sum(np.array([p(n, N) for n in x])) def dominance(data) -> float: N = np.sum(np.array(data).astype("float64")) if N > 0: proportions = data / N return np.sum(np.square(proportions)) else: return np.nan def simpson(data) -> float: d = dominance(data) if np.isnan(d): return np.nan else: return 1 - d # In[4]: season_country_df = ( players_df[players_df["rank"] <= 2000] .groupby(["season", "country"]) .agg(count=pd.NamedAgg("name", "count")) .reset_index() ) season_country_df # In[5]: season_diversity_df = ( season_country_df.groupby(["season"]) .agg( richness=pd.NamedAgg("count", "count"), shannon_diversity=pd.NamedAgg("count", sdi), dominance=pd.NamedAgg("count", dominance), simpson=pd.NamedAgg("count", simpson), ) .reset_index() ) season_diversity_df.set_index("season").to_excel( "./output/seasonal_diversity_of_nations.xlsx" ) season_diversity_df # In[6]: summary_df = ( players_df.groupby(["season"]) .agg( players_found=pd.NamedAgg("name", "count"), players_total=pd.NamedAgg("rank", "max"), ) .reset_index() ) summary_df["players_captured"] = ( summary_df["players_found"] * 100 / summary_df["players_total"] ) summary_df["extra_players_captured"] = ( (summary_df["players_found"] - 2860) * 100 / (summary_df["players_total"] - 2860) ) summary_df.set_index("season").to_excel("./output/seasonal_player_counts.xlsx") summary_df # In[7]: season_df = pd.read_excel("./output/seasonal_stats.xlsx").drop( columns=["Unnamed: 0", "min_mmr", "max_mmr", "num_matches"] ) estimates_df = pd.read_excel("./output/player_estimates.xlsx").drop( columns=["Unnamed: 0"] ) merged_df = pd.merge(summary_df, season_diversity_df, how="left", on="season") merged_df = pd.merge(merged_df, season_df, how="left", on="season") merged_df = pd.merge(merged_df, estimates_df, how="left", on="season") merged_df # In[8]: merged_df["series"] = merged_df.season.apply(lambda x: x.split("_")[0]) # In[9]: import seaborn as sns import matplotlib.pyplot as plt sns.regplot( data=merged_df, x="players_total", y="mean_estimate", scatter=False, color=".25" ) sns.scatterplot(data=merged_df, x="players_total", y="mean_estimate", hue="series") plt.title("Actual Players vs Estimated") plt.xlabel("Number of Players") plt.ylabel("Estimated Players") plt.xlim(2500, 23000) plt.show() # In[10]: import seaborn as sns import matplotlib.pyplot as plt sns.regplot( data=merged_df, x="players_total", y="shannon_diversity", scatter=False, color=".25" ) sns.scatterplot(data=merged_df, x="players_total", y="shannon_diversity", hue="series") plt.title("Actual Players vs Diversity") plt.xlabel("Number of Players") plt.ylabel("Shannon Diversity") plt.xlim(2500, 23000) plt.show()