#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') import os import warnings import tqdm import pandas as pd warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) import socceraction.vaep.formula as vaepformula # In[2]: ## Configure file and folder names datafolder = "../data-fifa" spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5") predictions_h5 = os.path.join(datafolder, "predictions.h5") # In[3]: with pd.HDFStore(spadl_h5) as spadlstore: games = ( spadlstore["games"] .merge(spadlstore["competitions"], how='left') .merge(spadlstore["teams"].add_prefix('home_'), how='left') .merge(spadlstore["teams"].add_prefix('away_'), how='left')) players = spadlstore["players"] teams = spadlstore["teams"] actiontypes = spadlstore["actiontypes"] bodyparts = spadlstore["bodyparts"] results = spadlstore["results"] print("nb of games:", len(games)) # In[4]: A = [] for game in tqdm.tqdm(list(games.itertuples()), desc="Rating actions"): actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}") actions = ( actions .merge(actiontypes, how="left") .merge(bodyparts, how="left") .merge(players, how="left") .merge(teams, how="left") .merge(results, how="left") .sort_values(["game_id","period_id","action_id"]) .reset_index(drop=True) ) preds = pd.read_hdf(predictions_h5, f"game_{game.game_id}") values = vaepformula.value(actions, preds.scores, preds.concedes) A.append(pd.concat([actions, preds,values], axis=1)) A = pd.concat(A).sort_values(["game_id", "period_id", "time_seconds", "timestamp"]).reset_index(drop=True) A.columns # ### Most valuable players # In[5]: A["count"] = 1 # These are useless events that don't measure player skill playersR = ( A[["player_id", "vaep_value", "offensive_value", "defensive_value", "count"]] .groupby(["player_id"]) .sum() .reset_index() ) playersR = playersR.merge(players[["player_id", "nickname", "player_name"]], how="left") playersR["player_name"] = playersR[["nickname","player_name"]].apply(lambda x: x[0] if x[0] else x[1],axis=1) playersR = playersR[["player_id", "player_name", "vaep_value", "offensive_value", "defensive_value", "count"]] playersR.sort_values("vaep_value", ascending=False)[:10] # In[6]: # Normalize for minutes played pg = pd.read_hdf(spadl_h5, "player_games") pg = pg[pg.game_id.isin(games.game_id)] mp = pg[["player_id", "minutes_played"]].groupby("player_id").sum().reset_index() stats = playersR.merge(mp) stats = stats[stats.minutes_played > 180] # at least two full games played stats["vaep_rating"] = stats.vaep_value * 90 / stats.minutes_played stats["offensive_rating"] = stats.offensive_value * 90 / stats.minutes_played #stats[stats.offensive_rating > 0.1] stats.sort_values("vaep_rating",ascending=False)[:10] # ### (optional) inspect Belgium's top 10 most valuable non-shot actions # In[7]: import matplotsoccer sorted_A = A.sort_values("vaep_value", ascending=False) sorted_A = sorted_A[sorted_A.team_name == "Belgium"] # view only actions from Belgium sorted_A = sorted_A[~sorted_A.type_name.str.contains("shot")] #eliminate shots def get_time(period_id,time_seconds): m = int((period_id-1)*45 + time_seconds // 60) s = time_seconds % 60 if s == int(s): s = int(s) return f"{m}m{s}s" for j in range(0,50): row = list(sorted_A[j:j+1].itertuples())[0] i = row.Index a = A[i - 3 : i+2].copy() a["player_name"] = a[["nickname", "player_name"]].apply(lambda x: x[0] if x[0] else x[1],axis=1) g = list(games[games.game_id == a.game_id.values[0]].itertuples())[0] game_info = f"{g.game_date} {g.home_team_name} {g.home_score}-{g.away_score} {g.away_team_name}" minute = int((row.period_id-1)*45 + row.time_seconds // 60) print(f"{game_info} {minute}' {row.type_name} {row.player_name}") a["scores"] = a.scores.apply(lambda x : "%.3f" % x ) a["vaep_value"] = a.vaep_value.apply(lambda x : "%.3f" % x ) a["time"] = a[["period_id","time_seconds"]].apply(lambda x: get_time(*x),axis=1) cols = ["time", "type_name", "player_name", "team_name", "scores", "vaep_value"] matplotsoccer.actions(a[["start_x","start_y","end_x","end_y"]], a.type_name, team=a.team_name, result = a.result_name == "success", label=a[cols], labeltitle = cols, zoom=False) # In[ ]: