import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, make_scorer, brier_score_loss
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from joblib import dump, load
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
# monkey patch for bayesseachcv (https://github.com/scikit-optimize/scikit-optimize/issues/902)
from numpy.ma import MaskedArray
import sklearn.utils.fixes
sklearn.utils.fixes.MaskedArray = MaskedArray
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')
Random state
seed = 42
Setup metrics (see: http://business-analytic.co.uk/blog/evaluating-expected-goals-models/)
# define Mcfadden's pseduo r-squared
def mcfadden_r2(y, y_pred):
ll = log_loss(y, y_pred)
ll_null = log_loss(y, np.full(len(y), y.mean()))
return 1 - (ll/ll_null)
pseudo_r2_scorer = make_scorer(mcfadden_r2, needs_proba=True, greater_is_better=True)
scoring = {'roc_aug': 'roc_auc', 'mcfaddens_r2': pseudo_r2_scorer}
Setup folder for storing models
Load the data
df = pd.read_parquet(os.path.join('..', 'data', 'shots.parquet'))
df.drop(['match_id', 'statsbomb_id', 'statsbomb_team_id', 'player_id_statsbomb', 'competition_gender', 'team_name',
'player_id', 'firstName', 'middleName', 'lastName', 'Name', 'dataset', 'wyscout_id', 'wyscout_team_id', 'team_id',
'player_id_wyscout'], axis=1, inplace=True)
X = df.drop('goal', axis=1)
y = df.goal
Split into train, calibration and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=seed, stratify=y)
print('Shots train', len(y_train), ';Number goals', y_train.sum(),
';Goals %: ', round(y_train.mean()*100, 1))
print('Shots test', len(y_test), ';Number goals', y_test.sum(),
';Goals %: ', round(y_test.mean()*100, 1))
Shots train 51335 ;Number goals 5443 ;Goals %: 10.6 Shots test 12834 ;Number goals 1361 ;Goals %: 10.6
Load and split fake data
df_fake = pd.read_parquet(os.path.join('..', 'data', 'fake_shots.parquet'))
df_fake.index = ['a'+str(idx) for idx in df_fake.index]
y_fake = df_fake.goal
X_fake = df_fake.drop('goal', axis=1)
print('Shots fake', len(y_fake), ';Goals %: ', round(y_fake.mean()*100, 1))
Shots fake 1000 ;Goals %: 3.4
Subset dataset for logistic regression
# drop colum
logistic_drop_cols = ['x', 'y', # logistic regression does not deal well with dependent features
# The model will use the distance/ angle features capture these location features instead
# lots of missings for the below features as they come from StatsBomb data only.
# It's not fair to impute these as they are not missing at random
# while logistic regression does not allow missings so I removed them
'pass_end_y', 'pass_end_x', # <- note these were in Wyscout, but often were just the shot location
'eventSec', 'period', 'player_id_goalkeeper',
'goalkeeper_x', 'goalkeeper_y', 'carry_length', 'shot_one_on_one', 'shot_open_goal',
'under_pressure', 'area_shot', 'area_goal', 'n_angle', 'smart_pass']
X_train_logistic = X_train.drop(logistic_drop_cols, axis=1).copy()
X_test_logistic = X_test.drop(logistic_drop_cols, axis=1).copy()
Split dataset for logistic regession into passes / other assists
def split(X, y):
mask = X.assist_type == 'pass'
X_pass = X[mask].drop('assist_type', axis=1).copy()
y_pass = y[mask]
X_other = X[~mask].dropna(axis=1, how='all').copy()
y_other = y[~mask]
return X_pass, y_pass, X_other, y_other
X_train_pass, y_train_pass, X_train_other, y_train_other = split(X_train_logistic, y_train)
X_test_pass, y_test_pass, X_test_other, y_test_other = split(X_test_logistic, y_test)
Pipeline for cleaning pass assists
cols = ['shot_type_name', 'body_part_name', 'pass_technique_name', 'pass_height_name']
cats = [['open_play', 'free_kick', 'corner', 'throw_in'],
['Right Foot', 'Left Foot', 'Other'],
['other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging'],
['Ground/ Low Pass', 'High Pass']]
pass_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_pass = Pipeline([('one_hot', pass_one_hot),
('impute', SimpleImputer()),
('scale', StandardScaler()),
('lr', LogisticRegression(random_state=seed))])
Column names of transformed pass data
original_cols_remain = [col for col in X_train_pass.columns if col not in cols]
new_cols_pass = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_pass.extend(original_cols_remain)
print(new_cols_pass)
['free_kick', 'corner', 'throw_in', 'Left Foot', 'Other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging', 'High Pass', 'counter_attack', 'fast_break', 'strong_foot', 'pass_switch', 'pass_cross', 'pass_cut_back', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle', 'log_distance_to_goal']
Pipeline for cleaning other assists
# setting direct to recovery so does not not encoded twice ( also covered by shot_type_name == 'direct_set_piece')
X_train_other.loc[X_train_other.assist_type == 'direct', 'assist_type'] = 'recovery'
X_test_other.loc[X_test_other.assist_type == 'direct', 'assist_type'] = 'recovery'
cols = ['shot_type_name', 'body_part_name', 'assist_type']
cats = [['open_play', 'free_kick', 'corner', 'throw_in', 'direct_set_piece'],
['Right Foot', 'Left Foot', 'Other'],
['recovery', 'clearance', 'rebound']]
other_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_other = Pipeline([('one_hot', other_one_hot),
('impute', SimpleImputer()),
('scale', StandardScaler()),
('lr', LogisticRegression(random_state=seed))])
Column names of transformed passes
original_cols_remain = [col for col in X_train_other.columns if col not in cols]
new_cols_other = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_other.extend(original_cols_remain)
print(new_cols_other)
['free_kick', 'corner', 'throw_in', 'direct_set_piece', 'Left Foot', 'Other', 'clearance', 'rebound', 'counter_attack', 'fast_break', 'strong_foot', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle', 'log_distance_to_goal']
Search parameters for gridsearchcv
param_grid = {'lr__C': np.logspace(-3, 0.1, 100)}
Fit the inner grid search for shots assisted by passes
clf_pass = GridSearchCV(estimator=pipe_pass, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_pass.fit(X_train_pass, y_train_pass)
print('C:', clf_pass.best_estimator_.named_steps.lr.C)
C: 0.04248961816344005
Fit the inner grid search for shots assisted other than passes
clf_other = GridSearchCV(estimator=pipe_other, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_other.fit(X_train_other, y_train_other)
print('C:', clf_other.best_estimator_.named_steps.lr.C)
C: 0.42687726983178853
Outer loops for unbiased estimates of the model accuracy
nested_score_pass = cross_validate(clf_pass, X=X_train_pass, y=y_train_pass, scoring=scoring, n_jobs=-1)
print('ROC AUC for shots assisted by passes:', nested_score_pass['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared shots assisted by passes:", nested_score_pass['test_mcfaddens_r2'].mean())
ROC AUC for shots assisted by passes: 0.7817401551132565 McFadden's Pseudo R-squared shots assisted by passes: 0.15855492654518005
nested_score_other = cross_validate(clf_other, X=X_train_other, y=y_train_other, scoring=scoring, n_jobs=-1)
print('ROC AUC for other model:', nested_score_other['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared for other model:", nested_score_other['test_mcfaddens_r2'].mean())
ROC AUC for other model: 0.8033677417704617 McFadden's Pseudo R-squared for other model: 0.19094339212253655
Add fake training data. I am not adding this to the test data as want this to be realistic of real data.
X_train = pd.concat([X_train, X_fake])
y_train = pd.concat([y_train, y_fake])
Clean data. Categories to numbers. Drop distance and angle measures as just want raw locations for my models.
def clean_lightgbm(df):
df = df.copy()
# replace categorical columns
shot_type_cat = {'free_kick': 0, 'corner': 1, 'throw_in': 2, 'direct_set_piece': 3, 'open_play': 4}
body_type_cat = {'Right Foot': 0, 'Left Foot': 1, 'Other': 2}
assist_type_cat = {'pass': 0, 'recovery': 1, 'clearance': 2, 'direct': 3, 'rebound': 4}
pass_height_cat = {'High Pass': 0, 'Ground/ Low Pass': 1}
pass_technique_cat = {'Through Ball': 0, 'Straight': 1, 'Inswinging': 2, 'Outswinging': 3, 'other': 4}
df.shot_type_name.replace(shot_type_cat, inplace=True)
df.body_part_name.replace(body_type_cat, inplace=True)
df.assist_type.replace(assist_type_cat, inplace=True)
df.pass_height_name.replace(pass_height_cat, inplace=True)
df.pass_technique_name.replace(pass_technique_cat, inplace=True)
# replace boolean type columns (not really as have nans)
for col in ['pass_switch', 'pass_cross', 'pass_cut_back', 'shot_one_on_one',
'shot_open_goal', 'under_pressure', 'smart_pass']:
df[col] = df[col].astype(np.float32)
# drop some distance/ angle columns
drop_cols = ['visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle',
'log_distance_to_goal', 'eventSec', 'period', 'player_id_goalkeeper']
df.drop(drop_cols, axis=1, inplace=True)
return df
X_train = clean_lightgbm(X_train)
X_test = clean_lightgbm(X_test)
print(X_train.columns)
Index(['shot_type_name', 'x', 'y', 'counter_attack', 'fast_break', 'strong_foot', 'body_part_name', 'assist_type', 'pass_end_y', 'pass_end_x', 'pass_switch', 'pass_cross', 'pass_cut_back', 'pass_height_name', 'pass_technique_name', 'carry_length', 'shot_one_on_one', 'shot_open_goal', 'under_pressure', 'area_shot', 'area_goal', 'n_angle', 'goalkeeper_x', 'goalkeeper_y', 'smart_pass'], dtype='object')
Fit the nested 5-fold cross validation using Bayesian optimisation.
#lgbm = LGBMClassifier(random_state=42)
lgbm = CalibratedClassifierCV(LGBMClassifier(random_state=42), method='isotonic', cv=3)
lgbm_param_grid = {'base_estimator__min_child_samples': Integer(0, 200),
'base_estimator__num_leaves': Integer(2, 500),
'base_estimator__reg_lambda': Real(0, 1),
'base_estimator__reg_alpha': Real(0, 1),
'base_estimator__max_depth': Integer(0, 500)}
# Nested resampling using skopt. see: https://github.com/scikit-optimize/scikit-optimize/issues/725
searchcv = BayesSearchCV(estimator=lgbm,
n_iter=100,
search_spaces=lgbm_param_grid,
cv=5,
n_jobs=-1)
searchcv.fit(X_train, y_train)
BayesSearchCV(cv=5, estimator=CalibratedClassifierCV(base_estimator=LGBMClassifier(random_state=42), cv=3, method='isotonic'), n_iter=100, n_jobs=-1, search_spaces={'base_estimator__max_depth': Integer(low=0, high=500, prior='uniform', transform='identity'), 'base_estimator__min_child_samples': Integer(low=0, high=200, prior='uniform', transform='identity'), 'base_estimator__num_leaves': Integer(low=2, high=500, prior='uniform', transform='identity'), 'base_estimator__reg_alpha': Real(low=0, high=1, prior='uniform', transform='identity'), 'base_estimator__reg_lambda': Real(low=0, high=1, prior='uniform', transform='identity')})
Permutation importance
# note not using fake data for permutation importance
perm_result = permutation_importance(searchcv.best_estimator_, X_train, y_train, n_repeats=10, random_state=seed)
df_perm_importance = pd.DataFrame({'Feature':X_train.columns,
'importance': perm_result.importances.mean(axis=1),
'std_dev': perm_result.importances.std(axis=1)})
df_perm_importance.sort_values('importance', ascending=False, inplace=True)
df_perm_importance.reset_index(drop=True, inplace=True)
df_perm_importance
Feature | importance | std_dev | |
---|---|---|---|
0 | x | 1.435942e-02 | 0.000504 |
1 | y | 1.153148e-02 | 0.000537 |
2 | goalkeeper_x | 1.199962e-03 | 0.000086 |
3 | n_angle | 9.439190e-04 | 0.000156 |
4 | shot_type_name | 8.350053e-04 | 0.000145 |
5 | goalkeeper_y | 7.413777e-04 | 0.000124 |
6 | body_part_name | 5.254610e-04 | 0.000176 |
7 | carry_length | 5.216394e-04 | 0.000117 |
8 | pass_technique_name | 4.681380e-04 | 0.000101 |
9 | shot_open_goal | 4.337441e-04 | 0.000046 |
10 | area_shot | 3.993503e-04 | 0.000115 |
11 | pass_height_name | 3.917073e-04 | 0.000154 |
12 | pass_end_x | 2.770612e-04 | 0.000081 |
13 | assist_type | 2.541320e-04 | 0.000090 |
14 | strong_foot | 2.197382e-04 | 0.000116 |
15 | pass_end_y | 2.159167e-04 | 0.000110 |
16 | area_goal | 1.948983e-04 | 0.000166 |
17 | pass_cross | 1.318429e-04 | 0.000153 |
18 | counter_attack | 9.362759e-05 | 0.000065 |
19 | pass_switch | 5.159071e-05 | 0.000059 |
20 | fast_break | 2.101844e-05 | 0.000039 |
21 | under_pressure | 1.110223e-17 | 0.000052 |
22 | smart_pass | -5.732302e-06 | 0.000028 |
23 | shot_one_on_one | -1.146460e-05 | 0.000018 |
24 | pass_cut_back | -2.101844e-05 | 0.000025 |
fig, ax = plt.subplots(figsize=(16, 9))
sorted_idx = perm_result.importances_mean.argsort()
bar_plot = ax.boxplot(perm_result.importances[sorted_idx].T, vert=False, labels=X_train.columns[sorted_idx])
Calculate calibration curve on test data
y_pred_lgbm_calibrated = searchcv.best_estimator_.predict_proba(X_test)[:, 1]
fraction_of_positives_lgbm, mean_predicted_value_lgbm = calibration_curve(y_test, y_pred_lgbm_calibrated, n_bins=10)
# logistic regression
y_pred_lr_pass = clf_pass.predict_proba(X_test_pass)[:, 1]
y_pred_lr_other = clf_other.predict_proba(X_test_other)[:, 1]
y_pred_lr = np.concatenate([y_pred_lr_pass, y_pred_lr_other])
y_true_test = np.concatenate([y_test_pass, y_test_other])
fraction_of_positives_lr, mean_predicted_value_lr = calibration_curve(y_true_test, y_pred_lr, n_bins=10)
Plot calibration curve on test data
plt.style.use('dark_background')
fig = plt.figure(constrained_layout=True, figsize=(10, 15))
gs = fig.add_gridspec(ncols=1, nrows=2, height_ratios=(2/3, 1/3))
ax1 = fig.add_subplot(gs[0])
ax1.plot(mean_predicted_value_lgbm, fraction_of_positives_lgbm, "-o", color='#aabced', label='Calibrated Light GBM')
ax1.plot(mean_predicted_value_lr, fraction_of_positives_lr, "-o", color='#dbdf4a', label='Logistic regression')
ax1.plot([0, 1], [0, 1], "--", color='#e7aeca', label="Perfectly calibrated")
ax1.set_xlabel('Mean predicted value', fontsize=15)
ax1.set_ylabel('Fraction of positives', fontsize=15)
ax1.set_title('Calibration curve', fontsize=20, pad=10)
ax1.legend(fontsize=15)
ax1.tick_params(labelsize=15)
ax2 = fig.add_subplot(gs[1])
sns.distplot(y_pred_lr, color='#4fe4e4', label='Logistic regression', kde=False, ax=ax2)
sns.distplot(y_pred_lgbm_calibrated, color='#aabced', label='Calibrated Light GBM', kde=False, ax=ax2)
ax2.set_xlabel('Predicted value', fontsize=15)
ax2.set_ylabel('Count', fontsize=15)
ax2.tick_params(labelsize=15)
ax2.legend(fontsize=15)
ax2.set_title('Distribution of predictions', fontsize=20, pad=10);
fig.savefig(os.path.join('..', 'figures', '22_calibration_curve.png'), bbox_inches = 'tight', pad_inches = 0.2)
From scikit-learn docs: "The smaller the Brier score, the better, hence the naming with “loss”. Across all items in a set N predictions, the Brier score measures the mean squared difference between (1) the predicted probability assigned to the possible outcomes for item i, and (2) the actual outcome."
print('Brier score, Light GBM:', brier_score_loss(y_test, y_pred_lgbm_calibrated, pos_label=y_test.max()))
print('ROC AUC, Light GBM:', roc_auc_score(y_test, y_pred_lgbm_calibrated))
print('Pseudo R-squared, Light GBM:', mcfadden_r2(y_test, y_pred_lgbm_calibrated))
Brier score, Light GBM: 0.08044014131802528 ROC AUC, Light GBM: 0.7851386121829785 Pseudo R-squared, Light GBM: 0.16991224608139832
print('Brier score, logistic regression:',brier_score_loss(y_true_test, y_pred_lr, pos_label=y_true_test.max()))
print('ROC AUC, logistic regression:', roc_auc_score(y_true_test, y_pred_lr))
print('Pseudo R-squared, logistic regression:', mcfadden_r2(y_true_test, y_pred_lr))
Brier score, logistic regression: 0.08150355125036453 ROC AUC, logistic regression: 0.786687948249966 Pseudo R-squared, logistic regression: 0.16477712349951212
dump(searchcv.best_estimator_, os.path.join('..', 'models', 'lgbm_model.joblib'))
['..\\models\\lgbm_model.joblib']
dump(clf_pass.best_estimator_, os.path.join('..', 'models', 'lr_pass.joblib'))
['..\\models\\lr_pass.joblib']
dump(clf_other.best_estimator_, os.path.join('..', 'models', 'lr_other.joblib'))
['..\\models\\lr_other.joblib']
# reload shot dataset for ids
df = pd.read_parquet(os.path.join('..', 'data', 'shots.parquet'))
df = df[['match_id', 'wyscout_id', 'statsbomb_id']].copy()
X_train_other['goal'] = y_train_other
X_train_other['split'] = 'train'
X_test_other['goal'] = y_test_other
X_test_other['split'] = 'test'
df_other = pd.concat([X_train_other, X_test_other])
df_other = df_other.merge(df, left_index=True, right_index=True, validate='1:1', how='left')
df_other.reset_index(drop=True, inplace=True)
df_other.to_parquet(os.path.join('..', 'data', 'modelling', 'lr_other.parquet'))
X_train_pass['goal'] = y_train_pass
X_train_pass['split'] = 'train'
X_test_pass['goal'] = y_test_pass
X_test_pass['split'] = 'test'
df_pass = pd.concat([X_train_pass, X_test_pass])
df_pass = df_pass.merge(df, left_index=True, right_index=True, validate='1:1', how='left')
df_pass.reset_index(drop=True, inplace=True)
df_pass.to_parquet(os.path.join('..', 'data', 'modelling', 'lr_pass.parquet'))
X_train['goal'] = y_train
X_train['split'] = 'train'
X_test['goal'] = y_test
X_test['split'] = 'test'
df_lgbm = pd.concat([X_train, X_test])
# exlcude fake shots
df_lgbm = df_lgbm[df_lgbm.index.isin(df.index)].copy()
df_lgbm = df_lgbm.merge(df, how='left', left_index=True, right_index=True, validate='1:1')
df_lgbm.to_parquet(os.path.join('..', 'data', 'modelling', 'lgbm.parquet'))