In [1]:

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, make_scorer, brier_score_loss
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from joblib import dump, load
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
# monkey patch for bayesseachcv (https://github.com/scikit-optimize/scikit-optimize/issues/902)
from numpy.ma import MaskedArray
import sklearn.utils.fixes
sklearn.utils.fixes.MaskedArray = MaskedArray
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import warnings
warnings.filterwarnings('ignore')

Random state

In [2]:

seed = 42

Setup metrics (see: http://business-analytic.co.uk/blog/evaluating-expected-goals-models/)

In [3]:

# define Mcfadden's pseduo r-squared
def mcfadden_r2(y, y_pred):
    ll = log_loss(y, y_pred)
    ll_null = log_loss(y, np.full(len(y), y.mean()))
    return 1 - (ll/ll_null)
pseudo_r2_scorer = make_scorer(mcfadden_r2, needs_proba=True, greater_is_better=True)

In [4]:

scoring = {'roc_aug': 'roc_auc', 'mcfaddens_r2': pseudo_r2_scorer}

Setup folder for storing models

Load the data

In [5]:

df = pd.read_parquet(os.path.join('..', 'data', 'shots.parquet'))
df.drop(['match_id', 'statsbomb_id', 'statsbomb_team_id', 'player_id_statsbomb', 'competition_gender', 'team_name',
         'player_id', 'firstName', 'middleName', 'lastName', 'Name', 'dataset', 'wyscout_id', 'wyscout_team_id', 'team_id',
         'player_id_wyscout'], axis=1, inplace=True)
X = df.drop('goal', axis=1)
y = df.goal

Split into train, calibration and test datasets

In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=seed, stratify=y)

In [7]:

print('Shots train', len(y_train), ';Number goals', y_train.sum(),
      ';Goals %: ', round(y_train.mean()*100, 1))
print('Shots test', len(y_test), ';Number goals', y_test.sum(),
      ';Goals %: ', round(y_test.mean()*100, 1))

Shots train 51335 ;Number goals 5443 ;Goals %:  10.6
Shots test 12834 ;Number goals 1361 ;Goals %:  10.6

Load and split fake data

In [8]:

df_fake = pd.read_parquet(os.path.join('..', 'data', 'fake_shots.parquet'))
df_fake.index = ['a'+str(idx) for idx in df_fake.index]
y_fake = df_fake.goal
X_fake = df_fake.drop('goal', axis=1)

In [9]:

print('Shots fake', len(y_fake), ';Goals %: ', round(y_fake.mean()*100, 1))

Shots fake 1000 ;Goals %:  3.4

Logistic regression¶

Subset dataset for logistic regression

In [10]:

# drop colum
logistic_drop_cols = ['x', 'y',  # logistic regression does not deal well with dependent features
                      # The model will use the distance/ angle features capture these location features instead
                      # lots of missings for the below features as they come from StatsBomb data only.
                      # It's not fair to impute these as they are not missing at random
                      # while logistic regression does not allow missings so I removed them
                      'pass_end_y', 'pass_end_x',  # <- note these were in Wyscout, but often were just the shot location
                      'eventSec', 'period', 'player_id_goalkeeper',
                      'goalkeeper_x', 'goalkeeper_y', 'carry_length', 'shot_one_on_one', 'shot_open_goal',
                      'under_pressure', 'area_shot', 'area_goal', 'n_angle', 'smart_pass']
X_train_logistic = X_train.drop(logistic_drop_cols, axis=1).copy()
X_test_logistic = X_test.drop(logistic_drop_cols, axis=1).copy()

Split dataset for logistic regession into passes / other assists

In [11]:

def split(X, y):
    mask = X.assist_type == 'pass'
    X_pass = X[mask].drop('assist_type', axis=1).copy()
    y_pass = y[mask]
    X_other = X[~mask].dropna(axis=1, how='all').copy()
    y_other = y[~mask]
    return X_pass, y_pass, X_other, y_other

In [12]:

X_train_pass, y_train_pass, X_train_other, y_train_other = split(X_train_logistic, y_train)
X_test_pass, y_test_pass, X_test_other, y_test_other = split(X_test_logistic, y_test)

Pipeline for cleaning pass assists

In [13]:

cols = ['shot_type_name', 'body_part_name', 'pass_technique_name', 'pass_height_name']
cats = [['open_play', 'free_kick', 'corner', 'throw_in'],
        ['Right Foot', 'Left Foot', 'Other'],
        ['other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging'],
        ['Ground/ Low Pass', 'High Pass']]
pass_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_pass = Pipeline([('one_hot', pass_one_hot),
                      ('impute', SimpleImputer()),
                      ('scale', StandardScaler()),
                      ('lr', LogisticRegression(random_state=seed))])

Column names of transformed pass data

In [14]:

original_cols_remain = [col for col in X_train_pass.columns if col not in cols]
new_cols_pass = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_pass.extend(original_cols_remain)

In [15]:

print(new_cols_pass)

['free_kick', 'corner', 'throw_in', 'Left Foot', 'Other', 'Through Ball', 'Straight', 'Inswinging', 'Outswinging', 'High Pass', 'counter_attack', 'fast_break', 'strong_foot', 'pass_switch', 'pass_cross', 'pass_cut_back', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle', 'log_distance_to_goal']

Pipeline for cleaning other assists

In [16]:

# setting direct to recovery so does not not encoded twice ( also covered by shot_type_name == 'direct_set_piece')
X_train_other.loc[X_train_other.assist_type == 'direct', 'assist_type'] = 'recovery'
X_test_other.loc[X_test_other.assist_type == 'direct', 'assist_type'] = 'recovery'

cols = ['shot_type_name', 'body_part_name', 'assist_type']
cats = [['open_play', 'free_kick', 'corner', 'throw_in', 'direct_set_piece'],
        ['Right Foot', 'Left Foot', 'Other'],
        ['recovery', 'clearance', 'rebound']]
other_one_hot = ColumnTransformer([('encoder', OneHotEncoder(drop='first', categories=cats), cols)], remainder='passthrough')
pipe_other = Pipeline([('one_hot', other_one_hot),
                       ('impute', SimpleImputer()),
                       ('scale', StandardScaler()),
                       ('lr', LogisticRegression(random_state=seed))])

Column names of transformed passes

In [17]:

original_cols_remain = [col for col in X_train_other.columns if col not in cols]
new_cols_other = [item for sublist in cats for i, item in enumerate(sublist) if (i>0)]
new_cols_other.extend(original_cols_remain)

In [18]:

print(new_cols_other)

['free_kick', 'corner', 'throw_in', 'direct_set_piece', 'Left Foot', 'Other', 'clearance', 'rebound', 'counter_attack', 'fast_break', 'strong_foot', 'visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle', 'log_distance_to_goal']

Search parameters for gridsearchcv

In [19]:

param_grid = {'lr__C': np.logspace(-3, 0.1, 100)}

Fit the inner grid search for shots assisted by passes

In [20]:

clf_pass = GridSearchCV(estimator=pipe_pass, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_pass.fit(X_train_pass, y_train_pass)
print('C:', clf_pass.best_estimator_.named_steps.lr.C)

C: 0.04248961816344005

Fit the inner grid search for shots assisted other than passes

In [21]:

clf_other = GridSearchCV(estimator=pipe_other, param_grid=param_grid, scoring='neg_log_loss', n_jobs=-1)
clf_other.fit(X_train_other, y_train_other)
print('C:', clf_other.best_estimator_.named_steps.lr.C)

C: 0.42687726983178853

Outer loops for unbiased estimates of the model accuracy

In [22]:

nested_score_pass = cross_validate(clf_pass, X=X_train_pass, y=y_train_pass, scoring=scoring, n_jobs=-1)
print('ROC AUC for shots assisted by passes:', nested_score_pass['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared shots assisted by passes:", nested_score_pass['test_mcfaddens_r2'].mean())

ROC AUC for shots assisted by passes: 0.7817401551132565
McFadden's Pseudo R-squared shots assisted by passes: 0.15855492654518005

In [23]:

nested_score_other = cross_validate(clf_other, X=X_train_other, y=y_train_other, scoring=scoring, n_jobs=-1)
print('ROC AUC for other model:', nested_score_other['test_roc_aug'].mean())
print("McFadden's Pseudo R-squared for other model:", nested_score_other['test_mcfaddens_r2'].mean())

ROC AUC for other model: 0.8033677417704617
McFadden's Pseudo R-squared for other model: 0.19094339212253655

LightGBM model¶

Add fake training data. I am not adding this to the test data as want this to be realistic of real data.

In [24]:

X_train = pd.concat([X_train, X_fake])
y_train = pd.concat([y_train, y_fake])

Clean data. Categories to numbers. Drop distance and angle measures as just want raw locations for my models.

In [25]:

def clean_lightgbm(df):
    df = df.copy()
    # replace categorical columns
    shot_type_cat = {'free_kick': 0, 'corner': 1, 'throw_in': 2, 'direct_set_piece': 3, 'open_play': 4}
    body_type_cat = {'Right Foot': 0, 'Left Foot': 1, 'Other': 2}
    assist_type_cat = {'pass': 0, 'recovery': 1, 'clearance': 2, 'direct': 3, 'rebound': 4}
    pass_height_cat = {'High Pass': 0, 'Ground/ Low Pass': 1}
    pass_technique_cat = {'Through Ball': 0, 'Straight': 1, 'Inswinging': 2, 'Outswinging': 3, 'other': 4}
    df.shot_type_name.replace(shot_type_cat, inplace=True)
    df.body_part_name.replace(body_type_cat, inplace=True)
    df.assist_type.replace(assist_type_cat, inplace=True)
    df.pass_height_name.replace(pass_height_cat, inplace=True)
    df.pass_technique_name.replace(pass_technique_cat, inplace=True)

    # replace boolean type columns (not really as have nans)
    for col in ['pass_switch', 'pass_cross', 'pass_cut_back', 'shot_one_on_one',
                'shot_open_goal', 'under_pressure', 'smart_pass']:
        df[col] = df[col].astype(np.float32)
    # drop some distance/ angle columns
    drop_cols = ['visible_angle', 'middle_angle', 'distance_to_goal', 'distance_visible_angle',
                 'log_distance_to_goal', 'eventSec', 'period', 'player_id_goalkeeper']
    df.drop(drop_cols, axis=1, inplace=True)
    return df
    
X_train = clean_lightgbm(X_train)
X_test = clean_lightgbm(X_test)

In [26]:

print(X_train.columns)

Index(['shot_type_name', 'x', 'y', 'counter_attack', 'fast_break',
       'strong_foot', 'body_part_name', 'assist_type', 'pass_end_y',
       'pass_end_x', 'pass_switch', 'pass_cross', 'pass_cut_back',
       'pass_height_name', 'pass_technique_name', 'carry_length',
       'shot_one_on_one', 'shot_open_goal', 'under_pressure', 'area_shot',
       'area_goal', 'n_angle', 'goalkeeper_x', 'goalkeeper_y', 'smart_pass'],
      dtype='object')

Fit the nested 5-fold cross validation using Bayesian optimisation.

In [27]:

#lgbm = LGBMClassifier(random_state=42)
lgbm = CalibratedClassifierCV(LGBMClassifier(random_state=42), method='isotonic', cv=3)
lgbm_param_grid = {'base_estimator__min_child_samples': Integer(0, 200),
                   'base_estimator__num_leaves': Integer(2, 500),
                   'base_estimator__reg_lambda': Real(0, 1),
                   'base_estimator__reg_alpha': Real(0, 1),
                   'base_estimator__max_depth': Integer(0, 500)}
# Nested resampling using skopt. see: https://github.com/scikit-optimize/scikit-optimize/issues/725
searchcv = BayesSearchCV(estimator=lgbm,
                         n_iter=100,
                         search_spaces=lgbm_param_grid,
                         cv=5,
                         n_jobs=-1)
searchcv.fit(X_train, y_train)

Out[27]:

BayesSearchCV(cv=5,
              estimator=CalibratedClassifierCV(base_estimator=LGBMClassifier(random_state=42),
                                               cv=3, method='isotonic'),
              n_iter=100, n_jobs=-1,
              search_spaces={'base_estimator__max_depth': Integer(low=0, high=500, prior='uniform', transform='identity'),
                             'base_estimator__min_child_samples': Integer(low=0, high=200, prior='uniform', transform='identity'),
                             'base_estimator__num_leaves': Integer(low=2, high=500, prior='uniform', transform='identity'),
                             'base_estimator__reg_alpha': Real(low=0, high=1, prior='uniform', transform='identity'),
                             'base_estimator__reg_lambda': Real(low=0, high=1, prior='uniform', transform='identity')})

Permutation importance

In [28]:

# note not using fake data for permutation importance
perm_result = permutation_importance(searchcv.best_estimator_, X_train, y_train, n_repeats=10, random_state=seed)
df_perm_importance = pd.DataFrame({'Feature':X_train.columns,
                                   'importance': perm_result.importances.mean(axis=1),
                                   'std_dev': perm_result.importances.std(axis=1)})
df_perm_importance.sort_values('importance', ascending=False, inplace=True)
df_perm_importance.reset_index(drop=True, inplace=True)

In [29]:

df_perm_importance

Out[29]:

	Feature	importance	std_dev
0	x	1.435942e-02	0.000504
1	y	1.153148e-02	0.000537
2	goalkeeper_x	1.199962e-03	0.000086
3	n_angle	9.439190e-04	0.000156
4	shot_type_name	8.350053e-04	0.000145
5	goalkeeper_y	7.413777e-04	0.000124
6	body_part_name	5.254610e-04	0.000176
7	carry_length	5.216394e-04	0.000117
8	pass_technique_name	4.681380e-04	0.000101
9	shot_open_goal	4.337441e-04	0.000046
10	area_shot	3.993503e-04	0.000115
11	pass_height_name	3.917073e-04	0.000154
12	pass_end_x	2.770612e-04	0.000081
13	assist_type	2.541320e-04	0.000090
14	strong_foot	2.197382e-04	0.000116
15	pass_end_y	2.159167e-04	0.000110
16	area_goal	1.948983e-04	0.000166
17	pass_cross	1.318429e-04	0.000153
18	counter_attack	9.362759e-05	0.000065
19	pass_switch	5.159071e-05	0.000059
20	fast_break	2.101844e-05	0.000039
21	under_pressure	1.110223e-17	0.000052
22	smart_pass	-5.732302e-06	0.000028
23	shot_one_on_one	-1.146460e-05	0.000018
24	pass_cut_back	-2.101844e-05	0.000025

In [30]:

fig, ax = plt.subplots(figsize=(16, 9))
sorted_idx = perm_result.importances_mean.argsort()
bar_plot = ax.boxplot(perm_result.importances[sorted_idx].T, vert=False, labels=X_train.columns[sorted_idx])

Test¶

Calculate calibration curve on test data

In [31]:

y_pred_lgbm_calibrated = searchcv.best_estimator_.predict_proba(X_test)[:, 1]
fraction_of_positives_lgbm, mean_predicted_value_lgbm = calibration_curve(y_test, y_pred_lgbm_calibrated, n_bins=10)
# logistic regression
y_pred_lr_pass = clf_pass.predict_proba(X_test_pass)[:, 1]
y_pred_lr_other = clf_other.predict_proba(X_test_other)[:, 1]
y_pred_lr = np.concatenate([y_pred_lr_pass, y_pred_lr_other])
y_true_test = np.concatenate([y_test_pass, y_test_other])
fraction_of_positives_lr, mean_predicted_value_lr = calibration_curve(y_true_test, y_pred_lr, n_bins=10)

Plot calibration curve on test data

In [32]:

plt.style.use('dark_background')
fig = plt.figure(constrained_layout=True, figsize=(10, 15))
gs = fig.add_gridspec(ncols=1, nrows=2, height_ratios=(2/3, 1/3))
ax1 = fig.add_subplot(gs[0])
ax1.plot(mean_predicted_value_lgbm, fraction_of_positives_lgbm, "-o", color='#aabced', label='Calibrated Light GBM')
ax1.plot(mean_predicted_value_lr, fraction_of_positives_lr, "-o", color='#dbdf4a', label='Logistic regression')
ax1.plot([0, 1], [0, 1], "--", color='#e7aeca', label="Perfectly calibrated")
ax1.set_xlabel('Mean predicted value', fontsize=15)
ax1.set_ylabel('Fraction of positives', fontsize=15)
ax1.set_title('Calibration curve', fontsize=20, pad=10)
ax1.legend(fontsize=15)
ax1.tick_params(labelsize=15)
ax2 = fig.add_subplot(gs[1])
sns.distplot(y_pred_lr, color='#4fe4e4', label='Logistic regression', kde=False, ax=ax2)
sns.distplot(y_pred_lgbm_calibrated, color='#aabced', label='Calibrated Light GBM', kde=False, ax=ax2)
ax2.set_xlabel('Predicted value', fontsize=15)
ax2.set_ylabel('Count', fontsize=15)
ax2.tick_params(labelsize=15)
ax2.legend(fontsize=15)
ax2.set_title('Distribution of predictions', fontsize=20, pad=10);
fig.savefig(os.path.join('..', 'figures', '22_calibration_curve.png'), bbox_inches = 'tight', pad_inches = 0.2)

From scikit-learn docs: "The smaller the Brier score, the better, hence the naming with “loss”. Across all items in a set N predictions, the Brier score measures the mean squared difference between (1) the predicted probability assigned to the possible outcomes for item i, and (2) the actual outcome."

In [33]:

print('Brier score, Light GBM:', brier_score_loss(y_test, y_pred_lgbm_calibrated, pos_label=y_test.max()))
print('ROC AUC, Light GBM:', roc_auc_score(y_test, y_pred_lgbm_calibrated))
print('Pseudo R-squared, Light GBM:', mcfadden_r2(y_test, y_pred_lgbm_calibrated))

Brier score, Light GBM: 0.08044014131802528
ROC AUC, Light GBM: 0.7851386121829785
Pseudo R-squared, Light GBM: 0.16991224608139832

In [34]:

print('Brier score, logistic regression:',brier_score_loss(y_true_test, y_pred_lr, pos_label=y_true_test.max()))
print('ROC AUC, logistic regression:', roc_auc_score(y_true_test, y_pred_lr))
print('Pseudo R-squared, logistic regression:', mcfadden_r2(y_true_test, y_pred_lr))

Brier score, logistic regression: 0.08150355125036453
ROC AUC, logistic regression: 0.786687948249966
Pseudo R-squared, logistic regression: 0.16477712349951212

Save models¶

In [35]:

dump(searchcv.best_estimator_, os.path.join('..', 'models', 'lgbm_model.joblib'))

Out[35]:

['..\\models\\lgbm_model.joblib']

In [36]:

dump(clf_pass.best_estimator_, os.path.join('..', 'models', 'lr_pass.joblib'))

Out[36]:

['..\\models\\lr_pass.joblib']

In [37]:

dump(clf_other.best_estimator_, os.path.join('..', 'models', 'lr_other.joblib'))

Out[37]:

['..\\models\\lr_other.joblib']

Save data¶

In [38]:

# reload shot dataset for ids
df = pd.read_parquet(os.path.join('..', 'data', 'shots.parquet'))
df = df[['match_id', 'wyscout_id', 'statsbomb_id']].copy()

In [39]:

X_train_other['goal'] = y_train_other
X_train_other['split'] = 'train'
X_test_other['goal'] = y_test_other
X_test_other['split'] = 'test'
df_other = pd.concat([X_train_other, X_test_other])
df_other = df_other.merge(df, left_index=True, right_index=True, validate='1:1', how='left')
df_other.reset_index(drop=True, inplace=True)
df_other.to_parquet(os.path.join('..', 'data', 'modelling', 'lr_other.parquet'))

In [40]:

X_train_pass['goal'] = y_train_pass
X_train_pass['split'] = 'train'
X_test_pass['goal'] = y_test_pass
X_test_pass['split'] = 'test'
df_pass = pd.concat([X_train_pass, X_test_pass])
df_pass = df_pass.merge(df, left_index=True, right_index=True, validate='1:1', how='left')
df_pass.reset_index(drop=True, inplace=True)
df_pass.to_parquet(os.path.join('..', 'data', 'modelling', 'lr_pass.parquet'))

In [41]:

X_train['goal'] = y_train
X_train['split'] = 'train'
X_test['goal'] = y_test
X_test['split'] = 'test'
df_lgbm = pd.concat([X_train, X_test])
# exlcude fake shots
df_lgbm = df_lgbm[df_lgbm.index.isin(df.index)].copy()
df_lgbm = df_lgbm.merge(df, how='left', left_index=True, right_index=True, validate='1:1')
df_lgbm.to_parquet(os.path.join('..', 'data', 'modelling', 'lgbm.parquet'))