import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
from xgboost import XGBClassifier
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from scikitplot.metrics import plot_calibration_curve
import warnings
warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning)
import os
data_dir = os.getcwd() + '/data/wy_scout/'
df_games = pd.read_hdf(data_dir + 'spadl.h5', 'games')
df_games.tail(10)
game_id | competition_id | season_id | game_date | home_team_id | away_team_id | |
---|---|---|---|---|---|---|
54 | 2057984 | 28 | 10078 | 2018-06-17 15:00:00 | 3148 | 15473 |
55 | 2057979 | 28 | 10078 | 2018-06-17 12:00:00 | 16871 | 17322 |
56 | 2057973 | 28 | 10078 | 2018-06-16 19:00:00 | 9598 | 16823 |
57 | 2057967 | 28 | 10078 | 2018-06-16 16:00:00 | 15594 | 7712 |
58 | 2057972 | 28 | 10078 | 2018-06-16 13:00:00 | 12274 | 7839 |
59 | 2057966 | 28 | 10078 | 2018-06-16 10:00:00 | 4418 | 8493 |
60 | 2057960 | 28 | 10078 | 2018-06-15 18:00:00 | 9905 | 1598 |
61 | 2057961 | 28 | 10078 | 2018-06-15 15:00:00 | 16216 | 10840 |
62 | 2057955 | 28 | 10078 | 2018-06-15 12:00:00 | 16129 | 15670 |
63 | 2057954 | 28 | 10078 | 2018-06-14 15:00:00 | 14358 | 16521 |
dfs_features = []
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
game_id = game['game_id']
df_features = pd.read_hdf(data_dir + 'features.h5', key=f'game_{game_id}')
df_features['game_id'] = game_id
dfs_features.append(df_features)
df_features = pd.concat(dfs_features).reset_index(drop=True)
df_features.tail(10)
HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))
type_pass_a0 | type_cross_a0 | type_throw_in_a0 | type_freekick_crossed_a0 | type_freekick_short_a0 | type_corner_crossed_a0 | type_corner_short_a0 | type_take_on_a0 | type_foul_a0 | type_tackle_a0 | ... | end_angle_to_goal_a0 | end_dist_to_goal_a1 | end_angle_to_goal_a1 | end_dist_to_goal_a2 | end_angle_to_goal_a2 | team_1 | team_2 | time_delta_1 | time_delta_2 | game_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
80969 | False | False | False | False | False | False | False | False | False | True | ... | 0.088083 | 46.801607 | 0.160512 | 38.019553 | 0.107520 | True | True | 3.724954 | 5.269374 | 2057954 |
80970 | True | False | False | False | False | False | False | False | False | False | ... | 0.313155 | 46.379806 | 0.088083 | 46.801607 | 0.160512 | True | True | 1.175848 | 4.900802 | 2057954 |
80971 | True | False | False | False | False | False | False | False | False | False | ... | 0.163867 | 37.524973 | 0.313155 | 46.379806 | 0.088083 | True | True | 1.412850 | 2.588698 | 2057954 |
80972 | False | False | False | False | False | False | False | False | False | False | ... | 0.284537 | 50.020077 | 0.163867 | 37.524973 | 0.313155 | True | True | 1.416485 | 2.829335 | 2057954 |
80973 | True | False | False | False | False | False | False | False | False | False | ... | 0.485553 | 33.913609 | 0.284537 | 50.020077 | 0.163867 | True | True | 1.416484 | 2.832969 | 2057954 |
80974 | True | False | False | False | False | False | False | False | False | False | ... | 0.532102 | 32.055017 | 0.485553 | 33.913609 | 0.284537 | True | True | 2.531683 | 3.948167 | 2057954 |
80975 | False | False | False | False | False | False | False | False | False | True | ... | 0.108906 | 83.021503 | 0.164555 | 78.096249 | 0.192750 | False | False | 2.483783 | 5.015466 | 2057954 |
80976 | False | False | False | False | False | False | False | False | True | False | ... | 0.108906 | 81.331839 | 0.108906 | 83.021503 | 0.164555 | True | False | 1.862786 | 4.346569 | 2057954 |
80977 | False | False | False | False | False | False | False | False | False | False | ... | 1.570796 | 25.717078 | 0.350897 | 25.717078 | 0.350897 | False | False | 66.594979 | 68.457765 | 2057954 |
80978 | True | False | False | False | False | False | False | False | False | False | ... | 1.570796 | 105.055033 | 0.032370 | 81.331839 | 0.108906 | False | True | 43.943937 | 110.538916 | 2057954 |
10 rows × 143 columns
dfs_labels = []
"""
for _, game in tqdm(df_games.iterrows(), total=len(df_games)):
game_id = game['game_id']
df_labels = pd.read_hdf()
"""
"\nfor _, game in tqdm(df_games.iterrows(), total=len(df_games)):\n game_id = game['game_id']\n df_labels = pd.read_hdf()\n"