#!/usr/bin/env python # coding: utf-8 # In[1]: # Python ≥3.5 (ideally) import platform import sys, getopt assert sys.version_info >= (3, 5) import csv # Import Dependencies get_ipython().run_line_magic('matplotlib', 'inline') # Math Operations import numpy as np from math import pi # Datetime import datetime from datetime import date import time # Data Preprocessing import pandas as pd # version 1.0.3 import os # used to read the csv filenames import re import random from io import BytesIO from pathlib import Path # Reading directories import glob import os # Working with JSON import json from pandas.io.json import json_normalize from ast import literal_eval # Data Visualisation import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns plt.style.use('seaborn-whitegrid') import missingno as msno # visually display missing data # Display in Jupyter from IPython.display import Image, YouTubeVideo from IPython.core.display import HTML # Ignore Warnings import warnings warnings.filterwarnings(action="ignore", message="^internal gelsd") print('Setup Complete') # In[2]: # Set up initial paths to subfolders base_dir = os.path.join('..', '..', ) data_dir = os.path.join(base_dir, 'data') data_dir_fbref = os.path.join(base_dir, 'data', 'fbref') data_dir_tm = os.path.join(base_dir, 'data', 'tm') data_dir_wyscout = os.path.join(base_dir, 'data', 'wyscout') img_dir = os.path.join(base_dir, 'img') fig_dir = os.path.join(base_dir, 'img', 'fig') video_dir = os.path.join(base_dir, 'video') # In[3]: # Show files in directory print(glob.glob(data_dir_wyscout + '/engineered/combined/*.csv')) # In[4]: df_wyscout = pd.read_csv(data_dir_wyscout + '/engineered/combined/wyscout_events_big5_1718.csv') # In[5]: pd.set_option('display.max_columns', None) # In[6]: # Display the first 5 rows of the raw DataFrame, df_wyscout df_wyscout.head() # In[7]: # Print the shape of the raw DataFrame, ddf_wyscout_raw print(df_wyscout.shape) # In[8]: # Display all column names list(df_wyscout.columns) # In[9]: # Data types of the features of the raw DataFrame, df_raw with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(df_wyscout.dtypes) # ### 4.8. Aggregate Data # #### 4.8.1. Fixture Level # In[ ]: # In[10]: # Select columns of interest ## Define columns cols = ['date_time_timestamp', 'league_name', 'fixture', 'team_home', 'team_away', 'teamName', 'goals_home', 'goals_away', 'eventName', 'subEventName' ] ## df_wyscout_select = df_wyscout[cols] ## df_wyscout_select['Opponent'] = np.where(df_wyscout_select['team_home'] == df_wyscout_select['teamName'], df_wyscout_select['team_away'], df_wyscout_select['team_home']) # In[16]: # ## df_wyscout_fixture_grouped = (df_wyscout_select #.groupby(['full_fixture_date']) .groupby(['date_time_timestamp', 'league_name', 'fixture', 'teamName', 'Opponent', 'goals_home', 'goals_away', 'eventName', 'subEventName']) .agg({'eventName': ['count']}, {'subEventName': ['count']} ) ) ## df_wyscout_fixture_grouped.columns = df_wyscout_fixture_grouped.columns.droplevel(level=0) ## df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.reset_index() ## df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.rename(columns={'date_time_timestamp': 'Date', 'league_name': 'League_Name', 'fixture': 'Fixture', 'teamName': 'Team', 'Opponent': 'Opponent', 'goals_home': 'Goals_Home', 'goals_away': 'Goals_Away', 'eventName': 'Event', 'subEventName': 'SubEvent', 'count': 'Team_Value' } ) ## df_wyscout_fixture_grouped.head() # In[17]: # List unique values in the df_wyscout_fixture_grouped['Event_Name'] column df_wyscout_fixture_grouped.Event.unique() # In[18]: # List unique values in the df_wyscout_fixture_grouped['SubEvent_Name'] column df_wyscout_fixture_grouped.SubEvent.unique() # In[36]: # Select columns of interest ## Define columns cols = ['Date', 'League_Name', 'Fixture', 'Team', 'Opponent', 'Event', 'SubEvent_Name', 'Team_Value' ] ## df_wyscout_fixture_grouped_select = df_wyscout_fixture_grouped[cols] # In[37]: # Join DataFrame to itself on 'Date', 'Fixture', 'Team'/'Opponent', and 'Event', to join Team and Opponent together df_wyscout_fixture_grouped = pd.merge(df_wyscout_fixture_grouped, df_wyscout_fixture_grouped, how='left', left_on=['Date', 'League_Name', 'Fixture', 'Opponent', 'Event'], right_on = ['Date', 'League_Name', 'Fixture', 'Team', 'Event']) # In[38]: df_wyscout_fixture_grouped # In[39]: # Clean Data ## Drop columns df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.drop(columns=['Team_y', 'Opponent_y', 'Goals_Home_y', 'Goals_Away_y']) ## Rename columns df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.rename(columns={'Team_x': 'Team', 'Opponent_x': 'Opponent', 'Goals_Home_x': 'Goals_Home', 'Goals_Away_x': 'Goals_Away', 'Team_Value_x': 'Team_Value', 'Team_Value_y': 'Opponent_Value', } ) ## Replace null values with zeros df_wyscout_fixture_grouped['Team_Value'] = df_wyscout_fixture_grouped['Team_Value'].replace(np.nan, 0) df_wyscout_fixture_grouped['Opponent_Value'] = df_wyscout_fixture_grouped['Opponent_Value'].replace(np.nan, 0) ## Convert Opponent_Value' from Float64 to Int64 type df_wyscout_fixture_grouped['Opponent_Value'] = df_wyscout_fixture_grouped['Opponent_Value'].astype('Int64') ## Display DataFrame df_wyscout_fixture_grouped.head() # In[40]: # ## df_fixture_gw = (df_wyscout_fixture_grouped .groupby(['Date', 'Team']) .agg({'Team': ['nunique']}) ) ## df_fixture_gw.columns = df_fixture_gw.columns.droplevel(level=0) ## df_fixture_gw = df_fixture_gw.reset_index() ## df_fixture_gw = df_fixture_gw.rename(columns={'Date': 'Date', 'nunique': 'Gameweek', } ) ## Groupby. See: https://stackoverflow.com/questions/18554920/pandas-aggregate-count-distinct df_fixture_gw = (df_fixture_gw.groupby(['Team', 'Date']).sum() .groupby(level=0).cumsum().reset_index() ) ## Display DataFrame df_fixture_gw.head() # In[41]: # Join DataFrame df_wyscout_fixture_grouped = pd.merge(df_wyscout_fixture_grouped, df_fixture_gw, how='left', left_on=['Date', 'Team'], right_on = ['Date', 'Team']) # Display DataFrame df_wyscout_fixture_grouped.head(50) # In[42]: ## Display DataFrame df_wyscout_fixture_grouped.tail(50) # In[43]: df_wyscout_fixture_grouped.to_csv(data_dir_wyscout + '/engineered/combined/wyscout_aggregated_fixtures_big5_1718.csv', index=None, header=True) # #### 4.8.2. Team Level # In[44]: # Group DataFrame by Team ## df_wyscout_team_grouped = (df_wyscout_fixture_grouped .groupby(['League_Name', 'Team', 'Event']) .agg({'Team_Value': ['sum'], 'Opponent_Value': ['sum'] } ) ) ## df_wyscout_team_grouped.columns = df_wyscout_team_grouped.columns.droplevel(level=0) ## df_wyscout_team_grouped = df_wyscout_team_grouped.reset_index() ## Rename columns df_wyscout_team_grouped.columns = ['League_Name', 'Team', 'Event', 'Team_Value', 'Opponent_Value'] ## Display columns df_wyscout_team_grouped.head() # In[45]: df_wyscout_team_grouped.to_csv(data_dir_wyscout + '/engineered/combined/wyscout_aggregated_team_big5_1718.csv', index=None, header=True)