#!/usr/bin/env python
# coding: utf-8
# In[1]:
# Python ≥3.5 (ideally)
import platform
import sys, getopt
assert sys.version_info >= (3, 5)
import csv
# Import Dependencies
get_ipython().run_line_magic('matplotlib', 'inline')
# Math Operations
import numpy as np
from math import pi
# Datetime
import datetime
from datetime import date
import time
# Data Preprocessing
import pandas as pd # version 1.0.3
import os # used to read the csv filenames
import re
import random
from io import BytesIO
from pathlib import Path
# Reading directories
import glob
import os
# Working with JSON
import json
from pandas.io.json import json_normalize
from ast import literal_eval
# Data Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
import missingno as msno # visually display missing data
# Display in Jupyter
from IPython.display import Image, YouTubeVideo
from IPython.core.display import HTML
# Ignore Warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
print('Setup Complete')
# In[2]:
# Set up initial paths to subfolders
base_dir = os.path.join('..', '..', )
data_dir = os.path.join(base_dir, 'data')
data_dir_fbref = os.path.join(base_dir, 'data', 'fbref')
data_dir_tm = os.path.join(base_dir, 'data', 'tm')
data_dir_wyscout = os.path.join(base_dir, 'data', 'wyscout')
img_dir = os.path.join(base_dir, 'img')
fig_dir = os.path.join(base_dir, 'img', 'fig')
video_dir = os.path.join(base_dir, 'video')
# In[3]:
# Show files in directory
print(glob.glob(data_dir_wyscout + '/engineered/combined/*.csv'))
# In[4]:
df_wyscout = pd.read_csv(data_dir_wyscout + '/engineered/combined/wyscout_events_big5_1718.csv')
# In[5]:
pd.set_option('display.max_columns', None)
# In[6]:
# Display the first 5 rows of the raw DataFrame, df_wyscout
df_wyscout.head()
# In[7]:
# Print the shape of the raw DataFrame, ddf_wyscout_raw
print(df_wyscout.shape)
# In[8]:
# Display all column names
list(df_wyscout.columns)
# In[9]:
# Data types of the features of the raw DataFrame, df_raw
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df_wyscout.dtypes)
# ### 4.8. Aggregate Data
# #### 4.8.1. Fixture Level
# In[ ]:
# In[10]:
# Select columns of interest
## Define columns
cols = ['date_time_timestamp',
'league_name',
'fixture',
'team_home',
'team_away',
'teamName',
'goals_home',
'goals_away',
'eventName',
'subEventName'
]
##
df_wyscout_select = df_wyscout[cols]
##
df_wyscout_select['Opponent'] = np.where(df_wyscout_select['team_home'] == df_wyscout_select['teamName'], df_wyscout_select['team_away'], df_wyscout_select['team_home'])
# In[16]:
#
##
df_wyscout_fixture_grouped = (df_wyscout_select
#.groupby(['full_fixture_date'])
.groupby(['date_time_timestamp',
'league_name',
'fixture',
'teamName',
'Opponent',
'goals_home',
'goals_away',
'eventName',
'subEventName'])
.agg({'eventName': ['count']},
{'subEventName': ['count']}
)
)
##
df_wyscout_fixture_grouped.columns = df_wyscout_fixture_grouped.columns.droplevel(level=0)
##
df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.reset_index()
##
df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.rename(columns={'date_time_timestamp': 'Date',
'league_name': 'League_Name',
'fixture': 'Fixture',
'teamName': 'Team',
'Opponent': 'Opponent',
'goals_home': 'Goals_Home',
'goals_away': 'Goals_Away',
'eventName': 'Event',
'subEventName': 'SubEvent',
'count': 'Team_Value'
}
)
##
df_wyscout_fixture_grouped.head()
# In[17]:
# List unique values in the df_wyscout_fixture_grouped['Event_Name'] column
df_wyscout_fixture_grouped.Event.unique()
# In[18]:
# List unique values in the df_wyscout_fixture_grouped['SubEvent_Name'] column
df_wyscout_fixture_grouped.SubEvent.unique()
# In[36]:
# Select columns of interest
## Define columns
cols = ['Date',
'League_Name',
'Fixture',
'Team',
'Opponent',
'Event',
'SubEvent_Name',
'Team_Value'
]
##
df_wyscout_fixture_grouped_select = df_wyscout_fixture_grouped[cols]
# In[37]:
# Join DataFrame to itself on 'Date', 'Fixture', 'Team'/'Opponent', and 'Event', to join Team and Opponent together
df_wyscout_fixture_grouped = pd.merge(df_wyscout_fixture_grouped, df_wyscout_fixture_grouped, how='left', left_on=['Date', 'League_Name', 'Fixture', 'Opponent', 'Event'], right_on = ['Date', 'League_Name', 'Fixture', 'Team', 'Event'])
# In[38]:
df_wyscout_fixture_grouped
# In[39]:
# Clean Data
## Drop columns
df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.drop(columns=['Team_y', 'Opponent_y', 'Goals_Home_y', 'Goals_Away_y'])
## Rename columns
df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.rename(columns={'Team_x': 'Team',
'Opponent_x': 'Opponent',
'Goals_Home_x': 'Goals_Home',
'Goals_Away_x': 'Goals_Away',
'Team_Value_x': 'Team_Value',
'Team_Value_y': 'Opponent_Value',
}
)
## Replace null values with zeros
df_wyscout_fixture_grouped['Team_Value'] = df_wyscout_fixture_grouped['Team_Value'].replace(np.nan, 0)
df_wyscout_fixture_grouped['Opponent_Value'] = df_wyscout_fixture_grouped['Opponent_Value'].replace(np.nan, 0)
## Convert Opponent_Value' from Float64 to Int64 type
df_wyscout_fixture_grouped['Opponent_Value'] = df_wyscout_fixture_grouped['Opponent_Value'].astype('Int64')
## Display DataFrame
df_wyscout_fixture_grouped.head()
# In[40]:
#
##
df_fixture_gw = (df_wyscout_fixture_grouped
.groupby(['Date', 'Team'])
.agg({'Team': ['nunique']})
)
##
df_fixture_gw.columns = df_fixture_gw.columns.droplevel(level=0)
##
df_fixture_gw = df_fixture_gw.reset_index()
##
df_fixture_gw = df_fixture_gw.rename(columns={'Date': 'Date',
'nunique': 'Gameweek',
}
)
## Groupby. See: https://stackoverflow.com/questions/18554920/pandas-aggregate-count-distinct
df_fixture_gw = (df_fixture_gw.groupby(['Team', 'Date']).sum()
.groupby(level=0).cumsum().reset_index()
)
## Display DataFrame
df_fixture_gw.head()
# In[41]:
# Join DataFrame
df_wyscout_fixture_grouped = pd.merge(df_wyscout_fixture_grouped, df_fixture_gw, how='left', left_on=['Date', 'Team'], right_on = ['Date', 'Team'])
# Display DataFrame
df_wyscout_fixture_grouped.head(50)
# In[42]:
## Display DataFrame
df_wyscout_fixture_grouped.tail(50)
# In[43]:
df_wyscout_fixture_grouped.to_csv(data_dir_wyscout + '/engineered/combined/wyscout_aggregated_fixtures_big5_1718.csv', index=None, header=True)
# #### 4.8.2. Team Level
# In[44]:
# Group DataFrame by Team
##
df_wyscout_team_grouped = (df_wyscout_fixture_grouped
.groupby(['League_Name', 'Team', 'Event'])
.agg({'Team_Value': ['sum'],
'Opponent_Value': ['sum']
}
)
)
##
df_wyscout_team_grouped.columns = df_wyscout_team_grouped.columns.droplevel(level=0)
##
df_wyscout_team_grouped = df_wyscout_team_grouped.reset_index()
## Rename columns
df_wyscout_team_grouped.columns = ['League_Name', 'Team', 'Event', 'Team_Value', 'Opponent_Value']
## Display columns
df_wyscout_team_grouped.head()
# In[45]:
df_wyscout_team_grouped.to_csv(data_dir_wyscout + '/engineered/combined/wyscout_aggregated_team_big5_1718.csv', index=None, header=True)