#!/usr/bin/env python # coding: utf-8 # # # Wyscout Data Engineering # ##### Notebook to engineered previously parsed Event data from [Wyscout](https://wyscout.com/). # # ### By [Edd Webster](https://www.twitter.com/eddwebster) # Notebook first written: 26/01/2021
# Notebook last updated: 12/02/2021 # # ![title](../../img/wyscout_logo.png) # --- # # ## Introduction # This notebook parses pubicly available [Wyscout](https://wyscout.com/) football match data for the Big 5 European leagues for the 17/18 season, using [pandas](http://pandas.pydata.org/) for data manipulation through DataFrames. # # For more information about this notebook and the author, I'm available through all the following channels: # * [eddwebster.com](https://www.eddwebster.com/); # * edd.j.webster@gmail.com; # * [@eddwebster](https://www.twitter.com/eddwebster); # * [linkedin.com/in/eddwebster](https://www.linkedin.com/in/eddwebster/); # * [github/eddwebster](https://github.com/eddwebster/); # * [public.tableau.com/profile/edd.webster](https://public.tableau.com/profile/edd.webster); # * [kaggle.com/eddwebster](https://www.kaggle.com/eddwebster); and # * [hackerrank.com/eddwebster](https://www.hackerrank.com/eddwebster). # # ![title](../../img/edd_webster/fifa21eddwebsterbanner.png) # # The accompanying GitHub repository for this notebook can be found [here](https://github.com/eddwebster/football_analytics) and a static version of this notebook can be found [here](https://nbviewer.jupyter.org/github/eddwebster/football_analytics/blob/master/notebooks/3_data_engineering/Wyscout%20Data%20Engineering.ipynb). # ___ # # ## Notebook Contents # 1. [Notebook Dependencies](#section1)
# 2. [Project Brief](#section2)
# 3. [Data Sources](#section3)
# 1. [Introduction](#section3.1)
# 2. [Competitions](#section3.2)
# 3. [Events](#section3.3)
# 4. [Matches](#section3.4)
# 5. [Players](#section3.5)
# 6. [Teams](#section3.6)
# 4. [Data Engineering](#section4)
# 1. [Join Datasets](#section4.1)
# 5. [Export DataFrame](#section4.5)
# 5. [Exploratory Data Analysis (EDA)](#section5)
# 1. [...](#section5.1)
# 2. [...](#section5.2)
# 3. [...](#section5.3)
# 6. [Summary](#section6)
# 7. [Next Steps](#section7)
# 8. [Bibliography](#section8)
# --- # # ## 1. Notebook Dependencies # This notebook was written using [Python 3](https://docs.python.org/3.7/) and requires the following libraries: # * [`Jupyter notebooks`](https://jupyter.org/) for this notebook environment with which this project is presented; # * [`NumPy`](http://www.numpy.org/) for multidimensional array computing; and # * [`pandas`](http://pandas.pydata.org/) for data analysis and manipulation. # # All packages used for this notebook except for BeautifulSoup can be obtained by downloading and installing the [Conda](https://anaconda.org/anaconda/conda) distribution, available on all platforms (Windows, Linux and Mac OSX). Step-by-step guides on how to install Anaconda can be found for Windows [here](https://medium.com/@GalarnykMichael/install-python-on-windows-anaconda-c63c7c3d1444) and Mac [here](https://medium.com/@GalarnykMichael/install-python-on-mac-anaconda-ccd9f2014072), as well as in the Anaconda documentation itself [here](https://docs.anaconda.com/anaconda/install/). # ### Import Libraries and Modules # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # Python ≥3.5 (ideally) import platform import sys, getopt assert sys.version_info >= (3, 5) import csv # Import Dependencies get_ipython().run_line_magic('matplotlib', 'inline') # Math Operations import numpy as np import math from math import pi # Datetime import datetime from datetime import date import time # Data Preprocessing import pandas as pd import os import re import random from io import BytesIO from pathlib import Path # Reading directories import glob import os from os.path import basename # Working with JSON import json from pandas.io.json import json_normalize # Data Visualisation import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib import patches from matplotlib.patches import Arc import seaborn as sns plt.style.use('seaborn-whitegrid') import missingno as msno # Downloading data sources from urllib.parse import urlparse from urllib.request import urlopen, urlretrieve from zipfile import ZipFile, is_zipfile from tqdm import tqdm # Progress Bar # Display in Jupyter from IPython.display import Image, Video, YouTubeVideo from IPython.core.display import HTML # Ignore Warnings import warnings warnings.filterwarnings(action="ignore", message="^internal gelsd") print('Setup Complete') # In[2]: # Python / module versions used here for reference print('Python: {}'.format(platform.python_version())) print('NumPy: {}'.format(np.__version__)) print('pandas: {}'.format(pd.__version__)) print('matplotlib: {}'.format(mpl.__version__)) print('Seaborn: {}'.format(sns.__version__)) # ### Defined Variables # In[3]: # Define today's date today = datetime.datetime.now().strftime('%d/%m/%Y').replace('/', '') # ### Defined Filepaths # In[4]: # Set up initial paths to subfolders base_dir = os.path.join('..', '..',) data_dir = os.path.join(base_dir, 'data') data_dir_wyscout = os.path.join(base_dir, 'data', 'wyscout') scripts_dir = os.path.join(base_dir, 'scripts') scripts_dir_wyscout = os.path.join(base_dir, 'scripts', 'wyscout') img_dir = os.path.join(base_dir, 'img') fig_dir = os.path.join(base_dir, 'img', 'fig') fig_dir_wyscout = os.path.join(base_dir, 'img', 'fig', 'wyscout') video_dir = os.path.join(base_dir, 'video') # ### Notebook Settings # In[5]: # Display all DataFrame columns pd.set_option('display.max_columns', None) # --- # # ## 2. Project Brief # This Jupyter notebook explores how to parse pubicly available [Wyscout](https://wyscout.com/) JSON data of football matches for the Big 5 European leagues for the 17/18 season using [pandas](http://pandas.pydata.org/) for data manipulation through DataFrames. # # The resulting five engineered DataFrames for each of the Big 5 European league are exported to CSV files. This data can be further analysed in Python, joined to other datasets, or explored using Tableau, PowerBI, Microsoft Excel. # # **Notebook Conventions**:
# * Variables that refer a `DataFrame` object are prefixed with `df_`. # * Variables that refer to a collection of `DataFrame` objects (e.g., a list, a set or a dict) are prefixed with `dfs_`. # # **References**:
# This notebook uses Wyscout data, made publicly available in the following paper by Luca Pappalardo, Paolo Cintia, Alessio Rossi, Emanuele Massucco, Paolo Ferragina, Dino Pedreschi, and Fosca Giannotti. **[A Public Data Set of Spatio-Temporal Match Events in Soccer Competitions](https://www.nature.com/articles/s41597-019-0247-7)**. In *Scientific Data 6*, no. 1 (2019): 1-15. # --- # # ## 3. Data Sources # ### 3.1. Introduction # #### 3.1.1. About Wyscout # [Wyscout](https://wyscout.com/) is an Italian company that supports football scouting, match analysis and transfer dynamics. The company was founded in Genoa, Italy in 2004 and provides video analysis tools and digital databases regarding performances and matches for coaches, teams and players dealing with football business. # # ![title](../../img/wyscout_logo.png) # # The purpose is to allow them have a detailed sight of a large number of athletes about individual performances, patterns of play and tactical strategy. # # This notebook explores a complete dataset of event data for the Big 5 European leagues during the 17/18 season. # #### 3.1.2. About the Wyscout publicly available data # A detailed description of the data can be found in the following paper: # - Pappalardo, L., Cintia, P., Rossi, A. et al. **A public data set of spatio-temporal match events in soccer competitions**. Scientific Data 6, 236 (2019) doi:10.1038/s41597-019-0247-7, https://www.nature.com/articles/s41597-019-0247-7 # # It is from this paper import the *matches*, *events*, *players*, *playerank*, *referees*, *coaches*, and *competition* data sets from the figshare repository. Data are stored in the `JSON` format. # # The complete data set contains: # - 1,941 matches # - 3,251,294 events # - 4,299 players. # # The datasets we will be using are: # - competitions; # - events; # - matches; # - players; and # - teams # # The data needs to be imported as a DataFrame in the Data Sources section [Section 3](#section3) and cleaned in the Data Engineering section [Section 4](#section4). # ### 3.2. Read in Data # The following cells read in the engineered CSV data prepared in the [Data Parsing](https://nbviewer.jupyter.org/github/eddwebster/football_analytics/blob/master/notebooks/2_data_parsing/Wyscout%20Parsing.ipynb) noteook. # #### 3.3.1. Data Dictionary # # The [Wyscout](https://wyscout.com/) Events dataset has twelve features (columns) with the following definitions and data types: # # | Feature | Data type | # |------|-----| # | `eventId` | int64 | # | `subEventName` | object | # | `tags` | object | # | `playerId` | int64 | # | `positions` | object | # | `matchId` | int64 | # | `eventName` | object | # | `teamId` | int64 | # | `matchPeriod` | object | # | `eventSec` | float64 | # | `subEventId` | object | # | `id` | int64 | # # Refer to the Wyscout [API docs](https://apidocs.wyscout.com/) and [Events Manual](https://footballdata.wyscout.com/wp-content/uploads/2018/03/Wyscout-Events-Manual.pdf) for further information about event and subevents. # #### 3.3.2. Read in CSV as pandas DataFrame # In[6]: # Read in preparsed Wyscout Events DataFrame df_wyscout_raw = pd.read_csv(os.path.join(data_dir_wyscout, 'raw', 'csv', 'combined', 'wyscout_big5_combined.csv')) # ### 3.3. Initial Data Handling # Let's quality of the dataset by looking first and last rows in pandas using the [head()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) and [tail()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html) methods. # In[7]: # Display the first 5 rows of the raw DataFrame, df_wyscout_raw df_wyscout_raw.head() # In[8]: # Display the last 5 rows of the raw DataFrame, df_wyscout_raw df_wyscout_raw.tail() # In[9]: # Print the shape of the raw DataFrame, ddf_wyscout_raw print(df_wyscout_raw.shape) # In[10]: # Print the column names of the raw DataFrame, df_wyscout_raw print(df_wyscout_raw.columns) # The joined dataset has forty features (columns). Full details of these attributes can be found in the [Data Dictionary](section3.3.1). # In[11]: # Data types of the features of the raw DataFrame, df_wyscout_raw df_wyscout_raw.dtypes # Full details of these attributes and their data types can be found in the [Data Dictionary](section3.3.1). # In[12]: # Info for the raw DataFrame, df_wyscout_raw df_wyscout_raw.info() # In[13]: # Description of the raw DataFrame, df_wyscout_raw, showing some summary statistics for each numberical column in the DataFrame df_wyscout_raw.describe() # In[14]: # Plot visualisation of the missing values for each feature of the raw DataFrame, df_wyscout_raw msno.matrix(df_wyscout_raw, figsize = (30, 7)) # In[15]: # Counts of missing values null_value_stats = df_wyscout_raw.isnull().sum(axis=0) null_value_stats[null_value_stats != 0] # The visualisation shows us that there are no missing values in the DataFrame. # --- # # ## 4. Data Engineering # Before any Feature Engineering or Data Visualisation, we first need to clean and wrangle the datasets to a form that meet our needs. # ### 4.1. Assign Raw DataFrame to Engineered DataFrame # In[13]: # Assign Raw DataFrame to Engineered DataFrame df_wyscout = df_wyscout_raw # ### 4.2. String Cleaning # ##### Split `Label` column into seperate `Fixture` and `Score` columns # In[14]: # Break down 'Label' column into constituent parts - Fixtures, Score, Date, Home Goals, Away Goals, etc. df_wyscout['fixture'] = df_wyscout['label'].str.split(', ').str[0] df_wyscout['score_home_away'] = df_wyscout['label'].str.split(', ').str[1] df_wyscout['team_home'] = df_wyscout['fixture'].str.split(' - ').str[0] df_wyscout['team_away'] = df_wyscout['fixture'].str.split(' - ').str[1] df_wyscout['goals_home'] = df_wyscout['score_home_away'].str.split(' - ').str[0] df_wyscout['goals_away'] = df_wyscout['score_home_away'].str.split(' - ').str[1] # ##### Split `date` column into separate `date_isolated` and `time_isolated` columns # In[15]: df_wyscout['date_isolated'] = df_wyscout['date'].str.split(' at').str[0] df_wyscout['time_isolated'] = df_wyscout['date'].str.split(' at ').str[1] df_wyscout['date_time_isolated'] = df_wyscout['date'].str.split(' GMT').str[0].str.replace(' at ', ' ', regex=True) # ### 4.3. Rename Columns # In[16]: df_wyscout = df_wyscout.rename(columns={'name': 'teamName'}) # ### 4.4. Convert Data Types # In[17]: df_wyscout['date_time_timestamp'] = pd.to_datetime(df_wyscout['dateutc']) df_wyscout['date_date'] = df_wyscout['date_time_timestamp'].dt.strftime('%d-%m-%Y') df_wyscout['time_time'] = df_wyscout['date_time_timestamp'].dt.time # In[19]: df_wyscout['full_fixture_date'] = df_wyscout['date_date'].astype(str) + ' ' + df_wyscout['team_home'].astype(str) + ' ' + df_wyscout['goals_home'].astype(str) + ' ' + ' v ' + ' ' + df_wyscout['goals_away'].astype(str) + ' ' + df_wyscout['team_away'].astype(str) # ### 4.5. Reorder DataFrame # Correctly order all the rows in the DataFrame by date, time, country, fixture, half, and time in the match. Important when looking at events and the following event e.g. is possession retains? Which player receives the pass, etc. # In[20]: df_wyscout = df_wyscout.sort_values(['date_date', 'time_time', 'competitionId', 'full_fixture_date', 'matchPeriod', 'eventSec'], ascending=[True, True, True, True, True, True]) # ### 4.6. Create Features # ##### Create `competition_name` column derived from the `competitionId` column # In[21]: # ## Define dictionary of competition names per competition ID dict_competition_name_wyscout = {28: 'FIFA World Cup', 102: 'UEFA EURO 2018', 364: 'Premier League', 412: 'Ligue 1', 426: 'Bundesliga', 524: 'Seria A', 795: 'La Liga' } ## Apply dictionary of competition names per competition ID df_wyscout['competition_name'] = df_wyscout['competitionId'].map(dict_competition_name_wyscout) # ##### Create `season` attribute # In[22]: # ## Define dictionary of seasons per competition ID dict_season_wyscout = {28: '2018', 102: '2016', 364: '17/18', 412: '17/18', 426: '17/18', 524: '17/18', 795: '17/18' } ## Apply dictionary of seasons per competition ID df_wyscout['season'] = df_wyscout['competitionId'].map(dict_season_wyscout) # ##### Create `fullName` attribute # Join together the `firstName` and `lastName` attribute to create a `fullName` attribute. # In[23]: df_wyscout['fullName'] = df_wyscout['firstName'].astype(str) + ' ' + df_wyscout['lastName'].astype(str) # ##### Create `previous_event` attribute # In[24]: df_wyscout['previous_event'] = df_wyscout['subEventName'].shift(1) # ##### Create `followingPossession` column # 'teamIdNext' = following 'teamId' # In[25]: df_wyscout['teamIdNext'] = df_wyscout['teamId'].shift(-1) df_wyscout['teamNameNext'] = df_wyscout['teamName'].shift(-1) df_wyscout['fullNameNext'] = df_wyscout['fullName'].shift(-1) # ##### Create `player2player` column # In[26]: df_wyscout['player2player'] = df_wyscout['fullName'] + ' - ' + df_wyscout['fullNameNext'] # ##### Create `isPossessionRetained` column # When `teamId` is not followed by the same `teamId` in the following row, possession is lost. We want to creat a column that stats this. # In[27]: df_wyscout['isPossessionRetained'] = np.where(df_wyscout['teamId'] == df_wyscout['teamIdNext'], True, False) # ### 4.7. Drop columns # As this is a large dataset with >3mil rows, we will remove every column that is not required at this stage. # In[28]: df_wyscout = df_wyscout.drop(['wyId_x', 'wyId_y', 'wyId_x.1'], axis=1) # ### 4.8. Aggregate Data # Aggregated data used for Tableau dashboarding # #### 4.8.1. Fixture Level # In[29]: # Select columns of interest ## Define columns cols = ['season', 'date_time_timestamp', 'fixture', 'team_home', 'team_away', 'teamName', 'goals_home', 'goals_away', 'eventName', 'subEventName' ] ## Streamline DataFrame with columns of interest df_wyscout_select = df_wyscout[cols] ## df_wyscout_select['Opponent'] = np.where(df_wyscout_select['team_home'] == df_wyscout_select['teamName'], df_wyscout_select['team_away'], df_wyscout_select['team_home']) # In[30]: # ## Group DataFrame and Aggregate on 'eventName' df_wyscout_fixture_grouped = (df_wyscout_select .groupby(['season', 'date_time_timestamp', 'fixture', 'teamName', 'Opponent', 'goals_home', 'goals_away', 'eventName']) .agg({'eventName': ['count']}) ) ## Drop level df_wyscout_fixture_grouped.columns = df_wyscout_fixture_grouped.columns.droplevel(level=0) ## Reset index df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.reset_index() ## Rename columns df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.rename(columns={'season': 'Season', 'date_time_timestamp': 'Date', 'fixture': 'Fixture', 'teamName': 'Team', 'Opponent': 'Opponent', 'goals_home': 'Goals_Home', 'goals_away': 'Goals_Away', 'eventName': 'Event', 'count': 'Team_Value' } ) ## Display DataFrame df_wyscout_fixture_grouped.head() # In[31]: # Select columns of interest ## Define columns cols = ['Season', 'Date', 'Fixture', 'Team', 'Opponent', 'Event', 'Team_Value' ] ## Streamline DataFrame with columns of interest df_wyscout_fixture_grouped_select = df_wyscout_fixture_grouped[cols] # In[32]: # Join DataFrame to itself on 'Date', 'Fixture', 'Team'/'Opponent', and 'Event', to join Team and Opponent together df_wyscout_fixture_grouped = pd.merge(df_wyscout_fixture_grouped, df_wyscout_fixture_grouped, how='left', left_on=['Season', 'Date', 'Fixture', 'Opponent', 'Event'], right_on = ['Season', 'Date', 'Fixture', 'Team', 'Event']) # In[33]: # Clean Data ## Drop columns df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.drop(columns=['Team_y', 'Opponent_y', 'Goals_Home_y', 'Goals_Away_y']) ## Rename columns df_wyscout_fixture_grouped = df_wyscout_fixture_grouped.rename(columns={'Season_x': 'Season', 'Team_x': 'Team', 'Opponent_x': 'Opponent', 'Goals_Home_x': 'Goals_Home', 'Goals_Away_x': 'Goals_Away', 'Team_Value_x': 'Team_Value', 'Team_Value_y': 'Opponent_Value', } ) ## Replace null values with zeros df_wyscout_fixture_grouped['Team_Value'] = df_wyscout_fixture_grouped['Team_Value'].replace(np.nan, 0) df_wyscout_fixture_grouped['Opponent_Value'] = df_wyscout_fixture_grouped['Opponent_Value'].replace(np.nan, 0) ## Convert Opponent_Value' from Float64 to Int64 type df_wyscout_fixture_grouped['Opponent_Value'] = df_wyscout_fixture_grouped['Opponent_Value'].astype('Int64') ## Display DataFrame df_wyscout_fixture_grouped.head() # In[34]: # ## df_fixture_gw = (df_wyscout_fixture_grouped .groupby(['Date', 'Team']) .agg({'Team': ['nunique']}) ) ## df_fixture_gw.columns = df_fixture_gw.columns.droplevel(level=0) ## df_fixture_gw = df_fixture_gw.reset_index() ## df_fixture_gw = df_fixture_gw.rename(columns={'Date': 'Date', 'nunique': 'Gameweek', } ) ## Groupby. See: https://stackoverflow.com/questions/18554920/pandas-aggregate-count-distinct df_fixture_gw = (df_fixture_gw.groupby(['Team', 'Date']).sum() .groupby(level=0).cumsum().reset_index() ) ## Display DataFrame df_fixture_gw.head() # In[35]: # Join DataFrame df_wyscout_fixture_grouped = pd.merge(df_wyscout_fixture_grouped, df_fixture_gw, how='left', left_on=['Date', 'Team'], right_on = ['Date', 'Team']) # Display DataFrame df_wyscout_fixture_grouped.head(50) # #### 4.8.2. Team Level # In[36]: # Group DataFrame by Team ## df_wyscout_team_grouped = (df_wyscout_fixture_grouped .groupby(['Team', 'Event']) .agg({'Team_Value': ['sum'], 'Opponent_Value': ['sum'] } ) ) ## df_wyscout_team_grouped.columns = df_wyscout_team_grouped.columns.droplevel(level=0) ## df_wyscout_team_grouped = df_wyscout_team_grouped.reset_index() ## Rename columns df_wyscout_team_grouped.columns = ['Team', 'Event', 'Team_Value', 'Opponent_Value'] ## Display columns df_wyscout_team_grouped.head() # ### 4.9. Filter Final DataFrames for 'Big 5' European Leagues Only # In[37]: lst_big5_leagues = [364, 412, 426, 524, 795] # ##### Events DataFrame # In[38]: df_wyscout_big5 = df_wyscout[df_wyscout['competitionId'].isin(lst_big5_leagues)] # ##### Aggregated DataFrame at fixture level # In[62]: df_wyscout_fixture_grouped_big5 = df_wyscout_fixture_grouped[df_wyscout_fixture_grouped['competitionId'].isin(lst_big5_leagues)] # ##### Aggregated DataFrame at team level # In[ ]: df_wyscout_team_grouped_big5 = df_wyscout_team_grouped[df_wyscout_team_grouped['competitionId'].isin(lst_big5_leagues)] # --- # # ## 5. Export Data # Export Data ready for building the Expected Goals models in the subsequent notebooks. # ##### Events DataFrames # In[40]: # Export Events DataFrame as CSV if not os.path.exists(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_events_all_1718.csv')): df_wyscout.to_csv(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_events_all_1718.csv'), index=None, header=True) else: pass # In[41]: # Export Events DataFrame as CSV if not os.path.exists(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_events_big5_1718.csv')): df_wyscout_big5.to_csv(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_events_big5_1718.csv'), index=None, header=True) else: pass # ##### Aggregated DataFrames at fixture level # In[ ]: # Export Aggregated DataFrame at the fixture Level as CSV if not os.path.exists(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_aggregated_fixtures_all_1718.csv')): df_wyscout_fixture_grouped.to_csv(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_aggregated_fixtures_all_1718.csv'), index=None, header=True) else: pass # In[ ]: # Export Aggregated DataFrame at the fixture Level as CSV if not os.path.exists(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_aggregated_fixtures_big5_1718.csv')): df_wyscout_fixture_grouped_big5.to_csv(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_aggregated_fixtures_big5_1718.csv'), index=None, header=True) else: pass # ##### Aggregated DataFrames at team level # In[ ]: # Export Aggregated DataFrame at the team Level as CSV if not os.path.exists(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_aggregated_team_all_1718.csv')): df_wyscout_team_grouped.to_csv(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_aggregated_team_all_1718.csv'), index=None, header=True) else: pass # In[ ]: # Export Aggregated DataFrame at the team Level as CSV if not os.path.exists(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_aggregated_team_big5_1718.csv')): df_wyscout_team_grouped_big5.to_csv(os.path.join(data_dir_wyscout, 'engineered', 'combined', 'wyscout_aggregated_team_big5_1718.csv'), index=None, header=True) else: pass # --- # # ## 6. Summary # This notebook parses Wyscout data using [pandas](http://pandas.pydata.org/) for data manipulation through DataFrames. # --- # # ## 7. Next Steps # The step is to take the dataset created in this notebook and ... # --- # # ## 8. References # * Data Parsing notebook: https://nbviewer.jupyter.org/github/eddwebster/football_analytics/blob/master/notebooks/2_data_parsing/Wyscout%20Parsing.ipynb # * Wyscout: https://wyscout.com/ # * Wyscout Events data manual: https://footballdata.wyscout.com/events-manual/ # * Pappalardo, Luca; Massucco, Emanuele (2019): Soccer match event dataset. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.4415000.v5 # * Pappalardo, L., Cintia, P., Rossi, A. et al. **A public data set of spatio-temporal match events in soccer competitions**. Scientific Data 6, 236 (2019) doi:10.1038/s41597-019-0247-7, https://www.nature.com/articles/s41597-019-0247-7 # * Custom function to flatten pandas DataFrames with nested JSON column: https://stackoverflow.com/questions/39899005/how-to-flatten-a-pandas-dataframe-with-some-columns-as-json # --- # # ## 9. Further Reading # --- # # ***Visit my website [EddWebster.com](https://www.eddwebster.com) or my [GitHub Repository](https://github.com/eddwebster) for more projects. If you'd like to get in contact, my Twitter handle is [@eddwebster](http://www.twitter.com/eddwebster) and my email is: edd.j.webster@gmail.com.*** # [Back to the top](#top)