#!/usr/bin/env python # coding: utf-8 # # # Data Engineering of StrataBet Data # ##### Notebook to engineer the event data provided by [StrataBet]( http://www.stratagem.co/) # # ### By [Edd Webster](https://www.twitter.com/eddwebster) # Notebook first written: 13/12/2020
# Notebook last updated: 26/12/2020 # ![title](../../img/stratabet_logo.jpg) # Click [here](#section5) to jump straight to the Exploratory Data Analysis section and skip the [Task Brief](#section2), [Data Sources](#section3), and [Data Engineering](#section4) sections. Or click [here](#section6) to jump straight to the Conclusion. # # This article was written with the aid of StrataData, which is property of [Stratagem Technologies](http://www.stratagem.co/). StrataData powers the [StrataBet Sports Trading Platform](http://www.stratabet.com/), in addition to [StrataBet Premium Recommendations](http://app.stratabet.com/recommendations). # ___ # # # ## Introduction # This notebook engineers [StrataBet]( http://www.stratagem.co/) data for football matches in a variety of European leagues during the 16/17 and 17/18 seasons, using [pandas](http://pandas.pydata.org/) for data manipulation through DataFrames. # # For more information about this notebook and the author, I'm available through all the following channels: # * [eddwebster.com](https://www.eddwebster.com/); # * edd.j.webster@gmail.com; # * [@eddwebster](https://www.twitter.com/eddwebster); # * [linkedin.com/in/eddwebster](https://www.linkedin.com/in/eddwebster/); # * [github/eddwebster](https://github.com/eddwebster/); # * [public.tableau.com/profile/edd.webster](https://public.tableau.com/profile/edd.webster); # * [kaggle.com/eddwebster](https://www.kaggle.com/eddwebster); and # * [hackerrank.com/eddwebster](https://www.hackerrank.com/eddwebster). # # ![title](../../img/fifa21eddwebsterbanner.png) # # The accompanying GitHub repository for this notebook can be found [here](https://github.com/eddwebster/fifa-league) and a static version of this notebook can be found [here](https://nbviewer.jupyter.org/github/eddwebster/football_analytics/blob/master/notebooks/B%29%20Data%20Engineering/Opta%20%23mcfcanalytics%20PL%202011-2012.ipynb). # ___ # # # ## Notebook Contents # 1. [Notebook Dependencies](#section1)
# 2. [Project Brief](#section2)
# 3. [Data Sources](#section3)
# 1. [Introduction](#section3.1)
# 2. [Data Dictionary](#section3.2)
# 3. [Creating the DataFrame](#section3.3)
# 4. [Initial Data Handling](#section3.4)
# 5. [Export the Raw DataFrame](#section3.5)
# 4. [Data Engineering](#section4)
# 1. [Introduction](#section4.1)
# 2. [Columns of Interest](#section4.2)
# 3. [String Cleaning](#section4.3)
# 4. [Converting Data Types](#section4.4)
# 5. [Export the Engineered DataFrame](#section4.5)
# 5. [Exploratory Data Analysis (EDA)](#section5)
# 1. [...](#section5.1)
# 2. [...](#section5.2)
# 3. [...](#section5.3)
# 6. [Summary](#section6)
# 7. [Next Steps](#section7)
# 8. [Bibliography](#section8)
# ___ # # # ## 1. Notebook Dependencies # This notebook was written using [Python 3](https://docs.python.org/3.7/) and requires the following libraries: # * [`Jupyter notebooks`](https://jupyter.org/) for this notebook environment with which this project is presented; # * [`NumPy`](http://www.numpy.org/) for multidimensional array computing; # * [`pandas`](http://pandas.pydata.org/) for data analysis and manipulation; and # * [`matplotlib`](https://matplotlib.org/contents.html?v=20200411155018) for data visualisations; # # All packages used for this notebook except for BeautifulSoup can be obtained by downloading and installing the [Conda](https://anaconda.org/anaconda/conda) distribution, available on all platforms (Windows, Linux and Mac OSX). Step-by-step guides on how to install Anaconda can be found for Windows [here](https://medium.com/@GalarnykMichael/install-python-on-windows-anaconda-c63c7c3d1444) and Mac [here](https://medium.com/@GalarnykMichael/install-python-on-mac-anaconda-ccd9f2014072), as well as in the Anaconda documentation itself [here](https://docs.anaconda.com/anaconda/install/). # ### Import Libraries and Modules # In[2]: # Python ≥3.5 (ideally) import platform import sys, getopt assert sys.version_info >= (3, 5) import csv # Import Dependencies get_ipython().run_line_magic('matplotlib', 'inline') # Math Operations import numpy as np from math import pi # Datetime import datetime from datetime import date import time # Data Preprocessing import pandas as pd # version 1.0.3 import os # used to read the csv filenames import re import random from io import BytesIO from pathlib import Path # Reading directories import glob import os # Working with JSON import json from pandas.io.json import json_normalize from ast import literal_eval # Data Visualisation import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns plt.style.use('seaborn-whitegrid') import missingno as msno # visually display missing data # Display in Jupyter from IPython.display import Image, YouTubeVideo from IPython.core.display import HTML # Ignore Warnings import warnings warnings.filterwarnings(action="ignore", message="^internal gelsd") print('Setup Complete') # In[3]: # Python / module versions used here for reference print('Python: {}'.format(platform.python_version())) print('NumPy: {}'.format(np.__version__)) print('pandas: {}'.format(pd.__version__)) print('matplotlib: {}'.format(mpl.__version__)) print('Seaborn: {}'.format(sns.__version__)) # ### Defined Filepaths # In[4]: # Set up initial paths to subfolders base_dir = os.path.join('..', '..', ) data_dir = os.path.join(base_dir, 'data') data_dir_fbref = os.path.join(base_dir, 'data', 'fbref') data_dir_stratabet = os.path.join(base_dir, 'data', 'stratabet') img_dir = os.path.join(base_dir, 'img') fig_dir = os.path.join(base_dir, 'img', 'fig') video_dir = os.path.join(base_dir, 'video') # ### Notebook Settings # In[5]: pd.set_option('display.max_columns', None) # --- # # # ## 2. Project Brief # This Jupyter notebook explores how to engineer [StrataBet]( http://www.stratagem.co/) Events data using [pandas](http://pandas.pydata.org/) for data manipulation through DataFrames. # # The resulting engineered Datarame is exported as a CSV file. This data can be further analysed in Python, joined to other datasets, or explored using Tableau, PowerBI, Microsoft Excel. # --- # # # ## 3. Data Sources # ### 3.1. Introduction # This StrataData has been made availble by [Stratagem Technologies](http://www.stratagem.co/). StrataData powers the [StrataBet Sports Trading Platform](http://www.stratabet.com/), in addition to [StrataBet Premium Recommendations](http://app.stratabet.com/recommendations). # # Before conducting our EDA, the data needs to be imported as a DataFrame in the Data Sources section [Section 3](#section3) and cleaned in the Data Engineering section [Section 4](#section4). # # We'll be using the [pandas](http://pandas.pydata.org/) library to import our data to this workbook as a DataFrame. # ### 3.2. Chances # #### 3.2.1. Data Dictionary # The [StrataBet]( http://www.stratagem.co/) Events dataset has twelve features (columns) with the following definitions and data types: # # | Feature | Data type | # |------|-----| # | `eventId` | int64 | # | `subEventName` | object | # | `tags` | object | # | `playerId` | int64 | # | `positions` | object | # | `matchId` | int64 | # | `eventName` | object | # | `teamId` | int64 | # | `matchPeriod` | object | # | `eventSec` | float64 | # | `subEventId` | object | # | `id` | int64 | # #### 3.2.2. Import Data # In[6]: # Combine individual csv files to form one DataFrame, df_raw, using glob lst_files_chances = glob.glob(data_dir_stratabet + '/raw/chances/individual_competitions' + "/*.csv") # Creates a list of all csv files li = [] # pd.concat takes a list of DataFrames as an argument for filename in lst_files_chances: df_raw_temp = pd.read_csv(filename, index_col=None, header=0) li.append(df_raw_temp) df_stratabet_chances_raw = pd.concat(li, axis=0, ignore_index=True) # ignore_index=True as we don't want pandas to try an align row indexes # #### 3.2.3. Initial Data Handling # Let's quality of the dataset by looking first and last rows in pandas using the [head()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) and [tail()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html) methods. # In[7]: # Display the first 5 rows of the raw DataFrame, df_stratabet_chances_raw df_stratabet_chances_raw.head() # In[8]: # Display the last 5 rows of the raw DataFrame, df_stratabet_chances_raw df_stratabet_chances_raw.tail() # In[9]: # Print the shape of the raw DataFrame, df_stratabet_chances_raw print(df_stratabet_chances_raw.shape) # In[10]: # Print the column names of the raw DataFrame, df_stratabet_chances_raw print(df_stratabet_chances_raw.columns) # The dataset has six features (columns). Full details of these attributes can be found in the [Data Dictionary](section3.3.1). # In[11]: # Data types of the features of the raw DataFrame, df_stratabet_chances_raw df_stratabet_chances_raw .dtypes # All six of the columns have the object data type. Full details of these attributes and their data types can be found in the [Data Dictionary](section3.3.1). # In[12]: # Info for the raw DataFrame, df_stratabet_chances_raw df_stratabet_chances_raw.info() # In[13]: # Description of the raw DataFrame, df_stratabet_raw, showing some summary statistics for each numberical column in the DataFrame df_stratabet_chances_raw.describe() # In[14]: # Plot visualisation of the missing values for each feature of the raw DataFrame, df_stratabet_chances_raw msno.matrix(df_stratabet_chances_raw, figsize = (30, 7)) # In[15]: # Counts of missing values tm_null_value_stats = df_stratabet_chances_raw.isnull().sum(axis=0) tm_null_value_stats[tm_null_value_stats != 0] # The visualisation shows us very quickly that there a few missing values in the `shotQuality` column, but otherwise the dataset is complete. This data is now ready for Data Engineering. # #### 3.2.4. Export Complete DataFrame # In[16]: df_stratabet_chances_raw.to_csv(data_dir_stratabet + '/raw/chances/' + 'stratabet_chances_all.csv', index=None, header=True) # ### 3.3. Key Entries # #### 3.3.1. Data Dictionary # The [StrataBet]( http://www.stratagem.co/) Events dataset has twelve features (columns) with the following definitions and data types: # # | Feature | Data type | # |------|-----| # | `eventId` | int64 | # | `subEventName` | object | # | `tags` | object | # | `playerId` | int64 | # | `positions` | object | # | `matchId` | int64 | # | `eventName` | object | # | `teamId` | int64 | # | `matchPeriod` | object | # | `eventSec` | float64 | # | `subEventId` | object | # | `id` | int64 | # #### 3.3.2. Import Data # In[17]: # Combine individual csv files to form one DataFrame, df_raw, using glob lst_files_key_entries = glob.glob(data_dir_stratabet + '/raw/key_entries/individual_competitions' + "/*.csv") # Creates a list of all csv files li = [] # pd.concat takes a list of DataFrames as an argument for filename in lst_files_key_entries: df_raw_temp = pd.read_csv(filename, index_col=None, header=0) li.append(df_raw_temp) df_stratabet_key_entries_raw = pd.concat(li, axis=0, ignore_index=True) # ignore_index=True as we don't want pandas to try an align row indexes # #### 3.3.3. Initial Data Handling # Let's quality of the dataset by looking first and last rows in pandas using the [head()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) and [tail()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html) methods. # In[18]: # Display the first 5 rows of the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.head() # In[19]: # Display the last 5 rows of the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.tail() # In[20]: # Print the shape of the raw DataFrame, df_stratabet_key_entries_raw print(df_stratabet_key_entries_raw.shape) # In[21]: # Print the column names of the raw DataFrame, df_stratabet_key_entries_raw print(df_stratabet_key_entries_raw.columns) # The dataset has six features (columns). Full details of these attributes can be found in the [Data Dictionary](section3.3.1). # In[22]: # Info for the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.info() # In[23]: # Description of the raw DataFrame, df_stratabet_key_entries_raw, showing some summary statistics for each numberical column in the DataFrame df_stratabet_key_entries_raw.describe() # In[24]: # Plot visualisation of the missing values for each feature of the raw DataFrame, df_stratabet_key_entries_raw msno.matrix(df_stratabet_key_entries_raw, figsize = (30, 7)) # In[25]: # Counts of missing values tm_null_value_stats = df_stratabet_key_entries_raw.isnull().sum(axis=0) tm_null_value_stats[tm_null_value_stats != 0] # The visualisation shows us very quickly that there a few missing values in the `shotQuality` column, but otherwise the dataset is complete. This data is now ready for Data Engineering. # #### 3.3.4. Export Complete DataFrame # In[26]: df_stratabet_key_entries_raw.to_csv(data_dir_stratabet + '/raw/key_entries/' + 'stratabet_key_entries_all.csv', index=None, header=True) # ### 3.4. Match Info # #### 3.4.1. Data Dictionary # The [StrataBet]( http://www.stratagem.co/) Events dataset has twelve features (columns) with the following definitions and data types: # # | Feature | Data type | # |------|-----| # | `eventId` | int64 | # | `subEventName` | object | # | `tags` | object | # | `playerId` | int64 | # | `positions` | object | # | `matchId` | int64 | # | `eventName` | object | # | `teamId` | int64 | # | `matchPeriod` | object | # | `eventSec` | float64 | # | `subEventId` | object | # | `id` | int64 | # #### 3.4.2. Import Data # In[27]: # Combine individual csv files to form one DataFrame, df_raw, using glob lst_files_key_entries = glob.glob(data_dir_stratabet + '/raw/key_entries/individual_competitions' + "/*.csv") # Creates a list of all csv files li = [] # pd.concat takes a list of DataFrames as an argument for filename in lst_files_key_entries: df_raw_temp = pd.read_csv(filename, index_col=None, header=0) li.append(df_raw_temp) df_stratabet_key_entries_raw = pd.concat(li, axis=0, ignore_index=True) # ignore_index=True as we don't want pandas to try an align row indexes # #### 3.4.3. Initial Data Handling # Let's quality of the dataset by looking first and last rows in pandas using the [head()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) and [tail()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html) methods. # In[28]: # Display the first 5 rows of the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.head() # In[29]: # Display the last 5 rows of the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.tail() # In[30]: # Print the shape of the raw DataFrame, df_stratabet_key_entries_raw print(df_stratabet_key_entries_raw.shape) # In[31]: # Print the column names of the raw DataFrame, df_stratabet_key_entries_raw print(df_stratabet_key_entries_raw.columns) # The dataset has six features (columns). Full details of these attributes can be found in the [Data Dictionary](section3.3.1). # In[32]: # Info for the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.info() # In[33]: # Description of the raw DataFrame, df_stratabet_key_entries_raw, showing some summary statistics for each numberical column in the DataFrame df_stratabet_key_entries_raw.describe() # In[34]: # Plot visualisation of the missing values for each feature of the raw DataFrame, df_stratabet_key_entries_raw msno.matrix(df_stratabet_key_entries_raw, figsize = (30, 7)) # In[35]: # Counts of missing values tm_null_value_stats = df_stratabet_key_entries_raw.isnull().sum(axis=0) tm_null_value_stats[tm_null_value_stats != 0] # The visualisation shows us very quickly that there a few missing values in the `shotQuality` column, but otherwise the dataset is complete. This data is now ready for Data Engineering. # #### 3.2.4. Export Complete DataFrame # In[36]: df_stratabet_key_entries_raw.to_csv(data_dir_stratabet + '/raw/key_entries/' + 'stratabet_key_entries_all.csv', index=None, header=True) # ### 3.5. Minutes Played # #### 3.5.1. Data Dictionary # The [StrataBet]( http://www.stratagem.co/) Events dataset has twelve features (columns) with the following definitions and data types: # # | Feature | Data type | # |------|-----| # | `eventId` | int64 | # | `subEventName` | object | # | `tags` | object | # | `playerId` | int64 | # | `positions` | object | # | `matchId` | int64 | # | `eventName` | object | # | `teamId` | int64 | # | `matchPeriod` | object | # | `eventSec` | float64 | # | `subEventId` | object | # | `id` | int64 | # #### 3.5.2. Import Data # In[37]: # Combine individual csv files to form one DataFrame, df_raw, using glob lst_files_key_entries = glob.glob(data_dir_stratabet + '/raw/key_entries/individual_competitions' + "/*.csv") # Creates a list of all csv files li = [] # pd.concat takes a list of DataFrames as an argument for filename in lst_files_key_entries: df_raw_temp = pd.read_csv(filename, index_col=None, header=0) li.append(df_raw_temp) df_stratabet_key_entries_raw = pd.concat(li, axis=0, ignore_index=True) # ignore_index=True as we don't want pandas to try an align row indexes # #### 3.5.3. Initial Data Handling # Let's quality of the dataset by looking first and last rows in pandas using the [head()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) and [tail()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html) methods. # In[38]: # Display the first 5 rows of the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.head() # In[39]: # Display the last 5 rows of the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.tail() # In[40]: # Print the shape of the raw DataFrame, df_stratabet_key_entries_raw print(df_stratabet_key_entries_raw.shape) # In[41]: # Print the column names of the raw DataFrame, df_stratabet_key_entries_raw print(df_stratabet_key_entries_raw.columns) # The dataset has six features (columns). Full details of these attributes can be found in the [Data Dictionary](section3.3.1). # In[42]: # Info for the raw DataFrame, df_stratabet_key_entries_raw df_stratabet_key_entries_raw.info() # In[43]: # Description of the raw DataFrame, df_stratabet_key_entries_raw, showing some summary statistics for each numberical column in the DataFrame df_stratabet_key_entries_raw.describe() # In[44]: # Plot visualisation of the missing values for each feature of the raw DataFrame, df_stratabet_key_entries_raw msno.matrix(df_stratabet_key_entries_raw, figsize = (30, 7)) # In[45]: # Counts of missing values tm_null_value_stats = df_stratabet_key_entries_raw.isnull().sum(axis=0) tm_null_value_stats[tm_null_value_stats != 0] # The visualisation shows us very quickly that there a few missing values in the `shotQuality` column, but otherwise the dataset is complete. This data is now ready for Data Engineering. # #### 3.5.4. Export Complete DataFrame # In[46]: df_stratabet_key_entries_raw.to_csv(data_dir_stratabet + '/raw/key_entries/' + 'stratabet_key_entries_all.csv', index=None, header=True) # --- # ## 4. Data Engineering # ### 4.1. Chances # #### 4.1.1. Assign Raw DataFrame to Engineered DataFrame # In[47]: # Assign Raw DataFrame to Engineered DataFrame df_stratabet_chances = df_stratabet_chances_raw # #### 4.1.2. Create `Full_Fixture_Date` Attribute # In[48]: df_stratabet_chances.head() # In[49]: df_stratabet_chances['Full_Fixture_Date'] = df_stratabet_chances['kickoffDate'].astype(str) + ' ' + df_stratabet_chances['hometeam_team1'].astype(str) + ' vs. ' + df_stratabet_chances['awayteam_team2'].astype(str) # #### 4.1.3. Convert Data Types # In[50]: df_stratabet_chances['location_x'] = df_stratabet_chances['location_x'].apply(pd.to_numeric, errors='coerce') df_stratabet_chances['location_y'] = df_stratabet_chances['location_y'].apply(pd.to_numeric, errors='coerce') df_stratabet_chances['primaryLocation_x'] = df_stratabet_chances['primaryLocation_x'].apply(pd.to_numeric, errors='coerce') df_stratabet_chances['primaryLocation_y'] = df_stratabet_chances['primaryLocation_y'].apply(pd.to_numeric, errors='coerce') df_stratabet_chances['shotQuality'] = df_stratabet_chances['shotQuality'].apply(pd.to_numeric, errors='coerce') df_stratabet_chances['defPressure'] = df_stratabet_chances['defPressure'].apply(pd.to_numeric, errors='coerce') df_stratabet_chances['numDefPlayers'] = df_stratabet_chances['numDefPlayers'].apply(pd.to_numeric, errors='coerce') df_stratabet_chances['numAttPlayers'] = df_stratabet_chances['numAttPlayers'].apply(pd.to_numeric, errors='coerce') # #### 4.1.4. Convert X, Y Coordinates to Standardised Coordiantes # # From the documenation, the XY coordinates in addition to grid locations, with (0,0) representing the absolute centre of the defended goal line. The pitch length runs from 0 to 420, while the width runs from 136 to -136 (left to right). # # Some key reference points of note: # * Left Goalpost (15, 0) # * Right Goalpost (-15, 0) # * 6-Yard Box Left Corner: (37, 22) # * 6-Yard Box Right Corner: (-37, 22) # * Penalty Spot: (0, 44) # * 18-Yard Box Left Corner: (81, 66) # * 18-Yard Box Right Corner: (-81, 66) # * Centre Spot: (0, 210) # In[51]: df_stratabet_chances['location_y_120'] = ((df_stratabet_chances['location_y'] / 480) * 120).round(2) df_stratabet_chances['location_x_80'] = (((df_stratabet_chances['location_x'] + 136) / 272) * 80).round(2) df_stratabet_chances['primaryLocation_y_120'] = ((df_stratabet_chances['primaryLocation_y'] / 480) * 120).round(2) df_stratabet_chances['primaryLocation_x_80'] = (((df_stratabet_chances['primaryLocation_x'] + 136) / 272) * 80).round(2) # In[52]: df_stratabet_chances.head() # In[53]: df_stratabet_chances['location_y_120_inv'] = 120 - ((df_stratabet_chances['location_y'] / 480) * 120).round(2) df_stratabet_chances['location_x_80_inv'] = 80 - (((df_stratabet_chances['location_x'] + 136) / 272) * 80).round(2) df_stratabet_chances['primaryLocation_y_120_inv'] = 120 - ((df_stratabet_chances['primaryLocation_y'] / 480) * 120).round(2) df_stratabet_chances['primaryLocation_x_80_inv'] = 80 - (((df_stratabet_chances['primaryLocation_x'] + 136) / 272) * 80).round(2) # #### 4.1.5. Renaming # In[54]: df_stratabet_chances = df_stratabet_chances.rename(columns = {'index':'id'}) # #### 4.1.6. Assign New Attributes # ##### Season # In[55]: df_stratabet_chances['Season'] = 'TO ADD' # In[56]: # Write code here to add seasons for each match per 'competition' and 'kickoffDate' - varies per league # #### 4.1.7. Create DataFrame of Teams and Leagues # In[68]: # Create DataFrame of Teams and Leagues ## Select columns of interest cols = ['competition', 'team'] df_stratabet_teams_leagues = df_stratabet_chances[cols] ## Drop duplicates df_stratabet_teams_leagues = df_stratabet_teams_leagues.drop_duplicates() ## Order columns by league and team df_stratabet_teams_leagues = df_stratabet_teams_leagues.sort_values(['competition', 'team'], ascending=[True, True]) ## Display DataFrame df_stratabet_teams_leagues.head() # In[69]: # Export DataFrame df_stratabet_teams_leagues.to_csv(data_dir_stratabet + '/reference/teams_leagues.csv', index=None, header=True) # #### 4.1.8. Export DataFrame # In[64]: df_stratabet_chances.to_csv(data_dir_stratabet + '/engineered/chances/stratabet_chances_all.csv', index=None, header=True) # In[65]: df_stratabet_chances.to_csv(data_dir + '/export/stratabet_events_chances.csv', index=None, header=True) # In[66]: df_stratabet_chances.head() # In[67]: df_stratabet_chances.shape # In[ ]: # All code below here is old and needs to be sorted # ### 4.2. Key Entries # In[ ]: # Add code here # ### 4.3. Match Info # In[ ]: # Add code here # ### 4.4. Minutes Played # In[ ]: # Add code here # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # Old code from here # ### 4.3. String Cleaning # ##### Split `Label` column into seperate `Fixture` and `Score` columns # In[ ]: # Break down 'Label' column into constituent parts - Fixtures, Score, Date, Home Goals, Away Goals, etc. df_stratabet['fixture'] = df_stratabet['label'].str.split(', ').str[0] df_stratabet['score_home_away'] = df_stratabet['label'].str.split(', ').str[1] df_stratabet['team_home'] = df_stratabet['fixture'].str.split(' - ').str[0] df_stratabet['team_away'] = df_stratabet['fixture'].str.split(' - ').str[1] df_stratabet['goals_home'] = df_stratabet['score_home_away'].str.split(' - ').str[0] df_stratabet['goals_away'] = df_stratabet['score_home_away'].str.split(' - ').str[1] # ##### Split `date` column into seperate `date_isolated` and `time_isolated` columns # In[ ]: df_stratabet['date_isolated'] = df_stratabet['date'].str.split(' at').str[0] df_stratabet['time_isolated'] = df_stratabet['date'].str.split(' at ').str[1] df_stratabet['date_time_isolated'] = df_stratabet['date'].str.split(' GMT').str[0].str.replace(' at ', ' ', regex=True) # ### 4.4. Drop columns # As this is a large dataset with >3mil rows, we will remove every column that is not required at this stage. # In[ ]: # Display columns df_stratabet.columns # In[ ]: # df_stratabet = df_stratabet.drop(['tags', 'dateutc', 'wyId_x', 'label', 'date', 'referees', 'wyId_y', 'date_isolated', 'time_isolated', 'date_time_isolated']) # ### 4.5. Create New Attributes # ##### Create full fixture data from broken down attributes created in section 4.2. # In[ ]: df_stratabet['full_fixture_date'] = df_stratabet['date_date'].astype(str) + ' ' + df_stratabet['team_home'].astype(str) + ' ' + df_stratabet['goals_home'].astype(str) + ' ' + ' v ' + ' ' + df_stratabet['goals_away'].astype(str) + ' ' + df_stratabet['team_away'].astype(str) # ##### Create 'season' attribute # In[ ]: df_stratabet['season'] = '17/18' # ### 4.5. Reorder DataFrame # Correctly order all the rows in the DataFrame by date, time, country, fixture, half, and time in the match. Important when looking at events and the following event e.g. is possession retains? Which player receives the pass, etc. # In[ ]: df_stratabet = df_stratabet.sort_values(['date_date', 'time_time', 'country', 'league_name', 'full_fixture_date', 'matchPeriod', 'eventSec'], ascending=[True, True, True, True, True, True, True]) # ### 4.6. Create New Attributes # ##### Create `followingPossession` column # 'teamIdNext' = following 'teamId' # In[ ]: df_stratabet['teamIdNext'] = df_stratabet['teamId'].shift(-1) df_stratabet['teamNameNext'] = df_stratabet['teamName'].shift(-1) df_stratabet['fullNameNext'] = df_stratabet['fullName'].shift(-1) # In[ ]: df_stratabet.head() # ##### Create `player2player` column # In[ ]: df_stratabet['player2player'] = df_stratabet['fullName'] + ' - ' + df_stratabet['fullNameNext'] # ##### Create `isPossessionRetained` column # When `teamId` is not followed by the same `teamId` in the following row, possession is lost. We want to creat a column that stats this. # In[ ]: df_stratabet['isPossessionRetained'] = np.where(df_stratabet['teamId'] == df_stratabet['teamIdNext'], True, False) # ##### Clean Positions data # In[ ]: # CODE HERE # ### 4.7. Export DataFrame # ##### Break down data into individual matches # In[ ]: lst_results = list(df_stratabet['full_fixture_date'].unique()) # In[ ]: for i, g in df_stratabet.groupby('full_fixture_date'): g.to_csv(data_dir_wyscout + '/engineered/individual_matches/{}.csv'.format(i), header=True, index_label=True) # ##### Complete dataset # In[ ]: df_stratabet.to_csv(data_dir_wyscout + '/engineered/combined/wyscout_events_big5_1718.csv', index=None, header=True) # ### 4.8. Aggregate Data # #### 4.8.1. Fixture Level # In[ ]: # Select columns of interest ## Define columns cols = ['season', 'date_time_timestamp', 'fixture', 'team_home', 'team_away', 'teamName', 'goals_home', 'goals_away', 'eventName', 'subEventName' ] ## Streamline DataFrame with columns of interest df_stratabet_select = df_stratabet[cols] ## df_stratabet_select['Opponent'] = np.where(df_stratabet_select['team_home'] == df_stratabet_select['teamName'], df_stratabet_select['team_away'], df_stratabet_select['team_home']) # In[ ]: # ## Group DataFrame and Aggregate on 'eventName' df_stratabet_fixture_grouped = (df_stratabet_select .groupby(['season', 'date_time_timestamp', 'fixture', 'teamName', 'Opponent', 'goals_home', 'goals_away', 'eventName']) .agg({'eventName': ['count']}) ) ## Drop level df_stratabet_fixture_grouped.columns = df_stratabet_fixture_grouped.columns.droplevel(level=0) ## Reset index df_stratabet_fixture_grouped = df_stratabet_fixture_grouped.reset_index() ## Rename columns df_stratabet_fixture_grouped = df_stratabet_fixture_grouped.rename(columns={'season': 'Season', 'date_time_timestamp': 'Date', 'fixture': 'Fixture', 'teamName': 'Team', 'Opponent': 'Opponent', 'goals_home': 'Goals_Home', 'goals_away': 'Goals_Away', 'eventName': 'Event', 'count': 'Team_Value' } ) ## Display DataFrame df_stratabet_fixture_grouped.head() # In[ ]: # Select columns of interest ## Define columns cols = ['Season', 'Date', 'Fixture', 'Team', 'Opponent', 'Event', 'Team_Value' ] ## Streamline DataFrame with columns of interest df_stratabet_fixture_grouped_select = df_stratabet_fixture_grouped[cols] # In[ ]: # Join DataFrame to itself on 'Date', 'Fixture', 'Team'/'Opponent', and 'Event', to join Team and Opponent together df_stratabet_fixture_grouped = pd.merge(df_stratabet_fixture_grouped, df_stratabet_fixture_grouped, how='left', left_on=['Season', 'Date', 'Fixture', 'Opponent', 'Event'], right_on = ['Season', 'Date', 'Fixture', 'Team', 'Event']) # In[ ]: # Clean Data ## Drop columns df_stratabet_fixture_grouped = df_stratabet_fixture_grouped.drop(columns=['Team_y', 'Opponent_y', 'Goals_Home_y', 'Goals_Away_y']) ## Rename columns df_stratabet_fixture_grouped = df_stratabet_fixture_grouped.rename(columns={'Season_x': 'Season', 'Team_x': 'Team', 'Opponent_x': 'Opponent', 'Goals_Home_x': 'Goals_Home', 'Goals_Away_x': 'Goals_Away', 'Team_Value_x': 'Team_Value', 'Team_Value_y': 'Opponent_Value', } ) ## Replace null values with zeros df_stratabet_fixture_grouped['Team_Value'] = df_stratabet_fixture_grouped['Team_Value'].replace(np.nan, 0) df_stratabet_fixture_grouped['Opponent_Value'] = df_stratabet_fixture_grouped['Opponent_Value'].replace(np.nan, 0) ## Convert Opponent_Value' from Float64 to Int64 type df_stratabet_fixture_grouped['Opponent_Value'] = df_stratabet_fixture_grouped['Opponent_Value'].astype('Int64') ## Display DataFrame df_stratabet_fixture_grouped.head() # In[ ]: # ## df_fixture_gw = (df_stratabet_fixture_grouped .groupby(['Date', 'Team']) .agg({'Team': ['nunique']}) ) ## df_fixture_gw.columns = df_fixture_gw.columns.droplevel(level=0) ## df_fixture_gw = df_fixture_gw.reset_index() ## df_fixture_gw = df_fixture_gw.rename(columns={'Date': 'Date', 'nunique': 'Gameweek', } ) ## Groupby. See: https://stackoverflow.com/questions/18554920/pandas-aggregate-count-distinct df_fixture_gw = (df_fixture_gw.groupby(['Team', 'Date']).sum() .groupby(level=0).cumsum().reset_index() ) ## Display DataFrame df_fixture_gw.head() # In[ ]: # Join DataFrame df_stratabet_fixture_grouped = pd.merge(df_stratabet_fixture_grouped, df_fixture_gw, how='left', left_on=['Date', 'Team'], right_on = ['Date', 'Team']) # Display DataFrame df_stratabet_fixture_grouped.head(50) # ##### Export DataFrame # In[ ]: df_stratabet_fixture_grouped.to_csv(data_dir_wyscout + '/engineered/combined/wyscout_aggregated_fixtures_big5_1718.csv', index=None, header=True) # #### 4.8.2. Team Level # In[ ]: # Group DataFrame by Team ## df_stratabet_team_grouped = (df_stratabet_fixture_grouped .groupby(['Team', 'Event']) .agg({'Team_Value': ['sum'], 'Opponent_Value': ['sum'] } ) ) ## df_stratabet_team_grouped.columns = df_stratabet_team_grouped.columns.droplevel(level=0) ## df_stratabet_team_grouped = df_stratabet_team_grouped.reset_index() ## Rename columns df_stratabet_team_grouped.columns = ['Team', 'Event', 'Team_Value', 'Opponent_Value'] ## Display columns df_stratabet_team_grouped.head() # ##### Export DataFrame # In[ ]: df_stratabet_team_grouped.to_csv(data_dir_wyscout + '/engineered/combined/wyscout_aggregated_team_big5_1718.csv', index=None, header=True) # --- # ## 5. Exploratory Data Analysis # ... # --- # ## 6. Summary # This notebook scrapes data for player valuations using [Beautifulsoup](https://pypi.org/project/beautifulsoup4/) from [TransferMarkt](https://www.transfermarkt.co.uk/) using [pandas](http://pandas.pydata.org/) for data maniuplation through DataFrames and [Beautifulsoup](https://pypi.org/project/beautifulsoup4/) for webscraping. # --- # ## 7. Next Steps # ... # --- # ## 8. References # ... # --- # This article was written with the aid of StrataData, which is property of [Stratagem Technologies](http://www.stratagem.co/). StrataData powers the [StrataBet Sports Trading Platform](http://www.stratabet.com/), in addition to [StrataBet Premium Recommendations](http://app.stratabet.com/recommendations). # # ***Visit my website [EddWebster.com](https://www.eddwebster.com) or my [GitHub Repository](https://github.com/eddwebster) for more projects. If you'd like to get in contact, my Twitter handle is [@eddwebster](http://www.twitter.com/eddwebster) and my email is: edd.j.webster@gmail.com.*** # [Back to the top](#top)