#!/usr/bin/env python # coding: utf-8 # # # Physical Data Engineering # ##### Notebook to engineer physical data data using [pandas](http://pandas.pydata.org/). # # ### By [Edd Webster](https://www.twitter.com/eddwebster) # Notebook first written: 20/01/2022
# Notebook last updated: 01/02/2022 # # ![Watford F.C.](../../img/club_badges/premier_league/watford_fc_logo_small.png) # # Click [here](#section4) to jump straight into the Data Engineering section and skip the [Notebook Brief](#section2) and [Data Sources](#section3) sections. # ___ # # # ## Introduction # This notebook engineers a an anonymised dataset of physical data provided by [Watford F.C](https://www.watfordfc.com/), using [pandas](http://pandas.pydata.org/) for data manipulation through DataFrames. # # For more information about this notebook and the author, I am available through all the following channels: # * [eddwebster.com](https://www.eddwebster.com/); # * edd.j.webster@gmail.com; # * [@eddwebster](https://www.twitter.com/eddwebster); # * [linkedin.com/in/eddwebster](https://www.linkedin.com/in/eddwebster/); # * [github/eddwebster](https://github.com/eddwebster/); and # * [public.tableau.com/profile/edd.webster](https://public.tableau.com/profile/edd.webster). # # A static version of this notebook can be found [here](https://nbviewer.org/github/eddwebster/watford/blob/main/notebooks/2_data_engineering/Opta%20Data%20Engineering.ipynb). This notebook has an accompanying [`watford`](https://github.com/eddwebster/watford) GitHub repository and for my full repository of football analysis, see my [`football_analysis`](https://github.com/eddwebster/football_analytics) GitHub repository. # ___ # # ## Notebook Contents # 1. [Notebook Dependencies](#section1)
# 2. [Notebook Brief](#section2)
# 3. [Data Sources](#section3)
# 1. [Introduction](#section3.1)
# 2. [Read in the Datasets](#section3.2)
# 3. [Initial Data Handling](#section3.3)
# 4. [Data Engineering](#section4)
# 1. [Assign Raw DataFrame to Engineered DataFrame](#section4.1)
# 5. [Summary](#section5)
# 6. [Next Steps](#section6)
# 7. [References](#section7)
# ___ # # # # ## 1. Notebook Dependencies # # This notebook was written using [Python 3](https://docs.python.org/3.7/) and requires the following libraries: # * [`Jupyter notebooks`](https://jupyter.org/) for this notebook environment with which this project is presented; # * [`NumPy`](http://www.numpy.org/) for multidimensional array computing; and # * [`pandas`](http://pandas.pydata.org/) for data analysis and manipulation. # # All packages used for this notebook can be obtained by downloading and installing the [Conda](https://anaconda.org/anaconda/conda) distribution, available on all platforms (Windows, Linux and Mac OSX). Step-by-step guides on how to install Anaconda can be found for Windows [here](https://medium.com/@GalarnykMichael/install-python-on-windows-anaconda-c63c7c3d1444) and Mac [here](https://medium.com/@GalarnykMichael/install-python-on-mac-anaconda-ccd9f2014072), as well as in the Anaconda documentation itself [here](https://docs.anaconda.com/anaconda/install/). # ### Import Libraries and Modules # In[2]: # Python ≥3.5 (ideally) import platform import sys, getopt assert sys.version_info >= (3, 5) import csv # Import Dependencies get_ipython().run_line_magic('matplotlib', 'inline') # Math Operations import numpy as np from math import pi # Datetime import datetime from datetime import date import time # Data Preprocessing import pandas as pd import pandas_profiling as pp import os import re import chardet import random from io import BytesIO from pathlib import Path # Reading Directories import glob import os # Working with JSON import json from pandas import json_normalize # Data Visualisation import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import missingno as msno # Requests and downloads import tqdm import requests # Display in Jupyter from IPython.display import Image, YouTubeVideo from IPython.core.display import HTML # Ignore Warnings import warnings warnings.filterwarnings(action="ignore", message="^internal gelsd") # Print message print("Setup Complete") # In[3]: # Python / module versions used here for reference print('Python: {}'.format(platform.python_version())) print('NumPy: {}'.format(np.__version__)) print('pandas: {}'.format(pd.__version__)) print('matplotlib: {}'.format(mpl.__version__)) # ### Defined Filepaths # In[4]: # Set up initial paths to subfolders base_dir = os.path.join('..', '..') data_dir = os.path.join(base_dir, 'data') data_dir_physical = os.path.join(base_dir, 'data', 'physical') scripts_dir = os.path.join(base_dir, 'scripts') models_dir = os.path.join(base_dir, 'models') img_dir = os.path.join(base_dir, 'img') fig_dir = os.path.join(base_dir, 'img', 'fig') # ### Notebook Settings # In[5]: # Display all columns of displayed pandas DataFrames pd.set_option('display.max_columns', None) #pd.set_option('display.max_rows', None) pd.options.mode.chained_assignment = None # --- # # # # ## 2. Notebook Brief # This notebook parses and engineers [Opta data](https://www.statsperform.com/opta/) by [Stats Perform](https://www.statsperform.com/) ... using [pandas](http://pandas.pydata.org/). # # # **Notebook Conventions**:
# * Variables that refer a `DataFrame` object are prefixed with `df_`. # * Variables that refer to a collection of `DataFrame` objects (e.g., a list, a set or a dict) are prefixed with `dfs_`. # --- # # # # ## 3. Data Sources # # # ### 3.1. Introduction # The physical data... # # # ### 3.2. Import Data # These `CSV` file provided is read in as [pandas](https://pandas.pydata.org/) DataFrames. # In[6]: # Show files in directory print(glob.glob(os.path.join(data_dir_physical, 'raw', 'F7/*.csv'))) # In[7]: # Import CSV file as a pandas DataFrame df_physical_raw = pd.read_csv(os.path.join(data_dir_physical, 'raw', 'Physical Output.csv')) # # # ### 3.3. Initial Data Handling # First check the quality of the dataset by looking first and last rows in pandas using the [`head()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) and [`tail()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html) methods. # In[8]: # Display the first five rows of the DataFrame, df_physical_raw df_physical_raw.head() # In[10]: # Display the last five rows of the DataFrame, df_physical_raw df_physical_raw.tail() # In[11]: # Print the shape of the DataFrame, df_physical_raw print(df_physical_raw.shape) # In[12]: # Print the column names of the DataFrame, df_physical_raw print(df_physical_raw.columns) # In[13]: # Data types of the features of the raw DataFrame, df_physical_raw df_physical_raw.dtypes # Full details of these attributes and their data types is discussed further in the [Data Dictionary](section3.2.2). # In[14]: # Displays all columns with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(df_physical_raw.dtypes) # In[16]: # Info for the raw DataFrame, df_physical_raw df_physical_raw.info() # In[17]: # Plot visualisation of the missing values for each feature of the raw DataFrame, df_physical_raw msno.matrix(df_physical_raw, figsize = (30, 7)) # In[18]: # Counts of missing values null_value_stats = df_physical_raw.isnull().sum(axis=0) null_value_stats[null_value_stats != 0] # --- # # # # ## 4. Data Engineering # The next step is to wrangle the dataset to into a format that’s suitable for analysis. # # This section is broken down into the following subsections: # # 4.1. [Assign Raw DataFrame to Engineered DataFrame](#section4.1)
# # # ### 4.1. Assign Raw DataFrames to Engineered DataFrames # In[19]: # Assign Raw DataFrame to Engineered DataFrame df_physical = df_physical_raw.copy() # In[20]: df_physical.head() # # # ### 4.2. Create Columns for 'Non-High Intensity Distance' per 5 mins # In[42]: # Rewrite with a loop df_physical['1-5NHID'] = df_physical['1-5TD'] - df_physical['1-5HID'] df_physical['6-10NHID'] = df_physical['6-10TD'] - df_physical['6-10HID'] df_physical['11-15NHID'] = df_physical['11-15TD'] - df_physical['11-15HID'] df_physical['16-20NHID'] = df_physical['16-20TD'] - df_physical['16-20HID'] df_physical['21-25NHID'] = df_physical['21-25TD'] - df_physical['21-25HID'] df_physical['26-30NHID'] = df_physical['26-30TD'] - df_physical['26-30HID'] df_physical['31-35NHID'] = df_physical['31-35TD'] - df_physical['31-35HID'] df_physical['36-40NHID'] = df_physical['36-40TD'] - df_physical['36-40HID'] df_physical['41-45NHID'] = df_physical['41-45TD'] - df_physical['41-45HID'] df_physical['45+NHID'] = df_physical['45+TD'] - df_physical['45+HID'] df_physical['46-50NHID'] = df_physical['46-50TD'] - df_physical['46-50HID'] df_physical['51-55NHID'] = df_physical['51-55TD'] - df_physical['51-55HID'] df_physical['56-60NHID'] = df_physical['56-60TD'] - df_physical['56-60HID'] df_physical['61-65NHID'] = df_physical['61-65TD'] - df_physical['61-65HID'] df_physical['66-70NHID'] = df_physical['66-70TD'] - df_physical['66-70HID'] df_physical['71-75NHID'] = df_physical['71-75TD'] - df_physical['71-75HID'] df_physical['76-80NHID'] = df_physical['76-80TD'] - df_physical['76-80HID'] df_physical['81-85NHID'] = df_physical['81-85TD'] - df_physical['81-85HID'] df_physical['86-90NHID'] = df_physical['86-90TD'] - df_physical['86-90HID'] df_physical['90+NHID'] = df_physical['90+TD'] - df_physical['90+HID'] # # # ### 4.3. Pivot Data # In[43]: # Select columns of interest ## Define the columns as a list lst_cols = ['Match Date', 'Match', 'Home/Away', 'Team', 'Player Name', 'Minutes Played', '1-5TD', '6-10TD', '11-15TD', '16-20TD', '21-25TD', '26-30TD', '31-35TD', '36-40TD', '41-45TD', '45+TD', '46-50TD', '51-55TD', '56-60TD', '61-65TD', '66-70TD', '71-75TD', '76-80TD', '81-85TD', '86-90TD', '90+TD', '1-5HID', '6-10HID', '11-15HID', '16-20HID', '21-25HID', '26-30HID', '31-35HID', '36-40HID', '41-45HID', '45+HID', '46-50HID', '51-55HID', '56-60HID', '61-65HID', '66-70HID', '71-75HID', '76-80HID', '81-85HID', '86-90HID', '90+HID', '1-5NHID', '6-10NHID', '11-15NHID', '16-20NHID', '21-25NHID', '26-30NHID', '31-35NHID', '36-40NHID', '41-45NHID', '45+NHID', '46-50NHID', '51-55NHID', '56-60NHID', '61-65NHID', '66-70NHID', '71-75NHID', '76-80NHID', '81-85NHID', '86-90NHID', '90+NHID' ] ## Filter DataFrame for just the columns of interest df_physical_select = df_physical[lst_cols] # In[44]: # Pivot the DataFrame df_physical_pvt = pd.melt(df_physical_select, id_vars=['Match Date', 'Match', 'Home/Away', 'Team', 'Player Name', 'Minutes Played' ], var_name='Time Period' , value_name='Total Distance' ) # In[45]: # Display DataFrame df_physical_pvt.head() # In[37]: # DataFrame shape df_physical_pvt.shape # # # ### 4.4. ... # In[ ]: # --- # # # # ## 5. Export Final DataFrames # In[46]: # Export DataFrames #df_physical.to_csv(os.path.join(data_dir_physical, 'engineered', 'Physical Output.csv'), index=None, header=True) #df_physical_pvt.to_csv(os.path.join(data_dir_physical, 'engineered', 'Physical Output Pivoted.csv'), index=None, header=True) # --- # # # # ## 6. Summary # This notebook engineer physical data using [pandas](http://pandas.pydata.org/). # --- # # # # ## 7. Next Steps # The next stage is to visualise this data in Tableau and analyse the findings, to be presented in a deck. # --- # # # # ## 8. References # * ... # * ... # * ... # --- # # ***Visit my website [eddwebster.com](https://www.eddwebster.com) or my [GitHub Repository](https://github.com/eddwebster) for more projects. If you'd like to get in contact, my Twitter handle is [@eddwebster](http://www.twitter.com/eddwebster) and my email is: edd.j.webster@gmail.com.*** # [Back to the top](#top)