Notebook first written: 02/09/2021
Notebook last updated: 04/09/2021
This notebook engineers transfer data data from Club ELO, and manipulates this landed data as DataFrames using pandas and matplotlib for visualisation.
For more information about this notebook and the author, I'm available through all the following channels:
The accompanying GitHub repository for this notebook can be found here and a static version of this notebook can be found here.
This notebook was written using Python 3 and requires the following libraries:
Jupyter notebooks
for this notebook environment with which this project is presented;NumPy
for multidimensional array computing;pandas
for data analysis and manipulation; andmatplotlib
for data visualisations.All packages used for this notebook except for BeautifulSoup can be obtained by downloading and installing the Conda distribution, available on all platforms (Windows, Linux and Mac OSX). Step-by-step guides on how to install Anaconda can be found for Windows here and Mac here, as well as in the Anaconda documentation itself here.
# Python ≥3.5 (ideally)
import platform
import sys, getopt
assert sys.version_info >= (3, 5)
import csv
# Import Dependencies
%matplotlib inline
# Math Operations
import numpy as np
import math
from math import pi
# Datetime
import datetime
from datetime import date, timedelta
import time
# Data Preprocessing
import pandas as pd
#import pandas_profiling as pp
import os
import re
import random
from io import BytesIO, StringIO
from pathlib import Path
# Reading directories
import glob
import os
from os.path import basename
# Flatten lists
from functools import reduce
# Working with JSON
import json
from pandas.io.json import json_normalize
# Web Scraping
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import re
# Currency Convert
from forex_python.converter import CurrencyRates
# Data Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
import missingno as msno
# Progress Bar
from tqdm import tqdm
# Display in Jupyter
from IPython.display import Image, YouTubeVideo
from IPython.core.display import HTML
# Ignore Warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
print('Setup Complete')
Setup Complete
# Python / module versions used here for reference
print('Python: {}'.format(platform.python_version()))
print('NumPy: {}'.format(np.__version__))
print('pandas: {}'.format(pd.__version__))
print('matplotlib: {}'.format(mpl.__version__))
Python: 3.7.6 NumPy: 1.20.3 pandas: 1.3.2 matplotlib: 3.4.2
# Set up initial paths to subfolders
base_dir = os.path.join('..', '..', )
data_dir = os.path.join(base_dir, 'data')
data_dir_elo = os.path.join(base_dir, 'data', 'elo')
img_dir = os.path.join(base_dir, 'img')
fig_dir = os.path.join(base_dir, 'img', 'fig')
video_dir = os.path.join(base_dir, 'video')
# Defined Variables
## Define today's date
today = datetime.datetime.now().strftime('%d/%m/%Y').replace('/', '')
# Defined Dictionaries
## Define seasons
dict_seasons = {1975: '1975/1976',
1976: '1976/1977',
1977: '1977/1978',
1978: '1978/1979',
1979: '1979/1980',
1980: '1980/1981',
1981: '1981/1982',
1982: '1982/1983',
1983: '1983/1984',
1984: '1984/1985',
1985: '1985/1986',
1986: '1986/1987',
1987: '1987/1988',
1988: '1988/1989',
1989: '1989/1990',
1990: '1990/1991',
1991: '1991/1992',
1992: '1992/1993',
1993: '1993/1994',
1994: '1994/1995',
1995: '1995/1996',
1996: '1996/1997',
1997: '1997/1998',
1998: '1998/1999',
1999: '1999/2000',
2000: '2000/2001',
2001: '2001/2002',
2002: '2002/2003',
2003: '2003/2004',
2004: '2004/2005',
2005: '2005/2006',
2006: '2006/2007',
2007: '2007/2008',
2008: '2008/2009',
2009: '2009/2010',
2010: '2010/2011',
2011: '2011/2012',
2012: '2012/2013',
2013: '2013/2014',
2014: '2014/2015',
2015: '2015/2016',
2016: '2016/2017',
2017: '2017/2018',
2018: '2018/2019',
2019: '2019/2020',
2020: '2020/2021',
2021: '2021/2022',
2022: '2022/2023',
2023: '2023/2024',
}
# Defined Lists
## Define list of folders
lst_folders = ['raw', 'engineered']
## Define list of data types
lst_data_types = ['ratings_by_date', 'ratings_by_team']
## Define lists of Teams and Countries
df_elo_teams_leagues = pd.read_csv(data_dir + '/reference/teams/elo_team_leagues_latest.csv')
### Countries
lst_countries = list(df_elo_teams_leagues['Country'].unique())
### 'Big 5' European League Countries
lst_countries_big5 = ['ENG', 'ESP', 'FRA', 'GER', 'ITA']
### Teams
lst_teams = list(df_elo_teams_leagues['Club'].unique())
# Define retry requests session function - taken from: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
def requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(500, 502, 504),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
# Define function for scraping the Elo rating, team-by-team
def get_elo_team_ratings():
"""
Function to scrape the Elo ratings of teams, team-by-team.
"""
## Start timer
tic = datetime.datetime.now()
## Print time scraping started
print(f'Scraping started at: {tic}')
## Define list of countries
lst_countries =list(df_elo_teams_leagues['Country'].unique())
## Loop through countries
for country in lst_countries:
### Print statement
print(f'Scraping started for Team Elo data for {country}...')
df_temp = df_elo_teams_leagues[df_elo_teams_leagues['Country']==country]
lst_teams = list(df_temp['Club'].unique())
### Loop through teams
for team in lst_teams:
#### Define team name used when saving to folder (Snailcase)
team_folder = team.replace(' ', '_')
#### Define team name for use with URL (lowercase, no spaces)
team_stripped = team.lower().strip().replace(' ', '')
#### Save Player URL List (if not already saved)
if not os.path.exists(os.path.join(data_dir_elo + f'/raw/ratings_by_team/{country}/{team_folder}_{country}_elo_rating_latest.csv')):
##### Scraping
###### Print statement
print(f'Scraping started for Team Elo data {team} in {country}...')
###### Define URL
url = f'http://api.clubelo.com/{team_stripped}'
###### Call requests_retry_session function, defined in previous command
response = requests_retry_session().get(url)
#print(response.status_code)
######
s = requests.Session()
s.auth = ('user', 'pass')
s.headers.update({'x-test': 'true'})
#####
response = requests_retry_session(session=s).get(url)
######
data = StringIO(response .text)
df = pd.read_csv(data, sep=',')
###### Data Engineering
####### Convert 'From' and 'To' to datatime
df['From'] = pd.to_datetime(df['From'])
df['To'] = pd.to_datetime(df['To'])
####### Drop duplicate rows
df = df.drop_duplicates()
##### Save DataFrame
###### Save latest version
df.to_csv(data_dir_elo + f'/raw/ratings_by_team/{country}/{team_folder}_{country}_elo_rating_latest.csv', index=None, header=True)
###### Export a copy to the 'archive' subfolder, including the date
df.to_csv(data_dir_elo + f'/raw/ratings_by_team/{country}/archive/{team_folder}_{country}_elo_rating_last_updated_{today}.csv', index=None, header=True)
##### Print statement for team
print(f'Team Elo rating data for {team} scraped and saved.')
##### If Team Elo rating already saved
else:
###### Print statement
print(f'Team Elo rating data for {team} already saved as a CSV file.')
## End timer
toc = datetime.datetime.now()
## Print time scraping ended
print(f'Scraping ended at: {toc}')
## Calculate time take
total_time = (toc-tic).total_seconds()
print(f'Time taken to scrape the Team Elo for {len(teams)} teams in {len(countries)} countries is: {total_time/60:0.2f} minutes.')
## Unify individual CSV files as a single DataFrame
### Show files in directory
all_files = glob.glob(os.path.join(data_dir_elo + f'/raw/ratings_by_team/*/*_elo_rating_latest.csv'))
### Create an empty list of Players URLs
lst_team_elo_ratings_all = []
### Loop through list of files and read into temporary DataFrames
for filename in all_files:
df_temp = pd.read_csv(filename, index_col=None, header=0)
lst_team_elo_ratings_all.append(df_temp)
### Concatenate the files into a single DataFrame
df_elo_teams_all = pd.concat(lst_team_elo_ratings_all, axis=0, ignore_index=True)
### Reset index
df_elo_teams_all = df_elo_teams_all.reset_index(drop=True)
### Sort DataFrame
df_elo_teams_all = df_elo_teams_all.sort_values(['Country', 'Club', 'To'], ascending=[True, True, True])
## Export DataFrame
### Save latest version
df_elo_teams_all.to_csv(data_dir_elo + f'/raw/ratings_by_team/elo_team_ratings_combined_latest.csv', index=None, header=True)
### Save a copy to archive folder (dated)
df_elo_teams_all.to_csv(data_dir_elo + f'/raw/ratings_by_team/archive/elo_team_ratings_combined_last_updated_{today}.csv', index=None, header=True)
## Distinct number of clubs and countries
total_countries = df_elo_teams_all['Country'].nunique()
total_clubs = df_elo_teams_all['Club'].nunique()
## Print statement
print(f'Combined Elo DataFrame contains {total_countries} countries and {total_clubs} clubs.')
## Return final list of Player URLs
return(df_elo_teams_all)
Create folders and subfolders for data, if not already created.
# Make the data directory structure
for folder in lst_folders:
path = os.path.join(data_dir_elo, folder)
if not os.path.exists(path):
os.mkdir(path)
for data_types in lst_data_types:
path = os.path.join(data_dir_elo, folder, data_types)
if not os.path.exists(path):
os.mkdir(path)
for country in lst_countries:
path = os.path.join(data_dir_elo, folder, data_types, country)
if not os.path.exists(path):
os.mkdir(path)
os.mkdir(os.path.join(path, 'archive'))
# Display all columns of pandas DataFrames
pd.set_option('display.max_columns', None)
This Jupyter notebook is part of a series of notebooks to scrape, parse, engineer, unify, and the model, culminating in a an Expected Transfer (xTransfer) player performance vs. valuation model. This model aims to determine the under- and over-performing players based on their on-the-pitch output against transfer fee and wages.
This particular notebook is one of several data parsing notebooks, that cleans team rating data from Club ELO using pandas.
This notebook, along with the other notebooks in this project workflow are shown in the following diagram:
Links to these notebooks in the football_analytics
GitHub repository can be found at the following:
Before conducting our EDA, the data needs to be imported as a DataFrame in the Data Sources section Section 3 and cleaned in the Data Engineering section Section 4.
We'll be using the pandas library to import our data to this workbook as a DataFrame.
The Club ELO dataset has eight features (columns) with the following definitions and data types:
Feature | Data type |
---|---|
Rank |
object |
Club |
object |
Country |
object |
Level |
object |
Elo |
object |
From |
object |
To |
object |
Potential methodology to approach this problem
## Unify individual CSV files as a single DataFrame
### Show files in directory
all_files = glob.glob(os.path.join(data_dir_elo + '/raw/ratings_by_date/*/*.csv'))
### Create an empty list of Players URLs
lst_team_elo_rating_all = []
### Loop through list of files and read into temporary DataFrames
for filename in all_files:
df_temp = pd.read_csv(filename, index_col=None, header=0)
lst_team_elo_rating_all.append(df_temp)
### Concatenate the files into a single DataFrame
df_elo_team_ratings_all = pd.concat(lst_team_elo_rating_all, axis=0, ignore_index=True)
## Reset index
df_elo_team_ratings_all = df_elo_team_ratings_all.reset_index(drop=True)
## Sort DataFrame
df_elo_team_ratings_all = df_elo_team_ratings_all.sort_values(['Country', 'Club'], ascending=[True, True])
## Export DataFrame
### Save latest version
df_elo_team_ratings_all.to_csv(data_dir_elo + '/raw/ratings_by_date/elo_team_rating_by_date_combined_latest.csv', index=None, header=True)
### Save a copy to archive folder (dated)
df_elo_team_ratings_all.to_csv(data_dir_elo + f'/raw/ratings_by_date/archive/elo_team_rating_by_date_combined_last_updated_{today}.csv', index=None, header=True)
## Distinct number of players
total_teams = df_elo_team_ratings_all['Club'].nunique()
## Print statement
print(f'The Team ELO Rating by Date DataFrame contains {total_teams:,} teams.')
The Team ELO Rating by Date DataFrame contains 1,497 teams.
df_elo_team_ratings_all
Rank | Club | Country | Level | Elo | From | To | |
---|---|---|---|---|---|---|---|
5696 | None | Besa Kavaje | ALB | 0 | 1142.927246 | 2007-08-31 | 2008-07-01 |
5697 | None | Besa Kavaje | ALB | 0 | 1137.801514 | 2007-07-26 | 2007-08-02 |
5698 | None | Besa Kavaje | ALB | 0 | 1099.695801 | 2010-07-23 | 2011-06-30 |
5699 | None | Besa Kavaje | ALB | 0 | 1099.695801 | 2010-07-23 | 2011-06-30 |
5700 | None | Besa Kavaje | ALB | 0 | 1142.927246 | 2007-08-31 | 2008-07-01 |
... | ... | ... | ... | ... | ... | ... | ... |
133286 | None | The New Saints | WAL | 0 | 1088.250854 | 2001-07-26 | 2001-08-01 |
133885 | None | The New Saints | WAL | 0 | 1064.529297 | 2002-08-30 | 2003-07-16 |
134476 | None | The New Saints | WAL | 0 | 1068.116089 | 2002-07-25 | 2002-08-15 |
135076 | None | The New Saints | WAL | 0 | 1055.741821 | 2003-08-29 | 2004-07-14 |
135672 | None | The New Saints | WAL | 0 | 1056.635864 | 2003-07-24 | 2003-08-14 |
135700 rows × 7 columns
# Create DataFrame of unique clubs and countries for reference
## Define columns of interest
lst_cols = ['Club', 'Country']
## Select columns of interest
df_elo_teams_leagues = df_elo_team_ratings_all[lst_cols]
# Drop duplicates
df_elo_teams_leagues = df_elo_teams_leagues.drop_duplicates()
# Sort DataFrame by Country/Club
df_elo_teams_leagues = df_elo_teams_leagues.sort_values(['Country', 'Club'], ascending=[True, True])
# Reset index
df_elo_teams_leagues = df_elo_teams_leagues.reset_index(drop=True)
# Export DataFrame
if not os.path.exists(os.path.join(data_dir + f'/reference/teams/archive/elo_team_leagues_last_updated_{today}.csv')):
### Save latest version
df_elo_teams_leagues.to_csv(data_dir + '/reference/teams/elo_team_leagues_latest.csv', index=None, header=True)
### Save a copy to archive folder (dated)
df_elo_teams_leagues.to_csv(data_dir + f'/reference/teams/archive/elo_team_leagues_last_updated_{today}.csv', index=None, header=True)
else:
df_elo_teams_leagues = pd.read_csv(data_dir + '/reference/teams/elo_team_leagues_latest.csv')
print('Data already saved previously')
# Display DataFrame
df_elo_teams_leagues.head()
Club | Country | |
---|---|---|
0 | Besa Kavaje | ALB |
1 | Bylis Ballsh | ALB |
2 | Dinamo Tirana | ALB |
3 | Elbasani | ALB |
4 | Flamurtari | ALB |
df_elo_teams_leagues.shape
## All unique Teams
lst_teams = list(df_elo_teams_leagues['Club_Trimmed'].unique())
len(lst_teams)
1497
## All unique Countries
lst_countries = list(df_elo_teams_leagues['Country'].unique())
len(lst_countries)
56
Format of scraping data: 'api.clubelo.com/YYYY-MM-DD' e.g. https://api.clubelo.com/2021-09-02.
# Create list of dates for the last number of defined years
## Define today's date
today = datetime.datetime.now().strftime('%d/%m/%Y').replace('/', '')
## Create an empty list for scraped player bio data to be appended
lst_dates = []
## Define start and end date
sdate = date(2016, 8, 1) # start date
edate = date(2021, 9, 2) # end date ## CHANGE TO TODAY'S DATE
## Determine the difference between the two dates
delta = edate - sdate # as timedelta
##
for i in range(delta.days + 1):
day = sdate + timedelta(days=i)
day = day.strftime('%Y/%m/%d').replace('/', '')
lst_dates.append(day)
len(lst_dates)
1859
Data scraped for individual teams and then saved before being combined to a unified DataFrame, ready for Data Engineering, using the get_elo_team_ratings()
defined in the Custom Functions section of the notebook. This function incorporates ClubElo's API [link] which has a 'One club's rating history' feature, that provides the ranking history for an individual club, going back to the mid forties (note however that Club Elo states that values before 1960 should be considered provisional).
The format for obtaining a team's ranking is: api.clubelo.com/CLUBNAME i.e. api.clubelo.com/mancity.
df_elo_team_ratings_all_raw = get_elo_team_ratings()
Scraping started at: 2021-09-04 18:21:20.001197 Scraping started for Team Elo data for ALB... Team Elo rating data for Besa Kavaje already saved as a CSV file. Team Elo rating data for Bylis Ballsh already saved as a CSV file. Team Elo rating data for Dinamo Tirana already saved as a CSV file. Team Elo rating data for Elbasani already saved as a CSV file. Team Elo rating data for Flamurtari already saved as a CSV file. Team Elo rating data for Kukesi already saved as a CSV file. Team Elo rating data for Laci already saved as a CSV file. Team Elo rating data for Luftetari already saved as a CSV file. Team Elo rating data for Partizani Tirana already saved as a CSV file. Team Elo rating data for Skenderbeu already saved as a CSV file. Team Elo rating data for Teuta Durres already saved as a CSV file. Team Elo rating data for Tirana already saved as a CSV file. Team Elo rating data for Tomori Berat already saved as a CSV file. Team Elo rating data for Vllaznia already saved as a CSV file. Scraping started for Team Elo data for AND... Team Elo rating data for CE Principat already saved as a CSV file. Team Elo rating data for Constelacio Esportiva already saved as a CSV file. Team Elo rating data for Encamp already saved as a CSV file. Team Elo rating data for Engordany already saved as a CSV file. Team Elo rating data for Escaldes already saved as a CSV file. Team Elo rating data for Escale already saved as a CSV file. Team Elo rating data for FC Rangers already saved as a CSV file. Team Elo rating data for Lusitanos already saved as a CSV file. Team Elo rating data for Matecosa Sant Julia already saved as a CSV file. Team Elo rating data for Santa Coloma already saved as a CSV file. Scraping started for Team Elo data for ARM... Team Elo rating data for Alashkert already saved as a CSV file. Team Elo rating data for Ararat already saved as a CSV file. Team Elo rating data for Ararat-Armenia already saved as a CSV file. Team Elo rating data for Banants already saved as a CSV file. Team Elo rating data for FC Yerevan already saved as a CSV file. Team Elo rating data for Gandzasar Kapan already saved as a CSV file. Team Elo rating data for MIKA already saved as a CSV file. Team Elo rating data for Noah already saved as a CSV file. Team Elo rating data for Pyunik already saved as a CSV file. Team Elo rating data for Shirak already saved as a CSV file. Team Elo rating data for Spartak Yerevan already saved as a CSV file. Team Elo rating data for Ulisses already saved as a CSV file. Team Elo rating data for Zvartnots already saved as a CSV file. Scraping started for Team Elo data for AUT... Team Elo rating data for Admira already saved as a CSV file. Team Elo rating data for Altach already saved as a CSV file. Team Elo rating data for Austria Kaernten already saved as a CSV file. Team Elo rating data for Austria Wien already saved as a CSV file. Team Elo rating data for Bregenz already saved as a CSV file. Team Elo rating data for FCK Kaernten already saved as a CSV file. Team Elo rating data for GAK already saved as a CSV file. Team Elo rating data for Groedig already saved as a CSV file. Team Elo rating data for Hartberg already saved as a CSV file. Team Elo rating data for Kapfenberg already saved as a CSV file. Team Elo rating data for Klagenfurt already saved as a CSV file. Team Elo rating data for LASK already saved as a CSV file. Team Elo rating data for Lustenau already saved as a CSV file. Team Elo rating data for Mattersburg already saved as a CSV file. Team Elo rating data for Pasching already saved as a CSV file. Team Elo rating data for Rapid Wien already saved as a CSV file. Team Elo rating data for Ried already saved as a CSV file. Team Elo rating data for Salzburg already saved as a CSV file. Team Elo rating data for St Poelten already saved as a CSV file. Team Elo rating data for Sturm Graz already saved as a CSV file. Team Elo rating data for Wacker Innsbruck already saved as a CSV file. Team Elo rating data for Wattens already saved as a CSV file. Team Elo rating data for Wiener Neustadt already saved as a CSV file. Team Elo rating data for Wolfsberg already saved as a CSV file. Scraping started for Team Elo data for AZE... Team Elo rating data for AZAL Baku already saved as a CSV file. Team Elo rating data for FK Baku already saved as a CSV file. Team Elo rating data for FK Karvan Evlakh already saved as a CSV file. Team Elo rating data for Gabala already saved as a CSV file. Team Elo rating data for Inter Baku already saved as a CSV file. Team Elo rating data for Kapaz Ganca already saved as a CSV file. Team Elo rating data for Karabakh Agdam already saved as a CSV file. Team Elo rating data for Khazar already saved as a CSV file. Team Elo rating data for MKT Araz Imishli already saved as a CSV file. Team Elo rating data for Neftchi already saved as a CSV file. Team Elo rating data for Sabail already saved as a CSV file. Team Elo rating data for Shafa already saved as a CSV file. Team Elo rating data for Shamkir already saved as a CSV file. Team Elo rating data for Simurq Zaqatala already saved as a CSV file. Team Elo rating data for Sumqayit already saved as a CSV file. Team Elo rating data for Zira already saved as a CSV file. Scraping started for Team Elo data for BEL... Team Elo rating data for Aalst already saved as a CSV file. Team Elo rating data for Anderlecht already saved as a CSV file. Team Elo rating data for Antwerp already saved as a CSV file. Team Elo rating data for Beerschot AC already saved as a CSV file. Team Elo rating data for Bergen already saved as a CSV file. Team Elo rating data for Beveren already saved as a CSV file. Team Elo rating data for Brugge already saved as a CSV file. Team Elo rating data for Cercle Brugge already saved as a CSV file. Team Elo rating data for Charleroi already saved as a CSV file. Team Elo rating data for Dender already saved as a CSV file. Team Elo rating data for Eupen already saved as a CSV file. Team Elo rating data for FC Brussels already saved as a CSV file. Team Elo rating data for Geel already saved as a CSV file. Team Elo rating data for Genk already saved as a CSV file. Team Elo rating data for Gent already saved as a CSV file. Team Elo rating data for Harelbeke already saved as a CSV file. Team Elo rating data for Heusden Zolder already saved as a CSV file. Team Elo rating data for Kortrijk already saved as a CSV file. Team Elo rating data for Leuven already saved as a CSV file. Team Elo rating data for Lierse already saved as a CSV file. Team Elo rating data for Lokeren already saved as a CSV file. Team Elo rating data for Lommel already saved as a CSV file. Team Elo rating data for Louvieroise already saved as a CSV file. Team Elo rating data for Mechelen already saved as a CSV file. Team Elo rating data for Molenbeek already saved as a CSV file. Team Elo rating data for Mouscron already saved as a CSV file. Team Elo rating data for Oostende already saved as a CSV file. Team Elo rating data for Roeselare already saved as a CSV file. Team Elo rating data for Seraing already saved as a CSV file. Team Elo rating data for St Gillis already saved as a CSV file. Team Elo rating data for St Truiden already saved as a CSV file. Team Elo rating data for Standard already saved as a CSV file. Team Elo rating data for Tubize already saved as a CSV file. Team Elo rating data for Westerlo already saved as a CSV file. Team Elo rating data for Zulte Waregem already saved as a CSV file. Scraping started for Team Elo data for BHZ... Team Elo rating data for Borac Banja Luka already saved as a CSV file. Team Elo rating data for Brotnjo Citluk already saved as a CSV file. Team Elo rating data for Buduchnost Banovici already saved as a CSV file. Team Elo rating data for FK Sarajevo already saved as a CSV file. Team Elo rating data for Leotar Trebinje already saved as a CSV file. Team Elo rating data for Modrica Maksima already saved as a CSV file. Team Elo rating data for Olimpik Sarajevo already saved as a CSV file. Team Elo rating data for Orasje already saved as a CSV file. Team Elo rating data for Radnik Bijeljina already saved as a CSV file. Team Elo rating data for Siroki Brijeg already saved as a CSV file. Team Elo rating data for Slavija Istocno already saved as a CSV file. Team Elo rating data for Sloboda Tuzla already saved as a CSV file. Team Elo rating data for Velez Mostar already saved as a CSV file. Team Elo rating data for Zeljeznicar already saved as a CSV file. Team Elo rating data for Zepce Limorad already saved as a CSV file. Team Elo rating data for Zrinjski Mostar already saved as a CSV file. Scraping started for Team Elo data for BLR... Team Elo rating data for BATE already saved as a CSV file. Team Elo rating data for Bobruisk already saved as a CSV file. Team Elo rating data for Darida already saved as a CSV file. Team Elo rating data for Dinamo Brest already saved as a CSV file. Team Elo rating data for Dinamo Minsk already saved as a CSV file. Team Elo rating data for Dnepr Mogilev already saved as a CSV file. Team Elo rating data for FK Minsk already saved as a CSV file. Team Elo rating data for FK Vitebsk already saved as a CSV file. Team Elo rating data for Gomel already saved as a CSV file. Team Elo rating data for Gorodeya already saved as a CSV file. Team Elo rating data for Granit Mikashevichy already saved as a CSV file. Team Elo rating data for Isloch already saved as a CSV file. Team Elo rating data for Kamunalnik already saved as a CSV file. Team Elo rating data for Krumkachy already saved as a CSV file. Team Elo rating data for Lida already saved as a CSV file. Team Elo rating data for Lok Minsk already saved as a CSV file. Team Elo rating data for Luch Minsk already saved as a CSV file. Team Elo rating data for MTZ-RIPO already saved as a CSV file. Team Elo rating data for Maladzyechna already saved as a CSV file. Team Elo rating data for Neman Grodno already saved as a CSV file. Team Elo rating data for Novopolotsk already saved as a CSV file. Team Elo rating data for Rukh Brest already saved as a CSV file. Team Elo rating data for Savit already saved as a CSV file. Team Elo rating data for Slavia Mozyr already saved as a CSV file. Team Elo rating data for Slutsk already saved as a CSV file. Team Elo rating data for Smolevichi already saved as a CSV file. Team Elo rating data for Smorgon already saved as a CSV file. Team Elo rating data for Soligorsk already saved as a CSV file. Team Elo rating data for Sputnik already saved as a CSV file. Team Elo rating data for Svisloch-Krovlya already saved as a CSV file. Team Elo rating data for Torpedo Minsk already saved as a CSV file. Team Elo rating data for Torpedo Mogilev already saved as a CSV file. Team Elo rating data for Torpedo Zhodino already saved as a CSV file. Team Elo rating data for Vedrych already saved as a CSV file. Team Elo rating data for Zvyazda BDU already saved as a CSV file. Scraping started for Team Elo data for BUL... Team Elo rating data for Akademik Sofia already saved as a CSV file. Team Elo rating data for Arda already saved as a CSV file. Team Elo rating data for Belasitsa Petrich already saved as a CSV file. Team Elo rating data for Beroe Stara Zagora already saved as a CSV file. Team Elo rating data for Blagoevgrad already saved as a CSV file. Team Elo rating data for Botev Plovdiv already saved as a CSV file. Team Elo rating data for Botev Vratsa already saved as a CSV file. Team Elo rating data for CSKA 1948 Sofia already saved as a CSV file. Team Elo rating data for CSKA Sofia already saved as a CSV file. Team Elo rating data for Cherno More already saved as a CSV file. Team Elo rating data for Chernomorets Burgas already saved as a CSV file. Team Elo rating data for Dobrudzha already saved as a CSV file. Team Elo rating data for Dunav Ruse already saved as a CSV file. Team Elo rating data for Etar already saved as a CSV file. Team Elo rating data for Gotse Delchev already saved as a CSV file. Team Elo rating data for Haskovo already saved as a CSV file. Team Elo rating data for Hebar already saved as a CSV file. Team Elo rating data for Kaliakra Kavarna already saved as a CSV file. Team Elo rating data for Levski already saved as a CSV file. Team Elo rating data for Litex already saved as a CSV file. Team Elo rating data for Lok Gorna already saved as a CSV file. Team Elo rating data for Lok Plovdiv already saved as a CSV file. Team Elo rating data for Lok Sofia already saved as a CSV file. Team Elo rating data for Lyubimets already saved as a CSV file. Team Elo rating data for Marek Dupnitza already saved as a CSV file. Team Elo rating data for Mezdra already saved as a CSV file. Team Elo rating data for Minyor Pernik already saved as a CSV file. Team Elo rating data for Montana 1921 already saved as a CSV file. Team Elo rating data for N Burgas already saved as a CSV file. Team Elo rating data for Nesebar already saved as a CSV file. Team Elo rating data for Pirin 1922 already saved as a CSV file. Team Elo rating data for Razgrad already saved as a CSV file. Team Elo rating data for Rodopa Smolian already saved as a CSV file. Team Elo rating data for Samakov already saved as a CSV file. Team Elo rating data for Septemvri Sofia already saved as a CSV file. Team Elo rating data for Sevlievo already saved as a CSV file. Team Elo rating data for Shumen already saved as a CSV file. Team Elo rating data for Slavia Sofia already saved as a CSV file. Team Elo rating data for Sliven already saved as a CSV file. Team Elo rating data for Spartak Pleven already saved as a CSV file. Team Elo rating data for Spartak Varna already saved as a CSV file. Team Elo rating data for Svetkavitsa already saved as a CSV file. Team Elo rating data for Svoge already saved as a CSV file. Team Elo rating data for Tsarsko Selo already saved as a CSV file. Team Elo rating data for Velbazhd already saved as a CSV file. Team Elo rating data for Vereya already saved as a CSV file. Team Elo rating data for Vihren Sandanski already saved as a CSV file. Team Elo rating data for Vitosha Bistritsa already saved as a CSV file. Scraping started for Team Elo data for CRO... Team Elo rating data for Cakovec already saved as a CSV file. Team Elo rating data for Cibalia Vinkovci already saved as a CSV file. Team Elo rating data for Croatia Sesvete already saved as a CSV file. Team Elo rating data for Dinamo Zagreb already saved as a CSV file. Team Elo rating data for Dragovoljac already saved as a CSV file. Team Elo rating data for HNK Gorica already saved as a CSV file. Team Elo rating data for Hajduk already saved as a CSV file. Team Elo rating data for Inter Zapresic already saved as a CSV file. Team Elo rating data for Istra already saved as a CSV file. Team Elo rating data for Istra 1961 already saved as a CSV file. Team Elo rating data for Kamen Ingrad already saved as a CSV file. Team Elo rating data for Karlovac already saved as a CSV file. Team Elo rating data for Lok Zagreb already saved as a CSV file. Team Elo rating data for Lucko already saved as a CSV file. Team Elo rating data for Marsonia already saved as a CSV file. Team Elo rating data for Medjimurje already saved as a CSV file. Team Elo rating data for NK Zagreb already saved as a CSV file. Team Elo rating data for Osijek already saved as a CSV file. Team Elo rating data for Pomorac already saved as a CSV file. Team Elo rating data for Rijeka already saved as a CSV file. Team Elo rating data for Rudes Zagreb already saved as a CSV file. Team Elo rating data for Sibenik already saved as a CSV file. Team Elo rating data for Slaven Belupo already saved as a CSV file. Team Elo rating data for Split 1912 already saved as a CSV file. Team Elo rating data for Topolovac already saved as a CSV file. Team Elo rating data for Varazdin already saved as a CSV file. Team Elo rating data for Varteks already saved as a CSV file. Team Elo rating data for Vukovar 91 already saved as a CSV file. Team Elo rating data for Zadar already saved as a CSV file. Scraping started for Team Elo data for CSR... Scraping started for Team Elo data Artmedia in CSR...
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) /var/folders/d7/wvbp_b411h75p3zvbmrvql080000gn/T/ipykernel_81883/3656556382.py in <module> ----> 1 df_elo_team_ratings_all_raw = get_elo_team_ratings() /var/folders/d7/wvbp_b411h75p3zvbmrvql080000gn/T/ipykernel_81883/2911065457.py in get_elo_team_ratings() 75 76 ###### Save latest version ---> 77 df.to_csv(data_dir_elo + f'/raw/ratings_by_team/{country}/{team_folder}_{country}_elo_rating_latest.csv', index=None, header=True) 78 79 ###### Export a copy to the 'archive' subfolder, including the date /opt/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, decimal, errors, storage_options) 3480 doublequote=doublequote, 3481 escapechar=escapechar, -> 3482 storage_options=storage_options, 3483 ) 3484 /opt/anaconda3/lib/python3.7/site-packages/pandas/io/formats/format.py in to_csv(self, path_or_buf, encoding, sep, columns, index_label, mode, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, errors, storage_options) 1103 formatter=self.fmt, 1104 ) -> 1105 csv_formatter.save() 1106 1107 if created_buffer: /opt/anaconda3/lib/python3.7/site-packages/pandas/io/formats/csvs.py in save(self) 241 errors=self.errors, 242 compression=self.compression, --> 243 storage_options=self.storage_options, 244 ) as handles: 245 /opt/anaconda3/lib/python3.7/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 704 encoding=ioargs.encoding, 705 errors=errors, --> 706 newline="", 707 ) 708 else: FileNotFoundError: [Errno 2] No such file or directory: '../../data/elo/raw/ratings_by_team/CSR/Artmedia_CSR_elo_rating_latest.csv'
Let's quality of the dataset by looking first and last rows in pandas using the head() and tail() methods.
Initial step of the data handling and Exploratory Data Analysis (EDA) is to create a quick summary report of the dataset using pandas Profiling Report.
# Summary of the data using pandas Profiling Report
#pp.ProfileReport(df_elo_team_ratings_all_raw)
The following commands go into more bespoke summary of the dataset. Some of the commands include content covered in the pandas Profiling summary above, but using the standard pandas functions and methods that most peoplem will be more familiar with.
First check the quality of the dataset by looking first and last rows in pandas using the head() and tail() methods.
# Display the first five rows of the raw DataFrame, df_elo_team_ratings_all_raw
df_elo_team_ratings_all_raw.head()
Rank | Club | Country | Level | Elo | From | To | |
---|---|---|---|---|---|---|---|
1203829 | None | Besa Kavaje | ALB | 0 | 1325.261230 | 1972-07-01 | 1972-09-13 |
1203830 | None | Besa Kavaje | ALB | 0 | 1337.468506 | 1972-09-14 | 1972-09-27 |
1203831 | None | Besa Kavaje | ALB | 0 | 1338.146362 | 1972-09-28 | 1972-10-25 |
1203832 | None | Besa Kavaje | ALB | 0 | 1335.007324 | 1972-10-26 | 1972-11-08 |
1203833 | None | Besa Kavaje | ALB | 0 | 1334.845581 | 1972-11-09 | 1978-07-01 |
# Display the last five rows of the raw DataFrame, df_elo_team_ratings_all_raw
df_elo_team_ratings_all_raw.tail()
Rank | Club | Country | Level | Elo | From | To | |
---|---|---|---|---|---|---|---|
1804701 | None | Zlatibor | YUG | 1 | 1306.602783 | 1992-04-30 | 1992-05-03 |
1804702 | None | Zlatibor | YUG | 1 | 1312.926880 | 1992-05-04 | 1992-05-17 |
1804703 | None | Zlatibor | YUG | 1 | 1312.077393 | 1992-05-18 | 1992-05-24 |
1804704 | None | Zlatibor | YUG | 1 | 1331.334717 | 1992-05-25 | 1992-06-29 |
1804705 | None | Zlatibor | YUG | 1 | 1331.334717 | 1992-06-30 | 1995-07-01 |
# Print the shape of the raw DataFrame, df_elo_team_ratings_all_raw
print(df_elo_team_ratings_all_raw.shape)
(1854899, 7)
# Print the column names of the raw DataFrame, df_elo_team_ratings_all_raw
print(df_elo_team_ratings_all_raw.columns)
Index(['Rank', 'Club', 'Country', 'Level', 'Elo', 'From', 'To'], dtype='object')
The dataset has twelve features (columns).
# Data types of the features of the raw DataFrame, df_elo_team_ratings_all_raw
df_elo_team_ratings_all_raw.dtypes
Rank object Club object Country object Level int64 Elo float64 From object To object dtype: object
# Info for the raw DataFrame, df_elo_team_ratings_all_raw
df_elo_team_ratings_all_raw.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1854899 entries, 1203829 to 1804705 Data columns (total 7 columns): # Column Dtype --- ------ ----- 0 Rank object 1 Club object 2 Country object 3 Level int64 4 Elo float64 5 From object 6 To object dtypes: float64(1), int64(1), object(5) memory usage: 113.2+ MB
# Description of the raw DataFrame, df_elo_team_ratings_all_raw, showing some summary statistics for each numberical column in the DataFrame
df_elo_team_ratings_all_raw.describe()
Level | Elo | |
---|---|---|
count | 1.854899e+06 | 1.854899e+06 |
mean | 1.189897e+00 | 1.519541e+03 |
std | 4.280279e-01 | 1.660081e+02 |
min | 0.000000e+00 | 6.492441e+02 |
25% | 1.000000e+00 | 1.408781e+03 |
50% | 1.000000e+00 | 1.524152e+03 |
75% | 1.000000e+00 | 1.642937e+03 |
max | 2.000000e+00 | 2.107799e+03 |
# Plot visualisation of the missing values for each feature of the raw DataFrame, df_elo_team_ratings_all_raw
msno.matrix(df_elo_team_ratings_all_raw, figsize = (30, 7))
<AxesSubplot:>
# Counts of missing values
null_value_stats = df_elo_team_ratings_all_raw.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]
Series([], dtype: int64)
The visualisation shows us very quickly that there are no missing values.
Before we answer the questions in the brief through Exploratory Data Analysis (EDA), we'll first need to clean and wrangle the datasets to a form that meet our needs.
# Assign Raw DataFrames to new Engineered DataFrames
df_elo_team_ratings_all = df_elo_team_ratings_all_raw.copy()
df_elo_team_ratings_all.head()
Rank | Club | Country | Level | Elo | From | To | |
---|---|---|---|---|---|---|---|
1203829 | None | Besa Kavaje | ALB | 0 | 1325.261230 | 1972-07-01 | 1972-09-13 |
1203830 | None | Besa Kavaje | ALB | 0 | 1337.468506 | 1972-09-14 | 1972-09-27 |
1203831 | None | Besa Kavaje | ALB | 0 | 1338.146362 | 1972-09-28 | 1972-10-25 |
1203832 | None | Besa Kavaje | ALB | 0 | 1335.007324 | 1972-10-26 | 1972-11-08 |
1203833 | None | Besa Kavaje | ALB | 0 | 1334.845581 | 1972-11-09 | 1978-07-01 |
df_elo_team_ratings_all['From'] = pd.to_datetime(df_elo_team_ratings_all['From'])
df_elo_team_ratings_all['To'] = pd.to_datetime(df_elo_team_ratings_all['To'])
df_elo_team_ratings_all.dtypes
Rank object Club object Country object Level int64 Elo float64 From datetime64[ns] To datetime64[ns] dtype: object
df_elo_team_ratings_all['Days'] = df_elo_team_ratings_all['To'] - df_elo_team_ratings_all['From']
df_elo_team_ratings_all['Year_From'] = pd. DatetimeIndex(df_elo_team_ratings_all['From']).year
df_elo_team_ratings_all = df_elo_team_ratings_all.rename(columns={'Level': 'Domestic Division'})
Two methodologies:
Take the last value per year as that year's valuation. Ideally, this would be the date closest to September (start of the season), but for now, the current code is fine.
# Take the last Elo value per Team, per Year
df_elo_team_ratings_all_grouped = (df_elo_team_ratings_all
.loc[df_elo_team_ratings_all.groupby(['Club', 'Country', 'Year_From']).From.idxmax(axis=0, skipna=True)]
.reset_index(drop=True)
)
# Drop unnecessary columns
df_elo_team_ratings_all_grouped = (df_elo_team_ratings_all_grouped
.drop(['From', 'To', 'Days'], axis=1)
.rename(columns={'Year_From': 'Year'})
.drop_duplicates()
)
# Diplay DataFrame
df_elo_team_ratings_all_grouped.head(10)
Rank | Club | Country | Domestic Division | Elo | Year | |
---|---|---|---|---|---|---|
0 | None | AEK | GRE | 1 | 1351.622559 | 1959 |
1 | None | AEK | GRE | 1 | 1385.260620 | 1960 |
2 | None | AEK | GRE | 1 | 1433.593506 | 1961 |
3 | None | AEK | GRE | 1 | 1361.075439 | 1962 |
4 | None | AEK | GRE | 1 | 1370.932983 | 1963 |
5 | None | AEK | GRE | 1 | 1363.367676 | 1964 |
6 | None | AEK | GRE | 1 | 1384.022217 | 1965 |
7 | None | AEK | GRE | 1 | 1325.130493 | 1966 |
8 | None | AEK | GRE | 1 | 1359.587036 | 1967 |
9 | None | AEK | GRE | 1 | 1396.072632 | 1968 |
df_elo_team_ratings_all_grouped.shape
(37334, 6)
Take the average Elo value per year using pandas resample function.
THIS IS CURRENTLY NOT WORKING, BUT IS THE BETTER SOLUTION.
#df_elo_team_ratings_all_grouped = df_elo_team_ratings_all['Elo'].resample('w').mean()
No MLS data in the dataset currently, so those teams cannot be included.
# Filter list of 'Big 5' European Leagues and MLS from DataFrame
df_elo_team_ratings_all_grouped_big5 = df_elo_team_ratings_all_grouped[df_elo_team_ratings_all_grouped['Country'].isin(lst_countries_big5)]
# Sort DataFrame by Country/Club
df_elo_team_ratings_all_grouped_big5 = df_elo_team_ratings_all_grouped_big5.sort_values(['Country', 'Club', 'Year'], ascending=[True, True, True])
# Drop duplicates
df_elo_team_ratings_all_grouped_big5 = df_elo_team_ratings_all_grouped_big5.drop_duplicates()
df_elo_team_ratings_all_grouped_big5.shape
(12323, 6)
Separate DataFrame created for transfers from 2015/2016 onwards as this is when the other data sources such as FBref start.
# Filter plays in the Big 5 European Leagues
## Define list of countries
lst_seasons_1516_2122 = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
## Filter list of Big 5 European League countries from DataFrame
df_elo_team_ratings_all_grouped_big5_1516_present = df_elo_team_ratings_all_grouped_big5[df_elo_team_ratings_all_grouped_big5['Year'].isin(lst_seasons_1516_2122)]
df_elo_team_ratings_all_grouped_big5_1516_present.shape
(1536, 6)
# Map season to DataFrame
df_elo_team_ratings_all_grouped_big5_1516_present['Season'] = df_elo_team_ratings_all_grouped_big5_1516_present['Year'].map(dict_seasons)
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_elo_team_ratings_all_grouped_big5_1516_present
Rank | Club | Country | Domestic Division | Elo | Year | Season | |
---|---|---|---|---|---|---|---|
1987 | 8 | Arsenal | ENG | 1 | 1844.670654 | 2015 | 2015/2016 |
1988 | 8 | Arsenal | ENG | 1 | 1868.986694 | 2016 | 2016/2017 |
1989 | 14 | Arsenal | ENG | 1 | 1843.742798 | 2017 | 2017/2018 |
1990 | 14 | Arsenal | ENG | 1 | 1834.147827 | 2018 | 2018/2019 |
1991 | 18 | Arsenal | ENG | 1 | 1791.202881 | 2019 | 2019/2020 |
... | ... | ... | ... | ... | ... | ... | ... |
35189 | None | Vicenza | ITA | 2 | 1341.564331 | 2017 | 2017/2018 |
35190 | None | Vicenza | ITA | 2 | 1384.246826 | 2020 | 2020/2021 |
35191 | None | Vicenza | ITA | 2 | 1389.424927 | 2021 | 2021/2022 |
35448 | None | Virtus Lanciano | ITA | 2 | 1377.387451 | 2015 | 2015/2016 |
35449 | None | Virtus Lanciano | ITA | 2 | 1355.020996 | 2016 | 2016/2017 |
1536 rows × 7 columns
# Export DataFrame
## Save latest version
df_elo_team_ratings_all.to_csv(data_dir_elo + '/raw/ratings_by_team/elo_team_ratings_all_combined_latest.csv', index=None, header=True)
## Save a copy to archive folder (dated)
df_elo_team_ratings_all.to_csv(data_dir_elo + f'/raw/ratings_by_team/archive/elo_team_ratings_all_combined_last_updated_{today}.csv', index=None, header=True)
# Export DataFrame
## Save latest version
df_elo_team_ratings_all_grouped.to_csv(data_dir_elo + '/engineered/ratings_by_team/elo_team_rating_per_year_all_latest.csv', index=None, header=True)
## Save a copy to archive folder (dated)
df_elo_team_ratings_all_grouped.to_csv(data_dir_elo + f'/engineered/ratings_by_team/archive/elo_team_rating_per_year_all_last_updated_{today}.csv', index=None, header=True)
# Export DataFrame
## Save latest version
df_elo_team_ratings_all_grouped_big5.to_csv(data_dir_elo + '/engineered/ratings_by_team/elo_team_rating_per_year_big5_latest.csv', index=None, header=True)
## Save a copy to archive folder (dated)
df_elo_team_ratings_all_grouped_big5.to_csv(data_dir_elo + f'/engineered/ratings_by_team/archive/elo_team_rating_per_year_big5_last_updated_{today}.csv', index=None, header=True)
# Export DataFrame
## Save latest version
df_elo_team_ratings_all_grouped_big5_1516_present.to_csv(data_dir_elo + '/engineered/ratings_by_team/elo_team_rating_per_year_big5_1516_2122_latest.csv', index=None, header=True)
## Save a copy to archive folder (dated)
df_elo_team_ratings_all_grouped_big5_1516_present.to_csv(data_dir_elo + f'/engineered/ratings_by_team/archive/elo_team_rating_per_year_big5_1516_2122_last_updated_{today}.csv', index=None, header=True)
This Jupyter notebook engineered scraped football data from Club ELO using pandas for data manipulation through DataFrames.
*Visit my website eddwebster.com or my GitHub Repository for more projects. If you'd like to get in contact, my Twitter handle is @eddwebster and my email is: edd.j.webster@gmail.com.*