#!/usr/bin/env python # coding: utf-8 # # ... and in the end, the most experienced team wins # # Author: Frédéric DITH | Creation date: 2024-03-11 | Last updated: 2024-03-14 # # `python`, `pandas`, `plotly-express`, `data-analytics`,`statistics`,`football`,`championsleague` # # ## Context # # This self-assigned project is a simple exploration of football games dataset, namely, all the games played in the Champions League between 2016-17 and 2022-23 seasons. # # ## Findings # # ### [1] The data suggests that there's an "experience advantage" # # The correlation coefficient between the number of games played and the overall win rate is +0.777 (strongly correlated, positive). While it can't imply causality on its own, this value suggests that experience plays a significant role in a team's success in the competition. Another way to read into this value could be: *in any matchup, the team with more experience in the Champions League can be expected to win*. # # ### [2] The data suggests that there's a "home advantage" # # - **Overall**: across the entire set of 744 games, the proportion of home team wins (43.95%) is slightly higher than the proportion of away team wins (34.67%), suggesting a potential home advantage. # - **Group stage**: in the subset of 576 group stage games, the proportion of home team wins (44.09%) remains slightly higher than the proportion of away team wins (32.63%), consistent with the trend observed in the entire dataset. The proportion of draws (23.26%) is higher compared to the entire set, indicating that group stage games might be more evenly matched leading to more draws. # - **Knockout Games**: in the subset of 168 knockout games, the proportion of home team wins (43.45%) is still higher than the proportion of away team wins (41.66%), but the difference is smaller compared to the group stage games and the entire dataset. The proportion of draws (14.88%) is the lowest among the subsets, indicating that knockout games tend to have clearer outcomes with fewer draws. # - **Excluding draws**: in the subset of 585 games that did not end in a draw, 55% of games were won by the home team, compared to 45% won by the away team. # # Our analysis suggests that while there might be a home advantage overall and in the group stage games, the advantage might diminish or become less significant in knockout games. # # # # ## Methodology # # ### Initial data source # https://www.kaggle.com/datasets/cbxkgl/uefa-champions-league-2016-2022-data/data: 744 games, 74 teams # # ### added data # the initial dataset lists all games (home team, away team, home goals, away goals), and we added to that # - `home` and `away`: a value for each team that reflects if the game played at home or away # - `win`, `loss` or `draw`: the result of the game, for the team # - `group` or `knockout`: for that I simply used the date of the game as a proxy: if the game took place between september and december, it is a group stage game. Otherwise it is a knockout game. # # As a consequence, each team in each game is given one of these twelve new distinct values: # - Homewin_group # - Homewin_knockout # - Homeloss_group # - Homeloss_knockout # - Homedraw_group # - Homedraw_knockout # - Awaywin_group # - Awaywin_knockout # - Awayloss_group # - Awayloss_knockout # - Awaydraw_group # - Awaydraw_knockout # # In[1]: import os import pandas as pd import numpy as np import plotly.express as px import requests df = pd.read_csv('matches.csv') # ADD group stage or knockout # if game is between september and december, then group, otherwise knockout df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], format='%d-%b-%y %I.%M.%S.%f %p') df['Month'] = df['DATE_TIME'].dt.strftime('%b') def map_month_to_group(month): if month in ['Sep', 'Oct', 'Nov', 'Dec']: return 'group' else: return 'knockout' df['stage'] = df['Month'].apply(map_month_to_group) df['Result_hometeam_stage'] = df['Result_hometeam'].str.cat(df['stage'], sep='_') df['Result_awayteam_stage'] = df['Result_awayteam'].str.cat(df['stage'], sep='_') # count the number of games played by each team teams = pd.concat([df['HOME_TEAM'], df['AWAY_TEAM']]).unique() games_played = {team: 0 for team in teams} for index, row in df.iterrows(): games_played[row['HOME_TEAM']] += 1 games_played[row['AWAY_TEAM']] += 1 games_played_df = pd.DataFrame.from_dict(games_played, orient='index', columns=['Games Played']) games_played_df.index.name = 'Team' games_played_df = games_played_df.sort_values(by='Games Played', ascending=False) # store the counts for each team and result in each stage homewin_group = {} homewin_knockout = {} homeloss_group = {} homeloss_knockout = {} homedraw_group = {} homedraw_knockout = {} awaywin_group = {} awaywin_knockout = {} awayloss_group = {} awayloss_knockout = {} awaydraw_group = {} awaydraw_knockout = {} # Iterate through each row of the DataFrame for index, row in df.iterrows(): # Update counts based on the stage stage = row['stage'] home_result = row['Result_hometeam'] away_result = row['Result_awayteam'] # Decide which dictionary to use based on stage and result if stage == 'group': home_dict = homewin_group if home_result == 'homewin' else (homeloss_group if home_result == 'homeloss' else homedraw_group) away_dict = awaywin_group if away_result == 'awaywin' else (awayloss_group if away_result == 'awayloss' else awaydraw_group) else: # Knockout stage home_dict = homewin_knockout if home_result == 'homewin' else (homeloss_knockout if home_result == 'homeloss' else homedraw_knockout) away_dict = awaywin_knockout if away_result == 'awaywin' else (awayloss_knockout if away_result == 'awayloss' else awaydraw_knockout) # Update counts for home team home_team = row['HOME_TEAM'] if home_team not in home_dict: home_dict[home_team] = 0 home_dict[home_team] += 1 # Update counts for away team away_team = row['AWAY_TEAM'] if away_team not in away_dict: away_dict[away_team] = 0 away_dict[away_team] += 1 # Create DataFrames to store the counts for each team results_group_df = pd.DataFrame({'Homewin_group': homewin_group, 'Homeloss_group': homeloss_group, 'Homedraw_group': homedraw_group, 'Awaywin_group': awaywin_group, 'Awayloss_group': awayloss_group, 'Awaydraw_group': awaydraw_group}) results_group_df.index.name = 'Team' results_knockout_df = pd.DataFrame({'Homewin_knockout': homewin_knockout, 'Homeloss_knockout': homeloss_knockout, 'Homedraw_knockout': homedraw_knockout, 'Awaywin_knockout': awaywin_knockout, 'Awayloss_knockout': awayloss_knockout, 'Awaydraw_knockout': awaydraw_knockout}) results_knockout_df.index.name = 'Team' merged_results_df = pd.merge(results_group_df, results_knockout_df, on='Team', how='outer') # merge results and games played merged_df = pd.merge(merged_results_df, games_played_df, left_index=True, right_index=True) merged_df.insert(0, 'Games Played', merged_df.pop('Games Played')) merged_df['Games Played'] = merged_df['Games Played'].astype(float) merged_df = merged_df.sort_values(by='Games Played', ascending=False) # ADD overall win, draw and loss rate merged_df['ovr_win_count'] = merged_df[['Homewin_group', 'Awaywin_group', 'Homewin_knockout', 'Awaywin_knockout']].sum(axis=1) merged_df['ovr_draw_count'] = merged_df[['Homedraw_group', 'Awaydraw_group', 'Homedraw_knockout', 'Awaydraw_knockout']].sum(axis=1) merged_df['ovr_loss_count'] = merged_df[['Homeloss_group', 'Awayloss_group', 'Homeloss_knockout', 'Awayloss_knockout']].sum(axis=1) merged_df['ovr_win'] = merged_df['ovr_win_count'] / merged_df['Games Played'] merged_df['ovr_draw'] = merged_df['ovr_draw_count'] / merged_df['Games Played'] merged_df['ovr_loss'] = merged_df['ovr_loss_count'] / merged_df['Games Played'] merged_df.insert(1, 'ovr_win', merged_df.pop('ovr_win')) merged_df.insert(2, 'ovr_draw', merged_df.pop('ovr_draw')) merged_df.insert(3, 'ovr_loss', merged_df.pop('ovr_loss')) # ADD count games played in group and games played in knockout merged_df['Games_Played_Group'] = merged_df[['Homewin_group', 'Homeloss_group', 'Homedraw_group', 'Awaywin_group', 'Awayloss_group', 'Awaydraw_group']].sum(axis=1) merged_df['Games_Played_Knockout'] = merged_df[['Homewin_knockout', 'Homeloss_knockout', 'Homedraw_knockout', 'Awaywin_knockout', 'Awayloss_knockout', 'Awaydraw_knockout']].sum(axis=1) # ADD specific results for home stage, away stage, home ko, away ko merged_df['Winrate_home_stage'] = merged_df['Homewin_group'] / merged_df['Games_Played_Group'] merged_df['Winrate_away_stage'] = merged_df['Awaywin_group'] / merged_df['Games_Played_Group'] merged_df['Winrate_home_knockout'] = merged_df['Homewin_knockout'] / merged_df['Games_Played_Knockout'] merged_df['Winrate_away_knockout'] = merged_df['Awaywin_knockout'] / merged_df['Games_Played_Knockout'] merged_df['Winrate_group'] = (merged_df['Homewin_group'] + merged_df['Awaywin_group']) / merged_df['Games_Played_Group'] merged_df['Winrate_knockout'] = (merged_df['Homewin_knockout'] + merged_df['Awaywin_knockout']) / merged_df['Games_Played_Knockout'] #replace NaN with zeros for further calculation merged_df.fillna(0, inplace=True) #merged_df.head(20) #merged_df.describe() # # The experience advantage # # # ### How to read this chart # - A team further to the right of the chart has played more games, or has more experience # - A team higher up has won more of these games played # - The trendline (method: Ordinary Least Squares) represents **the level of performance expected, according to the number of games played in the competition**. While there are many other parameters that should be accounted for, it gives us an insight at how a team can be expected to perform, according to their experience. # # In[2]: fig = px.scatter(merged_df, x='Games Played', y='ovr_win', text=merged_df.index, title='Games Played vs Overall win rate', labels={'Games Played': 'Games Played', 'ovr_win': 'Overall win rate'}, trendline='ols') fig.update_layout(height=1000) fig.update_traces(textposition='bottom right') fig.show() # ### Correlation between Games played and Overall win rate # In[3]: correlation = merged_df['Games Played'].corr(merged_df['ovr_win']) print("Correlation coefficient between games played and overall win rate:", correlation) # +0.777 suggests a strong positive correlation between the number of games played and the overall win rate for the teams. # # The home advantage # # Instead of looking at team level data, we will go back to the main dataset and look at the outcome of all games. # # ### Overall games distribution by outcomes # In[4]: all_outcomes = pd.DataFrame(merged_results_df.sum(axis=0)) all_outcomes = all_outcomes.reset_index() all_outcomes.columns = ['outcome','count'] all_outcomes['pct'] = all_outcomes['count'] / 744 all_outcomes # ### Games distribution by outcomes, and split by conditions (home/away, group/knockout) # In[5]: home_total_wins = merged_df[['Homewin_group','Homewin_knockout']].sum() home_total_loss = merged_df[['Homeloss_group','Homeloss_knockout']].sum() home_total_draw = merged_df[['Homedraw_group','Homedraw_knockout']].sum() away_total_wins = merged_df[['Awaywin_group','Awaywin_knockout']].sum() away_total_loss = merged_df[['Awayloss_group','Awayloss_knockout']].sum() away_total_draw = merged_df[['Awaydraw_group','Awaydraw_knockout']].sum() home_stats = pd.DataFrame({ 'Total Wins': [home_total_wins['Homewin_group'], home_total_wins['Homewin_knockout']], 'Total Losses': [home_total_loss['Homeloss_group'], home_total_loss['Homeloss_knockout']], 'Total Draws': [home_total_draw['Homedraw_group'], home_total_draw['Homedraw_knockout']] }, index=['Group Stage', 'Knockout']) away_stats = pd.DataFrame({ 'Total Wins': [away_total_wins['Awaywin_group'], away_total_wins['Awaywin_knockout']], 'Total Losses': [away_total_loss['Awayloss_group'], away_total_loss['Awayloss_knockout']], 'Total Draws': [away_total_draw['Awaydraw_group'], away_total_draw['Awaydraw_knockout']] }, index=['Group Stage', 'Knockout']) total_stats = pd.concat([home_stats, away_stats], keys=['Home', 'Away']) total_stats # As expected, the numbers between 'home' and 'away' subsets mirror each other: # - 254 Home wins in group stage must equal to the same number of Away losses in group stages # - 188 Home losses in group stage must equal the same number of Away wins in group stages # - Draw games will also be similar in all cases (when a game draws, both teams get a draw) # - etc # # Instead of looking at games ending with "a winner and a loser" (or both teams getting a draw), we can try the following approach, that is slightly different: # # **Any game results in one of three exclusive outcomes**: # - the home team winning, or # - the away team winning, or # - both teams get a draw # # We can now look at different numbers, for each of these outcomes # # **Entire set** (744 games): # - 43.95% ended with a win for the home team ((254+73)/744) # - 34.67% ended with a win for the away team ((188+70)/744) # - 21.37% ended with a draw ((134+25)/744) # # **Subset: group stage games** (576 games): # - 44.09% ended with a win for the home team (254/576) # - 32.63% ended with a win for the away team (188/576) # - 23.26% ended with a draw (134/576) # # **Subset: knockout games** (168 games): # - 43.45% ended with a win for the home team (73/168) # - 41.66% ended with a win for the away team (70/168) # - 14.88% ended with a draw (25/168) # # This suggests that while there might be a home advantage overall and in the group stage games, the advantage might diminish or become less significant in knockout games. # ### Exclude all draws # # Let's now look at all the games that did not end in a draw. We know from the previous table that 159 games ended with a draw (134+25). This means that 585 games ended with one of the teams winning the game. # # For this subset of 585 games, how many were won by the home team, and how many were won by the away team? # In[6]: result_counts = df['Result'].value_counts() result_counts_df = result_counts.to_frame().reset_index() result_counts_df.columns = ['result', 'count'] result_counts_df = result_counts_df.drop(2).reset_index(drop=True) result_counts_df['pct'] = result_counts_df['count'] / 585 result_counts_df