#!/usr/bin/env python
# coding: utf-8

# # ... and in the end, the most experienced team wins
# 
# Author: Frédéric DITH  |  Creation date: 2024-03-11  |  Last updated: 2024-03-14
# 
# `python`, `pandas`, `plotly-express`, `data-analytics`,`statistics`,`football`,`championsleague`
# 
# ## Context
# 
# This self-assigned project is a simple exploration of football games dataset, namely, all the games played in the Champions League between 2016-17 and 2022-23 seasons.
# 
# ## Findings
# 
# ### [1] The data suggests that there's an "experience advantage"
# 
# The correlation coefficient between the number of games played and the overall win rate is +0.777 (strongly correlated, positive). While it can't imply causality on its own, this value suggests that experience plays a significant role in a team's success in the competition. Another way to read into this value could be: *in any matchup, the team with more experience in the Champions League can be expected to win*.
# 
# ### [2] The data suggests that there's a "home advantage"
# 
# - **Overall**: across the entire set of 744 games, the proportion of home team wins (43.95%) is slightly higher than the proportion of away team wins (34.67%), suggesting a potential home advantage.
# - **Group stage**: in the subset of 576 group stage games, the proportion of home team wins (44.09%) remains slightly higher than the proportion of away team wins (32.63%), consistent with the trend observed in the entire dataset. The proportion of draws (23.26%) is higher compared to the entire set, indicating that group stage games might be more evenly matched leading to more draws.
# - **Knockout Games**: in the subset of 168 knockout games, the proportion of home team wins (43.45%) is still higher than the proportion of away team wins (41.66%), but the difference is smaller compared to the group stage games and the entire dataset. The proportion of draws (14.88%) is the lowest among the subsets, indicating that knockout games tend to have clearer outcomes with fewer draws.
# - **Excluding draws**: in the subset of 585 games that did not end in a draw, 55% of games were won by the home team, compared to 45% won by the away team.
# 
# Our analysis suggests that while there might be a home advantage overall and in the group stage games, the advantage might diminish or become less significant in knockout games.
# 
# 
# 
# ## Methodology
# 
# ### Initial data source
# https://www.kaggle.com/datasets/cbxkgl/uefa-champions-league-2016-2022-data/data: 744 games, 74 teams
# 
# ### added data
# the initial dataset lists all games (home team, away team, home goals, away goals), and we added to that
# - `home` and `away`: a value for each team that reflects if the game played at home or away
# - `win`, `loss` or `draw`: the result of the game, for the team
# - `group` or `knockout`: for that I simply used the date of the game as a proxy: if the game took place between september and december, it is a group stage game. Otherwise it is a knockout game.
# 
# As a consequence, each team in each game is given one of these twelve new distinct values:
# - Homewin_group
# - Homewin_knockout
# - Homeloss_group
# - Homeloss_knockout
# - Homedraw_group
# - Homedraw_knockout
# - Awaywin_group
# - Awaywin_knockout
# - Awayloss_group
# - Awayloss_knockout
# - Awaydraw_group
# - Awaydraw_knockout
# 

# In[1]:


import os
import pandas as pd
import numpy as np
import plotly.express as px
import requests

df = pd.read_csv('matches.csv')

# ADD group stage or knockout
# if game is between september and december, then group, otherwise knockout
df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], format='%d-%b-%y %I.%M.%S.%f %p')
df['Month'] = df['DATE_TIME'].dt.strftime('%b')
def map_month_to_group(month):
    if month in ['Sep', 'Oct', 'Nov', 'Dec']:
        return 'group'
    else:
        return 'knockout'
df['stage'] = df['Month'].apply(map_month_to_group)  
df['Result_hometeam_stage'] = df['Result_hometeam'].str.cat(df['stage'], sep='_')
df['Result_awayteam_stage'] = df['Result_awayteam'].str.cat(df['stage'], sep='_')

# count the number of games played by each team
teams = pd.concat([df['HOME_TEAM'], df['AWAY_TEAM']]).unique()
games_played = {team: 0 for team in teams}

for index, row in df.iterrows():
    games_played[row['HOME_TEAM']] += 1
    games_played[row['AWAY_TEAM']] += 1
games_played_df = pd.DataFrame.from_dict(games_played, orient='index', columns=['Games Played'])
games_played_df.index.name = 'Team'
games_played_df = games_played_df.sort_values(by='Games Played', ascending=False)


# store the counts for each team and result in each stage
homewin_group = {}
homewin_knockout = {}
homeloss_group = {}
homeloss_knockout = {}
homedraw_group = {}
homedraw_knockout = {}
awaywin_group = {}
awaywin_knockout = {}
awayloss_group = {}
awayloss_knockout = {}
awaydraw_group = {}
awaydraw_knockout = {}

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Update counts based on the stage
    stage = row['stage']
    home_result = row['Result_hometeam']
    away_result = row['Result_awayteam']
    
    # Decide which dictionary to use based on stage and result
    if stage == 'group':
        home_dict = homewin_group if home_result == 'homewin' else (homeloss_group if home_result == 'homeloss' else homedraw_group)
        away_dict = awaywin_group if away_result == 'awaywin' else (awayloss_group if away_result == 'awayloss' else awaydraw_group)
    else:  # Knockout stage
        home_dict = homewin_knockout if home_result == 'homewin' else (homeloss_knockout if home_result == 'homeloss' else homedraw_knockout)
        away_dict = awaywin_knockout if away_result == 'awaywin' else (awayloss_knockout if away_result == 'awayloss' else awaydraw_knockout)
    
    # Update counts for home team
    home_team = row['HOME_TEAM']
    if home_team not in home_dict:
        home_dict[home_team] = 0
    home_dict[home_team] += 1
        
    # Update counts for away team
    away_team = row['AWAY_TEAM']
    if away_team not in away_dict:
        away_dict[away_team] = 0
    away_dict[away_team] += 1

# Create DataFrames to store the counts for each team
results_group_df = pd.DataFrame({'Homewin_group': homewin_group, 'Homeloss_group': homeloss_group, 'Homedraw_group': homedraw_group, 
                                 'Awaywin_group': awaywin_group, 'Awayloss_group': awayloss_group, 'Awaydraw_group': awaydraw_group})
results_group_df.index.name = 'Team'

results_knockout_df = pd.DataFrame({'Homewin_knockout': homewin_knockout, 'Homeloss_knockout': homeloss_knockout, 'Homedraw_knockout': homedraw_knockout, 
                                     'Awaywin_knockout': awaywin_knockout, 'Awayloss_knockout': awayloss_knockout, 'Awaydraw_knockout': awaydraw_knockout})
results_knockout_df.index.name = 'Team'

merged_results_df = pd.merge(results_group_df, results_knockout_df, on='Team', how='outer')

# merge results and games played
merged_df = pd.merge(merged_results_df, games_played_df, left_index=True, right_index=True)
merged_df.insert(0, 'Games Played', merged_df.pop('Games Played'))
merged_df['Games Played'] = merged_df['Games Played'].astype(float)
merged_df = merged_df.sort_values(by='Games Played', ascending=False)

# ADD overall win, draw and loss rate
merged_df['ovr_win_count'] = merged_df[['Homewin_group', 'Awaywin_group', 'Homewin_knockout', 'Awaywin_knockout']].sum(axis=1)
merged_df['ovr_draw_count'] = merged_df[['Homedraw_group', 'Awaydraw_group', 'Homedraw_knockout', 'Awaydraw_knockout']].sum(axis=1)
merged_df['ovr_loss_count'] = merged_df[['Homeloss_group', 'Awayloss_group', 'Homeloss_knockout', 'Awayloss_knockout']].sum(axis=1)

merged_df['ovr_win'] = merged_df['ovr_win_count'] / merged_df['Games Played']
merged_df['ovr_draw'] = merged_df['ovr_draw_count'] / merged_df['Games Played']
merged_df['ovr_loss'] = merged_df['ovr_loss_count'] / merged_df['Games Played']

merged_df.insert(1, 'ovr_win', merged_df.pop('ovr_win'))
merged_df.insert(2, 'ovr_draw', merged_df.pop('ovr_draw'))
merged_df.insert(3, 'ovr_loss', merged_df.pop('ovr_loss'))

# ADD count games played in group and games played in knockout
merged_df['Games_Played_Group'] = merged_df[['Homewin_group', 'Homeloss_group', 'Homedraw_group',
                                                             'Awaywin_group', 'Awayloss_group', 'Awaydraw_group']].sum(axis=1)

merged_df['Games_Played_Knockout'] = merged_df[['Homewin_knockout', 'Homeloss_knockout', 'Homedraw_knockout',
                                                                'Awaywin_knockout', 'Awayloss_knockout', 'Awaydraw_knockout']].sum(axis=1)

# ADD specific results for home stage, away stage, home ko, away ko
merged_df['Winrate_home_stage'] = merged_df['Homewin_group'] / merged_df['Games_Played_Group']
merged_df['Winrate_away_stage'] = merged_df['Awaywin_group'] / merged_df['Games_Played_Group']
merged_df['Winrate_home_knockout'] = merged_df['Homewin_knockout'] / merged_df['Games_Played_Knockout']
merged_df['Winrate_away_knockout'] = merged_df['Awaywin_knockout'] / merged_df['Games_Played_Knockout']

merged_df['Winrate_group'] = (merged_df['Homewin_group'] + merged_df['Awaywin_group']) / merged_df['Games_Played_Group']
merged_df['Winrate_knockout'] = (merged_df['Homewin_knockout'] + merged_df['Awaywin_knockout']) / merged_df['Games_Played_Knockout']

#replace NaN with zeros for further calculation
merged_df.fillna(0, inplace=True)
#merged_df.head(20)
#merged_df.describe()


# # The experience advantage
# 
# 
# ### How to read this chart
# - A team further to the right of the chart has played more games, or has more experience
# - A team higher up has won more of these games played
# - The trendline (method: Ordinary Least Squares) represents **the level of performance expected, according to the number of games played in the competition**. While there are many other parameters that should be accounted for, it gives us an insight at how a team can be expected to perform, according to their experience.
# 

# In[2]:


fig = px.scatter(merged_df, x='Games Played', y='ovr_win',
                 text=merged_df.index,
                 title='Games Played vs Overall win rate',
                 labels={'Games Played': 'Games Played', 'ovr_win': 'Overall win rate'},
                 trendline='ols')
fig.update_layout(height=1000)
fig.update_traces(textposition='bottom right')
fig.show()


# ### Correlation between Games played and Overall win rate

# In[3]:


correlation = merged_df['Games Played'].corr(merged_df['ovr_win'])
print("Correlation coefficient between games played and overall win rate:", correlation)


# +0.777 suggests a strong positive correlation between the number of games played and the overall win rate for the teams.

# # The home advantage
# 
# Instead of looking at team level data, we will go back to the main dataset and look at the outcome of all games.
# 
# ### Overall games distribution by outcomes

# In[4]:


all_outcomes = pd.DataFrame(merged_results_df.sum(axis=0))
all_outcomes = all_outcomes.reset_index()
all_outcomes.columns = ['outcome','count']
all_outcomes['pct'] = all_outcomes['count'] / 744
all_outcomes


# ### Games distribution by outcomes, and split by conditions (home/away, group/knockout)

# In[5]:


home_total_wins = merged_df[['Homewin_group','Homewin_knockout']].sum()
home_total_loss = merged_df[['Homeloss_group','Homeloss_knockout']].sum()
home_total_draw = merged_df[['Homedraw_group','Homedraw_knockout']].sum()
away_total_wins = merged_df[['Awaywin_group','Awaywin_knockout']].sum()
away_total_loss = merged_df[['Awayloss_group','Awayloss_knockout']].sum()
away_total_draw = merged_df[['Awaydraw_group','Awaydraw_knockout']].sum()

home_stats = pd.DataFrame({
    'Total Wins': [home_total_wins['Homewin_group'], home_total_wins['Homewin_knockout']],
    'Total Losses': [home_total_loss['Homeloss_group'], home_total_loss['Homeloss_knockout']],
    'Total Draws': [home_total_draw['Homedraw_group'], home_total_draw['Homedraw_knockout']]
}, index=['Group Stage', 'Knockout'])

away_stats = pd.DataFrame({
    'Total Wins': [away_total_wins['Awaywin_group'], away_total_wins['Awaywin_knockout']],
    'Total Losses': [away_total_loss['Awayloss_group'], away_total_loss['Awayloss_knockout']],
    'Total Draws': [away_total_draw['Awaydraw_group'], away_total_draw['Awaydraw_knockout']]
}, index=['Group Stage', 'Knockout'])
total_stats = pd.concat([home_stats, away_stats], keys=['Home', 'Away'])
total_stats


# As expected, the numbers between 'home' and 'away' subsets mirror each other:
# - 254 Home wins in group stage must equal to the same number of Away losses in group stages
# - 188 Home losses in group stage must equal the same number of Away wins in group stages
# - Draw games will also be similar in all cases (when a game draws, both teams get a draw)
# - etc
# 
# Instead of looking at games ending with "a winner and a loser" (or both teams getting a draw), we can try the following approach, that is slightly different:
# 
# **Any game results in one of three exclusive outcomes**:
# - the home team winning, or
# - the away team winning, or
# - both teams get a draw
# 
# We can now look at different numbers, for each of these outcomes
# 
# **Entire set** (744 games):
# - 43.95% ended with a win for the home team ((254+73)/744)
# - 34.67% ended with a win for the away team ((188+70)/744)
# - 21.37% ended with a draw ((134+25)/744)
# 
# **Subset: group stage games** (576 games):
# - 44.09% ended with a win for the home team (254/576)
# - 32.63% ended with a win for the away team (188/576)
# - 23.26% ended with a draw (134/576)
# 
# **Subset: knockout games** (168 games):
# - 43.45% ended with a win for the home team (73/168)
# - 41.66% ended with a win for the away team (70/168)
# - 14.88% ended with a draw (25/168)
# 
# This suggests that while there might be a home advantage overall and in the group stage games, the advantage might diminish or become less significant in knockout games.

# ### Exclude all draws
# 
# Let's now look at all the games that did not end in a draw. We know from the previous table that 159 games ended with a draw (134+25). This means that 585 games ended with one of the teams winning the game.
# 
# For this subset of 585 games, how many were won by the home team, and how many were won by the away team?

# In[6]:


result_counts = df['Result'].value_counts()
result_counts_df = result_counts.to_frame().reset_index()
result_counts_df.columns = ['result', 'count']
result_counts_df = result_counts_df.drop(2).reset_index(drop=True)
result_counts_df['pct'] = result_counts_df['count'] / 585
result_counts_df