#RUN ALL THE CELLS TILL TO RUN THE APP
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#print(color.BOLD + 'PCA-based model to find similar players' + color.END)
print('by Parth Athale (@ParthAthale)\n')
print('Data credits to FBref/StatsBomb')
print('Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/')
print('Read code here: https://github.com/parth1902/PCA_Player_Finder\n')
print('Some examples as a guide to do this:')
print('If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall')
print('If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall')
print('If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work')
by Parth Athale (@ParthAthale) Data credits to FBref/StatsBomb Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/ Read code here: https://github.com/parth1902/PCA_Player_Finder Some examples as a guide to do this: If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import pandas as pd
import numpy as np
pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_columns', 10)
from IPython.display import HTML, display
display(HTML('''<style>
.widget-label { min-width: 20ex !important; }
</style>'''))
import pandas as pd
url = 'https://raw.githubusercontent.com/parth1902/test/master/games.csv'
df = pd.read_csv(url,sep = ',')
url = 'https://raw.githubusercontent.com/parth1902/test/master/players.csv'
df_players = pd.read_csv(url,sep = ',')
df_players = df_players[df_players['minutes'] > 500]
df_players = df_players[df_players['position'] != 'GK']
df_players = df_players.drop_duplicates(subset=['player'], keep='last')
def f(df):
if df['result'] == 'W':
val = 3
elif df['result'] == 'D':
val = 1
else:
val = 0
return val
df['Points taken'] = df.apply(f, axis = 1)
arr = ['minutes',
'goals',
'assists',
'pens_made',
'pens_att',
'xg',
'npxg',
'xa',
'shots_total',
'shots_on_target',
'shots_free_kicks',
'xg_net',
'npxg_net',
'passes_completed',
'passes',
'passes_total_distance',
'passes_progressive_distance',
'passes_completed_short',
'passes_short',
'passes_completed_medium',
'passes_medium',
'passes_completed_long',
'passes_long',
'assisted_shots',
'passes_into_final_third',
'passes_into_penalty_area',
'crosses_into_penalty_area',
'progressive_passes',
'passes_live',
'passes_dead',
'passes_free_kicks',
'through_balls',
'passes_pressure',
'passes_switches',
'crosses',
'corner_kicks',
'corner_kicks_in',
'corner_kicks_out',
'corner_kicks_straight',
'passes_ground',
'passes_low',
'passes_high',
'passes_left_foot',
'passes_right_foot',
'passes_head',
'throw_ins',
'passes_other_body',
'passes_offsides',
'passes_oob',
'passes_intercepted',
'passes_blocked',
'sca',
'sca_passes_live',
'sca_passes_dead',
'sca_dribbles',
'sca_shots',
'sca_fouled',
'gca',
'gca_passes_live',
'gca_passes_dead',
'gca_dribbles',
'gca_shots',
'gca_fouled',
'gca_og_for',
'tackles',
'tackles_won',
'tackles_def_3rd',
'tackles_mid_3rd',
'tackles_att_3rd',
'dribble_tackles',
'dribbles_vs',
'dribbled_past',
'pressures',
'pressure_regains',
'pressures_def_3rd',
'pressures_mid_3rd',
'pressures_att_3rd',
'blocks',
'blocked_shots',
'blocked_shots_saves',
'blocked_passes',
'interceptions',
'clearances',
'errors',
'touches',
'touches_def_pen_area',
'touches_def_3rd',
'touches_mid_3rd',
'touches_att_3rd',
'touches_att_pen_area',
'touches_live_ball',
'dribbles_completed',
'dribbles',
'players_dribbled_past',
'nutmegs',
'carries',
'carry_distance',
'carry_progressive_distance',
'pass_targets',
'miscontrols',
'dispossessed']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for i in range (0,len(arr)):
if arr[i]!= 'minutes':
df_players[arr[i]] = (df_players[arr[i]]/df_players['minutes'])*90
df_players = df_players.drop(['goals_per90',
'cards_yellow',
'cards_red',
'assists_per90',
'goals_assists_per90',
'goals_pens_per90',
'goals_assists_pens_per90',
'xg_per90',
'xa_per90',
'xg_xa_per90',
'npxg_per90',
'npxg_xa_per90',
'minutes_90s',
'shots_total_per90',
'shots_on_target_per90',
'xa_net',
'sca_per90',
'gca_per90',
'passes_received',
'cards_yellow_red',
'fouls',
'fouled',
'offsides',
'pens_won',
'pens_conceded',
'own_goals',
'ball_recoveries',
'aerials_won',
'aerials_lost',
'aerials_won_pct'], axis = 1)
df_playersnew = df_players.drop(['player',
'nationality',
'position',
'squad',
'age',
'birth_year',
'games',
'games_starts','minutes'], axis = 1)
arr2 = list(df_playersnew.columns.values)
for i in range(0,len(arr2)):
df_playersnew[arr2[i]] = scaler.fit_transform(df_playersnew[[arr2[i]]])
df.rename(columns={'xg_for':'xg'}, inplace=True)
df_playersnew1 = df_playersnew.copy()
def shoot(df_playersnew1):
return df_playersnew1[['goals',
'xg',
'npxg',
'shots_total',
'shots_on_target',
'shots_free_kicks',
'shots_on_target_pct',
'goals_per_shot',
'goals_per_shot_on_target',
'npxg_per_shot',
'xg_net',
'npxg_net']]
def create(df_playersnew1):
return df_playersnew1[['sca',
'sca_passes_live',
'sca_passes_dead',
'sca_dribbles',
'sca_shots',
'sca_fouled',
'assisted_shots',
'through_balls',
'gca',
'gca_passes_live',
'gca_passes_dead',
'gca_dribbles',
'gca_shots',
'gca_fouled',
'gca_og_for','assists','xa']]
def passs(df_playersnew1):
return df_playersnew1[['passes_completed',
'passes',
'passes_pct',
'passes_total_distance',
'passes_progressive_distance',
'passes_completed_short',
'passes_short',
'passes_pct_short',
'passes_completed_medium',
'passes_medium',
'passes_pct_medium',
'passes_completed_long',
'passes_long',
'passes_pct_long',
'passes_into_final_third',
'passes_into_penalty_area',
'crosses_into_penalty_area',
'progressive_passes',
'passes_live',
'passes_dead',
'passes_free_kicks',
'passes_pressure',
'passes_switches',
'crosses',
'corner_kicks',
'corner_kicks_in',
'corner_kicks_out',
'corner_kicks_straight',
'passes_ground',
'passes_low',
'passes_high',
'passes_left_foot',
'passes_right_foot',
'passes_head',
'throw_ins',
'passes_other_body',
'passes_offsides',
'passes_oob',
'passes_intercepted',
'passes_blocked']]
def deff(df_playersnew1):
return df_playersnew1[['tackles',
'tackles_won',
'tackles_def_3rd',
'tackles_mid_3rd',
'tackles_att_3rd',
'dribble_tackles',
'dribbles_vs',
'dribble_tackles_pct',
'dribbled_past',
'pressures',
'pressure_regains',
'pressure_regain_pct',
'pressures_def_3rd',
'pressures_mid_3rd',
'pressures_att_3rd',
'blocks',
'blocked_shots',
'blocked_shots_saves',
'blocked_passes',
'interceptions',
'clearances',
'errors']]
def poss(df_playersnew1):
return df_playersnew1[['touches',
'touches_def_pen_area',
'touches_def_3rd',
'touches_mid_3rd',
'touches_att_3rd',
'touches_att_pen_area',
'touches_live_ball',
'dribbles_completed',
'dribbles',
'dribbles_completed_pct',
'players_dribbled_past',
'nutmegs',
'carries',
'carry_distance',
'carry_progressive_distance',
'pass_targets',
'passes_received_pct',
'miscontrols',
'dispossessed']]
pl = np.array(df_players['player'])
te = np.array(df_players['squad'].unique())
te = np.append('Overall',sorted(te))
def find(player_name,team,skill,number_of_results):
global df
global df_players
global finalDf
global df_playersnew
global df_playersnew2
global df_playersnew1
global principalDf
df_playersnew1 = df_playersnew.copy()
df = df.loc[:,~df.T.duplicated(keep='first')]
df = df.loc[:, ~df.columns.duplicated()]
if team != "Overall":
df_new = df[df['for'] == team]
corrMatrix = df_new.corr()
else:
corrMatrix = df.corr()
arr2 = list(df_playersnew1.columns.values)
for i in range(0,len(arr2)):
df_playersnew1[arr2[i]] = (df_playersnew1[arr2[i]]) * (corrMatrix['Points taken'][arr2[i]])
if skill == 'Overall':
df_playersnew2 = df_playersnew1
elif skill == 'Possession':
df_playersnew2 = poss(df_playersnew1)
elif skill == 'Shooting':
df_playersnew2 = shoot(df_playersnew1)
elif skill == 'Passing':
df_playersnew2 = passs(df_playersnew1)
elif skill == 'Creating':
df_playersnew2 = create(df_playersnew1)
elif skill == 'Defensive work':
df_playersnew2 = deff(df_playersnew1)
features = list(df_playersnew2.columns.values)
#df_playersnew2 = df_playersnew2.fillna(0)
from sklearn.preprocessing import StandardScaler
# Separating out the features
x = df_playersnew2.loc[:, features].values
# Separating out the target
y = df_players.loc[:,['player']].values
# Standardizing the features
#x = StandardScaler().fit_transform(x)
x = np.nan_to_num(x)
from sklearn.decomposition import PCA
#pca = PCA(n_components=2)
pca = PCA(.90)
principalComponents = pca.fit_transform(x)
print('Number of PCA components:',pca.n_components_)
print('\n')
principalDf = pd.DataFrame(data = principalComponents)
global finalDf
df_players = df_players.reset_index(drop=True)
finalDf = pd.concat([principalDf, df_players[['player']]], axis = 1)
finalDf = pd.concat([finalDf, df_players[['squad']]], axis = 1)
finalDf = pd.concat([finalDf, df_players[['position']]], axis = 1)
finalDf = pd.concat([finalDf, df_players[['age']]], axis = 1)
player = player_name
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#print(color.BOLD + 'Hello World !' + color.END)
a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])]
b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]
finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)
dist = finalDf['distance'].max()
dist2 = finalDf['distance'].quantile(0.95)
finalDf['% match'] = 100-(finalDf['distance']/dist2)*100
final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]
final = final.reset_index(drop=True)
print(color.BOLD + 'List of similar players:' + color.END)
print('\n')
print(final[['player','squad','position','age','% match']])
def find2(player_name,team,stats,number_of_results):
print('Stats selected:',stats)
print('\n')
global df
global df_players
global finalDf
global df_playersnew
global df_playersnew2
global df_playersnew1
global principalDf
df_playersnew1 = df_playersnew.copy()
df = df.loc[:,~df.T.duplicated(keep='first')]
df = df.loc[:, ~df.columns.duplicated()]
if team != "Overall":
df_new = df[df['for'] == team]
corrMatrix = df_new.corr()
else:
corrMatrix = df.corr()
arr2 = list(df_playersnew1.columns.values)
for i in range(0,len(arr2)):
df_playersnew1[arr2[i]] = (df_playersnew1[arr2[i]]) * (corrMatrix['Points taken'][arr2[i]])
if not stats:
print("Choose at least one stat to see output")
return
else:
df_playersnew2 = df_playersnew1[np.array(stats)]
features = list(df_playersnew2.columns.values)
#df_playersnew2 = df_playersnew2.fillna(0)
from sklearn.preprocessing import StandardScaler
# Separating out the features
x = df_playersnew2.loc[:, features].values
# Separating out the target
y = df_players.loc[:,['player']].values
# Standardizing the features
#x = StandardScaler().fit_transform(x)
x = np.nan_to_num(x)
from sklearn.decomposition import PCA
#pca = PCA(n_components=2)
pca = PCA(.90)
principalComponents = pca.fit_transform(x)
print('Number of PCA components:',pca.n_components_)
print('\n')
principalDf = pd.DataFrame(data = principalComponents)
global finalDf
df_players = df_players.reset_index(drop=True)
finalDf = pd.concat([principalDf, df_players[['player']]], axis = 1)
finalDf = pd.concat([finalDf, df_players[['squad']]], axis = 1)
finalDf = pd.concat([finalDf, df_players[['position']]], axis = 1)
finalDf = pd.concat([finalDf, df_players[['age']]], axis = 1)
player = player_name
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#print(color.BOLD + 'Hello World !' + color.END)
a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])]
b = finalDf[finalDf.columns.drop(['player','squad','position','age'])]
finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5)
dist = finalDf['distance'].max()
dist2 = finalDf['distance'].quantile(0.95)
finalDf['% match'] = 100-(finalDf['distance']/dist2)*100
final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1]
final = final.reset_index(drop=True)
print(color.BOLD + 'List of similar players:' + color.END)
print('\n')
print(final[['player','squad','position','age','% match']])
interact(find, player_name = sorted(pl),team = te,skill = ['Overall','Passing','Creating','Shooting','Defensive work','Possession'],number_of_results = (range(100))[20:100]);
interactive(children=(Dropdown(description='player_name', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaron…
interact(find2, player_name = sorted(pl),team = te,stats = widgets.SelectMultiple(options = list(df_playersnew.columns.values),rows=10),number_of_results = (range(100))[20:100]);
#select multiple stats with ctrl/command button
interactive(children=(Dropdown(description='player_name', options=('Aaron Connolly', 'Aaron Cresswell', 'Aaron…