#!/usr/bin/env python # coding: utf-8 # # PCA-based model to find similar players # In[1]: #RUN ALL THE CELLS TILL TO RUN THE APP # In[2]: class color: PURPLE = '\033[95m' CYAN = '\033[96m' DARKCYAN = '\033[36m' BLUE = '\033[94m' GREEN = '\033[92m' YELLOW = '\033[93m' RED = '\033[91m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m' #print(color.BOLD + 'PCA-based model to find similar players' + color.END) print('by Parth Athale (@ParthAthale)\n') print('Data credits to FBref/StatsBomb') print('Read methodology here: https://xgpershot.wordpress.com/2020/08/06/pca-based-model-to-identify-similar-players/') print('Read code here: https://github.com/parth1902/PCA_Player_Finder\n') print('Some examples as a guide to do this:') print('If you want to find out a Pierre-Emile Højbjerg replacement for Southampton, choose Højbjerg, Southampton, Overall') print('If you want to find out a player like Leroy Sané for Barcelona, choose Sané, Barcelona, Overall') print('If you want to find out a player with the defensive ability of Wilfried Ndidi without any team constraint, choose Ndidi, Overall, Defensive work') # In[3]: from __future__ import print_function from ipywidgets import interact, interactive, fixed, interact_manual import ipywidgets as widgets import pandas as pd import numpy as np pd.set_option('expand_frame_repr', False) pd.set_option('display.max_columns', 10) from IPython.display import HTML, display display(HTML('''''')) # In[4]: import pandas as pd url = 'https://raw.githubusercontent.com/parth1902/test/master/games.csv' df = pd.read_csv(url,sep = ',') url = 'https://raw.githubusercontent.com/parth1902/test/master/players.csv' df_players = pd.read_csv(url,sep = ',') df_players = df_players[df_players['minutes'] > 500] df_players = df_players[df_players['position'] != 'GK'] df_players = df_players.drop_duplicates(subset=['player'], keep='last') def f(df): if df['result'] == 'W': val = 3 elif df['result'] == 'D': val = 1 else: val = 0 return val df['Points taken'] = df.apply(f, axis = 1) # In[5]: arr = ['minutes', 'goals', 'assists', 'pens_made', 'pens_att', 'xg', 'npxg', 'xa', 'shots_total', 'shots_on_target', 'shots_free_kicks', 'xg_net', 'npxg_net', 'passes_completed', 'passes', 'passes_total_distance', 'passes_progressive_distance', 'passes_completed_short', 'passes_short', 'passes_completed_medium', 'passes_medium', 'passes_completed_long', 'passes_long', 'assisted_shots', 'passes_into_final_third', 'passes_into_penalty_area', 'crosses_into_penalty_area', 'progressive_passes', 'passes_live', 'passes_dead', 'passes_free_kicks', 'through_balls', 'passes_pressure', 'passes_switches', 'crosses', 'corner_kicks', 'corner_kicks_in', 'corner_kicks_out', 'corner_kicks_straight', 'passes_ground', 'passes_low', 'passes_high', 'passes_left_foot', 'passes_right_foot', 'passes_head', 'throw_ins', 'passes_other_body', 'passes_offsides', 'passes_oob', 'passes_intercepted', 'passes_blocked', 'sca', 'sca_passes_live', 'sca_passes_dead', 'sca_dribbles', 'sca_shots', 'sca_fouled', 'gca', 'gca_passes_live', 'gca_passes_dead', 'gca_dribbles', 'gca_shots', 'gca_fouled', 'gca_og_for', 'tackles', 'tackles_won', 'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd', 'dribble_tackles', 'dribbles_vs', 'dribbled_past', 'pressures', 'pressure_regains', 'pressures_def_3rd', 'pressures_mid_3rd', 'pressures_att_3rd', 'blocks', 'blocked_shots', 'blocked_shots_saves', 'blocked_passes', 'interceptions', 'clearances', 'errors', 'touches', 'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd', 'touches_att_3rd', 'touches_att_pen_area', 'touches_live_ball', 'dribbles_completed', 'dribbles', 'players_dribbled_past', 'nutmegs', 'carries', 'carry_distance', 'carry_progressive_distance', 'pass_targets', 'miscontrols', 'dispossessed'] from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() for i in range (0,len(arr)): if arr[i]!= 'minutes': df_players[arr[i]] = (df_players[arr[i]]/df_players['minutes'])*90 df_players = df_players.drop(['goals_per90', 'cards_yellow', 'cards_red', 'assists_per90', 'goals_assists_per90', 'goals_pens_per90', 'goals_assists_pens_per90', 'xg_per90', 'xa_per90', 'xg_xa_per90', 'npxg_per90', 'npxg_xa_per90', 'minutes_90s', 'shots_total_per90', 'shots_on_target_per90', 'xa_net', 'sca_per90', 'gca_per90', 'passes_received', 'cards_yellow_red', 'fouls', 'fouled', 'offsides', 'pens_won', 'pens_conceded', 'own_goals', 'ball_recoveries', 'aerials_won', 'aerials_lost', 'aerials_won_pct'], axis = 1) df_playersnew = df_players.drop(['player', 'nationality', 'position', 'squad', 'age', 'birth_year', 'games', 'games_starts','minutes'], axis = 1) arr2 = list(df_playersnew.columns.values) for i in range(0,len(arr2)): df_playersnew[arr2[i]] = scaler.fit_transform(df_playersnew[[arr2[i]]]) df.rename(columns={'xg_for':'xg'}, inplace=True) df_playersnew1 = df_playersnew.copy() # In[6]: def shoot(df_playersnew1): return df_playersnew1[['goals', 'xg', 'npxg', 'shots_total', 'shots_on_target', 'shots_free_kicks', 'shots_on_target_pct', 'goals_per_shot', 'goals_per_shot_on_target', 'npxg_per_shot', 'xg_net', 'npxg_net']] def create(df_playersnew1): return df_playersnew1[['sca', 'sca_passes_live', 'sca_passes_dead', 'sca_dribbles', 'sca_shots', 'sca_fouled', 'assisted_shots', 'through_balls', 'gca', 'gca_passes_live', 'gca_passes_dead', 'gca_dribbles', 'gca_shots', 'gca_fouled', 'gca_og_for','assists','xa']] def passs(df_playersnew1): return df_playersnew1[['passes_completed', 'passes', 'passes_pct', 'passes_total_distance', 'passes_progressive_distance', 'passes_completed_short', 'passes_short', 'passes_pct_short', 'passes_completed_medium', 'passes_medium', 'passes_pct_medium', 'passes_completed_long', 'passes_long', 'passes_pct_long', 'passes_into_final_third', 'passes_into_penalty_area', 'crosses_into_penalty_area', 'progressive_passes', 'passes_live', 'passes_dead', 'passes_free_kicks', 'passes_pressure', 'passes_switches', 'crosses', 'corner_kicks', 'corner_kicks_in', 'corner_kicks_out', 'corner_kicks_straight', 'passes_ground', 'passes_low', 'passes_high', 'passes_left_foot', 'passes_right_foot', 'passes_head', 'throw_ins', 'passes_other_body', 'passes_offsides', 'passes_oob', 'passes_intercepted', 'passes_blocked']] def deff(df_playersnew1): return df_playersnew1[['tackles', 'tackles_won', 'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd', 'dribble_tackles', 'dribbles_vs', 'dribble_tackles_pct', 'dribbled_past', 'pressures', 'pressure_regains', 'pressure_regain_pct', 'pressures_def_3rd', 'pressures_mid_3rd', 'pressures_att_3rd', 'blocks', 'blocked_shots', 'blocked_shots_saves', 'blocked_passes', 'interceptions', 'clearances', 'errors']] def poss(df_playersnew1): return df_playersnew1[['touches', 'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd', 'touches_att_3rd', 'touches_att_pen_area', 'touches_live_ball', 'dribbles_completed', 'dribbles', 'dribbles_completed_pct', 'players_dribbled_past', 'nutmegs', 'carries', 'carry_distance', 'carry_progressive_distance', 'pass_targets', 'passes_received_pct', 'miscontrols', 'dispossessed']] # In[7]: pl = np.array(df_players['player']) te = np.array(df_players['squad'].unique()) te = np.append('Overall',sorted(te)) # In[8]: def find(player_name,team,skill,number_of_results): global df global df_players global finalDf global df_playersnew global df_playersnew2 global df_playersnew1 global principalDf df_playersnew1 = df_playersnew.copy() df = df.loc[:,~df.T.duplicated(keep='first')] df = df.loc[:, ~df.columns.duplicated()] if team != "Overall": df_new = df[df['for'] == team] corrMatrix = df_new.corr() else: corrMatrix = df.corr() arr2 = list(df_playersnew1.columns.values) for i in range(0,len(arr2)): df_playersnew1[arr2[i]] = (df_playersnew1[arr2[i]]) * (corrMatrix['Points taken'][arr2[i]]) if skill == 'Overall': df_playersnew2 = df_playersnew1 elif skill == 'Possession': df_playersnew2 = poss(df_playersnew1) elif skill == 'Shooting': df_playersnew2 = shoot(df_playersnew1) elif skill == 'Passing': df_playersnew2 = passs(df_playersnew1) elif skill == 'Creating': df_playersnew2 = create(df_playersnew1) elif skill == 'Defensive work': df_playersnew2 = deff(df_playersnew1) features = list(df_playersnew2.columns.values) #df_playersnew2 = df_playersnew2.fillna(0) from sklearn.preprocessing import StandardScaler # Separating out the features x = df_playersnew2.loc[:, features].values # Separating out the target y = df_players.loc[:,['player']].values # Standardizing the features #x = StandardScaler().fit_transform(x) x = np.nan_to_num(x) from sklearn.decomposition import PCA #pca = PCA(n_components=2) pca = PCA(.90) principalComponents = pca.fit_transform(x) print('Number of PCA components:',pca.n_components_) print('\n') principalDf = pd.DataFrame(data = principalComponents) global finalDf df_players = df_players.reset_index(drop=True) finalDf = pd.concat([principalDf, df_players[['player']]], axis = 1) finalDf = pd.concat([finalDf, df_players[['squad']]], axis = 1) finalDf = pd.concat([finalDf, df_players[['position']]], axis = 1) finalDf = pd.concat([finalDf, df_players[['age']]], axis = 1) player = player_name class color: PURPLE = '\033[95m' CYAN = '\033[96m' DARKCYAN = '\033[36m' BLUE = '\033[94m' GREEN = '\033[92m' YELLOW = '\033[93m' RED = '\033[91m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m' #print(color.BOLD + 'Hello World !' + color.END) a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])] b = finalDf[finalDf.columns.drop(['player','squad','position','age'])] finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5) dist = finalDf['distance'].max() dist2 = finalDf['distance'].quantile(0.95) finalDf['% match'] = 100-(finalDf['distance']/dist2)*100 final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1] final = final.reset_index(drop=True) print(color.BOLD + 'List of similar players:' + color.END) print('\n') print(final[['player','squad','position','age','% match']]) # In[9]: def find2(player_name,team,stats,number_of_results): print('Stats selected:',stats) print('\n') global df global df_players global finalDf global df_playersnew global df_playersnew2 global df_playersnew1 global principalDf df_playersnew1 = df_playersnew.copy() df = df.loc[:,~df.T.duplicated(keep='first')] df = df.loc[:, ~df.columns.duplicated()] if team != "Overall": df_new = df[df['for'] == team] corrMatrix = df_new.corr() else: corrMatrix = df.corr() arr2 = list(df_playersnew1.columns.values) for i in range(0,len(arr2)): df_playersnew1[arr2[i]] = (df_playersnew1[arr2[i]]) * (corrMatrix['Points taken'][arr2[i]]) if not stats: print("Choose at least one stat to see output") return else: df_playersnew2 = df_playersnew1[np.array(stats)] features = list(df_playersnew2.columns.values) #df_playersnew2 = df_playersnew2.fillna(0) from sklearn.preprocessing import StandardScaler # Separating out the features x = df_playersnew2.loc[:, features].values # Separating out the target y = df_players.loc[:,['player']].values # Standardizing the features #x = StandardScaler().fit_transform(x) x = np.nan_to_num(x) from sklearn.decomposition import PCA #pca = PCA(n_components=2) pca = PCA(.90) principalComponents = pca.fit_transform(x) print('Number of PCA components:',pca.n_components_) print('\n') principalDf = pd.DataFrame(data = principalComponents) global finalDf df_players = df_players.reset_index(drop=True) finalDf = pd.concat([principalDf, df_players[['player']]], axis = 1) finalDf = pd.concat([finalDf, df_players[['squad']]], axis = 1) finalDf = pd.concat([finalDf, df_players[['position']]], axis = 1) finalDf = pd.concat([finalDf, df_players[['age']]], axis = 1) player = player_name class color: PURPLE = '\033[95m' CYAN = '\033[96m' DARKCYAN = '\033[36m' BLUE = '\033[94m' GREEN = '\033[92m' YELLOW = '\033[93m' RED = '\033[91m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m' #print(color.BOLD + 'Hello World !' + color.END) a = (finalDf[finalDf['player'] == player])[(finalDf[finalDf['player'] == player]).columns.drop(['player','squad','position','age'])] b = finalDf[finalDf.columns.drop(['player','squad','position','age'])] finalDf['distance'] = (finalDf[list(b.columns.values)] - np.array(a)).pow(2).sum(1).pow(0.5) dist = finalDf['distance'].max() dist2 = finalDf['distance'].quantile(0.95) finalDf['% match'] = 100-(finalDf['distance']/dist2)*100 final = ((finalDf.sort_values(['distance'], ascending=[True])))[1:number_of_results+1] final = final.reset_index(drop=True) print(color.BOLD + 'List of similar players:' + color.END) print('\n') print(final[['player','squad','position','age','% match']]) # In[10]: interact(find, player_name = sorted(pl),team = te,skill = ['Overall','Passing','Creating','Shooting','Defensive work','Possession'],number_of_results = (range(100))[20:100]); # In[11]: interact(find2, player_name = sorted(pl),team = te,stats = widgets.SelectMultiple(options = list(df_playersnew.columns.values),rows=10),number_of_results = (range(100))[20:100]); #select multiple stats with ctrl/command button