#Importing libraries
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Loading,reading and storing dataset of FIFA21 players
df = pd.read_csv('fifa21_male2.csv')
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\2069660345.py:2: DtypeWarning: Columns (78) have mixed types. Specify dtype option on import or set low_memory=False. df = pd.read_csv('fifa21_male2.csv')
#Review of data fields
df. head(5)
ID | Name | Age | OVA | Nationality | Club | BOV | BP | Position | Player Photo | Club Logo | Flag Photo | POT | Team & Contract | Height | Weight | foot | Growth | Joined | Loan Date End | Value | Wage | Release Clause | Contract | Attacking | Crossing | Finishing | Heading Accuracy | Short Passing | Volleys | Skill | Dribbling | Curve | FK Accuracy | Long Passing | Ball Control | Movement | Acceleration | Sprint Speed | Agility | Reactions | Balance | Power | Shot Power | Jumping | Stamina | Strength | Long Shots | Mentality | Aggression | Interceptions | Positioning | Vision | Penalties | Composure | Defending | Marking | Standing Tackle | Sliding Tackle | Goalkeeping | GK Diving | GK Handling | GK Kicking | GK Positioning | GK Reflexes | Total Stats | Base Stats | W/F | SM | A/W | D/W | IR | PAC | SHO | PAS | DRI | DEF | PHY | Hits | LS | ST | RS | LW | LF | CF | RF | RW | LAM | CAM | RAM | LM | LCM | CM | RCM | RM | LWB | LDM | CDM | RDM | RWB | LB | LCB | CB | RCB | RB | GK | Gender | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | G. Pasquale | 33 | 69 | Italy | Udinese | 71 | LWB | LM | https://cdn.sofifa.com/players/000/002/16_120.png | https://cdn.sofifa.com/teams/55/light_60.png | https://cdn.sofifa.com/flags/it.png | 69 | Udinese 2008 ~ 2016 | 6'0" | 181lbs | Left | 0 | Jul 1, 2008 | NaN | €625K | €7K | €0 | 2008 ~ 2016 | 313 | 75 | 50 | 59 | 71 | 58.0 | 338 | 73 | 65.0 | 60 | 69 | 71 | 347 | 68 | 74 | 68.0 | 69 | 68.0 | 347 | 74 | 68.0 | 69 | 68 | 68 | 320 | 72 | 69.0 | 63.0 | 66.0 | 50 | NaN | 208 | 70 | 69 | 69.0 | 56 | 14 | 5 | 15 | 10 | 12 | 1929 | 408 | 3 ★ | 2★ | Medium | High | 2 ★ | 71 | 59 | 70 | 71 | 68 | 69 | 4 | 65+0 | 65+0 | 65+0 | 68+0 | 67+0 | 67+0 | 67+0 | 68+0 | 68+0 | 68+0 | 68+0 | 69+0 | 69+0 | 69+0 | 69+0 | 69+0 | 71+-2 | 70+-1 | 70+-1 | 70+-1 | 71+-2 | 70+-1 | 69+0 | 69+0 | 69+0 | 70+-1 | 17+0 | Male |
1 | 16 | Luis García | 37 | 71 | Spain | KAS Eupen | 70 | CM | CM CAM CDM | https://cdn.sofifa.com/players/000/016/19_120.png | https://cdn.sofifa.com/teams/2013/light_60.png | https://cdn.sofifa.com/flags/es.png | 71 | KAS Eupen 2014 ~ 2019 | 5'10" | 143lbs | Right | 0 | Jul 19, 2014 | NaN | €600K | €7K | €1.1M | 2014 ~ 2019 | 337 | 68 | 64 | 61 | 76 | 68.0 | 369 | 69 | 79.0 | 79 | 71 | 71 | 305 | 56 | 50 | 62.0 | 65 | 72.0 | 324 | 75 | 54.0 | 64 | 60 | 71 | 362 | 71 | 71.0 | 72.0 | 73.0 | 75 | 79.0 | 153 | 70 | 43 | 40.0 | 56 | 9 | 12 | 13 | 11 | 11 | 1906 | 385 | 4 ★ | 3★ | Medium | Medium | 1 ★ | 53 | 69 | 73 | 69 | 58 | 63 | 4 | 67+1 | 67+1 | 67+1 | 67+0 | 68+0 | 68+0 | 68+0 | 67+0 | 70+1 | 70+1 | 70+1 | 68+1 | 70+1 | 70+1 | 70+1 | 68+1 | 62+1 | 66+1 | 66+1 | 66+1 | 62+1 | 60+1 | 60+1 | 60+1 | 60+1 | 60+1 | 17+1 | Male |
2 | 27 | J. Cole | 33 | 71 | England | Coventry City | 71 | CAM | CAM RM RW LM | https://cdn.sofifa.com/players/000/027/16_120.png | https://cdn.sofifa.com/teams/1800/light_60.png | https://cdn.sofifa.com/flags/gb-eng.png | 71 | Coventry City 2016 ~ 2020 | 5'9" | 161lbs | Right | 0 | Jan 7, 2016 | NaN | €1.1M | €15K | €0 | 2016 ~ 2020 | 337 | 80 | 64 | 41 | 77 | 75.0 | 387 | 79 | 84.0 | 77 | 69 | 78 | 295 | 48 | 42 | 71.0 | 59 | 75.0 | 284 | 72 | 58.0 | 29 | 56 | 69 | 317 | 69 | 39.0 | 69.0 | 74.0 | 66 | NaN | 99 | 35 | 34 | 30.0 | 51 | 9 | 6 | 13 | 16 | 7 | 1770 | 354 | 4 ★ | 4★ | Medium | Low | 2 ★ | 45 | 68 | 76 | 77 | 36 | 52 | 11 | 64+0 | 64+0 | 64+0 | 70+0 | 69+0 | 69+0 | 69+0 | 70+0 | 71+0 | 71+0 | 71+0 | 68+0 | 66+0 | 66+0 | 66+0 | 68+0 | 52+0 | 54+0 | 54+0 | 54+0 | 52+0 | 47+0 | 46+0 | 46+0 | 46+0 | 47+0 | 15+0 | Male |
3 | 36 | D. Yorke | 36 | 68 | Trinidad & Tobago | Sunderland | 70 | ST | NaN | https://cdn.sofifa.com/players/000/036/09_120.png | https://cdn.sofifa.com/teams/106/light_60.png | https://cdn.sofifa.com/flags/tt.png | 82 | Sunderland 2009 | 5'11" | 165lbs | Right | 14 | NaN | NaN | €0 | €0 | €0 | 2009 | 264 | 54 | 70 | 60 | 80 | NaN | 255 | 68 | NaN | 46 | 64 | 77 | 176 | 59 | 62 | NaN | 55 | NaN | 239 | 63 | NaN | 51 | 66 | 59 | 271 | 59 | 70.0 | 72.0 | NaN | 70 | NaN | 75 | 34 | 41 | NaN | 68 | 5 | 21 | 64 | 21 | 21 | 1348 | 369 | 3 ★ | 1★ | NaN | NaN | 1 ★ | 61 | 66 | 66 | 69 | 47 | 60 | 3 | 67+0 | 67+0 | 67+0 | 66+0 | 67+0 | 67+0 | 67+0 | 66+0 | 70+0 | 70+0 | 70+0 | 66+0 | 68+0 | 68+0 | 68+0 | 66+0 | 56+0 | 65+0 | 65+0 | 65+0 | 56+0 | 57+0 | 51+0 | 51+0 | 51+0 | 57+0 | 22+0 | Male |
4 | 41 | Iniesta | 36 | 81 | Spain | Vissel Kobe | 82 | CAM | CM CAM | https://cdn.sofifa.com/players/000/041/20_120.png | https://cdn.sofifa.com/teams/101146/light_60.png | https://cdn.sofifa.com/flags/es.png | 81 | Vissel Kobe 2018 ~ 2021 | 5'7" | 150lbs | Right | 0 | Jul 16, 2018 | NaN | €5.5M | €12K | €7.2M | 2018 ~ 2021 | 367 | 75 | 69 | 54 | 90 | 79.0 | 408 | 85 | 80.0 | 70 | 83 | 90 | 346 | 61 | 56 | 79.0 | 75 | 75.0 | 297 | 67 | 40.0 | 58 | 62 | 70 | 370 | 58 | 70.0 | 78.0 | 93.0 | 71 | 89.0 | 181 | 68 | 57 | 56.0 | 45 | 6 | 13 | 6 | 13 | 7 | 2014 | 420 | 4 ★ | 4★ | High | Medium | 4 ★ | 58 | 70 | 85 | 85 | 63 | 59 | 149 | 72+3 | 72+3 | 72+3 | 79+0 | 79+0 | 79+0 | 79+0 | 79+0 | 82+-1 | 82+-1 | 82+-1 | 79+2 | 81+0 | 81+0 | 81+0 | 79+2 | 70+3 | 73+3 | 73+3 | 73+3 | 70+3 | 67+3 | 64+3 | 64+3 | 64+3 | 67+3 | 17+3 | Male |
df = df.loc[df['Club'] != 'Colombia']
#Show the DataFrame's shape
df.shape
(17115, 107)
#Printing all columns in the dataset for better understanding
def col_data():
l=df.columns.tolist()
for i in l:
print(i)
col_data()
ID Name Age OVA Nationality Club BOV BP Position Player Photo Club Logo Flag Photo POT Team & Contract Height Weight foot Growth Joined Loan Date End Value Wage Release Clause Contract Attacking Crossing Finishing Heading Accuracy Short Passing Volleys Skill Dribbling Curve FK Accuracy Long Passing Ball Control Movement Acceleration Sprint Speed Agility Reactions Balance Power Shot Power Jumping Stamina Strength Long Shots Mentality Aggression Interceptions Positioning Vision Penalties Composure Defending Marking Standing Tackle Sliding Tackle Goalkeeping GK Diving GK Handling GK Kicking GK Positioning GK Reflexes Total Stats Base Stats W/F SM A/W D/W IR PAC SHO PAS DRI DEF PHY Hits LS ST RS LW LF CF RF RW LAM CAM RAM LM LCM CM RCM RM LWB LDM CDM RDM RWB LB LCB CB RCB RB GK Gender
# I make sure there are no duplicate columns
if df.columns.duplicated().any():
print("Hay columnas repetidas.")
else:
print("No hay columnas repetidas.")
No hay columnas repetidas.
# Standardize column names
df.columns=[e.lower().replace(' ', '_') for e in df.columns]
df.columns
Index(['id', 'name', 'age', 'ova', 'nationality', 'club', 'bov', 'bp', 'position', 'player_photo', ... 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'gender'], dtype='object', length=107)
# Deleting irrelevant columns for the analysis, teAm& contrat same oinfo that club and contract
df=df.drop(columns=['player_photo', 'club_logo', 'flag_photo', 'gender', 'team_&_contract', 'contract', 'id', 'loan_date_end'],axis=1)
df
name | age | ova | nationality | club | bov | bp | position | pot | height | weight | foot | growth | joined | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | a/w | d/w | ir | pac | sho | pas | dri | def | phy | hits | ls | st | rs | lw | lf | cf | rf | rw | lam | cam | ram | lm | lcm | cm | rcm | rm | lwb | ldm | cdm | rdm | rwb | lb | lcb | cb | rcb | rb | gk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | G. Pasquale | 33 | 69 | Italy | Udinese | 71 | LWB | LM | 69 | 6'0" | 181lbs | Left | 0 | Jul 1, 2008 | €625K | €7K | €0 | 313 | 75 | 50 | 59 | 71 | 58.0 | 338 | 73 | 65.0 | 60 | 69 | 71 | 347 | 68 | 74 | 68.0 | 69 | 68.0 | 347 | 74 | 68.0 | 69 | 68 | 68 | 320 | 72 | 69.0 | 63.0 | 66.0 | 50 | NaN | 208 | 70 | 69 | 69.0 | 56 | 14 | 5 | 15 | 10 | 12 | 1929 | 408 | 3 ★ | 2★ | Medium | High | 2 ★ | 71 | 59 | 70 | 71 | 68 | 69 | 4 | 65+0 | 65+0 | 65+0 | 68+0 | 67+0 | 67+0 | 67+0 | 68+0 | 68+0 | 68+0 | 68+0 | 69+0 | 69+0 | 69+0 | 69+0 | 69+0 | 71+-2 | 70+-1 | 70+-1 | 70+-1 | 71+-2 | 70+-1 | 69+0 | 69+0 | 69+0 | 70+-1 | 17+0 |
1 | Luis García | 37 | 71 | Spain | KAS Eupen | 70 | CM | CM CAM CDM | 71 | 5'10" | 143lbs | Right | 0 | Jul 19, 2014 | €600K | €7K | €1.1M | 337 | 68 | 64 | 61 | 76 | 68.0 | 369 | 69 | 79.0 | 79 | 71 | 71 | 305 | 56 | 50 | 62.0 | 65 | 72.0 | 324 | 75 | 54.0 | 64 | 60 | 71 | 362 | 71 | 71.0 | 72.0 | 73.0 | 75 | 79.0 | 153 | 70 | 43 | 40.0 | 56 | 9 | 12 | 13 | 11 | 11 | 1906 | 385 | 4 ★ | 3★ | Medium | Medium | 1 ★ | 53 | 69 | 73 | 69 | 58 | 63 | 4 | 67+1 | 67+1 | 67+1 | 67+0 | 68+0 | 68+0 | 68+0 | 67+0 | 70+1 | 70+1 | 70+1 | 68+1 | 70+1 | 70+1 | 70+1 | 68+1 | 62+1 | 66+1 | 66+1 | 66+1 | 62+1 | 60+1 | 60+1 | 60+1 | 60+1 | 60+1 | 17+1 |
2 | J. Cole | 33 | 71 | England | Coventry City | 71 | CAM | CAM RM RW LM | 71 | 5'9" | 161lbs | Right | 0 | Jan 7, 2016 | €1.1M | €15K | €0 | 337 | 80 | 64 | 41 | 77 | 75.0 | 387 | 79 | 84.0 | 77 | 69 | 78 | 295 | 48 | 42 | 71.0 | 59 | 75.0 | 284 | 72 | 58.0 | 29 | 56 | 69 | 317 | 69 | 39.0 | 69.0 | 74.0 | 66 | NaN | 99 | 35 | 34 | 30.0 | 51 | 9 | 6 | 13 | 16 | 7 | 1770 | 354 | 4 ★ | 4★ | Medium | Low | 2 ★ | 45 | 68 | 76 | 77 | 36 | 52 | 11 | 64+0 | 64+0 | 64+0 | 70+0 | 69+0 | 69+0 | 69+0 | 70+0 | 71+0 | 71+0 | 71+0 | 68+0 | 66+0 | 66+0 | 66+0 | 68+0 | 52+0 | 54+0 | 54+0 | 54+0 | 52+0 | 47+0 | 46+0 | 46+0 | 46+0 | 47+0 | 15+0 |
3 | D. Yorke | 36 | 68 | Trinidad & Tobago | Sunderland | 70 | ST | NaN | 82 | 5'11" | 165lbs | Right | 14 | NaN | €0 | €0 | €0 | 264 | 54 | 70 | 60 | 80 | NaN | 255 | 68 | NaN | 46 | 64 | 77 | 176 | 59 | 62 | NaN | 55 | NaN | 239 | 63 | NaN | 51 | 66 | 59 | 271 | 59 | 70.0 | 72.0 | NaN | 70 | NaN | 75 | 34 | 41 | NaN | 68 | 5 | 21 | 64 | 21 | 21 | 1348 | 369 | 3 ★ | 1★ | NaN | NaN | 1 ★ | 61 | 66 | 66 | 69 | 47 | 60 | 3 | 67+0 | 67+0 | 67+0 | 66+0 | 67+0 | 67+0 | 67+0 | 66+0 | 70+0 | 70+0 | 70+0 | 66+0 | 68+0 | 68+0 | 68+0 | 66+0 | 56+0 | 65+0 | 65+0 | 65+0 | 56+0 | 57+0 | 51+0 | 51+0 | 51+0 | 57+0 | 22+0 |
4 | Iniesta | 36 | 81 | Spain | Vissel Kobe | 82 | CAM | CM CAM | 81 | 5'7" | 150lbs | Right | 0 | Jul 16, 2018 | €5.5M | €12K | €7.2M | 367 | 75 | 69 | 54 | 90 | 79.0 | 408 | 85 | 80.0 | 70 | 83 | 90 | 346 | 61 | 56 | 79.0 | 75 | 75.0 | 297 | 67 | 40.0 | 58 | 62 | 70 | 370 | 58 | 70.0 | 78.0 | 93.0 | 71 | 89.0 | 181 | 68 | 57 | 56.0 | 45 | 6 | 13 | 6 | 13 | 7 | 2014 | 420 | 4 ★ | 4★ | High | Medium | 4 ★ | 58 | 70 | 85 | 85 | 63 | 59 | 149 | 72+3 | 72+3 | 72+3 | 79+0 | 79+0 | 79+0 | 79+0 | 79+0 | 82+-1 | 82+-1 | 82+-1 | 79+2 | 81+0 | 81+0 | 81+0 | 79+2 | 70+3 | 73+3 | 73+3 | 73+3 | 70+3 | 67+3 | 64+3 | 64+3 | 64+3 | 67+3 | 17+3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17120 | A. Medioub | 22 | 64 | France | CD Tondela | 66 | CB | CB | 69 | 6'6" | 165lbs | Right | 5 | Sep 29, 2020 | €550K | €2K | €0 | 197 | 35 | 23 | 66 | 47 | 26.0 | 193 | 44 | 26.0 | 33 | 34 | 56 | 243 | 52 | 55 | 46.0 | 48 | 42.0 | 279 | 48 | 70.0 | 59 | 74 | 28 | 218 | 69 | 58.0 | 23.0 | 29.0 | 39 | 46.0 | 198 | 61 | 71 | 66.0 | 39 | 11 | 8 | 8 | 5 | 7 | 1367 | 302 | 2 ★ | 2★ | Medium | Medium | 1 ★ | 54 | 30 | 37 | 48 | 64 | 69 | 5 | 43+2 | 43+2 | 43+2 | 41+0 | 41+0 | 41+0 | 41+0 | 41+0 | 41+2 | 41+2 | 41+2 | 43+2 | 44+2 | 44+2 | 44+2 | 43+2 | 54+2 | 55+2 | 55+2 | 55+2 | 54+2 | 57+2 | 64+2 | 64+2 | 64+2 | 57+2 | 12+2 |
17121 | W. Rickard | 18 | 56 | Wales | Burnley | 58 | CAM | CAM CM | 66 | 5'10" | 170lbs | Right | 10 | Aug 20, 2020 | €130K | €3K | €273K | 248 | 50 | 43 | 50 | 64 | 41.0 | 264 | 57 | 42.0 | 50 | 61 | 54 | 298 | 61 | 63 | 52.0 | 54 | 68.0 | 271 | 51 | 62.0 | 53 | 57 | 48 | 259 | 51 | 50.0 | 55.0 | 53.0 | 50 | 46.0 | 132 | 34 | 42 | 56.0 | 57 | 12 | 15 | 7 | 12 | 11 | 1529 | 319 | 3 ★ | 2★ | Medium | Medium | 1 ★ | 62 | 46 | 57 | 56 | 43 | 55 | 5 | 53+2 | 53+2 | 53+2 | 55+0 | 54+0 | 54+0 | 54+0 | 55+0 | 56+2 | 56+2 | 56+2 | 56+2 | 55+2 | 55+2 | 55+2 | 56+2 | 53+2 | 52+2 | 52+2 | 52+2 | 53+2 | 52+2 | 49+2 | 49+2 | 49+2 | 52+2 | 17+2 |
17122 | C. Barrett | 18 | 56 | England | Burnley | 56 | RB | RB | 67 | 5'9" | 154lbs | Right | 11 | Aug 20, 2020 | €130K | €2K | €273K | 218 | 55 | 34 | 53 | 42 | 34.0 | 212 | 52 | 43.0 | 34 | 36 | 47 | 308 | 66 | 61 | 54.0 | 59 | 68.0 | 233 | 28 | 61.0 | 53 | 54 | 37 | 256 | 58 | 55.0 | 49.0 | 45.0 | 49 | 43.0 | 163 | 55 | 54 | 54.0 | 40 | 8 | 7 | 5 | 12 | 8 | 1430 | 303 | 3 ★ | 2★ | Medium | Medium | 1 ★ | 63 | 35 | 44 | 52 | 54 | 55 | 3 | 46+2 | 46+2 | 46+2 | 50+0 | 47+0 | 47+0 | 47+0 | 50+0 | 47+2 | 47+2 | 47+2 | 50+2 | 46+2 | 46+2 | 46+2 | 50+2 | 54+2 | 50+2 | 50+2 | 50+2 | 54+2 | 54+2 | 54+2 | 54+2 | 54+2 | 54+2 | 14+2 |
17123 | J. Gazibegović | 20 | 62 | Bosnia Herzegovina | SK Sturm Graz | 62 | RB | RB LB | 73 | 5'9" | 148lbs | Right | 11 | Sep 30, 2020 | €475K | €3K | €784K | 223 | 62 | 26 | 52 | 60 | 23.0 | 258 | 55 | 43.0 | 46 | 57 | 57 | 349 | 72 | 70 | 74.0 | 56 | 77.0 | 261 | 51 | 72.0 | 57 | 48 | 33 | 257 | 61 | 58.0 | 46.0 | 56.0 | 36 | 54.0 | 180 | 57 | 62 | 61.0 | 51 | 15 | 13 | 8 | 6 | 9 | 1579 | 335 | 4 ★ | 2★ | High | Medium | 1 ★ | 71 | 34 | 58 | 59 | 59 | 54 | 5 | 48+2 | 48+2 | 48+2 | 55+0 | 52+0 | 52+0 | 52+0 | 55+0 | 54+2 | 54+2 | 54+2 | 56+2 | 55+2 | 55+2 | 55+2 | 56+2 | 60+2 | 58+2 | 58+2 | 58+2 | 60+2 | 60+2 | 58+2 | 58+2 | 58+2 | 60+2 | 16+2 |
17124 | Pablo Rodríguez | 18 | 59 | Spain | Lecce | 61 | ST | ST | 74 | 5'10" | 154lbs | Right | 15 | Sep 30, 2020 | €325K | €2K | €788K | 254 | 32 | 64 | 59 | 45 | 54.0 | 235 | 57 | 45.0 | 48 | 30 | 55 | 317 | 68 | 72 | 59.0 | 53 | 65.0 | 288 | 58 | 62.0 | 55 | 52 | 61 | 224 | 32 | 11.0 | 64.0 | 50.0 | 67 | 57.0 | 35 | 12 | 12 | 11.0 | 48 | 5 | 10 | 9 | 15 | 9 | 1401 | 295 | 3 ★ | 3★ | High | Medium | 1 ★ | 70 | 62 | 41 | 57 | 16 | 49 | 4 | 59+2 | 59+2 | 59+2 | 56+0 | 58+0 | 58+0 | 58+0 | 56+0 | 55+2 | 55+2 | 55+2 | 53+2 | 46+2 | 46+2 | 46+2 | 53+2 | 36+2 | 32+2 | 32+2 | 32+2 | 36+2 | 34+2 | 30+2 | 30+2 | 30+2 | 34+2 | 14+2 |
17115 rows × 99 columns
#Cheking data type
for col_name, dtype in df.dtypes.items():
print(col_name, "is", dtype)
name is object age is int64 ova is int64 nationality is object club is object bov is int64 bp is object position is object pot is int64 height is object weight is object foot is object growth is int64 joined is object value is object wage is object release_clause is object attacking is int64 crossing is int64 finishing is int64 heading_accuracy is int64 short_passing is int64 volleys is float64 skill is int64 dribbling is int64 curve is float64 fk_accuracy is int64 long_passing is int64 ball_control is int64 movement is int64 acceleration is int64 sprint_speed is int64 agility is float64 reactions is int64 balance is float64 power is int64 shot_power is int64 jumping is float64 stamina is int64 strength is int64 long_shots is int64 mentality is int64 aggression is int64 interceptions is float64 positioning is float64 vision is float64 penalties is int64 composure is float64 defending is int64 marking is int64 standing_tackle is int64 sliding_tackle is float64 goalkeeping is int64 gk_diving is int64 gk_handling is int64 gk_kicking is int64 gk_positioning is int64 gk_reflexes is int64 total_stats is int64 base_stats is int64 w/f is object sm is object a/w is object d/w is object ir is object pac is int64 sho is int64 pas is int64 dri is int64 def is int64 phy is int64 hits is object ls is object st is object rs is object lw is object lf is object cf is object rf is object rw is object lam is object cam is object ram is object lm is object lcm is object cm is object rcm is object rm is object lwb is object ldm is object cdm is object rdm is object rwb is object lb is object lcb is object cb is object rcb is object rb is object gk is object
#Converting weight, hits, height into numerical type
df['weight'] = df['weight'].str.extract('(\d+)').astype(int)
df['weight']
0 181 1 143 2 161 3 165 4 150 ... 17120 165 17121 170 17122 154 17123 148 17124 154 Name: weight, Length: 17115, dtype: int32
df['height'] = df['height'].str.extract('(\d+)').astype(int)
df['height']
0 6 1 5 2 5 3 5 4 5 .. 17120 6 17121 5 17122 5 17123 5 17124 5 Name: height, Length: 17115, dtype: int32
valores_unicos = df['hits'].unique()
print(valores_unicos)
['4' '11' '3' '149' '7' '9' '16' '10' '5' '8' '154' '6' '18' '22' '34' '19' '15' '14' '27' '21' '309' '32' '151' '750' '13' '39' '83' '20' '442' '92' '23' '29' '30' '12' '31' '297' '36' '90' '88' '24' '171' '40' '25' '60' '103' '38' '82' '57' '17' '26' '44' '67' '571' '61' '68' '37' '231' '58' '62' '28' '51' '45' '80' '162' '59' '138' '220' '93' '47' '281' '33' '46' '35' '830' '43' '64' '49' '48' '54' '259' '161' '74' '434' '42' '131' '222' '229' '125' '142' '81' '128' '78' '109' '72' '75' '111' '63' '123' '136' '89' '55' '663' '97' '41' '143' '73' '464' '232' '164' '212' '129' '79' '107' '69' '117' '71' '101' '202' '140' '110' '66' '6.3K' '122' '95' '189' '157' '215' '52' '248' '178' '238' '208' '214' '407' '1.2K' '349' '180' '224' '155' '147' '98' '99' '152' '148' '120' '167' '158' '70' '328' '183' '56' '141' '173' '132' '87' '65' '313' '223' '487' '401' '96' '647' '76' '439' '177' '112' '100' '303' '86' '134' '233' '347' '195' '85' '546' '53' '91' '221' '213' '367' '166' '118' '50' '243' '406' '179' '153' '363' '113' '174' '193' '446' '472' '364' '116' '160' '529' '217' '4.1K' '292' '84' '176' '199' '197' '594' '508' '105' '511' '146' '102' '362' '1.9K' '226' '156' '182' '270' '307' '196' '235' '310' '119' '108' '420' '200' '211' '608' '341' '302' '413' '172' '204' '321' '127' '77' '279' '181' '2.3K' '201' '135' '389' '249' '227' '206' '284' '359' '163' '447' '115' '277' '219' '595' '477' '504' '253' '170' '114' '165' '257' '417' '293' '324' '264' '274' '124' '190' '391' '374' '186' '386' '256' '382' '515' '326' '553' '655' '453' '175' '247' '478' '185' '192' '144' '104' '139' '433' '312' '168' '334' '258' '488' '203' '188' '632' '261' '393' '509' '280' '777' '1.1K' '228' '299' '344' '191' '241' '1K' '262' '294' '205' '282' '234' '298' '456' '342' '230' '1.3K' '240' '337' '210' '94' '236' '377' '357' '338' '378' '395' '314' '405' '2.9K' '242' '287' '278' '441' '187' '516' '431' '1.6K' '829' '169' '184' '661' '465' '239' '130' '304' '296' '557' '133' '519' '250' '225' '106' '569' '255' '271' '436' '365' '772' '137' '868' '290' '343' '479' '662' '435' '332' '437' '412' '150' '273' '660' '491' '370' '565' '305' '457' '252' '246' '354' '245' '322' '574' '300' '674' '598' '455' '260' '350' '556' '316' '194' '353' '418' '679' '572' '410' '730' '371' '358' '145' '770' '462' '751' '466' '672' '484' '330' '1.5K' '468' '4.9K' '121' '823' '159' '1.7K' '198' '263' '600' '2K' '315' '545' '266' '340' '555' '416' '301' '854' '524' '3.6K' '403' '552' '295' '380' '743' '825' '520' '562' '311' '388' '209' '817' '460' '448' '761' '345' '526' '579' '538' '887' '3.8K' '959' '498' '360' '275' '396' '792' '366' '254' '858' '440' '764' '860' '573' '604' '318' '361' '352' '535' '928' '268' '724' '683' '542' '265' '306' '308' '430' '709' '383' '932' '390' '821' '402' '461' '583' '601' '593' '3K' '722' '689' '2.7K' '715' '381' '5K' '771' '346' '901' '2.6K' '2.2K' '335' '670' '812' '1.4K' '368' '2.1K' '974' '649' '495' '776' '684' '957' '961' '850' '473' '748' '445' '611' '3.3K' '289' '615' '2.8K' '596' '9.2K' '272' '934' '784' '1.8K' '738' '291' 3 70 4 165 5 8 7 16 11 324 6 44 15 10 34 96 111 27 14 12 28 30 9 162 22 26 117 127 29 93 43 18 123 239 20 36 45 17 151 23 54 83 262 35 32 19 21 720 13 76 42 78 69 274 67 60 52 25 55 227 249 150 59 37 79]
def clean_hits(x):
x= str(x)
if x.endswith('K'):
x=float(x.replace('K',''))
x=x*1000
return x
x= '1.8K'
test= clean_hits(x)
test
1800.0
df['hits'] = df['hits'].apply(clean_hits)
df['hits'] = pd.to_numeric(df['hits'])
df.hits.dtypes
dtype('float64')
df.head(5)
name | age | ova | nationality | club | bov | bp | position | pot | height | weight | foot | growth | joined | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | a/w | d/w | ir | pac | sho | pas | dri | def | phy | hits | ls | st | rs | lw | lf | cf | rf | rw | lam | cam | ram | lm | lcm | cm | rcm | rm | lwb | ldm | cdm | rdm | rwb | lb | lcb | cb | rcb | rb | gk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | G. Pasquale | 33 | 69 | Italy | Udinese | 71 | LWB | LM | 69 | 6 | 181 | Left | 0 | Jul 1, 2008 | €625K | €7K | €0 | 313 | 75 | 50 | 59 | 71 | 58.0 | 338 | 73 | 65.0 | 60 | 69 | 71 | 347 | 68 | 74 | 68.0 | 69 | 68.0 | 347 | 74 | 68.0 | 69 | 68 | 68 | 320 | 72 | 69.0 | 63.0 | 66.0 | 50 | NaN | 208 | 70 | 69 | 69.0 | 56 | 14 | 5 | 15 | 10 | 12 | 1929 | 408 | 3 ★ | 2★ | Medium | High | 2 ★ | 71 | 59 | 70 | 71 | 68 | 69 | 4.0 | 65+0 | 65+0 | 65+0 | 68+0 | 67+0 | 67+0 | 67+0 | 68+0 | 68+0 | 68+0 | 68+0 | 69+0 | 69+0 | 69+0 | 69+0 | 69+0 | 71+-2 | 70+-1 | 70+-1 | 70+-1 | 71+-2 | 70+-1 | 69+0 | 69+0 | 69+0 | 70+-1 | 17+0 |
1 | Luis García | 37 | 71 | Spain | KAS Eupen | 70 | CM | CM CAM CDM | 71 | 5 | 143 | Right | 0 | Jul 19, 2014 | €600K | €7K | €1.1M | 337 | 68 | 64 | 61 | 76 | 68.0 | 369 | 69 | 79.0 | 79 | 71 | 71 | 305 | 56 | 50 | 62.0 | 65 | 72.0 | 324 | 75 | 54.0 | 64 | 60 | 71 | 362 | 71 | 71.0 | 72.0 | 73.0 | 75 | 79.0 | 153 | 70 | 43 | 40.0 | 56 | 9 | 12 | 13 | 11 | 11 | 1906 | 385 | 4 ★ | 3★ | Medium | Medium | 1 ★ | 53 | 69 | 73 | 69 | 58 | 63 | 4.0 | 67+1 | 67+1 | 67+1 | 67+0 | 68+0 | 68+0 | 68+0 | 67+0 | 70+1 | 70+1 | 70+1 | 68+1 | 70+1 | 70+1 | 70+1 | 68+1 | 62+1 | 66+1 | 66+1 | 66+1 | 62+1 | 60+1 | 60+1 | 60+1 | 60+1 | 60+1 | 17+1 |
2 | J. Cole | 33 | 71 | England | Coventry City | 71 | CAM | CAM RM RW LM | 71 | 5 | 161 | Right | 0 | Jan 7, 2016 | €1.1M | €15K | €0 | 337 | 80 | 64 | 41 | 77 | 75.0 | 387 | 79 | 84.0 | 77 | 69 | 78 | 295 | 48 | 42 | 71.0 | 59 | 75.0 | 284 | 72 | 58.0 | 29 | 56 | 69 | 317 | 69 | 39.0 | 69.0 | 74.0 | 66 | NaN | 99 | 35 | 34 | 30.0 | 51 | 9 | 6 | 13 | 16 | 7 | 1770 | 354 | 4 ★ | 4★ | Medium | Low | 2 ★ | 45 | 68 | 76 | 77 | 36 | 52 | 11.0 | 64+0 | 64+0 | 64+0 | 70+0 | 69+0 | 69+0 | 69+0 | 70+0 | 71+0 | 71+0 | 71+0 | 68+0 | 66+0 | 66+0 | 66+0 | 68+0 | 52+0 | 54+0 | 54+0 | 54+0 | 52+0 | 47+0 | 46+0 | 46+0 | 46+0 | 47+0 | 15+0 |
3 | D. Yorke | 36 | 68 | Trinidad & Tobago | Sunderland | 70 | ST | NaN | 82 | 5 | 165 | Right | 14 | NaN | €0 | €0 | €0 | 264 | 54 | 70 | 60 | 80 | NaN | 255 | 68 | NaN | 46 | 64 | 77 | 176 | 59 | 62 | NaN | 55 | NaN | 239 | 63 | NaN | 51 | 66 | 59 | 271 | 59 | 70.0 | 72.0 | NaN | 70 | NaN | 75 | 34 | 41 | NaN | 68 | 5 | 21 | 64 | 21 | 21 | 1348 | 369 | 3 ★ | 1★ | NaN | NaN | 1 ★ | 61 | 66 | 66 | 69 | 47 | 60 | 3.0 | 67+0 | 67+0 | 67+0 | 66+0 | 67+0 | 67+0 | 67+0 | 66+0 | 70+0 | 70+0 | 70+0 | 66+0 | 68+0 | 68+0 | 68+0 | 66+0 | 56+0 | 65+0 | 65+0 | 65+0 | 56+0 | 57+0 | 51+0 | 51+0 | 51+0 | 57+0 | 22+0 |
4 | Iniesta | 36 | 81 | Spain | Vissel Kobe | 82 | CAM | CM CAM | 81 | 5 | 150 | Right | 0 | Jul 16, 2018 | €5.5M | €12K | €7.2M | 367 | 75 | 69 | 54 | 90 | 79.0 | 408 | 85 | 80.0 | 70 | 83 | 90 | 346 | 61 | 56 | 79.0 | 75 | 75.0 | 297 | 67 | 40.0 | 58 | 62 | 70 | 370 | 58 | 70.0 | 78.0 | 93.0 | 71 | 89.0 | 181 | 68 | 57 | 56.0 | 45 | 6 | 13 | 6 | 13 | 7 | 2014 | 420 | 4 ★ | 4★ | High | Medium | 4 ★ | 58 | 70 | 85 | 85 | 63 | 59 | 149.0 | 72+3 | 72+3 | 72+3 | 79+0 | 79+0 | 79+0 | 79+0 | 79+0 | 82+-1 | 82+-1 | 82+-1 | 79+2 | 81+0 | 81+0 | 81+0 | 79+2 | 70+3 | 73+3 | 73+3 | 73+3 | 70+3 | 67+3 | 64+3 | 64+3 | 64+3 | 67+3 | 17+3 |
# Numerical Data Cleaning
financials = ['value', 'wage', 'release_clause']
def clean_value(x):
x= str(x).replace('€','')
if x.endswith('K'):
x=float(x.replace('K','000').replace('€',''))
elif x.endswith('M'):
x = float(x.replace('M','').replace('€',''))
x= x*1000000
return x
df['value']= df['value'].apply(clean_value)
df['wage']= df['wage'].apply(clean_value)
df['release_clause']= df['release_clause'].apply(clean_value)
type(x)
str
df['value'] = pd.to_numeric(df['value'])
df['wage'] = pd.to_numeric(df['wage'])
df['release_clause'] = pd.to_numeric(df['release_clause'])
df.head(1)
name | age | ova | nationality | club | bov | bp | position | pot | height | weight | foot | growth | joined | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | a/w | d/w | ir | pac | sho | pas | dri | def | phy | hits | ls | st | rs | lw | lf | cf | rf | rw | lam | cam | ram | lm | lcm | cm | rcm | rm | lwb | ldm | cdm | rdm | rwb | lb | lcb | cb | rcb | rb | gk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | G. Pasquale | 33 | 69 | Italy | Udinese | 71 | LWB | LM | 69 | 6 | 181 | Left | 0 | Jul 1, 2008 | 625000.0 | 7000.0 | 0.0 | 313 | 75 | 50 | 59 | 71 | 58.0 | 338 | 73 | 65.0 | 60 | 69 | 71 | 347 | 68 | 74 | 68.0 | 69 | 68.0 | 347 | 74 | 68.0 | 69 | 68 | 68 | 320 | 72 | 69.0 | 63.0 | 66.0 | 50 | NaN | 208 | 70 | 69 | 69.0 | 56 | 14 | 5 | 15 | 10 | 12 | 1929 | 408 | 3 ★ | 2★ | Medium | High | 2 ★ | 71 | 59 | 70 | 71 | 68 | 69 | 4.0 | 65+0 | 65+0 | 65+0 | 68+0 | 67+0 | 67+0 | 67+0 | 68+0 | 68+0 | 68+0 | 68+0 | 69+0 | 69+0 | 69+0 | 69+0 | 69+0 | 71+-2 | 70+-1 | 70+-1 | 70+-1 | 71+-2 | 70+-1 | 69+0 | 69+0 | 69+0 | 70+-1 | 17+0 |
#Remover estrellas
star_columns = ['w/f', 'sm', 'ir']
# Check unique values
for column in star_columns:
print(df[column].unique())
['3 ★' '4 ★' '2 ★' '1 ★' '5 ★'] ['2★' '3★' '4★' '1★' '5★'] ['2 ★' '1 ★' '4 ★' '3 ★' '5 ★']
# Select only the first character from the string, then convert the data type to integer
# and check the operation was successful
for column in star_columns:
df[column] = df[column].str[0]
df[column] = pd.to_numeric(df[column], errors='raise')
print(df[column].dtypes)
int64 int64 int64
df.head(1)
name | age | ova | nationality | club | bov | bp | position | pot | height | weight | foot | growth | joined | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | a/w | d/w | ir | pac | sho | pas | dri | def | phy | hits | ls | st | rs | lw | lf | cf | rf | rw | lam | cam | ram | lm | lcm | cm | rcm | rm | lwb | ldm | cdm | rdm | rwb | lb | lcb | cb | rcb | rb | gk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | G. Pasquale | 33 | 69 | Italy | Udinese | 71 | LWB | LM | 69 | 6 | 181 | Left | 0 | Jul 1, 2008 | 625000.0 | 7000.0 | 0.0 | 313 | 75 | 50 | 59 | 71 | 58.0 | 338 | 73 | 65.0 | 60 | 69 | 71 | 347 | 68 | 74 | 68.0 | 69 | 68.0 | 347 | 74 | 68.0 | 69 | 68 | 68 | 320 | 72 | 69.0 | 63.0 | 66.0 | 50 | NaN | 208 | 70 | 69 | 69.0 | 56 | 14 | 5 | 15 | 10 | 12 | 1929 | 408 | 3 | 2 | Medium | High | 2 | 71 | 59 | 70 | 71 | 68 | 69 | 4.0 | 65+0 | 65+0 | 65+0 | 68+0 | 67+0 | 67+0 | 67+0 | 68+0 | 68+0 | 68+0 | 68+0 | 69+0 | 69+0 | 69+0 | 69+0 | 69+0 | 71+-2 | 70+-1 | 70+-1 | 70+-1 | 71+-2 | 70+-1 | 69+0 | 69+0 | 69+0 | 70+-1 | 17+0 |
#Checking for duplicate rows in the data to remove if any
df.duplicated().sum()
0
#Checking null values, I will remove columns have more than 75%
print("Nulls")
for i in df.columns:
print(i, "contains", 100*sum(df[i].isnull()) / len(df[i]) ,"%")
Nulls name contains 0.0 % age contains 0.0 % ova contains 0.0 % nationality contains 0.0 % club contains 0.13438504236050247 % bov contains 0.0 % bp contains 0.0 % position contains 2.40724510663161 % pot contains 0.0 % height contains 0.0 % weight contains 0.0 % foot contains 0.0 % growth contains 0.0 % joined contains 0.33888401986561495 % value contains 0.0 % wage contains 0.0 % release_clause contains 0.0 % attacking contains 0.0 % crossing contains 0.0 % finishing contains 0.0 % heading_accuracy contains 0.0 % short_passing contains 0.0 % volleys contains 0.33888401986561495 % skill contains 0.0 % dribbling contains 0.0 % curve contains 0.33888401986561495 % fk_accuracy contains 0.0 % long_passing contains 0.0 % ball_control contains 0.0 % movement contains 0.0 % acceleration contains 0.0 % sprint_speed contains 0.0 % agility contains 0.33888401986561495 % reactions contains 0.0 % balance contains 0.33888401986561495 % power contains 0.0 % shot_power contains 0.0 % jumping contains 0.33888401986561495 % stamina contains 0.0 % strength contains 0.0 % long_shots contains 0.0 % mentality contains 0.0 % aggression contains 0.0 % interceptions contains 0.0408997955010225 % positioning contains 0.0408997955010225 % vision contains 0.33888401986561495 % penalties contains 0.0 % composure contains 2.4656733859187847 % defending contains 0.0 % marking contains 0.0 % standing_tackle contains 0.0 % sliding_tackle contains 0.33888401986561495 % goalkeeping contains 0.0 % gk_diving contains 0.0 % gk_handling contains 0.0 % gk_kicking contains 0.0 % gk_positioning contains 0.0 % gk_reflexes contains 0.0 % total_stats contains 0.0 % base_stats contains 0.0 % w/f contains 0.0 % sm contains 0.0 % a/w contains 0.5200116856558574 % d/w contains 0.5200116856558574 % ir contains 0.0 % pac contains 0.0 % sho contains 0.0 % pas contains 0.0 % dri contains 0.0 % def contains 0.0 % phy contains 0.0 % hits contains 0.0 % ls contains 0.0 % st contains 0.0 % rs contains 0.0 % lw contains 0.0 % lf contains 0.0 % cf contains 0.0 % rf contains 0.0 % rw contains 0.0 % lam contains 0.0 % cam contains 0.0 % ram contains 0.0 % lm contains 0.0 % lcm contains 0.0 % cm contains 0.0 % rcm contains 0.0 % rm contains 0.0 % lwb contains 0.0 % ldm contains 0.0 % cdm contains 0.0 % rdm contains 0.0 % rwb contains 0.0 % lb contains 0.0 % lcb contains 0.0 % cb contains 0.0 % rcb contains 0.0 % rb contains 0.0 % gk contains 0.0 %
#Split data into categorical and numerical to deal with nulls and check unique values
df_cat=df.select_dtypes(include=['object'])
df_cat.head(5)
name | nationality | club | bp | position | foot | joined | a/w | d/w | ls | st | rs | lw | lf | cf | rf | rw | lam | cam | ram | lm | lcm | cm | rcm | rm | lwb | ldm | cdm | rdm | rwb | lb | lcb | cb | rcb | rb | gk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | G. Pasquale | Italy | Udinese | LWB | LM | Left | Jul 1, 2008 | Medium | High | 65+0 | 65+0 | 65+0 | 68+0 | 67+0 | 67+0 | 67+0 | 68+0 | 68+0 | 68+0 | 68+0 | 69+0 | 69+0 | 69+0 | 69+0 | 69+0 | 71+-2 | 70+-1 | 70+-1 | 70+-1 | 71+-2 | 70+-1 | 69+0 | 69+0 | 69+0 | 70+-1 | 17+0 |
1 | Luis García | Spain | KAS Eupen | CM | CM CAM CDM | Right | Jul 19, 2014 | Medium | Medium | 67+1 | 67+1 | 67+1 | 67+0 | 68+0 | 68+0 | 68+0 | 67+0 | 70+1 | 70+1 | 70+1 | 68+1 | 70+1 | 70+1 | 70+1 | 68+1 | 62+1 | 66+1 | 66+1 | 66+1 | 62+1 | 60+1 | 60+1 | 60+1 | 60+1 | 60+1 | 17+1 |
2 | J. Cole | England | Coventry City | CAM | CAM RM RW LM | Right | Jan 7, 2016 | Medium | Low | 64+0 | 64+0 | 64+0 | 70+0 | 69+0 | 69+0 | 69+0 | 70+0 | 71+0 | 71+0 | 71+0 | 68+0 | 66+0 | 66+0 | 66+0 | 68+0 | 52+0 | 54+0 | 54+0 | 54+0 | 52+0 | 47+0 | 46+0 | 46+0 | 46+0 | 47+0 | 15+0 |
3 | D. Yorke | Trinidad & Tobago | Sunderland | ST | NaN | Right | NaN | NaN | NaN | 67+0 | 67+0 | 67+0 | 66+0 | 67+0 | 67+0 | 67+0 | 66+0 | 70+0 | 70+0 | 70+0 | 66+0 | 68+0 | 68+0 | 68+0 | 66+0 | 56+0 | 65+0 | 65+0 | 65+0 | 56+0 | 57+0 | 51+0 | 51+0 | 51+0 | 57+0 | 22+0 |
4 | Iniesta | Spain | Vissel Kobe | CAM | CM CAM | Right | Jul 16, 2018 | High | Medium | 72+3 | 72+3 | 72+3 | 79+0 | 79+0 | 79+0 | 79+0 | 79+0 | 82+-1 | 82+-1 | 82+-1 | 79+2 | 81+0 | 81+0 | 81+0 | 79+2 | 70+3 | 73+3 | 73+3 | 73+3 | 70+3 | 67+3 | 64+3 | 64+3 | 64+3 | 67+3 | 17+3 |
df_cat.columns
Index(['name', 'nationality', 'club', 'bp', 'position', 'foot', 'joined', 'a/w', 'd/w', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk'], dtype='object')
#Delete column with numeric values and name because is not relevant for the analysis
df_cat=df_cat.drop(columns=['name', 'joined',
'st', 'rs',
'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm',
'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
'rcb', 'rb', 'gk','ls'], axis=1)
df_cat
nationality | club | bp | position | foot | a/w | d/w | |
---|---|---|---|---|---|---|---|
0 | Italy | Udinese | LWB | LM | Left | Medium | High |
1 | Spain | KAS Eupen | CM | CM CAM CDM | Right | Medium | Medium |
2 | England | Coventry City | CAM | CAM RM RW LM | Right | Medium | Low |
3 | Trinidad & Tobago | Sunderland | ST | NaN | Right | NaN | NaN |
4 | Spain | Vissel Kobe | CAM | CM CAM | Right | High | Medium |
... | ... | ... | ... | ... | ... | ... | ... |
17120 | France | CD Tondela | CB | CB | Right | Medium | Medium |
17121 | Wales | Burnley | CAM | CAM CM | Right | Medium | Medium |
17122 | England | Burnley | RB | RB | Right | Medium | Medium |
17123 | Bosnia Herzegovina | SK Sturm Graz | RB | RB LB | Right | High | Medium |
17124 | Spain | Lecce | ST | ST | Right | High | Medium |
17115 rows × 7 columns
#Checking unique values
df_cat.value_counts()
nationality club bp position foot a/w d/w Germany Türkgücü München CB CB Right Medium Medium 4 Norway Rosenborg BK GK GK Right Medium Medium 4 Germany Hertha BSC GK GK Right Medium Medium 4 Japan Vissel Kobe GK GK Right Medium Medium 4 Germany Fortuna Düsseldorf GK GK Right Medium Medium 4 .. England Stoke City CB CB Right Medium High 1 CDM CDM CM Right Low High 1 RB Right Medium Medium 1 CM CM CAM CDM Right Medium Medium 1 Zimbabwe Yeni Malatyaspor CB CB Left Medium Medium 1 Length: 15764, dtype: int64
#chequeamso si hay valor unico para alguna columna, no lo hay porque ya elimamos gender
def check_unique_values(df_cat):
single_value_columns = []
for column in df:
if len(df[column].unique()) == 1:
single_value_columns.append(column)
return single_value_columns
check_unique_values(df_cat)
[]
#we check null for categorical
df_cat.isnull().sum()
nationality 0 club 23 bp 0 position 412 foot 0 a/w 89 d/w 89 dtype: int64
#Fill null categorical values with mode
df_cat['club'].fillna(df_cat['club'].mode()[0], inplace=True)
df_cat['position'].fillna(df_cat['position'].mode()[0], inplace=True)
df_cat['a/w'].fillna(df_cat['a/w'].mode()[0], inplace=True)
df_cat['d/w'].fillna(df_cat['d/w'].mode()[0], inplace=True)
df_cat.isnull().sum()
nationality 0 club 0 bp 0 position 0 foot 0 a/w 0 d/w 0 dtype: int64
#Save numerical data
df_num= df.select_dtypes(exclude=["object"])
df_num
age | ova | bov | pot | height | weight | growth | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | ir | pac | sho | pas | dri | def | phy | hits | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33 | 69 | 71 | 69 | 6 | 181 | 0 | 625000.0 | 7000.0 | 0.0 | 313 | 75 | 50 | 59 | 71 | 58.0 | 338 | 73 | 65.0 | 60 | 69 | 71 | 347 | 68 | 74 | 68.0 | 69 | 68.0 | 347 | 74 | 68.0 | 69 | 68 | 68 | 320 | 72 | 69.0 | 63.0 | 66.0 | 50 | NaN | 208 | 70 | 69 | 69.0 | 56 | 14 | 5 | 15 | 10 | 12 | 1929 | 408 | 3 | 2 | 2 | 71 | 59 | 70 | 71 | 68 | 69 | 4.0 |
1 | 37 | 71 | 70 | 71 | 5 | 143 | 0 | 600000.0 | 7000.0 | 1100000.0 | 337 | 68 | 64 | 61 | 76 | 68.0 | 369 | 69 | 79.0 | 79 | 71 | 71 | 305 | 56 | 50 | 62.0 | 65 | 72.0 | 324 | 75 | 54.0 | 64 | 60 | 71 | 362 | 71 | 71.0 | 72.0 | 73.0 | 75 | 79.0 | 153 | 70 | 43 | 40.0 | 56 | 9 | 12 | 13 | 11 | 11 | 1906 | 385 | 4 | 3 | 1 | 53 | 69 | 73 | 69 | 58 | 63 | 4.0 |
2 | 33 | 71 | 71 | 71 | 5 | 161 | 0 | 1100000.0 | 15000.0 | 0.0 | 337 | 80 | 64 | 41 | 77 | 75.0 | 387 | 79 | 84.0 | 77 | 69 | 78 | 295 | 48 | 42 | 71.0 | 59 | 75.0 | 284 | 72 | 58.0 | 29 | 56 | 69 | 317 | 69 | 39.0 | 69.0 | 74.0 | 66 | NaN | 99 | 35 | 34 | 30.0 | 51 | 9 | 6 | 13 | 16 | 7 | 1770 | 354 | 4 | 4 | 2 | 45 | 68 | 76 | 77 | 36 | 52 | 11.0 |
3 | 36 | 68 | 70 | 82 | 5 | 165 | 14 | 0.0 | 0.0 | 0.0 | 264 | 54 | 70 | 60 | 80 | NaN | 255 | 68 | NaN | 46 | 64 | 77 | 176 | 59 | 62 | NaN | 55 | NaN | 239 | 63 | NaN | 51 | 66 | 59 | 271 | 59 | 70.0 | 72.0 | NaN | 70 | NaN | 75 | 34 | 41 | NaN | 68 | 5 | 21 | 64 | 21 | 21 | 1348 | 369 | 3 | 1 | 1 | 61 | 66 | 66 | 69 | 47 | 60 | 3.0 |
4 | 36 | 81 | 82 | 81 | 5 | 150 | 0 | 5500000.0 | 12000.0 | 7200000.0 | 367 | 75 | 69 | 54 | 90 | 79.0 | 408 | 85 | 80.0 | 70 | 83 | 90 | 346 | 61 | 56 | 79.0 | 75 | 75.0 | 297 | 67 | 40.0 | 58 | 62 | 70 | 370 | 58 | 70.0 | 78.0 | 93.0 | 71 | 89.0 | 181 | 68 | 57 | 56.0 | 45 | 6 | 13 | 6 | 13 | 7 | 2014 | 420 | 4 | 4 | 4 | 58 | 70 | 85 | 85 | 63 | 59 | 149.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17120 | 22 | 64 | 66 | 69 | 6 | 165 | 5 | 550000.0 | 2000.0 | 0.0 | 197 | 35 | 23 | 66 | 47 | 26.0 | 193 | 44 | 26.0 | 33 | 34 | 56 | 243 | 52 | 55 | 46.0 | 48 | 42.0 | 279 | 48 | 70.0 | 59 | 74 | 28 | 218 | 69 | 58.0 | 23.0 | 29.0 | 39 | 46.0 | 198 | 61 | 71 | 66.0 | 39 | 11 | 8 | 8 | 5 | 7 | 1367 | 302 | 2 | 2 | 1 | 54 | 30 | 37 | 48 | 64 | 69 | 5.0 |
17121 | 18 | 56 | 58 | 66 | 5 | 170 | 10 | 130000.0 | 3000.0 | 273000.0 | 248 | 50 | 43 | 50 | 64 | 41.0 | 264 | 57 | 42.0 | 50 | 61 | 54 | 298 | 61 | 63 | 52.0 | 54 | 68.0 | 271 | 51 | 62.0 | 53 | 57 | 48 | 259 | 51 | 50.0 | 55.0 | 53.0 | 50 | 46.0 | 132 | 34 | 42 | 56.0 | 57 | 12 | 15 | 7 | 12 | 11 | 1529 | 319 | 3 | 2 | 1 | 62 | 46 | 57 | 56 | 43 | 55 | 5.0 |
17122 | 18 | 56 | 56 | 67 | 5 | 154 | 11 | 130000.0 | 2000.0 | 273000.0 | 218 | 55 | 34 | 53 | 42 | 34.0 | 212 | 52 | 43.0 | 34 | 36 | 47 | 308 | 66 | 61 | 54.0 | 59 | 68.0 | 233 | 28 | 61.0 | 53 | 54 | 37 | 256 | 58 | 55.0 | 49.0 | 45.0 | 49 | 43.0 | 163 | 55 | 54 | 54.0 | 40 | 8 | 7 | 5 | 12 | 8 | 1430 | 303 | 3 | 2 | 1 | 63 | 35 | 44 | 52 | 54 | 55 | 3.0 |
17123 | 20 | 62 | 62 | 73 | 5 | 148 | 11 | 475000.0 | 3000.0 | 784000.0 | 223 | 62 | 26 | 52 | 60 | 23.0 | 258 | 55 | 43.0 | 46 | 57 | 57 | 349 | 72 | 70 | 74.0 | 56 | 77.0 | 261 | 51 | 72.0 | 57 | 48 | 33 | 257 | 61 | 58.0 | 46.0 | 56.0 | 36 | 54.0 | 180 | 57 | 62 | 61.0 | 51 | 15 | 13 | 8 | 6 | 9 | 1579 | 335 | 4 | 2 | 1 | 71 | 34 | 58 | 59 | 59 | 54 | 5.0 |
17124 | 18 | 59 | 61 | 74 | 5 | 154 | 15 | 325000.0 | 2000.0 | 788000.0 | 254 | 32 | 64 | 59 | 45 | 54.0 | 235 | 57 | 45.0 | 48 | 30 | 55 | 317 | 68 | 72 | 59.0 | 53 | 65.0 | 288 | 58 | 62.0 | 55 | 52 | 61 | 224 | 32 | 11.0 | 64.0 | 50.0 | 67 | 57.0 | 35 | 12 | 12 | 11.0 | 48 | 5 | 10 | 9 | 15 | 9 | 1401 | 295 | 3 | 3 | 1 | 70 | 62 | 41 | 57 | 16 | 49 | 4.0 |
17115 rows × 63 columns
#check null values DESC order
df_num.isnull().sum().sort_values(ascending=False)
composure 422 jumping 58 vision 58 volleys 58 balance 58 ... reactions 0 power 0 shot_power 0 ova 0 hits 0 Length: 63, dtype: int64
#Fill nules with media
df_num['composure']=df_num['composure'].fillna(np.mean(df_num['composure']))
df_num['curve'] = df_num['curve'].fillna(np.mean(df_num['curve']))
df_num['jumping']=df_num['jumping'].fillna(np.mean(df_num['jumping']))
df_num['vision']=df_num['vision'].fillna(np.mean(df_num['vision']))
df_num['balance']=df_num['balance'].fillna(np.mean(df_num['balance']))
df_num['agility']= df_num['agility'].fillna(np.mean(df_num['agility']))
df_num['volleys']= df_num['volleys'].fillna(np.mean(df_num['volleys']))
df_num['sliding_tackle']=df_num['sliding_tackle'].fillna(np.mean(df_num['sliding_tackle']))
df_num['interceptions']=df_num['interceptions'].fillna(np.mean(df_num['interceptions']))
df_num['positioning']=df_num['positioning'].fillna(np.mean(df_num['positioning']))
df_num
age | ova | bov | pot | height | weight | growth | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | ir | pac | sho | pas | dri | def | phy | hits | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33 | 69 | 71 | 69 | 6 | 181 | 0 | 625000.0 | 7000.0 | 0.0 | 313 | 75 | 50 | 59 | 71 | 58.000000 | 338 | 73 | 65.000000 | 60 | 69 | 71 | 347 | 68 | 74 | 68.000000 | 69 | 68.000000 | 347 | 74 | 68.000000 | 69 | 68 | 68 | 320 | 72 | 69.0 | 63.0 | 66.000000 | 50 | 59.937998 | 208 | 70 | 69 | 69.000000 | 56 | 14 | 5 | 15 | 10 | 12 | 1929 | 408 | 3 | 2 | 2 | 71 | 59 | 70 | 71 | 68 | 69 | 4.0 |
1 | 37 | 71 | 70 | 71 | 5 | 143 | 0 | 600000.0 | 7000.0 | 1100000.0 | 337 | 68 | 64 | 61 | 76 | 68.000000 | 369 | 69 | 79.000000 | 79 | 71 | 71 | 305 | 56 | 50 | 62.000000 | 65 | 72.000000 | 324 | 75 | 54.000000 | 64 | 60 | 71 | 362 | 71 | 71.0 | 72.0 | 73.000000 | 75 | 79.000000 | 153 | 70 | 43 | 40.000000 | 56 | 9 | 12 | 13 | 11 | 11 | 1906 | 385 | 4 | 3 | 1 | 53 | 69 | 73 | 69 | 58 | 63 | 4.0 |
2 | 33 | 71 | 71 | 71 | 5 | 161 | 0 | 1100000.0 | 15000.0 | 0.0 | 337 | 80 | 64 | 41 | 77 | 75.000000 | 387 | 79 | 84.000000 | 77 | 69 | 78 | 295 | 48 | 42 | 71.000000 | 59 | 75.000000 | 284 | 72 | 58.000000 | 29 | 56 | 69 | 317 | 69 | 39.0 | 69.0 | 74.000000 | 66 | 59.937998 | 99 | 35 | 34 | 30.000000 | 51 | 9 | 6 | 13 | 16 | 7 | 1770 | 354 | 4 | 4 | 2 | 45 | 68 | 76 | 77 | 36 | 52 | 11.0 |
3 | 36 | 68 | 70 | 82 | 5 | 165 | 14 | 0.0 | 0.0 | 0.0 | 264 | 54 | 70 | 60 | 80 | 45.010494 | 255 | 68 | 49.576713 | 46 | 64 | 77 | 176 | 59 | 62 | 64.604503 | 55 | 64.720408 | 239 | 63 | 65.173067 | 51 | 66 | 59 | 271 | 59 | 70.0 | 72.0 | 55.442927 | 70 | 59.937998 | 75 | 34 | 41 | 46.099431 | 68 | 5 | 21 | 64 | 21 | 21 | 1348 | 369 | 3 | 1 | 1 | 61 | 66 | 66 | 69 | 47 | 60 | 3.0 |
4 | 36 | 81 | 82 | 81 | 5 | 150 | 0 | 5500000.0 | 12000.0 | 7200000.0 | 367 | 75 | 69 | 54 | 90 | 79.000000 | 408 | 85 | 80.000000 | 70 | 83 | 90 | 346 | 61 | 56 | 79.000000 | 75 | 75.000000 | 297 | 67 | 40.000000 | 58 | 62 | 70 | 370 | 58 | 70.0 | 78.0 | 93.000000 | 71 | 89.000000 | 181 | 68 | 57 | 56.000000 | 45 | 6 | 13 | 6 | 13 | 7 | 2014 | 420 | 4 | 4 | 4 | 58 | 70 | 85 | 85 | 63 | 59 | 149.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17120 | 22 | 64 | 66 | 69 | 6 | 165 | 5 | 550000.0 | 2000.0 | 0.0 | 197 | 35 | 23 | 66 | 47 | 26.000000 | 193 | 44 | 26.000000 | 33 | 34 | 56 | 243 | 52 | 55 | 46.000000 | 48 | 42.000000 | 279 | 48 | 70.000000 | 59 | 74 | 28 | 218 | 69 | 58.0 | 23.0 | 29.000000 | 39 | 46.000000 | 198 | 61 | 71 | 66.000000 | 39 | 11 | 8 | 8 | 5 | 7 | 1367 | 302 | 2 | 2 | 1 | 54 | 30 | 37 | 48 | 64 | 69 | 5.0 |
17121 | 18 | 56 | 58 | 66 | 5 | 170 | 10 | 130000.0 | 3000.0 | 273000.0 | 248 | 50 | 43 | 50 | 64 | 41.000000 | 264 | 57 | 42.000000 | 50 | 61 | 54 | 298 | 61 | 63 | 52.000000 | 54 | 68.000000 | 271 | 51 | 62.000000 | 53 | 57 | 48 | 259 | 51 | 50.0 | 55.0 | 53.000000 | 50 | 46.000000 | 132 | 34 | 42 | 56.000000 | 57 | 12 | 15 | 7 | 12 | 11 | 1529 | 319 | 3 | 2 | 1 | 62 | 46 | 57 | 56 | 43 | 55 | 5.0 |
17122 | 18 | 56 | 56 | 67 | 5 | 154 | 11 | 130000.0 | 2000.0 | 273000.0 | 218 | 55 | 34 | 53 | 42 | 34.000000 | 212 | 52 | 43.000000 | 34 | 36 | 47 | 308 | 66 | 61 | 54.000000 | 59 | 68.000000 | 233 | 28 | 61.000000 | 53 | 54 | 37 | 256 | 58 | 55.0 | 49.0 | 45.000000 | 49 | 43.000000 | 163 | 55 | 54 | 54.000000 | 40 | 8 | 7 | 5 | 12 | 8 | 1430 | 303 | 3 | 2 | 1 | 63 | 35 | 44 | 52 | 54 | 55 | 3.0 |
17123 | 20 | 62 | 62 | 73 | 5 | 148 | 11 | 475000.0 | 3000.0 | 784000.0 | 223 | 62 | 26 | 52 | 60 | 23.000000 | 258 | 55 | 43.000000 | 46 | 57 | 57 | 349 | 72 | 70 | 74.000000 | 56 | 77.000000 | 261 | 51 | 72.000000 | 57 | 48 | 33 | 257 | 61 | 58.0 | 46.0 | 56.000000 | 36 | 54.000000 | 180 | 57 | 62 | 61.000000 | 51 | 15 | 13 | 8 | 6 | 9 | 1579 | 335 | 4 | 2 | 1 | 71 | 34 | 58 | 59 | 59 | 54 | 5.0 |
17124 | 18 | 59 | 61 | 74 | 5 | 154 | 15 | 325000.0 | 2000.0 | 788000.0 | 254 | 32 | 64 | 59 | 45 | 54.000000 | 235 | 57 | 45.000000 | 48 | 30 | 55 | 317 | 68 | 72 | 59.000000 | 53 | 65.000000 | 288 | 58 | 62.000000 | 55 | 52 | 61 | 224 | 32 | 11.0 | 64.0 | 50.000000 | 67 | 57.000000 | 35 | 12 | 12 | 11.000000 | 48 | 5 | 10 | 9 | 15 | 9 | 1401 | 295 | 3 | 3 | 1 | 70 | 62 | 41 | 57 | 16 | 49 | 4.0 |
17115 rows × 63 columns
df_num.isnull().sum()
age 0 ova 0 bov 0 pot 0 height 0 .. pas 0 dri 0 def 0 phy 0 hits 0 Length: 63, dtype: int64
# Save "cleaner" data to csv
df_num.shape
(17115, 63)
df_cat.shape
(17115, 7)
df_concat = pd.concat([df_num, df_cat], axis=1)
df_concat.to_csv('df_concat.csv')
df_concat
age | ova | bov | pot | height | weight | growth | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | ir | pac | sho | pas | dri | def | phy | hits | nationality | club | bp | position | foot | a/w | d/w | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33 | 69 | 71 | 69 | 6 | 181 | 0 | 625000.0 | 7000.0 | 0.0 | 313 | 75 | 50 | 59 | 71 | 58.000000 | 338 | 73 | 65.000000 | 60 | 69 | 71 | 347 | 68 | 74 | 68.000000 | 69 | 68.000000 | 347 | 74 | 68.000000 | 69 | 68 | 68 | 320 | 72 | 69.0 | 63.0 | 66.000000 | 50 | 59.937998 | 208 | 70 | 69 | 69.000000 | 56 | 14 | 5 | 15 | 10 | 12 | 1929 | 408 | 3 | 2 | 2 | 71 | 59 | 70 | 71 | 68 | 69 | 4.0 | Italy | Udinese | LWB | LM | Left | Medium | High |
1 | 37 | 71 | 70 | 71 | 5 | 143 | 0 | 600000.0 | 7000.0 | 1100000.0 | 337 | 68 | 64 | 61 | 76 | 68.000000 | 369 | 69 | 79.000000 | 79 | 71 | 71 | 305 | 56 | 50 | 62.000000 | 65 | 72.000000 | 324 | 75 | 54.000000 | 64 | 60 | 71 | 362 | 71 | 71.0 | 72.0 | 73.000000 | 75 | 79.000000 | 153 | 70 | 43 | 40.000000 | 56 | 9 | 12 | 13 | 11 | 11 | 1906 | 385 | 4 | 3 | 1 | 53 | 69 | 73 | 69 | 58 | 63 | 4.0 | Spain | KAS Eupen | CM | CM CAM CDM | Right | Medium | Medium |
2 | 33 | 71 | 71 | 71 | 5 | 161 | 0 | 1100000.0 | 15000.0 | 0.0 | 337 | 80 | 64 | 41 | 77 | 75.000000 | 387 | 79 | 84.000000 | 77 | 69 | 78 | 295 | 48 | 42 | 71.000000 | 59 | 75.000000 | 284 | 72 | 58.000000 | 29 | 56 | 69 | 317 | 69 | 39.0 | 69.0 | 74.000000 | 66 | 59.937998 | 99 | 35 | 34 | 30.000000 | 51 | 9 | 6 | 13 | 16 | 7 | 1770 | 354 | 4 | 4 | 2 | 45 | 68 | 76 | 77 | 36 | 52 | 11.0 | England | Coventry City | CAM | CAM RM RW LM | Right | Medium | Low |
3 | 36 | 68 | 70 | 82 | 5 | 165 | 14 | 0.0 | 0.0 | 0.0 | 264 | 54 | 70 | 60 | 80 | 45.010494 | 255 | 68 | 49.576713 | 46 | 64 | 77 | 176 | 59 | 62 | 64.604503 | 55 | 64.720408 | 239 | 63 | 65.173067 | 51 | 66 | 59 | 271 | 59 | 70.0 | 72.0 | 55.442927 | 70 | 59.937998 | 75 | 34 | 41 | 46.099431 | 68 | 5 | 21 | 64 | 21 | 21 | 1348 | 369 | 3 | 1 | 1 | 61 | 66 | 66 | 69 | 47 | 60 | 3.0 | Trinidad & Tobago | Sunderland | ST | CB | Right | Medium | Medium |
4 | 36 | 81 | 82 | 81 | 5 | 150 | 0 | 5500000.0 | 12000.0 | 7200000.0 | 367 | 75 | 69 | 54 | 90 | 79.000000 | 408 | 85 | 80.000000 | 70 | 83 | 90 | 346 | 61 | 56 | 79.000000 | 75 | 75.000000 | 297 | 67 | 40.000000 | 58 | 62 | 70 | 370 | 58 | 70.0 | 78.0 | 93.000000 | 71 | 89.000000 | 181 | 68 | 57 | 56.000000 | 45 | 6 | 13 | 6 | 13 | 7 | 2014 | 420 | 4 | 4 | 4 | 58 | 70 | 85 | 85 | 63 | 59 | 149.0 | Spain | Vissel Kobe | CAM | CM CAM | Right | High | Medium |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17120 | 22 | 64 | 66 | 69 | 6 | 165 | 5 | 550000.0 | 2000.0 | 0.0 | 197 | 35 | 23 | 66 | 47 | 26.000000 | 193 | 44 | 26.000000 | 33 | 34 | 56 | 243 | 52 | 55 | 46.000000 | 48 | 42.000000 | 279 | 48 | 70.000000 | 59 | 74 | 28 | 218 | 69 | 58.0 | 23.0 | 29.000000 | 39 | 46.000000 | 198 | 61 | 71 | 66.000000 | 39 | 11 | 8 | 8 | 5 | 7 | 1367 | 302 | 2 | 2 | 1 | 54 | 30 | 37 | 48 | 64 | 69 | 5.0 | France | CD Tondela | CB | CB | Right | Medium | Medium |
17121 | 18 | 56 | 58 | 66 | 5 | 170 | 10 | 130000.0 | 3000.0 | 273000.0 | 248 | 50 | 43 | 50 | 64 | 41.000000 | 264 | 57 | 42.000000 | 50 | 61 | 54 | 298 | 61 | 63 | 52.000000 | 54 | 68.000000 | 271 | 51 | 62.000000 | 53 | 57 | 48 | 259 | 51 | 50.0 | 55.0 | 53.000000 | 50 | 46.000000 | 132 | 34 | 42 | 56.000000 | 57 | 12 | 15 | 7 | 12 | 11 | 1529 | 319 | 3 | 2 | 1 | 62 | 46 | 57 | 56 | 43 | 55 | 5.0 | Wales | Burnley | CAM | CAM CM | Right | Medium | Medium |
17122 | 18 | 56 | 56 | 67 | 5 | 154 | 11 | 130000.0 | 2000.0 | 273000.0 | 218 | 55 | 34 | 53 | 42 | 34.000000 | 212 | 52 | 43.000000 | 34 | 36 | 47 | 308 | 66 | 61 | 54.000000 | 59 | 68.000000 | 233 | 28 | 61.000000 | 53 | 54 | 37 | 256 | 58 | 55.0 | 49.0 | 45.000000 | 49 | 43.000000 | 163 | 55 | 54 | 54.000000 | 40 | 8 | 7 | 5 | 12 | 8 | 1430 | 303 | 3 | 2 | 1 | 63 | 35 | 44 | 52 | 54 | 55 | 3.0 | England | Burnley | RB | RB | Right | Medium | Medium |
17123 | 20 | 62 | 62 | 73 | 5 | 148 | 11 | 475000.0 | 3000.0 | 784000.0 | 223 | 62 | 26 | 52 | 60 | 23.000000 | 258 | 55 | 43.000000 | 46 | 57 | 57 | 349 | 72 | 70 | 74.000000 | 56 | 77.000000 | 261 | 51 | 72.000000 | 57 | 48 | 33 | 257 | 61 | 58.0 | 46.0 | 56.000000 | 36 | 54.000000 | 180 | 57 | 62 | 61.000000 | 51 | 15 | 13 | 8 | 6 | 9 | 1579 | 335 | 4 | 2 | 1 | 71 | 34 | 58 | 59 | 59 | 54 | 5.0 | Bosnia Herzegovina | SK Sturm Graz | RB | RB LB | Right | High | Medium |
17124 | 18 | 59 | 61 | 74 | 5 | 154 | 15 | 325000.0 | 2000.0 | 788000.0 | 254 | 32 | 64 | 59 | 45 | 54.000000 | 235 | 57 | 45.000000 | 48 | 30 | 55 | 317 | 68 | 72 | 59.000000 | 53 | 65.000000 | 288 | 58 | 62.000000 | 55 | 52 | 61 | 224 | 32 | 11.0 | 64.0 | 50.000000 | 67 | 57.000000 | 35 | 12 | 12 | 11.000000 | 48 | 5 | 10 | 9 | 15 | 9 | 1401 | 295 | 3 | 3 | 1 | 70 | 62 | 41 | 57 | 16 | 49 | 4.0 | Spain | Lecce | ST | ST | Right | High | Medium |
17115 rows × 70 columns
df_concat.describe()
age | ova | bov | pot | height | weight | growth | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | ir | pac | sho | pas | dri | def | phy | hits | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 1.711500e+04 | 17115.000000 | 1.711500e+04 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 | 17115.000000 |
mean | 25.271224 | 66.960678 | 67.896290 | 72.487058 | 5.487233 | 165.654397 | 5.526380 | 2.566070e+06 | 10532.494888 | 4.571955e+06 | 258.549343 | 51.622027 | 47.958808 | 53.601461 | 60.509086 | 45.010494 | 266.617236 | 57.848379 | 49.576713 | 44.390768 | 54.324569 | 60.644814 | 322.700555 | 65.448554 | 65.445223 | 64.604503 | 62.920129 | 64.720408 | 302.413555 | 59.711656 | 65.173067 | 63.308443 | 65.304119 | 49.137131 | 261.924686 | 56.998247 | 47.083879 | 52.373217 | 55.442927 | 50.254981 | 59.937998 | 141.465031 | 47.243471 | 48.278352 | 46.099431 | 77.571312 | 15.593222 | 15.476132 | 15.465381 | 15.506339 | 15.730762 | 1631.241718 | 361.352323 | 2.986036 | 2.452469 | 1.152206 | 68.089454 | 54.963541 | 58.922933 | 64.206193 | 50.263921 | 64.906281 | 36.341221 |
std | 4.942151 | 6.862719 | 6.636054 | 5.768851 | 0.499852 | 15.815447 | 5.788402 | 5.372423e+06 | 21149.787090 | 1.033772e+07 | 72.229423 | 17.870984 | 19.387890 | 16.942617 | 13.951713 | 17.696243 | 76.783214 | 18.040415 | 18.153273 | 17.501659 | 14.852943 | 15.866080 | 54.902066 | 14.650507 | 14.430442 | 14.339520 | 9.036167 | 14.002595 | 49.364165 | 13.416099 | 11.890838 | 15.303155 | 12.626659 | 19.041017 | 62.885247 | 17.018331 | 20.838234 | 19.086164 | 13.701898 | 15.693373 | 11.598950 | 61.585705 | 20.179825 | 21.479753 | 21.146096 | 81.090508 | 16.819999 | 16.133871 | 15.991460 | 16.394128 | 17.121768 | 260.245811 | 40.244265 | 0.672402 | 0.781045 | 0.461343 | 11.146475 | 13.876627 | 10.186296 | 9.831689 | 16.850151 | 9.658489 | 166.925195 |
min | 16.000000 | 38.000000 | 42.000000 | 47.000000 | 5.000000 | 110.000000 | -1.000000 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 33.000000 | 6.000000 | 3.000000 | 5.000000 | 7.000000 | 4.000000 | 43.000000 | 5.000000 | 4.000000 | 5.000000 | 9.000000 | 5.000000 | 113.000000 | 12.000000 | 11.000000 | 14.000000 | 24.000000 | 17.000000 | 128.000000 | 12.000000 | 22.000000 | 11.000000 | 16.000000 | 4.000000 | 50.000000 | 9.000000 | 4.000000 | 2.000000 | 10.000000 | 7.000000 | 12.000000 | 17.000000 | 3.000000 | 5.000000 | 6.000000 | 5.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 731.000000 | 228.000000 | 1.000000 | 1.000000 | 1.000000 | 25.000000 | 16.000000 | 25.000000 | 28.000000 | 12.000000 | 27.000000 | 3.000000 |
25% | 21.000000 | 62.000000 | 64.000000 | 69.000000 | 5.000000 | 154.000000 | 0.000000 | 3.750000e+05 | 1000.000000 | 5.080000e+05 | 232.000000 | 41.000000 | 33.000000 | 46.000000 | 56.000000 | 32.000000 | 232.000000 | 53.000000 | 37.000000 | 32.000000 | 45.000000 | 57.000000 | 294.500000 | 58.000000 | 59.000000 | 57.000000 | 57.000000 | 57.000000 | 272.000000 | 50.000000 | 58.000000 | 56.000000 | 58.000000 | 35.000000 | 235.000000 | 45.000000 | 26.000000 | 43.000000 | 47.000000 | 40.000000 | 53.000000 | 84.000000 | 29.000000 | 28.000000 | 25.000000 | 48.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 8.000000 | 1492.000000 | 333.000000 | 3.000000 | 2.000000 | 1.000000 | 62.000000 | 46.000000 | 52.000000 | 59.000000 | 35.000000 | 59.000000 | 4.000000 |
50% | 25.000000 | 67.000000 | 68.000000 | 72.000000 | 5.000000 | 165.000000 | 4.000000 | 8.000000e+05 | 3000.000000 | 1.300000e+06 | 271.000000 | 56.000000 | 52.000000 | 57.000000 | 63.000000 | 46.000000 | 279.000000 | 63.000000 | 51.000000 | 43.000000 | 57.000000 | 64.000000 | 331.000000 | 68.000000 | 68.000000 | 67.000000 | 63.000000 | 67.000000 | 308.000000 | 61.000000 | 66.000000 | 66.000000 | 67.000000 | 53.000000 | 269.000000 | 60.000000 | 53.000000 | 57.000000 | 57.000000 | 51.000000 | 60.000000 | 158.000000 | 52.000000 | 55.000000 | 52.000000 | 53.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 11.000000 | 1659.000000 | 362.000000 | 3.000000 | 2.000000 | 1.000000 | 69.000000 | 58.000000 | 60.000000 | 65.000000 | 53.000000 | 66.000000 | 7.000000 |
75% | 29.000000 | 72.000000 | 72.000000 | 76.000000 | 6.000000 | 176.000000 | 9.000000 | 2.400000e+06 | 11000.000000 | 3.900000e+06 | 306.000000 | 65.000000 | 64.000000 | 65.000000 | 69.000000 | 59.000000 | 319.000000 | 70.000000 | 64.000000 | 58.000000 | 65.000000 | 70.000000 | 360.000000 | 75.000000 | 75.000000 | 75.000000 | 69.000000 | 75.000000 | 339.000000 | 70.000000 | 73.000000 | 73.000000 | 74.000000 | 64.000000 | 304.000000 | 70.000000 | 65.000000 | 66.000000 | 65.000000 | 62.000000 | 68.000000 | 194.000000 | 64.000000 | 66.000000 | 64.000000 | 59.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 14.000000 | 1812.000000 | 390.000000 | 3.000000 | 3.000000 | 1.000000 | 75.000000 | 65.000000 | 66.000000 | 71.000000 | 64.000000 | 72.000000 | 18.000000 |
max | 53.000000 | 93.000000 | 93.000000 | 95.000000 | 6.000000 | 243.000000 | 26.000000 | 1.055000e+08 | 560000.000000 | 2.031000e+08 | 437.000000 | 94.000000 | 95.000000 | 93.000000 | 94.000000 | 90.000000 | 470.000000 | 96.000000 | 94.000000 | 94.000000 | 93.000000 | 96.000000 | 464.000000 | 97.000000 | 96.000000 | 96.000000 | 96.000000 | 97.000000 | 444.000000 | 95.000000 | 95.000000 | 97.000000 | 97.000000 | 94.000000 | 421.000000 | 96.000000 | 95.000000 | 96.000000 | 95.000000 | 94.000000 | 96.000000 | 272.000000 | 94.000000 | 93.000000 | 95.000000 | 440.000000 | 90.000000 | 92.000000 | 93.000000 | 93.000000 | 90.000000 | 2316.000000 | 498.000000 | 5.000000 | 5.000000 | 5.000000 | 96.000000 | 93.000000 | 93.000000 | 95.000000 | 91.000000 | 93.000000 | 9200.000000 |
top_players = df.sort_values(['ova', 'base_stats'],ascending=False).head(10)
top_players.reset_index(drop=True, inplace=True)
top_players.index = top_players.index + 1
top_players.index.name = 'Rank'
top_players
name | age | ova | nationality | club | bov | bp | position | pot | height | weight | foot | growth | joined | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | a/w | d/w | ir | pac | sho | pas | dri | def | phy | hits | ls | st | rs | lw | lf | cf | rf | rw | lam | cam | ram | lm | lcm | cm | rcm | rm | lwb | ldm | cdm | rdm | rwb | lb | lcb | cb | rcb | rb | gk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Rank | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1 | L. Messi | 33 | 93 | Argentina | FC Barcelona | 93 | RW | RW ST CF | 93 | 5 | 159 | Left | 0 | Jul 1, 2004 | 67500000.0 | 560000.0 | 138400000.0 | 429 | 85 | 95 | 70 | 91 | 88.0 | 470 | 96 | 93.0 | 94 | 91 | 96 | 451 | 91 | 80 | 91.0 | 94 | 95.0 | 389 | 86 | 68.0 | 72 | 69 | 94 | 347 | 44 | 40.0 | 93.0 | 95.0 | 75 | 96.0 | 91 | 32 | 35 | 24.0 | 54 | 6 | 11 | 15 | 14 | 8 | 2231 | 466 | 4 | 4 | Medium | Low | 5 | 85 | 92 | 91 | 95 | 38 | 65 | 830.0 | 89+3 | 89+3 | 89+3 | 92+0 | 93+0 | 93+0 | 93+0 | 92+0 | 93+0 | 93+0 | 93+0 | 91+2 | 87+3 | 87+3 | 87+3 | 91+2 | 66+3 | 65+3 | 65+3 | 65+3 | 66+3 | 62+3 | 52+3 | 52+3 | 52+3 | 62+3 | 19+3 |
2 | Cristiano Ronaldo | 35 | 92 | Portugal | Juventus | 92 | ST | ST LW | 92 | 6 | 183 | Right | 0 | Jul 10, 2018 | 46000000.0 | 220000.0 | 75900000.0 | 437 | 84 | 95 | 90 | 82 | 86.0 | 414 | 88 | 81.0 | 76 | 77 | 92 | 431 | 87 | 91 | 87.0 | 95 | 71.0 | 444 | 94 | 95.0 | 84 | 78 | 93 | 353 | 63 | 29.0 | 95.0 | 82.0 | 84 | 95.0 | 84 | 28 | 32 | 24.0 | 58 | 7 | 11 | 15 | 14 | 11 | 2221 | 464 | 4 | 5 | High | Low | 5 | 89 | 93 | 81 | 89 | 35 | 77 | 750.0 | 91+1 | 91+1 | 91+1 | 89+0 | 91+0 | 91+0 | 91+0 | 89+0 | 88+3 | 88+3 | 88+3 | 88+3 | 81+3 | 81+3 | 81+3 | 88+3 | 65+3 | 61+3 | 61+3 | 61+3 | 65+3 | 61+3 | 54+3 | 54+3 | 54+3 | 61+3 | 20+3 |
3 | J. Oblak | 27 | 91 | Slovenia | Atlético Madrid | 91 | GK | GK | 93 | 6 | 192 | Right | 2 | Jul 16, 2014 | 75000000.0 | 125000.0 | 159400000.0 | 95 | 13 | 11 | 15 | 43 | 13.0 | 109 | 12 | 13.0 | 14 | 40 | 30 | 307 | 43 | 60 | 67.0 | 88 | 49.0 | 268 | 59 | 78.0 | 41 | 78 | 12 | 140 | 34 | 19.0 | 11.0 | 65.0 | 11 | 68.0 | 57 | 27 | 12 | 18.0 | 437 | 87 | 92 | 78 | 90 | 90 | 1413 | 489 | 3 | 1 | Medium | Medium | 3 | 87 | 92 | 78 | 90 | 52 | 90 | 220.0 | 33+3 | 33+3 | 33+3 | 32+0 | 35+0 | 35+0 | 35+0 | 32+0 | 38+3 | 38+3 | 38+3 | 35+3 | 38+3 | 38+3 | 38+3 | 35+3 | 32+3 | 36+3 | 36+3 | 36+3 | 32+3 | 32+3 | 33+3 | 33+3 | 33+3 | 32+3 | 89+3 |
4 | K. De Bruyne | 29 | 91 | Belgium | Manchester City | 91 | CAM | CAM CM | 91 | 5 | 154 | Right | 0 | Aug 30, 2015 | 87000000.0 | 370000.0 | 161000000.0 | 407 | 94 | 82 | 55 | 94 | 82.0 | 441 | 88 | 85.0 | 83 | 93 | 92 | 398 | 77 | 76 | 78.0 | 91 | 76.0 | 408 | 91 | 63.0 | 89 | 74 | 91 | 408 | 76 | 66.0 | 88.0 | 94.0 | 84 | 91.0 | 186 | 68 | 65 | 53.0 | 56 | 15 | 13 | 5 | 10 | 13 | 2304 | 485 | 5 | 4 | High | High | 4 | 76 | 86 | 93 | 88 | 64 | 78 | 406.0 | 83+3 | 83+3 | 83+3 | 88+0 | 88+0 | 88+0 | 88+0 | 88+0 | 89+2 | 89+2 | 89+2 | 89+2 | 89+2 | 89+2 | 89+2 | 89+2 | 79+3 | 80+3 | 80+3 | 80+3 | 79+3 | 75+3 | 69+3 | 69+3 | 69+3 | 75+3 | 21+3 |
5 | R. Lewandowski | 31 | 91 | Poland | FC Bayern München | 91 | ST | ST | 91 | 6 | 176 | Right | 0 | Jul 1, 2014 | 80000000.0 | 240000.0 | 132000000.0 | 423 | 71 | 94 | 85 | 84 | 89.0 | 407 | 85 | 79.0 | 85 | 70 | 88 | 407 | 77 | 78 | 77.0 | 93 | 82.0 | 420 | 89 | 84.0 | 76 | 86 | 85 | 391 | 81 | 49.0 | 94.0 | 79.0 | 88 | 88.0 | 96 | 35 | 42 | 19.0 | 51 | 15 | 6 | 12 | 8 | 10 | 2195 | 457 | 4 | 4 | High | Medium | 4 | 78 | 91 | 78 | 85 | 43 | 82 | 487.0 | 89+2 | 89+2 | 89+2 | 85+0 | 87+0 | 87+0 | 87+0 | 85+0 | 85+3 | 85+3 | 85+3 | 83+3 | 79+3 | 79+3 | 79+3 | 83+3 | 64+3 | 65+3 | 65+3 | 65+3 | 64+3 | 61+3 | 60+3 | 60+3 | 60+3 | 61+3 | 19+3 |
6 | Neymar Jr | 28 | 91 | Brazil | Paris Saint-Germain | 91 | LW | LW CAM | 91 | 5 | 150 | Right | 0 | Aug 3, 2017 | 90000000.0 | 270000.0 | 166500000.0 | 408 | 85 | 87 | 62 | 87 | 87.0 | 448 | 95 | 88.0 | 89 | 81 | 95 | 453 | 94 | 89 | 96.0 | 91 | 83.0 | 357 | 80 | 62.0 | 81 | 50 | 84 | 356 | 51 | 36.0 | 87.0 | 90.0 | 92 | 93.0 | 94 | 35 | 30 | 29.0 | 59 | 9 | 9 | 15 | 15 | 11 | 2175 | 451 | 5 | 5 | High | Medium | 5 | 91 | 85 | 86 | 94 | 36 | 59 | 546.0 | 84+3 | 84+3 | 84+3 | 90+0 | 89+0 | 89+0 | 89+0 | 90+0 | 90+1 | 90+1 | 90+1 | 90+1 | 83+3 | 83+3 | 83+3 | 90+1 | 67+3 | 62+3 | 62+3 | 62+3 | 67+3 | 62+3 | 49+3 | 49+3 | 49+3 | 62+3 | 20+3 |
7 | Alisson | 27 | 90 | Brazil | Liverpool | 90 | GK | GK | 91 | 6 | 201 | Right | 1 | Jul 19, 2018 | 62500000.0 | 160000.0 | 120300000.0 | 114 | 17 | 13 | 19 | 45 | 20.0 | 138 | 27 | 19.0 | 18 | 44 | 30 | 268 | 56 | 47 | 40.0 | 88 | 37.0 | 240 | 64 | 52.0 | 32 | 78 | 14 | 140 | 27 | 11.0 | 13.0 | 66.0 | 23 | 65.0 | 50 | 15 | 19 | 16.0 | 439 | 86 | 88 | 85 | 91 | 89 | 1389 | 490 | 3 | 1 | Medium | Medium | 3 | 86 | 88 | 85 | 89 | 51 | 91 | 228.0 | 36+3 | 36+3 | 36+3 | 35+0 | 38+0 | 38+0 | 38+0 | 35+0 | 40+3 | 40+3 | 40+3 | 38+3 | 40+3 | 40+3 | 40+3 | 38+3 | 31+3 | 35+3 | 35+3 | 35+3 | 31+3 | 30+3 | 30+3 | 30+3 | 30+3 | 30+3 | 88+3 |
8 | M. ter Stegen | 28 | 90 | Germany | FC Barcelona | 90 | GK | GK | 93 | 6 | 187 | Right | 3 | Jul 1, 2014 | 69500000.0 | 260000.0 | 147700000.0 | 118 | 18 | 14 | 11 | 61 | 14.0 | 144 | 21 | 18.0 | 12 | 63 | 30 | 254 | 38 | 50 | 37.0 | 86 | 43.0 | 268 | 66 | 79.0 | 35 | 78 | 10 | 171 | 43 | 22.0 | 11.0 | 70.0 | 25 | 70.0 | 48 | 25 | 13 | 10.0 | 439 | 88 | 85 | 88 | 88 | 90 | 1442 | 484 | 4 | 1 | Medium | Medium | 3 | 88 | 85 | 88 | 90 | 45 | 88 | 213.0 | 35+3 | 35+3 | 35+3 | 34+0 | 38+0 | 38+0 | 38+0 | 34+0 | 42+3 | 42+3 | 42+3 | 39+3 | 45+3 | 45+3 | 45+3 | 39+3 | 33+3 | 41+3 | 41+3 | 41+3 | 33+3 | 31+3 | 33+3 | 33+3 | 33+3 | 31+3 | 88+3 |
9 | M. Salah | 28 | 90 | Egypt | Liverpool | 90 | RW | RW | 90 | 5 | 157 | Left | 0 | Jul 1, 2017 | 78000000.0 | 250000.0 | 144300000.0 | 392 | 79 | 91 | 59 | 84 | 79.0 | 406 | 90 | 83.0 | 69 | 75 | 89 | 460 | 94 | 92 | 91.0 | 92 | 91.0 | 393 | 80 | 69.0 | 85 | 75 | 84 | 376 | 63 | 55.0 | 91.0 | 84.0 | 83 | 90.0 | 122 | 38 | 43 | 41.0 | 62 | 14 | 14 | 9 | 11 | 14 | 2211 | 470 | 3 | 4 | High | Medium | 3 | 93 | 86 | 81 | 90 | 45 | 75 | 382.0 | 85+3 | 85+3 | 85+3 | 88+0 | 88+0 | 88+0 | 88+0 | 88+0 | 88+2 | 88+2 | 88+2 | 87+3 | 82+3 | 82+3 | 82+3 | 87+3 | 71+3 | 68+3 | 68+3 | 68+3 | 71+3 | 67+3 | 58+3 | 58+3 | 58+3 | 67+3 | 22+3 |
10 | S. Mané | 28 | 90 | Senegal | Liverpool | 90 | LW | LW | 90 | 5 | 152 | Right | 0 | Jul 1, 2016 | 78000000.0 | 250000.0 | 144300000.0 | 410 | 76 | 90 | 84 | 85 | 75.0 | 391 | 91 | 76.0 | 64 | 71 | 89 | 460 | 95 | 93 | 93.0 | 93 | 86.0 | 406 | 84 | 86.0 | 88 | 70 | 78 | 358 | 75 | 35.0 | 92.0 | 85.0 | 71 | 84.0 | 122 | 42 | 42 | 38.0 | 56 | 10 | 10 | 15 | 7 | 14 | 2203 | 469 | 4 | 4 | High | Medium | 3 | 94 | 85 | 80 | 90 | 44 | 76 | 374.0 | 88+2 | 88+2 | 88+2 | 89+0 | 89+0 | 89+0 | 89+0 | 89+0 | 88+2 | 88+2 | 88+2 | 88+2 | 81+3 | 81+3 | 81+3 | 88+2 | 69+3 | 65+3 | 65+3 | 65+3 | 69+3 | 66+3 | 59+3 | 59+3 | 59+3 | 66+3 | 20+3 |
## Crear un gráfico de barras horizontal con los 10 mejores jugadores según OVA
sns.set(style="whitegrid")
plt.figure(figsize=(10,6))
sns.barplot(x="name", y="ova", data=top_players, palette="Blues_r")
# Agregar títulos y etiquetas de los ejes
plt.title("TOP 10 PLAYERS BY OVA", fontsize=18)
plt.xticks(rotation=45, ha='right', fontsize=12)
# Mostrar el gráfico
plt.show()
#La función "counter" toma una columna de posiciones (por ejemplo, 'ls' que se refiere a la calificación de disparo con la pierna izquierda) y la convierte en un número entero sumando los primeros dos
#y los últimos dígitos de cada valor en esa columna. Esto se hace para normalizar las calificaciones y convertirlas en un solo número para cada posición.
def counter(df):
front = df.str[:2].astype(int)
back = df.str[-1].astype(int)
return front+back
# Seleccionar columnas relevantes
positions = df[['name','ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm',
'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']]
#se eliminan todas las filas con valores faltantes
positions = positions.dropna()
#Se crea un nuevo conjunto de datos ("Position") sin la columna 'name' y se aplica la función "counter" a cada columna para convertir las calificaciones en un solo número.
Position = positions.drop('name',axis=1)
for x in Position.columns:
Position[x] = counter(Position[x])
#En un bucle "for", para cada columna de la posición (por ejemplo, 'ls', 'st', etc.) se encuentra la calificación máxima y se obtiene la fila correspondiente utilizando el índice de esa calificación máxima en la columna.
#Luego, el nombre del jugador en esa fila se utiliza para imprimir un mensaje que muestra el mejor jugador en esa posición y su calificación máxima.
for x in Position.columns:
rating = Position[x].max()
current_position = list(Position[x])
player = list(positions['name'])[current_position.index(rating)]
best_player = f'Best {x}, is {player} with {rating} rating'
print(best_player)
Best ls, is Cristiano Ronaldo with 92 rating Best st, is Cristiano Ronaldo with 92 rating Best rs, is Cristiano Ronaldo with 92 rating Best lw, is L. Messi with 92 rating Best lf, is L. Messi with 93 rating Best cf, is L. Messi with 93 rating Best rf, is L. Messi with 93 rating Best rw, is L. Messi with 92 rating Best lam, is L. Messi with 93 rating Best cam, is L. Messi with 93 rating Best ram, is L. Messi with 93 rating Best lm, is L. Messi with 93 rating Best lcm, is K. De Bruyne with 91 rating Best cm, is K. De Bruyne with 91 rating Best rcm, is K. De Bruyne with 91 rating Best rm, is L. Messi with 93 rating Best lwb, is P. Lahm with 88 rating Best ldm, is Casemiro with 89 rating Best cdm, is Casemiro with 89 rating Best rdm, is Casemiro with 89 rating Best rwb, is P. Lahm with 88 rating Best lb, is P. Lahm with 88 rating Best lcb, is V. van Dijk with 91 rating Best cb, is V. van Dijk with 91 rating Best rcb, is V. van Dijk with 91 rating Best rb, is P. Lahm with 88 rating
young_players = df[df['age'] <= 25]
young_players= pd.DataFrame(young_players , columns=['name', 'age', 'ova'])
young_players = young_players.sort_values(by=['ova'], ascending=False)
young_players.reset_index(drop=True, inplace=True)
young_players=young_players.head(5)
young_players
name | age | ova | |
---|---|---|---|
0 | K. Mbappé | 21 | 90 |
1 | R. Sterling | 25 | 88 |
2 | J. Kimmich | 25 | 88 |
3 | T. Alexander-Arnold | 21 | 87 |
4 | Bruno Fernandes | 25 | 87 |
sns.set_style('whitegrid')
sns.barplot(data=young_players, x='name', y='ova', palette='Dark2')
plt.title('TOP 5 PLAYERS WITH HIGHEST POTENCIAL')
plt.xlabel('Players')
plt.ylabel('Overall Rating')
plt.xticks(rotation=45)
plt.figure(figsize=(8, 6))
plt.show()
<Figure size 800x600 with 0 Axes>
#Players with the highest release clauses and their Ratings.
release_clause = top_players.loc[:, ['name', 'release_clause', 'ova']]
release_clause = release_clause.dropna(subset=['release_clause'])
release_clause['release_clause'] = release_clause['release_clause']
release_clause = pd.DataFrame(release_clause.sort_values(by='release_clause', ascending=False)[:10])
release_clause = release_clause.set_index('name')
release_clause
release_clause | ova | |
---|---|---|
name | ||
Neymar Jr | 166500000.0 | 91 |
K. De Bruyne | 161000000.0 | 91 |
J. Oblak | 159400000.0 | 91 |
M. ter Stegen | 147700000.0 | 90 |
M. Salah | 144300000.0 | 90 |
S. Mané | 144300000.0 | 90 |
L. Messi | 138400000.0 | 93 |
R. Lewandowski | 132000000.0 | 91 |
Alisson | 120300000.0 | 90 |
Cristiano Ronaldo | 75900000.0 | 92 |
Top10_values= top_players[['name', 'value']]
Top10_values
name | value | |
---|---|---|
Rank | ||
1 | L. Messi | 67500000.0 |
2 | Cristiano Ronaldo | 46000000.0 |
3 | J. Oblak | 75000000.0 |
4 | K. De Bruyne | 87000000.0 |
5 | R. Lewandowski | 80000000.0 |
6 | Neymar Jr | 90000000.0 |
7 | Alisson | 62500000.0 |
8 | M. ter Stegen | 69500000.0 |
9 | M. Salah | 78000000.0 |
10 | S. Mané | 78000000.0 |
Top10_values_by_value= Top10_values.sort_values(by='value',ascending=False)
Top10_values_by_value
name | value | |
---|---|---|
Rank | ||
6 | Neymar Jr | 90000000.0 |
4 | K. De Bruyne | 87000000.0 |
5 | R. Lewandowski | 80000000.0 |
9 | M. Salah | 78000000.0 |
10 | S. Mané | 78000000.0 |
3 | J. Oblak | 75000000.0 |
8 | M. ter Stegen | 69500000.0 |
1 | L. Messi | 67500000.0 |
7 | Alisson | 62500000.0 |
2 | Cristiano Ronaldo | 46000000.0 |
#Graficamos algunso datos más del dataset
df_cat.columns
Index(['nationality', 'club', 'bp', 'position', 'foot', 'a/w', 'd/w'], dtype='object')
for c in df_cat.columns:
plt.figure(figsize=(30,15))
df[c].value_counts().plot(kind='bar')
plt.xticks(rotation = 90)
plt.title(c)
plt.show()
from ipywidgets import interact
@interact
def plot_bar(col = df_cat.columns):
plt.figure(figsize = (10,5))
plt.bar(df[col].unique(), df[col].value_counts())
plt.title(col)
plt.show()
interactive(children=(Dropdown(description='col', options=('nationality', 'club', 'bp', 'position', 'foot', 'a…
df_num.columns
Index(['age', 'ova', 'bov', 'pot', 'height', 'weight', 'growth', 'value', 'wage', 'release_clause', 'attacking', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'skill', 'dribbling', 'curve', 'fk_accuracy', 'long_passing', 'ball_control', 'movement', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'power', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'mentality', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'composure', 'defending', 'marking', 'standing_tackle', 'sliding_tackle', 'goalkeeping', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes', 'total_stats', 'base_stats', 'w/f', 'sm', 'ir', 'pac', 'sho', 'pas', 'dri', 'def', 'phy', 'hits'], dtype='object')
for c in df_num.columns:
plt.figure(figsize=(10,4))
sns.histplot(df[c], kde= True)
plt.title(c)
plt.show()
We can see that many of the attributes are normally distributed, except for the wage, value, release_clause, hits,ir which are heavily skewed towards lower values. We'll also represent the data using boxplots, to get an image of the amount of outliers present:
for col in df_num.columns:
# Crea un nuevo subplot para cada gráfico
fig, ax = plt.subplots(figsize=(8, 6))
# Crea el gráfico de boxplot utilizando seaborn
sns.boxplot(x=df[col], ax=ax)
# Establece el título de cada gráfico
ax.set_title('Boxplot de {}'.format(col))
# Muestra cada gráfico
plt.show()
#Chequemaos correlacion
corr_matrix=df_num.corr()
corr_matrix
age | ova | bov | pot | height | weight | growth | value | wage | release_clause | attacking | crossing | finishing | heading_accuracy | short_passing | volleys | skill | dribbling | curve | fk_accuracy | long_passing | ball_control | movement | acceleration | sprint_speed | agility | reactions | balance | power | shot_power | jumping | stamina | strength | long_shots | mentality | aggression | interceptions | positioning | vision | penalties | composure | defending | marking | standing_tackle | sliding_tackle | goalkeeping | gk_diving | gk_handling | gk_kicking | gk_positioning | gk_reflexes | total_stats | base_stats | w/f | sm | ir | pac | sho | pas | dri | def | phy | hits | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
age | 1.000000 | 0.517223 | 0.456834 | -0.237895 | 0.084141 | 0.258366 | -0.850310 | 0.096086 | 0.200819 | 0.075504 | 0.184477 | 0.151364 | 0.105610 | 0.173262 | 0.177456 | 0.187459 | 0.169375 | 0.039957 | 0.180656 | 0.224292 | 0.223638 | 0.121653 | -0.055297 | -0.186298 | -0.180441 | -0.040919 | 0.501100 | -0.085273 | 0.311122 | 0.305535 | 0.197277 | 0.070264 | 0.350352 | 0.191225 | 0.273518 | 0.276475 | 0.193127 | 0.122845 | 0.235878 | 0.199260 | 0.408941 | 0.114977 | 0.155685 | 0.106370 | 0.085823 | 0.125354 | 0.120364 | 0.124586 | 0.130300 | 0.133735 | 0.121317 | 0.280882 | 0.414942 | 0.082551 | 0.071353 | 0.403032 | -0.177484 | 0.309302 | 0.396137 | 0.240992 | 0.223493 | 0.436392 | -0.082649 |
ova | 0.517223 | 1.000000 | 0.985867 | 0.591954 | 0.025873 | 0.152011 | -0.595643 | 0.631755 | 0.593202 | 0.597689 | 0.431323 | 0.386224 | 0.303978 | 0.305618 | 0.492470 | 0.361198 | 0.441757 | 0.350154 | 0.397389 | 0.369485 | 0.482662 | 0.431334 | 0.302111 | 0.143673 | 0.153216 | 0.225918 | 0.870037 | 0.100355 | 0.546445 | 0.534966 | 0.258798 | 0.334899 | 0.338343 | 0.390844 | 0.495116 | 0.382727 | 0.305015 | 0.339495 | 0.505164 | 0.317799 | 0.693862 | 0.248063 | 0.289858 | 0.237300 | 0.208776 | 0.038297 | 0.037970 | 0.037986 | 0.038716 | 0.044774 | 0.037369 | 0.607707 | 0.832557 | 0.201508 | 0.345226 | 0.474741 | 0.201023 | 0.487163 | 0.699167 | 0.643380 | 0.344806 | 0.543286 | 0.235200 |
bov | 0.456834 | 0.985867 | 1.000000 | 0.628036 | 0.018353 | 0.132776 | -0.542927 | 0.640904 | 0.592728 | 0.608461 | 0.464387 | 0.399890 | 0.336435 | 0.335755 | 0.533126 | 0.386622 | 0.473926 | 0.388041 | 0.417370 | 0.387457 | 0.515419 | 0.471489 | 0.334812 | 0.179258 | 0.189207 | 0.254519 | 0.870112 | 0.127582 | 0.572286 | 0.551037 | 0.263099 | 0.368599 | 0.335486 | 0.419627 | 0.523533 | 0.399927 | 0.319748 | 0.367871 | 0.527509 | 0.340417 | 0.709548 | 0.267606 | 0.309485 | 0.257425 | 0.227150 | -0.008622 | -0.008729 | -0.008311 | -0.006047 | -0.000807 | -0.008727 | 0.635047 | 0.830057 | 0.212681 | 0.373372 | 0.456196 | 0.212571 | 0.478665 | 0.689881 | 0.640672 | 0.349823 | 0.535546 | 0.250681 |
pot | -0.237895 | 0.591954 | 0.628036 | 1.000000 | -0.013345 | -0.039662 | 0.294802 | 0.563538 | 0.462483 | 0.549148 | 0.233610 | 0.210619 | 0.173054 | 0.136951 | 0.318243 | 0.176683 | 0.265268 | 0.264348 | 0.219986 | 0.166189 | 0.283335 | 0.292118 | 0.257898 | 0.197518 | 0.197032 | 0.188955 | 0.470805 | 0.131348 | 0.221107 | 0.266652 | 0.075809 | 0.169500 | 0.030772 | 0.191450 | 0.243294 | 0.139021 | 0.143494 | 0.191500 | 0.304210 | 0.147026 | 0.362103 | 0.138447 | 0.146998 | 0.138475 | 0.128692 | -0.006983 | -0.005541 | -0.004993 | -0.002455 | -0.003682 | -0.004937 | 0.328825 | 0.474514 | 0.137703 | 0.240578 | 0.293654 | 0.259089 | 0.244525 | 0.395609 | 0.444933 | 0.170302 | 0.159601 | 0.346830 |
height | 0.084141 | 0.025873 | 0.018353 | -0.013345 | 1.000000 | 0.632377 | -0.043975 | -0.002568 | 0.015622 | -0.001914 | -0.304313 | -0.435424 | -0.314788 | 0.074498 | -0.301504 | -0.290888 | -0.396163 | -0.422571 | -0.387024 | -0.351453 | -0.271910 | -0.351415 | -0.535194 | -0.471418 | -0.389322 | -0.544103 | 0.003558 | -0.648268 | -0.103150 | -0.144514 | 0.058541 | -0.236598 | 0.471317 | -0.324249 | -0.259826 | -0.002565 | -0.020992 | -0.376252 | -0.328146 | -0.264914 | -0.119236 | -0.032204 | -0.028126 | -0.030031 | -0.036255 | 0.288919 | 0.284657 | 0.285941 | 0.277593 | 0.284796 | 0.286870 | -0.314196 | -0.115465 | -0.138482 | -0.369253 | 0.041240 | -0.336416 | -0.096161 | -0.197713 | -0.287159 | 0.094074 | 0.381994 | -0.008828 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
pas | 0.396137 | 0.699167 | 0.689881 | 0.395609 | -0.197713 | -0.099482 | -0.434658 | 0.444515 | 0.419678 | 0.418112 | 0.373366 | 0.505355 | 0.329919 | -0.095020 | 0.466031 | 0.378515 | 0.517155 | 0.365967 | 0.525418 | 0.530682 | 0.575122 | 0.365153 | 0.307945 | 0.131469 | 0.081767 | 0.326037 | 0.625028 | 0.258612 | 0.360128 | 0.559982 | -0.022134 | 0.172448 | -0.055188 | 0.454703 | 0.409153 | 0.106873 | 0.115881 | 0.365342 | 0.722518 | 0.298322 | 0.529697 | 0.055048 | 0.076765 | 0.054589 | 0.034033 | 0.172291 | 0.167602 | 0.169040 | 0.181897 | 0.170666 | 0.165650 | 0.555060 | 0.850975 | 0.232971 | 0.367549 | 0.364659 | 0.247871 | 0.658522 | 1.000000 | 0.823140 | 0.142316 | 0.172772 | 0.159131 |
dri | 0.240992 | 0.643380 | 0.640672 | 0.444933 | -0.287159 | -0.207678 | -0.319361 | 0.442153 | 0.389981 | 0.416317 | 0.381955 | 0.447821 | 0.471792 | -0.149090 | 0.351753 | 0.457835 | 0.488648 | 0.495460 | 0.505516 | 0.440825 | 0.353345 | 0.409132 | 0.497752 | 0.349299 | 0.291321 | 0.516035 | 0.570681 | 0.398483 | 0.336232 | 0.557358 | -0.013006 | 0.153495 | -0.184856 | 0.489760 | 0.308786 | -0.053089 | -0.153777 | 0.470663 | 0.658930 | 0.354872 | 0.469896 | -0.206362 | -0.171497 | -0.207825 | -0.224053 | 0.161108 | 0.159477 | 0.157509 | 0.157102 | 0.160650 | 0.162615 | 0.494944 | 0.802689 | 0.261559 | 0.459192 | 0.289118 | 0.521425 | 0.765655 | 0.823140 | 1.000000 | -0.153468 | 0.024475 | 0.192896 |
def | 0.223493 | 0.344806 | 0.349823 | 0.170302 | 0.094074 | 0.104965 | -0.239075 | 0.172786 | 0.191587 | 0.167370 | 0.106597 | 0.219339 | -0.262519 | 0.386276 | 0.362391 | -0.152453 | 0.173530 | 0.053520 | 0.058725 | 0.083998 | 0.467352 | 0.183893 | 0.009001 | -0.051903 | -0.027629 | -0.065466 | 0.335480 | -0.024536 | 0.251774 | -0.051679 | 0.286337 | 0.396438 | 0.358428 | -0.043436 | 0.433623 | 0.646856 | 0.922766 | -0.077602 | 0.033557 | -0.121423 | 0.301769 | 0.929085 | 0.903514 | 0.918880 | 0.911869 | -0.203468 | -0.199423 | -0.200868 | -0.196966 | -0.197508 | -0.201805 | 0.391683 | 0.326425 | -0.066739 | -0.048083 | 0.145465 | -0.253501 | -0.416894 | 0.142316 | -0.153468 | 1.000000 | 0.513174 | 0.036156 |
phy | 0.436392 | 0.543286 | 0.535546 | 0.159601 | 0.381994 | 0.536951 | -0.485056 | 0.266445 | 0.261013 | 0.250743 | 0.104349 | -0.005811 | -0.066168 | 0.401144 | 0.165380 | -0.008472 | 0.029158 | -0.047824 | -0.026846 | -0.015198 | 0.190425 | 0.066788 | -0.114555 | -0.161604 | -0.083294 | -0.192765 | 0.494374 | -0.309688 | 0.515616 | 0.206863 | 0.450773 | 0.390083 | 0.846496 | 0.036931 | 0.312208 | 0.595739 | 0.439572 | -0.009948 | 0.046146 | -0.003776 | 0.353777 | 0.393358 | 0.408030 | 0.386811 | 0.364771 | 0.061681 | 0.058604 | 0.060486 | 0.056270 | 0.071367 | 0.059496 | 0.298947 | 0.487461 | -0.009838 | -0.088722 | 0.194618 | -0.126583 | 0.052057 | 0.172772 | 0.024475 | 0.513174 | 1.000000 | 0.067001 |
hits | -0.082649 | 0.235200 | 0.250681 | 0.346830 | -0.008828 | -0.020785 | 0.066806 | 0.399269 | 0.298920 | 0.409185 | 0.129491 | 0.110428 | 0.115907 | 0.062675 | 0.143922 | 0.114907 | 0.137937 | 0.143446 | 0.130948 | 0.091110 | 0.113754 | 0.145596 | 0.148820 | 0.116406 | 0.119475 | 0.116406 | 0.205684 | 0.080669 | 0.125609 | 0.130877 | 0.035938 | 0.100304 | 0.025609 | 0.111177 | 0.123708 | 0.064863 | 0.039662 | 0.120782 | 0.153298 | 0.089180 | 0.182412 | 0.035767 | 0.044287 | 0.034954 | 0.024989 | -0.031966 | -0.031803 | -0.031488 | -0.031399 | -0.033033 | -0.031759 | 0.160254 | 0.196433 | 0.070428 | 0.157448 | 0.128261 | 0.131126 | 0.120339 | 0.159131 | 0.192896 | 0.036156 | 0.067001 | 1.000000 |
63 rows × 63 columns
corr = df_num.corr()
ones = np.ones_like(corr, dtype="bool")
mask = np.triu(ones)
mask = mask[1:, :-1]
corr = corr.iloc[1:,:-1].copy()
fig, ax = plt.subplots(figsize=(40,30))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="Greens", linewidths=.6, cbar_kws={"shrink":.9})
ax.xaxis.tick_bottom()
plt.title("Correlations heatmap for numerical data".upper(), fontdict={"fontsize": 18}, loc="left")
Text(0.0, 1.0, 'CORRELATIONS HEATMAP FOR NUMERICAL DATA')
fig, ax = plt.subplots(figsize=(40,30) )
heatmap = sns.heatmap(df_num.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
#Alta correlacion:.85 a 1
#Media correlacion:.60 a .85
#Baja correlacion: .40 a .60
#Vemos que nuestro target a predecir value no tiene ninguna correlacion alta
#Multicolinealidad:chequeamos la relacion entre la data numerica de las variables seleccionadas para la predcción
df_num = df[['age','ova','bov','pot','height','weight','release_clause', 'attacking', 'sm', 'movement', 'power', 'mentality', 'defending', 'value','wage']]
for i in df_num.corr().items():
for k in i:
if isinstance(k, pd.Series):
for x in k:
if (x >= .8) and (x!= 1):
print("High variance ", x , " in", i[0], "&", k[k == x].index[0])
High variance 0.9858673856271536 in ova & bov High variance 0.9858673856271536 in bov & ova High variance 0.9792952988065718 in release_clause & value High variance 0.8236885822739604 in attacking & power High variance 0.8877783366592406 in attacking & mentality High variance 0.8236885822739604 in power & attacking High variance 0.8228478768023183 in power & mentality High variance 0.8877783366592406 in mentality & attacking High variance 0.8228478768023183 in mentality & power High variance 0.9792952988065718 in value & release_clause High variance 0.822917796975212 in value & wage High variance 0.822917796975212 in wage & value
df_num = df[['age','ova','pot','height','weight','wage','release_clause','power','movement', 'sm', 'defending','value' ]]
for i in df_num.corr().items():
for k in i:
if isinstance(k, pd.Series):
for x in k:
if (x >= .8) and (x!= 1):
print("High variance ", x , " in", i[0], "&", k[k == x].index[0])
High variance 0.822917796975212 in wage & value High variance 0.9792952988065718 in release_clause & value High variance 0.822917796975212 in value & wage High variance 0.9792952988065718 in value & release_clause
fig, ax = plt.subplots(figsize=(10,5) )
heatmap = sns.heatmap(df_num.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
df_num
age | ova | pot | height | weight | wage | release_clause | power | movement | sm | defending | value | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33 | 69 | 69 | 6 | 181 | 7000.0 | 0.0 | 347 | 347 | 2 | 208 | 625000.0 |
1 | 37 | 71 | 71 | 5 | 143 | 7000.0 | 1100000.0 | 324 | 305 | 3 | 153 | 600000.0 |
2 | 33 | 71 | 71 | 5 | 161 | 15000.0 | 0.0 | 284 | 295 | 4 | 99 | 1100000.0 |
3 | 36 | 68 | 82 | 5 | 165 | 0.0 | 0.0 | 239 | 176 | 1 | 75 | 0.0 |
4 | 36 | 81 | 81 | 5 | 150 | 12000.0 | 7200000.0 | 297 | 346 | 4 | 181 | 5500000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17120 | 22 | 64 | 69 | 6 | 165 | 2000.0 | 0.0 | 279 | 243 | 2 | 198 | 550000.0 |
17121 | 18 | 56 | 66 | 5 | 170 | 3000.0 | 273000.0 | 271 | 298 | 2 | 132 | 130000.0 |
17122 | 18 | 56 | 67 | 5 | 154 | 2000.0 | 273000.0 | 233 | 308 | 2 | 163 | 130000.0 |
17123 | 20 | 62 | 73 | 5 | 148 | 3000.0 | 784000.0 | 261 | 349 | 2 | 180 | 475000.0 |
17124 | 18 | 59 | 74 | 5 | 154 | 2000.0 | 788000.0 | 288 | 317 | 3 | 35 | 325000.0 |
17115 rows × 12 columns
for i in df_num:
sns.distplot(df_num[i])
plt.show()
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Real Madrid. F.C. needs to create a model to predict the market value ofplayers to make wise budget decisions for next season.
df_cat.head(2)
nationality | club | bp | position | foot | a/w | d/w | |
---|---|---|---|---|---|---|---|
0 | Italy | Udinese | LWB | LM | Left | Medium | High |
1 | Spain | KAS Eupen | CM | CM CAM CDM | Right | Medium | Medium |
df_num.head(2)
age | ova | pot | height | weight | wage | release_clause | power | movement | sm | defending | value | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33 | 69 | 69 | 6 | 181 | 7000.0 | 0.0 | 347 | 347 | 2 | 208 | 625000.0 |
1 | 37 | 71 | 71 | 5 | 143 | 7000.0 | 1100000.0 | 324 | 305 | 3 | 153 | 600000.0 |
#comprobar correlacion lineal
import scipy.stats as stats
stats.pearsonr(df_num['age'],df_num['value'])
PearsonRResult(statistic=0.09608639405363559, pvalue=2.1471715973255374e-36)
stats.pearsonr(df_num['ova'],df_num['value'])
PearsonRResult(statistic=0.6317548284541643, pvalue=0.0)
stats.pearsonr(df_num['pot'],df_num['value'])
PearsonRResult(statistic=0.5635382324319885, pvalue=0.0)
stats.pearsonr(df_num['height'],df_num['value'])
PearsonRResult(statistic=-0.002567655203660095, pvalue=0.736955523425748)
stats.pearsonr(df_num['weight'],df_num['value'])
PearsonRResult(statistic=0.04033459418983587, pvalue=1.3024002177396503e-07)
stats.pearsonr(df_num['wage'],df_num['value'])
PearsonRResult(statistic=0.822917796975202, pvalue=0.0)
stats.pearsonr(df_num['release_clause'],df_num['value'])
PearsonRResult(statistic=0.9792952988065707, pvalue=0.0)
stats.pearsonr(df_num['power'],df_num['value'])
PearsonRResult(statistic=0.32965512547351916, pvalue=0.0)
stats.pearsonr(df_num['movement'],df_num['value'])
PearsonRResult(statistic=0.2635545928114223, pvalue=6.940617688152622e-270)
stats.pearsonr(df_num['sm'],df_num['value'])
PearsonRResult(statistic=0.2778089215289008, pvalue=7.476338728701286e-301)
stats.pearsonr(df_num['defending'],df_num['value'])
PearsonRResult(statistic=0.1304075078448339, pvalue=8.502770319182378e-66)
#X/Y SPLIT
#X-y split
y=df_concat['value']
x_x=df_concat.drop('value',axis=1)
X_num= x_x.select_dtypes(include=np.number)
X_cat= x_x.select_dtypes(include=np.object)
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\1467385707.py:2: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations X_cat= x_x.select_dtypes(include=np.object)
#Normalizer
from sklearn.preprocessing import StandardScaler
transformer=StandardScaler().fit(X_num)
X_norm=transformer.transform(X_num)
X_norm
array([[ 1.56389432, 0.2971682 , 0.46771783, ..., 1.05260762, 0.4238591 , -0.19375245], [ 2.37328216, 0.58860639, 0.31702147, ..., 0.4591238 , -0.19737422, -0.19375245], [ 1.56389432, 0.58860639, 0.46771783, ..., -0.84654061, -1.33630197, -0.15181628], ..., [-1.47131009, -1.59718008, -1.79272756, ..., 0.22173027, -1.02568531, -0.19974333], [-1.06661617, -0.72286549, -0.88854941, ..., 0.51847218, -1.1292242 , -0.18776157], [-1.47131009, -1.16002279, -1.03924577, ..., -2.03350826, -1.64691863, -0.19375245]])
sns.distplot(X_norm)
C:\Users\danie\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:ylabel='Density'>
#Encodign categorical data
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(drop='first').fit(X_cat)
encoder=encoder.transform(X_cat).toarray()
encoder
array([[0., 0., 0., ..., 1., 0., 0.], [0., 0., 0., ..., 1., 0., 1.], [0., 0., 0., ..., 1., 1., 0.], ..., [0., 0., 0., ..., 1., 0., 1.], [0., 0., 0., ..., 0., 0., 1.], [0., 0., 0., ..., 0., 0., 1.]])
X_em=np.concatenate((X_norm, encoder),axis=1)
X_em
array([[ 1.56389432, 0.2971682 , 0.46771783, ..., 1. , 0. , 0. ], [ 2.37328216, 0.58860639, 0.31702147, ..., 1. , 0. , 1. ], [ 1.56389432, 0.58860639, 0.46771783, ..., 1. , 1. , 0. ], ..., [-1.47131009, -1.59718008, -1.79272756, ..., 1. , 0. , 1. ], [-1.06661617, -0.72286549, -0.88854941, ..., 0. , 0. , 1. ], [-1.47131009, -1.16002279, -1.03924577, ..., 0. , 0. , 1. ]])
#1. Running linear regression model
##Splitting into train set and test set
from sklearn.model_selection import train_test_split
X_em_train, X_em_test, y_train, y_test = train_test_split(X_em, y, test_size=0.3, random_state=42)
X_em_train.shape
(11980, 1783)
X_em_test.shape
(5135, 1783)
## Running linear regression model
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
lm=linear_model.LinearRegression()
model=lm.fit(X_em_train,y_train)
model
LinearRegression()
model.coef_
array([-2.77446479e+05, 5.79399389e+16, -2.21765574e+05, ..., 2.21120000e+04, 1.50082500e+04, -2.50503750e+04])
model.intercept_
2895000.7368530883
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
train_predictions=lm.predict(X_em_train)
r2_score(y_train, train_predictions)
0.97901212553766
mse_new = mean_squared_error(y_train, train_predictions)
mae_new = mean_absolute_error(y_train, train_predictions)
rmse_new = sqrt(mse_new)
print('Mean Squared Error:', mse_new)
print('Mean absolute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)
Mean Squared Error: 608462032800.9713 Mean absolute Error: 380202.62010170677 Root Mean Squared Error: 780039.7636024534
#solo con numerico
y=df_num['value']
X= df_num.drop(['value'], axis =1)
##Splitting into train set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape
(11980, 11)
X_test.shape
(5135, 11)
lm=linear_model.LinearRegression()
model=lm.fit(X_train,y_train)
model
LinearRegression()
model.coef_
array([-3.32194939e+04, 5.82848642e+04, -1.16569835e+04, 2.44829957e+04, 1.44961926e+03, 2.35886089e+01, 4.51790640e-01, -1.40559122e+02, 7.11839455e+02, 6.86199306e+04, -8.93609832e+02])
model.intercept_
-2578413.4014343964
train_predictions=lm.predict(X_train)
r2_score(y_train, train_predictions)
0.9701210029938898
mse_new = mean_squared_error(y_train, train_predictions)
mae_new = mean_absolute_error(y_train, train_predictions)
rmse_new = sqrt(mse_new)
print('Mean Squared Error:', mse_new)
print('Mean absolute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)
Mean Squared Error: 866225652769.8446 Mean absolute Error: 428295.5255877738 Root Mean Squared Error: 930712.4436526271
#Elimino outliers pasándolos a nulos y rellenándolos con la media
df_num
age | ova | pot | height | weight | wage | release_clause | power | movement | sm | defending | value | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33 | 69 | 69 | 6 | 181 | 7000.0 | 0.0 | 347 | 347 | 2 | 208 | 625000.0 |
1 | 37 | 71 | 71 | 5 | 143 | 7000.0 | 1100000.0 | 324 | 305 | 3 | 153 | 600000.0 |
2 | 33 | 71 | 71 | 5 | 161 | 15000.0 | 0.0 | 284 | 295 | 4 | 99 | 1100000.0 |
3 | 36 | 68 | 82 | 5 | 165 | 0.0 | 0.0 | 239 | 176 | 1 | 75 | 0.0 |
4 | 36 | 81 | 81 | 5 | 150 | 12000.0 | 7200000.0 | 297 | 346 | 4 | 181 | 5500000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17120 | 22 | 64 | 69 | 6 | 165 | 2000.0 | 0.0 | 279 | 243 | 2 | 198 | 550000.0 |
17121 | 18 | 56 | 66 | 5 | 170 | 3000.0 | 273000.0 | 271 | 298 | 2 | 132 | 130000.0 |
17122 | 18 | 56 | 67 | 5 | 154 | 2000.0 | 273000.0 | 233 | 308 | 2 | 163 | 130000.0 |
17123 | 20 | 62 | 73 | 5 | 148 | 3000.0 | 784000.0 | 261 | 349 | 2 | 180 | 475000.0 |
17124 | 18 | 59 | 74 | 5 | 154 | 2000.0 | 788000.0 | 288 | 317 | 3 | 35 | 325000.0 |
17115 rows × 12 columns
for x in ['age']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\4277607439.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
for x in ['ova']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\4056601922.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
for x in ['pot']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\4146485044.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
for x in ['height']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\3984880386.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
for x in ['weight']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\88507277.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
for x in ['value']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
for x in ['wage']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
for x in ['release_clause']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
for x in ['power']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\3738272923.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
for x in ['movement']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\944381429.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
for x in ['sm']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\1202935039.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
for x in ['defending']:
q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
intr_qr = q75-q25
max = q75+(1.5*intr_qr)
min = q25-(1.5*intr_qr)
df_num.loc[df_num[x] < min,x] = np.nan
df_num.loc[df_num[x] > max,x] = np.nan
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\1469301347.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num.loc[df_num[x] < min,x] = np.nan
df_num.isnull().sum()
age 12 ova 32 pot 298 height 0 weight 82 wage 1750 release_clause 2334 power 173 movement 568 sm 67 defending 0 value 2300 dtype: int64
df_num['age']= df_num['age'].fillna(np.mean(df_num['age']))
df_num['ova']= df_num['ova'].fillna(np.mean(df_num['ova']))
df_num['pot']= df_num['pot'].fillna(np.mean(df_num['pot']))
df_num['height']= df_num['height'].fillna(np.mean(df_num['height']))
df_num['weight']= df_num['weight'].fillna(np.mean(df_num['weight']))
df_num['value']= df_num['value'].fillna(np.mean(df_num['value']))
df_num['wage']= df_num['wage'].fillna(np.mean(df_num['wage']))
df_num['release_clause']= df_num['release_clause'].fillna(np.mean(df_num['release_clause']))
df_num['power']= df_num['power'].fillna(np.mean(df_num['power']))
df_num['movement']= df_num['movement'].fillna(np.mean(df_num['movement']))
df_num['sm']= df_num['sm'].fillna(np.mean(df_num['sm']))
df_num['defending']= df_num['defending'].fillna(np.mean(df_num['defending']))
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['age']= df_num['age'].fillna(np.mean(df_num['age'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['ova']= df_num['ova'].fillna(np.mean(df_num['ova'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['pot']= df_num['pot'].fillna(np.mean(df_num['pot'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['height']= df_num['height'].fillna(np.mean(df_num['height'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['weight']= df_num['weight'].fillna(np.mean(df_num['weight'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['value']= df_num['value'].fillna(np.mean(df_num['value'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['wage']= df_num['wage'].fillna(np.mean(df_num['wage'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['release_clause']= df_num['release_clause'].fillna(np.mean(df_num['release_clause'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['power']= df_num['power'].fillna(np.mean(df_num['power'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['movement']= df_num['movement'].fillna(np.mean(df_num['movement'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['sm']= df_num['sm'].fillna(np.mean(df_num['sm'])) C:\Users\danie\AppData\Local\Temp\ipykernel_19324\346574514.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_num['defending']= df_num['defending'].fillna(np.mean(df_num['defending']))
df_num.isnull().sum()
age 0 ova 0 pot 0 height 0 weight 0 wage 0 release_clause 0 power 0 movement 0 sm 0 defending 0 value 0 dtype: int64
#Split
X=df_num.drop(columns=['value'], axis=1)
y=df_num.value
#Normalizer
from sklearn.preprocessing import StandardScaler
transformer=StandardScaler().fit(X)
X_norm_x=transformer.transform(X)
X_norm_x
array([[ 1.57424308, 0.30520747, -0.6364717 , ..., 0.40659771, -0.57873807, 1.08039541], [ 2.38764085, 0.59968429, -0.2626271 , ..., -0.48454693, 0.72927288, 0.18730493], [ 1.57424308, 0.59968429, -0.2626271 , ..., -0.69672423, 2.03728383, -0.68954754], ..., [-1.47599854, -1.60889183, -1.01031629, ..., -0.42089374, -0.57873807, 0.34968502], [-1.06929966, -0.72546138, 0.11121749, ..., 0.44903317, -0.57873807, 0.62573116], [-1.47599854, -1.16717661, 0.29813978, ..., -0.22993418, 0.72927288, -1.72878009]])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape
(11980, 11)
X_test.shape
(5135, 11)
lm=linear_model.LinearRegression()
model=lm.fit(X_train,y_train)
model
LinearRegression()
train_predictions=lm.predict(X_train)
#R2+
train_score=lm.score(X_train, y_train) #R2
test_score=lm.score(X_test, y_test)
print (train_score, test_score)
0.6982558910881252 0.6544734864168147
mse_new = mean_squared_error(y_train, train_predictions)
mae_new = mean_absolute_error(y_train, train_predictions)
rmse_new = sqrt(mse_new)
print('Mean Squared Error:', mse_new)
print('Mean absolute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)
Mean Squared Error: 334912734051.11316 Mean absolute Error: 291899.0036340945 Root Mean Squared Error: 578716.4539315546
##Elimando dos columnas de numericas
df_num
age | ova | pot | height | weight | wage | release_clause | power | movement | sm | defending | value | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 33.0 | 69.0 | 69.0 | 6.0 | 181.0 | 7000.0 | 0.0 | 347.0 | 347.000000 | 2.0 | 208.0 | 6.250000e+05 |
1 | 37.0 | 71.0 | 71.0 | 5.0 | 143.0 | 7000.0 | 1100000.0 | 324.0 | 305.000000 | 3.0 | 153.0 | 6.000000e+05 |
2 | 33.0 | 71.0 | 71.0 | 5.0 | 161.0 | 15000.0 | 0.0 | 284.0 | 295.000000 | 4.0 | 99.0 | 1.100000e+06 |
3 | 36.0 | 68.0 | 82.0 | 5.0 | 165.0 | 0.0 | 0.0 | 239.0 | 327.836889 | 1.0 | 75.0 | 0.000000e+00 |
4 | 36.0 | 81.0 | 81.0 | 5.0 | 150.0 | 12000.0 | 7200000.0 | 297.0 | 346.000000 | 4.0 | 181.0 | 1.076023e+06 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17120 | 22.0 | 64.0 | 69.0 | 6.0 | 165.0 | 2000.0 | 0.0 | 279.0 | 243.000000 | 2.0 | 198.0 | 5.500000e+05 |
17121 | 18.0 | 56.0 | 66.0 | 5.0 | 170.0 | 3000.0 | 273000.0 | 271.0 | 298.000000 | 2.0 | 132.0 | 1.300000e+05 |
17122 | 18.0 | 56.0 | 67.0 | 5.0 | 154.0 | 2000.0 | 273000.0 | 233.0 | 308.000000 | 2.0 | 163.0 | 1.300000e+05 |
17123 | 20.0 | 62.0 | 73.0 | 5.0 | 148.0 | 3000.0 | 784000.0 | 261.0 | 349.000000 | 2.0 | 180.0 | 4.750000e+05 |
17124 | 18.0 | 59.0 | 74.0 | 5.0 | 154.0 | 2000.0 | 788000.0 | 288.0 | 317.000000 | 3.0 | 35.0 | 3.250000e+05 |
17115 rows × 12 columns
df_num = df_num.drop(['release_clause','height'], axis=1)
df_num
age | ova | pot | weight | wage | power | movement | sm | defending | value | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 33.0 | 69.0 | 69.0 | 181.0 | 7000.0 | 347.0 | 347.000000 | 2.0 | 208.0 | 6.250000e+05 |
1 | 37.0 | 71.0 | 71.0 | 143.0 | 7000.0 | 324.0 | 305.000000 | 3.0 | 153.0 | 6.000000e+05 |
2 | 33.0 | 71.0 | 71.0 | 161.0 | 15000.0 | 284.0 | 295.000000 | 4.0 | 99.0 | 1.100000e+06 |
3 | 36.0 | 68.0 | 82.0 | 165.0 | 0.0 | 239.0 | 327.836889 | 1.0 | 75.0 | 0.000000e+00 |
4 | 36.0 | 81.0 | 81.0 | 150.0 | 12000.0 | 297.0 | 346.000000 | 4.0 | 181.0 | 1.076023e+06 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17120 | 22.0 | 64.0 | 69.0 | 165.0 | 2000.0 | 279.0 | 243.000000 | 2.0 | 198.0 | 5.500000e+05 |
17121 | 18.0 | 56.0 | 66.0 | 170.0 | 3000.0 | 271.0 | 298.000000 | 2.0 | 132.0 | 1.300000e+05 |
17122 | 18.0 | 56.0 | 67.0 | 154.0 | 2000.0 | 233.0 | 308.000000 | 2.0 | 163.0 | 1.300000e+05 |
17123 | 20.0 | 62.0 | 73.0 | 148.0 | 3000.0 | 261.0 | 349.000000 | 2.0 | 180.0 | 4.750000e+05 |
17124 | 18.0 | 59.0 | 74.0 | 154.0 | 2000.0 | 288.0 | 317.000000 | 3.0 | 35.0 | 3.250000e+05 |
17115 rows × 10 columns
df_cat
nationality | club | bp | position | foot | a/w | d/w | |
---|---|---|---|---|---|---|---|
0 | Italy | Udinese | LWB | LM | Left | Medium | High |
1 | Spain | KAS Eupen | CM | CM CAM CDM | Right | Medium | Medium |
2 | England | Coventry City | CAM | CAM RM RW LM | Right | Medium | Low |
3 | Trinidad & Tobago | Sunderland | ST | CB | Right | Medium | Medium |
4 | Spain | Vissel Kobe | CAM | CM CAM | Right | High | Medium |
... | ... | ... | ... | ... | ... | ... | ... |
17120 | France | CD Tondela | CB | CB | Right | Medium | Medium |
17121 | Wales | Burnley | CAM | CAM CM | Right | Medium | Medium |
17122 | England | Burnley | RB | RB | Right | Medium | Medium |
17123 | Bosnia Herzegovina | SK Sturm Graz | RB | RB LB | Right | High | Medium |
17124 | Spain | Lecce | ST | ST | Right | High | Medium |
17115 rows × 7 columns
df_concat_model = pd.concat([df_num, df_cat], axis=1)
#X-y split
y=df_concat_model['value']
x_m=df_concat_model.drop('value',axis=1)
X_num_m= x_x.select_dtypes(include=np.number)
X_cat_c= x_x.select_dtypes(include=np.object)
C:\Users\danie\AppData\Local\Temp\ipykernel_19324\508701457.py:2: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations X_cat_c= x_x.select_dtypes(include=np.object)
from sklearn.preprocessing import Normalizer
transformer=Normalizer().fit(X_num_m)
x_norm_m=transformer.transform(X_num_m)
x_norm_m
array([[4.50341174e-03, 9.41622455e-03, 9.68915860e-03, ..., 9.27975753e-03, 9.41622455e-03, 5.45868090e-04], [3.36356189e-05, 6.45440255e-05, 6.36349547e-05, ..., 5.27261054e-05, 5.72714593e-05, 3.63628313e-06], [2.18077548e-03, 4.69197149e-03, 4.69197149e-03, ..., 2.37902780e-03, 3.43637349e-03, 7.26925161e-04], ..., [6.59311481e-05, 2.05119127e-04, 2.05119127e-04, ..., 1.97793444e-04, 2.01456286e-04, 1.09885247e-05], [2.55099518e-05, 7.90808504e-05, 7.90808504e-05, ..., 7.52543577e-05, 6.88768697e-05, 6.37748794e-06], [2.28425193e-05, 7.48727022e-05, 7.74107600e-05, ..., 2.03044616e-05, 6.21824137e-05, 5.07611541e-06]])
from sklearn import preprocessing
labels=preprocessing.OrdinalEncoder().fit(X_cat_c).transform(X_cat_c)
labels
array([[ 77., 837., 9., ..., 0., 2., 0.], [143., 466., 4., ..., 1., 2., 2.], [ 48., 228., 0., ..., 1., 2., 1.], ..., [ 48., 135., 10., ..., 1., 2., 2.], [ 19., 705., 10., ..., 1., 0., 2.], [143., 499., 14., ..., 1., 0., 2.]])
X=np.concatenate((x_norm_m,labels),axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
lm=linear_model.LinearRegression()
model=lm.fit(X_train,y_train)
predictions=lm.predict(X_train)
r2_score(y_train,predictions)
0.07250571063361666
mse_new = mean_squared_error(y_train, predictions)
mae_new = mean_absolute_error(y_train, predictions)
rmse_new = sqrt(mse_new)
print('Mean Squared Error:', mse_new)
print('Mean absolute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)
Mean Squared Error: 1029447267052.3954 Mean absolute Error: 708737.3275927786 Root Mean Squared Error: 1014616.8079883141
R2:0.97901212553766
Mean Squared Error: 608462032800.9713
Mean absolute Error: 380202.62010170677
Root Mean Squared Error: 780039.7636024534
R2:0.9701210029938898
Mean Squared Error: 866225652769.8446
Mean absolute Error: 428295.5255877738
Root Mean Squared Error: 930712.4436526271
R2: 0.6982558910881252
Mean Squared Error: 334912734051.11316
Mean absolute Error: 291899.0036340945
Root Mean Squared Error: 578716.453931554
R2:0.07250571063361666
Mean Squared Error: 1029447267052.3954
Mean absolute Error: 708737.3275927786
Root Mean Squared Error: 1014616.8079883141