import pandas as pd
import matplotlib.pyplot as plt
# Leer el dataset
url='https://raw.githubusercontent.com/JJTorresDS/ds-data-sources/main/income_per_person_gdppercapita_ppp_inflation_adjusted.csv'
df= pd.read_csv(url,sep=',')
df
country | 1799 | 1800 | 1801 | 1802 | 1803 | 1804 | 1805 | 1806 | 1807 | ... | 2040 | 2041 | 2042 | 2043 | 2044 | 2045 | 2046 | 2047 | 2048 | 2049 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | 674 | 674 | 674 | 674 | 674 | 674 | 674 | 674 | 674 | ... | 2880 | 2940 | 3000 | 3070 | 3130 | 3200 | 3270 | 3340 | 3410 | 3480 |
1 | Angola | 691 | 693 | 697 | 700 | 702 | 705 | 709 | 712 | 716 | ... | 8040 | 8220 | 8390 | 8570 | 8750 | 8940 | 9120 | 9320 | 9520 | 9720 |
2 | Albania | 746 | 746 | 746 | 746 | 746 | 747 | 747 | 747 | 747 | ... | 24.5k | 25k | 25.5k | 26.1k | 26.6k | 27.2k | 27.8k | 28.3k | 28.9k | 29.6k |
3 | Andorra | 1340 | 1340 | 1340 | 1350 | 1350 | 1350 | 1350 | 1360 | 1360 | ... | 108k | 111k | 113k | 116k | 118k | 121k | 123k | 126k | 128k | 131k |
4 | United Arab Emirates | 1120 | 1120 | 1120 | 1130 | 1130 | 1140 | 1140 | 1150 | 1150 | ... | 74.5k | 76.1k | 77.7k | 79.3k | 81k | 82.7k | 84.5k | 86.3k | 88.1k | 90k |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
190 | Samoa | 1570 | 1570 | 1570 | 1570 | 1570 | 1570 | 1570 | 1570 | 1570 | ... | 8230 | 8410 | 8590 | 8770 | 8960 | 9150 | 9340 | 9540 | 9740 | 9950 |
191 | Yemen | 981 | 983 | 986 | 988 | 992 | 994 | 997 | 1000 | 1000 | ... | 3980 | 4060 | 4150 | 4240 | 4330 | 4420 | 4510 | 4610 | 4710 | 4810 |
192 | South Africa | 1760 | 1740 | 1720 | 1700 | 1680 | 1590 | 1590 | 1730 | 1510 | ... | 14.5k | 14.8k | 15.2k | 15.5k | 15.8k | 16.1k | 16.5k | 16.8k | 17.2k | 17.6k |
193 | Zambia | 741 | 743 | 746 | 747 | 749 | 750 | 752 | 755 | 756 | ... | 3530 | 3610 | 3680 | 3760 | 3840 | 3920 | 4010 | 4090 | 4180 | 4270 |
194 | Zimbabwe | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 981 | ... | 3490 | 3570 | 3640 | 3720 | 3800 | 3880 | 3960 | 4050 | 4130 | 4220 |
195 rows × 252 columns
sum(df.isnull().sum())
0
print(df.shape)
df= df.drop_duplicates()
print(df.shape)
(195, 252) (195, 252)
# Parte 1
df_pais=df[df['country'] == 'Argentina'].iloc[:,217:224]
lista=df_pais.values[0]
lista
array(['23.2k', '23.6k', '22.8k', '22.1k', '19.7k', '20.6k', '20.9k'], dtype=object)
def promedio_pais(fila):
suma = cantidad = 0
for col in fila:
if str(col)[-1] == "k":
col = col[:-1]
suma += float(col)
cantidad += 1
return suma/cantidad
import numpy as np
df_pais=df_pais.apply(lambda x: x.replace("k", ""))
df_pais
2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | promedio | |
---|---|---|---|---|---|---|---|---|
5 | 23.2k | 23.6k | 22.8k | 22.1k | 19.7k | 20.6k | 20.9k | NaN |
lista_nueva = []
for string in lista:
x = float(string.replace("k", ""))
lista_nueva.append(x)
import statistics
statistics.mean(lista_nueva)*1000
21842.85714285714
# Parte 2 (Una forma de hacerlo) uso de filtros
df_paises=df[df['country'].isin(['Argentina','Chile','Colombia','Bolivia','Peru','Brasil','Uruguay','Venezuela','Paraguay','Ecuador'])].iloc[:,217:224]
lista=df_paises.values
lista
array([['23.2k', '23.6k', '22.8k', '22.1k', '19.7k', '20.6k', '20.9k'], ['8200', '8420', '8660', '8720', '7940', '8250', '8470'], ['24.6k', '24.5k', '25.2k', '25.1k', '23.2k', '24.3k', '25k'], ['14.2k', '14.2k', '14.4k', '14.6k', '13.5k', '14k', '14.4k'], ['11.6k', '11.6k', '11.6k', '11.4k', '10.4k', '10.5k', '10.5k'], ['12.4k', '12.5k', '12.8k', '12.8k', '11.3k', '12.1k', '12.6k'], ['12.2k', '12.6k', '12.8k', '12.7k', '12.4k', '12.7k', '13k'], ['20.9k', '21.3k', '21.6k', '21.6k', '20.3k', '20.8k', '21.4k'], ['17.5k', '14.8k', '12.3k', '8320', '5800', '5290', '5150']], dtype=object)
lista_nueva = []
for string in lista:
for j in string:
if 'k' in j:
#print(j)
x = float(j.replace("k", ""))*1000
lista_nueva.append(x)
#print(x)
else:
x= float(j)
lista_nueva.append(x)
#print('---')
#print(x)
import statistics
statistics.mean(lista_nueva)
15116.190476190477
df.country.unique()
array(['Afghanistan', 'Angola', 'Albania', 'Andorra', 'United Arab Emirates', 'Argentina', 'Armenia', 'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh', 'Bulgaria', 'Bahrain', 'Bahamas', 'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bolivia', 'Brazil', 'Barbados', 'Brunei', 'Bhutan', 'Botswana', 'Central African Republic', 'Canada', 'Switzerland', 'Chile', 'China', "Cote d'Ivoire", 'Cameroon', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Colombia', 'Comoros', 'Cape Verde', 'Costa Rica', 'Cuba', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Dominica', 'Denmark', 'Dominican Republic', 'Algeria', 'Ecuador', 'Egypt', 'Eritrea', 'Spain', 'Estonia', 'Ethiopia', 'Finland', 'Fiji', 'France', 'Micronesia, Fed. Sts.', 'Gabon', 'United Kingdom', 'Georgia', 'Ghana', 'Guinea', 'Gambia', 'Guinea-Bissau', 'Equatorial Guinea', 'Greece', 'Grenada', 'Guatemala', 'Guyana', 'Hong Kong, China', 'Honduras', 'Croatia', 'Haiti', 'Hungary', 'Indonesia', 'India', 'Ireland', 'Iran', 'Iraq', 'Iceland', 'Israel', 'Italy', 'Jamaica', 'Jordan', 'Japan', 'Kazakhstan', 'Kenya', 'Kyrgyz Republic', 'Cambodia', 'Kiribati', 'St. Kitts and Nevis', 'South Korea', 'Kuwait', 'Lao', 'Lebanon', 'Liberia', 'Libya', 'St. Lucia', 'Sri Lanka', 'Lesotho', 'Lithuania', 'Luxembourg', 'Latvia', 'Morocco', 'Monaco', 'Moldova', 'Madagascar', 'Maldives', 'Mexico', 'Marshall Islands', 'North Macedonia', 'Mali', 'Malta', 'Myanmar', 'Montenegro', 'Mongolia', 'Mozambique', 'Mauritania', 'Mauritius', 'Malawi', 'Malaysia', 'Namibia', 'Niger', 'Nigeria', 'Nicaragua', 'Netherlands', 'Norway', 'Nepal', 'Nauru', 'New Zealand', 'Oman', 'Pakistan', 'Panama', 'Peru', 'Philippines', 'Palau', 'Papua New Guinea', 'Poland', 'North Korea', 'Portugal', 'Paraguay', 'Palestine', 'Qatar', 'Romania', 'Russia', 'Rwanda', 'Saudi Arabia', 'Sudan', 'Senegal', 'Singapore', 'Solomon Islands', 'Sierra Leone', 'El Salvador', 'San Marino', 'Somalia', 'Serbia', 'South Sudan', 'Sao Tome and Principe', 'Suriname', 'Slovak Republic', 'Slovenia', 'Sweden', 'Eswatini', 'Seychelles', 'Syria', 'Chad', 'Togo', 'Thailand', 'Tajikistan', 'Turkmenistan', 'Timor-Leste', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Tuvalu', 'Taiwan', 'Tanzania', 'Uganda', 'Ukraine', 'Uruguay', 'United States', 'Uzbekistan', 'St. Vincent and the Grenadines', 'Venezuela', 'Vietnam', 'Vanuatu', 'Samoa', 'Yemen', 'South Africa', 'Zambia', 'Zimbabwe'], dtype=object)
df_paises=df[df['country'].isin(['United States','Canada','Mexico','Costa Rica','Nicaragua'])].iloc[:,217:224]
lista=df_paises.values
lista
array([['47.6k', '48.5k', '48.8k', '48.9k', '45.7k', '47.6k', '49.3k'], ['19.2k', '19.7k', '20.1k', '20.3k', '19.1k', '19.4k', '19.9k'], ['19.6k', '19.8k', '20k', '19.8k', '18k', '18.7k', '19.1k'], ['5810', '6000', '5700', '5410', '5270', '5240', '5330'], ['59k', '60.1k', '61.5k', '62.5k', '60k', '63.5k', '65.4k']], dtype=object)
lista_nueva = []
for string in lista:
for j in string:
if 'k' in j:
#print(j)
x = float(j.replace("k", ""))*1000
lista_nueva.append(x)
#print(x)
else:
x= float(j)
lista_nueva.append(x)
#print('---')
#print(x)
import statistics
statistics.mean(lista_nueva)
30853.14285714286
from google.colab import drive
import os
drive.mount('/content/gdrive')
# Establecer ruta de acceso en drive
import os
print(os.getcwd())
os.chdir("/content/gdrive/My Drive")
Mounted at /content/gdrive /content
import pandas as pd
import sqlite3
sql= "SELECT name FROM sqlite_master WHERE type='table'"
# Leer el archivo sqllite y convertirlo a dataframe
con =sqlite3.connect("nba_salary.sqlite")
df=pd.read_sql_query(sql,con)
# Verificar el resultado en un dataframe
print(df)
con.close()
name 0 NBA_season1718_salary 1 Seasons_Stats
con =sqlite3.connect("nba_salary.sqlite")
df=pd.read_sql_query('SELECT * FROM NBA_season1718_salary',con)
df.head()
X1 | Player | Tm | season17_18 | |
---|---|---|---|---|
0 | 1.0 | Stephen Curry | GSW | 34682550.0 |
1 | 2.0 | LeBron James | CLE | 33285709.0 |
2 | 3.0 | Paul Millsap | DEN | 31269231.0 |
3 | 4.0 | Gordon Hayward | BOS | 29727900.0 |
4 | 5.0 | Blake Griffin | DET | 29512900.0 |
import pandas as pd
con =sqlite3.connect("nba_salary.sqlite")
df1=pd.read_sql_query('SELECT * From Seasons_Stats',con)
df1
X1 | Year | Player | Pos | Age | Tm | G | GS | MP | PER | ... | FT% | ORB | DRB | TRB | AST | STL | BLK | TOV | PF | PTS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 1950.0 | Curly Armstrong | G-F | 31.0 | FTW | 63.0 | NaN | NaN | NaN | ... | 0.705 | NaN | NaN | NaN | 176.0 | NaN | NaN | NaN | 217.0 | 458.0 |
1 | 1.0 | 1950.0 | Cliff Barker | SG | 29.0 | INO | 49.0 | NaN | NaN | NaN | ... | 0.708 | NaN | NaN | NaN | 109.0 | NaN | NaN | NaN | 99.0 | 279.0 |
2 | 2.0 | 1950.0 | Leo Barnhorst | SF | 25.0 | CHS | 67.0 | NaN | NaN | NaN | ... | 0.698 | NaN | NaN | NaN | 140.0 | NaN | NaN | NaN | 192.0 | 438.0 |
3 | 3.0 | 1950.0 | Ed Bartels | F | 24.0 | TOT | 15.0 | NaN | NaN | NaN | ... | 0.559 | NaN | NaN | NaN | 20.0 | NaN | NaN | NaN | 29.0 | 63.0 |
4 | 4.0 | 1950.0 | Ed Bartels | F | 24.0 | DNN | 13.0 | NaN | NaN | NaN | ... | 0.548 | NaN | NaN | NaN | 20.0 | NaN | NaN | NaN | 27.0 | 59.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
24686 | 24686.0 | 2017.0 | Cody Zeller | PF | 24.0 | CHO | 62.0 | NaN | 1725.0 | 16.7 | ... | 0.679 | NaN | NaN | 405.0 | 99.0 | NaN | NaN | NaN | 189.0 | 639.0 |
24687 | 24687.0 | 2017.0 | Tyler Zeller | C | 27.0 | BOS | 51.0 | NaN | 525.0 | 13.0 | ... | 0.564 | NaN | NaN | 124.0 | 42.0 | NaN | NaN | NaN | 61.0 | 178.0 |
24688 | 24688.0 | 2017.0 | Stephen Zimmerman | C | 20.0 | ORL | 19.0 | 0.0 | 108.0 | 7.3 | ... | 0.600 | NaN | NaN | 35.0 | 4.0 | NaN | NaN | NaN | 17.0 | 23.0 |
24689 | 24689.0 | 2017.0 | Paul Zipser | SF | 22.0 | CHI | 44.0 | NaN | 843.0 | 6.9 | ... | 0.775 | NaN | NaN | 125.0 | 36.0 | NaN | NaN | NaN | 78.0 | 240.0 |
24690 | 24690.0 | 2017.0 | Ivica Zubac | C | 19.0 | LAL | 38.0 | NaN | 609.0 | 17.0 | ... | 0.653 | NaN | NaN | 159.0 | 30.0 | NaN | NaN | NaN | 66.0 | 284.0 |
24691 rows × 53 columns
df.isnull().values.any()
False
df.isnull().sum()
X1 0 Player 0 Tm 0 season17_18 0 dtype: int64
df1.isnull().values.any()
True
df1.isnull().sum()
X1 0 Year 67 Player 67 Pos 67 Age 75 Tm 67 G 67 GS 18444 MP 553 PER 590 TS% 153 3PAr 21239 FTr 166 ORB% 23714 DRB% 24320 TRB% 24436 AST% 23926 STL% 22672 BLK% 21652 TOV% 24241 USG% 24633 blanl 24691 OWS 106 DWS 106 WS 106 WS/48 590 blank2 24691 OBPM 24191 DBPM 23975 BPM 24279 VORP 21604 FG 67 FGA 67 FG% 166 3P 16021 3PA 19516 3P% 20893 2P 67 2PA 67 2P% 195 eFG% 166 FT 67 FTA 67 FT% 925 ORB 23158 DRB 23956 TRB 379 AST 67 STL 22794 BLK 21000 TOV 23761 PF 67 PTS 67 dtype: int64
print(df1.shape)
df1_final=df1.dropna(axis=0)# Filas con columas con Nas seran removidas
print(df1_final.shape)
df1_final.head(10)
# ! Esto no funciona!
(24691, 53) (0, 53)
X1 | Year | Player | Pos | Age | Tm | G | GS | MP | PER | ... | FT% | ORB | DRB | TRB | AST | STL | BLK | TOV | PF | PTS |
---|
0 rows × 53 columns
print(df1.shape)
df1_final=df1.dropna(axis=1)# Filas con columas con Nas seran removidas
print(df1_final.shape)
df1_final.head(10)
# ! Esto NO funciona!
(24691, 53) (24691, 1)
X1 | |
---|---|
0 | 0.0 |
1 | 1.0 |
2 | 2.0 |
3 | 3.0 |
4 | 4.0 |
5 | 5.0 |
6 | 6.0 |
7 | 7.0 |
8 | 8.0 |
9 | 9.0 |
Que Columnas removeriamos
percent_missing = df1.isnull().sum() * 100 / len(df1)
missing_value_df = pd.DataFrame({'Columnas': df1.columns,
'Porcentaje_Missing': percent_missing})
missing_value_df
Columnas | Porcentaje_Missing | |
---|---|---|
X1 | X1 | 0.000000 |
Year | Year | 0.271354 |
Player | Player | 0.271354 |
Pos | Pos | 0.271354 |
Age | Age | 0.303754 |
Tm | Tm | 0.271354 |
G | G | 0.271354 |
GS | GS | 74.699283 |
MP | MP | 2.239682 |
PER | PER | 2.389535 |
TS% | TS% | 0.619659 |
3PAr | 3PAr | 86.019197 |
FTr | FTr | 0.672310 |
ORB% | ORB% | 96.043093 |
DRB% | DRB% | 98.497428 |
TRB% | TRB% | 98.967235 |
AST% | AST% | 96.901705 |
STL% | STL% | 91.822931 |
BLK% | BLK% | 87.691872 |
TOV% | TOV% | 98.177474 |
USG% | USG% | 99.765097 |
blanl | blanl | 100.000000 |
OWS | OWS | 0.429306 |
DWS | DWS | 0.429306 |
WS | WS | 0.429306 |
WS/48 | WS/48 | 2.389535 |
blank2 | blank2 | 100.000000 |
OBPM | OBPM | 97.974971 |
DBPM | DBPM | 97.100158 |
BPM | BPM | 98.331376 |
VORP | VORP | 87.497469 |
FG | FG | 0.271354 |
FGA | FGA | 0.271354 |
FG% | FG% | 0.672310 |
3P | 3P | 64.885991 |
3PA | 3PA | 79.040946 |
3P% | 3P% | 84.617877 |
2P | 2P | 0.271354 |
2PA | 2PA | 0.271354 |
2P% | 2P% | 0.789761 |
eFG% | eFG% | 0.672310 |
FT | FT | 0.271354 |
FTA | FTA | 0.271354 |
FT% | FT% | 3.746304 |
ORB | ORB | 93.791260 |
DRB | DRB | 97.023207 |
TRB | TRB | 1.534972 |
AST | AST | 0.271354 |
STL | STL | 92.317039 |
BLK | BLK | 85.051233 |
TOV | TOV | 96.233445 |
PF | PF | 0.271354 |
PTS | PTS | 0.271354 |
missing_value_df_f=missing_value_df[missing_value_df['Porcentaje_Missing']<=50]
missing_value_df_f
Columnas | Porcentaje_Missing | |
---|---|---|
X1 | X1 | 0.000000 |
Year | Year | 0.271354 |
Player | Player | 0.271354 |
Pos | Pos | 0.271354 |
Age | Age | 0.303754 |
Tm | Tm | 0.271354 |
G | G | 0.271354 |
MP | MP | 2.239682 |
PER | PER | 2.389535 |
TS% | TS% | 0.619659 |
FTr | FTr | 0.672310 |
OWS | OWS | 0.429306 |
DWS | DWS | 0.429306 |
WS | WS | 0.429306 |
WS/48 | WS/48 | 2.389535 |
FG | FG | 0.271354 |
FGA | FGA | 0.271354 |
FG% | FG% | 0.672310 |
2P | 2P | 0.271354 |
2PA | 2PA | 0.271354 |
2P% | 2P% | 0.789761 |
eFG% | eFG% | 0.672310 |
FT | FT | 0.271354 |
FTA | FTA | 0.271354 |
FT% | FT% | 3.746304 |
TRB | TRB | 1.534972 |
AST | AST | 0.271354 |
PF | PF | 0.271354 |
PTS | PTS | 0.271354 |
lista_variables=list(missing_value_df_f.Columnas)
print(type(lista_variables))
print(len(lista_variables))
lista_variables
<class 'list'> 29
['X1', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', 'FTr', 'OWS', 'DWS', 'WS', 'WS/48', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TRB', 'AST', 'PF', 'PTS']
df1_x=df1.loc[:, df1.columns.isin(lista_variables)]
df1_x
X1 | Year | Player | Pos | Age | Tm | G | MP | PER | TS% | ... | 2PA | 2P% | eFG% | FT | FTA | FT% | TRB | AST | PF | PTS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 1950.0 | Curly Armstrong | G-F | 31.0 | FTW | 63.0 | NaN | NaN | 0.368 | ... | 516.0 | 0.279 | 0.279 | 170.0 | 241.0 | 0.705 | NaN | 176.0 | 217.0 | 458.0 |
1 | 1.0 | 1950.0 | Cliff Barker | SG | 29.0 | INO | 49.0 | NaN | NaN | 0.435 | ... | 274.0 | 0.372 | 0.372 | 75.0 | 106.0 | 0.708 | NaN | 109.0 | 99.0 | 279.0 |
2 | 2.0 | 1950.0 | Leo Barnhorst | SF | 25.0 | CHS | 67.0 | NaN | NaN | 0.394 | ... | 499.0 | 0.349 | 0.349 | 90.0 | 129.0 | 0.698 | NaN | 140.0 | 192.0 | 438.0 |
3 | 3.0 | 1950.0 | Ed Bartels | F | 24.0 | TOT | 15.0 | NaN | NaN | 0.312 | ... | 86.0 | 0.256 | 0.256 | 19.0 | 34.0 | 0.559 | NaN | 20.0 | 29.0 | 63.0 |
4 | 4.0 | 1950.0 | Ed Bartels | F | 24.0 | DNN | 13.0 | NaN | NaN | 0.308 | ... | 82.0 | 0.256 | 0.256 | 17.0 | 31.0 | 0.548 | NaN | 20.0 | 27.0 | 59.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
24686 | 24686.0 | 2017.0 | Cody Zeller | PF | 24.0 | CHO | 62.0 | 1725.0 | 16.7 | 0.604 | ... | 442.0 | 0.572 | 0.571 | 133.0 | 196.0 | 0.679 | 405.0 | 99.0 | 189.0 | 639.0 |
24687 | 24687.0 | 2017.0 | Tyler Zeller | C | 27.0 | BOS | 51.0 | 525.0 | 13.0 | 0.508 | ... | 157.0 | 0.497 | 0.494 | 22.0 | 39.0 | 0.564 | 124.0 | 42.0 | 61.0 | 178.0 |
24688 | 24688.0 | 2017.0 | Stephen Zimmerman | C | 20.0 | ORL | 19.0 | 108.0 | 7.3 | 0.346 | ... | 31.0 | 0.323 | 0.323 | 3.0 | 5.0 | 0.600 | 35.0 | 4.0 | 17.0 | 23.0 |
24689 | 24689.0 | 2017.0 | Paul Zipser | SF | 22.0 | CHI | 44.0 | 843.0 | 6.9 | 0.503 | ... | 122.0 | 0.451 | 0.473 | 31.0 | 40.0 | 0.775 | 125.0 | 36.0 | 78.0 | 240.0 |
24690 | 24690.0 | 2017.0 | Ivica Zubac | C | 19.0 | LAL | 38.0 | 609.0 | 17.0 | 0.547 | ... | 235.0 | 0.536 | 0.529 | 32.0 | 49.0 | 0.653 | 159.0 | 30.0 | 66.0 | 284.0 |
24691 rows × 29 columns
df1_y= df1_x.interpolate()
percent_missing_y = df1_y.isnull().sum() * 100 / len(df1_y)
percent_missing_y
X1 0.000000 Year 0.000000 Player 0.271354 Pos 0.271354 Age 0.000000 Tm 0.271354 G 0.000000 MP 1.976429 PER 1.976429 TS% 0.000000 FTr 0.000000 OWS 0.000000 DWS 0.000000 WS 0.000000 WS/48 1.976429 FG 0.000000 FGA 0.000000 FG% 0.000000 2P 0.000000 2PA 0.000000 2P% 0.000000 eFG% 0.000000 FT 0.000000 FTA 0.000000 FT% 0.000000 TRB 1.267668 AST 0.000000 PF 0.000000 PTS 0.000000 dtype: float64
df1_y
X1 | Year | Player | Pos | Age | Tm | G | MP | PER | TS% | ... | 2PA | 2P% | eFG% | FT | FTA | FT% | TRB | AST | PF | PTS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 1950.0 | Curly Armstrong | G-F | 31.0 | FTW | 63.0 | NaN | NaN | 0.368 | ... | 516.0 | 0.279 | 0.279 | 170.0 | 241.0 | 0.705 | NaN | 176.0 | 217.0 | 458.0 |
1 | 1.0 | 1950.0 | Cliff Barker | SG | 29.0 | INO | 49.0 | NaN | NaN | 0.435 | ... | 274.0 | 0.372 | 0.372 | 75.0 | 106.0 | 0.708 | NaN | 109.0 | 99.0 | 279.0 |
2 | 2.0 | 1950.0 | Leo Barnhorst | SF | 25.0 | CHS | 67.0 | NaN | NaN | 0.394 | ... | 499.0 | 0.349 | 0.349 | 90.0 | 129.0 | 0.698 | NaN | 140.0 | 192.0 | 438.0 |
3 | 3.0 | 1950.0 | Ed Bartels | F | 24.0 | TOT | 15.0 | NaN | NaN | 0.312 | ... | 86.0 | 0.256 | 0.256 | 19.0 | 34.0 | 0.559 | NaN | 20.0 | 29.0 | 63.0 |
4 | 4.0 | 1950.0 | Ed Bartels | F | 24.0 | DNN | 13.0 | NaN | NaN | 0.308 | ... | 82.0 | 0.256 | 0.256 | 17.0 | 31.0 | 0.548 | NaN | 20.0 | 27.0 | 59.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
24686 | 24686.0 | 2017.0 | Cody Zeller | PF | 24.0 | CHO | 62.0 | 1725.0 | 16.7 | 0.604 | ... | 442.0 | 0.572 | 0.571 | 133.0 | 196.0 | 0.679 | 405.0 | 99.0 | 189.0 | 639.0 |
24687 | 24687.0 | 2017.0 | Tyler Zeller | C | 27.0 | BOS | 51.0 | 525.0 | 13.0 | 0.508 | ... | 157.0 | 0.497 | 0.494 | 22.0 | 39.0 | 0.564 | 124.0 | 42.0 | 61.0 | 178.0 |
24688 | 24688.0 | 2017.0 | Stephen Zimmerman | C | 20.0 | ORL | 19.0 | 108.0 | 7.3 | 0.346 | ... | 31.0 | 0.323 | 0.323 | 3.0 | 5.0 | 0.600 | 35.0 | 4.0 | 17.0 | 23.0 |
24689 | 24689.0 | 2017.0 | Paul Zipser | SF | 22.0 | CHI | 44.0 | 843.0 | 6.9 | 0.503 | ... | 122.0 | 0.451 | 0.473 | 31.0 | 40.0 | 0.775 | 125.0 | 36.0 | 78.0 | 240.0 |
24690 | 24690.0 | 2017.0 | Ivica Zubac | C | 19.0 | LAL | 38.0 | 609.0 | 17.0 | 0.547 | ... | 235.0 | 0.536 | 0.529 | 32.0 | 49.0 | 0.653 | 159.0 | 30.0 | 66.0 | 284.0 |
24691 rows × 29 columns
con =sqlite3.connect("nba_salary.sqlite")
df=pd.read_sql_query('SELECT * FROM NBA_season1718_salary',con)
df.head()
X1 | Player | Tm | season17_18 | |
---|---|---|---|---|
0 | 1.0 | Stephen Curry | GSW | 34682550.0 |
1 | 2.0 | LeBron James | CLE | 33285709.0 |
2 | 3.0 | Paul Millsap | DEN | 31269231.0 |
3 | 4.0 | Gordon Hayward | BOS | 29727900.0 |
4 | 5.0 | Blake Griffin | DET | 29512900.0 |
import pandas as pd
con =sqlite3.connect("nba_salary.sqlite")
df1=pd.read_sql_query('SELECT * From Seasons_Stats',con)
df1=df1[df1.Year == 2017]
df1
X1 | Year | Player | Pos | Age | Tm | G | GS | MP | PER | ... | FT% | ORB | DRB | TRB | AST | STL | BLK | TOV | PF | PTS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
24096 | 24096.0 | 2017.0 | Alex Abrines | SG | 23.0 | OKC | 68.0 | NaN | 1055.0 | 10.1 | ... | 0.898 | NaN | NaN | 86.0 | 40.0 | NaN | NaN | NaN | 114.0 | 406.0 |
24097 | 24097.0 | 2017.0 | Quincy Acy | PF | 26.0 | TOT | 38.0 | 1.0 | 558.0 | 11.8 | ... | 0.750 | NaN | NaN | 115.0 | 18.0 | NaN | NaN | NaN | 67.0 | 222.0 |
24098 | 24098.0 | 2017.0 | Quincy Acy | PF | 26.0 | DAL | 6.0 | 0.0 | 48.0 | -1.4 | ... | 0.667 | NaN | NaN | 8.0 | 0.0 | 0.0 | 0.0 | NaN | 9.0 | 13.0 |
24099 | 24099.0 | 2017.0 | Quincy Acy | PF | 26.0 | BRK | 32.0 | 1.0 | 510.0 | 13.1 | ... | 0.754 | NaN | NaN | 107.0 | 18.0 | NaN | NaN | NaN | 58.0 | 209.0 |
24100 | 24100.0 | 2017.0 | Steven Adams | C | 23.0 | OKC | 80.0 | NaN | 2389.0 | 16.5 | ... | 0.611 | NaN | NaN | 615.0 | 86.0 | NaN | NaN | NaN | 195.0 | 905.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
24686 | 24686.0 | 2017.0 | Cody Zeller | PF | 24.0 | CHO | 62.0 | NaN | 1725.0 | 16.7 | ... | 0.679 | NaN | NaN | 405.0 | 99.0 | NaN | NaN | NaN | 189.0 | 639.0 |
24687 | 24687.0 | 2017.0 | Tyler Zeller | C | 27.0 | BOS | 51.0 | NaN | 525.0 | 13.0 | ... | 0.564 | NaN | NaN | 124.0 | 42.0 | NaN | NaN | NaN | 61.0 | 178.0 |
24688 | 24688.0 | 2017.0 | Stephen Zimmerman | C | 20.0 | ORL | 19.0 | 0.0 | 108.0 | 7.3 | ... | 0.600 | NaN | NaN | 35.0 | 4.0 | NaN | NaN | NaN | 17.0 | 23.0 |
24689 | 24689.0 | 2017.0 | Paul Zipser | SF | 22.0 | CHI | 44.0 | NaN | 843.0 | 6.9 | ... | 0.775 | NaN | NaN | 125.0 | 36.0 | NaN | NaN | NaN | 78.0 | 240.0 |
24690 | 24690.0 | 2017.0 | Ivica Zubac | C | 19.0 | LAL | 38.0 | NaN | 609.0 | 17.0 | ... | 0.653 | NaN | NaN | 159.0 | 30.0 | NaN | NaN | NaN | 66.0 | 284.0 |
595 rows × 53 columns
df.Player.duplicated()
0 False 1 False 2 False 3 False 4 False ... 568 True 569 False 570 False 571 True 572 True Name: Player, Length: 573, dtype: bool
df.Player.duplicated().sum()
38
df.loc[df.Player.duplicated(),:].sort_values(by='Player')
X1 | Player | Tm | season17_18 | |
---|---|---|---|---|
331 | 332.0 | Anthony Tolliver | SAC | 2000000.0 |
555 | 556.0 | Antonius Cleveland | DAL | 50000.0 |
377 | 378.0 | Arron Afflalo | SAC | 1500000.0 |
552 | 553.0 | Briante Weber | HOU | 50000.0 |
550 | 551.0 | Briante Weber | LAL | 50000.0 |
503 | 504.0 | DeAndre Liggins | NOP | 172238.0 |
566 | 567.0 | DeAndre Liggins | ATL | 26773.0 |
528 | 529.0 | Demetrius Jackson | HOU | 74159.0 |
535 | 536.0 | Demetrius Jackson | HOU | 50000.0 |
556 | 557.0 | Diamond Stone | CHI | 50000.0 |
297 | 298.0 | Dwyane Wade | MIA | 2328652.0 |
487 | 488.0 | Ersan Ilyasova | PHI | 502328.0 |
529 | 530.0 | Georgios Papagiannis | POR | 74159.0 |
201 | 202.0 | Greg Monroe | BOS | 5000000.0 |
501 | 502.0 | Isaiah Canaan | CHI | 200000.0 |
532 | 533.0 | Isaiah Canaan | HOU | 53465.0 |
216 | 217.0 | Jamal Crawford | MIN | 4328000.0 |
305 | 306.0 | Jameer Nelson | DET | 2262871.0 |
526 | 527.0 | Jarell Eddie | CHI | 83129.0 |
572 | 573.0 | Jarell Eddie | CHI | 17224.0 |
481 | 482.0 | Joe Johnson | HOU | 749905.0 |
571 | 572.0 | Joel Bolomboy | MIL | 22248.0 |
311 | 312.0 | Josh Smith | NOP | 2183934.0 |
534 | 535.0 | Kyle Collinsworth | DAL | 50000.0 |
480 | 481.0 | Marco Belinelli | PHI | 776217.0 |
539 | 540.0 | Naz Mitrou-Long | UTA | 50000.0 |
559 | 560.0 | Nigel Hayes | LAL | 46080.0 |
560 | 561.0 | Nigel Hayes | TOR | 46080.0 |
568 | 569.0 | Quinn Cook | NOP | 25000.0 |
260 | 261.0 | Rajon Rondo | CHI | 3000000.0 |
497 | 498.0 | Ramon Sessions | WAS | 263124.0 |
505 | 506.0 | Rashad Vaughn | ORL | 166258.0 |
299 | 300.0 | Richard Jefferson | DEN | 2328652.0 |
477 | 478.0 | Sean Kilpatrick | MIL | 789725.0 |
504 | 505.0 | Sean Kilpatrick | LAC | 172238.0 |
506 | 507.0 | Troy Williams | NYK | 148318.0 |
533 | 534.0 | Tyler Cavanaugh | ATL | 50000.0 |
500 | 501.0 | Wade Baldwin | POR | 222476.0 |
df1.Player.duplicated()
24096 False 24097 False 24098 True 24099 True 24100 False ... 24686 False 24687 False 24688 False 24689 False 24690 False Name: Player, Length: 595, dtype: bool
df1.Player.duplicated().sum()
109
df1.loc[df1.Player.duplicated(),:].sort_values(by='Player')
X1 | Year | Player | Pos | Age | Tm | G | GS | MP | PER | ... | FT% | ORB | DRB | TRB | AST | STL | BLK | TOV | PF | PTS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
24155 | 24155.0 | 2017.0 | Andrew Bogut | C | 32.0 | DAL | 26.0 | NaN | 582.0 | 9.4 | ... | 0.273 | NaN | NaN | 218.0 | 49.0 | NaN | NaN | NaN | 84.0 | 79.0 |
24156 | 24156.0 | 2017.0 | Andrew Bogut | C | 32.0 | CLE | 1.0 | 0.0 | 1.0 | -35.3 | ... | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 |
24492 | 24492.0 | 2017.0 | Andrew Nicholson | PF | 27.0 | BRK | 10.0 | 0.0 | 111.0 | 5.0 | ... | 1.000 | NaN | NaN | 27.0 | 3.0 | NaN | 0.0 | NaN | 18.0 | 30.0 |
24491 | 24491.0 | 2017.0 | Andrew Nicholson | PF | 27.0 | WAS | 28.0 | 0.0 | 231.0 | 6.3 | ... | 0.583 | NaN | NaN | 34.0 | 7.0 | NaN | NaN | NaN | 37.0 | 70.0 |
24167 | 24167.0 | 2017.0 | Anthony Brown | SF | 24.0 | NOP | 9.0 | 0.0 | 143.0 | 5.9 | ... | NaN | NaN | NaN | 26.0 | 6.0 | NaN | 1.0 | NaN | 13.0 | 34.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
24250 | 24250.0 | 2017.0 | Tyreke Evans | SF | 27.0 | NOP | 26.0 | 0.0 | 474.0 | 15.8 | ... | 0.776 | NaN | NaN | 86.0 | 91.0 | NaN | NaN | NaN | 41.0 | 248.0 |
24581 | 24581.0 | 2017.0 | Wayne Selden | SG | 22.0 | MEM | 11.0 | NaN | 189.0 | 6.2 | ... | 0.667 | 1.0 | NaN | 11.0 | 12.0 | NaN | 1.0 | NaN | 12.0 | 55.0 |
24580 | 24580.0 | 2017.0 | Wayne Selden | SG | 22.0 | NOP | 3.0 | NaN | 47.0 | 10.0 | ... | 0.500 | 0.0 | NaN | 5.0 | 1.0 | NaN | 0.0 | NaN | 6.0 | 16.0 |
24261 | 24261.0 | 2017.0 | Yogi Ferrell | PG | 23.0 | DAL | 36.0 | NaN | 1046.0 | 14.1 | ... | 0.877 | NaN | NaN | 99.0 | 155.0 | NaN | NaN | NaN | 79.0 | 408.0 |
24260 | 24260.0 | 2017.0 | Yogi Ferrell | PG | 23.0 | BRK | 10.0 | 0.0 | 151.0 | 6.5 | ... | 0.625 | NaN | NaN | 12.0 | 17.0 | NaN | NaN | NaN | 11.0 | 54.0 |
109 rows × 53 columns
Parece ser que los jugadores dentro de la misma temporada pueden cambiar de Equipo y eso hace que aparezcan con diferentes registros
print(df.columns)
print(df.columns)
print(df1.columns)
Index(['X1', 'Player', 'Tm', 'season17_18'], dtype='object') Index(['X1', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'], dtype='object')
lista=df.columns
lista1=df1.columns
list(set(lista1).intersection(lista))
['Player', 'X1', 'Tm']
pd.merge(df1,df, on =['Player'],how= 'inner')
X1_x | Year | Player | Pos | Age | Tm_x | G | GS | MP | PER | ... | TRB | AST | STL | BLK | TOV | PF | PTS | X1_y | Tm_y | season17_18 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 24096.0 | 2017.0 | Alex Abrines | SG | 23.0 | OKC | 68.0 | NaN | 1055.0 | 10.1 | ... | 86.0 | 40.0 | NaN | NaN | NaN | 114.0 | 406.0 | 185.0 | OKC | 5725000.0 |
1 | 24097.0 | 2017.0 | Quincy Acy | PF | 26.0 | TOT | 38.0 | 1.0 | 558.0 | 11.8 | ... | 115.0 | 18.0 | NaN | NaN | NaN | 67.0 | 222.0 | 350.0 | BRK | 1709538.0 |
2 | 24098.0 | 2017.0 | Quincy Acy | PF | 26.0 | DAL | 6.0 | 0.0 | 48.0 | -1.4 | ... | 8.0 | 0.0 | 0.0 | 0.0 | NaN | 9.0 | 13.0 | 350.0 | BRK | 1709538.0 |
3 | 24099.0 | 2017.0 | Quincy Acy | PF | 26.0 | BRK | 32.0 | 1.0 | 510.0 | 13.1 | ... | 107.0 | 18.0 | NaN | NaN | NaN | 58.0 | 209.0 | 350.0 | BRK | 1709538.0 |
4 | 24100.0 | 2017.0 | Steven Adams | C | 23.0 | OKC | 80.0 | NaN | 2389.0 | 16.5 | ... | 615.0 | 86.0 | NaN | NaN | NaN | 195.0 | 905.0 | 32.0 | OKC | 22471910.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
547 | 24686.0 | 2017.0 | Cody Zeller | PF | 24.0 | CHO | 62.0 | NaN | 1725.0 | 16.7 | ... | 405.0 | 99.0 | NaN | NaN | NaN | 189.0 | 639.0 | 95.0 | CHO | 12584270.0 |
548 | 24687.0 | 2017.0 | Tyler Zeller | C | 27.0 | BOS | 51.0 | NaN | 525.0 | 13.0 | ... | 124.0 | 42.0 | NaN | NaN | NaN | 61.0 | 178.0 | 349.0 | MIL | 1709538.0 |
549 | 24688.0 | 2017.0 | Stephen Zimmerman | C | 20.0 | ORL | 19.0 | 0.0 | 108.0 | 7.3 | ... | 35.0 | 4.0 | NaN | NaN | NaN | 17.0 | 23.0 | 443.0 | LAL | 1312611.0 |
550 | 24689.0 | 2017.0 | Paul Zipser | SF | 22.0 | CHI | 44.0 | NaN | 843.0 | 6.9 | ... | 125.0 | 36.0 | NaN | NaN | NaN | 78.0 | 240.0 | 432.0 | CHI | 1312611.0 |
551 | 24690.0 | 2017.0 | Ivica Zubac | C | 19.0 | LAL | 38.0 | NaN | 609.0 | 17.0 | ... | 159.0 | 30.0 | NaN | NaN | NaN | 66.0 | 284.0 | 426.0 | LAL | 1312611.0 |
552 rows × 56 columns
pd.merge(df1,df, on =['Player'],how= 'inner')[['Player','PTS','season17_18']]
Player | PTS | season17_18 | |
---|---|---|---|
0 | Alex Abrines | 406.0 | 5725000.0 |
1 | Quincy Acy | 222.0 | 1709538.0 |
2 | Quincy Acy | 13.0 | 1709538.0 |
3 | Quincy Acy | 209.0 | 1709538.0 |
4 | Steven Adams | 905.0 | 22471910.0 |
... | ... | ... | ... |
547 | Cody Zeller | 639.0 | 12584270.0 |
548 | Tyler Zeller | 178.0 | 1709538.0 |
549 | Stephen Zimmerman | 23.0 | 1312611.0 |
550 | Paul Zipser | 240.0 | 1312611.0 |
551 | Ivica Zubac | 284.0 | 1312611.0 |
552 rows × 3 columns
f = {'season17_18':['sum'], 'PTS':['mean']}
pd.merge(df1,df, on =['Player'],how= 'inner')[['Player','PTS','season17_18']].groupby('Player').agg(f)
season17_18 | PTS | |
---|---|---|
sum | mean | |
Player | ||
A.J. Hammons | 1312611.0 | 48.0 |
Aaron Brooks | 2116955.0 | 322.0 |
Aaron Gordon | 5504420.0 | 1019.0 |
Al Horford | 27734405.0 | 952.0 |
Al Jefferson | 9769821.0 | 535.0 |
... | ... | ... |
Wilson Chandler | 12016854.0 | 1117.0 |
Yogi Ferrell | 3937833.0 | 308.0 |
Zach LaVine | 3202217.0 | 889.0 |
Zach Randolph | 12307692.0 | 1028.0 |
Zaza Pachulia | 3477600.0 | 426.0 |
411 rows × 2 columns