import pandas as pd import matplotlib.pyplot as plt # Leer el dataset url='https://raw.githubusercontent.com/JJTorresDS/ds-data-sources/main/income_per_person_gdppercapita_ppp_inflation_adjusted.csv' df= pd.read_csv(url,sep=',') df sum(df.isnull().sum()) print(df.shape) df= df.drop_duplicates() print(df.shape) # Parte 1 df_pais=df[df['country'] == 'Argentina'].iloc[:,217:224] lista=df_pais.values[0] lista def promedio_pais(fila): suma = cantidad = 0 for col in fila: if str(col)[-1] == "k": col = col[:-1] suma += float(col) cantidad += 1 return suma/cantidad import numpy as np df_pais=df_pais.apply(lambda x: x.replace("k", "")) df_pais lista_nueva = [] for string in lista: x = float(string.replace("k", "")) lista_nueva.append(x) import statistics statistics.mean(lista_nueva)*1000 # Parte 2 (Una forma de hacerlo) uso de filtros df_paises=df[df['country'].isin(['Argentina','Chile','Colombia','Bolivia','Peru','Brasil','Uruguay','Venezuela','Paraguay','Ecuador'])].iloc[:,217:224] lista=df_paises.values lista lista_nueva = [] for string in lista: for j in string: if 'k' in j: #print(j) x = float(j.replace("k", ""))*1000 lista_nueva.append(x) #print(x) else: x= float(j) lista_nueva.append(x) #print('---') #print(x) import statistics statistics.mean(lista_nueva) df.country.unique() df_paises=df[df['country'].isin(['United States','Canada','Mexico','Costa Rica','Nicaragua'])].iloc[:,217:224] lista=df_paises.values lista lista_nueva = [] for string in lista: for j in string: if 'k' in j: #print(j) x = float(j.replace("k", ""))*1000 lista_nueva.append(x) #print(x) else: x= float(j) lista_nueva.append(x) #print('---') #print(x) import statistics statistics.mean(lista_nueva) from google.colab import drive import os drive.mount('/content/gdrive') # Establecer ruta de acceso en drive import os print(os.getcwd()) os.chdir("/content/gdrive/My Drive") import pandas as pd import sqlite3 sql= "SELECT name FROM sqlite_master WHERE type='table'" # Leer el archivo sqllite y convertirlo a dataframe con =sqlite3.connect("nba_salary.sqlite") df=pd.read_sql_query(sql,con) # Verificar el resultado en un dataframe print(df) con.close() con =sqlite3.connect("nba_salary.sqlite") df=pd.read_sql_query('SELECT * FROM NBA_season1718_salary',con) df.head() import pandas as pd con =sqlite3.connect("nba_salary.sqlite") df1=pd.read_sql_query('SELECT * From Seasons_Stats',con) df1 df.isnull().values.any() df.isnull().sum() df1.isnull().values.any() df1.isnull().sum() print(df1.shape) df1_final=df1.dropna(axis=0)# Filas con columas con Nas seran removidas print(df1_final.shape) df1_final.head(10) # ! Esto no funciona! print(df1.shape) df1_final=df1.dropna(axis=1)# Filas con columas con Nas seran removidas print(df1_final.shape) df1_final.head(10) # ! Esto NO funciona! percent_missing = df1.isnull().sum() * 100 / len(df1) missing_value_df = pd.DataFrame({'Columnas': df1.columns, 'Porcentaje_Missing': percent_missing}) missing_value_df missing_value_df_f=missing_value_df[missing_value_df['Porcentaje_Missing']<=50] missing_value_df_f lista_variables=list(missing_value_df_f.Columnas) print(type(lista_variables)) print(len(lista_variables)) lista_variables df1_x=df1.loc[:, df1.columns.isin(lista_variables)] df1_x df1_y= df1_x.interpolate() percent_missing_y = df1_y.isnull().sum() * 100 / len(df1_y) percent_missing_y df1_y con =sqlite3.connect("nba_salary.sqlite") df=pd.read_sql_query('SELECT * FROM NBA_season1718_salary',con) df.head() import pandas as pd con =sqlite3.connect("nba_salary.sqlite") df1=pd.read_sql_query('SELECT * From Seasons_Stats',con) df1=df1[df1.Year == 2017] df1 df.Player.duplicated() df.Player.duplicated().sum() df.loc[df.Player.duplicated(),:].sort_values(by='Player') df1.Player.duplicated() df1.Player.duplicated().sum() df1.loc[df1.Player.duplicated(),:].sort_values(by='Player') print(df.columns) print(df1.columns) lista=df.columns lista1=df1.columns list(set(lista1).intersection(lista)) pd.merge(df1,df, on =['Player'],how= 'inner') pd.merge(df1,df, on =['Player'],how= 'inner')[['Player','PTS','season17_18']] f = {'season17_18':['sum'], 'PTS':['mean']} pd.merge(df1,df, on =['Player'],how= 'inner')[['Player','PTS','season17_18']].groupby('Player').agg(f)