import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline from google.colab import drive import os drive.mount('/content/drive') # Establecer ruta de acceso en drive import os print(os.getcwd()) os.chdir("/content/drive/My Drive") print(os.getcwd()) train=pd.read_csv('train_datathon.csv',sep=',') train.head() test=pd.read_csv('test_datathon.csv',sep=',') test.head() print(train.shape) print(test.shape) train.info() pd.set_option('display.max_rows', None) # or 1000 serie=(train.isnull().sum()/train.shape[0])*100 serie=serie.sort_values(ascending=False) serie= serie[serie>0] serie import matplotlib.pyplot as plt plt.figure(figsize=(10,6)) serie.plot(kind='bar') pip install missingno import missingno as msno msno.matrix(train) msno.heatmap(train) msno.dendrogram(train) train.PoolQC.value_counts() # Decision: borrar train.MiscFeature.value_counts() # Decision: borrar train.Alley.value_counts() # Decision: borrar train.Fence.value_counts() # Decision: borrar train.FireplaceQu.value_counts() # Decision: Con este se podria hacer otra tecnica en vez de borrar la columna train.LotFrontage.isnull().sum() # Decision: Reemplazar con alguna tecnica de interpolacion train.GarageFinish.value_counts() # Decision: Alguna tecnica de reemplazo train.GarageQual.value_counts() # Decision: Alguna tecnica de reemplazo train.GarageCond.value_counts() # Decision: Reemplazar con alguna tecnica train.GarageType.value_counts() # Decision: Reemplazar con alguna tecnica train.BsmtExposure.value_counts() # Decision: Tecnica reemplazro train.GarageYrBlt.isnull().sum() # Decision: Reemplazar con alguna tecnica de interpolacion train.BsmtFinType2.value_counts() # Decision: Tecnica reemplazro train.BsmtFinType1.value_counts() # Decision: Tecnica reemplazro train.BsmtCond.value_counts()# Decision: reemplzar moda train.BsmtQual.value_counts()# Decision: reemplzar moda train.MasVnrArea.isnull().sum() train.MasVnrType.value_counts() train.Electrical.value_counts() train.shape col_del=['PoolQC','MiscFeature','Alley','Fence','Id'] # Columnas a borrar que no aportan col_inter= ['LotFrontage','GarageYrBlt','MasVnrArea'] # Columnas a interpolar col_cat= ['FireplaceQu','GarageFinish','GarageCond','GarageType','BsmtExposure','BsmtFinType2','BsmtFinType1','BsmtCond', 'BsmtQual','MasVnrType','Electrical'] # Columnas cateogoricas # Borrar variables train_x=train.drop(labels=col_del, axis=1) # Borrando las columnas en col_del train_x.columns # Verificando las columnas que quedaron for i in col_inter: # reemplazar por la mediana en las columnas numericas en col_inter train_x[i] = train_x[i].fillna(train_x[i].median()) train_x.MasVnrArea.isnull().sum() # Verificar que funciono for i in col_cat: # Reemplazar los vacios por Desconocido train_x[i] = train_x[i].fillna('Desconocido') train_x.Electrical.isnull().sum() # Verificando que funciona train_x.SaleCondition.dtype train_x.SaleCondition.value_counts() alldata=[] # lista para guardar resultados de One hot Encoding alldata1=[] # lista para guardar resultados de variables=[] # ir guardando las variables for i in train_x.columns: # iterar sobre todas las columnas if train_x[i].dtype == 'O': # Quedarme con las que sean tipo object if len(train_x[i].unique()) ==2: # Si solo tienen dos categorias aplicar One Hot Encoding print('Variable 2 categorias:',i) y= pd.get_dummies(train_x[i]) # Aplicar One Hot Encoding alldata.append(y) # ir agregando las columnas resultantes en alldata variables.append(i) else: pass print('--------------------') if len(train_x[i].unique()) >2: # Si las categorias tienen mas de 2 categorias aplicar LabelEncoder print('Variable >2 categorias:',i) z= pd.DataFrame() z[i]=train_x[i].astype('category').cat.codes # Aplicar Label Encoder alldata1.append(z) # Agregar a la lista alldata1 variables.append(i) # if train_x[i].dtype == 'float64': # Si la columna es numerica entonces normalizar z score train_x[i]=(train_x[i]-train_x[i].mean())/(train_x[i].std()) # yapo data_y=pd.concat(alldata, axis=1) # concatenar las columnas obtenidas One hot Encoding data_z=pd.concat(alldata1, axis=1) # concatenar las columnas de Label Encoder train_y= pd.concat([data_y,data_z,train_x],axis=1) # Unir todos los datasets One Hot Encoding, LabelEncoder, Z score train_y=train_y.drop(labels=variables, axis=1) # Borrar las columnas ya recodificadas train_y.head() # mostrar que quedo train_y.info() # traigo estos modelos from sklearn.ensemble import AdaBoostClassifier from sklearn.model_selection import train_test_split from sklearn import metrics X = train_y.drop(columns=['SalePrice']) # matriz de diseño y = train_y.SalePrice # vector respuesta X.head() #Separación en Train y Test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # 95% training and 5% test porque tengo un dataset de test aparte from sklearn.ensemble import AdaBoostRegressor ada=AdaBoostRegressor(n_estimators=12,learning_rate=0.1, loss='linear',random_state=42) #Fiteamos el modelo model = ada.fit(X_train, y_train) X_train.head() X_test.head() #Predicción! y_pred = model.predict(X_test) # Sacar predicciones con test from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import median_absolute_error r2=r2_score(y_true=y_test, y_pred=y_pred) print('r2: ', r2) mse=mean_squared_error(y_true=y_test, y_pred=y_pred) print('MSE: ',mse) mae=median_absolute_error(y_true=y_test, y_pred=y_pred) print('MAE: ',mae) #Ahora comparemos con GBM from sklearn.ensemble import GradientBoostingRegressor gbrt=GradientBoostingRegressor(loss='ls',learning_rate=0.1, n_estimators=50, subsample=0.8,max_depth=4,criterion='friedman_mse', random_state=42) #Fiteamos el modelo model1 = gbrt.fit(X_train, y_train) y_pred = model1.predict(X_test) from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import median_absolute_error r2=r2_score(y_true=y_test, y_pred=y_pred) print('r2: ', r2) mse=mean_squared_error(y_true=y_test, y_pred=y_pred) print('MSE: ',mse) mae=median_absolute_error(y_true=y_test, y_pred=y_pred) print('MAE: ',mae) import lightgbm as lgb #pip install lightgbm clf = lgb.LGBMRegressor(boosting_type='gbdt',max_depth=4,num_leaves=20,learning_rate=0.01,n_estimators=100) clf.fit(X_train, y_train) y_pred=clf.predict(X_test) from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import median_absolute_error r2=r2_score(y_true=y_test, y_pred=y_pred) print('r2: ', r2) mse=mean_squared_error(y_true=y_test, y_pred=y_pred) print('MSE: ',mse) mae=median_absolute_error(y_true=y_test, y_pred=y_pred) print('MAE: ',mae) import pandas as pd import xgboost as xgb from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error regressor = xgb.XGBRegressor( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3 ) #Fiteamos regressor.fit(X_train, y_train) #Predecimos y_pred = regressor.predict(X_test) from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.metrics import median_absolute_error r2=r2_score(y_true=y_test, y_pred=y_pred) print('r2: ', r2) mse=mean_squared_error(y_true=y_test, y_pred=y_pred) print('MSE: ',mse) mae=median_absolute_error(y_true=y_test, y_pred=y_pred) print('MAE: ',mae) test.head() col_del=['PoolQC','MiscFeature','Alley','Fence','Id'] col_inter= ['LotFrontage','GarageYrBlt','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath',\ 'GarageCars','GarageArea'] col_cat= ['FireplaceQu','GarageFinish','GarageCond','GarageType','BsmtExposure','BsmtFinType2','BsmtFinType1','BsmtCond', 'BsmtQual','MasVnrType','Electrical'] test_x=test.drop(labels=col_del, axis=1) for i in col_inter: test_x[i] = test_x[i].fillna(test_x[i].median()) for i in col_cat: test_x[i] = test_x[i].fillna('Desconocido') alldata=[] alldata1=[] variables=[] for i in test_x.columns: if test_x[i].dtype == 'O': if len(test_x[i].unique()) ==2: print('Variable 2 categorias:',i) y= pd.get_dummies(test_x[i]) alldata.append(y) variables.append(i) else: pass print('--------------------') if len(test_x[i].unique()) >2: print('Variable >2 categorias:',i) z= pd.DataFrame() z[i]=test_x[i].astype('category').cat.codes alldata1.append(z) variables.append(i) if test_x[i].dtype == 'float64': test_x[i]=(test_x[i]-test_x[i].mean())/(test_x[i].std()) data_y=pd.concat(alldata, axis=1) data_z=pd.concat(alldata1, axis=1) test_y= pd.concat([data_y,data_z,test_x],axis=1) # Unir todos los datasets test_y=test_y.drop(labels=variables, axis=1) # Borrar las columnas ya recodificadas test_y.head() test_y.isna().sum() len(test_y.columns) len(X_test.columns) test_y.columns X_test.columns X_test.NoSeWa.value_counts() # Todo es cero entonces podemos crear una igual test_y['NoSeWa']=0 test_y.head() interseccion = list(set(test_y.columns) & set(X_test.columns)) len(interseccion) union = list(set(test_y.columns) | set(X_test.columns)) set(test_y.columns) ^ set(X_test.columns) # Esta columna tiene un problema y nos cambia la dimension del dataset original # Predecimos con GradientBoosting y_pred = model.predict(test_y) y_pred df= pd.DataFrame() df['Id']= test_y.index df['Prediccion_Adaboost']= y_pred y_pred1=gbrt.predict(test_y) df['Prediccion_GradientBoosting']= y_pred1 y_pred2=clf.predict(test_y) df['Prediccion_LightGBM']= y_pred2 df.head()