#!/usr/bin/env python # coding: utf-8 # In[ ]: # In[36]: import pandas as pd import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error # In[58]: # metric de scoring def log_rmse(yhat, ytrue): return np.sqrt(mean_squared_error ( np.log(yhat), np.log(ytrue) )) # In[59]: df = pd.read_csv('ames/train.csv') df.head() # In[60]: # selection arbitraire des colonnes # check for NaN cols = ['MoSold', 'YrSold','1stFlrSF','2ndFlrSF', 'BedroomAbvGr'] for col in cols: print("{}: {}".format(col, df[df[col].isnull()].shape )) # pas de null values dans ces colonnes X = df[cols] y = df['SalePrice'] # In[61]: # split train test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=2) # In[62]: # train regressor clf = RandomForestRegressor(max_depth = 5,n_estimators=300 ) clf.fit(X_train, y_train) print(clf) yhat_test = clf.predict(X_test) yhat_train = clf.predict(X_train) print("\n -- RMSE test {:.4f}".format(log_rmse(y_test, yhat_test) )) print("\n -- RMSE train {:.4f}".format(log_rmse(y_train, yhat_train))) # # Appliquer le model sur le test # In[63]: # load validation dataset vdf = pd.read_csv('ames/test.csv') # a ce niveau appliquer les memes tranformations sur X_valid X_valid = vdf[cols] y_valid = clf.predict(X_valid) # In[64]: # construire la dataframe de resultats results = pd.DataFrame(columns = ['Id', 'SalePrice']) # Kaggle veut que la colonne index commence par 1461 results['Id'] = X_valid.index + 1461 results['SalePrice'] = y_valid # ecrire le resultats dans le fichier csv results.to_csv("submission_01.csv", index = False) # In[ ]: # In[ ]: