import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# metric de scoring
def log_rmse(yhat, ytrue):
return np.sqrt(mean_squared_error ( np.log(yhat), np.log(ytrue) ))
df = pd.read_csv('ames/train.csv')
df.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
# selection arbitraire des colonnes
# check for NaN
cols = ['MoSold', 'YrSold','1stFlrSF','2ndFlrSF', 'BedroomAbvGr']
for col in cols:
print("{}: {}".format(col, df[df[col].isnull()].shape ))
# pas de null values dans ces colonnes
X = df[cols]
y = df['SalePrice']
MoSold: (0, 81) YrSold: (0, 81) 1stFlrSF: (0, 81) 2ndFlrSF: (0, 81) BedroomAbvGr: (0, 81)
# split train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
test_size=0.3,
random_state=2)
# train regressor
clf = RandomForestRegressor(max_depth = 5,n_estimators=300 )
clf.fit(X_train, y_train)
print(clf)
yhat_test = clf.predict(X_test)
yhat_train = clf.predict(X_train)
print("\n -- RMSE test {:.4f}".format(log_rmse(y_test, yhat_test) ))
print("\n -- RMSE train {:.4f}".format(log_rmse(y_train, yhat_train)))
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) -- RMSE test 0.2365 -- RMSE train 0.2117
# load validation dataset
vdf = pd.read_csv('ames/test.csv')
# a ce niveau appliquer les memes tranformations sur X_valid
X_valid = vdf[cols]
y_valid = clf.predict(X_valid)
# construire la dataframe de resultats
results = pd.DataFrame(columns = ['Id', 'SalePrice'])
# Kaggle veut que la colonne index commence par 1461
results['Id'] = X_valid.index + 1461
results['SalePrice'] = y_valid
# ecrire le resultats dans le fichier csv
results.to_csv("submission_01.csv", index = False)