In [ ]:

In [36]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [58]:

# metric de scoring
def log_rmse(yhat, ytrue):
    return np.sqrt(mean_squared_error ( np.log(yhat), np.log(ytrue) ))

In [59]:

df = pd.read_csv('ames/train.csv')
df.head()

Out[59]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

5 rows × 81 columns

In [60]:

# selection arbitraire des colonnes
# check for NaN
cols = ['MoSold', 'YrSold','1stFlrSF','2ndFlrSF', 'BedroomAbvGr']
for col in cols:
    print("{}: {}".format(col, df[df[col].isnull()].shape ))

# pas de null values dans ces colonnes
X = df[cols]
y = df['SalePrice']

MoSold: (0, 81)
YrSold: (0, 81)
1stFlrSF: (0, 81)
2ndFlrSF: (0, 81)
BedroomAbvGr: (0, 81)

In [61]:

# split train test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
    test_size=0.3,
    random_state=2)

In [62]:

# train regressor


clf = RandomForestRegressor(max_depth = 5,n_estimators=300 )

clf.fit(X_train, y_train)
print(clf)
yhat_test = clf.predict(X_test)
yhat_train = clf.predict(X_train)

print("\n -- RMSE test {:.4f}".format(log_rmse(y_test, yhat_test)  ))
print("\n -- RMSE train {:.4f}".format(log_rmse(y_train, yhat_train)))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

 -- RMSE test 0.2365

 -- RMSE train 0.2117

Appliquer le model sur le test¶

In [63]:

# load validation dataset
vdf = pd.read_csv('ames/test.csv')
# a ce niveau appliquer les memes tranformations sur X_valid
X_valid = vdf[cols]

y_valid = clf.predict(X_valid)

In [64]:

# construire la dataframe de resultats
results = pd.DataFrame(columns = ['Id', 'SalePrice'])

# Kaggle veut que la colonne index commence par 1461
results['Id'] = X_valid.index + 1461

results['SalePrice'] = y_valid

# ecrire le resultats dans le fichier csv
results.to_csv("submission_01.csv", index = False)

In [ ]: