PyCaret 2 House Price Prediction Example¶

This notebook is created using PyCaret 2.0. Last updated : 04-08-2020

House Price Prediction data set from Kaggle https://www.kaggle.com/c/house-prices-advanced-regression-techniques
Train Dataset consists of 1460 Samples with 81 features including the SalePrice
Test Dataset consists of 1459 Samples wit 80 features

In [ ]:

# Mount Google Drive 
# Skip this step if using on local hardware 
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).

In [ ]:

# Works with pycaret and pycaret 2
#!pip install pycaret==2.0
from pycaret.regression import *
import pandas as pd

In [ ]:

# check version
from pycaret.utils import version
version()

2.0

In [ ]:

# Chane path as per your file structure
# Remove root_path if using local hardware

root_path = 'gdrive/My Drive/Colab Notebooks/'

data = pd.read_csv('gdrive/My Drive/Colab Notebooks/HousePrice/train.csv')

test_data = pd.read_csv('gdrive/My Drive/Colab Notebooks/HousePrice/test.csv')

print(data.shape, test_data.shape)

(1460, 81) (1459, 80)

In [ ]:

data.head()

Out[ ]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtUnfSF	TotalBsmtSF	Heating	...	CentralAir	Electrical	1stFlrSF	2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196.0	Gd	TA	PConc	Gd	TA	No	GLQ	706	Unf	150	856	GasA	...	Y	SBrkr	856	854	1710	1	0	2	1	3	1	Gd	8	Typ	0	NaN	Attchd	2003.0	RFn	2	548	TA	TA	Y	0	61	0	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	None	0.0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978	Unf	284	1262	GasA	...	Y	SBrkr	1262	0	1262	0	1	2	0	3	1	TA	6	Typ	1	TA	Attchd	1976.0	RFn	2	460	TA	TA	Y	298	0	0	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162.0	Gd	TA	PConc	Gd	TA	Mn	GLQ	486	Unf	434	920	GasA	...	Y	SBrkr	920	866	1786	1	0	2	1	3	1	Gd	6	Typ	1	TA	Attchd	2001.0	RFn	2	608	TA	TA	Y	0	42	0	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	None	0.0	TA	TA	BrkTil	TA	Gd	No	ALQ	216	Unf	540	756	GasA	...	Y	SBrkr	961	756	1717	1	0	1	0	3	1	Gd	7	Typ	1	Gd	Detchd	1998.0	Unf	3	642	TA	TA	Y	0	35	272	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350.0	Gd	TA	PConc	Gd	TA	Av	GLQ	655	Unf	490	1145	GasA	...	Y	SBrkr	1145	1053	2198	1	0	2	1	4	1	Gd	9	Typ	1	TA	Attchd	2000.0	RFn	3	836	TA	TA	Y	192	84	0	NaN	NaN	NaN	12	2008	WD	Normal	250000

5 rows × 81 columns

In [ ]:

# Ignoring features with high null values 

demo = setup(data = data, target = 'SalePrice', 
                   ignore_features = ['Alley','PoolQC','MiscFeature','Fence','FireplaceQu','Utilities'],normalize = True,
                   transformation= True, transformation_method = 'yeo-johnson', 
                   transform_target = True, remove_outliers= True,
                   remove_multicollinearity = True,
                   ignore_low_variance = True, combine_rare_levels = True) 

 
Setup Succesfully Completed.

	Description	Value
0	session_id	2553
1	Transform Target	True
2	Transform Target Method	box-cox
3	Original Data	(1460, 81)
4	Missing Values	True
5	Numeric Features	21
6	Categorical Features	59
7	Ordinal Features	False
8	High Cardinality Features	False
9	High Cardinality Method	None
10	Sampled Data	(1387, 81)
11	Transformed Train Set	(970, 244)
12	Transformed Test Set	(417, 244)
13	Numeric Imputer	mean
14	Categorical Imputer	constant
15	Normalize	True
16	Normalize Method	zscore
17	Transformation	True
18	Transformation Method	yeo-johnson
19	PCA	False
20	PCA Method	None
21	PCA Components	None
22	Ignore Low Variance	True
23	Combine Rare Levels	True
24	Rare Level Threshold	0.100000
25	Numeric Binning	False
26	Remove Outliers	True
27	Outliers Threshold	0.050000
28	Remove Multicollinearity	True
29	Multicollinearity Threshold	0.900000
30	Clustering	False
31	Clustering Iteration	None
32	Polynomial Features	False
33	Polynomial Degree	None
34	Trignometry Features	False
35	Polynomial Threshold	None
36	Group Features	False
37	Feature Selection	False
38	Features Selection Threshold	None
39	Feature Interaction	False
40	Feature Ratio	False
41	Interaction Threshold	None

In [ ]:

# Blacklist Theil–Sen Regressor 
# Auto sort on R2 
compare_models(blacklist = ['tr'])

	Model	MAE	MSE	RMSE	R2	RMSLE	MAPE	TT (Sec)
0	CatBoost Regressor	15313.6607	776434230.5449	25574.6520	0.8966	0.1216	0.0854	6.0545
1	Huber Regressor	14604.9714	912236241.9322	25213.0988	0.8939	0.1190	0.0833	0.1802
2	Support Vector Machine	15387.0710	795925878.7054	26206.2707	0.8893	0.1252	0.0865	0.2447
3	Bayesian Ridge	15381.1023	920602470.6363	26383.7903	0.8880	0.1212	0.0855	0.0722
4	Light Gradient Boosting Machine	16799.0399	837267455.9643	27326.6238	0.8817	0.1303	0.0925	0.2839
5	Gradient Boosting Regressor	16988.6879	900162869.8805	27414.7745	0.8814	0.1312	0.0943	0.7129
6	Extreme Gradient Boosting	17203.8522	890529209.3684	27723.6269	0.8793	0.1338	0.0964	0.4334
7	Ridge Regression	16198.3462	996008412.4994	27658.9915	0.8776	0.1255	0.0893	0.0130
8	Random Forest	17884.1543	918488672.4057	28876.3710	0.8683	0.1407	0.0996	1.5661
9	Orthogonal Matching Pursuit	17525.4920	1180290533.4058	30383.1262	0.8544	0.1384	0.0986	0.0142
10	K Neighbors Regressor	20935.8666	1263777102.8334	33958.8645	0.8201	0.1578	0.1129	0.0197
11	Linear Regression	18579.5821	1412999991.7642	34303.1362	0.8149	0.8875	0.1016	0.0375
12	Extra Trees Regressor	21077.7765	1450475526.5097	37113.8801	0.7864	0.1648	0.1114	1.5526
13	Passive Aggressive Regressor	23476.6379	1455925010.0713	34983.9284	0.7832	0.1663	0.1258	0.0310
14	AdaBoost Regressor	24871.3695	1596338786.3069	38377.7193	0.7702	0.1783	0.1330	0.4744
15	Random Sample Consensus	20436.4470	2024540158.8414	42271.7509	0.7199	1.8876	0.1166	2.4091
16	Decision Tree	28313.9691	2362204385.5155	46922.6999	0.6505	0.2193	0.1556	0.0383
17	Elastic Net	48540.7327	5644074263.3008	73662.8744	0.1642	0.3367	0.2630	0.0100
18	Lasso Regression	55943.9350	6918562280.5182	81883.4356	-0.0385	0.3877	0.3102	0.0097
19	Lasso Least Angle Regression	55943.9350	6918562280.5182	81883.4356	-0.0385	0.3877	0.3102	0.0123
20	Least Angle Regression	5993932134656973907951616.0000	34849194485222397251019231724335227216317654899884032.0000	59033380261709037822803968.0000	-5421370651881711978246612473249811579011072.0000	9.7590	31547047712188772352.0000	0.1863

Out[ ]:

<catboost.core.CatBoostRegressor at 0x7f4ec3e38748>

In [ ]:

# Creating models for the best estimators 
huber = create_model('huber')
bayesian_ridge = create_model('br')
cat_boost = create_model('catboost')

	MAE	MSE	RMSE	R2	RMSLE	MAPE
0	14464.0462	796631140.1868	28224.6548	0.8678	0.1055	0.0744
1	12317.9481	486929512.7377	22066.4794	0.8870	0.1081	0.0685
2	15872.2935	582156173.3834	24127.9127	0.9094	0.1512	0.1058
3	16292.3554	623678424.8092	24973.5545	0.9091	0.1106	0.0862
4	25111.9159	3283563129.2471	57302.3833	0.7460	0.1700	0.1107
5	12310.8000	301712699.9034	17369.8791	0.9566	0.0936	0.0705
6	14194.3649	427569376.3631	20677.7508	0.9369	0.1294	0.0855
7	15042.6148	463198310.0888	21522.0424	0.8863	0.1489	0.1021
8	15624.3454	522717385.2126	22863.0135	0.9212	0.1124	0.0843
9	11905.9229	276186153.5170	16618.8493	0.9455	0.0864	0.0654
Mean	15313.6607	776434230.5449	25574.6520	0.8966	0.1216	0.0854
SD	3591.9983	847985780.9750	11062.1610	0.0568	0.0259	0.0154

In [ ]:

# Tuning the created models 
huber = tune_model(huber)
bayesian_ridge = tune_model(bayesian_ridge)
cat_boost = tune_model(cat_boost)

	MAE	MSE	RMSE	R2	RMSLE	MAPE
0	15222.1797	670566500.0937	25895.2988	0.8887	0.1090	0.0828
1	12700.3444	471383401.7861	21711.3657	0.8906	0.1074	0.0704
2	16694.3988	633569183.1174	25170.8002	0.9014	0.1566	0.1098
3	17282.1157	742530620.7051	27249.4151	0.8918	0.1192	0.0909
4	27664.4349	3328893846.1072	57696.5670	0.7424	0.1762	0.1211
5	13631.0586	342263029.3862	18500.3521	0.9508	0.1028	0.0786
6	14448.9475	421609729.3052	20533.1373	0.9378	0.1256	0.0870
7	16189.5874	533125793.0337	23089.5170	0.8691	0.1575	0.1096
8	16572.7477	600491213.8336	24504.9222	0.9094	0.1163	0.0884
9	12919.5620	300757106.7155	17342.3501	0.9406	0.0914	0.0709
Mean	16332.5377	804519042.4084	26169.3726	0.8923	0.1262	0.0909
SD	4079.7390	852280183.3400	10939.9718	0.0559	0.0264	0.0164

In [ ]:

# Blending models
blender = blend_models(estimator_list = [huber, bayesian_ridge, cat_boost])

	MAE	MSE	RMSE	R2	RMSLE	MAPE
0	13233.5963	463761320.6194	21535.1183	0.9230	0.0971	0.0713
1	11504.8842	413827989.5399	20342.7626	0.9039	0.1018	0.0655
2	14194.1861	464560920.4938	21553.6753	0.9277	0.1431	0.0960
3	14860.3248	579523794.9158	24073.3005	0.9156	0.1046	0.0793
4	25506.1659	4270880912.6006	65351.9771	0.6696	0.1825	0.1149
5	12604.3339	295248634.9563	17182.8006	0.9576	0.0916	0.0708
6	12364.6512	352687304.7260	18779.9708	0.9479	0.1155	0.0750
7	14421.4822	438085842.0711	20930.5003	0.8924	0.1395	0.0943
8	13692.8294	351155335.6424	18739.1391	0.9470	0.1016	0.0777
9	10979.4809	232445546.8403	15246.1650	0.9541	0.0821	0.0619
Mean	14336.1935	786217760.2406	24373.5410	0.9039	0.1160	0.0807
SD	3909.8622	1165240457.9300	13861.7553	0.0808	0.0289	0.0155

In [ ]:

# Finaliszing model for predictions 
model = finalize_model(blender)
predictions = predict_model(model, data = test_data)

In [ ]:

# Generating CSV for Kaggle Submissions 
sub = pd.DataFrame({
        "Id": predictions['Id'],
        "SalePrice": predictions['Label']
    })

sub.to_csv('gdrive/My Drive/Colab Notebooks/HousePrice/submission.csv', index=False)