In [1]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

import urllib.request
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:

import ktrain
from ktrain import tabular

Predicting House Prices¶

In this notebook, we will predict the prices of houses from various house attributes. The dataset can be downloaded from Kaggle here.

STEP 1: Load and Preprocess Data¶

In [3]:

train_df = pd.read_csv('data/housing_price/train.csv', index_col=0)

In [4]:

train_df.head()

Out[4]:

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	1stFlrSF	2ndFlrSF	LowQualFinSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
Id
1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196.0	Gd	TA	PConc	Gd	TA	No	GLQ	706	Unf	0	150	856	GasA	Ex	Y	SBrkr	856	854	0	1710	1	0	2	1	3	1	Gd	8	Typ	0	NaN	Attchd	2003.0	RFn	2	548	TA	TA	Y	0	61	0	0	0	0	NaN	NaN	NaN	0	2	2008	WD	Normal	208500
2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	None	0.0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978	Unf	0	284	1262	GasA	Ex	Y	SBrkr	1262	0	0	1262	0	1	2	0	3	1	TA	6	Typ	1	TA	Attchd	1976.0	RFn	2	460	TA	TA	Y	298	0	0	0	0	0	NaN	NaN	NaN	0	5	2007	WD	Normal	181500
3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162.0	Gd	TA	PConc	Gd	TA	Mn	GLQ	486	Unf	0	434	920	GasA	Ex	Y	SBrkr	920	866	0	1786	1	0	2	1	3	1	Gd	6	Typ	1	TA	Attchd	2001.0	RFn	2	608	TA	TA	Y	0	42	0	0	0	0	NaN	NaN	NaN	0	9	2008	WD	Normal	223500
4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	None	0.0	TA	TA	BrkTil	TA	Gd	No	ALQ	216	Unf	0	540	756	GasA	Gd	Y	SBrkr	961	756	0	1717	1	0	1	0	3	1	Gd	7	Typ	1	Gd	Detchd	1998.0	Unf	3	642	TA	TA	Y	0	35	272	0	0	0	NaN	NaN	NaN	0	2	2006	WD	Abnorml	140000
5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350.0	Gd	TA	PConc	Gd	TA	Av	GLQ	655	Unf	0	490	1145	GasA	Ex	Y	SBrkr	1145	1053	0	2198	1	0	2	1	4	1	Gd	9	Typ	1	TA	Attchd	2000.0	RFn	3	836	TA	TA	Y	192	84	0	0	0	0	NaN	NaN	NaN	0	12	2008	WD	Normal	250000

In [5]:

train_df.drop(['Alley','PoolQC','MiscFeature','Fence','FireplaceQu','Utilities'], 1, inplace=True)

In [6]:

train_df.head()

Out[6]:

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	LotShape	LandContour	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	1stFlrSF	2ndFlrSF	LowQualFinSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
Id
1	60	RL	65.0	8450	Pave	Reg	Lvl	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196.0	Gd	TA	PConc	Gd	TA	No	GLQ	706	Unf	0	150	856	GasA	Ex	Y	SBrkr	856	854	0	1710	1	0	2	1	3	1	Gd	8	Typ	0	Attchd	2003.0	RFn	2	548	TA	TA	Y	0	61	0	0	0	0	0	2	2008	WD	Normal	208500
2	20	RL	80.0	9600	Pave	Reg	Lvl	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	None	0.0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978	Unf	0	284	1262	GasA	Ex	Y	SBrkr	1262	0	0	1262	0	1	2	0	3	1	TA	6	Typ	1	Attchd	1976.0	RFn	2	460	TA	TA	Y	298	0	0	0	0	0	0	5	2007	WD	Normal	181500
3	60	RL	68.0	11250	Pave	IR1	Lvl	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162.0	Gd	TA	PConc	Gd	TA	Mn	GLQ	486	Unf	0	434	920	GasA	Ex	Y	SBrkr	920	866	0	1786	1	0	2	1	3	1	Gd	6	Typ	1	Attchd	2001.0	RFn	2	608	TA	TA	Y	0	42	0	0	0	0	0	9	2008	WD	Normal	223500
4	70	RL	60.0	9550	Pave	IR1	Lvl	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	None	0.0	TA	TA	BrkTil	TA	Gd	No	ALQ	216	Unf	0	540	756	GasA	Gd	Y	SBrkr	961	756	0	1717	1	0	1	0	3	1	Gd	7	Typ	1	Detchd	1998.0	Unf	3	642	TA	TA	Y	0	35	272	0	0	0	0	2	2006	WD	Abnorml	140000
5	60	RL	84.0	14260	Pave	IR1	Lvl	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350.0	Gd	TA	PConc	Gd	TA	Av	GLQ	655	Unf	0	490	1145	GasA	Ex	Y	SBrkr	1145	1053	0	2198	1	0	2	1	4	1	Gd	9	Typ	1	Attchd	2000.0	RFn	3	836	TA	TA	Y	192	84	0	0	0	0	0	12	2008	WD	Normal	250000

In [7]:

trn, val, preproc = tabular.tabular_from_df(train_df, is_regression=True, 
                                             label_columns='SalePrice', random_state=42)

processing train: 1309 rows x 74 columns

The following integer column(s) are being treated as categorical variables:
['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', '3SsnPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
To treat any of these column(s) as numerical, cast the column to float in DataFrame or CSV
 and re-run tabular_from* function.

processing test: 151 rows x 74 columns

/home/amaiya/projects/ghub/ktrain/ktrain/utils.py:556: UserWarning: Task is being treated as REGRESSION because either class_names argument was not supplied or is_regression=True. If this is incorrect, change accordingly.
  'If this is incorrect, change accordingly.')

STEP 2: Create Model and Wrap in `Learner`¶

In [8]:

model = tabular.tabular_regression_model('mlp', trn)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128)

done.

STEP 3: Estimate LR¶

In [9]:

learner.lr_find(show_plot=True, max_epochs=16)

simulating training for different learning rates... this may take a few moments...
Train for 10 steps
Epoch 1/16
10/10 [==============================] - 5s 526ms/step - loss: 39023478485.7307 - mae: 181231.9375
Epoch 2/16
10/10 [==============================] - 1s 97ms/step - loss: 39033418674.8315 - mae: 181204.8594
Epoch 3/16
10/10 [==============================] - 1s 99ms/step - loss: 38418801680.4742 - mae: 179555.7969
Epoch 4/16
10/10 [==============================] - 1s 100ms/step - loss: 38186333255.9661 - mae: 179885.9062
Epoch 5/16
10/10 [==============================] - 1s 96ms/step - loss: 39033367996.8027 - mae: 181204.7500
Epoch 6/16
10/10 [==============================] - 1s 98ms/step - loss: 39178636118.9229 - mae: 181407.5469
Epoch 7/16
10/10 [==============================] - 1s 97ms/step - loss: 39022837652.4843 - mae: 181230.2969
Epoch 8/16
10/10 [==============================] - 1s 96ms/step - loss: 38555598667.6511 - mae: 180373.7500
Epoch 9/16
10/10 [==============================] - 1s 96ms/step - loss: 38548896255.5665 - mae: 180083.1719
Epoch 10/16
10/10 [==============================] - 1s 97ms/step - loss: 35094031275.8950 - mae: 170316.9062
Epoch 11/16
10/10 [==============================] - 1s 101ms/step - loss: 10132749122.6554 - mae: 77930.0859
Epoch 12/16
10/10 [==============================] - 1s 96ms/step - loss: 7250272029.6969 - mae: 73190.8281
Epoch 13/16
10/10 [==============================] - 1s 98ms/step - loss: 7010621260.5182 - mae: 70700.7656
Epoch 14/16
10/10 [==============================] - 1s 96ms/step - loss: 25998046822.4000 - mae: 135040.7500
Epoch 15/16
 7/10 [====================>.........] - ETA: 0s - loss: 874942046208.0000 - mae: 330034.8438

done.
Visually inspect loss plot and select learning rate associated with falling loss

STEP 4: Train¶

In [10]:

learner.autofit(1e-1)

early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 0.1...
Train for 11 steps, validate for 5 steps
Epoch 1/1024
11/11 [==============================] - 6s 541ms/step - loss: 33410952382.0443 - mae: 156290.1250 - val_loss: 15485848576.0000 - val_mae: 114282.9531
Epoch 2/1024
11/11 [==============================] - 1s 132ms/step - loss: 24494850035.0924 - mae: 145351.7188 - val_loss: 25421583155.2000 - val_mae: 153021.7188
Epoch 3/1024
11/11 [==============================] - 1s 136ms/step - loss: 18668215047.6272 - mae: 125661.6016 - val_loss: 12586017996.8000 - val_mae: 91371.8438
Epoch 4/1024
11/11 [==============================] - 1s 133ms/step - loss: 12545563277.9832 - mae: 84823.7500 - val_loss: 8312404070.4000 - val_mae: 86080.5156
Epoch 5/1024
11/11 [==============================] - 2s 140ms/step - loss: 5367907446.2460 - mae: 61690.7188 - val_loss: 3880212326.4000 - val_mae: 54640.9609
Epoch 6/1024
11/11 [==============================] - 2s 137ms/step - loss: 2749079195.2330 - mae: 35548.7422 - val_loss: 994916563.2000 - val_mae: 22115.1758
Epoch 7/1024
11/11 [==============================] - 2s 137ms/step - loss: 1747315035.4652 - mae: 28340.7891 - val_loss: 942788454.4000 - val_mae: 20258.2383
Epoch 8/1024
11/11 [==============================] - 1s 135ms/step - loss: 1317610653.1398 - mae: 24707.8145 - val_loss: 795084512.0000 - val_mae: 18270.5781
Epoch 9/1024
11/11 [==============================] - 2s 137ms/step - loss: 1345168384.1711 - mae: 23964.7910 - val_loss: 751457507.2000 - val_mae: 17889.5137
Epoch 10/1024
11/11 [==============================] - 2s 138ms/step - loss: 1204894672.8556 - mae: 22083.0527 - val_loss: 729885836.8000 - val_mae: 17330.7344
Epoch 11/1024
11/11 [==============================] - 1s 130ms/step - loss: 1293521195.8075 - mae: 24240.8613 - val_loss: 845886457.6000 - val_mae: 19439.5332
Epoch 12/1024
 8/11 [====================>.........] - ETA: 0s - loss: 1224515284.0000 - mae: 22132.0742
Epoch 00012: Reducing Max LR on Plateau: new max lr will be 0.05 (if not early_stopping).
11/11 [==============================] - 1s 134ms/step - loss: 1196409911.1933 - mae: 22769.3926 - val_loss: 997325734.4000 - val_mae: 21638.4668
Epoch 13/1024
11/11 [==============================] - 1s 136ms/step - loss: 1081636823.2361 - mae: 22792.1152 - val_loss: 709030249.6000 - val_mae: 18628.7402
Epoch 14/1024
11/11 [==============================] - 1s 136ms/step - loss: 984812624.6539 - mae: 20191.5820 - val_loss: 662907520.0000 - val_mae: 16497.9941
Epoch 15/1024
11/11 [==============================] - 1s 135ms/step - loss: 984294369.7418 - mae: 19897.7480 - val_loss: 666114873.6000 - val_mae: 16434.1055
Epoch 16/1024
 9/11 [=======================>......] - ETA: 0s - loss: 895732711.1111 - mae: 19749.8887
Epoch 00016: Reducing Max LR on Plateau: new max lr will be 0.025 (if not early_stopping).
11/11 [==============================] - 1s 135ms/step - loss: 957869730.4446 - mae: 19990.8848 - val_loss: 708456806.4000 - val_mae: 18026.2402
Epoch 17/1024
11/11 [==============================] - 1s 133ms/step - loss: 860801337.9251 - mae: 19515.2520 - val_loss: 695209459.2000 - val_mae: 16676.5195
Epoch 18/1024
11/11 [==============================] - 1s 136ms/step - loss: 824453914.3285 - mae: 19024.5078 - val_loss: 661850604.8000 - val_mae: 16811.2227
Epoch 19/1024
11/11 [==============================] - 2s 138ms/step - loss: 801468495.2299 - mae: 18976.1191 - val_loss: 660384323.2000 - val_mae: 16322.7549
Epoch 20/1024
11/11 [==============================] - 1s 131ms/step - loss: 753795500.6753 - mae: 18709.6406 - val_loss: 663768908.8000 - val_mae: 16474.5703
Epoch 21/1024
10/11 [==========================>...] - ETA: 0s - loss: 783667638.4000 - mae: 18711.5293
Epoch 00021: Reducing Max LR on Plateau: new max lr will be 0.0125 (if not early_stopping).
11/11 [==============================] - 1s 133ms/step - loss: 786054059.7830 - mae: 18731.3438 - val_loss: 664781280.0000 - val_mae: 16376.9863
Epoch 22/1024
11/11 [==============================] - 1s 133ms/step - loss: 825439074.7869 - mae: 19253.4219 - val_loss: 668607993.6000 - val_mae: 16350.0859
Epoch 23/1024
 8/11 [====================>.........] - ETA: 0s - loss: 824802156.0000 - mae: 18940.5098
Epoch 00023: Reducing Max LR on Plateau: new max lr will be 0.00625 (if not early_stopping).
11/11 [==============================] - 1s 134ms/step - loss: 790809524.3453 - mae: 18444.7891 - val_loss: 669713436.8000 - val_mae: 16343.8193
Epoch 24/1024
 9/11 [=======================>......] - ETA: 0s - loss: 714549009.7778 - mae: 18092.3867Restoring model weights from the end of the best epoch.
11/11 [==============================] - 1s 136ms/step - loss: 736000784.4584 - mae: 18215.8066 - val_loss: 666175027.2000 - val_mae: 16389.6992
Epoch 00024: early stopping
Weights from best epoch have been loaded into model.

Out[10]:

<tensorflow.python.keras.callbacks.History at 0x7f7c482316d8>

Evaluate Model¶

In [11]:

learner.evaluate(test_data=val)

Out[11]:

[('mae', 16322.754966887418)]

In [ ]: