In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

import urllib.request
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
In [2]:
import ktrain
from ktrain import tabular

Predicting House Prices

In this notebook, we will predict the prices of houses from various house attributes. The dataset can be downloaded from Kaggle here.

STEP 1: Load and Preprocess Data

In [3]:
train_df = pd.read_csv('data/housing_price/train.csv', index_col=0)
In [4]:
train_df.head()
Out[4]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 NaN Attchd 2003.0 RFn 2 548 TA TA Y 0 61 0 0 0 0 NaN NaN NaN 0 2 2008 WD Normal 208500
2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.0 RFn 2 460 TA TA Y 298 0 0 0 0 0 NaN NaN NaN 0 5 2007 WD Normal 181500
3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.0 RFn 2 608 TA TA Y 0 42 0 0 0 0 NaN NaN NaN 0 9 2008 WD Normal 223500
4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.0 Unf 3 642 TA TA Y 0 35 272 0 0 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.0 RFn 3 836 TA TA Y 192 84 0 0 0 0 NaN NaN NaN 0 12 2008 WD Normal 250000
In [5]:
train_df.drop(['Alley','PoolQC','MiscFeature','Fence','FireplaceQu','Utilities'], 1, inplace=True)
In [6]:
train_df.head()
Out[6]:
MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 60 RL 65.0 8450 Pave Reg Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 Attchd 2003.0 RFn 2 548 TA TA Y 0 61 0 0 0 0 0 2 2008 WD Normal 208500
2 20 RL 80.0 9600 Pave Reg Lvl FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 Attchd 1976.0 RFn 2 460 TA TA Y 298 0 0 0 0 0 0 5 2007 WD Normal 181500
3 60 RL 68.0 11250 Pave IR1 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 Attchd 2001.0 RFn 2 608 TA TA Y 0 42 0 0 0 0 0 9 2008 WD Normal 223500
4 70 RL 60.0 9550 Pave IR1 Lvl Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Detchd 1998.0 Unf 3 642 TA TA Y 0 35 272 0 0 0 0 2 2006 WD Abnorml 140000
5 60 RL 84.0 14260 Pave IR1 Lvl FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 Attchd 2000.0 RFn 3 836 TA TA Y 192 84 0 0 0 0 0 12 2008 WD Normal 250000
In [7]:
trn, val, preproc = tabular.tabular_from_df(train_df, is_regression=True, 
                                             label_columns='SalePrice', random_state=42)
processing train: 1309 rows x 74 columns

The following integer column(s) are being treated as categorical variables:
['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', '3SsnPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
To treat any of these column(s) as numerical, cast the column to float in DataFrame or CSV
 and re-run tabular_from* function.

processing test: 151 rows x 74 columns
/home/amaiya/projects/ghub/ktrain/ktrain/utils.py:556: UserWarning: Task is being treated as REGRESSION because either class_names argument was not supplied or is_regression=True. If this is incorrect, change accordingly.
  'If this is incorrect, change accordingly.')

STEP 2: Create Model and Wrap in Learner

In [8]:
model = tabular.tabular_regression_model('mlp', trn)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128)
done.

STEP 3: Estimate LR

In [9]:
learner.lr_find(show_plot=True, max_epochs=16)
simulating training for different learning rates... this may take a few moments...
Train for 10 steps
Epoch 1/16
10/10 [==============================] - 5s 526ms/step - loss: 39023478485.7307 - mae: 181231.9375
Epoch 2/16
10/10 [==============================] - 1s 97ms/step - loss: 39033418674.8315 - mae: 181204.8594
Epoch 3/16
10/10 [==============================] - 1s 99ms/step - loss: 38418801680.4742 - mae: 179555.7969
Epoch 4/16
10/10 [==============================] - 1s 100ms/step - loss: 38186333255.9661 - mae: 179885.9062
Epoch 5/16
10/10 [==============================] - 1s 96ms/step - loss: 39033367996.8027 - mae: 181204.7500
Epoch 6/16
10/10 [==============================] - 1s 98ms/step - loss: 39178636118.9229 - mae: 181407.5469
Epoch 7/16
10/10 [==============================] - 1s 97ms/step - loss: 39022837652.4843 - mae: 181230.2969
Epoch 8/16
10/10 [==============================] - 1s 96ms/step - loss: 38555598667.6511 - mae: 180373.7500
Epoch 9/16
10/10 [==============================] - 1s 96ms/step - loss: 38548896255.5665 - mae: 180083.1719
Epoch 10/16
10/10 [==============================] - 1s 97ms/step - loss: 35094031275.8950 - mae: 170316.9062
Epoch 11/16
10/10 [==============================] - 1s 101ms/step - loss: 10132749122.6554 - mae: 77930.0859
Epoch 12/16
10/10 [==============================] - 1s 96ms/step - loss: 7250272029.6969 - mae: 73190.8281
Epoch 13/16
10/10 [==============================] - 1s 98ms/step - loss: 7010621260.5182 - mae: 70700.7656
Epoch 14/16
10/10 [==============================] - 1s 96ms/step - loss: 25998046822.4000 - mae: 135040.7500
Epoch 15/16
 7/10 [====================>.........] - ETA: 0s - loss: 874942046208.0000 - mae: 330034.8438

done.
Visually inspect loss plot and select learning rate associated with falling loss

STEP 4: Train

In [10]:
learner.autofit(1e-1)
early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 0.1...
Train for 11 steps, validate for 5 steps
Epoch 1/1024
11/11 [==============================] - 6s 541ms/step - loss: 33410952382.0443 - mae: 156290.1250 - val_loss: 15485848576.0000 - val_mae: 114282.9531
Epoch 2/1024
11/11 [==============================] - 1s 132ms/step - loss: 24494850035.0924 - mae: 145351.7188 - val_loss: 25421583155.2000 - val_mae: 153021.7188
Epoch 3/1024
11/11 [==============================] - 1s 136ms/step - loss: 18668215047.6272 - mae: 125661.6016 - val_loss: 12586017996.8000 - val_mae: 91371.8438
Epoch 4/1024
11/11 [==============================] - 1s 133ms/step - loss: 12545563277.9832 - mae: 84823.7500 - val_loss: 8312404070.4000 - val_mae: 86080.5156
Epoch 5/1024
11/11 [==============================] - 2s 140ms/step - loss: 5367907446.2460 - mae: 61690.7188 - val_loss: 3880212326.4000 - val_mae: 54640.9609
Epoch 6/1024
11/11 [==============================] - 2s 137ms/step - loss: 2749079195.2330 - mae: 35548.7422 - val_loss: 994916563.2000 - val_mae: 22115.1758
Epoch 7/1024
11/11 [==============================] - 2s 137ms/step - loss: 1747315035.4652 - mae: 28340.7891 - val_loss: 942788454.4000 - val_mae: 20258.2383
Epoch 8/1024
11/11 [==============================] - 1s 135ms/step - loss: 1317610653.1398 - mae: 24707.8145 - val_loss: 795084512.0000 - val_mae: 18270.5781
Epoch 9/1024
11/11 [==============================] - 2s 137ms/step - loss: 1345168384.1711 - mae: 23964.7910 - val_loss: 751457507.2000 - val_mae: 17889.5137
Epoch 10/1024
11/11 [==============================] - 2s 138ms/step - loss: 1204894672.8556 - mae: 22083.0527 - val_loss: 729885836.8000 - val_mae: 17330.7344
Epoch 11/1024
11/11 [==============================] - 1s 130ms/step - loss: 1293521195.8075 - mae: 24240.8613 - val_loss: 845886457.6000 - val_mae: 19439.5332
Epoch 12/1024
 8/11 [====================>.........] - ETA: 0s - loss: 1224515284.0000 - mae: 22132.0742
Epoch 00012: Reducing Max LR on Plateau: new max lr will be 0.05 (if not early_stopping).
11/11 [==============================] - 1s 134ms/step - loss: 1196409911.1933 - mae: 22769.3926 - val_loss: 997325734.4000 - val_mae: 21638.4668
Epoch 13/1024
11/11 [==============================] - 1s 136ms/step - loss: 1081636823.2361 - mae: 22792.1152 - val_loss: 709030249.6000 - val_mae: 18628.7402
Epoch 14/1024
11/11 [==============================] - 1s 136ms/step - loss: 984812624.6539 - mae: 20191.5820 - val_loss: 662907520.0000 - val_mae: 16497.9941
Epoch 15/1024
11/11 [==============================] - 1s 135ms/step - loss: 984294369.7418 - mae: 19897.7480 - val_loss: 666114873.6000 - val_mae: 16434.1055
Epoch 16/1024
 9/11 [=======================>......] - ETA: 0s - loss: 895732711.1111 - mae: 19749.8887
Epoch 00016: Reducing Max LR on Plateau: new max lr will be 0.025 (if not early_stopping).
11/11 [==============================] - 1s 135ms/step - loss: 957869730.4446 - mae: 19990.8848 - val_loss: 708456806.4000 - val_mae: 18026.2402
Epoch 17/1024
11/11 [==============================] - 1s 133ms/step - loss: 860801337.9251 - mae: 19515.2520 - val_loss: 695209459.2000 - val_mae: 16676.5195
Epoch 18/1024
11/11 [==============================] - 1s 136ms/step - loss: 824453914.3285 - mae: 19024.5078 - val_loss: 661850604.8000 - val_mae: 16811.2227
Epoch 19/1024
11/11 [==============================] - 2s 138ms/step - loss: 801468495.2299 - mae: 18976.1191 - val_loss: 660384323.2000 - val_mae: 16322.7549
Epoch 20/1024
11/11 [==============================] - 1s 131ms/step - loss: 753795500.6753 - mae: 18709.6406 - val_loss: 663768908.8000 - val_mae: 16474.5703
Epoch 21/1024
10/11 [==========================>...] - ETA: 0s - loss: 783667638.4000 - mae: 18711.5293
Epoch 00021: Reducing Max LR on Plateau: new max lr will be 0.0125 (if not early_stopping).
11/11 [==============================] - 1s 133ms/step - loss: 786054059.7830 - mae: 18731.3438 - val_loss: 664781280.0000 - val_mae: 16376.9863
Epoch 22/1024
11/11 [==============================] - 1s 133ms/step - loss: 825439074.7869 - mae: 19253.4219 - val_loss: 668607993.6000 - val_mae: 16350.0859
Epoch 23/1024
 8/11 [====================>.........] - ETA: 0s - loss: 824802156.0000 - mae: 18940.5098
Epoch 00023: Reducing Max LR on Plateau: new max lr will be 0.00625 (if not early_stopping).
11/11 [==============================] - 1s 134ms/step - loss: 790809524.3453 - mae: 18444.7891 - val_loss: 669713436.8000 - val_mae: 16343.8193
Epoch 24/1024
 9/11 [=======================>......] - ETA: 0s - loss: 714549009.7778 - mae: 18092.3867Restoring model weights from the end of the best epoch.
11/11 [==============================] - 1s 136ms/step - loss: 736000784.4584 - mae: 18215.8066 - val_loss: 666175027.2000 - val_mae: 16389.6992
Epoch 00024: early stopping
Weights from best epoch have been loaded into model.
Out[10]:
<tensorflow.python.keras.callbacks.History at 0x7f7c482316d8>

Evaluate Model

In [11]:
learner.evaluate(test_data=val)
Out[11]:
[('mae', 16322.754966887418)]
In [ ]: