This data set describes the sale of individual residential property in Ames, Iowa from 2006 to 2010
The data set contains 2930 observations and a large number of explanatory variables (23 nominal, 23 ordinal, 14 discrete, and 20 continuous) involved in assessing home values. Here is a txt file. containing information on variables involved.
Data Docmentary can be found here Data can also be downloaded from here For in detail information, click here
In this project, we'll build a Linear Regression model and exploring ways to improve it by
Lets import the necessary tools and libraries
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
# Display Settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
data = pd.read_table('AmesHousing.tsv', delimiter="\t")
data.head()
Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | RL | 141.0 | 31770 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | Stone | 112.0 | TA | TA | CBlock | TA | Gd | Gd | BLQ | 639.0 | Unf | 0.0 | 441.0 | 1080.0 | GasA | Fa | Y | SBrkr | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | Gd | Attchd | 1960.0 | Fin | 2.0 | 528.0 | TA | TA | P | 210 | 62 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 215000 |
1 | 2 | 526350040 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal | 105000 |
2 | 3 | 526351010 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 |
3 | 4 | 526353030 | 20 | RL | 93.0 | 11160 | Pave | NaN | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | None | 0.0 | Gd | TA | CBlock | TA | TA | No | ALQ | 1065.0 | Unf | 0.0 | 1045.0 | 2110.0 | GasA | Ex | Y | SBrkr | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | TA | Attchd | 1968.0 | Fin | 2.0 | 522.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 244000 |
4 | 5 | 527105010 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal | 189900 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2930 entries, 0 to 2929 Data columns (total 82 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order 2930 non-null int64 1 PID 2930 non-null int64 2 MS SubClass 2930 non-null int64 3 MS Zoning 2930 non-null object 4 Lot Frontage 2440 non-null float64 5 Lot Area 2930 non-null int64 6 Street 2930 non-null object 7 Alley 198 non-null object 8 Lot Shape 2930 non-null object 9 Land Contour 2930 non-null object 10 Utilities 2930 non-null object 11 Lot Config 2930 non-null object 12 Land Slope 2930 non-null object 13 Neighborhood 2930 non-null object 14 Condition 1 2930 non-null object 15 Condition 2 2930 non-null object 16 Bldg Type 2930 non-null object 17 House Style 2930 non-null object 18 Overall Qual 2930 non-null int64 19 Overall Cond 2930 non-null int64 20 Year Built 2930 non-null int64 21 Year Remod/Add 2930 non-null int64 22 Roof Style 2930 non-null object 23 Roof Matl 2930 non-null object 24 Exterior 1st 2930 non-null object 25 Exterior 2nd 2930 non-null object 26 Mas Vnr Type 2907 non-null object 27 Mas Vnr Area 2907 non-null float64 28 Exter Qual 2930 non-null object 29 Exter Cond 2930 non-null object 30 Foundation 2930 non-null object 31 Bsmt Qual 2850 non-null object 32 Bsmt Cond 2850 non-null object 33 Bsmt Exposure 2847 non-null object 34 BsmtFin Type 1 2850 non-null object 35 BsmtFin SF 1 2929 non-null float64 36 BsmtFin Type 2 2849 non-null object 37 BsmtFin SF 2 2929 non-null float64 38 Bsmt Unf SF 2929 non-null float64 39 Total Bsmt SF 2929 non-null float64 40 Heating 2930 non-null object 41 Heating QC 2930 non-null object 42 Central Air 2930 non-null object 43 Electrical 2929 non-null object 44 1st Flr SF 2930 non-null int64 45 2nd Flr SF 2930 non-null int64 46 Low Qual Fin SF 2930 non-null int64 47 Gr Liv Area 2930 non-null int64 48 Bsmt Full Bath 2928 non-null float64 49 Bsmt Half Bath 2928 non-null float64 50 Full Bath 2930 non-null int64 51 Half Bath 2930 non-null int64 52 Bedroom AbvGr 2930 non-null int64 53 Kitchen AbvGr 2930 non-null int64 54 Kitchen Qual 2930 non-null object 55 TotRms AbvGrd 2930 non-null int64 56 Functional 2930 non-null object 57 Fireplaces 2930 non-null int64 58 Fireplace Qu 1508 non-null object 59 Garage Type 2773 non-null object 60 Garage Yr Blt 2771 non-null float64 61 Garage Finish 2771 non-null object 62 Garage Cars 2929 non-null float64 63 Garage Area 2929 non-null float64 64 Garage Qual 2771 non-null object 65 Garage Cond 2771 non-null object 66 Paved Drive 2930 non-null object 67 Wood Deck SF 2930 non-null int64 68 Open Porch SF 2930 non-null int64 69 Enclosed Porch 2930 non-null int64 70 3Ssn Porch 2930 non-null int64 71 Screen Porch 2930 non-null int64 72 Pool Area 2930 non-null int64 73 Pool QC 13 non-null object 74 Fence 572 non-null object 75 Misc Feature 106 non-null object 76 Misc Val 2930 non-null int64 77 Mo Sold 2930 non-null int64 78 Yr Sold 2930 non-null int64 79 Sale Type 2930 non-null object 80 Sale Condition 2930 non-null object 81 SalePrice 2930 non-null int64 dtypes: float64(11), int64(28), object(43) memory usage: 1.8+ MB
data['SalePrice'].tolist()
[215000, 105000, 172000, 244000, 189900, 195500, 213500, 191500, 236500, 189000, 175900, 185000, 180400, 171500, 212000, 538000, 164000, 394432, 141000, 210000, 190000, 170000, 216000, 149000, 149900, 142000, 126000, 115000, 184000, 96000, 105500, 88000, 127500, 149900, 120000, 146000, 376162, 306000, 395192, 290941, 220000, 275000, 259000, 214000, 611657, 224000, 500000, 320000, 319900, 205000, 175500, 199500, 160000, 192000, 184500, 216500, 185088, 180000, 222500, 333168, 355000, 260400, 325000, 290000, 221000, 410000, 221500, 204500, 215200, 262500, 254900, 271500, 233000, 181000, 205000, 143000, 189000, 99500, 125000, 194500, 152000, 171000, 67500, 112000, 148000, 138500, 122000, 133000, 127000, 169000, 190000, 362500, 285000, 260000, 190000, 155000, 151000, 149500, 152000, 222000, 177500, 177000, 155000, 147110, 267916, 254000, 155000, 206000, 130500, 230000, 218500, 243500, 205000, 212500, 196500, 197500, 171000, 142250, 143000, 128950, 159000, 178900, 136300, 180500, 137500, 84900, 142125, 197600, 172500, 116500, 76500, 128000, 153000, 132000, 178000, 154300, 180000, 190000, 135000, 214000, 136000, 165500, 145000, 148000, 142000, 167500, 108538, 159500, 108000, 135000, 122500, 119000, 109000, 105000, 107500, 144900, 129000, 97500, 144000, 162000, 242000, 132000, 154000, 166000, 134800, 160000, 148000, 192000, 155000, 80400, 96500, 109500, 115000, 143000, 107400, 80000, 119000, 130000, 119000, 129000, 100000, 12789, 105900, 150000, 139000, 240000, 76500, 149700, 125500, 122500, 140750, 128500, 209500, 87000, 134000, 128000, 132000, 139900, 123900, 138400, 109500, 140000, 149500, 159900, 122000, 110000, 55000, 140000, 244400, 173000, 107500, 100000, 95000, 93369, 114900, 94000, 136000, 136500, 131500, 121500, 125000, 154000, 137900, 158000, 137250, 160250, 163000, 158900, 328000, 270000, 85000, 128000, 260000, 230000, 124000, 83000, 144500, 129000, 127000, 128000, 186000, 308030, 114000, 84900, 178000, 270000, 218000, 236000, 147000, 245350, 206000, 198900, 187000, 320000, 138500, 155000, 159000, 191000, 200500, 150000, 161750, 128200, 127000, 318000, 272000, 237000, 240000, 224900, 143750, 143000, 232000, 213000, 185500, 84900, 155891, 100000, 144000, 64000, 125200, 107000, 90000, 140000, 113000, 80000, 144500, 104000, 128000, 58500, 127000, 126000, 160000, 100000, 169000, 257500, 215000, 266500, 335000, 203135, 185000, 162500, 289000, 125500, 82000, 110000, 68400, 102776, 55993, 50138, 246000, 254900, 190000, 201000, 169900, 170000, 160000, 220000, 179781, 174000, 269500, 214900, 202900, 378500, 169000, 173500, 139000, 166500, 83500, 119500, 85000, 76000, 75500, 88250, 85500, 130000, 157900, 149900, 159000, 136000, 161000, 285000, 231000, 124500, 157000, 345000, 189500, 270000, 189000, 377500, 168500, 375000, 278000, 240000, 239500, 177500, 185000, 191000, 178000, 185000, 181316, 166000, 178000, 174000, 173000, 225000, 180500, 187500, 501837, 372500, 260000, 185000, 260000, 181000, 82500, 215000, 154000, 200000, 249000, 187500, 184000, 278000, 157000, 152000, 197500, 240900, 263435, 220000, 235000, 213000, 167900, 158000, 165000, 158000, 136000, 148500, 156000, 128000, 143000, 76500, 120500, 124500, 97000, 130000, 111000, 125000, 112000, 97000, 118000, 119500, 143750, 146000, 148500, 123000, 147000, 137900, 147000, 148500, 138000, 128500, 100000, 148800, 337500, 462000, 485000, 555000, 325000, 256300, 253293, 398800, 335000, 404000, 402861, 451950, 610000, 582933, 360000, 296000, 409900, 255500, 335000, 274900, 300000, 324000, 350000, 280000, 284000, 269500, 233170, 386250, 445000, 290000, 255900, 213000, 196000, 184500, 212500, 230000, 552000, 382500, 320000, 248500, 286500, 254000, 173000, 173000, 184000, 167800, 174000, 174000, 174000, 175900, 192500, 181000, 180000, 160200, 188500, 200000, 170000, 189500, 184100, 195500, 192000, 178000, 207500, 236000, 257500, 244000, 167000, 179000, 190000, 156000, 245000, 181000, 214000, 168000, 337000, 403000, 327000, 340000, 336000, 265000, 315000, 260000, 260000, 263550, 402000, 248000, 244600, 275000, 257500, 287090, 275500, 245000, 253000, 468000, 252678, 210000, 208300, 225000, 229456, 229800, 250000, 370878, 238500, 310000, 270000, 252000, 241000, 264500, 291000, 263000, 185000, 234500, 209000, 159000, 152000, 143500, 193000, 203000, 184900, 159000, 142000, 153000, 224243, 220000, 257000, 189000, 171500, 120000, 145000, 184000, 162000, 160000, 82000, 76000, 110000, 135000, 141000, 122000, 124100, 129000, 131400, 62383, 123000, 275000, 235000, 280750, 164500, 173733, 222000, 195000, 172500, 180000, 156000, 172500, 318750, 211500, 241600, 180500, 150000, 154000, 185000, 185750, 200000, 206000, 162000, 256900, 197900, 163000, 113000, 230000, 167900, 213250, 227000, 130000, 143000, 117500, 168500, 172500, 161500, 141500, 118000, 127500, 140000, 177625, 110000, 167000, 153000, 145100, 154000, 177500, 158000, 124500, 174500, 122000, 82500, 110000, 149500, 175000, 167000, 128900, 140000, 147000, 124000, 187500, 159000, 256000, 205000, 193500, 110000, 104900, 150000, 156500, 176000, 149500, 139000, 155000, 120000, 153000, 144000, 176000, 153000, 135000, 131000, 123000, 126000, 115000, 164900, 113000, 145500, 102900, 95000, 152500, 129900, 132000, 99900, 135000, 149000, 114000, 109500, 125000, 142900, 156500, 59000, 105000, 106000, 78500, 190000, 154000, 163000, 200000, 143500, 135000, 153000, 157500, 113500, 133000, 92900, 128500, 90000, 138000, 128000, 139000, 118900, 138000, 132500, 133500, 135000, 144750, 145000, 127000, 109500, 115000, 110000, 128900, 103500, 66500, 130000, 129000, 150000, 107500, 94550, 124500, 135000, 103000, 93000, 129500, 93000, 80000, 45000, 37900, 91300, 99500, 113000, 87500, 110000, 106000, 265979, 160000, 119000, 168000, 58500, 143000, 85000, 124900, 119000, 146500, 34900, 44000, 223500, 149000, 205000, 137000, 121000, 128000, 134900, 117000, 132500, 93000, 119000, 100000, 141500, 133000, 60000, 105000, 115000, 150000, 126500, 214500, 167500, 155000, 155000, 179900, 104000, 62500, 149000, 103000, 123000, 97500, 135000, 70000, 116000, 88750, 179000, 179000, 159900, 61000, 103600, 63000, 175000, 139000, 172500, 113500, 130000, 149900, 134900, 137000, 139000, 165000, 148000, 165000, 63900, 161500, 143000, 135000, 82500, 190000, 139600, 122000, 127500, 121500, 60000, 154400, 113000, 125000, 84000, 139500, 131000, 105000, 108000, 162000, 156500, 316600, 271900, 213000, 239900, 239500, 131000, 118964, 153337, 147983, 118858, 118858, 142953, 148325, 113722, 269500, 269500, 269500, 323262, 297000, 295493, 332000, 239900, 212000, 272500, 239000, 220000, 200000, 221800, 194500, 237000, 173000, 275000, 233500, 185000, 152000, 143000, 231500, 212000, 207000, 145000, 138000, 197900, 204000, 192000, 200000, 195000, 227000, 230000, 187100, 203000, 201000, 124000, 140000, 134900, 150500, 136500, 143500, 133500, 123000, 133900, 133000, 250000, 254750, 236500, 261500, 313000, 198500, 211000, 220000, 203000, 279500, 210000, 219500, 191000, 178000, 213000, 144000, 140000, 293200, 190000, 190000, 173500, 170000, 108000, 167000, 100000, 198500, 159500, 127000, 125000, 320000, 145500, 118000, 85000, 120000, 98000, 99900, 82000, 119900, 103500, 112000, 110000, 122000, 117000, 160000, 159434, 60000, 139500, 197000, 105000, 135000, 147500, 137450, 155000, 177000, 145000, 137000, 234000, 205000, 177500, 160000, 155000, 163500, 154900, 205000, 144800, 174500, 193000, 158000, 158500, 224000, 115000, 137000, 121000, 124000, 208000, 315000, 222000, 147000, 123000, 78000, 85000, 75000, 230000, 57625, 251000, 239686, 240000, 215000, 156450, 173000, 126000, 173000, 152000, 137500, 375000, 315500, 224500, 410000, 316500, 201000, 175000, 204000, 213500, 220000, 170000, 315000, 425000, 139500, 115000, 162000, 165000, 91000, 130000, 86000, 160000, 80000, 97000, 88000, 131900, 131250, 165500, 112000, 149000, 143000, 130000, 196000, 173000, 176500, 237500, 206900, 187500, 165000, 195500, 192000, 173000, 177900, 175000, 286000, ...]
data.describe()
Order | PID | MS SubClass | Lot Frontage | Lot Area | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Mas Vnr Area | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | TotRms AbvGrd | Fireplaces | Garage Yr Blt | Garage Cars | Garage Area | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | Mo Sold | Yr Sold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2930.00000 | 2.930000e+03 | 2930.000000 | 2440.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2907.000000 | 2929.000000 | 2929.000000 | 2929.000000 | 2929.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2928.000000 | 2928.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2771.000000 | 2929.000000 | 2929.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 | 2930.000000 |
mean | 1465.50000 | 7.144645e+08 | 57.387372 | 69.224590 | 10147.921843 | 6.094881 | 5.563140 | 1971.356314 | 1984.266553 | 101.896801 | 442.629566 | 49.722431 | 559.262547 | 1051.614544 | 1159.557679 | 335.455973 | 4.676792 | 1499.690444 | 0.431352 | 0.061134 | 1.566553 | 0.379522 | 2.854266 | 1.044369 | 6.443003 | 0.599317 | 1978.132443 | 1.766815 | 472.819734 | 93.751877 | 47.533447 | 23.011604 | 2.592491 | 16.002048 | 2.243345 | 50.635154 | 6.216041 | 2007.790444 | 180796.060068 |
std | 845.96247 | 1.887308e+08 | 42.638025 | 23.365335 | 7880.017759 | 1.411026 | 1.111537 | 30.245361 | 20.860286 | 179.112611 | 455.590839 | 169.168476 | 439.494153 | 440.615067 | 391.890885 | 428.395715 | 46.310510 | 505.508887 | 0.524820 | 0.245254 | 0.552941 | 0.502629 | 0.827731 | 0.214076 | 1.572964 | 0.647921 | 25.528411 | 0.760566 | 215.046549 | 126.361562 | 67.483400 | 64.139059 | 25.141331 | 56.087370 | 35.597181 | 566.344288 | 2.714492 | 1.316613 | 79886.692357 |
min | 1.00000 | 5.263011e+08 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1895.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 12789.000000 |
25% | 733.25000 | 5.284770e+08 | 20.000000 | 58.000000 | 7440.250000 | 5.000000 | 5.000000 | 1954.000000 | 1965.000000 | 0.000000 | 0.000000 | 0.000000 | 219.000000 | 793.000000 | 876.250000 | 0.000000 | 0.000000 | 1126.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1960.000000 | 1.000000 | 320.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2007.000000 | 129500.000000 |
50% | 1465.50000 | 5.354536e+08 | 50.000000 | 68.000000 | 9436.500000 | 6.000000 | 5.000000 | 1973.000000 | 1993.000000 | 0.000000 | 370.000000 | 0.000000 | 466.000000 | 990.000000 | 1084.000000 | 0.000000 | 0.000000 | 1442.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1979.000000 | 2.000000 | 480.000000 | 0.000000 | 27.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 160000.000000 |
75% | 2197.75000 | 9.071811e+08 | 70.000000 | 80.000000 | 11555.250000 | 7.000000 | 6.000000 | 2001.000000 | 2004.000000 | 164.000000 | 734.000000 | 0.000000 | 802.000000 | 1302.000000 | 1384.000000 | 703.750000 | 0.000000 | 1742.750000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2002.000000 | 2.000000 | 576.000000 | 168.000000 | 70.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 213500.000000 |
max | 2930.00000 | 1.007100e+09 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1526.000000 | 2336.000000 | 6110.000000 | 5095.000000 | 2065.000000 | 1064.000000 | 5642.000000 | 3.000000 | 2.000000 | 4.000000 | 2.000000 | 8.000000 | 3.000000 | 15.000000 | 4.000000 | 2207.000000 | 5.000000 | 1488.000000 | 1424.000000 | 742.000000 | 1012.000000 | 508.000000 | 576.000000 | 800.000000 | 17000.000000 | 12.000000 | 2010.000000 | 755000.000000 |
Lets start by setting up a pipeline of functions that will let us quickly iterate on different models
Functions that transform features, select features and train-and-test features
We will build our functions and make the pipeline gradually and will improve in rounds. So Lets begin with the round 1, where we are just writing up minimal code. As mentioned, We will keep improving on them through out this project.
# For now, just return DataFrame
def transform_features(data):
'''
under developement
'''
return data
# For now, it just returns the DataFrame's two columns
def select_features(data):
'''
under developement
'''
return data[['Gr Liv Area', 'SalePrice']]
def train_and_test(data):
'''
under developement
'''
train = data.iloc[:1460]
test = data.iloc[1460:]
numeric_train = train.select_dtypes(include=['integer', 'float'])
numeric_test = test.select_dtypes(include=['integer', 'float'])
target = 'SalePrice'
features = numeric_train.columns.drop(target)
lr = LinearRegression()
lr.fit(train[features], train[target])
predictions = lr.predict(test[features])
mse = mean_squared_error(test[target], predictions)
rmse = np.sqrt(mse)
return rmse
# Features Transformation
round1_transformed_data = transform_features(data)
# Features Selection
round1_selected_data = select_features(round1_transformed_data)
# Training and Testing
round1_rmse_data = train_and_test(round1_selected_data)
print('So in this first and lazy attempt, we have got an RMSE of', round1_rmse_data)
So in this first and lazy attempt, we have got an RMSE of 57088.25161263909
Let's now start removing features with missing values, diving deeper into potential categorical features, transforming text and numerical columns
Lets update the transform_features()
function to achieve following goals
remove the features we don't want to use in the model.
transform features into the proper format like
Create new features by combining existing features
data.select_dtypes(include=['integer', 'float']).info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2930 entries, 0 to 2929 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order 2930 non-null int64 1 PID 2930 non-null int64 2 MS SubClass 2930 non-null int64 3 Lot Frontage 2440 non-null float64 4 Lot Area 2930 non-null int64 5 Overall Qual 2930 non-null int64 6 Overall Cond 2930 non-null int64 7 Year Built 2930 non-null int64 8 Year Remod/Add 2930 non-null int64 9 Mas Vnr Area 2907 non-null float64 10 BsmtFin SF 1 2929 non-null float64 11 BsmtFin SF 2 2929 non-null float64 12 Bsmt Unf SF 2929 non-null float64 13 Total Bsmt SF 2929 non-null float64 14 1st Flr SF 2930 non-null int64 15 2nd Flr SF 2930 non-null int64 16 Low Qual Fin SF 2930 non-null int64 17 Gr Liv Area 2930 non-null int64 18 Bsmt Full Bath 2928 non-null float64 19 Bsmt Half Bath 2928 non-null float64 20 Full Bath 2930 non-null int64 21 Half Bath 2930 non-null int64 22 Bedroom AbvGr 2930 non-null int64 23 Kitchen AbvGr 2930 non-null int64 24 TotRms AbvGrd 2930 non-null int64 25 Fireplaces 2930 non-null int64 26 Garage Yr Blt 2771 non-null float64 27 Garage Cars 2929 non-null float64 28 Garage Area 2929 non-null float64 29 Wood Deck SF 2930 non-null int64 30 Open Porch SF 2930 non-null int64 31 Enclosed Porch 2930 non-null int64 32 3Ssn Porch 2930 non-null int64 33 Screen Porch 2930 non-null int64 34 Pool Area 2930 non-null int64 35 Misc Val 2930 non-null int64 36 Mo Sold 2930 non-null int64 37 Yr Sold 2930 non-null int64 38 SalePrice 2930 non-null int64 dtypes: float64(11), int64(28) memory usage: 892.9 KB
data.select_dtypes(include=['object']).info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2930 entries, 0 to 2929 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MS Zoning 2930 non-null object 1 Street 2930 non-null object 2 Alley 198 non-null object 3 Lot Shape 2930 non-null object 4 Land Contour 2930 non-null object 5 Utilities 2930 non-null object 6 Lot Config 2930 non-null object 7 Land Slope 2930 non-null object 8 Neighborhood 2930 non-null object 9 Condition 1 2930 non-null object 10 Condition 2 2930 non-null object 11 Bldg Type 2930 non-null object 12 House Style 2930 non-null object 13 Roof Style 2930 non-null object 14 Roof Matl 2930 non-null object 15 Exterior 1st 2930 non-null object 16 Exterior 2nd 2930 non-null object 17 Mas Vnr Type 2907 non-null object 18 Exter Qual 2930 non-null object 19 Exter Cond 2930 non-null object 20 Foundation 2930 non-null object 21 Bsmt Qual 2850 non-null object 22 Bsmt Cond 2850 non-null object 23 Bsmt Exposure 2847 non-null object 24 BsmtFin Type 1 2850 non-null object 25 BsmtFin Type 2 2849 non-null object 26 Heating 2930 non-null object 27 Heating QC 2930 non-null object 28 Central Air 2930 non-null object 29 Electrical 2929 non-null object 30 Kitchen Qual 2930 non-null object 31 Functional 2930 non-null object 32 Fireplace Qu 1508 non-null object 33 Garage Type 2773 non-null object 34 Garage Finish 2771 non-null object 35 Garage Qual 2771 non-null object 36 Garage Cond 2771 non-null object 37 Paved Drive 2930 non-null object 38 Pool QC 13 non-null object 39 Fence 572 non-null object 40 Misc Feature 106 non-null object 41 Sale Type 2930 non-null object 42 Sale Condition 2930 non-null object dtypes: object(43) memory usage: 984.4+ KB
Lets use this approach for now. We can always change thresholds and approaches to improve our score.
And for the Cleaning part, we can do following ops:
SalePrice
Since we are working out to make a standard function, our working will involve a lot of experimentation. So its a better idea to retain the original dataset and do experimentation on a copy of it.
df = data.copy()
df.select_dtypes('object').head()
MS Zoning | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin Type 2 | Heating | Heating QC | Central Air | Electrical | Kitchen Qual | Functional | Fireplace Qu | Garage Type | Garage Finish | Garage Qual | Garage Cond | Paved Drive | Pool QC | Fence | Misc Feature | Sale Type | Sale Condition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | RL | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | Hip | CompShg | BrkFace | Plywood | Stone | TA | TA | CBlock | TA | Gd | Gd | BLQ | Unf | GasA | Fa | Y | SBrkr | TA | Typ | Gd | Attchd | Fin | TA | TA | P | NaN | NaN | NaN | WD | Normal |
1 | RH | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | Gable | CompShg | VinylSd | VinylSd | None | TA | TA | CBlock | TA | TA | No | Rec | LwQ | GasA | TA | Y | SBrkr | TA | Typ | NaN | Attchd | Unf | TA | TA | Y | NaN | MnPrv | NaN | WD | Normal |
2 | RL | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | TA | TA | CBlock | TA | TA | No | ALQ | Unf | GasA | TA | Y | SBrkr | Gd | Typ | NaN | Attchd | Unf | TA | TA | Y | NaN | NaN | Gar2 | WD | Normal |
3 | RL | Pave | NaN | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | Hip | CompShg | BrkFace | BrkFace | None | Gd | TA | CBlock | TA | TA | No | ALQ | Unf | GasA | Ex | Y | SBrkr | Ex | Typ | TA | Attchd | Fin | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
4 | RL | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | None | TA | TA | PConc | Gd | TA | No | GLQ | Unf | GasA | Gd | Y | SBrkr | TA | Typ | TA | Attchd | Fin | TA | TA | Y | NaN | MnPrv | NaN | WD | Normal |
Lets check columns of Integer and Float Dtypes.
df.select_dtypes(['integer', 'float']).head()
Order | PID | MS SubClass | Lot Frontage | Lot Area | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Mas Vnr Area | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | TotRms AbvGrd | Fireplaces | Garage Yr Blt | Garage Cars | Garage Area | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | Mo Sold | Yr Sold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | 141.0 | 31770 | 6 | 5 | 1960 | 1960 | 112.0 | 639.0 | 0.0 | 441.0 | 1080.0 | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | 7 | 2 | 1960.0 | 2.0 | 528.0 | 210 | 62 | 0 | 0 | 0 | 0 | 0 | 5 | 2010 | 215000 |
1 | 2 | 526350040 | 20 | 80.0 | 11622 | 5 | 6 | 1961 | 1961 | 0.0 | 468.0 | 144.0 | 270.0 | 882.0 | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | 5 | 0 | 1961.0 | 1.0 | 730.0 | 140 | 0 | 0 | 0 | 120 | 0 | 0 | 6 | 2010 | 105000 |
2 | 3 | 526351010 | 20 | 81.0 | 14267 | 6 | 6 | 1958 | 1958 | 108.0 | 923.0 | 0.0 | 406.0 | 1329.0 | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | 6 | 0 | 1958.0 | 1.0 | 312.0 | 393 | 36 | 0 | 0 | 0 | 0 | 12500 | 6 | 2010 | 172000 |
3 | 4 | 526353030 | 20 | 93.0 | 11160 | 7 | 5 | 1968 | 1968 | 0.0 | 1065.0 | 0.0 | 1045.0 | 2110.0 | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | 8 | 2 | 1968.0 | 2.0 | 522.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2010 | 244000 |
4 | 5 | 527105010 | 60 | 74.0 | 13830 | 5 | 5 | 1997 | 1998 | 0.0 | 791.0 | 0.0 | 137.0 | 928.0 | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | 6 | 1 | 1997.0 | 2.0 | 482.0 | 212 | 34 | 0 | 0 | 0 | 0 | 0 | 3 | 2010 | 189900 |
By looking closely at numerical features, three of them should catch our attention. The features are Year Built
, Year Remod/Add
and Yr Sold
.
Year Built
is about the year property was built. Year Remod/Add
tells us about last modification if there is any. Whereas Yr Sold
tells us about the year of sale of the property.
There features might become far more impactful if we combine them to calculate "Age of house", and "Years since alteration"
Lets do it for now by creating a new features
age
that will be formed by subtracting Year Built
from Yr Sold
andyears_remod
that will be formed by subtracting Year Remod/Add
from Yr Sold
df['age'] = df['Yr Sold'] - df['Year Built']
df['years_remod'] = df['Yr Sold'] - df['Year Remod/Add']
We rarely get to have a perfect data! On running value_counts() on both of our new columns, we found out some negative values.
# capture these rows
del_us = df[(df['age'] < 0) | (df['years_remod'] < 0)]
del_us
Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | age | years_remod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1702 | 1703 | 528120010 | 60 | RL | 134.0 | 16659 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NridgHt | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2007 | 2008 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | Gd | TA | PConc | Gd | TA | No | Unf | 0.0 | Unf | 0.0 | 1582.0 | 1582.0 | GasA | Ex | Y | SBrkr | 1582 | 570 | 0 | 2152 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 2007.0 | Unf | 2.0 | 728.0 | TA | TA | Y | 0 | 368 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2007 | New | Partial | 260116 | 0 | -1 |
2180 | 2181 | 908154195 | 20 | RL | 128.0 | 39290 | Pave | NaN | IR1 | Bnk | AllPub | Inside | Gtl | Edwards | Norm | Norm | 1Fam | 1Story | 10 | 5 | 2008 | 2009 | Hip | CompShg | CemntBd | CmentBd | Stone | 1224.0 | Ex | TA | PConc | Ex | TA | Gd | GLQ | 4010.0 | Unf | 0.0 | 1085.0 | 5095.0 | GasA | Ex | Y | SBrkr | 5095 | 0 | 0 | 5095 | 1.0 | 1.0 | 2 | 1 | 2 | 1 | Ex | 15 | Typ | 2 | Gd | Attchd | 2008.0 | Fin | 3.0 | 1154.0 | TA | TA | Y | 546 | 484 | 0 | 0 | 0 | 0 | NaN | NaN | Elev | 17000 | 10 | 2007 | New | Partial | 183850 | -1 | -2 |
2181 | 2182 | 908154205 | 60 | RL | 130.0 | 40094 | Pave | NaN | IR1 | Bnk | AllPub | Inside | Gtl | Edwards | PosN | PosN | 1Fam | 2Story | 10 | 5 | 2007 | 2008 | Hip | CompShg | CemntBd | CmentBd | Stone | 762.0 | Ex | TA | PConc | Ex | TA | Gd | GLQ | 2260.0 | Unf | 0.0 | 878.0 | 3138.0 | GasA | Ex | Y | SBrkr | 3138 | 1538 | 0 | 4676 | 1.0 | 0.0 | 3 | 1 | 3 | 1 | Ex | 11 | Typ | 1 | Gd | BuiltIn | 2007.0 | Fin | 3.0 | 884.0 | TA | TA | Y | 208 | 406 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 10 | 2007 | New | Partial | 184750 | 0 | -1 |
# Lets remove these rows.
df.drop(del_us.index, axis=0, inplace=True)
Lets check the columns with Null Values less than 5% and delete them after consideration
# Series of Columns with percentages of Null values in them
null_series = pd.Series(df.isnull().sum()/len(df) * 100)
# Filtering out the columns with more than 5% Null values
null_series[null_series > 5.0].sort_values()
Garage Type 5.363854 Garage Yr Blt 5.432183 Garage Finish 5.432183 Garage Qual 5.432183 Garage Cond 5.432183 Lot Frontage 16.740690 Fireplace Qu 48.582166 Fence 80.457807 Alley 93.235395 Misc Feature 96.412709 Pool QC 99.555859 dtype: float64
We have an interesting situation here. Columns like Pool QC
, Misc Feature
, Alley
, Fence
, Fireplace QU
will be deleted straight away because they contain a lot of Null values.
However, the columns that contain Null Values just above 5% are all related to Garage. By reading Data Documentation, we can see that most of them are text columns of ordinal and nominal variables. And those columns already have NA option provided. So we can not take Null Value as NA either. Lets Delete all for them for now
# Cutoff at 5%. (1-0.05 = 0.95)
df = df.dropna(thresh=df.shape[0]*0.95, axis=1)
In text columns, lets remove every column that contains any Null value for now.
text_df = df.select_dtypes(include='object')
text_df.shape
(2927, 34)
# Select Object dtype, and drop column (axis=1) with any missing values values (how='all')
text_df = text_df.dropna(how='any', axis=1)
text_df.shape
(2927, 27)
numeric_df = df.select_dtypes(include=['integer','float'])
numeric_df.head()
Order | PID | MS SubClass | Lot Area | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Mas Vnr Area | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | TotRms AbvGrd | Fireplaces | Garage Cars | Garage Area | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | Mo Sold | Yr Sold | SalePrice | age | years_remod | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | 31770 | 6 | 5 | 1960 | 1960 | 112.0 | 639.0 | 0.0 | 441.0 | 1080.0 | 1656 | 0 | 0 | 1656 | 1.0 | 0.0 | 1 | 0 | 3 | 1 | 7 | 2 | 2.0 | 528.0 | 210 | 62 | 0 | 0 | 0 | 0 | 0 | 5 | 2010 | 215000 | 50 | 50 |
1 | 2 | 526350040 | 20 | 11622 | 5 | 6 | 1961 | 1961 | 0.0 | 468.0 | 144.0 | 270.0 | 882.0 | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | 5 | 0 | 1.0 | 730.0 | 140 | 0 | 0 | 0 | 120 | 0 | 0 | 6 | 2010 | 105000 | 49 | 49 |
2 | 3 | 526351010 | 20 | 14267 | 6 | 6 | 1958 | 1958 | 108.0 | 923.0 | 0.0 | 406.0 | 1329.0 | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | 6 | 0 | 1.0 | 312.0 | 393 | 36 | 0 | 0 | 0 | 0 | 12500 | 6 | 2010 | 172000 | 52 | 52 |
3 | 4 | 526353030 | 20 | 11160 | 7 | 5 | 1968 | 1968 | 0.0 | 1065.0 | 0.0 | 1045.0 | 2110.0 | 2110 | 0 | 0 | 2110 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | 8 | 2 | 2.0 | 522.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 2010 | 244000 | 42 | 42 |
4 | 5 | 527105010 | 60 | 13830 | 5 | 5 | 1997 | 1998 | 0.0 | 791.0 | 0.0 | 137.0 | 928.0 | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | 6 | 1 | 2.0 | 482.0 | 212 | 34 | 0 | 0 | 0 | 0 | 0 | 3 | 2010 | 189900 | 13 | 12 |
# Take out the columns using isna() and any() and Segregate their names in a list.
drop_these = numeric_df.columns[numeric_df.isna().any()].tolist()
# Drop the columns
numeric_df = numeric_df.drop(drop_these, axis=1)
Lets delete Order
and PID
columns
numeric_df = numeric_df.drop(['Order', 'PID'], axis=1)
Following Columns can potentially Leak Data to our target variable SalePrice
:
Mo Sold
, Sale Condition
, Sale Type
, Yr Sold
. For more information, check out Data Documentation
Lets remove them as well. We will remove them from our Sub-DataFrames. We will concatenate out Sub-DataFrames later
numeric_df = numeric_df.drop(['Mo Sold', 'Yr Sold'], axis=1)
text_df = text_df.drop(['Sale Type', 'Sale Condition'], axis=1)
Lets keep the deletion of rows with extremities after concatenating both DataFrames, that we will do in the function.
def transform_features(data):
'''
Transforms the DataFrame. It replicates
the work done in preceding code.
Args:
data: The Dataframe
Returns:
transformed DataFrame
'''
# Create a Copy
df = data.copy()
# Create new features and Clean
df['age'] = df['Yr Sold'] - df['Year Built']
df['years_remod'] = df['Yr Sold'] - df['Year Remod/Add']
del_us = df[(df['age'] < 0) | (df['years_remod'] < 0)]
df.drop(del_us.index, axis=0, inplace=True)
df = df.dropna(thresh=df.shape[0]*0.95, axis=1)
# Break the DataFrame WRT Dtype and Clean
text_df = df.select_dtypes(include='object')
text_df = text_df.dropna(how='any', axis=1)
numeric_df = df.select_dtypes(include=['integer','float'])
drop_these = numeric_df.columns[numeric_df.isna().any()].tolist()
numeric_df = numeric_df.drop(drop_these, axis=1)
numeric_df = numeric_df.drop(['Order', 'PID', 'Mo Sold', 'Yr Sold'], axis=1)
text_df = text_df.drop(['Sale Type', 'Sale Condition'], axis=1)
# Concatenate the DataFrame pieces
df = pd.concat([numeric_df, text_df], axis=1)
# Do more Cleaning
extreme_rows = df[df['Gr Liv Area'] > 4000]
df = df.drop(extreme_rows.index, axis = 0)
# Return
return df
The functions select_features()
and train_test()
are still as they are
# We updated transform_features()
round2_transformed_data = transform_features(data)
# We have NOT updated select_features(), So it is still taking only 2 columns
round2_selected_data = select_features(round2_transformed_data)
# We have NOT updated test_and_train() either!
round2_rmse_data = train_and_test(round2_selected_data)
print(round2_rmse_data)
53051.95365648405
print('Our RMSE for this attempt is around {:.2f} and\nWith an impovement in RMSE of around USD {:.2f}\nThat is almost {:.1f}% improvement than the previous results'.
format(round2_rmse_data, round1_rmse_data-round2_rmse_data, (round1_rmse_data-round2_rmse_data)/round1_rmse_data*100))
Our RMSE for this attempt is around 53051.95 and With an impovement in RMSE of around USD 4036.30 That is almost 7.1% improvement than the previous results
Its time to move on to Feature Selection for numerical columns
SalePrice
?In the end, we will update the logic to our functions
# Lets Plot a Correlation Matrix
numeric_df = round2_transformed_data.select_dtypes(['integer', 'float'])
plt.figure(figsize=(20,20))
sns.heatmap(numeric_df.corr(), annot=True, fmt='.3g', vmin=-1, vmax=1,
center= 0, cmap= 'coolwarm',cbar_kws= {'orientation': 'horizontal'})
<matplotlib.axes._subplots.AxesSubplot at 0x1e2dbc40b80>
Following are the most Correlated Features with SalePrice
Overall quality
Gr Liv Area
Exter Qual
Kitchen Qual
Total Bsmt SF
Garage Cars
Garage Area
1st Flr SF
Years Before Sale
Years Since Remod
Within these features, there is a high correlation present between them that could be collinearity as well
Total Bsmt SF
and 1st Flr SF
Overall quality
, Exter Qual
, and Kitchen Qual
Garage Cars
and Garage Area
Since the columns age
and years_remod
were engineered from columns Year Built
and Year Remod/Add
, a very high correlation. though negative, can be seen among them
# Taking Correlation Coeffients
abs_corr_coeffs = numeric_df.corr()['SalePrice'].abs().sort_values(ascending=False)
# Displaying head only
abs_corr_coeffs.head()
SalePrice 1.000000 Overall Qual 0.805160 Gr Liv Area 0.719345 1st Flr SF 0.642763 age 0.565314 Name: SalePrice, dtype: float64
Lets keep the columns with correlation coeffients above 0.4
We will change it to get better results later.
abs_corr_coeffs[abs_corr_coeffs > 0.4]
SalePrice 1.000000 Overall Qual 0.805160 Gr Liv Area 0.719345 1st Flr SF 0.642763 age 0.565314 Year Built 0.564935 Full Bath 0.544366 years_remod 0.541941 Year Remod/Add 0.540110 TotRms AbvGrd 0.498450 Fireplaces 0.474777 Name: SalePrice, dtype: float64
# Drop any columns that are less correlant that 0.4.
round2_transformed_data = round2_transformed_data.drop(abs_corr_coeffs[abs_corr_coeffs < 0.4].index, axis=1)
All nominal variables are the candidates of being converted to Categorical Dtype. Let's check the nominal variables for
We also have to look into columns that are currently in numerical encoding but could do much better with Categorical encoding
# according to the Data Documentary, following are the nominal variables
nominal_features_all = [ 'PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood',
'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st',
'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type',
'Misc Feature', 'Sale Type', 'Sale Condition']
# We need to retrieve the columns that exist in our functioning DataFrame, i-e, round2_transformed_data
nominal_features = []
for name in nominal_features_all:
if name in round2_transformed_data.columns:
nominal_features.append(name)
nominal_features
['MS Zoning', 'Street', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Foundation', 'Heating', 'Central Air']
# Lets check the frequency tables with value_counts
for col in nominal_features:
print(col,'\n',round2_transformed_data[col].value_counts(), '\n', 'NUNIQUE:',
round2_transformed_data[col].nunique(), '\n', '-'*20)
MS Zoning RL 2267 RM 462 FV 139 RH 27 C (all) 25 I (all) 2 A (agr) 2 Name: MS Zoning, dtype: int64 NUNIQUE: 7 -------------------- Street Pave 2912 Grvl 12 Name: Street, dtype: int64 NUNIQUE: 2 -------------------- Land Contour Lvl 2630 HLS 120 Bnk 114 Low 60 Name: Land Contour, dtype: int64 NUNIQUE: 4 -------------------- Lot Config Inside 2138 Corner 507 CulDSac 180 FR2 85 FR3 14 Name: Lot Config, dtype: int64 NUNIQUE: 5 -------------------- Neighborhood NAmes 443 CollgCr 267 OldTown 239 Edwards 191 Somerst 182 NridgHt 165 Gilbert 165 Sawyer 151 NWAmes 131 SawyerW 125 Mitchel 114 BrkSide 108 Crawfor 103 IDOTRR 93 Timber 72 NoRidge 69 StoneBr 51 SWISU 48 ClearCr 44 MeadowV 37 BrDale 30 Blmngtn 28 Veenker 24 NPkVill 23 Blueste 10 Greens 8 GrnHill 2 Landmrk 1 Name: Neighborhood, dtype: int64 NUNIQUE: 28 -------------------- Condition 1 Norm 2518 Feedr 163 Artery 92 RRAn 50 PosN 38 RRAe 28 PosA 20 RRNn 9 RRNe 6 Name: Condition 1, dtype: int64 NUNIQUE: 9 -------------------- Condition 2 Norm 2895 Feedr 13 Artery 5 PosA 4 PosN 3 RRNn 2 RRAn 1 RRAe 1 Name: Condition 2, dtype: int64 NUNIQUE: 8 -------------------- Bldg Type 1Fam 2419 TwnhsE 233 Duplex 109 Twnhs 101 2fmCon 62 Name: Bldg Type, dtype: int64 NUNIQUE: 5 -------------------- House Style 1Story 1480 2Story 868 1.5Fin 314 SLvl 128 SFoyer 83 2.5Unf 24 1.5Unf 19 2.5Fin 8 Name: House Style, dtype: int64 NUNIQUE: 8 -------------------- Roof Style Gable 2319 Hip 547 Gambrel 22 Flat 20 Mansard 11 Shed 5 Name: Roof Style, dtype: int64 NUNIQUE: 6 -------------------- Roof Matl CompShg 2883 Tar&Grv 23 WdShake 9 WdShngl 6 Roll 1 Membran 1 Metal 1 Name: Roof Matl, dtype: int64 NUNIQUE: 7 -------------------- Exterior 1st VinylSd 1025 MetalSd 450 HdBoard 441 Wd Sdng 419 Plywood 221 CemntBd 124 BrkFace 88 WdShing 56 AsbShng 44 Stucco 42 BrkComm 6 CBlock 2 Stone 2 AsphShn 2 ImStucc 1 PreCast 1 Name: Exterior 1st, dtype: int64 NUNIQUE: 16 -------------------- Exterior 2nd VinylSd 1014 MetalSd 447 HdBoard 405 Wd Sdng 397 Plywood 274 CmentBd 124 Wd Shng 81 BrkFace 47 Stucco 46 AsbShng 38 Brk Cmn 22 ImStucc 14 Stone 6 AsphShn 4 CBlock 3 PreCast 1 Other 1 Name: Exterior 2nd, dtype: int64 NUNIQUE: 17 -------------------- Foundation PConc 1304 CBlock 1244 BrkTil 311 Slab 49 Stone 11 Wood 5 Name: Foundation, dtype: int64 NUNIQUE: 6 -------------------- Heating GasA 2879 GasW 27 Grav 9 Wall 6 OthW 2 Floor 1 Name: Heating, dtype: int64 NUNIQUE: 6 -------------------- Central Air Y 2728 N 196 Name: Central Air, dtype: int64 NUNIQUE: 2 --------------------
Heating
, Roof Matl
, Condition 2
, Land Contour
, Street
, Central Air
have one value each that has an overwhelming presence.Exterior 1st
, Exterior 2nd
and Neighborhood
contain relatively larger number of unique values.# This is the frequency table showing how much the MOST FREQUENT value is occupying of the total column.
for col in nominal_features:
print(col, round2_transformed_data[col].value_counts(normalize = True, ascending = False)[0])
MS Zoning 0.7753077975376197 Street 0.9958960328317373 Land Contour 0.899452804377565 Lot Config 0.7311901504787962 Neighborhood 0.1515047879616963 Condition 1 0.8611491108071135 Condition 2 0.9900820793433652 Bldg Type 0.8272913816689467 House Style 0.506155950752394 Roof Style 0.7930916552667578 Roof Matl 0.9859781121751026 Exterior 1st 0.350547195622435 Exterior 2nd 0.34678522571819426 Foundation 0.44596443228454175 Heating 0.9846101231190151 Central Air 0.93296853625171
# Adjust an arbitrary cutoff at 0.85
myraid_cutoff = 0.85
# Because of some unusual errors, I introduced the dictionary rather than pulling out the columns directly
myraid_dict = {}
for col in nominal_features:
myraid = round2_transformed_data[col].value_counts(normalize = True, ascending = False)[0]
myraid_dict[col] = myraid
for key, val in myraid_dict.items():
if val > myraid_cutoff:
round2_transformed_data.drop(key, axis=1, inplace=True)
nominal_features.remove(key)
# Adjust an arbitrary cutoff at 10
nunique_thresh = 10
nunique_dict = {}
for col in nominal_features:
nunique_dict[col] = round2_transformed_data[col].nunique()
for key, val in nunique_dict.items():
if val > nunique_thresh:
round2_transformed_data.drop(key, axis=1, inplace=True)
nominal_features.remove(key)
Lets look into the numerical variables that can possibly do better being categorical
for col in round2_transformed_data.select_dtypes(['integer', 'float']).columns:
print(col, '\n', round2_transformed_data[col].value_counts(), '\n')
Overall Qual 5 825 6 732 7 602 8 349 4 226 9 107 3 40 10 26 2 13 1 4 Name: Overall Qual, dtype: int64 Year Built 2005 142 2006 138 2007 107 2004 99 2003 88 1977 57 1920 57 1976 54 1999 52 1958 48 2000 48 2002 47 1998 47 2008 47 1968 45 1959 43 1954 43 1910 43 1978 42 1970 42 1967 41 1972 40 1993 40 1956 39 1971 39 1950 38 1960 37 1994 36 1940 36 1966 35 2001 35 1957 35 1962 35 1963 35 1997 35 1955 34 1965 34 1925 34 1961 34 1964 33 1996 33 1995 31 1900 29 1969 28 1980 27 1992 27 1948 27 1930 26 2009 25 1975 25 1953 24 1915 24 1974 23 1941 23 1973 21 1979 21 1939 20 1926 19 1990 19 1984 19 1951 18 1952 18 1949 18 1923 17 1922 16 1924 16 1945 15 1946 15 1988 15 1938 13 1935 13 1991 12 1936 11 1986 11 1921 11 1947 11 1981 10 1918 10 1916 10 1928 9 1937 9 1927 9 1989 8 1983 8 1929 8 1914 8 1987 8 1931 7 1890 7 1982 7 1985 7 1942 6 1919 5 1880 5 1934 5 1932 5 1912 5 1905 3 1895 3 2010 3 1917 3 1901 2 1885 2 1892 2 1908 2 1882 1 1893 1 1896 1 1898 1 1904 1 1902 1 1906 1 1879 1 1875 1 1907 1 1911 1 1913 1 1872 1 Name: Year Built, dtype: int64 Year Remod/Add 1950 361 2006 202 2007 164 2005 141 2004 111 2000 104 2003 99 2002 82 1998 79 2008 78 1999 60 1996 58 1995 55 1994 54 1997 49 2001 49 1976 48 1977 46 1970 44 1993 43 1968 39 1978 38 1972 35 1958 34 1967 34 2009 33 1992 32 1971 31 1975 30 1959 30 1980 30 1963 30 1956 30 1990 29 1960 29 1991 29 1954 28 1965 28 1966 27 1962 26 1964 26 1969 26 1955 25 1979 24 1961 24 1973 21 1953 20 1957 20 1984 19 1974 19 1989 18 1987 16 1988 15 1952 15 1985 14 1951 14 1981 13 1986 13 2010 13 1983 11 1982 9 Name: Year Remod/Add, dtype: int64 1st Flr SF 864 46 1040 28 912 19 960 18 816 18 .. 2076 1 2064 1 2048 1 2034 1 2053 1 Name: 1st Flr SF, Length: 1078, dtype: int64 Gr Liv Area 864 41 1092 26 1040 25 1456 20 1200 18 .. 2402 1 2400 1 2398 1 2380 1 2049 1 Name: Gr Liv Area, Length: 1286, dtype: int64 Full Bath 2 1529 1 1318 3 61 0 12 4 4 Name: Full Bath, dtype: int64 TotRms AbvGrd 6 844 7 648 5 586 8 347 4 203 9 143 10 78 11 31 3 26 12 15 13 1 14 1 2 1 Name: TotRms AbvGrd, dtype: int64 Fireplaces 0 1422 1 1272 2 218 3 11 4 1 Name: Fireplaces, dtype: int64 SalePrice 135000 34 140000 33 130000 29 155000 28 145000 26 .. 274725 1 332200 1 57625 1 125200 1 169985 1 Name: SalePrice, Length: 1027, dtype: int64 age 1 218 0 113 2 90 4 76 5 66 6 66 3 60 31 53 7 51 9 50 39 48 11 46 51 45 32 45 49 44 8 44 30 41 42 41 10 40 43 39 52 39 37 38 44 38 13 37 53 37 36 36 47 36 50 36 48 35 54 33 12 33 38 33 40 33 46 32 33 32 41 32 15 31 34 30 45 30 29 30 55 27 68 27 35 27 14 27 17 25 56 25 59 24 81 23 88 23 87 22 58 22 16 22 84 22 86 21 70 21 69 21 60 20 82 20 89 20 57 20 28 19 83 17 26 17 62 17 19 16 61 16 18 14 77 14 67 14 93 13 78 13 99 13 85 13 71 12 80 12 96 12 94 12 22 12 90 12 23 12 97 11 73 11 25 11 76 10 106 10 98 10 65 10 91 9 20 9 27 9 21 9 66 8 24 8 72 7 74 7 108 7 63 7 92 6 107 5 100 5 110 5 95 5 64 5 109 5 79 4 75 3 103 3 117 3 127 2 119 2 128 2 118 2 114 2 112 2 129 2 135 1 102 1 104 1 120 1 122 1 126 1 101 1 111 1 113 1 115 1 125 1 136 1 Name: age, dtype: int64 years_remod 0 237 1 193 56 112 4 111 2 104 3 97 5 91 8 85 6 83 57 79 59 78 58 76 7 75 9 71 10 66 11 61 12 58 13 57 31 50 39 47 14 46 60 42 30 40 16 38 36 37 32 37 15 37 44 34 17 33 42 33 51 31 34 30 53 30 40 30 37 30 52 29 33 29 47 28 49 27 38 27 43 27 48 26 41 26 29 26 46 25 55 23 50 23 35 22 28 22 45 22 19 21 54 21 20 18 18 18 23 18 27 18 26 17 21 15 25 14 22 13 24 10 Name: years_remod, dtype: int64
Following Columns that are currently numerical in our round2_transformed_data's numerical Dtypes:
Overall Qual
Year Built
, Year Remod/Add
, Full Bath
, TotRmsAbvGrd
, Fireplaces
, years_remod
, age
1st Flr SF
, Gr Liv Area
, SalePrice
So far, its difficult to see any 'plain' nominal variable in above-given columns. We can't assume Ordinal Variables as to be Nominal ones.
Let's move on for now
how to work with ordinal variables with keeping in mind that we are going to do linear regression. Secondly, do we need to scale? if yes then how. There is a page opened in this browser. Consult it too. And lastly, there are columns that show year. what to do with them
# Nominal vs Ordinal vs Ratio vs Interval round2_transformed_data[col]
# Select just the remaining text columns and convert to categorical
text_cols = round2_transformed_data.select_dtypes(include=['object'])
for col in text_cols:
round2_transformed_data[col] = round2_transformed_data[col].astype('category')
# Create dummy columns and add back to the dataframe!
dummies = pd.get_dummies(round2_transformed_data.select_dtypes(include=['category']))
# Concatenate back
round2_transformed_data = pd.concat([round2_transformed_data, dummies], axis=1)
# Drop the original ones
round2_transformed_data = round2_transformed_data.drop(text_cols, axis=1)
# Check
round2_transformed_data.shape
(2924, 88)
def select_features(df, coef_cutoff, myraid_cutoff, nunique_thresh):
'''
The function replicates the code done in prior cells.
It selects the features we define to be appropriate
Args:
df: DataFrame
coef_cutoff: Selects the minimum Coefficient of Correlation as Cut off
myraid_cutoff: Selects the maximum acceptable presence of the Mode value in its column
nunique_thresh: Selects the maximum number of unique values for a column to be acceptable
Return:
DataFrame with requested changes and Dummy Variables for Categorical Variables
'''
abs_corr_coeffs = df.select_dtypes(['integer', 'float']).corr()['SalePrice'].abs().sort_values(ascending=False)
coef_cutoff = coef_cutoff
df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coef_cutoff].index, axis=1)
nominal_features = df.select_dtypes(include='object').columns.tolist()
myraid_cutoff = myraid_cutoff
myraid_dict = {}
for col in nominal_features:
myraid = df[col].value_counts(normalize = True, ascending = False)[0]
myraid_dict[col] = myraid
for key, val in myraid_dict.items():
if val > myraid_cutoff:
df.drop(key, axis=1, inplace=True)
nominal_features.remove(key)
nunique_thresh = nunique_thresh
nunique_dict = {}
for col in nominal_features:
nunique_dict[col] = df[col].nunique()
for key, val in nunique_dict.items():
if val > nunique_thresh:
df.drop(key, axis=1, inplace=True)
nominal_features.remove(key)
text_cols = df.select_dtypes(include=['object'])
for col in text_cols:
df[col] = df[col].astype('category')
df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(text_cols, axis=1)
return df
# We updated transform_features()
round3_transformed_data = transform_features(data)
# We have updated select_features()
round3_selected_data = select_features(round3_transformed_data, 0.4, 0.85, 10)
round3_rmse_data = train_and_test(round3_selected_data)
print(round3_rmse_data)
29742.097652564822
print('Our RMSE for this attempt is around {:.2f} and\nWith an impovement in RMSE of around USD {:.2f}\nThat is almost {:.1f}% improvement than the previous results'.
format(round3_rmse_data, round2_rmse_data-round3_rmse_data, (round2_rmse_data-round3_rmse_data)/round2_rmse_data*100))
Our RMSE for this attempt is around 29742.10 and With an impovement in RMSE of around USD 23309.86 That is almost 43.9% improvement than the previous results
Having given some effort to improve our Feature transformation and selection, let's move on to the training and testing
Now for the final part of the pipeline, training and testing. When iterating on different features, using simple validation is a good idea. Let's add a parameter named k, that controls the type of cross-validation that occurs
def train_and_test(data, k=0):
'''
Trains and Tests the Linear Regression Model for multiple
values of n_splits in case n_splits > 1
Args:
data: The DataFrame
k: n_splits
Returns:
RMSEs on given number of n_splits
'''
# Take numerical variables and separate features and target
numeric_df = data.select_dtypes(include=['integer', 'float'])
features = numeric_df.columns.drop('SalePrice')
target = 'SalePrice'
# Instantiate LinearRegression() model
lr = linear_model.LinearRegression()
# HoldOut Validation
if k == 0:
train = data.iloc[:1460]
test = data.iloc[1460:]
lr = LinearRegression()
lr.fit(train[features], train[target])
predictions = lr.predict(test[features])
mse = mean_squared_error(test[target], predictions)
rmse = np.sqrt(mse)
return rmse
# Simple Cross Validation
elif k == 1:
# Shuffle the rows and
df_shuffled = data.sample(frac=1,)
train = data.iloc[:1460]
test = data.iloc[1460:]
# Case 1
lr.fit(train[features], train[target])
predictions_1 = lr.predict(test[features])
mse_1 = mean_squared_error(test[target], predictions_1)
rmse_1 = np.sqrt(mse_1)
# Case 2
lr.fit(test[features], test[target])
predictions_2 = lr.predict(train[features])
mse_2 = mean_squared_error(train[target], predictions_2)
rmse_2 = np.sqrt(mse_2)
# Print
print('Set no.1 RMSE:',rmse_1)
print('Set no.2 RMSE:',rmse_2)
# Average
avg_rmse = np.mean([rmse_1, rmse_2])
return avg_rmse
# K-Fold Cross Validation
else:
rmses = []
kf = KFold(n_splits=k, shuffle=True)
mses = cross_val_score(lr, data[features], data[target], scoring = "neg_mean_squared_error", cv =kf)
for n in mses:
root = abs(n)**0.5
rmses.append(root)
avg_rmse = np.mean(rmses)
return avg_rmse
# We updated transform_features()
round4_transformed_data = transform_features(data)
round4_selected_data = select_features(round4_transformed_data, 0.4, 0.85, 10)
round4_rmse_data = train_and_test(round4_selected_data,7)
print(round4_rmse_data)
28752.484245961794
for i in range(20):
round4_rmse_data = train_and_test(round4_selected_data,i)
print(round4_rmse_data)
29742.097652564822 Set no.1 RMSE: 29742.097652564822 Set no.2 RMSE: 29034.741307186 29388.41947987541 29119.791101046714 151560637.66965345 28886.530704883993 28798.881034148355 28833.705225561687 28918.549361720805 103225689.18178892 28781.4512117181 20077690.54605773 28666.736253769082 28855.167047586987 28742.954918946703 28823.21479456193 28802.41937028274 28743.75558920432 28743.755969617643 28647.47567215801 28712.830226751026