The present project aims to use Machine Learning (more specifically, Linear Regression) models to predict house prices from other features of the houses.
The information about the columns from the dataset that we'll can be found here, while the dataset can be downloaded here.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from datetime import date
pd.set_option('display.max_columns', None)
# Read the dataset
data = pd.read_csv("AmesHousing.tsv", sep="\t")
# Display basic info
print("Shape of the dataset:", data.shape)
display(data.info())
display(data.head())
For the functions that we'll create, we'll start by creating them in the first few cells in the notebook. This way, we can add cells to the end of the notebook to do experiments and update the functions in these cells. We aim to:
def transform_features():
return data
def select_features():
return data[["MS SubClass", "Gr Liv Area", "SalePrice"]]
def train_and_test():
train = data[:1460].copy()
test = data[1460:].copy()
features = list(select_features().columns[select_features().columns != "SalePrice"])
lr = LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
rmse = mean_squared_error(test["SalePrice"], predictions) ** 1/2
return rmse
In general, the goal of this function is to:
def transform_features():
"""
Takes the initial dataframe, and returns a transformed version of the dataframe with the following changes:
1. Drop columns with more than 25% of null values
2. Update null values in remaining numerical columns with column mean
3. Scale numerical columns to a 0-1 scale
4. Create a column that tells how long did it pass since year of construction until year of remodelation
5. Categorical columns are converted to proper dtype
Args:
none
Returns:
df: Modified dataframe
"""
df = data.copy()
# 1. Drop columns with more than 25% of null values
# Create a list with variables having null values and the percentage of null values for each of them
na_props = df.isna().sum()[df.isna().sum() > 0] / len(df)
# Drop variables with null values above 25%
drop_list = list(na_props[na_props > .25].index)
df.drop(drop_list, axis=1, inplace=True)
# 2. Update null values in remaining numerical columns with column mean
# Update the list with variables having null values and the percentage of null values for each of them
na_props = df.isna().sum()[df.isna().sum() > 0] / len(df)
# Create a list with columns having null values, then select a list with columns that, having null values, are numerical
na_cols = list(na_props[na_props > 0].index)
avg_list = list(df[na_cols].select_dtypes(include=["int", "float"]).columns)
# Replace null values in numerical columns with their means
df[avg_list] = df[avg_list].fillna(df[avg_list].mean())
# 3. Scale numerical columns to a 0-1 scale
# Manually create a list with variables to be scaled to 0-1, then apply the scaling
num_cols = ['Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF',
'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd',
'Fireplaces', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch',
'3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val']
df[num_cols] = (df[num_cols] - df[num_cols].min()) / (df[num_cols].max() - df[num_cols].min())
# 4. Create a column that tells how long did it pass since year of construction until year of remodelation
# Create a new variable telling how old was the house when it was remodeled. In case it wasn't, it tells current house age
df["years_until_remod"] = df["Year Remod/Add"] - df["Year Built"]
df["years_until_remod"] = np.where(df["years_until_remod"] == 0, date.today().year - df["Year Built"], df["years_until_remod"])
# 5. Categorical columns are converted to proper dtype
# Manually create a list with variables to be converted to category dtype, then apply the conversion
categorical_cols = ["Order", "PID", "Overall Qual", "Overall Cond", "Year Remod/Add", "Year Built", "Garage Yr Blt",
"MS SubClass", "MS Zoning", "Street", "Lot Shape", "Land Contour", "Utilities", "Lot Config", "Mo Sold",
"Yr Sold", "Land Slope", "Neighborhood", "Condition 1", "Condition 2", "Bldg Type", "House Style",
"Roof Style", "Roof Matl", "Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Exter Qual", "Exter Cond",
"Foundation", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2",
"Heating", "Heating QC", "Central Air", "Electrical", "Kitchen Qual", "Functional", "Garage Type",
"Garage Finish", "Garage Qual", "Garage Cond", "Paved Drive", "Sale Type", "Sale Condition"]
df[categorical_cols] = df[categorical_cols].astype("category")
# Move the SalePrice column to the end of the dataframe, for visualization purposes
cols = list(df.columns.values)
cols.pop(cols.index("SalePrice"))
df = df[cols+["SalePrice"]]
return df
To improve the function that we'll use to select interesting feature columns, first we'll take a look at the correlations between numerical columns
df = transform_features()
sns.set(rc={'figure.figsize':(12,9)})
sns.heatmap(df.select_dtypes(include=["int", "float"]).corr())
It seems that the variables with the higher correlation values with SalePrice column are:
Therefore, let's see their individual r values:
# Correlations of interesting features from the heatmap with SalePrice
df[["Total Bsmt SF", "1st Flr SF", "Gr Liv Area", "Garage Cars", "SalePrice"]].corr()["SalePrice"]
We'll use these as numerical predictors in our model.
Now, let's see how the categorical variables distribute their values, and we'll keep only those having interesting information, and not concentrating more than 95% of their values in one specific condition:
# Loop to display normalized value counts for each of the categorical columns
for var in ["Order", "PID", "Overall Qual", "Overall Cond", "Year Remod/Add", "Year Built", "Garage Yr Blt",
"MS SubClass", "MS Zoning", "Street", "Lot Shape", "Land Contour", "Utilities", "Lot Config", "Mo Sold",
"Yr Sold", "Land Slope", "Neighborhood", "Condition 1", "Condition 2", "Bldg Type", "House Style",
"Roof Style", "Roof Matl", "Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Exter Qual", "Exter Cond",
"Foundation", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2",
"Heating", "Heating QC", "Central Air", "Electrical", "Kitchen Qual", "Functional", "Garage Type",
"Garage Finish", "Garage Qual", "Garage Cond", "Paved Drive", "Sale Type", "Sale Condition"]:
display(df[var].value_counts(normalize=True))
From all the categorical variables, we'll only keep the following:
"Overall Qual", "Overall Cond", "MS SubClass", "MS Zoning", "Lot Shape", "Land Contour", "Lot Config", "Neighborhood", "Condition 1", "Bldg Type", "House Style", "Roof Style", "Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Exter Qual", "Exter Cond", "Foundation", "Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2", "Heating QC", "Central Air", "Electrical", "Kitchen Qual", "Functional", "Garage Type", "Garage Finish", "Garage Qual", "Paved Drive", "Sale Type", "Sale Condition".
We'll now proceed with updating the select_features() function, to keep only the numerical variables of our interest, and transforming the interesting categorical variables into dummy variables:
def select_features():
"""
Takes the transformed dataframe from transform_features(), and returns a version with the following changes:
1. Keep only numerical variables of interest
2. Convert interesting categorical variables to dummy (0-1) variables
3. Append SalePrice variable at the end of the dataframe
Args:
dataframe: Transformed dataframe as input
Returns:
df: Modified dataframe with the selected attributes
"""
dataframe = transform_features().copy()
df = pd.DataFrame()
numeric_cols = ["Total Bsmt SF", "1st Flr SF", "Gr Liv Area", "Garage Cars"]
categorical_cols = ["Overall Qual", "Overall Cond", "MS SubClass", "MS Zoning", "Lot Shape", "Land Contour",
"Lot Config", "Neighborhood", "Condition 1", "Bldg Type", "House Style", "Roof Style",
"Exterior 1st", "Exterior 2nd", "Mas Vnr Type", "Exter Qual", "Exter Cond", "Foundation",
"Bsmt Qual", "Bsmt Cond", "Bsmt Exposure", "BsmtFin Type 1", "BsmtFin Type 2", "Heating QC",
"Central Air", "Electrical", "Kitchen Qual", "Functional", "Garage Type", "Garage Finish",
"Garage Qual", "Paved Drive", "Sale Type", "Sale Condition"]
# 1. Keep only numerical variables of interest
df[numeric_cols] = dataframe[numeric_cols]
# 2. Convert interesting categorical variables to dummy (0-1) variables
dummy_cols = pd.get_dummies(dataframe[categorical_cols])
df = pd.concat([df, dummy_cols], axis=1)
# 3. Append SalePrice variable at the end of the dataframe
df["SalePrice"] = dataframe["SalePrice"]
return df
Finally, we'll update the train_and_test() function to implement holdoud, simple cross or multiple cross validation depending on a parameter k.
def train_and_test(k):
"""
Takes the modified dataframe from select_features(), and applies a Machine Learning -Linear Regression- model to return
the Root Mean Squared Error (RMSE) of the model.
Args:
k: Number of k-folds for the cross-validation. If 0, applies holdout validation.
Returns:
rmse: RMSE value for k=0 or k=1
avg_rmse: Average RMSE value for k>1
"""
df = select_features()
features = list(df.columns[select_features().columns != "SalePrice"])
# Apply holdout validation if k=0
if k == 0:
train = df[:1460].copy()
test = df[1460:].copy()
lr = LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
rmse = mean_squared_error(test["SalePrice"], predictions) ** 1/2
return rmse
# Apply simple cross validation if k=1
elif k == 1:
shuf_df = df.loc[np.random.permutation(len(df))]
train = shuf_df[:1460].copy()
test = shuf_df[1460:].copy()
lr = LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
rmse = mean_squared_error(test["SalePrice"], predictions) ** 1/2
return rmse
# Apply multiple cross validation if k>1
elif k > 1:
shuf_df = df.loc[np.random.permutation(len(df))]
rmse_vals = []
kf = KFold(k, shuffle=True, random_state=1)
for train_index, test_index in kf.split(shuf_df):
train = shuf_df.iloc[train_index].copy()
test = shuf_df.iloc[test_index].copy()
lr = LinearRegression()
lr.fit(train[features], train["SalePrice"])
predictions = lr.predict(test[features])
rmse = mean_squared_error(test["SalePrice"], predictions) ** 1/2
rmse_vals.append(rmse)
avg_rmse = np.mean(rmse_vals)
print("RMSE values:", rmse_vals)
return avg_rmse
train_and_test(4)