The goal of this project - predicting house price using Linear Regression and Ridge regression with Linear least squares with l2 regularization methods from scikit-learn.
In this project I use original data from article Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project by Dean De Cock, and use original dataset AmesHousing.txt
and dictionary with descriptions attributes DataDocumentation.txt
- you can download its from here and here .
I will not write many comments, all comments on the actions performed are inside the program code in cells.
Caution Cross validation for determination optimal alphas for Ridge regression it takes a very long time.
Load required modules and expore dataset. Print name of columns contains more than 5% NaN values, drop its and drops remain rows contain NaN values in the dataset.
from IPython.display import display
from IPython.display import HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
# pd.set_eng_float_format(accuracy=3, use_eng_prefix=True)
# Inspect dataset
# Open
ames = pd.read_csv("data/AmesHousing.txt",delimiter="\t")
# Get info about numbers rows and columns
# Calculate NaN sum for each row
ames_nan_count = ames.isnull().sum()
display("ames contains {:,} and {:} columns.\n".format(
ames.shape[0],
ames.shape[1]))
# Print columns name with NaN more equal 5% and drop it
drop_columns = []
for i, feature in enumerate(ames_nan_count.index):
nan = ames_nan_count[feature]
nan_perc = nan/ames.shape[0]
if nan_perc >= 0.05:
drop_columns.append(feature)
print ("row No ", i+1 , ", row name ", feature,
f", NaN = {nan:,} or % {nan_perc:.2%} from {ames.shape[0]:,} values.")
# Drop columns
ames.drop(axis=1, columns = drop_columns, inplace = True)
# Check NaN values
display(ames.isnull().sum().sort_values(ascending = False).head(17))
# Drop NaN values and reset index
ames.dropna(axis = 0, inplace = True)
ames.reset_index(drop = True, inplace = True)
# Display number and object columns
number_columns =list(ames.select_dtypes(include = 'number'))
display(f"Numbers of numeric columns = {len(number_columns):}" )
display(number_columns)
object_columns = list(ames.select_dtypes(include = 'object'))
display(f"Numbers of object columns = {len(object_columns):}")
display(object_columns)
# Define and print correlation absolute values to SalePrice column
sale_price_corr = ames.corr()["SalePrice"].\
abs().sort_values(ascending=False)
# Display first 15 rows
print("Correlation Correlation matrix for “Price” column")
display(sale_price_corr.head(15))
# Define list dropped numeric columns with corr < 0.4
drop_numeric_col = list((sale_price_corr[sale_price_corr<