#!/usr/bin/env python # coding: utf-8 # # Predicting House Sale Prices in Ames, IA using Linear Regression # ![housing](housing.jpg) # ## Introduction # This data set describes the sale of individual residential property in Ames, Iowa from 2006 to 2010 # # # ## Data Dictionary # The data set contains 2930 observations and a large number of explanatory variables (23 nominal, 23 ordinal, 14 discrete, and 20 continuous) involved in assessing home values. [Here](http://jse.amstat.org/v19n3/decock/DataDocumentation.txt) is a txt file. containing information on variables involved. # # ### Resources # Data Docmentary can be found [here](http://jse.amstat.org/v19n3/decock/DataDocumentation.txt) # Data can also be downloaded from [here](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data) # For in detail information, click [here](https://www.tandfonline.com/doi/abs/10.1080/10691898.2011.11889627) # # ## Goal # In this project, we'll build a Linear Regression model and exploring ways to improve it by # - Determining what features of the total 79 available are that best can be used as a predictor estimate for house sale price hence creating the best model based on linear regression for making the best posible accurate predictions for house prices. # - Determining how good the predictor equation is as a fit through root mean square error. # --- # Lets import the necessary tools and libraries # In[1]: # Importing Libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.model_selection import KFold, cross_val_score from sklearn.metrics import mean_squared_error from sklearn import linear_model from sklearn.linear_model import LinearRegression # In[2]: # Display Settings pd.set_option('display.max_columns', 500) pd.set_option('display.max_rows', 500) # --- # ## Load Dataset # In[3]: data = pd.read_table('AmesHousing.tsv', delimiter="\t") # In[4]: data.head() # In[5]: data.info() # In[6]: data['SalePrice'].tolist() # In[7]: data.describe() # Lets start by setting up a pipeline of functions that will let us quickly iterate on different models # --- # ## Setting up a Pipeline of functions # Functions that transform features, select features and train-and-test features # ![pipeline](my_pipeline.PNG) # We will build our functions and make the pipeline gradually and will improve in rounds. # So Lets begin with the round 1, where we are just writing up minimal code. As mentioned, We will keep improving on them through out this project. # # ## Round 1 # In[8]: # For now, just return DataFrame def transform_features(data): ''' under developement ''' return data # In[9]: # For now, it just returns the DataFrame's two columns def select_features(data): ''' under developement ''' return data[['Gr Liv Area', 'SalePrice']] # In[10]: def train_and_test(data): ''' under developement ''' train = data.iloc[:1460] test = data.iloc[1460:] numeric_train = train.select_dtypes(include=['integer', 'float']) numeric_test = test.select_dtypes(include=['integer', 'float']) target = 'SalePrice' features = numeric_train.columns.drop(target) lr = LinearRegression() lr.fit(train[features], train[target]) predictions = lr.predict(test[features]) mse = mean_squared_error(test[target], predictions) rmse = np.sqrt(mse) return rmse # In[11]: # Features Transformation round1_transformed_data = transform_features(data) # Features Selection round1_selected_data = select_features(round1_transformed_data) # Training and Testing round1_rmse_data = train_and_test(round1_selected_data) print('So in this first and lazy attempt, we have got an RMSE of', round1_rmse_data) # Let's now start removing features with missing values, diving deeper into potential categorical features, transforming text and numerical columns # # --- # ## Round 2 # Lets update the `transform_features()` function to achieve following goals # 1. remove the features we don't want to use in the model. # - based on the number of missing values # - could cause Data Leakage # # # 2. transform features into the proper format like # - numerical to categorical # - scaling numerical # - filling in missing values # - etc # # # 3. Create new features by combining existing features # # #### Lets explore the data and look for possible manuevering # In[12]: data.select_dtypes(include=['integer', 'float']).info() # In[13]: data.select_dtypes(include=['object']).info() # #### Approach for Round 2: # Lets use this approach **for now**. We can always change thresholds and approaches to improve our score. # - Check if we can create new features using the existing ones if newer ones can capture the details better # # And for the Cleaning part, we can do following ops: # - For numerical column: # - Impute the values with mean/median/mode of Columns that have less than 5% of Null Values # - For Text columns: # - Delete Columns with any of the Null Values # - For all the columns: # - Delete columns that are not useful for Machine Learning # - Delete columns that could potentially leak Data about `SalePrice` # - Delete columns that have more than 5% of Null Values # - We will also delete some rows with extreme values as suggested in [Data Documentary](DataDocumentary.txt) # # Since we are working out to make a standard function, our working will involve a lot of experimentation. So its a better idea to retain the original dataset and do experimentation on a copy of it. # # # In[14]: df = data.copy() # In[15]: df.select_dtypes('object').head() # - No numerical columns are misrepresented as Strings. # # Lets check columns of Integer and Float Dtypes. # In[16]: df.select_dtypes(['integer', 'float']).head() # By looking closely at numerical features, three of them should catch our attention. The features are `Year Built`, `Year Remod/Add` and `Yr Sold`. # `Year Built` is about the year property was built. `Year Remod/Add` tells us about last modification if there is any. Whereas `Yr Sold` tells us about the year of sale of the property. # There features might become far more impactful if we combine them to calculate "Age of house", and "Years since alteration" # # Lets do it for now by creating a new features # - `age` that will be formed by subtracting `Year Built` from `Yr Sold` and # - `years_remod` that will be formed by subtracting `Year Remod/Add` from `Yr Sold` # In[17]: df['age'] = df['Yr Sold'] - df['Year Built'] # In[18]: df['years_remod'] = df['Yr Sold'] - df['Year Remod/Add'] # We rarely get to have a perfect data! # On running **value_counts() on both of our new columns, we found out some negative values.** # In[19]: # capture these rows del_us = df[(df['age'] < 0) | (df['years_remod'] < 0)] del_us # In[20]: # Lets remove these rows. df.drop(del_us.index, axis=0, inplace=True) # #### Data Cleaning # Lets check the columns with Null Values less than 5% and delete them after consideration # In[21]: # Series of Columns with percentages of Null values in them null_series = pd.Series(df.isnull().sum()/len(df) * 100) # Filtering out the columns with more than 5% Null values null_series[null_series > 5.0].sort_values() # We have an interesting situation here. Columns like `Pool QC`, `Misc Feature`, `Alley`, `Fence`, `Fireplace QU` will be deleted straight away because they contain a lot of Null values. # # However, the columns that contain Null Values just above 5% are all related to Garage. # By reading [Data Documentation](DataDocumentation.txt), we can see that most of them are text columns of ordinal and nominal variables. And those columns already have **NA** option provided. So we can not take Null Value as **NA** either. # Lets Delete all for them for now # In[22]: # Cutoff at 5%. (1-0.05 = 0.95) df = df.dropna(thresh=df.shape[0]*0.95, axis=1) # - Columns with more than 5% Null values removed successfully # In text columns, lets remove every column that contains any Null value for now. # In[23]: text_df = df.select_dtypes(include='object') text_df.shape # In[24]: # Select Object dtype, and drop column (axis=1) with any missing values values (how='all') text_df = text_df.dropna(how='any', axis=1) text_df.shape # In[25]: numeric_df = df.select_dtypes(include=['integer','float']) numeric_df.head() # In[26]: # Take out the columns using isna() and any() and Segregate their names in a list. drop_these = numeric_df.columns[numeric_df.isna().any()].tolist() # In[27]: # Drop the columns numeric_df = numeric_df.drop(drop_these, axis=1) # Lets delete `Order` and `PID` columns # In[28]: numeric_df = numeric_df.drop(['Order', 'PID'], axis=1) # Following Columns can potentially Leak Data to our target variable `SalePrice`: # # `Mo Sold`, `Sale Condition`, `Sale Type`, `Yr Sold`. For more information, check out [Data Documentation](DataDocumentation.txt) # # Lets remove them as well. We will remove them from our Sub-DataFrames. We will concatenate out Sub-DataFrames later # In[29]: numeric_df = numeric_df.drop(['Mo Sold', 'Yr Sold'], axis=1) # In[30]: text_df = text_df.drop(['Sale Type', 'Sale Condition'], axis=1) # Lets keep the deletion of rows with extremities after concatenating both DataFrames, that we will do in the function. # ### Updating Our Function # In[31]: def transform_features(data): ''' Transforms the DataFrame. It replicates the work done in preceding code. Args: data: The Dataframe Returns: transformed DataFrame ''' # Create a Copy df = data.copy() # Create new features and Clean df['age'] = df['Yr Sold'] - df['Year Built'] df['years_remod'] = df['Yr Sold'] - df['Year Remod/Add'] del_us = df[(df['age'] < 0) | (df['years_remod'] < 0)] df.drop(del_us.index, axis=0, inplace=True) df = df.dropna(thresh=df.shape[0]*0.95, axis=1) # Break the DataFrame WRT Dtype and Clean text_df = df.select_dtypes(include='object') text_df = text_df.dropna(how='any', axis=1) numeric_df = df.select_dtypes(include=['integer','float']) drop_these = numeric_df.columns[numeric_df.isna().any()].tolist() numeric_df = numeric_df.drop(drop_these, axis=1) numeric_df = numeric_df.drop(['Order', 'PID', 'Mo Sold', 'Yr Sold'], axis=1) text_df = text_df.drop(['Sale Type', 'Sale Condition'], axis=1) # Concatenate the DataFrame pieces df = pd.concat([numeric_df, text_df], axis=1) # Do more Cleaning extreme_rows = df[df['Gr Liv Area'] > 4000] df = df.drop(extreme_rows.index, axis = 0) # Return return df # The functions `select_features()` and `train_test()` are still as they are # In[32]: # We updated transform_features() round2_transformed_data = transform_features(data) # We have NOT updated select_features(), So it is still taking only 2 columns round2_selected_data = select_features(round2_transformed_data) # We have NOT updated test_and_train() either! round2_rmse_data = train_and_test(round2_selected_data) print(round2_rmse_data) # In[33]: print('Our RMSE for this attempt is around {:.2f} and\nWith an impovement in RMSE of around USD {:.2f}\nThat is almost {:.1f}% improvement than the previous results'. format(round2_rmse_data, round1_rmse_data-round2_rmse_data, (round1_rmse_data-round2_rmse_data)/round1_rmse_data*100)) # --- # ## Round 3 # Its time to move on to **Feature Selection** for numerical columns # - Which features correlate strongly with our target variable `SalePrice`? # - What are the Correlation Coefficients of features that correlate well # - Which columns in the DataFrame should be converted to Categorical Dtype (All the columns that can be categorized as nominal variables are candidates for being converted into Categorical # - Which columns are currently numerical but need to be encoded as Categorical instead # # In the end, we will update the logic to our functions # In[34]: # Lets Plot a Correlation Matrix numeric_df = round2_transformed_data.select_dtypes(['integer', 'float']) plt.figure(figsize=(20,20)) sns.heatmap(numeric_df.corr(), annot=True, fmt='.3g', vmin=-1, vmax=1, center= 0, cmap= 'coolwarm',cbar_kws= {'orientation': 'horizontal'}) # Following are the **most Correlated Features** with `SalePrice` # - `Overall quality` # - `Gr Liv Area` # - `Exter Qual` # - `Kitchen Qual` # - `Total Bsmt SF` # - `Garage Cars` # - `Garage Area` # - `1st Flr SF` # - `Years Before Sale` # - `Years Since Remod` # # # Within these features, there is a **high correlation present between them that could be collinearity as well** # - `Total Bsmt SF` and `1st Flr SF` # - `Overall quality`, `Exter Qual`, and `Kitchen Qual` # - `Garage Cars` and `Garage Area` # # Since the columns `age` and `years_remod` were engineered from columns `Year Built` and `Year Remod/Add`, a very high correlation. though negative, can be seen among them # # In[35]: # Taking Correlation Coeffients abs_corr_coeffs = numeric_df.corr()['SalePrice'].abs().sort_values(ascending=False) # Displaying head only abs_corr_coeffs.head() # Lets keep the columns with correlation coeffients above 0.4 # # We will change it to get better results later. # In[36]: abs_corr_coeffs[abs_corr_coeffs > 0.4] # In[37]: # Drop any columns that are less correlant that 0.4. round2_transformed_data = round2_transformed_data.drop(abs_corr_coeffs[abs_corr_coeffs < 0.4].index, axis=1) # #### Moving on to Categorical Columns # All nominal variables are the candidates of being converted to Categorical Dtype. Let's check the nominal variables for # - Unique values in them. We have to be vary of columns that contain too many unique values # - Columns that have an overwhelming percentage of a certain value. They are akin to low-variance numerical feature (no variability in the data for model to capture). For example if a column has a few unique values but more than 95% of the values belong to a specific category. # # We also have to look into columns that are currently in numerical encoding but could do much better with Categorical encoding # # In[38]: # according to the Data Documentary, following are the nominal variables nominal_features_all = [ 'PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type', 'Sale Condition'] # We need to retrieve the columns that exist in our functioning DataFrame, i-e, round2_transformed_data nominal_features = [] for name in nominal_features_all: if name in round2_transformed_data.columns: nominal_features.append(name) nominal_features # In[39]: # Lets check the frequency tables with value_counts for col in nominal_features: print(col,'\n',round2_transformed_data[col].value_counts(), '\n', 'NUNIQUE:', round2_transformed_data[col].nunique(), '\n', '-'*20) # - Columns namely `Heating`, `Roof Matl`, `Condition 2`, `Land Contour`, `Street`, `Central Air` have one value each that has an overwhelming presence. # - Columns `Exterior 1st`, `Exterior 2nd` and `Neighborhood` contain relatively larger number of unique values. # - We should decide among either to take them all, leave them all or define a threshold which could be modified later # # In[40]: # This is the frequency table showing how much the MOST FREQUENT value is occupying of the total column. for col in nominal_features: print(col, round2_transformed_data[col].value_counts(normalize = True, ascending = False)[0]) # In[41]: # Adjust an arbitrary cutoff at 0.85 myraid_cutoff = 0.85 # Because of some unusual errors, I introduced the dictionary rather than pulling out the columns directly myraid_dict = {} for col in nominal_features: myraid = round2_transformed_data[col].value_counts(normalize = True, ascending = False)[0] myraid_dict[col] = myraid for key, val in myraid_dict.items(): if val > myraid_cutoff: round2_transformed_data.drop(key, axis=1, inplace=True) nominal_features.remove(key) # In[42]: # Adjust an arbitrary cutoff at 10 nunique_thresh = 10 nunique_dict = {} for col in nominal_features: nunique_dict[col] = round2_transformed_data[col].nunique() for key, val in nunique_dict.items(): if val > nunique_thresh: round2_transformed_data.drop(key, axis=1, inplace=True) nominal_features.remove(key) # Lets look into the numerical variables that can possibly do better being categorical # In[43]: for col in round2_transformed_data.select_dtypes(['integer', 'float']).columns: print(col, '\n', round2_transformed_data[col].value_counts(), '\n') # Following Columns that are currently numerical in our round2_transformed_data's numerical Dtypes: # - Ordinal Variables: # - `Overall Qual` # - Discrete Variables: # - `Year Built`, `Year Remod/Add`, `Full Bath`, `TotRmsAbvGrd`, `Fireplaces`, `years_remod`, `age` # - Continuous Variables: # - `1st Flr SF`, `Gr Liv Area`, `SalePrice` # # So far, its difficult to see any 'plain' nominal variable in above-given columns. We can't assume Ordinal Variables as to be Nominal ones. # # Let's move on for now # how to work with ordinal variables with keeping in mind that we are going to do linear regression. Secondly, do we need to scale? if yes then how. There is a page opened in this browser. Consult it too. And lastly, there are columns that show year. what to do with them # In[44]: # Nominal vs Ordinal vs Ratio vs Interval round2_transformed_data[col] # In[45]: # Select just the remaining text columns and convert to categorical text_cols = round2_transformed_data.select_dtypes(include=['object']) for col in text_cols: round2_transformed_data[col] = round2_transformed_data[col].astype('category') # Create dummy columns and add back to the dataframe! dummies = pd.get_dummies(round2_transformed_data.select_dtypes(include=['category'])) # Concatenate back round2_transformed_data = pd.concat([round2_transformed_data, dummies], axis=1) # Drop the original ones round2_transformed_data = round2_transformed_data.drop(text_cols, axis=1) # Check round2_transformed_data.shape # ### Updating Our Function # In[46]: def select_features(df, coef_cutoff, myraid_cutoff, nunique_thresh): ''' The function replicates the code done in prior cells. It selects the features we define to be appropriate Args: df: DataFrame coef_cutoff: Selects the minimum Coefficient of Correlation as Cut off myraid_cutoff: Selects the maximum acceptable presence of the Mode value in its column nunique_thresh: Selects the maximum number of unique values for a column to be acceptable Return: DataFrame with requested changes and Dummy Variables for Categorical Variables ''' abs_corr_coeffs = df.select_dtypes(['integer', 'float']).corr()['SalePrice'].abs().sort_values(ascending=False) coef_cutoff = coef_cutoff df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coef_cutoff].index, axis=1) nominal_features = df.select_dtypes(include='object').columns.tolist() myraid_cutoff = myraid_cutoff myraid_dict = {} for col in nominal_features: myraid = df[col].value_counts(normalize = True, ascending = False)[0] myraid_dict[col] = myraid for key, val in myraid_dict.items(): if val > myraid_cutoff: df.drop(key, axis=1, inplace=True) nominal_features.remove(key) nunique_thresh = nunique_thresh nunique_dict = {} for col in nominal_features: nunique_dict[col] = df[col].nunique() for key, val in nunique_dict.items(): if val > nunique_thresh: df.drop(key, axis=1, inplace=True) nominal_features.remove(key) text_cols = df.select_dtypes(include=['object']) for col in text_cols: df[col] = df[col].astype('category') df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(text_cols, axis=1) return df # In[47]: # We updated transform_features() round3_transformed_data = transform_features(data) # We have updated select_features() round3_selected_data = select_features(round3_transformed_data, 0.4, 0.85, 10) round3_rmse_data = train_and_test(round3_selected_data) print(round3_rmse_data) # In[48]: print('Our RMSE for this attempt is around {:.2f} and\nWith an impovement in RMSE of around USD {:.2f}\nThat is almost {:.1f}% improvement than the previous results'. format(round3_rmse_data, round2_rmse_data-round3_rmse_data, (round2_rmse_data-round3_rmse_data)/round2_rmse_data*100)) # Having given some effort to improve our Feature transformation and selection, let's move on to the training and testing # # --- # ## Round 4 # Now for the final part of the pipeline, training and testing. When iterating on different features, using simple validation is a good idea. Let's add a parameter named k, that controls the type of cross-validation that occurs # In[49]: def train_and_test(data, k=0): ''' Trains and Tests the Linear Regression Model for multiple values of n_splits in case n_splits > 1 Args: data: The DataFrame k: n_splits Returns: RMSEs on given number of n_splits ''' # Take numerical variables and separate features and target numeric_df = data.select_dtypes(include=['integer', 'float']) features = numeric_df.columns.drop('SalePrice') target = 'SalePrice' # Instantiate LinearRegression() model lr = linear_model.LinearRegression() # HoldOut Validation if k == 0: train = data.iloc[:1460] test = data.iloc[1460:] lr = LinearRegression() lr.fit(train[features], train[target]) predictions = lr.predict(test[features]) mse = mean_squared_error(test[target], predictions) rmse = np.sqrt(mse) return rmse # Simple Cross Validation elif k == 1: # Shuffle the rows and df_shuffled = data.sample(frac=1,) train = data.iloc[:1460] test = data.iloc[1460:] # Case 1 lr.fit(train[features], train[target]) predictions_1 = lr.predict(test[features]) mse_1 = mean_squared_error(test[target], predictions_1) rmse_1 = np.sqrt(mse_1) # Case 2 lr.fit(test[features], test[target]) predictions_2 = lr.predict(train[features]) mse_2 = mean_squared_error(train[target], predictions_2) rmse_2 = np.sqrt(mse_2) # Print print('Set no.1 RMSE:',rmse_1) print('Set no.2 RMSE:',rmse_2) # Average avg_rmse = np.mean([rmse_1, rmse_2]) return avg_rmse # K-Fold Cross Validation else: rmses = [] kf = KFold(n_splits=k, shuffle=True) mses = cross_val_score(lr, data[features], data[target], scoring = "neg_mean_squared_error", cv =kf) for n in mses: root = abs(n)**0.5 rmses.append(root) avg_rmse = np.mean(rmses) return avg_rmse # In[59]: # We updated transform_features() round4_transformed_data = transform_features(data) round4_selected_data = select_features(round4_transformed_data, 0.4, 0.85, 10) round4_rmse_data = train_and_test(round4_selected_data,7) print(round4_rmse_data) # In[60]: for i in range(20): round4_rmse_data = train_and_test(round4_selected_data,i) print(round4_rmse_data) # #### ... in progress # In[ ]: