#!/usr/bin/env python # coding: utf-8 # # **Multiple Regression** # ## Reading the data # Let's look at a dataset pertaining to housing. This data contains house sale prices in King County for houses sold between May 2014 and May 2015. In addition to price of houses, the dataset also contains 14 attributes or features of these houses.
# # In this case study, we will use **Multiple Regression** to predict the prices of houses given the feature information. # In[1]: import pandas as pd import numpy as np df = pd.read_csv('housing_data.csv') df.head() # In[2]: df.shape # As we see from the data, in addition to the price information we have 11 features for each of the 21613 houses. Example of features are the number of bedrooms, number of bathrooms, the size of living area, the size of the lot, the condition of the property and so on. # ## Simple Linear Regression # Let's start by using just one feature and implementing a Simple Linear Regression model.
We know that size of the living area can be a potential indicator of the price. First, visualize the how the living area size can affect price. # In[3]: ax = df.plot.scatter(x='sqft_living', y='price', title='Price vs Living-Area size (in sqft)') # We're going to be doing a lot of regressions, so why not make a function? # In[4]: from sklearn.linear_model import LinearRegression def do_regression( X, y ): # Linear regression using the entire dataset lin_model = LinearRegression() lin_model.fit(X, y) # Print the intercept, coefficients, and R^2 value print("Intercept:", lin_model.intercept_) print("Coefficients:", lin_model.coef_) print("R-squared value:", lin_model.score(X, y)) return lin_model # Regression using a single feature # In[5]: # Fitting a simple linear regression model on the "whole" dataset X = df[['sqft_living']] y = df['price'] lin_model = do_regression( X, y ) # This implies that based on just the one feature "sqft_living" we have the formula for # price = -43580.74 + (280.62 / sq.ft) * (sq.ft. living)
# However, We see that the R-squared is 0.49 indicating this feature alone may not able to give us a model that fits our data well.
# # Let's visualize how well our model fit the data! # In[6]: y_pred = lin_model.predict(X) df_pred = pd.DataFrame({'sqft_living': X.sqft_living, 'orig_price': y, 'pred_price': y_pred}) ax = df_pred.plot.scatter(x='sqft_living', y='orig_price'); df_pred.plot.scatter(x='sqft_living', y='pred_price', color='red', ax=ax); ax.set_ylabel("price"); # Let's also calculate the Error on the Training Data (a.k.a. the residuals) and see how they are distributed. Note that we are using the whole data for training here. For this, we'll calculate the residuals and then plot a histogram. # In[7]: residuals = y_pred - y residuals.plot.hist(bins=100, xlim=(-1e6,1e6), figsize=(10,5), title='distribution of the residuals'); # **Remember:** By definition, regression picks the line so that the residuals have zero-mean and have a "sum of squares", or RSS, that is as small as possible. The RSS is the square of the standard deviation. So the standard deviation of the residuals should also be as small as possible. Let's verify this: # In[8]: residuals.describe() # Using a simple linear regression model to predict the price of houses using 'sqft_living' as a feature doesn't seem to be doing too well. Let's use more features and see if we get a better fit to our data. # ## Multiple Regression # We will now use more features to feed into a regression model and see if our R-squared improves.
# But how do we select our features? Let's look at how the rest of our features look like when plotted against the price. # In[9]: # Looking at what other features we have df.columns # In[10]: # Price is our target variable, and the other columns are our features target = 'price' features = df.columns[1:] for feat in features: df.plot.scatter(x=feat, y=target); # Let's try using four features # In[11]: X = df[['sqft_living', 'grade', 'bathrooms', 'bedrooms']] y = df['price'] do_regression(X,y); # We see that our R-squared value improved, but can we do even better? Let's use all features and see # In[12]: X = df.iloc[:,1:] y = df['price'] lin_model = do_regression(X,y); # Let's make a histogram like we did before # In[13]: y_pred = lin_model.predict(X) residuals2 = y_pred - y ax = residuals.plot.hist(bins=100, xlim=(-1e6,1e6), figsize=(10,5), title='distribution of the residuals', alpha=0.5); residuals2.plot.hist(bins=100, xlim=(-1e6,1e6), figsize=(10,5), title='distribution of the residuals', ax=ax, alpha=0.5); ax.legend(['using one feature','using all features']); residuals2.describe() # We see that in this case, using all the features gives us a better Mulitple Regression fit to the data. # --- # # **In-Class Quiz** # ## Question 1 # > Import the `housing_data.csv` and fit a linear regression model to the data using just the `sqft_living` feature, and the `price` as target. Predict the price of the house when `sqft_living=2750`. # In[ ]: # Code provided already import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression df = pd.read_csv('housing_data.csv') X = df[['sqft_living']] y = df['price'] X_test = [[2750]] # In[ ]: # Your code here # ## Question 2 # > Import the `housing_data.csv`. First fit a Multiple Regression model with `grade`, `bathrooms` and `bedrooms` as features and `price` as the target. Find the R-squared value of the fit using the data you trained with. # In[ ]: # Your code here