#!/usr/bin/env python # coding: utf-8 # # Boston house price prediction # # Modified from: https://www.kaggle.com/shreayan98c/boston-house-price-prediction # The problem that we are going to solve here is that given a set of features that describe a house in Boston, our machine learning model must predict the house price. To train our machine learning model with boston housing data, we will be using scikit-learn’s boston dataset. # # In this dataset, each row describes a boston town or suburb. There are 506 rows and 13 attributes (features) with a target column (price). # https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names # ## Loading the data # In[1]: # Importing the libraries import pandas as pd import numpy as np from sklearn import metrics import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: # Importing the Boston Housing dataset from sklearn.datasets import load_boston boston = load_boston() # ## Getting a "feeling of the data" # In[3]: print(boston.keys()) #print keys # In[4]: # Initializing the dataframe data = pd.DataFrame(boston.data) # In[5]: print(boston.DESCR) # In[6]: # See head of the dataset data.head() # In[7]: #Adding the feature names to the dataframe data.columns = boston.feature_names data.head() # Each record in the database describes a Boston suburb or town. # In[8]: #Adding target variable to dataframe data['PRICE'] = boston.target # Median value of owner-occupied homes in $1000s # In[9]: #Check the shape of dataframe data.shape # In[10]: data.columns # In[11]: data.dtypes # In[12]: # Identifying the unique number of values in the dataset data.nunique() # In[13]: # Check for missing values data.isnull().sum() # In[14]: # See rows with missing values data[data.isnull().any(axis=1)] # In[15]: # Viewing the data statistics print(data.describe()) # In[16]: import seaborn as sns import matplotlib.pyplot as plt from scipy import stats fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10)) index = 0 axs = axs.flatten() for k,v in data.items(): sns.boxplot(y=k, data=data, ax=axs[index]) index += 1 plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0) # Columns like CRIM, ZN, RM, B seems to have outliers. Let's see the outliers percentage in every column. # In[17]: for k, v in data.items(): q1 = v.quantile(0.25) q3 = v.quantile(0.75) irq = q3 - q1 v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)] perc = np.shape(v_col)[0] * 100.0 / np.shape(data)[0] print("Column %s outliers = %.2f%%" % (k, perc)) # Let's see how the feature distributions look like # In[52]: #import warnings #warnings.filterwarnings('ignore') fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10)) index = 0 axs = axs.flatten() for k,v in data.items(): sns.histplot(v, kde=True, ax=axs[index]) index += 1 plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0) # In[20]: # Finding out the correlation between the features corr = data.corr().abs() corr.shape # In[21]: # Plotting the heatmap of correlation between features plt.figure(figsize=(20,20)) sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}) #sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens') # Plot columns with a correlation score above 0.5 with PRICE against PRICE # In[22]: from sklearn import preprocessing # Let's scale the columns before plotting them against MEDV min_max_scaler = preprocessing.MinMaxScaler() column_sels = ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE'] x = data.loc[:,column_sels] y = data['PRICE'] x = pd.DataFrame(data=min_max_scaler.fit_transform(x), columns=column_sels) fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10)) index = 0 axs = axs.flatten() for i, k in enumerate(column_sels): sns.regplot(y=y, x=x[k], ax=axs[i]) plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0) # ## Prediction using the data (a first example to machine learning applications) # In[23]: # Splitting target variable and independent variables X = data.drop(['PRICE'], axis = 1) y = data['PRICE'] # In[24]: # Splitting to training and testing data from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 4) # ### Linear regression # #### Training the model # In[25]: # Import library for Linear Regression from sklearn.linear_model import LinearRegression # Create a Linear regressor lm = LinearRegression() # Train the model using the training sets lm.fit(X_train, y_train) # In[26]: # Value of y intercept lm.intercept_ # In[27]: #Converting the coefficient values to a dataframe coeffs = pd.DataFrame([X_train.columns,lm.coef_]).T coeffs = coeffs.rename(columns={0: 'Attribute', 1: 'Coefficients'}) coeffs # #### Model Evaluation # In[28]: # Model prediction on train data y_pred = lm.predict(X_train) # In[29]: # Model Evaluation print('R^2:',metrics.r2_score(y_train, y_pred)) print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)) print('MAE:',metrics.mean_absolute_error(y_train, y_pred)) print('MSE:',metrics.mean_squared_error(y_train, y_pred)) print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred))) # 𝑅^2 : It is a measure of the linear relationship between X and Y. It is interpreted as the proportion of the variance in the dependent variable that is predictable from the independent variable. # # Adjusted 𝑅^2 :The adjusted R-squared compares the explanatory power of regression models that contain different numbers of predictors. # # MAE : It is the mean of the absolute value of the errors. It measures the difference between two continuous variables, here actual and predicted values of y.  # # MSE: The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value.  # # RMSE: The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value.  # # # # # # In[30]: # Visualizing the differences between actual prices and predicted values plt.scatter(y_train, y_pred) plt.xlabel("Prices") plt.ylabel("Predicted prices") plt.title("Prices vs Predicted prices") plt.show() # In[31]: # Checking residuals plt.scatter(y_pred,y_train-y_pred) plt.title("Predicted vs residuals") plt.xlabel("Predicted") plt.ylabel("Residuals") plt.show() # There is no pattern visible in this plot and values are distributed equally around zero. So Linearity assumption is satisfied # In[53]: # Checking Normality of errors sns.histplot(y_train-y_pred, kde=True) plt.title("Histogram of Residuals") plt.xlabel("Residuals") plt.ylabel("Frequency") plt.show() # Here the residuals are normally distributed. So normality assumption is satisfied # #### For test data # In[33]: # Predicting Test data with the model y_test_pred = lm.predict(X_test) # In[34]: # Model Evaluation acc_linreg = metrics.r2_score(y_test, y_test_pred) print('R^2:', acc_linreg) print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)) print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred)) print('MSE:',metrics.mean_squared_error(y_test, y_test_pred)) print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))) # Here the model evaluations scores are almost matching with that of train data. So the model is not overfitting. # ### Random Forest Regressor # #### Train the model # In[35]: # Import Random Forest Regressor from sklearn.ensemble import RandomForestRegressor # Create a Random Forest Regressor reg = RandomForestRegressor() # Train the model using the training sets reg.fit(X_train, y_train) # ### Model Evaluation # In[36]: # Model prediction on train data y_pred = reg.predict(X_train) # In[37]: # Model Evaluation print('R^2:',metrics.r2_score(y_train, y_pred)) print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)) print('MAE:',metrics.mean_absolute_error(y_train, y_pred)) print('MSE:',metrics.mean_squared_error(y_train, y_pred)) print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred))) # In[38]: # Visualizing the differences between actual prices and predicted values plt.scatter(y_train, y_pred) plt.xlabel("Prices") plt.ylabel("Predicted prices") plt.title("Prices vs Predicted prices") plt.show() # In[39]: # Checking residuals plt.scatter(y_pred,y_train-y_pred) plt.title("Predicted vs residuals") plt.xlabel("Predicted") plt.ylabel("Residuals") plt.show() # #### For test data # In[40]: # Predicting Test data with the model y_test_pred = reg.predict(X_test) # In[41]: # Model Evaluation acc_rf = metrics.r2_score(y_test, y_test_pred) print('R^2:', acc_rf) print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)) print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred)) print('MSE:',metrics.mean_squared_error(y_test, y_test_pred)) print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))) # ### SVM Regressor # In[42]: # Creating scaled set to be used in model to improve our results from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # #### Train the model # In[43]: # Import SVM Regressor from sklearn import svm # Create a SVM Regressor reg = svm.SVR() # In[44]: # Train the model using the training sets reg.fit(X_train, y_train) # C : float, optional (default=1.0): The penalty parameter of the error term. It controls the trade off between smooth decision boundary and classifying the training points correctly. # # kernel : string, optional (default='rbf’): kernel parameters selects the type of hyperplane used to separate the data. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed’ or a callable. # # degree : int, optional (default=3): Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. # # gamma : float, optional (default='auto’): It is for non linear hyperplanes. The higher the gamma value it tries to exactly fit the training data set. Current default is 'auto' which uses 1 / n_features. # # coef0 : float, optional (default=0.0): Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. # # shrinking : boolean, optional (default=True): Whether to use the shrinking heuristic. # #### Model Evaluation # In[45]: # Model prediction on train data y_pred = reg.predict(X_train) # In[46]: # Model Evaluation print('R^2:',metrics.r2_score(y_train, y_pred)) print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)) print('MAE:',metrics.mean_absolute_error(y_train, y_pred)) print('MSE:',metrics.mean_squared_error(y_train, y_pred)) print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred))) # In[47]: # Visualizing the differences between actual prices and predicted values plt.scatter(y_train, y_pred) plt.xlabel("Prices") plt.ylabel("Predicted prices") plt.title("Prices vs Predicted prices") plt.show() # In[48]: # Checking residuals plt.scatter(y_pred,y_train-y_pred) plt.title("Predicted vs residuals") plt.xlabel("Predicted") plt.ylabel("Residuals") plt.show() # #### For test data # In[49]: # Predicting Test data with the model y_test_pred = reg.predict(X_test) # In[50]: # Model Evaluation acc_svm = metrics.r2_score(y_test, y_test_pred) print('R^2:', acc_svm) print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)) print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred)) print('MSE:',metrics.mean_squared_error(y_test, y_test_pred)) print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))) # # Evaluation and comparison of all the models # In[51]: models = pd.DataFrame({ 'Model': ['Linear Regression', 'Random Forest', 'Support Vector Machines'], 'R-squared Score': [acc_linreg*100, acc_rf*100, acc_svm*100]}) models.sort_values(by='R-squared Score', ascending=False)