#!/usr/bin/env python
# coding: utf-8

# # Boston house price prediction
# 
# Modified from: https://www.kaggle.com/shreayan98c/boston-house-price-prediction

# The problem that we are going to solve here is that given a set of features that describe a house in Boston, our machine learning model must predict the house price. To train our machine learning model with boston housing data, we will be using scikit-learn’s boston dataset.
# 
# In this dataset, each row describes a boston town or suburb. There are 506 rows and 13 attributes (features) with a target column (price).
# https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names

# ## Loading the data

# In[1]:


# Importing the libraries 
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


# Importing the Boston Housing dataset
from sklearn.datasets import load_boston
boston = load_boston()


# ## Getting a "feeling of the data"

# In[3]:


print(boston.keys()) #print keys


# In[4]:


# Initializing the dataframe
data = pd.DataFrame(boston.data)


# In[5]:


print(boston.DESCR)


# In[6]:


# See head of the dataset
data.head()


# In[7]:


#Adding the feature names to the dataframe
data.columns = boston.feature_names
data.head()


# Each record in the database describes a Boston suburb or town.

# In[8]:


#Adding target variable to dataframe
data['PRICE'] = boston.target 
# Median value of owner-occupied homes in $1000s


# In[9]:


#Check the shape of dataframe
data.shape


# In[10]:


data.columns


# In[11]:


data.dtypes


# In[12]:


# Identifying the unique number of values in the dataset
data.nunique()


# In[13]:


# Check for missing values
data.isnull().sum()


# In[14]:


# See rows with missing values
data[data.isnull().any(axis=1)]


# In[15]:


# Viewing the data statistics
print(data.describe())


# In[16]:


import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in data.items():
    sns.boxplot(y=k, data=data, ax=axs[index])
    index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)


# Columns like CRIM, ZN, RM, B seems to have outliers. Let's see the outliers percentage in every column.

# In[17]:


for k, v in data.items():
    q1 = v.quantile(0.25)
    q3 = v.quantile(0.75)
    irq = q3 - q1
    v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)]
    perc = np.shape(v_col)[0] * 100.0 / np.shape(data)[0]
    print("Column %s outliers = %.2f%%" % (k, perc))


# Let's see how the feature distributions look like

# In[52]:


#import warnings
#warnings.filterwarnings('ignore')

fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in data.items():
    sns.histplot(v, kde=True, ax=axs[index])
    index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)


# In[20]:


# Finding out the correlation between the features
corr = data.corr().abs()
corr.shape


# In[21]:


# Plotting the heatmap of correlation between features
plt.figure(figsize=(20,20))
sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15})
#sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens')


# Plot columns with a correlation score above 0.5 with PRICE against PRICE

# In[22]:


from sklearn import preprocessing
# Let's scale the columns before plotting them against MEDV
min_max_scaler = preprocessing.MinMaxScaler()
column_sels = ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']
x = data.loc[:,column_sels]
y = data['PRICE']
x = pd.DataFrame(data=min_max_scaler.fit_transform(x), columns=column_sels)
fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for i, k in enumerate(column_sels):
    sns.regplot(y=y, x=x[k], ax=axs[i])
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)


# ## Prediction using the data (a first example to machine learning applications)

# In[23]:


# Splitting target variable and independent variables
X = data.drop(['PRICE'], axis = 1)
y = data['PRICE']


# In[24]:


# Splitting to training and testing data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 4)


# ### Linear regression

# #### Training the model

# In[25]:


# Import library for Linear Regression
from sklearn.linear_model import LinearRegression

# Create a Linear regressor
lm = LinearRegression()

# Train the model using the training sets 
lm.fit(X_train, y_train)


# In[26]:


# Value of y intercept
lm.intercept_


# In[27]:


#Converting the coefficient values to a dataframe
coeffs = pd.DataFrame([X_train.columns,lm.coef_]).T
coeffs = coeffs.rename(columns={0: 'Attribute', 1: 'Coefficients'})
coeffs


# #### Model Evaluation

# In[28]:


# Model prediction on train data
y_pred = lm.predict(X_train)


# In[29]:


# Model Evaluation
print('R^2:',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_train, y_pred))
print('MSE:',metrics.mean_squared_error(y_train, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))


# 𝑅^2 : It is a measure of the linear relationship between X and Y. It is interpreted as the proportion of the variance in the dependent variable that is predictable from the independent variable.
# 
# Adjusted 𝑅^2 :The adjusted R-squared compares the explanatory power of regression models that contain different numbers of predictors.
# 
# MAE : It is the mean of the absolute value of the errors. It measures the difference between two continuous variables, here actual and predicted values of y. 
# 
# MSE: The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value. 
# 
# RMSE: The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value. 
# 
# 
# 
# 
# 

# In[30]:


# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()


# In[31]:


# Checking residuals
plt.scatter(y_pred,y_train-y_pred)
plt.title("Predicted vs residuals")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show()


# There is no pattern visible in this plot and values are distributed equally around zero. So Linearity assumption is satisfied

# In[53]:


# Checking Normality of errors
sns.histplot(y_train-y_pred, kde=True)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()


# Here the residuals are normally distributed. So normality assumption is satisfied

# #### For test data

# In[33]:


# Predicting Test data with the model
y_test_pred = lm.predict(X_test)


# In[34]:


# Model Evaluation
acc_linreg = metrics.r2_score(y_test, y_test_pred)
print('R^2:', acc_linreg)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_test_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))


# Here the model evaluations scores are almost matching with that of train data. So the model is not overfitting.

# ### Random Forest Regressor 

# #### Train the model

# In[35]:


# Import Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest Regressor
reg = RandomForestRegressor()

# Train the model using the training sets 
reg.fit(X_train, y_train)


# ### Model Evaluation

# In[36]:


# Model prediction on train data
y_pred = reg.predict(X_train)


# In[37]:


# Model Evaluation
print('R^2:',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_train, y_pred))
print('MSE:',metrics.mean_squared_error(y_train, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))


# In[38]:


# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()


# In[39]:


# Checking residuals
plt.scatter(y_pred,y_train-y_pred)
plt.title("Predicted vs residuals")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show()


# #### For test data

# In[40]:


# Predicting Test data with the model
y_test_pred = reg.predict(X_test)


# In[41]:


# Model Evaluation
acc_rf = metrics.r2_score(y_test, y_test_pred)
print('R^2:', acc_rf)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_test_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))


# ### SVM Regressor

# In[42]:


# Creating scaled set to be used in model to improve our results
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


# #### Train the model

# In[43]:


# Import SVM Regressor
from sklearn import svm

# Create a SVM Regressor
reg = svm.SVR()


# In[44]:


# Train the model using the training sets 
reg.fit(X_train, y_train)


# C : float, optional (default=1.0): The penalty parameter of the error term. It controls the trade off between smooth decision boundary and classifying the training points correctly.
# 
# kernel : string, optional (default='rbf’): kernel parameters selects the type of hyperplane used to separate the data. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed’ or a callable.
# 
# degree : int, optional (default=3): Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
# 
# gamma : float, optional (default='auto’): It is for non linear hyperplanes. The higher the gamma value it tries to exactly fit the training data set. Current default is 'auto' which uses 1 / n_features.
# 
# coef0 : float, optional (default=0.0): Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.
# 
# shrinking : boolean, optional (default=True): Whether to use the shrinking heuristic.

# #### Model Evaluation

# In[45]:


# Model prediction on train data
y_pred = reg.predict(X_train)


# In[46]:


# Model Evaluation
print('R^2:',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_train, y_pred))
print('MSE:',metrics.mean_squared_error(y_train, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))


# In[47]:


# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()


# In[48]:


# Checking residuals
plt.scatter(y_pred,y_train-y_pred)
plt.title("Predicted vs residuals")
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.show()


# #### For test data

# In[49]:


# Predicting Test data with the model
y_test_pred = reg.predict(X_test)


# In[50]:


# Model Evaluation
acc_svm = metrics.r2_score(y_test, y_test_pred)
print('R^2:', acc_svm)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_test_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))


# # Evaluation and comparison of all the models

# In[51]:


models = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Support Vector Machines'],
    'R-squared Score': [acc_linreg*100, acc_rf*100, acc_svm*100]})
models.sort_values(by='R-squared Score', ascending=False)