! pip install plotly==5.3.1
! pip install numpy
! pip install pandas
! pip install pathlib
! pip install wget
! pip install scikit-learn
Requirement already satisfied: plotly==5.3.1 in c:\users\alilavaee\anaconda3\lib\site-packages (5.3.1) Requirement already satisfied: tenacity>=6.2.0 in c:\users\alilavaee\anaconda3\lib\site-packages (from plotly==5.3.1) (8.0.1) Requirement already satisfied: six in c:\users\alilavaee\anaconda3\lib\site-packages (from plotly==5.3.1) (1.15.0) Requirement already satisfied: numpy in c:\users\alilavaee\anaconda3\lib\site-packages (1.20.1) Requirement already satisfied: pandas in c:\users\alilavaee\anaconda3\lib\site-packages (1.2.4) Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\alilavaee\anaconda3\lib\site-packages (from pandas) (2.8.1) Requirement already satisfied: numpy>=1.16.5 in c:\users\alilavaee\anaconda3\lib\site-packages (from pandas) (1.20.1) Requirement already satisfied: pytz>=2017.3 in c:\users\alilavaee\anaconda3\lib\site-packages (from pandas) (2021.1) Requirement already satisfied: six>=1.5 in c:\users\alilavaee\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0) Requirement already satisfied: pathlib in c:\users\alilavaee\anaconda3\lib\site-packages (1.0.1) Requirement already satisfied: wget in c:\users\alilavaee\anaconda3\lib\site-packages (3.2) Requirement already satisfied: scikit-learn in c:\users\alilavaee\anaconda3\lib\site-packages (0.24.1) Requirement already satisfied: scipy>=0.19.1 in c:\users\alilavaee\anaconda3\lib\site-packages (from scikit-learn) (1.6.2) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\alilavaee\anaconda3\lib\site-packages (from scikit-learn) (2.1.0) Requirement already satisfied: joblib>=0.11 in c:\users\alilavaee\anaconda3\lib\site-packages (from scikit-learn) (1.0.1) Requirement already satisfied: numpy>=1.13.3 in c:\users\alilavaee\anaconda3\lib\site-packages (from scikit-learn) (1.20.1)
import plotly.graph_objects as go
import numpy as np
import pandas as pd
from pathlib import Path
import wget
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
url = 'https://raw.githubusercontent.com/lavaman131/Linear-Regression-Tutorial/main/real_estate_costs.csv'
if not Path('real_estate_costs.csv').is_file():
filename = wget.download(url)
# load dataset
realEstate_prices = pd.read_csv(Path('real_estate_costs.csv'))
x = 'X2 house age' # explanatory variable
y = 'Y house price of unit area' # response variable
realEstate_prices = realEstate_prices[[x, y]]
realEstate_prices.head()
X2 house age | Y house price of unit area | |
---|---|---|
0 | 32.0 | 37.9 |
1 | 19.5 | 42.2 |
2 | 13.3 | 47.3 |
3 | 13.3 | 54.8 |
4 | 5.0 | 43.1 |
# create plot figure
fig = go.Figure()
# plot scatterplot of data
fig.add_trace(go.Scatter(x=realEstate_prices[x], y=realEstate_prices[y], mode='markers', marker=dict(color='blue'), name='data'))
fig.update_layout(
xaxis_title="house age",
yaxis_title="house price"
)
r = np.corrcoef([realEstate_prices[x]], realEstate_prices[y])[0,1]
print(f'r: {r}')
r: -0.21056704627721692
# A matrix
X = np.array(realEstate_prices['X2 house age']).reshape(-1,1)
X = np.hstack([np.ones(X.shape), X])
# b vector
y = np.array(realEstate_prices['Y house price of unit area'])
# split data into 80-20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# create linear regression model
reg = LinearRegression()
# fit model using linear algebra
reg.fit(X_train, y_train)
# get accuracy of model (r^2)
reg.score(X_test, y_test)
0.0809438039240935
y_pred = reg.predict(X_test)
fig.add_trace(go.Scatter(x=X_test[:, 1], y=y_pred, line_shape='linear', marker=dict(color='red'), name='best fit line'))
# calculate residual
residual = y_test - y_pred
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=X_test[:, 1], y=residual, mode='markers',
marker=dict(color='blue'), name='residual'))
fig2.add_trace(go.Scatter(x=np.arange(X_test[:, 1].min()-5, X_test[:, 1].max()+5),
y=np.zeros(X_test[:, 1].shape), line_shape='linear',
marker=dict(color='red'), name='residual line'))
fig2.update_layout(yaxis_range=[residual.min()-20,residual.max()+20],
xaxis_title="house age", yaxis_title="house price")