Notebook

Mathematical equation for the multiple linear regression

In [ ]:

y = b0 + b1*x1 + b2*x2 + ... + bn*xn

The code for linear regression

In [7]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# Load the dataset from a CSV file
df = pd.read_csv('/Users/alksnk/Downloads/winequality-red.csv')

# Split the dataset into training and testing sets
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the linear regression model on the training set
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict the test set results and calculate accuracy
y_pred = regressor.predict(X_test)
accuracy = r2_score(y_test, y_pred)

# Plot the actual vs predicted values
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Linear Regression Results (Accuracy: {:.2f}%)'.format(accuracy * 100))
plt.show()

Application of Ridge regression to improve the accuracy

In [14]:

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# Load the dataset from a CSV file
df = pd.read_csv('/Users/alksnk/Downloads/winequality-red.csv')

# Split the dataset into training and testing sets
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the Ridge regression model on the training set
regressor = Ridge(alpha=0.4) # alpha is the regularization strength, higher value of alpha more restriction on the coefficients
regressor.fit(X_train, y_train)

# Predict the test set results and calculate accuracy
y_pred = regressor.predict(X_test)
accuracy = r2_score(y_test, y_pred)

# Plot the actual vs predicted values
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Ridge Regression Results (Accuracy: {:.2f}%)'.format(accuracy * 100))
plt.show()

Application of feature engineering to improve the accuracy

In [26]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# Load the dataset from a CSV file
df = pd.read_csv('/Users/alksnk/Downloads/winequality-red.csv')

# Create new features by transforming existing ones
df['volatile acidity_squared'] = df['volatile acidity'] ** 2
df['fixed acidity_log'] = np.log(df['fixed acidity'])

# Split the dataset into training and testing sets
X = df[['fixed acidity', 'volatile acidity', 'volatile acidity_squared', 'fixed acidity_log']].values
y = df['citric acid'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the linear regression model on the training set
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict the test set results and calculate accuracy
y_pred = regressor.predict(X_test)
accuracy = r2_score(y_test, y_pred)

# Plot the actual vs predicted values
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Linear Regression Results with Feature Engineering (Accuracy: {:.2f}%)'.format(accuracy * 100))
plt.show()