1.1 Import Data and Required Packages
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
df = pd.read_csv('studentsperformance.csv')
df.head(1)
gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
---|---|---|---|---|---|---|---|---|
0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
X = df.drop('math score', axis = 1)
y = df['math score']
X.head(1)
gender | race/ethnicity | parental level of education | lunch | test preparation course | reading score | writing score | |
---|---|---|---|---|---|---|---|
0 | female | group B | bachelor's degree | standard | none | 72 | 74 |
y.head(1)
0 72 Name: math score, dtype: int64
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
[
("OneHotEncoder", oh_transformer, cat_features),
("StandardScaler", numeric_transformer, num_features),
]
)
X = preprocessor.fit_transform(X)
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape
((800, 19), (200, 19))
# Create an Evaluate Function to give all metrics after model Training
def evaluate_model(true, predicted):
mae = mean_absolute_error(true, predicted)
mse = mean_squared_error(true, predicted)
rmse = np.sqrt(mse)
r2_square = r2_score(true, predicted)
return mae, rmse, r2_square
models = {
"Linear Regression": LinearRegression(),
"Lasso": Lasso(),
"Ridge": Ridge(),
"K-Neighbors Regressor": KNeighborsRegressor(),
"Decision Tree": DecisionTreeRegressor(),
"Random Forest Regressor": RandomForestRegressor(),
"XGBRegressor": XGBRegressor(),
"CatBoosting Regressor": CatBoostRegressor(verbose=False),
"AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]
for i in range(len(list(models))):
model = list(models.values())[i]
model.fit(X_train, y_train) # Train model
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Evaluate Train and Test dataset
model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
print(list(models.keys())[i])
model_list.append(list(models.keys())[i])
print('Model performance for Training set')
print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
print("- R2 Score: {:.4f}".format(model_train_r2))
print('----------------------------------')
print('Model performance for Test set')
print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
print("- R2 Score: {:.4f}".format(model_test_r2))
r2_list.append(model_test_r2)
print('='*35)
print('\n')
Linear Regression Model performance for Training set - Root Mean Squared Error: 5.3243 - Mean Absolute Error: 4.2671 - R2 Score: 0.8743 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 5.3960 - Mean Absolute Error: 4.2158 - R2 Score: 0.8803 =================================== Lasso Model performance for Training set - Root Mean Squared Error: 6.5938 - Mean Absolute Error: 5.2063 - R2 Score: 0.8071 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 6.5197 - Mean Absolute Error: 5.1579 - R2 Score: 0.8253 =================================== Ridge Model performance for Training set - Root Mean Squared Error: 5.3233 - Mean Absolute Error: 4.2650 - R2 Score: 0.8743 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 5.3904 - Mean Absolute Error: 4.2111 - R2 Score: 0.8806 =================================== K-Neighbors Regressor Model performance for Training set - Root Mean Squared Error: 5.7077 - Mean Absolute Error: 4.5167 - R2 Score: 0.8555 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 7.2530 - Mean Absolute Error: 5.6210 - R2 Score: 0.7838 =================================== Decision Tree Model performance for Training set - Root Mean Squared Error: 0.2795 - Mean Absolute Error: 0.0187 - R2 Score: 0.9997 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 7.9066 - Mean Absolute Error: 6.3550 - R2 Score: 0.7431 =================================== Random Forest Regressor Model performance for Training set - Root Mean Squared Error: 2.2808 - Mean Absolute Error: 1.8341 - R2 Score: 0.9769 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 5.9168 - Mean Absolute Error: 4.5796 - R2 Score: 0.8561 =================================== XGBRegressor Model performance for Training set - Root Mean Squared Error: 0.9087 - Mean Absolute Error: 0.6148 - R2 Score: 0.9963 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 6.5889 - Mean Absolute Error: 5.0844 - R2 Score: 0.8216 =================================== CatBoosting Regressor Model performance for Training set - Root Mean Squared Error: 3.0427 - Mean Absolute Error: 2.4054 - R2 Score: 0.9589 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 6.0086 - Mean Absolute Error: 4.6125 - R2 Score: 0.8516 =================================== AdaBoost Regressor Model performance for Training set - Root Mean Squared Error: 5.7813 - Mean Absolute Error: 4.7602 - R2 Score: 0.8517 ---------------------------------- Model performance for Test set - Root Mean Squared Error: 6.0781 - Mean Absolute Error: 4.7711 - R2 Score: 0.8482 ===================================
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
Model Name | R2_Score | |
---|---|---|
2 | Ridge | 0.880593 |
0 | Linear Regression | 0.880345 |
5 | Random Forest Regressor | 0.856131 |
7 | CatBoosting Regressor | 0.851632 |
8 | AdaBoost Regressor | 0.848181 |
1 | Lasso | 0.825320 |
6 | XGBRegressor | 0.821589 |
3 | K-Neighbors Regressor | 0.783813 |
4 | Decision Tree | 0.743094 |
Linear Regression
lin_model = LinearRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)
Accuracy of the model is 88.03
# Plot y_pred and y_test
plt.scatter(y_test,y_pred);
plt.xlabel('Actual');
plt.ylabel('Predicted');
sns.regplot(x=y_test,y=y_pred,ci=None,color ='red');
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df
Actual Value | Predicted Value | Difference | |
---|---|---|---|
521 | 91 | 76.507812 | 14.492188 |
737 | 53 | 58.953125 | -5.953125 |
740 | 80 | 76.960938 | 3.039062 |
660 | 74 | 76.757812 | -2.757812 |
411 | 84 | 87.539062 | -3.539062 |
... | ... | ... | ... |
408 | 52 | 43.546875 | 8.453125 |
332 | 62 | 62.031250 | -0.031250 |
208 | 74 | 67.976562 | 6.023438 |
613 | 65 | 67.132812 | -2.132812 |
78 | 61 | 62.492188 | -1.492188 |
200 rows × 3 columns