#!/usr/bin/env python # coding: utf-8 # ### Model Training # 1.1 Import Data and Required Packages # In[1]: # Basic Import import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # In[2]: # Modelling from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor from sklearn.svm import SVR from sklearn.linear_model import LinearRegression, Ridge,Lasso from sklearn.model_selection import RandomizedSearchCV from catboost import CatBoostRegressor from xgboost import XGBRegressor import warnings # In[3]: df = pd.read_csv('studentsperformance.csv') # In[4]: df.head(1) # In[5]: X = df.drop('math score', axis = 1) y = df['math score'] # In[6]: X.head(1) # In[7]: y.head(1) # ### Pipeline # In[8]: # Create Column Transformer with 3 types of transformers num_features = X.select_dtypes(exclude="object").columns cat_features = X.select_dtypes(include="object").columns from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer numeric_transformer = StandardScaler() oh_transformer = OneHotEncoder() preprocessor = ColumnTransformer( [ ("OneHotEncoder", oh_transformer, cat_features), ("StandardScaler", numeric_transformer, num_features), ] ) # In[9]: X = preprocessor.fit_transform(X) # In[10]: # separate dataset into train and test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42) X_train.shape, X_test.shape # In[11]: # Create an Evaluate Function to give all metrics after model Training def evaluate_model(true, predicted): mae = mean_absolute_error(true, predicted) mse = mean_squared_error(true, predicted) rmse = np.sqrt(mse) r2_square = r2_score(true, predicted) return mae, rmse, r2_square # In[12]: models = { "Linear Regression": LinearRegression(), "Lasso": Lasso(), "Ridge": Ridge(), "K-Neighbors Regressor": KNeighborsRegressor(), "Decision Tree": DecisionTreeRegressor(), "Random Forest Regressor": RandomForestRegressor(), "XGBRegressor": XGBRegressor(), "CatBoosting Regressor": CatBoostRegressor(verbose=False), "AdaBoost Regressor": AdaBoostRegressor() } model_list = [] r2_list =[] for i in range(len(list(models))): model = list(models.values())[i] model.fit(X_train, y_train) # Train model # Make predictions y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) # Evaluate Train and Test dataset model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred) model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred) print(list(models.keys())[i]) model_list.append(list(models.keys())[i]) print('Model performance for Training set') print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse)) print("- Mean Absolute Error: {:.4f}".format(model_train_mae)) print("- R2 Score: {:.4f}".format(model_train_r2)) print('----------------------------------') print('Model performance for Test set') print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse)) print("- Mean Absolute Error: {:.4f}".format(model_test_mae)) print("- R2 Score: {:.4f}".format(model_test_r2)) r2_list.append(model_test_r2) print('='*35) print('\n') # In[13]: pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False) # Linear Regression # In[14]: lin_model = LinearRegression(fit_intercept=True) lin_model = lin_model.fit(X_train, y_train) y_pred = lin_model.predict(X_test) score = r2_score(y_test, y_pred)*100 print(" Accuracy of the model is %.2f" %score) # In[15]: # Plot y_pred and y_test plt.scatter(y_test,y_pred); plt.xlabel('Actual'); plt.ylabel('Predicted'); # In[16]: sns.regplot(x=y_test,y=y_pred,ci=None,color ='red'); # In[17]: pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred}) pred_df # In[ ]: # In[ ]: