#!/usr/bin/env python # coding: utf-8 # # Imports # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.pyplot import subplots from ISLP import load_data import sklearn.model_selection as skm from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import OneHotEncoder from sklearn.pipeline import Pipeline from ISLP.models import \ (ModelSpec as MS, Stepwise, sklearn_selected) from statsmodels.api import OLS from sklearn.neighbors import RadiusNeighborsRegressor import sklearn.linear_model as skl from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score # # Q1 # In[2]: # Load data knn_data = pd.read_csv('KNN.csv') X_knn, Y_knn = knn_data.loc[:, 'X'].values.reshape(-1,1), knn_data.loc[:, ' Y'] # In[3]: # Pre-processing to scale X values scaler = StandardScaler(with_mean=True, with_std=True) # 5-fold CV with shuffle kfold = skm.KFold(n_splits=10, random_state=0, shuffle=True) # Pipeline entails scaling then fitting nearest neighbors averaging knn_avg = RadiusNeighborsRegressor() pipe = Pipeline([('scaler', scaler), ('knn_avg', knn_avg)]) # Grid of tau values to check tau_values = np.linspace(0.1, 10, num=100) param_grid = {'knn_avg__radius':tau_values} # Grid search grid = skm.GridSearchCV(pipe, param_grid, cv=kfold, scoring="neg_mean_squared_error") grid.fit(X_knn, Y_knn) # In[4]: # Plot results plt.figure(figsize=(10, 6)) plt.errorbar(tau_values, -grid.cv_results_['mean_test_score'], yerr=-grid.cv_results_['std_test_score'], fmt='o-', ecolor='r', capsize=2) plt.xlabel(r'Neighborhood radius ($\tau$)') plt.ylabel('Estimated Test Error (MSE)') plt.title(r'Test Error of Nearest Neighbors Averaging vs. $\tau$') plt.grid(True) plt.show() # # Q4 # ## (a) # In[5]: college_data = load_data('college') # One-hot encoding manually for categorical variable college_data['Private'] = college_data['Private'].apply(lambda x: 1 if x == 'Yes' else 0) design = MS(college_data.columns.drop('Apps')).fit(college_data) X_college, y_college = design.transform(college_data), college_data['Apps'] X_train, X_test, y_train, y_test = skm.train_test_split(X_college, y_college, test_size=0.30, random_state=42) # ## (b) # In[6]: # Function for getting training and test error from fitted model def fitted_eval(model, X_train, y_train, X_test, y_test): y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) rmse_train = root_mean_squared_error(y_train, y_train_pred) rmse_test = root_mean_squared_error(y_test, y_test_pred) mae_train = mean_absolute_error(y_train, y_train_pred) mae_test = mean_absolute_error(y_test, y_test_pred) r2_train = r2_score(y_train, y_train_pred) r2_test = r2_score(y_test, y_test_pred) return {'rmse_train': rmse_train, 'rmse_test': rmse_test, 'mae_train': mae_train, 'mae_test': mae_test, 'r2_train': r2_train, 'r2_test': r2_test} # In[7]: # Fit linear model using least squares and get test/train error linreg = OLS(y_train, X_train).fit() linreg_res = fitted_eval(linreg, X_train, y_train, X_test, y_test) linreg_res # ## (c) # In[8]: strategy = Stepwise.first_peak(design, direction='forward', max_terms=len(design.terms)) # In[9]: forward_mse = sklearn_selected(OLS, strategy) forward_mse.fit(X_train, y_train) forward_mse_res = fitted_eval(forward_mse, X_train, y_train, X_test, y_test) forward_mse_res # In[10]: forward_mse.selected_state_ # In[24]: X_train.columns # In[11]: # Scoring function for (negative) AIC for linear regression def negAIC(estimator, X, Y): "Negative AIC" n, p = X.shape Yhat = estimator.predict(X) MSE = np.mean((Y - Yhat)**2) return n + n * np.log(MSE) + 2 * (p + 1) # In[12]: forward_aic = sklearn_selected(OLS, strategy, scoring=negAIC) forward_aic.fit(X_train, y_train) forward_aic_res = fitted_eval(forward_aic, X_train, y_train, X_test, y_test) forward_aic_res # In[13]: forward_aic.selected_state_ # In[14]: # Scoring function for (negative) BIC for linear regression def negBIC(estimator, X, Y): "Negative AIC" n, p = X.shape Yhat = estimator.predict(X) MSE = np.mean((Y - Yhat)**2) return - (p + 1) * np.log(n) - n * np.log(MSE / n) # In[25]: forward_bic = sklearn_selected(OLS, strategy, scoring=negBIC) forward_bic.fit(X_train, y_train) forward_bic_res = fitted_eval(forward_bic, X_train, y_train, X_test, y_test) forward_bic_res # In[16]: forward_bic.selected_state_ # ## (d) # In[17]: lambdas = 10**np.linspace(8, -2, 100) / y_train.std() ridgeCV = skl.ElasticNetCV(alphas=lambdas, l1_ratio=0, cv=kfold) piperidgeCV = Pipeline(steps=[('scaler', scaler), ('ridge', ridgeCV)]) piperidgeCV.fit(X_train, y_train) fitted_eval(piperidgeCV, X_train, y_train, X_test, y_test) # In[18]: X_train.columns[piperidgeCV.named_steps['ridge'].coef_ != 0] # ## (e) # In[19]: lassoCV = skl.ElasticNetCV(alphas=lambdas, l1_ratio=1, cv=kfold) pipelassoCV = Pipeline(steps=[('scaler', scaler), ('lasso', lassoCV)]) pipelassoCV.fit(X_train, y_train) fitted_eval(pipelassoCV, X_train, y_train, X_test, y_test) # In[20]: X_train.columns[pipelassoCV.named_steps['lasso'].coef_ != 0]