url = 'https://raw.githubusercontent.com/StanfordStatistics/stats191-data/main/Sleuth3/state_SAT.csv' SAT.df = read.csv(url, sep=',', row.names=1, header=TRUE) head(SAT.df) url = 'https://raw.githubusercontent.com/StanfordStatistics/stats191-data/main/Sleuth3/discrim_employment.csv' salary = read.table(url, sep=',', header=TRUE) head(salary) # you may need to install `leaps` library # install.packages('leaps', repos='http://cloud.r-project.org') library(leaps) SAT.lm = lm(SAT ~ ., SAT.df) X_SAT = model.matrix(SAT.lm)[,-1] Y_SAT = SAT.df$SAT SAT.leaps = leaps(X_SAT, Y_SAT, nbest=5, method='r2') r2_idx = which((SAT.leaps$r2 == max(SAT.leaps$r2))) best.model.r2 = colnames(X_SAT)[SAT.leaps$which[r2_idx,]] best.model.r2 plot(SAT.leaps$size, SAT.leaps$r2, pch=23, bg='orange', cex=2, ylab='R^2', xlab='Model size') points(SAT.leaps$size[r2_idx], SAT.leaps$r2[r2_idx], pch=24, bg='red', cex=3) SAT.leaps = leaps(X_SAT, Y_SAT, nbest=5, method='adjr2') adjr2_idx = which((SAT.leaps$adjr2 == max(SAT.leaps$adjr2))) best.model.adjr2 = colnames(X_SAT)[SAT.leaps$which[adjr2_idx,]] best.model.adjr2 plot(SAT.leaps$size, SAT.leaps$adjr2, pch=23, bg='orange', cex=2, ylab='Adjusted R^2', xlab='Model size') points(SAT.leaps$size[adjr2_idx], SAT.leaps$adjr2[adjr2_idx], pch=24, bg='red', cex=3) SAT.leaps = leaps(X_SAT, Y_SAT, nbest=3, method='Cp') Cp_idx = which((SAT.leaps$Cp == min(SAT.leaps$Cp))) best.model.Cp = colnames(X_SAT)[SAT.leaps$which[Cp_idx,]] best.model.Cp plot(SAT.leaps$size, SAT.leaps$Cp, pch=23, bg='orange', cex=2, ylab='C_p', xlab='Model size') points(SAT.leaps$size[Cp_idx], SAT.leaps$Cp[Cp_idx], pch=24, bg='red', cex=3) n = nrow(X_SAT) p = length(coef(SAT.lm)) + 1 c(n * log(2*pi*sum(resid(SAT.lm)^2)/n) + n + 2*p, AIC(SAT.lm)) SAT.step.forward = step(lm(SAT ~ 1, SAT.df), list(upper = SAT.lm), direction='forward', trace=FALSE) names(coef(SAT.step.forward))[-1] SAT2.lm = lm(SAT ~ .^2, SAT.df) SAT2.step.forward = step(lm(SAT ~ 1, SAT.df), list(upper = SAT2.lm), direction='forward', trace=FALSE) names(coef(SAT2.step.forward))[-1] SAT.step.forward.BIC = step(lm(SAT ~ 1, SAT.df), list(upper=SAT.lm), direction='forward', k=log(nrow(SAT.df)), trace=FALSE) names(coef(SAT.step.forward.BIC))[-1] names(coef(SAT.step.forward))[-1] SAT.step.backward = step(lm(SAT ~ ., SAT.df), list(lower = ~ 1), direction='backward', trace=FALSE) names(coef(SAT.step.backward))[-1] names(coef(SAT.step.forward))[-1] D = with(salary, cbind(Senior, Age, Educ, Exper)) base.lm = lm(Bsal ~ poly(D, 2), data=salary) summary(base.lm)$coef X_sal = model.matrix(base.lm)[,-1] Y_sal = log(salary$Bsal) salary.leaps = leaps(X_sal, Y_sal, nbest=10, method='Cp') Cp_idx = which((salary.leaps$Cp == min(salary.leaps$Cp))) best.model.Cp = salary.leaps$which[Cp_idx,] colnames(X_sal)[best.model.Cp] plot(salary.leaps$size, salary.leaps$Cp, pch=23, bg='orange', cex=2, ylim=c(0, 15)) points(salary.leaps$size[Cp_idx], salary.leaps$Cp[Cp_idx], pch=24, bg='red', cex=3) summary(lm(log(Bsal) ~ Sex + X_sal[,best.model.Cp], data=salary))$coef['SexMale',] summary(lm(log(Bsal) ~ Sex + X_sal[,best.model.Cp][,-4], data=salary))$coef['SexMale',] step_df = with(salary, data.frame(Bsal, X_sal)) step_lm = lm(log(Bsal) ~ ., step_df) step_sel = step(lm(log(Bsal) ~ 1, data=step_df), list(upper=step_lm), trace=FALSE) names(coef(step_sel))[-1] url_X = 'https://raw.githubusercontent.com/StanfordStatistics/stats191-data/main/NRTI_X.csv' url_Y = 'https://raw.githubusercontent.com/StanfordStatistics/stats191-data/main/NRTI_Y.txt' X_HIV = read.table(url_X, header=FALSE, sep=',') Y_HIV = read.table(url_Y, header=FALSE, sep=',') Y_HIV = as.matrix(Y_HIV)[,1] X_HIV = as.matrix(X_HIV) dim(X_HIV) D = data.frame(X_HIV, Y_HIV) M = lm(Y_HIV ~ ., data=D) M0 = lm(Y_HIV ~ 1, data=D) system.time(MF <- step(M0, list(lower=M0, upper=M), trace=FALSE, direction='both')) names(coef(MF)) system.time(MB <- step(M, list(lower=M0, upper=M), trace=FALSE, direction='both')) names(coef(MB))