url = 'https://raw.githubusercontent.com/StanfordStatistics/stats191-data/main/Sleuth3/state_SAT.csv'
SAT.df = read.csv(url, sep=',', row.names=1, header=TRUE)
head(SAT.df)

url = 'https://raw.githubusercontent.com/StanfordStatistics/stats191-data/main/Sleuth3/discrim_employment.csv'
salary = read.table(url, sep=',', header=TRUE)
head(salary)

# you may need to install `leaps` library
# install.packages('leaps', repos='http://cloud.r-project.org')
library(leaps) 

SAT.lm = lm(SAT ~ ., SAT.df)
X_SAT = model.matrix(SAT.lm)[,-1]
Y_SAT = SAT.df$SAT

SAT.leaps = leaps(X_SAT, Y_SAT, nbest=5, method='r2')
r2_idx = which((SAT.leaps$r2 == max(SAT.leaps$r2)))
best.model.r2 = colnames(X_SAT)[SAT.leaps$which[r2_idx,]]
best.model.r2

plot(SAT.leaps$size, 
     SAT.leaps$r2, 
     pch=23, 
     bg='orange', 
     cex=2,
     ylab='R^2',
     xlab='Model size')
points(SAT.leaps$size[r2_idx],
       SAT.leaps$r2[r2_idx],
       pch=24,
       bg='red', 
       cex=3)

SAT.leaps = leaps(X_SAT, Y_SAT, nbest=5, method='adjr2')
adjr2_idx = which((SAT.leaps$adjr2 == max(SAT.leaps$adjr2)))
best.model.adjr2 = colnames(X_SAT)[SAT.leaps$which[adjr2_idx,]]
best.model.adjr2

plot(SAT.leaps$size, 
     SAT.leaps$adjr2, 
     pch=23, bg='orange', 
     cex=2,
     ylab='Adjusted R^2',
     xlab='Model size')
points(SAT.leaps$size[adjr2_idx],
       SAT.leaps$adjr2[adjr2_idx],
       pch=24, bg='red', cex=3)

SAT.leaps = leaps(X_SAT, Y_SAT, nbest=3, method='Cp')
Cp_idx = which((SAT.leaps$Cp == min(SAT.leaps$Cp)))
best.model.Cp = colnames(X_SAT)[SAT.leaps$which[Cp_idx,]]
best.model.Cp

plot(SAT.leaps$size,
     SAT.leaps$Cp,
     pch=23,
     bg='orange',
     cex=2,
     ylab='C_p',
     xlab='Model size')
points(SAT.leaps$size[Cp_idx],
       SAT.leaps$Cp[Cp_idx],
       pch=24, 
       bg='red', 
       cex=3)

n = nrow(X_SAT)
p = length(coef(SAT.lm)) + 1
c(n * log(2*pi*sum(resid(SAT.lm)^2)/n) + n + 2*p, AIC(SAT.lm))

SAT.step.forward = step(lm(SAT ~ 1, SAT.df), 
                        list(upper = SAT.lm),
                        direction='forward', 
                        trace=FALSE)
names(coef(SAT.step.forward))[-1]

SAT2.lm = lm(SAT ~ .^2, SAT.df)
SAT2.step.forward = step(lm(SAT ~ 1, SAT.df), 
                        list(upper = SAT2.lm),
                        direction='forward', 
                        trace=FALSE)
names(coef(SAT2.step.forward))[-1]

SAT.step.forward.BIC = step(lm(SAT ~ 1, SAT.df), 
                            list(upper=SAT.lm),
                            direction='forward', 
                            k=log(nrow(SAT.df)),
                            trace=FALSE)
names(coef(SAT.step.forward.BIC))[-1]

names(coef(SAT.step.forward))[-1]

SAT.step.backward = step(lm(SAT ~ ., SAT.df), 
                        list(lower = ~ 1),
                        direction='backward', 
                        trace=FALSE)
names(coef(SAT.step.backward))[-1]

names(coef(SAT.step.forward))[-1]

D = with(salary, cbind(Senior, Age, Educ, Exper))
base.lm = lm(Bsal ~ poly(D, 2), data=salary)
summary(base.lm)$coef

X_sal = model.matrix(base.lm)[,-1]
Y_sal = log(salary$Bsal)
salary.leaps = leaps(X_sal, Y_sal, nbest=10, method='Cp')
Cp_idx = which((salary.leaps$Cp == min(salary.leaps$Cp)))
best.model.Cp = salary.leaps$which[Cp_idx,]
colnames(X_sal)[best.model.Cp]

plot(salary.leaps$size,
     salary.leaps$Cp,
     pch=23,
     bg='orange',
     cex=2, 
     ylim=c(0, 15))
points(salary.leaps$size[Cp_idx],
       salary.leaps$Cp[Cp_idx],
       pch=24,
       bg='red',
       cex=3)

summary(lm(log(Bsal) ~ Sex + X_sal[,best.model.Cp], data=salary))$coef['SexMale',]

summary(lm(log(Bsal) ~ Sex + X_sal[,best.model.Cp][,-4], data=salary))$coef['SexMale',]

step_df = with(salary, data.frame(Bsal, X_sal))
step_lm = lm(log(Bsal) ~ ., step_df)
step_sel = step(lm(log(Bsal) ~ 1, data=step_df),
                list(upper=step_lm),
		trace=FALSE)
names(coef(step_sel))[-1]

url_X = 'https://raw.githubusercontent.com/StanfordStatistics/stats191-data/main/NRTI_X.csv'
url_Y = 'https://raw.githubusercontent.com/StanfordStatistics/stats191-data/main/NRTI_Y.txt'
X_HIV = read.table(url_X, header=FALSE, sep=',')
Y_HIV = read.table(url_Y, header=FALSE, sep=',')
Y_HIV = as.matrix(Y_HIV)[,1]
X_HIV = as.matrix(X_HIV)
dim(X_HIV)

D = data.frame(X_HIV, Y_HIV)
M = lm(Y_HIV ~ ., data=D)
M0 = lm(Y_HIV ~ 1, data=D)
system.time(MF <- step(M0,
                 list(lower=M0,
		      upper=M), 
                 trace=FALSE,
		 direction='both'))
names(coef(MF))

system.time(MB <- step(M,
                 list(lower=M0,
		      upper=M), 
                 trace=FALSE,
		 direction='both'))
names(coef(MB))