# Run this first so it's ready by the time you need it
install.packages("supernova")
install.packages("dplyr")
install.packages("ggformula")
library(supernova)
library(dplyr)
library(ggformula)
GSS <- read.csv("https://raw.githubusercontent.com/smburns47/Psyc158/main/GSS.csv")

#smaller dataset, more similar in size to most psychology studies
set.seed(10)
GSS_subset <- slice_sample(GSS, n=100)

model_obj <- lm(highest_year_of_school_completed ~ highest_year_school_completed_mother +
                highest_year_school_completed_father, data = GSS_subset)
summary(model_obj)

model_summary <- summary(model_obj)
model_summary$r.squared

supernova(model_obj)

#creating an empty vector of 1000 spots
null_R2 <- vector(length=1000)

#generate 1000 unique samples, saving each R2
for (i in 1:1000) {
    GSS_subset$shuffled_mother <- sample(x=GSS_subset$highest_year_school_completed_mother, 
                                         size=length(GSS_subset$highest_year_school_completed_mother), 
                                         replace=FALSE)
    GSS_subset$shuffled_father <- sample(x=GSS_subset$highest_year_school_completed_father, 
                                         size=length(GSS_subset$highest_year_school_completed_father), 
                                         replace=FALSE)
    model <- lm(highest_year_of_school_completed ~ shuffled_mother + shuffled_father, data=GSS_subset)
    null_R2[i] <- summary(model)$r.squared
}

R2_df <- data.frame(null_R2)
gf_histogram( ~ null_R2, data=R2_df)

#creating an empty vector of 1000 spots
null_b1 <- vector(length=1000)

#generate 1000 unique samples, saving each b1
for (i in 1:1000) {
    GSS_subset$shuffled_mother <- sample(x=GSS_subset$highest_year_school_completed_mother, 
                                         size=length(GSS_subset$highest_year_school_completed_mother), 
                                         replace=FALSE)
    model <- lm(highest_year_of_school_completed ~ shuffled_mother, data=GSS_subset)
    null_b1[i] <- model$coefficients[[2]]
}

b1_df <- data.frame(null_b1)

#cut-off values for extremeness
b1s_sd <- sd(b1_df$null_b1)
high_cutoff <- sd(b1_df$null_b1)*1.96
low_cutoff <- sd(b1_df$null_b1)*-1.96

#marking something as extreme if it is greater than 1.96*sd or less than -1.96*sd
b1_df$extreme <- b1_df$null_b1 > high_cutoff | b1_df$null_b1 < low_cutoff

gf_histogram(~ null_b1, data = b1_df, fill = ~extreme)

simple_model <- lm(highest_year_of_school_completed ~ highest_year_school_completed_mother, data=GSS_subset)
b1 <- simple_model$coefficients[[2]]

quantile(null_b1, 0.025)
quantile(null_b1, 0.975)
b1
b1 > quantile(null_b1, 0.975) | b1 < quantile(null_b1, 0.025)

full_model <- lm(highest_year_of_school_completed ~ highest_year_school_completed_mother +
                highest_year_school_completed_father, data = GSS_subset)
R2 <- summary(full_model)$r.squared

quantile(null_R2, 0.95)
R2 
R2 > quantile(null_R2, 0.95)

summary(model_obj)

#Fit a linear model predicting highest_year_of_school_completed with just highest_year_school_completed_father
#in GSS_subset
father_model <- #YOUR CODE HERE

summary(father_model)


#Fit a linear model predicting highest_year_of_school_completed with highest_year_school_completed_father
# and respondents_sex
full_model <- #YOUR CODE HERE

summary(full_model)


#R-squared
model_summary$r.squared

#adjusted R-quares
model_summary$adj.r.squared

install.packages('performance')
library(performance)

test_performance(father_model, full_model)