# Run this first so it's ready by the time you need it install.packages("dplyr") install.packages("supernova") install.packages("ggformula") library(dplyr) library(supernova) library(ggformula) GSS <- read.csv("https://raw.githubusercontent.com/smburns47/Psyc158/main/GSS.csv") str(GSS) set.seed(10) GSS_subset <- sample_n(GSS, 100) #args are dataframe to sample from, and number of observations to sample #removing NA observations #does it make sense what this filtering code is doing? Check out chapter 4 for a refresher GSS_subset <- filter(GSS_subset, !is.na(highest_year_of_school_completed) & !is.na(highest_year_school_completed_father) & !is.na(number_of_brothers_and_sisters)) #Make a model that predicts highest years of schooling from father's years of education. #Look at the variable list above to get exact variable names and use GSS_subset as the data. father_model <- lm(#YOUR CODE HERE, data = GSS_subset) #Make a model that predicts highest years of schooling from number of siblings sibling_model <- lm(#YOUR CODE HERE, data = GSS_subset) #Plotting father model gf_jitter(highest_year_of_school_completed ~ highest_year_school_completed_father, data=GSS_subset) %>% gf_lm() #Plotting sibling_model gf_jitter(highest_year_of_school_completed ~ number_of_brothers_and_sisters, data=GSS_subset) %>% gf_lm() # Generate the ANOVA tables for both father_model and sibling_model #Plotting father model gf_jitter(highest_year_of_school_completed ~ highest_year_school_completed_father, data=GSS_subset, size=4) #Add in the color argument to the gf_jitter() function gf_jitter(highest_year_of_school_completed ~ highest_year_school_completed_father, data=GSS_subset, size = 4) %>% gf_lm() #Save the residuals of father_model to a residual variable GSS_subset$father_resid <- #YOUR CODE HERE #plotting these residuals against number of siblings gf_jitter(father_resid ~ number_of_brothers_and_sisters, data=GSS_subset, size = 4) # use lm() to find the best fitting coefficients for our multivariate model and output the coefficient values full_model <- #YOUR CODE HERE full_model GSS_subset$education_predicted <- predict(full_model) subsetsubset <- filter(GSS_subset, number_of_brothers_and_sisters == 0 | number_of_brothers_and_sisters == 7) gf_point(education_predicted ~ highest_year_school_completed_father, data = subsetsubset, color = "red", size = 5) # generate the ANOVA table for full_model 182.376 + 388.286 father_model sibling_model gf_jitter(highest_year_school_completed_father ~ number_of_brothers_and_sisters, data=GSS_subset, size = 4) cor(GSS_subset$highest_year_school_completed_father, GSS_subset$number_of_brothers_and_sisters) supernova(full_model) supernova(full_model) #resetting GSS_subset, since we deleted some rows earlier set.seed(10) GSS_subset <- sample_n(GSS, 100) #check variable type of respondents_sex str(GSS_subset$respondents_sex) #check variable type of born_in_us str(GSS_subset$born_in_us) #table of respondents_sex values table(GSS_subset$respondents_sex) #table of born_in_us values table(GSS_subset$born_in_us) #Write some code that will output the results of a model with two categorical predictors #table of race_of_respondent values #model predicting education years with race of respondent #number_of_children was saves as a character type because one response option is "8 or more" - #always check your data types before modeling! GSS_subset$number_of_children <- as.numeric(GSS_subset$number_of_children) lm(number_of_children ~ general_happiness, data=GSS_subset) gf_boxplot(number_of_children ~ general_happiness, data=GSS_subset, color= ~general_happiness) #converting general_happiness to new labels, and as numeric GSS_subset$happiness_num <- as.numeric(recode(GSS_subset$general_happiness, "Not too happy" = "0", "Pretty happy" = "1", "Very happy" = "2")) lm(number_of_children ~ happiness_num, data=GSS_subset) gf_jitter(number_of_children ~ happiness_num, data=GSS_subset, width=0.2, height=0.2) %>% gf_lm() categ_model <- lm(number_of_children ~ general_happiness, data=GSS_subset) continuous_model <- lm(number_of_children ~ happiness_num, data=GSS_subset) supernova(categ_model) supernova(continuous_model) lm(highest_year_of_school_completed ~ highest_year_school_completed_father + born_in_us, data = GSS_subset) supernova(father_model) cor(GSS_subset$highest_year_school_completed_father, GSS_subset$highest_year_school_completed_mother, use="complete.obs") #dealing with NAs cor(GSS_subset$highest_year_of_school_completed, GSS_subset$highest_year_school_completed_mother, use="complete.obs") bothparents_model <- lm(highest_year_of_school_completed ~ highest_year_school_completed_father + highest_year_school_completed_mother, data = GSS_subset) bothparents_model supernova(bothparents_model)