library(IRdisplay) display_html( '
' ) shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr))) shhh({ library(tidyverse) # Modeling library(brms) library(lme4) library(tidybayes) set.seed(5) # Tables: library(gt) library(gtsummary) }) options(repr.plot.width = 15, repr.plot.height = 10) # collect all new topic tool attempts and saves # Add a join with talk_page_edit to ensure all saved edits are included due to bug https://phabricator.wikimedia.org/T305541 query <- " --find all edit attempts WITH edit_attempts AS ( SELECT wiki AS wiki, event.user_id as user_id, event.editing_session_id as edit_attempt_id, event.bucket AS experiment_group, event.is_oversample AS is_oversample, event.integration AS editing_method, If(event.integration == 'discussiontools', 1, 0) AS new_topic_tool_used, CASE WHEN event.init_type = 'section' AND event.integration == 'discussiontools' THEN 'new_topic_tool' WHEN event.init_type = 'section' AND event.integration == 'page' AND event.init_mechanism IN ('url-new', 'new') THEN 'new_section_link' ELSE 'NA' -- check to make sure all edit types accounted for in above list END AS section_edit_type, event.user_editcount AS experience_level FROM event.editattemptstep WHERE -- only in participating wikis wiki IN ('amwiki', 'bnwiki', 'zhwiki', 'nlwiki', 'arzwiki', 'frwiki', 'hewiki', 'hiwiki', 'idwiki', 'itwiki', 'jawiki', 'kowiki', 'omwiki', 'fawiki', 'plwiki', 'ptwiki', 'eswiki', 'thwiki', 'ukwiki', 'viwiki') -- since deployment AND year = 2022 AND ((month = 01 and day >= 27) OR (month = 02) OR (month = 03 and day <= 25)) -- remove bots AND useragent.is_bot = false -- look at only desktop events AND event.platform = 'desktop' -- review all talk namespaces AND event.page_ns % 2 = 1 -- only users in AB test AND event.bucket IN ('test', 'control') -- only registered user AND event.user_id != 0 AND event.action = 'init' -- discard VE/Wikieditor edits to create new page or reply tool edits AND NOT ( -- not a reply tool edit (event.init_type = 'page' AND event.integration = 'discussiontools') OR -- not an wikitext edit to create a new page (event.init_type = 'page' AND event.init_mechanism IN ('url-new', 'new') AND event.integration = 'page') OR -- not a corrective edit to an existing section (event.init_type = 'section' AND event.init_mechanism IN ('click', 'url') AND event.integration == 'page') OR -- not a full page edit (event.init_type = 'page' AND event.init_mechanism IN ('click', 'url') AND event.integration = 'page') )), -- find all published comments published_dt_new_topics AS ( SELECT performer.user_id AS user_id, session_id AS edit_save_id, `database` AS wiki FROM event.mediawiki_talk_page_edit WHERE year = 2022 AND ((month = 01 and day >= 27) OR (month = 02) OR (month = 03 and day <= 25)) -- only in participating wikis AND `database` IN ('amwiki', 'bnwiki', 'zhwiki', 'nlwiki', 'arzwiki', 'frwiki', 'hewiki', 'hiwiki', 'idwiki', 'itwiki', 'jawiki', 'kowiki', 'omwiki', 'fawiki', 'plwiki', 'ptwiki', 'eswiki', 'thwiki', 'ukwiki', 'viwiki') ), published_section_link_new_topics AS ( SELECT event.user_id as user_id, event.editing_session_id AS edit_save_id, wiki AS wiki FROM event.editattemptstep WHERE -- only in participating wikis wiki IN ('amwiki', 'bnwiki', 'zhwiki', 'nlwiki', 'arzwiki', 'frwiki', 'hewiki', 'hiwiki', 'idwiki', 'itwiki', 'jawiki', 'kowiki', 'omwiki', 'fawiki', 'plwiki', 'ptwiki', 'eswiki', 'thwiki', 'ukwiki', 'viwiki') AND year = 2022 AND ((month = 01 and day >= 27) OR (month = 02) OR (month = 03 and day <= 25)) AND event.action = 'saveSuccess' ) -- main query SELECT eas.wiki, eas.user_id, edit_attempt_id, experiment_group, is_oversample, editing_method, new_topic_tool_used, section_edit_type, -- was saved in either talk page edit or editattemptstep IF ((section_edit_type = 'new_topic_tool' AND (tpe_save.edit_save_id IS NOT NULL OR eas_save.edit_save_id IS NOT NULL)) OR (section_edit_type = 'new_section_link' AND (tpe_save.edit_save_id IS NOT NULL OR eas_save.edit_save_id IS NOT NULL)), 1, 0) AS edit_success, experience_level FROM edit_attempts eas LEFT JOIN published_dt_new_topics tpe_save ON eas.edit_attempt_id = tpe_save.edit_save_id AND eas.wiki = tpe_save.wiki LEFT JOIN published_section_link_new_topics eas_save ON eas.edit_attempt_id = eas_save.edit_save_id AND eas.wiki = eas_save.wiki " new_topic_attempts <- wmfdata::query_hive(query) # data reformatting and cleanup #set factor levels with correct baselines new_topic_attempts$section_edit_type <- factor( new_topic_attempts$section_edit_type, levels = c("NA", "full_page_edits", "new_section_link", "new_topic_tool"), labels = c("NA", "Full page edits", "Existing add new section link", "New topic tool") ) new_topic_attempts$edit_success <- factor( new_topic_attempts$edit_success, levels = c(0, 1), labels = c("Not Complete", "Complete") ) # reformat user-id and adjust to include wiki to account for duplicate user id instances. # Users do not have the smae user_id on different wikis new_topic_attempts$user_id <- as.character(paste(new_topic_attempts$user_id, new_topic_attempts$wiki, sep ="-")) #clarfiy wiki names new_topic_attempts <- new_topic_attempts %>% mutate( wiki = case_when( #clarfiy participating project names wiki == 'amwiki' ~ "Amharic Wikipedia", wiki == 'bnwiki' ~ "Bengali Wikipedia", wiki == 'zhwiki' ~ "Chinese Wikipedia", wiki == 'nlwiki' ~ 'Dutch Wikipedia', wiki == 'arzwiki' ~ 'Egyptian Wikipedia', wiki == 'frwiki' ~ 'French Wikipedia', wiki == 'hewiki' ~ 'Hebrew Wikipedia', wiki == 'hiwiki' ~ 'Hindi Wikipedia', wiki == 'idwiki' ~ 'Indonesian Wikipedia', wiki == 'itwiki' ~ 'Italian Wikipedia', wiki == 'jawiki' ~ 'Japanese Wikipedia', wiki == 'kowiki' ~ 'Korean Wikipedia', wiki == 'omwiki' ~ 'Oromo Wikipedia', wiki == 'fawiki' ~ 'Persian Wikipedia', wiki == 'plwiki' ~ 'Polish Wikipedia', wiki == 'ptwiki' ~ 'Portuguese Wikipedia', wiki == 'eswiki' ~ 'Spanish Wikipedia', wiki == 'thwiki' ~ 'Thai Wikipedia', wiki == 'ukwiki' ~ 'Ukrainian Wikipedia', wiki == 'viwiki' ~ 'Vietnamese Wikipedia' ) ) # Create new column to identify Junior and Non-Junior Contributors new_topic_attempts <- new_topic_attempts %>% mutate( is_junior = case_when( #clarfiy participating project names experience_level < 100 ~ "Junior Contributor", experience_level >= 100 ~ "Non-Junior Contributor" ), is_junior = factor(is_junior, levels = c("Non-Junior Contributor", "Junior Contributor") )) new_topic_attempts_bygroup <- new_topic_attempts %>% filter(is_oversample == 'false') %>% #All Discussion Tool events are oversampled - removing to check balance. group_by(experiment_group) %>% summarise(users = n_distinct(user_id), attempts = n_distinct(edit_attempt_id), .groups = 'drop') new_topic_attempts_bygroup # Completion Rate By Session new_topic_attempts_jc_bysession <- new_topic_attempts %>% filter(is_junior == 'Junior Contributor') %>% #only jc group_by (wiki, section_edit_type) %>% summarise(n_attempts = n_distinct(edit_attempt_id), n_completions = n_distinct(edit_attempt_id[edit_success == 'Complete']), #n_completions = sum(edit_success == 'Complete'), new_topic_tool_used = as.integer(ifelse(sum(section_edit_type== 'New topic tool'), 1, 0)), .groups = 'drop') # By session new_topic_attempts_jc_bysession_all <- new_topic_attempts_jc_bysession %>% group_by(section_edit_type) %>% summarise(n_attempts = sum(n_attempts), n_attempts_completed = sum(n_completions), completion_rate = paste0(round(n_attempts_completed / n_attempts *100, 1), "%"), .groups = 'drop' ) new_topic_attempts_jc_bysession_all # Completion Rate By Junior Contributors new_topic_attempts_jc <- new_topic_attempts %>% filter(is_junior == 'Junior Contributor') %>% #only jc group_by (wiki, section_edit_type, user_id) %>% summarise(n_attempts = n_distinct(edit_attempt_id), n_completions = n_distinct(edit_attempt_id[edit_success == 'Complete']), edit_success = ifelse(sum(n_completions >= 1), 'Complete', 'Not Complete'), #redefine edit success as user completed at least 1 edit attempt new_topic_tool_used = as.integer(ifelse(sum(section_edit_type== 'New topic tool'), 1, 0)), .groups = 'drop') # By Contributor # Review edit completion rate by editing method new_topic_attempts_jc_all <- new_topic_attempts_jc %>% group_by(section_edit_type) %>% summarise(n_users = n_distinct(user_id), n_users_completed = sum(n_completions >= 1), #user completed at least 1 edit completion_rate = paste0(round(n_users_completed / n_users *100, 1), "%"), .groups = 'drop' ) # Create table of completion rate new_topic_attempts_jc_all_table <- new_topic_attempts_jc_all %>% gt() %>% tab_header( title = "Junior contributors new topic completion rate", subtitle = "across all participating Wikipedias" ) %>% cols_label( section_edit_type = "Editing method", n_users = "Number of users attempted", n_users_completed = "Number of users completed", completion_rate = "New topic completion rate" ) %>% tab_footnote( footnote = "Defined as percent of contributors that attempted and published at least 1 new topic", locations = cells_column_labels( columns = 'completion_rate' ) ) %>% tab_footnote( footnote = "Sampling rate for Non-New Topic Tool events is 6.25%", locations = cells_body( columns = 'section_edit_type', rows = 1) ) %>% tab_footnote( footnote = "Sampling rate for New Topic Tool events is 100%", locations = cells_body( columns = 'section_edit_type', rows = 2)) %>% gtsave( "new_topic_attempts_jc_all_table.html", inline_css = TRUE) IRdisplay::display_html(data = new_topic_attempts_jc_all_table, file = "new_topic_attempts_jc_all_table.html") p <- new_topic_attempts_jc_all %>% ggplot(aes(x= section_edit_type, y = n_users_completed / n_users, fill = section_edit_type)) + geom_col(position = 'dodge') + geom_text(aes(label = paste(completion_rate), fontface=2), vjust=1.2, size = 8, color = "white") + scale_y_continuous(labels = scales::percent) + scale_x_discrete(labels = c("Existing add new section link", "New topic tool")) + labs (y = "Percent of junior contributors ", x = "Editing method", title = "Junior contributors new topic completion rate \n across all participating Wikipedias", caption = "Defined as percent of contributors that make a new topic attempt and publish at least 1 new topic") + scale_fill_manual(values= c("#999999", "steelblue2")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position= "none", axis.line = element_line(colour = "black")) p ggsave("Figures/new_topic_attempts_jc_all.png", p, width = 16, height = 8, units = "in", dpi = 300) # Review edit attempts by editing method and wiki new_topic_attempts_jc_bywiki <- new_topic_attempts_jc %>% group_by(wiki, section_edit_type) %>% summarise(n_users = n_distinct(user_id), n_users_completed = sum(n_completions >=1), #user completed at least 1 edit completion_rate = paste0(round(n_users_completed / n_users *100, 1), "%"), .groups = 'drop') %>% #determine credible intervals cbind(as.data.frame(binom:::binom.bayes(x = .$n_users_completed, n = .$n_users, conf.level = 0.95, tol = 1e-10))) %>% mutate(lower = round(lower,2), upper = round(upper, 2)) new_topic_attempts_jc_bywiki_tbl <- new_topic_attempts_jc_bywiki %>% select(c(1,2,3,4,5,12,13)) %>% #remove unneeded rows gt() %>% tab_header( title = "Junior contributors new topic completion rate by participating Wikipedia" ) %>% cols_label( wiki = "Wikipedia", section_edit_type= "Editing method", n_users = "Number of users attempted", n_users_completed = "Number of users completed", completion_rate = "Completion rate", lower = "CI (Lower Bound)", upper = "CI (Upper Bound)" ) %>% tab_footnote( footnote = "Defined as percent of contributors that make a new topic attempt and publish at least 1 new topic", locations = cells_column_labels( columns = 'completion_rate' ) ) %>% tab_footnote( footnote = "Sampling rate for Non-New Topic Tool events is 6.25%", locations = cells_column_labels( columns = 'section_edit_type' ) ) %>% tab_footnote( footnote = "Sampling rate for New Topic Tool events is 100%", locations = cells_column_labels( columns = 'section_edit_type' )) %>% tab_footnote( footnote = "95% credible intervals. There is a 95% probability that the parameter lies in this interval", locations = cells_column_labels( columns = c('lower', 'upper') )) %>% gtsave( "new_topic_attempts_jc_bywiki_tbl .html", inline_css = TRUE) IRdisplay::display_html(data = new_topic_attempts_jc_bywiki_tbl, file = "new_topic_attempts_jc_bywiki_tbl .html") new_topic_attempts_jc_bywiki_tbl # Plot edit completion rates for each wiki dodge <- position_dodge(width=0.9) p <- new_topic_attempts_jc_bywiki %>% filter(!(wiki %in% c('Amharic Wikipedia', 'Egyptian Wikipedia', 'Thai Wikipedia'))) %>% # remove wikis where there are under 10 events as we ggplot(aes(x= section_edit_type, y = n_users_completed / n_users, fill = section_edit_type)) + geom_col(position = 'dodge') + geom_errorbar(aes(ymin = lower, ymax = upper), color = 'red', size = 1, alpha = 0.5, position = dodge, width = 0.25) + geom_text(aes(label = paste(completion_rate), fontface=2), vjust=1.2, size = 5, color = "white") + facet_wrap(~ wiki) + scale_y_continuous(labels = scales::percent) + labs (y = "Percent of junior contributors ", title = "Junior contributors new topic completion rate by participating Wikipedia", caption = "Amharic, Egyptian, Thai, and Oromo Wikipedias removed from analysis due to insufficient events \n Red error bars: Reflect 95% credible intervals") + scale_fill_manual(values= c("#999999", "steelblue2"), name = "Editing Method", labels = c("Existing add new section link", "New topic tool")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position="bottom", axis.text.x = element_blank(), axis.title.x=element_blank(), axis.line = element_line(colour = "black")) p ggsave("Figures/new_topic_attempts_jc_bywiki_plot.png", p, width = 16, height = 8, units = "in", dpi = 300) #redefine edit success as factor for use in the model new_topic_attempts_jc$edit_success <- factor( new_topic_attempts_jc$edit_success, levels = c("Not Complete", "Complete") ) priors <- c( set_prior(prior = "std_normal()", class = "b"), set_prior("cauchy(0, 5)", class = "sd") ) fit_jc <- brm( edit_success ~ section_edit_type + (1 | wiki/user_id), family = bernoulli(link = "logit"), data = new_topic_attempts_jc, prior = priors, chains = 4, cores = 4 ) fit_jc_tbl <- fit_jc %>% spread_draws(b_section_edit_typeNewtopictool, b_Intercept) %>% mutate( exp_b = exp(b_section_edit_typeNewtopictool), b4 = b_section_edit_typeNewtopictool/ 4, avg_lift = plogis(b_Intercept + b_section_edit_typeNewtopictool) - plogis(b_Intercept) ) %>% pivot_longer( b_section_edit_typeNewtopictool:avg_lift, names_to = "param", values_to = "val" ) %>% group_by(param) %>% summarize( ps = c(0.025, 0.5, 0.975), qs = quantile(val, probs = ps), .groups = "drop" ) %>% mutate( quantity = ifelse( param %in% c("b_Intercept", "b_section_edit_typeNewtopictool"), "Parameter", "Function of parameter(s)" ), param = factor( param, c("b_Intercept", "b_section_edit_typeNewtopictool", "exp_b", "b4", "avg_lift"), c("(Intercept)", "Using new topic tool", "Multiplicative effect on odds", "Maximum Lift", "Average lift") ), ps = factor(ps, c(0.025, 0.5, 0.975), c("lower", "median", "upper")), ) %>% pivot_wider(names_from = "ps", values_from = "qs") %>% arrange(quantity, param) fit_jc_tbl%>% gt(rowname_col = "param", groupname_col = "quantity") %>% row_group_order(c("Parameter", "Function of parameter(s)")) %>% fmt_number(vars(lower, median, upper), decimals = 3) %>% fmt_percent(columns = vars(median, lower, upper), rows = 2:3, decimals = 1) %>% cols_align("center", vars(median, lower, upper)) %>% cols_merge(vars(lower, upper), pattern = "({1}, {2})") %>% cols_move_to_end(vars(lower)) %>% cols_label(median = "Point Estimate", lower = "95% CI") %>% tab_style(cell_text(weight = "bold"), cells_row_groups()) %>% tab_footnote("CI: Credible Interval", cells_column_labels(vars(lower))) %>% tab_footnote( html("Average lift = Pr(Success|New Topic Tool) - Pr(Success|New Section Link Editing) = logit-10 + β1) - logit-10)"), cells_body(vars(median), 3) ) %>% tab_footnote( html("Maximum lift calculated using the divide-by-4-rule"), cells_body(vars(median), 2) ) %>% tab_header("Junior Contributor Completion Rate: Posterior summary of model parameters") %>% gtsave( "fit_jc_tbl.html", inline_css = TRUE) IRdisplay::display_html(file = "fit_jc_tbl.html") # Create histogram exp_histogram <- new_topic_attempts %>% filter(experience_level < 5000) %>% #remove significant outliers to help more clearly see distribution of majority group_by(user_id) %>% summarize(experience_level = min(experience_level), .groups = 'drop') %>% #experience level at first recorded attempt in AB test ggplot(aes(x=experience_level)) + geom_histogram(binwidth = 100, fill = 'steelblue2') + scale_x_continuous(labels = scales::comma, breaks=seq(0,5000,500)) + labs (title = "Distribution of AB Test Users Edit Count ", y = "Number of users") + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position="bottom", axis.line = element_line(colour = "black")) exp_histogram # divide experiene level groups new_topic_attempts_exp <- new_topic_attempts %>% mutate(experience_group = cut(as.numeric(experience_level), breaks = c(0, 100, 500, 1000, 1500, 2000, 2500, 3000, Inf), labels = c('0-100 edits', '101-500 edits', '501-1000 edits', '1001-1500 edits', '1501-2000 edits', '2001-2500 edits', '2501-3000 edits', 'over 3000 edits'), include.lowest = TRUE)) #aggregate data to show contributors that completed at least 1 edit by experience level new_topic_attempts_byexp <- new_topic_attempts_exp %>% group_by (wiki, section_edit_type, user_id, experience_group) %>% summarise(n_attempts = n_distinct(edit_attempt_id), n_completions = sum(edit_success == 'Complete'), edit_success = ifelse(sum(n_completions >= 1), 'Complete', 'Not Complete'), #redefine edit success as user completed at least 1 edit new_topic_tool_used = as.integer(ifelse(sum(section_edit_type== 'New topic tool'), 1, 0)), .groups = 'drop') #convert edit success as factor for use in the model new_topic_attempts_byexp$edit_success <- factor( new_topic_attempts_byexp$edit_success, levels = c("Not Complete", "Complete"), labels = c("Not Complete", "Complete") ) # Review edit completion rate by section edit type across all experience levels new_topic_completes_allexp <- new_topic_attempts_byexp %>% group_by(section_edit_type) %>% summarise(n_users = n_distinct(user_id), n_users_completed = sum(n_completions >= 1), #user completed at least 1 edit completion_rate = paste0(round(n_users_completed / n_users *100, 1), "%"), .groups = 'drop' ) new_topic_completes_allexp_table <- new_topic_completes_allexp %>% gt() %>% tab_header( title = "Contributors new topic completion rate", subtitle = "Across all experience levels and participating Wikipedias" ) %>% cols_label( section_edit_type = "Editing method", n_users = "Number of users attempted", n_users_completed = "Number of users completed", completion_rate = "Completion rate" ) %>% tab_footnote( footnote = "Defined as percent of contributors that make a new topic attempt and publish at least 1 comment.", locations = cells_column_labels( columns = 'completion_rate' ) ) %>% tab_footnote( footnote = "Sampling rate for Non-New Topic Tool events is 6.25%", locations = cells_body( columns = 'section_edit_type', rows = 1) ) %>% tab_footnote( footnote = "Sampling rate for New Topic Tool events is 100%", locations = cells_body( columns = 'section_edit_type', rows = 2) ) %>% gtsave( "new_topic_completes_allexp_table.html", inline_css = TRUE) IRdisplay::display_html(file = "new_topic_completes_allexp_table.html") # Plot edit completion rates across all wikis and experience levels p <- new_topic_completes_allexp %>% ggplot(aes(x= section_edit_type, y = n_users_completed / n_users, fill = section_edit_type)) + geom_col(position = 'dodge') + geom_text(aes(label = paste(completion_rate), fontface=2), vjust=1.2, size = 8, color = "white") + scale_y_continuous(labels = scales::percent) + scale_x_discrete(labels = c("Existing add new section link", "New topic tool")) + labs (y = "Percent of contributors ", x = "Editing method", title = "Contributors comment completion rate \n across all experience levels and participating Wikipedias", caption = "Defined as percent of contributors that make a new topic attempt and publish at least 1 new topic") + scale_fill_manual(values= c("#999999", "steelblue2")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position= "none", axis.line = element_line(colour = "black")) p ggsave("Figures/new_topic_completes_allexp.png", p, width = 16, height = 8, units = "in", dpi = 300) # Completion Rate By Session new_topic_attempts_byexp_bysession <- new_topic_attempts_exp %>% group_by (experience_group, section_edit_type) %>% summarise(n_attempts = n_distinct(edit_attempt_id), n_completions = n_distinct(edit_attempt_id[edit_success == 'Complete']), completion_rate = paste0(round(n_completions / n_attempts *100, 1), "%"), new_topic_tool_used = as.integer(ifelse(sum(section_edit_type== 'New topic tool'), 1, 0)), .groups = 'drop') new_topic_attempts_byexp_bysession # Review edit completion rate by contributors and experience level new_topic_completes_byexp <- new_topic_attempts_byexp %>% group_by(experience_group, section_edit_type) %>% summarise(n_users = n_distinct(user_id), n_users_completed = n_distinct(user_id[n_completions >= 1]), #user completed at least 1 edit completion_rate = paste0(round(n_users_completed / n_users *100, 1), "%"), .groups = 'drop' ) %>% #determine credible intervals cbind(as.data.frame(binom:::binom.bayes(x = .$n_users_completed, n = .$n_users, conf.level = 0.95, tol = 1e-10))) %>% mutate(lower = round(lower,2), upper = round(upper, 2)) new_topic_completes_byexp_table <- new_topic_completes_byexp %>% select(c(1,2,3,4,5,12,13)) %>% #remove unneeded rows gt() %>% tab_header( title = "Contributors new topic completion rate by experience level", subtitle = "Across all participating Wikipedias" ) %>% cols_label( experience_group = "Experience level group", section_edit_type = "Editing experience", n_users = "Number of users attempted", n_users_completed = "Number of users completed", completion_rate = "Completion rate", lower = "CI (Lower Bound)", upper = "CI (Upper Bound)" ) %>% tab_footnote( footnote = "Defined as percent of contributors that made a new topic attempt and publish at least 1 comment.", locations = cells_column_labels( columns = 'completion_rate' )) %>% tab_footnote( footnote = "95% credible intervals. There is a 95% probability that the parameter lies in this interval", locations = cells_column_labels( columns = c('lower', 'upper') )) %>% gtsave( "new_topic_completes_byexp_table.html", inline_css = TRUE) IRdisplay::display_html(file = "new_topic_completes_byexp_table.html") new_topic_completes_byexp_table # Plot edit completion rates for each user on each wiki p <- new_topic_completes_byexp %>% ggplot(aes(x= section_edit_type, y = n_users_completed / n_users, fill = section_edit_type)) + geom_col(position = 'dodge') + geom_text(aes(label = paste(completion_rate), fontface=2), vjust=1.2, size = 8, color = "white") + geom_errorbar(aes(ymin = lower, ymax = upper), color = 'red', size = 1, alpha = 0.5, position = dodge, width = 0.25) + facet_wrap(~ experience_group) + scale_y_continuous(labels = scales::percent) + labs (y = "Percent of contributors ", title = "Contributors new topic completion rate by experience level \n across all participating Wikipedias", caption = "Red error bars: 95% credible intervals" )+ scale_fill_manual(values= c("#999999", "steelblue2"), name = "Editing Method", labels = c("Existing add new section link", "New topic tool")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position="bottom", axis.text.x = element_blank(), axis.title.x=element_blank(), axis.line = element_line(colour = "black")) p ggsave("Figures/new_topic_completes_byexp.png", p, width = 16, height = 8, units = "in", dpi = 300) fit_all <- brm( edit_success ~ section_edit_type * is_junior + (1 | wiki/user_id), family = bernoulli(link = "logit"), data = new_topic_attempts_byexp, prior = priors, chains = 4, cores = 4 ) conditional_effects <- conditional_effects(fit_all, effects = "is_junior:section_edit_type") p <- plot(conditional_effects, plot = FALSE)[[1]] + labs (y = "Probability of new topic completion", title = "Effects of experience level and editor type \n on new topic completion probability") + theme_bw() + scale_color_manual(values= c("#999999", "steelblue2")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position="bottom", axis.line = element_line(colour = "black")) p ggsave("Figures/conditional_effects_exp_editor.png", p, width = 16, height = 8, units = "in", dpi = 300) new_topic_reverts <- read.csv( file = 'Data/new_topic_reverts.csv', header = TRUE, sep = ",", stringsAsFactors = FALSE ) # loads all revert data #clarfiy levels and lables for factor variables new_topic_reverts$section_edit_type <- factor( new_topic_reverts$section_edit_type, levels = c("non-new-topic-tool", "new-topic-tool"), labels = c("Existing add new section link", "New topic tool") ) new_topic_reverts$is_reverted <- factor(new_topic_reverts$is_reverted, levels = c("reverted", "not-reverted"), labels = c("Reverted", "Not reverted")) #reformat user-id and adjust to include wiki to account for duplicate user id instances. # Users do not have the smae user_id on different wikis new_topic_reverts$user_id <- as.character(paste(new_topic_reverts$user_id,new_topic_reverts$wiki,sep ="-" )) #clarfiy wiki names new_topic_reverts <- new_topic_reverts %>% mutate( wiki = case_when( #clarfiy participating project names wiki == 'amwiki' ~ "Amharic Wikipedia", wiki == 'bnwiki' ~ "Bengali Wikipedia", wiki == 'zhwiki' ~ "Chinese Wikipedia", wiki == 'nlwiki' ~ 'Dutch Wikipedia', wiki == 'arzwiki' ~ 'Egyptian Wikipedia', wiki == 'frwiki' ~ 'French Wikipedia', wiki == 'hewiki' ~ 'Hebrew Wikipedia', wiki == 'hiwiki' ~ 'Hindi Wikipedia', wiki == 'idwiki' ~ 'Indonesian Wikipedia', wiki == 'itwiki' ~ 'Italian Wikipedia', wiki == 'jawiki' ~ 'Japanese Wikipedia', wiki == 'kowiki' ~ 'Korean Wikipedia', wiki == 'omwiki' ~ 'Oromo Wikipedia', wiki == 'fawiki' ~ 'Persian Wikipedia', wiki == 'plwiki' ~ 'Polish Wikipedia', wiki == 'ptwiki' ~ 'Portuguese Wikipedia', wiki == 'eswiki' ~ 'Spanish Wikipedia', wiki == 'thwiki' ~ 'Thai Wikipedia', wiki == 'ukwiki' ~ 'Ukrainian Wikipedia', wiki == 'viwiki' ~ 'Vietnamese Wikipedia', ) ) # Create new column to identify Junior and Non-Junior Contributors new_topic_reverts <- new_topic_reverts %>% mutate( is_junior = case_when( #clarfiy participating project names experience_level < 100 ~ "Junior Contributor", experience_level >= 100 ~ "Non-Junior Contributor" ), is_junior = factor(is_junior, levels = c("Non-Junior Contributor", "Junior Contributor") )) # filter date to only look at junior contributor edits reverted new_topic_reverts_jc <- new_topic_reverts %>% filter(is_junior == 'Junior Contributor') # aggregrate based on editing experience type new_topic_reverts_jc_all <- new_topic_reverts_jc %>% group_by(section_edit_type) %>% summarise(total_reverts = n_distinct(revision_id[is_reverted == "Reverted"]), total_comments = n_distinct(revision_id), revert_rate =paste(round(total_reverts/total_comments * 100, 2), '%'), .groups = 'drop') %>% cbind(as.data.frame(binom:::binom.bayes(x = .$total_reverts, n = .$total_comments, conf.level = 0.95, tol = 1e-10))) %>% mutate(lower = round(lower,2), upper = round(upper, 2)) new_topic_reverts_jc_all_table <- new_topic_reverts_jc_all %>% select(c(1,2,3,4,11,12)) %>% #remove unneeded rows gt() %>% tab_header( title = "Junior contributors new topic revert rate across all participating Wikipedias", subtitle = "Across all participating Wikipedias" ) %>% cols_label( section_edit_type = "Editing experience", total_reverts = "Number of new topics reverted", total_comments = "Number of new topics published", revert_rate = "Revert rate", lower = "CI (Lower Bound)", upper = "CI (Upper Bound)" ) %>% tab_footnote( footnote = "Defined as percent of new topics reverted within 48 hours.", locations = cells_column_labels( columns = 'revert_rate' ) ) %>% tab_footnote( footnote = "Sampling rate is 100% for new topic tool events and 6.25% for non-new topic tool events", locations = cells_column_labels( columns = 'section_edit_type' ) ) %>% tab_footnote( footnote = "95% credible intervals. There is a 95% probability that the parameter lies in this interval", locations = cells_column_labels( columns = c('lower', 'upper') ) ) %>% gtsave( "new_topic_reverts_jc_all_table.html", inline_css = TRUE) IRdisplay::display_html(file = "new_topic_reverts_jc_all_table.html") # Plot edit completion rates for each user on each wiki dodge <- position_dodge(width=0.9) p <- new_topic_reverts_jc_all %>% ggplot(aes(x= section_edit_type, y = total_reverts/ total_comments, fill = section_edit_type)) + geom_col(position = 'dodge') + geom_errorbar(aes(ymin = lower, ymax = upper), color = 'red', size = 1, alpha = 0.5, position = dodge, width = 0.25) + geom_text(aes(label = paste(revert_rate), fontface=2), vjust=1.2, size = 8, color = "white") + scale_y_continuous(labels = scales::percent) + scale_x_discrete(labels = c("Existing add new section link", "New topic tool")) + labs (y = "Percent of new topics reverted ", x = "Editing Method", title = "Junior contributors new topic revert rate across \n all participating Wikipedias", caption = "Revert rate defined as percent of published new topics reverted within 48 hours \n Red error bars: 95% credible intervals") + scale_fill_manual(values= c("#999999", "steelblue2")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position= "none", axis.line = element_line(colour = "black")) p ggsave("Figures/new_topic_reverts_jc_all .png", p, width = 16, height = 8, units = "in", dpi = 300) # aggregrate data by wiki and editing interface new_topic_reverts_jc_bywiki <- new_topic_reverts_jc %>% group_by(wiki, section_edit_type) %>% summarise(total_reverts = n_distinct(revision_id[is_reverted == "Reverted"]), total_comments = n_distinct(revision_id), revert_rate =paste(round(total_reverts/total_comments * 100, 2), '%'), .groups = 'drop') %>% cbind(as.data.frame(binom:::binom.bayes(x = .$total_reverts, n = .$total_comments, conf.level = 0.95, tol = 1e-10))) %>% mutate(lower = round(lower,2), upper = round(upper, 2)) new_topic_reverts_jc_bywiki_table <- new_topic_reverts_jc_bywiki %>% select(c(1,2,3,4,5,12,13)) %>% #remove unneeded rows gt() %>% tab_header( title = "Junior Contributors new topic revert rate by participating Wikipedia" ) %>% cols_label( wiki = "Wikipedia", section_edit_type = "Editing method", total_reverts = "Number of new topics reverted", total_comments = "Number of new topics published", revert_rate = "Revert rate", lower = "CI (Lower Bound)", upper = "CI (Upper Bound)" ) %>% tab_footnote( footnote = "Defined as percent of new topics reverted within 48 hours.", locations = cells_column_labels( columns = 'revert_rate' ) ) %>% tab_footnote( footnote = "Sampling rate is 100% for new topic tool events and 6.25% for non-new topic tool events", locations = cells_column_labels( columns = 'section_edit_type' ) ) %>% tab_footnote( footnote = "95% credible intervals. There is a 95% probability that the parameter lies in this interval", locations = cells_column_labels( columns = c('lower', 'upper') ) ) %>% gtsave( "new_topic_reverts_jc_bywiki_table.html", inline_css = TRUE) IRdisplay::display_html(file = "new_topic_reverts_jc_bywiki_table.html") # divide experience level groups new_topic_reverts_exp <- new_topic_reverts %>% mutate(experience_group = cut(as.numeric(experience_level), breaks = c(0, 100, 500, 1000, 1500, 2000, 2500, 3000, Inf), labels = c('0-100 edits', '101-500 edits', '501-1000 edits', '1001-1500 edits', '1501-2000 edits', '2001-2500 edits', '2501-3000 edits', 'over 3000 edits'), include.lowest = TRUE)) # aggregate data based on editor and experience level new_topic_reverts_byexp <- new_topic_reverts_exp %>% group_by(experience_group, section_edit_type) %>% summarise(total_reverts = n_distinct(revision_id[is_reverted == "Reverted"]), total_comments = n_distinct(revision_id), revert_rate =paste(round(total_reverts/total_comments * 100, 2), '%'), .groups = 'drop') %>% ungroup() %>% cbind(as.data.frame(binom:::binom.bayes(x = .$total_reverts, n = .$total_comments, conf.level = 0.95, tol = 1e-10))) %>% mutate(lower = round(lower,2), upper = round(upper, 2)) new_topic_reverts_byexp_table <- new_topic_reverts_byexp %>% select(c(1,2,3,4,5,12,13)) %>% #remove unneeded rows gt() %>% tab_header( title = "Contributors new topic revert rate by experience level" ) %>% cols_label( experience_group = "Experience level", section_edit_type = "Editing method", total_reverts = "Number of new topics reverted", total_comments = "Number of new topics published", revert_rate = "Revert rate", lower = "CI (Lower Bound)", upper = "CI (Upper Bound)" ) %>% tab_footnote( footnote = "Defined as percent of new topics reverted within 48 hours.", locations = cells_column_labels( columns = 'revert_rate' ) )%>% tab_footnote( footnote = "Sampling rate is 100% for new topic tool events and 6.25% for non-new topic tool events", locations = cells_column_labels( columns = 'section_edit_type' ) ) %>% tab_footnote( footnote = "Junior contributor defined as having under cumulative 100 edits. Non-Junior Contributor is defined as having over 100 cumulative edits", locations = cells_column_labels( columns = 'experience_group' ) ) %>% tab_footnote( footnote = "95% credible intervals. There is a 95% probability that the parameter lies in this interval", locations = cells_column_labels( columns = c('lower', 'upper') ) ) %>% gtsave( "new_topic_reverts_byexp_table.html", inline_css = TRUE) IRdisplay::display_html(file = "new_topic_reverts_byexp_table.html") # plot reverts by experience p <- new_topic_reverts_byexp %>% ggplot(aes(x= section_edit_type , y = total_reverts/total_comments, fill = section_edit_type)) + geom_col(position = 'dodge') + geom_errorbar(aes(ymin = lower, ymax = upper), color = 'red', alpha = 0.5, size = 1, position = dodge, width = 0.25) + geom_text(aes(label = paste(revert_rate),fontface=2), vjust=1.2, size = 8, color = "white") + scale_y_continuous(labels = scales::percent) + facet_wrap(~ experience_group, scale = "free_y") + labs (y = "Percent of new topics reverted", title = "Contributor new topic revert rate by experience level \n across all participating Wikipedias", caption = "Red error bars: 95% credible intervals") + scale_fill_manual(values= c("#999999", "steelblue2"), name = "Editing Method", labels = c("Existing add new section link", "New topic tool")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position="bottom", axis.text.x = element_blank(), axis.title.x=element_blank(), axis.line = element_line(colour = "black")) p ggsave("Figures/new_topic_reverts_byexp.png", p, width = 16, height = 8, units = "in", dpi = 300) new_topic_blocks <- read.csv( file = 'Data/new_topic_blocks.csv', header = TRUE, sep = ",", stringsAsFactors = FALSE ) # loads all revert data #clarfiy levels and lables for factor variables new_topic_blocks$section_edit_type <- factor( new_topic_blocks$section_edit_type, levels = c("non-new-topic-tool", "new-topic-tool"), labels = c("Existing add new section link", "New topic tool") ) #clarfiy wiki names new_topic_blocks<- new_topic_blocks%>% mutate( wiki = case_when( #clarfiy participating project names wiki == 'amwiki' ~ "Amharic Wikipedia", wiki == 'bnwiki' ~ "Bengali Wikipedia", wiki == 'zhwiki' ~ "Chinese Wikipedia", wiki == 'nlwiki' ~ 'Dutch Wikipedia', wiki == 'arzwiki' ~ 'Egyptian Wikipedia', wiki == 'frwiki' ~ 'French Wikipedia', wiki == 'hewiki' ~ 'Hebrew Wikipedia', wiki == 'hiwiki' ~ 'Hindi Wikipedia', wiki == 'idwiki' ~ 'Indonesian Wikipedia', wiki == 'itwiki' ~ 'Italian Wikipedia', wiki == 'jawiki' ~ 'Japanese Wikipedia', wiki == 'kowiki' ~ 'Korean Wikipedia', wiki == 'omwiki' ~ 'Oromo Wikipedia', wiki == 'fawiki' ~ 'Persian Wikipedia', wiki == 'plwiki' ~ 'Polish Wikipedia', wiki == 'ptwiki' ~ 'Portuguese Wikipedia', wiki == 'eswiki' ~ 'Spanish Wikipedia', wiki == 'thwiki' ~ 'Thai Wikipedia', wiki == 'ukwiki' ~ 'Ukrainian Wikipedia', wiki == 'viwiki' ~ 'Vietnamese Wikipedia', ) ) # Create new column to identify Junior and Non-Junior Contributors new_topic_blocks <- new_topic_blocks %>% mutate( is_junior = case_when( #clarfiy participating project names experience_level < 100 ~ "Junior Contributor", experience_level >= 100 ~ "Non-Junior Contributor" ), is_junior = factor(is_junior, levels = c("Non-Junior Contributor", "Junior Contributor") )) # filter date to only look at junior contributors blocked new_topic_blocks_jc <- new_topic_blocks %>% filter(is_junior == 'Junior Contributor') # blocks for JCs across all wikis new_topic_blocks_jc_all <- new_topic_blocks_jc %>% group_by(section_edit_type) %>% summarise(total_blocked_users = sum(blocked_user), total_users = sum(all_users), pct_blocked = paste(round(total_blocked_users/total_users * 100, 2), "%"), .groups = 'drop') %>% ungroup() %>% cbind(as.data.frame(binom:::binom.bayes(x = .$total_blocked_users, n = .$total_users, conf.level = 0.95, tol = 1e-10))) %>% mutate(lower = round(lower,2), upper = round(upper, 2)) new_topic_blocks_jc_all_table <- new_topic_blocks_jc_all %>% select(1,2,3,4, 11, 12) %>% gt() %>% tab_header( title = "Junior contributors blocked after publishing a new topic " ) %>% cols_label( section_edit_type = "Editing method", total_blocked_users = "Number of users blocked", total_users = "Number of users that published a new topic", pct_blocked = "Percent of users blocked", lower = "CI (Lower Bound)", upper = "CI (Upper Bound)" ) %>% tab_footnote( footnote = "Sampling rate is 100% for new topic tool events and 6.25% for non-new topic tool events", locations = cells_column_labels( columns = 'section_edit_type' ) ) %>% tab_footnote( footnote = "Percent of junior contributors blocked after posting a new topic during the AB test", locations = cells_column_labels( columns = 'pct_blocked' ) ) %>% tab_footnote( footnote = "95% credible intervals. There is a 95% probability that the parameter lies in this interval", locations = cells_column_labels( columns = c('lower', 'upper') ) ) %>% gtsave( "new_topic_blocks_jc_all_table.html", inline_css = TRUE) IRdisplay::display_html(file = "new_topic_blocks_jc_all_table.html") # blocked users by wiki new_topic_blocks_jc_bywiki <- new_topic_blocks_jc %>% group_by(wiki, section_edit_type) %>% summarise(total_blocked_users = sum(blocked_user), total_users = sum(all_users), pct_blocked = paste(round(total_blocked_users/total_users * 100, 2), "%"), .groups = 'drop') %>% ungroup() %>% cbind(as.data.frame(binom:::binom.bayes(x = .$total_blocked_users, n = .$total_users, conf.level = 0.95, tol = 1e-10))) %>% mutate(lower = round(lower,2), upper = round(upper, 2)) new_topic_blocks_jc_bywiki_table <- new_topic_blocks_jc_bywiki %>% select(1,2,3,4,5,12,13) %>% gt() %>% tab_header( title = "Junior Contributors blocked after publishing a new topic by participating Wikipedia " ) %>% cols_label( wiki = "Wikipedia", section_edit_type= "Editing method", total_blocked_users = "Number of users blocked", total_users = "Number of users that published a new topic", pct_blocked = "Percent of users blocked", lower = "CI (Lower Bound)", upper = "CI (Upper Bound)" ) %>% tab_footnote( footnote = "Sampling rate is 100% for new topic tool events and 6.25% for non-new topic tool events", locations = cells_column_labels( columns = 'section_edit_type' ) ) %>% tab_footnote( footnote = "Percent of junior contributors blocked after posting a comment during the AB test", locations = cells_column_labels( columns = 'pct_blocked' ) ) %>% tab_footnote( footnote = "95% credible intervals. There is a 95% probability that the parameter lies in this interval", locations = cells_column_labels( columns = c('lower', 'upper') ) ) %>% gtsave( "new_topic_blocks_jc_bywiki_table.html", inline_css = TRUE) IRdisplay::display_html(file = "new_topic_blocks_jc_bywiki_table.html") # divide experience level groups new_topic_blocks_exp <- new_topic_blocks %>% mutate(experience_group = cut(as.numeric(experience_level), breaks = c(0, 100, 500, 1000, 1500, 2000, 2500, 3000, Inf), labels = c('0-100 edits', '101-500 edits', '501-1000 edits', '1001-1500 edits', '1501-2000 edits', '2001-2500 edits', '2501-3000 edits', 'over 3000 edits'), include.lowest = TRUE)) new_topic_blocks_byexp <- new_topic_blocks_exp %>% group_by(section_edit_type, experience_group) %>% summarise(total_blocked_users = sum(blocked_user), total_users = sum(all_users), pct_blocked = paste(round(total_blocked_users/total_users * 100, 2), "%") , .groups = 'drop') %>% ungroup() %>% cbind(as.data.frame(binom:::binom.bayes(x = .$total_blocked_users, n = .$total_users, conf.level = 0.95, tol = 1e-10))) %>% mutate(lower = round(lower,2), upper = round(upper, 2)) new_topic_blocks_byexp_table <- new_topic_blocks_byexp %>% select(1,2,3,4,5, 12,13) %>% gt() %>% tab_header( title = "Contributors blocked after publishing a new topic by experience level", ) %>% cols_label( section_edit_type = "Editing method", experience_group = "Experience level", total_blocked_users = "Number of users blocked", total_users = "Number of users that saved a new topic", pct_blocked = "Percent of users blocked", lower = "CI (Lower Bound)", upper = "CI (Upper Bound)" ) %>% tab_footnote( footnote = "Sampling rate is 100% for new topic tool events and 6.25% for non-new topic tool event", locations = cells_column_labels( columns = 'section_edit_type' ) ) %>% tab_footnote( footnote = "Percent of contributors blocked after posting a new topic during the AB test", locations = cells_column_labels( columns = 'pct_blocked' ) ) %>% tab_footnote( footnote = "95% credible intervals. There is a 95% probability that the parameter lies in this interval", locations = cells_column_labels( columns = c('lower', 'upper') ) ) %>% gtsave( "new_topic_blocks_byexp_table.html", inline_css = TRUE) IRdisplay::display_html(file = "new_topic_blocks_byexp_table.html") # plot proportion of blocks by experience levels p <- new_topic_blocks_byexp %>% ggplot(aes(x= section_edit_type, y = total_blocked_users/total_users, fill = section_edit_type,)) + geom_col(position = 'dodge') + geom_errorbar(aes(ymin = lower, ymax = upper), color = 'red', alpha = 0.5, size = 1, position = dodge, width = 0.25) + geom_text(aes(label = paste(pct_blocked), fontface=2), vjust=1.2, size = 8, color = "white") + scale_y_continuous(labels = scales::percent) + facet_wrap(~ experience_group, scale = "free_y") + labs (y = "Percent of blocked users", x = "Editing experience", title = "Contributors blocked after posting a comment by experience level", caption = "Red error bars: 95% credible intervals") + scale_fill_manual(values= c("#999999", "steelblue2"), name = "Editing Method", labels = c("Existing add new section link", "New topic tool")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position="bottom", axis.text.x = element_blank(), axis.title.x=element_blank(), axis.line = element_line(colour = "black")) p ggsave("Figures/new_topic_blocks_byexp.png", p, width = 16, height = 8, units = "in", dpi = 300) num_jc_editors <- new_topic_attempts %>% filter(is_junior == 'Junior Contributor', is_oversample == 'false' ) %>% #remove oversampled events group_by(experiment_group, section_edit_type) %>% summarise(total_users_attempt = n_distinct(user_id), total_users_complete = n_distinct(user_id[edit_success == 'Complete']), .groups = 'drop') num_jc_editors_table <- num_jc_editors %>% gt() %>% tab_header( title = "Number of Junior Contributors that made a new topic attempt during the AB test by test group and section edit type" ) %>% cols_label( experiment_group = "Test group", section_edit_type = "Editing method", total_users_attempt = "Number of users that attempted a new topic", total_users_complete = "Number of users that published a new topic" ) %>% tab_row_group( rows = experiment_group == 'control' ) %>% tab_row_group( rows = experiment_group == 'test' ) %>% tab_footnote( footnote = "Based on a sampling rate of 6.25% for all events. Any oversampled events were removed so data for the two editor types could be directly compared", locations = cells_title( ) ) %>% gtsave( "num_jc_editors_table.html", inline_css = TRUE) IRdisplay::display_html(file = "num_jc_editors_table.html") p <-num_jc_editors %>% group_by(experiment_group) %>% summarise(total_user_complete = sum(total_users_complete), .groups = 'drop') %>% ggplot(aes(x= experiment_group, y = total_user_complete, fill = experiment_group)) + geom_col(position = 'dodge') + geom_text(aes(label = paste(total_user_complete),fontface=2), vjust=1.2, size = 8, color = "white") + labs (y = "Number of Junior Contributors", x = "Experiment group", title = "Number of Junior Contributors that completed a new topic by test group \n across all participating Wikipedias") + scale_fill_manual(values= c("#999999", "steelblue2"), name = "Test Group", labels = c("Control", "Test")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position="bottom", axis.text.x = element_blank(), axis.title.x=element_blank(), axis.line = element_line(colour = "black")) p ggsave("Figures/num_jc_editors_bygroup.png", p, width = 16, height = 8, units = "in", dpi = 300) query <- " WITH first_edits AS ( -- users that made an attempt during the AB Test SELECT event_user_text as user_name, wiki_db AS wiki, min(event_timestamp) as first_edit_time, CASE WHEN min(event_user_revision_count) is NULL THEN 'undefined' WHEN min(event_user_revision_count) < 100 THEN 'junior' ELSE 'non-junior' END AS experience_level, IF(ARRAY_CONTAINS(revision_tags, 'discussiontools-newtopic'), 'new-topic-tool', 'non-newtopic-tool') AS section_edit_type FROM wmf.mediawiki_history mwh JOIN event.mediawiki_talk_page_edit tpe ON mwh.revision_id = tpe.revision_id AND mwh.wiki_db = tpe.`database` WHERE snapshot = '2022-03' AND event_timestamp >= '2022-01-27' AND event_timestamp <= '2022-03-04' AND ((month = 01 and day >= 27) OR (month = 02) OR (month = 03 and day <= 25)) AND wiki_db IN ('amwiki', 'bnwiki', 'zhwiki', 'nlwiki', 'arzwiki', 'frwiki', 'hewiki', 'hiwiki', 'idwiki', 'itwiki', 'jawiki', 'kowiki', 'omwiki', 'fawiki', 'plwiki', 'ptwiki', 'eswiki', 'thwiki', 'ukwiki', 'viwiki') -- do not include new discussion tool talk page edits AND NOT (ARRAY_CONTAINS(revision_tags, 'discussiontools-reply')) -- include only desktop edits AND NOT array_contains(revision_tags, 'iOS') AND NOT array_contains(revision_tags, 'Android') AND NOT array_contains(revision_tags, 'Mobile Web') -- first edit not reverted within 48 hours AND NOT (revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800) -- 48 hours -- find all edits on talk pages AND page_namespace_historical % 2 = 1 AND event_entity = 'revision' AND event_type = 'create' -- user is not a bot and not anonymous AND SIZE(event_user_is_bot_by_historical) = 0 AND SIZE(event_user_is_bot_by) = 0 AND event_user_is_anonymous = FALSE GROUP BY event_user_text, IF(ARRAY_CONTAINS(revision_tags, 'discussiontools-newtopic'), 'new-topic-tool', 'non-newtopic-tool'), wiki_db ) SELECT first_edits.experience_level, first_edits.section_edit_type, (count(first_week.user_name)/count(*)) as first_week_retention_rate FROM first_edits LEFT JOIN ( SELECT event_user_text as user_name, first_edits.first_edit_time, min(event_timestamp) as return_time FROM wmf.mediawiki_history mh INNER JOIN first_edits ON mh.event_user_text = first_edits.user_name WHERE snapshot = '2022-03' -- include only desktop edits AND NOT array_contains(revision_tags, 'iOS') AND NOT array_contains(revision_tags, 'Android') AND NOT array_contains(revision_tags, 'Mobile Web') -- find all edits on talk pages AND page_namespace_historical % 2 = 1 AND event_entity = 'revision' AND event_type = 'create' -- on all participating wikis AND wiki_db IN ('frwiki', 'eswiki', 'itwiki', 'jawiki', 'fawiki', 'plwiki', 'hewiki', 'nlwiki', 'hiwiki', 'kowiki', 'viwiki', 'thwiki', 'ptwiki', 'bnwiki', 'arzwiki', 'swwiki', 'zhwiki', 'ukwiki', 'idwiki', 'amwiki', 'omwiki', 'afwiki') -- return edit not reverted within 48 hours AND NOT (revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800) -- 48 hours -- user is not a bot and not anonymous AND SIZE(event_user_is_bot_by_historical) = 0 AND SIZE(event_user_is_bot_by) = 0 AND event_user_is_anonymous = FALSE AND first_edits.first_edit_time >= '2022-01-27' AND first_edits.first_edit_time <= '2022-03-04' -- second revision is between two and 8 days AND unix_timestamp(event_timestamp, 'yyyy-MM-dd HH:mm:ss.0') >= (unix_timestamp(first_edits.first_edit_time, 'yyyy-MM-dd HH:mm:ss.0') + (2*24*60*60)) AND unix_timestamp(event_timestamp, 'yyyy-MM-dd HH:mm:ss.0') <= (unix_timestamp(first_edits.first_edit_time, 'yyyy-MM-dd HH:mm:ss.0') + (8*24*60*60)) GROUP BY event_user_text, first_edits.first_edit_time ) AS first_week ON (first_edits.user_name = first_week.user_name and first_edits.first_edit_time = first_week.first_edit_time ) GROUP BY first_edits.experience_level, first_edits.section_edit_type; " week_one_retention <- wmfdata::query_hive(query) # find week 2 retention query <- " WITH first_edits AS ( -- users that made an attempt during the AB Test SELECT event_user_text as user_name, wiki_db AS wiki, min(event_timestamp) as first_edit_time, CASE WHEN min(event_user_revision_count) is NULL THEN 'undefined' WHEN min(event_user_revision_count) < 100 THEN 'junior' ELSE 'non-junior' END AS experience_level, IF(ARRAY_CONTAINS(revision_tags, 'discussiontools-newtopic'), 'new-topic-tool', 'non-newtopic-tool') AS section_edit_type FROM wmf.mediawiki_history mwh JOIN event.mediawiki_talk_page_edit tpe ON mwh.revision_id = tpe.revision_id AND mwh.wiki_db = tpe.`database` WHERE snapshot = '2022-03' AND event_timestamp >= '2022-01-27' AND event_timestamp <= '2022-03-04' AND ((month = 01 and day >= 27) OR (month = 02) OR (month = 03 and day <= 25)) AND wiki_db IN ('amwiki', 'bnwiki', 'zhwiki', 'nlwiki', 'arzwiki', 'frwiki', 'hewiki', 'hiwiki', 'idwiki', 'itwiki', 'jawiki', 'kowiki', 'omwiki', 'fawiki', 'plwiki', 'ptwiki', 'eswiki', 'thwiki', 'ukwiki', 'viwiki') -- do not include new discussion tool talk page edits AND NOT (ARRAY_CONTAINS(revision_tags, 'discussiontools-reply')) -- include only desktop edits AND NOT array_contains(revision_tags, 'iOS') AND NOT array_contains(revision_tags, 'Android') AND NOT array_contains(revision_tags, 'Mobile Web') -- first edit not reverted within 48 hours AND NOT (revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800) -- 48 hours -- find all edits on talk pages AND page_namespace_historical % 2 = 1 AND event_entity = 'revision' AND event_type = 'create' -- user is not a bot and not anonymous AND SIZE(event_user_is_bot_by_historical) = 0 AND SIZE(event_user_is_bot_by) = 0 AND event_user_is_anonymous = FALSE GROUP BY event_user_text, IF(ARRAY_CONTAINS(revision_tags, 'discussiontools-newtopic'), 'new-topic-tool', 'non-newtopic-tool'), wiki_db ) SELECT first_edits.experience_level, first_edits.section_edit_type, (count(first_week.user_name)/count(*)) as two_week_retention_rate FROM first_edits LEFT JOIN ( SELECT event_user_text as user_name, first_edits.first_edit_time, min(event_timestamp) as return_time FROM wmf.mediawiki_history mh INNER JOIN first_edits ON mh.event_user_text = first_edits.user_name WHERE snapshot = '2022-03' -- include only desktop edits AND NOT array_contains(revision_tags, 'iOS') AND NOT array_contains(revision_tags, 'Android') AND NOT array_contains(revision_tags, 'Mobile Web') -- find all edits on talk pages AND page_namespace_historical % 2 = 1 AND event_entity = 'revision' AND event_type = 'create' -- on all participating wikis AND wiki_db IN ('frwiki', 'eswiki', 'itwiki', 'jawiki', 'fawiki', 'plwiki', 'hewiki', 'nlwiki', 'hiwiki', 'kowiki', 'viwiki', 'thwiki', 'ptwiki', 'bnwiki', 'arzwiki', 'swwiki', 'zhwiki', 'ukwiki', 'idwiki', 'amwiki', 'omwiki', 'afwiki') -- return edit not reverted within 48 hours AND NOT (revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800) -- 48 hours -- user is not a bot and not anonymous AND SIZE(event_user_is_bot_by_historical) = 0 AND SIZE(event_user_is_bot_by) = 0 AND event_user_is_anonymous = FALSE AND first_edits.first_edit_time >= '2022-01-27' AND first_edits.first_edit_time <= '2022-03-04' -- second revision is between two and 8 days AND unix_timestamp(event_timestamp, 'yyyy-MM-dd HH:mm:ss.0') >= (unix_timestamp(first_edits.first_edit_time, 'yyyy-MM-dd HH:mm:ss.0') + (9*24*60*60)) AND unix_timestamp(event_timestamp, 'yyyy-MM-dd HH:mm:ss.0') <= (unix_timestamp(first_edits.first_edit_time, 'yyyy-MM-dd HH:mm:ss.0') + (15*24*60*60)) GROUP BY event_user_text, first_edits.first_edit_time ) AS first_week ON (first_edits.user_name = first_week.user_name and first_edits.first_edit_time = first_week.first_edit_time ) GROUP BY first_edits.experience_level, first_edits.section_edit_type; " week_two_retention <- wmfdata::query_hive(query) week_two_retention # find week 3 retention query <- " WITH first_edits AS ( -- users that made an attempt during the AB Test SELECT event_user_text as user_name, wiki_db AS wiki, min(event_timestamp) as first_edit_time, CASE WHEN min(event_user_revision_count) is NULL THEN 'undefined' WHEN min(event_user_revision_count) < 100 THEN 'junior' ELSE 'non-junior' END AS experience_level, IF(ARRAY_CONTAINS(revision_tags, 'discussiontools-newtopic'), 'new-topic-tool', 'non-newtopic-tool') AS section_edit_type FROM wmf.mediawiki_history mwh JOIN event.mediawiki_talk_page_edit tpe ON mwh.revision_id = tpe.revision_id AND mwh.wiki_db = tpe.`database` WHERE snapshot = '2022-03' AND event_timestamp >= '2022-01-27' AND event_timestamp <= '2022-03-04' AND ((month = 01 and day >= 27) OR (month = 02) OR (month = 03 and day <= 25)) AND wiki_db IN ('amwiki', 'bnwiki', 'zhwiki', 'nlwiki', 'arzwiki', 'frwiki', 'hewiki', 'hiwiki', 'idwiki', 'itwiki', 'jawiki', 'kowiki', 'omwiki', 'fawiki', 'plwiki', 'ptwiki', 'eswiki', 'thwiki', 'ukwiki', 'viwiki') -- do not include new discussion tool talk page edits AND NOT (ARRAY_CONTAINS(revision_tags, 'discussiontools-reply')) -- include only desktop edits AND NOT array_contains(revision_tags, 'iOS') AND NOT array_contains(revision_tags, 'Android') AND NOT array_contains(revision_tags, 'Mobile Web') -- first edit not reverted within 48 hours AND NOT (revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800) -- 48 hours -- find all edits on talk pages AND page_namespace_historical % 2 = 1 AND event_entity = 'revision' AND event_type = 'create' -- user is not a bot and not anonymous AND SIZE(event_user_is_bot_by_historical) = 0 AND SIZE(event_user_is_bot_by) = 0 AND event_user_is_anonymous = FALSE GROUP BY event_user_text, IF(ARRAY_CONTAINS(revision_tags, 'discussiontools-newtopic'), 'new-topic-tool', 'non-newtopic-tool'), wiki_db ) SELECT first_edits.experience_level, first_edits.section_edit_type, (count(first_week.user_name)/count(*)) as third_week_retention_rate FROM first_edits LEFT JOIN ( SELECT event_user_text as user_name, first_edits.first_edit_time, min(event_timestamp) as return_time FROM wmf.mediawiki_history mh INNER JOIN first_edits ON mh.event_user_text = first_edits.user_name WHERE snapshot = '2022-03' -- include only desktop edits AND NOT array_contains(revision_tags, 'iOS') AND NOT array_contains(revision_tags, 'Android') AND NOT array_contains(revision_tags, 'Mobile Web') -- find all edits on talk pages AND page_namespace_historical % 2 = 1 AND event_entity = 'revision' AND event_type = 'create' -- on all participating wikis AND wiki_db IN ('frwiki', 'eswiki', 'itwiki', 'jawiki', 'fawiki', 'plwiki', 'hewiki', 'nlwiki', 'hiwiki', 'kowiki', 'viwiki', 'thwiki', 'ptwiki', 'bnwiki', 'arzwiki', 'swwiki', 'zhwiki', 'ukwiki', 'idwiki', 'amwiki', 'omwiki', 'afwiki') -- return edit not reverted within 48 hours AND NOT (revision_is_identity_reverted AND revision_seconds_to_identity_revert <= 172800) -- 48 hours -- user is not a bot and not anonymous AND SIZE(event_user_is_bot_by_historical) = 0 AND SIZE(event_user_is_bot_by) = 0 AND event_user_is_anonymous = FALSE AND first_edits.first_edit_time >= '2022-01-27' AND first_edits.first_edit_time <= '2022-03-04' -- second revision is between two and 8 days AND unix_timestamp(event_timestamp, 'yyyy-MM-dd HH:mm:ss.0') >= (unix_timestamp(first_edits.first_edit_time, 'yyyy-MM-dd HH:mm:ss.0') + (16*24*60*60)) AND unix_timestamp(event_timestamp, 'yyyy-MM-dd HH:mm:ss.0') <= (unix_timestamp(first_edits.first_edit_time, 'yyyy-MM-dd HH:mm:ss.0') + (22*24*60*60)) GROUP BY event_user_text, first_edits.first_edit_time ) AS first_week ON (first_edits.user_name = first_week.user_name and first_edits.first_edit_time = first_week.first_edit_time ) GROUP BY first_edits.experience_level, first_edits.section_edit_type; " week_three_retention <- wmfdata::query_hive(query) # Join all the data retention_rates_all <- merge(week_one_retention, week_two_retention) %>% merge(week_three_retention) #clarify levels and labels for factor variables retention_rates_all$section_edit_type <- factor( retention_rates_all$section_edit_type, levels = c("non-newtopic-tool", "new-topic-tool"), labels = c("Previous add new section link", "New topic tool") ) retention_rates_all$experience_level <- factor(retention_rates_all$experience_level, levels = c( "non-junior", "junior"), labels = c("Non-Junior Contributor", "Junior Contributor") ) # Restrict to just Junior Contributors retention_rates_jc_table <- retention_rates_all %>% filter(experience_level == 'Junior Contributor') %>% group_by(section_edit_type) %>% summarise(week_1 = paste(round((first_week_retention_rate * 100), 2), "%"), week_2= paste(round((two_week_retention_rate * 100), 2), "%"), week_3 = paste(round((third_week_retention_rate * 100), 2), "%")) %>% gt() %>% tab_header( title = "Junior contributors retention rate" ) %>% cols_label( section_edit_type = "Editing method", week_1 = "Week 1 (2-8 days)", week_2 = "Week 2 (9-15 days)", week_3 = "Week 3 (16-22 days)", ) %>% tab_footnote( footnote = "Defined as percent of Junior contributors that completed a new topic edit during the AB test and returned to make another edit", locations = cells_title( ) ) %>% tab_footnote( footnote = "Defined as days since first new topic edit during the AB test.", locations = cells_column_labels( columns = c('week_1', 'week_2', 'week_3')) ) %>% gtsave( "retention_rates_jc_table.html", inline_css = TRUE) IRdisplay::display_html(file = "retention_rates_jc_table.html") # Plot retention rates p <- retention_rates_all %>% filter(experience_level == 'Junior Contributor') %>% gather("week", "retention_rate", 3:5) %>% mutate(week = factor(week, levels = c("first_week_retention_rate", "two_week_retention_rate", "third_week_retention_rate"), labels = c("week 1", "week 2", "week_3"))) %>% ggplot(aes(x= section_edit_type, y = retention_rate*100, fill = section_edit_type)) + geom_col(position = 'dodge') + geom_text(aes(label = paste(round(retention_rate*100, 2), "%"), fontface=2), vjust=1.2, size = 8, color = "white") + facet_wrap(~ week) + scale_y_continuous() + labs (y = "Percent of Junior Contributors ", title = "Junior contributors new topic tool retention rate", caption = "Defined as the percent of Junior contributors that made at least one new topic edit \n during the time of the AB test and returned to made another edit on a talk namespace.") + theme_bw() + scale_fill_manual(values= c("#999999", "#000099"), name = "Editing Method", labels = c("Previous add new section link", "New topic tool")) + theme( panel.grid.minor = element_blank(), panel.background = element_blank(), plot.title = element_text(hjust = 0.5), text = element_text(size=16), legend.position="bottom", axis.text.x = element_blank(), axis.title.x=element_blank(), axis.line = element_line(colour = "black")) p ggsave("Figures/jc_retention_rate.png", p, width = 16, height = 8, units = "in", dpi = 300)