shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr))) shhh({ library(tidyverse); # Tables: library(gt); library(gtsummary); }) # Q4: April through June 2021 query <- " -- find both cx and non-cx created articles WITH created_articles AS ( SELECT wiki_db AS wiki, SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS created_cx, COUNT(*) AS created_total FROM wmf.mediawiki_history WHERE snapshot = '2021-08' AND event_timestamp BETWEEN '2021-04-01' and '2021-06-30' -- interested in main page namespaces AND page_namespace = 0 -- only look at new page creations AND revision_parent_id = 0 AND event_entity = 'revision' AND event_type = 'create' GROUP BY wiki_db ), --find all deleted articles that were created with cx deleted_articles AS ( SELECT wiki_db AS wiki, SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS deleted_cx, COUNT(*) AS deleted_total FROM wmf.mediawiki_history WHERE snapshot = '2021-08' AND event_timestamp BETWEEN '2021-04-01' and '2021-06-30' -- interested in main page namespaces AND page_namespace = 0 -- only look at new page creations AND revision_parent_id = 0 AND event_entity = 'revision' -- find revisions moved to the archive table AND event_type = 'create' AND revision_is_deleted_by_page_deletion = TRUE -- remove all bots AND SIZE(event_user_is_bot_by_historical) = 0 -- not a bot GROUP BY wiki_db ) -- main query to aggregate and join sources above SELECT created_articles.wiki, created_cx, (created_total - created_cx) AS created_non_cx, deleted_cx, (deleted_total - deleted_cx) AS deleted_non_cx FROM created_articles JOIN deleted_articles ON created_articles.wiki = deleted_articles.wiki " cx_deletion_ratio_q4 <- wmfdata::query_hive(query) cx_deletion_ratio_q4_overall <- cx_deletion_ratio_q4 %>% summarise(deleted_cx_pct = paste0(round(sum(deleted_cx)/sum(created_cx) * 100, 2), "%"), deleted_non_cx_pct = paste0(round(sum(deleted_non_cx)/sum(created_non_cx) * 100, 2), "%"), deletion_pct_diff = paste0(round((sum(deleted_non_cx)/sum(created_non_cx)*100)-((sum(deleted_cx)/sum(created_cx))*100), 2),"%") ) cx_deletion_ratio_q4_overall # Add columns with calculated deletion ratio cx_deletion_ratio_q4_bywiki <- cx_deletion_ratio_q4 %>% #filter(wiki == 'arwiki') %>% # use to find ratios for single wiki filter(created_cx > 15) %>% # remove wikis with 15 or fewer articles created using cx mutate(deleted_cx_ratio = deleted_cx/created_cx, deleted_non_cx_ratio = deleted_non_cx/created_non_cx, deletion_ratio_diff = ((deleted_non_cx/created_non_cx)-(deleted_cx/created_cx) )) cx_deletion_higher_q4 <- cx_deletion_ratio_q4_bywiki %>% filter(deletion_ratio_diff < 0) %>% #find wikis with higher cx deletion ratio summarise(total_wikis = n()) print(paste0("Across all wikis where more than 15 articles have been created with content translation in Q4, there were ", cx_deletion_higher_q4[1], " wikis where articles created with content translation were deleted more than articles created without cx")) cx_deletion_higher_list_q4 <- cx_deletion_ratio_q4_bywiki %>% filter(deletion_ratio_diff < 0)%>% # only wikis where cx deletion ratio is higher arrange(deletion_ratio_diff) #sort by highest deletion ratio difference # reformat into table cx_deletion_higher_list_q4_tbl <- cx_deletion_higher_list_q4 %>% gt() %>% tab_header( title = "Wikis with higher deletion ratios for articles created with Content Translation", subtitle = "Reviewed Time Period: April 2021 through June 2021 (Q4)") %>% fmt_percent( columns = 6:8 ) %>% cols_label(wiki = "Wiki project", created_cx = "Created CX Articles", created_non_cx = "Created non-CX Articles", deleted_cx = "Deleted CX Articles", deleted_non_cx = "Deleted non-CX Articles", deleted_cx_ratio = "CX Articles Deletion Ratio", deleted_non_cx_ratio = "Non-CX Articles Deletion Ratio", deletion_ratio_diff = "Deletion Ratio Difference") %>% tab_spanner("Created Articles", 2:3) %>% tab_spanner("Deleted Articles", 4:5) %>% tab_spanner("Deletion Ratios", 6:8) %>% tab_footnote( footnote = "Excludes wikis with 15 or fewer articles created with Content Translation during the reviewed time period", locations = cells_column_labels( columns = 'wiki' )) %>% gtsave( "cx_deletion_higher_wikis_q4.html", inline_css = TRUE) IRdisplay::display_html(data = cx_deletion_higher_list_q4_tbl, file = "cx_deletion_higher_wikis_q4.html") cx_deletion_ration_highest_q4 <- cx_deletion_ratio_q4_bywiki %>% arrange(desc(deleted_cx_ratio)) %>% #sort by highest to lowest cx deletion ratio mutate(deleted_cx_ratio = paste0(round(deleted_cx_ratio *100,2),"%") , deleted_non_cx_ratio = paste0(round(deleted_non_cx_ratio *100,2),"%") , deletion_ratio_diff = paste0(round(deletion_ratio_diff * 100,2),"%") ) head(cx_deletion_ration_highest_q4, 5) # Deletion ratios from Q1 query <- " -- find all created articles WITH created_articles AS ( SELECT wiki_db AS wiki, SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS created_cx, COUNT(*) AS created_total FROM wmf.mediawiki_history WHERE snapshot = '2021-08' AND event_timestamp BETWEEN '2021-01-01' and '2021-03-31' -- interested in main page namespaces AND page_namespace = 0 -- only look at new page creations AND revision_parent_id = 0 AND event_entity = 'revision' AND event_type = 'create' -- remove bots AND SIZE(event_user_is_bot_by_historical) = 0 GROUP BY wiki_db ), --find all deleted articles deleted_articles AS ( SELECT wiki_db AS wiki, SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS deleted_cx, COUNT(*) AS deleted_total FROM wmf.mediawiki_history WHERE snapshot = '2021-08' AND event_timestamp BETWEEN '2021-01-01' and '2021-03-31' -- interested in main page namespaces AND page_namespace = 0 -- only look at new page creations AND revision_parent_id = 0 AND event_entity = 'revision' -- find revisions moved to the archive table AND event_type = 'create' AND revision_is_deleted_by_page_deletion = TRUE -- remove bots AND SIZE(event_user_is_bot_by_historical) = 0 GROUP BY wiki_db ) -- main query SELECT created_articles.wiki, created_cx, (created_total - created_cx) AS created_non_cx, deleted_cx, (deleted_total - deleted_cx) AS deleted_non_cx FROM created_articles JOIN deleted_articles ON created_articles.wiki = deleted_articles.wiki " cx_deletion_ratio_q1 <- wmfdata::query_hive(query) cx_deletion_ratio_q1_bywiki <- cx_deletion_ratio_q1 %>% #filter(wiki == 'idwiki') %>% filter(created_cx > 15) %>% mutate(deleted_cx_ratio = deleted_cx/created_cx, deleted_non_cx_ratio = deleted_non_cx/created_non_cx, deletion_ratio_diff = ((deleted_non_cx/created_non_cx)-(deleted_cx/created_cx) )) cx_deletion_higher_q1 <- cx_deletion_ratio_q1_bywiki %>% filter(deletion_ratio_diff < 0) %>% summarise(total_wikis = n()) print(paste0("Across all wikis where more than 15 articles have been created with content translation in Q1, there were ", cx_deletion_higher_q1[1], " wikis where articles created with content translation were deleted more than articles created without cx")) cx_deletion_higher_list_q1 <- cx_deletion_ratio_q1_bywiki %>% filter(deletion_ratio_diff < 0) %>% arrange(deletion_ratio_diff) cx_deletion_higher_list_q1 intersect(cx_deletion_higher_list_q1[1], cx_deletion_higher_list_q4[1]) # Current 6 Months # Jan - June 2021 query <- " -- find both cx and non-cx created articles WITH created_articles AS ( SELECT wiki_db AS wiki, SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS created_cx, COUNT(*) AS created_total FROM wmf.mediawiki_history WHERE snapshot = '2021-08' AND event_timestamp BETWEEN '2021-01-01' and '2021-06-30' -- interested in main page namespaces AND page_namespace = 0 -- only look at new page creations AND revision_parent_id = 0 AND event_entity = 'revision' AND event_type = 'create' -- rremove bots AND SIZE(event_user_is_bot_by_historical) = 0 GROUP BY wiki_db ), --find all deleted articles that were created with cx deleted_articles AS ( SELECT wiki_db AS wiki, SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS deleted_cx, COUNT(*) AS deleted_total FROM wmf.mediawiki_history WHERE snapshot = '2021-08' AND event_timestamp BETWEEN '2021-01-01' and '2021-06-30' -- interested in main page namespaces AND page_namespace = 0 -- only look at new page creations AND revision_parent_id = 0 AND event_entity = 'revision' AND event_type = 'create' -- find revisions moved to the archive table AND revision_is_deleted_by_page_deletion = TRUE -- remove bots AND SIZE(event_user_is_bot_by_historical) = 0 GROUP BY wiki_db ) -- main query to aggregate and join sources above SELECT created_articles.wiki, created_cx, (created_total - created_cx) AS created_non_cx, deleted_cx, (deleted_total - deleted_cx) AS deleted_non_cx FROM created_articles JOIN deleted_articles ON created_articles.wiki = deleted_articles.wiki " cx_deletion_ratio_current_6mo <- wmfdata::query_hive(query) cx_deletion_ratio_6cur_overall <- cx_deletion_ratio_current_6mo %>% summarise(deleted_cx_pct = paste0(round(sum(deleted_cx)/sum(created_cx) * 100, 2), "%"), deleted_non_cx_pct = paste0(round(sum(deleted_non_cx)/sum(created_non_cx) * 100, 2), "%"), deletion_pct_diff = paste0(round((sum(deleted_non_cx)/sum(created_non_cx)*100)-((sum(deleted_cx)/sum(created_cx))*100), 2),"%") ) cx_deletion_ratio_6cur_overall cx_deletion_ratio_current_bywiki <- cx_deletion_ratio_current_6mo %>% #filter(wiki == 'idwiki') %>% filter(created_cx > 15) %>% # only review wikis with more than 15 cx articles mutate(deleted_cx_ratio = deleted_cx/created_cx, deleted_non_cx_ratio = deleted_non_cx/created_non_cx, deletion_ratio_diff = ((deleted_non_cx/created_non_cx)-(deleted_cx/created_cx) )) cx_deletion_higher_current_6mo <- cx_deletion_ratio_current_bywiki %>% filter(deletion_ratio_diff < 0) %>% summarise(total_wikis = n()) cx_deletion_higher_current_6mo print(paste0("Across all wikis where more than 15 articles have been created with content translation from Jan 2021 - June 2021, there were ", cx_deletion_higher_current_6mo[1], " wikis where articles created with content translation were deleted more than articles created without cx")) cx_deletion_higher_list_current <- cx_deletion_ratio_current_bywiki %>% filter(deletion_ratio_diff < 0)%>% #only wikis with higher cx deletion ratios arrange(deletion_ratio_diff) # reformat into table cx_deletion_higher_list_6mo_tbl <- cx_deletion_higher_list_current %>% gt() %>% tab_header( title = "Wikis with higher deletion ratios for articles created with Content Translation", subtitle = "Reviewed Time Period: January 2021 through June 2021") %>% fmt_percent( columns = 6:8 ) %>% cols_label(wiki = "Wiki project", created_cx = "Created CX Articles", created_non_cx = "Created non-CX Articles", deleted_cx = "Deleted CX Articles", deleted_non_cx = "Deleted non-CX Articles", deleted_cx_ratio = "CX Articles Deletion Ratio", deleted_non_cx_ratio = "Non-CX Articles Deletion Ratio", deletion_ratio_diff = "Deletion Ratio Difference") %>% tab_spanner("Created Articles", 2:3) %>% tab_spanner("Deleted Articles", 4:5) %>% tab_spanner("Deletion Ratios", 6:8) %>% tab_footnote( footnote = "Excludes wikis with 15 or fewer articles created with Content Translation during the reviewed time period", locations = cells_column_labels( columns = 'wiki' )) %>% gtsave( "cx_deletion_higher_wikis_6mo.html", inline_css = TRUE) IRdisplay::display_html(data = cx_deletion_higher_list_6mo_tbl, file = "cx_deletion_higher_wikis_6mo.html") cx_deletion_ration_highest_current <- cx_deletion_ratio_current_bywiki %>% arrange(desc(deleted_cx_ratio)) %>% mutate(deleted_cx_ratio = paste0(round(deleted_cx_ratio *100,2),"%") , deleted_non_cx_ratio = paste0(round(deleted_non_cx_ratio *100,2),"%") , deletion_ratio_diff = paste0(round(deletion_ratio_diff * 100,2),"%") ) head(cx_deletion_ration_highest_current, 5) # Previous 6 Months # July 2020 - December 2020 query <- " -- find both cx and non-cx created articles WITH created_articles AS ( SELECT wiki_db AS wiki, SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS created_cx, COUNT(*) AS created_total FROM wmf.mediawiki_history WHERE snapshot = '2021-08' AND event_timestamp BETWEEN '2020-07-01' and '2020-12-31' -- interested in main page namespaces AND page_namespace = 0 -- only look at new page creations AND revision_parent_id = 0 AND event_entity = 'revision' AND event_type = 'create' -- remove bots AND SIZE(event_user_is_bot_by_historical) = 0 GROUP BY wiki_db ), --find all deleted articles that were created with cx deleted_articles AS ( SELECT wiki_db AS wiki, SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS deleted_cx, COUNT(*) AS deleted_total FROM wmf.mediawiki_history WHERE snapshot = '2021-08' AND event_timestamp BETWEEN '2020-07-01' and '2020-12-31' -- interested in main page namespaces AND page_namespace = 0 -- only look at new page creations AND revision_parent_id = 0 AND event_entity = 'revision' -- find revisions moved to the archive table AND event_type = 'create' AND revision_is_deleted_by_page_deletion = TRUE -- remove bots AND SIZE(event_user_is_bot_by_historical) = 0 GROUP BY wiki_db ) -- main query to aggregate and join sources above SELECT created_articles.wiki, created_cx, (created_total - created_cx) AS created_non_cx, deleted_cx, (deleted_total - deleted_cx) AS deleted_non_cx FROM created_articles JOIN deleted_articles ON created_articles.wiki = deleted_articles.wiki " cx_deletion_ratio_previous_6mo <- wmfdata::query_hive(query) cx_deletion_ratio_bywiki_previous <- cx_deletion_ratio_previous_6mo %>% #filter(wiki == 'idwiki') %>% filter(created_cx > 15) %>% # only wikis with at leat 15 created articles mutate(deleted_cx_ratio = deleted_cx/created_cx, deleted_non_cx_ratio = deleted_non_cx/created_non_cx, deletion_ratio_diff = ((deleted_non_cx/created_non_cx)-(deleted_cx/created_cx) )) cx_deletion_higher_previous <- cx_deletion_ratio_bywiki_previous %>% filter(deletion_ratio_diff < 0) %>% summarise(total_wikis = n()) cx_deletion_higher_previous print(paste0("Across all wikis where more than 15 articles have been created with content translation between July 2020 and December 2020, there were ", cx_deletion_higher_previous[1], " wikis where articles created with content translation were deleted more than articles created without cx")) cx_deletion_higher_list_previous <- cx_deletion_ratio_bywiki_previous %>% filter(deletion_ratio_diff < 0) %>% arrange(deletion_ratio_diff) cx_deletion_higher_list_previous intersect(cx_deletion_higher_list_current[1], cx_deletion_higher_list_previous[1])