shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr))) shhh({ library(tidyverse); }) test <- missing_talk_pages) query <- " --find all subject namespace page WITH subject_pages AS ( SELECT -- address duplicate page title instances in mediawiki_page_history DISTINCT page_title AS subject_title, wiki_db AS wiki, CASE WHEN page_namespace = 0 THEN 'Main/Article' WHEN page_namespace = 2 THEN 'User' WHEN page_namespace = 4 THEN 'Wikipedia' WHEN page_namespace = 6 THEN 'File' WHEN page_namespace = 8 THEN 'MediaWiki' WHEN page_namespace = 10 THEN 'Template' WHEN page_namespace = 12 THEN 'Help' WHEN page_namespace = 14 THEN 'Category' WHEN page_namespace = 100 THEN 'Portal' WHEN page_namespace = 118 THEN 'Draft' WHEN page_namespace = 710 THEN 'TimedText' WHEN page_namespace = 828 THEN 'Module' END AS subject_namespace FROM wmf.mediawiki_page_history INNER JOIN canonical_data.wikis ON wiki_db = database_code and database_group == 'wikipedia' WHERE -- review all primary subject namespaces page_namespace IN (0,2,4,6,8,10,12,14,100,118, 710, 828) --Remove redirects AND page_is_redirect = FALSE --Remove archived articles AND page_is_deleted = FALSE AND snapshot = '2020-12' ), --find all talk namespace pages talk_pages AS ( SELECT DISTINCT page_title AS talk_title, wiki_db AS wiki, CASE WHEN page_namespace = 1 THEN 'Talk' WHEN page_namespace = 3 THEN 'User talk' WHEN page_namespace = 5 THEN 'Wikipedia talk' WHEN page_namespace = 7 THEN 'File talk' WHEN page_namespace = 9 THEN 'MediaWiki talk' WHEN page_namespace = 11 THEN 'Template talk' WHEN page_namespace = 13 THEN 'Help talk' WHEN page_namespace = 15 THEN 'Category talk' WHEN page_namespace = 101 THEN 'Portal talk' WHEN page_namespace = 119 THEN 'Draft talk' WHEN page_namespace = 711 THEN 'TimedText talk' WHEN page_namespace = 829 THEN 'Module talk' END AS talk_namespace FROM wmf.mediawiki_page_history INNER JOIN canonical_data.wikis ON wiki_db = database_code and database_group == 'wikipedia' WHERE -- review all primary talk namespaces page_namespace IN (1,3,5,7,9,11,13,15, 101, 119, 711, 829) --Remove redirects AND page_is_redirect = FALSE --Remove archived articles AND page_is_deleted = FALSE AND snapshot = '2020-12' ) -- MAIN QUERY -- SELECT sp.wiki, sp.subject_namespace, COUNT(*) AS num_subject_pages, SUM(CAST(talk_title IS NOT NULL AS int)) AS num_talk_pages_created, SUM(CAST(talk_title IS NULL AS int)) AS num_talk_pages_not_yet_created FROM subject_pages AS sp LEFT JOIN talk_pages ON sp.subject_title = talk_pages.talk_title AND sp.wiki = talk_pages.wiki GROUP BY sp.wiki, sp.subject_namespace ; " missing_talk_pages <- wmfdata::query_hive(query) missing_talk_bynamespace <- missing_talk_pages %>% group_by(subject_namespace) %>% summarise(total_talk_missing = sum(num_talk_pages_not_yet_created), total_subject = sum(num_subject_pages), prop_talk_missing = round(sum(num_talk_pages_not_yet_created)/sum(num_subject_pages) * 100,2), .groups = 'drop') missing_talk_bynamespace missing_talk_bynamespace <- missing_talk_pages %>% group_by(wiki, subject_namespace) %>% summarise(total_talk_missing = sum(num_talk_pages_not_yet_created), total_subject = sum(num_subject_pages), prop_talk_missing = round(sum(num_talk_pages_not_yet_created)/sum(num_subject_pages) * 100,2), .groups = 'drop') missing_talk_bynamespace