This task is about uncovering the percentage of talk pages, across namespaces and Wikipedias, that have not yet been created. nowing the percentage of talk pages, across namespaces and Wikipedias, that have not yet been created will help us decide how highly we should prioritize work on designing the empty state experience (See T252902.
shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(tidyverse);
})
test <- missing_talk_pages)
wiki | subject_namespace | num_subject_pages | num_talk_pages | num_talk_pages_not_yet_created | prop_talk_pages_not_yet_created | |
---|---|---|---|---|---|---|
<chr> | <int> | <int> | <int> | <int> | <dbl> | |
1 | aawiki | 0 | 2 | 1 | 1 | 0.5000000 |
2 | aawiki | 2 | 170 | 54 | 116 | 0.6823529 |
3 | aawiki | 4 | 7 | 0 | 7 | 1.0000000 |
4 | aawiki | 8 | 107 | 0 | 107 | 1.0000000 |
5 | aawiki | 10 | 24 | 0 | 24 | 1.0000000 |
6 | aawiki | 14 | 29 | 0 | 29 | 1.0000000 |
query <-
"
--find all subject namespace page
WITH subject_pages AS (
SELECT
-- address duplicate page title instances in mediawiki_page_history
DISTINCT page_title AS subject_title,
wiki_db AS wiki,
CASE
WHEN page_namespace = 0 THEN 'Main/Article'
WHEN page_namespace = 2 THEN 'User'
WHEN page_namespace = 4 THEN 'Wikipedia'
WHEN page_namespace = 6 THEN 'File'
WHEN page_namespace = 8 THEN 'MediaWiki'
WHEN page_namespace = 10 THEN 'Template'
WHEN page_namespace = 12 THEN 'Help'
WHEN page_namespace = 14 THEN 'Category'
WHEN page_namespace = 100 THEN 'Portal'
WHEN page_namespace = 118 THEN 'Draft'
WHEN page_namespace = 710 THEN 'TimedText'
WHEN page_namespace = 828 THEN 'Module'
END AS subject_namespace
FROM wmf.mediawiki_page_history
INNER JOIN canonical_data.wikis
ON
wiki_db = database_code and
database_group == 'wikipedia'
WHERE
-- review all primary subject namespaces
page_namespace IN (0,2,4,6,8,10,12,14,100,118, 710, 828)
--Remove redirects
AND page_is_redirect = FALSE
--Remove archived articles
AND page_is_deleted = FALSE
AND snapshot = '2020-12'
),
--find all talk namespace pages
talk_pages AS (
SELECT
DISTINCT page_title AS talk_title,
wiki_db AS wiki,
CASE
WHEN page_namespace = 1 THEN 'Talk'
WHEN page_namespace = 3 THEN 'User talk'
WHEN page_namespace = 5 THEN 'Wikipedia talk'
WHEN page_namespace = 7 THEN 'File talk'
WHEN page_namespace = 9 THEN 'MediaWiki talk'
WHEN page_namespace = 11 THEN 'Template talk'
WHEN page_namespace = 13 THEN 'Help talk'
WHEN page_namespace = 15 THEN 'Category talk'
WHEN page_namespace = 101 THEN 'Portal talk'
WHEN page_namespace = 119 THEN 'Draft talk'
WHEN page_namespace = 711 THEN 'TimedText talk'
WHEN page_namespace = 829 THEN 'Module talk'
END AS talk_namespace
FROM wmf.mediawiki_page_history
INNER JOIN canonical_data.wikis
ON
wiki_db = database_code and
database_group == 'wikipedia'
WHERE
-- review all primary talk namespaces
page_namespace IN (1,3,5,7,9,11,13,15, 101, 119, 711, 829)
--Remove redirects
AND page_is_redirect = FALSE
--Remove archived articles
AND page_is_deleted = FALSE
AND snapshot = '2020-12'
)
-- MAIN QUERY --
SELECT
sp.wiki,
sp.subject_namespace,
COUNT(*) AS num_subject_pages,
SUM(CAST(talk_title IS NOT NULL AS int)) AS num_talk_pages_created,
SUM(CAST(talk_title IS NULL AS int)) AS num_talk_pages_not_yet_created
FROM
subject_pages AS sp
LEFT JOIN talk_pages ON
sp.subject_title = talk_pages.talk_title AND
sp.wiki = talk_pages.wiki
GROUP BY
sp.wiki,
sp.subject_namespace
;
"
missing_talk_pages <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
missing_talk_bynamespace <- missing_talk_pages %>%
group_by(subject_namespace) %>%
summarise(total_talk_missing = sum(num_talk_pages_not_yet_created),
total_subject = sum(num_subject_pages),
prop_talk_missing = round(sum(num_talk_pages_not_yet_created)/sum(num_subject_pages) * 100,2), .groups = 'drop')
missing_talk_bynamespace
subject_namespace | total_talk_missing | total_subject | prop_talk_missing |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
Category | 9803368 | 13485972 | 72.69 |
Draft | 50342 | 66563 | 75.63 |
File | 2116032 | 2699583 | 78.38 |
Help | 18252 | 22739 | 80.27 |
Main/Article | 32239028 | 56107661 | 57.46 |
MediaWiki | 106603 | 115030 | 92.67 |
Module | 148977 | 157656 | 94.49 |
Portal | 471198 | 523353 | 90.03 |
Template | 4398181 | 5219632 | 84.26 |
TimedText | 1148 | 1266 | 90.68 |
User | 4064202 | 7034793 | 57.77 |
Wikipedia | 2411805 | 2559861 | 94.22 |
missing_talk_bynamespace <- missing_talk_pages %>%
group_by(wiki, subject_namespace) %>%
summarise(total_talk_missing = sum(num_talk_pages_not_yet_created),
total_subject = sum(num_subject_pages),
prop_talk_missing = round(sum(num_talk_pages_not_yet_created)/sum(num_subject_pages) * 100,2), .groups = 'drop')
missing_talk_bynamespace
wiki | subject_namespace | total_talk_missing | total_subject | prop_talk_missing |
---|---|---|---|---|
<chr> | <chr> | <int> | <int> | <dbl> |
aawiki | Category | 29 | 29 | 100.00 |
aawiki | Main/Article | 1 | 2 | 50.00 |
aawiki | MediaWiki | 107 | 107 | 100.00 |
aawiki | Template | 24 | 24 | 100.00 |
aawiki | User | 116 | 170 | 68.24 |
aawiki | Wikipedia | 7 | 7 | 100.00 |
abwiki | Category | 7046 | 7086 | 99.44 |
abwiki | File | 10 | 10 | 100.00 |
abwiki | Help | 2 | 2 | 100.00 |
abwiki | Main/Article | 6125 | 6203 | 98.74 |
abwiki | MediaWiki | 66 | 71 | 92.96 |
abwiki | Module | 82 | 82 | 100.00 |
abwiki | Template | 1199 | 1210 | 99.09 |
abwiki | User | 791 | 1007 | 78.55 |
abwiki | Wikipedia | 35 | 39 | 89.74 |
acewiki | Category | 1052 | 1073 | 98.04 |
acewiki | Help | 4 | 5 | 80.00 |
acewiki | Main/Article | 10406 | 10486 | 99.24 |
acewiki | MediaWiki | 107 | 112 | 95.54 |
acewiki | Module | 54 | 56 | 96.43 |
acewiki | Template | 1685 | 1704 | 98.88 |
acewiki | User | 583 | 906 | 64.35 |
acewiki | Wikipedia | 62 | 68 | 91.18 |
adywiki | Category | 36 | 38 | 94.74 |
adywiki | Help | 1 | 1 | 100.00 |
adywiki | Main/Article | 555 | 564 | 98.40 |
adywiki | MediaWiki | 25 | 28 | 89.29 |
adywiki | Module | 8 | 8 | 100.00 |
adywiki | Template | 1318 | 1321 | 99.77 |
adywiki | User | 44 | 62 | 70.97 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
zh_min_nanwiki | Wikipedia | 570 | 1706 | 33.41 |
zh_yuewiki | Category | 22565 | 23564 | 95.76 |
zh_yuewiki | File | 2037 | 2048 | 99.46 |
zh_yuewiki | Help | 20 | 27 | 74.07 |
zh_yuewiki | Main/Article | 97554 | 105811 | 92.20 |
zh_yuewiki | MediaWiki | 1706 | 1785 | 95.57 |
zh_yuewiki | Module | 312 | 334 | 93.41 |
zh_yuewiki | Portal | 413 | 459 | 89.98 |
zh_yuewiki | Template | 7994 | 8526 | 93.76 |
zh_yuewiki | User | 2045 | 3037 | 67.34 |
zh_yuewiki | Wikipedia | 2219 | 2433 | 91.20 |
zhwiki | Category | 295498 | 389965 | 75.78 |
zhwiki | Draft | 675 | 892 | 75.67 |
zhwiki | File | 50111 | 55899 | 89.65 |
zhwiki | Help | 298 | 427 | 69.79 |
zhwiki | Main/Article | 543143 | 1190778 | 45.61 |
zhwiki | MediaWiki | 8116 | 8451 | 96.04 |
zhwiki | Module | 3115 | 3509 | 88.77 |
zhwiki | Portal | 8374 | 9902 | 84.57 |
zhwiki | Template | 907022 | 936618 | 96.84 |
zhwiki | User | 68774 | 144267 | 47.67 |
zhwiki | Wikipedia | 47756 | 52642 | 90.72 |
zuwiki | Category | 1047 | 1083 | 96.68 |
zuwiki | Help | 1 | 1 | 100.00 |
zuwiki | Main/Article | 6339 | 6412 | 98.86 |
zuwiki | MediaWiki | 14 | 16 | 87.50 |
zuwiki | Module | 26 | 26 | 100.00 |
zuwiki | Template | 865 | 869 | 99.54 |
zuwiki | User | 1323 | 1456 | 90.87 |
zuwiki | Wikipedia | 20 | 25 | 80.00 |