From task description:
"Across all languages, Wikipedia articles created with Content Translation are deleted less often than those created from scratch. For example, in 2020, 3% of new translations were deleted, compared to 12% of other new articles. However, this is not the case for all Wikipedias and some specific wikis have a higher deletion rate for translations. For example, for Indonesian (T219851#5914691) and Telugu (T244769) the deletion ratios for Content Translation were higher compared to other articles created in these wikis."
The purpose of this analysis is to identify and list the number of wikis where the deletion rate of articles created with content translation is higher than the deletion rate for articles created with other tools. Specifically, we want to answer the following questions:
This analysis will be used as a baseline to assess the evolution of deletion rates as improvements are made.
Results are updated quarterly and documented on wiki.
Data comes from the mediawiki_history table and reflects the deletion ratios of main namespace articles that were created using Content Translation compared to the deletion ratio for main namespace articles created without the tool. Bots were excluded.
This data is collected quarterly (every three months) to assess the evolution of deletion rates as improvements are made. This timespan was selected to caputre a sufficient time for editors to review content and avoid seasonalilty effects
Wiki size threshold: We removed wikis where 15 or fewer articles were created with content translation during the reviewed period to reduce noise in the data and focus on wikis with more representative data.
shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(tidyverse);
# Tables:
library(gt);
library(gtsummary);
})
#FIXME: Update with parameters
#FIXME: Investigate ability to add time contraint for when the page was deleted
# Update with time period you wish to review
# Q2 October - December 2021
mw_snapshot <- '2022-03'
start_dt <- '2022-01-01'
end_dt <- '2022-03-31'
query <-
"
-- find both cx and non-cx created articles
WITH created_articles AS (
SELECT
wiki_db AS wiki,
SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS created_cx,
COUNT(*) AS created_total
FROM wmf.mediawiki_history
WHERE
snapshot = '2022-03'
AND event_timestamp BETWEEN '2022-01-01' and '2022-03-31'
-- interested in main page namespaces
AND page_namespace = 0
-- only look at new page creations
AND revision_parent_id = 0
AND event_entity = 'revision'
AND event_type = 'create'
GROUP BY
wiki_db
),
--find all deleted articles that were created with cx
deleted_articles AS (
SELECT
wiki_db AS wiki,
SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS deleted_cx,
COUNT(*) AS deleted_total
FROM wmf.mediawiki_history
WHERE
snapshot = '2022-03'
AND event_timestamp BETWEEN '2022-01-01' and '2022-03-31'
-- interested in main page namespaces
AND page_namespace = 0
-- only look at new page creations
AND revision_parent_id = 0
AND event_entity = 'revision'
-- find revisions moved to the archive table
AND event_type = 'create'
AND revision_is_deleted_by_page_deletion = TRUE
-- remove all bots
AND SIZE(event_user_is_bot_by_historical) = 0 -- not a bot
GROUP BY
wiki_db
)
-- main query to aggregate and join sources above
SELECT
created_articles.wiki,
created_cx,
(created_total - created_cx) AS created_non_cx,
deleted_cx,
(deleted_total - deleted_cx) AS deleted_non_cx
FROM created_articles
JOIN deleted_articles ON
created_articles.wiki = deleted_articles.wiki
"
cx_deletion_ratio <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
cx_deletion_ratio_overall <- cx_deletion_ratio %>%
#filter(created_cx > 15) %>% # remove wikis with 15 or fewer articles created using cx
summarise(deleted_cx_pct = paste0(round(sum(deleted_cx)/sum(created_cx) * 100, 2), "%"),
deleted_non_cx_pct = paste0(round(sum(deleted_non_cx)/sum(created_non_cx) * 100, 2), "%"),
deletion_pct_diff = paste0(round((sum(deleted_non_cx)/sum(created_non_cx)*100)-((sum(deleted_cx)/sum(created_cx))*100), 2),"%")
)
cx_deletion_ratio_overall
deleted_cx_pct | deleted_non_cx_pct | deletion_pct_diff |
---|---|---|
<chr> | <chr> | <chr> |
3.08% | 4.97% | 1.89% |
# Add columns with calculated deletion ratio
cx_deletion_ratio_bywiki <- cx_deletion_ratio %>%
#filter(wiki == 'arwiki') %>% # use to find ratios for single wiki
filter(created_cx > 15) %>% # remove wikis with 15 or fewer articles created using cx
mutate(deleted_cx_ratio = deleted_cx/created_cx,
deleted_non_cx_ratio = deleted_non_cx/created_non_cx,
deletion_ratio_diff = ((deleted_non_cx/created_non_cx)-(deleted_cx/created_cx)
))
cx_deletion_higher <- cx_deletion_ratio_bywiki %>%
filter(deletion_ratio_diff < 0) %>% #find wikis with higher cx deletion ratio
summarise(total_wikis = n())
print(paste0("Across all wikis where more than 15 articles have been created with content translation in Q3, there were ",
cx_deletion_higher[1],
" wikis where articles created with content translation were deleted more than articles created without cx"))
[1] "Across all wikis where more than 15 articles have been created with content translation in Q3, there were 13 wikis where articles created with content translation were deleted more than articles created without cx"
cx_deletion_higher_list <- cx_deletion_ratio_bywiki %>%
filter(deletion_ratio_diff < 0)%>% # only wikis where cx deletion ratio is higher
arrange(deletion_ratio_diff) #sort by highest deletion ratio difference
# reformat into table
cx_deletion_higher_list_tbl <- cx_deletion_higher_list %>%
gt() %>%
tab_header(
title = "Wikis with higher deletion ratios for articles created with Content Translation",
subtitle = "Reviewed Time Period: January 2022 through March 2022 (Q3)") %>%
fmt_percent(
columns = 6:8
) %>%
cols_label(wiki = "Wiki project",
created_cx = "Created CX Articles",
created_non_cx = "Created non-CX Articles",
deleted_cx = "Deleted CX Articles",
deleted_non_cx = "Deleted non-CX Articles",
deleted_cx_ratio = "CX Articles Deletion Ratio",
deleted_non_cx_ratio = "Non-CX Articles Deletion Ratio",
deletion_ratio_diff = "Deletion Ratio Difference") %>%
tab_spanner("Created Articles", 2:3) %>%
tab_spanner("Deleted Articles", 4:5) %>%
tab_spanner("Deletion Ratios", 6:8) %>%
tab_footnote(
footnote = "Excludes wikis with 15 or fewer articles created with Content Translation
during the reviewed time period",
locations = cells_column_labels(
columns = 'wiki'
)) %>%
gtsave(
"cx_deletion_higher_wikis_current.html", inline_css = TRUE)
IRdisplay::display_html(data = cx_deletion_higher_list_tbl, file = "cx_deletion_higher_wikis_current.html")
Wikis with higher deletion ratios for articles created with Content Translation | |||||||
---|---|---|---|---|---|---|---|
Reviewed Time Period: January 2022 through March 2022 (Q3) | |||||||
1
Excludes wikis with 15 or fewer articles created with Content Translation
during the reviewed time period
|
cx_deletion_ration_highest <- cx_deletion_ratio_bywiki %>%
arrange(desc(deleted_cx_ratio)) %>% #sort by highest to lowest cx deletion ratio
mutate(deleted_cx_ratio = paste0(round(deleted_cx_ratio *100,2),"%") ,
deleted_non_cx_ratio = paste0(round(deleted_non_cx_ratio *100,2),"%") ,
deletion_ratio_diff = paste0(round(deletion_ratio_diff * 100,2),"%") )
head(cx_deletion_ration_highest, 5)
wiki | created_cx | created_non_cx | deleted_cx | deleted_non_cx | deleted_cx_ratio | deleted_non_cx_ratio | deletion_ratio_diff | |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | |
1 | mnwiki | 17 | 378 | 11 | 214 | 64.71% | 56.61% | -8.09% |
2 | ltwiki | 31 | 2996 | 10 | 754 | 32.26% | 25.17% | -7.09% |
3 | etwiki | 17 | 4503 | 5 | 528 | 29.41% | 11.73% | -17.69% |
4 | skwiki | 29 | 2587 | 8 | 424 | 27.59% | 16.39% | -11.2% |
5 | guwiki | 16 | 162 | 3 | 66 | 18.75% | 40.74% | 21.99% |
# Deletion ratios from Q4
query <-
"
-- find all created articles
WITH created_articles AS (
SELECT
wiki_db AS wiki,
SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS created_cx,
COUNT(*) AS created_total
FROM wmf.mediawiki_history
WHERE
snapshot = '2022-03'
AND event_timestamp BETWEEN '2021-10-01' and '2021-12-31'
-- interested in main page namespaces
AND page_namespace = 0
-- only look at new page creations
AND revision_parent_id = 0
AND event_entity = 'revision'
AND event_type = 'create'
-- remove bots
AND SIZE(event_user_is_bot_by_historical) = 0
GROUP BY
wiki_db
),
--find all deleted articles
deleted_articles AS (
SELECT
wiki_db AS wiki,
SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS deleted_cx,
COUNT(*) AS deleted_total
FROM wmf.mediawiki_history
WHERE
snapshot = '2022-03'
AND event_timestamp BETWEEN '2021-10-01' and '2021-12-31'
-- interested in main page namespaces
AND page_namespace = 0
-- only look at new page creations
AND revision_parent_id = 0
AND event_entity = 'revision'
-- find revisions moved to the archive table
AND event_type = 'create'
AND revision_is_deleted_by_page_deletion = TRUE
-- remove bots
AND SIZE(event_user_is_bot_by_historical) = 0
GROUP BY
wiki_db
)
-- main query
SELECT
created_articles.wiki,
created_cx,
(created_total - created_cx) AS created_non_cx,
deleted_cx,
(deleted_total - deleted_cx) AS deleted_non_cx
FROM created_articles
JOIN deleted_articles ON
created_articles.wiki = deleted_articles.wiki
"
cx_deletion_ratio_previous <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
cx_deletion_ratio_overall_previous <- cx_deletion_ratio_previous %>%
#filter(created_cx > 15) %>% # remove wikis with 15 or fewer articles created using cx
summarise(deleted_cx_pct = paste0(round(sum(deleted_cx)/sum(created_cx) * 100, 2), "%"),
deleted_non_cx_pct = paste0(round(sum(deleted_non_cx)/sum(created_non_cx) * 100, 2), "%"),
deletion_pct_diff = paste0(round((sum(deleted_non_cx)/sum(created_non_cx)*100)-((sum(deleted_cx)/sum(created_cx))*100), 2),"%")
)
cx_deletion_ratio_overall_previous
deleted_cx_pct | deleted_non_cx_pct | deletion_pct_diff |
---|---|---|
<chr> | <chr> | <chr> |
3.27% | 6% | 2.73% |
cx_deletion_ratio_previous_bywiki <- cx_deletion_ratio_previous %>%
#filter(wiki == 'idwiki') %>%
filter(created_cx > 15) %>%
mutate(deleted_cx_ratio = deleted_cx/created_cx,
deleted_non_cx_ratio = deleted_non_cx/created_non_cx,
deletion_ratio_diff = ((deleted_non_cx/created_non_cx)-(deleted_cx/created_cx)
))
cx_deletion_higher_previous <- cx_deletion_ratio_previous_bywiki %>%
filter(deletion_ratio_diff < 0) %>%
summarise(total_wikis = n())
print(paste0("Across all wikis where more than 15 articles have been created with content translation in the previous quarter, there were ",
cx_deletion_higher_previous[1],
" wikis where articles created with content translation were deleted more than articles created without cx"))
[1] "Across all wikis where more than 15 articles have been created with content translation in the previous quarter, there were 15 wikis where articles created with content translation were deleted more than articles created without cx"
cx_deletion_higher_list_previous <- cx_deletion_ratio_previous_bywiki %>%
filter(deletion_ratio_diff < 0) %>%
arrange(deletion_ratio_diff)
cx_deletion_higher_list_previous
wiki | created_cx | created_non_cx | deleted_cx | deleted_non_cx | deleted_cx_ratio | deleted_non_cx_ratio | deletion_ratio_diff |
---|---|---|---|---|---|---|---|
<chr> | <int> | <int> | <int> | <int> | <dbl> | <dbl> | <dbl> |
ltwiki | 38 | 4426 | 18 | 690 | 0.47368421 | 0.155896972 | -0.3177872381 |
lawiki | 21 | 1084 | 4 | 51 | 0.19047619 | 0.047047970 | -0.1434282200 |
hywiki | 43 | 2326 | 10 | 356 | 0.23255814 | 0.153052451 | -0.0795056890 |
arzwiki | 62 | 170059 | 4 | 222 | 0.06451613 | 0.001305429 | -0.0632106997 |
ocwiki | 20 | 405 | 2 | 15 | 0.10000000 | 0.037037037 | -0.0629629630 |
mrwiki | 98 | 3421 | 7 | 70 | 0.07142857 | 0.020461853 | -0.0509667182 |
fiwiki | 37 | 8219 | 5 | 730 | 0.13513514 | 0.088818591 | -0.0463165441 |
swwiki | 71 | 1673 | 4 | 24 | 0.05633803 | 0.014345487 | -0.0419925410 |
thwiki | 70 | 7320 | 4 | 177 | 0.05714286 | 0.024180328 | -0.0329625293 |
twwiki | 82 | 275 | 6 | 14 | 0.07317073 | 0.050909091 | -0.0222616408 |
iswiki | 20 | 706 | 2 | 55 | 0.10000000 | 0.077903683 | -0.0220963173 |
nlwiki | 237 | 14644 | 41 | 2275 | 0.17299578 | 0.155353728 | -0.0176420521 |
afwiki | 82 | 1309 | 5 | 64 | 0.06097561 | 0.048892284 | -0.0120833256 |
kmwiki | 43 | 557 | 7 | 90 | 0.16279070 | 0.161579892 | -0.0012108054 |
bewiki | 154 | 5133 | 3 | 99 | 0.01948052 | 0.019286967 | -0.0001935528 |
# reformat into table
cx_deletion_higher_list_tbl_previous <- cx_deletion_higher_list_previous %>%
gt() %>%
tab_header(
title = "Wikis with higher deletion ratios for articles created with Content Translation",
subtitle = "Reviewed Time Period: July 2021 through September 2021 (Q1)") %>%
fmt_percent(
columns = 6:8
) %>%
cols_label(wiki = "Wiki project",
created_cx = "Created CX Articles",
created_non_cx = "Created non-CX Articles",
deleted_cx = "Deleted CX Articles",
deleted_non_cx = "Deleted non-CX Articles",
deleted_cx_ratio = "CX Articles Deletion Ratio",
deleted_non_cx_ratio = "Non-CX Articles Deletion Ratio",
deletion_ratio_diff = "Deletion Ratio Difference") %>%
tab_spanner("Created Articles", 2:3) %>%
tab_spanner("Deleted Articles", 4:5) %>%
tab_spanner("Deletion Ratios", 6:8) %>%
tab_footnote(
footnote = "Excludes wikis with 15 or fewer articles created with Content Translation
during the reviewed time period",
locations = cells_column_labels(
columns = 'wiki'
)) %>%
gtsave(
"cx_deletion_higher_wikis_previous.html", inline_css = TRUE)
IRdisplay::display_html(data = cx_deletion_higher_list_tbl_previous, file = "cx_deletion_higher_wikis_previous.html")
Wikis with higher deletion ratios for articles created with Content Translation | |||||||
---|---|---|---|---|---|---|---|
Reviewed Time Period: July 2021 through September 2021 (Q1) | |||||||
1
Excludes wikis with 15 or fewer articles created with Content Translation
during the reviewed time period
|
intersect(cx_deletion_higher_list_previous[1], cx_deletion_higher_list[1])
wiki |
---|
<chr> |
ltwiki |
arzwiki |
bewiki |
This was done in the analysis conducted as part of https://phabricator.wikimedia.org/T286636#7345479 to assess very review timeframes. The team decided to proceed with quarterly updates but leaving this prior analysis here for reference.
Results have not been updated since June 2021.
# Current 6 Months
# Jan - June 2021
query <-
"
-- find both cx and non-cx created articles
WITH created_articles AS (
SELECT
wiki_db AS wiki,
SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS created_cx,
COUNT(*) AS created_total
FROM wmf.mediawiki_history
WHERE
snapshot = '2021-08'
AND event_timestamp BETWEEN '2021-01-01' and '2021-06-30'
-- interested in main page namespaces
AND page_namespace = 0
-- only look at new page creations
AND revision_parent_id = 0
AND event_entity = 'revision'
AND event_type = 'create'
-- rremove bots
AND SIZE(event_user_is_bot_by_historical) = 0
GROUP BY
wiki_db
),
--find all deleted articles that were created with cx
deleted_articles AS (
SELECT
wiki_db AS wiki,
SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS deleted_cx,
COUNT(*) AS deleted_total
FROM wmf.mediawiki_history
WHERE
snapshot = '2021-08'
AND event_timestamp BETWEEN '2021-01-01' and '2021-06-30'
-- interested in main page namespaces
AND page_namespace = 0
-- only look at new page creations
AND revision_parent_id = 0
AND event_entity = 'revision'
AND event_type = 'create'
-- find revisions moved to the archive table
AND revision_is_deleted_by_page_deletion = TRUE
-- remove bots
AND SIZE(event_user_is_bot_by_historical) = 0
GROUP BY
wiki_db
)
-- main query to aggregate and join sources above
SELECT
created_articles.wiki,
created_cx,
(created_total - created_cx) AS created_non_cx,
deleted_cx,
(deleted_total - deleted_cx) AS deleted_non_cx
FROM created_articles
JOIN deleted_articles ON
created_articles.wiki = deleted_articles.wiki
"
cx_deletion_ratio_current_6mo <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
cx_deletion_ratio_6cur_overall <- cx_deletion_ratio_current_6mo %>%
summarise(deleted_cx_pct = paste0(round(sum(deleted_cx)/sum(created_cx) * 100, 2), "%"),
deleted_non_cx_pct = paste0(round(sum(deleted_non_cx)/sum(created_non_cx) * 100, 2), "%"),
deletion_pct_diff = paste0(round((sum(deleted_non_cx)/sum(created_non_cx)*100)-((sum(deleted_cx)/sum(created_cx))*100), 2),"%")
)
cx_deletion_ratio_6cur_overall
deleted_cx_pct | deleted_non_cx_pct | deletion_pct_diff |
---|---|---|
<chr> | <chr> | <chr> |
3.6% | 8.47% | 4.87% |
cx_deletion_ratio_current_bywiki <- cx_deletion_ratio_current_6mo %>%
#filter(wiki == 'idwiki') %>%
filter(created_cx > 15) %>% # only review wikis with more than 15 cx articles
mutate(deleted_cx_ratio = deleted_cx/created_cx,
deleted_non_cx_ratio = deleted_non_cx/created_non_cx,
deletion_ratio_diff = ((deleted_non_cx/created_non_cx)-(deleted_cx/created_cx)
))
cx_deletion_higher_current_6mo <- cx_deletion_ratio_current_bywiki %>%
filter(deletion_ratio_diff < 0) %>%
summarise(total_wikis = n())
cx_deletion_higher_current_6mo
total_wikis |
---|
<int> |
20 |
print(paste0("Across all wikis where more than 15 articles have been created with content translation from Jan 2021 - June 2021, there were ",
cx_deletion_higher_current_6mo[1],
" wikis where articles created with content translation were deleted more than articles created without cx"))
[1] "Across all wikis where more than 15 articles have been created with content translation from Jan 2021 - June 2021, there were 20 wikis where articles created with content translation were deleted more than articles created without cx"
cx_deletion_higher_list_current <- cx_deletion_ratio_current_bywiki %>%
filter(deletion_ratio_diff < 0)%>% #only wikis with higher cx deletion ratios
arrange(deletion_ratio_diff)
# reformat into table
cx_deletion_higher_list_6mo_tbl <- cx_deletion_higher_list_current %>%
gt() %>%
tab_header(
title = "Wikis with higher deletion ratios for articles created with Content Translation",
subtitle = "Reviewed Time Period: January 2021 through June 2021") %>%
fmt_percent(
columns = 6:8
) %>%
cols_label(wiki = "Wiki project",
created_cx = "Created CX Articles",
created_non_cx = "Created non-CX Articles",
deleted_cx = "Deleted CX Articles",
deleted_non_cx = "Deleted non-CX Articles",
deleted_cx_ratio = "CX Articles Deletion Ratio",
deleted_non_cx_ratio = "Non-CX Articles Deletion Ratio",
deletion_ratio_diff = "Deletion Ratio Difference") %>%
tab_spanner("Created Articles", 2:3) %>%
tab_spanner("Deleted Articles", 4:5) %>%
tab_spanner("Deletion Ratios", 6:8) %>%
tab_footnote(
footnote = "Excludes wikis with 15 or fewer articles created with Content Translation
during the reviewed time period",
locations = cells_column_labels(
columns = 'wiki'
)) %>%
gtsave(
"cx_deletion_higher_wikis_6mo.html", inline_css = TRUE)
IRdisplay::display_html(data = cx_deletion_higher_list_6mo_tbl, file = "cx_deletion_higher_wikis_6mo.html")
Wikis with higher deletion ratios for articles created with Content Translation | |||||||
---|---|---|---|---|---|---|---|
Reviewed Time Period: January 2021 through June 2021 | |||||||
1
Excludes wikis with 15 or fewer articles created with Content Translation
during the reviewed time period
|
cx_deletion_ration_highest_current <- cx_deletion_ratio_current_bywiki %>%
arrange(desc(deleted_cx_ratio)) %>%
mutate(deleted_cx_ratio = paste0(round(deleted_cx_ratio *100,2),"%") ,
deleted_non_cx_ratio = paste0(round(deleted_non_cx_ratio *100,2),"%") ,
deletion_ratio_diff = paste0(round(deletion_ratio_diff * 100,2),"%") )
head(cx_deletion_ration_highest_current, 5)
wiki | created_cx | created_non_cx | deleted_cx | deleted_non_cx | deleted_cx_ratio | deleted_non_cx_ratio | deletion_ratio_diff | |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | |
1 | ltwiki | 45 | 4254 | 17 | 2041 | 37.78% | 47.98% | 10.2% |
2 | hawwiki | 68 | 128 | 25 | 25 | 36.76% | 19.53% | -17.23% |
3 | mnwiki | 30 | 1265 | 10 | 542 | 33.33% | 42.85% | 9.51% |
4 | iswiki | 30 | 2157 | 7 | 140 | 23.33% | 6.49% | -16.84% |
5 | kawiki | 170 | 10010 | 33 | 1415 | 19.41% | 14.14% | -5.28% |
Lithuanian Wikipedia had the highest deletion ratio for articles created with content translation. 37.8% of all articles created with content translation rate were deleted; however, this was still less than the percent of non content translated article deletion ratio (47.9%).
The Wiki that had the highest different in deletion ratios was Hawaiian Wikipedia. 36.8% of all articles created with cx were deleted during the reviewed time period comparted to 19.5% of articles created without content translation.
# Previous 6 Months
# July 2020 - December 2020
query <-
"
-- find both cx and non-cx created articles
WITH created_articles AS (
SELECT
wiki_db AS wiki,
SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS created_cx,
COUNT(*) AS created_total
FROM wmf.mediawiki_history
WHERE
snapshot = '2021-08'
AND event_timestamp BETWEEN '2020-07-01' and '2020-12-31'
-- interested in main page namespaces
AND page_namespace = 0
-- only look at new page creations
AND revision_parent_id = 0
AND event_entity = 'revision'
AND event_type = 'create'
-- remove bots
AND SIZE(event_user_is_bot_by_historical) = 0
GROUP BY
wiki_db
),
--find all deleted articles that were created with cx
deleted_articles AS (
SELECT
wiki_db AS wiki,
SUM(CAST(ARRAY_CONTAINS(revision_tags, 'contenttranslation') AS INT)) AS deleted_cx,
COUNT(*) AS deleted_total
FROM wmf.mediawiki_history
WHERE
snapshot = '2021-08'
AND event_timestamp BETWEEN '2020-07-01' and '2020-12-31'
-- interested in main page namespaces
AND page_namespace = 0
-- only look at new page creations
AND revision_parent_id = 0
AND event_entity = 'revision'
-- find revisions moved to the archive table
AND event_type = 'create'
AND revision_is_deleted_by_page_deletion = TRUE
-- remove bots
AND SIZE(event_user_is_bot_by_historical) = 0
GROUP BY
wiki_db
)
-- main query to aggregate and join sources above
SELECT
created_articles.wiki,
created_cx,
(created_total - created_cx) AS created_non_cx,
deleted_cx,
(deleted_total - deleted_cx) AS deleted_non_cx
FROM created_articles
JOIN deleted_articles ON
created_articles.wiki = deleted_articles.wiki
"
cx_deletion_ratio_previous_6mo <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
cx_deletion_ratio_bywiki_previous <- cx_deletion_ratio_previous_6mo %>%
#filter(wiki == 'idwiki') %>%
filter(created_cx > 15) %>% # only wikis with at leat 15 created articles
mutate(deleted_cx_ratio = deleted_cx/created_cx,
deleted_non_cx_ratio = deleted_non_cx/created_non_cx,
deletion_ratio_diff = ((deleted_non_cx/created_non_cx)-(deleted_cx/created_cx)
))
cx_deletion_higher_previous <- cx_deletion_ratio_bywiki_previous %>%
filter(deletion_ratio_diff < 0) %>%
summarise(total_wikis = n())
cx_deletion_higher_previous
total_wikis |
---|
<int> |
21 |
print(paste0("Across all wikis where more than 15 articles have been created with content translation between July 2020 and December 2020, there were ",
cx_deletion_higher_previous[1],
" wikis where articles created with content translation were deleted more than articles created without cx"))
[1] "Across all wikis where more than 15 articles have been created with content translation between July 2020 and December 2020, there were 21 wikis where articles created with content translation were deleted more than articles created without cx"
The number of wikis with higher content translation deletion ratios decreased by 1 from July 2020 to December 2020 to January 2021 to June 2021.
We next compared the two lists of wikis to confirm if most of the wikis with higher deletion rates were the same across each quarter.
cx_deletion_higher_list_previous <- cx_deletion_ratio_bywiki_previous %>%
filter(deletion_ratio_diff < 0) %>%
arrange(deletion_ratio_diff)
cx_deletion_higher_list_previous
wiki | created_cx | created_non_cx | deleted_cx | deleted_non_cx | deleted_cx_ratio | deleted_non_cx_ratio | deletion_ratio_diff |
---|---|---|---|---|---|---|---|
<chr> | <int> | <int> | <int> | <int> | <dbl> | <dbl> | <dbl> |
fywiki | 17 | 1755 | 14 | 65 | 0.82352941 | 0.037037037 | -0.786492375 |
hawwiki | 42 | 132 | 31 | 24 | 0.73809524 | 0.181818182 | -0.556277056 |
ltwiki | 59 | 3337 | 28 | 644 | 0.47457627 | 0.192987714 | -0.281588558 |
iswiki | 26 | 2000 | 7 | 155 | 0.26923077 | 0.077500000 | -0.191730769 |
lawiki | 48 | 2979 | 9 | 158 | 0.18750000 | 0.053037932 | -0.134462068 |
hywiki | 159 | 33338 | 22 | 1080 | 0.13836478 | 0.032395465 | -0.105969315 |
azwiki | 206 | 29671 | 29 | 1885 | 0.14077670 | 0.063530046 | -0.077246653 |
arywiki | 63 | 2443 | 5 | 50 | 0.07936508 | 0.020466639 | -0.058898440 |
mywiki | 313 | 6698 | 37 | 439 | 0.11821086 | 0.065541953 | -0.052668910 |
cywiki | 122 | 1451 | 13 | 85 | 0.10655738 | 0.058580289 | -0.047977088 |
vecwiki | 20 | 10293 | 1 | 46 | 0.05000000 | 0.004469057 | -0.045530943 |
arzwiki | 133 | 355316 | 4 | 730 | 0.03007519 | 0.002054509 | -0.028020679 |
eowiki | 277 | 10800 | 8 | 90 | 0.02888087 | 0.008333333 | -0.020547533 |
zhwiki | 1512 | 80866 | 137 | 5753 | 0.09060847 | 0.071142384 | -0.019466082 |
dewiki | 505 | 119158 | 87 | 18351 | 0.17227723 | 0.154005606 | -0.018271622 |
zh_yuewiki | 35 | 23696 | 1 | 267 | 0.02857143 | 0.011267725 | -0.017303704 |
ckbwiki | 64 | 2901 | 5 | 183 | 0.07812500 | 0.063081696 | -0.015043304 |
kuwiki | 402 | 5291 | 13 | 133 | 0.03233831 | 0.025137025 | -0.007201283 |
fiwiki | 138 | 20467 | 12 | 1680 | 0.08695652 | 0.082083354 | -0.004873168 |
etwiki | 55 | 8239 | 6 | 865 | 0.10909091 | 0.104988469 | -0.004102440 |
bswiki | 62 | 2677 | 6 | 254 | 0.09677419 | 0.094882331 | -0.001891863 |
intersect(cx_deletion_higher_list_current[1], cx_deletion_higher_list_previous[1])
wiki |
---|
<chr> |
hawwiki |
iswiki |
kuwiki |
arywiki |
arzwiki |
fiwiki |
lawiki |
eowiki |
There were 8 wikis that had higher deletion ratios for content translated articles both quarters.