For the 2022-2023 fiscal year, the Editing Team is considering centering the work we will do around people A) who are contributing to a Wikipedia from sub-Saharan Africa and B) who have made fewer than 100 cumulative edits to Wikipedia.
Before moving forward with defining the audience in the ways "A)" and "B)" describe, we would like to understand:
Knowing the above will help us determine what projects we will consider partnering most closely with in the 2022-2023 fiscal year.
Knowing the above will help us determine how long we can expect the quantitative analyses we have planned to reach statistic significance.
wmf.editors_daily
and wmf.mediawiki_history
table.wmf.editors_daily
is only retained for the last two calendar months.shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(tidyverse); library(glue); library(lubridate); library(scales)
})
query <-
"
SELECT
mwh.event_user_id as user_id,
CASE
WHEN min(event_user_revision_count) = 1 THEN 'first_time_editor'
WHEN min(event_user_revision_count) < 100 THEN 'under 100'
WHEN (min(event_user_revision_count) >=100 AND min(event_user_revision_count <= 500)) THEN '100-500'
ELSE 'over 500'
END AS edit_count_group,
mwh.wiki_db AS project
FROM
wmf.mediawiki_history mwh
INNER JOIN canonical_data.wikis
ON wiki_db = database_code and
database_group == 'wikipedia'
INNER JOIN
wmf.editors_daily ed
ON mwh.event_user_id = CAST(ed.user_fingerprint_or_id AS BIGINT)
AND mwh.wiki_db = ed.wiki_db
WHERE
-- from sug-saharan Africa region
ed.country_code IN ('AO','BJ', 'BW','BF','BI','CM','CV','TD','KM','CI', 'DJ', 'GQ', 'ER', 'ET', 'GA','GM','GH',
'GN','GW','KE','LS','LR','MG','MW','ML','MU', 'MZ', 'NA', 'NE', 'NG', 'CD', 'RW', 'ST', 'SN', 'SC', 'SL', 'SO', 'ZA',
'SD', 'TZ','CF', 'CD', 'TG', 'UG', 'ZM','ZW')
AND ed.month = '2022-05'
AND mwh.snapshot = '2022-05'
AND mwh.event_timestamp >= '2022-05-01'
AND mwh.event_timestamp <= '2022-06-01'
AND ed.user_is_anonymous = FALSE
AND mwh.event_user_is_anonymous = FALSE
GROUP BY
mwh.event_user_id,
mwh.wiki_db
"
first_time_editors_may <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
first_time_editors_total_may <- first_time_editors_may %>%
#filter(edit_count_group == 'first_time_editor') %>%
filter(project == 'enwiki') %>%
summarise(n_users = n_distinct(user_id))
first_time_editors_total_may
n_users |
---|
<int> |
10534 |
query <-
"
SELECT
mwh.event_user_id as user_id,
CASE
WHEN min(event_user_revision_count) = 1 THEN 'first_time_editor'
WHEN min(event_user_revision_count) < 100 THEN 'under 100'
WHEN (min(event_user_revision_count) >=100 AND min(event_user_revision_count <= 500)) THEN '100-500'
ELSE 'over 500'
END AS edit_count_group,
mwh.wiki_db AS project
FROM
wmf.mediawiki_history mwh
INNER JOIN canonical_data.wikis
ON wiki_db = database_code and
database_group == 'wikipedia'
INNER JOIN
wmf.editors_daily ed
ON mwh.event_user_id = CAST(ed.user_fingerprint_or_id AS BIGINT)
AND mwh.wiki_db = ed.wiki_db
WHERE
ed.country_code IN ('AO','BJ', 'BW','BF','BI','CM','CV','TD','KM','CI', 'DJ', 'GQ', 'ER', 'ET', 'GA','GM','GH',
'GN','GW','KE','LS','LR','MG','MW','ML','MU', 'MZ', 'NA', 'NE', 'NG', 'CD', 'RW', 'ST', 'SN', 'SC', 'SL', 'SO', 'ZA',
'SD', 'TZ','CF', 'CD', 'TG', 'UG', 'ZM','ZW')
AND ed.month = '2022-04'
AND mwh.snapshot = '2022-05'
AND mwh.event_timestamp >= '2022-04-01'
AND mwh.event_timestamp <= '2022-05-01'
AND ed.user_is_anonymous = FALSE
AND mwh.event_user_is_anonymous = FALSE
GROUP BY
mwh.event_user_id,
mwh.wiki_db
"
first_time_editors_april <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
first_time_editors_total_april <- first_time_editors_april %>%
filter(edit_count_group == 'first_time_editor') %>%
summarise(n_users = n_distinct(user_id))
first_time_editors_total_april
n_users |
---|
<int> |
3072 |
Note: We'll use May to get a more recent depication.
first_time_editors_byproject <- first_time_editors_may %>%
filter(edit_count_group %in% c('first_time_editor', 'under 100')) %>%
group_by(project) %>%
summarise(n_users = n_distinct(user_id)) %>%
mutate(pct_users = paste0(round(prop.table(n_users) * 100, 2), "%")) %>%
arrange(desc(n_users))
head(first_time_editors_byproject, 15)
`summarise()` ungrouping output (override with `.groups` argument)
project | n_users | pct_users |
---|---|---|
<chr> | <int> | <chr> |
enwiki | 3017 | 68.17% |
frwiki | 711 | 16.06% |
ptwiki | 143 | 3.23% |
arwiki | 97 | 2.19% |
swwiki | 67 | 1.51% |
igwiki | 41 | 0.93% |
hawiki | 37 | 0.84% |
simplewiki | 32 | 0.72% |
yowiki | 22 | 0.5% |
dewiki | 21 | 0.47% |
rwwiki | 21 | 0.47% |
afwiki | 18 | 0.41% |
sowiki | 14 | 0.32% |
twwiki | 13 | 0.29% |
eswiki | 12 | 0.27% |