shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(tidyverse); library(glue); library(lubridate); library(scales)
})
# Collect all events from new instrumentation to review
query <-
"SELECT
date_format(dt, 'yyyy-MM-dd') AS block_time,
block_type,
block_expiry,
block_id,
block_scope,
interface,
country_code,
platform,
`database`,
page_id,
page_namespace,
rev_id,
performer.user_id,
performer.user_edit_count,
http.client_ip
FROM
event.mediawiki_editattempt_block
WHERE
YEAR = 2022
"
edits_blocked_events <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
edits_blocked_bytype <- edits_blocked_events %>%
group_by(block_type) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id)) %>%
mutate(pct_users = round(n_users / sum(n_users), 3))
edits_blocked_bytype
`summarise()` ungrouping output (override with `.groups` argument)
block_type | n_events | n_users | pct_users |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
autoblock | 1520 | 124 | 0.017 |
ip | 1466221 | 1177 | 0.162 |
range | 6598136 | 3169 | 0.437 |
user | 10860 | 2782 | 0.384 |
edits_blocked_byplatform <- edits_blocked_events %>%
group_by(platform) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id)) %>%
mutate(pct_events = round(n_events / sum(n_events), 3))
edits_blocked_byplatform
`summarise()` ungrouping output (override with `.groups` argument)
platform | n_events | n_users | pct_events |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
desktop | 7402183 | 5691 | 0.916 |
mobile | 674554 | 1601 | 0.084 |
edits_blocked_byinterface <- edits_blocked_events %>%
group_by(interface) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id)) %>%
mutate(pct_users = round(n_users / sum(n_users), 3))
edits_blocked_byinterface
`summarise()` ungrouping output (override with `.groups` argument)
interface | n_events | n_users | pct_users |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
discussiontools | 7880 | 504 | 0.062 |
mobilefrontend | 673580 | 1583 | 0.193 |
visualeditor | 213207 | 2094 | 0.256 |
wikieditor | 7182070 | 4009 | 0.489 |
edits_blocked_byanon <- edits_blocked_events %>%
mutate(isanon = ifelse(user_id == 0, "true", "false")) %>%
group_by(isanon) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id))
edits_blocked_byanon
`summarise()` ungrouping output (override with `.groups` argument)
isanon | n_events | n_users |
---|---|---|
<chr> | <int> | <int> |
false | 21455 | 6993 |
true | 8055282 | 1 |
Confirmed both blocks for anon and registered users are included. All anon users are correctly tagged with a user id of 0.
edits_blocked_byscope <- edits_blocked_events %>%
group_by(block_scope) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id)) %>%
mutate(pct_events = round(n_events / sum(n_events), 3))
edits_blocked_byscope
`summarise()` ungrouping output (override with `.groups` argument)
block_scope | n_events | n_users | pct_events |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
global | 2878909 | 1197 | 0.356 |
local | 5197828 | 5893 | 0.644 |
edits_blocked_byexpiry <- edits_blocked_events %>%
mutate(is_infinite = ifelse(block_expiry == 'infinity', "true", "false")) %>%
group_by(is_infinite) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id)) %>%
mutate(pct_users = round(n_users / sum(n_users), 3))
edits_blocked_byexpiry
`summarise()` ungrouping output (override with `.groups` argument)
is_infinite | n_events | n_users | pct_users |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
false | 7828840 | 4386 | 0.622 |
true | 247897 | 2661 | 0.378 |
Both infinite and non-infinite blocks have been logged.
#quick check of block expiration dates
edits_blocked_exp_date <- edits_blocked_events %>%
group_by(block_expiry) %>%
summarise(n_events = n()) %>%
arrange(block_expiry)
head(edits_blocked_exp_date)
`summarise()` ungrouping output (override with `.groups` argument)
block_expiry | n_events |
---|---|
<chr> | <int> |
2022-10-12T22:41:35Z | 10 |
2022-10-12T23:08:11Z | 1 |
2022-10-12T23:10:04Z | 11 |
2022-10-13T00:06:06Z | 1 |
2022-10-13T00:13:35Z | 2 |
2022-10-13T00:19:09Z | 9 |
The earliest block expirated date logged is on 12 October 2022, which is the date we started logging events. Let's make sure that also the earliest time logged for when the blocks occurred.
#quick check of block expiration dates
edits_blocked_date <- edits_blocked_events %>%
group_by(block_time) %>%
summarise(n_events = n()) %>%
arrange(block_time)
head(edits_blocked_date)
`summarise()` ungrouping output (override with `.groups` argument)
block_time | n_events |
---|---|
<chr> | <int> |
2022-10-12 | 95976 |
2022-10-13 | 1116715 |
2022-10-14 | 1074355 |
2022-10-15 | 1018007 |
2022-10-16 | 1078969 |
2022-10-17 | 1148724 |
We first start logging events on `2022-10-12` as expected.
#quick check of block expiration dates
edits_blocked_bycountry <- edits_blocked_events %>%
group_by(country_code) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id)) %>%
mutate(pct_users = round(n_users / sum(n_users), 3)) %>%
arrange(desc(n_events))
head(edits_blocked_bycountry)
`summarise()` ungrouping output (override with `.groups` argument)
country_code | n_events | n_users | pct_users |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
US | 2506830 | 1410 | 0.185 |
CA | 1390084 | 200 | 0.026 |
DE | 719657 | 343 | 0.045 |
RU | 653564 | 391 | 0.051 |
HK | 309201 | 189 | 0.025 |
BE | 303395 | 38 | 0.005 |
There's some instances where the country code field is logged as a long string vs a country code. Taking a close look at those:
edits_blocked_bycountry_errors <- edits_blocked_events %>%
group_by(country_code) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id)) %>%
mutate(pct_users = round(n_users / sum(n_users), 3)) %>%
filter(str_length(country_code) > 2) #find values that are are not 2 digit country code
head (edits_blocked_bycountry_errors)
`summarise()` ungrouping output (override with `.groups` argument)
country_code | n_events | n_users | pct_users |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
'"()&%<acx><ScRiPt >1pVo(9336)</ScRiPt> | 1 | 1 | 0 |
'"()&%<acx><ScRiPt >1TBU(9672)</ScRiPt> | 1 | 1 | 0 |
'"()&%<acx><ScRiPt >23hz(9646)</ScRiPt> | 1 | 1 | 0 |
'"()&%<acx><ScRiPt >26PX(9583)</ScRiPt> | 1 | 1 | 0 |
'"()&%<acx><ScRiPt >2asY(9963)</ScRiPt> | 1 | 1 | 0 |
'"()&%<acx><ScRiPt >2NsB(9331)</ScRiPt> | 1 | 1 | 0 |
# find percent of these occurences
edits_blocked_bycountry_errors <- edits_blocked_events %>%
mutate(error = ifelse(str_length(country_code) > 2, "country_code_error", "normal")) %>%
group_by(error) %>%
summarise(n_events = n(),
n_users = n_distinct(user_id)) %>%
mutate(pct_events = round(n_events / sum(n_events), 3))
edits_blocked_bycountry_errors
`summarise()` ungrouping output (override with `.groups` argument)
error | n_events | n_users | pct_events |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
country_code_error | 90518 | 7 | 0.011 |
normal | 7986215 | 6990 | 0.989 |
NA | 4 | 1 | 0.000 |
#quick check of block expiration dates
edits_blocked_bynp <- edits_blocked_events %>%
group_by(page_namespace) %>%
summarise(n_events = n())
edits_blocked_bynp
`summarise()` ungrouping output (override with `.groups` argument)
page_namespace | n_events |
---|---|
<int> | <int> |
0 | 6979785 |
1 | 86777 |
2 | 109794 |
3 | 124554 |
4 | 178533 |
5 | 17867 |
6 | 38155 |
7 | 697 |
8 | 5739 |
9 | 311 |
10 | 316506 |
11 | 23257 |
12 | 12286 |
13 | 1317 |
14 | 107361 |
15 | 1419 |
90 | 1707 |
92 | 6 |
100 | 21575 |
101 | 489 |
102 | 9854 |
103 | 2117 |
104 | 5711 |
105 | 105 |
106 | 1423 |
107 | 45 |
108 | 152 |
109 | 2 |
110 | 145 |
111 | 1 |
⋮ | ⋮ |
121 | 107 |
124 | 1 |
128 | 149 |
129 | 2 |
130 | 3 |
132 | 80 |
133 | 1 |
134 | 47 |
136 | 9 |
200 | 3904 |
201 | 508 |
202 | 1264 |
203 | 131 |
206 | 31 |
207 | 1 |
250 | 118 |
252 | 18 |
460 | 3 |
470 | 24 |
471 | 19 |
482 | 2 |
486 | 26 |
487 | 1 |
710 | 4 |
828 | 18375 |
829 | 229 |
866 | 22 |
867 | 1 |
1198 | 964 |
1199 | 4 |
edits_blocked_byanon_count <- edits_blocked_events %>%
mutate(isanon = ifelse(user_id == 0, "true", "false")) %>%
filter(isanon == 'true') %>%
group_by(isanon, user_edit_count) %>%
summarise(n_events = n())
edits_blocked_byanon_count
`summarise()` regrouping output by 'isanon' (override with `.groups` argument)
isanon | user_edit_count | n_events |
---|---|---|
<chr> | <int> | <int> |
true | 0 | 8055282 |
#quick check of block expiration dates
edits_blocked_bydatabase <- edits_blocked_events %>%
group_by(database) %>%
summarise(n_events = n()) %>%
arrange(desc(n_events))
head(edits_blocked_bydatabase)
`summarise()` ungrouping output (override with `.groups` argument)
database | n_events |
---|---|
<chr> | <int> |
enwiki | 4638043 |
zhwiki | 264465 |
frwiki | 263387 |
ruwiki | 189636 |
dewiki | 166379 |
ptwiki | 130881 |
Most blocks occur on enwiki.