shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(tidyverse); library(glue); library(lubridate); library(scales)
})
# Collect talk page edit events
query <-
"SELECT
component_type,
topic_id,
comment_parent_id,
comment_id,
page_namespace,
performer.user_id As `user`,
performer.user_is_anonymous As is_anon,
revision_id,
session_id,
integration,
action,
performer.user_edit_count As edit_count,
performer.user_edit_count_bucket As edit_count_bucket,
`database`,
meta.dt
FROM
event.mediawiki_talk_page_edit
WHERE
year = 2021
AND month >= 11
"
collect_talk_events <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
collect_talk_events$dt <- as.Date(collect_talk_events$dt, format = "%Y-%m-%d")
daily_events <- collect_talk_events %>%
group_by(dt) %>%
count() %>%
arrange(dt)
daily_events
dt | n |
---|---|
<date> | <int> |
2021-11-03 | 403 |
2021-11-04 | 4796 |
2021-11-05 | 13888 |
2021-11-06 | 14753 |
2021-11-07 | 16021 |
2021-11-08 | 19874 |
2021-11-09 | 17606 |
2021-11-10 | 16684 |
2021-11-11 | 21075 |
2021-11-12 | 15505 |
2021-11-13 | 14588 |
2021-11-14 | 19743 |
2021-11-15 | 15532 |
2021-11-16 | 16373 |
2021-11-17 | 16449 |
2021-11-18 | 15766 |
2021-11-19 | 15598 |
2021-11-20 | 13264 |
2021-11-21 | 13743 |
2021-11-22 | 17581 |
2021-11-23 | 16711 |
2021-11-24 | 16896 |
2021-11-25 | 16384 |
2021-11-26 | 15226 |
2021-11-27 | 17894 |
2021-11-28 | 14562 |
2021-11-29 | 15343 |
2021-11-30 | 17714 |
2021-12-01 | 16902 |
2021-12-02 | 16040 |
2021-12-03 | 16878 |
2021-12-04 | 13760 |
2021-12-05 | 16483 |
2021-12-06 | 15296 |
2021-12-07 | 16903 |
2021-12-08 | 16238 |
2021-12-09 | 11157 |
2021-12-10 | 1411 |
We start recording events on November 3rd, which is when the change was deployed.
talk_events_total <- collect_talk_events %>%
summarise(total_events = n())
talk_events_total
total_events |
---|
<int> |
571040 |
# Number of logged in users
talk_users_unique <- collect_talk_events %>%
filter(is_anon == 'false') %>%
summarise(unique_users = n_distinct(user))
talk_users_unique
unique_users |
---|
<int> |
40146 |
talk_users_anon <- collect_talk_events %>%
filter(is_anon == 'true'| user == 0 ) %>%
summarise( n_events = n())
talk_users_anon
n_events |
---|
<int> |
93 |
We are now logging logged out events
talk_events_byeditcount <- collect_talk_events %>%
group_by(edit_count_bucket) %>%
summarise(total_events = n())
talk_events_byeditcount
`summarise()` ungrouping output (override with `.groups` argument)
edit_count_bucket | total_events |
---|---|
<chr> | <int> |
1-4 edits | 3378 |
100-999 edits | 38773 |
1000+ edits | 500849 |
5-99 edits | 27947 |
N/A | 93 |
ISSUE: We do not appear to be logging any edits in the 1-4 edit bucket group. UPDATE: We are now logging edit count bucket events correctly. All logged out users are assigned the 'NA' value.
talk_events_byintergration <- collect_talk_events %>%
group_by(integration) %>%
summarise(total_events = n())
talk_events_byintergration
`summarise()` ungrouping output (override with `.groups` argument)
integration | total_events |
---|---|
<chr> | <int> |
discussiontools | 81125 |
page | 489915 |
We've recorded talk page events using discussion tools and on the page.
talk_events_bywiki <- collect_talk_events %>%
group_by(database) %>%
summarise(total_events = n())
talk_events_bywiki
`summarise()` ungrouping output (override with `.groups` argument)
database | total_events |
---|---|
<chr> | <int> |
afwiki | 69 |
alswiki | 171 |
amwiki | 2 |
amwiktionary | 1 |
arwiki | 1163 |
arwikiquote | 8 |
arwikisource | 1 |
arywiki | 4 |
arzwiki | 155 |
astwiki | 11 |
aswiki | 2 |
azbwiki | 24 |
azwiki | 293 |
banwiki | 14 |
bat_smgwiki | 1 |
bawiki | 66 |
bclwiktionary | 4 |
be_x_oldwiki | 18 |
bewiki | 129 |
bewikiquote | 1 |
bewikisource | 1 |
bewiktionary | 69 |
bgwiki | 356 |
bnwiki | 459 |
bnwikibooks | 5 |
bnwikisource | 18 |
bnwiktionary | 57 |
brwiki | 11 |
bswiki | 52 |
cawiki | 498 |
⋮ | ⋮ |
trwikiquote | 1 |
trwikisource | 23 |
trwikivoyage | 2 |
trwiktionary | 78 |
ttwiki | 13 |
twwiki | 2 |
uawikimedia | 67 |
ukwiki | 1551 |
ukwikibooks | 2 |
ukwikisource | 25 |
uzwiki | 63 |
vecwiki | 41 |
viwiki | 1658 |
viwikisource | 1 |
viwikivoyage | 3 |
viwiktionary | 22 |
warwiki | 2 |
wikidatawiki | 1603 |
wikimaniawiki | 3 |
yowiki | 3 |
zh_classicalwiki | 2 |
zh_yuewiki | 84 |
zhwiki | 4635 |
zhwikibooks | 3 |
zhwikinews | 51 |
zhwikiquote | 6 |
zhwikisource | 375 |
zhwikiversity | 8 |
zhwikivoyage | 23 |
zhwiktionary | 16 |
talk_events_bycomponent <- collect_talk_events %>%
group_by(component_type) %>%
summarise(total_events = n())
talk_events_bycomponent
`summarise()` ungrouping output (override with `.groups` argument)
component_type | total_events |
---|---|
<chr> | <int> |
comment | 253440 |
response | 317600 |
There are more responses than comments. We are only recording comment and response.
talk_events_byaction <- collect_talk_events %>%
group_by(action) %>%
summarise(total_events = n())
talk_events_byaction
`summarise()` ungrouping output (override with `.groups` argument)
action | total_events |
---|---|
<chr> | <int> |
publish | 571040 |
All recorded events have been logged as publish events as expected.
talk_events_bynamespace <- collect_talk_events %>%
group_by(page_namespace) %>%
summarise(total_events = n()) %>%
arrange(desc(total_events))
talk_events_bynamespace
`summarise()` ungrouping output (override with `.groups` argument)
page_namespace | total_events |
---|---|
<int> | <int> |
3 | 224801 |
4 | 146651 |
1 | 137068 |
5 | 35063 |
103 | 7905 |
11 | 6896 |
101 | 1615 |
0 | 1283 |
119 | 1182 |
15 | 1156 |
102 | 1023 |
13 | 1013 |
105 | 861 |
7 | 837 |
107 | 666 |
12 | 617 |
829 | 577 |
9 | 573 |
104 | 452 |
121 | 249 |
201 | 208 |
100 | 152 |
106 | 87 |
109 | 28 |
111 | 19 |
113 | 17 |
253 | 13 |
2 | 12 |
115 | 5 |
133 | 4 |
487 | 3 |
147 | 2 |
203 | 1 |
251 | 1 |
The most events have occured on User Talk pages (38.9%), followed by project talk (27%) and then article talk pages (24%). There are some events that occur on non talk pages. 364 events were recored on article (main) namespace pages. - Do we know what might cause this.
Per current set-up, comment_id
will change with every event -- it's the ID of the parent that was responded to. For the very first comment in a thread that'll be the ID of the topic (but the component_type will be comment
rather than response
and comment_id
will equal topic_id
, so this is easy to distinguish).
#find any instances where the comment id is equal to the comment parent id
comment_id_check <- collect_talk_events %>%
filter(collect_talk_events$comment_id
== collect_talk_events$comment_parent_id)
comment_id_check
component_type | topic_id | comment_parent_id | comment_id | page_namespace | user | is_anon | revision_id | session_id | integration | action | edit_count | edit_count_bucket | database | dt |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <int> | <int> | <chr> | <int> | <chr> | <chr> | <chr> | <int> | <chr> | <chr> | <date> |
No instances where the comment_id is equal to comment_parent_id as expected
# find top_level comments where the comment id is equal to the topic id. Should all be lableld comment
top_level_commment <- collect_talk_events %>%
filter(collect_talk_events$comment_parent_id
== collect_talk_events$topic_id & component_type == 'comment')
There are several top level comments, indicated by a comment_parent_id that matches a topic_id. All of these are labeled as comments as expected. Next we'll check that all responses are labled correctly.
# make sure responses are labeled correctly
response_check <- collect_talk_events %>%
filter(collect_talk_events$comment_parent_id
== collect_talk_events$topic_id & component_type == 'response')
response_check
component_type | topic_id | comment_parent_id | comment_id | page_namespace | user | is_anon | revision_id | session_id | integration | action | edit_count | edit_count_bucket | database | dt |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <int> | <int> | <chr> | <int> | <chr> | <chr> | <chr> | <int> | <chr> | <chr> | <date> |
Confirmed there are no instances where a response event has the same comment_parent_id and topic_id.
non_top_level_comments <- collect_talk_events %>%
filter(collect_talk_events$comment_parent_id
!= collect_talk_events$topic_id & component_type == 'comment')
There are component_type = comment events where the comment_parent_id does not match the topic_id. Is this expected? What's the difference between this and a response?
Check that I can perform the joins necessary to calculate the KPIs identifed for this data:
If there's a response, then it should be listed as a parent_id
- You can find comments and topics without a response by looking for new comment_ids that are not also labeled as comment_parent_ids
comments_with_response <- collect_talk_events %>%
filter(comment_id %in% comment_parent_id) # confirms that the comment recieved a response at some point
Steps would be to find all sessions
# Check to see if you can find comment_parent_id in the comment_id list
comments_without_response <- collect_talk_events %>%
filter(comment_parent_id %in% comment_id) # confirms that the comment recieved a response at some point
initial_comment_data <- collect_talk_events %>%
filter(comment_parent_id == 'c-Barter84-2021-11-09T16:17:00.000Z-Djhuty-2021-11-08T15:14:00.000Z')
initial_comment_data
response_comment_data <- collect_talk_events %>%
filter(comment_id == 'c-Barter84-2021-11-09T16:17:00.000Z-Djhuty-2021-11-08T15:14:00.000Z')
response_comment_data
# Check if there are duplicate comment_id events
duplicate_comments <- collect_talk_events %>%
group_by(comment_id) %>%
filter(n()>1)
Potential Issues: There's s number of events with the same comment_id logged. Different revision and sessions ids. Why would this happen?
query <-
"SELECT
tpe.session_id,
eas.event.editing_session_id,
tpe.performer.user_id,
eas.event.user_id
FROM
event.mediawiki_talk_page_edit tpe
LEFT JOIN
event.editattemptstep eas
ON session_id = eas.event.editing_session_id
AND eas.year = 2021 and eas.MOnth >= 11
WHERE
tpe.year = 2021
and tpe.month = 11
AND tpe.integration = 'discussiontools'
"
collect_talk_events_weditor <- wmfdata::query_hive(query)
Don't forget to authenticate with Kerberos using kinit
collect_talk_events_weditor %>%
filter(editing_session_id == 'NULL')
There's some sessions in editattemptstep but not all. All discussiontools related sessions are in editattemptstep but not all page related sessions. This is likely due to sampling differences. Need to see if we can match sampling rates. That's where we get platform data so we'll need to apply the same sampling rates as possible.