In [1]:
shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(tidyverse); 
library(lubridate); 
library(scales);
library(magrittr); 
library(dplyr);
# Modeling 
library(effsize);
})
In [112]:
library(IRdisplay)

display_html(
'<script>  
code_show=true; 
function code_toggle() {
  if (code_show){
    $(\'div.input\').hide();
  } else {
    $(\'div.input\').show();
  }
  code_show = !code_show
}  
$( document ).ready(code_toggle);
</script>
  <form action="javascript:code_toggle()">
    <input type="submit" value="Click here to toggle on/off the raw code.">
 </form>'
)
In [3]:
options(repr.plot.width = 15, repr.plot.height = 10)

Language Switching A/B Test Analysis

Jennifer Wang, Staff Data Scientist, Wikimedia Foundation

8 October 2021

TICKET

Introduction

As a part of the Desktop Improvements project, The Wikimedia Foundation's Web team is introducing a new way to switch to a different language wiki. The original language list appears on the sidebar. The new solution replaces the original one by providing 1) a language option button on the top of the page, 2) Selecting the language button will open the list of languages that will contain the suggested languages for each user, as well as a full list of all available languages for that article. You can find more information on this change and other feature deployments on the Language switching project page.

For the logged-in users, an AB test was performed on the early adopter wikis except fawiki between June 22, 2021 and July 20, 2021. On fawiki, the AB test was performed between June 28 and July 20, 2021. This report details the analysis and results for the language switch AB test.

For the logged-out users, another report details the analysis and results for the deployment of language switch on logged-out users.

Purpose

The primary goal of the AB Test was to test the hypothesis that the group with the new language button and links will get more clicks as it is to discover.

As part of this analysis, we were also interested in identifying any interesting trends in user behavior.

Methodology

The AB test was run on a per wiki basis on logged-in users. Users included in the test were randomly assigned to either the control (old language switch) or treatment (new language switch) based on their user ID.

We compared the total numbers of clicks between control and treatment groups. There are three use scenarios: clicks on language links, clicks on input language links, and clicks on interface language links. We reviewed each use scenario separately.

We also reviewed the difference in percentage between the treatment group and the control group. Ran one-sample t-test to determine if the difference is statistically significant.

Language Link Clicks

New feature -- language link clicks after new button clicks

In [4]:
query_language_clicks_new <- 
"
-- sessions where new lang button was selected
WITH new_button AS (
SELECT
    MIN(TO_DATE(dt)) as button_date,
    event.web_session_id as session_id,
    event.context as open_context,
    wiki as wiki
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7,8) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only logged-in users
    AND event.isanon = false
    AND event.action = 'compact-language-links-open'
    AND event.context = 'header'
    AND event.skinVersion = 'latest'
GROUP BY 
    event.web_session_id,
    event.context,
    event.isanon,
    wiki
),

lang_switches AS (
  SELECT
    TO_DATE(dt) as switch_date,
    event.web_session_id as session_id,
    event.context as switch_context,
    wiki as wiki
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7,8) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only logged-in users
    AND event.isanon = false
    AND event.action = 'language-change'
    AND event.context='content-language-switcher'
    AND event.skinVersion = 'latest'
)

SELECT
    new_button.button_date,
    lang_switches.switch_date,
    new_button.session_id,
    new_button.wiki,
    new_button.open_context,
-- sessions with lang switch that occured after button clicks
    IF(lang_switches.session_id IS NOT NULL AND switch_date >= button_date, 1, 0) AS language_switch,
    lang_switches.switch_context
FROM new_button
LEFT JOIN lang_switches ON
    new_button.session_id = lang_switches.session_id AND
    new_button.wiki = lang_switches.wiki 
"
In [113]:
#new_language_link_clicks <-  wmfdata::query_hive(query_language_clicks_new)
In [7]:
#write_csv(new_language_link_clicks, file = 'Data_login/new_language_link_clicks.csv')
In [8]:
#new_language_link_clicks <- read_csv('Data_login/new_language_link_clicks.csv')
In [9]:
new_lang_link_clicks_by_date <- new_language_link_clicks  %>%
    filter(language_switch==1) %>%
    group_by( wiki, switch_date) %>%
    summarize(n_events_new_language_change = n(), .groups = 'drop')

new_lang_link_clicks_by_date$switch_date <- as.Date(new_lang_link_clicks_by_date$switch_date)

Old feature -- language link clicks on sidebar

In [11]:
query_old_language_link <- 
"
SELECT
    TO_DATE(dt) AS `date`,
    wiki,
    event.web_session_id,
    event.usereditbucket,
    event.timetochangelanguage,
    event.interfacelanguage,
    event.contentlanguage,
    event.selectedinterfacelanguage,
    Count(*) AS n_events
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7,8) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND event.context = 'languages-list' and event.action = 'language-change' 
    AND event.skinVersion = 'latest' 
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only logged-in users
    AND event.isanon =false
GROUP BY
    TO_DATE(dt),
    wiki,
    event.web_session_id,
    event.usereditbucket,
    event.timetochangelanguage,
    event.interfacelanguage,
    event.contentlanguage,
    event.selectedinterfacelanguage
"
In [114]:
#old_lang_link_clicks <-  wmfdata::query_hive(query_old_language_link)
In [13]:
#write_csv(old_lang_link_clicks, file = 'Data_login/old_lang_link_clicks.csv')
In [14]:
#old_lang_link_clicks <- read_csv('Data_login/old_lang_link_clicks.csv')
In [15]:
old_lang_link_clicks_by_date <- old_lang_link_clicks %>%
    group_by( wiki, date) %>%
    summarize(n_events_language_link = sum(n_events),.groups='drop')
In [17]:
old_lang_link_clicks_by_date$date <- as.Date(old_lang_link_clicks_by_date$date)

Old feature -- language link clicks after N-more button clicks on sidebar

In [18]:
query_n_more_lang_switch <- 
"
-- sessions where N-more lang button was selected
WITH button AS (
SELECT
    MIN(TO_DATE(dt)) as button_date,
    event.web_session_id as session_id,
    event.context as open_context,
    wiki as wiki
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7,8) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only logged-in users
    AND event.isanon =false
    AND event.action = 'compact-language-links-open'
    AND event.context = 'other'
    AND event.skinVersion = 'latest'
GROUP BY 
    event.web_session_id,
    event.context,
    wiki
),

lang_switches AS (
  SELECT
    TO_DATE(dt) as switch_date,
    event.web_session_id as session_id,
    event.isanon,
    event.context as switch_context,
    wiki as wiki
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7,8) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only logged-in users
    AND event.isanon =false
    AND event.action = 'language-change'
    AND event.context='content-language-switcher'
    AND event.skinVersion = 'latest'
)

SELECT
    button.button_date,
    lang_switches.switch_date,
    button.session_id,
    button.wiki,
    button.open_context,
-- sessions with lang switch that occured after button clicks
    IF(lang_switches.session_id IS NOT NULL AND switch_date >= button_date , 1, 0) AS language_switch,
    lang_switches.switch_context
FROM button
LEFT JOIN lang_switches ON
    button.session_id = lang_switches.session_id AND
    button.wiki = lang_switches.wiki
"
In [115]:
#n_more_lang_clicks <-  wmfdata::query_hive(query_n_more_lang_switch)
In [20]:
#write_csv(n_more_lang_clicks, file = 'Data_login/n_more_switch_lang_clicks.csv')
In [21]:
#n_more_lang_clicks <- read_csv('Data_login/n_more_switch_lang_clicks.csv')
In [23]:
n_more_lang_clicks_by_date <- n_more_lang_clicks %>%
    filter(language_switch==1) %>%
    group_by( wiki, switch_date) %>%
    summarize(n_events_n_more_language_switch = sum(language_switch),.groups='drop')

n_more_lang_clicks_by_date$switch_date <- as.Date(n_more_lang_clicks_by_date$switch_date)
In [25]:
date_seq <- seq(as.Date('2021-06-22'), as.Date('2021-07-20'), by = 'days')
In [26]:
wiki_seq <- c('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
In [27]:
df_lang_clicks= expand.grid(date=date_seq, wiki=wiki_seq )
In [28]:
df_lang_clicks<-  merge(df_lang_clicks , new_lang_link_clicks_by_date, by.x=c("wiki", "date"), by.y=c("wiki","switch_date"), all.x = TRUE)
df_lang_clicks <-  merge(df_lang_clicks , old_lang_link_clicks_by_date, by.x=c("wiki", "date"), by.y=c("wiki","date"), all.x = TRUE)
df_lang_clicks <-  merge(df_lang_clicks , n_more_lang_clicks_by_date, by.x=c("wiki", "date"), by.y=c("wiki","switch_date"),  all.x = TRUE)
In [29]:
df_lang_clicks[is.na(df_lang_clicks)] <- 0
In [31]:
df_lang_clicks <- mutate(df_lang_clicks, n_events_control=n_events_language_link+n_events_n_more_language_switch)
In [32]:
df_lang_clicks <- df_lang_clicks %>%
mutate(
wiki_name = case_when( #clarfiy Wiki project names
        wiki == 'frwiktionary' ~ "French Wikitionary",
        wiki == 'hewiki' ~ 'Hebrew Wikipedia',
        wiki == 'ptwikiversity' ~ 'Portuguese Wikiversity',
        wiki == 'frwiki'~ "French Wikipedia",
        wiki == 'euwiki' ~ "Basque Wikipedia",
        wiki == 'fawiki' ~ 'Persian Wikipedia',
        wiki == 'ptwiki' ~ 'Portuguese Wikipedia',
        wiki == 'kowiki' ~ 'Korean Wikipedia',
        wiki == 'trwiki' ~ 'Turkish Wikipedia',
        wiki == 'srwiki' ~ 'Serbian Wikipedia',
        wiki == 'bnwiki' ~ 'Bengali Wikipedia',
        wiki == 'dewikivoyage' ~ 'German Wikivoyage',
        wiki == 'vecwiki' ~ 'Venetian Wikipedia'),
)
In [156]:
df_lang_clicks_g <- ggplot(data=df_lang_clicks, 
                                mapping=aes(x=date)) +
geom_line(aes(y=n_events_new_language_change , color="c2") , size = 1.5) +
geom_line(aes(y=n_events_control, color="c1") , size = 1.5) +
facet_wrap(~wiki_name,nrow=4,scale = 'free_y')+
scale_color_manual(values= c("c1"="#666666", "c2"="#000099"), name = "group", labels = c("Control", "Treatment"))  +
labs(title = 'Language Link Clicks - logged-in user',
     x = 'Date',
     y = 'Language Link Clicks',
    caption = "User type: logged-in User
Control: clicks on language links on both sidebar and N-more suggestion window
Treatment: clicks on language links on new language suggestion window"
    ) + 
theme_light(base_size=18) +
theme(legend.position = "bottom", 
     plot.caption = element_text(hjust = 0, face= "italic"),
     strip.text = element_text(size = 14, colour='black'),
     axis.text.x = element_text( size = 10 ),
     plot.title = element_text(size=20))
ggsave("Graphs_login/lang_link_clicks.png",
    plot = df_lang_clicks_g, width = 60, height = 30, units = "cm", dpi = "screen");
df_lang_clicks_g
In [35]:
# add column classifying events in AB testing
df_lang_clicks_AB <- df_lang_clicks %>%
    filter((date >= '2021-06-22' & date <= '2021-07-20' & wiki !='fawiki')| 
           date >= '2021-06-28' & date <= '2021-07-20' & wiki =='fawiki') %>%
    group_by(wiki_name) %>%
    summarize(control = sum(n_events_control), treatment= sum(n_events_new_language_change),  .groups = 'drop')
In [36]:
df_lang_clicks_AB
A tibble: 13 × 3
wiki_namecontroltreatment
<chr><dbl><dbl>
Basque Wikipedia 824 1354
Bengali Wikipedia 1216 713
French Wikipedia 5511926224
French Wikitionary 1011 711
German Wikivoyage 39 7
Hebrew Wikipedia 12675 5735
Korean Wikipedia 6401 3425
Persian Wikipedia 6099 3982
Portuguese Wikipedia 14698 8140
Portuguese Wikiversity 0 0
Serbian Wikipedia 3504 921
Turkish Wikipedia 5436 2964
Venetian Wikipedia 383 23
In [37]:
df_lang_clicks_AB_long <- pivot_longer(df_lang_clicks_AB, cols = c('control','treatment' ), names_to="group", values_to="lang_link_clicks")
In [177]:
lang_clicks_barchart <- df_lang_clicks_AB_long %>%
        ggplot(aes(x = group, y= lang_link_clicks, fill=group)) +
        geom_bar(stat="identity", position = 'dodge') +
        geom_text(aes(label = paste(lang_link_clicks)), color = "white", position = position_dodge(0.9), vjust = 1.5, size = 3) +
        facet_wrap(~wiki_name, scale = 'free_y') +
        labs (
              y = "Number of language link clicks",
             title = "Number of language link clicks in AB test",
             caption = "User type: logged-in User
    Control: clicks on language links on sidebar and N-more suggestion window
    Treatment: clicks on language links on new language suggestion window") +
    scale_fill_manual(values= c("control"="#666666", "treatment"="#000099"), name = "")  +
         theme(
            panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(),
            panel.background = element_blank(),
            plot.caption = element_text(hjust = 0, face= "italic"),
            strip.background =element_rect(fill="white"),
            plot.title = element_text(hjust = 0.5),
            text = element_text(size=16),
            axis.line = element_line(colour = "black"),
            legend.position = "bottom")
lang_clicks_barchart

Determine the percentage of changes

In [178]:
ggsave("Graphs_login/lang_link_clicks_bar.png",
    plot = lang_clicks_barchart, width = 60, height = 30, units = "cm", dpi = "screen");
In [40]:
df_lang_clicks_AB <- df_lang_clicks_AB %>%
  mutate(
        pct_change= case_when(
        control==0&treatment>0 ~ 100,
        control==0&treatment==0 ~ 0,
        TRUE ~ round((treatment-control)/control * 100 , 2)
        ),
        pct= case_when(
        control==0&treatment>0 ~ 1,
        control==0&treatment==0 ~ 0,
        TRUE ~ round((treatment-control)/control , 4)
        )
 )
In [41]:
df_lang_clicks_AB
A tibble: 13 × 5
wiki_namecontroltreatmentpct_changepct
<chr><dbl><dbl><dbl><dbl>
Basque Wikipedia 824 1354 64.32 0.6432
Bengali Wikipedia 1216 713-41.37-0.4137
French Wikipedia 5511926224-52.42-0.5242
French Wikitionary 1011 711-29.67-0.2967
German Wikivoyage 39 7-82.05-0.8205
Hebrew Wikipedia 12675 5735-54.75-0.5475
Korean Wikipedia 6401 3425-46.49-0.4649
Persian Wikipedia 6099 3982-34.71-0.3471
Portuguese Wikipedia 14698 8140-44.62-0.4462
Portuguese Wikiversity 0 0 0.00 0.0000
Serbian Wikipedia 3504 921-73.72-0.7372
Turkish Wikipedia 5436 2964-45.47-0.4547
Venetian Wikipedia 383 23-93.99-0.9399
In [42]:
pct_g <- df_lang_clicks_AB %>%
        ggplot() +
        geom_hline(yintercept = 0, linetype = "dashed") +
        geom_pointrange(
            aes(x = wiki_name, ymin = 0, ymax = pct_change, y = pct_change)
        ) +
        geom_text(
            aes(
                y = pct_change + ifelse(pct_change < 0, -4, 4),
                x = wiki_name, label = wiki_name,
                hjust = ifelse(pct_change < 0, "right", "left")
            ),
            size = 5
        ) +
        scale_x_discrete(breaks = NULL) +
        coord_flip(ylim = c(-300, 300)) +
         labs (x = "Wiki Project",
                y = "Percent Change",
             title = "Percent change in language link clicks in the AB test for the new language switching button") +
        theme_bw() +
        theme(
            panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(),
            panel.background = element_blank(),
            plot.title = element_text(hjust = 0.5),
            text = element_text(size=16)) 
      
pct_g
In [159]:
ggsave("Graphs_login/lang_link_clicks_pcg.png",
    plot = pct_g, width = 60, height = 30, units = "cm", dpi = "screen");

Average percent change

In [43]:
mean(filter(df_lang_clicks_AB, is.numeric(pct_change)&control>0)$pct_change)
-44.5783333333333

Modeling the impact

In [44]:
df_lang_clicks_AB
A tibble: 13 × 5
wiki_namecontroltreatmentpct_changepct
<chr><dbl><dbl><dbl><dbl>
Basque Wikipedia 824 1354 64.32 0.6432
Bengali Wikipedia 1216 713-41.37-0.4137
French Wikipedia 5511926224-52.42-0.5242
French Wikitionary 1011 711-29.67-0.2967
German Wikivoyage 39 7-82.05-0.8205
Hebrew Wikipedia 12675 5735-54.75-0.5475
Korean Wikipedia 6401 3425-46.49-0.4649
Persian Wikipedia 6099 3982-34.71-0.3471
Portuguese Wikipedia 14698 8140-44.62-0.4462
Portuguese Wikiversity 0 0 0.00 0.0000
Serbian Wikipedia 3504 921-73.72-0.7372
Turkish Wikipedia 5436 2964-45.47-0.4547
Venetian Wikipedia 383 23-93.99-0.9399

Paired T-test of clicks in two groups

We considered to use paired t-test to determine if there is statistical difference in the language link clicks between the two groups. However, we observed the data violated the prerequisite of normal distribution.

In [45]:
# Represent it
options(repr.plot.width = 10, repr.plot.height = 6)
p <- df_lang_clicks_AB_long %>%
  ggplot( aes(x=lang_link_clicks, fill=group)) +
    geom_histogram( color="#e9ecef", alpha=0.4, position = 'identity', bins=30) +
    scale_fill_manual(values=c("#69b3a2", "#404080")) +
    labs ( title = "Check distribution normality -- number of clicks",
         fill="") +
    theme_light(base_size=18)
p

One sample T-test of pecentage change in treatment group over control group

We also reviewed the change percentage of clicks in the treatment group over clicks in the control group. Ran one-sample t-test to determine if the change percentage is less than 0. In this model, each wiki represents an observational unit.

In [47]:
options(repr.plot.width = 10, repr.plot.height = 6)
p <- df_lang_clicks_AB %>%
    ggplot( aes(x=pct_change)) +
    geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity', binwidth=10) +
    labs ( title = "Check distribution normality -- change percentage") +
    theme_light(base_size=18)
p

Normality assumption holds.

We use one-tailed t-test to test the null hypothesis: the change percentage of clicks in the treatment group over clicks in the control group is zero.
Alternative hypothesis is: the change percentage of clicks in the treatment group over clicks in the control group is less than zero.

In [48]:
with(df_lang_clicks_AB, t.test(x=pct_change, y=NULL,mu=0, alternative = "less", paired = FALSE, 
       conf.level = 0.95 ))
	One Sample t-test

data:  pct_change
t = -3.7397, df = 12, p-value = 0.001412
alternative hypothesis: true mean is less than 0
95 percent confidence interval:
      -Inf -21.53813
sample estimates:
mean of x 
-41.14923 

Conclusion:

As the p-value < 0.05 and confidence interval is on the left side of 0, the alternative hypothesis holds. The change percentage, i.e. (treatment-control)*100/control, is significantly less than 0. We have sufficient statistical evidence to conclude that the clicks from the treatment group are less than the control group.

Summary:

There was an average 44.57% decrease1 in total clicks on language links by logged-in users on the early adopter wikis in treatment group compared with control group. The decrease is contradicted with our hypothesis. It might be due to the lags in new feature adoption. Further investigation may be needed to identify possible reasons for this decrease.


  1. Calculated by taking the average of the percent changes observed on each early adopter wiki.

Input Language Link Clicks

New feature -- input language link clicks after new button clicks

In [49]:
query_input_lang_clicks_new <- 
"
-- sessions where new lang button was selected
WITH new_button AS (
SELECT
    MIN(TO_DATE(dt)) as button_date,
    event.web_session_id as session_id,
    event.context as open_context,
    wiki as wiki
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7,8) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only logged-in users
    AND event.isanon =false
    AND event.action = 'compact-language-links-open'
    AND event.context = 'header'
    AND event.skinVersion = 'latest'
GROUP BY 
    event.web_session_id,
    event.context,
    wiki
),

input_lang_switches AS (
  SELECT
    TO_DATE(dt) as switch_date,
    event.web_session_id as session_id,
    event.context as switch_context,
    wiki as wiki
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7,8) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only anonymous users
    AND event.isanon = false
    AND event.action='ime-change'
    AND event.skinVersion = 'latest'
)

SELECT
    new_button.button_date,
    input_lang_switches.switch_date,
    new_button.session_id,
    new_button.wiki,
    new_button.open_context,
-- sessions with lang switch that occured after button clicks
    IF(input_lang_switches.session_id IS NOT NULL AND switch_date >= button_date, 1, 0) AS language_switch,
    input_lang_switches.switch_context
FROM new_button
LEFT JOIN input_lang_switches ON
    new_button.session_id = input_lang_switches.session_id AND
    new_button.wiki = input_lang_switches.wiki
"
In [116]:
#new_input_lang_clicks <-  wmfdata::query_hive(query_input_lang_clicks_new)
In [52]:
new_input_lang_clicks_by_date <- new_input_lang_clicks %>%
    filter(language_switch==1) %>%
    group_by( wiki, switch_date) %>%
    summarize(new_input_language_clicks = sum(language_switch), .groups='drop')

new_input_lang_clicks$switch_date <- as.Date(new_input_lang_clicks$switch_date, format = "%Y-%m-%d")

Old feature -- input langauage link clicks after setting button clicks on sidebar

In [53]:
query_input_language_old <- 
"
-- sessions where setting button was selected
WITH setting_button AS (
SELECT
    MIN(TO_DATE(dt)) as button_date,
    event.web_session_id as session_id,
    event.context as open_context,
    wiki as wiki
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only logged-in users
    AND event.isanon =false
    AND event.action = 'settings-open'
    AND event.context = 'interlanguage'
    AND event.skinVersion = 'latest'
GROUP BY 
    event.web_session_id,
    event.context,
    wiki
),

input_lang_switches AS (
  SELECT
    TO_DATE(dt) as switch_date,
    event.web_session_id as session_id,
    event.context as switch_context,
    wiki as wiki
FROM event_sanitized.universallanguageselector
WHERE
    year = 2021 and month in (6,7) 
    AND (CONCAT(year,LPAD(month,2,'0'),LPAD(day,2,'0')) between '20210622' and '20210720')
    AND wiki in ('frwiktionary', 'hewiki', 'ptwikiversity', 'frwiki', 
    'euwiki', 'fawiki', 'ptwiki', 'kowiki', 'trwiki', 'srwiki', 'bnwiki', 'dewikivoyage', 'vecwiki')
--  useragent is sanitized 
--  AND useragent.is_bot = false
-- only anonymous users
    AND event.isanon = false
    AND event.action='ime-change' 
    AND event.skinVersion = 'latest'
)

SELECT
    setting_button.button_date,
    input_lang_switches.switch_date,
    setting_button.session_id,
    setting_button.wiki,
    setting_button.open_context,
-- sessions with lang switch that occured after button clicks
    IF(input_lang_switches.session_id IS NOT NULL AND switch_date >= button_date, 1, 0) AS language_switch,
    input_lang_switches.switch_context
FROM setting_button
LEFT JOIN input_lang_switches ON
    setting_button.session_id = input_lang_switches.session_id AND
    setting_button.wiki = input_lang_switches.wiki
"
In [117]:
#old_input_lang_clicks <-  wmfdata::query_hive(query_input_language_old)
In [55]:
old_input_lang_clicks_by_date <- old_input_lang_clicks %>%
    filter(language_switch==1) %>%
    group_by( wiki, switch_date) %>%
    summarize(old_input_language_clicks = sum(language_switch), .groups='drop')

old_input_lang_clicks_by_date$switch_date <- as.Date(old_input_lang_clicks_by_date$switch_date, format = "%Y-%m-%d")
In [56]:
df_input_clicks= expand.grid(date=date_seq, wiki=wiki_seq )

df_input_clicks <-  merge(df_input_clicks , new_input_lang_clicks_by_date, by.x=c("wiki", "date"), by.y=c("wiki","switch_date"), all.x = TRUE)

df_input_clicks <-  merge(df_input_clicks , old_input_lang_clicks_by_date, by.x=c("wiki", "date"), by.y=c("wiki","switch_date"), all.x = TRUE)

df_input_clicks[is.na(df_input_clicks)] <- 0
In [57]:
df_input_clicks <- df_input_clicks %>%
mutate(
wiki_name = case_when( #clarfiy Wiki project names
        wiki == 'frwiktionary' ~ "French Wikitionary",
        wiki == 'hewiki' ~ 'Hebrew Wikipedia',
        wiki == 'ptwikiversity' ~ 'Portuguese Wikiversity',
        wiki == 'frwiki'~ "French Wikipedia",
        wiki == 'euwiki' ~ "Basque Wikipedia",
        wiki == 'fawiki' ~ 'Persian Wikipedia',
        wiki == 'ptwiki' ~ 'Portuguese Wikipedia',
        wiki == 'kowiki' ~ 'Korean Wikipedia',
        wiki == 'trwiki' ~ 'Turkish Wikipedia',
        wiki == 'srwiki' ~ 'Serbian Wikipedia',
        wiki == 'bnwiki' ~ 'Bengali Wikipedia',
        wiki == 'dewikivoyage' ~ 'German Wikivoyage',
        wiki == 'vecwiki' ~ 'Venetian Wikipedia'),
)
In [59]:
options(repr.plot.width = 15, repr.plot.height = 10)
In [164]:
input_lang_click_g <- ggplot(data=filter( df_input_clicks,  (date >= '2021-06-22')&(date <= '2021-07-20')), 
                                mapping=aes(x=date)) +
geom_line(aes(y=new_input_language_clicks , color="c2") , size = 1.5) +
geom_line(aes(y=old_input_language_clicks, color="c1") , size = 1.5) +
facet_wrap(~wiki_name,nrow=3,scale = 'free_y')+
scale_color_manual(values= c("c1"="#666666", "c2"="#000099"), name = "group", labels = c("Control", "Treatment"))  +
labs(title = 'Input Language Link Clicks - logged-in user',
     x = 'Date',
     y = 'Input Link Clicks') + 
theme_light(base_size = 16) +
theme(legend.position = "bottom", 
     strip.text = element_text(size = 14, colour='black'),
     axis.text.x = element_text( size = 10 ),
     plot.title = element_text(size=20));
ggsave("Graphs_login/input_lang.png",
    plot = input_lang_click_g , width = 60, height = 30, units = "cm", dpi = "screen");
input_lang_click_g