Purpose: We would like to begin to develop an understanding of the usage/behavior of various elements on the desktop site as we start to think about improvements we can make to the desktop experience.
library(IRdisplay)
display_html(
'<script>
code_show=true;
function code_toggle() {
if (code_show){
$(\'div.input\').hide();
} else {
$(\'div.input\').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
<input type="submit" value="Click here to toggle on/off the raw code.">
</form>'
)
shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(magrittr); library(zeallot); library(glue); library(tidyverse); library(glue); library(lubridate)
library(scales)
})
Data questions:
Audiences: logged-in users, logged-out users
Why this is relevant: currently search is a relatively small element on the page, and is only accessible from the top of the page (i.e. you can't perform a search if you're scrolled down, reading an article). We're curious if people have difficulty finding search, or perhaps just don't think of searching the site because they don't notice it.
Notes: Data from searchsatisfaction eventlogging table and webrequest. Data is from August 2019 and for desktop users across all projects. Search events include both full text and autocomplete searches.
Notes: Unique users defined by client_ip and user agent.
# collect number of distinct users that used search
query <-
"SELECT
date,
COUNT(DISTINCT CONCAT(client_ip, user_agent, query)) AS n_search,
COUNT(DISTINCT CONCAT(client_ip, user_agent)) AS n_user,
logged_in
FROM (
SELECT
CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS date,
access_method,
client_ip,
user_agent,
IF(x_analytics_map['loggedIn'] = '1', true, false) as logged_in,
PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search') AS query,
PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken') AS searchToken
FROM wmf.webrequest TABLESAMPLE(BUCKET 1 OUT OF 1024 ON hostname, sequence)
WHERE year = 2019 AND month = 08
AND webrequest_source = 'text'
AND access_method = 'desktop'
AND agent_type = 'user'
AND is_pageview
-- flag for pageviews that are search results pages
AND page_id IS NULL
AND (
uri_path = '/wiki/Special:Search'
OR (
uri_path = '/w/index.php'
AND (
LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'search')) > 0
OR LENGTH(PARSE_URL(CONCAT('http://', uri_host, uri_path, uri_query), 'QUERY', 'searchToken')) > 0
)
)
)
) AS serp
GROUP BY date, logged_in;"
search_events_byuser <- wmf::query_hive(query)
search_events_byuser$date <- as.Date(search_events_byuser$date, format = "%Y-%m-%d")
# Plot daily total search and sessions for August 2019
search_events_byuser_daily <- search_events_byuser %>%
gather(type, count, n_search:n_user) %>%
group_by(date, type) %>%
summarise(count = sum(count)) %>%
ggplot(aes(x=date, y= count, color= type)) +
geom_line() +
scale_x_date(name = "Date") +
scale_y_continuous(labels = polloi::compress, name = "Daily Count") +
labs(title = "Daily number of desktop searches") +
wmf::theme_min()
search_events_byuser_daily
#total number of logged in and logged out users from sample
search_events_byuser_total <- search_events_byuser %>%
group_by(logged_in) %>%
summarise(total_users = sum(n_user))
search_events_byuser_total
logged_in | total_users |
---|---|
<chr> | <int> |
false | 59139 |
true | 2458 |
# Number of overall users (apporximated by client ip and user client)
query <-
"SELECT COUNT(DISTINCT CONCAT(client_ip, user_agent)) as n_user,
logged_in
FROM (
SELECT
client_ip,
user_agent,
IF(x_analytics_map['loggedIn'] = '1', true, false) as logged_in
FROM wmf.webrequest TABLESAMPLE(BUCKET 1 OUT OF 1024 ON hostname, sequence)
WHERE year = 2019 and month = 8
AND agent_type = 'user'
AND access_method = 'desktop'
AND webrequest_source = 'text'
AND is_pageview
) as dataset
GROUP BY logged_in"
total_users_August <- wmf::query_hive(query)
About 1.4% of logged out desktop users and about 2.7% of logged in desktop users use search based on sample from August 2019.
# query for average search sessions.
query <- "
SELECT
CONCAT(year,'-',LPAD(month,2,'0'),'-',LPAD(day,2,'0')) as date,
event.searchSessionId as session_id,
sum(if(event.action='searchResultPage',1,0)) as n_search
FROM event.SearchSatisfaction
WHERE year = 2019 and month = 08 and day > 05 -- when events were logged to new schema.
AND useragent.is_bot = FALSE
AND event.action = 'searchResultPage'
group by year, month, day, event.searchSessionId
"
search_events_bysession <- wmf::query_hive(query)
search_events_bysession$date <- as.Date(search_events_bysession$date, format = "%Y-%m-%d")
# Find average and remove sessions with over 40 events
search_events_bysession_average <- search_events_bysession %>%
filter(n_search < 40) %>%
group_by(date) %>%
summarise(average_search_bysession = mean(n_search))
head(search_events_bysession_average)
date | average_search_bysession |
---|---|
<date> | <dbl> |
2019-08-06 | 6.789426 |
2019-08-07 | 6.801644 |
2019-08-08 | 6.779381 |
2019-08-09 | 6.781527 |
2019-08-10 | 6.968016 |
2019-08-11 | 6.958233 |
There are an average of 6.8 searches per search session on desktop looking at all wikipedias. Note. I filtered out sessions with over 50 searches as these are likely bots.
# Plot averaege daily sessions by search since 2019
search_events_bysession_daily_plot <- search_events_bysession_average %>%
ggplot(aes(x=date, y= average_search_bysession)) +
geom_line() +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Average number of searches") +
labs(title = "Average daily number of desktop searches by session") +
wmf::theme_min()
search_events_bysession_daily_plot
Data questions:
Audiences: logged-in users, logged-out users
Why this is relevant: we're curious about a more modular design, wherein the main menu is collapsible. It could even act different for different audiences. Knowing how often it gets used, and by whom, will inform our thinking.
Data Notes:
TODO: Look into possible ways to track clicks to store and dontate links
#overall pageviews for both logged in and logged out users
query <-
"SELECT
SUM(IF((pageview_info['page_title'] = 'Main_Page'), 1, null)) AS main_page,
SUM(IF((pageview_info['page_title'] = 'Portal:Contents'), 1, null)) AS contents,
SUM(IF((pageview_info['page_title'] = 'Portal:Featured_content'), 1, null)) AS featured_content,
SUM(IF((pageview_info['page_title'] = 'Portal:Current_events'), 1, null)) AS current_events,
SUM(IF((pageview_info['page_title'] = 'Special:Random'), 1, null)) AS random_article,
SUM(IF((pageview_info['page_title'] = 'Help:Contents'), 1, null)) AS help_views,
SUM(IF((pageview_info['page_title'] = 'Wikipedia:About'), 1, null)) AS about_wikipedia,
SUM(IF((pageview_info['page_title'] = 'Wikipedia:Community_portal'), 1, null)) AS community_portal,
SUM(IF((pageview_info['page_title'] = 'Special:RecentChanges'), 1, null)) AS recent_changes,
SUM(IF((pageview_info['page_title']= 'Wikipedia:Contact_us'), 1, null)) AS contact_page,
SUM(IF((pageview_info['page_title'] LIKE 'Special:WhatLinksHere%'), 1, null)) AS what_links_here,
SUM(IF((pageview_info['page_title'] LIKE 'Special:RecentChangesLinked%'), 1, null)) AS related_changes,
SUM(IF((pageview_info['page_title'] = 'Wikipedia:File_Upload_Wizard'), 1, null)) AS file_upload,
SUM(IF((pageview_info['page_title'] = 'Special:SpecialPages'), 1, null)) AS special_pages,
SUM(IF((uri_query LIKE '%&action=info%'), 1, null)) AS page_info,
SUM(IF((pageview_info['page_title'] = 'Special:CiteThisPage'), 1, null)) AS cite_this_page,
SUM(IF((pageview_info['page_title'] = 'Special:Book'), 1, null)) AS create_book,
SUM(IF((pageview_info['page_title'] = 'Special:ElectronPdf'), 1, null)) AS download_as_pdf
FROM wmf.webrequest
WHERE year = 2019 and month = 7
AND agent_type = 'user'
AND pageview_info['project'] = 'en.wikipedia'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia.
AND is_pageview"
sidebar_views_overall_raw <- read.csv(file="Data/sidebar_views_overall.tsv", header=TRUE, sep="\t")
head(sidebar_views_overall_raw)
main_page | contents | featured_content | current_events | random_article | help_views | about_wikipedia | community_portal | recent_changes | contact_page | what_links_here | related_changes | file_upload | special_pages | page_info | cite_this_page | create_book | download_as_pdf |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <fct> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> |
11068094 | 424594 | 124742 | 741406 | NULL | 91465 | 125960 | 237826 | 476123 | 226885 | 843500 | 579563 | 95757 | 45432 | 123188 | 242384 | 258355 | 685032 |
#proportion of sidebar views
sidebar_views_overall <- sidebar_views_overall_raw %>%
gather(sidebar_link, n_views) %>%
mutate(proportion = (n_views / sum(n_views))) %>%
arrange(desc(n_views))
sidebar_views_overall
sidebar_link | n_views | proportion |
---|---|---|
<chr> | <int> | <dbl> |
main_page | 11068094 | 0.675282939 |
what_links_here | 843500 | 0.051463347 |
current_events | 741406 | 0.045234421 |
download_as_pdf | 685032 | 0.041794949 |
related_changes | 579563 | 0.035360109 |
recent_changes | 476123 | 0.029049061 |
contents | 424594 | 0.025905191 |
create_book | 258355 | 0.015762671 |
cite_this_page | 242384 | 0.014788253 |
community_portal | 237826 | 0.014510162 |
contact_page | 226885 | 0.013842634 |
about_wikipedia | 125960 | 0.007685030 |
featured_content | 124742 | 0.007610718 |
page_info | 123188 | 0.007515906 |
file_upload | 95757 | 0.005842295 |
help_views | 91465 | 0.005580433 |
special_pages | 45432 | 0.002771882 |
random_article | 0 | 0.000000000 |
#plot bar chart showing usage
p <- ggplot(sidebar_views_overall, aes(x= sidebar_link, y= proportion, fill = sidebar_link)) %>%
+ geom_col(position = 'dodge') +
scale_y_continuous("Proportion of sidebar views", labels = scales::percent) +
labs(title = "Desktop views to links in sidebar on English Wikipedia") +
ggthemes::theme_tufte(base_size = 10, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid = element_line("gray70"),
legend.position= "none")
ggsave("Figures/proportion_sidebar_views.png", p, width = 18, height = 9, units = "in", dpi = 150)
p
The main page is by far the most viewed sidebar linked page on desktop (about 68% of all users views to a sidebar linked page are to the main page) followed by the what_links_here, current_events pages and download as pdf. This is for both logged in and logged out users.
Note: Random Article is recording very few events. This may be due to a bug in how it's recorded. I'll need to investigate further
#estimate of all unique users on desktop in Jul 2019 (both logged in and logged out)
query <-
"SELECT COUNT(DISTINCT CONCAT(client_ip, user_agent)) as n_user,
logged_in
FROM (
SELECT
client_ip,
user_agent,
IF(x_analytics_map['loggedIn'] = '1', true, false) as logged_in
FROM wmf.webrequest TABLESAMPLE(BUCKET 1 OUT OF 1024 ON hostname, sequence)
WHERE year = 2019 and month = 7
AND agent_type = 'user'
AND access_method = 'desktop'
AND is_pageview
AND webrequest_source = 'text'
AND pageview_info['project'] = 'en.wikipedia'
) as dataset
GROUP BY logged_in"
unique_users_count <- read.csv(file="Data/unique_users_count.tsv", header=TRUE, sep="\t")
## users who view sidebar linked page broken down by logged in status
query <-
"SELECT COUNT(DISTINCT CONCAT(client_ip, user_agent)) as n_user_sidebar,
logged_in
FROM (
SELECT
client_ip,
user_agent,
IF(x_analytics_map['loggedIn'] = '1', true, false) as logged_in
FROM wmf.webrequest
WHERE year = 2019 and month = 7
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia.
AND is_pageview
AND pageview_info['project'] = 'en.wikipedia'
AND (pageview_info['page_title'] IN ('Main_Page', 'Portal:Contents', 'Portal:Featured_content', 'Portal:Current_events',
'Special:Random', 'Help:Contents', 'Wikipedia:About', 'Wikipedia:Community_portal', 'Special:RecentChanges', 'Wikipedia:Contact_us',
'Wikipedia:File_Upload_Wizard', 'Special:SpecialPages', 'Special:CiteThisPage', 'Special:Book', 'Special:ElectronPdf') OR
pageview_info['page_title'] LIKE '%Special:WhatLinksHere%' OR
pageview_info['page_title'] LIKE '%Special:RecentChangesLinked%' OR
uri_query LIKE '%&action=info%')
) as dataset
GROUP BY logged_in"
sidebar_views_user_overall_raw <- read.csv(file="Data/sidebar_views_user_overall.tsv", header=TRUE, sep="\t")
sidebar_views_user_overall <- merge(sidebar_views_user_overall_raw, unique_users_count, by = "logged_in", all = TRUE)
sidebar_views_byusertype <- sidebar_views_user_overall %>%
mutate(user_pct = (n_user_sidebar/n_user_all) * 100)
sidebar_views_byusertype
logged_in | n_user_sidebar | n_user_all | user_pct |
---|---|---|---|
<fct> | <int> | <int> | <dbl> |
NA | NA | NA | |
false | 11732 | 2038498 | 0.5755218 |
true | 731 | 44345 | 1.6484384 |
Looking at sample data from July 2019, about 0.5% of logged out users and 1.6% of logged in desktop users clicked on one of the pages located in the sidebar.
# Obtain user clicks to each sidebar linked page
#ran in terminal due to issues running over notebook.
query <-
"
SELECT
COUNT(DISTINCT CONCAT(client_ip, user_agent)) as n_user,
logged_in,
sidebar_name
FROM (
SELECT
client_ip,
user_agent,
IF(x_analytics_map['loggedIn'] = '1', true, false) as logged_in,
(CASE
WHEN uri_query LIKE '%&action=info%' THEN 'PageInformation'
WHEN pageview_info['page_title'] LIKE '%Special:WhatLinksHere%' THEN 'WhatLinksHere'
WHEN pageview_info['page_title'] LIKE '%Special:RecentChangesLinked%' THEN 'RelatedChanges'
ELSE pageview_info['page_title']
END) as sidebar_name
FROM wmf.webrequest TABLESAMPLE(BUCKET 1 OUT OF 1024 ON hostname, sequence)
WHERE
year = 2019 and month = 7
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia.
AND is_pageview
AND webrequest_source = 'text'
AND pageview_info['project'] = 'en.wikipedia'
AND (pageview_info['page_title'] IN ('Main_Page', 'Portal:Contents', 'Portal:Featured_content', 'Portal:Current_events',
'Special:Random', 'Help:Contents', 'Wikipedia:About', 'Wikipedia:Community_portal', 'Special:RecentChanges', 'Wikipedia:Contact_us',
'Wikipedia:File_Upload_Wizard', 'Special:SpecialPages', 'Special:CiteThisPage', 'Special:Book', 'Special:ElectronPdf') OR
pageview_info['page_title'] LIKE '%Special:WhatLinksHere%' OR
pageview_info['page_title'] LIKE '%Special:RecentChangesLinked%' OR
pageview_info['page_title'] LIKE '%Special:RecentChangesLinked%' OR
uri_query LIKE '%&action=info%')
) AS page_info
Group BY sidebar_name, logged_in
"
sidebar_views_user_byfeature_raw <- read.csv(file="Data/sidebar_views_users_byfeature.tsv", header=TRUE, sep= "\t")
#Table showing percent of all users in each group (logged in or logged out)
#that view each page linked in the sidebar
sidebar_views_user_byfeature <- sidebar_views_user_byfeature_raw %>%
mutate(percent_all_users = if_else(logged_in == 'false', as.numeric(n_user_sidebar)/2038498 *100, as.numeric(n_user_sidebar)/44345 * 100)) %>%
arrange(desc(percent_all_users))
sidebar_views_user_byfeature
n_user_sidebar | logged_in | sidebar_name | percent_all_users |
---|---|---|---|
<int> | <fct> | <fct> | <dbl> |
442 | true | Main_Page | 0.996730184 |
9312 | false | Main_Page | 0.456806924 |
81 | true | WhatLinksHere | 0.182658699 |
51 | true | Portal:Current_events | 0.115007329 |
44 | true | Special:RecentChanges | 0.099222009 |
29 | true | Wikipedia:File_Upload_Wizard | 0.065396324 |
17 | true | PageInformation | 0.038335776 |
16 | true | Special:ElectronPdf | 0.036080731 |
12 | true | RelatedChanges | 0.027060548 |
12 | true | Portal:Contents | 0.027060548 |
541 | false | Special:ElectronPdf | 0.026539148 |
527 | false | Portal:Current_events | 0.025852368 |
11 | true | Wikipedia:Community_portal | 0.024805502 |
314 | false | WhatLinksHere | 0.015403498 |
6 | true | Special:Book | 0.013530274 |
238 | false | Special:RecentChanges | 0.011675263 |
234 | false | RelatedChanges | 0.011479040 |
5 | true | Special:SpecialPages | 0.011275228 |
159 | false | Portal:Contents | 0.007799860 |
3 | true | Special:CiteThisPage | 0.006765137 |
3 | true | Wikipedia:About | 0.006765137 |
131 | false | Special:CiteThisPage | 0.006426300 |
122 | false | Special:Book | 0.005984799 |
96 | false | Wikipedia:Contact_us | 0.004709350 |
2 | true | Portal:Featured_content | 0.004510091 |
2 | true | Help:Contents | 0.004510091 |
91 | false | Portal:Featured_content | 0.004464071 |
87 | false | PageInformation | 0.004267848 |
86 | false | Wikipedia:About | 0.004218792 |
82 | false | Wikipedia:Community_portal | 0.004022570 |
56 | false | Wikipedia:File_Upload_Wizard | 0.002747121 |
49 | false | Help:Contents | 0.002403731 |
1 | true | Wikipedia:Contact_us | 0.002255046 |
29 | false | Special:SpecialPages | 0.001422616 |
Do the the language links (as a group) get clicked more than the other links in the sidebar?
The data presented below shows the percent of all sidebar link clicks on desktop by users for each sidebar link in June 2019.
Notes:
#Percent language links clicks out of all internal referred clicks
#By project looking by the source of request (not target)
query <-
"
SELECT
navigation_count.project as project,
navigation_count_total as languages,
view_count_total,
(navigation_count_total/view_count_total) *100 as clicks_as_percent_of_total
FROM (
SELECT
concat(previous_project, '.', project_family) as project,
sum(navigation_count) as navigation_count_total
from wmf.interlanguage_navigation
where date > '2019-05-31'
AND date < '2019-07-01'
AND previous_project IN ('en', 'fr', 'de', 'it', 'scn', 'cy', 'ps')
group by project_family, previous_project
) as navigation_count
INNER JOIN
(SELECT project,
sum(view_count) as view_count_total
FROM wmf.projectview_hourly
WHERE year = 2019
and month = 06
and project IN ('en.wikipedia', 'fr.wikipedia', 'de.wikipedia', 'it.wikipedia', 'scn.wikipedia', 'cy.wikipedia', 'ps.wikipedia')
and agent_type = 'user'
and access_method = 'desktop'
and referer_class = 'internal'
group by project
) as projectview
on navigation_count.project = projectview.project"
language_link_views <- wmf::query_hive(query)
language_link_views
project | languages | view_count_total | clicks_as_percent_of_total |
---|---|---|---|
<chr> | <int> | <int> | <dbl> |
cy.wikipedia | 40011 | 157335 | 25.4304509 |
de.wikipedia | 1528152 | 156633851 | 0.9756205 |
en.wikipedia | 5467305 | 1088768166 | 0.5021551 |
fr.wikipedia | 784399 | 82525574 | 0.9504920 |
it.wikipedia | 428082 | 48007596 | 0.8916964 |
ps.wikipedia | 10317 | 47712 | 21.6234909 |
scn.wikipedia | 25627 | 72999 | 35.1059604 |
#non-language sidebar views for enwiki
query <-
"SELECT
SUM(IF((page_title = 'Main_Page'), 1, null)) AS main_page,
SUM(IF((page_title = 'Portal:Contents'), 1, null)) AS contents,
SUM(IF((page_title = 'Portal:Featured_content'), 1, null)) AS featured_content,
SUM(IF((page_title = 'Portal:Current_events'), 1, null)) AS current_events,
SUM(IF((page_title = 'Help:Contents'), 1, null)) AS help_views,
SUM(IF((page_title = 'Wikipedia:About'), 1, null)) AS about_wikipedia,
SUM(IF((page_title = 'Wikipedia:Community_portal'), 1, null)) AS community_portal,
SUM(IF((page_title = 'Special:RecentChanges'), 1, null)) AS recent_changes,
SUM(IF((page_title= 'Wikipedia:Contact_us'), 1, null)) AS contact_page,
SUM(IF((page_title LIKE 'Special:WhatLinksHere%'), 1, null)) AS what_links_here,
SUM(IF((page_title LIKE 'Special:RecentChangesLinked%'), 1, null)) AS related_changes,
SUM(IF((page_title = 'Wikipedia:File_Upload_Wizard'), 1, null)) AS file_upload,
SUM(IF((page_title = 'Special:SpecialPages'), 1, null)) AS special_pages,
SUM(IF((page_title = 'Special:CiteThisPage'), 1, null)) AS cite_this_page,
SUM(IF((page_title = 'Special:Book'), 1, null)) AS create_book,
SUM(IF((page_title = 'Special:ElectronPdf'), 1, null)) AS download_as_pdf,
sum(view_count) as view_count_total
FROM wmf.pageview_hourly
WHERE year = 2019 and month = 6
AND project = 'en.wikipedia'
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia "
enwiki_sidebar_views <- wmf::query_hive(query)
#proportion of sidebar views with language clicks added
enwiki_sidebar_view_wlang <- enwiki_sidebar_views%>%
mutate(languages = 5467305) %>% #add language clicks
gather(sidebar_link, n_views) %>% #remove columns not needed for calculation
filter(sidebar_link != "view_count_total") %>%
mutate(percent_views = (as.numeric(n_views) / sum(as.numeric(n_views)))*100) %>%
arrange(desc(percent_views))
enwiki_sidebar_view_wlang
sidebar_link | n_views | percent_views |
---|---|---|
<chr> | <dbl> | <dbl> |
languages | 5467305 | 39.0831352 |
main_page | 5199829 | 37.1710779 |
what_links_here | 1106416 | 7.9092361 |
related_changes | 770540 | 5.5082200 |
current_events | 440947 | 3.1521181 |
download_as_pdf | 271868 | 1.9434536 |
contents | 114028 | 0.8151314 |
recent_changes | 94601 | 0.6762571 |
featured_content | 76561 | 0.5472978 |
about_wikipedia | 68634 | 0.4906315 |
contact_page | 67793 | 0.4846196 |
cite_this_page | 66971 | 0.4787435 |
create_book | 60714 | 0.4340152 |
community_portal | 54568 | 0.3900804 |
file_upload | 47320 | 0.3382679 |
special_pages | 41861 | 0.2992442 |
help_views | 38955 | 0.2784706 |
#sidebar views for fr.wikipedia (excluding language lnks)
query <-
"SELECT
SUM(IF((page_title = 'Wikipédia:Accueil_principal'), 1, null)) AS main_page,
SUM(IF((page_title = 'Portail:Accueil'), 1, null)) AS thematic_portals,
SUM(IF((page_title= 'Wikipédia:Contact'), 1, null)) AS contact_page,
SUM(IF((page_title = 'Aide:Débuter'), 1, null)) AS getting_started,
SUM(IF((page_title = 'Aide:Accueil'), 1, null)) AS help,
SUM(IF((page_title = 'Wikipédia:Accueil_de_la_communauté'), 1, null)) AS community_home,
SUM(IF((page_title = 'Spécial:Modifications_récentes'), 1, null)) AS recent_changes,
SUM(IF((page_title LIKE 'Spécial:Pages_liées%'), 1, null)) AS linked_pages,
SUM(IF((page_title LIKE 'Spécial:Suivi_des_liens%'), 1, null)) AS tracking_related_changes,
SUM(IF((page_title = 'Aide:Importer_un_fichier'), 1, null)) AS file_upload,
SUM(IF((page_title = 'Spécial:Pages_spéciales'), 1, null)) AS special_pages,
SUM(IF((page_title = 'Spécial:Livre'), 1, null)) AS create_book,
SUM(IF((page_title = 'Spécial:ElectronPdf'), 1, null)) AS create_pdf,
SUM(view_count) as view_count_total
FROM wmf.pageview_hourly
WHERE year = 2019 and month = 6
AND project = 'fr.wikipedia'
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal'
"
frwiki_sidebar_views <- wmf::query_hive(query)
#proportion of sidebar views with language clicks added
frwiki_sidebar_view_wlang <- frwiki_sidebar_views%>%
mutate(languages = 784399) %>% #add language clicks
gather(sidebar_link, n_views) %>%
filter(sidebar_link != "view_count_total") %>%
mutate(proportion = (as.numeric(n_views) / sum(as.numeric(n_views)))*100) %>%
arrange(desc(proportion))
frwiki_sidebar_view_wlang
sidebar_link | n_views | proportion |
---|---|---|
<chr> | <dbl> | <dbl> |
languages | 784399 | 52.9211943 |
main_page | 422536 | 28.5073155 |
linked_pages | 66163 | 4.4638315 |
create_pdf | 56973 | 3.8438081 |
tracking_related_changes | 38309 | 2.5846005 |
thematic_portals | 24560 | 1.6569941 |
recent_changes | 23431 | 1.5808237 |
community_home | 15461 | 1.0431102 |
contact_page | 14086 | 0.9503428 |
getting_started | 12764 | 0.8611512 |
file_upload | 7188 | 0.4849541 |
create_book | 6002 | 0.4049381 |
help | 5272 | 0.3556870 |
special_pages | 5058 | 0.3412490 |
#sidebar views for de.wikipedia (excluding language lnks)
query <-
"SELECT
SUM(IF((page_title = 'Wikipedia:Hauptseite'), 1, null)) AS main_page,
SUM(IF((page_title = 'Portal:Wikipedia_nach_Themen'), 1, null)) AS theme_portals,
SUM(IF((page_title= 'Wikipedia:Beteiligen'), 1, null)) AS improve_articles,
SUM(IF((page_title= 'Hilfe:Neuen_Artikel_anlegen'), 1, null)) AS create_articles,
SUM(IF((page_title= 'Wikipedia:Autorenportal'), 1, null)) AS author_portal,
SUM(IF((page_title= 'Hilfe:Übersicht'), 1, null)) AS help,
SUM(IF((page_title LIKE 'Spezial:Letzte_Änderungen%'), 1, null)) AS last_changes,
SUM(IF((page_title= 'Wikipedia:Kontakt'), 1, null)) AS contact_page,
SUM(IF((page_title LIKE 'Spezial:Linkliste%'), 1, null)) AS linked_pages,
SUM(IF((page_title LIKE 'Spezial:Änderungen_an_verlinkten_Seiten%'), 1, null)) AS linked_page_changes,
SUM(IF((page_title = 'Spezial:Hochladen'), 1, null)) AS upload_file,
SUM(IF((page_title = 'Spezial:Spezialseiten'), 1, null)) AS special_pages,
SUM(IF((page_title = 'Spezial:Buch'), 1, null)) AS create_book,
SUM(IF((page_title = 'Spezial:ElectronPdf'), 1, null)) AS create_pdf,
SUM(view_count) as view_count_total
FROM wmf.pageview_hourly
WHERE year = 2019 and month = 6
AND project = 'de.wikipedia'
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal'
"
dewiki_sidebar_views <- wmf::query_hive(query)
#proportion of sidebar views with language clicks added
dewiki_sidebar_view_wlang <- dewiki_sidebar_views%>%
mutate(languages = 1528152) %>% #add language clicks
gather(sidebar_link, n_views) %>%
filter(sidebar_link != "view_count_total") %>%
mutate(proportion = (as.numeric(n_views) / sum(as.numeric(n_views)))*100) %>%
arrange(desc(proportion))
dewiki_sidebar_view_wlang
sidebar_link | n_views | proportion |
---|---|---|
<chr> | <dbl> | <dbl> |
languages | 1528152 | 49.32816429 |
main_page | 1146075 | 36.99486431 |
create_pdf | 96030 | 3.09981181 |
linked_pages | 86723 | 2.79938540 |
last_changes | 50767 | 1.63873942 |
author_portal | 35203 | 1.13633943 |
linked_page_changes | 33391 | 1.07784876 |
theme_portals | 32271 | 1.04169558 |
contact_page | 19754 | 0.63765159 |
create_articles | 17749 | 0.57293096 |
improve_articles | 14328 | 0.46250238 |
special_pages | 12992 | 0.41937681 |
create_book | 11474 | 0.37037635 |
help | 11398 | 0.36792310 |
upload_file | 1623 | 0.05238982 |
#sidebar views for it.wikipedia (excluding language lnks)
query <-
"SELECT
SUM(IF((page_title = 'Pagina_principale'), 1, null)) AS main_page,
SUM(IF((page_title = 'Speciale:UltimeModifiche'), 1, null)) AS recent_changes,
SUM(IF((page_title = 'Speciale:NelleVicinanze'), 1, null)) AS nearby,
SUM(IF((page_title = 'Wikipedia:Vetrina'), 1, null)) AS showcase,
SUM(IF((page_title = 'Aiuto:Aiuto'), 1, null)) AS help,
SUM(IF((page_title = 'Aiuto:Sportello_informazioni'), 1, null)) AS information_desk,
SUM(IF((page_title = 'Portale:Comunità'), 1, null)) AS community_portal,
SUM(IF((page_title = 'Wikipedia:Bar'), 1, null)) AS wikipedia_cafe,
SUM(IF((page_title = 'Wikipedia:Wikipediano'), 1, null)) AS wikipedian_page,
SUM(IF((page_title = 'Wikipedia:Contatti'), 1, null)) AS contact_page,
SUM(IF((page_title LIKE 'Speciale:PuntanoQui%'), 1, null)) AS linked_pages,
SUM(IF((page_title LIKE 'Speciale:ModificheCorrelate%'), 1, null)) AS related_changes,
SUM(IF((page_title = 'Speciale:PagineSpeciali'), 1, null)) AS special_pages,
SUM(IF((page_title = 'Speciale:Libro'), 1, null)) AS create_book,
SUM(IF((page_title = 'Speciale:ElectronPdf'), 1, null)) AS create_pdf,
SUM(view_count) as view_count_total
FROM wmf.pageview_hourly
WHERE year = 2019 and month = 6
AND project = 'it.wikipedia'
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal'
"
itwiki_sidebar_views <- wmf::query_hive(query)
#proportion of sidebar views with language clicks added
itwiki_sidebar_view_wlang <- itwiki_sidebar_views%>%
mutate(languages = 428082) %>% #add language clicks
gather(sidebar_link, n_views) %>%
filter(sidebar_link != "view_count_total") %>%
mutate(proportion = (as.numeric(n_views) / sum(as.numeric(n_views)))*100) %>%
arrange(desc(proportion))
itwiki_sidebar_view_wlang
sidebar_link | n_views | proportion |
---|---|---|
<chr> | <dbl> | <dbl> |
languages | 428082 | 50.8350552 |
main_page | 240922 | 28.6096663 |
linked_pages | 33965 | 4.0333690 |
recent_changes | 28815 | 3.4218026 |
create_pdf | 24586 | 2.9196057 |
related_changes | 12444 | 1.4777342 |
wikipedia_cafe | 11336 | 1.3461584 |
information_desk | 9167 | 1.0885881 |
wikipedian_page | 8593 | 1.0204251 |
showcase | 8442 | 1.0024938 |
nearby | 7289 | 0.8655742 |
contact_page | 6971 | 0.8278114 |
help | 6529 | 0.7753236 |
create_book | 5652 | 0.6711792 |
special_pages | 5543 | 0.6582354 |
community_portal | 3764 | 0.4469778 |
#sidebar views for cy.wikipedia (Welsh) (excluding language lnks)
query <-
"SELECT
SUM(IF((page_title = 'Hafan'), 1, null)) AS main_page,
SUM(IF((page_title = 'Wicipedia:Porth_y_Gymuned'), 1, null)) AS community_gateway,
SUM(IF((page_title = 'Wicipedia:Y_Caffi'), 1, null)) AS wikipedia_cafe,
SUM(IF((page_title = 'Categori:Materion_cyfoes'), 1, null)) AS current_affairs,
SUM(IF((page_title = 'Arbennig:RecentChanges'), 1, null)) AS recent_changes,
SUM(IF((page_title = 'Wicipedia:Cymorth'), 1, null)) AS help_page,
SUM(IF((page_title = 'Arbennig:Book'), 1, null)) AS make_a_book,
SUM(IF((page_title = 'Arbennig:ElectronPdf'), 1, null)) AS download_pdf,
SUM(IF((page_title LIKE 'Arbennig:WhatLinksHere%'), 1, null)) AS what_links_here,
SUM(IF((page_title LIKE 'Arbennig:RecentChangesLinked%'), 1, null)) AS related_changes,
SUM(IF((page_title LIKE 'Arbennig:SpecialPages%'), 1, null)) AS special_pages,
SUM(IF((page_title = 'Arbennig:Book'), 1, null)) AS create_book,
SUM(IF((page_title = 'Arbennig:ElectronPdf'), 1, null)) AS create_pdf,
SUM(view_count) as view_count_total
FROM wmf.pageview_hourly
WHERE year = 2019 and month = 6
AND project = 'cy.wikipedia'
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal'
"
cywiki_sidebar_views <- wmf::query_hive(query)
#proportion of sidebar views with language clicks added
cywiki_sidebar_view_wlang <- cywiki_sidebar_views%>%
mutate(languages = 40011) %>% #add language clicks
gather(sidebar_link, n_views) %>%
filter(sidebar_link != "view_count_total") %>%
mutate(proportion = (as.numeric(n_views) / sum(as.numeric(n_views)))*100) %>%
arrange(desc(proportion))
cywiki_sidebar_view_wlang
sidebar_link | n_views | proportion |
---|---|---|
<chr> | <dbl> | <dbl> |
languages | 40011 | 82.0890010 |
main_page | 3196 | 6.5571080 |
recent_changes | 1018 | 2.0885907 |
help_page | 779 | 1.5982438 |
what_links_here | 776 | 1.5920888 |
community_gateway | 631 | 1.2945980 |
wikipedia_cafe | 446 | 0.9150407 |
download_pdf | 343 | 0.7037197 |
create_pdf | 343 | 0.7037197 |
make_a_book | 306 | 0.6278082 |
create_book | 306 | 0.6278082 |
related_changes | 213 | 0.4370038 |
current_affairs | 193 | 0.3959705 |
special_pages | 180 | 0.3692989 |
## sidebar views for scn.wikipedia (Sicilian) (excluding language lnks)
query <-
"SELECT
SUM(IF((page_title = 'Pàggina_principali'), 1, null)) AS main_page,
SUM(IF((page_title = 'Wikipedia:Porta_dâ_Cumunitati'), 1, null)) AS community_portal,
SUM(IF((page_title = 'Archiviu:Nutizzi'), 1, null)) AS news,
SUM(IF((page_title LIKE 'Spiciali:UltimeModifiche%'), 1, null)) AS recent_changes,
SUM(IF((page_title = 'Wikipedia:Circulu'), 1, null)) AS the_circle,
SUM(IF((page_title = 'Wikipedia:Articulu_n_vitrina'), 1, null)) AS article_of_week,
SUM(IF((page_title = 'Wikipedia:Chat_IRC_n_sicilianu'), 1, null)) AS IRC_chat,
SUM(IF((page_title = 'Wikipedia:Grammàtica'), 1, null)) AS grammar,
SUM(IF((page_title = 'Wikipedia:Cumpenniu_Stilìsticu'), 1, null)) AS style_guide,
SUM(IF((page_title LIKE 'Spiciali:Libro%'), 1, null)) AS create_book,
SUM(IF((page_title LIKE 'Spiciali:ElectronPdf%'), 1, null)) AS create_pdf,
SUM(IF((page_title LIKE 'Spiciali:PuntanoQui%'), 1, null)) AS linked_pages,
SUM(IF((page_title LIKE 'Spiciali:ModificheCorrelate%'), 1, null)) AS related_changes,
SUM(IF((page_title = 'Spiciali:PagineSpeciali'), 1, null)) AS special_pages,
SUM(view_count) as view_count_total
FROM wmf.pageview_hourly
WHERE year = 2019 and month = 6
AND project = 'scn.wikipedia'
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal'
"
scnwiki_sidebar_views <- wmf::query_hive(query)
#proportion of sidebar views with language clicks added
scnwiki_sidebar_view_wlang <- scnwiki_sidebar_views%>%
mutate(languages = 25627) %>% #add language clicks
gather(sidebar_link, n_views) %>%
filter(sidebar_link != "view_count_total") %>%
mutate(proportion = (as.numeric(n_views) / sum(as.numeric(n_views)))*100) %>%
arrange(desc(proportion))
scnwiki_sidebar_view_wlang
sidebar_link | n_views | proportion |
---|---|---|
<chr> | <dbl> | <dbl> |
languages | 25627 | 84.22176942 |
main_page | 2402 | 7.89404496 |
community_portal | 508 | 1.66951492 |
news | 408 | 1.34087025 |
linked_pages | 314 | 1.03194426 |
the_circle | 276 | 0.90705929 |
create_pdf | 152 | 0.49953990 |
article_of_week | 137 | 0.45024320 |
create_book | 136 | 0.44695675 |
recent_changes | 131 | 0.43052452 |
irc_chat | 118 | 0.38780071 |
grammar | 110 | 0.36150914 |
special_pages | 70 | 0.23005127 |
style_guide | 25 | 0.08216117 |
related_changes | 14 | 0.04601025 |
## sidebar views for ps.wikipedia Pashto (excluding language lnks)
query <-
"SELECT
SUM(IF((page_title = 'لومړی_مخ'), 1, null)) AS main_page,
SUM(IF((page_title = 'ويکيپېډيا:د_ټولنې_تانبه'), 1, null)) AS community_portal,
SUM(IF((page_title = 'ويکيپېډيا:تازه_پېښې'), 1, null)) AS current_events,
SUM(IF((page_title = 'ځانگړی:اوسني_بدلونونه'), 1, null)) AS recent_changes,
SUM(IF((page_title = 'ځانگړی:Book'), 1, null)) AS create_a_book,
SUM(IF((page_title = 'ځانگړی:ElectronPdf'), 1, null)) AS download_as_pdf,
SUM(IF((page_title LIKE 'ځانگړی:WhatLinksHere%'), 1, null)) AS what_links_here,
SUM(IF((page_title LIKE 'ځانگړی:RecentChangesLinked%'), 1, null)) AS related_changes,
SUM(IF((page_title = 'ځانگړی:ځانګړي_مخونه'),1,null)) AS special_pages,
SUM(view_count) as view_count_total
FROM wmf.pageview_hourly
WHERE year = 2019 and month = 6
AND project = 'ps.wikipedia'
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal'
"
pswiki_sidebar_views <- wmf::query_hive(query)
#proportion of sidebar views with language clicks added
pswiki_sidebar_view_wlang <- pswiki_sidebar_views%>%
mutate(languages = 10317) %>% #add language clicks
gather(sidebar_link, n_views) %>%
filter(sidebar_link != "view_count_total") %>%
mutate(proportion = (as.numeric(n_views) / sum(as.numeric(n_views)))*100) %>%
arrange(desc(proportion))
pswiki_sidebar_view_wlang
sidebar_link | n_views | proportion |
---|---|---|
<chr> | <dbl> | <dbl> |
languages | 10317 | 76.95807847 |
main_page | 1815 | 13.53871401 |
community_portal | 561 | 4.18469342 |
download_as_pdf | 177 | 1.32030434 |
what_links_here | 152 | 1.13382068 |
create_a_book | 127 | 0.94733701 |
recent_changes | 111 | 0.82798747 |
current_events | 77 | 0.57436969 |
special_pages | 58 | 0.43264210 |
related_changes | 11 | 0.08205281 |
In summary, the language links as a group are clicked more than the other links in the sidebar for all wikis reviewed. Clicks to the main page account for the second most viewed sidebar link. On the larger sized wikis reviewed, clicks to language links represent around half (39-52%) of all clicks to links in the sidebar. On the smaller-sized wikis reviewed, language link clicks represent an even larger portion (77-84%) of all clicks to links in the sidebar.
Questions: what percentage of users use the user links in the header? What is the usage for all of the various links?
Audiences: logged-in users, logged-out users
Why this is relevant: currently the user links take up quite a bit of space in the header. As we think of ways to tidy up the site it would be helpful to understand what the usage is like there.
Data Notes: Sample data from webrequest and pageview_hourly table for en.wiki. Data is from July 2019 and for desktop users on en.wiki. Pages available in header vary for logged-in and logged-out users.
# Overall views to various header links (both logged in and logged out users)
#ran in terminal due to issues running over notebook.
query <-
"SELECT
SUM(IF(page_title = 'Special:MyTalk', 1, null)) AS MyTalk,
SUM(IF(page_title = 'Special:MyContributions', 1, null)) AS MyContributions,
SUM(IF(page_title = 'Special:CreateAccount', 1, null)) AS CreateAccount,
SUM(IF(page_title = 'Special:UserLogin', 1, null)) AS UserLogin,
SUM(IF(page_title LIKE 'User:%', 1, null)) AS UserPage,
SUM(IF(page_title LIKE 'User_talk:%', 1, null)) AS User_talk,
SUM(IF(page_title LIKE 'User:%/sandbox', 1, null)) AS Sandbox,
SUM(IF(page_title = 'Special:Preferences', 1, null)) AS Preferences,
SUM(IF(page_title = 'Special:Watchlist', 1, null)) AS Watchlist,
SUM(IF(page_title LIKE 'Special:Contributions%', 1, null)) AS Contributions,
SUM(IF(page_title = 'Special:UserLogout', 1, null)) AS UserLogout
FROM wmf.pageview_hourly
WHERE year = 2019 and month = 7
AND agent_type = 'user'
AND project = 'en.wikipedia'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia.
"
header_views_overall_raw <- read.csv(file="Data/header_views_overall.tsv", header=TRUE, sep="\t")
#isolate to links viewed by logged out users
header_views_overall_loggedout <- header_views_overall_raw %>%
gather(header_link, n_views) %>%
filter(header_link %in% c('mytalk', 'mycontributions','createaccount','userlogin')) %>%
mutate(proportion = (n_views / sum(n_views))) %>%
arrange(desc(n_views))
header_views_overall_loggedout
header_link | n_views | proportion |
---|---|---|
<chr> | <int> | <dbl> |
createaccount | 219392 | 0.52479846908 |
mycontributions | 102760 | 0.24580791771 |
mytalk | 95880 | 0.22935055615 |
userlogin | 18 | 0.00004305705 |
#plot bar chart showing usage
p <- ggplot(header_views_overall_loggedout, aes(x= header_link, y= proportion, fill = header_link)) %>%
+ geom_col(position = 'dodge') +
scale_y_continuous("Proportion of header views", labels = scales::percent) +
labs(title = "Desktop pageviews from header links for logged out users") +
ggthemes::theme_tufte(base_size = 10, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid = element_line("gray70"),
legend.position= "none")
ggsave("Figures/proportion_header_views_loggedout.png", p, width = 18, height = 9, units = "in", dpi = 150)
p
he create account link is the most viewed page in the header for logged out users. It accounts for about 52% of all desktop pageviews to links in a header for logged out users. Note: There were very few views to then UserLogin page compared to the other pages in the header bar, which seems unlikely. I spent a little time digging into the data but have not yet determined if this is real behavior or a bug.
#isolate to links viewed by logged in users
header_views_overall_loggedin <- header_views_overall_raw %>%
gather(header_link, n_views) %>%
filter(header_link %in% c('userpage','user_talk', 'sandbox','preferences','watchlist', 'contributions', 'userlogout' )) %>%
mutate(proportion = (n_views / sum(n_views))) %>%
arrange(desc(n_views))
header_views_overall_loggedin
header_link | n_views | proportion |
---|---|---|
<chr> | <int> | <dbl> |
userpage | 1280851 | 0.332278703 |
user_talk | 1023740 | 0.265578900 |
contributions | 870312 | 0.225776568 |
watchlist | 511455 | 0.132681791 |
sandbox | 83960 | 0.021780925 |
userlogout | 55291 | 0.014343606 |
preferences | 29140 | 0.007559506 |
#plot bar chart showing usage
p <- ggplot(header_views_overall_loggedin, aes(x= header_link, y= proportion, fill = header_link)) %>%
+ geom_col(position = 'dodge') +
scale_y_continuous("Proportion of header views", labels = scales::percent) +
labs(title = "Desktop pageviews from header links for logged in users") +
ggthemes::theme_tufte(base_size = 10, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid = element_line("gray70"),
legend.position= "none")
ggsave("Figures/proportion_header_views_loggedin.png", p, width = 18, height = 9, units = "in", dpi = 150)
p
The user page, user talk page and contributions pages are the most viewed pages in the header for logged in users.
## Overall for logged out users. Obtaining sample from the webrequest table.
query <-
"SELECT COUNT(DISTINCT CONCAT(client_ip, user_agent)) as n_user
FROM (
SELECT
client_ip,
user_agent
FROM wmf.webrequest TABLESAMPLE(BUCKET 1 OUT OF 1024 ON hostname, sequence)
WHERE year = 2019 and month = 7
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia.
AND webrequest_source = 'text'
AND is_pageview
AND x_analytics_map['loggedIn'] is NULL
AND pageview_info['project'] = 'en.wikipedia'
AND pageview_info['page_title'] IN ('Special:MyTalk', 'Special:MyContributions','Special:CreateAccount','Special:UserLogin')
) as page_info"
Logged Out Users
## Overall for logged in users
query <-
"SELECT COUNT(DISTINCT CONCAT(client_ip, user_agent)) as n_user
FROM (
SELECT
client_ip,
user_agent,
FROM wmf.webrequest TABLESAMPLE(BUCKET 1 OUT OF 1024 ON hostname, sequence)
WHERE year = 2019 and month = 7
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia.
AND webrequest_source = 'text'
AND is_pageview
AND x_analytics_map['loggedIn'] is NOT NULL
AND pageview_info['project'] = 'en.wikipedia'
AND (pageview_info['page_title'] IN ('Special:Preferences', 'Special:Watchlist','Special:UserLogout')
OR
pageview_info['page_title'] LIKE 'User:%' OR
pageview_info['page_title'] LIKE 'User_talk:%' OR
pageview_info['page_title'] LIKE 'User:%/sandbox' OR
pageview_info['page_title'] LIKE 'Special:Contributions%')
)as page_info"
Logged In Users
A higher percentage of logged in users click on a link in the header compared to logged out.
## By Feature for Logged Out Users
query <-
"SELECT
COUNT(DISTINCT CONCAT(client_ip, user_agent)) as n_user,
header_name
FROM (
SELECT
client_ip,
user_agent,
pageview_info['page_title'] as sidebar_name
FROM wmf.webrequest TABLESAMPLE(BUCKET 1 OUT OF 1024 ON hostname, sequence)
WHERE
year = 2019 and month = 7
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia.
AND is_pageview
AND x_analytics_map['loggedIn'] is NULL
AND webrequest_source = 'text'
AND pageview_info['project'] = 'en.wikipedia'
AND pageview_info['page_title'] IN ('Special:MyTalk', 'Special:MyContributions','Special:CreateAccount','Special:UserLogin')
) AS page_info
Group BY sidebar_name
"
header_views_loggedout_bylink_raw <- read.csv(file="Data/header_views_loggedout_bylink.tsv", header=TRUE, sep= "\t")
header_views_loggedout_bylink <- header_views_loggedout_bylink_raw %>%
mutate(user_pct = as.numeric(n_user)/2038498 *100) %>%
arrange(desc(user_pct))
header_views_loggedout_bylink
n_user | sidebar_name | user_pct |
---|---|---|
<int> | <fct> | <dbl> |
564 | Special:CreateAccount | 0.02766742965 |
252 | Special:MyTalk | 0.01236204303 |
125 | Special:MyContributions | 0.00613196579 |
1 | Special:UserLogin | 0.00004905573 |
## By Feature for Logged In Users
query <-
"SELECT
COUNT(DISTINCT CONCAT(client_ip, user_agent)) as n_user,
header_name
FROM (
SELECT
client_ip,
user_agent,
(CASE
WHEN pageview_info['page_title'] LIKE 'User:%' THEN 'User'
WHEN pageview_info['page_title'] LIKE 'User_talk:%' THEN 'User_talk'
WHEN pageview_info['page_title'] LIKE 'User:%/sandbox' THEN 'Sandbox'
WHEN pageview_info['page_title'] LIKE 'Special:Contributions%' THEN 'Special:Contributions'
ELSE pageview_info['page_title']
END) as header_name
FROM wmf.webrequest TABLESAMPLE(BUCKET 1 OUT OF 1024 ON hostname, sequence)
WHERE
year = 2019 and month = 7
AND agent_type = 'user'
AND access_method = 'desktop'
AND referer_class = 'internal' --isolate to only views to these pages from within wikipedia.
AND is_pageview
AND x_analytics_map['loggedIn'] is NOT NULL
AND webrequest_source = 'text'
AND pageview_info['project'] = 'en.wikipedia'
AND (pageview_info['page_title'] IN ('Special:Preferences', 'Special:Watchlist','Special:UserLogout')
OR
pageview_info['page_title'] LIKE 'User:%' OR
pageview_info['page_title'] LIKE 'User_talk:%' OR
pageview_info['page_title'] LIKE 'User:%/sandbox' OR
pageview_info['page_title'] LIKE 'Special:Contributions%')
) AS page_info
Group BY header_name
"
header_views_loggedin_bylink_raw <- read.csv(file="Data/header_views_loggedin_bylink.tsv", header=TRUE, sep= "\t")
header_views_loggedin_bylink <- header_views_loggedin_bylink_raw %>%
mutate(user_pct = as.numeric(n_user)/44345 *100) %>%
arrange(desc(user_pct))
header_views_loggedin_bylink
n_user | header_name | user_pct |
---|---|---|
<int> | <fct> | <dbl> |
975 | User | 2.198669523 |
866 | Special:Contributions | 1.952869546 |
814 | Special:Watchlist | 1.835607171 |
767 | User_talk | 1.729620025 |
51 | Special:Preferences | 0.115007329 |
3 | Special:UserLogout | 0.006765137 |
Data questions: what percentage of users use language links?
Audiences: logged-in users, logged-out users
Why this is relevant: currently you have to scroll in order to find the language links. We're wondering if a more prominent location, perhaps closer to the article context itself, would make it easier for users to find.
Data Notes: Data from wmf.interlaguage_navigation. Includes data for all projects and from July 2019.
Data below reflects the percent of all requests in July 2019 that were clicks to langauge pages.
#By project looking by the source of request (not target)
query <-
"
SELECT
navigation_count.project as project,
(navigation_count_total / view_count_total) *100 as percent_interlanguage_navigation
FROM (
SELECT
concat(previous_project, '.', project_family) as project,
sum(navigation_count) as navigation_count_total
from wmf.interlanguage_navigation
where date > '2019-06-31'
AND date < '2019-08-01'
group by project_family, previous_project
) as navigation_count
INNER JOIN
(SELECT project,
sum(view_count) as view_count_total
FROM wmf.projectview_hourly
WHERE concat(year, '-', lpad(month, 2, '0'), '-', lpad(day, 2, '0')) between '2019-06-31' and '2019-08-01'
and access_method = 'desktop'
and agent_type = 'user'
group by project
) as projectview
on navigation_count.project = projectview.project
ORDER by percent_interlanguage_navigation DESC LIMIT 5000"
interlanguage_clicks_byproject <- wmf::query_hive(query)
head(interlanguage_clicks_byproject, 50)
project | percent_interlanguage_navigation |
---|---|
<chr> | <dbl> |
kg.wikipedia | 10.363773 |
cv.wikipedia | 9.953629 |
gd.wikipedia | 9.470999 |
pa.wikipedia | 9.375662 |
zh-min-nan.wikipedia | 9.265733 |
arc.wikipedia | 9.254850 |
ga.wikipedia | 9.221888 |
nap.wikipedia | 9.097672 |
ce.wikipedia | 9.080891 |
nv.wikipedia | 9.037040 |
koi.wikipedia | 8.779625 |
pms.wikipedia | 8.743561 |
ba.wikipedia | 8.636493 |
ace.wikipedia | 8.499482 |
stq.wikipedia | 8.476840 |
ckb.wikipedia | 8.421348 |
ia.wikipedia | 8.302479 |
io.wikipedia | 8.299008 |
bxr.wikipedia | 8.293610 |
oc.wikipedia | 8.229620 |
ilo.wikipedia | 8.136071 |
cdo.wikipedia | 8.135914 |
br.wikipedia | 8.112895 |
yo.wikipedia | 8.033573 |
cy.wikipedia | 7.996634 |
wuu.wikipedia | 7.987096 |
tum.wikipedia | 7.949838 |
jv.wikipedia | 7.903932 |
am.wikipedia | 7.862119 |
gv.wikipedia | 7.814249 |
ts.wikipedia | 7.781456 |
pnb.wikipedia | 7.775095 |
sco.wikipedia | 7.768593 |
szl.wikipedia | 7.753842 |
hak.wikipedia | 7.712114 |
ps.wikipedia | 7.707871 |
iu.wikipedia | 7.689288 |
is.wikipedia | 7.670631 |
st.wikipedia | 7.669688 |
qu.wikipedia | 7.638240 |
fo.wikipedia | 7.545703 |
sah.wikipedia | 7.506009 |
myv.wikipedia | 7.491589 |
ht.wikipedia | 7.481700 |
frp.wikipedia | 7.469399 |
an.wikipedia | 7.449329 |
xmf.wikipedia | 7.414009 |
bh.wikipedia | 7.376527 |
ku.wikipedia | 7.334941 |
ext.wikipedia | 7.326225 |
#Views for top sized_wikis
interlanguage_clicks_byproject_topwikis <- interlanguage_clicks_byproject %>%
filter(project %in% c('en.wikipedia', 'es.wikipedia', 'de.wikipedia', 'ja.wikipedia',
'fr.wikipedia', 'ru.wikipedia', 'it.wikipedia', 'zh.wikipedia')) %>%
arrange(desc(percent_interlanguage_navigation))
interlanguage_clicks_byproject_topwikis
project | percent_interlanguage_navigation |
---|---|
<chr> | <dbl> |
fr.wikipedia | 0.3858005 |
de.wikipedia | 0.3707331 |
it.wikipedia | 0.3147767 |
ja.wikipedia | 0.2740005 |
zh.wikipedia | 0.2383240 |
en.wikipedia | 0.2179289 |
es.wikipedia | 0.2024851 |
ru.wikipedia | 0.1820905 |
#overall across all projects
query <-
"
SELECT
SUM(navigation_count_total) as navigation_count_total,
SUM(view_count_total) as view_count_total
FROM (
SELECT
concat(previous_project, '.', project_family) as project,
sum(navigation_count) as navigation_count_total
FROM wmf.interlanguage_navigation
WHERE date >= '2019-07-01'
AND date <= '2019-07-31'
group by project_family, previous_project
) as navigation_count
INNER JOIN
(
SELECT project,
sum(view_count) as view_count_total
FROM wmf.projectview_hourly
WHERE
concat(year, '-', lpad(month, 2, '0'), '-', lpad(day, 2, '0')) between '2019-07-01' and '2019-07-31'
and access_method = 'desktop'
and agent_type = 'user'
group by project
) as projectview
on navigation_count.project = projectview.project"
interlanguage_clicks_overall <- wmf::query_hive(query)
interlanguage_clicks_overall
navigation_count_total | view_count_total |
---|---|
<int> | <dbl> |
23948360 | 6166994793 |
About 0.39% of all pageviews from desktop are interlanguage navigation.
Table of Contents clicks are not recorded by the client or server side. I'll do a little more asking with the team to confirm if there is a way to use the uri_path.