Pending resolution of identified Prefupdate bugs, I reviewed the mediawiki user_properties table to determine the total number of users with each vector skin preference set for each of the test wiki. Note: Unlike PrefUpdate, this does not record every change in users preference but stores any current non-default state of the user's preference. As result, this data may included users that enabled and disabled their vectorskin version preference multiple times.
This reflects all current nondefault user preferences. If a user has not made any changes to their vector skin version then there will be no record for that user in this table and skin preference is default (i.e. Modern for logged-in users on test wikis)
VectorSkinVersion
set to 0 means.import pandas as pd
import numpy as np
import datetime as dt
from wmfdata import hive, mariadb
query = """
SELECT
up_value as skin,
COUNT(*) as users
FROM user_properties
WHERE
up_property = 'VectorSkinVersion'
GROUP BY up_value """
#define list of target wikis
wikis = ['euwiki','frwiktionary', 'ptwikiversity', 'fawiki','hewiki', 'frwiki']
up_skin=list()
for wiki in wikis:
prefs = mariadb.run(
query.format(),
wiki
)
up_skin.append(prefs)
skin= pd.concat(up_skin)
skin_users = skin['users'].sum()
print('Total number of users for whom we have vector skin preferences set in the user_properties table:' , skin_users)
Total number of users for whom we have vector skin preferences set in the user_properties table: 226605
skin_aliases = {
"0":"unknown",
"1":"legacy",
"2":"modern"
}
skin= skin.replace({"skin": skin_aliases})
user_skin=skin.groupby('skin').sum()
user_skin
users | |
---|---|
skin | |
legacy | 128242 |
modern | 98334 |
unknown | 29 |
#List of wikis to correspond to data values
wikis_list = ['euwiki','euwiki', 'frwiktionary', 'frwiktionary', 'frwiktionary', 'ptwikiversity', 'ptwikiversity','fawiki', 'fawiki', 'fawiki','hewiki', 'hewiki', 'hewiki', 'frwiki', 'frwiki', 'frwiki']
skin['wiki'] = wikis_list
user_skin_bywiki=pd.pivot_table(skin, index=['wiki','skin'],values=['users'],aggfunc=np.sum)
user_skin_bywiki
users | ||
---|---|---|
wiki | skin | |
euwiki | legacy | 2067 |
modern | 1852 | |
fawiki | legacy | 21442 |
modern | 22046 | |
unknown | 2 | |
frwiki | legacy | 71535 |
modern | 49185 | |
unknown | 19 | |
frwiktionary | legacy | 4684 |
modern | 3899 | |
unknown | 6 | |
hewiki | legacy | 28034 |
modern | 20890 | |
unknown | 2 | |
ptwikiversity | legacy | 480 |
modern | 462 |
We can use the total number of users on test wikis using the mediawiki user table to estimate the opt-out rate.
# collect total number of users on each wiki
query = """
SELECT
COUNT(DISTINCT user_id) AS num_users
FROM user"""
user_count = mariadb.run(commands = query, dbs = wikis, format="pandas")
user_count['wiki'] = wikis
user_count
num_users | wiki | |
---|---|---|
0 | 115786 | euwiki |
1 | 291868 | frwiktionary |
2 | 29455 | ptwikiversity |
3 | 963101 | fawiki |
4 | 685306 | hewiki |
5 | 3905482 | frwiki |
The opt-out rate was calculated by dividing the total number of users with their vector version preference changed to 'legacy' by the total number of all registered users on the wiki.
# Create list of legacy users - these are all users that opt-out assuming modern is the default
legacy_users = skin[skin['skin']=='legacy']
#rename colums
legacy_users.columns = ['skin', 'num_legacy_users', 'wiki']
# join to user_count table to obtain opt-out rate for each wiki
opt_out_rate = legacy_users.merge(user_count, left_on = 'wiki', right_on = 'wiki')
# Calculate opt-out rate
opt_out_rate['pct_opt_out_rate'] = opt_out_rate['num_legacy_users']/ opt_out_rate['num_users'] * 100
opt_out_rate
skin | num_legacy_users | wiki | num_users | opt_out_rate | pct_opt_out_rate | |
---|---|---|---|---|---|---|
0 | legacy | 2067 | euwiki | 115786 | 0.017852 | 1.785190 |
1 | legacy | 4684 | frwiktionary | 291868 | 0.016048 | 1.604835 |
2 | legacy | 480 | ptwikiversity | 29455 | 0.016296 | 1.629604 |
3 | legacy | 21442 | fawiki | 963101 | 0.022264 | 2.226350 |
4 | legacy | 28034 | hewiki | 685306 | 0.040907 | 4.090727 |
5 | legacy | 71535 | frwiki | 3905482 | 0.018317 | 1.831656 |
I reviewed the opt-out rate among active editors (users that had 5 or more content edits overall in the last year from September 2019 to September 2020). This was calculated by finding the percent of active editors for each wiki (obtained using data from mediawiki history table that have VectorSkinVersion
preference set to legacy in the user properties table.
Since the modern vector version was deployed as default to all of the test wikis in this analysis, it was assumed that any users with a non-default preference recorded as legacy have opt-d out.
HIVE_SNAPSHOT = "2020-09"
START_OF_DATA = "2019-09-01"
END_OF_DATA = "2020-10-01"
#all active editors from the past one year
active_editor_query = """
WITH yr_proj_edits as (
select
event_user_text as user,
event_user_id as user_id,
wiki_db as proj,
sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as content_edits,
max(event_timestamp) as latest_edit
from wmf.mediawiki_history
where
-- review target wikis
wiki_db IN ('euwiki','frwiktionary', 'ptwikiversity', 'fawiki','hewiki', 'frwiki') and
-- REGISTERED
event_user_is_anonymous = false and
-- NON-BOT
size(event_user_is_bot_by) = 0 and
not array_contains(event_user_groups, "bot") and
-- CONTENT EDITS
event_entity = "revision" and
event_type = "create" and
page_namespace_is_content = true and
-- FROM THE LAST YEAR
event_timestamp >= "{START_OF_DATA}" and event_timestamp < "{END_OF_DATA}" and
-- FROM THE LATEST SNAPSHOT
snapshot = "{hive_snapshot}"
-- PER USER, PER WIKI
group by event_user_text, event_user_id, wiki_db
)
-- FINAL SELECT OF
select
user as user_name,
user_id as user_id,
proj as wiki,
global_edits
from
-- JOINED TO THEIR HOME WIKI AND GLOBAL EDITS
(
select
user,
user_id,
proj,
-- in the unlikely event that wikis are tied by edit count and latest edit,
-- row_number() will break it somehow
row_number() over (partition by user order by content_edits desc, latest_edit desc) as rank,
sum(content_edits) over (partition by user) as global_edits
from yr_proj_edits
) yr_edits
where
rank = 1
and global_edits>= 5
"""
active_editor = hive.run(
active_editor_query.format(
hive_snapshot = HIVE_SNAPSHOT,
START_OF_DATA= START_OF_DATA,
END_OF_DATA=END_OF_DATA
)
)
#Total_active_ed = active_editor['user_id'].count()
Total_active_ed = active_editor.groupby(['wiki'])[['user_id']].count()
print('Total number of editors for whom we will be checking vector skin preferences:' , Total_active_ed)
Total number of editors for whom we will be checking vector skin preferences: user_id wiki euwiki 788 fawiki 12421 frwiki 35442 frwiktionary 446 hewiki 5958 ptwikiversity 81
#Querying user_properties for getting the skin preferences set by the active editors we got in the above query
query='''
SELECT
up_value AS skin,
COUNT(*) AS users
FROM user_properties
WHERE up_user in ({users})
AND up_property = "VectorSkinVersion"
GROUP BY up_value
'''
#define list of target wikis
wikis = ['euwiki','frwiktionary', 'ptwikiversity', 'fawiki','hewiki', 'frwiki']
# Looping through each wiki for the list of users for each skin
up_skin=list()
for wiki in wikis:
user_ids = active_editor[active_editor['wiki'] == wiki]["user_id"]
user_list = ','.join([str(u) for u in user_ids])
prefs = mariadb.run(
query.format(users=user_list),
wiki
)
up_skin.append(prefs)
skin= pd.concat(up_skin)
#List of wikis to correspond to data values
wikis_list = ['euwiki','euwiki', 'frwiktionary', 'frwiktionary', 'ptwikiversity', 'ptwikiversity','fawiki', 'fawiki', 'fawiki','hewiki', 'hewiki', 'frwiki', 'frwiki', 'frwiki']
# add wiki column
skin['wiki'] = wikis_list
# skin_users = skin['users'].sum()
skin_users = skin.groupby(['wiki']).sum()
print('Total number of users for whom we have vector skin preferences set in the user_properties table:' , skin_users)
Total number of users for whom we have vector skin preferences set in the user_properties table: users wiki euwiki 67 fawiki 3316 frwiki 6399 frwiktionary 93 hewiki 1439 ptwikiversity 11
Note: There are a large number of users who do not have data for vector skin preference in the user_preference table indicating that they are set to the default 'Modern' skin OR due to being deleted from the user_preference table. For the analysis below, let's default them to 'Modern'.
modern_users=np.subtract(Total_active_ed,skin_users)
modern_users
user_id | |
---|---|
wiki | |
euwiki | 721 |
fawiki | 9105 |
frwiki | 29043 |
frwiktionary | 353 |
hewiki | 4519 |
ptwikiversity | 70 |
#crate data frame of modern users
modern_users_df = pd.DataFrame([['modern', 721, 'euwiki'], ['modern', 9105, 'fawiki'],
['modern', 29043, 'frwiki'],['modern', 353, 'frwiktionary'], ['modern', 4519, 'hewiki'],
['modern', 70, 'ptwikiversity']],
columns=['skin','users','wiki'])
modern_users_df
skin | users | wiki | |
---|---|---|---|
0 | modern | 721 | euwiki |
1 | modern | 9105 | fawiki |
2 | modern | 29043 | frwiki |
3 | modern | 353 | frwiktionary |
4 | modern | 4519 | hewiki |
5 | modern | 70 | ptwikiversity |
# Define skin type for each property values
skin_aliases = {
"0":"unknown",
"1":"legacy",
"2":"modern"
}
skin= skin.replace({"skin": skin_aliases})
# combine modern_users with skin table
skin = skin.append(modern_users_df,ignore_index=True)
skin
skin | users | wiki | |
---|---|---|---|
0 | legacy | 40 | euwiki |
1 | modern | 27 | euwiki |
2 | legacy | 52 | frwiktionary |
3 | modern | 41 | frwiktionary |
4 | legacy | 7 | ptwikiversity |
5 | modern | 4 | ptwikiversity |
6 | unknown | 1 | fawiki |
7 | legacy | 1874 | fawiki |
8 | modern | 1441 | fawiki |
9 | legacy | 863 | hewiki |
10 | modern | 576 | hewiki |
11 | unknown | 9 | frwiki |
12 | legacy | 4299 | frwiki |
13 | modern | 2091 | frwiki |
14 | modern | 721 | euwiki |
15 | modern | 9105 | fawiki |
16 | modern | 29043 | frwiki |
17 | modern | 353 | frwiktionary |
18 | modern | 4519 | hewiki |
19 | modern | 70 | ptwikiversity |
user_skin=skin.groupby('skin').sum()
user_skin
users | |
---|---|
skin | |
legacy | 7135 |
modern | 47991 |
unknown | 10 |
user_skin_bywiki=pd.pivot_table(skin, index=['wiki','skin'],values=['users'],aggfunc=np.sum)
user_skin_bywiki
users | ||
---|---|---|
wiki | skin | |
euwiki | legacy | 40 |
modern | 748 | |
fawiki | legacy | 1874 |
modern | 10546 | |
unknown | 1 | |
frwiki | legacy | 4299 |
modern | 31134 | |
unknown | 9 | |
frwiktionary | legacy | 52 |
modern | 394 | |
hewiki | legacy | 863 |
modern | 5095 | |
ptwikiversity | legacy | 7 |
modern | 74 |
# overall
pct_user_skin=(100. * user_skin / user_skin.sum()).round(1).astype(str) + '%'
pct_user_skin.sort_values(by=['users'],ascending=False)
users | |
---|---|
skin | |
modern | 87.0% |
legacy | 12.9% |
unknown | 0.0% |
# by target wiki
pct_user_skin_bywiki = user_skin_bywiki.groupby(['wiki', 'skin']).agg({'users': 'sum'})
wiki = user_skin_bywiki.groupby(['wiki']).agg({'users': 'sum'})
pct_user_skin_bywiki.div(wiki, level='wiki') * 100
users | ||
---|---|---|
wiki | skin | |
euwiki | legacy | 5.076142 |
modern | 94.923858 | |
fawiki | legacy | 15.087352 |
modern | 84.904597 | |
unknown | 0.008051 | |
frwiki | legacy | 12.129677 |
modern | 87.844930 | |
unknown | 0.025394 | |
frwiktionary | legacy | 11.659193 |
modern | 88.340807 | |
hewiki | legacy | 14.484726 |
modern | 85.515274 | |
ptwikiversity | legacy | 8.641975 |
modern | 91.358025 |
The percentage of legacy users listed above for each wiki reflects the opt-out rate as the modern vector was presented as default for all these wikis.
The opt out rates among active editors for each target wiki are still well below 40%. Persian Wikipedia (fawiki) currently has the highest opt out rate (15.1%) among active editors while Basque Wikipedia (euwiki) has the lowest (5.1%).