#!/usr/bin/env python # coding: utf-8 # # Desktop Vector Skin Version User Preferences # [Task](https://phabricator.wikimedia.org/T260149) # # Pending resolution of identified Prefupdate bugs, I reviewed the mediawiki [user_properties table](https://www.mediawiki.org/wiki/Manual:User_properties_table) to determine the total number of users with each vector skin preference set for each of the test wiki. Note: Unlike PrefUpdate, this does not record every change in users preference but stores any current non-default state of the user's preference. As result, this data may included users that enabled and disabled their vectorskin version preference multiple times. # # # Contents # 1. [Calculate opt out rate among registered users](#Calculate-opt-out-rate-among-all-registered-users) # 2. [Calculate opt out rate among active editors](#Calculate-opt-out-rate-among-active-editors) # ## Data Notes: # # This reflects all current nondefault user preferences. If a user has not made any changes to their vector skin version then there will be no record for that user in this table and skin preference is default (i.e. Modern for logged-in users on test wikis) # * Only accounts for logged-in users. # * Data reflects the current state and does not account for users that opt in and opt out multiple times since deployment. # * Based on the context that the new vector skin was deployed as the default setting for all logged-in users on the wikis below, we can assume each of the values mean the following: # * Legacy: These are users that have currently opt'd out of the modern version. # * Modern: These are users that opt'd out and then opt'd back in to the modern version. (Note: This number does not reflect the total number of users that are using the modern skin; only the users that made changes to their default preferences) # * Unknown: There were a few VectorSkinVersion values set to 0 (instead of 1 [legacy] or 2[modern]). I need to further investigate what those values indicate. # ## Remaining To Dos: # - Investigate what `VectorSkinVersion` set to 0 means. # - Look into options of putting on superset dashboard. # - Pending resolution of PrefUpdate bugs or new schema, show opt-out rate over time. # In[1]: import pandas as pd import numpy as np import datetime as dt from wmfdata import hive, mariadb # # Calculate opt out rate among all registered users # In[139]: query = """ SELECT up_value as skin, COUNT(*) as users FROM user_properties WHERE up_property = 'VectorSkinVersion' GROUP BY up_value """ # In[150]: #define list of target wikis wikis = ['euwiki','frwiktionary', 'ptwikiversity', 'fawiki','hewiki', 'frwiki'] # In[141]: up_skin=list() for wiki in wikis: prefs = mariadb.run( query.format(), wiki ) up_skin.append(prefs) skin= pd.concat(up_skin) # In[142]: skin_users = skin['users'].sum() print('Total number of users for whom we have vector skin preferences set in the user_properties table:' , skin_users) # In[143]: skin_aliases = { "0":"unknown", "1":"legacy", "2":"modern" } skin= skin.replace({"skin": skin_aliases}) # ## Number of users for each skin type overall # In[144]: user_skin=skin.groupby('skin').sum() user_skin # ## Number of users for each skin type by wiki # In[145]: #List of wikis to correspond to data values wikis_list = ['euwiki','euwiki', 'frwiktionary', 'frwiktionary', 'frwiktionary', 'ptwikiversity', 'ptwikiversity','fawiki', 'fawiki', 'fawiki','hewiki', 'hewiki', 'hewiki', 'frwiki', 'frwiki', 'frwiki'] # In[146]: skin['wiki'] = wikis_list # In[148]: user_skin_bywiki=pd.pivot_table(skin, index=['wiki','skin'],values=['users'],aggfunc=np.sum) # In[149]: user_skin_bywiki # ## Total Number of Registered Users on Test Wikis # We can use the total number of users on test wikis using the [mediawiki user table](https://www.mediawiki.org/wiki/Manual:User_table) to estimate the opt-out rate. # In[155]: # collect total number of users on each wiki query = """ SELECT COUNT(DISTINCT user_id) AS num_users FROM user""" # In[156]: user_count = mariadb.run(commands = query, dbs = wikis, format="pandas") # In[157]: user_count['wiki'] = wikis # In[158]: user_count # ## Opt Out Rate for Registered Users # # The opt-out rate was calculated by dividing the total number of users with their vector version preference changed to 'legacy' by the total number of all registered users on the wiki. # In[166]: # Create list of legacy users - these are all users that opt-out assuming modern is the default legacy_users = skin[skin['skin']=='legacy'] #rename colums legacy_users.columns = ['skin', 'num_legacy_users', 'wiki'] # In[ ]: # join to user_count table to obtain opt-out rate for each wiki opt_out_rate = legacy_users.merge(user_count, left_on = 'wiki', right_on = 'wiki') # In[174]: # Calculate opt-out rate opt_out_rate['pct_opt_out_rate'] = opt_out_rate['num_legacy_users']/ opt_out_rate['num_users'] * 100 opt_out_rate # # Calculate opt out rate among active editors # I reviewed the opt-out rate among active editors (users that had 5 or more content edits overall in the last year from September 2019 to September 2020). This was calculated by finding the percent of active editors for each wiki (obtained using data from [mediawiki history table](https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/MediaWiki_history) that have `VectorSkinVersion` preference set to legacy in the [user properties table](https://www.mediawiki.org/wiki/Manual:User_properties_table/en). # # Since the modern vector version was deployed as default to all of the test wikis in this analysis, it was assumed that any users with a non-default preference recorded as legacy have opt-d out. # # In[172]: HIVE_SNAPSHOT = "2020-09" START_OF_DATA = "2019-09-01" END_OF_DATA = "2020-10-01" # ## Collect number of active users # In[173]: #all active editors from the past one year active_editor_query = """ WITH yr_proj_edits as ( select event_user_text as user, event_user_id as user_id, wiki_db as proj, sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as content_edits, max(event_timestamp) as latest_edit from wmf.mediawiki_history where -- review target wikis wiki_db IN ('euwiki','frwiktionary', 'ptwikiversity', 'fawiki','hewiki', 'frwiki') and -- REGISTERED event_user_is_anonymous = false and -- NON-BOT size(event_user_is_bot_by) = 0 and not array_contains(event_user_groups, "bot") and -- CONTENT EDITS event_entity = "revision" and event_type = "create" and page_namespace_is_content = true and -- FROM THE LAST YEAR event_timestamp >= "{START_OF_DATA}" and event_timestamp < "{END_OF_DATA}" and -- FROM THE LATEST SNAPSHOT snapshot = "{hive_snapshot}" -- PER USER, PER WIKI group by event_user_text, event_user_id, wiki_db ) -- FINAL SELECT OF select user as user_name, user_id as user_id, proj as wiki, global_edits from -- JOINED TO THEIR HOME WIKI AND GLOBAL EDITS ( select user, user_id, proj, -- in the unlikely event that wikis are tied by edit count and latest edit, -- row_number() will break it somehow row_number() over (partition by user order by content_edits desc, latest_edit desc) as rank, sum(content_edits) over (partition by user) as global_edits from yr_proj_edits ) yr_edits where rank = 1 and global_edits>= 5 """ # In[174]: active_editor = hive.run( active_editor_query.format( hive_snapshot = HIVE_SNAPSHOT, START_OF_DATA= START_OF_DATA, END_OF_DATA=END_OF_DATA ) ) # In[176]: #Total_active_ed = active_editor['user_id'].count() Total_active_ed = active_editor.groupby(['wiki'])[['user_id']].count() print('Total number of editors for whom we will be checking vector skin preferences:' , Total_active_ed) # ## Vector Skin Preferences By Active Users # In[177]: #Querying user_properties for getting the skin preferences set by the active editors we got in the above query query=''' SELECT up_value AS skin, COUNT(*) AS users FROM user_properties WHERE up_user in ({users}) AND up_property = "VectorSkinVersion" GROUP BY up_value ''' # In[178]: #define list of target wikis wikis = ['euwiki','frwiktionary', 'ptwikiversity', 'fawiki','hewiki', 'frwiki'] # In[179]: # Looping through each wiki for the list of users for each skin up_skin=list() for wiki in wikis: user_ids = active_editor[active_editor['wiki'] == wiki]["user_id"] user_list = ','.join([str(u) for u in user_ids]) prefs = mariadb.run( query.format(users=user_list), wiki ) up_skin.append(prefs) skin= pd.concat(up_skin) # In[180]: #List of wikis to correspond to data values wikis_list = ['euwiki','euwiki', 'frwiktionary', 'frwiktionary', 'ptwikiversity', 'ptwikiversity','fawiki', 'fawiki', 'fawiki','hewiki', 'hewiki', 'frwiki', 'frwiki', 'frwiki'] # In[181]: # add wiki column skin['wiki'] = wikis_list # In[182]: # skin_users = skin['users'].sum() skin_users = skin.groupby(['wiki']).sum() print('Total number of users for whom we have vector skin preferences set in the user_properties table:' , skin_users) # Note: There are a large number of users who do not have data for vector skin preference in the user_preference table indicating that they are set to the default 'Modern' skin OR due to being deleted from the user_preference table. For the analysis below, let's default them to 'Modern'. # In[183]: modern_users=np.subtract(Total_active_ed,skin_users) modern_users # In[184]: #crate data frame of modern users modern_users_df = pd.DataFrame([['modern', 721, 'euwiki'], ['modern', 9105, 'fawiki'], ['modern', 29043, 'frwiki'],['modern', 353, 'frwiktionary'], ['modern', 4519, 'hewiki'], ['modern', 70, 'ptwikiversity']], columns=['skin','users','wiki']) modern_users_df # In[185]: # Define skin type for each property values skin_aliases = { "0":"unknown", "1":"legacy", "2":"modern" } skin= skin.replace({"skin": skin_aliases}) # In[186]: # combine modern_users with skin table skin = skin.append(modern_users_df,ignore_index=True) skin # ## Number of Active Editors for Skin Type # In[187]: user_skin=skin.groupby('skin').sum() user_skin # In[188]: user_skin_bywiki=pd.pivot_table(skin, index=['wiki','skin'],values=['users'],aggfunc=np.sum) user_skin_bywiki # ## Perentage of Active Editors for Each Skin Type # In[189]: # overall pct_user_skin=(100. * user_skin / user_skin.sum()).round(1).astype(str) + '%' pct_user_skin.sort_values(by=['users'],ascending=False) # In[190]: # by target wiki pct_user_skin_bywiki = user_skin_bywiki.groupby(['wiki', 'skin']).agg({'users': 'sum'}) wiki = user_skin_bywiki.groupby(['wiki']).agg({'users': 'sum'}) pct_user_skin_bywiki.div(wiki, level='wiki') * 100 # The percentage of legacy users listed above for each wiki reflects the opt-out rate as the modern vector was presented as default for all these wikis. # # The opt out rates among active editors for each target wiki are still well below 40%. Persian Wikipedia (fawiki) currently has the highest opt out rate (15.1%) among active editors while Basque Wikipedia (euwiki) has the lowest (5.1%).