#!/usr/bin/env python # coding: utf-8 # # AHT team IP info feature analysis # __Jennifer Wang, Staff Data Scientist, Wikimedia Foundation__ # # # September 2022 # [Analysis ticket](https://phabricator.wikimedia.org/T313340) | [Instrument spec]( # https://docs.google.com/spreadsheets/d/16S5fqAEDz3A-Lu0rvtkiIgyeP_00ZX8qjz6GhH9fIlk/edit?usp=sharing) # # Table of Contents # 1. [Introduction](#Introduction) # 2. [Is the information surfaced in IP Info useful?](#Is-the-information-surfaced-in-IP-Info-useful?) # 3. [What is the usage of the popup on the different pages where it is exposed?](#What-is-the-usage-of-the-popup-on-the-different-pages-where-it-is-exposed?) # 4. [Do people understand the information being shown?](#Do-people-understand-the-information-being-shown?) # 5. [Who is the information shown most to?](#Who-is-the-information-shown-most-to?) # 6. [Opt-in / opt-out](#Opt-in-/-opt-out) # 7. [Other interesting findings](#Other-interesting-findings) # # Introduction # # AHT team launched the IP info feature. This feature is to make it easier for admins, anti-vandal fighters and power users to access information about IP addresses. More details can be found on the [project page](https://meta.wikimedia.org/wiki/IP_Editing:_Privacy_Enhancement_and_Abuse_Mitigation/IP_Info_feature). # # Analysis measured the enrollment and the usage of IP info on all wiki projects to answer the question listed in [T313340](https://phabricator.wikimedia.org/T313340) # In[269]: import wmfdata wmfdata.utils.insert_code_toggle() # In[270]: from wmfdata import hive import pandas as pd # In[19]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[39]: import numpy as np # # Is the information surfaced in IP Info useful? # # Measurement is only for popups because we cannot measure accordion. Reason: Accordion is expanded or collapsed based on its previous status. Users can access IP info without clicking the accordion. Meanwhile, users visit the "User contributions" page for two possible purposes: to view the IP info, or to view the contributions. We cannot measure the accordion by the number of visits to the "User contributions" page. # # __IP info popup__ # # __IP info accordion__ # # ## Is the tool being used/accessed for the popup? # In[96]: query = ''' SELECT TO_DATE(meta.dt) AS y_m_d , count(1) AS ipinfo_popup FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action ="open_popup" AND event_context ="page" AND normalized_host.project NOT IN ('test', 'test2') GROUP BY TO_DATE(meta.dt) ORDER BY y_m_d LIMIT 100000 ''' # In[97]: df = hive.run(query) # In[98]: df.loc[:, 'y_m_d']=pd.to_datetime(df['y_m_d']).copy() # In[100]: df['rolling_avg'] = df['ipinfo_popup'].rolling(window=7).mean().round(0) # In[65]: xlabel='Date' ylabel='Number of IP info window popups' title="Daily IP info window popups on page" fig = plt.figure(facecolor='w', figsize=(16, 6)) ax = fig.add_subplot(111) #fcst_t = df['y_m_d'].dt.to_pydatetime() ax.plot(df['y_m_d'].dt.to_pydatetime(), df['ipinfo_popup'], 'k-', label='Actual') ax.plot(df['y_m_d'].dt.to_pydatetime(), df['rolling_avg'],'r-', label='Rolling average') ax.grid(True, which='major', c='gray', ls='-', lw=1, alpha=0.2) ax.set_xlabel(xlabel, fontsize = 16, family='serif') ax.set_ylabel(ylabel, fontsize = 16, family='serif') ax.set_title(title,fontweight="bold",fontsize = 20, family='serif', y=1.08) ax.legend(loc='upper right') fig.tight_layout() # In[ ]: plt.savefig("Figures_out/1_popup.png", facecolor='white', transparent=False) # __Average__ # In[42]: df['ipinfo_popup'].mean().round(2) # __Median__. # In[40]: np.percentile(df['ipinfo_popup'], 50) # __Note__ # Between 2022-05-25 and 2022-08-22, IP info popup was used 105 daily in average cross all wikis # ### Top 5 wikis # In[102]: query_popup_wiki =''' SELECT concat(normalized_host.project, '.', normalized_host.project_class), count(1) AS popups, min(meta.dt) AS start_date, max(meta.dt) AS end_date FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action ='open_popup' AND event_context ='page' AND normalized_host.project NOT IN ('test', 'test2') GROUP BY concat(normalized_host.project, '.', normalized_host.project_class) ORDER BY popups DESC LIMIT 100000 ''' # In[103]: df_wiki = hive.run(query_popup_wiki ) # In[104]: df_wiki # __Note__ # - Between 2022-05-27 and 2022-08-25, IP info popup has been used on 95 wiki projects. # - Top 5 wikis which mostly used IP info are: en.wikipedia, ru.wikipedia, zh.wikipedia, fr.wikipedia, es.wikipedia # # In[ ]: # ## How often is the accordion expanded/collapsed? # # __IP info accordion collapsed__ # # __IP info accordion expanded__ # # In[107]: query_accordion =''' SELECT TO_DATE(meta.dt) AS y_m_d ,event_action , count(1) AS counts FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_context ="infobox" AND event_action IN ('collapse','expand') AND normalized_host.project NOT IN ('test', 'test2') GROUP BY TO_DATE(meta.dt), event_action ORDER BY y_m_d LIMIT 100000 ''' # In[108]: df_accordion =hive.run(query_accordion) # In[109]: df_accordion # In[110]: df_accordion.loc[:, 'y_m_d']=pd.to_datetime(df_accordion['y_m_d']).copy() # In[125]: xlabel='Date' ylabel='Number of IP info accordion expansions and collapses' title="Daily IP info accordion expansions and collapses" df_e=df_accordion[df_accordion.event_action=='expand'][:-1] df_c=df_accordion[df_accordion.event_action=='collapse'][:-1] fig = plt.figure(facecolor='w', figsize=(16, 6)) ax = fig.add_subplot(111) #fcst_t = df['y_m_d'].dt.to_pydatetime() ax.plot(df_e['y_m_d'].dt.to_pydatetime(), df_e['counts'], 'g-', label='Expansions') ax.plot(df_c['y_m_d'].dt.to_pydatetime(), df_c['counts'],'b-', label='Collapses') ax.grid(True, which='major', c='gray', ls='-', lw=1, alpha=0.2) ax.set_xlabel(xlabel, fontsize = 16, family='serif') ax.set_ylabel(ylabel, fontsize = 16, family='serif') ax.set_title(title,fontweight="bold",fontsize = 20, family='serif', y=1.08) ax.legend(loc='upper right') fig.tight_layout() # __Note__. # # - The number of clicks to expand is relatively higher than the number of clicks to collapse. # - The trends of expansions and collapses are correlated well. # In[ ]: # ## How often is the accordion opened *after* the popup was opened? # # User flow: Click on any place of the popup leads to contribution page with the accordion # # Click rate of the link on popup to open the accordion(Special:Contribs page) = # of clicks on the link on popup to open the accordion/ # of clicks on "i" icon by the IP to popup # # In[112]: query_click_rate = ''' WITH t_i AS (SELECT concat(normalized_host.project, '.', normalized_host.project_class) AS wiki, count(1) AS ipinfo_popups FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action ="open_popup" AND event_context ="page" AND normalized_host.project NOT IN ('test', 'test2') GROUP BY concat(normalized_host.project, '.', normalized_host.project_class) ), t_c AS ( SELECT concat(normalized_host.project, '.', normalized_host.project_class) AS wiki, count(1) AS clicks_on_popup FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action ="open_infobox" AND event_context ="popup" AND normalized_host.project NOT IN ('test', 'test2') GROUP BY concat(normalized_host.project, '.', normalized_host.project_class) ) SELECT t_i.wiki, t_i.ipinfo_popups, t_c.clicks_on_popup FROM t_i LEFT JOIN t_c ON t_i.wiki=t_c.wiki ORDER BY t_i.ipinfo_popups DESC LIMIT 100000 ''' # In[113]: df_click_rate=hive.run(query_click_rate) # In[114]: df_click_rate.head(10) # In[ ]: df_click_rate[np.isnan(df_click_rate.clicks_on_popup)] = 0 # Cilcks/Popups # In[117]: df_click_rate.clicks_on_popup.sum()/df_click_rate.ipinfo_popups.sum() # __Note__ # # 2.3% of IP info popups lead to a click for the accordion. To check with PM. # # This feature is not obvious to a new user. # ## How often is limited view shown? How often is full view shown? # In[118]: query_version = ''' SELECT TO_DATE(meta.dt) AS y_m_d , event_ipinfo_version, count(1) AS ipinfo_popup FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action ="open_popup" AND event_context ="page" AND normalized_host.project NOT IN ('test', 'test2') GROUP BY TO_DATE(meta.dt), event_ipinfo_version ORDER BY y_m_d LIMIT 100000 ''' # In[119]: df_version=hive.run(query_version) # In[120]: df_version # __Total popups__ # In[121]: df_version.ipinfo_popup.sum() # __Popups in full verion/total popups__ # In[122]: df_version[df_version.event_ipinfo_version=='full'].ipinfo_popup.sum()/9468 # __Popups in basic version/total popups__ # In[123]: df_version[df_version.event_ipinfo_version=='basic'].ipinfo_popup.sum()/9468 # __Note__ # # Between 2022-05-27 and 2022-08-25, 44% popups show full version, 56% popups show basic version # # # What is the usage of the popup on the different pages where it is exposed? # IP info feature was enabled on following four types of pages: # __1.revision action history page__ # #
# __2.Special:Log page__ # #
# __3.Special:RecentChanges page__ # #
# __4.Special:watchlist page__ # #
# Note: Special:Contributions : do not have show popups. It's expected. # # In[126]: query_source = ''' SELECT TO_DATE(meta.dt) AS y_m_d ,event_source, count(1) AS ipinfo_popup FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action ="open_popup" AND event_context ="page" AND normalized_host.project NOT IN ('test', 'test2') GROUP BY TO_DATE(meta.dt),event_source ORDER BY y_m_d LIMIT 100000 ''' # In[127]: df_source =hive.run(query_source) # In[132]: df_source.loc[:, 'y_m_d']=pd.to_datetime(df_source['y_m_d']).copy() df_source # In[144]: df_source[df_source.event_source=='special_contributions'][:-1] # In[134]: xlabel='Date' ylabel='Number of IP info popups' title="Daily IP info popups by page types" df_l=df_source[df_source.event_source=='special_log'][:-1] df_r=df_source[df_source.event_source=='special_recentchanges'][:-1] df_h=df_source[df_source.event_source=='action_history'][:-1] df_w=df_source[df_source.event_source=='special_watchlist'][:-1] fig = plt.figure(facecolor='w', figsize=(16, 6)) ax = fig.add_subplot(111) #fcst_t = df['y_m_d'].dt.to_pydatetime() ax.plot(df_l['y_m_d'].dt.to_pydatetime(), df_l['ipinfo_popup'], 'g-', label='special:log') ax.plot(df_r['y_m_d'].dt.to_pydatetime(), df_r['ipinfo_popup'],'b-', label='special:recent changes') ax.plot(df_h['y_m_d'].dt.to_pydatetime(), df_h['ipinfo_popup'], 'y-', label='action history') ax.plot(df_w['y_m_d'].dt.to_pydatetime(), df_w['ipinfo_popup'],'r-', label='special:watchlist') ax.grid(True, which='major', c='gray', ls='-', lw=1, alpha=0.2) ax.set_xlabel(xlabel, fontsize = 16, family='serif') ax.set_ylabel(ylabel, fontsize = 16, family='serif') ax.set_title(title,fontweight="bold",fontsize = 20, family='serif', y=1.08) ax.legend(loc='upper right') fig.tight_layout() # In[139]: df_sum=df_source.groupby('event_source').sum() # In[140]: df_sum # In[141]: df_source.ipinfo_popup.sum() # In[142]: df_sum['share_in_pct']=round(df_sum['ipinfo_popup']/df_source.ipinfo_popup.sum(),4) *100 # In[143]: df_sum # __Note__ # # Between 2022-05-27 and 2022-08-25, # - 51.57% popups are from action_history page. # - 24.75% popups are from special:recentchanges pages. # - 20.96% popups are from special:watchlist pages. # - Only 2.71% popups are from special:log pages. # # Do people understand the information being shown? # Is the "help" for the information accessed by users? # # In[148]: query_help = ''' SELECT TO_DATE(meta.dt) AS y_m_d , event_action, count(1) AS clicks FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action LIKE 'click_help%' AND event_context = 'infobox' AND normalized_host.project NOT IN ('test', 'test2') GROUP BY TO_DATE(meta.dt), event_action ORDER BY y_m_d LIMIT 100000 ''' # In[149]: df_help = hive.run(query_help) # In[152]: df_help.loc[:, 'y_m_d']=pd.to_datetime(df_help['y_m_d']).copy() df_help # In[153]: df_help # In[155]: xlabel='Date' ylabel='Number of help clicks' title="Daily help clicks by types of informatin" df_p=df_help[df_help.event_action=='click_help_proxy'][:-1] df_m=df_help[df_help.event_action=='click_help_connection_method'][:-1] df_o=df_help[df_help.event_action=='click_help_connection_owner'][:-1] fig = plt.figure(facecolor='w', figsize=(16, 6)) ax = fig.add_subplot(111) #fcst_t = df['y_m_d'].dt.to_pydatetime() ax.plot(df_p['y_m_d'].dt.to_pydatetime(), df_p['clicks'], 'g-', label='help of proxy') ax.plot(df_m['y_m_d'].dt.to_pydatetime(), df_m['clicks'],'b-', label='help of connection method') ax.plot(df_o['y_m_d'].dt.to_pydatetime(), df_o['clicks'], 'r-', label='help of connection owner') ax.grid(True, which='major', c='gray', ls='-', lw=1, alpha=0.2) ax.set_xlabel(xlabel, fontsize = 16, family='serif') ax.set_ylabel(ylabel, fontsize = 16, family='serif') ax.set_title(title,fontweight="bold",fontsize = 20, family='serif', y=1.08) ax.legend(loc='upper right') fig.tight_layout() # In[156]: df_help_sum=df_help.groupby('event_action').sum() # __Total clicks on help 'i'__ # In[ ]: df_help_sum['share_in_pct']=round(df_help_sum['clicks']/df_help.clicks.sum(),4) *100 # In[159]: df_help_sum # __Note__. # Between 2022-5-28 and 2022-8-26, # - Help of connection_owner is mostly clicked. 38.3% of chicks on help are for the info pf connection owner # - Connection method and proxy are equally popular. # - Help of ASN was not tracked in current instrumentation. To check with AHT team # # In[ ]: # # Who is the information shown most to? # This can help us assess if we can limit the exposure of the tool. We can measure the edit count bucket for users who are using this tool (while anonymizing their ID). The account age was not captured in data instrumentation. # ## IP info popup # In[179]: query_popup_edit_bucket = ''' SELECT user_edit_bucket, count(1) AS ipinfo_popup FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action ="open_popup" AND event_context ="page" AND normalized_host.project NOT IN ('test', 'test2') GROUP BY user_edit_bucket ORDER BY ipinfo_popup DESC LIMIT 100000 ''' # In[180]: df_popup_edit_bucket =hive.run(query_popup_edit_bucket) # In[181]: df_popup_edit_bucket # ## IP info accordion # In[165]: query_accordion_edit_bucket =''' SELECT user_edit_bucket, count(1) AS counts FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_context ="infobox" AND event_action IN ('collapse','expand') AND normalized_host.project NOT IN ('test', 'test2') GROUP BY user_edit_bucket ORDER BY counts DESC LIMIT 100000 ''' # In[166]: df_accordion_edit_bucket =hive.run(query_accordion_edit_bucket) # In[167]: df_accordion_edit_bucket # __Note:__ # # IP info is mostly used by experienced editors who had more than 1000 edits. # # Opt-in / opt-out # __Opt-in on user contributions page__ # #
# __Opt-in or opt-out on preference page__ # #
# ## Activation/Deactivation rate for the feature # # ### Activation rate for the feature # In[257]: query_enable = ''' SELECT event_source,event_action , count(1) AS events, min(meta.dt) AS start_date, max(meta.dt) AS end_date FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action IN ("init_disclaimer" ,"accept_disclaimer","close_disclaimer", "uncheck_iagree", "enable_ipinfo" ,"disable_ipinfo" ) AND normalized_host.project NOT IN ('test', 'test2') GROUP BY event_source,event_action ''' # In[258]: df_enable=hive.run(query_enable) # In[259]: df_enable # ### Activate rate # #### Definition1: number of times users clicked "enable IP info tool"/ number of times users see the disclaimer # In[261]: 2225/2620 # __Note__. # Activation rate is 84.9% # # Need to check with PM and engineer about what kind of events are recorded with key `event_source=special_preferences & event_action=enable_ipinfo` , from contribution page or from the preference page (above picture), or from both. If it's `both`, the activation rate using definition1 is over-counted and not accurate. # #### Definition2: number of times users accept disclaimer / number of times the disclaimer shown # In[262]: 523/2620 # __Note__ # Activation rate is 20% under definition2 # ### Deactivate rate # Definition: number of times users disabled "IP info tool" / number of times users enabled "IP info tool" # In[235]: 379/2391 # __Note__. # # Deactivate rate is 15.6% # ## How many times do people see the disclaimer? How many times do people accept the terms of the disclaimer? How many times do people churn away? # In[245]: query_disclaimer = ''' SELECT event_source,event_action , count(1) AS events, min(meta.dt) AS start_date, max(meta.dt) AS end_date FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND event_action IN ("init_disclaimer" ,"accept_disclaimer","close_disclaimer" ) AND normalized_host.project NOT IN ('test', 'test2') GROUP BY event_source,event_action ''' # In[211]: df_disclaimer =hive.run(query_disclaimer) # In[213]: df_disclaimer # __Acceptance rate__ # In[233]: #Accept rate 607/2756 # __Churn rate__ # In[234]: #churn rate 2194/2756 # accept_disclaimer + close_disclaimer # In[230]: 607+2194 # ? On special contribution page # # accept_disclaimer + close_disclaimer > init_disclaimer # __Note__ # # Between 2022-05-31 and 2022-09-01 # __On Special contribution page:__ # The disclaimer showed 2753 times. # The disclaimer was accepted 607 times, acceptance rate 22%. # 2194 times, people churn away by closing the disclaimer without any clicks, churn rate 80%. # # On special contribution page, accept_disclaimer + close_disclaimer > init_disclaimer. Possible reason:1)User might click accept disclaimer multiple times after one initialization ; 2) Current calculation is an estimation. For the users who accept disclaimer, if their initial events are beyond 90 days, the initial events are not available in schema any more. Same for accepting events. For the users who have seen the disclaimer and have not accepted yet, only initial_disclaimer events are available in the schema. We don't track user ID or session ID, therefore, we cannot connect the user's journey by any identity. (see below exploration) # # __On preference page:__ # The disclaimer was accepted 2387 times. # # To check with the PM and engineer about no initial events on the preference page. # # __Exploration__ # Below query doesn't return any events because meta.id is not session id. We don't record user ID or session id in schema. Therefore, we don’t have any identity to connect the user's journey. # In[254]: query_disclaimer_2 = ''' SELECT t1.meta.id, t1.meta.dt AS init_ds, t2.event_action, t2.meta.dt AS action_ds FROM event.mediawiki_ipinfo_interaction AS t1 LEFT JOIN event.mediawiki_ipinfo_interaction AS t2 ON t1.meta.id=t2.meta.id AND t1.normalized_host.project=t2.normalized_host.project AND t1.normalized_host.project_class=t2.normalized_host.project_class --AND t1.meta.dt < t2.meta.dt AND t1.year=t2.year WHERE t1.year=2022 AND t1.event_action="init_disclaimer" AND t2.event_action IN ("accept_disclaimer","close_disclaimer") AND t1.normalized_host.project NOT IN ('test', 'test2') ''' # In[255]: df_disclaimer_2=hive.run(query_disclaimer_2) # In[256]: df_disclaimer_2 # # Other interesting findings # ## How often the IP address was copied? # In[240]: query_copy= ''' SELECT event_action, event_context, event_source, count(1) AS events FROM event.mediawiki_ipinfo_interaction WHERE year=2022 AND normalized_host.project NOT IN ('test', 'test2') AND event_action='copy' AND event_context='ip_address' GROUP BY event_action, event_context, event_source ''' # In[241]: df_copy = hive.run(query_copy) # In[242]: df_copy # In[249]: df_copy['share_in_pct']=round(df_copy['events']/df_copy.events.sum(),4) *100 # In[250]: df_copy # __Note__. # # Bewteen 2022-05-31 and 2022-09-01 # # 84% of IP copy events are from special contributions pages. # 8.74% of IP copy events are from revision action history pages. # 5.23% of IP copy events are from special recentchanges pages. # 1.32% of IP copy events are from special watchlist pages. # Only 0.75% of IP copy events are from specal log pages. # In[ ]: