#!/usr/bin/env python # coding: utf-8 # # Event Data: Infobox and Section # In[1]: # basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data get_ipython().run_line_magic('run', "-i 'data-defaults.py'") # ## InfoBox and Main section events for WP:M pages with external links # In[2]: # Total count of events (by all event types) in InfoBoxes for WP:M pages only # Total count of events (by all event types) in the Main section for WP:M pages only pm_infobox_events_query = """ SELECT in_infobox, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count FROM citationusage WHERE wiki = 'enwiki' AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks) {} AND to_date(event_time) >= '{}' AND to_date(event_time) <= '{}' AND useragent_is_bot = FALSE GROUP BY in_infobox, action ORDER BY in_infobox, action """ pm_infobox_events = spark.sql( pm_infobox_events_query.format( days_in_study, event_exclusion_sql, start_date_string, end_date_string )) pm_infobox_events.toPandas() # ### InfoBox and Main section events for W pages with external links # In[3]: # Total count of events (by all event types) in InfoBoxes for W pages # Total count of events (by all event types) in the Main section for W pages w_infobox_events_query = """ SELECT in_infobox, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count FROM citationusage WHERE wiki = 'enwiki' AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks) {} AND to_date(event_time) >= '{}' AND to_date(event_time) <= '{}' AND useragent_is_bot = FALSE GROUP BY in_infobox, action ORDER BY in_infobox, action """ w_infobox_events = spark.sql( w_infobox_events_query.format( days_in_study, event_exclusion_sql, start_date_string, end_date_string )) w_infobox_events.toPandas() # ### Infobox clicks that occurred under a section heading (e.g. External links) # - no page limits # # In[4]: # Infobox clicks that occurred under a section heading (e.g. External links) # no limits infobox_section_events_query = """ SELECT section_id, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count FROM citationusage WHERE wiki = 'enwiki' AND in_infobox = TRUE {} AND useragent_is_bot = FALSE GROUP BY section_id, action ORDER BY total_event_count desc """ infobox_section_events = spark.sql(infobox_section_events_query.format(days_in_study,event_exclusion_sql)) infobox_section_events.show() # ### Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only # - where the event also occurred in an InfoBox # In[5]: # Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only # where the event also occurred in an InfoBox pm_section_events_query = """ SELECT population_wpm_sections.section_h2, action, count(*) count FROM citationusage LEFT JOIN ryanmax.population_wpm_sections ON population_wpm_sections.page_id = citationusage.page_id AND population_wpm_sections.section_id = citationusage.section_id WHERE wiki = 'enwiki' AND in_infobox = TRUE AND citationusage.page_id IN (SELECT DISTINCT page_id FROM ryanmax.population_wpm_pages_with_extlinks ) {} AND to_date(citationusage.event_time) >= '{}' AND to_date(citationusage.event_time) <= '{}' AND useragent_is_bot = FALSE GROUP BY population_wpm_sections.section_h2, action ORDER BY count desc """ pm_section_events = spark.sql( pm_section_events_query.format( event_exclusion_sql, start_date_string, end_date_string )) pm_section_events_rdd = pm_section_events.rdd pm_section_events_df = sqlContext.createDataFrame(pm_section_events_rdd) pm_section_events_pandas = pm_section_events_df.toPandas() # ### Count of Infobox events (by all event types) occurring under each top-level (H2) section ID # ** Limits: WP:M pages and >= 5 events ** # In[6]: section_pda = pm_section_events_pandas.copy() # replace 'NaN' section_h2 with 'missing' section_pda.section_h2.fillna(value='-- Infobox event outside of a section --', inplace=True) # limit to counts of 1K or more section_pda['count'] = section_pda['count'].astype(int) df_filtered = section_pda.query('count>=5').copy() # set precision before pivot df_filtered['count'] = df_filtered['count'].map(lambda x: '{0:.0f}'.format(x)) df_filtered.pivot(index='section_h2', columns='action', values='count') # In[ ]: