#!/usr/bin/env python # coding: utf-8 # # Section ID Count and Event Data # - limited to WP:M pages # # Section ID data was not limited to top-level (H2) sections during data capture, requiring post-capture processing for section ID click data. # # Example of capture issue: https://en.wikipedia.org/wiki/Hepatitis#Signs_and_symptoms. # Clicks on links under "Acute hepatitis" were captured with section_id Acute_hepatitis, not Signs_and_symptoms. # # Post-capture data augmentation: click event section_ids where mapped to parent H2 section headings. See [populate-section-table.ipynb](populate-section-table.ipynb) for extraction details. # # In[1]: # basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data get_ipython().run_line_magic('run', "-i 'data-defaults.py'") # ## Raw section data from captured events # - Raw total count of events (by all event types) for each section ID for WP:M pages only. # - Limited to >= 3000 events. # - shows the extent of the capture issue described above # # In[2]: # "raw" section data from captured events to show extent of the capture issue described above pm_section_events_raw_query = """ SELECT section_id, action, count(*) count FROM citationusage WHERE page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks) AND wiki = 'enwiki' {} AND to_date(event_time) >= '{}' AND to_date(event_time) <= '{}' AND useragent_is_bot = FALSE GROUP BY section_id, action ORDER BY count desc LIMIT 100 """ pm_section_events_raw = spark.sql( pm_section_events_raw_query.format( event_exclusion_sql, start_date_string, end_date_string )) pm_section_events_raw_pandas = pm_section_events_raw.toPandas() # In[3]: section_pda_raw = pm_section_events_raw_pandas.copy() # replace 'NaN' section_id with 'missing' section_pda_raw.section_id.fillna(value='-- missing --', inplace=True) # limit to counts of 1K or more df_filtered_raw = section_pda_raw.query('count>3000').copy() # set precision before pivot df_filtered_raw['count'] = df_filtered_raw['count'].map(lambda x: '{0:.0f}'.format(x)) df_filtered_raw.pivot(index='section_id', columns='action', values='count') # ## Count of WP:M pages by top-level (H2) section ID # In[4]: # count of top-level (H2) section IDs for WP:M pages only pm_sections_query = """ SELECT section_h2, count(distinct page_id) count FROM ryanmax.population_wpm_sections GROUP BY section_h2 ORDER BY count desc, section_h2 """ pm_sections = spark.sql(pm_sections_query) pm_sections.toPandas().head(50) # ## Total count of events (all types) for each top-level (H2) section ID for WP:M pages only # - Limited to sections with more than 1K fnClick events # - **missing** values are largely because section IDs were not recorded "if the section is the Main Section" as per [Schema:CitationUsage](https://meta.wikimedia.org/wiki/Schema:CitationUsage). # In[5]: # Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only pm_section_events_query = """ SELECT population_wpm_sections.section_h2, action, count(*) count, count(*)/{} AS daily_average FROM citationusage LEFT JOIN ryanmax.population_wpm_sections ON population_wpm_sections.page_id = citationusage.page_id AND population_wpm_sections.section_id = citationusage.section_id WHERE wiki = 'enwiki' AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks) {} AND to_date(citationusage.event_time) >= '{}' AND to_date(citationusage.event_time) <= '{}' AND useragent_is_bot = FALSE GROUP BY population_wpm_sections.section_h2, action ORDER BY count desc """ pm_section_events = spark.sql( pm_section_events_query.format( days_in_study, event_exclusion_sql, start_date_string, end_date_string )) pm_section_events_pandas = pm_section_events.toPandas() # In[6]: section_pda = pm_section_events_pandas.copy() # replace 'NaN' section_h2 with 'missing' section_pda.section_h2.fillna(value='-- missing --', inplace=True) pivot=section_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000') section_h2s=pivot.index pivot # #### Daily averages of above # In[7]: section_pda.pivot(index='section_h2', columns='action', values='daily_average').loc[section_h2s] # In[8]: # count of events by event type and access mode for each top-level (H2) section ID for WP:M pages only pm_section_events_mode_query = """ SELECT population_wpm_sections.section_h2, action, mode, count(*) count FROM citationusage LEFT JOIN ryanmax.population_wpm_sections ON population_wpm_sections.page_id = citationusage.page_id AND population_wpm_sections.section_id = citationusage.section_id WHERE wiki = 'enwiki' AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks) {} AND to_date(citationusage.event_time) >= '{}' AND to_date(citationusage.event_time) <= '{}' AND useragent_is_bot = FALSE GROUP BY population_wpm_sections.section_h2, action, mode ORDER BY count desc """ pm_section_events_mode = spark.sql( pm_section_events_mode_query.format( event_exclusion_sql, start_date_string, end_date_string )) pm_section_events_mode_pandas = pm_section_events_mode.toPandas() # ## Desktop event counts for each top-level (H2) section ID for WP:M pages only # - limits above apply here as well # In[9]: desktop_pda = pm_section_events_mode_pandas.query('mode == "desktop"').copy() # replace 'NaN' section_h2 with 'missing' desktop_pda.section_h2.fillna(value='-- missing --', inplace=True) desktop_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000') # ## Mobile event counts for each top-level (H2) section ID for WP:M pages only # - limits above apply here as well # In[10]: mobile_pda = pm_section_events_mode_pandas.query('mode == "mobile"').copy() # replace 'NaN' section_h2 with 'missing' mobile_pda.section_h2.fillna(value='-- missing --', inplace=True) mobile_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000') # ## Total event counts for Adverse_effects section H2s for WP:M pages # - 20 pages with most events # In[11]: # event counts for pages with Adverse_effects section H2s adverse_events_query = """ SELECT citationusage.page_id, population_page_titles_20190420.page_title, count(*) AS event_count FROM citationusage LEFT JOIN ryanmax.population_wpm_sections ON population_wpm_sections.page_id = citationusage.page_id AND population_wpm_sections.section_id = citationusage.section_id LEFT JOIN ryanmax.population_page_titles_20190420 ON population_page_titles_20190420.page_id = citationusage.page_id WHERE wiki = 'enwiki' AND population_wpm_sections.section_h2 = 'Adverse_effects' AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks) {} AND to_date(citationusage.event_time) >= '{}' AND to_date(citationusage.event_time) <= '{}' AND useragent_is_bot = FALSE GROUP BY citationusage.page_id, population_page_titles_20190420.page_title ORDER BY event_count desc """ adverse_events = spark.sql( adverse_events_query.format( event_exclusion_sql, start_date_string, end_date_string )) adverse_events_pandas = adverse_events.toPandas() # In[12]: print('total events for Adverse_effects: ',adverse_events_pandas['event_count'].sum()) adverse_events_pandas.head(20)