#!/usr/bin/env python
# coding: utf-8

# # Section ID Count and Event Data
# - limited to WP:M pages
# 
# Section ID data was not limited to top-level (H2) sections during data capture, requiring post-capture processing for section ID click data.
# 
# Example of capture issue: https://en.wikipedia.org/wiki/Hepatitis#Signs_and_symptoms.
# Clicks on links under "Acute hepatitis" were captured with section_id Acute_hepatitis, not Signs_and_symptoms.
# 
# Post-capture data augmentation: click event section_ids where mapped to parent H2 section headings. See [populate-section-table.ipynb](populate-section-table.ipynb) for extraction details.
# 

# In[1]:


# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
get_ipython().run_line_magic('run', "-i 'data-defaults.py'")


# ## Raw section data from captured events
# - Raw total count of events (by all event types) for each section ID for WP:M pages only.
# - Limited to >= 3000 events.
# - shows the extent of the capture issue described above
# 

# In[2]:


# "raw" section data from captured events to show extent of the capture issue described above
pm_section_events_raw_query = """
SELECT section_id, action, count(*) count
FROM 
    citationusage 
WHERE page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
    AND wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY section_id, action
ORDER BY count desc
LIMIT 100
"""
pm_section_events_raw = spark.sql(
    pm_section_events_raw_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
pm_section_events_raw_pandas = pm_section_events_raw.toPandas()


# In[3]:


section_pda_raw = pm_section_events_raw_pandas.copy()
# replace 'NaN' section_id with 'missing'
section_pda_raw.section_id.fillna(value='-- missing --', inplace=True)
# limit to counts of 1K or more
df_filtered_raw = section_pda_raw.query('count>3000').copy()
# set precision before pivot
df_filtered_raw['count'] = df_filtered_raw['count'].map(lambda x: '{0:.0f}'.format(x))
df_filtered_raw.pivot(index='section_id', columns='action', values='count')


# ## Count of WP:M pages by top-level (H2) section ID

# In[4]:


# count of top-level (H2) section IDs for WP:M pages only
pm_sections_query = """
SELECT section_h2, count(distinct page_id) count
FROM
    ryanmax.population_wpm_sections 
GROUP BY section_h2
ORDER BY count desc, section_h2
"""
pm_sections = spark.sql(pm_sections_query)
pm_sections.toPandas().head(50)


# ## Total count of events (all types) for each top-level (H2) section ID for WP:M pages only
# - Limited to sections with more than 1K fnClick events
# - **missing** values are largely because section IDs were not recorded "if the section is the Main Section" as per [Schema:CitationUsage](https://meta.wikimedia.org/wiki/Schema:CitationUsage).

# In[5]:


# Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only
pm_section_events_query = """
SELECT population_wpm_sections.section_h2, action, count(*) count, count(*)/{} AS daily_average
FROM 
    citationusage
    LEFT JOIN ryanmax.population_wpm_sections 
        ON 
        population_wpm_sections.page_id = citationusage.page_id 
        AND population_wpm_sections.section_id = citationusage.section_id
WHERE
    wiki = 'enwiki'
    AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
    {}
    AND to_date(citationusage.event_time) >= '{}'
    AND to_date(citationusage.event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY population_wpm_sections.section_h2, action
ORDER BY count desc
"""
pm_section_events = spark.sql(
    pm_section_events_query.format(
        days_in_study,
        event_exclusion_sql, start_date_string, end_date_string
    ))
pm_section_events_pandas = pm_section_events.toPandas()


# In[6]:


section_pda = pm_section_events_pandas.copy()
# replace 'NaN' section_h2 with 'missing'
section_pda.section_h2.fillna(value='-- missing --', inplace=True)
pivot=section_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')
section_h2s=pivot.index
pivot


# #### Daily averages of above

# In[7]:


section_pda.pivot(index='section_h2', columns='action', values='daily_average').loc[section_h2s]


# In[8]:


# count of events by event type and access mode for each top-level (H2) section ID for WP:M pages only
pm_section_events_mode_query = """
SELECT population_wpm_sections.section_h2, action, mode, count(*) count
FROM 
    citationusage
    LEFT JOIN ryanmax.population_wpm_sections 
        ON 
        population_wpm_sections.page_id = citationusage.page_id 
        AND population_wpm_sections.section_id = citationusage.section_id
WHERE
    wiki = 'enwiki'
    AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
    {}
    AND to_date(citationusage.event_time) >= '{}'
    AND to_date(citationusage.event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY population_wpm_sections.section_h2, action, mode
ORDER BY count desc
"""
pm_section_events_mode = spark.sql(
    pm_section_events_mode_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
pm_section_events_mode_pandas = pm_section_events_mode.toPandas()


# ## Desktop event counts for each top-level (H2) section ID for WP:M pages only
# - limits above apply here as well

# In[9]:


desktop_pda = pm_section_events_mode_pandas.query('mode == "desktop"').copy()
# replace 'NaN' section_h2 with 'missing'
desktop_pda.section_h2.fillna(value='-- missing --', inplace=True)
desktop_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')


# ## Mobile event counts for each top-level (H2) section ID for WP:M pages only
# - limits above apply here as well

# In[10]:


mobile_pda = pm_section_events_mode_pandas.query('mode == "mobile"').copy()
# replace 'NaN' section_h2 with 'missing'
mobile_pda.section_h2.fillna(value='-- missing --', inplace=True)
mobile_pda.pivot(index='section_h2', columns='action', values='count').query('fnClick > 1000')


# ## Total event counts for Adverse_effects section H2s for WP:M pages
#  - 20 pages with most events

# In[11]:


# event counts for pages with Adverse_effects section H2s
adverse_events_query = """
SELECT citationusage.page_id, population_page_titles_20190420.page_title, count(*) AS event_count
FROM 
    citationusage
    LEFT JOIN ryanmax.population_wpm_sections 
        ON 
        population_wpm_sections.page_id = citationusage.page_id 
        AND population_wpm_sections.section_id = citationusage.section_id
    LEFT JOIN ryanmax.population_page_titles_20190420
        ON
        population_page_titles_20190420.page_id = citationusage.page_id 
WHERE
    wiki = 'enwiki'
    AND population_wpm_sections.section_h2 = 'Adverse_effects'
    AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
    {}
    AND to_date(citationusage.event_time) >= '{}'
    AND to_date(citationusage.event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY citationusage.page_id, population_page_titles_20190420.page_title
ORDER BY event_count desc
"""
adverse_events = spark.sql(
    adverse_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
adverse_events_pandas = adverse_events.toPandas()


# In[12]:


print('total events for Adverse_effects: ',adverse_events_pandas['event_count'].sum())
adverse_events_pandas.head(20)