#!/usr/bin/env python
# coding: utf-8

# # Event Data: Infobox and Section

# In[1]:


# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
get_ipython().run_line_magic('run', "-i 'data-defaults.py'")


# ## InfoBox and Main section events for WP:M pages with external links

# In[2]:


# Total count of events (by all event types) in InfoBoxes for WP:M pages only
# Total count of events (by all event types) in the Main section for WP:M pages only
pm_infobox_events_query = """
SELECT in_infobox, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY in_infobox, action
ORDER BY in_infobox, action
"""

pm_infobox_events = spark.sql(
    pm_infobox_events_query.format(
        days_in_study, event_exclusion_sql, start_date_string, end_date_string
    ))
pm_infobox_events.toPandas()


# ### InfoBox and Main section events for W pages with external links

# In[3]:


# Total count of events (by all event types) in InfoBoxes for W pages
# Total count of events (by all event types) in the Main section for W pages
w_infobox_events_query = """
SELECT in_infobox, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY in_infobox, action
ORDER BY in_infobox, action
"""

w_infobox_events = spark.sql(
    w_infobox_events_query.format(
        days_in_study, event_exclusion_sql, start_date_string, end_date_string
    ))
w_infobox_events.toPandas()


# ### Infobox clicks that occurred under a section heading (e.g. External links)
# - no page limits
# 

# In[4]:


# Infobox clicks that occurred under a section heading (e.g. External links)
# no limits
infobox_section_events_query = """
SELECT section_id, action, count(*) AS total_event_count, count(*)/{} AS daily_average_event_count
FROM
    citationusage
WHERE
    wiki = 'enwiki'
    AND in_infobox = TRUE
    {}
    AND useragent_is_bot = FALSE
GROUP BY section_id, action
ORDER BY total_event_count desc
"""

infobox_section_events = spark.sql(infobox_section_events_query.format(days_in_study,event_exclusion_sql))
infobox_section_events.show()


# ### Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only
#  - where the event also occurred in an InfoBox

# In[5]:


# Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only
# where the event also occurred in an InfoBox
pm_section_events_query = """
SELECT population_wpm_sections.section_h2, action, count(*) count
FROM 
    citationusage
    LEFT JOIN ryanmax.population_wpm_sections 
        ON 
        population_wpm_sections.page_id = citationusage.page_id 
        AND population_wpm_sections.section_id = citationusage.section_id
WHERE
    wiki = 'enwiki'
    AND in_infobox = TRUE
    AND citationusage.page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.population_wpm_pages_with_extlinks
        )
    {}
    AND to_date(citationusage.event_time) >= '{}'
    AND to_date(citationusage.event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY population_wpm_sections.section_h2, action
ORDER BY count desc
"""

pm_section_events = spark.sql(
    pm_section_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
pm_section_events_rdd = pm_section_events.rdd
pm_section_events_df = sqlContext.createDataFrame(pm_section_events_rdd)
pm_section_events_pandas = pm_section_events_df.toPandas()


# ### Count of Infobox events (by all event types) occurring under each top-level (H2) section ID
# ** Limits: WP:M pages and >= 5 events **

# In[6]:


section_pda = pm_section_events_pandas.copy()
# replace 'NaN' section_h2 with 'missing'
section_pda.section_h2.fillna(value='-- Infobox event outside of a section --', inplace=True)
# limit to counts of 1K or more
section_pda['count'] = section_pda['count'].astype(int)
df_filtered = section_pda.query('count>=5').copy()
# set precision before pivot
df_filtered['count'] = df_filtered['count'].map(lambda x: '{0:.0f}'.format(x))
df_filtered.pivot(index='section_h2', columns='action', values='count')


# In[ ]: