#!/usr/bin/env python # coding: utf-8 # # WP:M Page Classes/Categories # In[1]: # basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data get_ipython().run_line_magic('run', "-i 'data-defaults.py'") # pandas float formatting pd.options.display.float_format = '{:.4f}'.format # ## Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type # In[2]: # Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type pm_category_events_query = """ SELECT projmed_categories.category, action, count(*) count FROM citationusage, (SELECT DISTINCT page_id, category FROM ryanmax.projmed_categories WHERE projmed_categories.category LIKE '%Class_medicine_articles%' AND to_date(projmed_categories.dt) = '2019-04-20' ) AS projmed_categories WHERE citationusage.page_id = projmed_categories.page_id AND wiki = 'enwiki' AND citationusage.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks ) {} AND to_date(event_time) >= '{}' AND to_date(event_time) <= '{}' AND useragent_is_bot = FALSE GROUP BY projmed_categories.category, action ORDER BY projmed_categories.category, action """ pm_category_events = spark.sql( pm_category_events_query.format( event_exclusion_sql, start_date_string, end_date_string )) pm_category_events_rdd = pm_category_events.rdd pm_category_events_df = sqlContext.createDataFrame(pm_category_events_rdd) pm_category_events_pandas = pm_category_events_df.toPandas() pm_category_events_pandas.pivot(index='category', columns='action', values='count') # ## Daily average event counts for each WP:M class (FA, A, GA, B, C, Start, Stub) # ### Totals # In[28]: pm_category_events_pandas.groupby(['category']).agg( [ ('total_events','sum'), ('daily_average', lambda x: sum(x)/days_in_study) ]) # ### By event type # In[3]: pm_category_events_pandas['average'] = pm_category_events_pandas['count'].map(lambda x: x/days_in_study) pm_category_events_pandas.pivot(index='category', columns='action', values='average') # ## Count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub) # **numbers will not match [1] because we're limiting to namespace 0 pages with external links** # # [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics # In[4]: # count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub) # numbers will not match [1] because we're limiting to namespace 0 pages with external links # [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics pm_category_pages = """ SELECT category, COUNT(DISTINCT page_id) AS pages_w_links FROM ryanmax.projmed_categories WHERE category LIKE '%Class_medicine_articles%' AND to_date(projmed_categories.dt) = '2019-04-20' AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks ) GROUP BY category ORDER BY COUNT(*) DESC """ pm_cat_counts = spark.sql(pm_category_pages) cats = sqlContext.createDataFrame(pm_cat_counts.rdd) cats.toPandas() # ## Pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub) # **numbers will be higher than overall WP:M pageloads since one page may have more than one category** # In[5]: # pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub) # numbers will be higher than overall WP:M pageloads since one page may have more than one category pm_category_pageloads_query = """ SELECT projmed_categories.category, sum(view_count) AS total_pageloads, sum(view_count)/{} AS daily_average FROM wmf.pageview_hourly, (SELECT DISTINCT page_id, category FROM ryanmax.projmed_categories WHERE projmed_categories.category LIKE '%Class_medicine_articles%' AND to_date(projmed_categories.dt) = '2019-04-20' ) AS projmed_categories WHERE pageview_hourly.page_id = projmed_categories.page_id AND pageview_hourly.project = 'en.wikipedia' AND pageview_hourly.agent_type = 'user' AND pageview_hourly.page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks ) AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}' AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}' GROUP BY projmed_categories.category ORDER BY projmed_categories.category """ pm_category_pageloads = spark.sql( pm_category_pageloads_query.format( days_in_study, start_date_string, end_date_string )) pm_category_pageloads.toPandas() # ## Count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub) # In[10]: # count of external links for each WP:M class (FA, A, GA, B, C, Start, Stub) pm_category_links_query = """ SELECT projmed_categories.category, COUNT(DISTINCT el_from, el_to) AS num_extlinks, COUNT(DISTINCT el_from, el_to)/COUNT(DISTINCT el_from) as avg_extlinks_per_page FROM ryanmax.population_externallinks, (SELECT DISTINCT page_id, category FROM ryanmax.projmed_categories WHERE projmed_categories.category LIKE '%Class_medicine_articles%' AND to_date(projmed_categories.dt) = '2019-04-20' ) AS projmed_categories WHERE population_externallinks.el_from = projmed_categories.page_id AND population_externallinks.el_from IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks) GROUP BY projmed_categories.category ORDER BY projmed_categories.category """ pm_category_links = spark.sql(pm_category_links_query) pm_category_links.toPandas() # In[ ]: