#!/usr/bin/env python # coding: utf-8 # # Infobox Counts from XML Dumps # - extract infobox data from dump files # - create and populate `population_infobox` table # - report infobox counts # In[1]: # basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data get_ipython().run_line_magic('run', "-i 'data-defaults.py'") # In[2]: WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2', 'enwiki-20190420-pages-articles-multistream.xml.bz2'] INFOBOX_REGEX = re.compile(r'\{ *infobox ', re.IGNORECASE) def count_infobox(entity, date): page_text = entity.revision.text._VALUE count = len(INFOBOX_REGEX.findall(page_text)) return Row(page_id=entity.id, infobox_count=count, dt=date) infobox_rdd = sc.emptyRDD() for file in WIKIPEDIA_XML_DUMPS: wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file) dump_date = re.search(r'.*(\d{8}).*',file).group(1) articles = wikipedia\ .filter("ns = '0'")\ .filter("redirect._title is null") \ .filter("revision.text._VALUE is not null") \ .filter("length(revision.text._VALUE) > 0") daily_counts = sqlContext.createDataFrame(articles.rdd.map(lambda entity: count_infobox(entity, dump_date))) infobox_rdd = infobox_rdd.union(daily_counts.rdd) infobox_merged = sqlContext.createDataFrame(infobox_rdd) infobox = infobox_merged.toPandas() # In[3]: infobox # In[4]: # write infobox counts data to a table for later use infobox_merged.createOrReplaceTempView("temp_infobox_count") sqlContext.sql("DROP TABLE IF EXISTS ryanmax.infobox_count") sqlContext.sql("CREATE TABLE ryanmax.infobox_count AS SELECT * FROM temp_infobox_count") # In[5]: infobox.query('infobox_count>25') # ### Rate of change between dump days # In[6]: # calculate rate of change q = """ SELECT COUNT(DISTINCT page_id) AS pages_w_infobox, dt AS extract_date FROM ryanmax.infobox_count GROUP BY extract_date ORDER BY extract_date """ ibs = spark.sql(q) ibs_pd = sqlContext.createDataFrame(ibs.rdd).toPandas() print('Rate of change: ',(ibs_pd['pages_w_infobox'][1] - ibs_pd['pages_w_infobox'][0])/ibs_pd['pages_w_infobox'][0]) ibs_pd # In[7]: # write data for 2019-04-20 and report out link count ibs = spark.sql("SELECT * FROM ryanmax.infobox_count WHERE dt = '20190420'") ibs_df = sqlContext.createDataFrame(ibs.rdd) ibs_df.createOrReplaceTempView("temp_infobox") sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_infobox") sqlContext.sql("CREATE TABLE ryanmax.population_infobox AS SELECT * FROM temp_infobox") spark.sql("select COUNT(DISTINCT page_id) AS pages_w_infobox from ryanmax.population_infobox").show() # ## Infobox Counts # ### Count of pages with infoboxes in W pages with external links # In[8]: # Count of pages with infoboxes in W pages with external links w_infobox_query = """ SELECT COUNT(DISTINCT page_id) AS pages_w_infobox FROM ryanmax.population_infobox WHERE page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks) """ w_infobox = spark.sql(w_infobox_query) w_infobox.show() # ### Count of pages with infoboxes in WP:M pages with external links # In[9]: # Count of pages with infoboxes in WP:M pages with external links pm_infobox_query = """ SELECT COUNT(DISTINCT page_id) AS pages_w_infobox FROM ryanmax.population_infobox WHERE page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks) """ pm_infobox = spark.sql(pm_infobox_query) pm_infobox.show() # ### Number of infoboxes per page in W pages with external links # In[10]: # Number of infoboxes per page in W pages with external links w_infobox_cnts_query = """ SELECT MIN(infobox_count) as min, MAX(infobox_count) as max, CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count, PERCENTILE(infobox_count,0.5) AS median_infobox_count, STDDEV(infobox_count) as stddev_infobox_count, (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count FROM ryanmax.population_infobox WHERE page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks) """ w_infobox_cnts = spark.sql(w_infobox_cnts_query) w_infobox_cnts.show() # ### Number of infoboxes per page in WP:M pages with external links # In[11]: # Number of infoboxes per page in WP:M pages with external links pm_infobox_cnts_query = """ SELECT MIN(infobox_count) as min, MAX(infobox_count) as max, CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count, PERCENTILE(infobox_count,0.5) AS median_infobox_count, STDDEV(infobox_count) as stddev_infobox_count, (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count FROM ryanmax.population_infobox WHERE page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks) """ pm_infobox_cnts = spark.sql(pm_infobox_cnts_query) pm_infobox_cnts.show() # In[ ]: