#!/usr/bin/env python # coding: utf-8 # # Apache Drill - Hansard Demo # # Download and install Apache Drill. # Start Apache Drill in the Apache Drill directory: `bin/drill-embedded` # # Tweak the settings as per [Querying Large CSV Files With Apache Drill](https://blog.ouseful.info/2017/06/03/querying-large-csv-files-with-apache-drill/) so you can query against column names. # In[ ]: #Download data file get_ipython().system('wget -P /Users/ajh59/Documents/parlidata/ https://zenodo.org/record/579712/files/senti_post_v2.csv') # In[114]: #Install some dependencies get_ipython().system('pip3 install pydrill') get_ipython().system('pip3 install pandas') get_ipython().system('pip3 install matplotlib') # In[139]: #Import necessary packages import pandas as pd from pydrill.client import PyDrill #Set the notebooks up for inline plotting get_ipython().run_line_magic('matplotlib', 'inline') # In[6]: #Get a connection to the Apache Drill server drill = PyDrill(host='localhost', port=8047) # ## Make things faster # # We can get a speed up on querying the CSV file by converting it to the `parquet` format. # # In the Apache Drill terminal, run something like the following (change the path to the CSV file as required): # # ``CREATE TABLE dfs.tmp.`/senti_post_v2.parquet` AS SELECT * FROM dfs.`/Users/ajh59/Documents/parlidata/senti_post_v2.csv`;`` # # (Running the command from the notebook suffers a timeout?) # In[10]: #Test the setup drill.query(''' SELECT * from dfs.tmp.`/senti_post_v2.parquet` LIMIT 3''').to_dataframe() # The Hansard data gives the date of each speech but not the session. To search for speeches within a particular session, we need the session dates. We can get these from the Parliament data API. # In[45]: #Get Parliament session dates from Parliament API psd=pd.read_csv('http://lda.data.parliament.uk/sessions.csv?_view=Sessions&_pageSize=50') psd # In[58]: def getParliamentDate(session): start=psd[psd['display name']==session]['start date'].iloc[0] end=psd[psd['display name']==session]['end date'].iloc[0] return start, end getParliamentDate('2015-2016') # In[140]: #Check the columns in the Hansard dataset, along with example values df=drill.query(''' SELECT * from dfs.tmp.`/senti_post_v2.parquet` LIMIT 1''').to_dataframe() print(df.columns.tolist()) df.iloc[0] # In[17]: # Example of count of speeches by person in the dataset as a whole q=''' SELECT proper_name, COUNT(*) AS number FROM dfs.tmp.`/senti_post_v2.parquet` GROUP BY proper_name ''' df=drill.query(q).to_dataframe() df.head() # In[35]: # Example of count of speeches by gender in the dataset as a whole q="SELECT gender, count(*) AS `Number of Speeches` FROM dfs.tmp.`/senti_post_v2.parquet` GROUP BY gender" drill.query(q).to_dataframe() # In[71]: #Query within session session='2015-2016' start,end=getParliamentDate(session) q=''' SELECT '{session}' AS session, gender, count(*) AS `Number of Speeches` FROM dfs.tmp.`/senti_post_v2.parquet` WHERE speech_date>='{start}' AND speech_date<='{end}' GROUP BY gender '''.format(session=session, start=start, end=end) drill.query(q).to_dataframe() # In[85]: #Count number of speeches per person start,end=getParliamentDate(session) q=''' SELECT '{session}' AS session, gender, mnis_id, count(*) AS `Number of Speeches` FROM dfs.tmp.`/senti_post_v2.parquet` WHERE speech_date>='{start}' AND speech_date<='{end}' GROUP BY mnis_id, gender '''.format(session=session, start=start, end=end) drill.query(q).to_dataframe().head() # In[143]: # Example of finding the average number of speeches per person by gender in a particular session q=''' SELECT AVG(gcount) AS average, gender, session FROM (SELECT '{session}' AS session, gender, mnis_id, count(*) AS gcount FROM dfs.tmp.`/senti_post_v2.parquet` WHERE speech_date>='{start}' AND speech_date<='{end}' GROUP BY mnis_id, gender) GROUP BY gender, session '''.format(session=session, start=start, end=end) drill.query(q).to_dataframe() #Note - the average is returned as a string not a numeric # In[129]: #We can package that query up in a Python function def avBySession(session): start,end=getParliamentDate(session) q='''SELECT AVG(gcount) AS average, gender, session FROM (SELECT '{session}' AS session, gender, mnis_id, count(*) AS gcount FROM dfs.tmp.`/senti_post_v2.parquet` WHERE speech_date>='{start}' AND speech_date<='{end}' GROUP BY mnis_id, gender) GROUP BY gender, session '''.format(session=session, start=start, end=end) dq=drill.query(q).to_dataframe() #Make the average a numeric type... dq['average']=dq['average'].astype(float) return dq avBySession(session) # In[146]: #Loop through sessions and create a dataframe containing gender based averages for each one overall=pd.DataFrame() for session in psd['display name']: overall=pd.concat([overall,avBySession(session)]) #Tidy up the index overall=overall.reset_index(drop=True) overall.head() # The data is currently in a long (tidy) format. To make it easier to plot, we can reshape it (unmelt it) by casting it into a wide format, with one row per session and and the gender averages arranged by column. # In[147]: #Reshape the dataset overall_wide = overall.pivot(index='session', columns='gender') #Flatten the column names overall_wide.columns = overall_wide.columns.get_level_values(1) overall_wide # Now we can plot the data - the session axis should sort in an appropriate way (alphanumerically). # In[137]: overall_wide.plot(kind='barh'); # In[138]: overall_wide.plot(); # We can generalise the approach to look at a count of split by party. # In[150]: # Example of finding the average number of speeches per person by party in a particular session # Simply tweak the query we used for gender... q=''' SELECT AVG(gcount) AS average, party, session FROM (SELECT '{session}' AS session, party, mnis_id, count(*) AS gcount FROM dfs.tmp.`/senti_post_v2.parquet` WHERE speech_date>='{start}' AND speech_date<='{end}' GROUP BY mnis_id, party) GROUP BY party, session '''.format(session=session, start=start, end=end) drill.query(q).to_dataframe() # Make a function out of that, as we did before. # In[157]: def avByType(session,typ): start,end=getParliamentDate(session) q='''SELECT AVG(gcount) AS average, {typ}, session FROM (SELECT '{session}' AS session, {typ}, mnis_id, count(*) AS gcount FROM dfs.tmp.`/senti_post_v2.parquet` WHERE speech_date>='{start}' AND speech_date<='{end}' GROUP BY mnis_id, {typ}) GROUP BY {typ}, session '''.format(session=session, start=start, end=end, typ=typ) dq=drill.query(q).to_dataframe() #Make the average a numeric type... dq['average']=dq['average'].astype(float) return dq def avByParty(session): return avByType(session,'party') avByParty(session) # In[172]: # Create a function to loop through sessions and create a dataframe containing specified averages for each one # Note that this just generalises and packages up the code we had previously def pivotAndFlatten(overall,typ): #Tidy up the index overall=overall.reset_index(drop=True) overall_wide = overall.pivot(index='session', columns=typ) #Flatten the column names overall_wide.columns = overall_wide.columns.get_level_values(1) return overall_wide def getOverall(typ): overall=pd.DataFrame() for session in psd['display name']: overall=pd.concat([overall,avByType(session,typ)]) return pivotAndFlatten(overall,typ) overallParty=getOverall('party') overallParty.head() # In[236]: #Note that the function means it's now just as easy to query on another single column getOverall('party_group') # In[166]: overallParty.plot(kind='barh', figsize=(20,20)); # In[169]: parties=['Conservative','Labour'] overallParty[parties].plot(); # We can write another query to look by gender and party. # In[175]: def avByGenderAndParty(session): start,end=getParliamentDate(session) q='''SELECT AVG(gcount) AS average, gender, party, session FROM (SELECT '{session}' AS session, gender, party, mnis_id, count(*) AS gcount FROM dfs.tmp.`/senti_post_v2.parquet` WHERE speech_date>='{start}' AND speech_date<='{end}' GROUP BY mnis_id, gender, party) GROUP BY gender, party, session '''.format(session=session, start=start, end=end) dq=drill.query(q).to_dataframe() #Make the average a numeric type... dq['average']=dq['average'].astype(float) return dq gp=avByGenderAndParty(session) gp # In[206]: gp_overall=pd.DataFrame() for session in psd['display name']: gp_overall=pd.concat([gp_overall,avByGenderAndParty(session)]) #Pivot table it more robust than pivot - missing entries handled with NA #Also limit what parties we are interested in gp_wide = gp_overall[gp_overall['party'].isin(parties)].pivot_table(index='session', columns=['party','gender']) #Flatten column names gp_wide.columns = gp_wide.columns.droplevel(0) gp_wide # In[208]: gp_wide.plot(figsize=(20,10)); # In[209]: gp_wide.plot(kind='barh', figsize=(20,10)); # ## Automating insight... # # We can automate some of the observations we might want to make, such as years when M speak more, on average, than F, within a party. # In[225]: # Go back to the full dataset, not filtered by party gp_wide = gp_overall.pivot_table(index='session', columns=['party','gender']) #Flatten column names gp_wide.columns = gp_wide.columns.droplevel(0) gp_wide.head() # In[232]: sp_wide = gp_wide.reset_index().melt(id_vars=['session']).pivot_table(index=['session','party'], columns=['gender']) #Flatten column names sp_wide.columns = sp_wide.columns.droplevel(0) sp_wide#.dropna(how='all') # In[235]: #Sessions when F spoke more, on average, then M #Recall, this data has been previously filtered to limit data to Con and Lab #Tweak the precision of the display pd.set_option('precision',3) sp_wide[sp_wide['Female'].fillna(0) > sp_wide['Male'].fillna(0) ] # In[ ]: