#!/usr/bin/env python
# coding: utf-8

# # Apache Drill - Hansard Demo
# 
# Download and install Apache Drill.
# Start Apache Drill in the Apache Drill directory: `bin/drill-embedded`
# 
# Tweak the settings as per [Querying Large CSV Files With Apache Drill](https://blog.ouseful.info/2017/06/03/querying-large-csv-files-with-apache-drill/) so you can query against column names.

# In[ ]:


#Download data file
get_ipython().system('wget -P /Users/ajh59/Documents/parlidata/ https://zenodo.org/record/579712/files/senti_post_v2.csv')


# In[114]:


#Install some dependencies
get_ipython().system('pip3 install pydrill')
get_ipython().system('pip3 install pandas')
get_ipython().system('pip3 install matplotlib')


# In[139]:


#Import necessary packages
import pandas as pd
from pydrill.client import PyDrill

#Set the notebooks up for inline plotting
get_ipython().run_line_magic('matplotlib', 'inline')


# In[6]:


#Get a connection to the Apache Drill server
drill = PyDrill(host='localhost', port=8047)


# ## Make things faster
# 
# We can get a speed up on querying the CSV file by converting it to the `parquet` format.
# 
# In the Apache Drill terminal, run something like the following (change the path to the CSV file as required):
# 
# ``CREATE TABLE dfs.tmp.`/senti_post_v2.parquet` AS SELECT * FROM dfs.`/Users/ajh59/Documents/parlidata/senti_post_v2.csv`;``
# 
# (Running the command from the notebook suffers a timeout?)

# In[10]:


#Test the setup
drill.query(''' SELECT * from dfs.tmp.`/senti_post_v2.parquet` LIMIT 3''').to_dataframe()


# The Hansard data gives the date of each speech but not the session. To search for speeches within a particular session, we need the session dates. We can get these from the Parliament data API.

# In[45]:


#Get Parliament session dates from Parliament API
psd=pd.read_csv('http://lda.data.parliament.uk/sessions.csv?_view=Sessions&_pageSize=50')
psd


# In[58]:


def getParliamentDate(session):
    start=psd[psd['display name']==session]['start date'].iloc[0]
    end=psd[psd['display name']==session]['end date'].iloc[0]
    return start, end

getParliamentDate('2015-2016')


# In[140]:


#Check the columns in the Hansard dataset, along with example values
df=drill.query(''' SELECT * from dfs.tmp.`/senti_post_v2.parquet` LIMIT 1''').to_dataframe()
print(df.columns.tolist())
df.iloc[0]


# In[17]:


# Example of count of speeches by person in the dataset as a whole
q='''
SELECT proper_name, COUNT(*) AS number 
FROM dfs.tmp.`/senti_post_v2.parquet`
GROUP BY proper_name
'''

df=drill.query(q).to_dataframe()
df.head()


# In[35]:


# Example of count of speeches by gender in the dataset as a whole
q="SELECT gender, count(*) AS `Number of Speeches` FROM dfs.tmp.`/senti_post_v2.parquet` GROUP BY gender"
drill.query(q).to_dataframe()


# In[71]:


#Query within session
session='2015-2016'

start,end=getParliamentDate(session)
q='''
SELECT '{session}' AS session, gender, count(*) AS `Number of Speeches`
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY gender
'''.format(session=session, start=start, end=end)

drill.query(q).to_dataframe()


# In[85]:


#Count number of speeches per person
start,end=getParliamentDate(session)
q='''
SELECT '{session}' AS session, gender, mnis_id, count(*) AS `Number of Speeches`
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY mnis_id, gender
'''.format(session=session, start=start, end=end)

drill.query(q).to_dataframe().head()


# In[143]:


# Example of finding the average number of speeches per person by gender in a particular session
q='''
SELECT AVG(gcount) AS average, gender, session
FROM (SELECT '{session}' AS session, gender, mnis_id, count(*) AS gcount
        FROM dfs.tmp.`/senti_post_v2.parquet`
        WHERE speech_date>='{start}' AND speech_date<='{end}'
        GROUP BY mnis_id, gender)
GROUP BY gender, session
'''.format(session=session, start=start, end=end)

drill.query(q).to_dataframe()

#Note - the average is returned as a string not a numeric


# In[129]:


#We can package that query up in a Python function
def avBySession(session):
    start,end=getParliamentDate(session)
    q='''SELECT AVG(gcount) AS average, gender, session FROM (SELECT '{session}' AS session, gender, mnis_id, count(*) AS gcount
FROM dfs.tmp.`/senti_post_v2.parquet`
WHERE speech_date>='{start}' AND speech_date<='{end}'
GROUP BY mnis_id, gender) GROUP BY gender, session
'''.format(session=session, start=start, end=end)
    dq=drill.query(q).to_dataframe()
    #Make the average a numeric type...
    dq['average']=dq['average'].astype(float)
    return dq

avBySession(session)


# In[146]:


#Loop through sessions and create a dataframe containing gender based averages for each one
overall=pd.DataFrame()
for session in psd['display name']:
    overall=pd.concat([overall,avBySession(session)])

#Tidy up the index
overall=overall.reset_index(drop=True)

overall.head()


# The data is currently in a long (tidy) format. To make it easier to plot, we can reshape it (unmelt it) by casting it into a wide format, with one row per session and and the gender averages arranged by column.

# In[147]:


#Reshape the dataset
overall_wide = overall.pivot(index='session', columns='gender')
#Flatten the column names
overall_wide.columns = overall_wide.columns.get_level_values(1)
overall_wide


# Now we can plot the data - the session axis should sort in an appropriate way (alphanumerically).

# In[137]:


overall_wide.plot(kind='barh');


# In[138]:


overall_wide.plot();


# We can generalise the approach to look at a count of split by party.

# In[150]:


# Example of finding the average number of speeches per person by party in a particular session
# Simply tweak the query we used for gender...
q='''
SELECT AVG(gcount) AS average, party, session
FROM (SELECT '{session}' AS session, party, mnis_id, count(*) AS gcount
        FROM dfs.tmp.`/senti_post_v2.parquet`
        WHERE speech_date>='{start}' AND speech_date<='{end}'
        GROUP BY mnis_id, party)
GROUP BY party, session
'''.format(session=session, start=start, end=end)

drill.query(q).to_dataframe()


# Make a function out of that, as we did before.

# In[157]:


def avByType(session,typ):
    start,end=getParliamentDate(session)
    q='''SELECT AVG(gcount) AS average, {typ}, session
        FROM (SELECT '{session}' AS session, {typ}, mnis_id, count(*) AS gcount
            FROM dfs.tmp.`/senti_post_v2.parquet`
            WHERE speech_date>='{start}' AND speech_date<='{end}'
            GROUP BY mnis_id, {typ})
        GROUP BY {typ}, session
'''.format(session=session, start=start, end=end, typ=typ)
    dq=drill.query(q).to_dataframe()
    #Make the average a numeric type...
    dq['average']=dq['average'].astype(float)
    return dq

def avByParty(session):
    return avByType(session,'party')

avByParty(session)


# In[172]:


# Create a function to loop through sessions and create a dataframe containing specified averages for each one
# Note that this just generalises and packages up the code we had previously
def pivotAndFlatten(overall,typ):
    #Tidy up the index
    overall=overall.reset_index(drop=True)
    overall_wide = overall.pivot(index='session', columns=typ)
    
    #Flatten the column names
    overall_wide.columns = overall_wide.columns.get_level_values(1)
    return overall_wide

def getOverall(typ):
    overall=pd.DataFrame()
    for session in psd['display name']:
        overall=pd.concat([overall,avByType(session,typ)])

    return pivotAndFlatten(overall,typ)

overallParty=getOverall('party')

overallParty.head()


# In[236]:


#Note that the function means it's now just as easy to query on another single column
getOverall('party_group')


# In[166]:


overallParty.plot(kind='barh', figsize=(20,20));


# In[169]:


parties=['Conservative','Labour']

overallParty[parties].plot();


# We can write another query to look by gender and party.

# In[175]:


def avByGenderAndParty(session):
    start,end=getParliamentDate(session)
    q='''SELECT AVG(gcount) AS average, gender, party, session
        FROM (SELECT '{session}' AS session, gender, party, mnis_id, count(*) AS gcount
            FROM dfs.tmp.`/senti_post_v2.parquet`
            WHERE speech_date>='{start}' AND speech_date<='{end}'
            GROUP BY mnis_id, gender, party)
        GROUP BY gender, party, session
'''.format(session=session, start=start, end=end)
    dq=drill.query(q).to_dataframe()
    #Make the average a numeric type...
    dq['average']=dq['average'].astype(float)
    return dq


gp=avByGenderAndParty(session)
gp


# In[206]:


gp_overall=pd.DataFrame()

for session in psd['display name']:
    gp_overall=pd.concat([gp_overall,avByGenderAndParty(session)])

#Pivot table it more robust than pivot - missing entries handled with NA
#Also limit what parties we are interested in
gp_wide = gp_overall[gp_overall['party'].isin(parties)].pivot_table(index='session', columns=['party','gender'])

#Flatten column names
gp_wide.columns = gp_wide.columns.droplevel(0)
    
gp_wide


# In[208]:


gp_wide.plot(figsize=(20,10));


# In[209]:


gp_wide.plot(kind='barh', figsize=(20,10));


# ## Automating insight...
# 
# We can automate some of the observations we might want to make, such as years when M speak more, on average, than F, within a party.

# In[225]:


# Go back to the full dataset, not filtered by party
gp_wide = gp_overall.pivot_table(index='session', columns=['party','gender'])

#Flatten column names
gp_wide.columns = gp_wide.columns.droplevel(0)

gp_wide.head()


# In[232]:


sp_wide = gp_wide.reset_index().melt(id_vars=['session']).pivot_table(index=['session','party'], columns=['gender'])

#Flatten column names
sp_wide.columns = sp_wide.columns.droplevel(0)

sp_wide#.dropna(how='all')


# In[235]:


#Sessions when F spoke more, on average, then M
#Recall, this data has been previously filtered to limit data to Con and Lab

#Tweak the precision of the display
pd.set_option('precision',3)

sp_wide[sp_wide['Female'].fillna(0) > sp_wide['Male'].fillna(0) ]


# In[ ]: