#Import some necessary programming libraries that we'll use for the analysis
import pandas as pd
from ggplot import *

#And some housekeeping
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)

#See what data files we have available
!ls data/iw_parkingMeterData/

#I'm going to start by just looking at data from the period Dec 2012 to March 2013.
#Read in a single spreadsheet and tidy it
df=pd.read_excel("data/iw_parkingMeterData/4_5_Transaction Report RR Dec 2012 March 2013.xls",skiprows=6)
#We need to clean the data a little, dropping empty columns, identifying timestamps as such
df.dropna(how='all',axis=1,inplace=True)
df.Date=pd.to_datetime(df.Date,  format="%Y-%m-%d %H:%M:%S",coerce=True)
df.dropna(subset=["Date"],inplace=True)

#So what does the data look like?
df[:5]

#What are the separate tariff bands?
df['Description.1'].unique()

#It's possibly easier to work with the Tariff code, so what code applies to which description?
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
dfx=df[["Tariff","Description.1"]].copy()
dfx.rename(columns=lambda x: x.replace('.','_'), inplace=True)
q="SELECT DISTINCT Tariff, Description_1 FROM dfx"
pysqldf(q)

#We can use this information to generate a mapping from the description or tariff to the tariff price
#[Really should automate the extraction of the amount from the description]
tariffMap={'01A':0.6, '01B':1,'01C':1.9, '01D':3.4,'01E':4.5,'01F':6.6,'01G':3,'01H':10}

df["Tariff_val"]=df['Tariff'].apply(lambda x: tariffMap[x])
df[:3]

#How much cash was taken over this period in total?
df[['Cash']].sum()

#If people paid exactly the tariff price, how much would have been taken?
df[['Tariff_val']].sum()

#So for this one car park, over four off season months, how much was overpaid?
round(float(df[['Cash']].sum())-float(df[['Tariff_val']].sum()),2)

#How much cash was taken over this period for each machine?
df[['Machine','Cash']].groupby('Machine').sum()

#How much cash was taken over this period for each machine and tariff?
df[['Machine','Tariff','Cash']].groupby(['Machine','Tariff']).sum()

#So how many tickets of each tariff type were issued by each machine?
df[["Tariff","Machine"]].groupby(['Tariff',"Machine"]).agg(len).sort_index()

#Can you show me that graphically?
p = ggplot(aes(x='Tariff'), data=df)
p + geom_bar() + ggtitle("Number of Tickets per Tariff")  + labs("Tariff Code", "Count") + facet_wrap('Machine',scales='fixed')

#Here's the same question asked another way
p = ggplot(aes(x='Tariff',fill="Machine"), data=df)
p + geom_bar() + ggtitle("Number of Tickets per Tariff")  + labs("Tariff Code", "Count")
#Ideally these bars would be "dodged" - placed side-by-side, but the charting library doesn't support that at the moment

#We can get a designation different time components as follows
# /via http://pandas-docs.github.io/pandas-docs-travis/timeseries.html
# minute: the minutes of the datetime
# weekday OR dayofweek: the day of the week with Monday=0, Sunday=6
# week: the week ordinal of the year
df['weekday']=df['Date'].apply(lambda x: x.dayofweek)
# hour: the hour of the datetime
df['hour']=df['Date'].apply(lambda x: x.hour)
#Let's just check that's worked:
df[:3]

ggplot(df, aes(x='factor(weekday)'))+geom_bar()

# this, or similar, should be supported at some point? +scale_x_discrete(labels=["Mon","Tues","Weds","Thurs","Fri","Sat","Sun"])

#How many transactions occured by hour of day?
ggplot(df, aes(x='hour'))+geom_bar(binwidth=1)

#Can we split that up to see whether it's different across days of the week?
ggplot(df, aes(x='hour'))+geom_bar(binwidth=1)+facet_wrap('weekday',scales='fixed')

#Can we probe that distribution a little further, perhaps seeing how the hourly counts are made up from different tariff counts?
ggplot(df, aes(x='hour',fill='Tariff'))+geom_bar(binwidth=1)+facet_wrap('weekday',scales='fixed')

#Let's try to dig into that a little more. For a given day of the week, how do the tariff bands get used over the day?
ggplot(df[df['weekday']==2], aes(x='hour'))+geom_bar(binwidth=1)+facet_grid('Tariff')+ggtitle('Wednesday')

#Let's see what activity for Saturday looks like:
ggplot(df[df['weekday']==5], aes(x='hour'))+geom_bar(binwidth=1)+facet_grid('Tariff')+ggtitle('Saturday')

#Let's try to look over all the data to see how the tariff bands compare by hour of day
ggplot(df[(df['Tariff']!='01H') & (df['Tariff']!='01G') ], aes(x='hour'))+geom_bar(binwidth=1)+facet_wrap('Tariff',scales='fixed')

#Let's plot a count of cash payments using bins that ar 5 pence wide
p = ggplot(aes(x='Cash'), data=df)
p + geom_histogram(binwidth=0.05) 

#The Overpayment column is a boolean that specifies whether there was an overpayment or not
df["Overpayment"]=(df["Cash"]!=df["Tariff_val"])
#The OverpaymentVal identifies how much, if anything, was overpaid
df["OverpaymentVal"]=df["Cash"]-df["Tariff_val"]

df[1220:1223]

#So how common are overpayents by tariff type?
df[["Tariff","Overpayment"]].groupby(['Tariff',"Overpayment"]).agg(len)

#Let's order the data by timestamp, then add up the cumulative revenue
df.sort(['Date'],inplace=True)
df['Cash_cumul'] = df.Cash.cumsum()
df[:3]

#How does it look?
g = ggplot(aes(x="Date",y="Cash_cumul"), data=df )+ geom_line()
g

#We can also calculate the accumulated amount within each tariff band

#Let's group=df[['Tariff','Cash']].groupby('Tariff')
#For group of rows, apply the transformation to each row in the group
#The number of rows in the response will be the same as the number of rows in the original data frame
df['Cash_cumul2']=group.transform(cumsum)['Cash']

#Here's how it looks:
ggplot(df,aes(x="Date",y="Cash_cumul2",colour="Tariff"))+geom_line()

#We can also split the amounts out into separate charts
ggplot(df, aes(x="Date",y="Cash_cumul2")) + geom_line() \
                                   + ggtitle("Payments made over time") \
                                   + labs("Transaction Date", "Transaction amount (£)") \
                                   + facet_wrap("Tariff",scales = "fixed")

dfx=pd.DataFrame()
for fn in ['4_10_River Road Transaction Report April 2012.xls', 
           '4_11_Transaction Report RR Aug 2012.xls',
           '4_5_Transaction Report RR Dec 2012 March 2013.xls',
           '4_6_Transaction Report RR July 2012.xls',
           '4_7_Transaction Report RR June 2012.xls',
           '4_8_Transaction Report RR May 2012.xls',
           '4_9_Transaction Report RR Sept Nov 2012.xls']:
    dfx=pd.concat([dfx,pd.read_excel('data/iw_parkingMeterData/'+fn,skiprows=6)])
dfx.dropna(how='all',axis=1,inplace=True)
dfx.Date=pd.to_datetime(dfx.Date,  format="%Y-%m-%d %H:%M:%S",coerce=True)
dfx.dropna(subset=["Date"],inplace=True)

dfx['weekday']=dfx['Date'].apply(lambda x: x.dayofweek)
ggplot(dfx, aes(x='factor(weekday)'))+geom_bar()
#0-Mon 6-Sun

dfx['month']=dfx['Date'].apply(lambda x: x.month)

dfx['week']=dfx['Date'].apply(lambda x: x.week)

dfx['hour']=dfx['Date'].apply(lambda x: x.hour)

#How much activity is there by week of year? Note: the data was collected from a financial year,
#so the scale actually runs Jan-Mar 13, then Apr-Dec 12.
ggplot(dfx, aes(x='week'))+geom_bar()+facet_wrap('Tariff',scales='free_y')

#Is there any evidence of folk paying just what they need as it gets closer to free parking time at 18.00?
ggplot(dfx[(dfx['Tariff']=='01A') | (dfx['Tariff']=='01B')|(dfx['Tariff']=='01C')], aes(x='hour')) \
    +geom_bar(binwidth=1)+facet_wrap('Tariff',scales='free_y')
#Note that the bin widths are 1 hour wide for doing setting the bars.

tariffMap2={'01A':0.6, '01B':1,'01C':1.9, '01D':3.4,'01E':4.5,'01F':6.6,'01G':3,'01H':10,'01I':13,
            '02A':0.6, '02B':1,'02C':1.9, '02D':3.4,'02E':4.5,'02F':6.6,}

dfx["Tariff_val"]=dfx['Tariff'].apply(lambda x: tariffMap2[x])

#Set a boolean to say whether or not a line item was an overpayment
dfx["Overpayment"]=(dfx["Cash"]!=dfx["Tariff_val"])

#Calculate amount of overpayment (if any) for each transaction
dfx["OverpaymentVal"]=dfx["Cash"]-dfx["Tariff_val"]

#What's the total amount of overpayment?
dfx["OverpaymentVal"].sum()

#How much was overpaid at the 01C/ £1.90 tariff level?
dfx[dfx['Tariff']=='01C']["OverpaymentVal"].sum()
#Note - I think 02C is the same level and there were also overpayments at that level.

#Total revenue over the year:
dfx["Cash"].sum()

#How many people paid £2 on the 01C tariff?
dfx[(dfx['Tariff']=='01C') & (dfx['Cash']==2)]["OverpaymentVal"].count()

dfx['OverpaymentValRounded']=dfx['OverpaymentVal'].apply(lambda  x: round(x,2))

#This crosstab counts the occurrences of one column value or index value with respect to another
#So we can get count of the number of overpayments of a particular size by Tariff
pd.crosstab(dfx['OverpaymentValRounded'],dfx['Tariff'], margins=True)