#Import some necessary programming libraries that we'll use for the analysis import pandas as pd from ggplot import * #And some housekeeping import warnings warnings.simplefilter(action = "ignore", category = FutureWarning) #See what data files we have available !ls data/iw_parkingMeterData/ #I'm going to start by just looking at data from the period Dec 2012 to March 2013. #Read in a single spreadsheet and tidy it df=pd.read_excel("data/iw_parkingMeterData/4_5_Transaction Report RR Dec 2012 March 2013.xls",skiprows=6) #We need to clean the data a little, dropping empty columns, identifying timestamps as such df.dropna(how='all',axis=1,inplace=True) df.Date=pd.to_datetime(df.Date, format="%Y-%m-%d %H:%M:%S",coerce=True) df.dropna(subset=["Date"],inplace=True) #So what does the data look like? df[:5] #What are the separate tariff bands? df['Description.1'].unique() #It's possibly easier to work with the Tariff code, so what code applies to which description? from pandasql import sqldf pysqldf = lambda q: sqldf(q, globals()) dfx=df[["Tariff","Description.1"]].copy() dfx.rename(columns=lambda x: x.replace('.','_'), inplace=True) q="SELECT DISTINCT Tariff, Description_1 FROM dfx" pysqldf(q) #We can use this information to generate a mapping from the description or tariff to the tariff price #[Really should automate the extraction of the amount from the description] tariffMap={'01A':0.6, '01B':1,'01C':1.9, '01D':3.4,'01E':4.5,'01F':6.6,'01G':3,'01H':10} df["Tariff_val"]=df['Tariff'].apply(lambda x: tariffMap[x]) df[:3] #How much cash was taken over this period in total? df[['Cash']].sum() #If people paid exactly the tariff price, how much would have been taken? df[['Tariff_val']].sum() #So for this one car park, over four off season months, how much was overpaid? round(float(df[['Cash']].sum())-float(df[['Tariff_val']].sum()),2) #How much cash was taken over this period for each machine? df[['Machine','Cash']].groupby('Machine').sum() #How much cash was taken over this period for each machine and tariff? df[['Machine','Tariff','Cash']].groupby(['Machine','Tariff']).sum() #So how many tickets of each tariff type were issued by each machine? df[["Tariff","Machine"]].groupby(['Tariff',"Machine"]).agg(len).sort_index() #Can you show me that graphically? p = ggplot(aes(x='Tariff'), data=df) p + geom_bar() + ggtitle("Number of Tickets per Tariff") + labs("Tariff Code", "Count") + facet_wrap('Machine',scales='fixed') #Here's the same question asked another way p = ggplot(aes(x='Tariff',fill="Machine"), data=df) p + geom_bar() + ggtitle("Number of Tickets per Tariff") + labs("Tariff Code", "Count") #Ideally these bars would be "dodged" - placed side-by-side, but the charting library doesn't support that at the moment #We can get a designation different time components as follows # /via http://pandas-docs.github.io/pandas-docs-travis/timeseries.html # minute: the minutes of the datetime # weekday OR dayofweek: the day of the week with Monday=0, Sunday=6 # week: the week ordinal of the year df['weekday']=df['Date'].apply(lambda x: x.dayofweek) # hour: the hour of the datetime df['hour']=df['Date'].apply(lambda x: x.hour) #Let's just check that's worked: df[:3] ggplot(df, aes(x='factor(weekday)'))+geom_bar() # this, or similar, should be supported at some point? +scale_x_discrete(labels=["Mon","Tues","Weds","Thurs","Fri","Sat","Sun"]) #How many transactions occured by hour of day? ggplot(df, aes(x='hour'))+geom_bar(binwidth=1) #Can we split that up to see whether it's different across days of the week? ggplot(df, aes(x='hour'))+geom_bar(binwidth=1)+facet_wrap('weekday',scales='fixed') #Can we probe that distribution a little further, perhaps seeing how the hourly counts are made up from different tariff counts? ggplot(df, aes(x='hour',fill='Tariff'))+geom_bar(binwidth=1)+facet_wrap('weekday',scales='fixed') #Let's try to dig into that a little more. For a given day of the week, how do the tariff bands get used over the day? ggplot(df[df['weekday']==2], aes(x='hour'))+geom_bar(binwidth=1)+facet_grid('Tariff')+ggtitle('Wednesday') #Let's see what activity for Saturday looks like: ggplot(df[df['weekday']==5], aes(x='hour'))+geom_bar(binwidth=1)+facet_grid('Tariff')+ggtitle('Saturday') #Let's try to look over all the data to see how the tariff bands compare by hour of day ggplot(df[(df['Tariff']!='01H') & (df['Tariff']!='01G') ], aes(x='hour'))+geom_bar(binwidth=1)+facet_wrap('Tariff',scales='fixed') #Let's plot a count of cash payments using bins that ar 5 pence wide p = ggplot(aes(x='Cash'), data=df) p + geom_histogram(binwidth=0.05) #The Overpayment column is a boolean that specifies whether there was an overpayment or not df["Overpayment"]=(df["Cash"]!=df["Tariff_val"]) #The OverpaymentVal identifies how much, if anything, was overpaid df["OverpaymentVal"]=df["Cash"]-df["Tariff_val"] df[1220:1223] #So how common are overpayents by tariff type? df[["Tariff","Overpayment"]].groupby(['Tariff',"Overpayment"]).agg(len) #Let's order the data by timestamp, then add up the cumulative revenue df.sort(['Date'],inplace=True) df['Cash_cumul'] = df.Cash.cumsum() df[:3] #How does it look? g = ggplot(aes(x="Date",y="Cash_cumul"), data=df )+ geom_line() g #We can also calculate the accumulated amount within each tariff band #Let's group=df[['Tariff','Cash']].groupby('Tariff') #For group of rows, apply the transformation to each row in the group #The number of rows in the response will be the same as the number of rows in the original data frame df['Cash_cumul2']=group.transform(cumsum)['Cash'] #Here's how it looks: ggplot(df,aes(x="Date",y="Cash_cumul2",colour="Tariff"))+geom_line() #We can also split the amounts out into separate charts ggplot(df, aes(x="Date",y="Cash_cumul2")) + geom_line() \ + ggtitle("Payments made over time") \ + labs("Transaction Date", "Transaction amount (£)") \ + facet_wrap("Tariff",scales = "fixed") dfx=pd.DataFrame() for fn in ['4_10_River Road Transaction Report April 2012.xls', '4_11_Transaction Report RR Aug 2012.xls', '4_5_Transaction Report RR Dec 2012 March 2013.xls', '4_6_Transaction Report RR July 2012.xls', '4_7_Transaction Report RR June 2012.xls', '4_8_Transaction Report RR May 2012.xls', '4_9_Transaction Report RR Sept Nov 2012.xls']: dfx=pd.concat([dfx,pd.read_excel('data/iw_parkingMeterData/'+fn,skiprows=6)]) dfx.dropna(how='all',axis=1,inplace=True) dfx.Date=pd.to_datetime(dfx.Date, format="%Y-%m-%d %H:%M:%S",coerce=True) dfx.dropna(subset=["Date"],inplace=True) dfx['weekday']=dfx['Date'].apply(lambda x: x.dayofweek) ggplot(dfx, aes(x='factor(weekday)'))+geom_bar() #0-Mon 6-Sun dfx['month']=dfx['Date'].apply(lambda x: x.month) dfx['week']=dfx['Date'].apply(lambda x: x.week) dfx['hour']=dfx['Date'].apply(lambda x: x.hour) #How much activity is there by week of year? Note: the data was collected from a financial year, #so the scale actually runs Jan-Mar 13, then Apr-Dec 12. ggplot(dfx, aes(x='week'))+geom_bar()+facet_wrap('Tariff',scales='free_y') #Is there any evidence of folk paying just what they need as it gets closer to free parking time at 18.00? ggplot(dfx[(dfx['Tariff']=='01A') | (dfx['Tariff']=='01B')|(dfx['Tariff']=='01C')], aes(x='hour')) \ +geom_bar(binwidth=1)+facet_wrap('Tariff',scales='free_y') #Note that the bin widths are 1 hour wide for doing setting the bars. tariffMap2={'01A':0.6, '01B':1,'01C':1.9, '01D':3.4,'01E':4.5,'01F':6.6,'01G':3,'01H':10,'01I':13, '02A':0.6, '02B':1,'02C':1.9, '02D':3.4,'02E':4.5,'02F':6.6,} dfx["Tariff_val"]=dfx['Tariff'].apply(lambda x: tariffMap2[x]) #Set a boolean to say whether or not a line item was an overpayment dfx["Overpayment"]=(dfx["Cash"]!=dfx["Tariff_val"]) #Calculate amount of overpayment (if any) for each transaction dfx["OverpaymentVal"]=dfx["Cash"]-dfx["Tariff_val"] #What's the total amount of overpayment? dfx["OverpaymentVal"].sum() #How much was overpaid at the 01C/ £1.90 tariff level? dfx[dfx['Tariff']=='01C']["OverpaymentVal"].sum() #Note - I think 02C is the same level and there were also overpayments at that level. #Total revenue over the year: dfx["Cash"].sum() #How many people paid £2 on the 01C tariff? dfx[(dfx['Tariff']=='01C') & (dfx['Cash']==2)]["OverpaymentVal"].count() dfx['OverpaymentValRounded']=dfx['OverpaymentVal'].apply(lambda x: round(x,2)) #This crosstab counts the occurrences of one column value or index value with respect to another #So we can get count of the number of overpayments of a particular size by Tariff pd.crosstab(dfx['OverpaymentValRounded'],dfx['Tariff'], margins=True)