import requests def linkgrabber_kirklees(response): ''' Download spending transaction CSV files from the Kirklees website ''' soup=BeautifulSoup(response.content) results=[] #Get the list of results resultsblock=soup.find('div',{'id':'pageContent'}) resultsblock=resultsblock.find('section') #For each result, parse out the name of the dataset, the datamart ID and the data filter ID # regex via http://stackoverflow.com/a/13794740/454773 for result in resultsblock.findAll('a',{'class':'csvLink'}): csvurl='http://www.kirklees.gov.uk'+result.attrs['href'] fn=result.attrs['href'].split('/')[-1] results.append((csvurl,fn)) return results #!mkdir kirkleesdata url='http://www.kirklees.gov.uk/you-kmc/information/expenditureData.aspx' html2= requests.get(url) cc=linkgrabber_kirklees(html2) harvester(cc,'kirkleesdata') import os #Routine to detect file encoding #derived from /usr/local/bin/chardetect.py from chardet.universaldetector import UniversalDetector def sniffer(fname): ''' Try to detect the file encoding for a text (CSV) file ''' file=open(fname, 'rb') u = UniversalDetector() for line in file: u.feed(line) u.close() result = u.result if result['encoding']: return (result['encoding'], result['confidence']) #for fname in os.listdir('kirkleesdata'): # print(fname) # if fname.endswith('csv'): # print(fname,sniffer('kirkleesdata/'+fname)) !mkdir kirkleescleandata #Routines to identify filetype #https://pypi.python.org/pypi/filemagic/1.6 import magic import pandas as pd df2=pd.DataFrame() def reader(fname,typ,enc): ''' Load in the data file as a pandas dataframe; try to parse dates as such ''' if typ=='text/plain': try: dfx=pd.read_csv('kirkleesdata/'+fname, parse_dates='Payment Date', infer_datetime_format=True, dayfirst=True, dtype={'Unique Transaction Id':object,'Classification Code':object,'Expense Type':object,'Vat Number':object}, encoding=enc) except: print('Error',fname,typ) elif typ in ['application/vnd.ms-office','application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']: try: dfx=pd.read_excel('kirkleesdata/'+fname, parse_dates='Payment Date', infer_datetime_format=True,dayfirst=True) except: print('Error',fname,typ) return dfx def patchreader(fname,typ,enc): ''' If column names are missing, add them in under assumption the standard column order is used ''' names=['Body', 'Body Name', 'Payment Date', 'Unique Transaction Id', 'Invoice Number', 'Amount Excluding Vat', 'Payee', 'Supplier Ref', 'Vat Number', 'Expense Area', 'Expense Area Description', 'Expense Type', 'Expense Type Description', 'Classification Code', 'Classification Description', 'Extended Information', 'BVACOP Code', 'BVACOP Description'] if typ=='text/plain': try: dfx=pd.read_csv('kirkleesdata/'+fname, names=names,header=None, parse_dates='Payment Date', infer_datetime_format=True, dayfirst=True, dtype={'Unique Transaction Id':object,'Classification Code':object,'Expense Type':object,'Vat Number':object}, encoding='windows-1252') except: print('Error',fname,typ) elif typ in ['application/vnd.ms-office','application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']: try: dfx=pd.read_excel('kirkleesdata/'+fname,names=names,header=None, parse_dates='Payment Date', infer_datetime_format=True,dayfirst=True) except: print('Error',fname,typ) return dfx #http://niemeyer.net/python-dateutil from dateutil.parser import * def loader(fname,typ,enc=None): ''' Try to read each data file in; cast dates not identified as such to dates ''' dfx=reader(fname,typ,enc) if 'Body' not in dfx.columns: dfx=patchreader(fname,typ,enc) #If we haven't converted to date type for some reason, use dateutil parse() for now... if dfx.dtypes['Payment Date']==dtype('O'): dfx['Payment Date']=dfx['Payment Date'].apply(parse) #Some expenditure columns may not be recognised as numbers because of pound signs or commas used to separate thousands if dfx.dtypes['Amount Excluding Vat']==dtype('O'): dfx['Amount Excluding Vat']=dfx['Amount Excluding Vat'].str.replace('£','').str.replace(',','').astype(float) return dfx #Get a list of all the downloaded CSV files, try to detect what sort of file they are and load them in for fname in os.listdir('kirkleesdata/'): #if a file is a CSV file, process it if fname.endswith('csv') and fname!='alldata.csv': #print(fname) with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: mt= m.id_filename('kirkleesdata/'+fname) enc=None if mt=='text/plain': enc,conf=sniffer('kirkleesdata/'+fname) dfx=loader(fname,mt,enc) #print(fname,dfx.columns,mt) #df.dtypes print(fname,'[',mt,enc,']') #df.dtypes for c in dfx.columns: if dfx.dtypes[c]==dtype('O'): dfx[c]=dfx[c].str.strip() dfx.to_csv('kirkleescleandata/'+fname,index=False) #df2=pd.concat([df2,dfx]) dd=pd.DataFrame() for fname in os.listdir('kirkleescleandata/'): dd=pd.concat([dd,pd.read_csv('kirkleescleandata/'+fname,dtype={'Classification Code':str})]) dd['Payment Date']=dd['Payment Date'].apply(parse) dd['Classification Code']=dd['Classification Code'].str.rstrip('.0') dd.dtypes dd[:2] from ggplot import * ggplot(dd,aes(x='Payment Date',y='Amount Excluding Vat'))+geom_point() dd.sort('Payment Date',inplace=True) dd['Cumul Amount Excluding Vat'] = dd['Amount Excluding Vat'].cumsum() ggplot(dd,aes(x='Payment Date',y='Cumul Amount Excluding Vat'))+geom_line() #Quick peek at spends by classification description group=dd[['Classification Description','Amount Excluding Vat']].groupby('Classification Description') group.aggregate(sum).sort('Amount Excluding Vat',ascending=False)[:15] group=dd[['Payee','Amount Excluding Vat']].groupby('Payee') group.aggregate(sum).sort('Amount Excluding Vat',ascending=False)[:30]