Brief review of open spending data from Kirklees Council
Expenditure Data Web Page: http://www.kirklees.gov.uk/you-kmc/information/expenditureData.aspx Ease of discovery:
The following code parses the Kirklees council expenditure data webpage, identifies the URLs of the CSV data files, and downloads them.
import requests
def linkgrabber_kirklees(response):
''' Download spending transaction CSV files from the Kirklees website '''
soup=BeautifulSoup(response.content)
results=[]
#Get the list of results
resultsblock=soup.find('div',{'id':'pageContent'})
resultsblock=resultsblock.find('section')
#For each result, parse out the name of the dataset, the datamart ID and the data filter ID
# regex via http://stackoverflow.com/a/13794740/454773
for result in resultsblock.findAll('a',{'class':'csvLink'}):
csvurl='http://www.kirklees.gov.uk'+result.attrs['href']
fn=result.attrs['href'].split('/')[-1]
results.append((csvurl,fn))
return results
#!mkdir kirkleesdata
url='http://www.kirklees.gov.uk/you-kmc/information/expenditureData.aspx'
html2= requests.get(url)
cc=linkgrabber_kirklees(html2)
harvester(cc,'kirkleesdata')
Having downloaded the data files, we then try to identify exactly what sort of files they are.
This includes:
.xls
or .xlsx
to .csv
does not make it a CSV file.import os
#Routine to detect file encoding
#derived from /usr/local/bin/chardetect.py
from chardet.universaldetector import UniversalDetector
def sniffer(fname):
''' Try to detect the file encoding for a text (CSV) file '''
file=open(fname, 'rb')
u = UniversalDetector()
for line in file:
u.feed(line)
u.close()
result = u.result
if result['encoding']:
return (result['encoding'], result['confidence'])
#for fname in os.listdir('kirkleesdata'):
# print(fname)
# if fname.endswith('csv'):
# print(fname,sniffer('kirkleesdata/'+fname))
!mkdir kirkleescleandata
#Routines to identify filetype
#https://pypi.python.org/pypi/filemagic/1.6
import magic
import pandas as pd
df2=pd.DataFrame()
def reader(fname,typ,enc):
''' Load in the data file as a pandas dataframe; try to parse dates as such '''
if typ=='text/plain':
try:
dfx=pd.read_csv('kirkleesdata/'+fname,
parse_dates='Payment Date',
infer_datetime_format=True, dayfirst=True,
dtype={'Unique Transaction Id':object,'Classification Code':object,'Expense Type':object,'Vat Number':object},
encoding=enc)
except:
print('Error',fname,typ)
elif typ in ['application/vnd.ms-office','application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
try:
dfx=pd.read_excel('kirkleesdata/'+fname,
parse_dates='Payment Date',
infer_datetime_format=True,dayfirst=True)
except:
print('Error',fname,typ)
return dfx
def patchreader(fname,typ,enc):
''' If column names are missing, add them in under assumption the standard column order is used '''
names=['Body', 'Body Name', 'Payment Date', 'Unique Transaction Id', 'Invoice Number', 'Amount Excluding Vat', 'Payee', 'Supplier Ref', 'Vat Number', 'Expense Area', 'Expense Area Description', 'Expense Type', 'Expense Type Description', 'Classification Code', 'Classification Description', 'Extended Information', 'BVACOP Code', 'BVACOP Description']
if typ=='text/plain':
try:
dfx=pd.read_csv('kirkleesdata/'+fname, names=names,header=None,
parse_dates='Payment Date',
infer_datetime_format=True, dayfirst=True,
dtype={'Unique Transaction Id':object,'Classification Code':object,'Expense Type':object,'Vat Number':object},
encoding='windows-1252')
except:
print('Error',fname,typ)
elif typ in ['application/vnd.ms-office','application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
try:
dfx=pd.read_excel('kirkleesdata/'+fname,names=names,header=None,
parse_dates='Payment Date',
infer_datetime_format=True,dayfirst=True)
except:
print('Error',fname,typ)
return dfx
#http://niemeyer.net/python-dateutil
from dateutil.parser import *
def loader(fname,typ,enc=None):
''' Try to read each data file in; cast dates not identified as such to dates '''
dfx=reader(fname,typ,enc)
if 'Body' not in dfx.columns:
dfx=patchreader(fname,typ,enc)
#If we haven't converted to date type for some reason, use dateutil parse() for now...
if dfx.dtypes['Payment Date']==dtype('O'):
dfx['Payment Date']=dfx['Payment Date'].apply(parse)
#Some expenditure columns may not be recognised as numbers because of pound signs or commas used to separate thousands
if dfx.dtypes['Amount Excluding Vat']==dtype('O'):
dfx['Amount Excluding Vat']=dfx['Amount Excluding Vat'].str.replace('£','').str.replace(',','').astype(float)
return dfx
#Get a list of all the downloaded CSV files, try to detect what sort of file they are and load them in
for fname in os.listdir('kirkleesdata/'):
#if a file is a CSV file, process it
if fname.endswith('csv') and fname!='alldata.csv':
#print(fname)
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
mt= m.id_filename('kirkleesdata/'+fname)
enc=None
if mt=='text/plain':
enc,conf=sniffer('kirkleesdata/'+fname)
dfx=loader(fname,mt,enc)
#print(fname,dfx.columns,mt) #df.dtypes
print(fname,'[',mt,enc,']') #df.dtypes
for c in dfx.columns:
if dfx.dtypes[c]==dtype('O'):
dfx[c]=dfx[c].str.strip()
dfx.to_csv('kirkleescleandata/'+fname,index=False)
#df2=pd.concat([df2,dfx])
expenditure250_April2013.csv [ application/vnd.ms-office None ] expenditure250_April2014.csv [ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet None ] expenditure250_August2012.csv [ text/plain windows-1252 ] expenditure250_August2013.csv [ text/plain windows-1252 ] expenditure250_December2012.csv [ text/plain ISO-8859-5 ] expenditure250_December2013.csv [ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet None ] expenditure250_February2013.csv [ application/vnd.ms-office None ] expenditure250_February2014.csv [ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet None ] expenditure250_January2013.csv [ text/plain windows-1252 ] expenditure250_January2014.csv [ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet None ] expenditure250_July2013.csv [ text/plain windows-1252 ] expenditure250_July2014.csv [ text/plain windows-1252 ] expenditure250_June2013.csv [ text/plain windows-1252 ] expenditure250_June2014.csv [ text/plain windows-1252 ] expenditure250_March2013.csv [ text/plain windows-1252 ] expenditure250_March2014.csv [ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet None ] expenditure250_May2013.csv [ application/vnd.ms-office None ] expenditure250_May2014.csv [ text/plain windows-1252 ] expenditure250_November2012.csv [ text/plain windows-1252 ] expenditure250_November2013.csv [ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet None ] expenditure250_October2012.csv [ application/vnd.ms-office None ] expenditure250_October2013.csv [ text/plain windows-1252 ] expenditure250_September2012.csv [ text/plain windows-1252 ] expenditure250_September2013.csv [ text/plain windows-1252 ]
The CSV files are the text/plain
files; the file encoding used for these files is also shown. The application files are actually spreadsheet application files.
dd=pd.DataFrame()
for fname in os.listdir('kirkleescleandata/'):
dd=pd.concat([dd,pd.read_csv('kirkleescleandata/'+fname,dtype={'Classification Code':str})])
dd['Payment Date']=dd['Payment Date'].apply(parse)
dd['Classification Code']=dd['Classification Code'].str.rstrip('.0')
dd.dtypes
Body object Body Name object Payment Date datetime64[ns] Unique Transaction Id object Invoice Number object Amount Excluding Vat float64 Payee object Supplier Ref object Vat Number object Expense Area object Expense Area Description object Expense Type object Expense Type Description object Classification Code object Classification Description object Extended Information object BVACOP Code object BVACOP Description object dtype: object
dd[:2]
Body | Body Name | Payment Date | Unique Transaction Id | Invoice Number | Amount Excluding Vat | Payee | Supplier Ref | Vat Number | Expense Area | Expense Area Description | Expense Type | Expense Type Description | Classification Code | Classification Description | Extended Information | BVACOP Code | BVACOP Description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 00CZ | Kirklees Council | 2013-04-02 | 5931631 | J46329 | -2575.47 | ADEN HOUSE LTD T/A ADEN HOUSE | ADE0110 | 0 | YT1403 | NK NURSING PLACEMTS | 86444 | GROSS FEES | 30222402 | NURSING HOMES | REDACTED - SOCIAL CARE CLIENT | YYSE22 | Nursing home placements |
1 | 00CZ | Kirklees Council | 2013-04-02 | 5931683 | J46329 | 126.32 | ADEN HOUSE LTD T/A ADEN HOUSE | ADE0110 | 0 | YT1411 | NK RES-RESPITE | 62547 | PRIV SECT PROVIDERS | 30222402 | NURSING HOMES | REDACTED - SOCIAL CARE CLIENT | YYSE24 | Residential care home placements |
from ggplot import *
ggplot(dd,aes(x='Payment Date',y='Amount Excluding Vat'))+geom_point()
<ggplot: (-9223363284016582151)>
dd.sort('Payment Date',inplace=True)
dd['Cumul Amount Excluding Vat'] = dd['Amount Excluding Vat'].cumsum()
ggplot(dd,aes(x='Payment Date',y='Cumul Amount Excluding Vat'))+geom_line()
<ggplot: (8752839562858)>
#Quick peek at spends by classification description
group=dd[['Classification Description','Amount Excluding Vat']].groupby('Classification Description')
group.aggregate(sum).sort('Amount Excluding Vat',ascending=False)[:15]
Amount Excluding Vat | |
---|---|
Classification Description | |
SCHOOLS - LOCAL AUTHORITY | 78105646.93 |
TAX COLLECTION SERVICES | 46998745.70 |
RESIDENTIAL HOMES | 38642927.73 |
PENSIONS | 29598451.20 |
NURSING HOMES | 22176758.12 |
EDUCATIONAL FACILITIES | 17617340.48 |
STAFF RECRUITING SERVICES | 15428748.20 |
WASTE DISPOSAL SERVICES | 14223936.01 |
FINANCIAL ASSISTANCE | 10468042.36 |
FOSTER HOME CARE SERVICES | 7527632.86 |
PLAYGROUPS & PRE-SCHOOL EDUCAT | 7250276.34 |
BUILDERS | 7043481.32 |
LOCAL GOVERNMENT | 6835798.19 |
INFANT OR CHILD DAYCARE SERVIC | 6533532.69 |
CREDIT UNIONS | 6527106.15 |
Duplicates in classification suggest we need to clean some of the data - eg check things like whitespace at the very least.
group=dd[['Payee','Amount Excluding Vat']].groupby('Payee')
group.aggregate(sum).sort('Amount Excluding Vat',ascending=False)[:30]
Amount Excluding Vat | |
---|---|
Payee | |
Hmrc Cumbernauld | 46998745.700000 |
REDACTED - SOCIAL CARE CLIENT | 41699061.729996 |
REDACTED - INDIVIDUALS NAME | 30087074.250000 |
TEACHERS PENSIONS | 17840106.900000 |
KIRKLEES SCHOOL SERVICES LTD | 14933581.530000 |
SITA KIRKLEES LTD | 10974026.820000 |
MATRIX SCM | 8470585.940000 |
Direct Debit Payment | 7351203.900000 |
REGENTER EXCELLENT HOMES | 6557239.140000 |
CASTLE & MINSTER CREDIT UNION | 6527106.150000 |
Kirklees Neighbourhood Housing | 6108284.250000 |
Matrix Scm | 6029300.860000 |
LOCALA COMMUNITY PARTNERSHIPS | 4396075.940000 |
BAM CONSTRUCTION LTD | 3770992.790000 |
CALDERDALE & KIRKLEES CAREERS | 3476105.180000 |
KIRKLEES MC | 3154920.660000 |
KIRKLEES ACTIVE LEISURE | 3112832.780000 |
NPOWER LTD | 3042823.740000 |
Sita Kirklees Ltd | 3006179.450000 |
CARLTON FUELS | 2970762.990000 |
FOSTERING SOLUTIONS LTD | 2960867.940000 |
ACTION FOR CHILDREN SERV LTD | 2948492.750000 |
LOGICA UK LIMITED | 2905261.000000 |
REDACTED - TEMP STAFF NAME | 2882357.080000 |
SANMET LTD | 2854561.460000 |
THE BRIDGEWOOD TRUST | 2824150.660000 |
QED (KMC) LTD | 2665664.340000 |
Npower Ltd (Acc G317****) | 2645074.930000 |
WESTBOROUGH HIGH SCHOOL | 2592773.970000 |
GALLAGHER BASSETT INT LTD | 2516284.000000 |