import requests

def linkgrabber_kirklees(response):
    ''' Download spending transaction CSV files from the Kirklees website '''
    soup=BeautifulSoup(response.content)

    results=[]

    #Get the list of results
    resultsblock=soup.find('div',{'id':'pageContent'})
    resultsblock=resultsblock.find('section')

    #For each result, parse out the name of the dataset, the datamart ID and the data filter ID
    # regex via http://stackoverflow.com/a/13794740/454773
    for result in resultsblock.findAll('a',{'class':'csvLink'}):   
        csvurl='http://www.kirklees.gov.uk'+result.attrs['href']
        fn=result.attrs['href'].split('/')[-1]
        results.append((csvurl,fn))
    return results

#!mkdir kirkleesdata
url='http://www.kirklees.gov.uk/you-kmc/information/expenditureData.aspx'
html2= requests.get(url)
cc=linkgrabber_kirklees(html2)
harvester(cc,'kirkleesdata')

import os

#Routine to detect file encoding
#derived from /usr/local/bin/chardetect.py
from chardet.universaldetector import UniversalDetector
def sniffer(fname):
    ''' Try to detect the file encoding for a text (CSV) file '''
    file=open(fname, 'rb')
    u = UniversalDetector()
    for line in file:
        u.feed(line)
    u.close()
    result = u.result
    if result['encoding']:
        return (result['encoding'], result['confidence'])

#for fname in os.listdir('kirkleesdata'):
#    print(fname)
#    if fname.endswith('csv'):
#        print(fname,sniffer('kirkleesdata/'+fname))

!mkdir kirkleescleandata

#Routines to identify filetype

#https://pypi.python.org/pypi/filemagic/1.6
import magic
import pandas as pd
df2=pd.DataFrame()

def reader(fname,typ,enc):
    ''' Load in the data file as a pandas dataframe; try to parse dates as such '''
    if typ=='text/plain':
        try:
            dfx=pd.read_csv('kirkleesdata/'+fname,
                            parse_dates='Payment Date',
                            infer_datetime_format=True, dayfirst=True,
                            dtype={'Unique Transaction Id':object,'Classification Code':object,'Expense Type':object,'Vat Number':object},
                            encoding=enc)
        except:
            print('Error',fname,typ)
    elif typ in ['application/vnd.ms-office','application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
        try:
            dfx=pd.read_excel('kirkleesdata/'+fname,
                              parse_dates='Payment Date',
                              infer_datetime_format=True,dayfirst=True)

        except:
            print('Error',fname,typ)
    return dfx

def patchreader(fname,typ,enc):
    ''' If column names are missing, add them in under assumption the standard column order is used '''
    
    names=['Body', 'Body Name', 'Payment Date', 'Unique Transaction Id', 'Invoice Number', 'Amount Excluding Vat', 'Payee', 'Supplier Ref', 'Vat Number', 'Expense Area', 'Expense Area Description', 'Expense Type', 'Expense Type Description', 'Classification Code', 'Classification Description', 'Extended Information', 'BVACOP Code', 'BVACOP Description']
    if typ=='text/plain':
        try:
            dfx=pd.read_csv('kirkleesdata/'+fname, names=names,header=None,
                            parse_dates='Payment Date',
                            infer_datetime_format=True, dayfirst=True,
                            dtype={'Unique Transaction Id':object,'Classification Code':object,'Expense Type':object,'Vat Number':object},
                            encoding='windows-1252')
        except:
            print('Error',fname,typ)
    elif typ in ['application/vnd.ms-office','application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:    
        try:
            dfx=pd.read_excel('kirkleesdata/'+fname,names=names,header=None,
                              parse_dates='Payment Date',
                              infer_datetime_format=True,dayfirst=True)
        except:
            print('Error',fname,typ)
    return dfx

#http://niemeyer.net/python-dateutil
from dateutil.parser import *

def loader(fname,typ,enc=None):
    ''' Try to read each data file in; cast dates not identified as such to dates '''
    
    dfx=reader(fname,typ,enc)
    if 'Body' not in dfx.columns:
        dfx=patchreader(fname,typ,enc)
        
    #If we haven't converted to date type for some reason, use dateutil parse() for now...
    if dfx.dtypes['Payment Date']==dtype('O'):
        dfx['Payment Date']=dfx['Payment Date'].apply(parse)
        
    #Some expenditure columns may not be recognised as numbers because of pound signs or commas used to separate thousands
    if dfx.dtypes['Amount Excluding Vat']==dtype('O'):
        dfx['Amount Excluding Vat']=dfx['Amount Excluding Vat'].str.replace('£','').str.replace(',','').astype(float)

    return dfx


#Get a list of all the downloaded CSV files, try to detect what sort of file they are and load them in
for fname in os.listdir('kirkleesdata/'):
    #if a file is a CSV file, process it
    if fname.endswith('csv') and fname!='alldata.csv':
        #print(fname)
        
        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
            mt= m.id_filename('kirkleesdata/'+fname)
        enc=None
        if mt=='text/plain':
            enc,conf=sniffer('kirkleesdata/'+fname)
        dfx=loader(fname,mt,enc)
        #print(fname,dfx.columns,mt) #df.dtypes
        print(fname,'[',mt,enc,']') #df.dtypes
        for c in dfx.columns:
            if dfx.dtypes[c]==dtype('O'):
                dfx[c]=dfx[c].str.strip()
        dfx.to_csv('kirkleescleandata/'+fname,index=False)
        #df2=pd.concat([df2,dfx])

dd=pd.DataFrame()
for fname in os.listdir('kirkleescleandata/'):
    dd=pd.concat([dd,pd.read_csv('kirkleescleandata/'+fname,dtype={'Classification Code':str})])

dd['Payment Date']=dd['Payment Date'].apply(parse)
dd['Classification Code']=dd['Classification Code'].str.rstrip('.0')

dd.dtypes

dd[:2]

from ggplot import *

ggplot(dd,aes(x='Payment Date',y='Amount Excluding Vat'))+geom_point()

dd.sort('Payment Date',inplace=True)
dd['Cumul Amount Excluding Vat'] = dd['Amount Excluding Vat'].cumsum()

ggplot(dd,aes(x='Payment Date',y='Cumul Amount Excluding Vat'))+geom_line()

#Quick peek at spends by classification description
group=dd[['Classification Description','Amount Excluding Vat']].groupby('Classification Description')
group.aggregate(sum).sort('Amount Excluding Vat',ascending=False)[:15]

group=dd[['Payee','Amount Excluding Vat']].groupby('Payee')
group.aggregate(sum).sort('Amount Excluding Vat',ascending=False)[:30]