!wget http://lin-360giving.aptivate.org/static/uploads/all_grants.csv.gz -P data
!gunzip "data/all_grants.csv.gz" -d data

import pandas as pd

!ls data

df=pd.read_csv('data/all_grants.csv', dtype={'title':object},
               parse_dates=['applicationdate_startdate', 'awarddate_startdate', 'planneddates_startdate','planneddates_enddate'] )

df[:3]

df['title'].unique()

#Who are the funders?
funders=df['fundingorganization_name'].unique().tolist()
funders

df[df['fundingorganization_name'].str.contains('Rock')]

#Which organisations are the most 20 successful recipents in terms of number of grants awarded?
df.groupby('recipientorganization_name').size().order(ascending=False)[:20]

#Strip any whitespace around the name
df['recipientorganization_name'] = df['recipientorganization_name'].str.strip()
#Generate a new column with names in uppercase
df['recipientorganization_name_upper'] = df['recipientorganization_name'].str.upper()

#So which organisations are actually the most 20 successful recipents in terms of number of grants awarded?
df.groupby('recipientorganization_name_upper').size().order(ascending=False)[:20]

#Now let's see which are the most 20 successful organisations in terms of the total amounts awarded?
orgs=df[['recipientorganization_name_upper','totalamountawarded']].groupby('recipientorganization_name_upper')
orgs.sum().sort('totalamountawarded',ascending=False)[:10]

#So the universities are getting lots of grants and lots of income. How about if we try to separate out "winners" by funder?
#First in terms of number of grants received
df.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:30]

#Which universities have received the most awards?
unis=df[df['recipientorganization_name_upper'].str.contains('UNIVERSITY')]
unis.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:20]

#How much cash have these universities received?
uniFunds=unis[['fundingorganization_name','recipientorganization_name_upper','totalamountawarded']]
uniFunds.groupby(['fundingorganization_name','recipientorganization_name_upper']).sum().sort('totalamountawarded',ascending=False)[:10]

#Bored of the universities... how about limited companies?
ltd=df[df['recipientorganization_name_upper'].str.contains('LTD') | df['recipientorganization_name_upper'].str.contains('LIMITED')]
ltd.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:20]

#And in terms of the cash received?
ltdFunds=ltd[['fundingorganization_name','recipientorganization_name_upper','totalamountawarded']]
ltdFunds.groupby(['fundingorganization_name','recipientorganization_name_upper']).sum().sort('totalamountawarded',ascending=False)[:15]

#Who's most successful in terms of total number of grants received apart from the universities?
nunis=df[~df['recipientorganization_name_upper'].str.contains('UNIVERSITY')]
nunis.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:20]

#Hmmm, councils... which have had most grants?
ccls=df[df['recipientorganization_name_upper'].str.contains('COUNCIL')]
ccls.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:20]

#How about council success in terms of cash...?
cclFunds=ccls[['fundingorganization_name','recipientorganization_name_upper','totalamountawarded']]
cclFunds.groupby(['fundingorganization_name','recipientorganization_name_upper']).sum().sort('totalamountawarded',ascending=False)[:15]

#psql lets us return tables as dataframes
from pandas import read_sql_query as psql
#psqlx is a convenience fucntion for executing raw queries where we aren't so bothered about the response...
#Errors should still be thrown...
from pandas.io.sql import execute as psqlx

import pandas as pd
from sqlalchemy import create_engine

#Create the database connection for pandas dataframe operations
engine = create_engine('postgresql://test:test@localhost:5432/tm351test')

#This ia a helper routine to help find the maximum length of each field - cast as a string  - in the datafile
def colMaxLen(df,maxlens):
    for col in df.columns:
        cml=df[col].astype(str).map(len).max()
        if cml > maxlens[col]: maxlens[col]=cml

#Let's detect the maximum field lengths in the dataset so we can use it to help configue our database table
fname='data/all_grants.csv'
maxlens={}
for name in df.columns: maxlens[name]=0
            
chunks=pd.read_csv(fname,chunksize=10000)
for chunk in chunks:
    colMaxLen(chunk,maxlens)
maxlens

#If the table we're going to use already exists, get rid of it - we're going for a fresh start...
psqlx("DROP TABLE IF EXISTS charities360data;",engine)

#Here's where I create the table - gonna be lazy and make lots of things varchar for a quick start...
psqlx("CREATE TABLE charities360data ( planneddates_enddate VARCHAR (30), awarddate_startdate VARCHAR (30), currency VARCHAR (10), \
        applicationdate_startdate VARCHAR (30), recipientorganization_charitynumber VARCHAR (10), \
        recipientorganization_companynumber VARCHAR (10), fundingorganization_id VARCHAR (50), \
        id VARCHAR (75) PRIMARY KEY, totalamountawarded FLOAT, fundingorganization_name VARCHAR (30), \
        recipientorganization_id VARCHAR(100), description VARCHAR(1000), recipientorganization_name VARCHAR (150), \
        recipientorganization_name_upper VARCHAR (150), \
        planneddates_startdate VARCHAR (30), totalamountappliedfor FLOAT, title VARCHAR (100) );",engine)

#Test the table
psql("SELECT * FROM charities360data LIMIT 1;", engine)

#This routine will load data in from a pandas dataframe, itself created from 10k rows at a time read in from the source CSV file
dateFields=[]#['applicationdate_startdate', 'awarddate_startdate', 'planneddates_startdate','planneddates_enddate']
chunks=pd.read_csv(fname,parse_dates=dateFields,chunksize=10000)
for chunk in chunks:
    #Just in case, strip of any whotespace around names
    for col in ['recipientorganization_name','fundingorganization_name']:
        chunk[col]=chunk[col].str.strip()
    #Another normalisation step: introduce a new column corresponding to organisation name in all upper case characters
    chunk['recipientorganization_name_upper']=chunk['recipientorganization_name'].str.upper()
    chunk.to_sql('charities360data', engine, index=False, if_exists='append')

#Quick query to see which organisation names appear the most
psql("SELECT  recipientorganization_name, COUNT(*) AS tally FROM charities360data \
        WHERE recipientorganization_name ILIKE '%%UNIVERSITY%%' \
        GROUP BY recipientorganization_name HAVING COUNT(*)>3  \
        ORDER BY tally DESC LIMIT 20;", engine)

#We note that the data appears to have the same organisation names in a variety of capitalisations...
#That's why we normalise them...
psql("SELECT  recipientorganization_name_upper, COUNT(*) AS tally, to_char(SUM(totalamountawarded),'99999999999D99') FROM charities360data \
        WHERE recipientorganization_name_upper ILIKE '%%UNIVERSITY%%' AND totalamountawarded != 'NaN' AND totalamountawarded>10000000 \
        GROUP BY recipientorganization_name_upper HAVING COUNT(*)>3  \
        ORDER BY tally DESC;", engine)

#Quick test query around the total amounts awarded to orgs by normalised name
psql("SELECT  recipientorganization_name_upper ,totalamountawarded FROM charities360data \
        WHERE  totalamountawarded >10000000 AND totalamountawarded != 'NaN' \
        AND  recipientorganization_name_upper ILIKE '%%UNIVERSITY%%' ORDER BY totalamountawarded DESC; ",engine)

#By the by, how many unique upper case names are there?
psql("SELECT COUNT(*) FROM (SELECT DISTINCT recipientorganization_name_upper FROM charities360data) tmp",engine)

#One of the other tables I have in the same database is a list of company names and company numbers from Companies House
#Here's what each row looks like in that table...
psql("SELECT * from companydata limit 1",engine)

#Let's see how many names we an trivially join on between the 360Charities data and the Companies House data...
cnum=psql("SELECT DISTINCT ch.recipientorganization_name_upper, co.companynumber FROM charities360data ch JOIN companydata co  \
        ON ch.recipientorganization_name_upper = co.companyname ;",engine)
cnum[:10]

#How many matches have we got?
len(cnum)

#We can bring in additional data about a company from it's company number using the OpenCorporates API

import json, requests
  
def openCorporatesLookupCompanyNumber(cnum):
    ''' Grab company data from OpenCorporates '''
    url='https://api.opencorporates.com/v0.3/companies/gb/{0}?format=json'.format(cnum)
    jsondata = json.loads(requests.get(url).text)
    return jsondata['results']

    
corpdata=openCorporatesLookupCompanyNumber('04200429')

#We can then do things like look the the list of officers
officers = pd.DataFrame( [ d['officer'] for d in corpdata['company']['officers'] ] )
officers

#We can also search OpenCorporates officers by name
def openCorporatesOfficerSearch(officername):
    ''' Search for officers by exact match name '''
    url='https://api.opencorporates.com/v0.3/officers/search'
    params={'q':'"'+officername+'"'}
    jsondata = json.loads(requests.get(url,params=params).text)
    data=[]
    for d in jsondata['results']['officers']:
        tmp=d['officer']
        for i in ['jurisdiction_code','company_number']:
            tmp[i]=tmp['company'][i]
        tmp['company_name']=tmp['company']['name']
        tmp.pop('company',None)
        data.append(tmp)
    df= pd.DataFrame( data )
    return df


openCorporatesOfficerSearch("JONATHAN MASHE THOMAS")

#If we use the join to get a list of company numbers for companies in the 360 charities data,
#we can then look these up in the Companies House datatable to see what postcodes they correspond to,
#and then group and count on this basis.
#That is, for compnay matched charities, what are the most popular postcodes?
pcc=psql("SELECT regaddress_postcode, COUNT(*) FROM companydata \
            WHERE companynumber IN (SELECT DISTINCT co.companynumber  \
            FROM charities360data ch JOIN companydata co  \
            ON ch.recipientorganization_name_upper = co.companyname) \
            AND regaddress_postcode !='NaN' \
            GROUP BY regaddress_postcode HAVING COUNT(*)> 5 ORDER BY COUNT(*) DESC;",engine)
pcc

#We can tunnel in on a particular postcode
mpc=psql("SELECT DISTINCT ch.fundingorganization_name , ch.recipientorganization_name_upper, co.companynumber, regaddress_addressline1 \
        FROM charities360data ch JOIN companydata co  \
        ON ch.recipientorganization_name_upper = co.companyname \
        WHERE co.regaddress_postcode ='HU19 2DL';",engine)
mpc

#Let's make it easier to run this sort of query...

def searchByCompanyPostcode(pc):
    mpc=psql("SELECT DISTINCT ch.fundingorganization_name, ch.recipientorganization_name_upper, co.companynumber, regaddress_addressline1 \
        FROM charities360data ch JOIN companydata co  \
        ON ch.recipientorganization_name_upper = co.companyname \
        WHERE co.regaddress_postcode ='"+pc+"';",engine)
    return mpc

searchByCompanyPostcode('KT15 2NX')

searchByCompanyPostcode('N9 0PZ')

#Grab the data file
!wget http://OpenCharities.org/charities.csv.zip -P data
    
#Uncompress it
!unzip data/charities.csv.zip -d data

!head -n 3 data/charities.csv

#Find out how wide each daata field is to help us set up the database table
fname='data/charities.csv'

df=pd.read_csv(fname)

maxlens={}
for name in df.columns: maxlens[name]=0
            
chunks=pd.read_csv(fname,chunksize=10000)
for chunk in chunks:
    colMaxLen(chunk,maxlens)
maxlens

#If the table we're going to use already exists, get rid of it - we're going for a fresh start...
psqlx("DROP TABLE IF EXISTS charitiescommission",engine)

#Postgresql defaults to lower caps - we can force capitalisation on a table name by double quoting it
#psqlx('DROP TABLE IF EXISTS "charitiesCommission"',engine)

#Create the table using fields a bit bigger than the max width
sql="CREATE TABLE charitiescommission ("
colstr='charity_name_upper'+" VARCHAR ("+str(maxlens['title']+5)+"), pcode_guess VARCHAR (10)"
for i in maxlens:
    tmp=i+" VARCHAR ("+str(maxlens[i]+5)+")"
    colstr=", ".join([colstr,tmp])
        
psqlx( sql +colstr +");",engine)


#Get the data into the postgresql database, via a pandas dataframe
chunks=pd.read_csv(fname,chunksize=10000, dtype ={'charity_number':str})
for chunk in chunks:
    for col in ['title']:
        chunk[col]=chunk[col].str.strip()
    #I'm going to add another column which is a guess at the postcode extracted from the address
    chunk['pcode_guess']=chunk['address'].str.extract('([A-Z]{1,2}[0-9]{1,2} [0-9][A-Z]{2})')
    #Let's also get a normalised, upper case variant of the charity name
    chunk['charity_name_upper']=chunk['title'].str.upper()
    chunk.to_sql('charitiescommission', engine, index=False, if_exists='append')

#A quick test query to see what we can join with the 360 charities data using charity name...
chcnum=psql("SELECT DISTINCT ch.recipientorganization_name_upper, cc.charity_number, cc.company_number, cc.pcode_guess \
        FROM charities360data ch JOIN charitiescommission cc  \
        ON ch.recipientorganization_name_upper = cc.charity_name_upper ;",engine)
chcnum[:10]

#How many matches were there?
len(chcnum)

#Are there any popular postcodes?
chcnum[['pcode_guess']].groupby(['pcode_guess']).size().order(ascending=False)[:20]

#As with populat postcodes for company registered addresses, there may be several reasons for popular charity address postcodes

#Let's have a quick look at charities registered to one of those postcodes
psql("SELECT DISTINCT charity_name_upper,address FROM charitiescommission WHERE pcode_guess='NE46 3NP'",engine)

#Although the Charity Commission doesn't publish an API, Open Charities does... 
import json, requests

def opencharitiesLookup(id):
    url = 'http://opencharities.org/charities/{0}.json'.format(id)
    jsondata = json.loads(requests.get(url).text)
    return jsondata


#Lets see what we can back...
charityNum='202918'

jdata = opencharitiesLookup(charityNum)
jdata

#We can start to pull out specifics from this data...
chdata = jdata['charity']
fdata = chdata['financial_breakdown']

data={}
#The data will include employee and volunteer numbers..
for tmp in ['volunteers','employees']:
    data[tmp] = chdata[tmp]
 
#...as well as financial information
for tmp in ['assets','spending','income']:
    if fdata != None and tmp in fdata:
        for tmp2 in fdata[tmp]:
            data[tmp+'_'+tmp2] = fdata[tmp][tmp2]
data

#Or how about get a list of trustees?
pd.DataFrame(jdata['charity']['trustees'])

#A search on the Charities Commission site - http://www.charitycommission.gov.uk/find-charities/ - for a particular charity
#  includes a list of trustees. Clicking a trustee name shows you other charities for which they are a trustee.
#Unfortunately, OpenCharities doesnlt seem to support this level of search - to implement it ourselves, we'd have to scrape the
# data ourselves...

#Simple function to look for trustees for a charity using OpenCharities by Charity Number
def lookupTrusteesForCharity(charitynumber):
    url = 'http://opencharities.org/charities/{0}.json'.format(charitynumber)
    jsondata = json.loads(requests.get(url).text)
    df = pd.DataFrame(jsondata['charity']['trustees'])
    df['charityname']=jsondata['charity']['title']
    df['charitynumber']=jsondata['charity']['charity_number']
    return df

lookupTrusteesForCharity('516163')

#Display charity classification areas

def lookupCharityClassification(charitynumber):
    url = 'http://opencharities.org/charities/{0}.json'.format(charitynumber)
    jsondata = json.loads(requests.get(url).text)
    df = pd.DataFrame(jsondata['charity']['classifications'])
    
    df['charityname']=jsondata['charity']['title']
    df['charitynumber']=jsondata['charity']['charity_number']
    return df

lookupCharityClassification('516163')


#Notes on sqlalchemy

#There's something odd goes on - sqlalchemy tablenames may include case by default.
#To drop a tablename with cased characters, enclose the name in "" in the DROP statement.
#I also seem to have a table that is listed but that doesn't exist?

#import sqlalchemy
#meta = sqlalchemy.MetaData(engine, schema='tm351test')
#meta.reflect()
#pdsql = pd.io.sql.PandasSQLAlchemy(engine, meta=meta)

#from sqlalchemy import inspect
#eng=inspect(engine)
#eng.get_columns('charitiescommission')

#dir(eng)

#eng.get_table_names()

#dir(chC)

#psql("SELECT table_schema,table_name FROM information_schema.tables \
#    ORDER BY table_schema,table_name;", engine)