!wget http://lin-360giving.aptivate.org/static/uploads/all_grants.csv.gz -P data !gunzip "data/all_grants.csv.gz" -d data import pandas as pd !ls data df=pd.read_csv('data/all_grants.csv', dtype={'title':object}, parse_dates=['applicationdate_startdate', 'awarddate_startdate', 'planneddates_startdate','planneddates_enddate'] ) df[:3] df['title'].unique() #Who are the funders? funders=df['fundingorganization_name'].unique().tolist() funders df[df['fundingorganization_name'].str.contains('Rock')] #Which organisations are the most 20 successful recipents in terms of number of grants awarded? df.groupby('recipientorganization_name').size().order(ascending=False)[:20] #Strip any whitespace around the name df['recipientorganization_name'] = df['recipientorganization_name'].str.strip() #Generate a new column with names in uppercase df['recipientorganization_name_upper'] = df['recipientorganization_name'].str.upper() #So which organisations are actually the most 20 successful recipents in terms of number of grants awarded? df.groupby('recipientorganization_name_upper').size().order(ascending=False)[:20] #Now let's see which are the most 20 successful organisations in terms of the total amounts awarded? orgs=df[['recipientorganization_name_upper','totalamountawarded']].groupby('recipientorganization_name_upper') orgs.sum().sort('totalamountawarded',ascending=False)[:10] #So the universities are getting lots of grants and lots of income. How about if we try to separate out "winners" by funder? #First in terms of number of grants received df.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:30] #Which universities have received the most awards? unis=df[df['recipientorganization_name_upper'].str.contains('UNIVERSITY')] unis.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:20] #How much cash have these universities received? uniFunds=unis[['fundingorganization_name','recipientorganization_name_upper','totalamountawarded']] uniFunds.groupby(['fundingorganization_name','recipientorganization_name_upper']).sum().sort('totalamountawarded',ascending=False)[:10] #Bored of the universities... how about limited companies? ltd=df[df['recipientorganization_name_upper'].str.contains('LTD') | df['recipientorganization_name_upper'].str.contains('LIMITED')] ltd.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:20] #And in terms of the cash received? ltdFunds=ltd[['fundingorganization_name','recipientorganization_name_upper','totalamountawarded']] ltdFunds.groupby(['fundingorganization_name','recipientorganization_name_upper']).sum().sort('totalamountawarded',ascending=False)[:15] #Who's most successful in terms of total number of grants received apart from the universities? nunis=df[~df['recipientorganization_name_upper'].str.contains('UNIVERSITY')] nunis.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:20] #Hmmm, councils... which have had most grants? ccls=df[df['recipientorganization_name_upper'].str.contains('COUNCIL')] ccls.groupby(['fundingorganization_name','recipientorganization_name_upper']).size().order(ascending=False)[:20] #How about council success in terms of cash...? cclFunds=ccls[['fundingorganization_name','recipientorganization_name_upper','totalamountawarded']] cclFunds.groupby(['fundingorganization_name','recipientorganization_name_upper']).sum().sort('totalamountawarded',ascending=False)[:15] #psql lets us return tables as dataframes from pandas import read_sql_query as psql #psqlx is a convenience fucntion for executing raw queries where we aren't so bothered about the response... #Errors should still be thrown... from pandas.io.sql import execute as psqlx import pandas as pd from sqlalchemy import create_engine #Create the database connection for pandas dataframe operations engine = create_engine('postgresql://test:test@localhost:5432/tm351test') #This ia a helper routine to help find the maximum length of each field - cast as a string - in the datafile def colMaxLen(df,maxlens): for col in df.columns: cml=df[col].astype(str).map(len).max() if cml > maxlens[col]: maxlens[col]=cml #Let's detect the maximum field lengths in the dataset so we can use it to help configue our database table fname='data/all_grants.csv' maxlens={} for name in df.columns: maxlens[name]=0 chunks=pd.read_csv(fname,chunksize=10000) for chunk in chunks: colMaxLen(chunk,maxlens) maxlens #If the table we're going to use already exists, get rid of it - we're going for a fresh start... psqlx("DROP TABLE IF EXISTS charities360data;",engine) #Here's where I create the table - gonna be lazy and make lots of things varchar for a quick start... psqlx("CREATE TABLE charities360data ( planneddates_enddate VARCHAR (30), awarddate_startdate VARCHAR (30), currency VARCHAR (10), \ applicationdate_startdate VARCHAR (30), recipientorganization_charitynumber VARCHAR (10), \ recipientorganization_companynumber VARCHAR (10), fundingorganization_id VARCHAR (50), \ id VARCHAR (75) PRIMARY KEY, totalamountawarded FLOAT, fundingorganization_name VARCHAR (30), \ recipientorganization_id VARCHAR(100), description VARCHAR(1000), recipientorganization_name VARCHAR (150), \ recipientorganization_name_upper VARCHAR (150), \ planneddates_startdate VARCHAR (30), totalamountappliedfor FLOAT, title VARCHAR (100) );",engine) #Test the table psql("SELECT * FROM charities360data LIMIT 1;", engine) #This routine will load data in from a pandas dataframe, itself created from 10k rows at a time read in from the source CSV file dateFields=[]#['applicationdate_startdate', 'awarddate_startdate', 'planneddates_startdate','planneddates_enddate'] chunks=pd.read_csv(fname,parse_dates=dateFields,chunksize=10000) for chunk in chunks: #Just in case, strip of any whotespace around names for col in ['recipientorganization_name','fundingorganization_name']: chunk[col]=chunk[col].str.strip() #Another normalisation step: introduce a new column corresponding to organisation name in all upper case characters chunk['recipientorganization_name_upper']=chunk['recipientorganization_name'].str.upper() chunk.to_sql('charities360data', engine, index=False, if_exists='append') #Quick query to see which organisation names appear the most psql("SELECT recipientorganization_name, COUNT(*) AS tally FROM charities360data \ WHERE recipientorganization_name ILIKE '%%UNIVERSITY%%' \ GROUP BY recipientorganization_name HAVING COUNT(*)>3 \ ORDER BY tally DESC LIMIT 20;", engine) #We note that the data appears to have the same organisation names in a variety of capitalisations... #That's why we normalise them... psql("SELECT recipientorganization_name_upper, COUNT(*) AS tally, to_char(SUM(totalamountawarded),'99999999999D99') FROM charities360data \ WHERE recipientorganization_name_upper ILIKE '%%UNIVERSITY%%' AND totalamountawarded != 'NaN' AND totalamountawarded>10000000 \ GROUP BY recipientorganization_name_upper HAVING COUNT(*)>3 \ ORDER BY tally DESC;", engine) #Quick test query around the total amounts awarded to orgs by normalised name psql("SELECT recipientorganization_name_upper ,totalamountawarded FROM charities360data \ WHERE totalamountawarded >10000000 AND totalamountawarded != 'NaN' \ AND recipientorganization_name_upper ILIKE '%%UNIVERSITY%%' ORDER BY totalamountawarded DESC; ",engine) #By the by, how many unique upper case names are there? psql("SELECT COUNT(*) FROM (SELECT DISTINCT recipientorganization_name_upper FROM charities360data) tmp",engine) #One of the other tables I have in the same database is a list of company names and company numbers from Companies House #Here's what each row looks like in that table... psql("SELECT * from companydata limit 1",engine) #Let's see how many names we an trivially join on between the 360Charities data and the Companies House data... cnum=psql("SELECT DISTINCT ch.recipientorganization_name_upper, co.companynumber FROM charities360data ch JOIN companydata co \ ON ch.recipientorganization_name_upper = co.companyname ;",engine) cnum[:10] #How many matches have we got? len(cnum) #We can bring in additional data about a company from it's company number using the OpenCorporates API import json, requests def openCorporatesLookupCompanyNumber(cnum): ''' Grab company data from OpenCorporates ''' url='https://api.opencorporates.com/v0.3/companies/gb/{0}?format=json'.format(cnum) jsondata = json.loads(requests.get(url).text) return jsondata['results'] corpdata=openCorporatesLookupCompanyNumber('04200429') #We can then do things like look the the list of officers officers = pd.DataFrame( [ d['officer'] for d in corpdata['company']['officers'] ] ) officers #We can also search OpenCorporates officers by name def openCorporatesOfficerSearch(officername): ''' Search for officers by exact match name ''' url='https://api.opencorporates.com/v0.3/officers/search' params={'q':'"'+officername+'"'} jsondata = json.loads(requests.get(url,params=params).text) data=[] for d in jsondata['results']['officers']: tmp=d['officer'] for i in ['jurisdiction_code','company_number']: tmp[i]=tmp['company'][i] tmp['company_name']=tmp['company']['name'] tmp.pop('company',None) data.append(tmp) df= pd.DataFrame( data ) return df openCorporatesOfficerSearch("JONATHAN MASHE THOMAS") #If we use the join to get a list of company numbers for companies in the 360 charities data, #we can then look these up in the Companies House datatable to see what postcodes they correspond to, #and then group and count on this basis. #That is, for compnay matched charities, what are the most popular postcodes? pcc=psql("SELECT regaddress_postcode, COUNT(*) FROM companydata \ WHERE companynumber IN (SELECT DISTINCT co.companynumber \ FROM charities360data ch JOIN companydata co \ ON ch.recipientorganization_name_upper = co.companyname) \ AND regaddress_postcode !='NaN' \ GROUP BY regaddress_postcode HAVING COUNT(*)> 5 ORDER BY COUNT(*) DESC;",engine) pcc #We can tunnel in on a particular postcode mpc=psql("SELECT DISTINCT ch.fundingorganization_name , ch.recipientorganization_name_upper, co.companynumber, regaddress_addressline1 \ FROM charities360data ch JOIN companydata co \ ON ch.recipientorganization_name_upper = co.companyname \ WHERE co.regaddress_postcode ='HU19 2DL';",engine) mpc #Let's make it easier to run this sort of query... def searchByCompanyPostcode(pc): mpc=psql("SELECT DISTINCT ch.fundingorganization_name, ch.recipientorganization_name_upper, co.companynumber, regaddress_addressline1 \ FROM charities360data ch JOIN companydata co \ ON ch.recipientorganization_name_upper = co.companyname \ WHERE co.regaddress_postcode ='"+pc+"';",engine) return mpc searchByCompanyPostcode('KT15 2NX') searchByCompanyPostcode('N9 0PZ') #Grab the data file !wget http://OpenCharities.org/charities.csv.zip -P data #Uncompress it !unzip data/charities.csv.zip -d data !head -n 3 data/charities.csv #Find out how wide each daata field is to help us set up the database table fname='data/charities.csv' df=pd.read_csv(fname) maxlens={} for name in df.columns: maxlens[name]=0 chunks=pd.read_csv(fname,chunksize=10000) for chunk in chunks: colMaxLen(chunk,maxlens) maxlens #If the table we're going to use already exists, get rid of it - we're going for a fresh start... psqlx("DROP TABLE IF EXISTS charitiescommission",engine) #Postgresql defaults to lower caps - we can force capitalisation on a table name by double quoting it #psqlx('DROP TABLE IF EXISTS "charitiesCommission"',engine) #Create the table using fields a bit bigger than the max width sql="CREATE TABLE charitiescommission (" colstr='charity_name_upper'+" VARCHAR ("+str(maxlens['title']+5)+"), pcode_guess VARCHAR (10)" for i in maxlens: tmp=i+" VARCHAR ("+str(maxlens[i]+5)+")" colstr=", ".join([colstr,tmp]) psqlx( sql +colstr +");",engine) #Get the data into the postgresql database, via a pandas dataframe chunks=pd.read_csv(fname,chunksize=10000, dtype ={'charity_number':str}) for chunk in chunks: for col in ['title']: chunk[col]=chunk[col].str.strip() #I'm going to add another column which is a guess at the postcode extracted from the address chunk['pcode_guess']=chunk['address'].str.extract('([A-Z]{1,2}[0-9]{1,2} [0-9][A-Z]{2})') #Let's also get a normalised, upper case variant of the charity name chunk['charity_name_upper']=chunk['title'].str.upper() chunk.to_sql('charitiescommission', engine, index=False, if_exists='append') #A quick test query to see what we can join with the 360 charities data using charity name... chcnum=psql("SELECT DISTINCT ch.recipientorganization_name_upper, cc.charity_number, cc.company_number, cc.pcode_guess \ FROM charities360data ch JOIN charitiescommission cc \ ON ch.recipientorganization_name_upper = cc.charity_name_upper ;",engine) chcnum[:10] #How many matches were there? len(chcnum) #Are there any popular postcodes? chcnum[['pcode_guess']].groupby(['pcode_guess']).size().order(ascending=False)[:20] #As with populat postcodes for company registered addresses, there may be several reasons for popular charity address postcodes #Let's have a quick look at charities registered to one of those postcodes psql("SELECT DISTINCT charity_name_upper,address FROM charitiescommission WHERE pcode_guess='NE46 3NP'",engine) #Although the Charity Commission doesn't publish an API, Open Charities does... import json, requests def opencharitiesLookup(id): url = 'http://opencharities.org/charities/{0}.json'.format(id) jsondata = json.loads(requests.get(url).text) return jsondata #Lets see what we can back... charityNum='202918' jdata = opencharitiesLookup(charityNum) jdata #We can start to pull out specifics from this data... chdata = jdata['charity'] fdata = chdata['financial_breakdown'] data={} #The data will include employee and volunteer numbers.. for tmp in ['volunteers','employees']: data[tmp] = chdata[tmp] #...as well as financial information for tmp in ['assets','spending','income']: if fdata != None and tmp in fdata: for tmp2 in fdata[tmp]: data[tmp+'_'+tmp2] = fdata[tmp][tmp2] data #Or how about get a list of trustees? pd.DataFrame(jdata['charity']['trustees']) #A search on the Charities Commission site - http://www.charitycommission.gov.uk/find-charities/ - for a particular charity # includes a list of trustees. Clicking a trustee name shows you other charities for which they are a trustee. #Unfortunately, OpenCharities doesnlt seem to support this level of search - to implement it ourselves, we'd have to scrape the # data ourselves... #Simple function to look for trustees for a charity using OpenCharities by Charity Number def lookupTrusteesForCharity(charitynumber): url = 'http://opencharities.org/charities/{0}.json'.format(charitynumber) jsondata = json.loads(requests.get(url).text) df = pd.DataFrame(jsondata['charity']['trustees']) df['charityname']=jsondata['charity']['title'] df['charitynumber']=jsondata['charity']['charity_number'] return df lookupTrusteesForCharity('516163') #Display charity classification areas def lookupCharityClassification(charitynumber): url = 'http://opencharities.org/charities/{0}.json'.format(charitynumber) jsondata = json.loads(requests.get(url).text) df = pd.DataFrame(jsondata['charity']['classifications']) df['charityname']=jsondata['charity']['title'] df['charitynumber']=jsondata['charity']['charity_number'] return df lookupCharityClassification('516163') #Notes on sqlalchemy #There's something odd goes on - sqlalchemy tablenames may include case by default. #To drop a tablename with cased characters, enclose the name in "" in the DROP statement. #I also seem to have a table that is listed but that doesn't exist? #import sqlalchemy #meta = sqlalchemy.MetaData(engine, schema='tm351test') #meta.reflect() #pdsql = pd.io.sql.PandasSQLAlchemy(engine, meta=meta) #from sqlalchemy import inspect #eng=inspect(engine) #eng.get_columns('charitiescommission') #dir(eng) #eng.get_table_names() #dir(chC) #psql("SELECT table_schema,table_name FROM information_schema.tables \ # ORDER BY table_schema,table_name;", engine)