%load_ext sql %sql postgresql://ipython:ipython@localhost/okf_gw # For some reason, the following query (which should work?) returns an empty set for me? # %sql select schema_name from information_schema.schemata; %sql SELECT * from pg_namespace ; %sql SELECT table_name FROM information_schema.tables WHERE table_schema = 'ukcompanies'; %sql SELECT table_name, column_name FROM information_schema.columns WHERE table_schema='ukcompanies'; result = %sql SELECT companyname,companynumber,regaddressaddressline1,regaddresspostcode, \ companycategory,companystatus,dissolutiondate,incorporationdate FROM ukcompanies.basic LIMIT 2; result #import pandas df=result.DataFrame() df from google.refine import refine, facet server = refine.RefineServer() orefine = refine.Refine(server) import matplotlib.pyplot as plt from ggplot import * top50pc=%sql SELECT ra_postal_code, COUNT(*) FROM ukcompanies.company_export \ GROUP BY ra_postal_code ORDER BY COUNT(*) DESC; top50pc.DataFrame()[:5] %sql SELECT DISTINCT current_status FROM ukcompanies.company_export postcode='B18 6EW' addr = %sql SELECT * FROM (\ SELECT ra_street_address, COUNT(*) FROM ukcompanies.company_export\ WHERE ra_postal_code = :postcode GROUP BY ra_street_address \ ORDER BY COUNT(*) DESC ) AS tmp WHERE count >1; addr csv_file='test.csv' addr.DataFrame().to_csv(csv_file,index=False) p=orefine.new_project(project_file=csv_file) fr=p.compute_facets(facet.TextFacet('ra_street_address')) facets = fr.facets[0] for k in sorted(facets.choices, key=lambda k: facets.choices[k].count, reverse=True): print facets.choices[k].count, k pr=p.get_rows(limit=10) pr.rows[0].row cluster_response = p.compute_clusters('ra_street_address',function='metaphone3') for cluster in cluster_response[:5]: print '\n' for line in cluster: print line p.mass_edit('ra_street_address', [{ 'from': ['69 GREAT HAMPTON STREET, BIRMINGHAM, WEST MIDLANDS', '69 GREAT HAMPTON STREET, BIRMINGHAM'], 'to': '69 GREAT HAMPTON STREET' }]) pr=p.get_rows(limit=10) for r in pr.rows: print r.row os = %sql SELECT name, COUNT(*) FROM ukcompanies.officer_export AS o WHERE o.company_id IN (\ SELECT id FROM ukcompanies.company_export\ WHERE ra_postal_code = :postcode ) AND position LIKE '%secretary%' AND inactive !='true' \ GROUP BY name ORDER BY COUNT(*) DESC LIMIT 5 os secretary='GO AHEAD SERVICE L%' oa= %sql SELECT c.ra_street_address, COUNT(*) FROM ukcompanies.company_export AS c WHERE id IN \ (SELECT company_id AS id FROM ukcompanies.officer_export \ WHERE name LIKE :secretary AND position LIKE '%secretary%' AND inactive !='true' ) \ GROUP BY c.ra_street_address ORDER BY COUNT(*) DESC LIMIT 10 oa occid='116469' officersByCompany = %sql SELECT * FROM ukcompanies.officer_export WHERE company_id= :occid ; officersByCompany.DataFrame()[:3] companyNumber ='00804867' officersByCompanyID = %sql SELECT o.name, o.id AS "OfficerID", c.name, c.company_number, c.id AS "OCID" \ FROM ukcompanies.officer_export AS o JOIN ukcompanies.company_export AS c ON c.id=o.company_id \ WHERE c.company_number = :companyNumber officersByCompanyID.DataFrame()[:10] gps=%sql SELECT c.company_number, c.name, o.company_id, COUNT(*) \ FROM ukcompanies.officer_export AS o JOIN ukcompanies.company_export AS c \ ON o.company_id=c.id \ GROUP BY c.company_number, c.name, o.company_id ORDER BY count DESC LIMIT 10; gps gps=%sql SELECT company_id, name, COUNT(*) OVER (PARTITION BY company_id) \ FROM ukcompanies.officer_export ORDER BY count DESC LIMIT 10; gps gps=%sql SELECT o.company_id, c.company_number, c.name, o.name, o.count FROM ( \ SELECT company_id, name, COUNT(*) OVER (PARTITION BY company_id) \ FROM ukcompanies.officer_export ) AS o \ JOIN ukcompanies.company_export AS c ON c.id = o.company_id ORDER BY o.count DESC LIMIT 10 gps gps=%sql SELECT company_id, name, count FROM (\ SELECT company_id, name, COUNT(*) OVER (PARTITION BY company_id) \ FROM ukcompanies.officer_export WHERE end_date!='' LIMIT 100 \ ) AS multidir WHERE count>8 ORDER BY count DESC LIMIT 20; gps test=%sql SELECT company_id,COUNT(*) FROM ukcompanies.officer_export WHERE name IN \ ( SELECT name FROM ukcompanies.officer_export WHERE company_id='100025') \ GROUP BY company_id HAVING COUNT(*)>1; test cid='100025' %sql SELECT c.name, o.company_id, COUNT(*) FROM ukcompanies.officer_export o JOIN ukcompanies.company_export c \ ON c.id=o.company_id WHERE o.name IN \ ( SELECT name FROM ukcompanies.officer_export WHERE company_id= :cid) \ GROUP BY c.name, o.company_id HAVING COUNT(*)>1; ts1=%sql select DISTINCT name FROM ukcompanies.officer_export where company_id='75047'\ AND name IN (SELECT DISTINCT name FROM ukcompanies.officer_export where company_id='100025') ORDER BY name; ts1 test=%sql SELECT company_id,COUNT(*) FROM ukcompanies.officer_export WHERE name IN \ ( SELECT name FROM ukcompanies.officer_export WHERE company_id='100025' AND end_date!='') \ GROUP BY company_id HAVING COUNT(*)>1; test test=%sql SELECT company_id,count(*) FROM ukcompanies.officer_export WHERE name IN \ ( SELECT name FROM ukcompanies.officer_export WHERE company_id='100025' AND end_date!='') \ AND end_date !='' GROUP BY company_id HAVING count(*)>1; test eos= %sql SELECT name,position, COUNT(*) FROM ukcompanies.officer_export \ GROUP BY name, position ORDER BY COUNT(*) DESC LIMIT 10 eos ods= %sql SELECT o.name AS "officer_name", o.start_date, o.end_date, o.inactive, c.name AS "company_name", c.company_number \ FROM ukcompanies.officer_export o JOIN ukcompanies.company_export c ON o.company_id=c.id \ WHERE o.name='YOMTOV ELIEZER JACOBS' \ AND o.position='director' ods.DataFrame()[:10] ods_df=ods.DataFrame() ods_df['start_date'] = ods_df['start_date'].astype('datetime64[ns]') ods_df['end_date'] = ods_df['end_date'].astype('datetime64[ns]') ods_df=ods_df.sort(['start_date','end_date']) ods_df['acc']=range(1,len(ods_df['company_number'])+1) ods_df[:3] g= ggplot(ods_df, aes('start_date','acc')) + \ geom_line(color='black') g fig=plt.figure() ax1 = fig.add_subplot(111) ax1.scatter(ods_df['start_date'],ods_df['acc'],c='r',marker='x',s=1) tmp=ods_df[ods_df['end_date'].notnull()] ax1.scatter(tmp['end_date'],tmp['acc'],c='b',marker='+',s=1) ax1.figure.show() ods_df['duration']=ods_df['end_date']-ods_df['start_date'] #The difference is provided in nanoseconds. Not sure how to convert this to days? ods_df[:5] ods_df.groupby('duration').size() ods_df.groupby('duration').size().plot() %sql DROP VIEW officerdates %sql CREATE VIEW officerdates AS \ SELECT *,to_date(start_date,'YYYY-MM-DD') startdate, to_date(end_date,'YYYY-MM-DD') enddate, \ to_date(end_date,'YYYY-MM-DD')-to_date(start_date,'YYYY-MM-DD') diffdate \ FROM ukcompanies.officer_export WHERE start_date ~ '[0-9]*-[0-9]*-[0-9]*' AND end_date~ '[0-9]*-[0-9]*-[0-9]*' %sql DROP VIEW companydates %sql CREATE VIEW companydates AS \ SELECT *,to_date(incorporation_date,'YYYY-MM-DD') startdate, to_date(dissolution_date,'YYYY-MM-DD') enddate,\ to_date(dissolution_date,'YYYY-MM-DD')-to_date(incorporation_date,'YYYY-MM-DD') diffdate \ FROM ukcompanies.company_export WHERE incorporation_date ~ '[0-9]*-[0-9]*-[0-9]*' \ AND dissolution_date ~ '[0-9]*-[0-9]*-[0-9]*' %sql SELECT * FROM companydates LIMIT 3 %sql SELECT name, COUNT(*) FROM officerdates WHERE diffdate = 0 GROUP BY name ORDER BY count DESC LIMIT 10 %sql SELECT * FROM companydates WHERE diffdate < 150 ORDER BY diffdate LIMIT 10 officerType= %sql SELECT DISTINCT position FROM ukcompanies.officer_export ORDER BY position; officerType tmp = %sql SELECT * FROM ukcompanies.officer_export \ WHERE position LIKE 'nominated%' AND inactive !='true' LIMIT 3; tmp %sql DROP VIEW currentOfficers %sql CREATE OR REPLACE VIEW currentOfficers AS \ SELECT * FROM (\ SELECT *, COUNT(*) OVER (PARTITION BY company_id) \ FROM ukcompanies.officer_export WHERE inactive !='true' \ ) AS multidir; %sql ALTER TABLE currentOfficers RENAME COLUMN count TO activeOfficers; tmp = %sql SELECT * FROM currentOfficers LIMIT 5; tmp %sql SELECT * FROM ( \ SELECT *, COUNT(*) OVER (PARTITION BY company_id) FROM currentOfficers \ WHERE position LIKE 'nominated%' ) as tmp \ WHERE count=activeofficers ORDER BY company_id; shells = %sql SELECT DISTINCT o.company_id, c.name, c.current_status FROM ( \ SELECT *, COUNT(*) OVER (PARTITION BY company_id) FROM currentOfficers \ WHERE position LIKE 'nominated%' ) AS o JOIN ukcompanies.company_export c\ on c.id = o.company_id WHERE count=activeofficers ORDER BY company_id LIMIT 10; shells %sql SELECT * FROM ukcompanies.officer_export WHERE company_id='80927' %sql CREATE OR REPLACE VIEW officers2 AS \ SELECT * \ FROM ( SELECT *, reverse(split_part(reverse(name),' ',1)) as lastname \ FROM ukcompanies.officer_export) as tmp; cln=%sql SELECT lastname, COUNT(*) FROM officers2 GROUP BY lastname ORDER BY COUNT(*) DESC LIMIT 10; cln %sql SELECT lastname, COUNT(*) FROM companies2 GROUP BY lastname HAVING COUNT(*) > 20 ORDER BY count DESC; %sql CREATE OR REPLACE VIEW companies2 AS \ SELECT * FROM \ ( SELECT *, reverse(split_part(reverse(name),' ',1)) AS lastname \ FROM ukcompanies.company_export ) AS tmp; %sql SELECT company_type, lastname, COUNT(*) FROM companies2 \ GROUP BY company_type,lastname HAVING COUNT(*) > 20 ORDER BY company_type; %sql CREATE OR REPLACE VIEW officers4 AS \ SELECT * FROM officers2 WHERE lastname IN (SELECT lastname FROM ( SELECT company_type, lastname, COUNT(*) FROM companies2 \ GROUP BY company_type,lastname HAVING COUNT(*) > 20 ORDER BY company_type ) AS tmp) %sql SELECT name,position, COUNT(*) FROM officers2 \ WHERE lastname NOT IN (SELECT lastname FROM companies2 GROUP BY lastname ORDER BY COUNT(*) DESC LIMIT 10) \ GROUP BY name, position ORDER BY COUNT(*) DESC LIMIT 10 %sql SELECT name,position, COUNT(*) FROM officers2 \ WHERE lastname NOT IN (SELECT lastname FROM companies2 GROUP BY lastname ORDER BY COUNT(*) DESC LIMIT 10) \ AND position NOT ILIKE '%nominated%' GROUP BY name, position ORDER BY COUNT(*) DESC LIMIT 10 %sql SELECT name,position, COUNT(*) FROM officers2 \ WHERE lastname NOT IN (SELECT lastname FROM companies2 GROUP BY lastname ORDER BY COUNT(*) DESC LIMIT 10) \ AND inactive !='true' AND position NOT ILIKE '%nominated%' \ GROUP BY name, position ORDER BY COUNT(*) DESC LIMIT 10 %sql SELECT c.name, c.ra_postal_code, o.name FROM ukcompanies.company_export c, ukcompanies.officer_export o \ WHERE o.name = 'ROSALYN SHARON SCHOFIELD' AND o.company_id = c.id AND \ position ='secretary' AND inactive !='true' %sql SELECT c.ra_postal_code, o.name, COUNT(*) FROM ukcompanies.company_export c, ukcompanies.officer_export o \ WHERE o.name = 'ROSALYN SHARON SCHOFIELD' AND o.company_id = c.id AND \ position ='secretary' AND inactive !='true' \ GROUP BY c.ra_postal_code, o.name ORDER BY count DESC %sql SELECT name FROM ukcompanies.company_export WHERE ra_postal_code ='W1K 4QY' AND name ILIKE '%PLC%' %sql SELECT name,date_of_birth, COUNT(*) FROM ukcompanies.officer_export \ WHERE date_of_birth !='' GROUP BY name, date_of_birth ORDER BY count DESC LIMIT 10; cname='JOHN SMITH' %sql SELECT DISTINCT name FROM ukcompanies.officer_export WHERE name @@ :cname ORDER BY name LIMIT 5; cbdate='1932-09-18' %sql SELECT * FROM (SELECT *, COUNT(*) OVER (PARTITION BY date_of_birth) FROM (SELECT DISTINCT name, date_of_birth \ FROM ukcompanies.officer_export WHERE name @@ :cname AND date_of_birth!='' ) AS tmp ) AS tmp2\ WHERE count > 1 ORDER BY count DESC; %sql SELECT * FROM (SELECT *, COUNT(*) OVER (PARTITION BY address) FROM (SELECT DISTINCT name, address \ FROM ukcompanies.officer_export WHERE name @@ :cname ) AS tmp ) AS tmp2\ WHERE count > 1 ORDER BY count DESC; %sql SELECT * FROM ukcompanies.officer_export WHERE name='JOHN JAYCOT SMITH'; %sql SELECT * FROM (SELECT *, COUNT(*) OVER (PARTITION BY address) FROM (SELECT DISTINCT name, address \ FROM ukcompanies.officer_export WHERE name % :cname ) AS tmp ) AS tmp2\ WHERE count > 1 ORDER BY count DESC LIMIT 20; %sql SELECT * FROM (SELECT *, COUNT(*) OVER (PARTITION BY address) FROM (SELECT DISTINCT name, address \ FROM ukcompanies.officer_export WHERE name % :cname ) AS tmp ) AS tmp2\ WHERE count > 2 ORDER BY address; pc=%sql SELECT DISTINCT pc[1] FROM \ (SELECT regexp_matches(address, ' ([A_Z][A-Z]?[0-9]+[A-Z]? [0-9][A-Z]+)', 'g') pc FROM ukcompanies.officer_export) \ AS tmp LIMIT 5; pc %sql CREATE OR REPLACE VIEW officers3 AS \ SELECT id,company_id,name,position,start_date,end_date,date_of_birth,inactive,occupation,nationality,address,lastname,pc[1] \ FROM ( SELECT *, reverse(split_part(reverse(name),' ',1)) as lastname, \ regexp_matches(address, ' ([A_Z][A-Z]?[0-9]+[A-Z]? [0-9][A-Z]+)', 'g') as pc \ FROM ukcompanies.officer_export) as tmp; %sql SELECT address,lastname,pc FROM officers3 LIMIT 3; %sql SELECT lastname, COUNT(*) FROM officers3 GROUP BY lastname ORDER BY count DESC LIMIT 10; %sql SELECT ra_country, COUNT(*) FROM ukcompanies.company_export GROUP BY ra_country ORDER BY count DESC %sql SELECT * FROM ukcompanies.company_export WHERE ra_country ILIKE '%virgin%' LIMIT 3 %sql SELECT name, count(*) FROM ukcompanies.officer_export WHERE address ILIKE '%virgin%' \ GROUP BY name ORDER BY count DESC limit 10 %sql SELECT name, COUNT(*) FROM ukcompanies.officer_export WHERE\ company_id IN ( SELECT id FROM ukcompanies.company_export WHERE ra_country ILIKE '%virgin%' )\ GROUP BY name ORDER BY count DESC LIMIT 10 sql SELECT * FROM ukcompanies.officer_export WHERE name='SARAH FEZER' sql SELECT * FROM ukcompanies.company_export WHERE \ id IN (SELECT DISTINCT company_id FROM ukcompanies.officer_export WHERE name='SARAH FEZER') sql SELECT * FROM ukcompanies.company_export WHERE \ id IN (SELECT DISTINCT company_id FROM ukcompanies.officer_export WHERE name='CHARLES PETER BITHELL')