This example makes use of the Google/OpenRefine client that can be found at https://github.com/PaulMakepeace/refine-client-py
The example is inspired by, and draws heavily from, Trevor Muñoz's tutorial posted at http://nbviewer.ipython.org/gist/trevormunoz/6265360
First we need to import some necessary libraries...
!~/anaconda/bin/pip install git+https://github.com/PaulMakepeace/refine-client-py.git
Collecting git+https://github.com/PaulMakepeace/refine-client-py.git
Collecting urllib2-file (from refine-client==0.2.1)
Using cached urllib2_file-0.2.1.tar.gz
Building wheels for collected packages: urllib2-file
Running setup.py bdist_wheel for urllib2-file ... - \ done
Successfully built urllib2-file
Installing collected packages: urllib2-file, refine-client
Running setup.py install for refine-client ... - done
Successfully installed refine-client-0.2.1 urllib2-file-0.2.1
You are using pip version 7.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
!~/anaconda/bin/pip install refine-client
You are using pip version 7.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
from google.refine import refine, facet
Make sure you have OpenRefine running locally before going any further! (You can download and install it from OpenRefine.org.)
Once you've started OpenRefine in the normal way, get a link to the server...
server = refine.RefineServer()
orefine = refine.Refine(server)
Let's check everything's working by previewing three examples from the list of any projects that have previously been created...
orefine.list_projects().items()[:3]
[(u'1526469065323', {u'created': u'2015-02-01T13:29:38Z', u'customMetadata': {}, u'modified': u'2015-02-01T13:29:38Z', u'name': u'UK MPs'}), (u'2104805151035', {u'created': u'2015-09-10T23:45:39Z', u'customMetadata': {}, u'modified': u'2015-09-10T23:45:39Z', u'name': u'localweblist csv'}), (u'2132091692801', {u'created': u'2015-09-04T10:40:12Z', u'customMetadata': {}, u'modified': u'2015-09-04T10:40:12Z', u'name': u'Data on Written Complaints in the NHS 2014 15 KO41b csv csv'})]
If we want to work with one of these projects, we can open it directly:
project_file = 'smalltest.csv'
p=orefine.new_project(project_file=project_file)
pr=p.get_rows(limit=10)
dir(pr.rows[0])
['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'flagged', 'index', 'row', 'starred']
pr.rows[0].row
[u'! LTD', 8209948, None, None, u'METROHOUSE 57 PEPPER ROAD', u'HUNSLET', u'LEEDS', u'YORKSHIRE', u'UNITED KINGDOM', u'LS10 2RU', u'Private Limited Company', u'Active', u'United Kingdom', None, u'11/09/2012', 30, 9, u'11/06/2014', None, u'NO ACCOUNTS FILED', u'09/10/2014', u'11/09/2013', 0, 0, 0, 0, u'99999 - Dormant Company', None, None, None, 0, 0, u'http://business.data.gov.uk/id/company/08209948', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
f=facet.TextFacet('CompanyName')
dir(f)
['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'as_dict', 'column_name', 'exclude', 'expression', 'include', 'invert', 'name', 'omit_blank', 'omit_error', 'reset', 'select_blank', 'select_error', 'selection', 'type']
fr=p.compute_facets(facet.TextFacet('CompanyName'))
facets = fr.facets[0]
for k in sorted(facets.choices, key=lambda k: facets.choices[k].count, reverse=True):
print facets.choices[k].count, k
1 !YOZO FASS LIMITED 1 !OBAC UK LIMITED 1 !NFERNO LTD. 1 !ST MEDIA SOUTHAMPTON LTD 1 !OBAC INSTALLATIONS LIMITED 1 !NSPIRED LTD 1 !OBAC MAGNETIC FLOOR TECHNOLOGY 1 ! LTD 1 !BIG IMPACT GRAPHICS LIMITED
dir(pr.rows.rows_response)
pr.rows.rows_response
[{u'cells': [{u'v': u'! LTD'}, {u'v': 8209948}, None, None, {u'v': u'METROHOUSE 57 PEPPER ROAD'}, {u'v': u'HUNSLET'}, {u'v': u'LEEDS'}, {u'v': u'YORKSHIRE'}, {u'v': u'UNITED KINGDOM'}, {u'v': u'LS10 2RU'}, {u'v': u'Private Limited Company'}, {u'v': u'Active'}, {u'v': u'United Kingdom'}, None, {u'v': u'11/09/2012'}, {u'v': 30}, {u'v': 9}, {u'v': u'11/06/2014'}, None, {u'v': u'NO ACCOUNTS FILED'}, {u'v': u'09/10/2014'}, {u'v': u'11/09/2013'}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'99999 - Dormant Company'}, None, None, None, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/08209948'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 0, u'starred': False}, {u'cells': [{u'v': u'!BIG IMPACT GRAPHICS LIMITED'}, {u'v': 7382019}, None, None, {u'v': u'335 ROSDEN HOUSE'}, {u'v': u'372 OLD STREET'}, {u'v': u'LONDON'}, None, None, {u'v': u'EC1V 9AV'}, {u'v': u'Private Limited Company'}, {u'v': u'Active'}, {u'v': u'United Kingdom'}, None, {u'v': u'21/09/2010'}, {u'v': 30}, {u'v': 9}, {u'v': u'30/06/2014'}, {u'v': u'30/09/2012'}, {u'v': u'DORMANT'}, {u'v': u'19/10/2014'}, {u'v': u'21/09/2013'}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'59112 - Video production activities'}, {u'v': u'59113 - Television programme production activities'}, {u'v': u'74100 - specialised design activities'}, {u'v': u'74202 - Other specialist photography'}, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/07382019'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 1, u'starred': False}, {u'cells': [{u'v': u'!NFERNO LTD.'}, {u'v': 4753368}, None, None, {u'v': u'FIRST FLOOR THAVIES INN HOUSE 3-4'}, {u'v': u'HOLBORN CIRCUS'}, {u'v': u'LONDON'}, None, None, {u'v': u'EC1N 2HA'}, {u'v': u'Private Limited Company'}, {u'v': u'Active'}, {u'v': u'United Kingdom'}, None, {u'v': u'03/05/2003'}, {u'v': 31}, {u'v': 10}, {u'v': u'31/07/2014'}, {u'v': u'31/10/2012'}, {u'v': u'TOTAL EXEMPTION FULL'}, {u'v': u'31/05/2014'}, {u'v': u'03/05/2013'}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'82990 - Other business support service activities n.e.c.'}, None, None, None, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/04753368'}, {u'v': u'27/01/2009'}, {u'v': u'DISTINCTIVE IMPRINT WORLDWIDE LIMITED'}, {u'v': u'02/09/2003'}, {u'v': u'DISTINCTIVE IMPRINT CONSULTANTS LIMITED'}, {u'v': u'03/07/2003'}, {u'v': u'AMEROOLA CONSULTANTS LIMITED'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 2, u'starred': False}, {u'cells': [{u'v': u'!NSPIRED LTD'}, {u'v': u'SC421617'}, None, None, {u'v': u'12 BON ACCORD SQUARE'}, None, {u'v': u'ABERDEEN'}, None, {u'v': u'SCOTLAND'}, {u'v': u'AB11 6DJ'}, {u'v': u'Private Limited Company'}, {u'v': u'Active'}, {u'v': u'United Kingdom'}, None, {u'v': u'11/04/2012'}, {u'v': 31}, {u'v': 3}, {u'v': u'31/12/2013'}, None, {u'v': u'NO ACCOUNTS FILED'}, {u'v': u'09/05/2014'}, {u'v': u'11/04/2013'}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'70229 - Management consultancy activities other than financial management'}, None, None, None, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/SC421617'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 3, u'starred': False}, {u'cells': [{u'v': u'!OBAC INSTALLATIONS LIMITED'}, {u'v': 7527820}, None, None, {u'v': u'DEVONSHIRE HOUSE'}, {u'v': u'60 GOSWELL ROAD'}, {u'v': u'LONDON'}, None, {u'v': u'UNITED KINGDOM'}, {u'v': u'EC1M 7AD'}, {u'v': u'Private Limited Company'}, {u'v': u'Active'}, {u'v': u'United Kingdom'}, None, {u'v': u'14/02/2011'}, {u'v': 31}, {u'v': 12}, {u'v': u'30/09/2014'}, {u'v': u'31/12/2012'}, {u'v': u'DORMANT'}, {u'v': u'14/03/2014'}, {u'v': u'14/02/2013'}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'43330 - Floor and wall covering'}, None, None, None, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/07527820'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 4, u'starred': False}, {u'cells': [{u'v': u'!OBAC MAGNETIC FLOOR TECHNOLOGY'}, {u'v': u'FC031362'}, None, None, {u'v': u'1ST AND 2ND FLOORS ELIZABETH HOUSE'}, {u'v': u'LES RUETIES BRAYES'}, {u'v': u'ST PETER PORT'}, {u'v': u'GY1 1EW'}, {u'v': u'GUERNSEY'}, None, {u'v': u'Other company type'}, {u'v': u'Active'}, {u'v': u'CHANNEL ISLANDS'}, None, {u'v': u'30/11/2012'}, {u'v': 31}, {u'v': 12}, None, None, {u'v': u'NO ACCOUNTS FILED'}, None, None, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'None Supplied'}, None, None, None, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/FC031362'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 5, u'starred': False}, {u'cells': [{u'v': u'!OBAC UK LIMITED'}, {u'v': 7687209}, None, None, {u'v': u'DEVONSHIRE HOUSE'}, {u'v': u'60 GOSWELL ROAD'}, {u'v': u'LONDON'}, None, {u'v': u'UNITED KINGDOM'}, {u'v': u'EC1M 7AD'}, {u'v': u'Private Limited Company'}, {u'v': u'Active'}, {u'v': u'United Kingdom'}, None, {u'v': u'29/06/2011'}, {u'v': 29}, {u'v': 12}, {u'v': u'29/09/2014'}, {u'v': u'29/12/2012'}, {u'v': u'TOTAL EXEMPTION FULL'}, {u'v': u'27/07/2014'}, {u'v': u'29/06/2013'}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'70229 - Management consultancy activities other than financial management'}, None, None, None, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/07687209'}, {u'v': u'15/07/2011'}, {u'v': u'!OBAC FITTINGS LIMITED'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 6, u'starred': False}, {u'cells': [{u'v': u'!ST MEDIA SOUTHAMPTON LTD'}, {u'v': 7904170}, None, None, {u'v': u'10 NORTHBROOK HOUSE'}, {u'v': u'FREE STREET, BISHOPS WALTHAM'}, {u'v': u'SOUTHAMPTON'}, None, {u'v': u'ENGLAND'}, {u'v': u'SO32 1NP'}, {u'v': u'Private Limited Company'}, {u'v': u'Active - Proposal to Strike off'}, {u'v': u'United Kingdom'}, None, {u'v': u'10/01/2012'}, {u'v': 31}, {u'v': 1}, {u'v': u'10/10/2013'}, None, {u'v': u'NO ACCOUNTS FILED'}, {u'v': u'07/02/2013'}, None, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'None Supplied'}, None, None, None, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/07904170'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 7, u'starred': False}, {u'cells': [{u'v': u'!YOZO FASS LIMITED'}, {u'v': 2714021}, None, None, {u'v': u'1 VERONICA HOUSE'}, {u'v': u'WICKHAM ROAD'}, {u'v': u'BROCKLEY'}, None, None, {u'v': u'SE4 1NQ'}, {u'v': u'Private Limited Company'}, {u'v': u'Active'}, {u'v': u'United Kingdom'}, None, {u'v': u'12/05/1992'}, {u'v': 31}, {u'v': 3}, {u'v': u'31/12/2013'}, {u'v': u'31/03/2012'}, {u'v': u'DORMANT'}, {u'v': u'09/06/2014'}, {u'v': u'12/05/2013'}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': 0}, {u'v': u'90010 - Performing arts'}, None, None, None, {u'v': 0}, {u'v': 0}, {u'v': u'http://business.data.gov.uk/id/company/02714021'}, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], u'flagged': False, u'i': 8, u'starred': False}]
dir(pr)
['RefineRows', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'filtered', 'limit', 'mode', 'rows', 'start', 'total']
dir(p)
['__doc__', '__init__', '__module__', 'add_column', 'annotate_one_row', 'apply_operations', 'blank_down', 'clusterer_defaults', 'column_order', 'columns', 'compute_clusters', 'compute_facets', 'delete', 'do_json', 'do_raw', 'edit', 'engine', 'export', 'export_rows', 'fill_down', 'flag_row', 'get_models', 'get_preference', 'get_reconciliation_service_by_name_or_url', 'get_reconciliation_services', 'get_rows', 'guess_types_of_column', 'has_records', 'history_entry', 'key_column', 'mass_edit', 'move_column', 'project_id', 'project_name', 'project_url', 'recon_services', 'reconcile', 'remove_rows', 'rename_column', 'reorder_columns', 'reorder_rows', 'rows_response_factory', 'server', 'sorting', 'split_column', 'star_row', 'text_transform', 'transpose_columns_into_rows', 'transpose_rows_into_columns', 'wait_until_idle']
p.columns
[u'CompanyName', u'CompanyNumber', u'RegAddress.CareOf', u'RegAddress.POBox', u'RegAddress.AddressLine1', u'RegAddress.AddressLine2', u'RegAddress.PostTown', u'RegAddress.County', u'RegAddress.Country', u'RegAddress.PostCode', u'CompanyCategory', u'CompanyStatus', u'CountryOfOrigin', u'DissolutionDate', u'IncorporationDate', u'Accounts.AccountRefDay', u'Accounts.AccountRefMonth', u'Accounts.NextDueDate', u'Accounts.LastMadeUpDate', u'Accounts.AccountCategory', u'Returns.NextDueDate', u'Returns.LastMadeUpDate', u'Mortgages.NumMortCharges', u'Mortgages.NumMortOutstanding', u'Mortgages.NumMortPartSatisfied', u'Mortgages.NumMortSatisfied', u'SICCode.SicText_1', u'SICCode.SicText_2', u'SICCode.SicText_3', u'SICCode.SicText_4', u'LimitedPartnerships.NumGenPartners', u'LimitedPartnerships.NumLimPartners', u'URI', u'PreviousName_1.CONDATE', u'PreviousName_1.CompanyName', u'PreviousName_2.CONDATE', u'PreviousName_2.CompanyName', u'PreviousName_3.CONDATE', u'PreviousName_3.CompanyName', u'PreviousName_4.CONDATE', u'PreviousName_4.CompanyName', u'PreviousName_5.CONDATE', u'PreviousName_5.CompanyName', u'PreviousName_6.CONDATE', u'PreviousName_6.CompanyName', u'PreviousName_7.CONDATE', u'PreviousName_7.CompanyName', u'PreviousName_8.CONDATE', u'PreviousName_8.CompanyName', u'PreviousName_9.CONDATE', u'PreviousName_9.CompanyName', u'PreviousName_10.CONDATE', u'PreviousName_10.CompanyName']