# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path = datasets_dir + os.sep + 'dblp_demo.csv'
# Read the CSV file and set 'ID' as the key attribute
A = em.read_csv_metadata(path, key='id')
B = em.read_csv_metadata(path, key='id')
A.head()
Metadata file is not present in the given path; proceeding to read the csv file. Metadata file is not present in the given path; proceeding to read the csv file.
id | title | authors | venue | year | |
---|---|---|---|---|---|
0 | l0 | Paradise: A Database System for GIS Applications | Paradise Team | SIGMOD Conference | 1995 |
1 | l1 | A Query Language and Optimization Techniques for Unstructured Data | Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu | SIGMOD Conference | 1996 |
2 | l2 | Turbo-charging Vertical Mining of Large Databases | Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia | SIGMOD Conference | 2000 |
3 | l3 | Maintenance of Data Cubes and Summary Tables in a Warehouse | Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick | SIGMOD Conference | 1997 |
4 | l4 | On Relational Support for XML Publishing: Beyond Sorting and Tagging | Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri | SIGMOD Conference | 2003 |
This notebook will demonstrate using two different data exploration tools. OpenRefine is supported for python 2.7 and 3.5 and PandasTable is only supported for python 3.5
# Invoke the open refine gui for data exploration
p = em.data_explore_openrefine(A, name='Table')
# Save the project back to our dataframe
# after calling export_pandas_frame, the openRefine project will be deleted automatically
A = p.export_pandas_frame()
A.head()
id | title | authors | venue | year | |
---|---|---|---|---|---|
0 | l0 | You can modify data if necessary using OpenRefine | Paradise Team | SIGMOD Conference | 1995 |
1 | l1 | A Query Language and Optimization Techniques for Unstructured Data | Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu | SIGMOD Conference | 1996 |
2 | l2 | Turbo-charging Vertical Mining of Large Databases | Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia | SIGMOD Conference | 2000 |
3 | l3 | Maintenance of Data Cubes and Summary Tables in a Warehouse | Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick | SIGMOD Conference | 1997 |
4 | l4 | On Relational Support for XML Publishing: Beyond Sorting and Tagging | Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri | SIGMOD Conference | 2003 |
# Invoke the pandastable gui for data exploration
# The process will be blocked until closing the GUI
em.data_explore_pandastable(B)
B.head()
id | title | authors | venue | year | |
---|---|---|---|---|---|
0 | l0 | You can modify data if necessary using pandastable | Paradise Team | SIGMOD Conference | 1995 |
1 | l1 | A Query Language and Optimization Techniques for Unstructured Data | Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu | SIGMOD Conference | 1996 |
2 | l2 | Turbo-charging Vertical Mining of Large Databases | Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia | SIGMOD Conference | 2000 |
3 | l3 | Maintenance of Data Cubes and Summary Tables in a Warehouse | Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick | SIGMOD Conference | 1997 |
4 | l4 | On Relational Support for XML Publishing: Beyond Sorting and Tagging | Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri | SIGMOD Conference | 2003 |