#!/usr/bin/env python # coding: utf-8 # # Introduction # In[1]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Then, read the (sample) input tables # In[2]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path = datasets_dir + os.sep + 'dblp_demo.csv' # In[3]: # Read the CSV file and set 'ID' as the key attribute A = em.read_csv_metadata(path, key='id') B = em.read_csv_metadata(path, key='id') A.head() # # Data Exploration # # This notebook will demonstrate using two different data exploration tools. OpenRefine is supported for python 2.7 and 3.5 and PandasTable is only supported for python 3.5 # ## OpenRefine # In[4]: # Invoke the open refine gui for data exploration p = em.data_explore_openrefine(A, name='Table') # In[5]: # Save the project back to our dataframe # after calling export_pandas_frame, the openRefine project will be deleted automatically A = p.export_pandas_frame() # In[6]: A.head() # ## Pandastable # In[7]: # Invoke the pandastable gui for data exploration # The process will be blocked until closing the GUI em.data_explore_pandastable(B) # In[8]: B.head() # In[ ]: