#!/usr/bin/env python # coding: utf-8 # # Introduction # This IPython notebook illustrates how to sample and label a table (candidate set). # First, we need to import py_entitymatching package and other libraries as follows: # In[1]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # In[2]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' path_A = datasets_dir + os.sep + 'DBLP.csv' path_B = datasets_dir + os.sep + 'ACM.csv' path_C = datasets_dir + os.sep + 'tableC.csv' # In[3]: A = em.read_csv_metadata(path_A, key='id') B = em.read_csv_metadata(path_B, key='id') C = em.read_csv_metadata(path_C, key='_id', fk_ltable='ltable_id', fk_rtable='rtable_id', ltable=A, rtable=B) # In[4]: C.head() # In[5]: len(C) # # Sample Candidate Set # From the candidate set, a sample (for labeling purposes) can be obtained like this: # In[6]: S = em.sample_table(C, 450) # # Label the Sampled Set # In[7]: # Label the sampled set # Specify the name for the label column G = em.label_table(S, 'gold_label') # The user must specify 0 for non-match and 1 for match. Typically, the sampling and the labeling step is done in iterations (till we get sufficient density of matches). Once labeled, the labeled data set will look like this: # In[8]: # Assume that we have labeled the data and stored it in # labeled_data_demo.csv path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv' G = em.read_csv_metadata(path_labeled_data, key='_id', fk_ltable='ltable_id', fk_rtable='rtable_id', ltable=A, rtable=B) # In[9]: G.head()