#!/usr/bin/env python
# coding: utf-8

# # Introduction

# This IPython notebook illustrates how to sample and label a table (candidate set).
# First, we need to import py_entitymatching package and other libraries as follows:

# In[1]:


# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd


# In[2]:


# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

path_A = datasets_dir + os.sep + 'DBLP.csv'
path_B = datasets_dir + os.sep + 'ACM.csv'
path_C = datasets_dir + os.sep + 'tableC.csv'


# In[3]:


A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
C = em.read_csv_metadata(path_C, key='_id', 
                         fk_ltable='ltable_id', fk_rtable='rtable_id',
                         ltable=A, rtable=B)


# In[4]:


C.head()


# In[5]:


len(C)


# # Sample Candidate Set

# From the candidate set, a sample (for labeling purposes) can be obtained like this:

# In[6]:


S = em.sample_table(C, 450)


# # Label the Sampled Set

# In[7]:


# Label the sampled set
# Specify the name for the label column
G = em.label_table(S, 'gold_label')


# The user must specify 0 for non-match and 1 for match. Typically, the sampling and the labeling step is done in iterations (till we get sufficient density of matches). Once labeled, the labeled data set will look like this:

# In[8]:


# Assume that we have labeled the data and stored it in 
# labeled_data_demo.csv

path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'
G = em.read_csv_metadata(path_labeled_data, key='_id', 
                         fk_ltable='ltable_id', fk_rtable='rtable_id',
                         ltable=A, rtable=B)


# In[9]:


G.head()