#!/usr/bin/env python
# coding: utf-8

# # Introduction
# This IPython notebook illustrates how to generate features for blocking/matching manually.
# 
# First, we need to import *py_entitymatching* package and other libraries as follows:

# In[22]:


# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd


# Then, read the (sample) input tables for blocking purposes.

# In[23]:


# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'


# In[24]:


# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')


# # Generating Features for Manually

# ## Getting Attribute Types

# In[25]:


atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)


# In[26]:


atypes1.keys()


# In[27]:


atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']


# In[28]:


atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']


# ## Getting Attribute Correspondences

# In[29]:


block_c = em.get_attr_corres(A, B)


# In[30]:


block_c.keys()


# In[31]:


id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])


# In[32]:


block_c['corres']


# ## Getting Tokenizers

# In[33]:


# for blocking
tok = em.get_tokenizers_for_blocking()
# for matching
# tok = em.get_tokenizers_for_matching()


# In[34]:


tok


# ## Getting Similarity Functions

# In[35]:


# for blocking
sim = em.get_sim_funs_for_blocking()

# for matching
# sim = em.get_sim_funs_for_matching()


# In[36]:


sim


# ## Getting Features

# In[38]:


feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)


# In[41]:


feature_table.head()


# In[40]:


type(feature_table)