Introduction¶

This IPython notebook illustrates how to generate features for blocking/matching manually.

First, we need to import py_entitymatching package and other libraries as follows:

In [22]:

# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes.

In [23]:

# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'

In [24]:

# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

Generating Features for Manually¶

Getting Attribute Types¶

In [25]:

atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)

In [26]:

atypes1.keys()

Out[26]:

dict_keys(['ID', '_table', 'birth_year', 'hourly_wage', 'address', 'name', 'zipcode'])

In [27]:

atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']

Out[27]:

('numeric', 'numeric', 'str_bt_1w_5w', 'str_bt_1w_5w', 'numeric')

In [28]:

atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']

Out[28]:

('numeric', 'numeric', 'str_bt_5w_10w', 'str_bt_1w_5w', 'numeric')

Getting Attribute Correspondences¶

In [29]:

block_c = em.get_attr_corres(A, B)

In [30]:

block_c.keys()

Out[30]:

dict_keys(['rtable', 'ltable', 'corres'])

In [31]:

id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])

Out[31]:

(4635705184, 4635705184, 4635959984, 4635959984)

In [32]:

block_c['corres']

Out[32]:

[('ID', 'ID'),
 ('name', 'name'),
 ('birth_year', 'birth_year'),
 ('hourly_wage', 'hourly_wage'),
 ('address', 'address'),
 ('zipcode', 'zipcode')]

Getting Tokenizers¶

In [33]:

# for blocking
tok = em.get_tokenizers_for_blocking()
# for matching
# tok = em.get_tokenizers_for_matching()

In [34]:

tok

Out[34]:

{'alphabetic': <function py_entitymatching.feature.tokenizers.tok_alphabetic>,
 'alphanumeric': <function py_entitymatching.feature.tokenizers.tok_alphanumeric>,
 'dlm_dc0': <function py_entitymatching.feature.tokenizers._make_tok_delim.<locals>.tok_delim>,
 'qgm_2': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,
 'qgm_3': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,
 'wspace': <function py_entitymatching.feature.tokenizers.tok_wspace>}

Getting Similarity Functions¶

In [35]:

# for blocking
sim = em.get_sim_funs_for_blocking()

# for matching
# sim = em.get_sim_funs_for_matching()

In [36]:

sim

Out[36]:

{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,
 'affine': <function py_entitymatching.feature.simfunctions.affine>,
 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,
 'dice': <function py_entitymatching.feature.simfunctions.dice>,
 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,
 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,
 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,
 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,
 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,
 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,
 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,
 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,
 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,
 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,
 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,
 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>,
 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}

Getting Features¶

In [38]:

feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)

In [41]:

feature_table.head()

Out[41]:

	feature_name	left_attribute	right_attribute	left_attr_tokenizer	right_attr_tokenizer	simfunction	function	function_source	is_auto_generated
0	ID_ID_lev_dist	ID	ID	None	None	lev_dist	<function ID_ID_lev_dist at 0x11452b378>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
1	ID_ID_lev_sim	ID	ID	None	None	lev_sim	<function ID_ID_lev_sim at 0x114515d08>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
2	ID_ID_jar	ID	ID	None	None	jaro	<function ID_ID_jar at 0x11452b158>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
3	ID_ID_jwn	ID	ID	None	None	jaro_winkler	<function ID_ID_jwn at 0x11452b048>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
4	ID_ID_exm	ID	ID	None	None	exact_match	<function ID_ID_exm at 0x11452b400>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True

In [40]:

type(feature_table)

Out[40]:

pandas.core.frame.DataFrame