This IPython notebook illustrates how to debug blocker output.
First, we need to import py_entitymatching package and other libraries as follows:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes.
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
First, block using rule-based blocker
# First get features that can be used
feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)
# Create rule-based blocker
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.8
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.8'], feature_table)
'_rule_0'
E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])
0% 100% [##############################] | ETA: 00:00:00 Total time elapsed: 00:00:00
E
_id | ltable_ID | rtable_ID | ltable_name | rtable_name | |
---|---|---|---|---|---|
0 | 0 | a5 | b5 | Alphonse Kemper | Alfons Kemper |
dbg = em.debug_blocker(E, A, B, output_size=5)
dbg
_id | similarity | ltable_ID | rtable_ID | ltable_name | ltable_address | rtable_name | rtable_address | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0.750000 | a2 | b3 | Michael Franklin | 1652 Stockton St, San Francisco | Mike Franklin | 1652 Stockton St, San Francisco |
1 | 1 | 0.750000 | a3 | b2 | William Bridge | 3131 Webster St, San Francisco | Bill Bridge | 3131 Webster St, San Francisco |
2 | 2 | 0.272727 | a4 | b2 | Binto George | 423 Powell St, San Francisco | Bill Bridge | 3131 Webster St, San Francisco |
3 | 3 | 0.272727 | a4 | b3 | Binto George | 423 Powell St, San Francisco | Mike Franklin | 1652 Stockton St, San Francisco |
4 | 4 | 0.272727 | a5 | b6 | Alphonse Kemper | 1702 Post Street, San Francisco | Michael Brodie | 133 Clement Street, San Francisco |
# Create rule-based blocker --- NOTE: we are creating a new blocker !!!
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev_sim(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table)
'_rule_0'
E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])
0% 100% [##############################] | ETA: 00:00:00 Total time elapsed: 00:00:00
E
_id | ltable_ID | rtable_ID | ltable_name | rtable_name | |
---|---|---|---|---|---|
0 | 0 | a2 | b3 | Michael Franklin | Mike Franklin |
1 | 1 | a2 | b6 | Michael Franklin | Michael Brodie |
2 | 2 | a3 | b2 | William Bridge | Bill Bridge |
3 | 3 | a3 | b6 | William Bridge | Michael Brodie |
4 | 4 | a4 | b2 | Binto George | Bill Bridge |
5 | 5 | a5 | b5 | Alphonse Kemper | Alfons Kemper |
dbg = em.debug_blocker(E, A, B, output_size=5)
dbg
_id | similarity | ltable_ID | rtable_ID | ltable_name | ltable_address | rtable_name | rtable_address | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0.272727 | a3 | b1 | William Bridge | 3131 Webster St, San Francisco | Mark Levene | 108 Clement St, San Francisco |
1 | 1 | 0.272727 | a3 | b3 | William Bridge | 3131 Webster St, San Francisco | Mike Franklin | 1652 Stockton St, San Francisco |
2 | 2 | 0.272727 | a5 | b6 | Alphonse Kemper | 1702 Post Street, San Francisco | Michael Brodie | 133 Clement Street, San Francisco |
3 | 3 | 0.272727 | a4 | b1 | Binto George | 423 Powell St, San Francisco | Mark Levene | 108 Clement St, San Francisco |
4 | 4 | 0.272727 | a4 | b3 | Binto George | 423 Powell St, San Francisco | Mike Franklin | 1652 Stockton St, San Francisco |