This IPython notebook illustrates how to use multiple blockers and combine the results.
First, we need to import py_entitymatching package and other libraries as follows:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes.
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
#Blocking plan :
#A, B --overlap blocking--> candset --attr-equiv-block--> candset |
# |
#A, B ------------rule-based-blocking--------------------> candset |----union--->candset
# |
#A, B -----------black-box-blocking----------------------> candset |
# Overlap blocking over input tables
ob = em.OverlapBlocker()
# block using name
C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=1,
l_output_attrs=['name', 'birth_year'],
r_output_attrs=['name', 'birth_year'],
show_progress=False)
C
_id | ltable_ID | rtable_ID | ltable_name | ltable_birth_year | rtable_name | rtable_birth_year | |
---|---|---|---|---|---|---|---|
0 | 0 | a3 | b2 | William Bridge | 1986 | Bill Bridge | 1986 |
1 | 1 | a2 | b3 | Michael Franklin | 1988 | Mike Franklin | 1988 |
2 | 2 | a5 | b5 | Alphonse Kemper | 1984 | Alfons Kemper | 1984 |
3 | 3 | a2 | b6 | Michael Franklin | 1988 | Michael Brodie | 1987 |
# Overlap blocking over input tables
ob = em.OverlapBlocker()
# block using name
C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=1,
l_output_attrs=['name', 'birth_year'],
r_output_attrs=['name', 'birth_year'],
show_progress=False)
C
_id | ltable_ID | rtable_ID | ltable_name | ltable_birth_year | rtable_name | rtable_birth_year | |
---|---|---|---|---|---|---|---|
0 | 0 | a3 | b2 | William Bridge | 1986 | Bill Bridge | 1986 |
1 | 1 | a2 | b3 | Michael Franklin | 1988 | Mike Franklin | 1988 |
2 | 2 | a5 | b5 | Alphonse Kemper | 1984 | Alfons Kemper | 1984 |
3 | 3 | a2 | b6 | Michael Franklin | 1988 | Michael Brodie | 1987 |
# Attribute equivalence blocking: block C using birth_year
ab = em.AttrEquivalenceBlocker()
D = ab.block_candset(C, 'birth_year', 'birth_year', show_progress=False)
D
_id | ltable_ID | rtable_ID | ltable_name | ltable_birth_year | rtable_name | rtable_birth_year | |
---|---|---|---|---|---|---|---|
0 | 0 | a3 | b2 | William Bridge | 1986 | Bill Bridge | 1986 |
1 | 1 | a2 | b3 | Michael Franklin | 1988 | Mike Franklin | 1988 |
2 | 2 | a5 | b5 | Alphonse Kemper | 1984 | Alfons Kemper | 1984 |
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)
# Create rule-based blocker
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table)
'_rule_0'
E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'], show_progress=False)
# Apply black box blocker
# Create black box blocker
bb = em.BlackBoxBlocker()
# define a black box function.
# The blocker function should drop tuple pairs whose last name do not match
# The function has to do the following steps
# 1) Get name attributes from each of the tuples
# 2) Split name attribute to get last name
# 3) if last names donot match return True
def my_function(x, y):
# x, y will be of type pandas series
# get name attribute
x_name = x['name']
y_name = y['name']
# get last names
x_name = x_name.split(' ')[1]
y_name = y_name.split(' ')[1]
# check if last names match
if x_name != y_name:
return True
else:
return False
bb.set_black_box_function(my_function)
F = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'], show_progress=False)
F
_id | ltable_ID | rtable_ID | ltable_name | rtable_name | |
---|---|---|---|---|---|
0 | 0 | a2 | b3 | Michael Franklin | Mike Franklin |
1 | 1 | a3 | b2 | William Bridge | Bill Bridge |
2 | 2 | a5 | b5 | Alphonse Kemper | Alfons Kemper |
# Combine all the blocker outputs
G = em.combine_blocker_outputs_via_union([D, E, F])
G
_id | ltable_ID | rtable_ID | ltable_name | ltable_birth_year | rtable_name | rtable_birth_year | |
---|---|---|---|---|---|---|---|
0 | 0 | a2 | b3 | Michael Franklin | 1988 | Mike Franklin | 1988 |
1 | 1 | a2 | b6 | Michael Franklin | 1988 | Michael Brodie | 1987 |
2 | 2 | a3 | b2 | William Bridge | 1986 | Bill Bridge | 1986 |
3 | 3 | a3 | b6 | William Bridge | 1986 | Michael Brodie | 1987 |
4 | 4 | a4 | b2 | Binto George | 1987 | Bill Bridge | 1986 |
5 | 5 | a5 | b5 | Alphonse Kemper | 1984 | Alfons Kemper | 1984 |
em.show_properties(G)
id: 4547208976 rtable(obj.id): 4546836464 key: _id fk_rtable: rtable_ID fk_ltable: ltable_ID ltable(obj.id): 4546835680