#!/usr/bin/env python # coding: utf-8 # # Introduction # # This IPython notebook illustrates how to perform blocking using rule-based blocker. # # First, we need to import *py_entitymatching* package and other libraries as follows: # In[1]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Then, read the (sample) input tables for blocking purposes. # # In[5]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path_A = datasets_dir + os.sep + 'person_table_A.csv' path_B = datasets_dir + os.sep + 'person_table_B.csv' # In[6]: # Read the CSV files and set 'ID' as the key attribute A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # # Different Ways to Block Using Blackbox Based Blocker # # There are three different ways to do overlap blocking: # # 1. Block two tables to produce a `candidate set` of tuple pairs. # 2. Block a `candidate set` of tuple pairs to typically produce a reduced candidate set of tuple pairs. # 3. Block two tuples to check if a tuple pair would get blocked. # ## Block Tables to Produce a Candidate Set of Tuple Pairs # First, define a blackbox function # In[18]: def address_address_function(x, y): # x, y will be of type pandas series # get name attribute x_address = x['address'] y_address = y['address'] # get the city x_split, y_split = x_address.split(','), y_address.split(',') x_city = x_split[len(x_split) - 1] y_city = y_split[len(y_split) - 1] # check if the cities match if x_city != y_city: return True else: return False # In[22]: # Instantiate blackbox blocker bb = em.BlackBoxBlocker() # Set the black box function bb.set_black_box_function(address_address_function) # In[23]: C = bb.block_tables(A, B, l_output_attrs=['name', 'address'], r_output_attrs=['name', 'address']) # In[24]: C # ## Block Candidate Set # # First, define a blackbox function # In[25]: def name_name_function(x, y): # x, y will be of type pandas series # get name attribute x_name = x['name'] y_name = y['name'] # get last names x_name = x_name.split(' ')[1] y_name = y_name.split(' ')[1] # check if last names match if x_name != y_name: return True else: return False # In[29]: # Instantiate blackbox blocker bb = em.BlackBoxBlocker() # Set the black box function bb.set_black_box_function(name_name_function) # In[30]: D = bb.block_candset(C) # In[31]: D # ## Block Two tuples To Check If a Tuple Pair Would Get Blocked # First, define the black box function first # In[33]: def address_address_function(x, y): # x, y will be of type pandas series # get name attribute x_address = x['address'] y_address = y['address'] # get the city x_split, y_split = x_address.split(','), y_address.split(',') x_city = x_split[len(x_split) - 1] y_city = y_split[len(y_split) - 1] # check if the cities match if x_city != y_city: return True else: return False # In[34]: # Instantiate blackabox blocker bb = em.BlackBoxBlocker() # Set the blackbox function bb.set_black_box_function(address_address_function) # In[35]: A.ix[[0]] # In[36]: B.ix[[0]] # In[38]: status = bb.block_tuples(A.ix[0], B.ix[0]) print(status)