#!/usr/bin/env python # coding: utf-8 # # Introduction # This IPython notebook illustrates how to perform blocking using rule-based blocker. # # First, we need to import *py_entitymatching* package and other libraries as follows: # In[1]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Then, read the (sample) input tables for blocking purposes. # In[2]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path_A = datasets_dir + os.sep + 'person_table_A.csv' path_B = datasets_dir + os.sep + 'person_table_B.csv' # In[3]: # Read the CSV files and set 'ID' as the key attribute A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # In[4]: A.head() # In[5]: B.head() # # Generating Features for Blocking # In[6]: block_f = em.get_features_for_blocking(A, B) # In[22]: block_f # In[10]: em._block_c['corres'] # In[31]: em._atypes1['birth_year'], em._atypes1['hourly_wage'], em._atypes1['name'], em._atypes1['zipcode'] # In[32]: em._atypes2['birth_year'], em._atypes2['hourly_wage'], em._atypes2['name'], em._atypes2['zipcode'] # # Different Ways to Block Using Rule Based Blocker # There are three different ways to do overlap blocking: # # 1. Block two tables to produce a `candidate set` of tuple pairs. # 2. Block a `candidate set` of tuple pairs to typically produce a reduced candidate set of tuple pairs. # 3. Block two tuples to check if a tuple pair would get blocked. # ## Block Tables to Produce a Candidate Set of Tuple Pairs # In[16]: rb = em.RuleBasedBlocker() # Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4 rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f) # In[20]: C = rb.block_tables(A, B, l_output_attrs=['name', 'address'], r_output_attrs=['name', 'address'], show_progress=False) # In[21]: C.head() # ## Block Candidate Set # In[28]: rb = em.RuleBasedBlocker() rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f) # In[29]: D = rb.block_candset(C, show_progress=False) # In[30]: D.head() # ## Block Two tuples To Check If a Tuple Pair Would Get Blocked # In[33]: A.ix[[0]] # In[34]: B.ix[[1]] # In[36]: rb = em.RuleBasedBlocker() # Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4 rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f) rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f) # In[38]: status = rb.block_tuples(A.ix[0], B.ix[0]) print(status)