#!/usr/bin/env python # coding: utf-8 # # Introduction # # This IPython notebook illustrates how to use multiple blockers and combine the results. # # First, we need to import *py_entitymatching* package and other libraries as follows: # In[1]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Then, read the (sample) input tables for blocking purposes. # In[2]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path_A = datasets_dir + os.sep + 'person_table_A.csv' path_B = datasets_dir + os.sep + 'person_table_B.csv' # In[3]: # Read the CSV files and set 'ID' as the key attribute A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # # Combining Multiple Blockers # In[5]: #Blocking plan : #A, B --overlap blocking--> candset --attr-equiv-block--> candset | # | #A, B ------------rule-based-blocking--------------------> candset |----union--->candset # | #A, B -----------black-box-blocking----------------------> candset | # In[6]: # Overlap blocking over input tables ob = em.OverlapBlocker() # block using name C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=1, l_output_attrs=['name', 'birth_year'], r_output_attrs=['name', 'birth_year'], show_progress=False) C # In[7]: # Overlap blocking over input tables ob = em.OverlapBlocker() # block using name C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=1, l_output_attrs=['name', 'birth_year'], r_output_attrs=['name', 'birth_year'], show_progress=False) C # In[8]: # Attribute equivalence blocking: block C using birth_year ab = em.AttrEquivalenceBlocker() D = ab.block_candset(C, 'birth_year', 'birth_year', show_progress=False) # In[9]: D # In[10]: # Rule-based blocking over input tables # first get features that can be used feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False) # In[11]: # Create rule-based blocker rb = em.RuleBasedBlocker() # Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4 rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table) # In[12]: E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'], show_progress=False) # In[13]: # Apply black box blocker # Create black box blocker bb = em.BlackBoxBlocker() # In[14]: # define a black box function. # The blocker function should drop tuple pairs whose last name do not match # The function has to do the following steps # 1) Get name attributes from each of the tuples # 2) Split name attribute to get last name # 3) if last names donot match return True # In[15]: def my_function(x, y): # x, y will be of type pandas series # get name attribute x_name = x['name'] y_name = y['name'] # get last names x_name = x_name.split(' ')[1] y_name = y_name.split(' ')[1] # check if last names match if x_name != y_name: return True else: return False # In[16]: bb.set_black_box_function(my_function) # In[17]: F = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'], show_progress=False) # In[18]: F # In[19]: # Combine all the blocker outputs G = em.combine_blocker_outputs_via_union([D, E, F]) # In[20]: G # In[21]: em.show_properties(G) # In[ ]: