#!/usr/bin/env python # coding: utf-8 # # Introduction # This IPython notebook illustrates how to generate features for blocking/matching manually. # # First, we need to import *py_entitymatching* package and other libraries as follows: # In[22]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Then, read the (sample) input tables for blocking purposes. # In[23]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path_A = datasets_dir + os.sep + 'person_table_A.csv' path_B = datasets_dir + os.sep + 'person_table_B.csv' # In[24]: # Read the CSV files and set 'ID' as the key attribute A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # # Generating Features for Manually # ## Getting Attribute Types # In[25]: atypes1 = em.get_attr_types(A) atypes2 = em.get_attr_types(B) # In[26]: atypes1.keys() # In[27]: atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode'] # In[28]: atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode'] # ## Getting Attribute Correspondences # In[29]: block_c = em.get_attr_corres(A, B) # In[30]: block_c.keys() # In[31]: id(A), id(block_c['ltable']), id(B), id(block_c['rtable']) # In[32]: block_c['corres'] # ## Getting Tokenizers # In[33]: # for blocking tok = em.get_tokenizers_for_blocking() # for matching # tok = em.get_tokenizers_for_matching() # In[34]: tok # ## Getting Similarity Functions # In[35]: # for blocking sim = em.get_sim_funs_for_blocking() # for matching # sim = em.get_sim_funs_for_matching() # In[36]: sim # ## Getting Features # In[38]: feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim) # In[41]: feature_table.head() # In[40]: type(feature_table)