#!/usr/bin/env python # coding: utf-8 # # Introduction # This IPython notebook illustrates how to update attribute types and generate features for blocking/matching manually. # # First, we need to import *py_entitymatching* package and other libraries as follows: # In[1]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Then, read the (sample) input tables for blocking purposes. # In[2]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path_A = datasets_dir + os.sep + 'person_table_A.csv' path_B = datasets_dir + os.sep + 'person_table_B.csv' # In[3]: # Read the CSV files and set 'ID' as the key attribute A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # ## Getting Attribute Types # In[4]: atypes1 = em.get_attr_types(A) atypes2 = em.get_attr_types(B) # In[5]: atypes1.keys() # In[6]: atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode'] # In[7]: atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode'] # ## Updating Attribute Types # In[8]: atypes1['address'], atypes2['address'] # In[9]: atypes1['address'] = 'str_bt_1w_5w' atypes2['address'] = 'str_bt_1w_5w' # ## Getting Attribute Correspondences # In[10]: block_c = em.get_attr_corres(A, B) # In[11]: block_c.keys() # In[12]: id(A), id(block_c['ltable']), id(B), id(block_c['rtable']) # In[13]: block_c['corres'] # ## Updating Attribute Correspondences # In[14]: block_c['corres'] = [('name', 'name'), ('birth_year', 'birth_year'), ('hourly_wage', 'hourly_wage'), ('address', 'address'), ('zipcode', 'zipcode')] # ## Getting Tokenizers # In[23]: # for blocking tok = em.get_tokenizers_for_blocking() # for matching #tok = em.get_tokenizers_for_matching() # In[16]: tok # ## Getting Similarity Functions # In[22]: #for blocking sim = em.get_sim_funs_for_blocking() #for matching #sim = em.get_sim_funs_for_matching() # In[18]: sim # ## Getting Features # In[19]: feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim) # In[20]: feature_table[feature_table.left_attribute == 'address'] # In[21]: type(feature_table)