This IPython notebook illustrates how to update attribute types and generate features for blocking/matching manually.
First, we need to import py_entitymatching package and other libraries as follows:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes.
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)
atypes1.keys()
dict_keys(['ID', 'zipcode', '_table', 'name', 'hourly_wage', 'address', 'birth_year'])
atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']
('numeric', 'numeric', 'str_bt_1w_5w', 'str_bt_1w_5w', 'numeric')
atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']
('numeric', 'numeric', 'str_bt_5w_10w', 'str_bt_1w_5w', 'numeric')
atypes1['address'], atypes2['address']
('str_bt_1w_5w', 'str_bt_5w_10w')
atypes1['address'] = 'str_bt_1w_5w'
atypes2['address'] = 'str_bt_1w_5w'
block_c = em.get_attr_corres(A, B)
block_c.keys()
dict_keys(['corres', 'rtable', 'ltable'])
id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])
(4509225032, 4509225032, 4509225816, 4509225816)
block_c['corres']
[('ID', 'ID'), ('name', 'name'), ('birth_year', 'birth_year'), ('hourly_wage', 'hourly_wage'), ('address', 'address'), ('zipcode', 'zipcode')]
block_c['corres'] = [('name', 'name'),
('birth_year', 'birth_year'),
('hourly_wage', 'hourly_wage'),
('address', 'address'),
('zipcode', 'zipcode')]
# for blocking
tok = em.get_tokenizers_for_blocking()
# for matching
#tok = em.get_tokenizers_for_matching()
tok
{'alphabetic': <function py_entitymatching.feature.tokenizers.tok_alphabetic>, 'alphanumeric': <function py_entitymatching.feature.tokenizers.tok_alphanumeric>, 'dlm_dc0': <function py_entitymatching.feature.tokenizers._make_tok_delim.<locals>.tok_delim>, 'qgm_2': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>, 'qgm_3': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>, 'wspace': <function py_entitymatching.feature.tokenizers.tok_wspace>}
#for blocking
sim = em.get_sim_funs_for_blocking()
#for matching
#sim = em.get_sim_funs_for_matching()
sim
{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>, 'affine': <function py_entitymatching.feature.simfunctions.affine>, 'cosine': <function py_entitymatching.feature.simfunctions.cosine>, 'dice': <function py_entitymatching.feature.simfunctions.dice>, 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>, 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>, 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>, 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>, 'jaro': <function py_entitymatching.feature.simfunctions.jaro>, 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>, 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>, 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>, 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>, 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>, 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>, 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>, 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}
feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)
feature_table[feature_table.left_attribute == 'address']
feature_name | left_attribute | right_attribute | left_attr_tokenizer | right_attr_tokenizer | simfunction | function | function_source | is_auto_generated | |
---|---|---|---|---|---|---|---|---|---|
16 | address_address_jac_qgm_3_qgm_3 | address | address | qgm_3 | qgm_3 | jaccard | <function address_address_jac_qgm_3_qgm_3 at 0x10f959c80> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
17 | address_address_cos_dlm_dc0_dlm_dc0 | address | address | dlm_dc0 | dlm_dc0 | cosine | <function address_address_cos_dlm_dc0_dlm_dc0 at 0x10f959d08> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
18 | address_address_jac_dlm_dc0_dlm_dc0 | address | address | dlm_dc0 | dlm_dc0 | jaccard | <function address_address_jac_dlm_dc0_dlm_dc0 at 0x10f959d90> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
19 | address_address_mel | address | address | None | None | monge_elkan | <function address_address_mel at 0x10f959e18> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
20 | address_address_lev_dist | address | address | None | None | lev_dist | <function address_address_lev_dist at 0x10f959ea0> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
21 | address_address_lev_sim | address | address | None | None | lev_sim | <function address_address_lev_sim at 0x10f959f28> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
22 | address_address_nmw | address | address | None | None | needleman_wunsch | <function address_address_nmw at 0x10f9bb048> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
23 | address_address_sw | address | address | None | None | smith_waterman | <function address_address_sw at 0x10f9bb0d0> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
type(feature_table)
pandas.core.frame.DataFrame