# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
# Get features (for blocking)
feature_table = em.get_features_for_blocking(A, B)
# Get features (for matching)
# feature_table = em.get_features_for_blocking(A, B)
# Add a feature to do Jaccard on title + authors and add it to F
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()),
wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(feature_table, 'jac_ws_name_address', feature)
True
feature_table.feature_name
0 ID_ID_lev_dist 1 ID_ID_lev_sim 2 ID_ID_jar 3 ID_ID_jwn 4 ID_ID_exm 5 ID_ID_jac_qgm_3_qgm_3 6 name_name_jac_qgm_3_qgm_3 7 name_name_cos_dlm_dc0_dlm_dc0 8 name_name_jac_dlm_dc0_dlm_dc0 9 name_name_mel 10 name_name_lev_dist 11 name_name_lev_sim 12 name_name_nmw 13 name_name_sw 14 birth_year_birth_year_exm 15 birth_year_birth_year_anm 16 birth_year_birth_year_lev_dist 17 birth_year_birth_year_lev_sim 18 hourly_wage_hourly_wage_exm 19 hourly_wage_hourly_wage_anm 20 hourly_wage_hourly_wage_lev_dist 21 hourly_wage_hourly_wage_lev_sim 22 zipcode_zipcode_exm 23 zipcode_zipcode_anm 24 zipcode_zipcode_lev_dist 25 zipcode_zipcode_lev_sim 26 jac_ws_name_address Name: feature_name, dtype: object
import fuzzywuzzy.StringMatcher as fz
fz.ratio('xyz', 'ayz')
0.6666666666666666
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
sim['fz_ratio'] = fz.ratio
sim
{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>, 'affine': <function py_entitymatching.feature.simfunctions.affine>, 'cosine': <function py_entitymatching.feature.simfunctions.cosine>, 'dice': <function py_entitymatching.feature.simfunctions.dice>, 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>, 'fz_ratio': <function Levenshtein._levenshtein.ratio>, 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>, 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>, 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>, 'jaro': <function py_entitymatching.feature.simfunctions.jaro>, 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>, 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>, 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>, 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>, 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>, 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>, 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>, 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}
feature_string = """fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(),
(rtuple['name'] + ' ' + rtuple['address']).lower())"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(feature_table, 'fzratio_name_address', feature)
True
feature_table.feature_name
0 ID_ID_lev_dist 1 ID_ID_lev_sim 2 ID_ID_jar 3 ID_ID_jwn 4 ID_ID_exm 5 ID_ID_jac_qgm_3_qgm_3 6 name_name_jac_qgm_3_qgm_3 7 name_name_cos_dlm_dc0_dlm_dc0 8 name_name_jac_dlm_dc0_dlm_dc0 9 name_name_mel 10 name_name_lev_dist 11 name_name_lev_sim 12 name_name_nmw 13 name_name_sw 14 birth_year_birth_year_exm 15 birth_year_birth_year_anm 16 birth_year_birth_year_lev_dist 17 birth_year_birth_year_lev_sim 18 hourly_wage_hourly_wage_exm 19 hourly_wage_hourly_wage_anm 20 hourly_wage_hourly_wage_lev_dist 21 hourly_wage_hourly_wage_lev_sim 22 zipcode_zipcode_exm 23 zipcode_zipcode_anm 24 zipcode_zipcode_lev_dist 25 zipcode_zipcode_lev_sim 26 jac_ws_name_address 27 fzratio_name_address Name: feature_name, dtype: object
import fuzzywuzzy.StringMatcher as fz
def my_feature(ltuple, rtuple):
return(ltuple['name'], rtuple['name'])
feature_table = em.get_features_for_blocking(A, B)
help(em.add_blackbox_feature)
Help on function add_blackbox_feature in module py_entitymatching.feature.addfeatures: add_blackbox_feature(feature_table, feature_name, feature_function) Adds a black box feature to the feature table. Args: feature_table (DataFrame): The input DataFrame (typically a feature table) to which the feature must be added. feature_name (string): The name that should be given to the feature. feature_function (Python function): A Python function for the black box feature. Returns: A Boolean value of True is returned if the addition was successful. Raises: AssertionError: If the input `feature_table` is not of type DataFrame. AssertionError: If the input `feature_name` is not of type string. AssertionError: If the `feature_table` does not have necessary columns such as 'feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', and 'function_source' in the DataFrame. AssertionError: If the `feature_name` is already present in the feature table. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> block_f = em.get_features_for_blocking(A, B) >>> def age_diff(ltuple, rtuple): >>> # assume that the tuples have age attribute and values are valid numbers. >>> return ltuple['age'] - rtuple['age'] >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)
em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)
True
feature_table.feature_name
0 ID_ID_lev_dist 1 ID_ID_lev_sim 2 ID_ID_jar 3 ID_ID_jwn 4 ID_ID_exm 5 ID_ID_jac_qgm_3_qgm_3 6 name_name_jac_qgm_3_qgm_3 7 name_name_cos_dlm_dc0_dlm_dc0 8 name_name_jac_dlm_dc0_dlm_dc0 9 name_name_mel 10 name_name_lev_dist 11 name_name_lev_sim 12 name_name_nmw 13 name_name_sw 14 birth_year_birth_year_exm 15 birth_year_birth_year_anm 16 birth_year_birth_year_lev_dist 17 birth_year_birth_year_lev_sim 18 hourly_wage_hourly_wage_exm 19 hourly_wage_hourly_wage_anm 20 hourly_wage_hourly_wage_lev_dist 21 hourly_wage_hourly_wage_lev_sim 22 zipcode_zipcode_exm 23 zipcode_zipcode_anm 24 zipcode_zipcode_lev_dist 25 zipcode_zipcode_lev_sim 26 blackbox_fz_ratio_name Name: feature_name, dtype: object