# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
# Calling get_features_for_blocking or get_features_for_matching without setting the flag
# validate_inferred_attr_types to false will result in a validation process that shows the
# user a table containing all of the inferred attribute correspondence and inferred types.
# Get features (for blocking)
feature_table = em.get_features_for_blocking(A, B)
# Get features (for matching)
# feature_table = em.get_features_for_blocking(A, B)
The table shows the corresponding attributes along with their respective types. Please confirm that the information has been correctly inferred. If you would like to skip this validation process in the future, please set the flag validate_inferred_attr_types equal to false.
Left Attribute | Right Attribute | Left Attribute Type | Right Attribute Type | Example Features | |
---|---|---|---|---|---|
0 | ID | ID | short string (1 word) | short string (1 word) | Levenshtein Distance; Levenshtein Similarity |
1 | name | name | short string (1 word to 5 words) | short string (1 word to 5 words) | Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter] |
2 | birth_year | birth_year | numeric | numeric | Exact Match; Absolute Norm |
3 | hourly_wage | hourly_wage | numeric | numeric | Exact Match; Absolute Norm |
4 | address | address | short string (1 word to 5 words) | medium string (5 words to 10 words) | Not Applicable: Types do not match |
5 | zipcode | zipcode | numeric | numeric | Exact Match; Absolute Norm |
Do you want to proceed? (y/n):y
# Add a feature to do Jaccard on title + authors and add it to F
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()),
wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(feature_table, 'jac_ws_name_address', feature)
True
feature_table.feature_name
0 ID_ID_lev_dist 1 ID_ID_lev_sim 2 ID_ID_jar 3 ID_ID_jwn 4 ID_ID_exm 5 ID_ID_jac_qgm_3_qgm_3 6 name_name_jac_qgm_3_qgm_3 7 name_name_cos_dlm_dc0_dlm_dc0 8 name_name_jac_dlm_dc0_dlm_dc0 9 name_name_mel 10 name_name_lev_dist 11 name_name_lev_sim 12 name_name_nmw 13 name_name_sw 14 birth_year_birth_year_exm 15 birth_year_birth_year_anm 16 birth_year_birth_year_lev_dist 17 birth_year_birth_year_lev_sim 18 hourly_wage_hourly_wage_exm 19 hourly_wage_hourly_wage_anm 20 hourly_wage_hourly_wage_lev_dist 21 hourly_wage_hourly_wage_lev_sim 22 zipcode_zipcode_exm 23 zipcode_zipcode_anm 24 zipcode_zipcode_lev_dist 25 zipcode_zipcode_lev_sim 26 jac_ws_name_address Name: feature_name, dtype: object
import fuzzywuzzy.StringMatcher as fz
fz.ratio('xyz', 'ayz')
0.6666666666666666
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
sim['fz_ratio'] = fz.ratio
sim
{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>, 'affine': <function py_entitymatching.feature.simfunctions.affine>, 'cosine': <function py_entitymatching.feature.simfunctions.cosine>, 'dice': <function py_entitymatching.feature.simfunctions.dice>, 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>, 'fz_ratio': <function Levenshtein._levenshtein.ratio>, 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>, 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>, 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>, 'jaro': <function py_entitymatching.feature.simfunctions.jaro>, 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>, 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>, 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>, 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>, 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>, 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>, 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>, 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}
feature_string = """fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(),
(rtuple['name'] + ' ' + rtuple['address']).lower())"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(feature_table, 'fzratio_name_address', feature)
True
feature_table.feature_name
0 ID_ID_lev_dist 1 ID_ID_lev_sim 2 ID_ID_jar 3 ID_ID_jwn 4 ID_ID_exm 5 ID_ID_jac_qgm_3_qgm_3 6 name_name_jac_qgm_3_qgm_3 7 name_name_cos_dlm_dc0_dlm_dc0 8 name_name_jac_dlm_dc0_dlm_dc0 9 name_name_mel 10 name_name_lev_dist 11 name_name_lev_sim 12 name_name_nmw 13 name_name_sw 14 birth_year_birth_year_exm 15 birth_year_birth_year_anm 16 birth_year_birth_year_lev_dist 17 birth_year_birth_year_lev_sim 18 hourly_wage_hourly_wage_exm 19 hourly_wage_hourly_wage_anm 20 hourly_wage_hourly_wage_lev_dist 21 hourly_wage_hourly_wage_lev_sim 22 zipcode_zipcode_exm 23 zipcode_zipcode_anm 24 zipcode_zipcode_lev_dist 25 zipcode_zipcode_lev_sim 26 jac_ws_name_address 27 fzratio_name_address Name: feature_name, dtype: object
import fuzzywuzzy.StringMatcher as fz
def my_feature(ltuple, rtuple):
return(ltuple['name'], rtuple['name'])
feature_table = em.get_features_for_blocking(A, B)
help(em.add_blackbox_feature)
Help on function add_blackbox_feature in module py_entitymatching.feature.addfeatures: add_blackbox_feature(feature_table, feature_name, feature_function) Adds a black box feature to the feature table. Args: feature_table (DataFrame): The input DataFrame (typically a feature table) to which the feature must be added. feature_name (string): The name that should be given to the feature. feature_function (Python function): A Python function for the black box feature. Returns: A Boolean value of True is returned if the addition was successful. Raises: AssertionError: If the input `feature_table` is not of type DataFrame. AssertionError: If the input `feature_name` is not of type string. AssertionError: If the `feature_table` does not have necessary columns such as 'feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', and 'function_source' in the DataFrame. AssertionError: If the `feature_name` is already present in the feature table. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> block_f = em.get_features_for_blocking(A, B) >>> def age_diff(ltuple, rtuple): >>> # assume that the tuples have age attribute and values are valid numbers. >>> return ltuple['age'] - rtuple['age'] >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)
em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)
True
feature_table.feature_name
0 ID_ID_lev_dist 1 ID_ID_lev_sim 2 ID_ID_jar 3 ID_ID_jwn 4 ID_ID_exm 5 ID_ID_jac_qgm_3_qgm_3 6 name_name_jac_qgm_3_qgm_3 7 name_name_cos_dlm_dc0_dlm_dc0 8 name_name_jac_dlm_dc0_dlm_dc0 9 name_name_mel 10 name_name_lev_dist 11 name_name_lev_sim 12 name_name_nmw 13 name_name_sw 14 birth_year_birth_year_exm 15 birth_year_birth_year_anm 16 birth_year_birth_year_lev_dist 17 birth_year_birth_year_lev_sim 18 hourly_wage_hourly_wage_exm 19 hourly_wage_hourly_wage_anm 20 hourly_wage_hourly_wage_lev_dist 21 hourly_wage_hourly_wage_lev_sim 22 zipcode_zipcode_exm 23 zipcode_zipcode_anm 24 zipcode_zipcode_lev_dist 25 zipcode_zipcode_lev_sim 26 blackbox_fz_ratio_name Name: feature_name, dtype: object