Notebook

Introduction¶

In [1]:

# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes

In [2]:

# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'

In [3]:

# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

In [4]:

# Get features (for blocking)
feature_table = em.get_features_for_blocking(A, B)

# Get features (for matching)
# feature_table = em.get_features_for_blocking(A, B)

Adding Features Declaratively¶

In [5]:

# Add a feature to do Jaccard on title + authors and add it to F

# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), 
                            wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(feature_table, 'jac_ws_name_address', feature)

Out[5]:

True

In [6]:

feature_table.feature_name

Out[6]:

0                       ID_ID_lev_dist
1                        ID_ID_lev_sim
2                            ID_ID_jar
3                            ID_ID_jwn
4                            ID_ID_exm
5                ID_ID_jac_qgm_3_qgm_3
6            name_name_jac_qgm_3_qgm_3
7        name_name_cos_dlm_dc0_dlm_dc0
8        name_name_jac_dlm_dc0_dlm_dc0
9                        name_name_mel
10                  name_name_lev_dist
11                   name_name_lev_sim
12                       name_name_nmw
13                        name_name_sw
14           birth_year_birth_year_exm
15           birth_year_birth_year_anm
16      birth_year_birth_year_lev_dist
17       birth_year_birth_year_lev_sim
18         hourly_wage_hourly_wage_exm
19         hourly_wage_hourly_wage_anm
20    hourly_wage_hourly_wage_lev_dist
21     hourly_wage_hourly_wage_lev_sim
22                 zipcode_zipcode_exm
23                 zipcode_zipcode_anm
24            zipcode_zipcode_lev_dist
25             zipcode_zipcode_lev_sim
26                 jac_ws_name_address
Name: feature_name, dtype: object

In [7]:

import fuzzywuzzy.StringMatcher as fz

In [8]:

fz.ratio('xyz', 'ayz')

Out[8]:

0.6666666666666666

In [9]:

# Create a feature declaratively
sim = em.get_sim_funs_for_matching()

In [10]:

sim['fz_ratio'] = fz.ratio

In [11]:

sim

Out[11]:

{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,
 'affine': <function py_entitymatching.feature.simfunctions.affine>,
 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,
 'dice': <function py_entitymatching.feature.simfunctions.dice>,
 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,
 'fz_ratio': <function Levenshtein._levenshtein.ratio>,
 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,
 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,
 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,
 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,
 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,
 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,
 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,
 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,
 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,
 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,
 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>,
 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}

In [12]:

feature_string = """fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(), 
                            (rtuple['name'] + ' ' + rtuple['address']).lower())"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(feature_table, 'fzratio_name_address', feature)

Out[12]:

True

In [13]:

feature_table.feature_name

Out[13]:

0                       ID_ID_lev_dist
1                        ID_ID_lev_sim
2                            ID_ID_jar
3                            ID_ID_jwn
4                            ID_ID_exm
5                ID_ID_jac_qgm_3_qgm_3
6            name_name_jac_qgm_3_qgm_3
7        name_name_cos_dlm_dc0_dlm_dc0
8        name_name_jac_dlm_dc0_dlm_dc0
9                        name_name_mel
10                  name_name_lev_dist
11                   name_name_lev_sim
12                       name_name_nmw
13                        name_name_sw
14           birth_year_birth_year_exm
15           birth_year_birth_year_anm
16      birth_year_birth_year_lev_dist
17       birth_year_birth_year_lev_sim
18         hourly_wage_hourly_wage_exm
19         hourly_wage_hourly_wage_anm
20    hourly_wage_hourly_wage_lev_dist
21     hourly_wage_hourly_wage_lev_sim
22                 zipcode_zipcode_exm
23                 zipcode_zipcode_anm
24            zipcode_zipcode_lev_dist
25             zipcode_zipcode_lev_sim
26                 jac_ws_name_address
27                fzratio_name_address
Name: feature_name, dtype: object

Adding Blackbox Features¶

In [14]:

import fuzzywuzzy.StringMatcher as fz

In [15]:

def my_feature(ltuple, rtuple):
    return(ltuple['name'], rtuple['name'])

In [16]:

feature_table = em.get_features_for_blocking(A, B)

In [17]:

help(em.add_blackbox_feature)

Help on function add_blackbox_feature in module py_entitymatching.feature.addfeatures:

add_blackbox_feature(feature_table, feature_name, feature_function)
    Adds a black box feature to the feature table.
    
    Args:
        feature_table (DataFrame): The input DataFrame (typically a feature
            table) to which the feature must be added.
        feature_name (string): The name that should be given to the feature.
        feature_function (Python function): A Python function for the black box
            feature.
    
    Returns:
        A Boolean value of True is returned if the addition was successful.
    
    Raises:
        AssertionError: If the input `feature_table` is not of type
            DataFrame.
        AssertionError: If the input `feature_name` is not of type
            string.
        AssertionError: If the `feature_table` does not have necessary columns
            such as 'feature_name', 'left_attribute', 'right_attribute',
            'left_attr_tokenizer',
            'right_attr_tokenizer', 'simfunction', 'function', and
            'function_source' in the DataFrame.
        AssertionError: If the `feature_name` is already present in the
            feature table.
    
    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> def age_diff(ltuple, rtuple):
        >>>     # assume that the tuples have age attribute and values are valid numbers.
        >>>   return ltuple['age'] - rtuple['age']
        >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)

In [18]:

em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)

Out[18]:

True

In [19]:

feature_table.feature_name

Out[19]:

0                       ID_ID_lev_dist
1                        ID_ID_lev_sim
2                            ID_ID_jar
3                            ID_ID_jwn
4                            ID_ID_exm
5                ID_ID_jac_qgm_3_qgm_3
6            name_name_jac_qgm_3_qgm_3
7        name_name_cos_dlm_dc0_dlm_dc0
8        name_name_jac_dlm_dc0_dlm_dc0
9                        name_name_mel
10                  name_name_lev_dist
11                   name_name_lev_sim
12                       name_name_nmw
13                        name_name_sw
14           birth_year_birth_year_exm
15           birth_year_birth_year_anm
16      birth_year_birth_year_lev_dist
17       birth_year_birth_year_lev_sim
18         hourly_wage_hourly_wage_exm
19         hourly_wage_hourly_wage_anm
20    hourly_wage_hourly_wage_lev_dist
21     hourly_wage_hourly_wage_lev_sim
22                 zipcode_zipcode_exm
23                 zipcode_zipcode_anm
24            zipcode_zipcode_lev_dist
25             zipcode_zipcode_lev_sim
26              blackbox_fz_ratio_name
Name: feature_name, dtype: object