Introduction¶

In [1]:

# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes

In [2]:

# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'

In [3]:

# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

In [4]:

# Calling get_features_for_blocking or get_features_for_matching without setting the flag 
# validate_inferred_attr_types to false will result in a validation process that shows the
# user a table containing all of the inferred attribute correspondence and inferred types.

# Get features (for blocking)
feature_table = em.get_features_for_blocking(A, B)

# Get features (for matching)
# feature_table = em.get_features_for_blocking(A, B)

The table shows the corresponding attributes along with their respective types. Please confirm that the information  has been correctly inferred. If you would like to skip this validation process in the future, please set the flag    validate_inferred_attr_types equal to false.

	Left Attribute	Right Attribute	Left Attribute Type	Right Attribute Type	Example Features
0	ID	ID	short string (1 word)	short string (1 word)	Levenshtein Distance; Levenshtein Similarity
1	name	name	short string (1 word to 5 words)	short string (1 word to 5 words)	Jaccard Similarity [3-grams, 3-grams]; Cosine Similarity [Space Delimiter, Space Delimiter]
2	birth_year	birth_year	numeric	numeric	Exact Match; Absolute Norm
3	hourly_wage	hourly_wage	numeric	numeric	Exact Match; Absolute Norm
4	address	address	short string (1 word to 5 words)	medium string (5 words to 10 words)	Not Applicable: Types do not match
5	zipcode	zipcode	numeric	numeric	Exact Match; Absolute Norm

Do you want to proceed? (y/n):y

Adding Features Declaratively¶

In [5]:

# Add a feature to do Jaccard on title + authors and add it to F

# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), 
                            wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(feature_table, 'jac_ws_name_address', feature)

Out[5]:

True

In [6]:

feature_table.feature_name

Out[6]:

0                       ID_ID_lev_dist
1                        ID_ID_lev_sim
2                            ID_ID_jar
3                            ID_ID_jwn
4                            ID_ID_exm
5                ID_ID_jac_qgm_3_qgm_3
6            name_name_jac_qgm_3_qgm_3
7        name_name_cos_dlm_dc0_dlm_dc0
8        name_name_jac_dlm_dc0_dlm_dc0
9                        name_name_mel
10                  name_name_lev_dist
11                   name_name_lev_sim
12                       name_name_nmw
13                        name_name_sw
14           birth_year_birth_year_exm
15           birth_year_birth_year_anm
16      birth_year_birth_year_lev_dist
17       birth_year_birth_year_lev_sim
18         hourly_wage_hourly_wage_exm
19         hourly_wage_hourly_wage_anm
20    hourly_wage_hourly_wage_lev_dist
21     hourly_wage_hourly_wage_lev_sim
22                 zipcode_zipcode_exm
23                 zipcode_zipcode_anm
24            zipcode_zipcode_lev_dist
25             zipcode_zipcode_lev_sim
26                 jac_ws_name_address
Name: feature_name, dtype: object

In [7]:

import fuzzywuzzy.StringMatcher as fz

In [8]:

fz.ratio('xyz', 'ayz')

Out[8]:

0.6666666666666666

In [9]:

# Create a feature declaratively
sim = em.get_sim_funs_for_matching()

In [10]:

sim['fz_ratio'] = fz.ratio

In [11]:

sim

Out[11]:

{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,
 'affine': <function py_entitymatching.feature.simfunctions.affine>,
 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,
 'dice': <function py_entitymatching.feature.simfunctions.dice>,
 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,
 'fz_ratio': <function Levenshtein._levenshtein.ratio>,
 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,
 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,
 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,
 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,
 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,
 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,
 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,
 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,
 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,
 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,
 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>,
 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}

In [12]:

feature_string = """fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(), 
                            (rtuple['name'] + ' ' + rtuple['address']).lower())"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(feature_table, 'fzratio_name_address', feature)

Out[12]:

True

In [13]:

feature_table.feature_name

Out[13]:

0                       ID_ID_lev_dist
1                        ID_ID_lev_sim
2                            ID_ID_jar
3                            ID_ID_jwn
4                            ID_ID_exm
5                ID_ID_jac_qgm_3_qgm_3
6            name_name_jac_qgm_3_qgm_3
7        name_name_cos_dlm_dc0_dlm_dc0
8        name_name_jac_dlm_dc0_dlm_dc0
9                        name_name_mel
10                  name_name_lev_dist
11                   name_name_lev_sim
12                       name_name_nmw
13                        name_name_sw
14           birth_year_birth_year_exm
15           birth_year_birth_year_anm
16      birth_year_birth_year_lev_dist
17       birth_year_birth_year_lev_sim
18         hourly_wage_hourly_wage_exm
19         hourly_wage_hourly_wage_anm
20    hourly_wage_hourly_wage_lev_dist
21     hourly_wage_hourly_wage_lev_sim
22                 zipcode_zipcode_exm
23                 zipcode_zipcode_anm
24            zipcode_zipcode_lev_dist
25             zipcode_zipcode_lev_sim
26                 jac_ws_name_address
27                fzratio_name_address
Name: feature_name, dtype: object

Adding Blackbox Features¶

In [14]:

import fuzzywuzzy.StringMatcher as fz

In [15]:

def my_feature(ltuple, rtuple):
    return(ltuple['name'], rtuple['name'])

In [16]:

feature_table = em.get_features_for_blocking(A, B)

In [17]:

help(em.add_blackbox_feature)

Help on function add_blackbox_feature in module py_entitymatching.feature.addfeatures:

add_blackbox_feature(feature_table, feature_name, feature_function)
    Adds a black box feature to the feature table.
    
    Args:
        feature_table (DataFrame): The input DataFrame (typically a feature
            table) to which the feature must be added.
        feature_name (string): The name that should be given to the feature.
        feature_function (Python function): A Python function for the black box
            feature.
    
    Returns:
        A Boolean value of True is returned if the addition was successful.
    
    Raises:
        AssertionError: If the input `feature_table` is not of type
            DataFrame.
        AssertionError: If the input `feature_name` is not of type
            string.
        AssertionError: If the `feature_table` does not have necessary columns
            such as 'feature_name', 'left_attribute', 'right_attribute',
            'left_attr_tokenizer',
            'right_attr_tokenizer', 'simfunction', 'function', and
            'function_source' in the DataFrame.
        AssertionError: If the `feature_name` is already present in the
            feature table.
    
    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> block_f = em.get_features_for_blocking(A, B)
        >>> def age_diff(ltuple, rtuple):
        >>>     # assume that the tuples have age attribute and values are valid numbers.
        >>>   return ltuple['age'] - rtuple['age']
        >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff)

In [18]:

em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)

Out[18]:

True

In [19]:

feature_table.feature_name

Out[19]:

0                       ID_ID_lev_dist
1                        ID_ID_lev_sim
2                            ID_ID_jar
3                            ID_ID_jwn
4                            ID_ID_exm
5                ID_ID_jac_qgm_3_qgm_3
6            name_name_jac_qgm_3_qgm_3
7        name_name_cos_dlm_dc0_dlm_dc0
8        name_name_jac_dlm_dc0_dlm_dc0
9                        name_name_mel
10                  name_name_lev_dist
11                   name_name_lev_sim
12                       name_name_nmw
13                        name_name_sw
14           birth_year_birth_year_exm
15           birth_year_birth_year_anm
16      birth_year_birth_year_lev_dist
17       birth_year_birth_year_lev_sim
18         hourly_wage_hourly_wage_exm
19         hourly_wage_hourly_wage_anm
20    hourly_wage_hourly_wage_lev_dist
21     hourly_wage_hourly_wage_lev_sim
22                 zipcode_zipcode_exm
23                 zipcode_zipcode_anm
24            zipcode_zipcode_lev_dist
25             zipcode_zipcode_lev_sim
26              blackbox_fz_ratio_name
Name: feature_name, dtype: object