This IPython notebook illustrates how to remove features from feature table. First, we need to import py_entitymatching package and other libraries as follows:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
# Get features (for blocking)
feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)
# Get features (for matching)
# feature_table = em.get_features_for_matching(A, B)
type(feature_table)
pandas.core.frame.DataFrame
feature_table.head()
feature_name | left_attribute | right_attribute | left_attr_tokenizer | right_attr_tokenizer | simfunction | function | function_source | is_auto_generated | |
---|---|---|---|---|---|---|---|---|---|
0 | ID_ID_lev_dist | ID | ID | None | None | lev_dist | <function ID_ID_lev_dist at 0x10b5987b8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
1 | ID_ID_lev_sim | ID | ID | None | None | lev_sim | <function ID_ID_lev_sim at 0x10f9b0620> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
2 | ID_ID_jar | ID | ID | None | None | jaro | <function ID_ID_jar at 0x10f9b0950> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
3 | ID_ID_jwn | ID | ID | None | None | jaro_winkler | <function ID_ID_jwn at 0x10f9b09d8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
4 | ID_ID_exm | ID | ID | None | None | exact_match | <function ID_ID_exm at 0x10f9b08c8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
# Drop first row
feature_table = feature_table.drop(0)
feature_table.head()
feature_name | left_attribute | right_attribute | left_attr_tokenizer | right_attr_tokenizer | simfunction | function | function_source | is_auto_generated | |
---|---|---|---|---|---|---|---|---|---|
1 | ID_ID_lev_sim | ID | ID | None | None | lev_sim | <function ID_ID_lev_sim at 0x10f9b0620> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
2 | ID_ID_jar | ID | ID | None | None | jaro | <function ID_ID_jar at 0x10f9b0950> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
3 | ID_ID_jwn | ID | ID | None | None | jaro_winkler | <function ID_ID_jwn at 0x10f9b09d8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
4 | ID_ID_exm | ID | ID | None | None | exact_match | <function ID_ID_exm at 0x10f9b08c8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
5 | ID_ID_jac_qgm_3_qgm_3 | ID | ID | qgm_3 | qgm_3 | jaccard | <function ID_ID_jac_qgm_3_qgm_3 at 0x10f9b0a60> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
#Remove all the features except involving name (Include only the features where the left attribute is name)
feature_table = feature_table[feature_table.left_attribute=='name']
feature_table
feature_name | left_attribute | right_attribute | left_attr_tokenizer | right_attr_tokenizer | simfunction | function | function_source | is_auto_generated | |
---|---|---|---|---|---|---|---|---|---|
6 | name_name_jac_qgm_3_qgm_3 | name | name | qgm_3 | qgm_3 | jaccard | <function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
7 | name_name_cos_dlm_dc0_dlm_dc0 | name | name | dlm_dc0 | dlm_dc0 | cosine | <function name_name_cos_dlm_dc0_dlm_dc0 at 0x10f9b0b70> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
8 | name_name_jac_dlm_dc0_dlm_dc0 | name | name | dlm_dc0 | dlm_dc0 | jaccard | <function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
9 | name_name_mel | name | name | None | None | monge_elkan | <function name_name_mel at 0x10f9b0c80> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
10 | name_name_lev_dist | name | name | None | None | lev_dist | <function name_name_lev_dist at 0x10f9b0d08> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
11 | name_name_lev_sim | name | name | None | None | lev_sim | <function name_name_lev_sim at 0x10f9b0d90> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
12 | name_name_nmw | name | name | None | None | needleman_wunsch | <function name_name_nmw at 0x10f9b0e18> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
13 | name_name_sw | name | name | None | None | smith_waterman | <function name_name_sw at 0x10f9b0ea0> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)
feature_table = feature_table[feature_table.simfunction=='jaccard']
feature_table
feature_name | left_attribute | right_attribute | left_attr_tokenizer | right_attr_tokenizer | simfunction | function | function_source | is_auto_generated | |
---|---|---|---|---|---|---|---|---|---|
6 | name_name_jac_qgm_3_qgm_3 | name | name | qgm_3 | qgm_3 | jaccard | <function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |
8 | name_name_jac_dlm_dc0_dlm_dc0 | name | name | dlm_dc0 | dlm_dc0 | jaccard | <function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8> | from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... | True |