#!/usr/bin/env python # coding: utf-8 # # Introduction # In[1]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Then, read the (sample) input tables for blocking purposes # In[2]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' # Get the paths of the input tables path_A = datasets_dir + os.sep + 'person_table_A.csv' path_B = datasets_dir + os.sep + 'person_table_B.csv' # In[3]: # Read the CSV files and set 'ID' as the key attribute A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # In[4]: # Get features (for blocking) feature_table = em.get_features_for_blocking(A, B) # Get features (for matching) # feature_table = em.get_features_for_blocking(A, B) # # Adding Features Declaratively # In[5]: # Add a feature to do Jaccard on title + authors and add it to F # Create a feature declaratively sim = em.get_sim_funs_for_matching() tok = em.get_tokenizers_for_matching() feature_string = """jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()), wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))""" feature = em.get_feature_fn(feature_string, sim, tok) # Add feature to F em.add_feature(feature_table, 'jac_ws_name_address', feature) # In[6]: feature_table.feature_name # In[7]: import fuzzywuzzy.StringMatcher as fz # In[8]: fz.ratio('xyz', 'ayz') # In[9]: # Create a feature declaratively sim = em.get_sim_funs_for_matching() # In[10]: sim['fz_ratio'] = fz.ratio # In[11]: sim # In[12]: feature_string = """fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(), (rtuple['name'] + ' ' + rtuple['address']).lower())""" feature = em.get_feature_fn(feature_string, sim, tok) # Add feature to F em.add_feature(feature_table, 'fzratio_name_address', feature) # In[13]: feature_table.feature_name # # Adding Blackbox Features # In[14]: import fuzzywuzzy.StringMatcher as fz # In[15]: def my_feature(ltuple, rtuple): return(ltuple['name'], rtuple['name']) # In[16]: feature_table = em.get_features_for_blocking(A, B) # In[17]: help(em.add_blackbox_feature) # In[18]: em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature) # In[19]: feature_table.feature_name