#!/usr/bin/env python # coding: utf-8 # # Introduction # # This IPython notebook illustrates how to refine the results of matching using triggers. # # First, we need to import py_entitymatching package and other libraries as follows: # In[2]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Then, read the (sample) input tables for matching purposes. # In[3]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' path_A = datasets_dir + os.sep + 'dblp_demo.csv' path_B = datasets_dir + os.sep + 'acm_demo.csv' path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv' # In[5]: A = em.read_csv_metadata(path_A, key='id') B = em.read_csv_metadata(path_B, key='id') # Load the pre-labeled data S = em.read_csv_metadata(path_labeled_data, key='_id', ltable=A, rtable=B, fk_ltable='ltable_id', fk_rtable='rtable_id') S.head() # # Use a ML Matcher to get Predictions # # Here we will purposely create a decision tree matcher that does not take the several features into account to show later how triggers can be used to refine the model. # In[6]: # Split S into I an J IJ = em.split_train_test(S, train_proportion=0.5, random_state=0) I = IJ['train'] J = IJ['test'] # In[7]: # Create a Decision Tree Matcher dt = em.DTMatcher(name='DecisionTree', random_state=0) # In[8]: # Generate a set of features feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) feature_table # In[9]: # We will remove many of the features here to purposly create a poor model. This will make it easier # to demonstrate triggers later F = feature_table.drop([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) F # In[10]: # Convert the I into a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) H.head() # In[11]: # Impute feature vectors with the mean of the column values. H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], strategy='mean') # In[12]: # Fit the decision tree to the feature vectors dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='label') # In[13]: # Use the decision tree matcher to predict if tuple pairs match dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='predicted_labels', return_probs=True, probs_attr='proba', append=True, inplace=True) H.head() # # Debug the ML Matcher # # Now we will use the debugger to determine what problems exist with our decision tree matcher. # In[14]: # Split H into P and Q PQ = em.split_train_test(H, train_proportion=0.5, random_state=0) P = PQ['train'] Q = PQ['test'] # In[15]: # Debug RF matcher using GUI em.vis_debug_dt(dt, P, Q, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='label') # We see with the debugger that the false negatives have completely different values in the Title attribute. # This is most likly because we removed all of the features that compare the Title attribute from each table earlier. # In[16]: # We can see which tuples are not predicted correctly H[H['label'] != H['predicted_labels']] # # Using Triggers to Improve Results # # This, typically involves the following steps: # 1. Creating the match trigger # 2. Adding Rules # 3. Adding a condition status and action # 3. Using the trigger to improve results # # ## Creating the Match Trigger # In[17]: # Use the constructor to create a trigger mt = em.MatchTrigger() # ## Adding Rules # Before we can use the rule-based matcher, we need to create rules to evaluate tuple pairs. Each rule is a list of strings. Each string specifies a conjunction of predicates. Each predicate has three parts: (1) an expression, (2) a comparison operator, and (3) a value. The expression is evaluated over a tuple pair, producing a numeric value. # In[18]: # Add two rules to the rule-based matcher # Since we removed all of the features comparing Title earlier, we want to now add a rule that compares Titles mt.add_cond_rule(['title_title_lev_sim(ltuple, rtuple) > 0.7'], feature_table) # The rule has two predicates, one comparing the titles and the other looking for an exact match of the years mt.add_cond_rule(['title_title_lev_sim(ltuple, rtuple) > 0.4', 'year_year_exm(ltuple, rtuple) == 1'], feature_table) mt.get_rule_names() # In[19]: # Rules can also be deleted from the rule-based matcher mt.delete_rule('_rule_1') # ## Adding a Condition Status and Action # Next, we need to add a condition status and an action to the trigger. Triggers apply the rules added to each tuple pair. If the result is the same value as the condition status, then the action will be carried out. # In[20]: # Since we are using the trigger to fix a problem related to false negatives, we want the condition to be # True and the action to be 1. This way, the trigger will set a prediction to 1 when the rule returns True. mt.add_cond_status(True) mt.add_action(1) # ## Using the Trigger to Improve Results # Now that we have added rules, a condition status, and an action, we can execute the trigger to improve results # In[21]: preds = mt.execute(input_table=H, label_column='predicted_labels', inplace=False) preds.head() # In[22]: # We were able to significantly reduce the number of incorrectly labeled tuple pairs preds[preds['label'] != preds['predicted_labels']] # In[23]: # We can see that the two tuples that are still labeled incorrectly are due to the title and authors being in the # wrong column for one of the tuples. pd.concat([S[S['_id'] == 11], S[S['_id'] == 267]]) # In[ ]: