#!/usr/bin/env python # coding: utf-8 # # Introduction # # This IPython notebook illustrates how to select the best learning based matcher. First, we need to import py_entitymatching package and other libraries as follows: # In[2]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Set the seed value seed = 0 # In[3]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' path_A = datasets_dir + os.sep + 'dblp_demo.csv' path_B = datasets_dir + os.sep + 'acm_demo.csv' path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv' # In[4]: A = em.read_csv_metadata(path_A, key='id') B = em.read_csv_metadata(path_B, key='id') # Load the pre-labeled data S = em.read_csv_metadata(path_labeled_data, key='_id', ltable=A, rtable=B, fk_ltable='ltable_id', fk_rtable='rtable_id') # Then, split the labeled data into development set and evaluation set and convert them into feature vectors # In[5]: # Split S into I an J IJ = em.split_train_test(S, train_proportion=0.5, random_state=0) I = IJ['train'] J = IJ['test'] # In[6]: # Generate a set of features F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # In[7]: # Convert I into feature vectors using updated F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) # # Compute accuracy of X (Decision Tree) on J # It involves the following steps: # # 1. Train X using H # 2. Convert J into a set of feature vectors (L) # 3. Predict on L using X # 4. Evaluate the predictions # In[8]: # Instantiate the matcher to evaluate. dt = em.DTMatcher(name='DecisionTree', random_state=0) # In[9]: # Train using feature vectors from I dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='label') # Convert J into a set of feature vectors using F L = em.extract_feature_vecs(J, feature_table=F, attrs_after='label', show_progress=False) # Predict on L predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], append=True, target_attr='predicted', inplace=False, return_probs=True, probs_attr='proba') # In[10]: predictions[['_id', 'ltable_id', 'rtable_id', 'predicted', 'proba']].head() # In[11]: # Evaluate the predictions eval_result = em.eval_matches(predictions, 'label', 'predicted') em.print_eval_summary(eval_result)