#!/usr/bin/env python # coding: utf-8 # # Introduction # # This IPython notebook illustrates how to select the best learning based matcher. First, we need to import py_entitymatching package and other libraries as follows: # In[1]: # Import py_entitymatching package import py_entitymatching as em import os import pandas as pd # Set the seed value seed = 0 # In[2]: get_ipython().system('ls $datasets_dir') # In[3]: # Get the datasets directory datasets_dir = em.get_install_path() + os.sep + 'datasets' path_A = datasets_dir + os.sep + 'dblp_demo.csv' path_B = datasets_dir + os.sep + 'acm_demo.csv' path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv' # In[5]: A = em.read_csv_metadata(path_A, key='id') B = em.read_csv_metadata(path_B, key='id') # Load the pre-labeled data S = em.read_csv_metadata(path_labeled_data, key='_id', ltable=A, rtable=B, fk_ltable='ltable_id', fk_rtable='rtable_id') # Then, split the labeled data into development set and evaluation set. Use the development set to select the best learning-based matcher # In[6]: # Split S into I an J IJ = em.split_train_test(S, train_proportion=0.5, random_state=0) I = IJ['train'] J = IJ['test'] # # Selecting the Best learning-based matcher # This, typically involves the following steps: # 1. Creating a set of learning-based matchers # 2. Creating features # 3. Extracting feature vectors # 4. Selecting the best learning-based matcher using k-fold cross validation # 5. Debugging the matcher (and possibly repeat the above steps) # ## Creating a set of learning-based matchers # First, we need to create a set of learning-based matchers. The following matchers are supported in Magellan: (1) decision tree, (2) random forest, (3) naive bayes, (4) svm, (5) logistic regression, and (6) linear regression. # In[7]: # Create a set of ML-matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') # ## Creating features # # Next, we need to create a set of features for the development set. Magellan provides a way to automatically generate features based on the attributes in the input tables. For the purposes of this guide, we use the automatically generated features. # In[8]: # Generate a set of features F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # We observe that there were 20 features generated. As a first step, lets say that we decide to use only 'year' related features. # In[9]: F.feature_name # ## Extracting feature vectors # # In this step, we extract feature vectors using the development set and the created features. # In[10]: # Convert the I into a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) # In[11]: # Display first few rows H.head() # In[12]: # Check if the feature vectors contain missing values # A return value of True means that there are missing values any(pd.notnull(H)) # We observe that the extracted feature vectors contain missing values. We have to impute the missing values for the learning-based matchers to fit the model correctly. For the purposes of this guide, we impute the missing value in a column with the mean of the values in that column. # In[13]: # Impute feature vectors with the mean of the column values. H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], strategy='mean') # ## Selecting the best matcher using cross-validation # # Now, we select the best matcher using k-fold cross-validation. For the purposes of this guide, we use five fold cross validation and use 'precision' metric to select the best matcher. # In[14]: # Select the best ML matcher using CV result = em.select_matcher([dt, rf, svm, ln, lg], table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) result['cv_stats'] # In[15]: result['drill_down_cv_stats']['precision'] # In[16]: result['drill_down_cv_stats']['recall'] # In[17]: result['drill_down_cv_stats']['f1'] # ### Debug X (Random Forest) # In[18]: # Split H into P and Q PQ = em.split_train_test(H, train_proportion=0.5, random_state=0) P = PQ['train'] Q = PQ['test'] # In[19]: # Debug RF matcher using GUI em.vis_debug_rf(rf, P, Q, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='label') # In[20]: # Add a feature to do Jaccard on title + authors and add it to F # Create a feature declaratively sim = em.get_sim_funs_for_matching() tok = em.get_tokenizers_for_matching() feature_string = """jaccard(wspace((ltuple['title'] + ' ' + ltuple['authors']).lower()), wspace((rtuple['title'] + ' ' + rtuple['authors']).lower()))""" feature = em.get_feature_fn(feature_string, sim, tok) # Add feature to F em.add_feature(F, 'jac_ws_title_authors', feature) # In[21]: # Convert I into feature vectors using updated F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) # In[22]: # Check whether the updated F improves X (Random Forest) result = em.select_matcher([rf], table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) result['drill_down_cv_stats']['f1'] # In[23]: # Select the best matcher again using CV result = em.select_matcher([dt, rf, svm, ln, lg], table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) result['cv_stats'] # In[24]: result['drill_down_cv_stats']['f1']