#!/usr/bin/env python
# coding: utf-8

# # Introduction
# 
# This IPython notebook illustrates how to select the best learning based matcher. First, we need to import py_entitymatching package and other libraries as follows:

# In[1]:


# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

# Set the seed value 
seed = 0


# In[2]:


get_ipython().system('ls $datasets_dir')


# In[3]:


# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

path_A = datasets_dir + os.sep + 'dblp_demo.csv'
path_B = datasets_dir + os.sep + 'acm_demo.csv'
path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'


# In[5]:


A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
# Load the pre-labeled data
S = em.read_csv_metadata(path_labeled_data, 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')


# Then, split the labeled data into development set and evaluation set. Use the development set to select the best learning-based matcher

# In[6]:


# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']


# # Selecting the Best learning-based matcher 

# This, typically involves the following steps:
# 1. Creating a set of learning-based matchers
# 2. Creating features
# 3. Extracting feature vectors
# 4. Selecting the best learning-based matcher using k-fold cross validation
# 5. Debugging the matcher (and possibly repeat the above steps)

# ## Creating a set of learning-based matchers

# First, we need to create a set of learning-based matchers. The following matchers are supported in Magellan: (1) decision tree, (2) random forest, (3) naive bayes, (4) svm, (5) logistic regression, and (6) linear regression.

# In[7]:


# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')


# ## Creating features
# 
# Next, we need to create a set of features for the development set. Magellan provides a way to automatically generate features based on the attributes in the input tables. For the purposes of this guide, we use the automatically generated features.

# In[8]:


# Generate a set of features
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)


# We observe that there were 20 features generated. As a first step, lets say that we decide to use only 'year' related features.

# In[9]:


F.feature_name


# ## Extracting feature vectors
# 
# In this step, we extract feature vectors using the development set and the created features.

# In[10]:


# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)


# In[11]:


# Display first few rows
H.head()


# In[12]:


# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))


# We observe that the extracted feature vectors contain missing values. We have to impute the missing values for the learning-based matchers to fit the model correctly. For the purposes of this guide, we impute the missing value in a column with the mean of the values in that column. 

# In[13]:


# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')


# ## Selecting the best matcher using cross-validation
# 
# Now, we select the best matcher using k-fold cross-validation. For the purposes of this guide, we use five fold cross validation and use 'precision' metric to select the best matcher.

# In[14]:


# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']


# In[15]:


result['drill_down_cv_stats']['precision']


# In[16]:


result['drill_down_cv_stats']['recall']


# In[17]:


result['drill_down_cv_stats']['f1']


# ### Debug X (Random Forest)

# In[18]:


# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']


# In[19]:


# Debug RF matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        target_attr='label')


# In[20]:


# Add a feature to do Jaccard on title + authors and add it to F

# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['title'] + ' ' + ltuple['authors']).lower()), 
                            wspace((rtuple['title'] + ' ' + rtuple['authors']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(F, 'jac_ws_title_authors', feature)


# In[21]:


# Convert I into feature vectors using updated F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)


# In[22]:


# Check whether the updated F improves X (Random Forest)
result = em.select_matcher([rf], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['drill_down_cv_stats']['f1']


# In[23]:


# Select the best matcher again using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']


# In[24]:


result['drill_down_cv_stats']['f1']