#!/usr/bin/env python
# coding: utf-8
# # Get to know pyJedAI
#
# In this notebook we present the pyJedAI approach. pyJedAI is a an end-to-end and an upcoming python framework for Entity Resolution that will be a manual of the Entity Resolution. Its usages will outperform other state-of-the-art ER frameworks as it's easy-to-use and highly optimized as it is consisted from other established python libraries (i.e pandas, networkX, ..).
# # Install
#
# pyJedAI is an open-source library that can be installed from PyPI.
# In[ ]:
get_ipython().run_line_magic('pip', 'install pyjedai -U')
# In[ ]:
get_ipython().run_line_magic('pip', 'show pyjedai')
# # Reading the dataset - Clean-Clean ER example
#
# pyJedAI in order to perfrom needs only the tranformation of the initial data into a pandas DataFrame. Hence, pyJedAI can function in every structured or semi-structured data. In this case Abt-Buy dataset is provided as .csv files.
#
#
#

#
#
#
# ## pyjedai module
#
# Data module offers a numpber of options
# - Selecting the parameters (columns) of the dataframe, in D1 (and in D2)
# - Prints a detailed text analysis
# - Stores a hidden mapping of the ids, and creates if it is not exist.
#
# In[14]:
import pandas as pd
from pyjedai.datamodel import Data
d1 = pd.read_csv("./../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("./../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv("./../data/ccer/D2/gt.csv", sep='|', engine='python')
data = Data(
dataset_1=d1,
attributes_1=['id','name','description'],
id_column_name_1='id',
dataset_2=d2,
attributes_2=['id','name','description'],
id_column_name_2='id',
ground_truth=gt,
)
# In[15]:
data.print_specs()
# In[16]:
data.dataset_1.head(2)
# In[17]:
data.dataset_2.head(2)
# In[18]:
data.ground_truth.head(2)
# # Creating workflow using pyJedAI methods
#
# Multiple algorithms, techniques and features have already been implemented. This way, we can import the method and proceed to the workflow architecture.
#
# For example we demostrate a variety of algorithms in each step, as it is shown in the bellow cell.
# In[19]:
from pyjedai.workflow import BlockingBasedWorkFlow, compare_workflows
from pyjedai.block_building import (
StandardBlocking, QGramsBlocking, ExtendedQGramsBlocking,
SuffixArraysBlocking, ExtendedSuffixArraysBlocking
)
from pyjedai.block_cleaning import BlockFiltering, BlockPurging
from pyjedai.comparison_cleaning import (
WeightedEdgePruning, WeightedNodePruning, CardinalityEdgePruning,
CardinalityNodePruning, BLAST, ReciprocalCardinalityNodePruning,
ReciprocalWeightedNodePruning, ComparisonPropagation
)
from pyjedai.matching import EntityMatching
from pyjedai.clustering import ConnectedComponentsClustering
# ## Building a simple WorkFlow
#
# The main workflow that pyjedai supports, consists of 8 steps:
#
# - __Data Reading:__ Raw data in the pandas\ format are transformed to pyjedai\
# - __Block Building__: In this step we create blocks of entities based on some tokenization techniques like QGrams, SuffixArray, etc.
# - __Block Filtering, Block Purging, Cardnality Edge Pruning__: These methods reduce the amount of comparisons by removing entities from the blocks or producing a much more dence index.
# - __Entity Matching__: The similarity checking phase. Each block is now tested for similarity. Meaning that all entities contained in the block will be mesaured for similarity using a metric like Jaccard.
# - __Connected Components Clustering__: Creates clusters of similarity based on the graph produced from the entity matching step.
# - __Results__: Finally, pyjedai produces a set of pairs that apper to be duplicates, scores as well as visualizations that help users understand the workflow performance.
#
# For this demo, we created a simple architecture as we see bellow:
#
# 
# In[20]:
w = BlockingBasedWorkFlow(
block_building = dict(
method=QGramsBlocking,
params=dict(qgrams=3)
),
block_cleaning = [
dict(
method=BlockPurging,
params=dict(smoothing_factor=1.025)
),
dict(
method=BlockFiltering,
params=dict(ratio=0.8)
)
],
comparison_cleaning = dict(method=CardinalityEdgePruning),
entity_matching = dict(
method=EntityMatching,
metric='dice',
similarity_threshold=0.5,
attributes = ['description', 'name']
),
clustering = dict(method=ConnectedComponentsClustering),
name="Worflow-QGramsBlocking"
)
# # Evaluation and detailed reporting
# In[21]:
w.run(data, workflow_tqdm_enable=True, verbose=False)
# In[22]:
w.to_df()
# # Visualization
# In[23]:
w.visualize()
# In[24]:
w.visualize(separate=True)
# # Multiple workflows
#
# pyJedAI provides methods for comparing multiple workflows. For example, we can test the above example with all the Block Building methods provided.
#
# In[25]:
block_building_methods = [StandardBlocking, QGramsBlocking, ExtendedQGramsBlocking, SuffixArraysBlocking, ExtendedSuffixArraysBlocking]
workflows = []
for bbm in block_building_methods:
workflows.append(BlockingBasedWorkFlow(
block_building = dict(
method=bbm,
),
block_cleaning = [
dict(
method=BlockFiltering,
params=dict(ratio=0.8)
),
dict(
method=BlockPurging,
params=dict(smoothing_factor=1.025)
)
],
comparison_cleaning = dict(method=CardinalityEdgePruning),
entity_matching = dict(
method=EntityMatching,
metric='sorensen_dice',
similarity_threshold=0.5,
attributes = ['description', 'name']
),
clustering = dict(method=ConnectedComponentsClustering),
name="Workflow-"+str(bbm.__name__)
))
workflows[-1].run(data, workflow_tqdm_enable=True)
# In[26]:
compare_workflows(workflows, with_visualization=True)
#
#
# K. Nikoletos, J. Maciejewski, G. Papadakis & M. Koubarakis
#
#