#!/usr/bin/env python # coding: utf-8 # # # User-Friendly Wrapper # # In this notebook we present the a __user-friendly approach__ in the well-known ABT-BUY dataset. This is a simple approach, specially developed for novice users in ER. # # How to install? # # pyJedAI is an open-source library that can be installed from PyPI. # # In[ ]: get_ipython().run_line_magic('pip', 'install pyjedai -U') # In[ ]: get_ipython().run_line_magic('pip', 'show pyjedai') # Imports # In[1]: import os import sys import pandas as pd # ## Data Reading # In[2]: from pyjedai.datamodel import Data data = Data( dataset_1=pd.read_csv("./../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str), attributes_1=['id','name','description'], id_column_name_1='id', dataset_2=pd.read_csv("./../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str), attributes_2=['id','name','description'], id_column_name_2='id', ground_truth=pd.read_csv("./../data/ccer/D2/gt.csv", sep='|', engine='python'), ) # ## WorkFlow # In[3]: from pyjedai.workflow import BlockingBasedWorkFlow, EmbeddingsNNWorkFlow, compare_workflows from pyjedai.block_building import StandardBlocking, QGramsBlocking, ExtendedQGramsBlocking, SuffixArraysBlocking, ExtendedSuffixArraysBlocking from pyjedai.block_cleaning import BlockFiltering, BlockPurging from pyjedai.comparison_cleaning import WeightedEdgePruning, WeightedNodePruning, CardinalityEdgePruning, CardinalityNodePruning, BLAST, ReciprocalCardinalityNodePruning, ReciprocalWeightedNodePruning, ComparisonPropagation from pyjedai.matching import EntityMatching from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding # In[4]: w = BlockingBasedWorkFlow( block_building = dict( method=QGramsBlocking, params=dict(qgrams=3), attributes_1=['name'], attributes_2=['name'] ), block_cleaning = [ dict( method=BlockPurging, params=dict(smoothing_factor=1.025) ), dict( method=BlockFiltering, params=dict(ratio=0.8) ) ], comparison_cleaning = dict(method=CardinalityEdgePruning), entity_matching = dict( method=EntityMatching, metric='sorensen_dice', similarity_threshold=0.5, attributes = ['description', 'name'] ), clustering = dict(method=ConnectedComponentsClustering), name="Worflow-Test" ) # In[5]: w.run(data, verbose=True) # In[6]: w.to_df() # In[7]: w.visualize() # In[8]: w.visualize(separate=True) # ## Multiple workflows - Comparison # In[9]: w1 = BlockingBasedWorkFlow( block_building = dict( method=QGramsBlocking, params=dict(qgrams=4), attributes_1=['name'], attributes_2=['name'] ), block_cleaning = [ dict( method=BlockFiltering, params=dict(ratio=0.6) ), dict( method=BlockPurging, params=dict(smoothing_factor=1.025) ) ], comparison_cleaning = dict(method=CardinalityEdgePruning), entity_matching = dict( method=EntityMatching, metric='sorensen_dice', similarity_threshold=0.5, attributes = ['description', 'name'] ), clustering = dict(method=ConnectedComponentsClustering) ) w1.run(data, verbose=False, workflow_tqdm_enable=True) # ## Workflow based on pyTorch Embdeddings # # # - `block_building` : # - `method` : EmbeddingsNNBlockBuilding # - `params` : Constructor parameters # - `exec_params` : `build_blocks` parameters # - `clustering` : # - `method` : UniqueMappingClustering # - `params` : Constructor parameters (i.e similarity threshold) # In[10]: w2 = EmbeddingsNNWorkFlow( block_building = dict( method=EmbeddingsNNBlockBuilding, params=dict(vectorizer='sminilm', similarity_search='faiss'), exec_params=dict(top_k=5, similarity_distance='euclidean', load_embeddings_if_exist=False, save_embeddings=False) ), clustering = dict(method=UniqueMappingClustering), name="EmbeddingsNNWorkFlow-Test" ) w2.run(data, verbose=True) # In[11]: compare_workflows([w, w1, w2], with_visualization=True) # # Predefined workflows (best & default) # In[12]: w = BlockingBasedWorkFlow() w.best_blocking_workflow_ccer() w.run(data, verbose=True) #