#!/usr/bin/env python # coding: utf-8 # # # Similarity Joins Tutorial # # In this notebook we present the pyJedAI approach in the well-known ABT-BUY dataset using a Similarity Join workflow. # # # ![workflow2-cora.png](https://github.com/AI-team-UoA/pyJedAI/blob/main/docs/img/workflow2-cora.png?raw=true) # # ## How to install? # # pyJedAI is an open-source library that can be installed from PyPI. # # For more: [pypi.org/project/pyjedai/](https://pypi.org/project/pyjedai/) # In[1]: get_ipython().system('python --version') # In[ ]: get_ipython().system('pip install pyjedai -U') # In[3]: get_ipython().system('pip show pyjedai') # Imports # In[4]: import os import sys import pandas as pd import networkx from networkx import draw, Graph from pyjedai.utils import print_clusters, print_blocks, print_candidate_pairs from pyjedai.evaluation import Evaluation # ## Reading the dataset # # pyJedAI in order to perfrom needs only the tranformation of the initial data into a pandas DataFrame. Hence, pyJedAI can function in every structured or semi-structured data. In this case Abt-Buy dataset is provided as .csv files. # #

# # # ### pyjedai module # # Data module offers a numpber of options # - Selecting the parameters (columns) of the dataframe, in D1 (and in D2) # - Prints a detailed text analysis # - Stores a hidden mapping of the ids, and creates it if not exists. # In[5]: from pyjedai.datamodel import Data d1 = pd.read_csv("./../data/der/cora/cora.csv", sep='|') gt = pd.read_csv("./../data/der/cora/cora_gt.csv", sep='|', header=None) attr = ['Entity Id','author', 'title'] # Data is the connecting module of all steps of the workflow # In[6]: data = Data( dataset_1=d1, id_column_name_1='Entity Id', ground_truth=gt, attributes_1=attr ) # ## Similarity Joins # # __Available algorithms:__ # # - EJoin # - TopKJoin # In[7]: from pyjedai.joins import EJoin, TopKJoin # In[8]: join = EJoin(similarity_threshold = 0.5, metric = 'jaccard', tokenization = 'qgrams_multiset', qgrams = 2) g = join.fit(data) # In[9]: _ = join.evaluate(g) # In[10]: draw(g) # ## Entity Clustering # # It takes as input the similarity graph produced by Entity Matching and partitions it into a set of equivalence clusters, with every cluster corresponding to a distinct real-world object. # In[11]: from pyjedai.clustering import ConnectedComponentsClustering # In[12]: ec = ConnectedComponentsClustering() clusters = ec.process(g, data, similarity_threshold=0.3) # In[13]: _ = ec.evaluate(clusters) #

# K. Nikoletos, J. Maciejewski, G. Papadakis & M. Koubarakis #

# Apache License 2.0 #

# In[ ]: