#!/usr/bin/env python
# coding: utf-8
#
# # Similarity Joins Tutorial
#
# In this notebook we present the pyJedAI approach in the well-known ABT-BUY dataset using a Similarity Join workflow.
#
#
# 
#
# ## How to install?
#
# pyJedAI is an open-source library that can be installed from PyPI.
#
# For more: [pypi.org/project/pyjedai/](https://pypi.org/project/pyjedai/)
# In[1]:
get_ipython().system('python --version')
# In[ ]:
get_ipython().system('pip install pyjedai -U')
# In[3]:
get_ipython().system('pip show pyjedai')
# Imports
# In[4]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
from pyjedai.utils import print_clusters, print_blocks, print_candidate_pairs
from pyjedai.evaluation import Evaluation
# ## Reading the dataset
#
# pyJedAI in order to perfrom needs only the tranformation of the initial data into a pandas DataFrame. Hence, pyJedAI can function in every structured or semi-structured data. In this case Abt-Buy dataset is provided as .csv files.
#
#
#

#
#
#
# ### pyjedai module
#
# Data module offers a numpber of options
# - Selecting the parameters (columns) of the dataframe, in D1 (and in D2)
# - Prints a detailed text analysis
# - Stores a hidden mapping of the ids, and creates it if not exists.
# In[5]:
from pyjedai.datamodel import Data
d1 = pd.read_csv("./../data/der/cora/cora.csv", sep='|')
gt = pd.read_csv("./../data/der/cora/cora_gt.csv", sep='|', header=None)
attr = ['Entity Id','author', 'title']
# Data is the connecting module of all steps of the workflow
# In[6]:
data = Data(
dataset_1=d1,
id_column_name_1='Entity Id',
ground_truth=gt,
attributes_1=attr
)
# ## Similarity Joins
#
# __Available algorithms:__
#
# - EJoin
# - TopKJoin
# In[7]:
from pyjedai.joins import EJoin, TopKJoin
# In[8]:
join = EJoin(similarity_threshold = 0.5,
metric = 'jaccard',
tokenization = 'qgrams_multiset',
qgrams = 2)
g = join.fit(data)
# In[9]:
_ = join.evaluate(g)
# In[10]:
draw(g)
# ## Entity Clustering
#
# It takes as input the similarity graph produced by Entity Matching and partitions it into a set of equivalence clusters, with every cluster corresponding to a distinct real-world object.
# In[11]:
from pyjedai.clustering import ConnectedComponentsClustering
# In[12]:
ec = ConnectedComponentsClustering()
clusters = ec.process(g, data, similarity_threshold=0.3)
# In[13]:
_ = ec.evaluate(clusters)
#
#
# K. Nikoletos, J. Maciejewski, G. Papadakis & M. Koubarakis
#
#
# In[ ]: