pyJedAI is an open-source library that can be installed from PyPI.
!pip install pyjedai -U
!pip show pyjedai
Name: pyjedai Version: 0.1.8 Summary: An open-source library that builds powerful end-to-end Entity Resolution workflows. Home-page: Author: Author-email: Konstantinos Nikoletos <nikoletos.kon@gmail.com>, George Papadakis <gpapadis84@gmail.com>, Jakub Maciejewski <jacobb.maciejewski@gmail.com>, Manolis Koubarakis <koubarak@di.uoa.gr> License: Apache Software License 2.0 Location: c:\users\nikol\miniconda3\envs\pyjedai\lib\site-packages Requires: faiss-cpu, gensim, matplotlib, networkx, nltk, numpy, ordered-set, pandas, py-stringmatching, scipy, seaborn, sentence-transformers, shapely, tqdm, transformers, valentine Required-by:
Imports
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
from pyjedai.evaluation import Evaluation
from pyjedai.datamodel import Data
d1 = pd.read_csv("../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv("../data/ccer/D2/gt.csv", sep='|', engine='python')
attr1 = d1.columns[1:].to_list()
attr2 = d2.columns[1:].to_list()
data = Data(dataset_1=d1,
attributes_1=attr1,
id_column_name_1='id',
dataset_2=d2,
attributes_2=attr2,
id_column_name_2='id',
ground_truth=gt)
Available embeddings:
{ 'fasttext', 'glove', 'word2vec'}
{'smpnet','st5','sdistilroberta','sminilm','sent_glove'}
{'bert', 'distilbert', 'roberta', 'xlnet', 'albert'}
faiss.IndexIVFFlat is an implementation of an inverted file index with coarse quantization. This index is used to efficiently search for nearest neighbors of a query vector in a large dataset of vectors. Here's a brief explanation of the parameters used in this index:
from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding
c:\Users\nikol\miniconda3\envs\pyjedai\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Created embeddings directory at: c:\Users\nikol\Desktop\GitHub\pyJedAI-Dev\notebooks\.embs
emb = EmbeddingsNNBlockBuilding(vectorizer='sminilm',
similarity_search='faiss')
blocks, g = emb.build_blocks(data,
top_k=5,
similarity_distance='euclidean',
load_embeddings_if_exist=False,
save_embeddings=False,
with_entity_matching=True)
Embeddings-NN Block Building [sminilm, faiss, cpu]: 100%|██████████| 2152/2152 [02:37<00:00, 13.64it/s]
emb.evaluate(blocks, with_classification_report=True, with_stats=True)
*************************************************************************************************************************** Method: Embeddings-NN Block Building *************************************************************************************************************************** Method name: Embeddings-NN Block Building Parameters: Vectorizer: sminilm Similarity-Search: faiss Top-K: 5 Vector size: 384 Runtime: 157.7577 seconds ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Performance: Precision: 18.75% Recall: 93.77% F1-score: 31.26% ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Classification report: True positives: 1009 False positives: 4371 True negatives: 1156633 False negatives: 67 Total comparisons: 5380 ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Statistics: FAISS: Indices shape returned after search: (1076, 5) ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
{'Precision %': 18.7546468401487, 'Recall %': 93.77323420074349, 'F1 %': 31.257744733581166, 'True Positives': 1009, 'False Positives': 4371, 'True Negatives': 1156633, 'False Negatives': 67}
from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering
ccc = UniqueMappingClustering()
clusters = ccc.process(g, data, similarity_threshold=0.63)
ccc.evaluate(clusters, with_classification_report=True)
*************************************************************************************************************************** Method: Unique Mapping Clustering *************************************************************************************************************************** Method name: Unique Mapping Clustering Parameters: Similarity Threshold: 0.63 Runtime: 0.1158 seconds ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Performance: Precision: 83.18% Recall: 67.10% F1-score: 74.28% ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Classification report: True positives: 722 False positives: 146 True negatives: 1156346 False negatives: 354 Total comparisons: 868 ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
{'Precision %': 83.17972350230414, 'Recall %': 67.1003717472119, 'F1 %': 74.2798353909465, 'True Positives': 722, 'False Positives': 146, 'True Negatives': 1156346, 'False Negatives': 354}
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_samples, silhouette_score
concatenated_df = pd.concat([d1, d2], ignore_index=True)
concatenated_df
id | name | description | price | |
---|---|---|---|---|
0 | 0 | Sony Turntable - PSLX350H | Sony Turntable - PSLX350H/ Belt Drive System/ ... | |
1 | 1 | Bose Acoustimass 5 Series III Speaker System -... | Bose Acoustimass 5 Series III Speaker System -... | 399 |
2 | 2 | Sony Switcher - SBV40S | Sony Switcher - SBV40S/ Eliminates Disconnecti... | 49 |
3 | 3 | Sony 5 Disc CD Player - CDPCE375 | Sony 5 Disc CD Player- CDPCE375/ 5 Disc Change... | |
4 | 4 | Bose 27028 161 Bookshelf Pair Speakers In Whit... | Bose 161 Bookshelf Speakers In White - 161WH/ ... | 158 |
... | ... | ... | ... | ... |
2147 | 1071 | Sony VAIO FW378J/B Notebook - VGNFW378J/B | Intel Centrino 2 Core 2 Duo P8600 2.4GHz - 16.... | |
2148 | 1072 | Sennheiser CX380 Sennheiser CX 380 Sport II Gr... | ||
2149 | 1073 | IWORK 09 RETAIL-INT DVD - MB942Z/A | ||
2150 | 1074 | IWORK 09 FAMILY PACK-INT DVD - MB943Z/A | ||
2151 | 1075 | CASE MATE CARBON FIBER IPHONE 3G CASE BLACK - ... | 28.08 |
2152 rows × 4 columns
X1 = emb.vectors_1
X2 = emb.vectors_2
X = np.concatenate((X1, X2), axis=0)
def tranform_input_clusters_into_id(clusters, data):
new_clusters = []
for cluster in clusters:
new_cluster = set()
lcluster = list(cluster)
for i1 in range(0, len(lcluster)):
internal_id = data._gt_to_ids_reversed_1[lcluster[i1]] if lcluster[i1] < data.dataset_limit else data._gt_to_ids_reversed_2[lcluster[i1]]
new_cluster.add(internal_id)
new_clusters.append(new_cluster)
return new_clusters
real_clusters = tranform_input_clusters_into_id(clusters=clusters, data=data)
print("Number of clusters: ", len(real_clusters))
labels = np.empty(X.shape[0], dtype=int)
for label, cluster in enumerate(real_clusters):
for entity_id in cluster:
labels[int(entity_id)] = label
silhouette_vals = silhouette_samples(X, labels)
mean_silhouette_score = silhouette_score(X, labels)
concatenated_df['Cluster'] = labels
Number of clusters: 868
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
tsne = TSNE(n_components=2, learning_rate='auto', init='random')
X_tsne = tsne.fit_transform(X)
df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
df_tsne = pd.DataFrame(X_tsne, columns=['Dimension 1', 'Dimension 2'])
df_pca['Cluster'] = labels
df_tsne['Cluster'] = labels
df_pca['Description'] = concatenated_df['name']
df_tsne['Description'] = concatenated_df['name']
fig_pca = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title='PCA Cluster Plot',
hover_data={'PC1':False, 'PC2':False, 'Cluster':True, 'Description':True})
fig_pca.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))
fig_pca.update_layout(legend_title_text='Cluster')
fig_pca.show()
fig_tsne = px.scatter(df_tsne, x='Dimension 1', y='Dimension 2', color='Cluster', title='t-SNE Cluster Plot',
hover_data={'Dimension 1':False, 'Dimension 2':False, 'Cluster':True, 'Description':True})
fig_tsne.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))
fig_tsne.update_layout(legend_title_text='Cluster')
fig_tsne.show()
c:\Users\nikol\miniconda3\envs\pyjedai\lib\site-packages\threadpoolctl.py:1223: RuntimeWarning: Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at the same time. Both libraries are known to be incompatible and this can cause random crashes or deadlocks on Linux when loaded in the same Python program. Using threadpoolctl may cause crashes or deadlocks. For more information and possible workarounds, please see https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md warnings.warn(msg, RuntimeWarning)
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X)
tsne_3d = TSNE(n_components=3, learning_rate='auto', init='random')
X_tsne_3d = tsne_3d.fit_transform(X)
df_pca_3d = pd.DataFrame(X_pca_3d, columns=['PC1', 'PC2', 'PC3'])
df_tsne_3d = pd.DataFrame(X_tsne_3d, columns=['Dimension 1', 'Dimension 2', 'Dimension 3'])
df_pca_3d['Cluster'] = labels
df_tsne_3d['Cluster'] = labels
df_pca_3d['Description'] = concatenated_df['name']
df_tsne_3d['Description'] = concatenated_df['name']
fig_pca_3d = px.scatter_3d(df_pca_3d, x='PC1', y='PC2', z='PC3', color='Cluster', title='3D PCA Cluster Plot',
hover_data={'PC1':False, 'PC2':False, 'PC3':False, 'Cluster':True, 'Description':True})
fig_pca_3d.update_traces(marker=dict(size=5, line=dict(width=1, color='DarkSlateGrey')))
fig_pca_3d.update_layout(legend_title_text='Cluster')
fig_pca_3d.show()
fig_tsne_3d = px.scatter_3d(df_tsne_3d, x='Dimension 1', y='Dimension 2', z='Dimension 3', color='Cluster', title='3D t-SNE Cluster Plot',
hover_data={'Dimension 1':False, 'Dimension 2':False, 'Dimension 3':False, 'Cluster':True, 'Description':True})
fig_tsne_3d.update_traces(marker=dict(size=5, line=dict(width=1, color='DarkSlateGrey')))
fig_tsne_3d.update_layout(legend_title_text='Cluster')
fig_tsne_3d.show()