#!/usr/bin/env python
# coding: utf-8
#
#
# # pyTorch and FAISS workflow
#
#
#

#
#
# ## How to install?
#
# pyJedAI is an open-source library that can be installed from PyPI.
#
# In[ ]:
get_ipython().system('pip install pyjedai -U')
# In[2]:
get_ipython().system('pip show pyjedai')
# Imports
# In[3]:
import os
import sys
import pandas as pd
import networkx
from networkx import draw, Graph
from pyjedai.evaluation import Evaluation
from pyjedai.datamodel import Data
# In[4]:
d1 = pd.read_csv("../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str)
d2 = pd.read_csv("../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str)
gt = pd.read_csv("../data/ccer/D2/gt.csv", sep='|', engine='python')
attr1 = d1.columns[1:].to_list()
attr2 = d2.columns[1:].to_list()
data = Data(dataset_1=d1,
attributes_1=attr1,
id_column_name_1='id',
dataset_2=d2,
attributes_2=attr2,
id_column_name_2='id',
ground_truth=gt)
# # Block Building
#
# ## Pre-trained pyTorch & GENSIM embeddings
#
# Available embeddings:
#
# - Gensim: `{ 'fasttext', 'glove', 'word2vec'}`
# - pyTorch Sentence transformers : `{'smpnet','st5','sdistilroberta','sminilm','sent_glove'}`
# - pyTorch Word transformers :`{'bert', 'distilbert', 'roberta', 'xlnet', 'albert'}`
#
# Custom Word or Sentence embedding models can be specified using a file path or HuggingFace identifier and a corresponding argument to `emb.build_blocks`
# - Custom Sentence transformers: `vectorizer='model_name'` and `emb.build_blocks(..., custom_pretrained_model='sentence')`
# - Custom Word transformers: `vectorizer='model_name'` and `emb.build_blocks(..., custom_pretrained_model='word')`
#
# ## FAISS
#
# faiss.IndexIVFFlat is an implementation of an inverted file index with coarse quantization. This index is used to efficiently search for nearest neighbors of a query vector in a large dataset of vectors. Here's a brief explanation of the parameters used in this index:
#
# In[5]:
from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding
# In[6]:
emb = EmbeddingsNNBlockBuilding(vectorizer='sminilm',
similarity_search='faiss')
blocks, g = emb.build_blocks(data,
top_k=5,
similarity_distance='euclidean',
load_embeddings_if_exist=False,
save_embeddings=False,
with_entity_matching=True)
# In[7]:
emb.evaluate(blocks, with_classification_report=True, with_stats=True)
# # Entity Clustering
# In[8]:
from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering
# In[9]:
ccc = UniqueMappingClustering()
clusters = ccc.process(g, data, similarity_threshold=0.63)
# In[10]:
ccc.evaluate(clusters, with_classification_report=True)
# ## Clusters Visualization
# In[11]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_samples, silhouette_score
# In[12]:
concatenated_df = pd.concat([d1, d2], ignore_index=True)
concatenated_df
# In[13]:
X1 = emb.vectors_1
X2 = emb.vectors_2
X = np.concatenate((X1, X2), axis=0)
# In[14]:
def tranform_input_clusters_into_id(clusters, data):
new_clusters = []
for cluster in clusters:
new_cluster = set()
lcluster = list(cluster)
for i1 in range(0, len(lcluster)):
internal_id = data._gt_to_ids_reversed_1[lcluster[i1]] if lcluster[i1] < data.dataset_limit else data._gt_to_ids_reversed_2[lcluster[i1]]
new_cluster.add(internal_id)
new_clusters.append(new_cluster)
return new_clusters
real_clusters = tranform_input_clusters_into_id(clusters=clusters, data=data)
print("Number of clusters: ", len(real_clusters))
labels = np.empty(X.shape[0], dtype=int)
for label, cluster in enumerate(real_clusters):
for entity_id in cluster:
labels[int(entity_id)] = label
silhouette_vals = silhouette_samples(X, labels)
mean_silhouette_score = silhouette_score(X, labels)
concatenated_df['Cluster'] = labels
# In[15]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
tsne = TSNE(n_components=2, learning_rate='auto', init='random')
X_tsne = tsne.fit_transform(X)
df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
df_tsne = pd.DataFrame(X_tsne, columns=['Dimension 1', 'Dimension 2'])
df_pca['Cluster'] = labels
df_tsne['Cluster'] = labels
df_pca['Description'] = concatenated_df['name']
df_tsne['Description'] = concatenated_df['name']
fig_pca = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title='PCA Cluster Plot',
hover_data={'PC1':False, 'PC2':False, 'Cluster':True, 'Description':True})
fig_pca.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))
fig_pca.update_layout(legend_title_text='Cluster')
fig_pca.show()
fig_tsne = px.scatter(df_tsne, x='Dimension 1', y='Dimension 2', color='Cluster', title='t-SNE Cluster Plot',
hover_data={'Dimension 1':False, 'Dimension 2':False, 'Cluster':True, 'Description':True})
fig_tsne.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))
fig_tsne.update_layout(legend_title_text='Cluster')
fig_tsne.show()
# In[16]:
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X)
tsne_3d = TSNE(n_components=3, learning_rate='auto', init='random')
X_tsne_3d = tsne_3d.fit_transform(X)
df_pca_3d = pd.DataFrame(X_pca_3d, columns=['PC1', 'PC2', 'PC3'])
df_tsne_3d = pd.DataFrame(X_tsne_3d, columns=['Dimension 1', 'Dimension 2', 'Dimension 3'])
df_pca_3d['Cluster'] = labels
df_tsne_3d['Cluster'] = labels
df_pca_3d['Description'] = concatenated_df['name']
df_tsne_3d['Description'] = concatenated_df['name']
fig_pca_3d = px.scatter_3d(df_pca_3d, x='PC1', y='PC2', z='PC3', color='Cluster', title='3D PCA Cluster Plot',
hover_data={'PC1':False, 'PC2':False, 'PC3':False, 'Cluster':True, 'Description':True})
fig_pca_3d.update_traces(marker=dict(size=5, line=dict(width=1, color='DarkSlateGrey')))
fig_pca_3d.update_layout(legend_title_text='Cluster')
fig_pca_3d.show()
fig_tsne_3d = px.scatter_3d(df_tsne_3d, x='Dimension 1', y='Dimension 2', z='Dimension 3', color='Cluster', title='3D t-SNE Cluster Plot',
hover_data={'Dimension 1':False, 'Dimension 2':False, 'Dimension 3':False, 'Cluster':True, 'Description':True})
fig_tsne_3d.update_traces(marker=dict(size=5, line=dict(width=1, color='DarkSlateGrey')))
fig_tsne_3d.update_layout(legend_title_text='Cluster')
fig_tsne_3d.show()
#
#
# K. Nikoletos, G. Papadakis & M. Koubarakis
#
#