#!/usr/bin/env python # coding: utf-8 # # # # pyTorch and FAISS workflow # #

# # ## How to install? # # pyJedAI is an open-source library that can be installed from PyPI. # # In[ ]: get_ipython().system('pip install pyjedai -U') # In[2]: get_ipython().system('pip show pyjedai') # Imports # In[3]: import os import sys import pandas as pd import networkx from networkx import draw, Graph from pyjedai.evaluation import Evaluation from pyjedai.datamodel import Data # In[4]: d1 = pd.read_csv("../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str) d2 = pd.read_csv("../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str) gt = pd.read_csv("../data/ccer/D2/gt.csv", sep='|', engine='python') attr1 = d1.columns[1:].to_list() attr2 = d2.columns[1:].to_list() data = Data(dataset_1=d1, attributes_1=attr1, id_column_name_1='id', dataset_2=d2, attributes_2=attr2, id_column_name_2='id', ground_truth=gt) # # Block Building # # ## Pre-trained pyTorch & GENSIM embeddings # # Available embeddings: # # - Gensim: `{ 'fasttext', 'glove', 'word2vec'}` # - pyTorch Sentence transformers : `{'smpnet','st5','sdistilroberta','sminilm','sent_glove'}` # - pyTorch Word transformers :`{'bert', 'distilbert', 'roberta', 'xlnet', 'albert'}` # # Custom Word or Sentence embedding models can be specified using a file path or HuggingFace identifier and a corresponding argument to `emb.build_blocks` # - Custom Sentence transformers: `vectorizer='model_name'` and `emb.build_blocks(..., custom_pretrained_model='sentence')` # - Custom Word transformers: `vectorizer='model_name'` and `emb.build_blocks(..., custom_pretrained_model='word')` # # ## FAISS # # faiss.IndexIVFFlat is an implementation of an inverted file index with coarse quantization. This index is used to efficiently search for nearest neighbors of a query vector in a large dataset of vectors. Here's a brief explanation of the parameters used in this index: # # In[5]: from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding # In[6]: emb = EmbeddingsNNBlockBuilding(vectorizer='sminilm', similarity_search='faiss') blocks, g = emb.build_blocks(data, top_k=5, similarity_distance='euclidean', load_embeddings_if_exist=False, save_embeddings=False, with_entity_matching=True) # In[7]: emb.evaluate(blocks, with_classification_report=True, with_stats=True) # # Entity Clustering # In[8]: from pyjedai.clustering import ConnectedComponentsClustering, UniqueMappingClustering # In[9]: ccc = UniqueMappingClustering() clusters = ccc.process(g, data, similarity_threshold=0.63) # In[10]: ccc.evaluate(clusters, with_classification_report=True) # ## Clusters Visualization # In[11]: import numpy as np import seaborn as sns import matplotlib.pyplot as plt import plotly.express as px from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.metrics import silhouette_samples, silhouette_score # In[12]: concatenated_df = pd.concat([d1, d2], ignore_index=True) concatenated_df # In[13]: X1 = emb.vectors_1 X2 = emb.vectors_2 X = np.concatenate((X1, X2), axis=0) # In[14]: def tranform_input_clusters_into_id(clusters, data): new_clusters = [] for cluster in clusters: new_cluster = set() lcluster = list(cluster) for i1 in range(0, len(lcluster)): internal_id = data._gt_to_ids_reversed_1[lcluster[i1]] if lcluster[i1] < data.dataset_limit else data._gt_to_ids_reversed_2[lcluster[i1]] new_cluster.add(internal_id) new_clusters.append(new_cluster) return new_clusters real_clusters = tranform_input_clusters_into_id(clusters=clusters, data=data) print("Number of clusters: ", len(real_clusters)) labels = np.empty(X.shape[0], dtype=int) for label, cluster in enumerate(real_clusters): for entity_id in cluster: labels[int(entity_id)] = label silhouette_vals = silhouette_samples(X, labels) mean_silhouette_score = silhouette_score(X, labels) concatenated_df['Cluster'] = labels # In[15]: pca = PCA(n_components=2) X_pca = pca.fit_transform(X) tsne = TSNE(n_components=2, learning_rate='auto', init='random') X_tsne = tsne.fit_transform(X) df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2']) df_tsne = pd.DataFrame(X_tsne, columns=['Dimension 1', 'Dimension 2']) df_pca['Cluster'] = labels df_tsne['Cluster'] = labels df_pca['Description'] = concatenated_df['name'] df_tsne['Description'] = concatenated_df['name'] fig_pca = px.scatter(df_pca, x='PC1', y='PC2', color='Cluster', title='PCA Cluster Plot', hover_data={'PC1':False, 'PC2':False, 'Cluster':True, 'Description':True}) fig_pca.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey'))) fig_pca.update_layout(legend_title_text='Cluster') fig_pca.show() fig_tsne = px.scatter(df_tsne, x='Dimension 1', y='Dimension 2', color='Cluster', title='t-SNE Cluster Plot', hover_data={'Dimension 1':False, 'Dimension 2':False, 'Cluster':True, 'Description':True}) fig_tsne.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey'))) fig_tsne.update_layout(legend_title_text='Cluster') fig_tsne.show() # In[16]: pca_3d = PCA(n_components=3) X_pca_3d = pca_3d.fit_transform(X) tsne_3d = TSNE(n_components=3, learning_rate='auto', init='random') X_tsne_3d = tsne_3d.fit_transform(X) df_pca_3d = pd.DataFrame(X_pca_3d, columns=['PC1', 'PC2', 'PC3']) df_tsne_3d = pd.DataFrame(X_tsne_3d, columns=['Dimension 1', 'Dimension 2', 'Dimension 3']) df_pca_3d['Cluster'] = labels df_tsne_3d['Cluster'] = labels df_pca_3d['Description'] = concatenated_df['name'] df_tsne_3d['Description'] = concatenated_df['name'] fig_pca_3d = px.scatter_3d(df_pca_3d, x='PC1', y='PC2', z='PC3', color='Cluster', title='3D PCA Cluster Plot', hover_data={'PC1':False, 'PC2':False, 'PC3':False, 'Cluster':True, 'Description':True}) fig_pca_3d.update_traces(marker=dict(size=5, line=dict(width=1, color='DarkSlateGrey'))) fig_pca_3d.update_layout(legend_title_text='Cluster') fig_pca_3d.show() fig_tsne_3d = px.scatter_3d(df_tsne_3d, x='Dimension 1', y='Dimension 2', z='Dimension 3', color='Cluster', title='3D t-SNE Cluster Plot', hover_data={'Dimension 1':False, 'Dimension 2':False, 'Dimension 3':False, 'Cluster':True, 'Description':True}) fig_tsne_3d.update_traces(marker=dict(size=5, line=dict(width=1, color='DarkSlateGrey'))) fig_tsne_3d.update_layout(legend_title_text='Cluster') fig_tsne_3d.show() #

# K. Nikoletos, G. Papadakis & M. Koubarakis #

# Apache License 2.0 #