Optimization and fine-tuning for the hyper-parameters using a novel framework named Optuna.
pyJedAI is an open-source library that can be installed from PyPI.
%pip install pyjedai -U
%pip show pyjedai
Imports
import plotly.express as px
import logging
import sys
import optuna
import plotly
import os
import sys
import pandas as pd
from optuna.visualization import *
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"
from pyjedai.datamodel import Data
data = Data(
dataset_1=pd.read_csv("./../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str),
attributes_1=['id','name','description'],
id_column_name_1='id',
dataset_2=pd.read_csv("./../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str),
attributes_2=['id','name','description'],
id_column_name_2='id',
ground_truth=pd.read_csv("./../data/ccer/D2/gt.csv", sep='|', engine='python'),
)
from pyjedai.workflow import WorkFlow, compare_workflows
from pyjedai.block_building import StandardBlocking, QGramsBlocking, ExtendedQGramsBlocking, SuffixArraysBlocking, ExtendedSuffixArraysBlocking
from pyjedai.block_cleaning import BlockFiltering, BlockPurging
from pyjedai.comparison_cleaning import WeightedEdgePruning, WeightedNodePruning, CardinalityEdgePruning, CardinalityNodePruning, BLAST, ReciprocalCardinalityNodePruning, ReciprocalWeightedNodePruning, ComparisonPropagation
from pyjedai.matching import EntityMatching
from pyjedai.clustering import ConnectedComponentsClustering
db_name = "pyjedai"
title = "Test"
storage_name = "sqlite:///{}.db".format(db_name)
study_name = title # Unique identifier of the study.
In the bellow cell, we define which parameters we want to be fine-tuned and the boundaries that we suggest. Also we set as the goal score to be maximized the F1-Score.
'''
OPTUNA objective function
'''
def objective(trial):
w = WorkFlow(
block_building = dict(
method=QGramsBlocking,
params=dict(qgrams=trial.suggest_int("qgrams", 3, 10)),
attributes_1=['name'],
attributes_2=['name']
),
block_cleaning = [
dict(
method=BlockPurging,
params=dict(smoothing_factor=1.025)
),
dict(
method=BlockFiltering,
params=dict(
ratio = trial.suggest_float("ratio", 0.7, 0.95)
)
)
],
comparison_cleaning = dict(method=CardinalityEdgePruning),
entity_matching = dict(
method=EntityMatching,
metric='sorensen_dice',
similarity_threshold= trial.suggest_float("similarity_threshold", 0.05, 0.9),
attributes = ['description', 'name']
),
clustering = dict(method=ConnectedComponentsClustering),
name="Worflow-Test"
)
w.run(data, workflow_step_tqdm_disable=True, verbose=False)
f1, precision, recall = w.get_final_scores()
return f1
study_name = title # Unique identifier of the study.
num_of_trials = 30
study = optuna.create_study(
directions=["maximize"],
study_name=study_name,
storage=storage_name,
load_if_exists=True
)
print("Optuna trials starting")
study.optimize(
objective,
n_trials=num_of_trials,
show_progress_bar=True
)
print("Optuna trials finished")
[I 2022-09-26 17:11:56,515] A new study created in RDB with name: Test
Optuna trials starting
C:\Users\nikol\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\progress_bar.py:49: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future. self._init_valid()
0%| | 0/30 [00:00<?, ?it/s]
[I 2022-09-26 17:12:08,614] Trial 0 finished with value: 0.30337436666113177 and parameters: {'qgrams': 8, 'ratio': 0.8380947452182991, 'similarity_threshold': 0.34701140984689427}. Best is trial 0 with value: 0.30337436666113177. [I 2022-09-26 17:12:30,648] Trial 1 finished with value: 0.20307681243216183 and parameters: {'qgrams': 5, 'ratio': 0.7929630924927731, 'similarity_threshold': 0.3138589895822442}. Best is trial 0 with value: 0.30337436666113177. [I 2022-09-26 17:12:53,415] Trial 2 finished with value: 0.19103604207409036 and parameters: {'qgrams': 4, 'ratio': 0.8038691888459086, 'similarity_threshold': 0.1331382386125572}. Best is trial 0 with value: 0.30337436666113177. [I 2022-09-26 17:12:58,756] Trial 3 finished with value: 0.28333512688101153 and parameters: {'qgrams': 7, 'ratio': 0.7144467000567123, 'similarity_threshold': 0.38959392590704467}. Best is trial 0 with value: 0.30337436666113177. [I 2022-09-26 17:13:03,935] Trial 4 finished with value: 0.4633111426794054 and parameters: {'qgrams': 10, 'ratio': 0.8517624194151302, 'similarity_threshold': 0.1658021229910926}. Best is trial 4 with value: 0.4633111426794054. [I 2022-09-26 17:13:26,462] Trial 5 finished with value: 0.18531875170393172 and parameters: {'qgrams': 3, 'ratio': 0.9174470432736699, 'similarity_threshold': 0.8777133320453102}. Best is trial 4 with value: 0.4633111426794054. [I 2022-09-26 17:13:38,772] Trial 6 finished with value: 0.1907552827778649 and parameters: {'qgrams': 4, 'ratio': 0.7808721328696897, 'similarity_threshold': 0.10683080190597335}. Best is trial 4 with value: 0.4633111426794054. [I 2022-09-26 17:13:43,365] Trial 7 finished with value: 0.33330840997266736 and parameters: {'qgrams': 9, 'ratio': 0.800827464931477, 'similarity_threshold': 0.40948711496314116}. Best is trial 4 with value: 0.4633111426794054. [I 2022-09-26 17:13:47,298] Trial 8 finished with value: 0.39784787794552795 and parameters: {'qgrams': 9, 'ratio': 0.7395142458665667, 'similarity_threshold': 0.830695162394687}. Best is trial 4 with value: 0.4633111426794054. [I 2022-09-26 17:14:04,391] Trial 9 finished with value: 0.1862693152521443 and parameters: {'qgrams': 3, 'ratio': 0.8596838078185782, 'similarity_threshold': 0.056875572246384}. Best is trial 4 with value: 0.4633111426794054. [I 2022-09-26 17:14:08,972] Trial 10 finished with value: 0.6482633708392243 and parameters: {'qgrams': 10, 'ratio': 0.8957263433574094, 'similarity_threshold': 0.630359849425508}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:13,289] Trial 11 finished with value: 0.5274133516352982 and parameters: {'qgrams': 10, 'ratio': 0.9045625386867897, 'similarity_threshold': 0.6392297322807924}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:18,068] Trial 12 finished with value: 0.42621654591235925 and parameters: {'qgrams': 10, 'ratio': 0.9483018131894073, 'similarity_threshold': 0.669716499745008}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:23,897] Trial 13 finished with value: 0.22112929238626436 and parameters: {'qgrams': 7, 'ratio': 0.8961116303610029, 'similarity_threshold': 0.6036793964995847}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:28,293] Trial 14 finished with value: 0.4070833316320489 and parameters: {'qgrams': 9, 'ratio': 0.8983635553727757, 'similarity_threshold': 0.5808143891387648}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:33,109] Trial 15 finished with value: 0.42621654591235925 and parameters: {'qgrams': 10, 'ratio': 0.9488029595469605, 'similarity_threshold': 0.7454617001392468}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:40,257] Trial 16 finished with value: 0.2123846817548284 and parameters: {'qgrams': 6, 'ratio': 0.8777239752940389, 'similarity_threshold': 0.5114877692227677}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:45,396] Trial 17 finished with value: 0.3258435026582389 and parameters: {'qgrams': 8, 'ratio': 0.9204153410113451, 'similarity_threshold': 0.7435281257384602}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:50,167] Trial 18 finished with value: 0.3480318083295157 and parameters: {'qgrams': 8, 'ratio': 0.875183707875409, 'similarity_threshold': 0.5035164496571632}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:14:54,146] Trial 19 finished with value: 0.36391860524716446 and parameters: {'qgrams': 10, 'ratio': 0.8294161917860436, 'similarity_threshold': 0.6808222011656786}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:01,379] Trial 20 finished with value: 0.20993148042255388 and parameters: {'qgrams': 6, 'ratio': 0.9196762030928591, 'similarity_threshold': 0.5795471650862246}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:05,584] Trial 21 finished with value: 0.46330684097155167 and parameters: {'qgrams': 10, 'ratio': 0.8563838420348091, 'similarity_threshold': 0.23323189289645876}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:09,902] Trial 22 finished with value: 0.4070833316320489 and parameters: {'qgrams': 9, 'ratio': 0.8948541483847852, 'similarity_threshold': 0.2644175908762265}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:14,091] Trial 23 finished with value: 0.42745701777773953 and parameters: {'qgrams': 10, 'ratio': 0.8429055971815752, 'similarity_threshold': 0.4726717116974519}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:18,489] Trial 24 finished with value: 0.40750648259180683 and parameters: {'qgrams': 9, 'ratio': 0.876132015812083, 'similarity_threshold': 0.6726766450815258}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:23,589] Trial 25 finished with value: 0.3258414249026382 and parameters: {'qgrams': 8, 'ratio': 0.9275690010126776, 'similarity_threshold': 0.16506027753408292}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:27,909] Trial 26 finished with value: 0.46330684097155167 and parameters: {'qgrams': 10, 'ratio': 0.8601034741266624, 'similarity_threshold': 0.7695051840189288}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:31,804] Trial 27 finished with value: 0.4037641499510466 and parameters: {'qgrams': 9, 'ratio': 0.7662074140900922, 'similarity_threshold': 0.44798780615040656}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:36,144] Trial 28 finished with value: 0.3639219361818585 and parameters: {'qgrams': 10, 'ratio': 0.8174855005418883, 'similarity_threshold': 0.6110684353074003}. Best is trial 10 with value: 0.6482633708392243. [I 2022-09-26 17:15:43,271] Trial 29 finished with value: 0.24080227825457184 and parameters: {'qgrams': 7, 'ratio': 0.8389446905059951, 'similarity_threshold': 0.35743796412615136}. Best is trial 10 with value: 0.6482633708392243. Optuna trials finished
study.trials_dataframe(attrs=("number", "value", "params", "state"))
number | value | params_qgrams | params_ratio | params_similarity_threshold | state | |
---|---|---|---|---|---|---|
0 | 0 | 0.303374 | 8 | 0.838095 | 0.347011 | COMPLETE |
1 | 1 | 0.203077 | 5 | 0.792963 | 0.313859 | COMPLETE |
2 | 2 | 0.191036 | 4 | 0.803869 | 0.133138 | COMPLETE |
3 | 3 | 0.283335 | 7 | 0.714447 | 0.389594 | COMPLETE |
4 | 4 | 0.463311 | 10 | 0.851762 | 0.165802 | COMPLETE |
5 | 5 | 0.185319 | 3 | 0.917447 | 0.877713 | COMPLETE |
6 | 6 | 0.190755 | 4 | 0.780872 | 0.106831 | COMPLETE |
7 | 7 | 0.333308 | 9 | 0.800827 | 0.409487 | COMPLETE |
8 | 8 | 0.397848 | 9 | 0.739514 | 0.830695 | COMPLETE |
9 | 9 | 0.186269 | 3 | 0.859684 | 0.056876 | COMPLETE |
10 | 10 | 0.648263 | 10 | 0.895726 | 0.630360 | COMPLETE |
11 | 11 | 0.527413 | 10 | 0.904563 | 0.639230 | COMPLETE |
12 | 12 | 0.426217 | 10 | 0.948302 | 0.669716 | COMPLETE |
13 | 13 | 0.221129 | 7 | 0.896112 | 0.603679 | COMPLETE |
14 | 14 | 0.407083 | 9 | 0.898364 | 0.580814 | COMPLETE |
15 | 15 | 0.426217 | 10 | 0.948803 | 0.745462 | COMPLETE |
16 | 16 | 0.212385 | 6 | 0.877724 | 0.511488 | COMPLETE |
17 | 17 | 0.325844 | 8 | 0.920415 | 0.743528 | COMPLETE |
18 | 18 | 0.348032 | 8 | 0.875184 | 0.503516 | COMPLETE |
19 | 19 | 0.363919 | 10 | 0.829416 | 0.680822 | COMPLETE |
20 | 20 | 0.209931 | 6 | 0.919676 | 0.579547 | COMPLETE |
21 | 21 | 0.463307 | 10 | 0.856384 | 0.233232 | COMPLETE |
22 | 22 | 0.407083 | 9 | 0.894854 | 0.264418 | COMPLETE |
23 | 23 | 0.427457 | 10 | 0.842906 | 0.472672 | COMPLETE |
24 | 24 | 0.407506 | 9 | 0.876132 | 0.672677 | COMPLETE |
25 | 25 | 0.325841 | 8 | 0.927569 | 0.165060 | COMPLETE |
26 | 26 | 0.463307 | 10 | 0.860103 | 0.769505 | COMPLETE |
27 | 27 | 0.403764 | 9 | 0.766207 | 0.447988 | COMPLETE |
28 | 28 | 0.363922 | 10 | 0.817486 | 0.611068 | COMPLETE |
29 | 29 | 0.240802 | 7 | 0.838945 | 0.357438 | COMPLETE |
fig = plot_optimization_history(study)
fig.show()
fig = plot_parallel_coordinate(study)
fig.show()
fig = plot_parallel_coordinate(study, params=["qgrams"])
fig.show()
fig = plot_contour(study)
fig.show()
fig = plot_contour(study, params=["qgrams", "ratio"])
fig.show()
fig = plot_slice(study, params=["qgrams", "ratio"])
fig.show()
fig = plot_slice(study, params=["qgrams", "ratio"])
fig.show()
fig = plot_param_importances(study)
fig.show()
fig = plot_edf(study)
fig.show()
fig = optuna.visualization.plot_param_importances(
study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)
fig.show()