#!/usr/bin/env python # coding: utf-8 # # # Hyper-Parameter Tuning # #
# #
# # Optimization and fine-tuning for the hyper-parameters using a novel framework named Optuna. # # Install # # pyJedAI is an open-source library that can be installed from PyPI. # # In[ ]: get_ipython().run_line_magic('pip', 'install pyjedai -U') # In[ ]: get_ipython().run_line_magic('pip', 'show pyjedai') # Imports # In[1]: import plotly.express as px import logging import sys import optuna import plotly import os import sys import pandas as pd from optuna.visualization import * import plotly.io as pio import plotly.express as px pio.templates.default = "plotly_white" # ## Data Reading # In[3]: from pyjedai.datamodel import Data data = Data( dataset_1=pd.read_csv("./../data/ccer/D2/abt.csv", sep='|', engine='python', na_filter=False).astype(str), attributes_1=['id','name','description'], id_column_name_1='id', dataset_2=pd.read_csv("./../data/ccer/D2/buy.csv", sep='|', engine='python', na_filter=False).astype(str), attributes_2=['id','name','description'], id_column_name_2='id', ground_truth=pd.read_csv("./../data/ccer/D2/gt.csv", sep='|', engine='python'), ) # ## WorkFlow # In[4]: from pyjedai.workflow import WorkFlow, compare_workflows from pyjedai.block_building import StandardBlocking, QGramsBlocking, ExtendedQGramsBlocking, SuffixArraysBlocking, ExtendedSuffixArraysBlocking from pyjedai.block_cleaning import BlockFiltering, BlockPurging from pyjedai.comparison_cleaning import WeightedEdgePruning, WeightedNodePruning, CardinalityEdgePruning, CardinalityNodePruning, BLAST, ReciprocalCardinalityNodePruning, ReciprocalWeightedNodePruning, ComparisonPropagation from pyjedai.matching import EntityMatching from pyjedai.clustering import ConnectedComponentsClustering # In[6]: db_name = "pyjedai" title = "Test" storage_name = "sqlite:///{}.db".format(db_name) study_name = title # Unique identifier of the study. # ## Objective function # # # In the bellow cell, we define which parameters we want to be fine-tuned and the boundaries that we suggest. Also we set as the goal score to be maximized the F1-Score. # # In[7]: ''' OPTUNA objective function ''' def objective(trial): w = WorkFlow( block_building = dict( method=QGramsBlocking, params=dict(qgrams=trial.suggest_int("qgrams", 3, 10)), attributes_1=['name'], attributes_2=['name'] ), block_cleaning = [ dict( method=BlockPurging, params=dict(smoothing_factor=1.025) ), dict( method=BlockFiltering, params=dict( ratio = trial.suggest_float("ratio", 0.7, 0.95) ) ) ], comparison_cleaning = dict(method=CardinalityEdgePruning), entity_matching = dict( method=EntityMatching, metric='sorensen_dice', similarity_threshold= trial.suggest_float("similarity_threshold", 0.05, 0.9), attributes = ['description', 'name'] ), clustering = dict(method=ConnectedComponentsClustering), name="Worflow-Test" ) w.run(data, workflow_step_tqdm_disable=True, verbose=False) f1, precision, recall = w.get_final_scores() return f1 # In[8]: study_name = title # Unique identifier of the study. num_of_trials = 30 study = optuna.create_study( directions=["maximize"], study_name=study_name, storage=storage_name, load_if_exists=True ) print("Optuna trials starting") study.optimize( objective, n_trials=num_of_trials, show_progress_bar=True ) print("Optuna trials finished") # # Optuna Visualizations # In[9]: study.trials_dataframe(attrs=("number", "value", "params", "state")) # In[10]: fig = plot_optimization_history(study) fig.show() # In[11]: fig = plot_parallel_coordinate(study) fig.show() # In[12]: fig = plot_parallel_coordinate(study, params=["qgrams"]) fig.show() # In[13]: fig = plot_contour(study) fig.show() # In[14]: fig = plot_contour(study, params=["qgrams", "ratio"]) fig.show() # In[15]: fig = plot_slice(study, params=["qgrams", "ratio"]) fig.show() # In[16]: fig = plot_slice(study, params=["qgrams", "ratio"]) fig.show() # In[17]: fig = plot_param_importances(study) fig.show() # In[18]: fig = plot_edf(study) fig.show() # In[19]: fig = optuna.visualization.plot_param_importances( study, target=lambda t: t.duration.total_seconds(), target_name="duration" ) fig.show() #
#
# K. Nikoletos, J. Maciejewski, G. Papadakis & M. Koubarakis #
#
# Apache License 2.0 #