#!/usr/bin/env python # coding: utf-8 # # Query projects & experiments with ``RubiconJSON`` # # Users can utilize the ``RubiconJSON`` class to query ``rubicon-ml`` logs in a JSONPath-like manner. # # ``RubiconJSON`` takes in top-level ``Rubicon`` objects, ``Projects``, and/or ``Experiments`` and # composes a JSON representation of them. Then, with the `search` method, users can query their logged # data using JSONPath syntax. # # ``RubiconJSON`` relies on [the ``jsonpath_ng`` library](https://github.com/h2non/jsonpath-ng) for query # parsing. More information on the allowed syntax can be found # [here in their documentation](https://github.com/h2non/jsonpath-ng#jsonpath-syntax). # In[1]: from rubicon_ml import Rubicon from sklearn.datasets import load_wine from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, make_scorer, precision_score, recall_score from sklearn.model_selection import ParameterGrid, train_test_split # ### Trian some models, log some experiments # # We'll start off by loading a dataset and creating our ``rubicon-ml`` project. # In[2]: X, y = load_wine(return_X_y=True, as_frame=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0) # In[3]: rubicon = Rubicon(persistence="memory", auto_git_enabled=True) project = rubicon.get_or_create_project(name="jsonpath querying") # Now, let's train and evaluate some models and log thier metadata to ``rubicon-ml``. # In[4]: for parameters in ParameterGrid({ "n_estimators": [5, 50, 500], "min_samples_leaf": [1, 10, 100], }): rfc = RandomForestClassifier(random_state=0, **parameters) tags = ["large"] if parameters["n_estimators"] > 10 else [] experiment = project.log_experiment(model_name=rfc.__class__.__name__, tags=tags) for name, value in parameters.items(): experiment.log_parameter(name=name, value=value) for name in X_train.columns: experiment.log_feature(name=name) rfc.fit(X_train, y_train) precision_scorer = make_scorer(precision_score, average="weighted", zero_division=0.0) precision = precision_scorer(rfc, X_test, y_test) recall_scorer = make_scorer(recall_score, average="weighted") recall = recall_scorer(rfc, X_test, y_test) experiment.log_metric(name="precision", value=precision) experiment.log_metric(name="recall", value=recall) experiment.log_artifact(data_object=rfc, name=rfc.__class__.__name__, tags=["trained"]) # ### Load experiments into the ``RubiconJSON`` class # # The ``RubiconJSON`` class accepts ``Projects``, ``Experiments``, and top-level ``Rubicon`` objects as # an input. Once instantiated, the ``RubiconJSON`` class has a ``json`` property detailing each project # and experiment. Let's take a look at the representation of one of our experiments: # In[5]: from rubicon_ml import RubiconJSON rubicon_json = RubiconJSON(experiments=project.experiments()) rubicon_json.json["experiment"][0] # ### Query experiments with ``RubiconJSON.search`` # # Once created, we can use the ``RubiconJSON`` class to query our experiment metadata. We'll start by getting each # experiment that was tagged "large" during training. # In[6]: experiment_query = "$..experiment[?(@.tags[*]=='large')]" for match in rubicon_json.search(experiment_query): print(match.value) # We can access any attribute of the queried objects within the query as well. # Let's just get the ID's of those experiments from the last cell. # In[7]: experiment_query += ".id" for match in rubicon_json.search(experiment_query): print(match.value) # Now, let's get _all_ the metrics from _every_ experiment: # In[8]: metric_query = "$..experiment[*].metric" for match in rubicon_json.search(metric_query): print(match.value) # Some of those precision scores are a lot better than others - let's just get the really high ones. # In[9]: best_metric_query = "$..experiment[*].metric[?(@.name=='precision' & @.value>=0.96)]" for match in rubicon_json.search(best_metric_query): print(match.value) # We can retrieve the ID's of the experiments those metrics belong to for further exploration. # In[10]: best_experiment_query = "$..experiment[?(@.metric[?(@.name=='precision' & @.value>=0.96)])].id" for match in rubicon_json.search(best_experiment_query): print(match.value) # We can use the IDs to retrieve ``rubicon-ml`` experiments and dig deeper into the metadata. # In[11]: for match in rubicon_json.search(best_experiment_query): experiment = project.experiment(id=match.value) print(experiment.artifact(name="RandomForestClassifier").get_data(unpickle=True))