#!/usr/bin/env python
# coding: utf-8

# # Query projects & experiments with ``RubiconJSON``
# 
# Users can utilize the ``RubiconJSON`` class to query ``rubicon-ml`` logs in a JSONPath-like manner.
# 
# ``RubiconJSON`` takes in top-level ``Rubicon`` objects, ``Projects``, and/or ``Experiments`` and
# composes a JSON representation of them. Then, with the `search` method, users can query their logged
# data using JSONPath syntax.
# 
# ``RubiconJSON`` relies on [the ``jsonpath_ng`` library](https://github.com/h2non/jsonpath-ng) for query
# parsing. More information on the allowed syntax can be found
# [here in their documentation](https://github.com/h2non/jsonpath-ng#jsonpath-syntax).

# In[1]:


from rubicon_ml import Rubicon

from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer, precision_score, recall_score
from sklearn.model_selection import ParameterGrid, train_test_split


# ### Trian some models, log some experiments
# 
# We'll start off by loading a dataset and creating our ``rubicon-ml`` project.

# In[2]:


X, y = load_wine(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)


# In[3]:


rubicon = Rubicon(persistence="memory", auto_git_enabled=True)
project = rubicon.get_or_create_project(name="jsonpath querying")


# Now, let's train and evaluate some models and log thier metadata to ``rubicon-ml``.

# In[4]:


for parameters in ParameterGrid({
    "n_estimators": [5, 50, 500],
    "min_samples_leaf": [1, 10, 100],
}):
    rfc = RandomForestClassifier(random_state=0, **parameters)

    tags = ["large"] if parameters["n_estimators"] > 10 else []
    experiment = project.log_experiment(model_name=rfc.__class__.__name__, tags=tags)
    for name, value in parameters.items():
        experiment.log_parameter(name=name, value=value)
    for name in X_train.columns:
        experiment.log_feature(name=name)

    rfc.fit(X_train, y_train)

    precision_scorer = make_scorer(precision_score, average="weighted", zero_division=0.0)
    precision = precision_scorer(rfc, X_test, y_test)
    recall_scorer = make_scorer(recall_score, average="weighted")
    recall = recall_scorer(rfc, X_test, y_test)

    experiment.log_metric(name="precision", value=precision)
    experiment.log_metric(name="recall", value=recall)
    experiment.log_artifact(data_object=rfc, name=rfc.__class__.__name__, tags=["trained"])


# ### Load experiments into the ``RubiconJSON`` class
# 
# The ``RubiconJSON`` class accepts ``Projects``, ``Experiments``, and top-level ``Rubicon`` objects as
# an input. Once instantiated, the ``RubiconJSON`` class has a ``json`` property detailing each project
# and experiment. Let's take a look at the representation of one of our experiments:

# In[5]:


from rubicon_ml import RubiconJSON

rubicon_json = RubiconJSON(experiments=project.experiments())
rubicon_json.json["experiment"][0]


# ### Query experiments with ``RubiconJSON.search``
# 
# Once created, we can use the ``RubiconJSON`` class to query our experiment metadata. We'll start by getting each
# experiment that was tagged "large" during training.

# In[6]:


experiment_query = "$..experiment[?(@.tags[*]=='large')]"

for match in rubicon_json.search(experiment_query):
    print(match.value)


# We can access any attribute of the queried objects within the query as well.
# Let's just get the ID's of those experiments from the last cell.

# In[7]:


experiment_query += ".id"

for match in rubicon_json.search(experiment_query):
    print(match.value)


# Now, let's get _all_ the metrics from _every_ experiment:

# In[8]:


metric_query = "$..experiment[*].metric"

for match in rubicon_json.search(metric_query):
    print(match.value)


# Some of those precision scores are a lot better than others - let's just get the really high ones.

# In[9]:


best_metric_query = "$..experiment[*].metric[?(@.name=='precision' & @.value>=0.96)]"

for match in rubicon_json.search(best_metric_query):
    print(match.value)


# We can retrieve the ID's of the experiments those metrics belong to for further exploration.

# In[10]:


best_experiment_query = "$..experiment[?(@.metric[?(@.name=='precision' & @.value>=0.96)])].id"

for match in rubicon_json.search(best_experiment_query):
    print(match.value)


# We can use the IDs to retrieve ``rubicon-ml`` experiments and dig deeper into the metadata.

# In[11]:


for match in rubicon_json.search(best_experiment_query):
    experiment = project.experiment(id=match.value)

    print(experiment.artifact(name="RandomForestClassifier").get_data(unpickle=True))