#!/usr/bin/env python # coding: utf-8 # >### 🚩 *Create a free WhyLabs account to get more value out of whylogs!*
# >*Did you know you can store, visualize, and monitor whylogs profiles with the [WhyLabs Observability Platform](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Metric_Constraints)? Sign up for a [free WhyLabs account](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Metric_Constraints) to leverage the power of whylogs and WhyLabs together!* # # Data Validation with Metric Constraints # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/whylabs/whylogs/blob/mainline/python/examples/advanced/Metric_Constraints.ipynb) # > This is an example for whylogs versions 1.0.0 and above. If you're interested in constraints for versions <1.0.0, please see these examples: [Constraints Suite](https://github.com/whylabs/whylogs/blob/maintenance/0.7.x/examples/Constraints_Suite.ipynb), [Constraints-Distributional Measures](https://github.com/whylabs/whylogs/blob/maintenance/0.7.x/examples/Constraints_Distributional_Measures.ipynb), and [Creating Customized Constraints](https://github.com/whylabs/whylogs/blob/maintenance/0.7.x/examples/Creating_Customized_Constraints.ipynb) # In[ ]: # Note: you may need to restart the kernel to use updated packages. get_ipython().run_line_magic('pip', "install 'whylogs[viz]'") # Starting with the basic pandas dataframe logging, consider the following input. We will generate whylogs profile view from this # In[2]: import pandas as pd import whylogs as why data = { "animal": ["cat", "hawk", "snake", "cat", "mosquito"], "legs": [4, 2, 0, 4, 6], "weight": [4.3, 1.8, 1.3, 4.1, 5.5e-6], } results = why.log(pd.DataFrame(data)) profile_view = results.view() # The profile view can be display as a pandas dataframe where the columns are metric/component paths # In[3]: profile_view.to_pandas() # In the above output notice that we have a metrics on the number of legs these animals have in the "legs" column. # Let's say we want to define some constraints on the number of "legs" we expect for animals. # In[4]: from whylogs.core.constraints import Constraints, ConstraintsBuilder, MetricsSelector, MetricConstraint column_view = profile_view.get_column("legs") # constraint session bound to profile_view builder = ConstraintsBuilder(profile_view) # A constraint builder lets you generate a set of contraints using the passed in profile_view's list of columns and metrics. # lets explore what kind of column profiles and metrics we have avalaible in the profile view # We can specify a metric by selecting a (column_name, metric_name) # lets look at the column names again: column_names = profile_view.get_columns().keys() print(f"columns: {column_names}") # And here are the metric names on the "legs" column metric_names = profile_view.get_column("legs").get_metric_names() print(f"metric names: {metric_names}") # If you want to the full set of possibilities you can ask the builder for all MetricSelectors # which covers the unique combinations of (column_name, metric_name) selectors = builder.get_metric_selectors() i = 6 print(f"here is selector at index {i}: {selectors[i]} there are a total of {len(selectors)}") # In[5]: # Lets say we're interested in defining a constraint on the number of "legs". From output above we see # that there are the following metrics on column "legs": [counts, types, distribution, ints, cardinality, frequent_items] # lets look at what the distribution metric contains: distribution_values = profile_view.get_column("legs").get_metric("distribution").to_summary_dict() distribution_values # Ok, let's come back to how to use the ConstraintsBuilder to add a couple constraints # In[6]: # the constraints builder add_constraint() takes in a MetricConstraint, which requires three things to define it: # 1. A metric selector, this is a way of selecting which metric and on which column you want to apply a constraint. # let's choose MetricsSelector(metric_name='distribution', column_name='legs', metrics_resolver=None) # 2. an expression on the selected metric, for distribution, we have numeric properties such as max, min, stddev # and others we can reference. For this we'll require animal legs < 12 (sorry centipedes)! # 3. a name for this constraint, let's go with "legs < 12" distribution_legs = MetricsSelector(metric_name='distribution', column_name='legs') # this lambda takes in a distribution metric, which has convenience properties on this metric for max/min, # but we could also call to_summary_dict() and use any of the keys we saw in 'distribution_values' above legs_under_12 = lambda x: x.max < 12 constraint_name = "legs < 12" legs_constraint = MetricConstraint( name=constraint_name, condition=legs_under_12, metric_selector=distribution_legs) # In[7]: # now that we have a legs_constraint defined we can add it to the builder: builder.add_constraint(legs_constraint) # we could add more constraints using this pattern to the builder, maybe we realize negative values are invalid not_negative = lambda x: x.min >= 0 builder.add_constraint(MetricConstraint( name="legs >= 0", condition=not_negative, metric_selector=distribution_legs )) # ok lets build these constraints constraints: Constraints = builder.build() # A Constraints object contains a collection of contraints and can call validate to get a pass/fail # or report for display constraints_valid = constraints.validate() print(f"Constraints valid: {constraints_valid}") # And a simple report of the [constraint name, pass, fail] can be generated like this: constraints_report = constraints.generate_constraints_report() print(f"Constraints report [constraint name, pass, fail, summary]: {constraints_report}") # Ok lets add a few more! and rebuild the constraints # In[8]: stddev_below_3 = lambda x: x.stddev < 3.0 builder.add_constraint(MetricConstraint( name="legs stddev < 3.0", condition=stddev_below_3, metric_selector=distribution_legs )) distribution_weight = MetricsSelector(metric_name='distribution', column_name='weight') builder.add_constraint(MetricConstraint( name="weight >= 0", condition=not_negative, metric_selector=distribution_weight )) reasonable_constraints = builder.build() builder.add_constraint(MetricConstraint( name="animal count >= 1000", condition=lambda x: x.n.value > 1000, metric_selector=MetricsSelector(metric_name='counts', column_name='animal') )) reasonable_constraints_over_1000_rows = builder.build() # In[9]: from whylogs.viz import NotebookProfileVisualizer # You can also pass the constraints to the NotebookProfileVisualizer and generate a report visualization = NotebookProfileVisualizer() visualization.constraints_report(constraints, cell_height=300) # If you hover on the `Passed/Fail` icons, you'll be able to check the summary of the metric that was used to build the constraints. In this case, `legs<12` passed because the `max` metric component is __6__, which is below the number __12__. # # Similarly, `legs >= 0` passed, because `min` is __0__, which is above or equal __0__. # In[10]: # a slightly more interesting report visualization.constraints_report(reasonable_constraints, cell_height=400) # In[11]: # a failing report (because we don't have enough animals!) visualization.constraints_report(reasonable_constraints_over_1000_rows, cell_height=400)