#!/usr/bin/env python # coding: utf-8 # >### 🚩 *Create a free WhyLabs account to get more value out of whylogs!*
# >*Did you know you can store, visualize, and monitor whylogs profiles with the [WhyLabs Observability Platform](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Constraints_Suite)? Sign up for a [free WhyLabs account](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Constraints_Suite) to leverage the power of whylogs and WhyLabs together!* # # Simple Constraints - Examples and Usage # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/whylabs/whylogs/blob/mainline/python/examples/basic/Constraints_Suite.ipynb) # > This is a `whylogs v1` example. For the analog feature in `v0`, please refer to [this example](https://github.com/whylabs/whylogs/blob/maintenance/0.7.x/examples/Constraints_Suite.ipynb) # In this example, we'll show how to define a number of simple constraints and examples on how to use them. For the basics on how to build your own set of constraints, see the example - [Data Validation with Metric Constraints](https://whylogs.readthedocs.io/en/stable/examples/advanced/Metric_Constraints.html). # # The constraints are listed according to the metric namespace used when defining them. For each category, we will create helper functions for simple and popular constraints. Each helper function has a brief explanation in its docstring. After defining the helper functions, we'll show a simple example on how to build the constraints out of the functions and visualize them as a report with the visualization module. # > Note: The constraints shown here are still experimental and subject to further changes. Stay tuned for upgrades! # ## Completeness Constraints # # | constraint | parameters | semantic | metric | # |------------------------------|---------------------|-------------------------------------------------------|--------| # | no_missing_values | column name | Checks that are no missing values in the column | Counts | # | null_values_below_number | column name, number | Number of null values must be below given number. | Counts | # | null_percentage_below_number | column name, number | Percentage of null values must be below given number. | Counts | # ## Consistency Constraints # # | constraint | parameters | semantic | metric | # |-----------------------------------|----------------------------|----------------------------------------------------------------------------------------|----------------| # | greater_than_number | column name | Minimum value of given column must be above defined number. | Distribution | # | smaller_than_number | column name, number | Maximum value of given column must be below defined number. | Distribution | # | is_in_range | column name, lower, upper | Checks that all of column's values are in defined range (inclusive). | Distribution | # | is_non_negative | column name | Checks if a column is non negative. | Distribution | # | n_most_common_items_in_set | column name, reference set | Checks if the top n most common items appear in the dataset. | Frequent Items | # | frequent_strings_in_reference_set | column name, reference set | Checks if a set of variables appear in the frequent strings for a string column. | Frequent Items | # | count_below_number | column name, number | Checks if elements in a column are below given number. | Counts | # | distinct_number_in_range | column name, lower, upper | Checks if number of distinct categories is between lower and upper values (inclusive). | Cardinality | # | column_is_nullable_integral | column name | Check if column contains only records of specific datatype. | Types | # | column_is_nullable_boolean | column name | Check if column contains only records of specific datatype. | Types | # | column_is_nullable_fractional | column name | Check if column contains only records of specific datatype. | Types | # | column_is_nullable_object | column name | Check if column contains only records of specific datatype. | Types | # | column_is_nullable_string | column name | Check if column contains only records of specific datatype. | Types | # ## Condition Count Constraints # # Please refer to the example [Metric Constraints with Condition Count Metrics](https://github.com/whylabs/whylogs/blob/mainline/python/examples/advanced/Metric_Constraints_with_Condition_Count_Metrics.ipynb) for examples on how to use these constraints. # # | constraint | parameters | semantic | metric | # |------------------------|-------------------------------------|--------------------------------------------------------------------------------------|--------------| # | condition_meets | column name, condition_name | Fails if condition not met at least once. | Condition Count | # | condition_never_meets | column name, condition_name | Fails if condition is met at least once | Condition Count | # | condition_count_below | column name, condition_name, max_count | Fails if condition is met more than max count | Condition Count | # # ## Statistics Constraints # # | constraint | parameters | semantic | metric | # |------------------------|-------------------------------------|--------------------------------------------------------------------------------------|--------------| # | mean_between_range | column name, lower, upper | Mean must be between range defined by lower and upper bounds. | Distribution | # | stddev_between_range | column name, lower, upper | Standard deviarion must be between range defined by lower and upper bounds. | Distribution | # | quantile_between_range | column name, quantile, lower, upper | Q-th quantile value must be withing the range defined by lower and upper boundaries. | Distribution | # ## Table of Contents # # - [Installing and Importing Modules](#pre) # - [Distribution Metrics Constraints](#distribution) # - [Frequent Items/Frequent Strings Metrics Constraints](#frequent) # - [Counters Constraints](#counts) # - [Cardinality Constraints](#card) # - [Types Constraints](#types) # - [Combined Metrics Constraints](#comb) # # ## Installing whylogs and importing modules # # If you haven't already, install whylogs: # In[1]: # Note: you may need to restart the kernel to use updated packages. get_ipython().run_line_magic('pip', "install 'whylogs[viz]'") # Then, let's import the helper functions needed to define the constraints: # In[2]: from whylogs.core.constraints import ConstraintsBuilder from whylogs.core.constraints.factories import ( greater_than_number, is_in_range, is_non_negative, mean_between_range, smaller_than_number, stddev_between_range, quantile_between_range ) # ### Examples - Distribution Metrics Constraints # In[3]: import whylogs as why import pandas as pd data = { "animal": ["cat", "hawk", "snake", "cat", "mosquito"], "legs": [4, 2, 0, 4, 6], "weight": [4.3, 1.8, 1.3, 4.1, 5.5e-6], } results = why.log(pd.DataFrame(data)) profile_view = results.view() # In[4]: builder = ConstraintsBuilder(dataset_profile_view=profile_view) builder.add_constraint(greater_than_number(column_name="weight", number=0.14)) builder.add_constraint(mean_between_range(column_name="weight", lower=2, upper=3)) builder.add_constraint(smaller_than_number(column_name="weight", number=20.5)) builder.add_constraint(stddev_between_range(column_name="weight", lower=1, upper=3)) builder.add_constraint(quantile_between_range(column_name="weight", quantile=0.5, lower=1.5, upper=2.0)) builder.add_constraint(is_in_range(column_name="weight", lower=1.1, upper=3.2)) builder.add_constraint(is_in_range(column_name="legs", lower=0, upper=6)) builder.add_constraint(is_non_negative(column_name="legs")) # animal has missing distribution metrics. this will pass if skip_missing = True and fail otherwise. builder.add_constraint( quantile_between_range( column_name="animal", quantile=0.5, lower=1.5, upper=2.0, skip_missing=False ) ) constraints = builder.build() from whylogs.viz import NotebookProfileVisualizer visualization = NotebookProfileVisualizer() visualization.constraints_report(constraints, cell_height=300) # ## Frequent Items/Frequent Strings Constraints # In[5]: from whylogs.core.constraints.factories import n_most_common_items_in_set, frequent_strings_in_reference_set # ### Examples - Frequent Items/Frequent Strings Constraints # In[6]: import whylogs as why import pandas as pd data = { "animal": ["cat", "snake", "snake", "cat", "mosquito"], "legs": [0, 1, 2, 3, 4], "weight": [4.3, 1.8, 1.3, 4.1, 5.5e-6], } results = why.log(pd.DataFrame(data)) profile_view = results.view() # In[7]: builder = ConstraintsBuilder(dataset_profile_view=profile_view) reference_set = {"cat","snake"} builder.add_constraint(frequent_strings_in_reference_set(column_name="animal", reference_set=reference_set)) builder.add_constraint(n_most_common_items_in_set(column_name="animal",n=2,reference_set=reference_set)) constraints = builder.build() from whylogs.viz import NotebookProfileVisualizer visualization = NotebookProfileVisualizer() visualization.constraints_report(constraints, cell_height=300) # ## Counters Constraints # In[8]: from whylogs.core.constraints.factories import no_missing_values, count_below_number, null_percentage_below_number, null_values_below_number # ### Examples - Counters Constraints # In[9]: import whylogs as why import pandas as pd data = { "animal": ["cat", "snake", "snake", "cat", "mosquito"], "legs": [4, 2, 0, None, 6], "weight": [4.3, 1.8, 1.3, 4.1, 5.5e-6], } results = why.log(pd.DataFrame(data)) profile_view = results.view() # In[10]: builder = ConstraintsBuilder(dataset_profile_view=profile_view) builder.add_constraint(count_below_number(column_name="legs", number=10)) builder.add_constraint(null_percentage_below_number(column_name="legs", number=0.05)) builder.add_constraint(null_values_below_number(column_name="legs", number=1)) builder.add_constraint(no_missing_values(column_name="legs")) builder.add_constraint(no_missing_values(column_name="animal")) constraints = builder.build() from whylogs.viz import NotebookProfileVisualizer visualization = NotebookProfileVisualizer() visualization.constraints_report(constraints, cell_height=300) # ## Cardinality Constraints # In[11]: from whylogs.core.constraints.factories import distinct_number_in_range # ### Examples - Cardinality Constraints # In[12]: import whylogs as why import pandas as pd data = { "animal": ["cat", "snake", "snake", "cat", "mosquito"], "legs": [4, 2, 0, None, 6], "weight": [4.3, 1.8, 1.3, 4.1, 5.5e-6], } results = why.log(pd.DataFrame(data)) profile_view = results.view() # In[13]: builder = ConstraintsBuilder(dataset_profile_view=profile_view) builder.add_constraint(distinct_number_in_range(column_name = "animal", lower = 3, upper = 6)) constraints = builder.build() from whylogs.viz import NotebookProfileVisualizer visualization = NotebookProfileVisualizer() visualization.constraints_report(constraints, cell_height=300) # ## Types Metrics # ### Examples - Types Metrics # In[14]: import whylogs as why import pandas as pd data = { "animal": ["cat", "snake", "snake", "cat", "mosquito"], "legs": [4, 2, 0, None, 6], "weight": [4.3, 1.8, 1.3, 4.1, 5.5e-6], "flies": [False, False, "False", False, True], "obj": [{"a":1}, None, {"a":1}, {"a":1}, {"a":1}] } df = pd.DataFrame(data) results = why.log(df) profile_view = results.view() # #### Check Nullable Types # In[15]: from whylogs.core.constraints.factories import ( column_is_nullable_integral, column_is_nullable_boolean, column_is_nullable_fractional, column_is_nullable_object, column_is_nullable_string, ) from whylogs.core.constraints import ConstraintsBuilder builder = ConstraintsBuilder(dataset_profile_view=profile_view) builder.add_constraint(column_is_nullable_string(column_name="animal")) builder.add_constraint(column_is_nullable_integral(column_name="legs")) builder.add_constraint(column_is_nullable_fractional(column_name="weight")) builder.add_constraint(column_is_nullable_boolean(column_name="flies")) builder.add_constraint(column_is_nullable_object(column_name="obj")) constraints = builder.build() from whylogs.viz import NotebookProfileVisualizer visualization = NotebookProfileVisualizer() visualization.constraints_report(constraints, cell_height=300) # The constraints above will pass if all values are of a given type. Null values are accepted. # # Note that for `legs`, the constraints failed. That is because whylogs leverages __pandas' dtypes__ when it is available, and when a `None` is present, the column is considered to be `fractional`, even though the remaining values were originally integers. # ## Combined Constraints # ### Examples - Combined Metrics # To create a constraint that checks for a non-nullable type, we combine two separate constraints: # # - `column is nullable datatype` # - `null values below 1` # In[16]: import whylogs as why import pandas as pd data = { "animal": ["cat", "snake", "snake", "cat", "mosquito"], "legs": [4, 2, 0, None, 6], "weight": [4.3, 1.8, 1.3, 4.1, 5.5e-6], "flies": [False, False, "False", False, True], "obj": [{"a":1}, None, {"a":1}, {"a":1}, {"a":1}] } df = pd.DataFrame(data) results = why.log(df) profile_view = results.view() # #### Check Non-nullable Types # In[17]: from whylogs.core.constraints.factories import ( column_is_nullable_integral, column_is_nullable_boolean, column_is_nullable_fractional, column_is_nullable_object, column_is_nullable_string, null_values_below_number, ) from whylogs.core.constraints import ConstraintsBuilder builder = ConstraintsBuilder(dataset_profile_view=profile_view) builder.add_constraint(column_is_nullable_string(column_name="animal")) builder.add_constraint(null_values_below_number(column_name="animal",number=1)) # The combination of these metrics makes a check of non-nullable integral builder.add_constraint(column_is_nullable_integral(column_name="legs")) builder.add_constraint(null_values_below_number(column_name="legs",number=1)) # The combination of these metrics makes a check of non-nullable fractional builder.add_constraint(column_is_nullable_fractional(column_name="weight")) builder.add_constraint(null_values_below_number(column_name="weight",number=1)) # The combination of these metrics makes a check of non-nullable boolean builder.add_constraint(column_is_nullable_boolean(column_name="flies")) builder.add_constraint(null_values_below_number(column_name="flies",number=1)) # The combination of these metrics makes a check of non-nullable object builder.add_constraint(column_is_nullable_object(column_name="obj")) builder.add_constraint(null_values_below_number(column_name="obj",number=1)) constraints = builder.build() from whylogs.viz import NotebookProfileVisualizer visualization = NotebookProfileVisualizer() visualization.constraints_report(constraints, cell_height=300)