#!/usr/bin/env python # coding: utf-8 # >### 🚩 *Create a free WhyLabs account to get more value out of whylogs!*
# >*Did you know you can store, visualize, and monitor whylogs profiles with the [WhyLabs Observability Platform](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Schema_Configuration)? Sign up for a [free WhyLabs account](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Schema_Configuration) to leverage the power of whylogs and WhyLabs together!* # # Schema Configuration for Tracking Metrics # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/whylabs/whylogs/blob/mainline/python/examples/basic/Schema_Configuration.ipynb) # When logging data, whylogs outputs certain metrics according to the column type. While whylogs provide a default behaviour, you can configure it in order to only track metrics that are important to you. # # In this example, we'll see how you can configure the Schema for a dataset level to control which metrics you want to calculate. # We'll see how to specify metrics: # # 1. Per data type # # 2. Per column name # # # But first, let's talk briefly about whylogs' data types and basic metrics. # ## Installing whylogs # In[1]: # Note: you may need to restart the kernel to use updated packages. get_ipython().run_line_magic('pip', 'install whylogs') # ## whylogs DataTypes # whylogs maps different data types, like numpy arrays, list, integers, etc. to specific whylogs data types. The three most important whylogs data types are: # # - Integral # - Fractional # - String # Anything that doesn't end up matching the above types will have an `AnyType` type. # # To check which type a certain Python type is mapped to in whylogs, you can use the StandardTypeMapper: # In[2]: from whylogs.core.datatypes import StandardTypeMapper type_mapper = StandardTypeMapper() type_mapper(list) # ## Basic Metrics # The standard metrics available in whylogs are grouped in __namespaces__. They are: # # - __counts__: Counters, such as number of samples and null values # - __types__: Inferred types, such as boolean, string or fractional # - __ints__: Max and Min Values # - __distribution__: min,max, median, quantile values # - __cardinality__: Number of different values # - __frequent_items__: Most common values # - __unicode_range__: Count of characters used in string values # - __condition_count__: Count how often values meet specified conditions # ## Configuring Metrics in the Dataset Schema # Now, let's see how we can control which metrics are tracked according to the column's type or column name. # ### Metrics per Type # Let's assume you're not interested in every metric listed above, and you have a performance-critical application, so you'd like to do as few calculations as possible. # # For example, you might only be interested in: # # - Counts/Types metrics for every data type # - Distribution metrics for Fractional # - Frequent Items for Integral # # Let's see how we can configure our Schema to track only the above metrics for the related types. # Let's create a sample dataframe to illustrate: # In[ ]: # Install pandas if you don't have it already get_ipython().run_line_magic('pip', 'install pandas') # In[4]: import pandas as pd d = {"col1": [1, 2, 3], "col2": [3.0, 4.0, 5.0], "col3": ["a", "b", "c"], "col4": [3.0, 4.0, 5.0]} df = pd.DataFrame(data=d) # whylogs uses `Resolvers` in order to define how a column name or data type gets mapped to different metrics. # # We will create a custom Resolver class in order to customize it. # In[5]: from whylogs.core.resolvers import Resolver from whylogs.core.datatypes import DataType, Fractional, Integral from typing import Dict, List from whylogs.core.metrics import StandardMetric from whylogs.core.metrics.metrics import Metric class MyCustomResolver(Resolver): """Resolver that keeps distribution metrics for Fractional and frequent items for Integral, and counters and types metrics for all data types.""" def resolve(self, name: str, why_type: DataType, column_schema) -> Dict[str, Metric]: metrics: List[StandardMetric] = [StandardMetric.counts, StandardMetric.types] if isinstance(why_type, Fractional): metrics.append(StandardMetric.distribution) if isinstance(why_type, Integral): metrics.append(StandardMetric.frequent_items) result: Dict[str, Metric] = {} for m in metrics: result[m.name] = m.zero(column_schema.cfg) return result # In the case above, the `name` parameter is not being used, as the column name is not relevant to map the metrics, only the `why_type`. # # We basically initialize `metrics` with metrics of both `counts` and `types` namespaces regardless of the data type. Then, we check for the whylogs data type in order to add the desired metric namespace (`distribution` for __Fractional__ columns and `frequent_items` for __Integral__ columns) # Now we can proceed with the normal process of logging a dataframe. Resolvers are passed to whylogs through a `Dataset Schema`, so we can pass a `DatasetSchema` object to log's `schema` parameter as follows: # In[6]: import whylogs as why from whylogs.core import DatasetSchema result = why.log(df, schema=DatasetSchema(resolvers=MyCustomResolver())) prof = result.profile() prof_view = prof.view() pd.set_option("display.max_columns", None) prof_view.to_pandas() # Notice we have `counts` and `types` metrics for every type, `distribution` metrics only for `col2` and `col4` (floats) and `frequent_items` only for `col1` (ints). # # That's precisely what we wanted. # ### Metrics per Column # Now, suppose we don't want to specify the tracked metrics per data type, and rather by each specific columns. # # For example, we might want to track: # - Count metrics for `col1` # - Distribution Metrics for `col2` # - Cardinality for `col3` # - Distribution Metrics + Cardinality for `col4` # # The process is similar to the previous case. We only need to change the if clauses to check for the `name` instead of `why_type`, like this: # In[7]: from whylogs.core.resolvers import Resolver from whylogs.core.datatypes import DataType, Fractional, Integral from typing import Dict, List from whylogs.core.metrics import StandardMetric from whylogs.core.metrics.metrics import Metric class MyCustomResolver(Resolver): """Resolver that keeps distribution metrics for Fractional and frequent items for Integral, and counters and types metrics for all data types.""" def resolve(self, name: str, why_type: DataType, column_schema) -> Dict[str, Metric]: metrics = [] if name=='col1': metrics.append(StandardMetric.counts) if name=='col2': metrics.append(StandardMetric.distribution) if name=='col3': metrics.append(StandardMetric.cardinality) if name=='col4': metrics.append(StandardMetric.distribution) metrics.append(StandardMetric.cardinality) result: Dict[str, Metric] = {} for m in metrics: result[m.name] = m.zero(column_schema.cfg) return result # Since there's no common metrics for all columns, we can initialize `metrics` as an empty list, and then append the relevant metrics for each columns. # # Now, we create a custom schema, just like before: # In[8]: import whylogs as why from whylogs.core import DatasetSchema df['col5'] = 0 result = why.log(df, schema=DatasetSchema(resolvers=MyCustomResolver())) prof = result.profile() prof_view = prof.view() pd.set_option("display.max_columns", None) prof_view.to_pandas() # Note that existing columns that are not specified in your custom resolver won't have any metrics tracked. In the example above, we added a `col5` column, but since we didn't link any metrics to it, all of the metrics are `NaN`s. # # ## Declarative Schema # # In the previous section, we created subclasses of `Resolver` and implemented its `resolve()` method using control flow. The `DeclarativeSchema` allows us to customize the metrics present in a column by simply listing the metrics we want by data type or column name without implementing a `Resolver` subclass. # # ### Declarative Schema Specification # # A `ResolverSpec` specifies a list of metrics to use for columns that match it. We can match columns by name or by type. The column name takes precedence if both are given. Each `ResolverSpec` has a list of `MetricSpec` that specify the `Metric`s (and optionally custom configurations) to apply to matching metrics. For example: # # In[9]: from whylogs.core.metrics.condition_count_metric import ( Condition, ConditionCountConfig, ConditionCountMetric, ) from whylogs.core.relations import Predicate from whylogs.core.resolvers import COLUMN_METRICS, MetricSpec, ResolverSpec from whylogs.core.schema import DeclarativeSchema from whylogs.core.datatypes import AnyType, DataType, Fractional, Integral, String X = Predicate() schema = DeclarativeSchema( [ ResolverSpec( column_name="col1", metrics=[ MetricSpec(StandardMetric.distribution.value), MetricSpec( ConditionCountMetric, ConditionCountConfig( conditions={ "below 42": Condition(lambda x: x < 42), "above 42": Condition(lambda x: x > 42), } ), ), ], ), ResolverSpec( column_type=String, metrics=[ MetricSpec(StandardMetric.frequent_items.value), MetricSpec( ConditionCountMetric, ConditionCountConfig( conditions={ "alpha": Condition(X.matches("[a-zA-Z]+")), "digit": Condition(X.matches("[0-9]+")), } ), ), ], ), ] ) d = {"col1": [1, 2, 3], "col2": [3.0, 4.0, 5.0], "col3": ["a", "b", "c"], "col4": [3.0, 4.0, 5.0]} df = pd.DataFrame(data=d) result = why.log(df, schema=schema) prof_view = result.profile().view() prof_view.to_pandas() # We can now pass `schema` to `why.log()` to log data according to the schema. Note that we pass the `Metric` class to the the `MetricSpec` constructor, not an instance. In this example, `col1` will have a `ConditionCountMetric` that tracks how often the column entries are above or below 42. Any string column will track how many entries are alphabetic and how many are numeric. # # `whylogs.core.resolvers.COLUMN_METRICS` is a list of `MetricSpec`s for the metrics WhyLabs expects in each column. There are also some predefined `ResolverSpec` lists to cover common use cases. For example, `STANDARD_RESOLVER` specifies the same metrics as the `StandardResolver`: # # In[10]: STANDARD_RESOLVER = [ ResolverSpec( column_type=Integral, metrics=COLUMN_METRICS + [ MetricSpec(StandardMetric.distribution.value), MetricSpec(StandardMetric.ints.value), MetricSpec(StandardMetric.cardinality.value), MetricSpec(StandardMetric.frequent_items.value), ], ), ResolverSpec( column_type=Fractional, metrics=COLUMN_METRICS + [ MetricSpec(StandardMetric.distribution.value), MetricSpec(StandardMetric.cardinality.value), ], ), ResolverSpec( column_type=String, metrics=COLUMN_METRICS + [ MetricSpec(StandardMetric.unicode_range.value), MetricSpec(StandardMetric.distribution.value), MetricSpec(StandardMetric.cardinality.value), MetricSpec(StandardMetric.frequent_items.value), ], ), ResolverSpec(column_type=AnyType, metrics=COLUMN_METRICS), ] # There are also declarations for # * `LIMITED_TRACKING_RESOLVER` just tracks the metrics required by WhyLogs, plus the distribution metric for numeric columns. # * `NO_FI_RESOLVER` is the same as `STANDARD_RESOLVER` but omits the frequent item metrics. # * `HISTOGRAM_COUNTING_TRACKING_RESOLVER` tracks only the distribution metric for each column. # # These provide handy starting places if we just want to add one or two metrics to one of these standard schema using the `add_resolver()` method: # In[11]: from whylogs.core.resolvers import STANDARD_RESOLVER schema = DeclarativeSchema(STANDARD_RESOLVER) extra_metric = ResolverSpec( column_name="col1", metrics=[ MetricSpec(StandardMetric.distribution.value), MetricSpec( ConditionCountMetric, ConditionCountConfig( conditions={ "below 42": Condition(lambda x: x < 42), "above 42": Condition(lambda x: x > 42), } ), ), ], ) schema.add_resolver(extra_metric) result = why.log(df, schema=schema) prof_view = result.profile().view() prof_view.to_pandas() # This example adds a condition count metric to `col1` in addition to the usual default metrics. # # ### Default Resolver # # If you instantiate a `DeclarativeResolver` without passing it a list of `ResolverSpec`s, it will use the value of the variable `whylogs.core.resovlers.DEFAULT_RESOLVER`. Initially this has the value of `STANDARD_RESOLVER` which matches whylog's default behavior. You can set the value to one of the other pre-defined resolver lists or your own custom resolver list to customize the default resolving behavior. # # Similarly, there is a `whylogs.experimental.core.metrics.udf_metric.DEFAULT_UDF_RESOLVER` variable that specifies the default resolvers for the submetrics in a `UdfMetric`. # # ## Excluding Metrics # # The `ResolverSpec` has an `exclude` field. If this is set to true, the metrics listed in the `ResolverSpec` are excluded from columns that match it. This can be handy for preventing sensitive information from "leaking" via a frequent items metric: # In[15]: from whylogs.core.resolvers import DEFAULT_RESOLVER data = pd.DataFrame({"Sensitive": ["private", "secret"], "Boring": ["normal", "stuff"]}) schema = DeclarativeSchema( DEFAULT_RESOLVER + [ResolverSpec( column_name = "Sensitive", metrics = [MetricSpec(StandardMetric.frequent_items.value)], exclude = True )] ) result = why.log(data, schema=schema) result.profile().view().to_pandas()["frequent_items/frequent_strings"] # The frequent items metrics has been excluded from the `Sensitive` column without affecting the `DEFAULT_RESOLVER`'s treatment of other columns.