#!/usr/bin/env python # coding: utf-8 # # Schema Configuration for Tracking Metrics # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/whylabs/whylogs-v1/blob/mainline/python/examples/basic/Schema_Configuration.ipynb) # When logging data, whylogs outputs certain metrics according to the column type. While whylogs provide a default behaviour, you can configure it in order to only track metrics that are important to you. # # In this example, we'll see how you can configure the Schema for a dataset level to control which metrics you want to calculate. # We'll see how to specify metrics: # # 1. Per data type # # 2. Per column name # # # But first, let's talk briefly about whylogs' data types and basic metrics. # ## whylogs DataTypes # whylogs maps different data types, like numpy arrays, list, integers, etc. to specific whylogs data types. The three most important whylogs data types are: # # - Integral # - Fractional # - String # Anything that doesn't end up matching the above types will have an `AnyType` type. # # If you want to check to which type a certain Python type is mapped to whylogs, you can use the StandardTypeMapper: # In[1]: from whylogs.core.datatypes import StandardTypeMapper type_mapper = StandardTypeMapper() type_mapper(list) # ## Basic Metrics # The standard metrics available in whylogs are grouped in __namespaces__. They are: # # - __counts__: Counters, such as number of samples and null values # - __types__: Inferred types, such as boolean, string or fractional # - __ints__: Max and Min Values # - __distribution__: min,max, median, quantile values # - __cardinality__ # - __frequent_items__ # ## Configuring Metrics in the Dataset Schema # Now, let's see how we can control which metrics are tracked according to the column's type or column name. # ### Metrics per Type # Let's assume you're not interested in every metric listed above, and you have a performance-critical application, so you'd like to do as few calculations as possible. # # For example, you might only be interested in: # # - Counts/Types metrics for every data type # - Distribution metrics for Fractional # - Frequent Items for Integral # # Let's see how we can configure our Schema to track only the above metrics for the related types. # Let's create a sample dataframe to illustrate: # In[2]: import pandas as pd d = {"col1": [1, 2, 3], "col2": [3.0, 4.0, 5.0], "col3": ["a", "b", "c"], "col4": [3.0, 4.0, 5.0]} df = pd.DataFrame(data=d) # whylogs use `Resolvers` in order to define how a column name or data type gets mapped to different metrics. # # We will need to create a custom Resolver class in order to customize it. # In[3]: from whylogs.core.resolvers import Resolver from whylogs.core.datatypes import DataType, Fractional, Integral from typing import Dict, List from whylogs.core.metrics import StandardMetric from whylogs.core.metrics.metrics import Metric class MyCustomResolver(Resolver): """Resolver that keeps distribution metrics for Fractional and frequent items for Integral, and counters and types metrics for all data types.""" def resolve(self, name: str, why_type: DataType, column_schema) -> Dict[str, Metric]: metrics: List[StandardMetric] = [StandardMetric.counts, StandardMetric.types] if isinstance(why_type, Fractional): metrics.append(StandardMetric.distribution) if isinstance(why_type, Integral): metrics.append(StandardMetric.frequent_items) result: Dict[str, Metric] = {} for m in metrics: result[m.name] = m.zero(column_schema) return result # In the case above, the `name` parameter is not being used, as the column name is not relevant to map the metrics, only the `why_type`. # # We basically initialize `metrics` with metrics of both `counts` and `types` namespaces regardless of the data type. Then, we check for the whylogs data type in order to add the desired metric namespace (`distribution` for __Fractional__ columns and `frequent_items` for __Integral__ columns) # Resolvers are passed to whylogs through a `Dataset Schema`, so we'll have to create a custom Schema as well. # # In this case, since we're only interested in the resolvers, we could create a custom schema as follows: # In[4]: from whylogs.core import DatasetSchema # In[5]: class MyCustomSchema(DatasetSchema): resolvers = MyCustomResolver() # Now we can proceed with the normal process of logging a dataframe, remembering to pass our schema when making the `log` call: # In[6]: import whylogs as why result = why.log(df, schema=MyCustomSchema()) prof = result.profile() prof_view = prof.view() pd.set_option("display.max_columns", None) prof_view.to_pandas() # Notice we have `counts` and `types` metrics for every type, `distribution` metrics only for `col2` and `col4` (floats) and `frequent_items` only for `col1` (ints). # # That's precisely what we wanted. # ### Metrics per Column # Now, suppose we don't want to specify the tracked metrics per data type, and rather by each specific columns. # # For example, we might want to track: # - Count metrics for `col1` # - Distribution Metrics for `col2` # - Cardinality for `col3` # - Distribution Metrics + Cardinality for `col4` # # The process is similar to the previous case. We only need to change the if clauses to check for the `name` instead of `why_type`, like this: # In[7]: from whylogs.core.resolvers import Resolver from whylogs.core.datatypes import DataType, Fractional, Integral from typing import Dict, List from whylogs.core.metrics import StandardMetric from whylogs.core.metrics.metrics import Metric class MyCustomResolver(Resolver): """Resolver that keeps distribution metrics for Fractional and frequent items for Integral, and counters and types metrics for all data types.""" def resolve(self, name: str, why_type: DataType, column_schema) -> Dict[str, Metric]: metrics = [] if name=='col1': metrics.append(StandardMetric.counts) if name=='col2': metrics.append(StandardMetric.distribution) if name=='col3': metrics.append(StandardMetric.cardinality) if name=='col4': metrics.append(StandardMetric.distribution) metrics.append(StandardMetric.cardinality) result: Dict[str, Metric] = {} for m in metrics: result[m.name] = m.zero(column_schema) return result # Since there's no common metrics for all columns, we can initialize `metrics` as an empty list, and then append the relevant metrics for each columns. # # Now, we create a custom schema, just like before: # In[8]: class MyCustomSchema(DatasetSchema): resolvers = MyCustomResolver() # In[9]: import whylogs as why df['col5'] = 0 result = why.log(df, schema=MyCustomSchema()) prof = result.profile() prof_view = prof.view() pd.set_option("display.max_columns", None) prof_view.to_pandas() # Note that existing columns that are not specified in your custom resolver won't have any metrics tracked. In the example above, we added a `col5` column, but since we didn't link any metrics to it, all of the metrics are `NaN`s.