#!/usr/bin/env python # coding: utf-8 # >### 🚩 *Create a free WhyLabs account to get more value out of whylogs!*
# >*Did you know you can store, visualize, and monitor whylogs profiles with the [WhyLabs Observability Platform](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Condition_Count_Metrics)? Sign up for a [free WhyLabs account](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Condition_Count_Metrics) to leverage the power of whylogs and WhyLabs together!* # # Condition Count Metrics # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/whylabs/whylogs/blob/mainline/python/examples/advanced/Condition_Count_Metrics.ipynb) # By default, whylogs tracks several metrics, such as type counts, distribution metrics, cardinality and frequent items. Those are general metrics that are useful for a lot of use cases, but often we need metrics tailored for our application. # # __Condition Count Metrics__ gives you the flexibility to define your own customized metrics. It will return the results as counters, which is the number of times the condition was met for a given column. With it, you can define conditions such as regex matching for strings, equalities or inequalities for numerical features, and even define your own function to check for any given condition. # # In this example, we will cover: # # 1. Create metrics for __regex matching__ # - Examples: contains email/credit card number (String features) # 2. Create metrics for __(in)equalities__ # - Examples: equal, less, greater, less than, greater than (Numerical features) # 3. Combining metrics with __logic operators__ (and, or, not) # - Examples: Between range, outside of range, not equal (Numerical features) # 4. Creating metrics with __custom functions__ # - Examples: is even number, is text preprocessed (Any type) # 5. Going Further: Combining this example with other whylogs' features # 6. (APPENDIX) Complete code snippets - The complete code snippets (to make it easir to copy and paste) # ## Installing whylogs and importing modules # In[1]: # Note: you may need to restart the kernel to use updated packages. get_ipython().run_line_magic('pip', 'install whylogs') # Let's import all the dependencies for this example upfront: # In[2]: import pandas as pd from typing import Any import whylogs as why from whylogs.core.resolvers import STANDARD_RESOLVER from whylogs.core.specialized_resolvers import ConditionCountMetricSpec from whylogs.core.datatypes import Fractional, Integral from whylogs.core.metrics.condition_count_metric import Condition from whylogs.core.relations import Not, Predicate from whylogs.core.schema import DeclarativeSchema # ## 1. Regex Matching # Suppose we have textual columns in our data in which we want to make sure certain elements are present / not present. # # For example, for privacy and security issues, we might be interested in tracking the number of times a credit card number appears on a given column, or if we have sensitive email information in another column. # # With whylogs, we can define metrics that will count the number of occurences a certain regex pattern is met for a given column. # ### Creating sample dataframe # Let's create a simple dataframe. # # In this scenario, the `emails` column should have only a valid email, nothing else. As for the `trascriptions` column, we want to make sure existing credit card number was properly masked or removed. # In[3]: data = { "emails": ["my email is my_email_1989@gmail.com","invalidEmail@xyz.toolong","this.is.ok@hotmail.com","not an email"], "transcriptions": ["Bob's credit card number is 4000000000000", "Alice's credit card is XXXXXXXXXXXXX", "Hi, my name is Bob", "Hi, I'm Alice"], } df = pd.DataFrame(data=data) # The conditions are defined through a whylogs' `Condition` object. There are several different ways of assembling a condition. In the following example, we will define two different regex patterns, one for each column. Since we can define multiple conditions for a single column, we'll assemble the conditions into dictionaries, where the key is the condition name. Each dictionary will be later attached to the relevant column. # In[4]: emails_conditions = { "containsEmail": Condition(Predicate().fullmatch("[\w.]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}")), } transcriptions_conditions = { "containsCreditCard": Condition(Predicate().matches(".*4[0-9]{12}(?:[0-9]{3})?")) } # whylogs must be aware of those conditions while profiling the data. We can do that by creating a Standard Schema, and then simply adding the conditions to the schema with `add_resolver_spec`. That way, we can pass our enhanced schema when calling `why.log()` later. # In[5]: schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_name="emails", metrics=[ConditionCountMetricSpec(emails_conditions)]) schema.add_resolver_spec(column_name="transcriptions", metrics=[ConditionCountMetricSpec(transcriptions_conditions)]) # > Note: The regex expressions are for demonstrational purposes only. These expressions are not general - there will be emails and credit cards whose patterns will not be met by the expression. # Now, we only need to pass our schema when logging our data. Let's also take a look at the metrics, to make sure everythins was tracked correctly: # In[6]: prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['condition_count/containsEmail', 'condition_count/containsCreditCard', 'condition_count/total']] # Let's check the numbers: # # For `emails` feature, only one occurence was met for `containsEmail`. That is expected, because the only valid row is the third one ("this.is.ok@hotmail.com"). Others either don't contain an email, are invalid emails or have extra text that are not an email (note we're using `fullmatch` as the predicate for the email condition). # # For `transcriptions` column, we also have only one match. That is well, since only the first row has a match with the given pattern, and others either don't have a credit card number or are properly "hidden". Note that in this case we want to check for the pattern inside a broader text, so we're using `.*` before the pattern, so the text doesn't have to start with the pattern (whylogs' `Predicate.matches` uses python's `re.compile().match()` under the hood.) # # The available relations for regex matching are the ones used in this example: # # - `matches` # - `fullmatch` # ## 2. Numerical Equalities and Inequalities # For this one, let's create integer and floats columns: # In[7]: data = { "ints_column": [1,12,42,4], "floats_column": [1.2, 12.3, 42.2, 4.8] } df = pd.DataFrame(data=data) # As before, we will create our set of conditions for each column and pass both to our schema: # In[9]: ints_conditions = { "equals42": Condition(Predicate().equals(42)), "lessthan5": Condition(Predicate().less_than(5)), "morethan40": Condition(Predicate().greater_than(40)), } floats_conditions = { "equals42.2": Condition(Predicate().equals(42.2)), "lessthan5": Condition(Predicate().less_than(5)), "morethan40": Condition(Predicate().greater_than(40)), } schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_type=Integral, metrics=[ConditionCountMetricSpec(ints_conditions)]) schema.add_resolver_spec(column_type=Fractional, metrics=[ConditionCountMetricSpec(floats_conditions)]) # Let's log and check the metrics: # In[10]: prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['types/fractional','types/integral','condition_count/lessthan5', 'condition_count/morethan40','condition_count/equals42','condition_count/equals42.2', 'condition_count/total']] # We can simply check the original data to verify that the metrics are correct. # We used `equals`, `less_than` and `greater_than` in this example, but here's the complete list of available relations: # # - `equals` - equal to # - `less_than` - less than # - `less_or_equals` - less than or equal to # - `greater_than` - greater than # - `greater_or_equals` - greater than or equal to # - `not_equal` - not equal to # # ## 3. Combining metrics with logical operators - AND, OR, NOT # You can also combine relations with logical operators such as __AND__, __OR__ and __NOT__. # # Let's stick with the numerical features to show how you can combine relations to assemble conditions such as: # # - Value is between a certain range # - Value is outside a certin range # - Value is NOT a certain number # In[11]: conditions = { "between10and50": Condition(Predicate().greater_than(10).and_(Predicate().less_than(50))), "outside10and50": Condition(Predicate().less_than(10).or_(Predicate().greater_than(50))), "not_42": Condition(Not(Predicate().equals(42))), # could also use X.not_equal(42) or X.not_.equals(42) } schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_name="ints_column", metrics=[ConditionCountMetricSpec(conditions)]) schema.add_resolver_spec(column_name="floats_column", metrics=[ConditionCountMetricSpec(conditions)]) prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['condition_count/between10and50', 'condition_count/outside10and50', 'condition_count/not_42', 'condition_count/total']] # Available logical operators are: # - `and_` # - `or_` # - `not_` # - `Not` # # Note that `and_`, `or_`, and `not_` are methods called on a `Predicate` and passed another `Predicate`, while `Not` is a function that takes a single `Predicate` argument. # Even though we showed these operators with numerical features, this also works with regex matching conditions shown previously. # ## 4. Custom Condition with User-defined functions # If none of the previously conditions are suited to your use case, you are free to define your own custom function to create your own metrics. # # Let's see a simple example: suppose we want to check if a certain number is even. # # We can define a `even` predicate function, as simple as: # In[12]: def even(x: Any) -> bool: return x % 2 == 0 # And then we proceed as usual, defining our condition and adding it to the schema: # # We only have to pass the name of the function to `conditions` as a `Condition` object, like below: # In[13]: conditions = { "isEven": Condition(Predicate().is_(even)), } schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_name="ints_column", metrics=[ConditionCountMetricSpec(conditions)]) schema.add_resolver_spec(column_name="floats_column", metrics=[ConditionCountMetricSpec(conditions)]) prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['condition_count/isEven', 'condition_count/total']] # ### NLP example # For user-defined functions, the sky's the limit for what you can do. # # Let's think of another simple cenario for NLP. Suppose our model assumes text to be a certain way. Maybe it was trained and expects: # # - lowercased characters # - no digits # - no stopwords # Let's check these conditions for the data below: # In[14]: data = { "transcriptions": ["I AM BOB AND I LIKE TO SCREAM","i am bob","am alice and am xx years old","am bob and am 42 years old"], "ints": [0,1,2,3], } df = pd.DataFrame(data=data) # Once again, let's define our function: # In[15]: def preprocessed(x: Any) -> bool: stopwords = ["i", "me", "myself"] if not isinstance(x, str): return False # should have only lowercase letters and space (no digits) if not all(c.islower() or c.isspace() for c in x): return False # should not contain any words in our stopwords list if any(c in stopwords for c in x.split()): return False return True # > Since this is an example, our `stopwords` list is only a placeholder for the real thing. # The rest is the same as before: # In[16]: conditions = { "isPreprocessed": Condition(Predicate().is_(preprocessed)), } schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_name="transcriptions", metrics=[ConditionCountMetricSpec(conditions)]) prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['condition_count/isPreprocessed', 'condition_count/total']] # For the `transcriptions` feature, we can see that only the second row is properly preprocessed ("am alice and am xx years old"). The first one contained uppercase characters, the third contained a stopword and the last one contained digits. For the integers column, isPreprocessed returns 0, since it's not a string value. # ## 5. Going Further # You can combine this example with other whylogs' features to cover even more scenarios. # # Here are some pointers for some possible use cases: # # - I want to track other types of metrics, not only Condition Counts! # - Check the [Schema Configuration](https://whylogs.readthedocs.io/en/stable/examples/basic/Schema_Configuration.html) example! # - Ok, I have counters. What now? # - You can set constraints (such as `containsCreditCardNumber` should always be 0). Check the [Metric Constraints with Condition Count Metrics](https://whylogs.readthedocs.io/en/stable/examples/advanced/Metric_Constraints_with_Condition_Count_Metrics.html) example! # - You can store it locally or on S3 for future inspection - Check the [Writing Profiles](https://whylogs.readthedocs.io/en/stable/examples/integrations/writers/Writing_Profiles.html) example! # - You can send your profiles to a monitoring platform, such as WhyLabs - Check the [Writing Profiles to WhyLabs](https://whylogs.readthedocs.io/en/stable/examples/integrations/writers/Writing_to_WhyLabs.html) example! # ## Appendix - Complete Code Snippets # Here are the complete code snippets - just to make it easier to copy/paste! # ### Regex example # In[17]: import pandas as pd import whylogs as why from whylogs.core.resolvers import STANDARD_RESOLVER from whylogs.core.specialized_resolvers import ConditionCountMetricSpec from whylogs.core.metrics.condition_count_metric import Condition from whylogs.core.relations import Predicate from whylogs.core.schema import DeclarativeSchema data = { "emails": ["my email is my_email_1989@gmail.com","invalidEmail@xyz.toolong","this.is.ok@hotmail.com","not an email"], "transcriptions": ["Bob's credit card number is 4000000000000", "Alice's credit card is XXXXXXXXXXXXX", "Hi, my name is Bob", "Hi, I'm Alice"], } df = pd.DataFrame(data=data) emails_conditions = { "containsEmail": Condition(Predicate().fullmatch("[\w.]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}")), } transcriptions_conditions = { "containsCreditCard": Condition(Predicate().matches(".*4[0-9]{12}(?:[0-9]{3})?")) } schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_name="emails", metrics=[ConditionCountMetricSpec(emails_conditions)]) schema.add_resolver_spec(column_name="transcriptions", metrics=[ConditionCountMetricSpec(transcriptions_conditions)]) prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['condition_count/containsEmail', 'condition_count/containsCreditCard', 'condition_count/total']] # ### Equalities Example # In[18]: import pandas as pd import whylogs as why from whylogs.core.resolvers import STANDARD_RESOLVER from whylogs.core.specialized_resolvers import ConditionCountMetricSpec from whylogs.core.datatypes import Fractional, Integral from whylogs.core.metrics.condition_count_metric import Condition from whylogs.core.relations import Predicate from whylogs.core.schema import DeclarativeSchema data = { "ints_column": [1,12,42,4], "floats_column": [1.2, 12.3, 42.2, 4.8] } df = pd.DataFrame(data=data) ints_conditions = { "equals42": Condition(Predicate().equals(42)), "lessthan5": Condition(Predicate().less_than(5)), "morethan40": Condition(Predicate().greater_than(40)), } floats_conditions = { "equals42.2": Condition(Predicate().equals(42.2)), "lessthan5": Condition(Predicate().less_than(5)), "morethan40": Condition(Predicate().greater_than(40)), } schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_type=Integral, metrics=[ConditionCountMetricSpec(ints_conditions)]) schema.add_resolver_spec(column_type=Fractional, metrics=[ConditionCountMetricSpec(floats_conditions)]) prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['types/fractional','types/integral','condition_count/lessthan5', 'condition_count/morethan40','condition_count/equals42','condition_count/equals42.2', 'condition_count/total']] # ### Logical Operators Example # In[19]: import pandas as pd import whylogs as why from whylogs.core.resolvers import STANDARD_RESOLVER from whylogs.core.specialized_resolvers import ConditionCountMetricSpec from whylogs.core.metrics.condition_count_metric import Condition from whylogs.core.relations import Predicate from whylogs.core.schema import DeclarativeSchema from whylogs.core.relations import Not data = { "ints_column": [1,12,42,4], "floats_column": [1.2, 12.3, 42.2, 4.8] } df = pd.DataFrame(data=data) conditions = { "between10and50": Condition(Predicate().greater_than(10).and_(Predicate().less_than(50))), "outside10and50": Condition(Predicate().less_than(10).or_(Predicate().greater_than(50))), "not_42": Condition(Not(Predicate().equals(42))), # could also use X.not_equal(42) or X.not_.equals(42) } schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_name="ints_column", metrics=[ConditionCountMetricSpec(conditions)]) schema.add_resolver_spec(column_name="floats_column", metrics=[ConditionCountMetricSpec(conditions)]) prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['condition_count/between10and50', 'condition_count/outside10and50', 'condition_count/not_42', 'condition_count/total']] # ### User-defined function - even # In[20]: import pandas as pd from typing import Any import whylogs as why from whylogs.core.resolvers import STANDARD_RESOLVER from whylogs.core.specialized_resolvers import ConditionCountMetricSpec from whylogs.core.metrics.condition_count_metric import Condition from whylogs.core.relations import Predicate from whylogs.core.schema import DeclarativeSchema def even(x: Any) -> bool: return x % 2 == 0 def preprocessed(x: Any) -> bool: stopwords = ["i", "me", "myself"] if not isinstance(x, str): return False # should have only lowercase letters and space (no digits) if not all(c.islower() or c.isspace() for c in x): return False # should not contain any words in our stopwords list if any(c in stopwords for c in x.split()): return False return True data = { "transcriptions": ["I AM BOB AND I LIKE TO SCREAM","i am bob","am alice and am xx years old","am bob and am 42 years old"], "ints_column": [1,12,42,4], "floats_column": [1.2, 12.3, 42.2, 4.8] } df = pd.DataFrame(data=data) transcriptions_conditions = { "isPreprocessed": Condition(Predicate().is_(preprocessed)), } numerical_conditions = { "isEven": Condition(Predicate().is_(even)), } schema = DeclarativeSchema(STANDARD_RESOLVER) schema.add_resolver_spec(column_name="ints_column", metrics=[ConditionCountMetricSpec(numerical_conditions)]) schema.add_resolver_spec(column_name="floats_column", metrics=[ConditionCountMetricSpec(numerical_conditions)]) schema.add_resolver_spec(column_name="transcriptions", metrics=[ConditionCountMetricSpec(transcriptions_conditions)]) prof_view = why.log(df, schema=schema).profile().view() prof_view.to_pandas()[['condition_count/isPreprocessed','condition_count/isEven', 'condition_count/total']]