#!/usr/bin/env python # coding: utf-8 # >### 🚩 *Create a free WhyLabs account to get more value out of whylogs!*
# >*Did you know you can store, visualize, and monitor whylogs profiles with the [WhyLabs Observability Platform](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Condition_Validators)? Sign up for a [free WhyLabs account](https://whylabs.ai/whylogs-free-signup?utm_source=whylogs-Github&utm_medium=whylogs-example&utm_campaign=Condition_Validators) to leverage the power of whylogs and WhyLabs together!* # # Real-time Data Validation with Condition Validators # [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/whylabs/whylogs/blob/mainline/python/examples/advanced/Condition_Validators.ipynb) # With __Condition Validators__, the user is able to evaluate conditions on individual values on real-time scenarios. These checks are done while data is being logged, and can trigger one or multiple actions when these conditions fail to be met. With __Condition Validators__, you are able to define actions where an immediate response is required, such as emiting an alert to key stakeholders, logging specific failures or throwing exceptions. Validators are designed with flexibility in mind, so you are free to customize your actions as well as the conditions that trigger those actions. # # In this example, we will cover how to: # # - Define conditions for data validations # - Define actions to be triggered when conditions fail # - Assemble a Condition Validator with defined conditions and actions # - Bind validators to Columns # - Apply validators to your data # - Debugging failed conditions # # Showing the different types of conditions is NOT the focus of this example. If you wish to see the different types of conditions you can define, please refer to [Condition Count Metrics](https://whylogs.readthedocs.io/en/stable/examples/advanced/Condition_Count_Metrics.html). # # Unlike metrics, validators will not log properties into profiles. They are meant only to evaluate conditions and trigger actions while logging is under way. # # # ## Installing whylogs and importing modules # In[1]: # Note: you may need to restart the kernel to use updated packages. get_ipython().run_line_magic('pip', 'install whylogs') # ## Use-case: Validating email and credit card columns # In this simple scenario, we want to make sure two things happen: # # - Single emails in the `emails` column (nothing else) # - No credit cards information in the `transcriptions` column # # We'll use the following sample dataframe to evaluate on: # In[2]: import pandas as pd text_data = { "emails": [ "my email is my_email_1989@gmail.com", "invalidEmail@xyz.toolong", "this.is.ok@hotmail.com", "not an email", ], "transcriptions": [ "Bob's credit card number is 4000000000000", "Alice's credit card is XXXXXXXXXXXXX", "Hi, my name is Bob", "Hi, I'm Alice", ], } df = pd.DataFrame(data=text_data) # ## Defining the Conditions # Let's translate the mentioned conditions into regex expressions: # - A negation of a credit card pattern matching ("No Credit Cards") # - A Full Match for an email pattern matching: # > Our conditions are usually expected to evaluate to `True`. When something goes wrong, the condition should evaluate to `False`, triggering a certain action in the process. This is why we negate the first condition (because matching the pattern is bad) and do a match for the second one (because not finding an email is bad) # In[3]: from whylogs.core.relations import Not, Predicate X = Predicate() credit_card_conditions = {"noCreditCard": Not(X.matches(".*4[0-9]{12}(?:[0-9]{3})?"))} email_conditions = {"hasEmail": X.fullmatch("[\w.]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}")} # > Note: The regex expressions are for demonstrational purposes only. These expressions are not general - there will be emails and credit cards whose patterns will not be met by the expression. # ## Defining the actions # The action to be triggered when a contidion fails is created by simply defining a regular function. # # We should just remember to define the arguments: `validator_name`, `condition_name` and `value`. You can use these values to help with logging and debugging the failures. # In[4]: from typing import Any def do_something_important(validator_name, condition_name: str, value: Any): print("Validator: {}\n Condition name {} failed for value {}".format(validator_name, condition_name, value)) return # ## Creating the Validators # To create a Condition Validator, we need a name, a set of conditions, and a list of actions. # # Let's make a Validator for the credit card column and another Validator for the email column. Each validator has a single condition to be evaluated, and also a single action. # # > Note that for a single validator, we could have multiple conditions defined and also multiple actions to be triggered. # In[5]: from whylogs.core.validators import ConditionValidator credit_card_validator = ConditionValidator( name="no_credit_cards", conditions=credit_card_conditions, actions=[do_something_important], ) email_validator = ConditionValidator( name="has_emails", conditions=email_conditions, actions=[do_something_important], ) # ## Bind the Validators to specific Columns # Each validator instance should be mapped to a single column, but each column can have multiple validators attached to it. # # Assigning an instance to multiple columns will lead to an undefined behavior. # # In our case, we have only one validator for each of the columns: # In[6]: validators = { "emails": [email_validator], "transcriptions": [credit_card_validator]} # ## Apply Validators to Data # Now, we only need to pass our set of validators to our DatasetSchema. # # This will make the validators to be applied while data is being logged. The actions will be triggered immediately when the conditions fail, and not only when the logging is done. # In[7]: from whylogs.core.schema import DatasetSchema import whylogs as why schema = DatasetSchema(validators=validators) profile = why.log(df, schema=schema).profile() # We can see in the results above that our `has_emails` validator failed three times. The first time, the value has extra text, the second has an invalid email address and the third does not contain an email. # # The `no_credit_cards` validator failed once, because the pattern was found once. # We can also access a simple summary with the total number of evaluations, the number of total failures and the number of failures per condition present in the validator: # In[8]: email_validator.to_summary_dict() # In[9]: credit_card_validator.to_summary_dict() # ## Debugging Failed Conditions # The validator retain contextual information about the data that failed the conditions. You can access it by using the `get_samples` method of the validator. # In[10]: email_validator.get_samples() # Note that the samples are stored in the validator instance, but they are not logged into the profile. # By default, the `ConditionValidator` will sample 10 rows that failed the condition by using a [Reservoir Sampler](https://en.wikipedia.org/wiki/Reservoir_sampling). You can change this by setting the `validator_sample_size` in the `ConditionValidatorConfig`. # # If you want, you can also assign an `identity_column` to the validator. You can use the identity column for two purposes: # - Make the identity row visible to your action by specifying it as a fourth argument in your action function. That way, your callable will also receive the value of the identity column for the row that failed the condition. # - Sample the ids of the rows that failed the condition. # # Let's see how this works. First,let's create a dataframe again. This time, we have a column that contains the ids for each row: # In[11]: import pandas as pd text_data = { "emails": [ "my email is my_email_1989@gmail.com", "invalidEmail@xyz.toolong", "this.is.ok@hotmail.com", "not an email", ], "ids": [ "id_0", "id_1", "id_2", "id_3", ], } df = pd.DataFrame(data=text_data) # We will only use the email validator for this example. # # Notice that now we are defining a column that contains our ids. We want to access those values in both our actions and in our sampling. # # Let's define the validator again, but now with an identity column. # # In the following block, there are two main differences: # # - we're specifying a fourth argument in our action function. This argument will receive the value of the identity column for the row that failed the condition. # - we're setting `enable_sampling=True` when instantiating the validator. This is by default True, but we're setting it explicitly for demonstration purposes. If you set this to `False`, the validator won't sample the failed rows. # - we're setting our sample size to 2. It's a small value, but we're picking a value that is smaller than the actual number of failed conditions. This way, we make it clear that they are indeed samples, and it's not the complete set of failures. # In[12]: from typing import Any from whylogs.core.validators import ConditionValidator def do_something_important(validator_name, condition_name: str, value: Any, row_id: Any = None): print("Validator: {}\n Condition name {} failed for value {} and row id {}".format(validator_name, condition_name, value, row_id)) return email_validator = ConditionValidator( name="has_emails", conditions=email_conditions, actions=[do_something_important], enable_sampling=True, sample_size=2, ) validators = { "emails": [email_validator], } # Now, we need to let whylogs know which column is our identity column. We do this by setting the `identity_column` in our `MetricConfig`: # In[13]: from whylogs.core.schema import DatasetSchema import whylogs as why from whylogs.core.metrics import MetricConfig condition_count_config = MetricConfig(identity_column="ids") schema = DatasetSchema(validators=validators,default_configs=condition_count_config) profile = why.log(df, schema=schema).profile() samples = email_validator.get_samples() print(f"Samples of failed rows: {samples}")