#!/usr/bin/env python # coding: utf-8 # # Install basic requirements # In[1]: pip install -U whylogs pandas # In[2]: import whylogs import pandas as pd # # Load example data batches # # The example data is prepared from our public S3 bucket. You can use your own data if you want if you have multiple batches of data. # In[3]: pdfs = [] for i in range(1, 8): path = f"https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_{i}.csv" print(f"Loading data from {path}") df = pd.read_csv(path) pdfs.append(df) # In[4]: pdfs[0].describe() # # Configure whylogs # # `whylogs`, by default, does not send statistics to WhyLabs. # # There are a few small steps you need to set up. If you haven't got the access key, please onboard with WhyLabs. # # **WhyLabs only requires whylogs API - your raw data never leaves your premise.** # In[5]: from whylogs.app import Session from whylogs.app.writers import WhyLabsWriter import os import datetime # In[6]: import getpass # set your org-id here print("Enter your WhyLabs Org ID") os.environ["WHYLABS_DEFAULT_ORG_ID"] = input() # set your API key here print("Enter your WhyLabs API key") os.environ["WHYLABS_API_KEY"] = getpass.getpass() print("Using API Key ID: ", os.environ["WHYLABS_API_KEY"][0:10]) # ## Creating session # # Once the environments are set, let's create a whylogs session with a WhyLabs writer. # # Note that you can add your local writer or S3 writer if you want here. Check out the API docs for more information. # In[7]: # create WhyLabs session writer = WhyLabsWriter("", formats=[]) session = Session(project="demo-project", pipeline="demo-pipeline", writers=[writer]) # ## Logging to WhyLabs # # Ensure you have a **model ID** (also called **dataset ID**) before you start! # # ### Dataset Timestamp # * To avoid confusion, it's recommended that you use UTC # * If you don't set `dataset_timestamp` parameter, it'll default to `UTC` now # * WhyLabs supports real time visualization when the timestamp is **within the last 7 days**. Anything older than than will be picked up when we run our batch processing # * **If you log two profiles for the same day with different timestamps (12:00 vs 12:01), they are merged to the same batch** # # ### Logging Different Batches of Data # * We'll give the profiles different **dates** # * Create a new logger for each date. Note that the logger needs to be `closed` to flush out the data # In[8]: print("Enter your model ID from WhyLabs:") model_id = input() for i, df in enumerate(pdfs): # walking backwards. Each dataset has to map to a date to show up as a different batch # in WhyLabs dt = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=i) # Create new logger for date with session.logger(tags={"datasetId": model_id}, dataset_timestamp=dt) as ylog: print("Log data frame for ", dt) ylog.log_dataframe(df) # In[9]: # Ensure everything is flushed session.close() # ## Voila # # * Now check the application to see if your **statistics** are in!! # * Also, run the above cell again for the same model ID, do you see the statistics changes in WhyLabs? Especially the counters?