#!/usr/bin/env python
# coding: utf-8

# # Install basic requirements

# In[1]:


pip install -U whylogs pandas


# In[2]:


import whylogs
import pandas as pd


# # Load example data batches
# 
# The example data is prepared from our public S3 bucket. You can use your own data if you want if you have multiple batches of data.

# In[3]:


pdfs = []
for i in range(1, 8):
    path = f"https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_{i}.csv"
    print(f"Loading data from {path}")
    df = pd.read_csv(path)
    pdfs.append(df)


# In[4]:


pdfs[0].describe()


# # Configure whylogs
# 
# `whylogs`, by default, does not send statistics to WhyLabs.
# 
# There are a few small steps you need to set up. If you haven't got the access key, please onboard with WhyLabs.
# 
# **WhyLabs only requires whylogs API - your raw data never leaves your premise.**

# In[5]:


from whylogs.app import Session
from whylogs.app.writers import WhyLabsWriter
import os
import datetime


# In[6]:


import getpass

# set your org-id here
print("Enter your WhyLabs Org ID")
os.environ["WHYLABS_DEFAULT_ORG_ID"] = input()
# set your API key here
print("Enter your WhyLabs API key")
os.environ["WHYLABS_API_KEY"] = getpass.getpass()
print("Using API Key ID: ", os.environ["WHYLABS_API_KEY"][0:10])


# ## Creating session
# 
# Once the environments are set, let's create a whylogs session with a WhyLabs writer.
# 
# Note that you can add your local writer or S3 writer if you want here. Check out the API docs for more information.

# In[7]:


# create WhyLabs session
writer = WhyLabsWriter("", formats=[])
session = Session(project="demo-project", pipeline="demo-pipeline", writers=[writer])


# ## Logging to WhyLabs
# 
# Ensure you have a **model ID** (also called **dataset ID**) before you start!
# 
# ### Dataset Timestamp
# * To avoid confusion, it's recommended that you use UTC
# * If you don't set `dataset_timestamp` parameter, it'll default to `UTC` now
# * WhyLabs supports real time visualization when the timestamp is **within the last 7 days**. Anything older than than will be picked up when we run our batch processing
# * **If you log two profiles for the same day with different timestamps (12:00 vs 12:01), they are merged to the same batch**
# 
# ### Logging Different Batches of Data
# * We'll give the profiles different **dates**
# * Create a new logger for each date. Note that the logger needs to be `closed` to flush out the data

# In[8]:


print("Enter your model ID from WhyLabs:")
model_id = input()
for i, df in enumerate(pdfs):
    # walking backwards. Each dataset has to map to a date to show up as a different batch
    # in WhyLabs
    dt = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=i)
    
    # Create new logger for date
    with session.logger(tags={"datasetId": model_id}, dataset_timestamp=dt) as ylog:
        print("Log data frame for ", dt)
        ylog.log_dataframe(df)


# In[9]:


# Ensure everything is flushed
session.close()


# ## Voila
# 
# * Now check the application to see if your **statistics** are in!!
# * Also, run the above cell again for the same model ID, do you see the statistics changes in WhyLabs? Especially the counters?