#!/usr/bin/env python # coding: utf-8 # # Detecting Dataset Drift with whylogs # # We will be using data from Kaggle (https://www.kaggle.com/yugagrawal95/sample-media-spends-data) that is packaged with this notebook. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import datetime import math import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import matplotlib.ticker as ticker from whylogs import get_or_create_session # In[2]: # Read our Media Spend dataset as Pandas dataframe data = pd.read_csv("MediaSpendDataset.csv", parse_dates=["Calendar_Week"], infer_datetime_format=True) # In[3]: data # As we can see here, we have advertising and media impressions and views per week for a number of marketing campaigns for some unknown company. Included with this information is sales against those spends. # # ## Exploratory Data Analysis # # Let's now explore the dataset; we have very little metadata or context. # In[4]: data.groupby("Calendar_Week").count().T # In[5]: data.groupby("Division").count().T # We see that the *Z* division has double the entries than the other divisions. # In[6]: fig, ax = plt.subplots(figsize=(10, 3)) sns.lineplot(x="Calendar_Week", y="Sales", data=data, ax=ax) # In[7]: fig, ax = plt.subplots(figsize=(10, 3)) sns.scatterplot(x="Google_Impressions", y="Sales", data=data, ax=ax) # Let's compare the data from the first month to the last month, which happens to capture differences in transactions prior to and during the COVID-19 global pandemic. # ## Profiling with whylogs # In[8]: model_date = datetime.datetime(2020, 1, 1) training_data = data[data["Calendar_Week"] < model_date] test_data = data[data["Calendar_Week"] >= model_date] # In[9]: session = get_or_create_session() # In[10]: profiles = [] profiles.append(session.log_dataframe(training_data, dataset_timestamp=model_date)) profiles.append(session.log_dataframe(test_data, dataset_timestamp=datetime.datetime.now())) # In[11]: profiles # We can compare the data we'll use for training with that in early 2020. # In[12]: # Training data profile summary training_summary = profiles[0].flat_summary()["summary"] training_summary # In[13]: # Test data profile summary test_summary = profiles[1].flat_summary()["summary"] test_summary # ## Dataset Drift in whylogs Data # # We need to understand how the data changes between that used in training and test data. To do so, let's first view one of the many objects in the dataset profile provided by whylogs, a histogram for each feature tracked. We can then inspect the **Overall_Views** feature. # In[14]: training_histograms = profiles[0].flat_summary()["hist"] test_histograms = profiles[1].flat_summary()["hist"] test_histograms["Overall_Views"] # While we plan to integrate convienient dataset shift visualization and analysis API soon, you are always able to access the attributes you need. # # We will first define a custom range and bins, then utilize our access to the data sketches' probability mass function. We then visualize these values using Seaborn. # In[15]: def get_custom_histogram_info(variable, n_bins): min_range = min(training_summary[training_summary["column"]==variable]["min"].values[0], test_summary[test_summary["column"]==variable]["min"].values[0]) max_range = max(training_summary[training_summary["column"]==variable]["max"].values[0], test_summary[test_summary["column"]==variable]["max"].values[0]) bins = range(int(min_range), int(max_range), int((max_range-min_range)/n_bins)) training_counts = np.array( profiles[0].columns[variable].number_tracker.histogram.get_pmf(bins[:-1])) test_counts = np.array( profiles[1].columns[variable].number_tracker.histogram.get_pmf(bins[:-1])) return bins, training_counts, test_counts def plot_distribution_shift(variable, n_bins): """Visualization for distribution shift""" bins, training_counts, test_counts = get_custom_histogram_info(variable, n_bins) fig, ax = plt.subplots(figsize=(10, 3)) sns.histplot(x=bins, weights=training_counts, bins=n_bins, label="Training data", color="teal", alpha=0.7, ax=ax) sns.histplot(x=bins, weights=test_counts, bins=n_bins, label="Test data", color="gold", alpha=0.7, ax=ax) ax.legend() plt.show() # In[16]: plot_distribution_shift("Overall_Views", n_bins=60) # While it is quite clear that the distribution in this case differs between the training and test dataset, we will likely need a quantitative measure. You can also use whylogs histogram metrics to calculate dataset shift using a number of metrics: Population Stability Index (PSI), Kolmogorov-Smirnov statistic, Kullback-Lebler divergence (or other f-divergences), and histogram intersection. # # ## Kullback-Lebler divergence # # This score, often shortened to K-L divergence, is measure of how one probability distribution is different from a second, reference probability distribution. The K-L divergence can be interpreted as the average difference of the number of bits required for encoding samples of one distribution (*P*) using a code optimized for another (*Q*) rather than one optimized for *P*. KL divergence is not a true statistical metric of spread as it is not symmetric and does not satisfy the triangle inequality. # # However, this value has become quite poplular and easy to calculate in Python. We'll use the implementation in `scikit-learn`. # In[17]: from sklearn.metrics import mutual_info_score def calculate_kl_divergence(variable, n_bins): _, training_counts, test_counts = get_custom_histogram_info(variable, n_bins) return mutual_info_score(training_counts, test_counts) # In[18]: calculate_kl_divergence("Overall_Views", n_bins=60) # ## Histogram intersection metric # # Our second metric is the histogram intersection score, which is an intuitive metric that measures the area of overlap between the two probability distributions. A histogram intersection score of 0.0 represents no overlap while a score of 1.0 represents identical distributions. This score requires discretized probability distributions and depends heavily on the choice of bin size and scale used. # In[19]: def calculate_histogram_intersection(variable, n_bins): _, training_counts, test_counts = get_custom_histogram_info(variable, n_bins) result = 0 for i in range(n_bins): result += min(training_counts[i], test_counts[i]) return result # In[20]: calculate_histogram_intersection("Overall_Views", n_bins=60) # In[21]: calculate_histogram_intersection("Sales", n_bins=60) # In[ ]: