This notebook shows how to generate reports on incremental datasets
The incremental data will either have a proper time-axis, or will be batches of data without a specific time-axis.
The histograms of these datasets will be stitched together, and we generate a (consistent) report on the stitched dataset.
Note that we always generate the report on the full stitched histograms, because algorithms like trend detection and comparison with reference histograms rely on having the historical histograms in place.
Install popmon (if not installed yet)
import sys
!"{sys.executable}" -m pip install -q popmon
import pandas as pd
import popmon
from popmon import get_bin_specs, resources, stability_report, stitch_histograms
df = pd.read_csv(resources.data("test.csv.gz"), parse_dates=["date"])
Add month column, so we can make data batches per month.
def to_month(x):
date = pd.to_datetime(x)
return str(12 * date.year + date.month)
df["month"] = df["date"].apply(to_month)
months = df.month.unique()
features = ["date:isActive", "date:eyeColor", "date:latitude", "date:age"]
# weeks start on a Monday
hists = df.pm_make_histograms(
features=features, time_axis="date", time_width="1w", time_offset="2015-1-5"
)
# use the same bin_specifications for every month below
bin_specs = popmon.get_bin_specs(hists)
# generate histograms per month, each month uses the same (weekly) binning specifications
hists_list = []
for month in months:
df_month = df[df.month == month]
h = df_month.pm_make_histograms(features=features, bin_specs=bin_specs)
hists_list.append(h)
# add up all the histograms sets
hists2 = popmon.stitch_histograms(hists_list=hists_list, time_axis="date", mode="add")
# the two sets of histograms have consistent binning
hists
hists2
rep = popmon.stability_report(hists)
rep
popmon.stability_report(hists2)
Scan through the two reports above and you see that the outputs are identical!
# Now let's assume we already have a set of stitched histograms (hists3),
# and we want to stitch to add another new batch to this:
# hists_basis is the set of existing histogram
hists_basis = hists
# hists_delta is the new set of histograms
hists_delta = hists_list[-1]
# by default, the stitcher will recognize the existing time-axis "date" in both histogram sets.
# remember that binning along the "date" time-axis is in weeks.
# when adding hists_delta, one can either "add" histograms to existing weeks, or "replace" existing weeks.
# the default is to add them.
hists4 = popmon.stitch_histograms(
hists_basis=hists_basis, hists_delta=hists_delta, mode="add"
)
# or "replace" histograms found in existing weeks with those in hists_delta:
hists4 = popmon.stitch_histograms(
hists_basis=hists_basis, hists_delta=hists_delta, mode="replace"
)
Now we are ignoring the date information in the histogram creation, but every batch dataset corresponds to one week of data. Although a batch can be anything, of course.
features = ["isActive", "eyeColor", "latitude", "age"]
# use the same bin_specifications for every week below as earlier, and skip the date information
bin_specs = popmon.get_bin_specs(hists, skip_first_axis=True)
def to_week(x):
date = pd.to_datetime(x)
return 52 * date.year + date.week
df["week"] = df["date"].apply(to_week)
weeks = df.week.unique().tolist()
# generate histograms per month, each month uses the same (weekly) binning specifications
hists_list = []
for week in weeks:
df_week = df[df.week == week]
h = df_week.pm_make_histograms(features=features, bin_specs=bin_specs)
hists_list.append(h)
# since none of these histograms has a time-axis, in the stitching we create one (called 'batch'), and specify
# that each batch of histograms is inserted at a particular value time_bin_idx value
hists3 = popmon.stitch_histograms(
hists_list=hists_list, time_axis="batch", time_bin_idx=weeks
)
popmon.stability_report(hists3)
rep
# again the two reports are identical, except that the first one uses the batch-id as artificial time-axis.
# Now let's assume we already have a set of stitched histograms (hists3),
# and we want to stitch to add another new batch to this:
# hists_basis are the existing histogram
hists_basis = hists3
# hists_delta is the new set of histograms
hists_delta = hists_list[-1]
# by default, the stitcher will insert the batch right after the last batch found.
hists4 = popmon.stitch_histograms(
hists_basis=hists_basis, hists_delta=hists_delta, time_axis="batch"
)
# one can also insert the new batch at a chosen new or existing time-bin index:
hists4 = popmon.stitch_histograms(
hists_basis=hists_basis,
hists_delta=hists_delta,
time_axis="batch",
time_bin_idx=200000,
)
# when inserting at an existing time-bin index, on can either "add" to that index
# or "replace" the existing histograms. The default setting is to "add" the histograms:
mode = "add" # "replace"
hists4 = popmon.stitch_histograms(
hists_basis=hists_basis,
hists_delta=hists_delta,
time_axis="batch",
time_bin_idx=104833,
mode=mode,
)
%%script false --no-raise-error
# we can store the histograms if we want to
import json
from popmon.hist.histogram import dumper
# store
with open('histograms.json', 'w') as outfile:
json.dump(hists, outfile, default=dumper)
# and load again
with open('histograms.json') as handle:
hists = json.load(handle)