pip install -U whylogs pandas
Defaulting to user installation because normal site-packages is not writeable WARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages) WARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages) Requirement already satisfied: whylogs in /home/jamie/.local/lib/python3.8/site-packages (0.6.25.dev0) Requirement already satisfied: pandas in /home/jamie/.local/lib/python3.8/site-packages (1.4.0) Requirement already satisfied: puremagic<2.0,>=1.10 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.11) Requirement already satisfied: jsonschema>=3.2.0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (4.4.0) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.8/dist-packages/python_dateutil-2.8.2-py3.8.egg (from whylogs) (2.8.2) Requirement already satisfied: boto3>=1.14.1 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.18.9) Requirement already satisfied: botocore>=1.17.44 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.21.9) Requirement already satisfied: requests>=2.22.0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (2.27.1) Requirement already satisfied: whylabs-client<0.2.0,>=0.1.1.dev0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (0.1.1.dev0) Requirement already satisfied: numpy>=1.18.0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.22.0) Requirement already satisfied: scipy<2.0.0,>=1.5.4 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (1.6.3) Requirement already satisfied: whylabs-datasketches>=2.2.0b1 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (2.2.0b1) Requirement already satisfied: smart-open>=4.1.2 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (4.1.2) Requirement already satisfied: matplotlib<4.0.0,>=3.0.3 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (3.3.3) Requirement already satisfied: marshmallow>=3.7.1 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (3.14.1) Requirement already satisfied: click>=7.1.2 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (7.1.2) Requirement already satisfied: tqdm<5.0.0,>=4.60.0 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (4.62.3) Requirement already satisfied: pyyaml>=5.3.1 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (6.0) Requirement already satisfied: protobuf>=3.15.5 in /home/jamie/.local/lib/python3.8/site-packages (from whylogs) (3.19.3) Requirement already satisfied: pytz>=2020.1 in /home/jamie/.local/lib/python3.8/site-packages (from pandas) (2021.3) Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/jamie/.local/lib/python3.8/site-packages (from boto3>=1.14.1->whylogs) (0.5.0) Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/jamie/.local/lib/python3.8/site-packages (from boto3>=1.14.1->whylogs) (0.10.0) Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/lib/python3/dist-packages (from botocore>=1.17.44->whylogs) (1.25.8) Requirement already satisfied: attrs>=17.4.0 in /home/jamie/.local/lib/python3.8/site-packages (from jsonschema>=3.2.0->whylogs) (21.2.0) Requirement already satisfied: importlib-resources>=1.4.0 in /home/jamie/.local/lib/python3.8/site-packages (from jsonschema>=3.2.0->whylogs) (5.4.0) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/lib/python3/dist-packages (from jsonschema>=3.2.0->whylogs) (0.15.5) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /usr/lib/python3/dist-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (2.4.6) Requirement already satisfied: kiwisolver>=1.0.1 in /home/jamie/.local/lib/python3.8/site-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (1.3.1) Requirement already satisfied: cycler>=0.10 in /home/jamie/.local/lib/python3.8/site-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (0.10.0) Requirement already satisfied: pillow>=6.2.0 in /usr/lib/python3/dist-packages (from matplotlib<4.0.0,>=3.0.3->whylogs) (7.0.0) Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->whylogs) (1.14.0) Requirement already satisfied: charset-normalizer~=2.0.0 in /home/jamie/.local/lib/python3.8/site-packages (from requests>=2.22.0->whylogs) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.22.0->whylogs) (2019.11.28) Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.22.0->whylogs) (2.8) Requirement already satisfied: zipp>=3.1.0 in /home/jamie/.local/lib/python3.8/site-packages (from importlib-resources>=1.4.0->jsonschema>=3.2.0->whylogs) (3.7.0) WARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages) WARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages) WARNING: Ignoring invalid distribution -andas (/home/jamie/.local/lib/python3.8/site-packages) Note: you may need to restart the kernel to use updated packages.
import whylogs
import pandas as pd
The example data is prepared from our public S3 bucket. You can use your own data if you want if you have multiple batches of data.
pdfs = []
for i in range(1, 8):
path = f"https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_{i}.csv"
print(f"Loading data from {path}")
df = pd.read_csv(path)
pdfs.append(df)
Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_1.csv Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_2.csv Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_3.csv Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_4.csv Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_5.csv Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_6.csv Loading data from https://whylabs-public.s3.us-west-2.amazonaws.com/demo_batches/input_batch_7.csv
pdfs[0].describe()
Unnamed: 0 | id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | int_rate | installment | annual_inc | desc | ... | hardship_loan_status | orig_projected_additional_accrued_interest | hardship_payoff_balance_amount | hardship_last_payment_amount | debt_settlement_flag_date | settlement_status | settlement_date | settlement_amount | settlement_percentage | settlement_term | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 407.000000 | 4.070000e+02 | 0.0 | 407.000000 | 407.000000 | 407.000000 | 407.000000 | 407.000000 | 407.000000 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
mean | 12548.717445 | 1.158631e+08 | NaN | 14203.746929 | 14203.746929 | 14202.948403 | 13.514054 | 418.020344 | 78818.956069 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
std | 125.354772 | 1.207642e+06 | NaN | 9351.142374 | 9351.142374 | 9350.997874 | 5.446881 | 271.096531 | 55864.939403 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
min | 12325.000000 | 1.121538e+08 | NaN | 1000.000000 | 1000.000000 | 1000.000000 | 5.320000 | 34.220000 | 0.000000 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25% | 12442.500000 | 1.150769e+08 | NaN | 7000.000000 | 7000.000000 | 7000.000000 | 9.930000 | 235.580000 | 43325.000000 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
50% | 12550.000000 | 1.157004e+08 | NaN | 12000.000000 | 12000.000000 | 12000.000000 | 12.620000 | 357.250000 | 63300.000000 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
75% | 12653.500000 | 1.168245e+08 | NaN | 20000.000000 | 20000.000000 | 20000.000000 | 16.020000 | 553.515000 | 95000.000000 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
max | 12862.000000 | 1.181592e+08 | NaN | 40000.000000 | 40000.000000 | 40000.000000 | 30.990000 | 1417.710000 | 495000.000000 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 rows × 126 columns
whylogs
, by default, does not send statistics to WhyLabs.
There are a few small steps you need to set up. If you haven't got the access key, please onboard with WhyLabs here: https://hub.whylabsapp.com.
WhyLabs only requires whylogs API - your raw data never leaves your premise.
from whylogs.app import Session
from whylogs.app.writers import WhyLabsWriter
import os
import datetime
import getpass
# set your org-id here
print("Enter your WhyLabs Org ID")
os.environ["WHYLABS_DEFAULT_ORG_ID"] = input()
# set your API key here
print("Enter your WhyLabs API key")
os.environ["WHYLABS_API_KEY"] = getpass.getpass()
print("Using API Key ID: ", os.environ["WHYLABS_API_KEY"][0:10])
Once the environments are set, let's create a whylogs session with a WhyLabs writer.
Note that you can add your local writer or S3 writer if you want here. Check out the API docs for more information.
# create WhyLabs session
writer = WhyLabsWriter()
session = Session(writers=[writer])
Ensure you have a model ID (also called dataset ID) before you start!
dataset_timestamp
parameter, it'll default to UTC
nowclosed
to flush out the dataprint("Enter your model ID from WhyLabs:")
model_id = input()
reference_profile = None
for i, df in enumerate(pdfs):
# walking backwards. Each dataset has to map to a date to show up as a different batch
# in WhyLabs
dt = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(days=i)
# Create new logger for date
with session.logger(tags={"datasetId": model_id}, dataset_timestamp=dt) as ylog:
print("Log data frame for ", dt)
ylog.log_dataframe(df)
# we will keep a reference to the first profile for us as a baseline for monitoring
if (i==0):
reference_profile = ylog.profile
# Ensure everything is flushed
session.close()
We still have a reference to the first profile, for this demo we will use this dataframe's profile and upload it as a reference profile for monitoring on Whylabs
# You can rename the reference profile alias, this will show up when choosing a baseline on the monitoring settings page of Whylabs
reference_profile_alias = "demo-reference-profile"
reference_profile.to_summary()
The reference profile can be uploaded using a whylabs_client directly. First, we need to reference the profile as a file on disk, so write it out.
import tempfile
# write out the profile we just
tmp_dir = tempfile.mkdtemp()
profile_path = os.path.join(tmp_dir, "reference-profile.bin")
reference_profile.write_protobuf(profile_path)
print(f"Reference profile written to temporary file in preparation to upload to Whylabs as a reference profile: {profile_path}")
The whylabs_client will construct a request to upload this as a reference profile, using the org-id, model-id and api-key entered above.
import requests
import whylabs_client
from whylabs_client.api.log_api import LogApi
from whylabs_client.model.log_reference_request import LogReferenceRequest
# Now setup some of the inputs required to make the request to upload to Whylabs using the whylabs_client
whylabs_api_endpoint = "https://api.whylabsapp.com"
api_key = os.environ["WHYLABS_API_KEY"]
print(f"Using API key ID: {api_key[:10]} and endpoint {whylabs_api_endpoint}")
config = whylabs_client.Configuration(host=whylabs_api_endpoint, api_key={"ApiKeyAuth": api_key}, discard_unknown_keys=True)
api_log_client = whylabs_client.ApiClient(config)
log_api = LogApi(api_log_client)
org_id = reference_profile.tags.get("orgId", os.environ.get("WHYLABS_DEFAULT_ORG_ID"))
dataset_id = reference_profile.tags.get("datasetId", os.environ.get("WHYLABS_DEFAULT_DATASET_ID"))
dataset_timestamp = int(reference_profile.dataset_timestamp.timestamp() * 1000)
alias = reference_profile_alias
try:
with open(profile_path, "rb") as f:
request = LogReferenceRequest(dataset_timestamp=dataset_timestamp, alias=alias)
print(f"Making initial call to log_reference to get upload url for {alias} and in [{org_id}] for [{dataset_id}] using request: {request}")
async_result = log_api.log_reference(org_id=org_id, model_id=dataset_id, log_reference_request=request, async_req=True)
result = async_result.get()
upload_url = result["upload_url"]
print(f"got async_result from log_reference, upload url is: {upload_url[:140]}")
print(f"About to upload reference profile...")
http_response = requests.put(upload_url, data=f.read())
if http_response.ok:
print(f"Done uploading reference profile with alias: {alias} to: {upload_url[:140]} with API token ID: {api_key[:10]}")
else:
print(
f"Failed to upload reference profile with alias: {alias} to: {upload_url[:140]} with API token ID: {api_key[:10]} to "
+ f"{whylabs_api_endpoint}: unexpected HTTP status {http_response}"
)
except Exception as e:
print(f"Failed to upload reference profile: {e}.")
from IPython.display import display, Markdown
url = f"https://hub.whylabsapp.com/models/{dataset_id}/monitor-settings"
content = Markdown(f"url here: {url}")
display(content)