This notebook provides an example how you can use whylogs to profile unstructure data like images.
from PIL import Image
import numpy as np
import os
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
import seaborn as sns
with open("flower2.jpg","rb") as img_f:
img= Image.open(img_f)
imshow(np.asarray(img))
w,h = img.size
total_num_pixels= w*h
print("withd :\t{}\nheight:\t{}\nnumber of pixels:{}".format(w,h,total_num_pixels))
withd : 300 height: 225 number of pixels:67500
We can create logger and create a profile sketch of the image data
from whylogs import get_or_create_session
_session=None
session = get_or_create_session()
logger=session.logger("image_dataset2")
logger.log_image("flower2.jpg")
profile=logger.profile
You can obtain the histogram sketch of image data features. e.g Saturation below
imageProfiles = profile.flat_summary()["hist"]
print(imageProfiles["Saturation"])
{'bin_edges': [0.0, 8.50000085, 17.0000017, 25.500002549999998, 34.0000034, 42.500004249999996, 51.000005099999996, 59.500005949999995, 68.0000068, 76.50000764999999, 85.00000849999999, 93.50000935, 102.00001019999999, 110.50001104999998, 119.00001189999999, 127.50001275, 136.0000136, 144.50001444999998, 153.00001529999997, 161.50001615, 170.00001699999999, 178.50001784999998, 187.0000187, 195.50001955, 204.00002039999998, 212.50002124999997, 221.00002209999997, 229.50002295, 238.00002379999998, 246.50002464999997, 255.0000255], 'counts': [64, 512, 576, 128, 512, 664, 1024, 896, 1240, 2550, 4156, 5940, 5594, 6010, 4836, 4636, 3712, 4232, 2972, 3506, 2506, 2484, 1258, 1286, 884, 1002, 416, 624, 704, 2576]}
Along with all the metadata collected from the image
print(profile.flat_summary()["summary"]["column"].values)
['Orientation' 'Software' 'Y-Resolution' 'Model' 'BrightnessValue' 'Flash' 'ImageLength' 'PhotometricInterpretationSamplesPerPixel' 'ImageWidth' 'BitsPerSample' 'Saturation' 'Compression' 'ExposureTime' 'RowsPerStrip' 'ResolutionUnit' 'X-Resolution' 'Quality' 'Hue' 'Brightness']
One can also create custom functions to profile image specific features. E.g. Two example below demostrate get the average of image pixels per column, while the second function simple allow you to create a distribution sketch of the blue values. Also ComposeTransforms functions allow you mix and match functions to create new features to monitor.
class AvgValue:
def __call__(self, x):
return np.mean(np.array(x)).reshape(-1,1)
def __repr__(self,):
return self.__class__.__name__
class MyBlue:
def __call__(self, x):
_,_,b= x.split()
return np.array(b).reshape(-1,1)
def __repr__(self,):
return self.__class__.__name__
from whylogs.features.transforms import ComposeTransforms, Brightness,Saturation
_session=None
session=None
session = get_or_create_session()
logger2=session.logger("image_dataset_custom_functions")
logger2.log_image("flower2.jpg",feature_transforms = [ AvgValue(), MyBlue(),
ComposeTransforms([MyBlue(),AvgValue()])])
profile2=logger2.profile
print(profile2.flat_summary()["summary"]["column"].values)
We can obtain the idenvidual histograms for the features
minnpf = np.frompyfunc(lambda x, y: min(x,y), 2, 1)
maxnpf = np.frompyfunc(lambda x, y: max(x,y), 2, 1)
def get_custom_histogram_info(profiles, variable, n_bins):
summaries = [profile.flat_summary()["summary"] for profile in profiles]
min_range= minnpf.accumulate([ summary[summary["column"]==variable]["min"].values[0] for summary in summaries], dtype=np.object).astype(np.int)
max_range= maxnpf.accumulate([ summary[summary["column"]==variable]["max"].values[0] for summary in summaries], dtype=np.object).astype(np.int)
bins = np.linspace(int(min_range), int(max_range), int((max_range-min_range)/n_bins))
counts= [ profile.columns[variable].number_tracker.histogram.get_pmf(bins[:-1]) for profile in profiles]
return bins, counts
def plot_distribution_shift(profiles, variable, n_bins):
"""Visualization for distribution shift"""
bins, counts = get_custom_histogram_info(profiles, variable, n_bins)
fig, ax = plt.subplots(figsize=(10, 3))
for idx, profile in enumerate(profiles):
sns.histplot(x=bins, weights=counts[idx], bins=n_bins,
label=profile.name, alpha=0.7, ax=ax)
ax.legend()
plt.show()
plot_distribution_shift([profile2],"MyBlue",10)
plot_distribution_shift([profile],"Saturation",10)