Metrics like Hit Rate provide only a partial picture of the expected behavior of recommenders in the wild: two models with very similar accuracy can have very different behavior on, say, the long-tail, or model A can be better than model B overall, but at the expense of providing disastrous performance on a set of inputs that are particularly important in production. Metrics such as coverage, serendipity, and bias have been therefore proposed to capture other aspects of the behaviors of RSs, but they still fall short of what is needed to debug RSs in production, or provide any guarantee that a model will be reliable when released.
Dataset - This tutorial shows how easy it is to run behavioral tests on a target dataset, in this case a wrapper around a large e-commerce dataset (the Coveo Data Challenge dataset).
Use case - Complementary items. We are targeting a "complementary items" use cases, such as for example a cart recommender. if a shopper added item X to the cart, what is she likely to add next?
Model - We train a simple, yet effective prod2vec baseline model, re-using for convenience a "training embedding" function. Word2vec algorithm is used for embedding over product SKU sequences.
Sample workflow for behavioral tests. Starting with shopping data (left), the dataset split and model training mimic the usual training loop. We also create a latent space, which is used to measure the relationships between inputs, ground truths and predictions, such as how far misses are from ground truths. Since a session can be viewed as a sequence of items or features (brands), we can use the same method to create embeddings for different tests.
Inputs:
RecModel
RecDataset
RecTask
RecTest
RecList
Outputs:
!pip install gensim==4.0.1
!pip install jinja2==3.0.2
!pip install algoliasearch==2.6.0
!pip install appdirs==1.4.4
!pip install wget==3.2
!pip install pytest==6.2.5
!pip install requests==2.22.0
!pip install tqdm==4.62.3
!pip install matplotlib==3.4.3
!pip install numpy==1.21.2
!pip install pathos==0.2.8
!pip install flask==2.0.2
!pip install networkx==2.6.3
!pip install python-Levenshtein==0.12.2
import gensim
from abc import ABC, abstractmethod
import ast
from datetime import datetime
import inspect
import os
from abc import ABC, abstractmethod
from functools import wraps
from pathlib import Path
import time
import json
import tempfile
import zipfile
import random
import requests
from tqdm import tqdm
from enum import Enum
from typing import List
import itertools
import numpy as np
import collections
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
from statistics import mean
import json
import networkx as nx
from networkx.algorithms.shortest_paths.generic import shortest_path
from collections import Counter, defaultdict
import math
COVEO_INTERACTION_DATASET_S3_URL = 'https://reclist-datasets-6d3c836d-6djh887d.s3.us-west-2.amazonaws.com/coveo_sigir.zip'
def download_with_progress(url, destination):
"""
Downloads a file with a progress bar
:param url: url from which to download from
:destination: file path for saving data
"""
try:
response = requests.get(url, stream=True)
response.raise_for_status()
except requests.exceptions.RequestException as e:
raise SystemExit(e)
with tqdm.wrapattr(open(destination, "wb"), "write",
miniters=1, desc=url.split('/')[-1],
total=int(response.headers.get('content-length', 0))) as fout:
for chunk in response.iter_content(chunk_size=4096):
fout.write(chunk)
def get_cache_directory():
"""
Returns the cache directory on the system
"""
cache_dir = '/content/coveo_reclist'
Path(cache_dir).mkdir(parents=True, exist_ok=True)
return cache_dir
class Dataset(Enum):
COVEO = 'coveo'
COVEO_INTERNAL = 'coveo-internal'
class RecDataset(ABC):
"""
Implements an abstract class for the dataset
"""
def __init__(self, force_download=False):
"""
:param force_download: allows to force the download of the dataset in case it is needed.
:type: force_download: bool, optional
"""
self._x_train = None
self._y_train = None
self._x_test = None
self._y_test = None
self._catalog = None
self.force_download = force_download
self.load()
@abstractmethod
def load(self):
"""
Abstract method that should implement dataset loading
@return:
"""
return
@property
def x_train(self):
return self._x_train
@property
def y_train(self):
return self._y_train
@property
def x_test(self):
return self._x_test
@property
def y_test(self):
return self._y_test
@property
def catalog(self):
return self._catalog
class CoveoDataset(RecDataset):
"""
Coveo SIGIR data challenge dataset
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
def load(self):
cache_directory = get_cache_directory()
filename = os.path.join(cache_directory, "coveo_sigir.zip") # TODO: make var somewhere
if not os.path.exists(filename) or self.force_download:
download_with_progress(COVEO_INTERACTION_DATASET_S3_URL, filename)
with tempfile.TemporaryDirectory() as temp_dir:
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
with open(os.path.join(temp_dir, 'dataset.json')) as f:
data = json.load(f)
self._x_train = data["x_train"]
self._y_train = None
self._x_test = data["x_test"]
self._y_test = data["y_test"]
self._catalog = data["catalog"]
def train_embeddings(
sessions: list,
min_c: int = 3,
size: int = 48,
window: int = 5,
iterations: int = 15,
ns_exponent: float = 0.75,
is_debug: bool = True):
"""
Train CBOW to get product embeddings with sensible defaults (https://arxiv.org/abs/2007.14906).
:param sessions: list of lists, as user sessions are list of interactions
:param min_c: minimum frequency of an event for it to be calculated for product embeddings
:param size: output dimension
:param window: window parameter for gensim word2vec
:param iterations: number of training iterations
:param ns_exponent: ns_exponent parameter for gensim word2vec
:param is_debug: if true, be more verbose when training
:return: trained product embedding model
"""
model = gensim.models.Word2Vec(sentences=sessions,
min_count=min_c,
vector_size=size,
window=window,
epochs=iterations,
ns_exponent=ns_exponent)
if is_debug:
print("# products in the space: {}".format(len(model.wv.index_to_key)))
return model.wv
class RecModel(ABC):
"""
Abstract class for recommendation model
"""
def __init__(self, model=None):
"""
:param model: a model that can be used in the predict function
"""
self._model = model
@abstractmethod
def predict(self, prediction_input: list, *args, **kwargs):
"""
The predict function should implement the behaviour of the model at inference time.
:param prediction_input: the input that is used to to do the prediction
:param args:
:param kwargs:
:return:
"""
return NotImplementedError
@property
def model(self):
return self._model
class P2VRecModel(RecModel):
"""
Implement of the prod2vec model through the standard RecModel interface.
Since init is ok, we just need to overwrite the prediction methods to get predictions
out of it.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
model_name = "prod2vec"
def predict(self, prediction_input: list, *args, **kwargs):
"""
Implement the abstract method, accepting a list of lists, each list being
the content of a cart: the predictions returned by the model are the top K
items suggested to complete the cart.
"""
predictions = []
for _x in prediction_input:
# we assume here that every X is a list of one-element, the product already in the cart
# i.e. our prediction_input list is [[sku_1], [sku_3], ...]
key_item = _x[0]['product_sku']
nn_products = self._model.most_similar(key_item, topn=10) if key_item in self._model else None
if nn_products:
predictions.append([{'product_sku':_[0]} for _ in nn_products])
else:
predictions.append([])
return predictions
def get_vector(self, product_sku):
try:
return list(self._model.get_vector(product_sku))
except Exception as e:
return []
def statistics(x_train, y_train, x_test, y_test, y_pred):
train_size = len(x_train)
test_size = len(x_test)
# num non-zero preds
num_preds = len([p for p in y_pred if p])
return {
'training_set__size': train_size,
'test_set_size': test_size,
'num_non_null_predictions': num_preds
}
def sample_hits_at_k(y_preds, y_test, x_test=None, k=3, size=3):
hits = []
for idx, (_p, _y) in enumerate(zip(y_preds, y_test)):
if _y[0] in _p[:k]:
hit_info = {
'Y_TEST': [_y[0]],
'Y_PRED': _p[:k],
}
if x_test:
hit_info['X_TEST'] = [x_test[idx][0]]
hits.append(hit_info)
if len(hits) < size or size == -1:
return hits
return random.sample(hits, k=size)
def sample_misses_at_k(y_preds, y_test, x_test=None, k=3, size=3):
misses = []
for idx, (_p, _y) in enumerate(zip(y_preds, y_test)):
if _y[0] not in _p[:k]:
miss_info = {
'Y_TEST': [_y[0]],
'Y_PRED': _p[:k],
}
if x_test:
miss_info['X_TEST'] = [x_test[idx][0]]
misses.append(miss_info)
if len(misses) < size or size == -1:
return misses
return random.sample(misses, k=size)
def hit_rate_at_k(y_preds, y_test, k=3):
hits = 0
for _p, _y in zip(y_preds, y_test):
if _y[0] in _p[:k]:
hits += 1
return hits / len(y_test)
def mrr_at_k(y_preds, y_test, k=3):
"""
Computes MRR
:param y_preds: predictions, as lists of lists
:param y_test: target data, as lists of lists (eventually [[sku1], [sku2],...]
:param k: top-k
"""
rr = []
y_test = [k[0] for k in y_test]
for _p, _y in zip(y_preds, y_test):
if _y in _p[:k+1]:
rank = _p[:k+1].index(_y) + 1
rr.append(1/rank)
else:
rr.append(0)
return np.mean(rr)
def coverage_at_k(y_preds, product_data, k=3):
pred_skus = set(itertools.chain.from_iterable(y_preds[:k]))
all_skus = set(product_data.keys())
nb_overlap_skus = len(pred_skus.intersection(all_skus))
return nb_overlap_skus / len(all_skus)
def popularity_bias_at_k(y_preds, x_train, k=3):
# estimate popularity from training data
pop_map = collections.defaultdict(lambda : 0)
num_interactions = 0
for session in x_train:
for event in session:
pop_map[event] += 1
num_interactions += 1
# normalize popularity
pop_map = {k:v/num_interactions for k,v in pop_map.items()}
all_popularity = []
for p in y_preds:
average_pop = sum(pop_map.get(_, 0.0) for _ in p[:k]) / len(p) if len(p) > 0 else 0
all_popularity.append(average_pop)
return sum(all_popularity) / len(y_preds)
def error_by_cosine_distance(model, y_test, y_preds, k=3, bins=25, debug=False):
if not(hasattr(model.__class__, 'get_vector') and callable(getattr(model.__class__, 'get_vector'))):
error_msg = "Error : Model {} does not support retrieval of vector embeddings".format(model.__class__)
print(error_msg)
return error_msg
misses = sample_misses_at_k(y_preds, y_test, k=k, size=-1)
cos_distances = []
for m in misses:
if m['Y_PRED']:
vector_test = model.get_vector(m['Y_TEST'][0])
vector_pred = model.get_vector(m['Y_PRED'][0])
if vector_pred and vector_test:
cos_dist = cosine(vector_pred, vector_test)
cos_distances.append(cos_dist)
histogram = np.histogram(cos_distances, bins=bins, density=False)
# cast to list
histogram = (histogram[0].tolist(), histogram[1].tolist())
# debug / viz
if debug:
plt.hist(cos_distances, bins=bins)
plt.title('dist over cosine distance prod space')
plt.show()
return {'mean': np.mean(cos_distances), 'histogram': histogram}
def distance_to_query(model, x_test, y_test, y_preds, k=3, bins=25, debug=False):
if not(hasattr(model.__class__, 'get_vector') and callable(getattr(model.__class__, 'get_vector'))):
error_msg = "Error : Model {} does not support retrieval of vector embeddings".format(model.__class__)
print(error_msg)
return error_msg
misses = sample_misses_at_k(y_preds, y_test, x_test=x_test, k=k, size=-1)
x_to_y_cos = []
x_to_p_cos = []
for m in misses:
if m['Y_PRED']:
vector_x = model.get_vector(m['X_TEST'][0])
vector_y = model.get_vector(m['Y_TEST'][0])
vectors_p = [model.get_vector(_) for _ in m['Y_PRED']]
c_dists =[]
if not vector_x or not vector_y:
continue
x_to_y_cos.append(cosine(vector_x, vector_y))
for v_p in vectors_p:
if not v_p:
continue
cos_dist = cosine(v_p, vector_x)
if cos_dist:
c_dists.append(cos_dist)
if c_dists:
x_to_p_cos.append(mean(c_dists))
h_xy = np.histogram(x_to_y_cos, bins=bins, density=False)
h_xp = np.histogram(x_to_p_cos, bins=bins, density=False)
h_xy = (h_xy[0].tolist(), h_xy[1].tolist())
h_xp = (h_xp[0].tolist(), h_xp[1].tolist())
# debug / viz
if debug:
plt.hist(x_to_y_cos, bins=bins, alpha=0.5)
plt.hist(x_to_p_cos, bins=bins, alpha=0.5)
plt.title('distribution of distance to input')
plt.legend(['Distance from Input to Label',
'Distance from Input to Label'],
loc='upper right')
plt.show()
return {
'histogram_x_to_y': h_xy,
'histogram_x_to_p': h_xp,
'raw_distances_x_to_y': x_to_y_cos,
'raw_distances_x_to_p': x_to_p_cos,
}
def generic_cosine_distance(embeddings: dict,
type_fn,
y_test,
y_preds,
k=10,
bins=25,
debug=False):
misses = sample_misses_at_k(y_preds, y_test, k=k, size=-1)
cos_distances = []
for m in misses:
if m['Y_TEST'] and m['Y_PRED'] and type_fn(m['Y_TEST'][0]) and type_fn(m['Y_PRED'][0]):
vector_test = embeddings.get(type_fn(m['Y_TEST'][0]), None)
vector_pred = embeddings.get(type_fn(m['Y_PRED'][0]), None)
if vector_pred and vector_test:
cos_dist = cosine(vector_pred, vector_test)
cos_distances.append(cos_dist)
# TODO: Maybe sample some examples from the bins
histogram = np.histogram(cos_distances, bins=bins, density=False)
# cast to list
histogram = (histogram[0].tolist(), histogram[1].tolist())
# debug / viz
if debug:
plt.hist(cos_distances, bins=bins)
plt.title('cosine distance misses')
plt.show()
return {'mean': np.mean(cos_distances), 'histogram': histogram}
def shortest_path_length():
pass
get_nodes = lambda nodes, ancestors="": [] if not nodes else ['_'.join([ancestors, nodes[0]])] + \
get_nodes(nodes[1:], '_'.join([ancestors, nodes[0]]))
def graph_distance_test(y_test, y_preds, product_data, k=3):
path_lengths = []
misses = sample_misses_at_k(y_preds, y_test, k=k, size=-1)
for _y, _y_p in zip([_['Y_TEST'] for _ in misses],
[_['Y_PRED'] for _ in misses]):
if not _y_p:
continue
_y_sku = _y[0]
_y_p_sku = _y_p[0]
if _y_sku not in product_data or _y_p_sku not in product_data:
continue
if not product_data[_y_sku]['CATEGORIES'] or not product_data[_y_p_sku]['CATEGORIES']:
continue
# extract graph information
catA = json.loads(product_data[_y_sku]['CATEGORIES'])
catB = json.loads(product_data[_y_p_sku]['CATEGORIES'])
catA_nodes = [get_nodes(c) for c in catA]
catB_nodes = [get_nodes(c) for c in catB]
all_nodes = list(set([n for c in catA_nodes + catB_nodes for n in c]))
all_edges = [(n1, n2) for c in catA_nodes + catB_nodes for n1, n2 in zip(c[:-1], c[1:])]
all_edges = list(set(all_edges))
# build graph
G = nx.Graph()
G.add_nodes_from(all_nodes)
G.add_edges_from(all_edges)
# get leaves
cat1_leaves = [c[-1] for c in catA_nodes]
cat2_leaves = [c[-1] for c in catB_nodes]
all_paths = [shortest_path(G, c1_l, c2_l) for c1_l in cat1_leaves for c2_l in cat2_leaves]
s_path = min(all_paths, key=len)
s_path_len = len(s_path) - 1
path_lengths.append(s_path_len)
histogram = np.histogram(path_lengths, bins=np.arange(0, max(path_lengths)))
histogram = (histogram[0].tolist(), histogram[1].tolist())
return {'mean': mean(path_lengths), 'hist': histogram}
def roundup(x: int):
div = 10.0 ** (len(str(x)))
return int(math.ceil(x / div)) * div
def hits_distribution(x_train, x_test, y_test, y_preds, k=3, debug=False):
# get product interaction frequency
prod_interaction_cnt = Counter([_ for x in x_train for _ in x])
hit_per_interaction_cnt = defaultdict(list)
for _x, _y_test, _y_pred in zip(x_test, y_test, y_preds):
_x_cnt = prod_interaction_cnt[_x[0]]
# TODO: allow for generic metric
hit_per_interaction_cnt[_x_cnt].append(hit_rate_at_k([_y_pred], [_y_test], k=k))
# get max product frequency
max_cnt = prod_interaction_cnt.most_common(1)[0][1]
# round up to nearest place
max_cnt = int(roundup(max_cnt))
# generate log-bins
indices = np.logspace(1, np.log10(max_cnt), num=int(np.log10(max_cnt))).astype(np.int64)
indices = np.concatenate(([0], indices))
counts_per_bin = [[_ for i in range(low, high) for _ in hit_per_interaction_cnt[i]]
for low, high in zip(indices[:-1], indices[1:])]
histogram = [np.mean(counts) if counts else 0 for counts in counts_per_bin]
count = [len(counts) for counts in counts_per_bin]
if debug:
# debug / visualization
plt.bar(indices[1:], histogram, width=-np.diff(indices)/1.05, align='edge')
plt.xscale('log', base=10)
plt.title('HIT Distribution Across Product Frequency')
plt.show()
return {
'histogram': {int(k): v for k, v in zip(indices[1:], histogram)},
'counts': {int(k): v for k, v in zip(indices[1:], count)}
}
def hits_distribution_by_slice(slice_fns: dict,
y_test,
y_preds,
product_data,
k=3,
sample_size=3,
debug=False):
hit_rate_per_slice = defaultdict(dict)
for slice_name, filter_fn in slice_fns.items():
# get indices for slice
slice_idx = [idx for idx,_y in enumerate(y_test) if _y[0] in product_data and filter_fn(product_data[_y[0]])]
# get predictions for slice
slice_y_preds = [y_preds[_] for _ in slice_idx]
# get labels for slice
slice_y_test = [y_test[_] for _ in slice_idx]
# TODO: We may want to allow for generic metric to be used here
slice_hr = hit_rate_at_k(slice_y_preds, slice_y_test,k=3)
# store results
hit_rate_per_slice[slice_name]['hit_rate'] = slice_hr
hit_rate_per_slice[slice_name]['hits'] = sample_hits_at_k(slice_y_preds, slice_y_test, k=k, size=sample_size)
hit_rate_per_slice[slice_name]['misses'] = sample_misses_at_k(slice_y_preds, slice_y_test, k=k, size=sample_size)
# debug / visualization
if debug:
x_tick_names = list(hit_rate_per_slice.keys())
x_tick_idx = list(range(len(x_tick_names)))
plt.bar(x_tick_idx, hit_rate_per_slice.values(), align='center')
plt.xticks(list(range(len(hit_rate_per_slice))), x_tick_names)
plt.show()
# cast to normal dict
return dict(hit_rate_per_slice)
# TODO: We might need to enforce some standardization of how CATEGORY is represented
def get_item_with_category(product_data: dict, category: set, to_ignore=None):
to_ignore = [] if to_ignore is None else to_ignore
skus = [_ for _ in product_data if product_data[_]['category_hash'] == category and _ not in to_ignore]
if skus:
return random.choice(skus)
return []
def perturb_session(session, product_data):
last_item = session[-1]
if last_item not in product_data:
return []
last_item_category = product_data[last_item]['category_hash']
similar_item = get_item_with_category(product_data, last_item_category, to_ignore=[last_item])
if similar_item:
new_session = session[:-1] + [similar_item]
return new_session
return []
def session_perturbation_test(model, x_test, y_preds, product_data):
overlap_ratios = []
# print(product_data)
y_p = []
s_perturbs = []
# generate a batch of perturbations
for idx, (s, _y_p) in enumerate(tqdm(zip(x_test,y_preds))):
# perturb last item in session
s = [ _.split('_')[0] for _ in s]
s_perturb = perturb_session(s, product_data)
if not s_perturb:
continue
s_perturb = ['_'.join([_,'add']) for _ in s_perturb]
s_perturbs.append(s_perturb)
y_p.append(_y_p)
y_perturbs = model.predict(s_perturbs)
for _y_p, _y_perturb in zip(y_p, y_perturbs):
if _y_p and _y_perturb:
# compute prediction intersection
intersection = set(_y_perturb).intersection(_y_p)
overlap_ratio = len(intersection)/len(_y_p)
overlap_ratios.append(overlap_ratio)
return np.mean(overlap_ratios)
def price_homogeneity_test(y_test, y_preds, product_data, bins=25, debug=True, key='PRICE'):
abs_log_price_diff = []
for idx, (_y, _y_pred) in enumerate(zip(y_test, y_preds)):
# need >=1 predictions
if not _y_pred:
continue
# item must be in product data
if _y[0] not in product_data or _y_pred[0] not in product_data:
continue
if product_data[_y[0]][key] and product_data[_y_pred[0]][key]:
pred_item_price = float(product_data[_y_pred[0]][key])
y_item_price = float(product_data[_y[0]][key])
if pred_item_price and y_item_price:
abs_log_price_diff.append(np.abs(np.log10(pred_item_price)-(np.log10(y_item_price))))
histogram = np.histogram(abs_log_price_diff, bins=bins, density=False)
histogram = (histogram[0].tolist(), histogram[1].tolist())
if debug:
# debug / viz
plt.hist(abs_log_price_diff, bins=25)
plt.show()
return {
'mean': np.mean(abs_log_price_diff),
'histogram': histogram
}
def rec_test(test_type: str):
"""
Rec test decorator
"""
def decorator(f):
@wraps(f)
def w(*args, **kwargs):
return f(*args, **kwargs)
# add attributes to f
w.is_test = True
w.test_type = test_type
try:
w.test_desc = f.__doc__.lstrip().rstrip()
except:
w.test_desc = ""
try:
# python 3
w.name = w.__name__
except:
# python 2
w.name = w.__func__.func_name
return w
return decorator
class RecList(ABC):
META_DATA_FOLDER = '.reclist'
def __init__(self, model: RecModel, dataset: RecDataset, y_preds: list = None):
"""
:param model:
:param dataset:
:param y_preds:
"""
self.name = self.__class__.__name__
self._rec_tests = self.get_tests()
self._x_train = dataset.x_train
self._y_train = dataset.y_train
self._x_test = dataset.x_test
self._y_test = dataset.y_test
self._y_preds = y_preds if y_preds else model.predict(dataset.x_test)
self.rec_model = model
self.product_data = dataset.catalog
self._test_results = []
self._test_data = {}
self._dense_repr = {}
assert len(self._y_test) == len(self._y_preds)
def train_dense_repr(self, type_name: str, type_fn):
"""
Train a dense representation over a type of meta-data & store into object
"""
# type_fn: given a SKU returns some type i.e. brand
x_train_transformed = [[type_fn(e) for e in session if type_fn(e)] for session in self._x_train]
wv = train_embeddings(x_train_transformed)
# store a dict
self._dense_repr[type_name] = {word: list(wv.get_vector(word)) for word in wv.key_to_index}
def get_tests(self):
"""
Helper to extract methods decorated with rec_test
"""
nodes = {}
for _ in self.__dir__():
if not hasattr(self,_):
continue
func = getattr(self, _)
if hasattr(func, 'is_test'):
nodes[func.name] = func
return nodes
def __call__(self, verbose=True, *args, **kwargs):
run_epoch_time_ms = round(time.time() * 1000)
# iterate through tests
for test_func_name, test in self._rec_tests.items():
test_result = test(*args, **kwargs)
# we could store the results in the test function itself
# test.__func__.test_result = test_result
self._test_results.append({
'test_name': test.test_type,
'description': test.test_desc,
'test_result': test_result}
)
if verbose:
print("============= TEST RESULTS ===============")
print("Test Type : {}".format(test.test_type))
print("Test Description : {}".format(test.test_desc))
print("Test Result : {}\n".format(test_result))
# at the end, we dump it locally
if verbose:
print("Generating reports at {}".format(datetime.utcnow()))
self.generate_report(run_epoch_time_ms)
def generate_report(self, epoch_time_ms: int):
# create path first: META_DATA_FOLDER / RecList / Model / Run Time
report_path = os.path.join(
self.META_DATA_FOLDER,
self.name,
self.rec_model.__class__.__name__,
str(epoch_time_ms)
)
# now, dump results
self.dump_results_to_json(self._test_results, report_path, epoch_time_ms)
# now, store artifacts
self.store_artifacts(report_path)
def store_artifacts(self, report_path: str):
target_path = os.path.join(report_path, 'artifacts')
# make sure the folder is there, with all intermediate parents
Path(target_path).mkdir(parents=True, exist_ok=True)
# store predictions
with open(os.path.join(target_path, 'model_predictions.json'), 'w') as f:
json.dump({
'x_test': self._x_test,
'y_test': self._y_test,
'y_preds': self._y_preds
}, f)
def dump_results_to_json(self, test_results: list, report_path: str, epoch_time_ms: int):
target_path = os.path.join(report_path, 'results')
# make sure the folder is there, with all intermediate parents
Path(target_path).mkdir(parents=True, exist_ok=True)
report = {
'metadata': {
'run_time': epoch_time_ms,
'model_name': self.rec_model.__class__.__name__,
'reclist': self.name,
'tests': list(self._rec_tests.keys())
},
'data': test_results
}
with open(os.path.join(target_path, 'report.json'), 'w') as f:
json.dump(report, f)
@property
def test_results(self):
return self._test_results
@property
def test_data(self):
return self._test_data
@property
def rec_tests(self):
return self._rec_tests
class CoveoCartRecList(RecList):
@rec_test(test_type='stats')
def basic_stats(self):
"""
Basic statistics on training, test and prediction data
"""
return statistics(self._x_train,
self._y_train,
self._x_test,
self._y_test,
self._y_preds)
@rec_test(test_type='Coverage@10')
def coverage_at_k(self):
"""
Coverage is the proportion of all possible products which the RS
recommends based on a set of sessions
"""
return coverage_at_k(self.sku_only(self._y_preds),
self.product_data,
k=10)
@rec_test(test_type='HR@10')
def hit_rate_at_k(self):
"""
Compute the rate in which the top-k predictions contain the item to be predicted
"""
return hit_rate_at_k(self.sku_only(self._y_preds),
self.sku_only(self._y_test),
k=10)
@rec_test(test_type='hits_distribution')
def hits_distribution(self):
"""
Compute the distribution of hit-rate across product frequency in training data
"""
return hits_distribution(self.sku_only(self._x_train),
self.sku_only(self._x_test),
self.sku_only(self._y_test),
self.sku_only(self._y_preds),
k=10,
debug=True)
@rec_test(test_type='distance_to_query')
def dist_to_query(self):
"""
Compute the distribution of distance from query to label and query to prediction
"""
return distance_to_query(self.rec_model,
self.sku_only(self._x_test),
self.sku_only(self._y_test),
self.sku_only(self._y_preds), k=10, bins=25, debug=True)
def sku_only(self, l:List[List]):
return [[e['product_sku'] for e in s] for s in l]
Let's download the data, and load it. If dataset is already downloaded, it will automatically skip the downloading.
coveo_dataset = CoveoDataset()
coveo_sigir.zip: 100%|██████████| 1.91G/1.91G [02:04<00:00, 16.5MB/s]
Let's see what's inside the dataset, there is a lot of information here. We are seeing list of sessions; each element in the list is a user session, we know what the user was doing (e.g., "detail") and which product he/she/they were looking at.
coveo_dataset.x_train[0]
[{'event_type': 'event_product', 'hashed_url': '803f6c2d4202e39d6d7fdb232d69366b86bc843869c809f1e1954465bfc6e17f', 'product_action': 'detail', 'product_sku': '624bc145579b67b608e6a7b0d0516cc75e0ec4cbe44ec42c6ac53cc83925bc3e', 'server_timestamp_epoch_ms': '1547528580651', 'session_id': '0f1416c8c68bb9209c1bbc4576386df5480e9757f55ce9cb0d4d4017cf14fc1c'}, {'event_type': 'event_product', 'hashed_url': '80ee4de500233dc623dde6a2730d4bb61e651e5ae3fe33abd985598551de253e', 'product_action': 'detail', 'product_sku': '3b59c7591dd71a2ade8df707e4b1d4e65ef6d1b165aabf54191feccf056b93df', 'server_timestamp_epoch_ms': '1547528590750', 'session_id': '0f1416c8c68bb9209c1bbc4576386df5480e9757f55ce9cb0d4d4017cf14fc1c'}, {'event_type': 'event_product', 'hashed_url': 'ca21b70cf35319e289af06659c222a0fa24b6e9414f1d3d32a3ec86a1ecc0a44', 'product_action': 'detail', 'product_sku': '983d92391f7db7d45005e0d0c61387dea6a05116ae5c5260b90c82a55dc15121', 'server_timestamp_epoch_ms': '1547528595759', 'session_id': '0f1416c8c68bb9209c1bbc4576386df5480e9757f55ce9cb0d4d4017cf14fc1c'}]
x_train_skus = [[e['product_sku'] for e in s] for s in coveo_dataset.x_train]
x_train_skus[0:3]
[['624bc145579b67b608e6a7b0d0516cc75e0ec4cbe44ec42c6ac53cc83925bc3e', '3b59c7591dd71a2ade8df707e4b1d4e65ef6d1b165aabf54191feccf056b93df', '983d92391f7db7d45005e0d0c61387dea6a05116ae5c5260b90c82a55dc15121'], ['d6392b60c1bc118a06cdb12caacbc45513cd5d9d9d4aef0826c4b9015cf70d1d', '605d79343281f976747693e6bbb5757947d9842e02d21bcaa6a02edee3ecd9a9', 'd6392b60c1bc118a06cdb12caacbc45513cd5d9d9d4aef0826c4b9015cf70d1d'], ['c3b0916415e119da78c0746cfaeaa6dc806f3076718c9c703337d75d9d61eb4f', 'c17daa37dc5a0199940118e7edb402526fbca70929086f2e7ba042c5af2299b3', '859e3db8924990d6fb811e67a2f88f4711b6b9407b271c39f6a16838dcf22dcf']]
Let's build a vanilla recommender with a KNN product based model!
embeddings = train_embeddings(sessions=x_train_skus)
# products in the space: 27562
model = P2VRecModel(model=embeddings)
Let's see if our model is working, we will try to do a simple prediction starting from a single product.
test = [[{'event_type': 'event_product',
'hashed_url': '803f6c2d4202e39d6d7fdb232d69366b86bc843869c809f1e1954465bfc6e17f',
'product_action': 'detail',
'product_sku': '624bc145579b67b608e6a7b0d0516cc75e0ec4cbe44ec42c6ac53cc83925bc3e',
'server_timestamp_epoch_ms': '1547528580651',
'session_id': '0f1416c8c68bb9209c1bbc4576386df5480e9757f55ce9cb0d4d4017cf14fc1c'}]]
model.predict(test)
[[{'product_sku': '064ccca34647e8c65f7421a22cc520e40646657322b2196218a99b9dc54417d9'}, {'product_sku': '52c7818bf0ce31793a43319fac4136404c7df939152f3d91e9b193e86d20ef64'}, {'product_sku': '39cd17df8567532b9db06983e6d5312c70f61b1d9276deef0343514cd83809ab'}, {'product_sku': '2a112f0436d6d8b8cbecb61acca3fa88383ff76b05053d975caa40482e9736ed'}, {'product_sku': '704aac67713b93c893912c3d10d9751af53de51c5f05f24afb702080545212b5'}, {'product_sku': '1e7a9c74d8769294783c0198d649ecba4caea98c53de129c51a38f7566d1b09c'}, {'product_sku': 'b42787bd0decd82b42906b2488a8dfb99ba9bbd717113c3c5d808eeb0265bec2'}, {'product_sku': 'e5a80ede4427db027b53a73ed6943c1e9f378ddbbfe49e523dfaefaca660785e'}, {'product_sku': 'eb083190e2fd4e8ea272786338092674430d8e4b51621e102816ad757aecee5f'}, {'product_sku': '962122e7a9ded48c578e7f0f167d61c94d5bfc7e077c32ee533b42b7b4fb5833'}]]
Instantiate rec_list object, prepared with standard quantitative tests and sensible behavioral tests. Then, invoke rec_list to run tests.
# instantiate rec_list object
rec_list = CoveoCartRecList(
model=model,
dataset=coveo_dataset
)
# invoke rec_list to run tests
rec_list(verbose=True)
============= TEST RESULTS =============== Test Type : stats Test Description : Basic statistics on training, test and prediction data Test Result : {'training_set__size': 927357, 'test_set_size': 790, 'num_non_null_predictions': 762} ============= TEST RESULTS =============== Test Type : Coverage@10 Test Description : Coverage is the proportion of all possible products which the RS recommends based on a set of sessions Test Result : 0.0015063416985508992 ============= TEST RESULTS =============== Test Type : HR@10 Test Description : Compute the rate in which the top-k predictions contain the item to be predicted Test Result : 0.12151898734177215
============= TEST RESULTS =============== Test Type : hits_distribution Test Description : Compute the distribution of hit-rate across product frequency in training data Test Result : {'histogram': {10: 0.02702702702702703, 100: 0.14410480349344978, 1000: 0.12534818941504178, 10000: 0.125, 100000: 0}, 'counts': {10: 74, 100: 229, 1000: 359, 10000: 128, 100000: 0}}
============= TEST RESULTS =============== Test Type : distance_to_query Test Description : Compute the distribution of distance from query to label and query to prediction Test Result : {'histogram_x_to_y': ([10, 21, 20, 22, 22, 36, 39, 39, 35, 58, 39, 53, 41, 35, 35, 21, 31, 26, 23, 12, 6, 7, 3, 4, 1], [0.027085185050964355, 0.07621452689170838, 0.1253438687324524, 0.1744732105731964, 0.22360255241394042, 0.27273189425468447, 0.32186123609542844, 0.37099057793617246, 0.4201199197769165, 0.4692492616176605, 0.5183786034584046, 0.5675079452991485, 0.6166372871398925, 0.6657666289806365, 0.7148959708213806, 0.7640253126621246, 0.8131546545028686, 0.8622839963436126, 0.9114133381843567, 0.9605426800251007, 1.0096720218658448, 1.0588013637065887, 1.1079307055473326, 1.1570600473880768, 1.2061893892288207, 1.2553187310695648]), 'histogram_x_to_p': ([4, 11, 21, 20, 29, 28, 35, 33, 24, 34, 33, 30, 28, 37, 33, 32, 34, 33, 20, 61, 17, 20, 13, 4, 5], [0.012268495559692384, 0.025094011306762698, 0.03791952705383301, 0.05074504280090332, 0.06357055854797364, 0.07639607429504394, 0.08922159004211426, 0.10204710578918456, 0.11487262153625488, 0.1276981372833252, 0.14052365303039552, 0.15334916877746582, 0.16617468452453615, 0.17900020027160646, 0.19182571601867676, 0.2046512317657471, 0.2174767475128174, 0.2303022632598877, 0.24312777900695803, 0.25595329475402834, 0.26877881050109864, 0.28160432624816895, 0.29442984199523925, 0.3072553577423096, 0.3200808734893799, 0.3329063892364502]), 'raw_distances_x_to_y': [0.9254719093441963, 0.462533175945282, 0.21085858345031738, 0.47345417737960815, 0.8189148157835007, 0.6212723255157471, 0.1114012598991394, 0.4277552366256714, 0.5421712100505829, 0.3438114523887634, 0.6834848821163177, 0.12182509899139404, 0.5115478038787842, 0.2190667986869812, 0.7817028611898422, 0.7468498349189758, 0.49416977167129517, 0.2725176215171814, 0.5640368163585663, 0.10806989669799805, 0.5597535669803619, 0.26346278190612793, 0.2727212905883789, 0.31810325384140015, 0.7297700345516205, 0.5911677479743958, 0.5790992975234985, 1.0076064178720117, 0.430014431476593, 0.2532806396484375, 0.4981846809387207, 0.6517561078071594, 0.08466029167175293, 0.15237146615982056, 0.781215712428093, 1.0076064178720117, 0.23168128728866577, 0.2835843563079834, 0.34047961235046387, 0.7128640711307526, 0.6921305954456329, 0.4090891480445862, 0.45411020517349243, 0.308832049369812, 0.8578503280878067, 0.8280059248209, 0.09990346431732178, 0.36499106884002686, 0.47029995918273926, 0.9402659200131893, 0.26203757524490356, 0.6552892327308655, 0.40001380443573, 0.27151793241500854, 0.8445519506931305, 0.28339993953704834, 0.17017924785614014, 0.38485950231552124, 0.9698969088494778, 0.2542409896850586, 0.30253440141677856, 0.5792183578014374, 0.5336647927761078, 0.6775735020637512, 0.6186142265796661, 0.7454920709133148, 0.076199471950531, 0.2535223960876465, 0.5798035562038422, 0.6477236449718475, 0.7253351509571075, 0.36245912313461304, 0.5802899301052094, 0.6831025183200836, 0.3974495530128479, 0.4377900958061218, 0.874511793255806, 0.09468638896942139, 0.30603235960006714, 0.6420265138149261, 0.8917598351836205, 0.8514892011880875, 0.28252047300338745, 0.3410959243774414, 0.7218420505523682, 0.49533963203430176, 0.7329410910606384, 0.4549601674079895, 0.8185496628284454, 0.5602165162563324, 0.935164324939251, 0.6192016303539276, 0.6965530812740326, 0.506960391998291, 0.6797125935554504, 0.972739702090621, 0.5587949752807617, 0.9648317396640778, 0.7918451577425003, 0.10904902219772339, 0.26039278507232666, 0.48779743909835815, 0.32050520181655884, 0.9108971506357193, 0.17316967248916626, 0.1562144160270691, 0.5707772672176361, 0.7172189950942993, 0.4743512272834778, 0.5884027183055878, 0.39823293685913086, 0.37635380029678345, 0.48903268575668335, 0.2427654266357422, 0.740830808877945, 0.3790507912635803, 0.48203784227371216, 0.26257091760635376, 0.714726060628891, 0.3886526823043823, 0.506866067647934, 0.38689154386520386, 0.8665386885404587, 0.4252302050590515, 0.3790507912635803, 0.2864288091659546, 0.5251165926456451, 0.2038726806640625, 0.6813859939575195, 0.6797683835029602, 0.13988149166107178, 0.4257371425628662, 0.6832488477230072, 0.4939197897911072, 0.7525335550308228, 0.6219100654125214, 0.15661191940307617, 0.639555811882019, 0.6219100654125214, 0.6514264643192291, 0.35110509395599365, 0.6271197497844696, 0.462327778339386, 0.6513311266899109, 0.10752040147781372, 0.5508485436439514, 0.3116947412490845, 0.7269882261753082, 0.6206513047218323, 0.5326894521713257, 0.8309376835823059, 0.2881903052330017, 0.3581002354621887, 0.16245216131210327, 0.5602153539657593, 0.5890812277793884, 0.1208798885345459, 0.4772169589996338, 0.44260478019714355, 0.6125787794589996, 0.09781920909881592, 0.7173270881175995, 0.9057733789086342, 0.21883291006088257, 0.2283496856689453, 0.3400246500968933, 0.6160574853420258, 0.46860283613204956, 0.4285128712654114, 0.4254186749458313, 0.4115585684776306, 0.8667544275522232, 0.06605154275894165, 0.44577592611312866, 0.4536110758781433, 1.0924406200647354, 0.5168035328388214, 0.5836308300495148, 0.4283052682876587, 0.22816848754882812, 0.3787039518356323, 0.07026445865631104, 0.8491666465997696, 0.3641747832298279, 1.1299540549516678, 0.44231313467025757, 1.167671948671341, 0.918891005218029, 0.7797763794660568, 0.8064161539077759, 0.7030901312828064, 0.1738971471786499, 0.11099112033843994, 0.4285128712654114, 0.6690216064453125, 0.5043551623821259, 0.5595879852771759, 0.40289217233657837, 0.06950515508651733, 0.46268099546432495, 0.39849114418029785, 0.4093968868255615, 0.7422437965869904, 0.5640649199485779, 0.5044651627540588, 0.3718870282173157, 0.4949328303337097, 0.3384582996368408, 0.9164132475852966, 0.8826794549822807, 0.4781320095062256, 0.39250558614730835, 0.47071176767349243, 0.2293713092803955, 0.6476076543331146, 0.8838279992341995, 0.27570104598999023, 0.19004637002944946, 0.6612381637096405, 0.6677241921424866, 0.19149482250213623, 0.7657949030399323, 0.5640649199485779, 0.3024331331253052, 0.1256752610206604, 0.36499106884002686, 0.4845578074455261, 0.5405918657779694, 0.5955899655818939, 0.9245310798287392, 0.9297516494989395, 0.21759074926376343, 0.3373871445655823, 0.3699283003807068, 0.6013206541538239, 0.9102961793541908, 0.843553215265274, 0.5168035328388214, 0.3229179382324219, 0.8645953387022018, 0.572632223367691, 0.4692692160606384, 0.9606809169054031, 0.5453396439552307, 0.5945159792900085, 0.6038006544113159, 0.85569629073143, 0.565295547246933, 0.9472744166851044, 0.49883782863616943, 0.32196158170700073, 0.6548276245594025, 0.38367241621017456, 0.3617027997970581, 0.6122760474681854, 1.0495793037116528, 0.12808352708816528, 0.7142658829689026, 0.8562994003295898, 0.2721226215362549, 0.3397049307823181, 0.027085185050964355, 0.6539434194564819, 0.5633589923381805, 0.3535528779029846, 0.18204158544540405, 0.2661483883857727, 0.8781896978616714, 0.6539434194564819, 0.3516688942909241, 0.9167362451553345, 0.12582182884216309, 0.19273149967193604, 0.6312002241611481, 0.49662697315216064, 0.7021820545196533, 0.2805701494216919, 0.39922797679901123, 0.6158891320228577, 0.07739633321762085, 0.617974579334259, 0.32775402069091797, 0.5247384607791901, 0.2149120569229126, 0.4774479866027832, 0.5334309637546539, 0.7717662453651428, 0.6299411654472351, 0.9769610203802586, 0.3056491017341614, 0.5658897757530212, 0.4421624541282654, 0.5553690791130066, 0.9007061719894409, 0.4115585684776306, 0.6690648198127747, 0.3697749972343445, 0.17395007610321045, 0.4646461009979248, 1.0299112629145384, 0.600534588098526, 0.4646461009979248, 0.9810393769294024, 0.8647336810827255, 0.34212249517440796, 0.5590666830539703, 0.3464195132255554, 0.05532979965209961, 0.20188939571380615, 0.4105381965637207, 0.4939197897911072, 0.2629045248031616, 0.28857362270355225, 0.29965150356292725, 0.4188130497932434, 0.2244529128074646, 0.7472360134124756, 0.6975664496421814, 0.49244630336761475, 0.08120405673980713, 0.48203784227371216, 0.4968187212944031, 0.6568638682365417, 0.5499407947063446, 0.4152105450630188, 0.8767637833952904, 0.6606419384479523, 0.5982445478439331, 0.8854007720947266, 0.451798677444458, 0.7668242305517197, 0.29245537519454956, 0.5700768232345581, 0.27996474504470825, 0.39198386669158936, 0.2561491131782532, 0.2906454801559448, 0.8172992914915085, 0.36499106884002686, 0.839078962802887, 0.871750995516777, 0.7232131659984589, 0.3438114523887634, 0.3022156357765198, 0.6184073686599731, 0.5854481160640717, 1.01153430249542, 0.7308022677898407, 0.21456676721572876, 0.7636695057153702, 0.8693548738956451, 0.42182499170303345, 0.7092864513397217, 1.2553187310695648, 0.5536989271640778, 0.8955832570791245, 0.4093968868255615, 0.5419392883777618, 0.505172461271286, 0.3697749972343445, 0.9452919512987137, 0.763588935136795, 0.6174618601799011, 0.5677748918533325, 0.5108366012573242, 0.2942226529121399, 0.8166680783033371, 0.6539860665798187, 0.6094558238983154, 0.3326358199119568, 0.30736833810806274, 0.4509674310684204, 0.40113013982772827, 0.5168035328388214, 0.20486664772033691, 0.4109269976615906, 0.03789621591567993, 0.7438116669654846, 0.7488087117671967, 1.0868622660636902, 0.7574658691883087, 0.5947867333889008, 0.20400893688201904, 0.5677686929702759, 0.49960315227508545, 0.4939197897911072, 0.6324397325515747, 0.4222393035888672, 0.8293663859367371, 0.9296053647994995, 0.5602153539657593, 0.11834901571273804, 0.7321200668811798, 1.1602952182292938, 0.5965181291103363, 0.9149350449442863, 0.506960391998291, 0.5186827778816223, 0.6399234533309937, 0.4691590666770935, 0.7098327279090881, 0.1822097897529602, 0.5572097897529602, 0.40265655517578125, 0.5968579947948456, 0.48597395420074463, 0.8904133811593056, 0.13772159814834595, 0.7772166877985001, 0.40333008766174316, 0.08821582794189453, 0.21322739124298096, 0.8235199153423309, 0.3984646201133728, 0.46459639072418213, 0.5436182022094727, 0.9407618939876556, 0.5690764486789703, 0.8153036832809448, 0.5076813995838165, 0.8492564111948013, 0.15788805484771729, 0.11221820116043091, 0.5724060535430908, 0.7176460325717926, 0.8440407365560532, 0.7636695057153702, 0.5076813995838165, 0.6323907971382141, 1.0598904006183147, 0.6086966693401337, 0.5732938051223755, 0.6028461754322052, 0.4580917954444885, 0.7809314131736755, 0.7321200668811798, 0.7066938281059265, 0.36259204149246216, 0.9492336809635162, 1.0921175256371498, 0.6050473153591156, 0.3663545846939087, 0.4949040412902832, 0.15298473834991455, 0.9291635751724243, 0.6140021979808807, 0.2852240204811096, 0.4260064363479614, 0.1934911012649536, 0.7881776839494705, 0.35110509395599365, 0.3110712170600891, 0.9965950401965529, 0.5273209810256958, 0.08015191555023193, 0.4719691276550293, 0.9257553964853287, 0.08657711744308472, 1.061875719577074, 0.7328509390354156, 0.41553622484207153, 0.4125799536705017, 0.6805701553821564, 0.4923962950706482, 0.06813162565231323, 0.5629296898841858, 0.4759763479232788, 0.5833230018615723, 0.36499106884002686, 0.9472270794212818, 0.5061180889606476, 0.9592425897717476, 0.5083636045455933, 0.14995676279067993, 0.5998384058475494, 1.0139076802879572, 0.49391037225723267, 0.7350744903087616, 0.8142646700143814, 0.6496772468090057, 0.7049342393875122, 0.7604426890611649, 0.7480517625808716, 0.7623411267995834, 0.6347159445285797, 0.5602153539657593, 0.1489863395690918, 0.18149614334106445, 0.3795756697654724, 0.6545434892177582, 0.5564406514167786, 0.6680121421813965, 0.9342424422502518, 0.657894492149353, 0.7992750257253647, 0.09565281867980957, 0.8649559319019318, 0.6687523424625397, 0.2839009165763855, 0.6356145441532135, 0.5763238072395325, 0.41780197620391846, 0.33846670389175415, 0.8244993835687637, 0.2919861078262329, 0.5979242324829102, 0.16055721044540405, 0.44577592611312866, 0.43739110231399536, 0.8500117361545563, 0.8017996996641159, 0.22006440162658691, 0.8103550374507904, 0.2716165781021118, 0.4894411563873291, 0.4743186831474304, 1.1532704830169678, 0.6796385049819946, 0.5788311660289764, 0.5738734304904938, 1.0360383354127407, 0.8758786469697952, 0.7802753150463104, 0.6680121421813965, 0.32175880670547485, 0.7403334379196167, 0.1184920072555542, 0.985954599454999, 0.5621178150177002, 0.05862969160079956, 0.9962523609865457, 0.3015354871749878, 0.6796385049819946, 0.8925864547491074, 0.28270334005355835, 0.6035843193531036, 0.5169978737831116, 0.2768561840057373, 0.4702553153038025, 0.5790992975234985, 1.1766748428344727, 0.47123968601226807, 0.5027961134910583, 0.8790005818009377, 0.7171919941902161, 0.6880250871181488, 0.8500117361545563, 1.069704793393612, 0.49865972995758057, 0.6096971333026886, 0.8759694769978523, 0.44577592611312866, 0.2793188691139221, 0.715606302022934, 0.3277524709701538, 0.8630160391330719, 0.6029638350009918, 0.672376275062561, 0.7672243714332581, 0.6264006495475769, 0.477161705493927, 0.8390759825706482, 0.41296738386154175, 1.1259145587682724, 0.595622718334198, 0.41994398832321167, 0.8203410059213638, 0.5836825668811798, 0.9287489503622055, 0.6963227391242981, 0.5261703729629517, 0.5353045463562012, 0.5869246125221252, 0.5790992975234985, 0.5850421190261841, 0.36374711990356445, 0.5985998511314392, 0.6088151931762695, 0.747675210237503, 0.06854182481765747, 0.6931126117706299, 0.645240068435669, 0.7922555953264236, 0.5883113443851471, 1.0752593576908112, 0.285663902759552, 0.6201569736003876, 0.56195929646492, 0.4090891480445862, 0.8667544275522232, 0.2510431408882141, 0.5058979690074921, 0.4431813359260559, 0.8103550374507904, 0.3243250250816345, 0.30453455448150635, 0.8527398705482483, 0.3313218951225281, 0.19052201509475708, 0.5382198095321655, 0.7340660691261292, 0.20157545804977417, 0.8104113787412643, 0.39328664541244507, 0.4090891480445862, 0.8347247093915939, 1.1893530488014221, 0.4659144878387451, 0.3405866026878357, 0.504460483789444, 0.3602169156074524, 0.6858391165733337, 1.0038859038613737, 0.5214316546916962, 0.8468800038099289, 0.47100013494491577, 0.8355625718832016, 0.9142830222845078, 0.7842075973749161, 0.48353803157806396, 0.6460709273815155, 0.9214013367891312, 0.16748452186584473, 0.7851828187704086, 0.9456217810511589, 1.0546553991734982, 0.6465227603912354, 0.7309760749340057, 0.7143962383270264, 0.6868995428085327, 0.29914140701293945, 0.8468800038099289, 0.1323196291923523, 0.6218132972717285, 0.36877548694610596, 0.8355625718832016, 0.29121387004852295, 0.22311073541641235, 0.10930991172790527], 'raw_distances_x_to_p': [0.07491215467453002, 0.1258436381816864, 0.07867926359176636, 0.2686879813671112, 0.2801799297332764, 0.0717882513999939, 0.0725929856300354, 0.23633030652999878, 0.10049613714218139, 0.2687165439128876, 0.21981453895568848, 0.060776901245117185, 0.20193864703178405, 0.17324100732803344, 0.2267237663269043, 0.2679001986980438, 0.2943339288234711, 0.09887873530387878, 0.18990106582641603, 0.013069415092468261, 0.22808496952056884, 0.177504962682724, 0.19701056480407714, 0.10048171281814575, 0.22985101938247682, 0.18766284584999085, 0.2594358205795288, 0.22143737077713013, 0.09927035570144653, 0.06417487263679504, 0.2943339288234711, 0.1856810450553894, 0.048224085569381715, 0.07560800909996032, 0.1954294800758362, 0.22143737077713013, 0.11648967862129211, 0.1531326115131378, 0.2031690001487732, 0.14105207920074464, 0.20138368010520935, 0.26318231225013733, 0.09289433360099793, 0.16757089495658875, 0.18792030811309815, 0.20214383006095887, 0.054546850919723514, 0.21114518642425537, 0.10422782301902771, 0.1210227906703949, 0.1002508282661438, 0.19361129999160767, 0.3329063892364502, 0.17842383980751036, 0.26788932681083677, 0.20689859390258789, 0.1385556697845459, 0.09504943490028381, 0.0713767647743225, 0.09155805706977845, 0.20662015676498413, 0.09082162380218506, 0.2263636529445648, 0.2943339288234711, 0.12961583137512206, 0.32856560349464414, 0.049546295404434205, 0.09032933712005616, 0.30195106863975524, 0.24248597025871277, 0.25832881331443786, 0.15774248242378236, 0.26330329179763795, 0.10972844958305358, 0.2620315611362457, 0.12930552363395692, 0.23038687109947203, 0.026624709367752075, 0.14155877828598024, 0.20760102868080138, 0.18119154572486879, 0.09862520694732665, 0.18418713212013244, 0.031426644325256346, 0.1537289261817932, 0.23018404245376586, 0.1488548696041107, 0.14751260876655578, 0.24583064913749694, 0.2943339288234711, 0.21902262568473815, 0.14626285433769226, 0.15618699193000793, 0.21385310888290404, 0.07923253774642944, 0.05288533568382263, 0.1357314705848694, 0.08316851258277894, 0.17194377779960632, 0.051333832740783694, 0.16315961480140687, 0.11301332712173462, 0.13821958899497985, 0.04227041006088257, 0.13023970127105713, 0.04402931928634644, 0.0550550639629364, 0.30488539934158326, 0.12046017050743103, 0.20294120907783508, 0.2607896029949188, 0.24519628882408143, 0.2092129111289978, 0.14207690358161926, 0.2014531373977661, 0.1417696475982666, 0.23628079891204834, 0.15885791778564454, 0.05424953699111938, 0.15618699193000793, 0.026917821168899535, 0.12801412343978882, 0.19133716821670532, 0.19039098024368287, 0.12497220635414123, 0.08388009071350097, 0.2687165439128876, 0.15618699193000793, 0.1058310627937317, 0.18474913239479065, 0.06041879653930664, 0.2943339288234711, 0.13736398816108703, 0.2687165439128876, 0.050558412075042726, 0.17581526637077333, 0.09195072650909424, 0.19391862154006959, 0.17581526637077333, 0.2424494206905365, 0.30195106863975524, 0.14327287077903747, 0.22033972144126893, 0.14627864956855774, 0.028694087266922, 0.31550863981246946, 0.18408670425415039, 0.2152866005897522, 0.10920886993408203, 0.1462748944759369, 0.265926456451416, 0.041348469257354734, 0.18667010664939881, 0.05466875433921814, 0.06425246000289916, 0.26812600493431094, 0.09431666135787964, 0.1754552900791168, 0.19701056480407714, 0.27325833439826963, 0.0812505841255188, 0.08923358917236328, 0.08142293095588685, 0.08049794435501098, 0.08388690948486328, 0.11004357933998107, 0.16105413436889648, 0.20935619473457337, 0.2286216974258423, 0.2687165439128876, 0.2943339288234711, 0.29934104084968566, 0.04505055546760559, 0.2687165439128876, 0.2687165439128876, 0.165482234954834, 0.2505683660507202, 0.2572839021682739, 0.13336470127105712, 0.10528640747070313, 0.18572738766670227, 0.02762399911880493, 0.28054004311561587, 0.2687165439128876, 0.2402168333530426, 0.21964013576507568, 0.20647518634796141, 0.12037063241004944, 0.23799507617950438, 0.18990106582641603, 0.23378283381462098, 0.0682909369468689, 0.09713414311408997, 0.2286216974258423, 0.313013356924057, 0.2574847638607025, 0.09879412055015564, 0.16634994745254517, 0.051359379291534425, 0.2650827825069427, 0.19282710552215576, 0.2687165439128876, 0.22142802476882933, 0.27086228132247925, 0.10801171064376831, 0.1440779447555542, 0.20884934067726135, 0.13752301335334777, 0.2809159100055695, 0.07321414351463318, 0.12935298681259155, 0.09392074346542359, 0.09345549941062928, 0.1448982834815979, 0.2368825912475586, 0.20223453640937805, 0.1077257513999939, 0.06970394849777221, 0.24059501886367798, 0.1531326115131378, 0.07047860026359558, 0.05411232113838196, 0.21195716857910157, 0.1774573504924774, 0.07035811543464661, 0.21114518642425537, 0.2943339288234711, 0.24689266681671143, 0.13023970127105713, 0.1622855007648468, 0.18095166683197023, 0.12736454010009765, 0.2687165439128876, 0.11606188416481018, 0.07301830649375915, 0.20024213194847107, 0.1914510667324066, 0.2687165439128876, 0.14800957441329957, 0.07394496202468873, 0.2943339288234711, 0.3224632441997528, 0.10174304246902466, 0.24306625127792358, 0.06830078363418579, 0.2943339288234711, 0.10397533774375915, 0.09609824419021606, 0.2799139261245728, 0.30195106863975524, 0.24489012360572815, 0.2943339288234711, 0.2669981956481934, 0.19226622581481934, 0.23796676993370056, 0.12403554916381836, 0.07518824338912963, 0.24027977585792543, 0.2129749596118927, 0.19538822770118713, 0.07956457734107972, 0.021501171588897704, 0.2117869734764099, 0.1697279155254364, 0.13646966218948364, 0.04563586115837097, 0.13630935549736023, 0.2325596272945404, 0.27604236602783205, 0.0848526418209076, 0.12355437874794006, 0.08359813094139099, 0.07769800424575805, 0.19930691719055177, 0.017387312650680543, 0.25326520800590513, 0.14934924840927125, 0.05194996595382691, 0.22447794675827026, 0.050048929452896115, 0.1993529200553894, 0.10217436552047729, 0.11040244102478028, 0.13426822423934937, 0.24725902676582337, 0.0681905210018158, 0.22447794675827026, 0.2687165439128876, 0.1155886709690094, 0.2221108555793762, 0.29718998670578, 0.14218377470970153, 0.14780757427215577, 0.24868152141571045, 0.15012385249137877, 0.2957450091838837, 0.23570191264152526, 0.07039734721183777, 0.20243200063705444, 0.2450391411781311, 0.2610181331634521, 0.2943339288234711, 0.21168712973594667, 0.23322834372520446, 0.22336766123771667, 0.14155877828598024, 0.03834769129753113, 0.04359009861946106, 0.17089332342147828, 0.23494293093681334, 0.22307827472686767, 0.10335022807121277, 0.0901472270488739, 0.1275405168533325, 0.2651544511318207, 0.10194430351257325, 0.0823241114616394, 0.2671786665916443, 0.05228369235992432, 0.05673014521598816, 0.23628079891204834, 0.137525075674057, 0.13061318397521973, 0.24059501886367798, 0.1940344214439392, 0.18790475130081177, 0.0804525375366211, 0.1972542643547058, 0.13630935549736023, 0.04387810826301575, 0.12807478308677672, 0.16763864159584047, 0.27681214213371275, 0.16889337301254273, 0.10969911217689514, 0.1264793336391449, 0.2058880627155304, 0.13305887579917908, 0.21114518642425537, 0.11689011454582214, 0.13438963890075684, 0.25695239901542666, 0.1478006660938263, 0.10499387383460998, 0.18095166683197023, 0.07256574034690857, 0.2998500466346741, 0.0661353349685669, 0.15724987983703614, 0.16255935430526733, 0.10478451251983642, 0.16101192235946654, 0.2666783809661865, 0.20674362182617187, 0.2074684500694275, 0.2637452960014343, 0.2687165439128876, 0.2943339288234711, 0.10662007927894593, 0.15771008729934693, 0.2045162320137024, 0.13224366307258606, 0.2799139261245728, 0.15469701290130616, 0.05253932476043701, 0.08710484504699707, 0.15890161395072938, 0.2323542058467865, 0.14316514134407043, 0.16188710927963257, 0.2687165439128876, 0.05047763586044311, 0.16098361611366271, 0.2687165439128876, 0.11001648306846619, 0.03917397260665893, 0.012268495559692384, 0.16453346014022827, 0.21224514842033387, 0.22161090970039368, 0.14690780639648438, 0.10857421159744263, 0.08916388154029846, 0.2687165439128876, 0.20792827010154724, 0.22307827472686767, 0.14967355728149415, 0.25636852383613584, 0.13464637994766235, 0.10542458891868592, 0.06425246000289916, 0.0630178153514862, 0.19448707699775697, 0.2340227782726288, 0.2687165439128876, 0.24868152141571045, 0.21385310888290404, 0.25854902267456054, 0.1818527340888977, 0.05183754563331604, 0.28751025199890134, 0.12577881813049316, 0.05676699876785278, 0.1897931694984436, 0.17065418362617493, 0.182553231716156, 0.028694087266922, 0.041348469257354734, 0.25440205335617067, 0.07136039137840271, 0.032577937841415404, 0.12833439707756042, 0.11628175973892212, 0.16251060962677003, 0.2943339288234711, 0.2687165439128876, 0.2554169952869415, 0.2056412637233734, 0.16429876685142517, 0.18444325923919677, 0.13646966218948364, 0.11304506659507751, 0.04368300437927246, 0.13392633199691772, 0.29578139781951907, 0.16492363810539246, 0.16255935430526733, 0.18444325923919677, 0.2693161189556122, 0.1712073028087616, 0.20294120907783508, 0.19065025448799133, 0.17955344915390015, 0.2943339288234711, 0.18558666110038757, 0.20904414653778075, 0.13144407868385316, 0.1312577486038208, 0.20102456212043762, 0.22140125632286073, 0.2943339288234711, 0.2687165439128876, 0.2687165439128876, 0.05089426636695862, 0.2081002414226532, 0.09939942359924317, 0.17711467146873475, 0.2813377916812897, 0.1275405168533325, 0.044525742530822754, 0.30195106863975524, 0.04257720708847046, 0.14626285433769226, 0.21185248494148254, 0.026778030395507812, 0.21441231966018676, 0.1540526568889618, 0.03564638495445251, 0.13489956259727479, 0.19494746923446654, 0.1498823344707489, 0.1713216245174408, 0.2957450091838837, 0.1910253345966339, 0.047457408905029294, 0.2687165439128876, 0.0846400499343872, 0.24712690114974975, 0.21114518642425537, 0.22140125632286073, 0.18413348197937013, 0.17220293879508972, 0.08344451785087585, 0.10791766047477722, 0.2943339288234711, 0.11980748176574707, 0.17051582932472228, 0.29050675630569456, 0.07664370536804199, 0.14097718000411988, 0.1090990126132965, 0.20662015676498413, 0.1486163854598999, 0.24198567271232604, 0.22140125632286073, 0.06425246000289916, 0.06679590940475463, 0.08926873207092285, 0.20193864703178405, 0.165316379070282, 0.2243583619594574, 0.23749199509620667, 0.10233506560325623, 0.22188621163368225, 0.24299496412277222, 0.07004004716873169, 0.11663140654563904, 0.2263636529445648, 0.11847202777862549, 0.27681214213371275, 0.2652188241481781, 0.1246204912662506, 0.24221355319023133, 0.18734511733055115, 0.0993520438671112, 0.18324695229530336, 0.08314414024353027, 0.2687165439128876, 0.3097450017929077, 0.17438538670539855, 0.15710639357566833, 0.16711366772651673, 0.2575950860977173, 0.1641591429710388, 0.2686879813671112, 0.2435591459274292, 0.30788477063179015, 0.2328833520412445, 0.2943339288234711, 0.20351722836494446, 0.1339774787425995, 0.1500688910484314, 0.1537289261817932, 0.2687165439128876, 0.19867417812347413, 0.24798166155815124, 0.03163349628448486, 0.1450836956501007, 0.2454463243484497, 0.043240517377853394, 0.27681214213371275, 0.07196561098098755, 0.2328833520412445, 0.2183497130870819, 0.2235489785671234, 0.0924670159816742, 0.18189479112625123, 0.09548647999763489, 0.27252782583236695, 0.2687165439128876, 0.17735087275505065, 0.17673290371894837, 0.23877571821212767, 0.22149975895881652, 0.06357990503311158, 0.2586200892925262, 0.17438538670539855, 0.12340834140777587, 0.19728720784187317, 0.09735530018806457, 0.2269219696521759, 0.2687165439128876, 0.15445377230644225, 0.22122191190719603, 0.07904585003852845, 0.265926456451416, 0.10367761254310608, 0.07003435492515564, 0.20137229561805725, 0.1289765179157257, 0.111567223072052, 0.0603985071182251, 0.2092129111289978, 0.25832881331443786, 0.2558863937854767, 0.10555083751678467, 0.2173720419406891, 0.0378653347492218, 0.11084820032119751, 0.20214383006095887, 0.08438714146614075, 0.2687165439128876, 0.11780235171318054, 0.2687165439128876, 0.19899271726608275, 0.09077228307723999, 0.1399840772151947, 0.14687377214431763, 0.105167555809021, 0.04305406212806702, 0.1743747115135193, 0.09195072650909424, 0.24299496412277222, 0.10525094270706177, 0.2801799297332764, 0.09630033373832703, 0.29934104084968566, 0.22234933376312255, 0.2687165439128876, 0.29934104084968566, 0.11002368330955506, 0.13696004152297975, 0.2943339288234711, 0.2575950860977173, 0.10142452120780945, 0.18189479112625123, 0.3224554419517517, 0.17229073643684387, 0.0851608693599701, 0.19139978289604187, 0.25695239901542666, 0.1909913182258606, 0.3245618581771851, 0.17286092638969422, 0.2687165439128876, 0.12159132361412048, 0.23253061175346373, 0.24027977585792543, 0.06982409954071045, 0.09837161302566529, 0.08640698194503785, 0.24048447608947754, 0.13646966218948364, 0.08163856863975524, 0.2243583619594574, 0.2687165439128876, 0.2476891040802002, 0.25287011861801145, 0.15283629298210144, 0.18016136884689332, 0.27681214213371275, 0.1574077844619751, 0.09922032356262207, 0.18095166683197023, 0.09856984615325928, 0.1839461386203766, 0.07881054282188416, 0.08569443821907044, 0.1804223418235779, 0.2763674437999725, 0.20935619473457337, 0.15054181814193726, 0.10751851201057434, 0.05906050801277161, 0.2687165439128876, 0.2476891040802002, 0.23040683269500734, 0.10335022807121277, 0.07769800424575805]} Generating reports at 2021-11-22 15:35:29.586247
!apt-get install tree
!tree -a --du -h . -L 3
. ├── [ 16K] .config │ ├── [ 7] active_config │ ├── [ 0] config_sentinel │ ├── [4.1K] configurations │ │ └── [ 94] config_default │ ├── [ 5] gce │ ├── [ 3] .last_opt_in_prompt.yaml │ ├── [ 37] .last_survey_prompt.yaml │ ├── [ 135] .last_update_check.json │ └── [8.0K] logs │ └── [4.0K] 2021.11.18 ├── [1.9G] coveo_reclist │ └── [1.9G] coveo_sigir.zip ├── [ 19M] coveo_sigir.zip └── [ 12K] .reclist └── [8.0K] CoveoCartRecList └── [4.0K] P2VRecModel 1.9G used in 8 directories, 9 files
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d
Author: Sparsh A. Last updated: 2021-11-22 16:35:08 Compiler : GCC 7.5.0 OS : Linux Release : 5.4.104+ Machine : x86_64 Processor : x86_64 CPU cores : 2 Architecture: 64bit json : 2.0.9 numpy : 1.21.2 IPython : 5.5.0 gensim : 4.0.1 matplotlib: 3.4.3 networkx : 2.6.3 requests : 2.22.0
END