from collections import defaultdict
import numpy as np
import scipy as sp
import pandas as pd
from ipypb import track
from polara.evaluation import evaluation_engine as ee
from polara.evaluation.pipelines import random_grid, find_optimal_config
from lce import LCEModel, LCEModelItemColdStart
from data_preprocessing import (get_yahoo_music_data,
get_similarity_data,
prepare_data_model,
prepare_cold_start_data_model)
from utils import (report_results, save_results,
apply_config, print_data_stats,
save_training_time, save_cv_training_time)
%matplotlib inline
from polara.recommender import defaults
defaults.memory_hard_limit = 15 # allowed memory usage during recommendations generationa
defaults.max_test_workers = 6 # use this manyparallel thread for evaluation each using up to {memory_hard_limit} Gb of RAM
seed = 42
experiment_name = 'lce'
data_labels = ['YaMus']
init_config = dict(seed = seed,
max_iterations = 75,
alpha = 0.1,
beta = 0.05,
max_neighbours=10,
)
lce_init_config = dict.fromkeys(data_labels, {'LCE': init_config, # standard scenario
'LCE(cs)': init_config}) # cold start
params = {
'regularization': [1, 3, 10, 30],
'rank': [100] # for initial tuning (exploration)
}
coeffs = {
'alpha': [0.1, 0.3, 0.5, 0.7, 0.9],
'beta': [0, 0.05, 0.1, 0.3]
}
ranks_grid = [1, 50, 100, 500, 750, 1000, 1500, 2000, 2500, 3000]
lce_ranks = {'YaMus': ranks_grid}
topk_values = [1, 3, 10, 20, 30]
target_metric = 'mrr'
data_dict = dict.fromkeys(data_labels)
meta_dict = dict.fromkeys(data_labels)
similarities = dict.fromkeys(data_labels)
feature_idx = dict.fromkeys(data_labels)
sim_indices = dict.fromkeys(data_labels)
all_data = [data_dict, similarities, sim_indices, meta_dict]
lbl = 'YaMus'
data_dict[lbl], meta_dict[lbl] = get_yahoo_music_data('/data/recsys/yahoo_music/yamus_train0_rating5.gz',
meta_path='/data/recsys/yahoo_music/yamus_attrs.gz',
implicit=True,
pcore=5,
filter_data={'genreid': [0]}, # filter unknown genre
filter_no_meta=True)
similarities[lbl], sim_indices[lbl], feature_idx[lbl] = get_similarity_data(meta_dict[lbl])
(meta_dict[lbl].applymap(len).sum(axis=1)==0).mean()
0.0
print_data_stats(data_labels, all_data)
YaMus {'userid': 183003, 'songid': 134059} density 0.09740952587383789 similarity matrix density 0.4576464914574314
def prepare_recommender_models(data_label, data_models, config):
data_model = data_models[data_label]
lce = LCEModel(data_model, item_features=meta_dict[data_label])
lce.method = 'LCE'
models = [lce]
apply_config(models, config, data_label)
return models
def fine_tune_lce(model, params, label, ntrials=60, record_time_as=None):
param_grid, param_names = random_grid(params, n=ntrials)
best_lce_config, lce_scores = find_optimal_config(model, param_grid, param_names,
target_metric,
return_scores=True,
force_build=True,
iterator=lambda x: track(x, label=label))
model_config = {model.method: dict(zip(param_names, best_lce_config))}
model_scores = {model.method: lce_scores}
try:
if record_time_as:
save_training_time(f'{experiment_name}_{record_time_as}', model, lce_scores.index, label)
finally:
return model_config, model_scores
config = {}
scores = {}
data_models = {}
lce_init_config['YaMus']['LCE']
{'seed': 42, 'max_iterations': 75, 'alpha': 0.1, 'beta': 0.05, 'max_neighbours': 10}
_config = {}
_scores = {}
for label in track(data_labels):
data_models[label] = prepare_data_model(label, *all_data, seed)
model, = prepare_recommender_models(label, data_models, lce_init_config)
_config[label], _ = fine_tune_lce(model, params, label)
del model
_config # will also reuse it in coldstart
{'YaMus': {'LCE': {'regularization': 10, 'rank': 40}}}
for label in track(data_labels):
data_models[label] = prepare_data_model(label, *all_data, seed)
model, = prepare_recommender_models(label, data_models, [lce_init_config, _config])
config[label], scores[label] = fine_tune_lce(model, coeffs, label)
# make sure to save all parameters
config[label][model.method].update(_config[label][model.method])
del model
report_results('tuning', scores);
config
save_results(f'{experiment_name}_param', config=config, tuning=scores)
rank_config = {}
rank_scores = {}
for label in track(data_labels):
model, = prepare_recommender_models(label, data_models,
[lce_init_config, config]) # initiate with optimal config
rank_config[label], rank_scores[label] = fine_tune_lce(model, {'rank': lce_ranks[label]},
label, ntrials=0, record_time_as='rank')
del model
report_results('rank', {lbl: v.sort_index() for lbl, scr in rank_scores.items() for k, v in scr.items()});
rank_config
save_results(f'{experiment_name}_rank', config=rank_config, tuning=rank_scores)
result = {}
for label in track(data_labels):
models = prepare_recommender_models(label, data_models, [lce_init_config, config, rank_config])
result[label] = ee.run_cv_experiment(models,
fold_experiment=ee.topk_test,
topk_list=topk_values,
ignore_feedback=True,
iterator=lambda x: track(x, label=label))
save_cv_training_time(experiment_name, models, label)
report_results('topn', result, target_metric);
pd.concat({lbl: res.mean(level='top-n').loc[10, :'ranking'] for lbl, res in result.items()}, axis=1)
save_results(experiment_name, cv=result)
def prepare_cold_start_recommender_models(data_label, data_models, config):
data_model = data_models[data_label]
lce = LCEModelItemColdStart(data_model, item_features=meta_dict[data_label])
lce.method = 'LCE(cs)'
models = [lce]
apply_config(models, config, data_label)
return models
config_cold = {}
scores_cold = {}
data_models_cold = {}
lce_init_config['YaMus']['LCE(cs)']
{'seed': 42, 'max_iterations': 75, 'alpha': 0.1, 'beta': 0.05, 'max_neighbours': 10}
_config_cold = {}
for label in track(data_labels):
# reuse regularization param from standard scenario
_config_cold[label] = {f'{k}(cs)' if k=='LCE' else k: v for k, v in _config[label].items()}
data_models_cold[label] = prepare_cold_start_data_model(label, *all_data, seed)
model, = prepare_cold_start_recommender_models(label, data_models_cold, [lce_init_config, _config_cold])
config_cold[label], scores_cold[label] = fine_tune_lce(model, coeffs, label, record_time_as=None)
# make sure to save all parameters
config_cold[label][model.method].update(_config_cold[label][model.method])
del model
report_results('tuning', scores_cold);
/opt/conda/envs/py36/lib/python3.6/site-packages/pandas/plotting/_core.py:1001: UserWarning: Attempted to set non-positive left xlim on a log-scaled axis. Invalid limit will be ignored. ax.set_xlim(left, right)
config_cold
{'YaMus': {'LCE(cs)': {'alpha': 0.1, 'beta': 0.1, 'regularization': 10, 'rank': 40}}}
save_results(f'{experiment_name}_coldstart_param', config=config_cold, tuning=scores_cold)
rank_config_cold = {}
rank_scores_cold = {}
for label in track(data_labels):
model, = prepare_cold_start_recommender_models(label, data_models_cold,
[lce_init_config, config_cold]) # initiate with optimal config
rank_config_cold[label], rank_scores_cold[label] = fine_tune_lce(model, {'rank': lce_ranks[label]},
label, ntrials=0)
del model
report_results('rank', {lbl: v.sort_index() for lbl, scr in rank_scores_cold.items() for k, v in scr.items()});
rank_config_cold
save_results(f'{experiment_name}_coldstart_rank', config=rank_config_cold, tuning=rank_scores_cold)
result_cold = {}
for label in track(data_labels):
models_cold = prepare_cold_start_recommender_models(label, data_models_cold,
[lce_init_config, config_cold, rank_config_cold])
result_cold[label] = ee.run_cv_experiment(models_cold,
fold_experiment=ee.topk_test,
topk_list=topk_values,
ignore_feedback=True,
iterator=lambda x: track(x, label=label))
report_results('topn', result_cold, target_metric);
report_results('topn', result_cold, 'coverage');
save_results(f'{experiment_name}_coldstart', cv=result_cold)
1
1
1