%env MKL_NUM_THREADS=16
%env OMP_NUM_THREADS=16
env: MKL_NUM_THREADS=16 env: OMP_NUM_THREADS=16
from collections import defaultdict
import numpy as np
import scipy as sp
import pandas as pd
from ipypb import track
from polara.evaluation import evaluation_engine as ee
from polara.evaluation.pipelines import random_grid, find_optimal_config
from polara.recommender.coldstart.models import ItemColdStartEvaluationMixin
from polara.recommender.external.turi.turiwrapper import (TuriFactorizationRecommender,
ColdStartRecommendationsMixin)
from data_preprocessing import (get_movielens_data,
get_bookcrossing_data,
get_similarity_data,
prepare_data_model,
prepare_cold_start_data_model)
from utils import (report_results, save_results,
apply_config, print_data_stats,
save_training_time, save_cv_training_time)
%matplotlib inline
seed = 42
experiment_name = 'fm'
data_labels = ['ML1M', 'ML10M', 'BX']
# according to https://apple.github.io/turicreate/docs/api/generated/turicreate.recommender.ranking_factorization_recommender.RankingFactorizationRecommender.html
init_config = dict(with_data_feedback = False, # implicit case
ranking_optimization = True,
solver = 'adagrad',
sgd_step_size = 0, # let Turi autotune it
seed = seed,
max_iterations = 25,
other_tc_params = {}
)
fm_init_config = dict.fromkeys(data_labels, {'FM': init_config, # standard scenario
'FM(cs)': init_config}) # cold start
params = {
'regularization': [1e-10, 3e-10, 1e-9, 3e-9, 1e-8, 3e-8, 1e-7, 3e-7, 1e-6, 3e-6],
'linear_regularization': [1e-10, 3e-10, 1e-9, 3e-9, 1e-8, 3e-8, 1e-7, 3e-7, 1e-6, 3e-6],
'rank': [40] # for initial tuning (exploration)
}
if init_config['solver'] == 'adagrad':
params.update({
'adagrad_momentum_weighting': [0.9, 0.95, 0.99]
})
ranks_grid = [1, 5, 10, 15, 20, 30, 50, 60, 75, 100, 125, 150, 200, 250, 300, 350, 400,
500, 750, 1000, 1250, 1500, 1750, 2000, 2500, 3000]
fm_ranks = {'ML1M': [r for r in ranks_grid if r <= 1000],
'ML10M': [r for r in ranks_grid if r <= 1000],
'BX': [r for r in ranks_grid if r <= 2000]}
topk_values = [1, 3, 10, 20, 30]
target_metric = 'mrr'
data_dict = dict.fromkeys(data_labels)
meta_dict = dict.fromkeys(data_labels)
similarities = dict.fromkeys(data_labels)
sim_indices = dict.fromkeys(data_labels)
all_data = [data_dict, similarities, sim_indices, meta_dict]
lbl = 'ML1M'
data_dict[lbl], meta_dict[lbl] = get_movielens_data('/mnt/bulky/datasets/recsys/movielens/ml-1m.zip',
meta_path='data/meta_info_ml1m.csv',
implicit=True,
filter_no_meta=True)
# not used actually, simply to onform with general pipeline
itemid = meta_dict[lbl].index.name
sim_indices[lbl] = {itemid: meta_dict[lbl].index}
similarities[lbl] = {itemid: sp.sparse.eye(len(meta_dict[lbl].index))}
lbl = 'ML10M'
data_dict[lbl], meta_dict[lbl] = get_movielens_data('/mnt/bulky/datasets/recsys/movielens/ml-10m.zip',
meta_path='data/meta_info_ml10m.csv',
implicit=True,
filter_no_meta=True)
# not used actually, simply to onform with general pipeline
itemid = meta_dict[lbl].index.name
sim_indices[lbl] = {itemid: meta_dict[lbl].index}
similarities[lbl] = {itemid: sp.sparse.eye(len(meta_dict[lbl].index))}
(meta_dict[lbl].applymap(len).sum(axis=1)==0).mean()
0.0
lbl = 'BX'
data_dict[lbl], meta_dict[lbl] = get_bookcrossing_data('/mnt/bulky/datasets/recsys/bookcrossing/BX-CSV-Dump.zip',
get_books_meta=True,
implicit=True,
pcore=5,
filter_no_meta=True)
/home/evfro/miniconda3/envs/polara_dev/lib/python3.6/site-packages/pandas/io/parsers.py:1990: DeprecationWarning: invalid escape sequence '\8' data = self._reader.read(nrows) /home/evfro/miniconda3/envs/polara_dev/lib/python3.6/site-packages/pandas/io/parsers.py:1990: DeprecationWarning: invalid escape sequence '\9' data = self._reader.read(nrows)
# not used actually, simply to onform with general pipeline
itemid = meta_dict[lbl].index.name
sim_indices[lbl] = {itemid: meta_dict[lbl].index}
similarities[lbl] = {itemid: sp.sparse.eye(len(meta_dict[lbl].index))}
(meta_dict[lbl].applymap(len).sum(axis=1)==0).mean()
0.0
print_data_stats(data_labels, all_data)
ML1M {'userid': 6038, 'movieid': 3522} density 2.699052132255699 similarity matrix density 0.028392958546280524 ML10M {'userid': 69797, 'movieid': 10258} density 0.6991397242349022 similarity matrix density 0.009748488984207448 BX {'userid': 7160, 'isbn': 16273} density 0.18925598044812894 similarity matrix density 0.0005841769822585451
def prepare_recommender_models(data_label, data_models, config):
data_model = data_models[data_label]
fm = TuriFactorizationRecommender(data_model, item_side_info=meta_dict[data_label])
fm.method = 'FM'
models = [fm]
apply_config(models, config, data_label)
return models
def fine_tune_fm(model, params, label, ntrials=60, record_time_as=None):
param_grid, param_names = random_grid(params, n=ntrials)
best_fm_config, fm_scores = find_optimal_config(model, param_grid, param_names,
target_metric,
return_scores=True,
force_build=True,
iterator=lambda x: track(x, label=label))
model_config = {model.method: dict(zip(param_names, best_fm_config))}
model_scores = {model.method: fm_scores}
try:
if record_time_as:
save_training_time(f'{experiment_name}_{record_time_as}', model, fm_scores.index, label)
finally:
return model_config, model_scores
config = {}
scores = {}
times = {}
data_models = {}
fm_init_config['ML1M']['FM']
{'with_data_feedback': False, 'ranking_optimization': True, 'solver': 'adagrad', 'sgd_step_size': 0, 'seed': 42, 'max_iterations': 25, 'other_tc_params': {}}
for label in track(data_labels):
data_models[label] = prepare_data_model(label, *all_data, seed)
model, = prepare_recommender_models(label, data_models, fm_init_config)
config[label], scores[label] = fine_tune_fm(model, params, label, ntrials=60, record_time_as='param')
del model
report_results('tuning', scores);
/home/evfro/miniconda3/envs/polara_dev/lib/python3.6/site-packages/pandas/plotting/_core.py:998: UserWarning: Attempted to set non-positive left xlim on a log-scaled axis. Invalid limit will be ignored. ax.set_xlim(left, right)
config
{'ML1M': {'FM': {'regularization': 3e-08, 'linear_regularization': 1e-10, 'rank': 40, 'adagrad_momentum_weighting': 0.99}}, 'ML10M': {'FM': {'regularization': 1e-06, 'linear_regularization': 3e-07, 'rank': 40, 'adagrad_momentum_weighting': 0.99}}, 'BX': {'FM': {'regularization': 1e-06, 'linear_regularization': 3e-08, 'rank': 40, 'adagrad_momentum_weighting': 0.99}}}
save_results(f'{experiment_name}_param', config=config, tuning=scores)
rank_config = {}
rank_scores = {}
for label in track(data_labels):
model, = prepare_recommender_models(label, data_models,
[fm_init_config, config]) # initiate with optimal config
rank_config[label], rank_scores[label] = fine_tune_fm(model, {'rank': fm_ranks[label]},
label, ntrials=0, record_time_as='rank')
del model
report_results('rank', {lbl: v.sort_index() for lbl, scr in rank_scores.items() for k, v in scr.items()});
rank_config
{'ML1M': {'FM': {'rank': 750}}, 'ML10M': {'FM': {'rank': 750}}, 'BX': {'FM': {'rank': 750}}}
save_results(f'{experiment_name}_rank', config=rank_config, tuning=rank_scores)
result = {}
for label in track(data_labels):
models = prepare_recommender_models(label, data_models, [fm_init_config, config, rank_config])
result[label] = ee.run_cv_experiment(models,
fold_experiment=ee.topk_test,
topk_list=topk_values,
ignore_feedback=True,
iterator=lambda x: track(x, label=label))
save_cv_training_time(experiment_name, models, label)
report_results('topn', result, target_metric);
pd.concat({lbl: res.mean(level='top-n').loc[10, :'ranking'] for lbl, res in result.items()}, axis=1)
BX | ML10M | ML1M | ||
---|---|---|---|---|
type | metric | |||
relevance | hr | 0.061034 | 0.257876 | 0.190189 |
ranking | mrr | 0.024515 | 0.109986 | 0.082559 |
save_results(experiment_name, cv=result)
class TuriRecommenderColdStart(ItemColdStartEvaluationMixin,
ColdStartRecommendationsMixin,
TuriFactorizationRecommender): pass
def prepare_cold_start_recommender_models(data_label, data_models, config):
data_model = data_models[data_label]
fm = TuriRecommenderColdStart(data_model, item_side_info=meta_dict[data_label])
fm.method = 'FM(cs)'
models = [fm]
apply_config(models, config, data_label)
return models
config_cold = {}
scores_cold = {}
data_models_cold = {}
for label in track(data_labels):
data_models_cold[label] = prepare_cold_start_data_model(label, *all_data, seed)
model, = prepare_cold_start_recommender_models(label, data_models_cold, fm_init_config)
config_cold[label], scores_cold[label] = fine_tune_fm(model, params, label, ntrials=60)
del model
report_results('tuning', scores_cold);
config_cold
{'ML1M': {'FM(cs)': {'regularization': 1e-10, 'linear_regularization': 1e-06, 'rank': 40, 'adagrad_momentum_weighting': 0.99}}, 'ML10M': {'FM(cs)': {'regularization': 3e-06, 'linear_regularization': 1e-06, 'rank': 40, 'adagrad_momentum_weighting': 0.95}}, 'BX': {'FM(cs)': {'regularization': 3e-06, 'linear_regularization': 3e-07, 'rank': 40, 'adagrad_momentum_weighting': 0.95}}}
save_results(f'{experiment_name}_coldstart_param', config=config_cold, tuning=scores_cold)
rank_config_cold = {}
rank_scores_cold = {}
for label in track(data_labels):
model, = prepare_cold_start_recommender_models(label, data_models_cold,
[fm_init_config, config_cold]) # initiate with optimal config
rank_config_cold[label], rank_scores_cold[label] = fine_tune_fm(model, {'rank': fm_ranks[label]},
label, ntrials=0)
del model
report_results('rank', {lbl: v.sort_index() for lbl, scr in rank_scores_cold.items() for k, v in scr.items()});
rank_config_cold
{'ML1M': {'FM(cs)': {'rank': 5}}, 'ML10M': {'FM(cs)': {'rank': 75}}, 'BX': {'FM(cs)': {'rank': 150}}}
save_results(f'{experiment_name}_coldstart_rank', config=rank_config_cold, tuning=rank_scores_cold)
result_cold = {}
for label in track(data_labels):
models_cold = prepare_cold_start_recommender_models(label, data_models_cold,
[fm_init_config, config_cold, rank_config_cold])
result_cold[label] = ee.run_cv_experiment(models_cold,
fold_experiment=ee.topk_test,
topk_list=topk_values,
ignore_feedback=True,
iterator=lambda x: track(x, label=label))
report_results('topn', result_cold, target_metric);
report_results('topn', result_cold, 'coverage');
save_results(f'{experiment_name}_coldstart', cv=result_cold)