#!/usr/bin/env python # coding: utf-8 # # RePlay Tutorial # This notebook is designed to familiarize with the use of RePlay library, including # - data preprocessing # - data splitting # - model training and inference # - model optimization # - model saving and loading # - models comparison # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: get_ipython().run_line_magic('config', 'Completer.use_jedi = False') # In[3]: import warnings from optuna.exceptions import ExperimentalWarning warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings("ignore", category=ExperimentalWarning) # In[4]: import pandas as pd from pyspark.sql.functions import rand from replay.data_preparator import DataPreparator from replay.experiment import Experiment from replay.metrics import Coverage, HitRate, NDCG, MAP from replay.model_handler import save, load from replay.models import ALSWrap, KNN, SLIM from replay.session_handler import State from replay.splitters import UserSplitter from replay.utils import convert2spark # In[5]: K = 5 SEED=1234 # ## 0. Data preprocessing # We will use MovieLens 1m as an example. # In[6]: df = pd.read_csv("data/ml1m_ratings.dat", sep="\t", names=["user_id", "item_id", "relevance", "timestamp"]) users = pd.read_csv("data/ml1m_users.dat", sep="\t", names=["user_id", "gender", "age", "occupation", "zip_code"]) # ### 0.1. DataPreparator # An inner data format in RePlay is a spark dataframe. # You can pass spark or pandas dataframe as an input. Columns ``item_id`` and ``user_id`` are required for interaction matrix. # Optional columns for interaction matrix are ``relevance`` and interaction ``timestamp``. # # We implemented DataPreparator class to convert dataframes to spark format and preprocess the data, including renaming/creation of required and optional interaction matrix columns, null check and dates parsing. # # To convert pandas dataframe to spark as is use function ``convert_to_spark`` from ``replay.utils``. # In[7]: preparator = DataPreparator() log, _, _ = preparator(df) # In[8]: log.show(3) # In[9]: users = convert2spark(users) users.show(3) # ### 0.2. Split # RePlay provides you with data splitters to reproduce a validation schemas widely-used in recommender systems. # # `UserSplitter` takes ``item_test_size`` items for ``user_test_size`` user to the test dataset. # In[10]: splitter = UserSplitter( drop_cold_items=True, drop_cold_users=True, item_test_size=K, user_test_size=500, seed=SEED, shuffle=True ) train, test = splitter.split(log) print(train.count(), test.count()) # ## 1. Models training # # #### SLIM # In[11]: slim = SLIM(seed=SEED) # In[12]: get_ipython().run_cell_magic('time', '', '\nslim.fit(log=train)\n') # In[13]: get_ipython().run_cell_magic('time', '', "\nrecs = slim.predict(\n k=K,\n users=test.select('user_idx').distinct(),\n log=train,\n filter_seen_items=True\n)\n") # In[14]: recs.show(2) # ## 2. Models evaluation # RePlay implements some popular recommenders' quality metrics. Use pure metrics or calculate a set of chosen metrics and compare models with the ``Experiment`` class. # In[15]: metrics = Experiment(test, {NDCG(): K, MAP() : K, HitRate(): [1, K], Coverage(train): K }) # In[16]: get_ipython().run_cell_magic('time', '', 'metrics.add_result("SLIM", recs)\nmetrics.results\n') # ## 3. Hyperparameters optimization # #### 3.1 Search # In[17]: # data split for hyperparameters optimization train_opt, val_opt = splitter.split(train) # In[18]: get_ipython().run_cell_magic('time', '', 'best_params = slim.optimize(train_opt, val_opt, criterion=NDCG(), k=K, budget=15)\n') # In[19]: best_params # #### 3.2 Compare with previous # In[20]: def fit_predict_evaluate(model, experiment, name): model.fit(log=train) recs = model.predict( k=K, users=test.select('user_idx').distinct(), log=train, filter_seen_items=True ) experiment.add_result(name, recs) return recs # In[21]: get_ipython().run_cell_magic('time', '', "recs = fit_predict_evaluate(SLIM(**best_params, seed=SEED), metrics, 'SLIM_optimized')\nmetrics.results.sort_values('NDCG@5', ascending=False)\n") # ### Convert to pandas # In[22]: recs_pd = recs.toPandas() recs_pd.head(2) # ## 4. Save and load # RePlay allows to save and load fitted models with `save` and `load` functions of `model_handler` module. Model is saved as a folder with all necessary parameters and data. # In[23]: save(slim, path='./slim_best_params') slim_loaded = load('./slim_best_params') # In[24]: get_ipython().run_cell_magic('time', '', "pred_from_loaded = slim_loaded.predict(k=K,\n users=test.select('user_idx').distinct(),\n log=train,\n filter_seen_items=True)\npred_from_loaded.show(2)\n") # In[25]: slim_loaded.beta, slim_loaded.lambda_ # ## 5. Other RePlay models # #### ALS # Commonly-used matrix factorization algorithm. # In[26]: get_ipython().run_cell_magic('time', '', "recs = fit_predict_evaluate(ALSWrap(rank=100, seed=SEED), metrics, 'ALS')\nmetrics.results.sort_values('NDCG@5', ascending=False)\n") # #### KNN # Commonly-used item-based recommender # In[27]: get_ipython().run_cell_magic('time', '', "recs = fit_predict_evaluate(KNN(num_neighbours=100), metrics, 'KNN')\nmetrics.results.sort_values('NDCG@5', ascending=False)\n") # ## 6 Compare RePlay models with others # To easily evaluate recommendations obtained from other sources, read and pass these recommendations to ``Experiment`` # In[29]: metrics.add_result("my_model", recs) metrics.results.sort_values("NDCG@5", ascending=False)