In [1]:

%load_ext autoreload
%autoreload 2

In [2]:

%config Completer.use_jedi = False

In [3]:

import warnings
from optuna.exceptions import ExperimentalWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ExperimentalWarning)

In [4]:

K=10
SEED=1234

The notebook contains an example of LightFM model usage and dataset preprocessing with RePlay, including:¶

Data loading
Features preprocessing with pyspark
Building LightFM model based on interaction matrix and features
Model evaluation

1) Data loading¶

We will use MovieLens 10m dataset from rs_datasets package, which contains a list of recommendations datasets.

In [5]:

from rs_datasets import MovieLens

data = MovieLens("10m")
data.info()

ratings

	user_id	item_id	rating	timestamp
0	1	122	5.0	838985046
1	1	185	5.0	838983525
2	1	231	5.0	838983392

items

	item_id	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance

tags

	user_id	item_id	tag	timestamp
0	15	4973	excellent!	1215184630
1	20	1747	politics	1188263867
2	20	1747	satire	1188263867

Convert interaction log to RePlay format¶

In [6]:

from replay.data_preparator import DataPreparator

log = DataPreparator().transform(
    data=data.ratings,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "rating",
        "timestamp": "timestamp"
    }
)

Data split¶

In [7]:

from replay.splitters import UserSplitter

user_random_splitter = UserSplitter(
    item_test_size=K,
    user_test_size=500,
    drop_cold_items=True,
    drop_cold_users=True,
    shuffle=True,
    seed=SEED
)

In [8]:

train, test = user_random_splitter.split(log)
train.count(), test.count()

Out[8]:

(9995054, 5000)

In [9]:

train_opt, val_opt = user_random_splitter.split(train)
train_opt.count(), val_opt.count()

Out[9]:

(9990054, 5000)

2) Features preprocessing with pyspark¶

Convert features to RePlay format¶

In [10]:

%%time
item_features = DataPreparator().transform(
    data=data.items,
    columns_names={
        "item_id": "item_id"
    }
)

CPU times: user 37.6 ms, sys: 4.1 ms, total: 41.7 ms
Wall time: 215 ms

In [11]:

item_features.show(2)

+-------+--------------------+----------------+
|item_id|              genres|           title|
+-------+--------------------+----------------+
|      1|Adventure|Animati...|Toy Story (1995)|
|      2|Adventure|Childre...|  Jumanji (1995)|
+-------+--------------------+----------------+
only showing top 2 rows

Year¶

In [12]:

from pyspark.sql import functions as sf
from pyspark.sql.types import IntegerType

In [13]:

year = item_features.withColumn('year', sf.substring(sf.col('title'), -5, 4).astype(IntegerType())).select('item_id', 'year')
year.show(2)

+-------+----+
|item_id|year|
+-------+----+
|      1|1995|
|      2|1995|
+-------+----+
only showing top 2 rows

Genres¶

In [14]:

from replay.session_handler import State
from pyspark.sql.functions import split

genres = (
    State().session.createDataFrame(data.items[["item_id", "genres"]])
    .select(
        "item_id",
        split("genres", "\|").alias("genres")
    )
)

In [15]:

genres.show()

+-------+--------------------+
|item_id|              genres|
+-------+--------------------+
|      1|[Adventure, Anima...|
|      2|[Adventure, Child...|
|      3|   [Comedy, Romance]|
|      4|[Comedy, Drama, R...|
|      5|            [Comedy]|
|      6|[Action, Crime, T...|
|      7|   [Comedy, Romance]|
|      8|[Adventure, Child...|
|      9|            [Action]|
|     10|[Action, Adventur...|
|     11|[Comedy, Drama, R...|
|     12|    [Comedy, Horror]|
|     13|[Animation, Child...|
|     14|             [Drama]|
|     15|[Action, Adventur...|
|     16|      [Crime, Drama]|
|     17|[Comedy, Drama, R...|
|     18|[Comedy, Drama, T...|
|     19|            [Comedy]|
|     20|[Action, Comedy, ...|
+-------+--------------------+
only showing top 20 rows

In [16]:

from pyspark.sql.functions import explode

genres_list = (
    genres.select(explode("genres").alias("genre"))
    .distinct().filter('genre <> "(no genres listed)"')
    .toPandas()["genre"].tolist()
)

In [17]:

genres_list

Out[17]:

['Documentary',
 'IMAX',
 'Adventure',
 'Animation',
 'Comedy',
 'Thriller',
 'Sci-Fi',
 'Musical',
 'Horror',
 'Action',
 'Fantasy',
 'War',
 'Mystery',
 'Drama',
 'Film-Noir',
 'Crime',
 'Western',
 'Romance',
 'Children']

In [18]:

from pyspark.sql.functions import col, lit, array_contains
from pyspark.sql.types import IntegerType

item_features = genres
for genre in genres_list:
    item_features = item_features.withColumn(
        genre,
        array_contains(col("genres"), genre).astype(IntegerType())
    )
item_features = item_features.drop("genres").cache()
item_features.count()

Out[18]:

In [19]:

item_features.show(2)

+-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+
|item_id|Documentary|IMAX|Adventure|Animation|Comedy|Thriller|Sci-Fi|Musical|Horror|Action|Fantasy|War|Mystery|Drama|Film-Noir|Crime|Western|Romance|Children|
+-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+
|      1|          0|   0|        1|        1|     1|       0|     0|      0|     0|     0|      1|  0|      0|    0|        0|    0|      0|      0|       1|
|      2|          0|   0|        1|        0|     0|       0|     0|      0|     0|     0|      1|  0|      0|    0|        0|    0|      0|      0|       1|
+-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+
only showing top 2 rows

In [20]:

item_features = item_features.join(year, on='item_id', how='inner')
item_features.cache()
item_features.count()

Out[20]:

3) Building LightFM model based on interaction matrix and features¶

In [21]:

from replay.models import LightFMWrap

model_feat = LightFMWrap(random_state=SEED, loss='warp', no_components=128)

In [22]:

%%time
model_feat.fit(train, item_features=item_features)

CPU times: user 13h 43min 26s, sys: 59.3 s, total: 13h 44min 25s
Wall time: 18min 55s

In [23]:

%%time
recs = model_feat.predict(
    k=K,
    users=test.select('user_id').distinct(),
    log=train,
    filter_seen_items=True,
    item_features=item_features
)

CPU times: user 8.44 s, sys: 2.99 s, total: 11.4 s
Wall time: 1min

4) Model evaluation¶

In [24]:

from replay.metrics import HitRate, NDCG, MAP, Coverage
from replay.experiment import Experiment

metrics = Experiment(test, {NDCG(): K,
                            MAP() : K,
                            HitRate(): [1, K],
                           Coverage(train): K})
 

In [25]:

metrics.add_result("LightFM_item_features", recs)
metrics.results

Out[25]:

	Coverage@10	HitRate@1	HitRate@10	MAP@10	NDCG@10
LightFM_item_features	0.07193	0.348	0.796	0.113104	0.221282

In [ ]: