%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
import warnings
from optuna.exceptions import ExperimentalWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ExperimentalWarning)
K=10
SEED=1234
We will use MovieLens 10m dataset from rs_datasets package, which contains a list of recommendations datasets.
from rs_datasets import MovieLens
data = MovieLens("10m")
data.info()
ratings
user_id | item_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 122 | 5.0 | 838985046 |
1 | 1 | 185 | 5.0 | 838983525 |
2 | 1 | 231 | 5.0 | 838983392 |
items
item_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
1 | 2 | Jumanji (1995) | Adventure|Children|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
tags
user_id | item_id | tag | timestamp | |
---|---|---|---|---|
0 | 15 | 4973 | excellent! | 1215184630 |
1 | 20 | 1747 | politics | 1188263867 |
2 | 20 | 1747 | satire | 1188263867 |
from replay.data_preparator import DataPreparator
log = DataPreparator().transform(
data=data.ratings,
columns_names={
"user_id": "user_id",
"item_id": "item_id",
"relevance": "rating",
"timestamp": "timestamp"
}
)
from replay.splitters import UserSplitter
user_random_splitter = UserSplitter(
item_test_size=K,
user_test_size=500,
drop_cold_items=True,
drop_cold_users=True,
shuffle=True,
seed=SEED
)
train, test = user_random_splitter.split(log)
train.count(), test.count()
(9995054, 5000)
train_opt, val_opt = user_random_splitter.split(train)
train_opt.count(), val_opt.count()
(9990054, 5000)
%%time
item_features = DataPreparator().transform(
data=data.items,
columns_names={
"item_id": "item_id"
}
)
CPU times: user 37.6 ms, sys: 4.1 ms, total: 41.7 ms Wall time: 215 ms
item_features.show(2)
+-------+--------------------+----------------+ |item_id| genres| title| +-------+--------------------+----------------+ | 1|Adventure|Animati...|Toy Story (1995)| | 2|Adventure|Childre...| Jumanji (1995)| +-------+--------------------+----------------+ only showing top 2 rows
from pyspark.sql import functions as sf
from pyspark.sql.types import IntegerType
year = item_features.withColumn('year', sf.substring(sf.col('title'), -5, 4).astype(IntegerType())).select('item_id', 'year')
year.show(2)
+-------+----+ |item_id|year| +-------+----+ | 1|1995| | 2|1995| +-------+----+ only showing top 2 rows
from replay.session_handler import State
from pyspark.sql.functions import split
genres = (
State().session.createDataFrame(data.items[["item_id", "genres"]])
.select(
"item_id",
split("genres", "\|").alias("genres")
)
)
genres.show()
+-------+--------------------+ |item_id| genres| +-------+--------------------+ | 1|[Adventure, Anima...| | 2|[Adventure, Child...| | 3| [Comedy, Romance]| | 4|[Comedy, Drama, R...| | 5| [Comedy]| | 6|[Action, Crime, T...| | 7| [Comedy, Romance]| | 8|[Adventure, Child...| | 9| [Action]| | 10|[Action, Adventur...| | 11|[Comedy, Drama, R...| | 12| [Comedy, Horror]| | 13|[Animation, Child...| | 14| [Drama]| | 15|[Action, Adventur...| | 16| [Crime, Drama]| | 17|[Comedy, Drama, R...| | 18|[Comedy, Drama, T...| | 19| [Comedy]| | 20|[Action, Comedy, ...| +-------+--------------------+ only showing top 20 rows
from pyspark.sql.functions import explode
genres_list = (
genres.select(explode("genres").alias("genre"))
.distinct().filter('genre <> "(no genres listed)"')
.toPandas()["genre"].tolist()
)
genres_list
['Documentary', 'IMAX', 'Adventure', 'Animation', 'Comedy', 'Thriller', 'Sci-Fi', 'Musical', 'Horror', 'Action', 'Fantasy', 'War', 'Mystery', 'Drama', 'Film-Noir', 'Crime', 'Western', 'Romance', 'Children']
from pyspark.sql.functions import col, lit, array_contains
from pyspark.sql.types import IntegerType
item_features = genres
for genre in genres_list:
item_features = item_features.withColumn(
genre,
array_contains(col("genres"), genre).astype(IntegerType())
)
item_features = item_features.drop("genres").cache()
item_features.count()
10681
item_features.show(2)
+-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+ |item_id|Documentary|IMAX|Adventure|Animation|Comedy|Thriller|Sci-Fi|Musical|Horror|Action|Fantasy|War|Mystery|Drama|Film-Noir|Crime|Western|Romance|Children| +-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+ | 1| 0| 0| 1| 1| 1| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 1| | 2| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 1| 0| 0| 0| 0| 0| 0| 0| 1| +-------+-----------+----+---------+---------+------+--------+------+-------+------+------+-------+---+-------+-----+---------+-----+-------+-------+--------+ only showing top 2 rows
item_features = item_features.join(year, on='item_id', how='inner')
item_features.cache()
item_features.count()
10681
from replay.models import LightFMWrap
model_feat = LightFMWrap(random_state=SEED, loss='warp', no_components=128)
%%time
model_feat.fit(train, item_features=item_features)
CPU times: user 13h 43min 26s, sys: 59.3 s, total: 13h 44min 25s Wall time: 18min 55s
%%time
recs = model_feat.predict(
k=K,
users=test.select('user_id').distinct(),
log=train,
filter_seen_items=True,
item_features=item_features
)
CPU times: user 8.44 s, sys: 2.99 s, total: 11.4 s Wall time: 1min
from replay.metrics import HitRate, NDCG, MAP, Coverage
from replay.experiment import Experiment
metrics = Experiment(test, {NDCG(): K,
MAP() : K,
HitRate(): [1, K],
Coverage(train): K})
metrics.add_result("LightFM_item_features", recs)
metrics.results
Coverage@10 | HitRate@1 | HitRate@10 | MAP@10 | NDCG@10 | |
---|---|---|---|---|---|
LightFM_item_features | 0.07193 | 0.348 | 0.796 | 0.113104 | 0.221282 |