!pip install --upgrade pip setuptools wheel
!git clone https://github.com/benfred/implicit
!cd implicit && pip install .
!pip install -q catboost
!pip install recohut
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
import random
import datetime
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from implicit import nearest_neighbours as NN
from implicit.nearest_neighbours import TFIDFRecommender
from catboost import CatBoostClassifier
from recohut.datasets.mts import MTSDataset
from recohut.utils.common_utils import get_coo_matrix
from recohut.transforms.splitting import TimeRangeSplit
from recohut.models.itempop import ItemPop as PopularRecommender
ds = MTSDataset(data_dir='/content/data', sample_frac=0.1)
users_df = pd.read_csv(os.path.join(ds.processed_dir, 'users_processed.csv'))
items_df = pd.read_csv(os.path.join(ds.processed_dir, 'items_processed.csv'))
interactions_df = pd.read_csv(os.path.join(ds.processed_dir, 'interactions_processed.csv'))
interactions_df['last_watch_dt'] = pd.to_datetime(interactions_df['last_watch_dt'])
interactions_df.sort_values(by='last_watch_dt', inplace=True)
This solution includes a two-stage model. I used item-item CF from implicit library to generate candidates with their scores and Catboost classifier to predict final ranks with classification objective. Recommendations for cold users were made with Popular items.
Implicit model parameters were chosen on sliding time window cross validation. The best scores were achieved by Cosine recommender model, taking only last 20 interactions for each user. 100 candidates with their scores were generated for each user, filtering all items that user had interactions with.
Implicit candidates were calculated for the last 14 days of the interactions. Then catboost model was trained on positive interactions from the candidates list on last 14 days. Random negative sampling was applied.
For final submission implicit candidates and catboost predictions were recalculated on the whole dataset.
Ref: Daria
# Creating items and users mapping
users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
# Preparing data
last_date_df = interactions_df['last_watch_dt'].max()
boosting_split_date = last_date_df - pd.Timedelta(days=14)
boosting_data = interactions_df[(interactions_df['last_watch_dt'] >
boosting_split_date)].copy()
boost_idx = boosting_data['user_id'].unique()
before_boosting = interactions_df[(interactions_df['last_watch_dt'] <=
boosting_split_date)].copy()
before_boosting_known_items = before_boosting.groupby(
'user_id')['item_id'].apply(list).to_dict()
before_boosting_known_items_mapped = {}
for user, recommend in before_boosting_known_items.items():
before_boosting_known_items_mapped[user] = list(map(lambda x:
items_mapping[x],
recommend))
before_boosting['order_from_recent'] = before_boosting.sort_values(
by=['last_watch_dt'], ascending=False).groupby('user_id').cumcount() + 1
boost_warm_idx = np.intersect1d(before_boosting['user_id'].unique(),
boosting_data['user_id'].unique())
Calculates top candidates from implicit model with their scores. Implicit parameters were chosen on time range split cross-validation. History offset stands for taking only lask X items from user history. Day offset stands for taking items from last X days of user history.
k_neighbours = 200
day_offset = 170
history_offset = 20
distance = 'Cosine'
num_candidates = 100
before_boosting['order_from_recent'] = before_boosting.sort_values(
by=['last_watch_dt'], ascending=False).groupby('user_id').cumcount() + 1
train = before_boosting.copy()
date_window = train['last_watch_dt'].max() - pd.DateOffset(days=day_offset)
train = train[train['last_watch_dt'] >= date_window]
if history_offset:
train = train[train['order_from_recent'] < history_offset]
if distance == 'Cosine':
model = NN.CosineRecommender(K=k_neighbours)
weights = None
else:
model = NN.TFIDFRecommender(K=k_neighbours)
weights = None
train_mat = get_coo_matrix(
train,
users_mapping=users_mapping,
items_mapping=items_mapping,
weight_col=weights
).tocsr()
model.fit(train_mat.T, show_progress=True)
0%| | 0/266854 [00:00<?, ?it/s]
def generate_implicit_recs_mapper(
model,
train_matrix,
top_N,
user_mapping,
item_inv_mapping,
filter_already_liked_items,
known_items=None,
filter_items=None,
return_scores=False
):
def _recs_mapper(user):
user_id = user_mapping[user]
if filter_items:
if user in known_items:
filtering = set(known_items[user]).union(set(filter_items))
else:
filtering = filter_items
else:
if known_items and user in known_items:
filtering = known_items[user]
else:
filtering = None
recs = model.recommend(user_id,
train_matrix,
N=top_N,
filter_already_liked_items=filter_already_liked_items,
filter_items=filtering)
if return_scores:
return recs
return recs[0]
return _recs_mapper
mapper = generate_implicit_recs_mapper(
model,
train_mat,
num_candidates,
users_mapping,
items_inv_mapping,
filter_already_liked_items=False,
known_items=before_boosting_known_items_mapped,
filter_items=None,
return_scores=True
)
recs = pd.DataFrame({'user_id': boost_warm_idx})
recs['item_id_score'] = recs['user_id'].map(mapper)
recs['item_id'] = recs['item_id_score'].apply(lambda x: x[0])
recs['implicit_score'] = recs['item_id_score'].apply(lambda x: x[1])
recs['tmp'] = recs.apply(lambda row: list(zip(row['item_id'], row['implicit_score'])), axis=1)
recs = recs.explode('tmp')
recs[['item_id','implicit_score']] = pd.DataFrame(recs['tmp'].tolist(), index=recs.index)
recs.drop(columns='tmp', inplace=True)
recs.drop(['item_id_score'], axis=1, inplace=True)
recs
user_id | item_id | implicit_score | |
---|---|---|---|
0 | 30 | 199262.0 | 0.707107 |
0 | 30 | 203105.0 | 0.707107 |
0 | 30 | 199886.0 | 0.707107 |
0 | 30 | 219904.0 | 0.707107 |
0 | 30 | 203206.0 | 0.707107 |
... | ... | ... | ... |
22231 | 1097544 | 263721.0 | 0.577350 |
22231 | 1097544 | 227113.0 | 0.577350 |
22231 | 1097544 | 239830.0 | 0.577350 |
22231 | 1097544 | 139002.0 | 0.577350 |
22231 | 1097544 | 243127.0 | 0.577350 |
2109153 rows × 3 columns
recs.to_csv(os.path.join(ds.processed_dir, 'impl_scores.csv'), index=False)
# taking candidates from implicit model and generating positive samples
candidates = pd.read_csv(os.path.join(ds.processed_dir, 'impl_scores.csv'))
candidates['item_id'] = candidates['item_id'].fillna(0.).astype('int64')
candidates['id'] = candidates.index
pos = candidates.merge(boosting_data[['user_id', 'item_id']],
on=['user_id', 'item_id'], how='inner')
pos['target'] = 1
pos
user_id | item_id | implicit_score | id | target | |
---|---|---|---|---|---|
0 | 109925 | 5543 | 1.000000 | 211288 | 1 |
1 | 126087 | 5518 | 1.000000 | 240448 | 1 |
2 | 131803 | 7807 | 0.707107 | 250989 | 1 |
3 | 140179 | 5011 | 0.707107 | 264967 | 1 |
4 | 223763 | 2780 | 1.000000 | 425032 | 1 |
5 | 316074 | 7033 | 1.000000 | 604543 | 1 |
6 | 419536 | 10267 | 1.000000 | 806723 | 1 |
7 | 482854 | 13237 | 1.000000 | 923066 | 1 |
8 | 484834 | 7558 | 0.500000 | 927130 | 1 |
9 | 487160 | 3784 | 1.000000 | 931333 | 1 |
10 | 522481 | 13787 | 0.516398 | 995099 | 1 |
11 | 616140 | 8254 | 0.169031 | 1176238 | 1 |
12 | 626147 | 5216 | 1.000000 | 1193879 | 1 |
13 | 779743 | 10971 | 0.353553 | 1494276 | 1 |
14 | 860928 | 14431 | 0.500000 | 1650890 | 1 |
15 | 928023 | 9113 | 0.500000 | 1784750 | 1 |
16 | 947916 | 1173 | 1.000000 | 1822962 | 1 |
17 | 1030860 | 657 | 0.333333 | 1983602 | 1 |
18 | 1043861 | 15384 | 1.000000 | 2006821 | 1 |
19 | 1093253 | 11769 | 1.000000 | 2101280 | 1 |
# Generating negative samples
num_negatives = 3
pos_group = pos.groupby('user_id')['item_id'].count()
neg = candidates[~candidates['id'].isin(pos['id'])].copy()
neg_sampling = pd.DataFrame(neg.groupby('user_id')['id'].apply(
list)).join(pos_group, on='user_id', rsuffix='p', how='right')
neg_sampling['num_choices'] = np.clip(neg_sampling['item_id'] * num_negatives,
a_min=0, a_max=25)
func = lambda row: np.random.choice(row['id'],
size=row['num_choices'],
replace=False)
neg_sampling['sample_idx'] = neg_sampling.apply(func, axis=1)
idx_chosen = neg_sampling['sample_idx'].explode().values
neg = neg[neg['id'].isin(idx_chosen)]
neg['target'] = 0
neg
user_id | item_id | implicit_score | id | target | |
---|---|---|---|---|---|
211232 | 109925 | 12948 | 1.000000 | 211232 | 0 |
211234 | 109925 | 31205 | 1.000000 | 211234 | 0 |
211287 | 109925 | 251132 | 1.000000 | 211287 | 0 |
240482 | 126087 | 38859 | 1.000000 | 240482 | 0 |
240493 | 126087 | 65257 | 1.000000 | 240493 | 0 |
240494 | 126087 | 41067 | 1.000000 | 240494 | 0 |
250980 | 131803 | 207587 | 0.577350 | 250980 | 0 |
250988 | 131803 | 6113 | 0.707107 | 250988 | 0 |
251041 | 131803 | 107381 | 1.000000 | 251041 | 0 |
265003 | 140179 | 30433 | 1.000000 | 265003 | 0 |
265014 | 140179 | 21064 | 1.000000 | 265014 | 0 |
265031 | 140179 | 16373 | 1.000000 | 265031 | 0 |
425049 | 223763 | 77169 | 1.000000 | 425049 | 0 |
425052 | 223763 | 12948 | 1.000000 | 425052 | 0 |
425074 | 223763 | 109280 | 1.000000 | 425074 | 0 |
604542 | 316074 | 7107 | 1.000000 | 604542 | 0 |
604554 | 316074 | 11829 | 1.000000 | 604554 | 0 |
604556 | 316074 | 73997 | 1.000000 | 604556 | 0 |
806662 | 419536 | 12854 | 1.000000 | 806662 | 0 |
806668 | 419536 | 13076 | 1.000000 | 806668 | 0 |
806742 | 419536 | 9204 | 1.000000 | 806742 | 0 |
923002 | 482854 | 34763 | 1.000000 | 923002 | 0 |
923018 | 482854 | 11361 | 1.000000 | 923018 | 0 |
923061 | 482854 | 12965 | 1.000000 | 923061 | 0 |
927151 | 484834 | 30217 | 0.707107 | 927151 | 0 |
927180 | 484834 | 12652 | 0.500000 | 927180 | 0 |
927201 | 484834 | 65037 | 0.707107 | 927201 | 0 |
931310 | 487160 | 7616 | 0.707107 | 931310 | 0 |
931343 | 487160 | 7107 | 1.000000 | 931343 | 0 |
931370 | 487160 | 21317 | 1.000000 | 931370 | 0 |
995170 | 522481 | 120210 | 0.707107 | 995170 | 0 |
995173 | 522481 | 33260 | 1.000000 | 995173 | 0 |
995183 | 522481 | 176089 | 1.000000 | 995183 | 0 |
1176201 | 616140 | 40776 | 0.133631 | 1176201 | 0 |
1176203 | 616140 | 35411 | 0.133631 | 1176203 | 0 |
1176252 | 616140 | 75552 | 0.267261 | 1176252 | 0 |
1193820 | 626147 | 118355 | 0.707107 | 1193820 | 0 |
1193836 | 626147 | 245945 | 0.707107 | 1193836 | 0 |
1193877 | 626147 | 2239 | 1.000000 | 1193877 | 0 |
1494296 | 779743 | 73398 | 0.353553 | 1494296 | 0 |
1494302 | 779743 | 2209 | 0.500000 | 1494302 | 0 |
1494338 | 779743 | 88020 | 0.500000 | 1494338 | 0 |
1650910 | 860928 | 22021 | 0.577350 | 1650910 | 0 |
1650945 | 860928 | 43326 | 1.000000 | 1650945 | 0 |
1650951 | 860928 | 38736 | 0.577350 | 1650951 | 0 |
1784731 | 928023 | 11357 | 0.447214 | 1784731 | 0 |
1784770 | 928023 | 220290 | 1.000000 | 1784770 | 0 |
1784781 | 928023 | 30716 | 0.324443 | 1784781 | 0 |
1822946 | 947916 | 8637 | 1.000000 | 1822946 | 0 |
1822954 | 947916 | 226402 | 0.707107 | 1822954 | 0 |
1822957 | 947916 | 21772 | 0.707107 | 1822957 | 0 |
1983609 | 1030860 | 7223 | 0.500000 | 1983609 | 0 |
1983645 | 1030860 | 110861 | 0.707107 | 1983645 | 0 |
1983663 | 1030860 | 16007 | 0.408248 | 1983663 | 0 |
2006811 | 1043861 | 108934 | 0.707107 | 2006811 | 0 |
2006872 | 1043861 | 38242 | 1.000000 | 2006872 | 0 |
2006889 | 1043861 | 57489 | 1.000000 | 2006889 | 0 |
2101213 | 1093253 | 194512 | 0.707107 | 2101213 | 0 |
2101240 | 1093253 | 150161 | 0.707107 | 2101240 | 0 |
2101261 | 1093253 | 225008 | 0.707107 | 2101261 | 0 |
# Creating training data sample and early stopping data sample
boost_idx_train = np.intersect1d(boost_idx, pos['user_id'].unique())
boost_train_users, boost_eval_users = train_test_split(boost_idx_train,
test_size=0.1,
random_state=345)
select_col = ['user_id', 'item_id', 'implicit_score', 'target']
boost_train = shuffle(
pd.concat([
pos[pos['user_id'].isin(boost_train_users)],
neg[neg['user_id'].isin(boost_train_users)]
])[select_col]
)
boost_eval = shuffle(
pd.concat([
pos[pos['user_id'].isin(boost_eval_users)],
neg[neg['user_id'].isin(boost_eval_users)]
])[select_col]
)
user_col = ['user_id','age','income','sex','kids_flg','boost_user_watch_cnt_all',
'boost_user_watch_cnt_last_14']
item_col = ['item_id','content_type','countries_max','for_kids','age_rating',
'studios_max','genres_max','genres_min','genres_med','release_novelty']
item_stats_col = ['item_id','watched_in_7_days','watch_ts_std','trend_slope',
'watch_ts_quantile_95_diff','watch_ts_median_diff',
'watched_in_all_time','male_watchers_fraction',
'female_watchers_fraction','younger_35_fraction','older_35_fraction']
cat_col = ['age','income','sex','content_type']
train_feat = boost_train.merge(users_df[user_col],
on=['user_id'],
how='left')\
.merge(items_df[item_col],
on=['item_id'],
how='left')
eval_feat = boost_eval.merge(users_df[user_col],
on=['user_id'],
how='left') \
.merge(items_df[item_col],
on=['item_id'],
how='left')
eval_feat
user_id | item_id | implicit_score | target | age | income | sex | kids_flg | boost_user_watch_cnt_all | boost_user_watch_cnt_last_14 | content_type | countries_max | for_kids | age_rating | studios_max | genres_max | genres_min | genres_med | release_novelty | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 316074 | 7033 | 1.000000 | 1 | age_18_24 | income_20_40 | F | False | 4.0 | 0.0 | series | 4340.0 | False | 16.0 | 14898.0 | 3858.0 | 2778.0 | 3318.0 | 5.0 |
1 | 131803 | 6113 | 0.707107 | 0 | age_35_44 | income_20_40 | M | False | 0.0 | 0.0 | film | 5065.0 | False | 12.0 | 14898.0 | 3503.0 | 1820.0 | 1877.0 | 1.0 |
2 | 316074 | 11829 | 1.000000 | 0 | age_18_24 | income_20_40 | F | False | 4.0 | 0.0 | film | 5065.0 | False | 18.0 | 14898.0 | 1820.0 | 1033.0 | 1426.5 | 6.0 |
3 | 131803 | 207587 | 0.577350 | 0 | age_35_44 | income_20_40 | M | False | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 316074 | 7107 | 1.000000 | 0 | age_18_24 | income_20_40 | F | False | 4.0 | 0.0 | series | 4340.0 | False | 12.0 | 14898.0 | 5431.0 | 626.0 | 1877.0 | 6.0 |
5 | 131803 | 7807 | 0.707107 | 1 | age_35_44 | income_20_40 | M | False | 0.0 | 0.0 | film | 4340.0 | False | 16.0 | 14898.0 | 3858.0 | 3858.0 | 3858.0 | 5.0 |
6 | 316074 | 73997 | 1.000000 | 0 | age_18_24 | income_20_40 | F | False | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | 131803 | 107381 | 1.000000 | 0 | age_35_44 | income_20_40 | M | False | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
item_stats = pd.read_csv(os.path.join(ds.processed_dir, 'item_stats.csv'))
item_stats = item_stats[item_stats_col]
train_feat = train_feat.join(item_stats.set_index('item_id'),
on='item_id', how='left')
eval_feat = eval_feat.join(item_stats.set_index('item_id'),
on='item_id', how='left')
drop_col = ['user_id', 'item_id']
target_col = ['target']
X_train = train_feat.drop(drop_col + target_col, axis=1)
y_train = train_feat[target_col]
X_val = eval_feat.drop(drop_col + target_col, axis=1)
y_val = eval_feat[target_col]
X_train.fillna('None', inplace=True)
X_val.fillna('None', inplace=True)
X_train[cat_col] = X_train[cat_col].astype('category')
X_val[cat_col] = X_val[cat_col].astype('category')
X_train
implicit_score | age | income | sex | kids_flg | boost_user_watch_cnt_all | boost_user_watch_cnt_last_14 | content_type | countries_max | for_kids | age_rating | studios_max | genres_max | genres_min | genres_med | release_novelty | watched_in_7_days | watch_ts_std | trend_slope | watch_ts_quantile_95_diff | watch_ts_median_diff | watched_in_all_time | male_watchers_fraction | female_watchers_fraction | younger_35_fraction | older_35_fraction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.000000 | age_35_44 | income_20_40 | F | False | 2 | 1 | film | 5065 | False | 16 | 14898 | 2418 | 1820 | 2119 | 3 | 46 | 0.787585 | 0.195783 | 0 | 1 | 46 | 0.422222 | 0.355556 | 0.311111 | 0.466667 |
1 | 1.000000 | age_unknown | income_unknown | sex_unknown | False | 1 | 1 | series | 4340 | False | 12 | 14898 | 1339 | 1339 | 1339 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0.500000 | age_18_24 | income_20_40 | M | False | 2 | 1 | film | 5065 | False | 18 | 14898 | 5431 | 1224 | 2418 | 5 | 5 | 46.3813 | -0.0692771 | 5 | 74 | 89 | 0.431818 | 0.409091 | 0.420455 | 0.420455 |
3 | 1.000000 | age_25_34 | income_20_40 | M | True | 1 | 1 | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
4 | 0.707107 | age_18_24 | income_20_40 | M | False | 3 | 1 | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
67 | 0.707107 | age_35_44 | income_20_40 | F | False | 2 | 1 | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
68 | 0.447214 | age_18_24 | income_20_40 | M | False | 2 | 1 | film | 295 | False | 18 | 14898 | 3858 | 31 | 3140.5 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
69 | 0.500000 | age_25_34 | income_40_60 | M | True | 1 | 1 | film | 1272 | False | 18 | 14898 | 5431 | 254 | 3503 | 5 | 0 | 0 | 0 | 68 | 68 | 1 | 0 | 0 | 0 | 0 |
70 | 1.000000 | age_45_54 | income_40_60 | M | True | 2 | 0 | film | 5065 | False | 16 | 14898 | 3858 | 2778 | 3503 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
71 | 0.707107 | age_65_inf | income_20_40 | F | False | 5 | 5 | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
72 rows × 26 columns
# Training CatBoost classifier with parameters previously chosen on cross validation
params = {
'subsample': 0.97,
'max_depth': 9,
'n_estimators': 2000,
'learning_rate': 0.03,
'scale_pos_weight': num_negatives,
'l2_leaf_reg': 27,
'thread_count': -1,
'verbose': 200,
'task_type': "CPU",
'devices': '0:1',
# 'bootstrap_type': 'Poisson'
}
boost_model = CatBoostClassifier(**params)
boost_model.fit(X_train,
y_train,
eval_set=(X_val, y_val),
early_stopping_rounds=200,
cat_features=cat_col,
plot=False)
0: learn: 0.6814278 test: 0.6853672 best: 0.6853672 (0) total: 57.5ms remaining: 1m 54s 200: learn: 0.1793975 test: 0.5471784 best: 0.5422113 (146) total: 1.19s remaining: 10.7s Stopped by overfitting detector (200 iterations wait) bestTest = 0.5422113159 bestIteration = 146 Shrink model to first 147 iterations.
<catboost.core.CatBoostClassifier at 0x7f157243ac90>
with open("catboost_trained.pkl", 'wb') as f:
pickle.dump(boost_model, f)
# with open("catboost_trained.pkl", 'rb') as f:
# boost_model = pickle.load(f)
boost_model
<catboost.core.CatBoostClassifier at 0x7f157243ac90>
random_items = list(np.random.choice(interactions_df['user_id'], size=5, replace=False))
cold_items = [10000, 20000]
random_items.extend(cold_items)
warm_idx = np.intersect1d(random_items, interactions_df['user_id'].unique())
warm_idx
array([ 20000, 133452, 332832, 341075, 622570, 728808])
_candidates = candidates.copy()
_candidates.dropna(subset=['item_id'], axis=0, inplace=True)
submit_feat = _candidates.merge(users_df[user_col],
on=['user_id'],
how='left') \
.merge(items_df[item_col],
on=['item_id'],
how='left')
submit_feat
user_id | item_id | implicit_score | id | age | income | sex | kids_flg | boost_user_watch_cnt_all | boost_user_watch_cnt_last_14 | content_type | countries_max | for_kids | age_rating | studios_max | genres_max | genres_min | genres_med | release_novelty | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | 199262 | 0.707107 | 0 | age_unknown | income_unknown | sex_unknown | False | 2.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 30 | 203105 | 0.707107 | 1 | age_unknown | income_unknown | sex_unknown | False | 2.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 30 | 199886 | 0.707107 | 2 | age_unknown | income_unknown | sex_unknown | False | 2.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 30 | 219904 | 0.707107 | 3 | age_unknown | income_unknown | sex_unknown | False | 2.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 30 | 203206 | 0.707107 | 4 | age_unknown | income_unknown | sex_unknown | False | 2.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2109148 | 1097544 | 263721 | 0.577350 | 2109148 | age_25_34 | income_20_40 | F | True | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109149 | 1097544 | 227113 | 0.577350 | 2109149 | age_25_34 | income_20_40 | F | True | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109150 | 1097544 | 239830 | 0.577350 | 2109150 | age_25_34 | income_20_40 | F | True | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109151 | 1097544 | 139002 | 0.577350 | 2109151 | age_25_34 | income_20_40 | F | True | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109152 | 1097544 | 243127 | 0.577350 | 2109152 | age_25_34 | income_20_40 | F | True | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109153 rows × 19 columns
full_train = submit_feat.fillna('None')
full_train[cat_col] = full_train[cat_col].astype('category')
# item_stats = pd.read_csv('data/item_stats_for_submit.csv')
full_train = full_train.join(item_stats.set_index('item_id'),
on='item_id', how='left')
full_train
user_id | item_id | implicit_score | id | age | income | sex | kids_flg | boost_user_watch_cnt_all | boost_user_watch_cnt_last_14 | content_type | countries_max | for_kids | age_rating | studios_max | genres_max | genres_min | genres_med | release_novelty | watched_in_7_days | watch_ts_std | trend_slope | watch_ts_quantile_95_diff | watch_ts_median_diff | watched_in_all_time | male_watchers_fraction | female_watchers_fraction | younger_35_fraction | older_35_fraction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | 199262 | 0.707107 | 0 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 30 | 203105 | 0.707107 | 1 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 30 | 199886 | 0.707107 | 2 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 30 | 219904 | 0.707107 | 3 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 30 | 203206 | 0.707107 | 4 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2109148 | 1097544 | 263721 | 0.57735 | 2109148 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109149 | 1097544 | 227113 | 0.57735 | 2109149 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109150 | 1097544 | 239830 | 0.57735 | 2109150 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109151 | 1097544 | 139002 | 0.57735 | 2109151 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109152 | 1097544 | 243127 | 0.57735 | 2109152 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109153 rows × 29 columns
cols
['user_id', 'item_id', 'implicit_score', 'age', 'income', 'sex', 'kids_flg', 'user_watch_cnt_all', 'user_watch_cnt_last_14', 'content_type', 'countries_max', 'for_kids', 'age_rating', 'studios_max', 'genres_max', 'genres_min', 'genres_med', 'release_novelty', 'watched_in_7_days', 'watch_ts_std', 'trend_slope', 'watch_ts_quantile_95_diff', 'watch_ts_median_diff', 'watched_in_all_time', 'male_watchers_fraction', 'female_watchers_fraction', 'younger_35_fraction', 'older_35_fraction']
# Renaming columns to match classifier feature names
cols = ['user_id', 'item_id']
cols.extend(boost_model.feature_names_)
cols = cols[:7] + ['boost_user_watch_cnt_all', 'boost_user_watch_cnt_last_14'] + cols[9:]
full_train = full_train[cols]
full_train_new_names = ['user_id', 'item_id'] + boost_model.feature_names_
full_train.columns = full_train_new_names
full_train
user_id | item_id | implicit_score | age | income | sex | kids_flg | boost_user_watch_cnt_all | boost_user_watch_cnt_last_14 | content_type | countries_max | for_kids | age_rating | studios_max | genres_max | genres_min | genres_med | release_novelty | watched_in_7_days | watch_ts_std | trend_slope | watch_ts_quantile_95_diff | watch_ts_median_diff | watched_in_all_time | male_watchers_fraction | female_watchers_fraction | younger_35_fraction | older_35_fraction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | 199262 | 0.707107 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 30 | 203105 | 0.707107 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 30 | 199886 | 0.707107 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 30 | 219904 | 0.707107 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 30 | 203206 | 0.707107 | age_unknown | income_unknown | sex_unknown | False | 2 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2109148 | 1097544 | 263721 | 0.57735 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109149 | 1097544 | 227113 | 0.57735 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109150 | 1097544 | 239830 | 0.57735 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109151 | 1097544 | 139002 | 0.57735 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109152 | 1097544 | 243127 | 0.57735 | age_25_34 | income_20_40 | F | True | 1 | 1 | None | None | None | None | None | None | None | None | None | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2109153 rows × 28 columns
# Making predictions for warm users
y_pred_all = boost_model.predict_proba(full_train.drop(
['user_id', 'item_id'], axis=1))
full_train['boost_pred'] = y_pred_all[:, 1]
full_train = full_train[['user_id', 'item_id', 'boost_pred']]
full_train = full_train.sort_values(by=['user_id', 'boost_pred'],
ascending=[True, False])
full_train['rank'] = full_train.groupby('user_id').cumcount() + 1
full_train = full_train[full_train['rank'] <= 10].drop('boost_pred', axis=1)
full_train['item_id'] = full_train['item_id'].astype('int64')
boost_recs = full_train.groupby('user_id')['item_id'].apply(list)
boost_recs = pd.DataFrame(boost_recs)
boost_recs.reset_index(inplace=True)
boost_recs
user_id | item_id | |
---|---|---|
0 | 30 | [16986, 199262, 203105, 199886, 219904, 203206... |
1 | 55 | [12232, 7634, 6489, 15987, 14556, 5573, 15058,... |
2 | 106 | [8821, 10700, 10497, 3399, 9154, 3629, 12189, ... |
3 | 144 | [79668, 85771, 79780, 100360, 87071, 80158, 14... |
4 | 155 | [10747, 2236, 67784, 78954, 139975, 137705, 22... |
... | ... | ... |
22227 | 1097444 | [7300, 16181, 110702, 114582, 113097, 86716, 1... |
22228 | 1097459 | [68578, 71663, 68642, 74552, 71682, 68811, 777... |
22229 | 1097470 | [196242, 201115, 196364, 201461, 203105, 19904... |
22230 | 1097508 | [207809, 210545, 208388, 212164, 213627, 21296... |
22231 | 1097544 | [71485, 75317, 72714, 94880, 75852, 72851, 112... |
22232 rows × 2 columns
# Making predictions for cold users with Popular Recommender
idx_for_popular = list(set(pd.Series(random_items).unique()).difference(
set(boost_recs['user_id'].unique())))
idx_for_popular
[20000, 728808, 622570, 133452, 10000, 341075]
interactions_df
user_id | item_id | last_watch_dt | total_dur | watched_pct | |
---|---|---|---|---|---|
0 | 917575 | 10353 | 2021-03-13 | 11131 | 58 |
1060 | 275080 | 15574 | 2021-03-13 | 670 | 11 |
1059 | 120517 | 9550 | 2021-03-13 | 32456 | 100 |
1058 | 15045 | 6115 | 2021-03-13 | 22830 | 100 |
1057 | 92904 | 10135 | 2021-03-13 | 3709 | 71 |
... | ... | ... | ... | ... | ... |
542914 | 484870 | 9157 | 2021-08-22 | 9435 | 6 |
542913 | 8428 | 5732 | 2021-08-22 | 6570 | 100 |
542912 | 818134 | 11505 | 2021-08-22 | 60 | 0 |
542923 | 314358 | 14111 | 2021-08-22 | 2590 | 35 |
547624 | 755517 | 5693 | 2021-08-22 | 6174 | 88 |
547625 rows × 5 columns
pop_model = PopularRecommender(days=30, dt_column='last_watch_dt',
with_filter=True)
pop_model.fit(interactions_df)
recs_popular = pop_model.recommend_with_filter(interactions_df, idx_for_popular, top_K=10)
recs_popular
user_id | item_id | |
---|---|---|
4 | 10000 | [10440, 9728, 15297, 13865, 3734, 12192, 4151,... |
0 | 20000 | [10440, 9728, 15297, 13865, 3734, 12192, 4151,... |
1 | 728808 | [10440, 9728, 15297, 13865, 12192, 4151, 11863... |
2 | 622570 | [10440, 9728, 15297, 13865, 12192, 4151, 11863... |
3 | 133452 | [10440, 9728, 15297, 13865, 3734, 12192, 4151,... |
5 | 341075 | [10440, 9728, 15297, 13865, 3734, 12192, 4151,... |
all_recs = pd.concat([boost_recs, recs_popular], axis=0)
def fill_with_popular(recs, pop_model_fitted, interactions_df, top_K=10):
"""
Fills missing recommendations with Popular Recommender.
Takes top_K first recommendations if length of recs exceeds top_K
"""
recs['len'] = recs['item_id'].apply(lambda x: len(x))
recs_good = recs[recs['len'] >= top_K].copy()
recs_good.loc[(recs_good['len'] > top_K), 'item_id'] = recs_good.loc[
(recs_good['len'] > 10), 'item_id'].apply(lambda x: x[:10])
recs_bad = recs[recs['len'] < top_K].copy()
recs_bad['num_popular'] = top_K - recs_bad.len
idx_for_filling = recs_bad['user_id'].unique()
filling_recs = pop_model_fitted.recommend_with_filter(
interactions_df, idx_for_filling, top_K=top_K)
recs_bad = recs_bad.join(filling_recs.set_index('user_id'),
on='user_id', how='left', rsuffix='1')
recs_bad.loc[(recs_bad['len'] > 0), 'item_id'] = \
recs_bad.loc[(recs_bad['len'] > 0), 'item_id'] + \
recs_bad.loc[(recs_bad['len'] > 0), 'item_id1']
recs_bad.loc[(recs_bad['len'] == 0), 'item_id'] = recs_bad.loc[
(recs_bad['len'] == 0), 'item_id1']
recs_bad['item_id'] = recs_bad['item_id'].apply(lambda x: x[:top_K])
total_recs = pd.concat([recs_good[['user_id', 'item_id']],
recs_bad[['user_id', 'item_id']]], axis=0)
return total_recs
# Filling short recommendations woth popular items
all_recs = fill_with_popular(all_recs, pop_model, interactions_df)
all_recs
user_id | item_id | |
---|---|---|
0 | 30 | [16986, 199262, 203105, 199886, 219904, 203206... |
1 | 55 | [12232, 7634, 6489, 15987, 14556, 5573, 15058,... |
2 | 106 | [8821, 10700, 10497, 3399, 9154, 3629, 12189, ... |
3 | 144 | [79668, 85771, 79780, 100360, 87071, 80158, 14... |
4 | 155 | [10747, 2236, 67784, 78954, 139975, 137705, 22... |
... | ... | ... |
22054 | 1087746 | [366, 4784, 33316, 63977, 10440, 9728, 15297, ... |
22137 | 1092833 | [15355, 198132, 191636, 50599, 177761, 10440, ... |
22159 | 1093784 | [296, 124311, 20002, 219743, 10440, 9728, 1529... |
22160 | 1093836 | [1343, 11710, 3254, 1967, 3356, 5292, 70331, 2... |
22171 | 1094683 | [15355, 198132, 191636, 50599, 177761, 10440, ... |
22238 rows × 2 columns
def calculate_novelty(train_interactions, recommendations, top_n):
users = recommendations['user_id'].unique()
n_users = train_interactions['user_id'].nunique()
n_users_per_item = train_interactions.groupby('item_id')['user_id'].nunique()
recommendations = recommendations.loc[recommendations['rank'] <= top_n].copy()
recommendations['n_users_per_item'] = recommendations['item_id'].map(n_users_per_item)
recommendations['n_users_per_item'] = recommendations['n_users_per_item'].fillna(1)
recommendations['item_novelty'] = -np.log2(recommendations['n_users_per_item'] / n_users)
item_novelties = recommendations[['user_id', 'rank', 'item_novelty']]
miuf_at_k = item_novelties.loc[item_novelties['rank'] <= top_n, ['user_id', 'item_novelty']]
miuf_at_k = miuf_at_k.groupby('user_id').agg('mean').squeeze()
return miuf_at_k.reindex(users).mean()
def compute_metrics(train, test, recs, top_N):
result = {}
test_recs = test.set_index(['user_id', 'item_id']).join(recs.set_index(['user_id', 'item_id']))
test_recs = test_recs.sort_values(by=['user_id', 'rank'])
test_recs['users_item_count'] = test_recs.groupby(level='user_id')['rank'].transform(np.size)
test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)
test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1
test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']
users_count = test_recs.index.get_level_values('user_id').nunique()
for k in range(1, top_N + 1):
hit_k = f'hit@{k}'
test_recs[hit_k] = test_recs['rank'] <= k
result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count
result[f'MAP@{top_N}'] = (test_recs['cumulative_rank'] / test_recs['users_item_count']).sum() / users_count
result[f'Novelty@{top_N}'] = calculate_novelty(train, recs, top_N)
return pd.Series(result)
test = interactions_df[interactions_df['last_watch_dt'] == interactions_df['last_watch_dt'].max()]
train = interactions_df[interactions_df['last_watch_dt'] < interactions_df['last_watch_dt'].max()]
pop_model = PopularRecommender(days=7, dt_column='last_watch_dt')
pop_model.fit(train)
top10_recs = pop_model.recommend()
top10_recs
array([ 9728, 15297, 10440, 13865, 12360, 14488, 12192, 512, 341, 3734])
item_titles = pd.Series(items_df['title'].values, index=items_df['item_id']).to_dict()
list(map(item_titles.get, top10_recs))
['гнев человеческий', 'клиника счастья', 'хрустальный', 'девятаев', 'круэлла', 'мастер меча', 'фемида видит', 'рядовой чээрин', 'лето - это море', 'прабабушка легкого поведения']
recs = pd.DataFrame({'user_id': test['user_id'].unique()})
top_N = 10
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs.head()
user_id | item_id | |
---|---|---|
0 | 936370 | [9728, 15297, 10440, 13865, 12360, 14488, 1219... |
1 | 279776 | [9728, 15297, 10440, 13865, 12360, 14488, 1219... |
2 | 321739 | [9728, 15297, 10440, 13865, 12360, 14488, 1219... |
3 | 98693 | [9728, 15297, 10440, 13865, 12360, 14488, 1219... |
4 | 267998 | [9728, 15297, 10440, 13865, 12360, 14488, 1219... |
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs.head(top_N + 2)
user_id | item_id | rank | |
---|---|---|---|
0 | 936370 | 9728 | 1 |
0 | 936370 | 15297 | 2 |
0 | 936370 | 10440 | 3 |
0 | 936370 | 13865 | 4 |
0 | 936370 | 12360 | 5 |
0 | 936370 | 14488 | 6 |
0 | 936370 | 12192 | 7 |
0 | 936370 | 512 | 8 |
0 | 936370 | 341 | 9 |
0 | 936370 | 3734 | 10 |
1 | 279776 | 9728 | 1 |
1 | 279776 | 15297 | 2 |
compute_metrics(train, test, recs, 10)
Precision@1 0.034862 Recall@1 0.033231 Precision@2 0.033945 Recall@2 0.065418 Precision@3 0.032875 Recall@3 0.095387 Precision@4 0.029128 Recall@4 0.112564 Precision@5 0.023425 Recall@5 0.113175 Precision@6 0.022273 Recall@6 0.128721 Precision@7 0.021669 Recall@7 0.145846 Precision@8 0.019897 Recall@8 0.152727 Precision@9 0.018926 Recall@9 0.163532 Precision@10 0.018211 Recall@10 0.174618 MAP@10 0.071974 Novelty@10 6.242784 dtype: float64
Let's take the last 3 weeks from our data and test them sequentially (1 test fold - 1 week). Don't forget about the cold start problem.
last_date = interactions_df['last_watch_dt'].max().normalize()
folds = 3
start_date = last_date - pd.Timedelta(days=folds*7)
start_date, last_date
(Timestamp('2021-08-01 00:00:00'), Timestamp('2021-08-22 00:00:00'))
cv = TimeRangeSplit(start_date=start_date, periods=folds+1, freq='W')
cv.max_n_splits, cv.get_n_splits(interactions_df, datetime_column='last_watch_dt')
(3, 3)
cv.date_range
DatetimeIndex(['2021-08-01', '2021-08-08', '2021-08-15', '2021-08-22'], dtype='datetime64[ns]', freq='W-SUN')
folds_with_stats = list(cv.split(
interactions_df,
user_column='user_id',
item_column='item_id',
datetime_column='last_watch_dt',
fold_stats=True
))
folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])
Already seen number: 0 Already seen number: 0 Already seen number: 0
folds_info_with_stats
Start date | End date | Train | New users | New users interactions | New items | New items interactions | Known interactions | Test | |
---|---|---|---|---|---|---|---|---|---|
0 | 2021-08-01 | 2021-08-08 | 420915 | 19360 | 22608 | 166 | 907 | 0 | 14717 |
1 | 2021-08-08 | 2021-08-15 | 459147 | 19615 | 22955 | 136 | 609 | 0 | 15979 |
2 | 2021-08-15 | 2021-08-22 | 498690 | 20501 | 24032 | 99 | 476 | 0 | 17371 |
top_N = 10
last_n_days = 7
final_results = []
validation_results = pd.DataFrame()
for train_idx, test_idx, info in folds_with_stats:
train = interactions_df.loc[train_idx]
test = interactions_df.loc[test_idx]
pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')
pop_model.fit(train)
recs = pd.DataFrame({'user_id': test['user_id'].unique()})
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1
fold_result = compute_metrics(train, test, recs, top_N)
validation_results = validation_results.append(fold_result, ignore_index=True)
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})
MAP@10 0.039814 Novelty@10 5.778481 dtype: float64
Let's see if it makes sense to predict the popular depending on the social group
train_idx, test_idx, info = folds_with_stats[0]
train = interactions_df.loc[train_idx]
test = interactions_df.loc[test_idx]
date_window_for_popular = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)
train_slice = pd.merge(train[train['last_watch_dt'] >= date_window_for_popular], users_df, on='user_id', how='left')
we have users without features, so we need to define padding for them
train_slice.head()
user_id | item_id | last_watch_dt | total_dur | watched_pct | age | income | sex | kids_flg | boost_user_watch_cnt_all | boost_user_watch_cnt_last_14 | user_watch_cnt_all | user_watch_cnt_last_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 689871 | 6404 | 2021-07-24 | 905 | 16 | age_45_54 | income_20_40 | M | False | 1.0 | 0.0 | 1.0 | 0.0 |
1 | 482718 | 2624 | 2021-07-24 | 1898 | 25 | age_18_24 | income_40_60 | F | False | 1.0 | 0.0 | 4.0 | 3.0 |
2 | 183195 | 11239 | 2021-07-24 | 1037 | 14 | age_35_44 | income_20_40 | F | True | 5.0 | 0.0 | 5.0 | 0.0 |
3 | 1077534 | 4457 | 2021-07-24 | 151 | 2 | age_25_34 | income_20_40 | M | False | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 274241 | 16228 | 2021-07-24 | 19306 | 18 | age_65_inf | income_20_40 | F | False | 4.0 | 0.0 | 4.0 | 0.0 |
train_slice.fillna({'age':'age_unknown',
'sex':'sex_unknown',
'income': 'income_unknown',
'kids_flg': False
}, inplace=True)
For example, you can watch popular by age, gender and presence of children
train_slice.head()
user_id | item_id | last_watch_dt | total_dur | watched_pct | age | income | sex | kids_flg | boost_user_watch_cnt_all | boost_user_watch_cnt_last_14 | user_watch_cnt_all | user_watch_cnt_last_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 689871 | 6404 | 2021-07-24 | 905 | 16 | age_45_54 | income_20_40 | M | False | 1.0 | 0.0 | 1.0 | 0.0 |
1 | 482718 | 2624 | 2021-07-24 | 1898 | 25 | age_18_24 | income_40_60 | F | False | 1.0 | 0.0 | 4.0 | 3.0 |
2 | 183195 | 11239 | 2021-07-24 | 1037 | 14 | age_35_44 | income_20_40 | F | True | 5.0 | 0.0 | 5.0 | 0.0 |
3 | 1077534 | 4457 | 2021-07-24 | 151 | 2 | age_25_34 | income_20_40 | M | False | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 274241 | 16228 | 2021-07-24 | 19306 | 18 | age_65_inf | income_20_40 | F | False | 4.0 | 0.0 | 4.0 | 0.0 |
soc_dem_recommendations = train_slice.groupby(
['age', 'sex', 'income', 'item_id']
).size().to_frame().reset_index()
soc_dem_recommendations
age | sex | income | item_id | 0 | |
---|---|---|---|---|---|
0 | age_18_24 | F | income_0_20 | 14 | 1 |
1 | age_18_24 | F | income_0_20 | 111 | 1 |
2 | age_18_24 | F | income_0_20 | 162 | 1 |
3 | age_18_24 | F | income_0_20 | 288 | 1 |
4 | age_18_24 | F | income_0_20 | 334 | 1 |
... | ... | ... | ... | ... | ... |
18651 | age_unknown | sex_unknown | income_unknown | 16488 | 1 |
18652 | age_unknown | sex_unknown | income_unknown | 16498 | 1 |
18653 | age_unknown | sex_unknown | income_unknown | 16499 | 3 |
18654 | age_unknown | sex_unknown | income_unknown | 16509 | 21 |
18655 | age_unknown | sex_unknown | income_unknown | 16516 | 1 |
18656 rows × 5 columns
Now you just need to select for each user the most popular top_n objects in his group
We can check this option on folds
validation_results = pd.DataFrame()
for train_idx, test_idx, info in folds_with_stats:
train = interactions_df.loc[train_idx]
test = interactions_df.loc[test_idx]
date_window = train['last_watch_dt'].max() - pd.DateOffset(days=last_n_days)
train_slice = pd.merge(train[train['last_watch_dt'] >= date_window], users_df, on='user_id', how='left')
train_slice.fillna({
'age':'age_unknown',
'sex':'sex_unknown',
'income': 'income_unknown',
'kids_flg': False
},inplace=True)
soc_dem_recommendations = train_slice.groupby(
['age', 'sex', 'income', 'item_id']
).size().to_frame().reset_index()
top_soc_dem = []
for age in soc_dem_recommendations.age.unique():
for income in soc_dem_recommendations.income.unique():
for sex in soc_dem_recommendations.sex.unique():
top_items = soc_dem_recommendations[
(soc_dem_recommendations.age == age)
& (soc_dem_recommendations.income == income)
& (soc_dem_recommendations.sex == sex)].sort_values(0, ascending=False).head(10).item_id.values
top_soc_dem.append([age, income, sex, top_items])
top_soc_dem = pd.DataFrame(top_soc_dem, columns = ['age', 'income', 'sex', 'item_id'])
recs = pd.DataFrame({'user_id': test['user_id'].unique()})
recs = pd.merge(recs[['user_id']], users_df, on='user_id', how='left')
recs.fillna({
'age':'age_unknown',
'sex':'sex_unknown',
'income': 'income_unknown',
'kids_flg': False
}, inplace=True)
recs = pd.merge(recs, top_soc_dem, on = ['age', 'sex', 'income'], how = 'left')
recs = recs.drop(columns = ['age', 'sex', 'income'])
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1
fold_result = compute_metrics(train, test, recs, top_N)
validation_results = validation_results.append(fold_result, ignore_index=True)
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean'})
MAP@10 0.040677 Novelty@10 6.050588 dtype: float64
In this case, the features by which you build the popular are selected, as well as the number of days that you take to calculate the popular
users_inv_mapping = dict(enumerate(interactions_df['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
items_inv_mapping = dict(enumerate(interactions_df['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
validation_results = pd.DataFrame()
for train_idx, test_idx, info in folds_with_stats:
train = interactions_df.loc[train_idx]
date_window = train['last_watch_dt'].max() - pd.DateOffset(days=60)
train = train[train['last_watch_dt'] >= date_window]
test = interactions_df.loc[test_idx]
train_mat = get_coo_matrix(
train,
users_mapping=users_mapping,
items_mapping=items_mapping,
).tocsr()
model = TFIDFRecommender(K=top_N)
model.fit(train_mat.T, show_progress=False)
mapper = generate_implicit_recs_mapper(
model,
train_mat,
top_N,
users_mapping,
items_inv_mapping,
filter_already_liked_items=True
)
recs = pd.DataFrame({'user_id': test['user_id'].unique()})
recs['item_id'] = recs['user_id'].map(mapper)
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1
fold_result = compute_metrics(train, test, recs, top_N)
validation_results = validation_results.append(fold_result, ignore_index=True)
validation_results.agg({'MAP@10':'mean', 'Novelty@10':'mean',})
MAP@10 0.698575 Novelty@10 17.440547 dtype: float64
Simply using the code above for submission won't work due to cold users. We'll have to figure out how to process them.
random_items = list(np.random.choice(interactions_df['user_id'], size=5, replace=False))
cold_items = [10000, 20000]
random_items.extend(cold_items)
random_items
[754950, 758416, 83485, 636568, 669127, 10000, 20000]
train = interactions_df
test = random_items
pop_model = PopularRecommender(days=last_n_days, dt_column='last_watch_dt')
pop_model.fit(train)
recs = pd.DataFrame({'user_id': pd.Series(test).unique()})
recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
recs = recs.explode('item_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs = recs.groupby('user_id').agg({'item_id': list}).reset_index()
recs.head()
user_id | item_id | |
---|---|---|
0 | 10000 | [9728, 15297, 10440, 14488, 13865, 12192, 341,... |
1 | 20000 | [9728, 15297, 10440, 14488, 13865, 12192, 341,... |
2 | 83485 | [9728, 15297, 10440, 14488, 13865, 12192, 341,... |
3 | 636568 | [9728, 15297, 10440, 14488, 13865, 12192, 341,... |
4 | 669127 | [9728, 15297, 10440, 14488, 13865, 12192, 341,... |
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p implicit,catboost,recohut
numpy 1.19.5 pandas 1.1.5 Sparsh A. last updated: 2022-01-14 19:35:09 implicit 0.4.8 catboost 1.0.4 recohut 0.0.11 compiler : GCC 7.5.0 system : Linux release : 5.4.144+ machine : x86_64 processor : x86_64 CPU cores : 2 interpreter: 64bit
END