building a tv-show and movie recommender system using lightfm and lightgbm models on sweet.tv's streaming dataset
sweet.tv is a streaming service that offers access to TV-channels, Ukrainian films, world-class movies, cartoons, and the best series
!pip install lightfm
import ast
import tqdm
import scipy
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k, auc_score
from lightfm.cross_validation import random_train_test_split
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
%matplotlib inline
sns.set_style('whitegrid')
!pip install -q watermark
%reload_ext watermark
%watermark -m -iv
Compiler : GCC 7.5.0 OS : Linux Release : 5.4.104+ Machine : x86_64 Processor : x86_64 CPU cores : 2 Architecture: 64bit lightgbm : 2.2.3 IPython : 5.5.0 scipy : 1.4.1 hyperopt : 0.1.2 numpy : 1.19.5 seaborn : 0.11.1 matplotlib: 3.2.2 tqdm : 4.41.1 pandas : 1.1.5
!pip install -q -U kaggle
!pip install --upgrade --force-reinstall --no-deps kaggle
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c sweettv-tv-program-recommender
!unzip -qqo /content/sweettv-tv-program-recommender.zip
dataset11-30week.csv - the training set. vsetv_id
is the same as channel_id
in export_arh_*.csv files.
df = pd.read_csv('dataset11-30.csv')
df.head()
user_id | vsetv_id | start_time | stop_time | duraton | |
---|---|---|---|---|---|
0 | 3398461054087191302 | 6 | 2020-03-09 00:00:02 | 2020-03-09 00:01:53 | 111 |
1 | 17226860011138219284 | 353 | 2020-03-09 00:00:02 | 2020-03-09 00:03:26 | 204 |
2 | 18417531283109304442 | 332 | 2020-03-09 00:00:05 | 2020-03-09 00:01:26 | 81 |
3 | 3677642666679759206 | 7 | 2020-03-09 00:00:08 | 2020-03-09 01:07:41 | 4053 |
4 | 15105343836194894915 | 108 | 2020-03-09 00:00:10 | 2020-03-09 00:02:34 | 144 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5821987 entries, 0 to 5821986 Data columns (total 5 columns): # Column Dtype --- ------ ----- 0 user_id uint64 1 vsetv_id int64 2 start_time object 3 stop_time object 4 duraton int64 dtypes: int64(2), object(2), uint64(1) memory usage: 222.1+ MB
df.start_time = pd.to_datetime(df.start_time)
df.stop_time = pd.to_datetime(df.stop_time)
df['date'] = df['start_time'].dt.date
df.describe(include='all', datetime_is_numeric=True).T
count | unique | top | freq | mean | min | 25% | 50% | 75% | max | std | |
---|---|---|---|---|---|---|---|---|---|---|---|
user_id | 5.82199e+06 | NaN | NaN | NaN | 9.29473e+18 | 2.24447e+15 | 4.92875e+18 | 9.24975e+18 | 1.38644e+19 | 1.84438e+19 | 5.2006e+18 |
vsetv_id | 5.82199e+06 | NaN | NaN | NaN | 498.091 | 3 | 6 | 175 | 935 | 1531 | 572.814 |
start_time | 5821987 | NaN | NaN | NaN | 2020-05-05 03:15:37.239014656 | 2020-03-09 00:00:02 | 2020-04-01 00:05:17 | 2020-04-25 22:00:58 | 2020-06-02 21:35:18 | 2020-07-26 23:58:42 | NaN |
stop_time | 5821987 | NaN | NaN | NaN | 2020-05-05 03:40:17.770906624 | 2020-03-09 00:01:26 | 2020-04-01 00:23:54.500000 | 2020-04-25 22:17:34 | 2020-06-02 22:10:13.500000 | 2020-07-26 23:59:58 | NaN |
duraton | 5.82199e+06 | NaN | NaN | NaN | 1480.46 | 61 | 133 | 396 | 1452 | 342145 | 3123.74 |
date | 5821987 | 139 | 2020-03-22 | 88255 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
export_arh_*.csv files - supplemental information about train data (TV program schedule). Contains tv_show_id - have to predict 5 of these for the submission user_ids.
df_info = pd.read_csv('export_arh_11-20-final.csv').append(pd.read_csv('export_arh_21-30-final.csv'))
df_info.start_time = pd.to_datetime(df_info.start_time, format='%d.%m.%Y %H:%M:%S')
df_info['stop_time'] = df_info['start_time'] + pd.to_timedelta(df_info['duration'], unit='s')
df_info = df_info[df_info.tv_show_id != 0].copy().reset_index(drop=True)
df_info.head()
channel_id | channel_title | start_time | duration | tv_show_title | tv_show_id | tv_show_category | tv_show_genre_1 | tv_show_genre_2 | tv_show_genre_3 | year_of_production | director | actors | stop_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 1+1 | 2020-03-09 07:40:00 | 5400 | 'ТСН. Неделя'. | 2400480 | Инфо | общество и политика | NaN | NaN | 1997 | NaN | NaN | 2020-03-09 09:10:00 |
1 | 3 | 1+1 | 2020-03-09 09:10:00 | 8400 | 'Голос страны', 10 сезон, 8 эп. 'Бои'. | 700475 | Развлечения | музыка | NaN | NaN | 2010 | NaN | NaN | 2020-03-09 11:30:00 |
2 | 3 | 1+1 | 2020-03-09 19:30:00 | 2700 | ТСН. | 2400480 | Инфо | общество и политика | NaN | NaN | 1997 | NaN | NaN | 2020-03-09 20:15:00 |
3 | 3 | 1+1 | 2020-03-09 20:15:00 | 14100 | Премьера. Х/ф 'Любовь без памяти', 1-4 с. | 90082508 | Фильмы | мелодрама | NaN | NaN | 2019 | Андрей Комаров | Алина Гросу, Артем Алексеев, Ирина Мак | 2020-03-10 00:10:00 |
4 | 3 | 1+1 | 2020-03-10 00:10:00 | 6900 | Х/ф 'Сделано в Америке'. | 90080911 | Фильмы | боевик | биографический | комедия | 2017 | Даг Лаймэн | Том Круз, Донал Глисон, Сара Райт | 2020-03-10 02:05:00 |
df_info.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 237118 entries, 0 to 237117 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 channel_id 237118 non-null int64 1 channel_title 237118 non-null object 2 start_time 237118 non-null datetime64[ns] 3 duration 237118 non-null int64 4 tv_show_title 237118 non-null object 5 tv_show_id 237118 non-null int64 6 tv_show_category 237118 non-null object 7 tv_show_genre_1 226331 non-null object 8 tv_show_genre_2 141357 non-null object 9 tv_show_genre_3 43902 non-null object 10 year_of_production 234747 non-null object 11 director 205092 non-null object 12 actors 205934 non-null object 13 stop_time 237118 non-null datetime64[ns] dtypes: datetime64[ns](2), int64(3), object(9) memory usage: 25.3+ MB
print('No. of users:', df.user_id.nunique())
print('No. of channels:', df.vsetv_id.nunique())
print('No. of TV shows:', df_info.tv_show_id.nunique())
print('Data dates:', df.start_time.min(), df.start_time.max())
print('Data dates info:', df_info.start_time.min(), df_info.start_time.max())
No. of users: 4914 No. of channels: 202 No. of TV shows: 8234 Data dates: 2020-03-09 00:00:02 2020-07-26 23:58:42 Data dates info: 2020-03-08 23:00:00 2020-07-27 06:00:00
ax = df.groupby(['date']).size().plot(figsize = (20, 7));
ax.set_title('Views by time', fontsize = 15);
There are many signs in the names of TV programs - cinema, season, series, premiere
for c in ['эп.', 'м/с', 'х/ф', 'д/ф', 'м/ф', 'т/с', 'премьера']:
c_index = df_info[df_info['tv_show_title'].str.lower().str.contains(c)].index
for c in ['c.', 'c..']:
c_index = df_info[df_info['tv_show_title'].str.lower()==c].index
channel_info_df = df_info[['channel_id', 'channel_title']].drop_duplicates()
channel_info_df[channel_info_df.duplicated(['channel_id'], keep = False)].sort_values(['channel_id'])
channel_id | channel_title | |
---|---|---|
27421 | 75 | Киев |
27425 | 75 | Київ |
40540 | 288 | Русский экстрим |
158230 | 288 | Russian Extreme (HD REX) |
99803 | 1249 | UA:Перший (спутник) |
217053 | 1249 | UA:Перший |
show_info_df = df_info[['tv_show_id', 'tv_show_category', 'tv_show_genre_1',
'tv_show_genre_2', 'tv_show_genre_3', 'year_of_production',
'director', 'actors']].drop_duplicates()
show_info_df[show_info_df.duplicated(['tv_show_id'], keep = False)].sort_values(['tv_show_id']).head(3)
tv_show_id | tv_show_category | tv_show_genre_1 | tv_show_genre_2 | tv_show_genre_3 | year_of_production | director | actors | |
---|---|---|---|---|---|---|---|---|
100228 | 900954 | Фильмы | драма | криминал | NaN | 1995 | Тим Роббинс | Сюзан Сарандон, Шон Пенн, Роберт Проски |
87489 | 900954 | Фильмы | драма | NaN | NaN | 1995 | Тим Роббинс | Сюзан Сарандон, Шон Пенн, Роберт Проски |
92727 | 2000444 | Спорт | спорт | хоккей | NaN | 2018-2018 | NaN | NaN |
show_duration_df = df_info[['tv_show_id', 'duration']].drop_duplicates()
show_duration_df[show_duration_df.duplicated(['tv_show_id'], keep = False)].sort_values(['tv_show_id']).head(3)
tv_show_id | duration | |
---|---|---|
37204 | 20088 | 10500 |
7 | 20088 | 1800 |
9 | 20088 | 3000 |
ax = df_info['channel_title'].value_counts()[:25].plot(figsize = (20, 7), kind = 'bar');
ax.set_title('Top channels by transmission');
tv_show_id_df = df_info[['tv_show_id', 'tv_show_title']].drop_duplicates()
tv_show_id_df[tv_show_id_df.duplicated(['tv_show_id'], keep = False)].sort_values(['tv_show_id']).head()
tv_show_id | tv_show_title | |
---|---|---|
7321 | 30066 | 'Наша Russia'. Дайджест, 1 эп. |
7336 | 30066 | 'Наша Russia'. Дайджест, 16 эп. |
7335 | 30066 | 'Наша Russia'. Дайджест, 15 эп. |
7334 | 30066 | 'Наша Russia'. Дайджест, 14 эп. |
7333 | 30066 | 'Наша Russia'. Дайджест, 13 эп. |
Algorithm for collecting data on TV program views
show_lines = []
for i, row in tqdm(df.iterrows(), total = df.shape[0]):
user_id = row['user_id']
vsetv_id = row['vsetv_id']
start_time = row['start_time']
stop_time = row['stop_time']
# For each user session, we will find those programs that went on the channel that he watched
shows_remember_df = df_info[(df_info.channel_id == vsetv_id)
& (df_info.start_time < stop_time)
& (df_info.stop_time > start_time)][['tv_show_id', 'start_time', 'stop_time']].copy()
# Remember which programs the user has watched
for j, row_j in shows_remember_df.iterrows():
show_lines.append([user_id, vsetv_id, start_time, stop_time, row_j['tv_show_id'], row_j['start_time'], row_j['stop_time']])
show_lines_df = pd.DataFrame(show_lines,columns = ['user_id', 'vsetv_id', 'start_time', 'stop_time', 'tv_show_id', 'show_start_time', 'show_stop_time'])
show_lines_df.to_csv('dataset_with_shows.csv', index = False)
def prepare_features(train, train_info, train_info_future, train_base):
'''
Function to enrich data by adding features
'''
final_shape = train.shape[0]
# collect prediction rank
for user, group in train.sort_values(['user_id', 'user_show_freq'], ascending = False).groupby(['user_id']):
train.loc[group.index, 'user_show_rank'] = np.arange(0, group.shape[0])
train.loc[train[train.user_show_freq == 0].index, 'user_show_rank'] = 50
# collect prediction rank
for user, group in train.sort_values(['user_id', 'lightfm_score'], ascending = False).groupby(['user_id']):
train.loc[group.index, 'user_lfm_rank'] = np.arange(0, group.shape[0])
train.loc[train[pd.isnull(train.lightfm_score)].index, 'user_lfm_rank'] = 50
# mean rank
train['combined_rank'] = (train['user_show_rank'] + train['user_lfm_rank']) / 2
# tv show channel
_temp = train_info[['tv_show_id', 'channel_id']].drop_duplicates()
_temp = _temp[~_temp.duplicated(['tv_show_id'], keep = 'first')].copy()
train = train.merge(_temp, on = ['tv_show_id'], how = 'left')
assert final_shape == train.shape[0]
# add cat ids
_temp = train_info[['tv_show_id', 'tv_show_category', 'tv_show_genre_1', 'tv_show_genre_2', 'tv_show_genre_3']].drop_duplicates()
_temp = _temp[~_temp.duplicated(['tv_show_id'], keep = 'first')].copy()
train = train.merge(_temp, on = ['tv_show_id'], how = 'left')
assert final_shape == train.shape[0]
# number of watches with diff thresholds
for threshold in [0.3, 0.8]:
c_new = f'user_show_freq_{threshold}'
features_threshold = get_features(train_base, threshold=threshold)
features_threshold.rename(columns = {'user_show_freq':c_new}, inplace = True)
train = train.merge(features_threshold, on = ['user_id', 'tv_show_id'], how = 'left')
train[c_new].fillna(0, inplace = True)
assert final_shape == train.shape[0]
# alternative base
total_user_show_watch_df = train_base.groupby(['tv_show_id', 'show_start_time', 'show_stop_time', 'user_id', 'show_duration'], as_index = False).user_watch_time.sum()
total_user_show_watch_df['user_watch_perc'] = total_user_show_watch_df['user_watch_time'] / total_user_show_watch_df['show_duration']
total_user_show_watch_df = total_user_show_watch_df[total_user_show_watch_df.user_watch_perc <= 1].copy()
# number of watches with diff thresholds
for threshold in [0.3, 0.5, 0.8]:
c_new = f'alt_user_show_freq_{threshold}'
features_threshold = get_features(total_user_show_watch_df, threshold=threshold)
features_threshold.rename(columns = {'user_show_freq':c_new}, inplace = True)
train = train.merge(features_threshold, on = ['user_id', 'tv_show_id'], how = 'left')
train[c_new].fillna(0, inplace = True)
assert final_shape == train.shape[0]
# number of watches with more recent time splits
for weeks_prior in [1]:
c_new = f'user_show_freq_week_{weeks_prior}'
split_date = train_base.start_time.max() - datetime.timedelta(days = weeks_prior * 7)
features_split = get_features(train_base[train_base.start_time >= split_date], 200)
features_split.rename(columns = {'user_show_freq':c_new}, inplace = True)
train = train.merge(features_split, on = ['user_id', 'tv_show_id'], how = 'left')
train[c_new].fillna(0, inplace = True)
assert final_shape == train.shape[0]
train[f'user_show_freq_dif_week_{weeks_prior}'] = (train['user_show_freq'] - train[c_new]) / train['user_show_freq']
assert final_shape == train.shape[0]
# new ranks
for user, group in train.sort_values(['user_id', f'user_show_freq_week_{weeks_prior}'], ascending = False).groupby(['user_id']):
train.loc[group.index, f'user_show_rank_week_{weeks_prior}'] = np.arange(0, group.shape[0])
train[f'user_show_rank_mean_week_{weeks_prior}'] = train[['user_show_rank', f'user_show_rank_week_{weeks_prior}']].mean(axis = 1)
assert final_shape == train.shape[0]
train.drop(c_new, 1, inplace = True)
# number of watches with more recent time splits
for weeks_prior in [1]:
c_new = f'alt_user_show_freq_week_{weeks_prior}'
split_date = total_user_show_watch_df.show_start_time.max() - datetime.timedelta(days = weeks_prior * 7)
features_split = get_features(total_user_show_watch_df[total_user_show_watch_df.show_start_time >= split_date], 200)
features_split.rename(columns = {'user_show_freq':c_new}, inplace = True)
train = train.merge(features_split, on = ['user_id', 'tv_show_id'], how = 'left')
train[c_new].fillna(0, inplace = True)
train[f'alt_user_show_freq_dif_week_{weeks_prior}'] = (train['user_show_freq'] - train[c_new]) / train['user_show_freq']
# new ranks
for user, group in train.sort_values(['user_id', c_new], ascending = False).groupby(['user_id']):
train.loc[group.index, f'user_show_rank_week_{weeks_prior}'] = np.arange(0, group.shape[0])
train[f'alt_user_show_rank_mean_week_{weeks_prior}'] = train[['user_show_rank', f'user_show_rank_week_{weeks_prior}']].mean(axis = 1)
train.drop(c_new, 1, inplace = True)
# Насколько часто пользователь смотрит канал
_temp = train.groupby(['user_id', 'channel_id']).size().reset_index().rename(columns = {0:'user_channel_count'})
_temp = _temp.merge(_temp.groupby(['user_id'], as_index = False)['user_channel_count'].sum().rename(columns = {'user_channel_count':'user_count'}),
on = ['user_id'], how = 'left')
train = train.merge(_temp[['user_id', 'channel_id', 'user_channel_count']],
on = ['user_id', 'channel_id'], how = 'left')
# user gruop watch mean time+ rel to every watch
train = train.merge(train.groupby(['user_id'], as_index = False)['user_show_freq'].mean().rename(columns = {'user_show_freq':'group_user_show_freq'}),
on = ['user_id'], how = 'left')
train['user_show_freq_rel_group'] = train['user_show_freq'] / train['group_user_show_freq']
assert final_shape == train.shape[0]
# show total duration in the future and relative to previous
_temp_1 = train_info.groupby(['tv_show_id'], as_index = False)['duration'].sum().rename(columns = {'duration':'tot_show_duration'})
num_days = (train_info.start_time.max() - train_info.start_time.min()).days
_temp_1['tot_show_duration'] /= num_days
_temp_2 = train_info_future.groupby(['tv_show_id'], as_index = False)['duration'].sum().rename(columns = {'duration':'tot_show_duration_future'})
num_days = (train_info_future.start_time.max() - train_info_future.start_time.min()).days
_temp_2['tot_show_duration_future'] /= num_days
train = train.merge(_temp_1, on = ['tv_show_id'], how = 'left')
train = train.merge(_temp_2, on = ['tv_show_id'], how = 'left')
train['tot_show_duration'].fillna(0, inplace = True)
train['tot_show_duration_future'].fillna(0, inplace = True)
train['popularity_drop'] = train['tot_show_duration_future'] / train['tot_show_duration']
assert final_shape == train.shape[0]
return train
def apk(actual, predicted, k=5):
'''
Function to get Average Precision at K
'''
if len(predicted)>k:
predicted = predicted[:k]
score = 0.0
num_hits = 0.0
for i,p in enumerate(predicted):
if p in actual and p not in predicted[:i]:
num_hits += 1.0
score += num_hits / (i+1.0)
if not actual:
return 0.0
return score / min(len(actual), k)
def mapk(actual, predicted, k=5):
'''
Function to get Mean Average Precision at K
'''
return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
user_col = 'cont_user_id'
movie_col = 'cont_tv_show_id'
def get_features(df, get_n = 5, threshold = 0.5):
'''
Function to get target-like features by threshold
'''
# filter by 80% watch time
df = df[df['user_watch_perc'] > threshold].copy()
# calc how many times user viewed the show
df_views = df[['user_id', 'tv_show_id']].groupby(['user_id', 'tv_show_id']).size().reset_index()\
.rename(columns = {0:'user_show_freq'})
# get only top-5 for each user
df_top_views = df_views.sort_values(['user_id', 'user_show_freq'], ascending=False).groupby(['user_id']).head(get_n)
return df_top_views
def get_target(df, get_n = 5):
'''
Function to get target
'''
# filter by 80% watch time
df = df[df['user_watch_perc'] >= 0.8].copy()
# calc how many times user viewed the show
df_views = df[['user_id', 'tv_show_id']].groupby(['user_id', 'tv_show_id']).size().reset_index()\
.rename(columns = {0:'user_show_freq'})
# get only top-5 for each user
df_top_views = df_views.sort_values(['user_id', 'user_show_freq'], ascending=False).groupby(['user_id']).head(get_n)
return df_top_views
def df_to_sparse_interaction_matrix(x, has_seen = True):
'''
Pandas dataframe to LightFM format
'''
interaction_x = x[[user_col, movie_col]].drop_duplicates().assign(seen=1).\
pivot_table(index = user_col, columns = movie_col).fillna(0)
return scipy.sparse.csr_matrix(interaction_x)
export_arh_31-42-final.csv - supplemental information about test data (TV program schedule)
df_info_future = pd.read_csv('export_arh_31-42-final.csv', low_memory=False)
df_show = pd.read_csv('dataset_with_shows.csv')
for c in ['start_time', 'show_start_time', 'stop_time','show_stop_time']:
df_show[c] = pd.to_datetime(df_show[c])
df_show['start_show_user_time'] = df_show[['show_start_time','start_time']].max(axis=1)
df_show['stop_show_user_time'] = df_show[['stop_time','show_stop_time']].min(axis=1)
df_show['user_watch_time'] = (df_show['stop_show_user_time'] - df_show['start_show_user_time']).dt.total_seconds()
df_show['show_duration'] = (df_show['show_stop_time'] - df_show['show_start_time']).dt.total_seconds()
df_show['user_watch_perc'] = df_show['user_watch_time'] / df_show['show_duration']
# split on holdout by time
train_start_date = df_show.start_time.min()# + datetime.timedelta(days = 4 * 7)
split_date = df_show.start_time.max() - datetime.timedelta(days = 9 * 7)
train = df_show[(df_show.start_time <= split_date) & (df_show.start_time >= train_start_date)].copy()
val = df_show[df_show.start_time > split_date].copy()
# collection of signs and target by views
train_top_views = get_features(train[train['tv_show_id'].isin(val['tv_show_id'].unique())])
val_top_views = get_target(val)
overall_top_movies = train_top_views.groupby(['tv_show_id'], as_index = False)['user_show_freq'].sum().sort_values(['user_show_freq'])['tv_show_id'][-5:].values
overall_top_movies = overall_top_movies[::-1]
# checking the accuracy of the solution
preds = []
trues = []
for user in train_top_views.user_id.unique():
predict_n_movies = train_top_views[train_top_views.user_id == user]['tv_show_id'].values[:5]
actual_n_movies = val_top_views[val_top_views.user_id == user]['tv_show_id'].values[:5]
if len(predict_n_movies) < 5:
predict_n_movies = list(predict_n_movies[:len(predict_n_movies)]) + list(overall_top_movies[:5 - len(predict_n_movies)])
preds.append(list(predict_n_movies))
trues.append(list(actual_n_movies))
score = np.round(mapk(trues, preds, k = 5), 5)
print(f'MAP@{5} = {score}')
MAP@5 = 0.32872
# view data
df_top_views = get_features(df_show[df_show['tv_show_id'].isin(df_info_future['tv_show_id'].unique())])
overall_top_movies = df_top_views.groupby(['tv_show_id'], as_index = False)['user_show_freq'].sum().sort_values(['user_show_freq'])['tv_show_id'][-5:].values[::-1]
# forecast for the future
submission_df = pd.read_csv('submission.csv')
for index_row, row in submission_df.iterrows():
pred_n_movies = list(df_top_views[df_top_views.user_id == row['user_id']]['tv_show_id'].values[:5])
if len(pred_n_movies) < 5:
pred_n_movies = list(pred_n_movies[:len(pred_n_movies)]) + list(overall_top_movies[:5 - len(pred_n_movies)])
pred = ' '.join([str(int(x)) for x in pred_n_movies])
submission_df.loc[index_row, 'tv_show_id'] = pred
submission_df.to_csv('baseline_submission.csv', index = False)
# data from the past
df_info = pd.read_csv('export_arh_11-20-final.csv').append(pd.read_csv('export_arh_21-30-final.csv'))
df_info.start_time = pd.to_datetime(df_info.start_time, format='%d.%m.%Y %H:%M:%S')
df_info['stop_time'] = df_info['start_time'] + pd.to_timedelta(df_info['duration'], unit='s')
df_info = df_info[df_info.tv_show_id != 0].copy()
# data for the future
df_info_future = pd.read_csv('export_arh_31-42-final.csv', low_memory=False)
df_info_future.start_time = pd.to_datetime(df_info_future.start_time, format='%d.%m.%Y %H:%M:%S')
df_info_future['stop_time'] = df_info_future['start_time'] + pd.to_timedelta(df_info_future['duration'], unit='s')
df_info_future = df_info_future[df_info_future.tv_show_id != 0].copy()
# categorical features
for c in ['tv_show_category', 'tv_show_genre_1', 'tv_show_genre_2', 'tv_show_genre_3']:
df_info[c] = LabelEncoder().fit_transform(df_info[c].fillna('Nope'))
for c in ['user_id', 'tv_show_id']:
_temp = df_show[[c]].drop_duplicates().reset_index(drop = True)
_temp[f"cont_{c}"] = np.arange(_temp.shape[0])
df_show = df_show.merge(_temp, on = [c], how = 'left')
user_col = 'cont_user_id'
movie_col = 'cont_tv_show_id'
_temp = df_show[['tv_show_id', movie_col]].drop_duplicates()
movies_dict = dict(zip(_temp[movie_col].values, _temp['tv_show_id'].values))
_temp = df_show[['user_id', user_col]].drop_duplicates()
users_dict = dict(zip(_temp[user_col].values, _temp['user_id'].values))
Split data to train / val
train_start_date = df_show.start_time.min()
split_date = df_show.start_time.max() - datetime.timedelta(days = 9 * 7) # 9 weeks
train = df_show[(df_show.start_time <= split_date) & (df_show.start_time >= train_start_date)].copy()
df_info_train = df_info[(df_info.start_time <= split_date) & (df_info.start_time >= train_start_date)].copy()
val = df_show[df_show.start_time > split_date].copy()
df_info_val = df_info[df_info.start_time > split_date].copy()
# We remember which TV programs can be validated
possible_movies_in_val = df_info_val.tv_show_id.unique()
print(train.shape, val.shape)
(2537448, 14) (1310248, 14)
Train LightFm
# sparse for lightfm
train_sparse = df_to_sparse_interaction_matrix(train)
# fit lightfm
model = LightFM(random_state=42, loss = 'warp')
model.fit(train_sparse, epochs=15, num_threads=4, verbose = False);
# collect user biases
user_bias_df = pd.DataFrame(model.user_biases, columns = ['user_bias_lfm'])
user_bias_df['user_id'] = list(sorted(train[user_col].unique()))
user_bias_df['user_id'] = user_bias_df['user_id'].apply(lambda x: users_dict.get(x))
# collect item biases
show_bias_df = pd.DataFrame(model.item_biases, columns = ['show_bias_lfm'])
show_bias_df['tv_show_id'] = list(sorted(train[movie_col].unique()))
show_bias_df['tv_show_id'] = show_bias_df['tv_show_id'].apply(lambda x: movies_dict.get(x))
# predict the top N current programs for each user from train
train_users = sorted(train[user_col].unique())
# we can only predict films that will definitely be in the target
train_movies = sorted(train[train['tv_show_id'].isin(possible_movies_in_val)][movie_col].unique())
get_n = 200
lightfm_predictions_df = pd.DataFrame()
for train_user in train_users:
all_movies_df = pd.DataFrame(train_movies, columns = [movie_col])
all_movies_df[user_col] = train_user
all_movies_df = all_movies_df.astype('int32')
all_movies_df['lightfm_score'] = model.predict(all_movies_df[user_col].values,
all_movies_df[movie_col].values)
all_movies_df.sort_values('lightfm_score', ascending = False, inplace = True)
all_movies_df['lightfm_rank'] = np.arange(all_movies_df.shape[0])
lightfm_predictions_df = lightfm_predictions_df.append(all_movies_df.head(get_n))
lightfm_predictions_df['user_id'] = lightfm_predictions_df[user_col].map(lambda x: users_dict.get(x))
lightfm_predictions_df['tv_show_id'] = lightfm_predictions_df[movie_col].map(lambda x: movies_dict.get(x))
lightfm_predictions_df.head(1)
cont_tv_show_id | cont_user_id | lightfm_score | lightfm_rank | user_id | tv_show_id | |
---|---|---|---|---|---|---|
19 | 20 | 0 | -0.982242 | 0 | 7981779138282146703 | 2400480 |
Choice of N movies for user based on views and LightFm results
get_n = 200
train_base = train[train['tv_show_id'].isin(possible_movies_in_val)].copy()
val_base = val.copy()
train = get_features(train_base, get_n)
val = get_target(val_base, get_n)
overal_val = get_target(val_base, 10000)
print(train.shape, val.shape)
(165997, 3) (120146, 3)
Add LightFm as additional lines
# if incomplete recommendations (user watched <get_n) then add recommendations from lightfm
train_w_lfm = train.copy()
for user, group in train.groupby(['user_id']):
if group.shape[0] < get_n:
need_to_add = get_n - group.shape[0]
add_tv_show_ids_from_lightfm = lightfm_predictions_df[
(lightfm_predictions_df.user_id == user)&
(~lightfm_predictions_df.tv_show_id.isin(group.tv_show_id.unique()))]\
.tv_show_id.values[:need_to_add]
add_df = pd.DataFrame(add_tv_show_ids_from_lightfm, columns = ['tv_show_id'])
add_df['user_id'] = user
add_df['user_show_freq'] = 0
add_df['user_id'] = add_df['user_id'].astype('uint64')
train_w_lfm = train_w_lfm.append(add_df[['user_id', 'tv_show_id', 'user_show_freq']])
assert train.user_id.nunique() == train_w_lfm.user_id.nunique()
assert train_w_lfm.groupby(['user_id']).size().min() == get_n
train_w_lfm = train_w_lfm.merge(lightfm_predictions_df[['user_id', 'tv_show_id', 'lightfm_score']],
on = ['user_id', 'tv_show_id'], how = 'left')
# combined train
train = train_w_lfm.copy()
train.reset_index(inplace = True, drop = True)
Add target
# create target from tv programs that user actually saw in the next 9 weeks
target_col = 'seen'
train = train.merge(overal_val.drop(['user_show_freq'], 1).assign(seen=1), on = ['user_id', 'tv_show_id'], how = 'left')
train[target_col].fillna(0, inplace = True)
print('Target distribution:')
train[target_col].value_counts()
Target distribution:
0.0 907186 1.0 50414 Name: seen, dtype: int64
Add signs
# Remove TV programs that have been watched by less than 10 users
shows_watches = train_base[train_base['user_watch_perc'] >= 0.8].groupby(['tv_show_id']).size().reset_index().rename(columns = {0:'show_watched'})
min_watch = 10
shows_watches = shows_watches[shows_watches['show_watched'] > min_watch].copy()
possible_movies_that_matter = shows_watches.tv_show_id.unique()
train = train[train.tv_show_id.isin(possible_movies_that_matter)].copy()
train.reset_index(inplace = True, drop = True)
# collecting signs
train = prepare_features(train, df_info_train, df_info_val, train_base)
train = train.merge(user_bias_df, on = ['user_id'], how = 'left')
train = train.merge(show_bias_df, on = ['tv_show_id'], how = 'left')
Model validation
cat_columns = ['tv_show_category', 'tv_show_genre_1', 'tv_show_genre_2', 'tv_show_genre_3']
users_train, users_test = train_test_split(train.user_id.unique(), random_state = 42)
X_train = train[train.user_id.isin(users_train)].copy()
X_test = train[train.user_id.isin(users_test)].copy()
y_train = X_train.pop(target_col)
y_test = X_test.pop(target_col)
X_train = X_train.set_index(["user_id", "tv_show_id"])
X_test = X_test.set_index(["user_id", "tv_show_id"])
print(X_train.shape, X_test.shape)
print('Features:', list(X_train.columns))
(676824, 28) (225161, 28) Features: ['user_show_freq', 'lightfm_score', 'user_show_rank', 'user_lfm_rank', 'combined_rank', 'channel_id', 'tv_show_category', 'tv_show_genre_1', 'tv_show_genre_2', 'tv_show_genre_3', 'user_show_freq_0.3', 'user_show_freq_0.8', 'alt_user_show_freq_0.3', 'alt_user_show_freq_0.5', 'alt_user_show_freq_0.8', 'user_show_freq_dif_week_1', 'user_show_rank_week_1', 'user_show_rank_mean_week_1', 'alt_user_show_freq_dif_week_1', 'alt_user_show_rank_mean_week_1', 'user_channel_count', 'group_user_show_freq', 'user_show_freq_rel_group', 'tot_show_duration', 'tot_show_duration_future', 'popularity_drop', 'user_bias_lfm', 'show_bias_lfm']
train_data = lgb.Dataset(X_train, y_train)
test_data = lgb.Dataset(X_test, y_test)
param = {
'task': 'train',
'objective': 'binary',
'metric': 'auc',
'bagging_fraction': 0.8,
'bagging_freq': 4,
'colsample_bytree': '0.6',
'feature_fraction': 0.75,
'learning_rate': 0.01,
'metric': 'auc',
'min_data_in_leaf': 20,
'num_leaves': 150,
'num_threads': 4,
'reg_alpha': 0.4,
'reg_lambda': 0.32,
'seed': 42,
}
res = {}
bst = lgb.train(
param, train_data,
valid_sets=[train_data, test_data],
valid_names=["train", "valid"],
categorical_feature = cat_columns,
num_boost_round=10000, evals_result=res,
verbose_eval=100, early_stopping_rounds=15)
/usr/local/lib/python3.7/dist-packages/lightgbm/basic.py:1209: UserWarning: categorical_feature in Dataset is overridden. New categorical_feature is ['tv_show_category', 'tv_show_genre_1', 'tv_show_genre_2', 'tv_show_genre_3'] 'New categorical_feature is {}'.format(sorted(list(categorical_feature)))) /usr/local/lib/python3.7/dist-packages/lightgbm/basic.py:762: UserWarning: categorical_feature in param dict is overridden. warnings.warn('categorical_feature in param dict is overridden.')
Training until validation scores don't improve for 15 rounds. [100] train's auc: 0.899065 valid's auc: 0.890476 [200] train's auc: 0.908111 valid's auc: 0.896364 [300] train's auc: 0.914384 valid's auc: 0.899388 [400] train's auc: 0.919506 valid's auc: 0.901092 [500] train's auc: 0.923837 valid's auc: 0.902236 [600] train's auc: 0.928072 valid's auc: 0.902798 Early stopping, best iteration is: [587] train's auc: 0.927361 valid's auc: 0.90281
X_test["lgb_score"] = bst.predict(X_test, num_iteration=bst.best_iteration)
lgb_res = X_test.reset_index([0, 1])[["user_id",
"tv_show_id",
"lgb_score"]].sort_values("lgb_score",
ascending=False)
preds = []
trues = []
for user, group in lgb_res.groupby("user_id"):
predict_n_movies = list(group.tv_show_id)[:5]
actual_n_movies = val[val.user_id == user]['tv_show_id'].values[:5]
preds.append(list(predict_n_movies))
trues.append(list(actual_n_movies))
score = np.round(mapk(trues, preds, k = 5), 5)
print(f'Model MAP@{5} = {score}')
# reference (baseline)
preds = []
trues = []
for user, group in lgb_res.groupby("user_id"):
predict_n_movies = train[train.user_id == user]['tv_show_id'].values[:5]
actual_n_movies = val[val.user_id == user]['tv_show_id'].values[:5]
preds.append(list(predict_n_movies))
trues.append(list(actual_n_movies))
score = np.round(mapk(trues, preds, k = 5), 5)
print(f'Baseline MAP@{5} = {score}')
Model MAP@5 = 0.35283 Baseline MAP@5 = 0.31984
lgb.plot_importance(bst, figsize = (15, 10));
# Full Train
X_train = train.copy()
y_train = X_train.pop(target_col)
X_train = X_train.set_index(["user_id", "tv_show_id"])
train_data = lgb.Dataset(X_train, y_train)
final_model = lgb.train(
param, train_data,
categorical_feature = cat_columns,
num_boost_round=bst.best_iteration+100, verbose_eval=False)
final_model.save_model('tuned_.txt')
/usr/local/lib/python3.7/dist-packages/lightgbm/basic.py:1209: UserWarning: categorical_feature in Dataset is overridden. New categorical_feature is ['tv_show_category', 'tv_show_genre_1', 'tv_show_genre_2', 'tv_show_genre_3'] 'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
<lightgbm.basic.Booster at 0x7fa7752e3150>
Update trained Lightfm model, now with all data
# sparse for lightfm
df_sparse = df_to_sparse_interaction_matrix(df_show)
# fit lightfm
lfm_final_model = LightFM(random_state=42, loss = 'warp')
lfm_final_model.fit(df_sparse, epochs=15, num_threads=4, verbose = False);
# predict the top N current programs for each user from train
train_users = sorted(df_show[user_col].unique())
# we can only predict films that will definitely be in the target
train_movies = sorted(df_show[df_show.tv_show_id.isin(df_info_future['tv_show_id'].unique())][movie_col].unique())
# collect user biases
df_user_bias_df = pd.DataFrame(lfm_final_model.user_biases, columns = ['user_bias_lfm'])
df_user_bias_df['user_id'] = list(sorted(df_show[user_col].unique()))
df_user_bias_df['user_id'] = df_user_bias_df['user_id'].apply(lambda x: users_dict.get(x))
# collect item biases
df_show_bias_df = pd.DataFrame(lfm_final_model.item_biases, columns = ['show_bias_lfm'])
df_show_bias_df['tv_show_id'] = list(sorted(df_show[movie_col].unique()))
df_show_bias_df['tv_show_id'] = df_show_bias_df['tv_show_id'].apply(lambda x: movies_dict.get(x))
# collect Lightfm recommendations for all users in the train
get_n = 200
df_lightfm_predictions_df = pd.DataFrame()
for train_user in train_users:
all_movies_df = pd.DataFrame(train_movies, columns = [movie_col])
all_movies_df[user_col] = train_user
all_movies_df = all_movies_df.astype('int32')
all_movies_df['lightfm_score'] = lfm_final_model.predict(all_movies_df[user_col].values,
all_movies_df[movie_col].values)
all_movies_df.sort_values('lightfm_score', ascending = False, inplace = True)
all_movies_df['lightfm_rank'] = np.arange(all_movies_df.shape[0])
df_lightfm_predictions_df = df_lightfm_predictions_df.append(all_movies_df.head(get_n), sort = False)
df_lightfm_predictions_df['user_id'] = df_lightfm_predictions_df[user_col].map(lambda x: users_dict.get(x))
df_lightfm_predictions_df['tv_show_id'] = df_lightfm_predictions_df[movie_col].map(lambda x: movies_dict.get(x))
assert df_lightfm_predictions_df.user_id.nunique() == len(train_users)
df_lightfm_predictions_df.head(1)
cont_tv_show_id | cont_user_id | lightfm_score | lightfm_rank | user_id | tv_show_id | |
---|---|---|---|---|---|---|
20 | 20 | 0 | -1.957515 | 0 | 7981779138282146703 | 2400480 |
Collecting basic features, enriching with recommendations from LightFM
# Basic recommendations - according to the top views
print(df_show.shape)
get_n = 200
possible_test_movies = df_info_future['tv_show_id'].unique()
df_show_base = df_show[df_show['tv_show_id'].isin(possible_test_movies)].copy()
df_show_train = get_features(df_show_base, get_n)
overall_top_movies = df_show_train.groupby(['tv_show_id'], as_index = False)['user_show_freq'].sum().sort_values(['user_show_freq'])['tv_show_id'][-5:].values[::-1]
(3847696, 14)
# if incomplete recommendations (user watched <get_n) then add recommendations from lightfm
df_train_w_lfm = df_show_train.copy()
for user, group in df_show_train.groupby(['user_id']):
if group.shape[0] < get_n:
need_to_add = get_n - group.shape[0]
add_tv_show_ids_from_lightfm = df_lightfm_predictions_df[
(df_lightfm_predictions_df.user_id == user)&
(~df_lightfm_predictions_df.tv_show_id.isin(group.tv_show_id.unique()))]\
.tv_show_id.values[:need_to_add]
add_df = pd.DataFrame(add_tv_show_ids_from_lightfm, columns = ['tv_show_id'])
add_df['user_id'] = user
add_df['user_show_freq'] = 0
add_df['user_id'] = add_df['user_id'].astype('uint64')
df_train_w_lfm = df_train_w_lfm.append(add_df[['user_id', 'tv_show_id', 'user_show_freq']])
assert df_show_train.user_id.nunique() == df_train_w_lfm.user_id.nunique()
assert df_train_w_lfm.groupby(['user_id']).size().min() == get_n
df_train_w_lfm = df_train_w_lfm.merge(df_lightfm_predictions_df[['user_id', 'tv_show_id', 'lightfm_score']],
on = ['user_id', 'tv_show_id'], how = 'left')
# combined train
df_show_train = df_train_w_lfm.copy()
df_show_train.reset_index(inplace = True, drop = True)
Post-processing and adding features
# remove low-count movies
shows_watches = df_show_base[df_show_base['user_watch_perc'] >= 0.8].groupby(['tv_show_id']).size().reset_index().rename(columns = {0:'show_watched'})
min_watch = 10
shows_watches = shows_watches[shows_watches['show_watched'] > 10].copy()
possible_movies_that_matter = shows_watches.tv_show_id.unique()
df_show_train = df_show_train[df_show_train.tv_show_id.isin(possible_movies_that_matter)].copy()
df_show_train.reset_index(inplace = True, drop = True)
# collect features
df_show_train = prepare_features(df_show_train, df_info, df_info_future, df_show_base)
df_show_train = df_show_train.merge(df_user_bias_df, on = ['user_id'], how = 'left')
df_show_train = df_show_train.merge(df_show_bias_df, on = ['tv_show_id'], how = 'left')
print(df_show_train.shape)
(946395, 30)
Prediction submission
# predict rank score
X_predict = df_show_train.copy()
X_predict = X_predict.set_index(["user_id", "tv_show_id"])[X_train.columns]
X_predict["lgb_score"] = final_model.predict(X_predict, num_iteration=bst.best_iteration)
pred_res = X_predict.reset_index([0, 1])[["user_id",
"tv_show_id",
"lgb_score"]].sort_values("lgb_score",
ascending=False)
# create submission
submission_df = pd.read_csv('submission.csv')
for index_row, row in submission_df.iterrows():
pred_n_movies = list(pred_res[pred_res.user_id == row['user_id']]['tv_show_id'].values[:5])
if len(pred_n_movies) < 5:
pred_n_movies = list(pred_n_movies[:len(pred_n_movies)]) + list(overall_top_movies[:5 - len(pred_n_movies)])
pred = ' '.join([str(int(x)) for x in pred_n_movies])
submission_df.loc[index_row, 'tv_show_id'] = pred
submission_df.to_csv('lfm_lgb6_submission.csv', index = False)
submission_df.head()
user_id | tv_show_id | |
---|---|---|
0 | 8377619604347126107 | 240081 2400480 2400508 20088 2400467 |
1 | 8381667675275833309 | 2400480 240081 6700482 12001682 2400467 |
2 | 8387147770138767246 | 12001682 12002856 12001732 12002355 12002955 |
3 | 8397181578236218580 | 240081 2400480 20088 2400467 6700482 |
4 | 8404698046253197367 | 2400480 240081 20088 6200371 200432 |