import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os,sys,inspect
import gc
from tqdm.notebook import tqdm
import random
import heapq
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply
from tensorflow.keras.models import Model, Sequential, load_model
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
tf.config.experimental.set_memory_growth(gpus[0], True)
except RuntimeError as e:
print(e)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
Num GPUs Available: 0
!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip
ml-1m.zip 100%[===================>] 5.64M 25.0MB/s in 0.2s Archive: ml-1m.zip creating: ml-1m/ inflating: ml-1m/movies.dat inflating: ml-1m/ratings.dat inflating: ml-1m/README inflating: ml-1m/users.dat
def mish(x):
return x*tf.math.tanh(tf.math.softplus(x))
def leakyrelu(x, factor=0.2):
return tf.maximum(x, factor*x)
def load_data(filepath, threshold=0):
df = pd.read_csv(filepath,
sep="::",
header=None,
engine='python',
names=['userId', 'movieId', 'rating', 'time'])
df = df.drop('time', axis=1)
df['userId'] = df['userId'].astype(int)
df['movieId'] = df['movieId'].astype(int)
df['rating'] = df['rating'].astype(float)
df = df[['userId', 'movieId', 'rating']]
if threshold > 0:
df['rating'] = np.where(df['rating']>threshold, 1, 0)
else:
df['rating'] = 1.
m_codes = df['movieId'].astype('category').cat.codes
u_codes = df['userId'].astype('category').cat.codes
df['movieId'] = m_codes
df['userId'] = u_codes
return df
def add_negative(df, uiid, times=4):
df_ = df.copy()
user_id = df_['userId'].unique()
item_id = df_['movieId'].unique()
for i in tqdm(user_id):
cnt = 0
n = len(df_[df_['userId']==i])
n_negative = min(n*times, len(item_id)-n-1)
available_negative = list(set(uiid) - set(df[df['userId']==i]['movieId'].values))
new = np.random.choice(available_negative, n_negative, replace=False)
new = [[i, j, 0] for j in new]
df_ = df_.append(pd.DataFrame(new, columns=df.columns), ignore_index=True)
return df_
def extract_from_df(df, n_positive, n_negative):
df_ = df.copy()
rtd = []
user_id = df['userId'].unique()
for i in tqdm(user_id):
rtd += list(np.random.choice(df[df['userId']==i][df['rating']==1]['movieId'].index, n_positive, replace=False))
rtd += list(np.random.choice(df[df['userId']==i][df['rating']==0]['movieId'].index, n_negative, replace=False))
return rtd
def eval_NDCG(true, pred):
top_k = pred
for i, item in enumerate(top_k, 1):
if item == true:
return 1 / np.log2(i+1)
return 0
df = load_data('./ml-1m/ratings.dat', threshold=3)
df.head()
userId | movieId | rating | |
---|---|---|---|
0 | 0 | 1104 | 1 |
1 | 0 | 639 | 0 |
2 | 0 | 853 | 0 |
3 | 0 | 3177 | 1 |
4 | 0 | 2162 | 1 |
df = df[df['rating']==1].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
cnt = tdf.sum(1)
df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
tdf.iloc[:,:] = 0
test_idx = []
for i in tdf.index:
test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
train = df.loc[list(set(df.index)-set(test_idx)),:]
test = df.loc[test_idx, :]
df
userId | movieId | rating | |
---|---|---|---|
0 | 0 | 1104 | 1 |
1 | 0 | 3177 | 1 |
2 | 0 | 2162 | 1 |
3 | 0 | 1195 | 1 |
4 | 0 | 2599 | 1 |
... | ... | ... | ... |
570512 | 6037 | 346 | 1 |
570513 | 6037 | 1120 | 1 |
570514 | 6037 | 1133 | 1 |
570515 | 6037 | 1204 | 1 |
570516 | 6037 | 1007 | 1 |
570517 rows × 3 columns
df.shape, train.shape, test.shape
((570517, 3), (564569, 3), (5948, 3))
for uid, iid in zip(train['userId'].values, train['movieId'].values):
tdf.loc[uid, iid] = 1
train = tdf.copy()
train
movieId | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | ... | 3666 | 3667 | 3668 | 3669 | 3670 | 3671 | 3672 | 3673 | 3674 | 3675 | 3676 | 3677 | 3678 | 3679 | 3680 | 3681 | 3682 | 3683 | 3684 | 3685 | 3686 | 3687 | 3688 | 3689 | 3690 | 3691 | 3692 | 3693 | 3694 | 3695 | 3696 | 3697 | 3698 | 3699 | 3700 | 3701 | 3702 | 3703 | 3704 | 3705 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
userId | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6033 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6034 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6035 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6036 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6037 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5948 rows × 3533 columns
class CDAE(tf.keras.models.Model):
def __init__(self, input_dim, latent_dim, n_user, lamda=1e-4):
super().__init__()
self.input_dim = input_dim
self.latent_dim = latent_dim
self.lamda = lamda
self.n_user = n_user
self.embedding = Embedding(n_user, latent_dim, )
self.model = self.build()
def compile(self, optimizer, loss_fn=None):
super().compile()
self.optimizer = optimizer
self.loss_fn = loss_fn
def build(self):
self.encoder = self.build_encoder()
self.decoder = self.build_decoder()
rating = Input(shape=(self.input_dim, ), name='rating_input')
user_id = Input(shape=(1, ), name='user_input')
emb = self.embedding(user_id)
emb = tf.squeeze(emb, 1)
enc = self.encoder(rating) + emb
enc = tf.nn.tanh(enc)
outputs = self.decoder(enc)
return Model([rating, user_id], outputs)
def build_encoder(self):
inputs = Input(shape = (self.input_dim, ))
encoder = Sequential()
encoder.add(Dropout(0.2))
encoder.add(Dense(self.latent_dim, activation='tanh'))
outputs = encoder(inputs)
return Model(inputs, outputs)
def build_decoder(self):
inputs = Input(shape = (self.latent_dim, ))
encoder = Sequential()
encoder.add(Dense(self.input_dim, activation='sigmoid'))
outputs = encoder(inputs)
return Model(inputs, outputs)
def train_step(self, data):
x = data['rating']
user_ids = data['id']
with tf.GradientTape() as tape:
pred = self.model([x, user_ids])
rec_loss = tf.losses.binary_crossentropy(x, pred)
loss = rec_loss
grads = tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
return {'loss': loss}
loader = tf.data.Dataset.from_tensor_slices({'rating': train.values, 'id': np.arange(len(train))})
loader = loader.batch(32, drop_remainder=True).shuffle(len(train))
model = CDAE(train.shape[1], 200, len(train))
model.compile(optimizer=tf.optimizers.Adam())
model.fit(loader, epochs=25)
Epoch 1/25 185/185 [==============================] - 7s 31ms/step - loss: 0.1558 Epoch 2/25 185/185 [==============================] - 6s 31ms/step - loss: 0.1036 Epoch 3/25 185/185 [==============================] - 6s 31ms/step - loss: 0.1007 Epoch 4/25 185/185 [==============================] - 6s 29ms/step - loss: 0.0972 Epoch 5/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0922 Epoch 6/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0884 Epoch 7/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0857 Epoch 8/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0833 Epoch 9/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0810 Epoch 10/25 185/185 [==============================] - 5s 29ms/step - loss: 0.0793 Epoch 11/25 185/185 [==============================] - 6s 29ms/step - loss: 0.0776 Epoch 12/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0763 Epoch 13/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0747 Epoch 14/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0735 Epoch 15/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0722 Epoch 16/25 185/185 [==============================] - 6s 31ms/step - loss: 0.0709 Epoch 17/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0699 Epoch 18/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0687 Epoch 19/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0678 Epoch 20/25 185/185 [==============================] - 6s 31ms/step - loss: 0.0668 Epoch 21/25 185/185 [==============================] - 6s 31ms/step - loss: 0.0660 Epoch 22/25 185/185 [==============================] - 6s 31ms/step - loss: 0.0649 Epoch 23/25 185/185 [==============================] - 6s 31ms/step - loss: 0.0640 Epoch 24/25 185/185 [==============================] - 6s 30ms/step - loss: 0.0633 Epoch 25/25 185/185 [==============================] - 6s 31ms/step - loss: 0.0624
<keras.callbacks.History at 0x7f12cdede850>
top_k = 10
np.random.seed(42)
scores = []
for idx, i in tqdm(enumerate(np.random.choice(train.index, 100))):
item_to_pred = {item: pred for item, pred in zip(train.columns, model.model.predict([train.values, np.arange(len(train))])[idx])}
test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values
items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_)
top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get)
score = eval_NDCG(test_, top_k_items)
scores.append(score)
np.mean(scores)
0it [00:00, ?it/s]
0.2810483955912976
df = load_data('./ml-1m/ratings.dat', threshold=3)
df.head()
userId | movieId | rating | |
---|---|---|---|
0 | 0 | 1104 | 1 |
1 | 0 | 639 | 0 |
2 | 0 | 853 | 0 |
3 | 0 | 3177 | 1 |
4 | 0 | 2162 | 1 |
test_idx = []
user_id = df
for i in df['userId'].unique():
test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
train = df.iloc[list(set(df.index)-set(test_idx)),:]
test = df.iloc[test_idx, :]
df.shape, train.shape, test.shape
((570517, 3), (564569, 3), (5948, 3))
class EASE:
def __init__(self):
self.user_enc = LabelEncoder()
self.item_enc = LabelEncoder()
def _get_users_and_items(self, df):
users = self.user_enc.fit_transform(df.loc[:, 'userId'])
items = self.item_enc.fit_transform(df.loc[:, 'movieId'])
return users, items
def fit(self, df, lambda_: float = 0.5, implicit=True):
"""
df: pandas.DataFrame with columns user_id, item_id and (rating)
lambda_: l2-regularization term
implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
"""
users, items = self._get_users_and_items(df)
values = np.ones(df.shape[0]) if implicit else df['rating'].to_numpy() / df['rating'].max()
X = csr_matrix((values, (users, items)))
self.X = X
G = X.T.dot(X).toarray()
diagIndices = np.diag_indices(G.shape[0])
G[diagIndices] += lambda_
P = np.linalg.inv(G)
B = P / (-np.diag(P))
B[diagIndices] = 0
self.B = B
self.pred = X.dot(B)
def predict(self, train, users, items, k):
df = pd.DataFrame()
items = self.item_enc.transform(items)
dd = train.loc[train['userId'].isin(users)]
dd['ci'] = self.item_enc.transform(dd['movieId'])
dd['cu'] = self.user_enc.transform(dd['userId'])
g = dd.groupby('userId')
for user, group in tqdm(g):
watched = set(group['ci'])
candidates = [item for item in items if item not in watched]
u = group['cu'].iloc[0]
pred = np.take(self.pred[u, :], candidates)
res = np.argpartition(pred, -k)[-k:]
r = pd.DataFrame({
"userId": [user] * len(res),
"movieId": np.take(candidates, res),
"score": np.take(pred, res)
}).sort_values('score', ascending=False)
df = df.append(r, ignore_index=True)
df['movieId'] = self.item_enc.inverse_transform(df['movieId'])
return df
ease = EASE()
ease.fit(train)
uid = 0
ease.user_enc.inverse_transform([0])[0]
0
ease.item_enc.inverse_transform(np.argsort(ease.pred[0]))
array([2845, 1204, 1502, ..., 957, 574, 581], dtype=int16)
np.argsort(-ease.pred[0])
array([ 565, 559, 903, ..., 1420, 1138, 2716])
ease.pred[0][np.argsort(-ease.pred[0])]
array([ 0.88894744, 0.86783598, 0.76730558, ..., -0.26904345, -0.29024257, -0.29286189])
np.unique(train[train['userId']==0]['movieId'])
array([ 0, 47, 144, 253, 513, 517, 574, 580, 581, 593, 740, 858, 877, 957, 963, 964, 970, 1025, 1104, 1117, 1154, 1178, 1195, 1421, 1439, 1574, 1658, 1727, 1781, 1782, 1838, 1848, 2102, 2162, 2205, 2488, 2557, 2586, 2592, 2599, 2710, 2889, 2969, 3177], dtype=int16)
pred = ease.predict(train, train['userId'].unique(), train['movieId'].unique(), 100)
pred
0%| | 0/5947 [00:00<?, ?it/s]
userId | movieId | score | |
---|---|---|---|
0 | 0 | 354 | 0.659144 |
1 | 0 | 2511 | 0.420217 |
2 | 0 | 2058 | 0.397685 |
3 | 0 | 853 | 0.382166 |
4 | 0 | 892 | 0.325232 |
... | ... | ... | ... |
594695 | 6037 | 1729 | 0.105471 |
594696 | 6037 | 1978 | 0.104400 |
594697 | 6037 | 1172 | 0.104144 |
594698 | 6037 | 27 | 0.103118 |
594699 | 6037 | 2128 | 0.102213 |
594700 rows × 3 columns
uid = 1
df[(df['userId']==uid) & (df['movieId'].isin(pred[pred['userId']==uid]['movieId']))]
userId | movieId | rating |
---|
train[(train['userId']==uid) & (train['movieId'].isin(pred[pred['userId']==uid]['movieId']))]
userId | movieId | rating |
---|
for uid in range(942):
pdf = df[(df['userId']==uid) & (df['movieId'].isin(pred[pred['userId']==uid]['movieId']))]
ease.pred.shape
(5947, 3532)
train['userId'].unique().shape, train['movieId'].unique().shape,
((5947,), (3532,))
df = load_data('./ml-1m/ratings.dat', threshold=3)
df.head()
userId | movieId | rating | |
---|---|---|---|
0 | 0 | 1104 | 1 |
1 | 0 | 639 | 0 |
2 | 0 | 853 | 0 |
3 | 0 | 3177 | 1 |
4 | 0 | 2162 | 1 |
df = df[df['rating']==1].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
cnt = tdf.sum(1)
df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
tdf.iloc[:,:] = 0
test_idx = []
for i in tdf.index:
test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
train = df.iloc[list(set(df.index)-set(test_idx)),:]
test = df.iloc[test_idx, :]
for uid, iid in zip(train['userId'].values, train['movieId'].values):
tdf.loc[uid, iid] = 1
train = tdf.copy()
def sampling(args):
z_mean, z_log_var = args
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.random.normal(shape=(batch, dim), stddev=0.01)
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
df.shape, train.shape, test.shape
((570517, 3), (5948, 3533), (5948, 3))
class MultVAE(tf.keras.models.Model):
def __init__(self, input_dim, latent_dim, lamda=1e-4):
super().__init__()
self.input_dim = input_dim
self.latent_dim = latent_dim
self.anneal = 0.
self.model = self.build()
def compile(self, optimizer, loss_fn=None):
super().compile()
self.optimizer = optimizer
self.loss_fn = loss_fn
def build(self):
self.encoder = self.build_encoder()
self.decoder = self.build_decoder()
inputs = self.encoder.input
mu, log_var = self.encoder(inputs)
h = sampling([mu, log_var])
outputs = self.decoder(h)
return Model(inputs, outputs)
def build_encoder(self):
inputs = Input(shape = (self.input_dim, ))
h = Dropout(0.2)(inputs)
mu = Dense(self.latent_dim)(h)
log_var = Dense(self.latent_dim)(h)
return Model(inputs, [mu, log_var])
def build_decoder(self):
inputs = Input(shape = (self.latent_dim, ))
outputs = Dense(self.input_dim, activation='sigmoid')(inputs)
return Model(inputs, outputs)
def train_step(self, data):
x = data
with tf.GradientTape() as tape:
mu, log_var = self.encoder(x)
pred = self.model(x)
kl_loss = tf.reduce_mean(tf.reduce_sum(0.5*(log_var + tf.exp(log_var) + tf.pow(mu, 2)-1), 1, keepdims=True))
ce_loss = -tf.reduce_mean(tf.reduce_sum(tf.nn.log_softmax(pred) * x, -1))
loss = ce_loss + kl_loss*self.anneal
grads = tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
return {'loss': loss}
def predict(self, data):
mu, log_var = self.encoder(data)
return self.decoder(mu)
loader = tf.data.Dataset.from_tensor_slices(train.values.astype(np.float32))
loader = loader.batch(8, drop_remainder=True).shuffle(len(train))
model = MultVAE(train.shape[1], 200)
model.compile(optimizer=tf.optimizers.Adam())
class AnnealCallback(callbacks.Callback):
def __init__(self):
super().__init__()
self.anneal_cap = 0.3
def on_train_batch_end(self, batch, logs=None):
self.model.anneal = min(self.anneal_cap, self.model.anneal+1e-4)
model.fit(loader, epochs=25, callbacks=[AnnealCallback()])
Epoch 1/25 743/743 [==============================] - 13s 16ms/step - loss: 730.4190 Epoch 2/25 743/743 [==============================] - 12s 16ms/step - loss: 722.0552 Epoch 3/25 743/743 [==============================] - 12s 16ms/step - loss: 719.0447 Epoch 4/25 743/743 [==============================] - 12s 16ms/step - loss: 717.2124 Epoch 5/25 743/743 [==============================] - 12s 17ms/step - loss: 715.4193 Epoch 6/25 743/743 [==============================] - 12s 16ms/step - loss: 713.9064 Epoch 7/25 743/743 [==============================] - 12s 17ms/step - loss: 711.3033 Epoch 8/25 743/743 [==============================] - 12s 16ms/step - loss: 709.6291 Epoch 9/25 743/743 [==============================] - 12s 16ms/step - loss: 707.6846 Epoch 10/25 743/743 [==============================] - 12s 16ms/step - loss: 707.0621 Epoch 11/25 743/743 [==============================] - 12s 16ms/step - loss: 705.8719 Epoch 12/25 743/743 [==============================] - 12s 16ms/step - loss: 704.4416 Epoch 13/25 743/743 [==============================] - 12s 16ms/step - loss: 703.5001 Epoch 14/25 743/743 [==============================] - 12s 16ms/step - loss: 703.4260 Epoch 15/25 743/743 [==============================] - 12s 16ms/step - loss: 703.5530 Epoch 16/25 743/743 [==============================] - 12s 16ms/step - loss: 701.2676 Epoch 17/25 743/743 [==============================] - 12s 16ms/step - loss: 700.5692 Epoch 18/25 743/743 [==============================] - 12s 16ms/step - loss: 700.5253 Epoch 19/25 743/743 [==============================] - 12s 16ms/step - loss: 699.8253 Epoch 20/25 743/743 [==============================] - 12s 16ms/step - loss: 700.0319 Epoch 21/25 743/743 [==============================] - 12s 16ms/step - loss: 699.0198 Epoch 22/25 743/743 [==============================] - 12s 16ms/step - loss: 699.0835 Epoch 23/25 743/743 [==============================] - 12s 16ms/step - loss: 698.7805 Epoch 24/25 743/743 [==============================] - 12s 16ms/step - loss: 698.1454 Epoch 25/25 743/743 [==============================] - 12s 16ms/step - loss: 698.6210
<keras.callbacks.History at 0x7fe770aee110>
top_k = 10
np.random.seed(42)
scores = []
for idx, i in tqdm(enumerate(np.random.choice(train.index, 100))):
item_to_pred = {item: pred for item, pred in zip(train.columns, model.model.predict(train.values)[idx])}
test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values
items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_)
top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get)
score = eval_NDCG(test_, top_k_items)
scores.append(score)
np.mean(scores)
0it [00:00, ?it/s]
0.22758533052845625
df = load_data('./ml-1m/ratings.dat', threshold=3)
df.head()
userId | movieId | rating | |
---|---|---|---|
0 | 0 | 1104 | 1 |
1 | 0 | 639 | 0 |
2 | 0 | 853 | 0 |
3 | 0 | 3177 | 1 |
4 | 0 | 2162 | 1 |
df = df[df['rating']==1].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
cnt = tdf.sum(1)
df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
tdf.iloc[:,:] = 0
test_idx = []
for i in tdf.index:
test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
train = df.loc[list(set(df.index)-set(test_idx)),:]
test = df.loc[test_idx, :]
df
userId | movieId | rating | |
---|---|---|---|
0 | 0 | 1104 | 1 |
1 | 0 | 3177 | 1 |
2 | 0 | 2162 | 1 |
3 | 0 | 1195 | 1 |
4 | 0 | 2599 | 1 |
... | ... | ... | ... |
570512 | 6037 | 346 | 1 |
570513 | 6037 | 1120 | 1 |
570514 | 6037 | 1133 | 1 |
570515 | 6037 | 1204 | 1 |
570516 | 6037 | 1007 | 1 |
570517 rows × 3 columns
df.shape, train.shape, test.shape
((570517, 3), (564569, 3), (5948, 3))
for uid, iid in zip(train['userId'].values, train['movieId'].values):
tdf.loc[uid, iid] = 1
train = tdf.copy()
train
movieId | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | ... | 3666 | 3667 | 3668 | 3669 | 3670 | 3671 | 3672 | 3673 | 3674 | 3675 | 3676 | 3677 | 3678 | 3679 | 3680 | 3681 | 3682 | 3683 | 3684 | 3685 | 3686 | 3687 | 3688 | 3689 | 3690 | 3691 | 3692 | 3693 | 3694 | 3695 | 3696 | 3697 | 3698 | 3699 | 3700 | 3701 | 3702 | 3703 | 3704 | 3705 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
userId | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6033 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6034 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6035 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6036 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6037 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5948 rows × 3533 columns
class DAE(tf.keras.models.Model):
def __init__(self, input_dim, latent_dim, lamda=1e-4):
super().__init__()
self.input_dim = input_dim
self.latent_dim = latent_dim
self.lamda = lamda
self.model = self.build()
def compile(self, optimizer, loss_fn=None):
super().compile()
self.optimizer = optimizer
self.loss_fn = loss_fn
def build(self):
self.encoder = self.build_encoder()
self.decoder = self.build_decoder()
inputs = self.encoder.input
outputs = self.decoder(self.encoder(inputs))
return Model(inputs, outputs)
def build_encoder(self):
inputs = Input(shape = (self.input_dim, ))
encoder = Sequential()
encoder.add(Dropout(0.2))
encoder.add(Dense(self.latent_dim, activation='tanh'))
outputs = encoder(inputs)
return Model(inputs, outputs)
def build_decoder(self):
inputs = Input(shape = (self.latent_dim, ))
encoder = Sequential()
encoder.add(Dense(self.input_dim, activation='sigmoid'))
outputs = encoder(inputs)
return Model(inputs, outputs)
def train_step(self, x):
with tf.GradientTape() as tape:
pred = self.model(x)
rec_loss = tf.losses.binary_crossentropy(x, pred)
loss = rec_loss
grads = tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
return {'loss': loss}
loader = tf.data.Dataset.from_tensor_slices(train.values)
loader = loader.batch(32, drop_remainder=True).shuffle(len(df))
model = DAE(train.shape[1], 200)
model.compile(optimizer=tf.optimizers.Adam())
model.fit(loader, epochs = 25)
Epoch 1/25 185/185 [==============================] - 4s 16ms/step - loss: 0.1585 Epoch 2/25 185/185 [==============================] - 3s 16ms/step - loss: 0.1032 Epoch 3/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0940 Epoch 4/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0876 Epoch 5/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0823 Epoch 6/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0785 Epoch 7/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0758 Epoch 8/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0737 Epoch 9/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0719 Epoch 10/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0705 Epoch 11/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0693 Epoch 12/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0684 Epoch 13/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0675 Epoch 14/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0668 Epoch 15/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0661 Epoch 16/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0654 Epoch 17/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0649 Epoch 18/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0640 Epoch 19/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0635 Epoch 20/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0628 Epoch 21/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0621 Epoch 22/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0613 Epoch 23/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0606 Epoch 24/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0601 Epoch 25/25 185/185 [==============================] - 3s 16ms/step - loss: 0.0595
<keras.callbacks.History at 0x7f9bd8942850>
top_k = 10
np.random.seed(42)
scores = []
for idx, i in tqdm(enumerate(np.random.choice(train.index, 100))):
item_to_pred = {item: pred for item, pred in zip(train.columns, model.model.predict(train.values)[idx])}
test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values
items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_)
top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get)
score = eval_NDCG(test_, top_k_items)
scores.append(score)
np.mean(scores)
0it [00:00, ?it/s]
0.2853871661973964
df = load_data('./ml-1m/ratings.dat', threshold=3)
df.head()
userId | movieId | rating | |
---|---|---|---|
0 | 0 | 1104 | 1 |
1 | 0 | 639 | 0 |
2 | 0 | 853 | 0 |
3 | 0 | 3177 | 1 |
4 | 0 | 2162 | 1 |
df = df[df['rating']==1].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
cnt = tdf.sum(1)
df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True)
tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0)
tdf.iloc[:,:] = 0
test_idx = []
for i in tdf.index:
test_idx += list(np.random.choice(df[df['userId']==i].index, 1))
train = df.iloc[list(set(df.index)-set(test_idx)),:]
test = df.iloc[test_idx, :]
for uid, iid in zip(train['userId'].values, train['movieId'].values):
tdf.loc[uid, iid] = 1
train = tdf.copy().astype(np.float32)
loader = tf.data.Dataset.from_tensor_slices(train.values.astype(np.float32))
loader = loader.batch(8, drop_remainder=True).shuffle(len(train))
def log_norm_pdf(x, mu, logvar):
return -0.5*(logvar + tf.math.log(2 * np.pi) + tf.pow((x - mu), 2) / tf.exp(logvar))
def sampling(args):
z_mean, z_log_var = args
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.random.normal(shape=(batch, dim), stddev=0.01)
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
class CompositePrior(tf.keras.models.Model):
def __init__(self, x_dim, latent_dim, mixture_weights = [3/20, 15/20, 2/20]):
super().__init__()
self.encoder_old = Encoder(x_dim, latent_dim, dropout_rate=0)
self.latent_dim = latent_dim
self.mixture_weights = mixture_weights
self.mu_prior = self.add_weight(shape=(self.latent_dim, ), initializer = tf.zeros_initializer(), trainable=False)
self.logvar_prior = self.add_weight(shape=(self.latent_dim, ), initializer = tf.zeros_initializer(), trainable=False)
self.logvar_unif_prior = self.add_weight(shape=(self.latent_dim, ), initializer = tf.constant_initializer(10), trainable=False)
def call(self, x, z):
post_mu, post_logvar = self.encoder_old(x)
stnd_prior = log_norm_pdf(z, self.mu_prior, self.logvar_prior)
post_prior = log_norm_pdf(z, post_mu, post_logvar)
unif_prior = log_norm_pdf(z, self.mu_prior, self.logvar_unif_prior)
gaussians = [stnd_prior, post_prior, unif_prior]
gaussians = [g+tf.math.log(w) for g, w in zip(gaussians, self.mixture_weights)]
density = tf.stack(gaussians, -1)
return tf.math.log(tf.reduce_sum(tf.exp(density), -1)) # logsumexp
class Encoder(tf.keras.models.Model):
def __init__(self, x_dim, latent_dim, dropout_rate = 0.1):
super().__init__()
self.latent_dim = latent_dim
self.x_dim = x_dim
self.dropout_rate = dropout_rate
self.model = self.build_model()
def build_model(self): # now just shallow net
x_in = Input(shape=(self.x_dim, ))
h = Dense(1024, activation='relu')(x_in)
mu = Dense(self.latent_dim)(h)
logvar = Dense(self.latent_dim)(h)
return Model(x_in, [mu, logvar])
def call(self, x):
norm = tf.sqrt(tf.reduce_sum(tf.pow(x, 2), -1, keepdims=True))
x = x/norm
if self.dropout_rate>0:
x = Dropout(self.dropout_rate)(x)
return self.model(x)
class RecVAE(tf.keras.models.Model):
def __init__(self, x_dim, latent_dim):
super().__init__()
self.encoder = Encoder(x_dim, latent_dim)
self.decoder = Dense(x_dim)
self.prior = CompositePrior(x_dim, latent_dim)
def call(self, data):
mu, logvar = self.encoder(data)
z = sampling([mu, logvar])
recon = self.decoder(z)
return mu, logvar, z, recon
def predict(self, data):
mu, logvar = self.encoder(data)
z = sampling([mu, logvar])
recon = self.decoder(z)
return recon
def update_prior(self):
self.prior.encoder_old.set_weights(self.encoder.get_weights())
def tf_train(model, loader, optimizer, target, gamma=1.):
total_loss = 0.
for x in loader:
norm = tf.reduce_sum(x, -1, keepdims=True)
kl_weight = gamma*norm
with tf.GradientTape() as tape:
mu, logvar, z, pred = model(x)
# kl_loss = tf.reduce_mean(tf.reduce_sum(0.5*(logvar + tf.exp(logvar) + tf.pow(mu, 2)-1), 1, keepdims=True))
kl_loss = tf.reduce_mean(log_norm_pdf(z, mu, logvar) - tf.multiply(model.prior(x, z), kl_weight))
ce_loss = -tf.reduce_mean(tf.reduce_sum(tf.nn.log_softmax(pred) * x, -1))
loss = ce_loss + kl_loss*kl_weight
if target == 'encoder':
grads = tape.gradient(loss, model.encoder.trainable_weights)
optimizer.apply_gradients(zip(grads, model.encoder.trainable_weights))
else:
grads = tape.gradient(loss, model.decoder.trainable_weights)
optimizer.apply_gradients(zip(grads, model.decoder.trainable_weights))
total_loss += tf.reduce_sum(loss)
return total_loss
epochs = 25
model = RecVAE(train.shape[1], 200)
enc_opt = optimizers.Adam()
dec_opt = optimizers.Adam()
for e in range(epochs):
# alternating
## train step
tf_train(model, loader, enc_opt, 'encoder')
model.update_prior()
tf_train(model, loader, dec_opt, 'decoder')
## eval step
top_k = 10
np.random.seed(42)
scores = []
for idx, i in tqdm(enumerate(np.random.choice(train.index, 100))):
item_to_pred = {item: pred.numpy() for item, pred in zip(train.columns, model.predict(train.values)[idx])}
test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values
items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_)
top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get)
score = eval_NDCG(test_, top_k_items)
scores.append(score)
# break
np.mean(scores)
0it [00:00, ?it/s]
0.0031546487678572877
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d
Author: Sparsh A. Last updated: 2021-12-17 04:46:00 Compiler : GCC 7.5.0 OS : Linux Release : 5.4.104+ Machine : x86_64 Processor : x86_64 CPU cores : 2 Architecture: 64bit IPython : 5.5.0 matplotlib: 3.2.2 sys : 3.7.12 (default, Sep 10 2021, 00:21:48) [GCC 7.5.0] tensorflow: 2.7.0 pandas : 1.1.5 numpy : 1.19.5 seaborn : 0.11.2 keras : 2.7.0 google : 2.0.3
END