import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import os,sys,inspect import gc from tqdm.notebook import tqdm import random import heapq from sklearn.preprocessing import LabelEncoder from scipy.sparse import csr_matrix from tensorflow import keras import tensorflow as tf from tensorflow.keras import optimizers, callbacks, layers, losses from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply from tensorflow.keras.models import Model, Sequential, load_model SEED = 42 np.random.seed(SEED) tf.random.set_seed(SEED) os.environ['PYTHONHASHSEED']=str(SEED) random.seed(SEED) gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: tf.config.experimental.set_memory_growth(gpus[0], True) except RuntimeError as e: print(e) print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) !wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip !unzip ml-1m.zip def mish(x): return x*tf.math.tanh(tf.math.softplus(x)) def leakyrelu(x, factor=0.2): return tf.maximum(x, factor*x) def load_data(filepath, threshold=0): df = pd.read_csv(filepath, sep="::", header=None, engine='python', names=['userId', 'movieId', 'rating', 'time']) df = df.drop('time', axis=1) df['userId'] = df['userId'].astype(int) df['movieId'] = df['movieId'].astype(int) df['rating'] = df['rating'].astype(float) df = df[['userId', 'movieId', 'rating']] if threshold > 0: df['rating'] = np.where(df['rating']>threshold, 1, 0) else: df['rating'] = 1. m_codes = df['movieId'].astype('category').cat.codes u_codes = df['userId'].astype('category').cat.codes df['movieId'] = m_codes df['userId'] = u_codes return df def add_negative(df, uiid, times=4): df_ = df.copy() user_id = df_['userId'].unique() item_id = df_['movieId'].unique() for i in tqdm(user_id): cnt = 0 n = len(df_[df_['userId']==i]) n_negative = min(n*times, len(item_id)-n-1) available_negative = list(set(uiid) - set(df[df['userId']==i]['movieId'].values)) new = np.random.choice(available_negative, n_negative, replace=False) new = [[i, j, 0] for j in new] df_ = df_.append(pd.DataFrame(new, columns=df.columns), ignore_index=True) return df_ def extract_from_df(df, n_positive, n_negative): df_ = df.copy() rtd = [] user_id = df['userId'].unique() for i in tqdm(user_id): rtd += list(np.random.choice(df[df['userId']==i][df['rating']==1]['movieId'].index, n_positive, replace=False)) rtd += list(np.random.choice(df[df['userId']==i][df['rating']==0]['movieId'].index, n_negative, replace=False)) return rtd def eval_NDCG(true, pred): top_k = pred for i, item in enumerate(top_k, 1): if item == true: return 1 / np.log2(i+1) return 0 df = load_data('./ml-1m/ratings.dat', threshold=3) df.head() df = df[df['rating']==1].reset_index(drop=True) tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0) cnt = tdf.sum(1) df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True) tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0) tdf.iloc[:,:] = 0 test_idx = [] for i in tdf.index: test_idx += list(np.random.choice(df[df['userId']==i].index, 1)) train = df.loc[list(set(df.index)-set(test_idx)),:] test = df.loc[test_idx, :] df df.shape, train.shape, test.shape for uid, iid in zip(train['userId'].values, train['movieId'].values): tdf.loc[uid, iid] = 1 train = tdf.copy() train class CDAE(tf.keras.models.Model): def __init__(self, input_dim, latent_dim, n_user, lamda=1e-4): super().__init__() self.input_dim = input_dim self.latent_dim = latent_dim self.lamda = lamda self.n_user = n_user self.embedding = Embedding(n_user, latent_dim, ) self.model = self.build() def compile(self, optimizer, loss_fn=None): super().compile() self.optimizer = optimizer self.loss_fn = loss_fn def build(self): self.encoder = self.build_encoder() self.decoder = self.build_decoder() rating = Input(shape=(self.input_dim, ), name='rating_input') user_id = Input(shape=(1, ), name='user_input') emb = self.embedding(user_id) emb = tf.squeeze(emb, 1) enc = self.encoder(rating) + emb enc = tf.nn.tanh(enc) outputs = self.decoder(enc) return Model([rating, user_id], outputs) def build_encoder(self): inputs = Input(shape = (self.input_dim, )) encoder = Sequential() encoder.add(Dropout(0.2)) encoder.add(Dense(self.latent_dim, activation='tanh')) outputs = encoder(inputs) return Model(inputs, outputs) def build_decoder(self): inputs = Input(shape = (self.latent_dim, )) encoder = Sequential() encoder.add(Dense(self.input_dim, activation='sigmoid')) outputs = encoder(inputs) return Model(inputs, outputs) def train_step(self, data): x = data['rating'] user_ids = data['id'] with tf.GradientTape() as tape: pred = self.model([x, user_ids]) rec_loss = tf.losses.binary_crossentropy(x, pred) loss = rec_loss grads = tape.gradient(loss, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) return {'loss': loss} loader = tf.data.Dataset.from_tensor_slices({'rating': train.values, 'id': np.arange(len(train))}) loader = loader.batch(32, drop_remainder=True).shuffle(len(train)) model = CDAE(train.shape[1], 200, len(train)) model.compile(optimizer=tf.optimizers.Adam()) model.fit(loader, epochs=25) top_k = 10 np.random.seed(42) scores = [] for idx, i in tqdm(enumerate(np.random.choice(train.index, 100))): item_to_pred = {item: pred for item, pred in zip(train.columns, model.model.predict([train.values, np.arange(len(train))])[idx])} test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_) top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get) score = eval_NDCG(test_, top_k_items) scores.append(score) np.mean(scores) df = load_data('./ml-1m/ratings.dat', threshold=3) df.head() test_idx = [] user_id = df for i in df['userId'].unique(): test_idx += list(np.random.choice(df[df['userId']==i].index, 1)) train = df.iloc[list(set(df.index)-set(test_idx)),:] test = df.iloc[test_idx, :] df.shape, train.shape, test.shape class EASE: def __init__(self): self.user_enc = LabelEncoder() self.item_enc = LabelEncoder() def _get_users_and_items(self, df): users = self.user_enc.fit_transform(df.loc[:, 'userId']) items = self.item_enc.fit_transform(df.loc[:, 'movieId']) return users, items def fit(self, df, lambda_: float = 0.5, implicit=True): """ df: pandas.DataFrame with columns user_id, item_id and (rating) lambda_: l2-regularization term implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used """ users, items = self._get_users_and_items(df) values = np.ones(df.shape[0]) if implicit else df['rating'].to_numpy() / df['rating'].max() X = csr_matrix((values, (users, items))) self.X = X G = X.T.dot(X).toarray() diagIndices = np.diag_indices(G.shape[0]) G[diagIndices] += lambda_ P = np.linalg.inv(G) B = P / (-np.diag(P)) B[diagIndices] = 0 self.B = B self.pred = X.dot(B) def predict(self, train, users, items, k): df = pd.DataFrame() items = self.item_enc.transform(items) dd = train.loc[train['userId'].isin(users)] dd['ci'] = self.item_enc.transform(dd['movieId']) dd['cu'] = self.user_enc.transform(dd['userId']) g = dd.groupby('userId') for user, group in tqdm(g): watched = set(group['ci']) candidates = [item for item in items if item not in watched] u = group['cu'].iloc[0] pred = np.take(self.pred[u, :], candidates) res = np.argpartition(pred, -k)[-k:] r = pd.DataFrame({ "userId": [user] * len(res), "movieId": np.take(candidates, res), "score": np.take(pred, res) }).sort_values('score', ascending=False) df = df.append(r, ignore_index=True) df['movieId'] = self.item_enc.inverse_transform(df['movieId']) return df ease = EASE() ease.fit(train) uid = 0 ease.user_enc.inverse_transform([0])[0] ease.item_enc.inverse_transform(np.argsort(ease.pred[0])) np.argsort(-ease.pred[0]) ease.pred[0][np.argsort(-ease.pred[0])] np.unique(train[train['userId']==0]['movieId']) pred = ease.predict(train, train['userId'].unique(), train['movieId'].unique(), 100) pred uid = 1 df[(df['userId']==uid) & (df['movieId'].isin(pred[pred['userId']==uid]['movieId']))] train[(train['userId']==uid) & (train['movieId'].isin(pred[pred['userId']==uid]['movieId']))] for uid in range(942): pdf = df[(df['userId']==uid) & (df['movieId'].isin(pred[pred['userId']==uid]['movieId']))] ease.pred.shape train['userId'].unique().shape, train['movieId'].unique().shape, df = load_data('./ml-1m/ratings.dat', threshold=3) df.head() df = df[df['rating']==1].reset_index(drop=True) tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0) cnt = tdf.sum(1) df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True) tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0) tdf.iloc[:,:] = 0 test_idx = [] for i in tdf.index: test_idx += list(np.random.choice(df[df['userId']==i].index, 1)) train = df.iloc[list(set(df.index)-set(test_idx)),:] test = df.iloc[test_idx, :] for uid, iid in zip(train['userId'].values, train['movieId'].values): tdf.loc[uid, iid] = 1 train = tdf.copy() def sampling(args): z_mean, z_log_var = args batch = tf.shape(z_mean)[0] dim = tf.shape(z_mean)[1] epsilon = tf.random.normal(shape=(batch, dim), stddev=0.01) return z_mean + tf.exp(0.5 * z_log_var) * epsilon df.shape, train.shape, test.shape class MultVAE(tf.keras.models.Model): def __init__(self, input_dim, latent_dim, lamda=1e-4): super().__init__() self.input_dim = input_dim self.latent_dim = latent_dim self.anneal = 0. self.model = self.build() def compile(self, optimizer, loss_fn=None): super().compile() self.optimizer = optimizer self.loss_fn = loss_fn def build(self): self.encoder = self.build_encoder() self.decoder = self.build_decoder() inputs = self.encoder.input mu, log_var = self.encoder(inputs) h = sampling([mu, log_var]) outputs = self.decoder(h) return Model(inputs, outputs) def build_encoder(self): inputs = Input(shape = (self.input_dim, )) h = Dropout(0.2)(inputs) mu = Dense(self.latent_dim)(h) log_var = Dense(self.latent_dim)(h) return Model(inputs, [mu, log_var]) def build_decoder(self): inputs = Input(shape = (self.latent_dim, )) outputs = Dense(self.input_dim, activation='sigmoid')(inputs) return Model(inputs, outputs) def train_step(self, data): x = data with tf.GradientTape() as tape: mu, log_var = self.encoder(x) pred = self.model(x) kl_loss = tf.reduce_mean(tf.reduce_sum(0.5*(log_var + tf.exp(log_var) + tf.pow(mu, 2)-1), 1, keepdims=True)) ce_loss = -tf.reduce_mean(tf.reduce_sum(tf.nn.log_softmax(pred) * x, -1)) loss = ce_loss + kl_loss*self.anneal grads = tape.gradient(loss, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) return {'loss': loss} def predict(self, data): mu, log_var = self.encoder(data) return self.decoder(mu) loader = tf.data.Dataset.from_tensor_slices(train.values.astype(np.float32)) loader = loader.batch(8, drop_remainder=True).shuffle(len(train)) model = MultVAE(train.shape[1], 200) model.compile(optimizer=tf.optimizers.Adam()) class AnnealCallback(callbacks.Callback): def __init__(self): super().__init__() self.anneal_cap = 0.3 def on_train_batch_end(self, batch, logs=None): self.model.anneal = min(self.anneal_cap, self.model.anneal+1e-4) model.fit(loader, epochs=25, callbacks=[AnnealCallback()]) top_k = 10 np.random.seed(42) scores = [] for idx, i in tqdm(enumerate(np.random.choice(train.index, 100))): item_to_pred = {item: pred for item, pred in zip(train.columns, model.model.predict(train.values)[idx])} test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_) top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get) score = eval_NDCG(test_, top_k_items) scores.append(score) np.mean(scores) df = load_data('./ml-1m/ratings.dat', threshold=3) df.head() df = df[df['rating']==1].reset_index(drop=True) tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0) cnt = tdf.sum(1) df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True) tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0) tdf.iloc[:,:] = 0 test_idx = [] for i in tdf.index: test_idx += list(np.random.choice(df[df['userId']==i].index, 1)) train = df.loc[list(set(df.index)-set(test_idx)),:] test = df.loc[test_idx, :] df df.shape, train.shape, test.shape for uid, iid in zip(train['userId'].values, train['movieId'].values): tdf.loc[uid, iid] = 1 train = tdf.copy() train class DAE(tf.keras.models.Model): def __init__(self, input_dim, latent_dim, lamda=1e-4): super().__init__() self.input_dim = input_dim self.latent_dim = latent_dim self.lamda = lamda self.model = self.build() def compile(self, optimizer, loss_fn=None): super().compile() self.optimizer = optimizer self.loss_fn = loss_fn def build(self): self.encoder = self.build_encoder() self.decoder = self.build_decoder() inputs = self.encoder.input outputs = self.decoder(self.encoder(inputs)) return Model(inputs, outputs) def build_encoder(self): inputs = Input(shape = (self.input_dim, )) encoder = Sequential() encoder.add(Dropout(0.2)) encoder.add(Dense(self.latent_dim, activation='tanh')) outputs = encoder(inputs) return Model(inputs, outputs) def build_decoder(self): inputs = Input(shape = (self.latent_dim, )) encoder = Sequential() encoder.add(Dense(self.input_dim, activation='sigmoid')) outputs = encoder(inputs) return Model(inputs, outputs) def train_step(self, x): with tf.GradientTape() as tape: pred = self.model(x) rec_loss = tf.losses.binary_crossentropy(x, pred) loss = rec_loss grads = tape.gradient(loss, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) return {'loss': loss} loader = tf.data.Dataset.from_tensor_slices(train.values) loader = loader.batch(32, drop_remainder=True).shuffle(len(df)) model = DAE(train.shape[1], 200) model.compile(optimizer=tf.optimizers.Adam()) model.fit(loader, epochs = 25) top_k = 10 np.random.seed(42) scores = [] for idx, i in tqdm(enumerate(np.random.choice(train.index, 100))): item_to_pred = {item: pred for item, pred in zip(train.columns, model.model.predict(train.values)[idx])} test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_) top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get) score = eval_NDCG(test_, top_k_items) scores.append(score) np.mean(scores) df = load_data('./ml-1m/ratings.dat', threshold=3) df.head() df = df[df['rating']==1].reset_index(drop=True) tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0) cnt = tdf.sum(1) df = df[df['userId'].isin(np.where(cnt >= 10)[0])].reset_index(drop=True) tdf = pd.pivot_table(df, index='userId', values='rating', columns='movieId').fillna(0) tdf.iloc[:,:] = 0 test_idx = [] for i in tdf.index: test_idx += list(np.random.choice(df[df['userId']==i].index, 1)) train = df.iloc[list(set(df.index)-set(test_idx)),:] test = df.iloc[test_idx, :] for uid, iid in zip(train['userId'].values, train['movieId'].values): tdf.loc[uid, iid] = 1 train = tdf.copy().astype(np.float32) loader = tf.data.Dataset.from_tensor_slices(train.values.astype(np.float32)) loader = loader.batch(8, drop_remainder=True).shuffle(len(train)) def log_norm_pdf(x, mu, logvar): return -0.5*(logvar + tf.math.log(2 * np.pi) + tf.pow((x - mu), 2) / tf.exp(logvar)) def sampling(args): z_mean, z_log_var = args batch = tf.shape(z_mean)[0] dim = tf.shape(z_mean)[1] epsilon = tf.random.normal(shape=(batch, dim), stddev=0.01) return z_mean + tf.exp(0.5 * z_log_var) * epsilon class CompositePrior(tf.keras.models.Model): def __init__(self, x_dim, latent_dim, mixture_weights = [3/20, 15/20, 2/20]): super().__init__() self.encoder_old = Encoder(x_dim, latent_dim, dropout_rate=0) self.latent_dim = latent_dim self.mixture_weights = mixture_weights self.mu_prior = self.add_weight(shape=(self.latent_dim, ), initializer = tf.zeros_initializer(), trainable=False) self.logvar_prior = self.add_weight(shape=(self.latent_dim, ), initializer = tf.zeros_initializer(), trainable=False) self.logvar_unif_prior = self.add_weight(shape=(self.latent_dim, ), initializer = tf.constant_initializer(10), trainable=False) def call(self, x, z): post_mu, post_logvar = self.encoder_old(x) stnd_prior = log_norm_pdf(z, self.mu_prior, self.logvar_prior) post_prior = log_norm_pdf(z, post_mu, post_logvar) unif_prior = log_norm_pdf(z, self.mu_prior, self.logvar_unif_prior) gaussians = [stnd_prior, post_prior, unif_prior] gaussians = [g+tf.math.log(w) for g, w in zip(gaussians, self.mixture_weights)] density = tf.stack(gaussians, -1) return tf.math.log(tf.reduce_sum(tf.exp(density), -1)) # logsumexp class Encoder(tf.keras.models.Model): def __init__(self, x_dim, latent_dim, dropout_rate = 0.1): super().__init__() self.latent_dim = latent_dim self.x_dim = x_dim self.dropout_rate = dropout_rate self.model = self.build_model() def build_model(self): # now just shallow net x_in = Input(shape=(self.x_dim, )) h = Dense(1024, activation='relu')(x_in) mu = Dense(self.latent_dim)(h) logvar = Dense(self.latent_dim)(h) return Model(x_in, [mu, logvar]) def call(self, x): norm = tf.sqrt(tf.reduce_sum(tf.pow(x, 2), -1, keepdims=True)) x = x/norm if self.dropout_rate>0: x = Dropout(self.dropout_rate)(x) return self.model(x) class RecVAE(tf.keras.models.Model): def __init__(self, x_dim, latent_dim): super().__init__() self.encoder = Encoder(x_dim, latent_dim) self.decoder = Dense(x_dim) self.prior = CompositePrior(x_dim, latent_dim) def call(self, data): mu, logvar = self.encoder(data) z = sampling([mu, logvar]) recon = self.decoder(z) return mu, logvar, z, recon def predict(self, data): mu, logvar = self.encoder(data) z = sampling([mu, logvar]) recon = self.decoder(z) return recon def update_prior(self): self.prior.encoder_old.set_weights(self.encoder.get_weights()) def tf_train(model, loader, optimizer, target, gamma=1.): total_loss = 0. for x in loader: norm = tf.reduce_sum(x, -1, keepdims=True) kl_weight = gamma*norm with tf.GradientTape() as tape: mu, logvar, z, pred = model(x) # kl_loss = tf.reduce_mean(tf.reduce_sum(0.5*(logvar + tf.exp(logvar) + tf.pow(mu, 2)-1), 1, keepdims=True)) kl_loss = tf.reduce_mean(log_norm_pdf(z, mu, logvar) - tf.multiply(model.prior(x, z), kl_weight)) ce_loss = -tf.reduce_mean(tf.reduce_sum(tf.nn.log_softmax(pred) * x, -1)) loss = ce_loss + kl_loss*kl_weight if target == 'encoder': grads = tape.gradient(loss, model.encoder.trainable_weights) optimizer.apply_gradients(zip(grads, model.encoder.trainable_weights)) else: grads = tape.gradient(loss, model.decoder.trainable_weights) optimizer.apply_gradients(zip(grads, model.decoder.trainable_weights)) total_loss += tf.reduce_sum(loss) return total_loss epochs = 25 model = RecVAE(train.shape[1], 200) enc_opt = optimizers.Adam() dec_opt = optimizers.Adam() for e in range(epochs): # alternating ## train step tf_train(model, loader, enc_opt, 'encoder') model.update_prior() tf_train(model, loader, dec_opt, 'decoder') ## eval step top_k = 10 np.random.seed(42) scores = [] for idx, i in tqdm(enumerate(np.random.choice(train.index, 100))): item_to_pred = {item: pred.numpy() for item, pred in zip(train.columns, model.predict(train.values)[idx])} test_ = test[(test['userId']==i) & (test['rating']==1)]['movieId'].values items = list(np.random.choice(list(filter(lambda x: x not in np.argwhere(train.values[idx]).flatten(), item_to_pred.keys())), 100)) + list(test_) top_k_items = heapq.nlargest(top_k, items, key=item_to_pred.get) score = eval_NDCG(test_, top_k_items) scores.append(score) # break np.mean(scores) !pip install -q watermark %reload_ext watermark %watermark -a "Sparsh A." -m -iv -u -t -d