import torch
import torch.nn as nn
import torch.nn.functional as F
class NGCF(nn.Module):
def __init__(self, n_user, n_item, norm_adj, args):
super(NGCF, self).__init__()
self.n_user = n_user
self.n_item = n_item
self.device = args.device
self.emb_size = args.embed_size
self.batch_size = args.batch_size
self.node_dropout = args.node_dropout[0]
self.mess_dropout = args.mess_dropout
self.batch_size = args.batch_size
self.norm_adj = norm_adj
self.layers = eval(args.layer_size)
self.decay = eval(args.regs)[0]
"""
*********************************************************
Init the weight of user-item.
"""
self.embedding_dict, self.weight_dict = self.init_weight()
"""
*********************************************************
Get sparse adj.
"""
self.sparse_norm_adj = self._convert_sp_mat_to_sp_tensor(self.norm_adj).to(self.device)
def init_weight(self):
# xavier init
initializer = nn.init.xavier_uniform_
embedding_dict = nn.ParameterDict({
'user_emb': nn.Parameter(initializer(torch.empty(self.n_user,
self.emb_size))),
'item_emb': nn.Parameter(initializer(torch.empty(self.n_item,
self.emb_size)))
})
weight_dict = nn.ParameterDict()
layers = [self.emb_size] + self.layers
for k in range(len(self.layers)):
weight_dict.update({'W_gc_%d'%k: nn.Parameter(initializer(torch.empty(layers[k],
layers[k+1])))})
weight_dict.update({'b_gc_%d'%k: nn.Parameter(initializer(torch.empty(1, layers[k+1])))})
weight_dict.update({'W_bi_%d'%k: nn.Parameter(initializer(torch.empty(layers[k],
layers[k+1])))})
weight_dict.update({'b_bi_%d'%k: nn.Parameter(initializer(torch.empty(1, layers[k+1])))})
return embedding_dict, weight_dict
def _convert_sp_mat_to_sp_tensor(self, X):
coo = X.tocoo()
i = torch.LongTensor([coo.row, coo.col])
v = torch.from_numpy(coo.data).float()
return torch.sparse.FloatTensor(i, v, coo.shape)
def sparse_dropout(self, x, rate, noise_shape):
random_tensor = 1 - rate
random_tensor += torch.rand(noise_shape).to(x.device)
dropout_mask = torch.floor(random_tensor).type(torch.bool)
i = x._indices()
v = x._values()
i = i[:, dropout_mask]
v = v[dropout_mask]
out = torch.sparse.FloatTensor(i, v, x.shape).to(x.device)
return out * (1. / (1 - rate))
def create_bpr_loss(self, users, pos_items, neg_items):
pos_scores = torch.sum(torch.mul(users, pos_items), axis=1)
neg_scores = torch.sum(torch.mul(users, neg_items), axis=1)
maxi = nn.LogSigmoid()(pos_scores - neg_scores)
mf_loss = -1 * torch.mean(maxi)
# cul regularizer
regularizer = (torch.norm(users) ** 2
+ torch.norm(pos_items) ** 2
+ torch.norm(neg_items) ** 2) / 2
emb_loss = self.decay * regularizer / self.batch_size
return mf_loss + emb_loss, mf_loss, emb_loss
def rating(self, u_g_embeddings, pos_i_g_embeddings):
return torch.matmul(u_g_embeddings, pos_i_g_embeddings.t())
def forward(self, users, pos_items, neg_items, drop_flag=True):
A_hat = self.sparse_dropout(self.sparse_norm_adj,
self.node_dropout,
self.sparse_norm_adj._nnz()) if drop_flag else self.sparse_norm_adj
ego_embeddings = torch.cat([self.embedding_dict['user_emb'],
self.embedding_dict['item_emb']], 0)
all_embeddings = [ego_embeddings]
for k in range(len(self.layers)):
side_embeddings = torch.sparse.mm(A_hat, ego_embeddings)
# transformed sum messages of neighbors.
sum_embeddings = torch.matmul(side_embeddings, self.weight_dict['W_gc_%d' % k]) \
+ self.weight_dict['b_gc_%d' % k]
# bi messages of neighbors.
# element-wise product
bi_embeddings = torch.mul(ego_embeddings, side_embeddings)
# transformed bi messages of neighbors.
bi_embeddings = torch.matmul(bi_embeddings, self.weight_dict['W_bi_%d' % k]) \
+ self.weight_dict['b_bi_%d' % k]
# non-linear activation.
ego_embeddings = nn.LeakyReLU(negative_slope=0.2)(sum_embeddings + bi_embeddings)
# message dropout.
ego_embeddings = nn.Dropout(self.mess_dropout[k])(ego_embeddings)
# normalize the distribution of embeddings.
norm_embeddings = F.normalize(ego_embeddings, p=2, dim=1)
all_embeddings += [norm_embeddings]
all_embeddings = torch.cat(all_embeddings, 1)
u_g_embeddings = all_embeddings[:self.n_user, :]
i_g_embeddings = all_embeddings[self.n_user:, :]
"""
*********************************************************
look up.
"""
u_g_embeddings = u_g_embeddings[users, :]
pos_i_g_embeddings = i_g_embeddings[pos_items, :]
neg_i_g_embeddings = i_g_embeddings[neg_items, :]
return u_g_embeddings, pos_i_g_embeddings, neg_i_g_embeddings
!mkdir gowalla
%cd gowalla
!wget -q --show-progress https://github.com/huangtinglin/NGCF-PyTorch/raw/master/Data/gowalla/train.txt
!wget -q --show-progress https://github.com/huangtinglin/NGCF-PyTorch/raw/master/Data/gowalla/test.txt
!wget -q --show-progress https://github.com/huangtinglin/NGCF-PyTorch/raw/master/Data/gowalla/user_list.txt
!wget -q --show-progress https://github.com/huangtinglin/NGCF-PyTorch/raw/master/Data/gowalla/item_list.txt
%cd ..
/content/gowalla train.txt 100%[===================>] 4.42M --.-KB/s in 0.07s test.txt 100%[===================>] 1.31M --.-KB/s in 0.05s user_list.txt 100%[===================>] 342.97K --.-KB/s in 0.03s item_list.txt 100%[===================>] 495.04K --.-KB/s in 0.04s /content
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Run NGCF.")
parser.add_argument('--weights_path', nargs='?', default='./',
help='Store model path.')
parser.add_argument('--data_path', nargs='?', default='./',
help='Input data path.')
parser.add_argument('--proj_path', nargs='?', default='./',
help='Project path.')
parser.add_argument('--dataset', nargs='?', default='gowalla',
help='Choose a dataset from {gowalla, yelp2018, amazon-book}')
parser.add_argument('--pretrain', type=int, default=0,
help='0: No pretrain, -1: Pretrain with the learned embeddings, 1:Pretrain with stored models.')
parser.add_argument('--verbose', type=int, default=1,
help='Interval of evaluation.')
parser.add_argument('--epoch', type=int, default=400,
help='Number of epoch.')
parser.add_argument('--embed_size', type=int, default=64,
help='Embedding size.')
parser.add_argument('--layer_size', nargs='?', default='[64,64,64]',
help='Output sizes of every layer')
parser.add_argument('--batch_size', type=int, default=1024,
help='Batch size.')
parser.add_argument('--regs', nargs='?', default='[1e-5]',
help='Regularizations.')
parser.add_argument('--lr', type=float, default=0.0001,
help='Learning rate.')
parser.add_argument('--model_type', nargs='?', default='ngcf',
help='Specify the name of model (ngcf).')
parser.add_argument('--adj_type', nargs='?', default='norm',
help='Specify the type of the adjacency (laplacian) matrix from {plain, norm, mean}.')
parser.add_argument('--gpu_id', type=int, default=0)
parser.add_argument('--node_dropout_flag', type=int, default=1,
help='0: Disable node dropout, 1: Activate node dropout')
parser.add_argument('--node_dropout', nargs='?', default='[0.1]',
help='Keep probability w.r.t. node dropout (i.e., 1-dropout_ratio) for each deep layer. 1: no dropout.')
parser.add_argument('--mess_dropout', nargs='?', default='[0.1,0.1,0.1]',
help='Keep probability w.r.t. message dropout (i.e., 1-dropout_ratio) for each deep layer. 1: no dropout.')
parser.add_argument('--Ks', nargs='?', default='[20, 40, 60, 80, 100]',
help='Output sizes of every layer')
parser.add_argument('--save_flag', type=int, default=0,
help='0: Disable model saver, 1: Activate model saver')
parser.add_argument('--test_flag', nargs='?', default='part',
help='Specify the test type from {part, full}, indicating whether the reference is done in mini-batch')
parser.add_argument('--report', type=int, default=0,
help='0: Disable performance report w.r.t. sparsity levels, 1: Show performance report w.r.t. sparsity levels')
return parser.parse_args(args={})
import numpy as np
from sklearn.metrics import roc_auc_score
def recall(rank, ground_truth, N):
return len(set(rank[:N]) & set(ground_truth)) / float(len(set(ground_truth)))
def precision_at_k(r, k):
"""Score is precision @ k
Relevance is binary (nonzero is relevant).
Returns:
Precision @ k
Raises:
ValueError: len(r) must be >= k
"""
assert k >= 1
r = np.asarray(r)[:k]
return np.mean(r)
def average_precision(r,cut):
"""Score is average precision (area under PR curve)
Relevance is binary (nonzero is relevant).
Returns:
Average precision
"""
r = np.asarray(r)
out = [precision_at_k(r, k + 1) for k in range(cut) if r[k]]
if not out:
return 0.
return np.sum(out)/float(min(cut, np.sum(r)))
def mean_average_precision(rs):
"""Score is mean average precision
Relevance is binary (nonzero is relevant).
Returns:
Mean average precision
"""
return np.mean([average_precision(r) for r in rs])
def dcg_at_k(r, k, method=1):
"""Score is discounted cumulative gain (dcg)
Relevance is positive real values. Can use binary
as the previous methods.
Returns:
Discounted cumulative gain
"""
r = np.asfarray(r)[:k]
if r.size:
if method == 0:
return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
elif method == 1:
return np.sum(r / np.log2(np.arange(2, r.size + 2)))
else:
raise ValueError('method must be 0 or 1.')
return 0.
def ndcg_at_k(r, k, ground_truth, method=1):
"""Score is normalized discounted cumulative gain (ndcg)
Relevance is positive real values. Can use binary
as the previous methods.
Returns:
Normalized discounted cumulative gain
Low but correct defination
"""
GT = set(ground_truth)
if len(GT) > k :
sent_list = [1.0] * k
else:
sent_list = [1.0]*len(GT) + [0.0]*(k-len(GT))
dcg_max = dcg_at_k(sent_list, k, method)
if not dcg_max:
return 0.
return dcg_at_k(r, k, method) / dcg_max
def recall_at_k(r, k, all_pos_num):
# if all_pos_num == 0:
# return 0
r = np.asfarray(r)[:k]
return np.sum(r) / all_pos_num
def hit_at_k(r, k):
r = np.array(r)[:k]
if np.sum(r) > 0:
return 1.
else:
return 0.
def F1(pre, rec):
if pre + rec > 0:
return (2.0 * pre * rec) / (pre + rec)
else:
return 0.
def AUC(ground_truth, prediction):
try:
res = roc_auc_score(y_true=ground_truth, y_score=prediction)
except Exception:
res = 0.
return res
import numpy as np
import random as rd
import scipy.sparse as sp
from time import time
class Data(object):
def __init__(self, path, batch_size):
self.path = path
self.batch_size = batch_size
train_file = path + '/train.txt'
test_file = path + '/test.txt'
#get number of users and items
self.n_users, self.n_items = 0, 0
self.n_train, self.n_test = 0, 0
self.neg_pools = {}
self.exist_users = []
with open(train_file) as f:
for l in f.readlines():
if len(l) > 0:
l = l.strip('\n').split(' ')
items = [int(i) for i in l[1:]]
uid = int(l[0])
self.exist_users.append(uid)
self.n_items = max(self.n_items, max(items))
self.n_users = max(self.n_users, uid)
self.n_train += len(items)
with open(test_file) as f:
for l in f.readlines():
if len(l) > 0:
l = l.strip('\n')
try:
items = [int(i) for i in l.split(' ')[1:]]
except Exception:
continue
self.n_items = max(self.n_items, max(items))
self.n_test += len(items)
self.n_items += 1
self.n_users += 1
self.print_statistics()
self.R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32)
self.train_items, self.test_set = {}, {}
with open(train_file) as f_train:
with open(test_file) as f_test:
for l in f_train.readlines():
if len(l) == 0:
break
l = l.strip('\n')
items = [int(i) for i in l.split(' ')]
uid, train_items = items[0], items[1:]
for i in train_items:
self.R[uid, i] = 1.
# self.R[uid][i] = 1
self.train_items[uid] = train_items
for l in f_test.readlines():
if len(l) == 0: break
l = l.strip('\n')
try:
items = [int(i) for i in l.split(' ')]
except Exception:
continue
uid, test_items = items[0], items[1:]
self.test_set[uid] = test_items
def get_adj_mat(self):
try:
t1 = time()
adj_mat = sp.load_npz(self.path + '/s_adj_mat.npz')
norm_adj_mat = sp.load_npz(self.path + '/s_norm_adj_mat.npz')
mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz')
print('already load adj matrix', adj_mat.shape, time() - t1)
except Exception:
adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
sp.save_npz(self.path + '/s_adj_mat.npz', adj_mat)
sp.save_npz(self.path + '/s_norm_adj_mat.npz', norm_adj_mat)
sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat)
return adj_mat, norm_adj_mat, mean_adj_mat
def create_adj_mat(self):
t1 = time()
adj_mat = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32)
adj_mat = adj_mat.tolil()
R = self.R.tolil()
adj_mat[:self.n_users, self.n_users:] = R
adj_mat[self.n_users:, :self.n_users] = R.T
adj_mat = adj_mat.todok()
print('already create adjacency matrix', adj_mat.shape, time() - t1)
t2 = time()
def mean_adj_single(adj):
# D^-1 * A
rowsum = np.array(adj.sum(1))
d_inv = np.power(rowsum, -1).flatten()
d_inv[np.isinf(d_inv)] = 0.
d_mat_inv = sp.diags(d_inv)
norm_adj = d_mat_inv.dot(adj)
# norm_adj = adj.dot(d_mat_inv)
print('generate single-normalized adjacency matrix.')
return norm_adj.tocoo()
def normalized_adj_single(adj):
# D^-1/2 * A * D^-1/2
rowsum = np.array(adj.sum(1))
d_inv_sqrt = np.power(rowsum, -0.5).flatten()
d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
# bi_lap = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)
bi_lap = d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt)
return bi_lap.tocoo()
def check_adj_if_equal(adj):
dense_A = np.array(adj.todense())
degree = np.sum(dense_A, axis=1, keepdims=False)
temp = np.dot(np.diag(np.power(degree, -1)), dense_A)
print('check normalized adjacency matrix whether equal to this laplacian matrix.')
return temp
norm_adj_mat = mean_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
# norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
mean_adj_mat = mean_adj_single(adj_mat)
print('already normalize adjacency matrix', time() - t2)
return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr()
def negative_pool(self):
t1 = time()
for u in self.train_items.keys():
neg_items = list(set(range(self.n_items)) - set(self.train_items[u]))
pools = [rd.choice(neg_items) for _ in range(100)]
self.neg_pools[u] = pools
print('refresh negative pools', time() - t1)
def sample(self):
if self.batch_size <= self.n_users:
users = rd.sample(self.exist_users, self.batch_size)
else:
users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]
def sample_pos_items_for_u(u, num):
# sample num pos items for u-th user
pos_items = self.train_items[u]
n_pos_items = len(pos_items)
pos_batch = []
while True:
if len(pos_batch) == num:
break
pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
pos_i_id = pos_items[pos_id]
if pos_i_id not in pos_batch:
pos_batch.append(pos_i_id)
return pos_batch
def sample_neg_items_for_u(u, num):
# sample num neg items for u-th user
neg_items = []
while True:
if len(neg_items) == num:
break
neg_id = np.random.randint(low=0, high=self.n_items,size=1)[0]
if neg_id not in self.train_items[u] and neg_id not in neg_items:
neg_items.append(neg_id)
return neg_items
def sample_neg_items_for_u_from_pools(u, num):
neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u]))
return rd.sample(neg_items, num)
pos_items, neg_items = [], []
for u in users:
pos_items += sample_pos_items_for_u(u, 1)
neg_items += sample_neg_items_for_u(u, 1)
return users, pos_items, neg_items
def get_num_users_items(self):
return self.n_users, self.n_items
def print_statistics(self):
print('n_users=%d, n_items=%d' % (self.n_users, self.n_items))
print('n_interactions=%d' % (self.n_train + self.n_test))
print('n_train=%d, n_test=%d, sparsity=%.5f' % (self.n_train, self.n_test, (self.n_train + self.n_test)/(self.n_users * self.n_items)))
def get_sparsity_split(self):
try:
split_uids, split_state = [], []
lines = open(self.path + '/sparsity.split', 'r').readlines()
for idx, line in enumerate(lines):
if idx % 2 == 0:
split_state.append(line.strip())
print(line.strip())
else:
split_uids.append([int(uid) for uid in line.strip().split(' ')])
print('get sparsity split.')
except Exception:
split_uids, split_state = self.create_sparsity_split()
f = open(self.path + '/sparsity.split', 'w')
for idx in range(len(split_state)):
f.write(split_state[idx] + '\n')
f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n')
print('create sparsity split.')
return split_uids, split_state
def create_sparsity_split(self):
all_users_to_test = list(self.test_set.keys())
user_n_iid = dict()
# generate a dictionary to store (key=n_iids, value=a list of uid).
for uid in all_users_to_test:
train_iids = self.train_items[uid]
test_iids = self.test_set[uid]
n_iids = len(train_iids) + len(test_iids)
if n_iids not in user_n_iid.keys():
user_n_iid[n_iids] = [uid]
else:
user_n_iid[n_iids].append(uid)
split_uids = list()
# split the whole user set into four subset.
temp = []
count = 1
fold = 4
n_count = (self.n_train + self.n_test)
n_rates = 0
split_state = []
for idx, n_iids in enumerate(sorted(user_n_iid)):
temp += user_n_iid[n_iids]
n_rates += n_iids * len(user_n_iid[n_iids])
n_count -= n_iids * len(user_n_iid[n_iids])
if n_rates >= count * 0.25 * (self.n_train + self.n_test):
split_uids.append(temp)
state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates)
split_state.append(state)
print(state)
temp = []
n_rates = 0
fold -= 1
if idx == len(user_n_iid.keys()) - 1 or n_count == 0:
split_uids.append(temp)
state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates)
split_state.append(state)
print(state)
return split_uids, split_state
import os
import re
def txt2list(file_src):
orig_file = open(file_src, "r")
lines = orig_file.readlines()
return lines
def ensureDir(dir_path):
d = os.path.dirname(dir_path)
if not os.path.exists(d):
os.makedirs(d)
def uni2str(unicode_str):
return str(unicode_str.encode('ascii', 'ignore')).replace('\n', '').strip()
def hasNumbers(inputString):
return bool(re.search(r'\d', inputString))
def delMultiChar(inputString, chars):
for ch in chars:
inputString = inputString.replace(ch, '')
return inputString
def merge_two_dicts(x, y):
z = x.copy() # start with x's keys and values
z.update(y) # modifies z with y's keys and values & returns None
return z
def early_stopping(log_value, best_value, stopping_step, expected_order='acc', flag_step=100):
# early stopping strategy:
assert expected_order in ['acc', 'dec']
if (expected_order == 'acc' and log_value >= best_value) or (expected_order == 'dec' and log_value <= best_value):
stopping_step = 0
best_value = log_value
else:
stopping_step += 1
if stopping_step >= flag_step:
print("Early stopping is trigger at step: {} log:{}".format(flag_step, log_value))
should_stop = True
else:
should_stop = False
return best_value, stopping_step, should_stop
import multiprocessing
import heapq
cores = multiprocessing.cpu_count() // 2
args = parse_args()
Ks = eval(args.Ks)
data_generator = Data(path=args.data_path + args.dataset, batch_size=args.batch_size)
USR_NUM, ITEM_NUM = data_generator.n_users, data_generator.n_items
N_TRAIN, N_TEST = data_generator.n_train, data_generator.n_test
BATCH_SIZE = args.batch_size
def ranklist_by_heapq(user_pos_test, test_items, rating, Ks):
item_score = {}
for i in test_items:
item_score[i] = rating[i]
K_max = max(Ks)
K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)
r = []
for i in K_max_item_score:
if i in user_pos_test:
r.append(1)
else:
r.append(0)
auc = 0.
return r, auc
def get_auc(item_score, user_pos_test):
item_score = sorted(item_score.items(), key=lambda kv: kv[1])
item_score.reverse()
item_sort = [x[0] for x in item_score]
posterior = [x[1] for x in item_score]
r = []
for i in item_sort:
if i in user_pos_test:
r.append(1)
else:
r.append(0)
auc = auc(ground_truth=r, prediction=posterior)
return auc
def ranklist_by_sorted(user_pos_test, test_items, rating, Ks):
item_score = {}
for i in test_items:
item_score[i] = rating[i]
K_max = max(Ks)
K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)
r = []
for i in K_max_item_score:
if i in user_pos_test:
r.append(1)
else:
r.append(0)
auc = get_auc(item_score, user_pos_test)
return r, auc
def get_performance(user_pos_test, r, auc, Ks):
precision, recall, ndcg, hit_ratio = [], [], [], []
for K in Ks:
precision.append(precision_at_k(r, K))
recall.append(recall_at_k(r, K, len(user_pos_test)))
ndcg.append(ndcg_at_k(r, K, user_pos_test))
hit_ratio.append(hit_at_k(r, K))
return {'recall': np.array(recall), 'precision': np.array(precision),
'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc}
def test_one_user(x):
# user u's ratings for user u
rating = x[0]
#uid
u = x[1]
#user u's items in the training set
try:
training_items = data_generator.train_items[u]
except Exception:
training_items = []
#user u's items in the test set
user_pos_test = data_generator.test_set[u]
all_items = set(range(ITEM_NUM))
test_items = list(all_items - set(training_items))
if args.test_flag == 'part':
r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks)
else:
r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)
return get_performance(user_pos_test, r, auc, Ks)
def test(model, users_to_test, drop_flag=False, batch_test_flag=False):
result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)),
'hit_ratio': np.zeros(len(Ks)), 'auc': 0.}
pool = multiprocessing.Pool(cores)
u_batch_size = BATCH_SIZE * 2
i_batch_size = BATCH_SIZE
test_users = users_to_test
n_test_users = len(test_users)
n_user_batchs = n_test_users // u_batch_size + 1
count = 0
for u_batch_id in range(n_user_batchs):
start = u_batch_id * u_batch_size
end = (u_batch_id + 1) * u_batch_size
user_batch = test_users[start: end]
if batch_test_flag:
# batch-item test
n_item_batchs = ITEM_NUM // i_batch_size + 1
rate_batch = np.zeros(shape=(len(user_batch), ITEM_NUM))
i_count = 0
for i_batch_id in range(n_item_batchs):
i_start = i_batch_id * i_batch_size
i_end = min((i_batch_id + 1) * i_batch_size, ITEM_NUM)
item_batch = range(i_start, i_end)
if drop_flag == False:
u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch,
item_batch,
[],
drop_flag=False)
i_rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu()
else:
u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch,
item_batch,
[],
drop_flag=True)
i_rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu()
rate_batch[:, i_start: i_end] = i_rate_batch
i_count += i_rate_batch.shape[1]
assert i_count == ITEM_NUM
else:
# all-item test
item_batch = range(ITEM_NUM)
if drop_flag == False:
u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch,
item_batch,
[],
drop_flag=False)
rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu()
else:
u_g_embeddings, pos_i_g_embeddings, _ = model(user_batch,
item_batch,
[],
drop_flag=True)
rate_batch = model.rating(u_g_embeddings, pos_i_g_embeddings).detach().cpu()
user_batch_rating_uid = zip(rate_batch.numpy(), user_batch)
batch_result = pool.map(test_one_user, user_batch_rating_uid)
count += len(batch_result)
for re in batch_result:
result['precision'] += re['precision']/n_test_users
result['recall'] += re['recall']/n_test_users
result['ndcg'] += re['ndcg']/n_test_users
result['hit_ratio'] += re['hit_ratio']/n_test_users
result['auc'] += re['auc']/n_test_users
assert count == n_test_users
pool.close()
return result
n_users=29858, n_items=40981 n_interactions=1027370 n_train=810128, n_test=217242, sparsity=0.00084
import torch
import torch.optim as optim
from time import time
import warnings
warnings.filterwarnings('ignore')
if __name__ == '__main__':
args.device = torch.device('cuda:' + str(args.gpu_id))
plain_adj, norm_adj, mean_adj = data_generator.get_adj_mat()
args.node_dropout = eval(args.node_dropout)
args.mess_dropout = eval(args.mess_dropout)
model = NGCF(data_generator.n_users,
data_generator.n_items,
norm_adj,
args).to(args.device)
t0 = time()
"""
*********************************************************
Train.
"""
cur_best_pre_0, stopping_step = 0, 0
optimizer = optim.Adam(model.parameters(), lr=args.lr)
loss_loger, pre_loger, rec_loger, ndcg_loger, hit_loger = [], [], [], [], []
for epoch in range(args.epoch):
t1 = time()
loss, mf_loss, emb_loss = 0., 0., 0.
n_batch = data_generator.n_train // args.batch_size + 1
for idx in range(n_batch):
users, pos_items, neg_items = data_generator.sample()
u_g_embeddings, pos_i_g_embeddings, neg_i_g_embeddings = model(users,
pos_items,
neg_items,
drop_flag=args.node_dropout_flag)
batch_loss, batch_mf_loss, batch_emb_loss = model.create_bpr_loss(u_g_embeddings,
pos_i_g_embeddings,
neg_i_g_embeddings)
optimizer.zero_grad()
batch_loss.backward()
optimizer.step()
loss += batch_loss
mf_loss += batch_mf_loss
emb_loss += batch_emb_loss
if (epoch + 1) % 10 != 0:
if args.verbose > 0 and epoch % args.verbose == 0:
perf_str = 'Epoch %d [%.1fs]: train==[%.5f=%.5f + %.5f]' % (
epoch, time() - t1, loss, mf_loss, emb_loss)
print(perf_str)
continue
t2 = time()
users_to_test = list(data_generator.test_set.keys())
ret = test(model, users_to_test, drop_flag=False)
t3 = time()
loss_loger.append(loss)
rec_loger.append(ret['recall'])
pre_loger.append(ret['precision'])
ndcg_loger.append(ret['ndcg'])
hit_loger.append(ret['hit_ratio'])
if args.verbose > 0:
perf_str = 'Epoch %d [%.1fs + %.1fs]: train==[%.5f=%.5f + %.5f], recall=[%.5f, %.5f], ' \
'precision=[%.5f, %.5f], hit=[%.5f, %.5f], ndcg=[%.5f, %.5f]' % \
(epoch, t2 - t1, t3 - t2, loss, mf_loss, emb_loss, ret['recall'][0], ret['recall'][-1],
ret['precision'][0], ret['precision'][-1], ret['hit_ratio'][0], ret['hit_ratio'][-1],
ret['ndcg'][0], ret['ndcg'][-1])
print(perf_str)
cur_best_pre_0, stopping_step, should_stop = early_stopping(ret['recall'][0], cur_best_pre_0,
stopping_step, expected_order='acc', flag_step=5)
# *********************************************************
# early stopping when cur_best_pre_0 is decreasing for ten successive steps.
if should_stop == True:
break
# *********************************************************
# save the user & item embeddings for pretraining.
if ret['recall'][0] == cur_best_pre_0 and args.save_flag == 1:
torch.save(model.state_dict(), args.weights_path + str(epoch) + '.pkl')
print('save the weights in path: ', args.weights_path + str(epoch) + '.pkl')
recs = np.array(rec_loger)
pres = np.array(pre_loger)
ndcgs = np.array(ndcg_loger)
hit = np.array(hit_loger)
best_rec_0 = max(recs[:, 0])
idx = list(recs[:, 0]).index(best_rec_0)
final_perf = "Best Iter=[%d]@[%.1f]\trecall=[%s], precision=[%s], hit=[%s], ndcg=[%s]" % \
(idx, time() - t0, '\t'.join(['%.5f' % r for r in recs[idx]]),
'\t'.join(['%.5f' % r for r in pres[idx]]),
'\t'.join(['%.5f' % r for r in hit[idx]]),
'\t'.join(['%.5f' % r for r in ndcgs[idx]]))
print(final_perf)
already create adjacency matrix (70839, 70839) 79.06983399391174 generate single-normalized adjacency matrix. generate single-normalized adjacency matrix. already normalize adjacency matrix 2.180420160293579 Epoch 0 [173.5s]: train==[432.81360=432.77740 + 0.03577] Epoch 1 [173.0s]: train==[238.26131=238.22516 + 0.03618] Epoch 2 [173.6s]: train==[198.87595=198.83954 + 0.03643] Epoch 3 [172.9s]: train==[183.36285=183.32619 + 0.03663] Epoch 4 [174.2s]: train==[169.82016=169.78339 + 0.03679] Epoch 5 [173.9s]: train==[154.85112=154.81422 + 0.03695] Epoch 6 [175.6s]: train==[143.06267=143.02547 + 0.03708] Epoch 7 [174.5s]: train==[134.72755=134.69037 + 0.03721] Epoch 8 [174.8s]: train==[129.35472=129.31735 + 0.03733]
--------------------------------------------------------------------------- RemoteTraceback Traceback (most recent call last) RemoteTraceback: """ Traceback (most recent call last): File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker result = (True, func(*args, **kwds)) File "/usr/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar return list(map(*args)) File "<ipython-input-7-9522e3d80e26>", line 98, in test_one_user return get_performance(user_pos_test, r, auc, Ks) File "<ipython-input-7-9522e3d80e26>", line 67, in get_performance precision.append(metrics.precision_at_k(r, K)) NameError: name 'metrics' is not defined """ The above exception was the direct cause of the following exception: NameError Traceback (most recent call last) <ipython-input-8-4a88fc9b12e1> in <module>() 62 t2 = time() 63 users_to_test = list(data_generator.test_set.keys()) ---> 64 ret = test(model, users_to_test, drop_flag=False) 65 66 t3 = time() <ipython-input-7-9522e3d80e26> in test(model, users_to_test, drop_flag, batch_test_flag) 168 169 user_batch_rating_uid = zip(rate_batch.numpy(), user_batch) --> 170 batch_result = pool.map(test_one_user, user_batch_rating_uid) 171 count += len(batch_result) 172 /usr/lib/python3.7/multiprocessing/pool.py in map(self, func, iterable, chunksize) 266 in a list that is returned. 267 ''' --> 268 return self._map_async(func, iterable, mapstar, chunksize).get() 269 270 def starmap(self, func, iterable, chunksize=None): /usr/lib/python3.7/multiprocessing/pool.py in get(self, timeout) 655 return self._value 656 else: --> 657 raise self._value 658 659 def _set(self, i, obj): NameError: name 'metrics' is not defined