!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-geometric
import torch
torch.__version__
'1.10.0+cu111'
from torch import nn
from torch import Tensor
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
import os
import pickle
import warnings
warnings.filterwarnings('ignore')
class Args:
dataset = 'ML-1M' # Dataset
version = 1 # Dataset version
batch_size = 1024 # Batch size
dim = 64 # Dimension
lr = 5e-3 # Learning rate
offset = 3.5 # Criterion of likes/dislikes
K = 40 # The number of negative samples
num_layers = 4 # The number of layers of a GNN model for the graph with positive edges
MLP_layers = 2 # The number of layers of MLP for the graph with negative edges
epoch = 5 # The number of epochs
reg = 0.05 # Regularization coefficient
args = Args()
Args.__dict__
mappingproxy({'K': 40, 'MLP_layers': 2, '__dict__': <attribute '__dict__' of 'Args' objects>, '__doc__': None, '__module__': '__main__', '__weakref__': <attribute '__weakref__' of 'Args' objects>, 'batch_size': 1024, 'dataset': 'ML-1M', 'dim': 64, 'epoch': 5, 'lr': 0.005, 'num_layers': 4, 'offset': 3.5, 'reg': 0.05, 'version': 1})
!git clone -q --branch v2 https://github.com/RecoHut-Datasets/movielens_1m.git
class Data_loader():
def __init__(self,dataset,version):
self.dataset=dataset; self.version=version
self.sep='::'
self.names=['userId','movieId','rating','timestemp'];
self.path_for_whole='./movielens_1m/ratings.dat'
self.path_for_train='./movielens_1m/train_1m%s.dat'%(version)
self.path_for_test='./movielens_1m/test_1m%s.dat'%(version)
self.num_u=6040; self.num_v=3952;
def data_load(self):
self.whole_=pd.read_csv(self.path_for_whole, names = self.names, sep=self.sep, engine='python').drop('timestemp',axis=1).sample(frac=1,replace=False,random_state=self.version)
self.train_set = pd.read_csv(self.path_for_train,engine='python',names=self.names).drop('timestemp',axis=1)
self.test_set = pd.read_csv(self.path_for_test,engine='python',names=self.names).drop('timestemp',axis=1)
return self.train_set, self.test_set
class bipartite_dataset(Dataset):
def __init__(self, train,neg_dist,offset,num_u,num_v,K):
self.edge_1 = torch.tensor(train['userId'].values-1)
self.edge_2 = torch.tensor(train['movieId'].values-1) +num_u
self.edge_3 = torch.tensor(train['rating'].values) - offset
self.neg_dist = neg_dist
self.K = K;
self.num_u = num_u
self.num_v = num_v
self.tot = np.arange(num_v)
self.train = train
def negs_gen_(self):
print('negative sampling...'); st=time.time()
self.edge_4 = torch.empty((len(self.edge_1),self.K),dtype=torch.long)
prog = tqdm(desc='negative sampling for each epoch...',total=len(set(self.train['userId'].values)),position=0)
for j in set(self.train['userId'].values):
pos=self.train[self.train['userId']==j]['movieId'].values-1
neg = np.setdiff1d(self.tot,pos)
temp = (torch.tensor(np.random.choice(neg,len(pos)*self.K,replace=True,p=self.neg_dist[neg]/self.neg_dist[neg].sum()))+self.num_u).long()
self.edge_4[self.edge_1==j-1]=temp.view(int(len(temp)/self.K),self.K)
prog.update(1)
prog.close()
self.edge_4 = torch.tensor(self.edge_4).long()
print('complete ! %s'%(time.time()-st))
def negs_gen_EP(self,epoch):
print('negative sampling for next epochs...'); st=time.time()
self.edge_4_tot = torch.empty((len(self.edge_1),self.K,epoch),dtype=torch.long)
prog = tqdm(desc='negative sampling for next epochs...',total=len(set(self.train['userId'].values)),position=0)
for j in set(self.train['userId'].values):
pos=self.train[self.train['userId']==j]['movieId'].values-1
neg = np.setdiff1d(self.tot,pos)
temp = (torch.tensor(np.random.choice(neg,len(pos)*self.K*epoch,replace=True,p=self.neg_dist[neg]/self.neg_dist[neg].sum()))+self.num_u).long()
self.edge_4_tot[self.edge_1==j-1]=temp.view(int(len(temp)/self.K/epoch),self.K,epoch)
prog.update(1)
prog.close()
self.edge_4_tot = torch.tensor(self.edge_4_tot).long()
print('complete ! %s'%(time.time()-st))
def __len__(self):
return len(self.edge_1)
def __getitem__(self,idx):
u = self.edge_1[idx]
v = self.edge_2[idx]
w = self.edge_3[idx]
negs = self.edge_4[idx]
return u,v,w,negs
def deg_dist(train, num_v):
uni, cou = np.unique(train['movieId'].values-1,return_counts=True)
cou = cou**(0.75)
deg = np.zeros(num_v)
deg[uni] = cou
return torch.tensor(deg)
def gen_top_K(data_class,emb,train,directory_):
no_items = np.array(list(set(np.arange(1,data_class.num_v+1))-set(train['movieId'])))
total_users = set(np.arange(1,data_class.num_u+1))
reco = dict()
pbar = tqdm(desc = 'top-k recommendation...',total=len(total_users),position=0)
for j in total_users:
pos = train[train['userId']==j]['movieId'].values-1
embedding_ = emb[j-1].view(1,len(emb[0])).mm(emb[data_class.num_u:].t()).detach();
embedding_[0][no_items-1]=-np.inf;
embedding_[0][pos]=-np.inf;
reco[j]=torch.topk(embedding_[0],300).indices.cpu().numpy()+1
pbar.update(1)
pbar.close()
return reco
class LightGConv(MessagePassing):
def __init__(self):
super().__init__(aggr='add')
def forward(self,x,edge_index):
row, col = edge_index
deg = degree(col, x.size(0), dtype=x.dtype)
deg_inv_sqrt = deg.pow(-0.5)
deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]
return self.propagate(edge_index, x=x, norm=norm)
def message(self,x_j,norm):
return norm.view(-1,1) * x_j
def update(self,inputs: Tensor) -> Tensor:
return inputs
class LRGCCF(MessagePassing):
def __init__(self, in_channels,out_channels):
super(LRGCCF,self).__init__(aggr='mean')
self.lin = torch.nn.Linear(in_channels, out_channels)
def forward(self,x,edge_index):
edge_index, _ = add_self_loops(edge_index, num_nodes = x.size(0));
return self.lin(self.propagate(edge_index,x=x))
def message(self,x_j):
return x_j
def update(self,inputs: Tensor) -> Tensor:
return inputs
class SiReN(nn.Module):
def __init__(self,train,num_u,num_v,offset,num_layers = 2,MLP_layers=2,dim = 64,reg=1e-4
,device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")):
super(SiReN,self).__init__()
self.M = num_u; self.N = num_v;
self.num_layers = num_layers
self.MLP_layers = MLP_layers
self.device = device
self.reg = reg
self.embed_dim = dim
edge_user = torch.tensor(train[train['rating']>offset]['userId'].values-1)
edge_item = torch.tensor(train[train['rating']>offset]['movieId'].values-1)+self.M
edge_ = torch.stack((torch.cat((edge_user,edge_item),0),torch.cat((edge_item,edge_user),0)),0)
self.data_p=Data(edge_index=edge_)
# For the graph with positive edges
self.E = nn.Parameter(torch.empty(self.M + self.N, dim))
nn.init.xavier_normal_(self.E.data)
self.convs = nn.ModuleList()
self.mlps = nn.ModuleList()
for _ in range(num_layers):
# self.convs.append(LRGCCF(dim,dim))
self.convs.append(LightGConv())
# For the graph with negative edges
self.E2 = nn.Parameter(torch.empty(self.M + self.N, dim))
nn.init.xavier_normal_(self.E2.data)
for _ in range(MLP_layers):
self.mlps.append(nn.Linear(dim,dim,bias=True))
nn.init.xavier_normal_(self.mlps[-1].weight.data)
# Attntion model
self.attn = nn.Linear(dim,dim,bias=True)
self.q = nn.Linear(dim,1,bias=False)
self.attn_softmax = nn.Softmax(dim=1)
def aggregate(self):
# Generate embeddings z_p
B=[]; B.append(self.E)
x = self.convs[0](self.E,self.data_p.edge_index)
B.append(x)
for i in range(1,self.num_layers):
x = self.convs[i](x,self.data_p.edge_index)
B.append(x)
z_p = sum(B)/len(B)
# Generate embeddings z_n
C = []; C.append(self.E2)
x = F.dropout(F.relu(self.mlps[0](self.E2)),p=0.5,training=self.training)
for i in range(1,self.MLP_layers):
x = self.mlps[i](x);
x = F.relu(x)
x = F.dropout(x,p=0.5,training=self.training)
C.append(x)
z_n = C[-1]
# Attntion for final embeddings Z
w_p = self.q(F.dropout(torch.tanh((self.attn(z_p))),p=0.5,training=self.training))
w_n = self.q(F.dropout(torch.tanh((self.attn(z_n))),p=0.5,training=self.training))
alpha_ = self.attn_softmax(torch.cat([w_p,w_n],dim=1))
Z = alpha_[:,0].view(len(z_p),1) * z_p + alpha_[:,1].view(len(z_p),1) * z_n
return Z
def forward(self,u,v,w,n,device):
emb = self.aggregate()
u_ = emb[u].to(device);
v_ = emb[v].to(device);
n_ = emb[n].to(device);
w_ = w.to(device)
positivebatch = torch.mul(u_ , v_ );
negativebatch = torch.mul(u_.view(len(u_),1,self.embed_dim),n_)
sBPR_loss = F.logsigmoid((torch.sign(w_).view(len(u_),1) * (positivebatch.sum(dim=1).view(len(u_),1))) - negativebatch.sum(dim=2)).sum(dim=1)
reg_loss = u_.norm(dim=1).pow(2).sum() + v_.norm(dim=1).pow(2).sum() + n_.norm(dim=2).pow(2).sum();
return -torch.sum(sBPR_loss) + self.reg * reg_loss
class evaluate():
def __init__(self,reco,train,test,threshold,num_u,num_v,N=[5,10,15,20,25],ratings=[20,50]):
'''
train : training set
test : test set
threshold : To generate ground truth set from test set
'''
self.reco = reco
self.num_u = num_u;
self.num_v = num_v;
self.N=N
self.p=[]
self.r=[]
self.NDCG=[]
self.p_c1=[]; self.p_c2=[]; self.p_c3=[]
self.r_c1=[]; self.r_c2=[]; self.r_c3=[]
self.NDCG_c1=[]; self.NDCG_c2=[]; self.NDCG_c3=[]
self.tr = train; self.te = test;
self.threshold = threshold;
self.gen_ground_truth_set()
self.ratings = ratings
self.partition_into_groups_(self.ratings)
print('\nevaluating recommendation accuracy....')
self.precision_and_recall_G(self.group1,1)
self.precision_and_recall_G(self.group2,2)
self.precision_and_recall_G(self.group3,3)
self.Normalized_DCG_G(self.group1,1)
self.Normalized_DCG_G(self.group2,2)
self.Normalized_DCG_G(self.group3,3)
self.metric_total()
def gen_ground_truth_set(self):
result = dict()
GT = self.te[self.te['rating']>=self.threshold];
U = set(GT['userId'])
for i in U:
result[i] = list(set([j for j in GT[GT['userId']==i]['movieId']]))#-set(self.TOP))
if len(result[i])==0:
del(result[i])
self.GT = result
def precision_and_recall(self):
user_in_GT=[j for j in self.GT];
for n in self.N:
p=0; r=0;
for i in user_in_GT:
topn=self.reco[i][:n]
num_hit=len(set(topn).intersection(set(self.GT[i])));
p+=num_hit/n; r+=num_hit/len(self.GT[i]);
self.p.append(p/len(user_in_GT)); self.r.append(r/len(user_in_GT));
def Normalized_DCG(self):
maxn=max(self.N);
user_in_GT=[j for j in self.GT];
ndcg=np.zeros(maxn);
for i in user_in_GT:
idcg_len = min(len(self.GT[i]), maxn)
temp_idcg = np.cumsum(1.0 / np.log2(np.arange(2, maxn + 2)))
temp_idcg[idcg_len:] = temp_idcg[idcg_len-1]
temp_dcg=np.cumsum([1.0/np.log2(idx+2) if item in self.GT[i] else 0.0 for idx, item in enumerate(self.reco[i][:maxn])])
ndcg+=temp_dcg/temp_idcg;
ndcg/=len(user_in_GT);
for n in self.N:
self.NDCG.append(ndcg[n-1])
def metric_total(self):
self.p = self.len1 * np.array(self.p_c1) + self.len2 * np.array(self.p_c2) + self.len3 * np.array(self.p_c3);
self.p/= self.len1 + self.len2 + self.len3
self.p = list(self.p)
self.r = self.len1 * np.array(self.r_c1) + self.len2 * np.array(self.r_c2) + self.len3 * np.array(self.r_c3);
self.r/= self.len1 + self.len2 + self.len3
self.r = list(self.r)
self.NDCG = self.len1 * np.array(self.NDCG_c1) + self.len2 * np.array(self.NDCG_c2) + self.len3 * np.array(self.NDCG_c3);
self.NDCG/= self.len1 + self.len2 + self.len3
self.NDCG = list(self.NDCG)
def partition_into_groups_(self,ratings=[20,50]):
unique_u, counts_u = np.unique(self.tr['userId'].values,return_counts=True)
self.group1 = unique_u[np.argwhere(counts_u<ratings[0])]
temp = unique_u[np.argwhere(counts_u<ratings[1])]
self.group2 = np.setdiff1d(temp,self.group1)
self.group3 = np.setdiff1d(unique_u,temp)
self.cold_groups = ratings
self.group1 = list(self.group1.reshape(-1))
self.group2 = list(self.group2.reshape(-1))
self.group3 = list(self.group3.reshape(-1))
def precision_and_recall_G(self,group,gn):
user_in_GT=[j for j in self.GT];
leng = 0 ; maxn = max(self.N) ; p = np.zeros(maxn); r = np.zeros(maxn);
for i in user_in_GT:
if i in group:
leng+=1
hit_ = np.cumsum([1.0 if item in self.GT[i] else 0.0 for idx, item in enumerate(self.reco[i][:maxn])])
p+=hit_ / np.arange(1,maxn+1); r+=hit_/len(self.GT[i])
p/= leng; r/=leng;
for n in self.N:
if gn == 1 :
self.p_c1.append(p[n-1])
self.r_c1.append(r[n-1])
self.len1 = leng;
elif gn == 2 :
self.p_c2.append(p[n-1])
self.r_c2.append(r[n-1])
self.len2 = leng;
elif gn == 3 :
self.p_c3.append(p[n-1])
self.r_c3.append(r[n-1])
self.len3 = leng;
def Normalized_DCG_G(self,group,gn):
maxn=max(self.N);
user_in_GT=[j for j in self.GT];
ndcg=np.zeros(maxn);
leng = 0
for i in user_in_GT:
if i in group:
leng+=1
idcg_len = min(len(self.GT[i]), maxn)
temp_idcg = np.cumsum(1.0 / np.log2(np.arange(2, maxn + 2)))
temp_idcg[idcg_len:] = temp_idcg[idcg_len-1]
temp_dcg=np.cumsum([1.0/np.log2(idx+2) if item in self.GT[i] else 0.0 for idx, item in enumerate(self.reco[i][:maxn])])
ndcg+=temp_dcg/temp_idcg;
ndcg/=leng
for n in self.N:
if gn == 1 :
self.NDCG_c1.append(ndcg[n-1])
elif gn == 2 :
self.NDCG_c2.append(ndcg[n-1])
elif gn == 3 :
self.NDCG_c3.append(ndcg[n-1])
data_class=Data_loader(args.dataset,args.version)
threshold = round(args.offset) # To generate ground truth set
print('data loading...'); st=time.time()
train,test = data_class.data_load();
train = train.astype({'userId':'int64', 'movieId':'int64'})
print('loading complete! time :: %s'%(time.time()-st))
print('generate negative candidates...'); st=time.time()
neg_dist = deg_dist(train,data_class.num_v)
print('complete ! time : %s'%(time.time()-st))
data loading... loading complete! time :: 9.793904066085815 generate negative candidates... complete ! time : 0.08616352081298828
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model= SiReN(train, data_class.num_u,data_class.num_v,offset=args.offset,num_layers = args.num_layers,MLP_layers=args.MLP_layers,dim=args.dim,device=device,reg=args.reg)#.to(device);
model.data_p.to(device)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr = args.lr)
print("\nTraining on {}...\n".format(device))
model.train()
training_dataset=bipartite_dataset(train,neg_dist,args.offset,data_class.num_u,data_class.num_v,args.K);
for EPOCH in range(1,args.epoch+1):
if EPOCH%2-1==0:training_dataset.negs_gen_EP(2)
LOSS=0
training_dataset.edge_4 = training_dataset.edge_4_tot[:,:,EPOCH%2-1]
ds = DataLoader(training_dataset,batch_size=args.batch_size,shuffle=True)
q=0
pbar = tqdm(desc = 'Version : {} Epoch {}/{}'.format(args.version,EPOCH,args.epoch),total=len(ds),position=0)
for u,v,w,negs in ds:
q+=len(u)
st=time.time()
optimizer.zero_grad()
loss = model(u,v,w,negs,device) # original
loss.backward()
optimizer.step()
LOSS+=loss.item() * len(ds)
pbar.update(1);
pbar.set_postfix({'loss':loss.item()})
pbar.close()
if EPOCH%2==0 :
directory = os.getcwd() + '/results/%s/SiReN/epoch%s_batch%s_dim%s_lr%s_offset%s_K%s_num_layers%s_MLP_layers%s_threshold%s_reg%s/'%(args.dataset,EPOCH,args.batch_size,args.dim,args.lr,args.offset,args.K,args.num_layers,args.MLP_layers,threshold,args.reg)
if not os.path.exists(directory):
os.makedirs(directory)
model.eval()
emb = model.aggregate();
top_k_list = gen_top_K(data_class,emb,train,directory+'r%s_reco.pickle'%(args.version))
eval_ = evaluate(top_k_list,train,test,threshold,data_class.num_u,data_class.num_v,N=[10,15,20],ratings=[20,50])
print("\n***************************************************************************************")
print(" /* Recommendation Accuracy */")
print('Precision at [10, 15, 20] :: ',eval_.p)
print('Recall at [10, 15, 20] :: ',eval_.r)
print('NDCG at [10, 15, 20] :: ',eval_.NDCG)
print("***************************************************************************************")
directory_ = directory+'r%s_reco.pickle'%(args.version)
with open(directory_,'wb') as fw:
pickle.dump(eval_,fw)
model.train()
Training on cuda:0... negative sampling for next epochs...
negative sampling for next epochs...: 0%| | 0/6040 [00:00<?, ?it/s]
comlete ! 47.54057741165161
Version : 1 Epoch 1/5: 0%| | 0/782 [00:00<?, ?it/s]
Version : 1 Epoch 2/5: 0%| | 0/782 [00:00<?, ?it/s]
top-k recommendation...: 0%| | 0/6040 [00:00<?, ?it/s]
evaluating recommendation accuracy.... *************************************************************************************** /* Recommendation Accuracy */ Precision at [10, 15, 20] :: [0.17743606886177699, 0.15966349100228705, 0.14585492227979271] Recall at [10, 15, 20] :: [0.10826288445828194, 0.1428493921120366, 0.1714220782607128] NDCG at [10, 15, 20] :: [0.19677987212069598, 0.1955777256736281, 0.19695887816559263] *************************************************************************************** negative sampling for next epochs...
negative sampling for next epochs...: 0%| | 0/6040 [00:00<?, ?it/s]
comlete ! 97.63998460769653
Version : 1 Epoch 3/5: 0%| | 0/782 [00:00<?, ?it/s]
Version : 1 Epoch 4/5: 0%| | 0/782 [00:00<?, ?it/s]
top-k recommendation...: 0%| | 0/6040 [00:00<?, ?it/s]
evaluating recommendation accuracy.... *************************************************************************************** /* Recommendation Accuracy */ Precision at [10, 15, 20] :: [0.21833528330269186, 0.19131985068806362, 0.1724887180344305] Recall at [10, 15, 20] :: [0.1381091856308494, 0.17661215295682997, 0.2087517778197392] NDCG at [10, 15, 20] :: [0.26384756370294105, 0.2564248659896275, 0.2555622447550156] *************************************************************************************** negative sampling for next epochs...
negative sampling for next epochs...: 0%| | 0/6040 [00:00<?, ?it/s]
comlete ! 45.37535858154297
Version : 1 Epoch 5/5: 0%| | 0/782 [00:00<?, ?it/s]
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p torch_geometric
Author: Sparsh A. Last updated: 2021-11-24 14:00:16 torch_geometric: 2.0.2 Compiler : GCC 7.5.0 OS : Linux Release : 5.4.104+ Machine : x86_64 Processor : x86_64 CPU cores : 2 Architecture: 64bit torch : 1.10.0+cu111 pandas : 1.1.5 IPython: 5.5.0 numpy : 1.19.5
END