Test the NCF modules under folder cf_ec2 with ml-1m dataset, save the best model¶

In [1]:

import numpy as np 
import pandas as pd
import keras
from keras import Model
from keras.regularizers import l2
from keras.optimizers import (
    Adam,
    Adamax,
    Adagrad,
    SGD,
    RMSprop
)
from keras.layers import (
    Embedding, 
    Input,
    Flatten, 
    Multiply, 
    Concatenate,
    Dense
)

import sys
sys.path.append('../')
from cf_ec2 import (
    GMF,
    MLP,
    NCF,
    Data,
    evaluation
)

Using TensorFlow backend.

step 1: load the data¶

In [2]:

train = pd.read_csv('../data/ml-1m.train.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])
test = pd.read_csv('../data/ml-1m.test.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])

In [3]:

train.head(3)

Out[3]:

	item	rating	event_ts
0	32	4	978824330
1	34	4	978824330
2	4	5	978824291

In [4]:

test.head(3)

Out[4]:

	user	item	rating	event_ts
0	0	25	5	978824351
1	1	133	3	978300174
2	2	207	4	978298504

In [5]:

test.user.nunique(), test.shape

Out[5]:

(6040, (6040, 4))

step 2: prepare the data for ncf model training¶

In [6]:

dataset = Data(
    train=train,
    test=test,
    col_user='user',
    col_item='item',
    col_rating='rating',
    col_time='event_ts',
    binary=True,
    n_neg=4,
    n_neg_test=100
)
dataset.prepTrainDNN()
dataset.prepTestDNN()
dataset.negativeSampling()

In [7]:

len(dataset.users),train.shape

Out[7]:

(4970845, (994169, 6))

In [8]:

len(dataset.users_test),test.shape

Out[8]:

(610040, (6040, 6))

In [9]:

train.user.nunique(), test.user.nunique()

Out[9]:

(6040, 6040)

In [10]:

train.item.nunique(), test.item.nunique()

Out[10]:

(3704, 1921)

In [11]:

dataset.interaction_train.head(3)

Out[11]:

	user	item_interacted	item_negative
0	0	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...	{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6...
1	1	{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
2	2	{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...	{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15...

In [12]:

len(set(dataset.users)), len(set(dataset.items))

Out[12]:

(6040, 3704)

In [13]:

len(set(dataset.users_test)), len(set(dataset.items_test))

Out[13]:

(6040, 3706)

prepare the test dataset¶

In [14]:

newItems = set(dataset.items_test)-set(dataset.items)
idx2del = []
for idx,item in enumerate(dataset.items_test):
    if item in newItems:
        idx2del.append(idx)

length_test_original = len(dataset.users_test)
dataset.users_test = [
    dataset.users_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.items_test = [
    dataset.items_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.ratings_test = [
    dataset.ratings_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]

step 3: create the model architecture¶

In [15]:

n_users = 6040
n_items = 3704
n_factors_gmf = 32
layers_mlp = [64,32,16,8]
reg_gmf = 0.
reg_layers_mlp = [0.,0.,0.,0.]
learning_rate = 0.01
flg_pretrain = ''
filepath = ''
filepath_gmf_pretrain = ''
filepath_mlp_pretrain = ''
num_epochs = 20
batch_size = 100

ncf = NCF(
    n_users=n_users,
    n_items=n_items,
    n_factors_gmf=n_factors_gmf,
    layers_mlp=layers_mlp,
    reg_gmf=reg_gmf,
    reg_layers_mlp=reg_layers_mlp
)
model = ncf.create_model()
#### compile the model
model.compile(
    optimizer=Adam(lr=learning_rate),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
#### create the callback metrics
filepath="../metadata/ncf/ncf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath= filepath, 
    verbose=1, 
    save_best_only=True
)
csvlog = keras.callbacks.CSVLogger(
    '../metadata/ncf/ncf_log.csv', 
    separator=',', 
    append=False
)
earlystop = keras.callbacks.EarlyStopping(patience=12)
lrreduce = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", 
    factor=0.3, 
    patience=4, 
    verbose=1
)

step 4: train the model¶

define customized metrics¶

In [16]:

class newMetrics(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
#         print(len(self.validation_data))
#         print(self.validation_data[0][:5])
#         print(self.validation_data[1][:5])        
#         print(self.validation_data[2][:5])
#         print(self.validation_data[3][:5])        
#         X_val, y_val = self.validation_data[0], self.validation_data[1]
        X_val = [self.validation_data[0],self.validation_data[1]]
        y_val = self.validation_data[2]
        y_predict = model.predict(x = X_val)
        logs['val_auc'] = evaluation.auc(y_val, y_predict)

metrics2 = newMetrics()

In [17]:

#### train
hist = model.fit(
    x = [
        np.array(dataset.users),
        np.array(dataset.items)
    ],
    y = np.array(dataset.ratings),
    batch_size=batch_size,
    epochs=num_epochs,
    verbose=2,
    shuffle=True,
    callbacks=[metrics2,checkpoint,csvlog,earlystop,lrreduce],
    validation_data=(
        [
            np.array(dataset.users_test),
            np.array(dataset.items_test)
        ],
        np.array(dataset.ratings_test)
    )
)

/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow_core/python/framework/indexed_slices.py:424: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "

Train on 4970845 samples, validate on 610038 samples
Epoch 1/20
 - 260s - loss: 0.3409 - accuracy: 0.8465 - val_loss: 0.1485 - val_accuracy: 0.9459

Epoch 00001: val_loss improved from inf to 0.14850, saving model to ../metadata/ncf/ncf-weights-improvement-01-0.1485.hdf5
Epoch 2/20
 - 252s - loss: 0.3123 - accuracy: 0.8611 - val_loss: 0.1842 - val_accuracy: 0.9258

Epoch 00002: val_loss did not improve from 0.14850
Epoch 3/20
 - 272s - loss: 0.3037 - accuracy: 0.8661 - val_loss: 0.1595 - val_accuracy: 0.9413

Epoch 00003: val_loss did not improve from 0.14850
Epoch 4/20
 - 264s - loss: 0.2990 - accuracy: 0.8687 - val_loss: 0.1650 - val_accuracy: 0.9344

Epoch 00004: val_loss did not improve from 0.14850
Epoch 5/20
 - 263s - loss: 0.2942 - accuracy: 0.8719 - val_loss: 0.1507 - val_accuracy: 0.9395

Epoch 00005: val_loss did not improve from 0.14850

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0029999999329447745.
Epoch 6/20
 - 247s - loss: 0.2747 - accuracy: 0.8808 - val_loss: 0.1807 - val_accuracy: 0.9248

Epoch 00006: val_loss did not improve from 0.14850
Epoch 7/20
 - 247s - loss: 0.2698 - accuracy: 0.8837 - val_loss: 0.1595 - val_accuracy: 0.9363

Epoch 00007: val_loss did not improve from 0.14850
Epoch 8/20
 - 237s - loss: 0.2627 - accuracy: 0.8878 - val_loss: 0.1495 - val_accuracy: 0.9404

Epoch 00008: val_loss did not improve from 0.14850
Epoch 9/20
 - 237s - loss: 0.2538 - accuracy: 0.8925 - val_loss: 0.1833 - val_accuracy: 0.9252

Epoch 00009: val_loss did not improve from 0.14850

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0009000000078231095.
Epoch 10/20
 - 233s - loss: 0.2358 - accuracy: 0.9011 - val_loss: 0.1506 - val_accuracy: 0.9392

Epoch 00010: val_loss did not improve from 0.14850
Epoch 11/20
 - 234s - loss: 0.2321 - accuracy: 0.9031 - val_loss: 0.1592 - val_accuracy: 0.9351

Epoch 00011: val_loss did not improve from 0.14850
Epoch 12/20
 - 235s - loss: 0.2291 - accuracy: 0.9046 - val_loss: 0.1607 - val_accuracy: 0.9341

Epoch 00012: val_loss did not improve from 0.14850
Epoch 13/20
 - 254s - loss: 0.2264 - accuracy: 0.9060 - val_loss: 0.1729 - val_accuracy: 0.9296

Epoch 00013: val_loss did not improve from 0.14850

Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.00026999999536201356.

In [18]:

dataset.users_test[:5], dataset.items_test[:5], dataset.ratings_test[:5], dataset.ratings[:5]

Out[18]:

([0, 0, 0, 0, 0],
 [398, 2310, 2068, 2263, 1366],
 [1.0, 0.0, 0.0, 0.0, 0.0],
 array([1., 0., 0., 0., 0.]))

In [19]:

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_mlp_User (Embedding)  (None, 1, 32)        193280      user_input[0][0]                 
__________________________________________________________________________________________________
embedding_mlp_Item (Embedding)  (None, 1, 32)        118528      item_input[0][0]                 
__________________________________________________________________________________________________
flatten_mlp_User (Flatten)      (None, 32)           0           embedding_mlp_User[0][0]         
__________________________________________________________________________________________________
flatten_mlp_Item (Flatten)      (None, 32)           0           embedding_mlp_Item[0][0]         
__________________________________________________________________________________________________
concat_mlp_UserItem (Concatenat (None, 64)           0           flatten_mlp_User[0][0]           
                                                                 flatten_mlp_Item[0][0]           
__________________________________________________________________________________________________
embedding_gmf_User (Embedding)  (None, 1, 32)        193280      user_input[0][0]                 
__________________________________________________________________________________________________
embedding_gmf_Item (Embedding)  (None, 1, 32)        118528      item_input[0][0]                 
__________________________________________________________________________________________________
mlp_layer_1 (Dense)             (None, 32)           2080        concat_mlp_UserItem[0][0]        
__________________________________________________________________________________________________
flatten_gmf_User (Flatten)      (None, 32)           0           embedding_gmf_User[0][0]         
__________________________________________________________________________________________________
flatten_gmf_Item (Flatten)      (None, 32)           0           embedding_gmf_Item[0][0]         
__________________________________________________________________________________________________
mlp_layer_2 (Dense)             (None, 16)           528         mlp_layer_1[0][0]                
__________________________________________________________________________________________________
multiply_gmf_UserItem (Multiply (None, 32)           0           flatten_gmf_User[0][0]           
                                                                 flatten_gmf_Item[0][0]           
__________________________________________________________________________________________________
mlp_layer_3 (Dense)             (None, 8)            136         mlp_layer_2[0][0]                
__________________________________________________________________________________________________
concat_gmf_mlp (Concatenate)    (None, 40)           0           multiply_gmf_UserItem[0][0]      
                                                                 mlp_layer_3[0][0]                
__________________________________________________________________________________________________
output (Dense)                  (None, 1)            41          concat_gmf_mlp[0][0]             
==================================================================================================
Total params: 626,401
Trainable params: 626,401
Non-trainable params: 0
__________________________________________________________________________________________________

In [20]:

hist.history

Out[20]:

{'val_loss': [0.1485011165742999,
  0.18416567678242549,
  0.15954290449423783,
  0.1650358262217424,
  0.1506881339903949,
  0.18065066654141143,
  0.1595049133731821,
  0.14945811447602397,
  0.18333839380089495,
  0.15058260036174537,
  0.15917451103293503,
  0.16069093156649547,
  0.1729039898706394],
 'val_accuracy': [0.945934534072876,
  0.9257521629333496,
  0.9413331747055054,
  0.9343729615211487,
  0.9395332932472229,
  0.9247997403144836,
  0.9362990260124207,
  0.9404119253158569,
  0.9251571297645569,
  0.939182460308075,
  0.9351253509521484,
  0.9341335892677307,
  0.9295814633369446],
 'loss': [0.34090790990596076,
  0.31232653234047203,
  0.3036990459802174,
  0.2990263803599324,
  0.2942262135450036,
  0.27472649822994305,
  0.2697909015162971,
  0.2626914049020689,
  0.2537869813717614,
  0.2357732586596515,
  0.2320512784263774,
  0.22910663956301205,
  0.22642496287824582],
 'accuracy': [0.84647256,
  0.8611172,
  0.86606985,
  0.86869276,
  0.8718952,
  0.8808474,
  0.8836669,
  0.8877706,
  0.89248186,
  0.90110457,
  0.9031352,
  0.90458083,
  0.90598017],
 'val_auc': [0.8447472837591501,
  0.858340589621141,
  0.8635911596862258,
  0.8709023831407706,
  0.8599701511015224,
  0.8788743092862202,
  0.8787586388578734,
  0.8794580647071855,
  0.8790272227328466,
  0.8798275979502883,
  0.8773889560378091,
  0.8815762746534642,
  0.8802717551533443],
 'lr': [0.01,
  0.01,
  0.01,
  0.01,
  0.01,
  0.003,
  0.003,
  0.003,
  0.003,
  0.0009,
  0.0009,
  0.0009,
  0.0009]}

In [ ]: