Estimate the predictability limits due to randomness and due to algorithm design in some methods of session-based recommendation.
import numpy as np
import pandas as pd
from datetime import timezone, datetime, timedelta
import time
'''
preprocessing method ["info","org","org_min_date","days_test","slice","buys"]
info: just load and show info
org: from gru4rec (last day => test set)
org_min_date: from gru4rec (last day => test set) but from a minimal date onwards
days_test: adapted from gru4rec (last N days => test set)
slice: new (create multiple train-test-combinations with a sliding window approach
buys: load buys and safe file to prepared
'''
# METHOD = "slice"
METHOD = input('Preprocessing method (info/org/org_min_date/days_test/slice/buys):') or 'slice'
assert(METHOD in 'info/org/org_min_date/days_test/slice/buys'.split('/')), 'Invalid Preprocessing method.'
'''
data config (all methods)
'''
PATH = './retailrocket/'
PATH_PROCESSED = './retailrocket/slices/'
FILE = 'events'
'''
org_min_date config
'''
MIN_DATE = '2015-09-02'
'''
filtering config (all methods)
'''
SESSION_LENGTH = 30 * 60 #30 minutes
MIN_SESSION_LENGTH = 2
MIN_ITEM_SUPPORT = 5
MIN_DATE = '2014-04-01'
'''
days test default config
'''
DAYS_TEST = 2
'''
slicing default config
'''
NUM_SLICES = 5 #offset in days from the first date in the data set
DAYS_OFFSET = 0 #number of days the training start date is shifted after creating one slice
DAYS_SHIFT = 27
#each slice consists of...
DAYS_TRAIN = 25
DAYS_TEST = 2
Preprocessing method (info/org/org_min_date/days_test/slice/buys):slice
!wget -q --show-progress https://github.com/RecoHut-Datasets/retail_rocket/raw/v2/retailrocket.zip
!unzip retailrocket.zip
!mkdir retailrocket/slices
retailrocket.zip 100%[===================>] 32.00M 147MB/s in 0.2s Archive: retailrocket.zip creating: retailrocket/ inflating: retailrocket/events.csv creating: retailrocket/prepared_window/ inflating: retailrocket/prepared_window/events.0.hdf inflating: retailrocket/prepared_window/events.1.hdf inflating: retailrocket/prepared_window/events.2.hdf inflating: retailrocket/prepared_window/events.3.hdf inflating: retailrocket/prepared_window/events.4.hdf
#preprocessing from original gru4rec
def preprocess_org( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH ):
data, buys = load_data( path+file )
data = filter_data( data, min_item_support, min_session_length )
split_data_org( data, path_proc+file )
#preprocessing from original gru4rec but from a certain point in time
def preprocess_org_min_date( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH, min_date=MIN_DATE ):
data, buys = load_data( path+file )
data = filter_data( data, min_item_support, min_session_length )
data = filter_min_date( data, min_date )
split_data_org( data, path_proc+file )
#preprocessing adapted from original gru4rec
def preprocess_days_test( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH, days_test=DAYS_TEST ):
data, buys = load_data( path+file )
data = filter_data( data, min_item_support, min_session_length )
split_data( data, path_proc+file, days_test )
#preprocessing from original gru4rec but from a certain point in time
def preprocess_days_test_min_date( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH, days_test=DAYS_TEST, min_date=MIN_DATE ):
data, buys = load_data( path+file )
data = filter_data( data, min_item_support, min_session_length )
data = filter_min_date( data, min_date )
split_data( data, path_proc+file, days_test )
#preprocessing to create data slices with a sliding window
def preprocess_slices( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH,
num_slices = NUM_SLICES, days_offset = DAYS_OFFSET, days_shift = DAYS_SHIFT, days_train = DAYS_TRAIN, days_test=DAYS_TEST ):
data, buys = load_data( path+file )
data = filter_data( data, min_item_support, min_session_length )
slice_data( data, path_proc+file, num_slices, days_offset, days_shift, days_train, days_test )
#just load and show info
def preprocess_info( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH ):
data, buys = load_data( path+file )
data = filter_data( data, min_item_support, min_session_length )
def preprocess_save( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH ):
data, buys = load_data( path+file )
data = filter_data( data, min_item_support, min_session_length )
data.to_csv(path_proc + file + '_preprocessed.txt', sep='\t', index=False)
#preprocessing to create a file with buy actions
def preprocess_buys( path=PATH, file=FILE, path_proc=PATH_PROCESSED ):
data, buys = load_data( path+file )
store_buys(buys, path_proc+file)
def load_data( file ) :
#load csv
data = pd.read_csv( file+'.csv', sep=',', header=0, usecols=[0,1,2,3], dtype={0:np.int64, 1:np.int32, 2:str, 3:np.int32})
#specify header names
data.columns = ['Time','UserId','Type','ItemId']
data['Time'] = (data.Time / 1000).astype( int )
data.sort_values( ['UserId','Time'], ascending=True, inplace=True )
#sessionize
data['TimeTmp'] = pd.to_datetime(data.Time, unit='s')
data.sort_values( ['UserId','TimeTmp'], ascending=True, inplace=True )
# users = data.groupby('UserId')
data['TimeShift'] = data['TimeTmp'].shift(1)
data['TimeDiff'] = (data['TimeTmp'] - data['TimeShift']).dt.total_seconds().abs()
data['SessionIdTmp'] = (data['TimeDiff'] > SESSION_LENGTH).astype( int )
data['SessionId'] = data['SessionIdTmp'].cumsum( skipna=False )
del data['SessionIdTmp'], data['TimeShift'], data['TimeDiff']
data.sort_values( ['SessionId','Time'], ascending=True, inplace=True )
cart = data[data.Type == 'addtocart']
data = data[data.Type == 'view']
del data['Type']
print(data)
#output
print( data.Time.min() )
print( data.Time.max() )
data_start = datetime.fromtimestamp( data.Time.min(), timezone.utc )
data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
del data['TimeTmp']
print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
format( len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat() ) )
return data, cart;
def filter_data( data, min_item_support, min_session_length ) :
#y?
session_lengths = data.groupby('SessionId').size()
data = data[np.in1d(data.SessionId, session_lengths[ session_lengths>1 ].index)]
#filter item support
item_supports = data.groupby('ItemId').size()
data = data[np.in1d(data.ItemId, item_supports[ item_supports>= min_item_support ].index)]
#filter session length
session_lengths = data.groupby('SessionId').size()
data = data[np.in1d(data.SessionId, session_lengths[ session_lengths>= min_session_length ].index)]
#output
data_start = datetime.fromtimestamp( data.Time.min(), timezone.utc )
data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
print('Filtered data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
format( len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat() ) )
return data;
def filter_min_date( data, min_date='2014-04-01' ) :
min_datetime = datetime.strptime(min_date + ' 00:00:00', '%Y-%m-%d %H:%M:%S')
#filter
session_max_times = data.groupby('SessionId').Time.max()
session_keep = session_max_times[ session_max_times > min_datetime.timestamp() ].index
data = data[ np.in1d(data.SessionId, session_keep) ]
#output
data_start = datetime.fromtimestamp( data.Time.min(), timezone.utc )
data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
print('Filtered data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
format( len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat() ) )
return data;
def split_data_org( data, output_file ) :
tmax = data.Time.max()
session_max_times = data.groupby('SessionId').Time.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_test = session_max_times[session_max_times >= tmax-86400].index
train = data[np.in1d(data.SessionId, session_train)]
test = data[np.in1d(data.SessionId, session_test)]
test = test[np.in1d(test.ItemId, train.ItemId)]
tslength = test.groupby('SessionId').size()
test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
train.to_csv(output_file + '_train_full.txt', sep='\t', index=False)
print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
test.to_csv(output_file + '_test.txt', sep='\t', index=False)
tmax = train.Time.max()
session_max_times = train.groupby('SessionId').Time.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_valid = session_max_times[session_max_times >= tmax-86400].index
train_tr = train[np.in1d(train.SessionId, session_train)]
valid = train[np.in1d(train.SessionId, session_valid)]
valid = valid[np.in1d(valid.ItemId, train_tr.ItemId)]
tslength = valid.groupby('SessionId').size()
valid = valid[np.in1d(valid.SessionId, tslength[tslength>=2].index)]
print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), train_tr.SessionId.nunique(), train_tr.ItemId.nunique()))
train_tr.to_csv( output_file + '_train_tr.txt', sep='\t', index=False)
print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), valid.SessionId.nunique(), valid.ItemId.nunique()))
valid.to_csv( output_file + '_train_valid.txt', sep='\t', index=False)
def split_data( data, output_file, days_test ) :
data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
test_from = data_end - timedelta( days_test )
session_max_times = data.groupby('SessionId').Time.max()
session_train = session_max_times[ session_max_times < test_from.timestamp() ].index
session_test = session_max_times[ session_max_times >= test_from.timestamp() ].index
train = data[np.in1d(data.SessionId, session_train)]
test = data[np.in1d(data.SessionId, session_test)]
test = test[np.in1d(test.ItemId, train.ItemId)]
tslength = test.groupby('SessionId').size()
test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
train.to_csv(output_file + '_train_full.txt', sep='\t', index=False)
print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
test.to_csv(output_file + '_test.txt', sep='\t', index=False)
def slice_data( data, output_file, num_slices, days_offset, days_shift, days_train, days_test ):
for slice_id in range( 0, num_slices ) :
split_data_slice( data, output_file, slice_id, days_offset+(slice_id*days_shift), days_train, days_test )
def split_data_slice( data, output_file, slice_id, days_offset, days_train, days_test ) :
data_start = datetime.fromtimestamp( data.Time.min(), timezone.utc )
data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
print('Full data set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
format( slice_id, len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.isoformat(), data_end.isoformat() ) )
start = datetime.fromtimestamp( data.Time.min(), timezone.utc ) + timedelta( days_offset )
middle = start + timedelta( days_train )
end = middle + timedelta( days_test )
#prefilter the timespan
session_max_times = data.groupby('SessionId').Time.max()
greater_start = session_max_times[session_max_times >= start.timestamp()].index
lower_end = session_max_times[session_max_times <= end.timestamp()].index
data_filtered = data[np.in1d(data.SessionId, greater_start.intersection( lower_end ))]
print('Slice data set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {} / {}'.
format( slice_id, len(data_filtered), data_filtered.SessionId.nunique(), data_filtered.ItemId.nunique(), start.date().isoformat(), middle.date().isoformat(), end.date().isoformat() ) )
#split to train and test
session_max_times = data_filtered.groupby('SessionId').Time.max()
sessions_train = session_max_times[session_max_times < middle.timestamp()].index
sessions_test = session_max_times[session_max_times >= middle.timestamp()].index
train = data[np.in1d(data.SessionId, sessions_train)]
print('Train set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
format( slice_id, len(train), train.SessionId.nunique(), train.ItemId.nunique(), start.date().isoformat(), middle.date().isoformat() ) )
train.to_csv(output_file + '_train_full.'+str(slice_id)+'.txt', sep='\t', index=False)
test = data[np.in1d(data.SessionId, sessions_test)]
test = test[np.in1d(test.ItemId, train.ItemId)]
tslength = test.groupby('SessionId').size()
test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
print('Test set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {} \n\n'.
format( slice_id, len(test), test.SessionId.nunique(), test.ItemId.nunique(), middle.date().isoformat(), end.date().isoformat() ) )
test.to_csv(output_file + '_test.'+str(slice_id)+'.txt', sep='\t', index=False)
def store_buys( buys, target ):
buys.to_csv( target + '_buys.txt', sep='\t', index=False )
if __name__ == '__main__':
'''
Run the preprocessing configured above.
'''
print( "START preprocessing ", METHOD )
sc, st = time.time(), time.time()
if METHOD == "info":
preprocess_info( PATH, FILE, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH )
elif METHOD == "org":
preprocess_org( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH )
elif METHOD == "org_min_date":
preprocess_org_min_date( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH, MIN_DATE )
elif METHOD == "day_test":
preprocess_days_test( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH, DAYS_TEST )
elif METHOD == "day_test_min_date":
preprocess_days_test_min_date( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH, DAYS_TEST, MIN_DATE )
elif METHOD == "slice":
preprocess_slices( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH, NUM_SLICES, DAYS_OFFSET, DAYS_SHIFT, DAYS_TRAIN, DAYS_TEST )
elif METHOD == "buys":
preprocess_buys( PATH, FILE, PATH_PROCESSED )
elif METHOD == "save":
preprocess_save( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH )
else:
print( "Invalid method ", METHOD )
print( "END preproccessing ", (time.time() - sc), "c ", (time.time() - st), "s" )
START preprocessing slice Time UserId ItemId TimeTmp SessionId 1361687 1442004589 0 285930 2015-09-11 20:49:49 0 1367212 1442004759 0 357564 2015-09-11 20:52:39 0 1367342 1442004917 0 67045 2015-09-11 20:55:17 0 830385 1439487966 1 72028 2015-08-13 17:46:06 1 742616 1438969904 2 325215 2015-08-07 17:51:44 2 ... ... ... ... ... ... 206556 1433972768 1407575 121220 2015-06-10 21:46:08 1761093 47311 1433343689 1407576 356208 2015-06-03 15:01:29 1761094 1762583 1431899284 1407577 427784 2015-05-17 21:48:04 1761095 1744277 1431825683 1407578 188736 2015-05-17 01:21:23 1761096 482559 1435184526 1407579 2521 2015-06-24 22:22:06 1761097 [2664312 rows x 5 columns] 1430622011 1442545187 Loaded data set Events: 2664312 Sessions: 1755206 Items: 234838 Span: 2015-05-03 / 2015-09-18 Filtered data set Events: 1085763 Sessions: 306919 Items: 49070 Span: 2015-05-03 / 2015-09-18 Full data set 0 Events: 1085763 Sessions: 306919 Items: 49070 Span: 2015-05-03T03:00:33+00:00 / 2015-09-18T02:58:58+00:00 Slice data set 0 Events: 230003 Sessions: 63746 Items: 32977 Span: 2015-05-03 / 2015-05-28 / 2015-05-30 Train set 0 Events: 213660 Sessions: 59110 Items: 32052 Span: 2015-05-03 / 2015-05-28 Test set 0 Events: 14457 Sessions: 4136 Items: 6506 Span: 2015-05-28 / 2015-05-30 Full data set 1 Events: 1085763 Sessions: 306919 Items: 49070 Span: 2015-05-03T03:00:33+00:00 / 2015-09-18T02:58:58+00:00 Slice data set 1 Events: 229891 Sessions: 62631 Items: 33577 Span: 2015-05-30 / 2015-06-24 / 2015-06-26 Train set 1 Events: 212266 Sessions: 57795 Items: 32529 Span: 2015-05-30 / 2015-06-24 Test set 1 Events: 15425 Sessions: 4260 Items: 6801 Span: 2015-06-24 / 2015-06-26 Full data set 2 Events: 1085763 Sessions: 306919 Items: 49070 Span: 2015-05-03T03:00:33+00:00 / 2015-09-18T02:58:58+00:00 Slice data set 2 Events: 224835 Sessions: 62257 Items: 34396 Span: 2015-06-26 / 2015-07-21 / 2015-07-23 Train set 2 Events: 207176 Sessions: 57229 Items: 33453 Span: 2015-06-26 / 2015-07-21 Test set 2 Events: 15650 Sessions: 4486 Items: 6937 Span: 2015-07-21 / 2015-07-23 Full data set 3 Events: 1085763 Sessions: 306919 Items: 49070 Span: 2015-05-03T03:00:33+00:00 / 2015-09-18T02:58:58+00:00 Slice data set 3 Events: 206774 Sessions: 60383 Items: 33305 Span: 2015-07-23 / 2015-08-17 / 2015-08-19 Train set 3 Events: 191128 Sessions: 55786 Items: 32263 Span: 2015-07-23 / 2015-08-17 Test set 3 Events: 13486 Sessions: 3959 Items: 6065 Span: 2015-08-17 / 2015-08-19 Full data set 4 Events: 1085763 Sessions: 306919 Items: 49070 Span: 2015-05-03T03:00:33+00:00 / 2015-09-18T02:58:58+00:00 Slice data set 4 Events: 179452 Sessions: 53595 Items: 30420 Span: 2015-08-19 / 2015-09-13 / 2015-09-15 Train set 4 Events: 166160 Sessions: 49492 Items: 29543 Span: 2015-08-19 / 2015-09-13 Test set 4 Events: 11502 Sessions: 3561 Items: 5329 Span: 2015-09-13 / 2015-09-15 END preproccessing 17.43916940689087 c 17.43917155265808 s
import time
import os.path
import numpy as np
import pandas as pd
from _datetime import timezone, datetime
def load_data( path, file, rows_train=None, rows_test=None, slice_num=None, density=1, train_eval=False ):
'''
Loads a tuple of training and test set with the given parameters.
Parameters
--------
path : string
Base path to look in for the prepared data files
file : string
Prefix of the dataset you want to use.
"yoochoose-clicks-full" loads yoochoose-clicks-full_train_full.txt and yoochoose-clicks-full_test.txt
rows_train : int or None
Number of rows to load from the training set file.
This option will automatically filter the test set to only retain items included in the training set.
rows_test : int or None
Number of rows to load from the test set file.
slice_num :
Adds a slice index to the constructed file_path
yoochoose-clicks-full_train_full.0.txt
density : float
Percentage of the sessions to randomly retain from the original data (0-1).
The result is cached for the execution of multiple experiments.
Returns
--------
out : tuple of pandas.DataFrame
(train, test)
'''
print('START load data')
st = time.time()
sc = time.time()
split = ''
if( slice_num != None and isinstance(slice_num, int ) ):
split = '.'+str(slice_num)
train_appendix = '_train_full'
test_appendix = '_test'
if train_eval:
train_appendix = '_train_tr'
test_appendix = '_train_valid'
density_appendix = ''
if( density < 1 ): #create sample
if not os.path.isfile( path + file + train_appendix + split + '.txt.'+str( density ) ) :
train = pd.read_csv(path + file + train_appendix + split + '.txt', sep='\t', dtype={'ItemId':np.int64})
test = pd.read_csv(path + file + test_appendix + split + '.txt', sep='\t', dtype={'ItemId':np.int64} )
sessions = train.SessionId.unique()
drop_n = round( len(sessions) - (len(sessions) * density) )
drop_sessions = np.random.choice(sessions, drop_n, replace=False)
train = train[ ~train.SessionId.isin( drop_sessions ) ]
train.to_csv( path + file + train_appendix +split+'.txt.'+str(density), sep='\t', index=False )
sessions = test.SessionId.unique()
drop_n = round( len(sessions) - (len(sessions) * density) )
drop_sessions = np.random.choice(sessions, drop_n, replace=False)
test = test[ ~test.SessionId.isin( drop_sessions ) ]
test = test[np.in1d(test.ItemId, train.ItemId)]
test.to_csv( path + file + test_appendix +split+'.txt.'+str(density), sep='\t', index=False )
density_appendix = '.'+str(density)
if( rows_train == None ):
train = pd.read_csv(path + file + train_appendix +split+'.txt'+density_appendix, sep='\t', dtype={'ItemId':np.int64})
else:
train = pd.read_csv(path + file + train_appendix +split+'.txt'+density_appendix, sep='\t', dtype={'ItemId':np.int64}, nrows=rows_train)
session_lengths = train.groupby('SessionId').size()
train = train[np.in1d(train.SessionId, session_lengths[ session_lengths>1 ].index)]
if( rows_test == None ):
test = pd.read_csv(path + file + test_appendix +split+'.txt'+density_appendix, sep='\t', dtype={'ItemId':np.int64} )
else :
test = pd.read_csv(path + file + test_appendix +split+'.txt'+density_appendix, sep='\t', dtype={'ItemId':np.int64}, nrows=rows_test )
session_lengths = test.groupby('SessionId').size()
test = test[np.in1d(test.SessionId, session_lengths[ session_lengths>1 ].index)]
# rows_train = 10000
# train = train.tail(10000)
if( rows_train != None ):
test = test[np.in1d(test.ItemId, train.ItemId)]
session_lengths = test.groupby('SessionId').size()
test = test[np.in1d(test.SessionId, session_lengths[ session_lengths>1 ].index)]
#output
data_start = datetime.fromtimestamp( train.Time.min(), timezone.utc )
data_end = datetime.fromtimestamp( train.Time.max(), timezone.utc )
print('Loaded train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n'.
format( len(train), train.SessionId.nunique(), train.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat() ) )
data_start = datetime.fromtimestamp( test.Time.min(), timezone.utc )
data_end = datetime.fromtimestamp( test.Time.max(), timezone.utc )
print('Loaded test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n'.
format( len(test), test.SessionId.nunique(), test.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat() ) )
print( 'END load data ', (time.time()-sc), 'c / ', (time.time()-st), 's' )
return (train, test)
def load_buys( path, file ):
'''
Load all buy events from the youchoose file, retains events fitting in the given test set and merges both data sets into one
Parameters
--------
path : string
Base path to look in for the prepared data files
file : string
Prefix of the dataset you want to use.
"yoochoose-clicks-full" loads yoochoose-clicks-full_train_full.txt and yoochoose-clicks-full_test.txt
Returns
--------
out : pandas.DataFrame
test with buys
'''
print('START load buys')
st = time.time()
sc = time.time()
#load csv
buys = pd.read_csv(path + file + '.txt', sep='\t', dtype={'ItemId':np.int64})
print( 'END load buys ', (time.time()-sc), 'c / ', (time.time()-st), 's' )
return buys
def dump_sequence(data_path, file_prefix, out_fn, density=1, slic=0):
"""
Convert training/testing slices into a sequence format
suitable for entropy rate estimation
"""
train, test = load_data(data_path, file_prefix,
rows_train=None, rows_test=None, density=density,
slice_num=slic)
# append all
all_data = train.append(test)
# sort by sequence, then timestamp
groupby = all_data.groupby("SessionId")
with open(out_fn, "w") as f:
for session_id, session in groupby:
item_ids = [item_id for
item_id in session.sort_values("Time")["ItemId"]]
for item_id in item_ids:
f.write("{}\n".format(item_id))
f.write("-1\n")
data_path = './retailrocket/slices/'
file_prefix = 'events'
output_file = './retailrocket/seq/s0.txt'
d = 1 # downsample the input data (0.1 - use only 10%% of input)
s = 0 # slice number, 0-4
!mkdir ./retailrocket/seq
dump_sequence(data_path, file_prefix, output_file, d, s)
START load data Loaded train set Events: 213660 Sessions: 59110 Items: 32052 Span: 2015-05-03 / 2015-05-28 Loaded test set Events: 14457 Sessions: 4136 Items: 6506 Span: 2015-05-28 / 2015-05-30 END load data 0.10811591148376465 c / 0.10811710357666016 s
import numpy as np
from collections import defaultdict
def calc_entropy2(in_fn):
"""
Entropy rate estimation for a sequence
input: file with each sequence element (integer) on its own row
"""
with open(in_fn) as f:
events = [int(l.strip()) for l in f]
# calculate Lempel-Ziv estimate of entropy
lambda_sum = 0
seq1 = set() # single item sequences
seq2 = set() # two-item sequences
seq3 = defaultdict(list) # three-item sequences index
n = len(events)
print(in_fn, n)
timestep = int(n / 10) + 1
for i in range(n):
k_max = 0
# single item
if events[i] in seq1:
k_max = 1
# two items
if i + 1 < n and tuple(events[i:i+2]) in seq2:
k_max = 2
# three or more
if i + 2 < n:
for subseq_start in seq3[tuple(events[i:i+3])]:
k = 3
while subseq_start + k < i and i + k < n:
if events[subseq_start + k] != events[i + k]:
break
k += 1
k_max = max(k, k_max)
lambda_sum += (k_max + 1) # as in Xu, et al. (2019)
#print(i, ev, k_max)
# update index
seq1.add(events[i])
if i > 0:
seq2.add(tuple(events[i-1:i+1]))
if i > 1:
seq3[tuple(events[i-2:i+1])].append(i - 2)
if i % timestep == 0 and i > 0:
print(i, "done")
S = (n / lambda_sum) * np.log2(n)
print("S:", S)
print("m (for \Pi^max equation):", len(seq1))
input_file = './retailrocket/seq/s0.txt'
calc_entropy2(input_file)
./retailrocket/seq/s0.txt 291363 29137 done 58274 done 87411 done 116548 done 145685 done 174822 done 203959 done 233096 done 262233 done S: 7.136608275066677 m (for \Pi^max equation): 32053
The predictability limit can be computed using the entropy rate estimate S and the unique event count m.
Calculate co-occurrence of the item to predict (in a recommendation accuracy test) and the current item (given to the recommender as an input) in the training data.
def test_all(data_path, file_prefix, density=1, slic=[0]):
all_stats = defaultdict(int)
for i in slic:
train, test = load_data(data_path, file_prefix,
rows_train=None, rows_test=None, density=density,
slice_num=i)
s, i2s = load_sessions(train)
print(data_path, file_prefix, i)
stats = test_reachability(s, i2s, test)
for k, v in stats.items():
all_stats[k] += v
for k, v in all_stats.items():
print(k, v)
def test_reachability(sessions, item2session, data, max_span=10):
"""Item co-occurrence in sessions"""
stats = {"r_cnt" : 0,
"cnt_next" : 0,
"cnt_fwd10" : 0,
"cnt_anywhere" : 0,
"cnt_anywhere_sess" : 0}
groupby = data.groupby("SessionId")
for session_id, session in groupby:
item_ids = [item_id for
item_id in session.sort_values("Time")["ItemId"]]
l = len(item_ids)
for i in range(l - 1):
# step 1: calculate relative to current item
# MC cnt_next
# SR, windowed NB cnt_fwd10
# AR cnt_anywhere
item_id = item_ids[i]
target_id = item_ids[i + 1]
next_found = 0
fwd10_found = 0
any_found = 0
sess_found = 0
seen_sessions = set()
# loop through all sessions
for train_sess_id in item2session[item_id]:
seen_sessions.add(train_sess_id)
train_sess = sessions[train_sess_id]
last_item = None
for i, train_item in enumerate(train_sess):
if train_item == target_id:
any_found = 1
sess_found = 1
if last_item == item_id:
next_found = 1
fwd10_found = 1
break
elif not fwd10_found and i > 1 and item_id in train_sess[max(0, i - max_span):i - 1]:
fwd10_found = 1
last_item = train_item
if next_found:
break
# otherwise need to keep searching other sessions
# step 2: search using the remainder of the items seen so far
# NB cnt_anywhere_sess
if not sess_found:
sess_so_far = set(item_ids[:i])
for item_id in sess_so_far:
for train_sess_id in item2session[item_id]:
if train_sess_id in seen_sessions:
continue
seen_sessions.add(train_sess_id)
train_sess = sessions[train_sess_id]
last_item = None
for i, train_item in enumerate(train_sess):
if train_item == target_id:
sess_found = 1
break
# summarize results
stats["r_cnt"] += 1
stats["cnt_next"] += next_found
stats["cnt_fwd10"] += fwd10_found
stats["cnt_anywhere"] += any_found
stats["cnt_anywhere_sess"] += sess_found
return stats
def test_forward_backward(sessions, item2session, data):
"""Statistics of whether the item to predict occurs
before or after the current item (when co-occurring in a session)
"""
stats = {"f_cnt" : 0,
"cnt_bwd" : 0,
"cnt_fwd" : 0,
"cnt_both" : 0}
groupby = data.groupby("SessionId")
for session_id, session in groupby:
item_ids = [item_id for
item_id in session.sort_values("Time")["ItemId"]]
l = len(item_ids)
for i in range(l - 1):
item_id = item_ids[i]
target_id = item_ids[i + 1]
if item_id == target_id:
continue
common_sessions = set(item2session[item_id]).intersection(
set(item2session[target_id]))
bwd = 0
fwd = 0
both = 0
# loop through all sessions
for train_sess_id in common_sessions:
train_sess = sessions[train_sess_id]
item_pos = []
target_pos = []
for i in range(len(train_sess)):
if train_sess[i] == item_id:
item_pos.append(i)
elif train_sess[i] == target_id:
target_pos.append(i)
b = f = 0
if min(target_pos) < max(item_pos):
b = 1
if min(item_pos) < max(target_pos):
f = 1
bwd += b
fwd += f
if b == f:
both += 1
# summarize results
stats["f_cnt"] += len(common_sessions)
stats["cnt_bwd"] += bwd
stats["cnt_fwd"] += fwd
stats["cnt_both"] += both
return stats
def test_out_edges(sessions, item2session):
"""Count outgoing edges in an item-to-item graph
(edge is one item following another in a session)
"""
stats = {"e_cnt" : 0,
"cnt_u20" : 0,
"cnt_u10" : 0,
"cnt_u05" : 0}
out_cnt = defaultdict(set)
for session_id, item_ids in sessions.items():
last_item_id = None
for item_id in item_ids:
if last_item_id is not None:
out_cnt[last_item_id].add(item_id)
last_item_id = item_id
for item_id, out_edges in out_cnt.items():
stats["e_cnt"] += 1
l = len(out_edges)
if l <= 20:
stats["cnt_u20"] += 1
if l <= 10:
stats["cnt_u10"] += 1
if l <= 5:
stats["cnt_u05"] += 1
return stats
def load_sessions(data):
"""Build a dictionary of sessions and a lookup map for
finding which sessions an item belongs to
"""
sessions = defaultdict(list)
item2session = defaultdict(list)
groupby = data.groupby("SessionId")
for session_id, session in groupby:
item_ids = [item_id for
item_id in session.sort_values("Time")["ItemId"]]
sessions[session_id] = item_ids
for item_id in item_ids:
item2session[item_id].append(session_id)
return sessions, item2session
d = 1 # downsample the input data (0.1 - use only 10%% of input)
data_path = './retailrocket/slices/'
file_prefix = 'events'
test_all(data_path, file_prefix, d, [0,1,2,3,4])
START load data Loaded train set Events: 213660 Sessions: 59110 Items: 32052 Span: 2015-05-03 / 2015-05-28 Loaded test set Events: 14457 Sessions: 4136 Items: 6506 Span: 2015-05-28 / 2015-05-30 END load data 0.10039901733398438 c / 0.10040020942687988 s ./retailrocket/slices/ events 0 START load data Loaded train set Events: 212266 Sessions: 57795 Items: 32529 Span: 2015-05-30 / 2015-06-24 Loaded test set Events: 15425 Sessions: 4260 Items: 6801 Span: 2015-06-24 / 2015-06-26 END load data 0.08721494674682617 c / 0.08721661567687988 s ./retailrocket/slices/ events 1 START load data Loaded train set Events: 207176 Sessions: 57229 Items: 33453 Span: 2015-06-26 / 2015-07-21 Loaded test set Events: 15650 Sessions: 4486 Items: 6937 Span: 2015-07-21 / 2015-07-23 END load data 0.08588290214538574 c / 0.08588480949401855 s ./retailrocket/slices/ events 2 START load data Loaded train set Events: 191128 Sessions: 55786 Items: 32263 Span: 2015-07-23 / 2015-08-17 Loaded test set Events: 13486 Sessions: 3959 Items: 6065 Span: 2015-08-17 / 2015-08-19 END load data 0.08804202079772949 c / 0.0880436897277832 s ./retailrocket/slices/ events 3 START load data Loaded train set Events: 166160 Sessions: 49492 Items: 29543 Span: 2015-08-19 / 2015-09-13 Loaded test set Events: 11502 Sessions: 3561 Items: 5329 Span: 2015-09-13 / 2015-09-15 END load data 0.07229804992675781 c / 0.07229948043823242 s ./retailrocket/slices/ events 4 r_cnt 50118 cnt_next 18197 cnt_fwd10 22035 cnt_anywhere 28417 cnt_anywhere_sess 40052
"r_cnt"
in results is the total number of test cases examined.
Interpreting the results:
Key | Item to predict appears | Applies to algorithm |
---|---|---|
cnt_next | next to current item | MC, SF-SKNN |
cnt_fwd10 | among 10 items after current item | SR |
cnt_anywhere | anywhere in session | AR, IKNN |
cnt_anywhere_sess | in session with any current session item | *SKNN |
# !apt-get -qq install tree
# !rm -r sample_data
# !tree -h --du .
. ├── [217M] retailrocket │ ├── [ 90M] events.csv │ ├── [ 60M] prepared_window │ │ ├── [ 12M] events.0.hdf │ │ ├── [ 12M] events.1.hdf │ │ ├── [ 12M] events.2.hdf │ │ ├── [ 11M] events.3.hdf │ │ └── [ 11M] events.4.hdf │ └── [ 67M] slices │ ├── [115K] events_test.txt │ ├── [ 33M] events_train_full.txt │ ├── [ 33M] events_train_tr.txt │ └── [132K] events_train_valid.txt └── [ 32M] retailrocket.zip 249M used in 3 directories, 11 files
# !pip install -q watermark
# %reload_ext watermark
# %watermark -a "Sparsh A." -m -iv -u -t -d
Author: Sparsh A. Last updated: 2021-12-04 17:20:19 Compiler : GCC 7.5.0 OS : Linux Release : 5.4.104+ Machine : x86_64 Processor : x86_64 CPU cores : 2 Architecture: 64bit IPython: 5.5.0 pandas : 1.1.5 numpy : 1.19.5 sys : 3.7.12 (default, Sep 10 2021, 00:21:48) [GCC 7.5.0]
END