Booking Popularity-based RecSys¶

Setup¶

In [ ]:

import os
project_name = "chef-session"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

In [ ]:

if not os.path.exists(project_path):
    !pip install -U -q dvc dvc[gdrive]
    !cp -r /content/drive/MyDrive/git_credentials/. ~
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    !git init
    !git remote add origin https://github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout "{branch}"
else:
    %cd "{project_path}"

In [ ]:

!git status

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean

In [ ]:

!git add . && git commit -m 'commit' && git push origin "{branch}"

In [ ]:

!dvc pull ./data/bronze/booking/*

Context¶

Booking.com dataset
- Popularity recommender, hit rate evaluation

Prototype¶

In [ ]:

import pandas as pd

In [ ]:

train = pd.read_parquet('./data/bronze/booking/train.parquet.snappy')
train = train.sort_values(by=['utrip_id','checkin'])
train

Out[ ]:

	user_id	checkin	checkout	city_id	device_class	affiliate_id	booker_country	hotel_country	utrip_id
0	1000027	2016-08-13	2016-08-14	8183	desktop	7168	Elbonia	Gondal	1000027_1
1	1000027	2016-08-14	2016-08-16	15626	desktop	7168	Elbonia	Gondal	1000027_1
2	1000027	2016-08-16	2016-08-18	60902	desktop	7168	Elbonia	Gondal	1000027_1
3	1000027	2016-08-18	2016-08-21	30628	desktop	253	Elbonia	Gondal	1000027_1
4	1000033	2016-04-09	2016-04-11	38677	mobile	359	Gondal	Cobra Island	1000033_1
...	...	...	...	...	...	...	...	...	...
1166830	999855	2016-05-01	2016-05-02	20345	mobile	359	Gondal	Fook Island	999855_1
1166831	999944	2016-06-23	2016-06-24	17944	desktop	4541	Gondal	Glubbdubdrib	999944_1
1166832	999944	2016-06-24	2016-06-27	47075	desktop	2322	Gondal	Glubbdubdrib	999944_1
1166833	999944	2016-06-27	2016-06-29	228	desktop	384	Gondal	Glubbdubdrib	999944_1
1166834	999944	2016-06-29	2016-06-30	62930	desktop	4541	Gondal	Glubbdubdrib	999944_1

1166835 rows × 9 columns

In [ ]:

test = pd.read_parquet('./data/bronze/booking/test.parquet.snappy')
test = test.sort_values(by=['utrip_id','checkin'])
test

Out[ ]:

	user_id	checkin	checkout	device_class	affiliate_id	booker_country	utrip_id	city_id	hotel_country
0	1000066	2016-07-21	2016-07-23	desktop	9924	Gondal	1000066_2	56430	Urkesh
1	1000066	2016-07-23	2016-07-25	desktop	9924	Gondal	1000066_2	41971	Urkesh
2	1000066	2016-07-25	2016-07-28	desktop	9924	Gondal	1000066_2	5797	Urkesh
3	1000066	2016-07-28	2016-07-31	mobile	2436	Gondal	1000066_2	0	None
4	1000270	2016-02-08	2016-02-09	mobile	9452	The Devilfire Empire	1000270_1	50075	The Devilfire Empire
...	...	...	...	...	...	...	...	...	...
378662	999911	2016-10-07	2016-10-08	desktop	9598	Gondal	999911_1	0	None
378663	999991	2016-08-15	2016-08-17	desktop	8065	Elbonia	999991_3	29770	Elbonia
378664	999991	2016-08-18	2016-08-19	desktop	8065	Elbonia	999991_3	36170	Carpathia
378665	999991	2016-08-19	2016-08-20	tablet	3631	Elbonia	999991_3	52155	Elbonia
378666	999991	2016-08-21	2016-08-22	tablet	3631	Elbonia	999991_3	0	None

378667 rows × 9 columns

In [ ]:

# Generate Dummy Predictions - use top 4 cities in the trainset as benchmark recommendation
topcities = train.city_id.value_counts().index[:4]

test_trips = (test[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)

cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0], columns=['city_id_1','city_id_2','city_id_3','city_id_4'])

cities_prediction = pd.concat([test_trips, cities_prediction], axis =1)
cities_prediction

Out[ ]:

	utrip_id	city_id_1	city_id_2	city_id_3	city_id_4
0	1000066_2	47499	23921	36063	17013
1	1000270_1	47499	23921	36063	17013
2	1000441_1	47499	23921	36063	17013
3	100048_1	47499	23921	36063	17013
4	1000543_1	47499	23921	36063	17013
...	...	...	...	...	...
70657	999674_1	47499	23921	36063	17013
70658	999797_1	47499	23921	36063	17013
70659	999862_1	47499	23921	36063	17013
70660	999911_1	47499	23921	36063	17013
70661	999991_3	47499	23921	36063	17013

70662 rows × 5 columns

In [ ]:

ground_truth = pd.read_parquet('./data/bronze/booking/ground_truth.parquet.snappy')
ground_truth.set_index('utrip_id', inplace=True)
ground_truth

Out[ ]:

	city_id	hotel_country
utrip_id
1038944_1	54085	Sokovia
1068715_1	29319	Cobra Island
1075528_1	55763	Bozatta
1110462_4	11930	Alvonia
1132565_1	58659	Axphain
...	...	...
881470_1	28422	Cobra Island
886479_1	51291	Glubbdubdrib
90072_1	22175	Cobra Island
96245_1	58135	Nevoruss
990535_1	56503	Axphain

70662 rows × 2 columns

In [ ]:

def evaluate_accuracy_at_4(predicted, actual):
    '''checks if the true city is within the four recommended cities'''
    data = predicted.join(actual, on='utrip_id')

    hits = ((data['city_id']==data['city_id_1'])|(data['city_id']==data['city_id_2'])|
        (data['city_id']==data['city_id_3'])|(data['city_id']==data['city_id_4']))*1
    return hits.mean()

In [ ]:

evaluate_accuracy_at_4(cities_prediction, ground_truth)

Tests¶

In [ ]:

!pip install -q ipytest
import ipytest
ipytest.autoconfig()

In [ ]:

# %%ipytest

In [ ]:

train = Dataset(path: str)
test = Dataset(path: str)
model = Model()
model.fit(train: pd.DataFrame)
model.recommend(test: pd.DataFrame, topk=4)

metrics = Metrics()
hr = metrics.HitRate(k=4)

eval = Evaluator(model,
                 data = test,
                 metrics=[hr])
eval.evaluate()
eval.save_results(path: str)

Dev¶

In [ ]:

import numpy as np
import pandas as pd
from typing import List

In [ ]:

class Dataset:
    def __init__(self, data=None):
        self.data = data

    def load(self, path, type='parquet'):
        if type=='parquet':
            self.data = pd.read_parquet(path)
        return self

    def sort(self, by: List):
        self.data.sort_values(by=by)
        return self

    def filter(self, by='cols', keep=[]):
        if by=='cols':
            self.data = self.data[keep]
        return self
    
    def rename(self, rename_map):
        self.data = self.data.rename(columns=rename_map)
        return self
    
    def cast(self, schema_map):
        self.data = self.data.astype(schema_map)
        return self

    def __repr__(self):
        return '{}\n{}\n{}\n{}'\
        .format(
            self.data.info(),
            '='*100,
            self.data.head(),
            '='*100
            )

In [ ]:

class Model:
    def __init__(self):
        self.items_by_popularity = []

    def fit(self, train):
        self.items_by_popularity = train.data['ITEM_ID'].value_counts().index.tolist()

    def recommend(self, uid=None, topk=4):
        return self.items_by_popularity[:topk]

In [ ]:

class HitRate:
    def __init__(self, k=4):
        self.k = k

    def calculate(self, recommended_list, actual_list):
        actual_list = np.array(actual_list) 
        recommended_list = np.array(recommended_list)[:self.k]
        flags = np.isin(actual_list, recommended_list) 
        return (flags.sum() > 0) * 1

    def __repr__(self):
        return 'HR@{}'.format(self.k)

In [ ]:

class Evaluate:
    def __init__(self, model, test_ids, ground_truth, metrics):
        self.model = model
        self.test_ids = test_ids
        self.ground_truth = ground_truth
        self.metrics = metrics
        self.results = {}
        self.recommendations = {}
        self._calculate_recommendations()
    
    def _calculate_recommendations(self):
        for test_id in self.test_ids:
            self.recommendations[test_id] = self.model.recommend(test_id)

    def evaluate(self):
        for metric in self.metrics:
            self.results[metric] = 0
            scores = []
            for test_id in self.test_ids:
                actual_list = self.ground_truth[test_id]
                recommended_list = self.recommendations[test_id]
                score = metric.calculate(recommended_list=recommended_list,
                                         actual_list=actual_list)
                scores.append(score)
            self.results[metric] = np.mean(scores)
        return self

    def save_results(self, path):
        with open(path, 'wt') as handle:
            self.results.write(str(handle))
    
    def __repr__(self):
        return str(self.results)

In [ ]:

train = Dataset()

train_info = train.load('./data/bronze/booking/train.parquet.snappy')\
                        .sort(by=['utrip_id','checkin'])\
                        .filter(by='cols', keep=['utrip_id','city_id'])\
                        .rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
                        .cast({'USER_ID':'str', 'ITEM_ID':'str'})
train_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1166835 entries, 0 to 1166834
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   USER_ID  1166835 non-null  object
 1   ITEM_ID  1166835 non-null  object
dtypes: object(2)
memory usage: 17.8+ MB

Out[ ]:

None
====================================================================================================
     USER_ID ITEM_ID
0  1000027_1    8183
1  1000027_1   15626
2  1000027_1   60902
3  1000027_1   30628
4  1000033_1   38677
====================================================================================================

In [ ]:

test = Dataset()

test_info = test.load('./data/bronze/booking/test.parquet.snappy')\
                        .sort(by=['utrip_id','checkin'])\
                        .filter(by='cols', keep=['utrip_id','city_id'])\
                        .rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
                        .cast({'USER_ID':'str', 'ITEM_ID':'str'})
test_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378667 entries, 0 to 378666
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   USER_ID  378667 non-null  object
 1   ITEM_ID  378667 non-null  object
dtypes: object(2)
memory usage: 5.8+ MB

Out[ ]:

None
====================================================================================================
     USER_ID ITEM_ID
0  1000066_2   56430
1  1000066_2   41971
2  1000066_2    5797
3  1000066_2       0
4  1000270_1   50075
====================================================================================================

In [ ]:

model = Model()
model.fit(train)
model.recommend('1000066_2')

Out[ ]:

['47499', '23921', '36063', '17013']

In [ ]:

hitrate = HitRate(k=4)
hitrate
print(hitrate.calculate(recommended_list=['1','2','3','4','5'], actual_list = ['4']))
print(hitrate.calculate(recommended_list=['1','2','3','4','5'], actual_list = ['5']))

1
0

In [ ]:

ground_truth = Dataset()

gt_info = ground_truth.load('./data/bronze/booking/ground_truth.parquet.snappy')\
                            .filter(by='cols', keep=['utrip_id','city_id'])\
                            .rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
                            .cast({'USER_ID':'str', 'ITEM_ID':'str'})

ground_truth = ground_truth.data\
                    .drop_duplicates(subset='USER_ID', keep='last')\
                    .set_index('USER_ID')\
                    .to_dict()['ITEM_ID']

print(type(ground_truth), len(ground_truth.keys()))

<class 'dict'> 70662

In [ ]:

eval = Evaluate(model=model,
                test_ids=test.data.USER_ID.unique(),
                ground_truth=ground_truth,
                metrics=[hitrate])

In [ ]:

eval.evaluate()

Out[ ]:

{HR@4: 0.05271574537941185}