Booking.com trip recommendation part 1 - baseline model¶

Booking.com challenge on trip recommendation part 1

toc: true
badges: true
comments: true
categories: [travel]
image:

In [1]:

import pandas as pd

In [ ]:

!wget https://github.com/sparsh-ai/reco-data/raw/master/BookingChallenge.zip
!unzip BookingChallenge.zip

In [5]:

train_set = pd.read_csv('train_set.csv').sort_values(by=['utrip_id','checkin'])

print(train_set.shape)
train_set.head()

(1166835, 9)

Out[5]:

	user_id	checkin	checkout	city_id	device_class	affiliate_id	booker_country	hotel_country	utrip_id
0	1000027	2016-08-13	2016-08-14	8183	desktop	7168	Elbonia	Gondal	1000027_1
1	1000027	2016-08-14	2016-08-16	15626	desktop	7168	Elbonia	Gondal	1000027_1
2	1000027	2016-08-16	2016-08-18	60902	desktop	7168	Elbonia	Gondal	1000027_1
3	1000027	2016-08-18	2016-08-21	30628	desktop	253	Elbonia	Gondal	1000027_1
4	1000033	2016-04-09	2016-04-11	38677	mobile	359	Gondal	Cobra Island	1000033_1

In [6]:

test_set = pd.read_csv('test_set.csv').sort_values(by=['utrip_id','checkin'])

print(test_set.shape)
test_set.head()

(378667, 9)

Out[6]:

	user_id	checkin	checkout	device_class	affiliate_id	booker_country	utrip_id	city_id	hotel_country
0	1000066	2016-07-21	2016-07-23	desktop	9924	Gondal	1000066_2	56430	Urkesh
1	1000066	2016-07-23	2016-07-25	desktop	9924	Gondal	1000066_2	41971	Urkesh
2	1000066	2016-07-25	2016-07-28	desktop	9924	Gondal	1000066_2	5797	Urkesh
3	1000066	2016-07-28	2016-07-31	mobile	2436	Gondal	1000066_2	0	NaN
4	1000270	2016-02-08	2016-02-09	mobile	9452	The Devilfire Empire	1000270_1	50075	The Devilfire Empire

In [7]:

# what are the top 4 most visited cities?
topcities = train_set.city_id.value_counts().index[:4]
topcities

Out[7]:

Int64Index([47499, 23921, 36063, 17013], dtype='int64')

In [9]:

# how many trips are there in the test set?
test_trips = (test_set[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)
len(test_trips)

Out[9]:

In [10]:

# baseline - a simple logical rule - recommend top 4 most visitied cities to everyone
cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0],
                                 columns= ['city_id_1','city_id_2','city_id_3','city_id_4'])
cities_prediction[:5]

Out[10]:

	city_id_1	city_id_2	city_id_3	city_id_4
0	47499	23921	36063	17013
1	47499	23921	36063	17013
2	47499	23921	36063	17013
3	47499	23921	36063	17013
4	47499	23921	36063	17013

In [11]:

predictions = pd.concat([test_trips, cities_prediction], axis=1)

print(predictions.shape)
predictions.head()

(70662, 5)

Out[11]:

	utrip_id	city_id_1	city_id_2	city_id_3	city_id_4
0	1000066_2	47499	23921	36063	17013
1	1000270_1	47499	23921	36063	17013
2	1000441_1	47499	23921	36063	17013
3	100048_1	47499	23921	36063	17013
4	1000543_1	47499	23921	36063	17013

In [12]:

ground_truth = pd.read_csv('ground_truth.csv', index_col=[0])

print(ground_truth.shape)
ground_truth.head()

(70662, 2)

Out[12]:

	city_id	hotel_country
utrip_id
1038944_1	54085	Sokovia
1068715_1	29319	Cobra Island
1075528_1	55763	Bozatta
1110462_4	11930	Alvonia
1132565_1	58659	Axphain

In [13]:

def evaluate_accuracy_at_4(predictions, ground_truth):
    '''checks if the true city is within the four recommended cities'''
    data = predictions.join(ground_truth, on='utrip_id')

    hits = ((data['city_id']==data['city_id_1'])|(data['city_id']==data['city_id_2'])|
        (data['city_id']==data['city_id_3'])|(data['city_id']==data['city_id_4']))*1
    return hits.mean()

In [14]:

evaluate_accuracy_at_4(predictions, ground_truth)

Out[14]:

0.05271574537941185

In [ ]: