import os
project_name = "chef-session"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)
if not os.path.exists(project_path):
!pip install -U -q dvc dvc[gdrive]
!cp -r /content/drive/MyDrive/git_credentials/. ~
path = "/content/" + project_name;
!mkdir "{path}"
%cd "{path}"
!git init
!git remote add origin https://github.com/"{account}"/"{project_name}".git
!git pull origin "{branch}"
!git checkout "{branch}"
else:
%cd "{project_path}"
!git status
On branch main Your branch is up to date with 'origin/main'. nothing to commit, working tree clean
!git add . && git commit -m 'commit' && git push origin "{branch}"
!dvc pull ./data/bronze/booking/*
import pandas as pd
train = pd.read_parquet('./data/bronze/booking/train.parquet.snappy')
train = train.sort_values(by=['utrip_id','checkin'])
train
user_id | checkin | checkout | city_id | device_class | affiliate_id | booker_country | hotel_country | utrip_id | |
---|---|---|---|---|---|---|---|---|---|
0 | 1000027 | 2016-08-13 | 2016-08-14 | 8183 | desktop | 7168 | Elbonia | Gondal | 1000027_1 |
1 | 1000027 | 2016-08-14 | 2016-08-16 | 15626 | desktop | 7168 | Elbonia | Gondal | 1000027_1 |
2 | 1000027 | 2016-08-16 | 2016-08-18 | 60902 | desktop | 7168 | Elbonia | Gondal | 1000027_1 |
3 | 1000027 | 2016-08-18 | 2016-08-21 | 30628 | desktop | 253 | Elbonia | Gondal | 1000027_1 |
4 | 1000033 | 2016-04-09 | 2016-04-11 | 38677 | mobile | 359 | Gondal | Cobra Island | 1000033_1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1166830 | 999855 | 2016-05-01 | 2016-05-02 | 20345 | mobile | 359 | Gondal | Fook Island | 999855_1 |
1166831 | 999944 | 2016-06-23 | 2016-06-24 | 17944 | desktop | 4541 | Gondal | Glubbdubdrib | 999944_1 |
1166832 | 999944 | 2016-06-24 | 2016-06-27 | 47075 | desktop | 2322 | Gondal | Glubbdubdrib | 999944_1 |
1166833 | 999944 | 2016-06-27 | 2016-06-29 | 228 | desktop | 384 | Gondal | Glubbdubdrib | 999944_1 |
1166834 | 999944 | 2016-06-29 | 2016-06-30 | 62930 | desktop | 4541 | Gondal | Glubbdubdrib | 999944_1 |
1166835 rows × 9 columns
test = pd.read_parquet('./data/bronze/booking/test.parquet.snappy')
test = test.sort_values(by=['utrip_id','checkin'])
test
user_id | checkin | checkout | device_class | affiliate_id | booker_country | utrip_id | city_id | hotel_country | |
---|---|---|---|---|---|---|---|---|---|
0 | 1000066 | 2016-07-21 | 2016-07-23 | desktop | 9924 | Gondal | 1000066_2 | 56430 | Urkesh |
1 | 1000066 | 2016-07-23 | 2016-07-25 | desktop | 9924 | Gondal | 1000066_2 | 41971 | Urkesh |
2 | 1000066 | 2016-07-25 | 2016-07-28 | desktop | 9924 | Gondal | 1000066_2 | 5797 | Urkesh |
3 | 1000066 | 2016-07-28 | 2016-07-31 | mobile | 2436 | Gondal | 1000066_2 | 0 | None |
4 | 1000270 | 2016-02-08 | 2016-02-09 | mobile | 9452 | The Devilfire Empire | 1000270_1 | 50075 | The Devilfire Empire |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
378662 | 999911 | 2016-10-07 | 2016-10-08 | desktop | 9598 | Gondal | 999911_1 | 0 | None |
378663 | 999991 | 2016-08-15 | 2016-08-17 | desktop | 8065 | Elbonia | 999991_3 | 29770 | Elbonia |
378664 | 999991 | 2016-08-18 | 2016-08-19 | desktop | 8065 | Elbonia | 999991_3 | 36170 | Carpathia |
378665 | 999991 | 2016-08-19 | 2016-08-20 | tablet | 3631 | Elbonia | 999991_3 | 52155 | Elbonia |
378666 | 999991 | 2016-08-21 | 2016-08-22 | tablet | 3631 | Elbonia | 999991_3 | 0 | None |
378667 rows × 9 columns
# Generate Dummy Predictions - use top 4 cities in the trainset as benchmark recommendation
topcities = train.city_id.value_counts().index[:4]
test_trips = (test[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)
cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0], columns=['city_id_1','city_id_2','city_id_3','city_id_4'])
cities_prediction = pd.concat([test_trips, cities_prediction], axis =1)
cities_prediction
utrip_id | city_id_1 | city_id_2 | city_id_3 | city_id_4 | |
---|---|---|---|---|---|
0 | 1000066_2 | 47499 | 23921 | 36063 | 17013 |
1 | 1000270_1 | 47499 | 23921 | 36063 | 17013 |
2 | 1000441_1 | 47499 | 23921 | 36063 | 17013 |
3 | 100048_1 | 47499 | 23921 | 36063 | 17013 |
4 | 1000543_1 | 47499 | 23921 | 36063 | 17013 |
... | ... | ... | ... | ... | ... |
70657 | 999674_1 | 47499 | 23921 | 36063 | 17013 |
70658 | 999797_1 | 47499 | 23921 | 36063 | 17013 |
70659 | 999862_1 | 47499 | 23921 | 36063 | 17013 |
70660 | 999911_1 | 47499 | 23921 | 36063 | 17013 |
70661 | 999991_3 | 47499 | 23921 | 36063 | 17013 |
70662 rows × 5 columns
ground_truth = pd.read_parquet('./data/bronze/booking/ground_truth.parquet.snappy')
ground_truth.set_index('utrip_id', inplace=True)
ground_truth
city_id | hotel_country | |
---|---|---|
utrip_id | ||
1038944_1 | 54085 | Sokovia |
1068715_1 | 29319 | Cobra Island |
1075528_1 | 55763 | Bozatta |
1110462_4 | 11930 | Alvonia |
1132565_1 | 58659 | Axphain |
... | ... | ... |
881470_1 | 28422 | Cobra Island |
886479_1 | 51291 | Glubbdubdrib |
90072_1 | 22175 | Cobra Island |
96245_1 | 58135 | Nevoruss |
990535_1 | 56503 | Axphain |
70662 rows × 2 columns
def evaluate_accuracy_at_4(predicted, actual):
'''checks if the true city is within the four recommended cities'''
data = predicted.join(actual, on='utrip_id')
hits = ((data['city_id']==data['city_id_1'])|(data['city_id']==data['city_id_2'])|
(data['city_id']==data['city_id_3'])|(data['city_id']==data['city_id_4']))*1
return hits.mean()
evaluate_accuracy_at_4(cities_prediction, ground_truth)
!pip install -q ipytest
import ipytest
ipytest.autoconfig()
# %%ipytest
train = Dataset(path: str)
test = Dataset(path: str)
model = Model()
model.fit(train: pd.DataFrame)
model.recommend(test: pd.DataFrame, topk=4)
metrics = Metrics()
hr = metrics.HitRate(k=4)
eval = Evaluator(model,
data = test,
metrics=[hr])
eval.evaluate()
eval.save_results(path: str)
import numpy as np
import pandas as pd
from typing import List
class Dataset:
def __init__(self, data=None):
self.data = data
def load(self, path, type='parquet'):
if type=='parquet':
self.data = pd.read_parquet(path)
return self
def sort(self, by: List):
self.data.sort_values(by=by)
return self
def filter(self, by='cols', keep=[]):
if by=='cols':
self.data = self.data[keep]
return self
def rename(self, rename_map):
self.data = self.data.rename(columns=rename_map)
return self
def cast(self, schema_map):
self.data = self.data.astype(schema_map)
return self
def __repr__(self):
return '{}\n{}\n{}\n{}'\
.format(
self.data.info(),
'='*100,
self.data.head(),
'='*100
)
class Model:
def __init__(self):
self.items_by_popularity = []
def fit(self, train):
self.items_by_popularity = train.data['ITEM_ID'].value_counts().index.tolist()
def recommend(self, uid=None, topk=4):
return self.items_by_popularity[:topk]
class HitRate:
def __init__(self, k=4):
self.k = k
def calculate(self, recommended_list, actual_list):
actual_list = np.array(actual_list)
recommended_list = np.array(recommended_list)[:self.k]
flags = np.isin(actual_list, recommended_list)
return (flags.sum() > 0) * 1
def __repr__(self):
return 'HR@{}'.format(self.k)
class Evaluate:
def __init__(self, model, test_ids, ground_truth, metrics):
self.model = model
self.test_ids = test_ids
self.ground_truth = ground_truth
self.metrics = metrics
self.results = {}
self.recommendations = {}
self._calculate_recommendations()
def _calculate_recommendations(self):
for test_id in self.test_ids:
self.recommendations[test_id] = self.model.recommend(test_id)
def evaluate(self):
for metric in self.metrics:
self.results[metric] = 0
scores = []
for test_id in self.test_ids:
actual_list = self.ground_truth[test_id]
recommended_list = self.recommendations[test_id]
score = metric.calculate(recommended_list=recommended_list,
actual_list=actual_list)
scores.append(score)
self.results[metric] = np.mean(scores)
return self
def save_results(self, path):
with open(path, 'wt') as handle:
self.results.write(str(handle))
def __repr__(self):
return str(self.results)
train = Dataset()
train_info = train.load('./data/bronze/booking/train.parquet.snappy')\
.sort(by=['utrip_id','checkin'])\
.filter(by='cols', keep=['utrip_id','city_id'])\
.rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
.cast({'USER_ID':'str', 'ITEM_ID':'str'})
train_info
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1166835 entries, 0 to 1166834 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 USER_ID 1166835 non-null object 1 ITEM_ID 1166835 non-null object dtypes: object(2) memory usage: 17.8+ MB
None ==================================================================================================== USER_ID ITEM_ID 0 1000027_1 8183 1 1000027_1 15626 2 1000027_1 60902 3 1000027_1 30628 4 1000033_1 38677 ====================================================================================================
test = Dataset()
test_info = test.load('./data/bronze/booking/test.parquet.snappy')\
.sort(by=['utrip_id','checkin'])\
.filter(by='cols', keep=['utrip_id','city_id'])\
.rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
.cast({'USER_ID':'str', 'ITEM_ID':'str'})
test_info
<class 'pandas.core.frame.DataFrame'> RangeIndex: 378667 entries, 0 to 378666 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 USER_ID 378667 non-null object 1 ITEM_ID 378667 non-null object dtypes: object(2) memory usage: 5.8+ MB
None ==================================================================================================== USER_ID ITEM_ID 0 1000066_2 56430 1 1000066_2 41971 2 1000066_2 5797 3 1000066_2 0 4 1000270_1 50075 ====================================================================================================
model = Model()
model.fit(train)
model.recommend('1000066_2')
['47499', '23921', '36063', '17013']
hitrate = HitRate(k=4)
hitrate
print(hitrate.calculate(recommended_list=['1','2','3','4','5'], actual_list = ['4']))
print(hitrate.calculate(recommended_list=['1','2','3','4','5'], actual_list = ['5']))
1 0
ground_truth = Dataset()
gt_info = ground_truth.load('./data/bronze/booking/ground_truth.parquet.snappy')\
.filter(by='cols', keep=['utrip_id','city_id'])\
.rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
.cast({'USER_ID':'str', 'ITEM_ID':'str'})
ground_truth = ground_truth.data\
.drop_duplicates(subset='USER_ID', keep='last')\
.set_index('USER_ID')\
.to_dict()['ITEM_ID']
print(type(ground_truth), len(ground_truth.keys()))
<class 'dict'> 70662
eval = Evaluate(model=model,
test_ids=test.data.USER_ID.unique(),
ground_truth=ground_truth,
metrics=[hitrate])
eval.evaluate()
{HR@4: 0.05271574537941185}