- See here for more information.
- Author: Lilian Besson.
- License: MIT License.
We have a few CSV files, let start by reading them.
from tqdm import tqdm
import numpy as np
import pandas as pd
!ls -larth *.csv
-rw-r--r-- 1 lilian lilian 350K juin 27 15:10 titles.csv -rw-r--r-- 1 lilian lilian 3,2M juin 27 15:25 watched.csv -rw-r--r-- 1 lilian lilian 1010K juin 27 15:34 test.csv -rw-r--r-- 1 lilian lilian 124K juin 28 17:55 train.csv -rw-r--r-- 1 lilian lilian 2,4M juin 28 17:57 submission.csv
!cp -vf submission.csv submission.csv.old
'submission.csv' -> 'submission.csv.old'
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
titles = pd.read_csv("titles.csv")
watched = pd.read_csv("watched.csv")
np.unique(titles.category)
array(['album', 'anime', 'manga'], dtype=object)
Just to check they have correctly been read:
train[:5]
len(train)
min(train['user_id']), max(train['user_id'])
min(train['work_id']), max(train['work_id'])
user_id | work_id | rating | |
---|---|---|---|
0 | 50 | 4041 | 0 |
1 | 508 | 1713 | 0 |
2 | 1780 | 7053 | 1 |
3 | 658 | 8853 | 0 |
4 | 1003 | 9401 | 0 |
11112
(1, 1982)
(2, 9884)
test[:5]
len(test)
min(test['user_id']), max(test['user_id'])
min(test['work_id']), max(test['work_id'])
user_id | work_id | |
---|---|---|
0 | 486 | 1086 |
1 | 1509 | 3296 |
2 | 617 | 1086 |
3 | 270 | 9648 |
4 | 459 | 3647 |
100015
(0, 1982)
(2, 9884)
watched[:5]
len(watched)
min(watched['user_id']), max(watched['user_id'])
min(watched['work_id']), max(watched['work_id'])
user_id | work_id | rating | |
---|---|---|---|
0 | 717 | 8025 | dislike |
1 | 1106 | 1027 | neutral |
2 | 1970 | 3949 | neutral |
3 | 1685 | 9815 | like |
4 | 1703 | 3482 | like |
198970
(0, 1982)
(0, 9896)
rating
of users who saw it, using data from the train data.submission = test.copy()
total_average_rating = train.rating.mean()
submission[:5]
len(submission)
user_id | work_id | |
---|---|---|
0 | 486 | 1086 |
1 | 1509 | 3296 |
2 | 617 | 1086 |
3 | 270 | 9648 |
4 | 459 | 3647 |
100015
works_id = np.unique(np.append(test.work_id.unique(), train.work_id.unique()))
mean_ratings = pd.DataFrame(data={'mean_rating': 0}, index=works_id)
mean_ratings[:5]
len(mean_ratings)
mean_rating | |
---|---|
2 | 0 |
3 | 0 |
4 | 0 |
5 | 0 |
9 | 0 |
2706
computed_means = pd.DataFrame(data={'mean_rating': train.groupby('work_id').mean()['rating']}, index=works_id)
computed_means[:5]
len(computed_means)
mean_rating | |
---|---|
2 | 0.230769 |
3 | NaN |
4 | 0.500000 |
5 | 0.333333 |
9 | 0.200000 |
2706
mean_ratings.update(computed_means)
mean_ratings[:10]
len(mean_ratings)
mean_rating | |
---|---|
2 | 0.230769 |
3 | 0.000000 |
4 | 0.500000 |
5 | 0.333333 |
9 | 0.200000 |
22 | 0.000000 |
23 | 0.000000 |
24 | 0.000000 |
27 | 0.333333 |
28 | 0.333333 |
2706
submission = submission.join(mean_ratings, on='work_id')
submission.rename_axis({'mean_rating': 'prob_willsee'}, axis="columns", inplace=True)
# in case of mean on empty values
submission.fillna(value=total_average_rating, inplace=True)
submission[:10]
user_id | work_id | prob_willsee | |
---|---|---|---|
0 | 486 | 1086 | 0.440000 |
1 | 1509 | 3296 | 0.000000 |
2 | 617 | 1086 | 0.440000 |
3 | 270 | 9648 | 0.529412 |
4 | 459 | 3647 | 0.333333 |
5 | 41 | 3562 | 0.724138 |
6 | 1780 | 9156 | 0.333333 |
7 | 284 | 5502 | 0.500000 |
8 | 1521 | 7250 | 0.500000 |
9 | 130 | 8209 | 0.500000 |
Let save it to submission_naive1.csv
:
submission.to_csv("submission_naive1.csv", index=False)
!ls -larth submission_naive1.csv
-rw-rw-r-- 1 lilian lilian 2,0M sept. 24 14:40 submission_naive1.csv
watched.csv
¶The bonus data set watched
can give a lot of information. There is 200000 entries in it and only 100000 in test.csv
.
len(test), len(watched)
(100015, 198970)
ratings = np.unique(watched.rating).tolist()
ratings
['dislike', 'like', 'love', 'neutral']
watched[:5]
user_id | work_id | rating | |
---|---|---|---|
0 | 717 | 8025 | dislike |
1 | 1106 | 1027 | neutral |
2 | 1970 | 3949 | neutral |
3 | 1685 | 9815 | like |
4 | 1703 | 3482 | like |
By using the train data (user, work)
that are also in watched
, we can learn to map string rating, i.e., 'dislike', 'neutral', 'like', 'love'
, to probability of having see the movie.
watched.rename_axis({'rating': 'strrating'}, axis="columns", inplace=True)
watched[:5]
user_id | work_id | strrating | |
---|---|---|---|
0 | 717 | 8025 | dislike |
1 | 1106 | 1027 | neutral |
2 | 1970 | 3949 | neutral |
3 | 1685 | 9815 | like |
4 | 1703 | 3482 | like |
train[:5]
user_id | work_id | rating | |
---|---|---|---|
0 | 50 | 4041 | 0 |
1 | 508 | 1713 | 0 |
2 | 1780 | 7053 | 1 |
3 | 658 | 8853 | 0 |
4 | 1003 | 9401 | 0 |
Is there pairs (user, work)
for which both train data and watched data are available (i.e., both see/notsee and liked/disliked) ?
train.merge(watched, on=['user_id', 'work_id'])
user_id | work_id | rating | strrating |
---|
And what about test data?
test.merge(watched, on=['user_id', 'work_id'])
user_id | work_id | strrating |
---|
test.merge(watched, on=['work_id'])
user_id_x | work_id | user_id_y | strrating | |
---|---|---|---|---|
0 | 486 | 1086 | 1120 | neutral |
1 | 486 | 1086 | 1934 | dislike |
2 | 486 | 1086 | 684 | neutral |
3 | 486 | 1086 | 45 | neutral |
4 | 486 | 1086 | 1245 | neutral |
5 | 486 | 1086 | 1705 | dislike |
6 | 486 | 1086 | 1671 | like |
7 | 486 | 1086 | 1082 | neutral |
8 | 486 | 1086 | 1672 | dislike |
9 | 486 | 1086 | 1606 | dislike |
10 | 486 | 1086 | 1392 | dislike |
11 | 486 | 1086 | 1463 | dislike |
12 | 486 | 1086 | 1668 | neutral |
13 | 486 | 1086 | 657 | neutral |
14 | 486 | 1086 | 466 | dislike |
15 | 486 | 1086 | 114 | neutral |
16 | 486 | 1086 | 1795 | like |
17 | 486 | 1086 | 843 | neutral |
18 | 486 | 1086 | 910 | dislike |
19 | 486 | 1086 | 831 | neutral |
20 | 486 | 1086 | 1416 | dislike |
21 | 486 | 1086 | 356 | neutral |
22 | 486 | 1086 | 1125 | dislike |
23 | 486 | 1086 | 1587 | dislike |
24 | 486 | 1086 | 1962 | like |
25 | 486 | 1086 | 208 | dislike |
26 | 486 | 1086 | 231 | neutral |
27 | 486 | 1086 | 610 | dislike |
28 | 486 | 1086 | 1333 | like |
29 | 486 | 1086 | 181 | neutral |
... | ... | ... | ... | ... |
17983485 | 1238 | 4595 | 1635 | like |
17983486 | 1238 | 4595 | 1268 | like |
17983487 | 1238 | 4595 | 1559 | like |
17983488 | 1238 | 4595 | 561 | dislike |
17983489 | 1238 | 4595 | 872 | neutral |
17983490 | 1238 | 4595 | 1802 | neutral |
17983491 | 1238 | 4595 | 297 | like |
17983492 | 1238 | 4595 | 274 | like |
17983493 | 1238 | 4595 | 968 | neutral |
17983494 | 1238 | 4595 | 962 | dislike |
17983495 | 425 | 4595 | 1635 | like |
17983496 | 425 | 4595 | 1268 | like |
17983497 | 425 | 4595 | 1559 | like |
17983498 | 425 | 4595 | 561 | dislike |
17983499 | 425 | 4595 | 872 | neutral |
17983500 | 425 | 4595 | 1802 | neutral |
17983501 | 425 | 4595 | 297 | like |
17983502 | 425 | 4595 | 274 | like |
17983503 | 425 | 4595 | 968 | neutral |
17983504 | 425 | 4595 | 962 | dislike |
17983505 | 802 | 4595 | 1635 | like |
17983506 | 802 | 4595 | 1268 | like |
17983507 | 802 | 4595 | 1559 | like |
17983508 | 802 | 4595 | 561 | dislike |
17983509 | 802 | 4595 | 872 | neutral |
17983510 | 802 | 4595 | 1802 | neutral |
17983511 | 802 | 4595 | 297 | like |
17983512 | 802 | 4595 | 274 | like |
17983513 | 802 | 4595 | 968 | neutral |
17983514 | 802 | 4595 | 962 | dislike |
17983515 rows × 4 columns
No! So we can forget about the user_id
, and we will learn how to map liked/disliked to see/notsee for each movie.
all_train = watched.merge(train, on='work_id')
all_train[:5]
user_id_x | work_id | strrating | user_id_y | rating | |
---|---|---|---|---|---|
0 | 717 | 8025 | dislike | 863 | 0 |
1 | 717 | 8025 | dislike | 329 | 0 |
2 | 717 | 8025 | dislike | 1046 | 0 |
3 | 717 | 8025 | dislike | 794 | 0 |
4 | 717 | 8025 | dislike | 820 | 1 |
del all_train['user_id_x']
del all_train['user_id_y']
We can delete the user_id
axes.
all_train[:5]
work_id | strrating | rating | |
---|---|---|---|
0 | 8025 | dislike | 0 |
1 | 8025 | dislike | 0 |
2 | 8025 | dislike | 0 |
3 | 8025 | dislike | 0 |
4 | 8025 | dislike | 1 |
We can first get the average rating of each work:
all_train.groupby('work_id').rating.mean()[:10]
work_id 2 0.230769 4 0.500000 5 0.333333 9 0.200000 23 0.000000 24 0.000000 27 0.333333 28 0.333333 33 0.250000 48 0.400000 Name: rating, dtype: float64
This table now contains, for each work, a list of mapping from strrating
to rating
.
It can be combined into a concise mapping, like in this form:
mapping_strrating_probwillsee = {
'dislike': 0,
'neutral': 0.50,
'like': 0.75,
'love': 1,
}
Manually, for instance for one movie:
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'dislike')]
work_id | strrating | rating | |
---|---|---|---|
0 | 8025 | dislike | 0 |
1 | 8025 | dislike | 0 |
2 | 8025 | dislike | 0 |
3 | 8025 | dislike | 0 |
4 | 8025 | dislike | 1 |
5 | 8025 | dislike | 0 |
6 | 8025 | dislike | 0 |
7 | 8025 | dislike | 1 |
8 | 8025 | dislike | 0 |
9 | 8025 | dislike | 0 |
10 | 8025 | dislike | 0 |
11 | 8025 | dislike | 0 |
12 | 8025 | dislike | 0 |
13 | 8025 | dislike | 1 |
14 | 8025 | dislike | 0 |
15 | 8025 | dislike | 1 |
16 | 8025 | dislike | 0 |
17 | 8025 | dislike | 1 |
18 | 8025 | dislike | 1 |
19 | 8025 | dislike | 0 |
20 | 8025 | dislike | 0 |
21 | 8025 | dislike | 0 |
22 | 8025 | dislike | 0 |
23 | 8025 | dislike | 1 |
24 | 8025 | dislike | 0 |
25 | 8025 | dislike | 0 |
26 | 8025 | dislike | 1 |
27 | 8025 | dislike | 0 |
28 | 8025 | dislike | 0 |
29 | 8025 | dislike | 0 |
... | ... | ... | ... |
17754 | 8025 | dislike | 0 |
17755 | 8025 | dislike | 0 |
17756 | 8025 | dislike | 0 |
17757 | 8025 | dislike | 0 |
17758 | 8025 | dislike | 0 |
17759 | 8025 | dislike | 1 |
17760 | 8025 | dislike | 0 |
17761 | 8025 | dislike | 1 |
17762 | 8025 | dislike | 0 |
17763 | 8025 | dislike | 1 |
17764 | 8025 | dislike | 1 |
17784 | 8025 | dislike | 0 |
17785 | 8025 | dislike | 0 |
17786 | 8025 | dislike | 0 |
17787 | 8025 | dislike | 0 |
17788 | 8025 | dislike | 1 |
17789 | 8025 | dislike | 0 |
17790 | 8025 | dislike | 0 |
17791 | 8025 | dislike | 1 |
17792 | 8025 | dislike | 0 |
17793 | 8025 | dislike | 0 |
17794 | 8025 | dislike | 0 |
17795 | 8025 | dislike | 0 |
17796 | 8025 | dislike | 0 |
17797 | 8025 | dislike | 1 |
17798 | 8025 | dislike | 0 |
17799 | 8025 | dislike | 1 |
17800 | 8025 | dislike | 0 |
17801 | 8025 | dislike | 1 |
17802 | 8025 | dislike | 1 |
4294 rows × 3 columns
all_train[all_train.work_id == 8025].rating.mean()
0.31578947368421051
len(all_train[(all_train.work_id == 8025) & (all_train.strrating == 'dislike')].rating)
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'dislike')].rating.mean()
4294
0.31578947368421051
len(all_train[(all_train.work_id == 8025) & (all_train.strrating == 'neutral')].rating)
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'neutral')].rating.mean()
4598
0.31578947368421051
len(all_train[(all_train.work_id == 8025) & (all_train.strrating == 'like')].rating)
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'like')].rating.mean()
8151
0.31578947368421051
len(all_train[(all_train.work_id == 8025) & (all_train.strrating == 'love')].rating)
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'love')].rating.mean()
817
0.31578947368421051
That's weird!
titles.csv
¶I don't think I want to use the titles, but clustering the works by categories could help, maybe.
categories = np.unique(titles.category).tolist()
categories
['album', 'anime', 'manga']
for cat in categories:
print("There is {:>5} work(s) in category '{}'.".format(sum(titles.category == cat), cat))
There is 2808 work(s) in category 'manga'. There is 1 work(s) in category 'album'. There is 7088 work(s) in category 'anime'.
One category is alone, let rewrite it to 'anime'
.
categories = {
'anime': 0,
'album': 0,
'manga': 1,
}
TODO !
TODO !