Let's try out some of the ideas from last lecture on the MovieLens dataset.
import matplotlib.pyplot as plt
import os
import pandas as pd
import urllib
import zipfile
%matplotlib inline
# Download the data.
def download_data():
""" Download and unzip data.
DONE ALREADY.
"""
url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'
urllib.request.urlretrieve(url, 'ml-latest-small.zip')
zfile = zipfile.ZipFile('ml-latest-small.zip')
zfile.extractall()
zfile.close()
download_data()
path = 'ml-latest-small'
ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
movies = pd.read_csv(path + os.path.sep + 'movies.csv')
tags = pd.read_csv(path + os.path.sep + 'tags.csv')
ratings.head(3)
userId | movieId | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 31 | 2.5 | 1260759144 |
1 | 1 | 1029 | 3.0 | 1260759179 |
2 | 1 | 1061 | 3.0 | 1260759182 |
movies.head(3)
movieId | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
1 | 2 | Jumanji (1995) | Adventure|Children|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
tags.head(3)
userId | movieId | tag | timestamp | |
---|---|---|---|---|
0 | 15 | 339 | sandra 'boring' bullock | 1138537770 |
1 | 15 | 1955 | dentist | 1193435061 |
2 | 15 | 7478 | Cambodia | 1170560997 |
ratings[ratings.userId==1]
userId | movieId | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 31 | 2.5 | 1260759144 |
1 | 1 | 1029 | 3.0 | 1260759179 |
2 | 1 | 1061 | 3.0 | 1260759182 |
3 | 1 | 1129 | 2.0 | 1260759185 |
4 | 1 | 1172 | 4.0 | 1260759205 |
5 | 1 | 1263 | 2.0 | 1260759151 |
6 | 1 | 1287 | 2.0 | 1260759187 |
7 | 1 | 1293 | 2.0 | 1260759148 |
8 | 1 | 1339 | 3.5 | 1260759125 |
9 | 1 | 1343 | 2.0 | 1260759131 |
10 | 1 | 1371 | 2.5 | 1260759135 |
11 | 1 | 1405 | 1.0 | 1260759203 |
12 | 1 | 1953 | 4.0 | 1260759191 |
13 | 1 | 2105 | 4.0 | 1260759139 |
14 | 1 | 2150 | 3.0 | 1260759194 |
15 | 1 | 2193 | 2.0 | 1260759198 |
16 | 1 | 2294 | 2.0 | 1260759108 |
17 | 1 | 2455 | 2.5 | 1260759113 |
18 | 1 | 2968 | 1.0 | 1260759200 |
19 | 1 | 3671 | 3.0 | 1260759117 |
Let's use the item-item method to predict user 1's rating for movie 3671
movies[movies.movieId==3671].iloc[0]['genres']
movieId | title | genres | |
---|---|---|---|
2925 | 3671 | Blazing Saddles (1974) | Comedy|Western |
# how many users are there?
user_ids = sorted(set(ratings.userId))
#len(user_ids)
user_ids[:10]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# make user ids start at 0.
ratings['userId'] = ratings['userId'] - 1
# how many users are there?
user_ids = sorted(set(ratings.userId))
#len(user_ids)
user_ids[:10]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# What are the ratings for 3671?
ratings[ratings.movieId==3671]
userId | movieId | rating | timestamp | |
---|---|---|---|---|
19 | 0 | 3671 | 3.0 | 1260759117 |
1679 | 14 | 3671 | 2.0 | 1166586157 |
4436 | 22 | 3671 | 3.5 | 1149868554 |
5761 | 29 | 3671 | 4.0 | 960918106 |
8453 | 55 | 3671 | 4.0 | 1467003357 |
10809 | 72 | 3671 | 3.0 | 1255595938 |
11981 | 74 | 3671 | 4.0 | 1165596914 |
12031 | 75 | 3671 | 3.5 | 1194384277 |
12239 | 76 | 3671 | 4.0 | 1163079471 |
13017 | 82 | 3671 | 4.5 | 1156206112 |
16005 | 101 | 3671 | 4.0 | 959975744 |
18453 | 119 | 3671 | 2.5 | 1167422038 |
18840 | 124 | 3671 | 4.5 | 1269735510 |
19659 | 129 | 3671 | 2.0 | 1149644335 |
20255 | 133 | 3671 | 4.5 | 1361244873 |
21162 | 147 | 3671 | 4.0 | 1059604766 |
24820 | 177 | 3671 | 4.0 | 1437427720 |
26709 | 194 | 3671 | 5.0 | 976288624 |
26848 | 195 | 3671 | 5.0 | 959223213 |
28827 | 211 | 3671 | 2.5 | 1218954775 |
31181 | 221 | 3671 | 3.0 | 960920305 |
36982 | 264 | 3671 | 5.0 | 960060002 |
39237 | 284 | 3671 | 3.0 | 965091993 |
39772 | 290 | 3671 | 5.0 | 1111489095 |
40676 | 293 | 3671 | 3.0 | 1062536588 |
43028 | 305 | 3671 | 4.0 | 959197066 |
43260 | 308 | 3671 | 5.0 | 1114567315 |
44986 | 314 | 3671 | 5.0 | 1046663466 |
47765 | 351 | 3671 | 4.0 | 1420521398 |
51493 | 379 | 3671 | 4.0 | 1199154721 |
... | ... | ... | ... | ... |
56105 | 404 | 3671 | 4.0 | 1097698251 |
58240 | 422 | 3671 | 3.5 | 1355514296 |
58423 | 423 | 3671 | 3.5 | 1088826857 |
59126 | 427 | 3671 | 4.5 | 1304130933 |
59401 | 429 | 3671 | 4.5 | 1111489222 |
60953 | 441 | 3671 | 5.0 | 1227968502 |
62245 | 451 | 3671 | 4.0 | 976421517 |
64006 | 459 | 3671 | 4.0 | 1072837173 |
66319 | 467 | 3671 | 2.5 | 1296192490 |
67861 | 471 | 3671 | 5.0 | 958950037 |
72067 | 504 | 3671 | 3.5 | 1340406082 |
72953 | 508 | 3671 | 4.0 | 978938266 |
74831 | 518 | 3671 | 4.5 | 1469927119 |
76376 | 528 | 3671 | 4.0 | 959965342 |
81283 | 552 | 3671 | 3.5 | 1423010113 |
82292 | 560 | 3671 | 4.5 | 1172734423 |
84549 | 563 | 3671 | 5.0 | 974712545 |
85997 | 574 | 3671 | 4.0 | 1012594537 |
87014 | 579 | 3671 | 4.0 | 1155617921 |
88086 | 584 | 3671 | 4.0 | 975363032 |
88440 | 586 | 3671 | 4.0 | 1112034902 |
90220 | 597 | 3671 | 4.0 | 1008571310 |
90860 | 603 | 3671 | 3.5 | 1277532575 |
91255 | 604 | 3671 | 4.0 | 980175465 |
92650 | 614 | 3671 | 3.5 | 1468174876 |
93369 | 620 | 3671 | 5.0 | 1116476740 |
94105 | 623 | 3671 | 5.0 | 1019127174 |
95945 | 633 | 3671 | 3.5 | 1309492285 |
97871 | 653 | 3671 | 4.5 | 1145390260 |
99965 | 670 | 3671 | 3.0 | 1065149267 |
62 rows × 4 columns
# Get the ratings from all users assigned to movie 3671
# Store ratings in a numpy array with dimension equal to number of users.
target_movie_id = 3671
target_movie_vector = np.zeros(len(user_ids))
for index, row in ratings[ratings.movieId==3671].iterrows():
target_movie_vector[row.userId] = row.rating
# Remove target user's rating for this movie:
target_movie_vector[0] = 0
target_movie_vector
/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:6: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
array([ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 2. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. , 0. , 4. , 3.5, 4. , 0. , 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 2.5, 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 2. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 5. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 2.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 3. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 3. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 3.5, 0. , 0. , 0. , 4.5, 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 2.5, 0. , 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 4. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 3.5, 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. ])
from scipy.stats import pearsonr
# Correlation between two item vectors (e.g., all ratings given to movie j)
def correlation(v1, v2):
indices = [i for i in range(len(v1)) if v1[i] != 0 and v2[i] != 0]
print(v1[indices])
print(v2[indices])
if len(indices) < 2:
return 0
else:
return pearsonr(v1[indices], v2[indices])[0]
correlation(np.array([0,4,0,5,0,5,2]),
np.array([4,3,0,4,0,5,1]))
[4 5 5 2] [3 4 5 1]
0.96609178307929588
correlation(np.array([0,4,0,5,0,5,2]),
np.array([4,5,0,4,0,4,5])) # change second vector
[4 5 5 2] [5 4 4 5]
-0.81649658092772615
# For every other movie that user 1 rated, compute its correlation with movie 3671
correlations = [] # (correlation, movieId) tuples
for index, row in ratings[ratings.userId==0].iterrows(): # for each movie this user has rated.
if row.movieId != 3671: # ignore Blazing Saddles
movie = movies[movies.movieId==row.movieId].iloc[0] # iloc: to get index of
# print(movie['title'])
movie_vector = np.zeros(len(user_ids))
# get all user ratings for this title.
for j, row2 in ratings[ratings.movieId==row.movieId].iterrows():
movie_vector[row2.userId] = row2.rating
corr = correlation(target_movie_vector, movie_vector)
correlations.append((corr, row.movieId))
print(sorted(correlations)[::-1][:10])
[(0.63592247307435135, 2294.0), (0.61616638738992613, 1287.0), (0.53082272889989246, 2455.0), (0.39498658934488512, 1343.0), (0.37922646140545141, 1029.0), (0.36838579111178049, 1953.0), (0.33029706730581576, 1405.0), (0.3273268353539886, 31.0), (0.29746710191544001, 2193.0), (0.28743499113013798, 1293.0)]
/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:10: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
movies[movies.movieId==1287]
movieId | title | genres | |
---|---|---|---|
1041 | 1287 | Ben-Hur (1959) | Action|Adventure|Drama |
# Now, take top K movies and do weighted average to compute predicted score.
K = 5
top_movies = sorted(correlations)[::-1][:K]
top_movie_ids = [int(x[1]) for x in top_movies]
top_movie_corrs = [x[0] for x in top_movies]
# get target user's ratings for these movies:
top_ratings = [ratings[ratings.userId==0][ratings.movieId == tmid]['rating'].iloc[0] for tmid in top_movie_ids]
top_ratings
/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
[2.0, 2.0, 2.5, 2.0, 3.0]
# weighted average:
np.dot(np.array(top_ratings), np.array(top_movie_corrs)) / sum(top_movie_corrs)
# True rating: 3.0
2.2520948004421606