CS579: Lecture 19¶

Recommendation Systems

Dr. Aron Culotta
Illinois Institute of Technology

Recommendation Systems, continued.¶

Let's try out some of the ideas from last lecture on the MovieLens dataset.

In [11]:

import matplotlib.pyplot as plt
import os
import pandas as pd
import urllib
import zipfile
%matplotlib inline

In [87]:

# Download the data.
def download_data():
    """ Download and unzip data.
    DONE ALREADY.
    """
    url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'
    urllib.request.urlretrieve(url, 'ml-latest-small.zip')
    zfile = zipfile.ZipFile('ml-latest-small.zip')
    zfile.extractall()
    zfile.close()
    
download_data()
path = 'ml-latest-small'
ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
movies = pd.read_csv(path + os.path.sep + 'movies.csv')
tags = pd.read_csv(path + os.path.sep + 'tags.csv')

In [88]:

ratings.head(3)

Out[88]:

	userId	movieId	rating	timestamp
0	1	31	2.5	1260759144
1	1	1029	3.0	1260759179
2	1	1061	3.0	1260759182

In [90]:

movies.head(3)

Out[90]:

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance

In [91]:

tags.head(3)

Out[91]:

	userId	movieId	tag	timestamp
0	15	339	sandra 'boring' bullock	1138537770
1	15	1955	dentist	1193435061
2	15	7478	Cambodia	1170560997

In [100]:

ratings[ratings.userId==1]

Out[100]:

	userId	movieId	rating	timestamp
0	1	31	2.5	1260759144
1	1	1029	3.0	1260759179
2	1	1061	3.0	1260759182
3	1	1129	2.0	1260759185
4	1	1172	4.0	1260759205
5	1	1263	2.0	1260759151
6	1	1287	2.0	1260759187
7	1	1293	2.0	1260759148
8	1	1339	3.5	1260759125
9	1	1343	2.0	1260759131
10	1	1371	2.5	1260759135
11	1	1405	1.0	1260759203
12	1	1953	4.0	1260759191
13	1	2105	4.0	1260759139
14	1	2150	3.0	1260759194
15	1	2193	2.0	1260759198
16	1	2294	2.0	1260759108
17	1	2455	2.5	1260759113
18	1	2968	1.0	1260759200
19	1	3671	3.0	1260759117

Let's use the item-item method to predict user 1's rating for movie 3671

In [101]:

movies[movies.movieId==3671].iloc[0]['genres']

Out[101]:

	movieId	title	genres
2925	3671	Blazing Saddles (1974)	Comedy\|Western

In [105]:

# how many users are there?
user_ids = sorted(set(ratings.userId))
#len(user_ids)
user_ids[:10]

Out[105]:

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [106]:

# make user ids start at 0.
ratings['userId'] = ratings['userId'] - 1

In [107]:

# how many users are there?
user_ids = sorted(set(ratings.userId))
#len(user_ids)
user_ids[:10]

Out[107]:

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [111]:

# What are the ratings for 3671?
ratings[ratings.movieId==3671]

Out[111]:

	userId	movieId	rating	timestamp
19	0	3671	3.0	1260759117
1679	14	3671	2.0	1166586157
4436	22	3671	3.5	1149868554
5761	29	3671	4.0	960918106
8453	55	3671	4.0	1467003357
10809	72	3671	3.0	1255595938
11981	74	3671	4.0	1165596914
12031	75	3671	3.5	1194384277
12239	76	3671	4.0	1163079471
13017	82	3671	4.5	1156206112
16005	101	3671	4.0	959975744
18453	119	3671	2.5	1167422038
18840	124	3671	4.5	1269735510
19659	129	3671	2.0	1149644335
20255	133	3671	4.5	1361244873
21162	147	3671	4.0	1059604766
24820	177	3671	4.0	1437427720
26709	194	3671	5.0	976288624
26848	195	3671	5.0	959223213
28827	211	3671	2.5	1218954775
31181	221	3671	3.0	960920305
36982	264	3671	5.0	960060002
39237	284	3671	3.0	965091993
39772	290	3671	5.0	1111489095
40676	293	3671	3.0	1062536588
43028	305	3671	4.0	959197066
43260	308	3671	5.0	1114567315
44986	314	3671	5.0	1046663466
47765	351	3671	4.0	1420521398
51493	379	3671	4.0	1199154721
...	...	...	...	...
56105	404	3671	4.0	1097698251
58240	422	3671	3.5	1355514296
58423	423	3671	3.5	1088826857
59126	427	3671	4.5	1304130933
59401	429	3671	4.5	1111489222
60953	441	3671	5.0	1227968502
62245	451	3671	4.0	976421517
64006	459	3671	4.0	1072837173
66319	467	3671	2.5	1296192490
67861	471	3671	5.0	958950037
72067	504	3671	3.5	1340406082
72953	508	3671	4.0	978938266
74831	518	3671	4.5	1469927119
76376	528	3671	4.0	959965342
81283	552	3671	3.5	1423010113
82292	560	3671	4.5	1172734423
84549	563	3671	5.0	974712545
85997	574	3671	4.0	1012594537
87014	579	3671	4.0	1155617921
88086	584	3671	4.0	975363032
88440	586	3671	4.0	1112034902
90220	597	3671	4.0	1008571310
90860	603	3671	3.5	1277532575
91255	604	3671	4.0	980175465
92650	614	3671	3.5	1468174876
93369	620	3671	5.0	1116476740
94105	623	3671	5.0	1019127174
95945	633	3671	3.5	1309492285
97871	653	3671	4.5	1145390260
99965	670	3671	3.0	1065149267

62 rows × 4 columns

In [112]:

# Get the ratings from all users assigned to movie 3671
# Store ratings in a numpy array with dimension equal to number of users.
target_movie_id = 3671
target_movie_vector = np.zeros(len(user_ids))
for index, row in ratings[ratings.movieId==3671].iterrows():
    target_movie_vector[row.userId] = row.rating
# Remove target user's rating for this movie:
target_movie_vector[0] = 0
target_movie_vector

/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:6: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

Out[112]:

array([ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  2. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        3.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  4. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  3. ,  0. ,  4. ,  3.5,  4. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  4.5,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  2.5,  0. ,
        0. ,  0. ,  0. ,  4.5,  0. ,  0. ,  0. ,  0. ,  2. ,  0. ,  0. ,
        0. ,  4.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  5. ,  5. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  2.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  3. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        5. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  3. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  5. ,  0. ,  0. ,  3. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  4. ,  0. ,  0. ,
        5. ,  0. ,  0. ,  0. ,  0. ,  0. ,  5. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  4. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  4. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  3. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  5. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  4. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  3.5,  3.5,  0. ,  0. ,  0. ,  4.5,  0. ,
        4.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  5. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  4. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  2.5,  0. ,  0. ,  0. ,  5. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  3.5,  0. ,
        0. ,  0. ,  4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  4.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  3.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  4.5,
        0. ,  0. ,  5. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  4. ,  0. ,  0. ,  0. ,  0. ,  4. ,  0. ,  0. ,  0. ,
        0. ,  4. ,  0. ,  4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  4. ,  0. ,  0. ,  0. ,  0. ,  0. ,  3.5,  4. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  3.5,  0. ,
        0. ,  0. ,  0. ,  0. ,  5. ,  0. ,  0. ,  5. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  3.5,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  4.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  3. ])

In [118]:

from scipy.stats import pearsonr
# Correlation between two item vectors (e.g., all ratings given to movie j)
def correlation(v1, v2):
    indices = [i for i in range(len(v1)) if v1[i] != 0 and v2[i] != 0]
    print(v1[indices])
    print(v2[indices])
    if len(indices) < 2:
        return 0
    else:
        return pearsonr(v1[indices], v2[indices])[0]
        
        
correlation(np.array([0,4,0,5,0,5,2]),
            np.array([4,3,0,4,0,5,1]))

[4 5 5 2]
[3 4 5 1]

Out[118]:

0.96609178307929588

In [119]:

correlation(np.array([0,4,0,5,0,5,2]),
            np.array([4,5,0,4,0,4,5]))  # change second vector 

[4 5 5 2]
[5 4 4 5]

Out[119]:

-0.81649658092772615

In [114]:

# For every other movie that user 1 rated, compute its correlation with movie 3671
correlations = []  # (correlation, movieId) tuples
for index, row in ratings[ratings.userId==0].iterrows():  # for each movie this user has rated.
    if row.movieId != 3671:  # ignore Blazing Saddles
        movie = movies[movies.movieId==row.movieId].iloc[0]  # iloc: to get index of 
        # print(movie['title'])
        movie_vector = np.zeros(len(user_ids))
        # get all user ratings for this title.
        for j, row2 in ratings[ratings.movieId==row.movieId].iterrows():
            movie_vector[row2.userId] = row2.rating
        corr = correlation(target_movie_vector, movie_vector)
        correlations.append((corr, row.movieId))
        
print(sorted(correlations)[::-1][:10])    

[(0.63592247307435135, 2294.0), (0.61616638738992613, 1287.0), (0.53082272889989246, 2455.0), (0.39498658934488512, 1343.0), (0.37922646140545141, 1029.0), (0.36838579111178049, 1953.0), (0.33029706730581576, 1405.0), (0.3273268353539886, 31.0), (0.29746710191544001, 2193.0), (0.28743499113013798, 1293.0)]

/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:10: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [116]:

movies[movies.movieId==1287]

Out[116]:

	movieId	title	genres
1041	1287	Ben-Hur (1959)	Action\|Adventure\|Drama

In [120]:

# Now, take top K movies and do weighted average to compute predicted score.
K = 5
top_movies = sorted(correlations)[::-1][:K]
top_movie_ids = [int(x[1]) for x in top_movies]
top_movie_corrs = [x[0] for x in top_movies]
# get target user's ratings for these movies:
top_ratings = [ratings[ratings.userId==0][ratings.movieId == tmid]['rating'].iloc[0] for tmid in top_movie_ids]
top_ratings

/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.

Out[120]:

[2.0, 2.0, 2.5, 2.0, 3.0]

In [121]:

# weighted average:
np.dot(np.array(top_ratings), np.array(top_movie_corrs)) / sum(top_movie_corrs)

# True rating: 3.0

Out[121]:

2.2520948004421606