import numpy as np import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('http://jonghank.github.io/ee786/files/ratings.csv') df.head() df.tail() md = pd.read_csv('http://jonghank.github.io/ee786/files/movies.csv') md.head() md.tail() import scipy.sparse as ssp userId = df['userId'].values movieId = df['movieId'].values rating = df['rating'].values movieIdList = md['movieId'].values titleList = md['title'].values R = ssp.csr_matrix( (rating,(userId,movieId)) ) B = ssp.csr_matrix( (np.ones_like(rating),(userId,movieId)), dtype='int' ) userIdList = np.arange(B.shape[0], dtype='int') titleIdList = np.arange(B.shape[1], dtype='int') m, n = B.shape plt.figure(figsize=(10,10)) plt.imshow(R[:,:1000].toarray(), cmap='gray', vmin=0, vmax=5) plt.xlabel('Movie id'), plt.ylabel('User id') plt.title(f'Rating table: {m} users, {n} movies') plt.show() minRatesPerPerson, minRatesPerMovie = 40, 60 selectedMovies = np.sum(B, axis=0)>minRatesPerMovie selectedMovies = np.asarray(selectedMovies).flatten() R = R[:, selectedMovies] B = B[:, selectedMovies] titleIdList = titleIdList[selectedMovies] selectedUsers = np.sum(B, axis=1)>minRatesPerPerson selectedUsers = np.asarray(selectedUsers).flatten() R = R[selectedUsers, :] B = B[selectedUsers, :] userIdList = userIdList[selectedUsers] selectedUserNumbers = userIdList selectedMovieTitles = [ md['title'][md['movieId']==i] for i in titleIdList ] m, n = R.shape plt.figure(figsize=(10,10)) plt.imshow(R.toarray(), cmap='gray', vmin=0, vmax=5) plt.xlabel('Movie id'), plt.ylabel('User id') plt.title(f'Rating table: {m} users, {n} movies') plt.show() B = B.toarray() R = R.toarray() B_true = np.copy(B) R_true = np.copy(R) for i in range(m): for j in range(n): if B[i,j]>0: if np.random.rand()>0.9: B[i,j] = 0 R[i,j] = 0 plt.figure(figsize=(20,10)) plt.subplot(121) plt.imshow(R, cmap='gray', vmin=0, vmax=5) plt.xlabel('Movie id'), plt.ylabel('User id') plt.title(f'Rating table (training set): {m} users, {n} movies') plt.subplot(122) plt.imshow(R_true-R, cmap='gray', vmin=0, vmax=5) plt.xlabel('Movie id'), plt.ylabel('User id') plt.title(f'Rating table (validation set): {m} users, {n} movies') plt.show() # your code here import cvxpy as cp P = cp.Variable((m,n)) X = cp.Variable((m,m), symmetric=True) Y = cp.Variable((n,n), symmetric=True) obj = cp.trace(X) + cp.trace(Y) + 1.0*cp.sum_squares(P[B>0] - R[B>0]) obj = cp.Minimize(obj) constr = [ cp.bmat([[X, P], [P.T, Y]]) >> 0 ] prob = cp.Problem(obj, constr) prob.solve(verbose=True) #your code here P_opt = np.round(np.clip(2*P.value, 0, 10))/2 plt.figure(figsize=(20,10)) plt.subplot(121) plt.imshow(P_opt, cmap='gray', vmin=0, vmax=5) plt.xlabel('Movie id'), plt.ylabel('User id') plt.title('Preference prediction') plt.subplot(122) plt.imshow(R, cmap='gray', vmin=0, vmax=5) plt.xlabel('Movie id'), plt.ylabel('User id') plt.title('Rating table') plt.show() # your code here UP, SP, VP = np.linalg.svd(P.value) UX, SX, VX = np.linalg.svd(X.value) UY, SY, VY = np.linalg.svd(Y.value) plt.figure(figsize=(10,10)) plt.subplot(211) plt.semilogy(SP, label=r'$\sigma(P)$') plt.semilogy(SX, label=r'$\sigma(X)$') plt.semilogy(SY, label=r'$\sigma(Y)$') plt.title('Singular values (linear scale)') plt.grid(), plt.legend() plt.subplot(212) plt.plot(SP, label=r'$\sigma(P)$') plt.plot(SX, label=r'$\sigma(X)$') plt.plot(SY, label=r'$\sigma(Y)$') plt.title('Singular values (log scale)') plt.grid(), plt.legend(), plt.ylim(0,100) plt.show() # your code here users = np.random.randint(m, size=(10,)) movies = np.arange(n) for u in users: idx_trn = B[u,:]>0 idx_val = np.bitwise_and(B_true[u,:]>0, B[u,:]==0) plt.figure(figsize=(10,4)) plt.plot(P_opt[u,:], 'o:', label='Preference prediction') plt.plot(movies[idx_trn], R[u,idx_trn], 'kd', markersize=7, label='Training set') plt.plot(movies[idx_val], R_true[u,idx_val], 'r*', markersize=12, label='Validation set') plt.ylabel(f'user #{u}') plt.ylim(0,5.5), plt.grid(), plt.legend() plt.show() # your code here validation_set = np.where(B_true-B>0) error_validation = P.value[validation_set] - R_true[validation_set] RMSE_validation = np.std(error_validation) print(f'Validation RMSE: {RMSE_validation}') plt.figure(figsize=(10,6)) plt.hist(error_validation, bins=np.arange(-4,4.5,0.5)) plt.grid() plt.xlabel('Prediction error') plt.ylabel('Frequency') plt.title('Prediction accuracy') plt.show()