import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from tqdm.notebook import tqdm import math from tensorflow.keras.layers import Input, Dense from tensorflow.keras.models import Model import matplotlib.pyplot as plt !wget -q --show-progress http://files.grouplens.org/datasets/movielens/ml-1m.zip !unzip ml-1m.zip df_ratings = pd.read_csv("./ml-1m/ratings.dat", sep="::", header=None, engine='python', names=["UserID", "MovieID", "Rating", "Timestamp"]) df_ratings.head() df_movies = pd.read_csv("./ml-1m/movies.dat", sep="::", header=None, engine='python', names=["MovieID", "Title", "Genres"]) df_movies.head() df_users = pd.read_csv("./ml-1m/users.dat", sep="::", header=None, engine='python', names=["UserID", "Gender", "Age", "Occupation", "Zip-code"]) df_users.head() df_ratings.shape, df_movies.shape, df_users.shape # merge df_ratings_movies = pd.merge(df_ratings, df_movies, on='MovieID') df = pd.merge(df_ratings_movies, df_users, on="UserID") df_raw = df.copy() # drop columns df = df.drop(['Timestamp','Genres','Gender','Age','Occupation','Zip-code'], axis=1) # Recommend 5 most seen movies def recommend_movie(n): movie_rank = df['MovieID'].value_counts()[:n] recommend_movies = df.loc[movie_rank.index] recommend = recommend_movies['Title'] return recommend recommend_movie(5) # Split train, test set x_train, x_test = train_test_split(df, test_size=0.05) # Recommend n most popular movies on a dataset def popular_movie(dataset, n): movie_rank = dataset['MovieID'].value_counts()[:n] popular_movies = dataset.iloc[movie_rank.index] return popular_movies # Calculate hitrate@K def hitrate(K): raw_ranking = popular_movie(df, K) pred_ranking = popular_movie(x_test, K) return raw_ranking['MovieID'].isin(pred_ranking['MovieID']).value_counts(normalize=True)[True] hitrate(100) # Recommend 5 movies with high ratings def recommend_movie2(n): movie_sort = movie_mean.sort_values(ascending=False)[:n] recommend_movies = df.loc[movie_sort.index] recommendation = recommend_movies['Title'] return recommendation movie_mean = df.groupby(['MovieID'])['Rating'].mean() recommend_movie2(5) # Split train, test set x_train, x_test = train_test_split(df, test_size=0.05) # dRecommend n most popular movies on a dataset def popular_movie(dataset, n): movie_rank = dataset['MovieID'].value_counts()[:n] popular_movies = dataset.iloc[movie_rank.index] return popular_movies # Calculate hitrate@K def hitrate(K): raw_ranking = popular_movie(df, K) pred_ranking = popular_movie(x_test, K) return raw_ranking['MovieID'].isin(pred_ranking['MovieID']).value_counts(normalize=True)[True] hitrate(100) # Accuracy calculation def RMSE(y_true, y_pred): return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2)) rmse = [] for user in set(df.index): y_true = df.loc[user]['Rating'] y_pred = movie_mean[df.loc[user]['MovieID']] accuracy = RMSE(y_true, y_pred) rmse.append(accuracy) print(np.mean(rmse)) # Split train, test set x = df.copy() y = df['UserID'] x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.05,stratify=y) # Accuracy calculation def RMSE(y_true, y_pred): return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2)) # Calculate RMSE by model def score(model): id_pairs = zip(x_test['UserID'], x_test['MovieID']) y_pred = np.array([model(user,movie) for (user,movie) in id_pairs]) y_true = np.array(x_test['Rating']) return RMSE(y_true, y_pred) # Get full matrix with training df rating_matrix = x_train.pivot(index='UserID', columns='MovieID', values='Rating') # The default model for calculating forecasts by the overall mean def best_seller(user_id, movie_id): try: rating = train_mean[movie_id] except: rating = 3.0 return rating train_mean = x_train.groupby(['MovieID'])['Rating'].mean() score(best_seller) df = df_raw.copy() count = df['UserID'].value_counts() count_index = count[count > 50] data_pre = df[df['UserID'].isin(count_index)] data_pre.head() x_train, x_test = train_test_split(data_pre, test_size=0.05) def genre_pop(dataset, genre, n): dataset_genres = dataset['Genres'].str.get_dummies("|") select_genre = dataset_genres[dataset_genres[genre]==1] genre_popmovie = dataset.loc[select_genre.index] genre_popmovie = genre_popmovie.reset_index() genre_popmovie_rank = genre_popmovie['MovieID'].value_counts()[:n] recomm_movie = genre_popmovie.loc[genre_popmovie_rank.index] return recomm_movie genre_pop(x_train, 'Comedy', 10) def user_genre(dataset, userid): user_data = dataset[dataset['UserID']==userid] user_data_genres = user_data['Genres'].str.get_dummies("|") user_genre_ranking = user_data_genres.sum().sort_values(ascending=False) return user_genre_ranking user_genre(x_train, 54) def user_genre_recommend(dataset, userid, n): genre_pref = user_genre(dataset, userid) recomm = genre_pop(dataset, genre_pref.index[0], n) return recomm user_genre_recommend(x_train, 54, 5) def popular_movie(dataset, n): movie_rank = dataset['MovieID'].value_counts()[:n] popular_movies = dataset.iloc[movie_rank.index] return popular_movies def hitrate1(K): raw_ranking = popular_movie(df, K) pred_ranking = popular_movie(x_test, K) return raw_ranking['MovieID'].isin(pred_ranking['MovieID']).value_counts(normalize=True)[True] s = 0 for i in range (100): s += hitrate1(100) s /= 100 s def user_genre_recommend(dataset, userid, n): genre_pref = user_genre(dataset, userid) recomm = genre_pop(dataset, genre_pref.index[0], n) return recomm def hitrate2(K): user = x_train.sample(n=1)['UserID'].values[0] raw_recomm = user_genre_recommend(data_pre, user, K) pred_recomm = user_genre_recommend(x_train, user, K) return raw_recomm['MovieID'].isin(pred_recomm['MovieID']).value_counts(normalize=True)[True] s = 0 count = 0 while count!=100: try: _s = hitrate2(100) count+=1 except: pass if count%10==0: print(count) s += _s s /= 100 s df = df_raw.copy() # drop columns df = df.drop(['Timestamp','Gender','Zip-code'], axis=1) # leave only data with a rating of 3 or higher data_pre = df[df['Rating'] > 2] x_train, x_test = train_test_split(data_pre, test_size=0.05) # Recommend n popular movies by occupation def occu_pop(dataset, occu, n): data_occu = dataset[dataset['Occupation'] == occu] data_occu = data_occu.reset_index() occu_pop_rank = data_occu['MovieID'].value_counts()[:n] recommend_movies = data_occu.loc[occu_pop_rank.index] return recommend_movies # Recommend n movies depending on user's occupation def user_occu_recommend(dataset, userid, n): user_occu = dataset[dataset['UserID'] == userid]['Occupation'].values[0] recomm = occu_pop(dataset, user_occu, n) return recomm user_occu_recommend(x_train, 46, 5) # Recommend n popular movies by age group def age_pop(dataset, age, n): data_age = dataset[dataset['Age'] == age] data_age = data_age.reset_index() age_pop_rank = data_age['MovieID'].value_counts()[:n] recommend_movies = data_age.loc[age_pop_rank.index] return recommend_movies # Recommend n movies based on user's age def user_age_recommend(dataset, userid, n): user_age = dataset[dataset['UserID'] == userid]['Age'].values[0] recomm = age_pop(dataset, user_age, n) return recomm user_age_recommend(x_train, 46, 5) def popular_movie(dataset, n): data_pop = dataset.copy() movie_rank = data_pop['MovieID'].value_counts()[:n] return movie_rank def hitrate1(K): raw_ranking = popular_movie(data_pre, K) pred_ranking = popular_movie(x_test, K) return pd.DataFrame(raw_ranking.index.isin(pred_ranking.index)).value_counts(normalize=True)[True] s = 0 for i in tqdm(range(len(x_test.index))): s += hitrate1(100) s /= len(x_test.index) s def occu_pop(dataset, occu, n): data_occu = dataset[dataset['Occupation'] == occu] occu_pop_rank = data_occu['MovieID'].value_counts()[:n] return occu_pop_rank def user_occu_recommend(dataset, userid, n): user_occu = dataset[dataset['UserID'] == userid]['Occupation'].values[0] recomm = occu_pop(dataset, user_occu, n) return recomm def hitrate2(user, K): raw_recomm = user_occu_recommend(data_pre, user, K) pred_recomm = user_occu_recommend(x_test, user, K) return pd.DataFrame(raw_recomm.index.isin(pred_recomm.index)).value_counts(normalize=True)[True] s = 0 for i in tqdm(x_test['UserID'].index): s += hitrate2(x_test['UserID'][i], 100) s /= len(x_test.index) s def age_pop(dataset, age, n): data_age = dataset[dataset['Age'] == age] data_age = data_age.reset_index() age_pop_rank = data_age['MovieID'].value_counts()[:n] return age_pop_rank def user_age_recommend(dataset, userid, n): user_age = dataset[dataset['UserID'] == userid]['Age'].values[0] recomm = age_pop(dataset, user_age, n) return recomm def hitrate3(user, K): raw_recomm = user_age_recommend(data_pre, user, K) pred_recomm = user_age_recommend(x_train, user, K) return pd.DataFrame(raw_recomm.index.isin(pred_recomm.index)).value_counts(normalize=True)[True] s = 0 for i in tqdm(x_test['UserID'].index): s += hitrate3(x_test['UserID'][i], 100) s /= len(x_test.index) s df = df_ratings.copy() df.columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] df = df.drop('unix_timestamp', 1) input_df = pd.DataFrame(index=range(1,max(df['user_id'])+1), columns=range(1,max(df['movie_id'])+1)) for index,row in df.iterrows(): input_df[row['movie_id']][row['user_id']]=row['rating'] print(input_df.shape) input_df input_df = input_df.truncate(after=64, axis=1) input_df = input_df.dropna(axis=0, how='all') mean_col = input_df.mean(axis=1) input_df.shape def user_similarity(a,b): if (not a in input_df.index or not b in input_df.index): return np.nan cov = 0.0 var_a = 0.0 var_b = 0.0 for column in input_df: avg_rating_a = mean_col[a] avg_rating_b = mean_col[b] j_rating_a = input_df[column][a] j_rating_b = input_df[column][b] if (not np.isnan(j_rating_a) and not np.isnan(j_rating_b)): cov = cov + (j_rating_a - avg_rating_a) * (j_rating_b - avg_rating_b) var_a = var_a + (j_rating_a - avg_rating_a) * (j_rating_a - avg_rating_a) var_b = var_b + (j_rating_b - avg_rating_b) * (j_rating_b - avg_rating_b) if (var_a == 0 or var_b == 0): return 0 return (cov/(math.sqrt(var_a*var_b))) sim = np.zeros(shape=(max(df['user_id']), max(df['user_id']))) num_of_users = max(df['user_id']) it = 0 for i in tqdm(range(num_of_users)): for j in range(i+1): sim[i][j] = user_similarity(i+1, j+1) sim[j][i] = sim[i][j] sim def round_off_rating(val): new_val = int(val) frac = val - int(val) if (frac >= 0.75): new_val = new_val + 1 elif (frac >= 0.25): new_val = new_val + 0.5 return max(min(new_val, 5.0), 1) def predict_column_rating(column_no): temp = input_df[input_df[column_no].notnull()][column_no] for index, null_rating in input_df[column_no].iteritems(): num_sum = 0 den_sum = 0 if (np.isnan(null_rating)): for i,rating in temp.iteritems(): num_sum = num_sum + sim[index-1][i-1] * (rating - mean_col[i]) den_sum = den_sum + sim[index-1][i-1] if (den_sum == 0): input_df[column_no][index] = round_off_rating(mean_col[index]) else: input_df[column_no][index] = round_off_rating(mean_col[index] + num_sum/den_sum) for column_no in input_df: predict_column_rating(column_no) input_df encoding_dim1 = 16 encoding_dim2 = 5 input_rating = Input(shape=(64,)) encoded = Dense(16, activation='relu')(input_rating) # 64->16 encoded = Dense(5, activation='relu')(encoded) # 16->05 decoded = Dense(16, activation='relu')(encoded) # 05->16 decoded = Dense(64, activation='sigmoid')(decoded) # 16->64 autoencoder = Model(input_rating, decoded) encoder1 = Model(input_rating, autoencoder.layers[1](input_rating)) input_encoding = Input(shape=(encoding_dim1,)) encoder2 = Model(input_encoding, autoencoder.layers[2](input_encoding)) encoded_input1 = Input(shape=(encoding_dim2,)) encoded_input2 = Input(shape=(encoding_dim1,)) decoder_layer1 = autoencoder.layers[-2] decoder_layer2 = autoencoder.layers[-1] decoder1 = Model(encoded_input1, decoder_layer1(encoded_input1)) decoder2 = Model(encoded_input2, decoder_layer2(encoded_input2)) autoencoder.compile(optimizer='adam', loss='mse') input_df = input_df/5 input_df x_train = input_df.sample(frac=0.8, random_state=200).astype(float) x_test = input_df.drop(x_train.index).astype(float) x_train.shape #collapse-hide autoencoder.fit(x_train, x_train, epochs=100, batch_size=100, shuffle=True, validation_data=(x_test,x_test)) encoded_output1 = encoder1.predict(input_df.astype(float)) encoded_output2 = encoder2.predict(encoded_output1) decoded_output1 = decoder1.predict(encoded_output2) decoded_output2 = decoder2.predict(decoded_output1) encoded_output2 decoded_output2 ans = decoded_output2 * 5 for (x,y), value in np.ndenumerate(ans): ans[x][y] = round_off_rating(ans[x][y]) ans ans_df = pd.DataFrame(ans) df = input_df.copy() df = df * 5 ans_df plt.figure(figsize=(10,10)) plt.subplot(211) line1, = plt.plot(range(1,65), df.iloc[6], 'b') line2, = plt.plot(range(1,65), ans_df.iloc[3], 'k') plt.ylabel('ratings') plt.xlabel('movie ids') plt.legend([line1, line2], ['Initial Filtered', 'Predicted AE']) plt.figure(figsize=(50,50)) plt.subplot(212) line1, = plt.plot(range(1,4812), df[1].tolist(), 'bo') line2, = plt.plot(range(1,4812), ans_df[0].tolist(), 'ko') plt.ylabel('ratings') plt.xlabel('movie ids') plt.legend([line1, line2], ['Initial Filtered', 'Predicted AE']) !pip install -q watermark %reload_ext watermark %watermark -a "Sparsh A." -m -iv -u -t -d -p tensorflow