In [1]:

import random
import numpy as np

my_seed = 1337
random.seed(my_seed)
np.random.seed(my_seed)

In [2]:

import pandas as pd
import numpy as np
from typing import *
from IPython.display import display, HTML, Markdown

import warnings
warnings.filterwarnings('ignore')


def display_best_and_worse_recommendations(recommendations: pd.DataFrame):
    recommendations.sort_values('Estimated Prediction', ascending=False, inplace=True)

    top_recommendations = recommendations.iloc[:10]
    top_recommendations.columns = ['Prediction (sorted by best)', 'Movie Title']

    worse_recommendations = recommendations.iloc[-10:]
    worse_recommendations.columns = ['Prediction (sorted by worse)', 'Movie Title']

    display(HTML("<h1>Recommendations your user will love</h1>"))
    display(top_recommendations)

    display(HTML("<h1>Recommendations your user will hate</h1>"))
    display(worse_recommendations)
    

def load_movies_dataset() -> pd.DataFrame:
    movie_data_columns = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'url',
    'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
    'War', 'Western'
    ]

    movie_data = pd.read_csv(
        'datasets/ml-100k/u.item', 
        sep = '|', 
        encoding = "ISO-8859-1", 
        header = None, 
        names = movie_data_columns,
        index_col = 'movie_id'
    )
    movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
    return movie_data

def load_ratings() -> pd.DataFrame:
    ratings_data = pd.read_csv(
        'datasets/ml-100k/u.data',
        sep = '\t',
        encoding = "ISO-8859-1",
        header = None,
        names=['user_id', 'movie_id', 'rating', 'timestamp']
    )
    return ratings_data[['user_id', 'movie_id', 'rating']]

def load_movielens() -> pd.DataFrame:
    ratings_data = load_ratings()
    movies_data = load_movies_dataset()
    ratings_data['user_id'] = ratings_data['user_id'].map(lambda k: f"User {k}")
    
    ratings_and_movies = ratings_data \
        .set_index('movie_id') \
        .join(movies_data['title']) \
        .reset_index()
    
    ratings_and_movies['movie_title'] = ratings_and_movies['title']
    return ratings_and_movies[['user_id', 'movie_title', 'rating']].sample(frac=1)
    
    

Table of contents¶

1) Training a SVD model¶

Downloading and exploring the MovieLens dataset
Training a SVD using Surprise in 4 simple steps

2) Generating recommendations¶

Recommendations via Matrix Reconstruction: Using the predict() API inside of Surprise
Recommendations via Product based CF: Finding similarity between vectors

Downloading and exploring the MovieLens dataset¶

Open Source dataset
20 million ratings
27,000 movies
138,000 users

In [3]:

movielens_df: pd.DataFrame = load_movielens()
movielens_df.head(5)

Out[3]:

	user_id	movie_title	rating
36649	User 742	Jerry Maguire (1996)	4
2478	User 908	Usual Suspects, The (1995)	3
82838	User 758	Real Genius (1985)	4
69729	User 393	Things to Do in Denver when You're Dead (1995)	3
36560	User 66	Jerry Maguire (1996)	4

In [4]:

# Remove movies with few ratings
movie_ratings = movielens_df.groupby('movie_title').size()
valid_movies = movie_ratings[movie_ratings > 50]
movie_ratings = movielens_df.set_index('movie_title', drop=False).join(valid_movies.to_frame(), how='inner').reset_index(drop=True)

del movie_ratings[0]

movie_ratings = movie_ratings.sample(frac=1)
movie_ratings.head(5)

movielens_df = movie_ratings

Training a SVD using Surprise in 4 simple steps¶

In [5]:

from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

# Step 1: create a Reader.
# A reader tells our SVD what the lower and upper bound of our ratings is.
# MovieLens ratings are from 1 to 5
reader = Reader(rating_scale=(1, 5))

In [6]:

# Step 2: create a new Dataset instance with a DataFrame and the reader
# The DataFrame needs to have 3 columns in this specific order: [user_id, product_id, rating]
data = Dataset.load_from_df(movielens_df, reader)

In [7]:

# Step 3: keep 25% of your trainset for testing
trainset, testset = train_test_split(data, test_size=.25)

In [8]:

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movielens_df, reader)
trainset, testset = train_test_split(data, test_size=.01)

In [9]:

# Step 4: train a new SVD with 100 latent features (number was chosen arbitrarily)
model = SVD(n_factors=100)
model.fit(trainset)

Out[9]:

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1130ead68>

In [10]:

# Normalization
pd.DataFrame(model.qi).iloc[0].pow(2).sum()
model.qi /= np.linalg.norm(model.qi, ord=2, axis=1).reshape(-1, 1)
pd.DataFrame(model.qi).iloc[0].pow(2).sum()

Out[10]:

0.9999999999999999

Inspecting our Product Matrix¶

Surprise SVD stores the product matrix under the model.qi attribute.

In [11]:

model.qi.shape

Out[11]:

(596, 100)

The matrix has n_factors columns (we chose 100). Every row represents a movie

Mapping every vector back to it's movie¶

Every row is mapped to a movie. How do we map every movie to it's vector?

In [12]:

def display(df: pd.DataFrame):
    item_to_row_idx_df = pd.DataFrame(
        list(item_to_row_idx.items()),
        columns=['Movie name', 'model.qi row idx'],
    ).set_index('Movie name')
    return item_to_row_idx_df.head(5)

In [13]:

item_to_row_idx: Dict[Any, int] = model.trainset._raw2inner_id_items

# `display()` is a utility function to make `item_to_row_idx` more readable
display(item_to_row_idx)

Out[13]:

	model.qi row idx
Movie name
Lion King, The (1994)	0
African Queen, The (1951)	1
Day the Earth Stood Still, The (1951)	2
Fried Green Tomatoes (1991)	3
Blues Brothers, The (1980)	4

Identifying Toy Story¶

In [14]:

toy_story_row_idx : int = item_to_row_idx['Toy Story (1995)']

In [15]:

model.qi[toy_story_row_idx]

Out[15]:

array([-0.00889267, -0.03901101, -0.19582206, -0.06800691,  0.11612643,
       -0.0133471 , -0.0067134 ,  0.00288335,  0.18905863, -0.01727417,
       -0.05463992,  0.03962723, -0.01882104,  0.01020398, -0.02117866,
        0.16177179, -0.04796802,  0.01428753,  0.13078113, -0.02725028,
        0.12102731,  0.07361403, -0.03889315,  0.21971317,  0.10844565,
       -0.02779188, -0.06676929,  0.06646453, -0.00768229, -0.14992161,
       -0.07929755,  0.00377584, -0.18182449, -0.07932236,  0.0837675 ,
       -0.08436358,  0.10939826, -0.21550487, -0.00997129, -0.14068558,
       -0.07365779, -0.06704182,  0.01132891,  0.10421864,  0.11748961,
        0.07426254,  0.09342114,  0.01356848, -0.0250024 ,  0.12239668,
       -0.20936433, -0.22866096, -0.04916814,  0.0842263 , -0.1353041 ,
       -0.03717908, -0.17404182,  0.02941116,  0.04993152,  0.06490656,
       -0.05549422, -0.10358558,  0.00789368,  0.09439441, -0.07726498,
       -0.08448086,  0.08246883,  0.17941641,  0.01990596, -0.02759331,
        0.06862457, -0.12098117, -0.03077882,  0.08178186,  0.10700504,
       -0.01529634, -0.00385706,  0.04940254,  0.28700017, -0.0197356 ,
        0.02827431,  0.13303162, -0.05905182, -0.0673481 ,  0.0471547 ,
       -0.01943226,  0.09228729,  0.12408544,  0.07230831,  0.09700075,
       -0.14674701,  0.03890628,  0.00311309, -0.02259477,  0.00057669,
       -0.01448026, -0.00467238, -0.20787822, -0.19006575,  0.05603329])

In [16]:

print(f"Every product has {model.qi[toy_story_row_idx].shape[0]} features")

Every product has 100 features

Recommendations via Product based CF: Finding similarity between vectors¶

2 products are "similar" when the cosine distance is close to 0

In [17]:

from scipy.spatial.distance import cosine


def get_vector_by_movie_title(movie_title: str, trained_model: SVD) -> np.array:
    """Returns the latent features of a movie in the form of a numpy array"""
    movie_row_idx = trained_model.trainset._raw2inner_id_items[movie_title]
    return trained_model.qi[movie_row_idx]


def cosine_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Returns a float indicating the similarity between two vectors"""
    return cosine(vector_a, vector_b)

In [18]:

# Fetch the vectors of "Toy Story" and "Wizard of Oz"
toy_story_vec = get_vector_by_movie_title('Toy Story (1995)', model)
wizard_of_oz_vec = get_vector_by_movie_title('Wizard of Oz, The (1939)', model)

# Calculate the distance between the vectors. The smaller the number,
# the more similar the two movies are
similarity_score = cosine_distance(toy_story_vec, wizard_of_oz_vec)
similarity_score

Out[18]:

0.9461284008856982

Recommendations via Matrix Reconstruction¶

Use cases:¶

Predict a score between any combination of user and a product

Recommendations via Matrix Reconstruction: Using the predict() API inside of Surprise¶

Computes the rating prediction for given user and movie with model.predict(). Pick a random user and movie, and calculate the score between them

In [19]:

# Refresher: ratings data-frame.
movielens_df.head(2)

Out[19]:

	user_id	movie_title	rating
49469	User 437	Monty Python and the Holy Grail (1974)	3
12181	User 85	Butch Cassidy and the Sundance Kid (1969)	4

In [20]:

a_user = "User 196"
a_product = "Toy Story (1995)"
model.predict(a_user, a_product)

Out[20]:

Prediction(uid='User 196', iid='Toy Story (1995)', r_ui=None, est=4.103242838730761, details={'was_impossible': False})

Recommendations via Item Similarity¶

Use cases¶

Clustering: find clusters of items that are similar (ex. Amazon)
Item-based collaborative filtering

Recommendations via Item Similarity: Finding similarity between vectors¶

2 products are "similar" when the cosine distance is close to 0

In [21]:

from scipy.spatial.distance import cosine as cosine_distance

In [22]:

# Fetch indices for Toy Story and Wizard of Oz
starwars_idx = model.trainset._raw2inner_id_items['Star Wars (1977)']
roj_idx = model.trainset._raw2inner_id_items['Return of the Jedi (1983)']
aladdin_idx = model.trainset._raw2inner_id_items['Aladdin (1992)']

# Get vectors for both movies
starwars_vector = model.qi[starwars_idx]
return_of_jedi_vector = model.qi[roj_idx]
aladdin_vector = model.qi[aladdin_idx]

In [23]:

# Distance between Starwars and Return of the Jedi
cosine_distance(starwars_vector, return_of_jedi_vector)

Out[23]:

0.29566718216988797

In [24]:

# Distance between Starwars and Aladdin
cosine_distance(starwars_vector, aladdin_vector)

Out[24]:

0.8587662155892206

In [25]:

def display(similarity_table):
    similarity_table = pd.DataFrame(
        similarity_table,
        columns=['vector cosine distance', 'movie title']
    ).sort_values('vector cosine distance', ascending=True)
    return similarity_table.iloc[:4]

Finding similar movies by ranking¶

In [26]:

def get_top_similarities(movie_title: str, model: SVD) -> pd.DataFrame:
    """Returns the top 5 most similar movies to a specified movie"""
    ...

In [27]:

def get_top_similarities(movie_title: str, model: SVD) -> pd.DataFrame:
    """Returns the top 5 most similar movies to a specified movie
    
    This function iterates over every possible movie in MovieLens and calculates
    distance between `movie_title` vector and that movie's vector.
    """
    
    # Get the first movie vector
    movie_vector: np.array = get_vector_by_movie_title(movie_title, model)
    similarity_table = []
    
    # Iterate over every possible movie and calculate similarity
    for other_movie_title in model.trainset._raw2inner_id_items.keys():
        other_movie_vector = get_vector_by_movie_title(other_movie_title, model)
        
        # Get the second movie vector, and calculate distance
        similarity_score = cosine_distance(other_movie_vector, movie_vector)
        similarity_table.append((similarity_score, other_movie_title))
    
    # sort movies by ascending similarity
    return display(sorted(similarity_table))

In [28]:

get_top_similarities('Star Wars (1977)', model)

Out[28]:

	vector cosine distance	movie title
0	0.000000	Star Wars (1977)
1	0.262668	Empire Strikes Back, The (1980)
2	0.295667	Return of the Jedi (1983)
3	0.435423	Raiders of the Lost Ark (1981)

In [29]:

get_top_similarities('Pulp Fiction (1994)', model)

Out[29]:

	vector cosine distance	movie title
0	0.000000	Pulp Fiction (1994)
1	0.514664	Ed Wood (1994)
2	0.658022	Trainspotting (1996)
3	0.659555	From Dusk Till Dawn (1996)

In conclusion¶

SVD is a really powerful technique for providing recommendations
Latent features can be used in many different ways
Once the latent features are generated, collaborative filtering becomes entirely platform agnostic. The vectors are very portable
Surprise has a really low barrier of entry.

A special thanks to¶

David O'Steen
Upom Malik
Danni Hu

Slides: https://github.com/PirosB3/PyConUS2018