import pandas as pd
import numpy as np
import pickle
import fasttext
from rake_nltk import Rake
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.pipeline import Pipeline
from tqdm import tqdm
tqdm.pandas()
Creating a good recommendation typically need a combination of content and user data. Here, instead of employing the user data, I will use only content data to create a recommendation system by generating item with high similarity with the item entered by user.
The dataset I used for this experiment is the Goodreads' Best Book Dataset that available on kaggle. The dataset contains 54301 rows of data and 12 columns.
df = pd.read_csv('data/book_data.csv')
df.head()
book_authors | book_desc | book_edition | book_format | book_isbn | book_pages | book_rating | book_rating_count | book_review_count | book_title | genres | image_url | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Suzanne Collins | Winning will make you famous. Losing means cer... | NaN | Hardcover | 9.78044E+12 | 374 pages | 4.33 | 5519135 | 160706 | The Hunger Games | Young Adult|Fiction|Science Fiction|Dystopia|F... | https://images.gr-assets.com/books/1447303603l... |
1 | J.K. Rowling|Mary GrandPré | There is a door at the end of a silent corrido... | US Edition | Paperback | 9.78044E+12 | 870 pages | 4.48 | 2041594 | 33264 | Harry Potter and the Order of the Phoenix | Fantasy|Young Adult|Fiction | https://images.gr-assets.com/books/1255614970l... |
2 | Harper Lee | The unforgettable novel of a childhood in a sl... | 50th Anniversary | Paperback | 9.78006E+12 | 324 pages | 4.27 | 3745197 | 79450 | To Kill a Mockingbird | Classics|Fiction|Historical|Historical Fiction... | https://images.gr-assets.com/books/1361975680l... |
3 | Jane Austen|Anna Quindlen|Mrs. Oliphant|George... | «È cosa ormai risaputa che a uno scapolo in po... | Modern Library Classics, USA / CAN | Paperback | 9.78068E+12 | 279 pages | 4.25 | 2453620 | 54322 | Pride and Prejudice | Classics|Fiction|Romance | https://images.gr-assets.com/books/1320399351l... |
4 | Stephenie Meyer | About three things I was absolutely positive.F... | NaN | Paperback | 9.78032E+12 | 498 pages | 3.58 | 4281268 | 97991 | Twilight | Young Adult|Fantasy|Romance|Paranormal|Vampire... | https://images.gr-assets.com/books/1361039443l... |
len(df)
54301
As I mentioned previously, we will need to calculate the similarity between items to provide a book recommendation for users. The similarity will be calculated simply by looking at how similar the item's keywords is. In this goodreads recommendation systems, the keywords will contain:
Here in the KeywordsTransformer class, we will gather each book's keywords into one column. We also filter the book's dataset so it's only contain the books that written in English based on its title and description using fasttext.
class KeywordsTransformer(BaseEstimator, TransformerMixin):
"""
gathering the keywords of each book into one column and will .
the keyword will containe book's first author, genre, and important keywords description.
"""
def __init__(self):
print('keywords transformer called...')
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X_ = X.copy()
# getting only English books for the recommendation system
print('filtering English books only...')
self.lang_model = fasttext.load_model('lid.176.ftz')
X_['lang_status'] = X_.progress_apply(lambda x: self.lang_detect(x), axis=1)
# getting the first author of the book
print('getting first author...')
X_['first_author'] = X_.apply(lambda x: self.set_author(x), axis=1)
X_['keywords'] = X_.progress_apply(lambda x: ''.join(x['first_author'].split(' ')), axis=1)
# getting important keywords in the description
print('getting description...')
rake = Rake()
X_['keywords'] = X_.progress_apply(lambda x: self.set_description(rake, x), axis=1)
# getting the genre of the books
print('getting genre...')
X_['keywords'] = X_.progress_apply(lambda x: self.set_genres(x), axis=1)
# eliminating the books that is not written in English
X_ = X_[X_.lang_status=='en']
# eliminating duplicates book based on combination of first author and book title
X_ = X_.drop_duplicates(['first_author', 'book_title'], keep='first').reset_index(drop=True)
return X_
def lang_detect(self, x):
"""
detecting book's language using fasttext based on book's description and title
"""
try:
status = (self.lang_model.predict(x['book_desc'])[0][0] in ('__label__en') or isinstance(x['book_desc'], float)) and self.lang_model.predict(x['book_title'])[0][0] in ('__label__en')
if status:
return 'en'
return 'other'
except:
return 'other'
def set_author(self, x):
"""
getting the first author of the books
"""
return x['book_authors'].split('|')[0]
def set_description(self, rake, x):
"""
inserting some important keywords from book's description using rake into keywords
"""
DESCRIPTION_KEYWORDS_COUNT = 2
if isinstance(x['book_desc'], str):
rake.extract_keywords_from_text(x['book_desc'])
key = rake.get_ranked_phrases()[:DESCRIPTION_KEYWORDS_COUNT]
return x['keywords']+' '+' '.join(key)
return x['keywords']
def set_genres(self, x):
"""
inserting the genres of the books into keywords
"""
if isinstance(x['genres'], str):
return x['keywords'] + ' ' + ' '.join(x['genres'].split('|'))
return x['keywords']
After getting each of the book's keywords, next step will be building the recommendation system from the book dataset.
class BookRecommender():
def __init__(self, data):
self.data = data
def transform(self):
"""
transforming the raw dataset, getting TfIdf from its keywords, and getting cosine similarity of the tfidf
"""
self.preprocessed()
print('preprocess completed. getting tfidf...')
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(self.processed_data['keywords'])
print('getting cosine similarity...')
self.cosine_similarity = linear_kernel(tfidf, tfidf)
print('transformation completed.')
def preprocessed(self):
"""
preprocessing the dataset using KeywordsTransformer
"""
kt = KeywordsTransformer()
kt.fit(self.data)
self.processed_data = kt.transform(self.data)
def book_recommender(self, title):
"""
getting books that have high similarity with book that's entered by users
"""
try:
ids = self.processed_data.loc[self.processed_data.book_title.str.lower()==title.lower()].index[0]
BOOK_RECOMMENDATION_COUNT = 10
# getting id of books that have high similarity with entered book
sim_scores = list(enumerate(self.cosine_similarity[ids]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[0:BOOK_RECOMMENDATION_COUNT+1]
res_dic = {}
for x in sim_scores:
res_dic[x] = {'book_id': x[0],
'book_title': self.processed_data.iloc[x[0]].book_title,
'book_author': self.processed_data.iloc[x[0]].first_author,
'similarity': x[1]}
return self.book_recommender_printer(sorted(res_dic.items(), key=lambda x: x[1]['similarity'], reverse=True))
except IndexError:
return 'Book is not found!'
def book_recommender_printer(self, books):
"""
printing the recommended book results
"""
print('---------title input : ', books[0][1]['book_title'], ' by', books[0][1]['book_author'], '\n')
print('---------recommendations: \n')
for b in books[1:]:
print(b[1]['book_title'], 'by', b[1]['book_author'])
print('similarity: ', b[1]['similarity'])
print('\n')
# load the dataset
df = pd.read_csv('data/book_data.csv')
# initiate the book recommender
recommender = BookRecommender(df)
# transform the data and create the book recommender
recommender.transform()
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
keywords transformer called... filtering English books only...
100%|███████████████████████████████████| 54301/54301 [00:09<00:00, 5520.73it/s]
getting first author...
100%|█████████████████████████████████| 54301/54301 [00:00<00:00, 111068.85it/s]
getting description...
100%|███████████████████████████████████| 54301/54301 [00:32<00:00, 1660.22it/s]
getting genre...
100%|██████████████████████████████████| 54301/54301 [00:00<00:00, 66708.60it/s]
preprocess completed. getting tfidf... getting cosine similarity... transformation completed.
Let's test the recommendation system by entering one of my favorite books: And Then There Were None by Agatha Christie!
recommender.book_recommender('and then there were none')
---------title input : And Then There Were None by Agatha Christie ---------recommendations: The Murder on the Links by Agatha Christie similarity: 0.44428498622376605 Evil Under the Sun by Agatha Christie similarity: 0.35410649242110337 N or M? by Agatha Christie similarity: 0.33615155083710596 Sleeping Murder by Agatha Christie similarity: 0.33395712265493865 The Mystery of the Blue Train by Agatha Christie similarity: 0.3297086837196586 Witness for the Prosecution and Selected Plays by Agatha Christie similarity: 0.3160301751208531 How Does Your Garden Grow? and Other Stories by Agatha Christie similarity: 0.3062958948772468 Dead Simple by Peter James similarity: 0.3033607052474662 Faithful Place by Tana French similarity: 0.302753760386371 In The Dark by Brian Freeman similarity: 0.30251517321055743
As expected, the book recommendation system will generate several books that has high similarity with the entered title, in this context is And Then There Were None by Agatha Christie. Since I'm a big fans of her, I already read The Murder on the Links and N or M?! And both of the books are pretty solid book, so I'm very excited when the system giving me Evil Under the Sun, a title that I haven't read yet. Already on my to-be-read list and I can't wait to read it!!!
Next, I will enter a book that I currently read: The Secret History by Donna Tart
recommender.book_recommender('the secret history')
---------title input : The Secret History by Donna Tartt ---------recommendations: The Pugilist at Rest by Thom Jones similarity: 0.29270328313073773 The Goldfinch by Donna Tartt similarity: 0.2666224989294841 Losing It by Cora Carmack similarity: 0.2430829833151303 The Lady That I Love by Crystal Linn similarity: 0.22847475855102853 The Destiny of Violet & Luke by Jessica Sorensen similarity: 0.22359497790387248 Sweet Girl by Sierra Hill similarity: 0.22280415744966814 A Winter Haunting by Dan Simmons similarity: 0.2220611264340409 The Little Friend by Donna Tartt similarity: 0.2127591170494097 Elite by Rachel Van Dyken similarity: 0.21057750312655546 Dark Matter by Blake Crouch similarity: 0.18968665820726552
I'm embarrased to say that I've never heard of The Pugilist at Rest by Thom Jones. It's a short story collection, and I rarely read that sort of book, so that kind of understandable. The Goldfinch by Donna Tartt is the only book that I recognized from this list, and it's on my to-be-read list already. But it's still very exciting to explore some new books and maybe, find a new all time favorite!