#!/usr/bin/env python # coding: utf-8 # # GOODREADS CONTENT-BASED BOOK RECOMMENDATION SYSTEM # In[1]: import pandas as pd import numpy as np import pickle import fasttext from rake_nltk import Rake from sklearn.base import BaseEstimator, TransformerMixin from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel from sklearn.pipeline import Pipeline from tqdm import tqdm tqdm.pandas() # ## GOAL # # Creating a good recommendation typically need a combination of content and user data. Here, instead of employing the user data, I will use only content data to create a recommendation system by generating item with high similarity with the item entered by user. # # ## DATASET # # The dataset I used for this experiment is the [Goodreads' Best Book Dataset](https://www.kaggle.com/datasets/meetnaren/goodreads-best-books) that available on kaggle. The dataset contains 54301 rows of data and 12 columns. # In[2]: df = pd.read_csv('data/book_data.csv') df.head() # In[3]: len(df) # ## Preprocessing # As I mentioned previously, we will need to calculate the similarity between items to provide a book recommendation for users. The similarity will be calculated simply by looking at **how similar the item's keywords is**. In this goodreads recommendation systems, the keywords will contain: # - The book's first author # - Important keywords from description # - Genre of the book # # Here in the KeywordsTransformer class, we will gather each book's keywords into one column. We also filter the book's dataset so it's only contain the books that written in English based on its title and description using fasttext. # In[4]: class KeywordsTransformer(BaseEstimator, TransformerMixin): """ gathering the keywords of each book into one column and will . the keyword will containe book's first author, genre, and important keywords description. """ def __init__(self): print('keywords transformer called...') def fit(self, X, y=None): return self def transform(self, X, y=None): X_ = X.copy() # getting only English books for the recommendation system print('filtering English books only...') self.lang_model = fasttext.load_model('lid.176.ftz') X_['lang_status'] = X_.progress_apply(lambda x: self.lang_detect(x), axis=1) # getting the first author of the book print('getting first author...') X_['first_author'] = X_.apply(lambda x: self.set_author(x), axis=1) X_['keywords'] = X_.progress_apply(lambda x: ''.join(x['first_author'].split(' ')), axis=1) # getting important keywords in the description print('getting description...') rake = Rake() X_['keywords'] = X_.progress_apply(lambda x: self.set_description(rake, x), axis=1) # getting the genre of the books print('getting genre...') X_['keywords'] = X_.progress_apply(lambda x: self.set_genres(x), axis=1) # eliminating the books that is not written in English X_ = X_[X_.lang_status=='en'] # eliminating duplicates book based on combination of first author and book title X_ = X_.drop_duplicates(['first_author', 'book_title'], keep='first').reset_index(drop=True) return X_ def lang_detect(self, x): """ detecting book's language using fasttext based on book's description and title """ try: status = (self.lang_model.predict(x['book_desc'])[0][0] in ('__label__en') or isinstance(x['book_desc'], float)) and self.lang_model.predict(x['book_title'])[0][0] in ('__label__en') if status: return 'en' return 'other' except: return 'other' def set_author(self, x): """ getting the first author of the books """ return x['book_authors'].split('|')[0] def set_description(self, rake, x): """ inserting some important keywords from book's description using rake into keywords """ DESCRIPTION_KEYWORDS_COUNT = 2 if isinstance(x['book_desc'], str): rake.extract_keywords_from_text(x['book_desc']) key = rake.get_ranked_phrases()[:DESCRIPTION_KEYWORDS_COUNT] return x['keywords']+' '+' '.join(key) return x['keywords'] def set_genres(self, x): """ inserting the genres of the books into keywords """ if isinstance(x['genres'], str): return x['keywords'] + ' ' + ' '.join(x['genres'].split('|')) return x['keywords'] # ## Creating book recommendation system # After getting each of the book's keywords, next step will be building the recommendation system from the book dataset. # - First, we will creating TfIdf from the dataset's keywords using TfIdfVectorizer. TfIdf, of Term Frequency-Inverse Document Frequency, is a numerical statistic to reflect how important a word is to a document in a collection of document.[*](https://en.wikipedia.org/wiki/Tf–idf) # - Then using [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) of each other TfIdf vector of the books, we will be able to return the books that have high similarity with the book entered by the user. # In[5]: class BookRecommender(): def __init__(self, data): self.data = data def transform(self): """ transforming the raw dataset, getting TfIdf from its keywords, and getting cosine similarity of the tfidf """ self.preprocessed() print('preprocess completed. getting tfidf...') vectorizer = TfidfVectorizer() tfidf = vectorizer.fit_transform(self.processed_data['keywords']) print('getting cosine similarity...') self.cosine_similarity = linear_kernel(tfidf, tfidf) print('transformation completed.') def preprocessed(self): """ preprocessing the dataset using KeywordsTransformer """ kt = KeywordsTransformer() kt.fit(self.data) self.processed_data = kt.transform(self.data) def book_recommender(self, title): """ getting books that have high similarity with book that's entered by users """ try: ids = self.processed_data.loc[self.processed_data.book_title.str.lower()==title.lower()].index[0] BOOK_RECOMMENDATION_COUNT = 10 # getting id of books that have high similarity with entered book sim_scores = list(enumerate(self.cosine_similarity[ids])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[0:BOOK_RECOMMENDATION_COUNT+1] res_dic = {} for x in sim_scores: res_dic[x] = {'book_id': x[0], 'book_title': self.processed_data.iloc[x[0]].book_title, 'book_author': self.processed_data.iloc[x[0]].first_author, 'similarity': x[1]} return self.book_recommender_printer(sorted(res_dic.items(), key=lambda x: x[1]['similarity'], reverse=True)) except IndexError: return 'Book is not found!' def book_recommender_printer(self, books): """ printing the recommended book results """ print('---------title input : ', books[0][1]['book_title'], ' by', books[0][1]['book_author'], '\n') print('---------recommendations: \n') for b in books[1:]: print(b[1]['book_title'], 'by', b[1]['book_author']) print('similarity: ', b[1]['similarity']) print('\n') # In[6]: # load the dataset df = pd.read_csv('data/book_data.csv') # In[7]: # initiate the book recommender recommender = BookRecommender(df) # In[8]: # transform the data and create the book recommender recommender.transform() # ## Testing the recommendation system # Let's test the recommendation system by entering one of my favorite books: **And Then There Were None by Agatha Christie!** # In[9]: recommender.book_recommender('and then there were none') # As expected, the book recommendation system will generate several books that has high similarity with the entered title, in this context is And Then There Were None by Agatha Christie. Since I'm a big fans of her, I already read The Murder on the Links and N or M?! And both of the books are pretty solid book, so I'm very excited when the system giving me Evil Under the Sun, a title that I haven't read yet. Already on my to-be-read list and I can't wait to read it!!! # Next, I will enter a book that I currently read: **The Secret History by Donna Tart** # In[10]: recommender.book_recommender('the secret history') # I'm embarrased to say that I've never heard of The Pugilist at Rest by Thom Jones. It's a short story collection, and I rarely read that sort of book, so that kind of understandable. The Goldfinch by Donna Tartt is the only book that I recognized from this list, and it's on my to-be-read list already. But it's still very exciting to explore some new books and maybe, find a new all time favorite!