# ! apt-get update
# ! apt-get install g++ openjdk-8-jdk
# ! pip3 install nltk konlpy matplotlib gensim
# ! apt-get install fonts-nanum-eco
# ! apt-get install fontconfig
# ! fc-cache -fv
# ! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/
# ! rm -rf /content/.cache/matplotlib/*
import nltk
nltk.download('wordnet')
import pandas as pd
import io, requests
url = "https://raw.githubusercontent.com/YongBeomKim/nltk_basic/master/data/movies_metadata.csv"
response = requests.get(url).content
movies = pd.read_csv(io.StringIO(response.decode('utf-8')),
usecols=['original_title', 'overview', 'title'], low_memory=False)
movies = movies.dropna(axis=0)
movies.shape
[nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Package wordnet is already up-to-date!
(44506, 3)
movie_plot_li = movies['overview']
movie_info_li = movies['title']
movies.head(3)
original_title | overview | title | |
---|---|---|---|
0 | Toy Story | Led by Woody, Andy's toys live happily in his ... | Toy Story |
1 | Jumanji | When siblings Judy and Peter discover an encha... | Jumanji |
2 | Grumpier Old Men | A family wedding reignites the ancient feud be... | Grumpier Old Men |
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
self.tokenizer = RegexpTokenizer('(?u)[A-z]+')
def __call__(self, doc): # 클래스 호출시 마다 실행(Tf-idf Vector 호출)
return([self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc)])
# 사이킷런에 위에서 정의한 토크나이저를 입력으로 넣습니다.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=3, tokenizer=LemmaTokenizer(), stop_words='english')
X = vectorizer.fit_transform(movie_plot_li[:25000]) # 메모리 오류로 갯수를 제한
vocabluary = vectorizer.get_feature_names()
/usr/local/lib/python3.6/dist-packages/sklearn/feature_extraction/text.py:301: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words. 'stop_words.' % sorted(inconsistent))
# 비슷한 영화 추천하는 Cosin 유사모델 만들기
from sklearn.metrics.pairwise import cosine_similarity
movie_sim = cosine_similarity(X)
print(movie_sim.shape)
movie_sim
(25000, 25000)
array([[1. , 0.02707166, 0.00539808, ..., 0. , 0.00976661, 0.01429638], [0.02707166, 1. , 0.0466465 , ..., 0. , 0.00801626, 0.02456379], [0.00539808, 0.0466465 , 1. , ..., 0. , 0.02812631, 0.01745318], ..., [0. , 0. , 0. , ..., 1. , 0. , 0.01639453], [0.00976661, 0.00801626, 0.02812631, ..., 0. , 1. , 0.01056217], [0.01429638, 0.02456379, 0.01745318, ..., 0.01639453, 0.01056217, 1. ]])
# 특정 영화와 유사한 영화목록 출력하기
def similar_recommend_by_movie_id(movielens_id, rank=8):
movie_index = movielens_id - 1
similar_movies = sorted(list(enumerate(movie_sim[movie_index])), key=lambda x:x[1], reverse=True)
print("----- {} : 관람객 추천영화 -------".format(movie_info_li[similar_movies[0][0]]))
for no, movie_idx in enumerate(similar_movies[1:rank]):
print('추천영화 {}순위 : {}'.format(no, movie_info_li[movie_idx[0]]))
similar_recommend_by_movie_id(20, rank=20)
----- Money Train : 관람객 추천영화 ------- 추천영화 0순위 : The One Percent 추천영화 1순위 : The Long Good Friday 추천영화 2순위 : The Milagro Beanfield War 추천영화 3순위 : Gone in Sixty Seconds 추천영화 4순위 : The January Man 추천영화 5순위 : Armored Car Robbery 추천영화 6순위 : You Can Count on Me 추천영화 7순위 : Funny People 추천영화 8순위 : Uranus 추천영화 9순위 : OMG: Oh My God! 추천영화 10순위 : Hush! 추천영화 11순위 : Rich and Famous 추천영화 12순위 : RoboCop 3 추천영화 13순위 : Afterschool 추천영화 14순위 : The Searchers 추천영화 15순위 : Anatomy of Hell 추천영화 16순위 : Lagaan: Once Upon a Time in India 추천영화 17순위 : The Krays 추천영화 18순위 : A Face in the Crowd