The Reader class is used to parse a file containing ratings.
Such a file is assumed to specify only one rating per line,
and each line needs to respect the following structure: ::
user ; item ; rating ; [timestamp]
import surprise
print(surprise.__version__)
1.0.6
%%time
# 샘플 데이터를 불러와서 SVD 샘플모델을 생성 합니다
from surprise import SVD, Dataset, accuracy
from surprise.model_selection import train_test_split
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25, random_state=0)
algo = SVD()
algo.fit(trainset)
CPU times: user 4.13 s, sys: 17 ms, total: 4.15 s Wall time: 4.15 s
predictions = algo.test(testset)
print('prediction type :{} size:{}'.format(type(predictions),len(predictions)))
print('prediction 예측모델의 최초 5개 결과물 추출')
predictions[:5]
prediction type :<class 'list'> size:25000 prediction 예측모델의 최초 5개 결과물 추출
[Prediction(uid='120', iid='282', r_ui=4.0, est=3.7169020090370926, details={'was_impossible': False}), Prediction(uid='882', iid='291', r_ui=4.0, est=3.7336545625605893, details={'was_impossible': False}), Prediction(uid='535', iid='507', r_ui=5.0, est=3.9110320749400636, details={'was_impossible': False}), Prediction(uid='697', iid='244', r_ui=5.0, est=3.5976295337676976, details={'was_impossible': False}), Prediction(uid='751', iid='385', r_ui=4.0, est=3.4908622254480104, details={'was_impossible': False})]
# UserID, MovieID, ratting
# DataFrame 3개 필드만 사용하여 학습을 합니다
[ (pred.uid, pred.iid, pred.est) for pred in predictions[:3] ]
[('120', '282', 3.7169020090370926), ('882', '291', 3.7336545625605893), ('535', '507', 3.9110320749400636)]
# 사용자 아이디, 아이템 아이디는 를 입력하면
# 결과를 출력하는 모델을 활용
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid)
print(pred)
user: 196 item: 302 r_ui = None est = 4.21 {'was_impossible': False}
# Train/ Test 훈련모델의 검증결과 95% 성능을 출력합니다
accuracy.rmse(predictions)
RMSE: 0.9493
0.9493021636428113
# 데이터셋 불러오기
# ratings_noh.csv : header를 제거
import pandas as pd
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
ratings.head(3)
userId | movieId | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 31 | 2.5 | 1260759144 |
1 | 1 | 1029 | 3.0 | 1260759179 |
2 | 1 | 1061 | 3.0 | 1260759182 |
ratings.to_csv('./data/ml-latest-small/ratings_noh.csv', index=False, header=False)
ratings.values
array([[1.00000000e+00, 3.10000000e+01, 2.50000000e+00, 1.26075914e+09], [1.00000000e+00, 1.02900000e+03, 3.00000000e+00, 1.26075918e+09], [1.00000000e+00, 1.06100000e+03, 3.00000000e+00, 1.26075918e+09], ..., [6.71000000e+02, 6.36500000e+03, 4.00000000e+00, 1.07094036e+09], [6.71000000e+02, 6.38500000e+03, 2.50000000e+00, 1.07097966e+09], [6.71000000e+02, 6.56500000e+03, 3.50000000e+00, 1.07478472e+09]])
# Numpy Matrix 데이터셋을 Surprise 데이터셋으로 변환합니다
from surprise import Reader, Dataset
reader = Reader(line_format = 'user item rating timestamp', # 필드별 레이블
sep = ',', # 구분기호
rating_scale = (0.5, 5)) # 데이터 min ~ max 범위
data = Dataset.load_from_file('./data/ml-latest-small/ratings_noh.csv', reader=reader)
data
<surprise.dataset.DatasetAutoFolds at 0x7fc934234be0>
# n_factors=50 : 학습 모델의 잠재요인 K를 50개로 설정 합니다
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25, random_state=0)
algo = SVD(n_factors=50, random_state=0)
algo.fit(trainset)
# testset 으로 모델을 RMSE 평가 합니다
from surprise import accuracy
predictions = algo.test(testset)
accuracy.rmse(predictions)
RMSE: 0.8908
0.8907754769926038
import pandas as pd
from surprise import Reader, Dataset
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
ratings.head(3)
userId | movieId | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 31 | 2.5 | 1260759144 |
1 | 1 | 1029 | 3.0 | 1260759179 |
2 | 1 | 1061 | 3.0 | 1260759182 |
# ratings DataFrame 에서 컬럼은 사용자 아이디, 아이템 아이디, 평점 순서를 지켜야 합니다.
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
algo = SVD(n_factors=50, random_state=0)
trainset, testset = train_test_split(data, test_size=.25, random_state=0)
algo.fit(trainset)
predictions = algo.test( testset )
accuracy.rmse(predictions)
RMSE: 0.8908
0.8907754769926038
%%time
from surprise.model_selection import cross_validate
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
reader = Reader(rating_scale = (0.5, 5.0)) # Reader 인스턴스
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
algo = SVD(random_state = 0)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
Evaluating RMSE, MAE of algorithm SVD on 5 split(s). Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std RMSE (testset) 0.8922 0.8964 0.8896 0.9073 0.8931 0.8957 0.0062 MAE (testset) 0.6891 0.6923 0.6845 0.6966 0.6864 0.6898 0.0043 Fit time 3.99 3.93 3.95 3.93 4.06 3.97 0.05 Test time 0.13 0.18 0.18 0.13 0.18 0.16 0.03 CPU times: user 21.5 s, sys: 23.9 ms, total: 21.5 s Wall time: 21.5 s
%%time
# 최적화 검증용 파라미터, GridSearchCV (KFold CV 3개 분할)
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [20, 40, 60], 'n_factors': [50, 100, 200] }
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
gs.fit(data)
# 최고 RMSE Evaluation 점수와 그때의 하이퍼 파라미터
print("Bast Score: {}\nParams: {}".format(
gs.best_score['rmse'], gs.best_params['rmse']))
Bast Score: 0.900002299884766 Params: {'n_epochs': 20, 'n_factors': 50} CPU times: user 1min 7s, sys: 405 ms, total: 1min 7s Wall time: 2min 5s
AttributeError: 'DatasetAutoFolds' object has no attribute 'global_mean'
오류를 출력# 아래 코드는 train_test_split( ) 구분 없어도 오류없이 출력합니다
# 최신버전 에서는 오류를 출력하지 않고 바로 결과를 출력 합니다
import pandas as pd
from surprise import Reader, Dataset, SVD
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
reader = Reader(line_format = 'user item rating timestamp',
sep = ',',
rating_scale = (0.5, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
algo = SVD(n_factors=50, random_state=0)
algo.fit(trainset)
<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc93b526400>
# # 데이터를 모두 사용하여 학습을 진행합니다
# from surprise.dataset import DatasetAutoFolds
# reader = Reader(line_format = 'user item rating timestamp',
# sep = ',',
# rating_scale = (0.5, 5))
# # DatasetAutoFolds 클래스로 파일을 불러옵니다
# data_folds = DatasetAutoFolds(ratings_file = './data/ml-latest-small/ratings_noh.csv',
# reader = reader)
# trainset = data_folds.build_full_trainset()
# algo = SVD(n_epochs=20, n_factors=50, random_state=0)
# algo.fit(trainset)
# 영화에 대한 상세 속성 정보 DataFrame로딩
movies = pd.read_csv('./data/ml-latest-small/movies.csv')
movies.head(3)
movieId | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
1 | 2 | Jumanji (1995) | Adventure|Children|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
# userId=9 의 movieId 데이터 추출하여 movieId=42 데이터가 있는지 확인.
movieIds = ratings[ratings['userId']==9]['movieId']
if movieIds[movieIds==42].count() == 0:
print('사용자 아이디 9는 영화 아이디 42의 평점 없음')
# movies 데이터셋에서 42번 id 로 정보를 추출합니다
print(movies[movies['movieId']==42])
사용자 아이디 9는 영화 아이디 42의 평점 없음 movieId title genres 40 42 Dead Presidents (1995) Action|Crime|Drama
# 9번 사용자 42번 영화평점이 없을 때, 예측평점을 채웁니다
uid = str(9)
iid = str(42)
pred = algo.predict(uid, iid, verbose=True)
user: 9 item: 42 r_ui = None est = 3.54 {'was_impossible': False}
# 영화정보를 보기 쉽도록 함수로 구현
def get_unseen_surprise(ratings, movies, userId):
seen_movies = ratings[ratings['userId']== userId]['movieId'].tolist() # 입력 userId 평점목록
total_movies = movies['movieId'].tolist() # 모든 movieId 리스트
unseen_movies= [movie for movie in total_movies # 평점이 없는 movieId 목록
if movie not in seen_movies]
print('평점 매긴 영화수: {}\n추천 대상 영화수: {:,}\n전체 영화수: {:,}'.format(
len(seen_movies),len(unseen_movies),len(total_movies)))
return unseen_movies
# 9번 사용자의 추천목록을 검색
unseen_movies = get_unseen_surprise(ratings, movies, 9)
평점 매긴 영화수: 45 추천 대상 영화수: 9,080 전체 영화수: 9,125