!pip install -q surprise import pandas as pd import numpy as np import datetime as dt import surprise from surprise.prediction_algorithms import * from surprise import Reader, Dataset from surprise import SVD, accuracy from surprise.model_selection import train_test_split from surprise.model_selection import GridSearchCV import warnings warnings.filterwarnings('ignore') !pip install -q watermark %reload_ext watermark %watermark -m -iv # Poetry is a tool for dependency management and packaging in Python. # It allows you to declare the libraries your project depends on and it will manage (install/update) them for you. # https://python-poetry.org/docs/basic-usage/ # !curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | python - # !/root/.local/bin/poetry --version # !/root/.local/bin/poetry new poetry-demo # %cd poetry-demo # !/root/.local/bin/poetry install # !/root/.local/bin/poetry add numpy # import pandas as pd # import requests # import json # import random # import numpy # with open('creds.json') as f: # creds = json.load(f) # users = [] # for i in random.sample(range(1, 12000000), 50000): # try: # url ='https://api.ravelry.com/people/' + str(i) +'.json' # response = requests.get(url, auth=(creds['id'], creds['key'])) # users.append(response.json()['user']['username']) # except ValueError: # pass # if len(set(users)) >10000: # break # parsed_data = [] # for i, user in enumerate(users[9935:]): # url ='https://api.ravelry.com/projects/' + user + '/list.json?sort=completed_' # response = requests.get(url, auth=(creds['id'], creds['key'])) # try: # for project in response.json()['projects']: # if project['craft_name'] == 'Knitting': # if project['pattern_id'] != None: # pattern_url ='https://api.ravelry.com/patterns.json?ids=' + str(int(project['pattern_id'])) # pattern_response = requests.get(pattern_url, auth=(creds['id'], creds['key'])) # project_tuple = (user, project['completed'], project['rating'], project['status_name'], # project['pattern_id'], # pattern_response.json()['patterns'][str(int(project['pattern_id']))]['rating_average'], # pattern_response.json()['patterns'][str(int(project['pattern_id']))]['rating_count']) # parsed_data.append(project_tuple) # except ValueError: # pass # print(i, len(parsed_data)) # df = pd.DataFrame(parsed_data, columns= ['user', 'completed', 'rating', 'status', 'pattern_id', 'average_rating', 'rating_count']) # finished_projects = df[df['status'] == 'Finished'] # finished_projects.to_csv('ravelry_interactions.csv', index=False) df = pd.read_csv('https://raw.githubusercontent.com/recohut/reco-data/ravelry/ravelry/v1/ravelry_interactions.csv') df df_drop_nans = df[['user', 'pattern_id', 'rating']].dropna(subset = ['rating']) df_drop_nans df_drop_nans.describe(include='all').T df_replace_nans = df[['user', 'pattern_id', 'rating', 'average_rating']] rating_replace_nans = df_replace_nans['rating'].fillna(df_replace_nans['average_rating']) df_replace_nans['rating'] = rating_replace_nans df_replace_nans.drop(columns = 'average_rating', inplace = True) df_replace_nans reader = Reader() data_replace = Dataset.load_from_df(df_replace_nans, reader) data_drop = Dataset.load_from_df(df_drop_nans, reader) drop_trainset, drop_testset = train_test_split(data_drop, test_size=0.25) replace_trainset, replace_testset = train_test_split(data_replace, test_size=0.25) drop_trainset.global_mean algo = SVD(n_factors = 50, n_epochs = 45, lr_all = 0.004, reg_all = 0.2) algo.fit(drop_trainset) predictions = algo.test(drop_testset) accuracy.rmse(predictions) param_grid = {'n_factors':[5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 'n_epochs': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 'lr_all': [0.002, 0.003, 0.004, 0.005], 'reg_all': [0.2, 0.3, 0.4, 0.5, 0.6]} gs_model = GridSearchCV(SVD, param_grid=param_grid, n_jobs = -1, joblib_verbose=5) gs_model.fit(data_drop) gs_model.best_params algo = SVD(**gs_model.best_params['rmse']) algo.fit(drop_trainset) predictions = algo.test(drop_testset) accuracy.rmse(predictions) predictions_df = pd.DataFrame({"user": [prediction.uid for prediction in predictions], "item": [prediction.iid for prediction in predictions], "actual": [prediction.r_ui for prediction in predictions], "estimated" :[prediction.est for prediction in predictions]}) predictions_df[predictions_df['user'] == 'Ona'].describe() df[df['user'] == "Ona"] algo.predict('Ona', 1)