import graphlab as gl
# set canvas to show sframes and sgraphs in ipython notebook
gl.canvas.set_target('ipynb')
import matplotlib.pyplot as plt
%matplotlib inline
#train_file = 'http://s3.amazonaws.com/dato-datasets/millionsong/10000.txt'
train_file = '/Users/chengjun/bigdata/millionsong/song_usage_10000.txt'
sf = gl.SFrame.read_csv(train_file, header=False, delimiter='\t', verbose=False)
sf.rename({'X1':'user_id', 'X2':'music_id', 'X3':'rating'}).show()
------------------------------------------------------ Inferred types from first line of file as column_type_hints=[str,str,int] If parsing fails due to incorrect types, you can correct the inferred type list above and pass it to read_csv in the column_type_hints argument ------------------------------------------------------ PROGRESS: Read 844838 lines. Lines per second: 810295 PROGRESS: Finished parsing file /Users/chengjun/bigdata/millionsong/song_usage_10000.txt PROGRESS: Parsing completed. Parsed 2000000 lines in 1.59616 secs.
(train_set, test_set) = sf.random_split(0.8, seed=1)
popularity_model = gl.popularity_recommender.create(train_set, 'user_id', 'music_id', target = 'rating')
PROGRESS: Recsys training: model = popularity PROGRESS: Preparing data set. PROGRESS: Data has 1599753 observations with 76085 users and 10000 items. PROGRESS: Data prepared in: 1.23558s PROGRESS: 1599753 observations to process; with 10000 unique items.
item_sim_model = gl.item_similarity_recommender.create(train_set, 'user_id', 'music_id', target = 'rating',
similarity_type='cosine')
PROGRESS: Recsys training: model = item_similarity PROGRESS: Preparing data set. PROGRESS: Data has 1599753 observations with 76085 users and 10000 items. PROGRESS: Data prepared in: 1.34152s PROGRESS: Computing item similarity statistics: PROGRESS: Computing most similar items for 10000 items: PROGRESS: +-----------------+-----------------+ PROGRESS: | Number of items | Elapsed Time | PROGRESS: +-----------------+-----------------+ PROGRESS: | 1000 | 1.67234 | PROGRESS: | 2000 | 1.70878 | PROGRESS: | 3000 | 1.74289 | PROGRESS: | 4000 | 1.77751 | PROGRESS: | 5000 | 1.81794 | PROGRESS: | 6000 | 1.85361 | PROGRESS: | 7000 | 1.88976 | PROGRESS: | 8000 | 1.92744 | PROGRESS: | 9000 | 1.96709 | PROGRESS: | 10000 | 2.08439 | PROGRESS: +-----------------+-----------------+ PROGRESS: Finished training in 2.50669s PROGRESS: Finished prediction in 0.734376s
factorization_machine_model = gl.recommender.factorization_recommender.create(train_set, 'user_id', 'music_id',
target='rating')
PROGRESS: Recsys training: model = factorization_recommender PROGRESS: Preparing data set. PROGRESS: Data has 1599753 observations with 76085 users and 10000 items. PROGRESS: Data prepared in: 1.31298s PROGRESS: Training factorization_recommender for recommendations. PROGRESS: +--------------------------------+--------------------------------------------------+----------+ PROGRESS: | Parameter | Description | Value | PROGRESS: +--------------------------------+--------------------------------------------------+----------+ PROGRESS: | num_factors | Factor Dimension | 8 | PROGRESS: | regularization | L2 Regularization on Factors | 1e-08 | PROGRESS: | solver | Solver used for training | sgd | PROGRESS: | linear_regularization | L2 Regularization on Linear Coefficients | 1e-10 | PROGRESS: | max_iterations | Maximum Number of Iterations | 50 | PROGRESS: +--------------------------------+--------------------------------------------------+----------+ PROGRESS: Optimizing model using SGD; tuning step size. PROGRESS: Using 199969 / 1599753 points for tuning the step size. PROGRESS: +---------+-------------------+------------------------------------------+ PROGRESS: | Attempt | Initial Step Size | Estimated Objective Value | PROGRESS: +---------+-------------------+------------------------------------------+ PROGRESS: | 0 | 25 | No Decrease (234.956 >= 45.6461) | PROGRESS: | 1 | 6.25 | No Decrease (222.818 >= 45.6461) | PROGRESS: | 2 | 1.5625 | No Decrease (193.879 >= 45.6461) | PROGRESS: | 3 | 0.390625 | No Decrease (93.6001 >= 45.6461) | PROGRESS: | 4 | 0.0976562 | 18.1929 | PROGRESS: | 5 | 0.0488281 | 12.7349 | PROGRESS: | 6 | 0.0244141 | 27.6064 | PROGRESS: +---------+-------------------+------------------------------------------+ PROGRESS: | Final | 0.0488281 | 12.7349 | PROGRESS: +---------+-------------------+------------------------------------------+ PROGRESS: Starting Optimization. PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+ PROGRESS: | Iter. | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size | PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+ PROGRESS: | Initial | 388us | 43.795 | 6.61778 | | PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+ PROGRESS: | 1 | 242.781ms | 43.525 | 6.59695 | 0.0488281 | PROGRESS: | 2 | 369.391ms | 40.9211 | 6.3966 | 0.0290334 | PROGRESS: | 3 | 491.657ms | 37.9834 | 6.1627 | 0.0214205 | PROGRESS: | 4 | 603.858ms | 35.2255 | 5.93471 | 0.0172633 | PROGRESS: | 5 | 743.824ms | 32.7566 | 5.7229 | 0.014603 | PROGRESS: | 6 | 861.2ms | 30.8412 | 5.553 | 0.0127367 | PROGRESS: | 10 | 1.39s | 24.7548 | 4.97477 | 0.008683 | PROGRESS: | 11 | 1.53s | 23.5887 | 4.85613 | 0.00808399 | PROGRESS: | 20 | 2.76s | 17.6337 | 4.19832 | 0.00516295 | PROGRESS: | 30 | 3.96s | 14.4135 | 3.79539 | 0.00380916 | PROGRESS: | 40 | 5.17s | 12.5212 | 3.53725 | 0.00306991 | PROGRESS: | 50 | 6.39s | 9.83216 | 3.13412 | 0.00154408 | PROGRESS: +---------+--------------+-------------------+-----------------------+-------------+ PROGRESS: Optimization Complete: Maximum number of passes through the data reached. PROGRESS: Computing final objective value and training RMSE. PROGRESS: Final objective value: 8.86198 PROGRESS: Final training RMSE: 2.97532
result = gl.recommender.util.compare_models(test_set, [popularity_model, item_sim_model, factorization_machine_model],
user_sample=.1, skip_set=train_set)
compare_models: using 6871 users to estimate model performance PROGRESS: Evaluate model M0 PROGRESS: recommendations finished on 1000/6871 queries. users per second: 12410 PROGRESS: recommendations finished on 2000/6871 queries. users per second: 14958.4 PROGRESS: recommendations finished on 3000/6871 queries. users per second: 15825.3 PROGRESS: recommendations finished on 4000/6871 queries. users per second: 16808.7 PROGRESS: recommendations finished on 5000/6871 queries. users per second: 17280.5 PROGRESS: recommendations finished on 6000/6871 queries. users per second: 17228 Precision and recall summary statistics by cutoff +--------+-------------------+-------------------+ | cutoff | mean_precision | mean_recall | +--------+-------------------+-------------------+ | 2 | 0.000363848057051 | 0.000222530101733 | | 4 | 0.000509387279872 | 0.000644168294629 | | 6 | 0.000460874205598 | 0.000838220591723 | | 8 | 0.000418425265609 | 0.000983759814544 | | 10 | 0.000465725513026 | 0.00128720279373 | | 12 | 0.000412361131325 | 0.00132237477257 | | 14 | 0.000457408986007 | 0.00161781917277 | | 16 | 0.000491194877019 | 0.00189451695711 | | 18 | 0.000468959717977 | 0.00196078928179 | | 20 | 0.000480279435308 | 0.00211815339109 | +--------+-------------------+-------------------+ [10 rows x 3 columns] Overall RMSE: 5.79840126177 Per User RMSE (best) +-------------------------------+-------+-----------------+ | user_id | count | rmse | +-------------------------------+-------+-----------------+ | 907f83008d1b7a7958766544a0... | 1 | 0.0160085378869 | +-------------------------------+-------+-----------------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | 2c263b458bb317ee91c346ae90... | 4 | 172.795342779 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+------+ | music_id | count | rmse | +--------------------+-------+------+ | SOZWCBD12AB01848DD | 1 | 0.0 | +--------------------+-------+------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+---------------+ | music_id | count | rmse | +--------------------+-------+---------------+ | SOTGIKV12AB0182176 | 1 | 173.804878049 | +--------------------+-------+---------------+ [1 rows x 3 columns] PROGRESS: Evaluate model M1 PROGRESS: recommendations finished on 1000/6871 queries. users per second: 1291.91 PROGRESS: recommendations finished on 2000/6871 queries. users per second: 1303.24 PROGRESS: recommendations finished on 3000/6871 queries. users per second: 1303.16 PROGRESS: recommendations finished on 4000/6871 queries. users per second: 1309.34 PROGRESS: recommendations finished on 5000/6871 queries. users per second: 1309.57 PROGRESS: recommendations finished on 6000/6871 queries. users per second: 1318.84 Precision and recall summary statistics by cutoff +--------+-------------------+-------------------+ | cutoff | mean_precision | mean_recall | +--------+-------------------+-------------------+ | 2 | 0.000509387279872 | 0.000106894612147 | | 4 | 0.000509387279872 | 0.00041709974353 | | 6 | 0.000557900354145 | 0.000741928943341 | | 8 | 0.000491194877019 | 0.000896131215139 | | 10 | 0.000509387279872 | 0.0010508900222 | | 12 | 0.000533643817009 | 0.00124220214402 | | 14 | 0.000509387279872 | 0.00144984014027 | | 16 | 0.000491194877019 | 0.00164649135206 | | 18 | 0.000517472792251 | 0.00213570216873 | | 20 | 0.000509387279872 | 0.00236702556808 | +--------+-------------------+-------------------+ [10 rows x 3 columns] PROGRESS: Finished prediction in 0.226961s Overall RMSE: 6.09897586494 Per User RMSE (best) +-------------------------------+-------+------+ | user_id | count | rmse | +-------------------------------+-------+------+ | 91e5266cafbdd11964d70fb1d8... | 1 | 0.0 | +-------------------------------+-------+------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | 2c263b458bb317ee91c346ae90... | 4 | 161.518485703 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+------+ | music_id | count | rmse | +--------------------+-------+------+ | SOOIQZC12A6701FEA1 | 2 | 0.0 | +--------------------+-------+------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+-------+ | music_id | count | rmse | +--------------------+-------+-------+ | SOTGIKV12AB0182176 | 1 | 172.0 | +--------------------+-------+-------+ [1 rows x 3 columns] PROGRESS: Evaluate model M2 PROGRESS: recommendations finished on 1000/6871 queries. users per second: 10178.8 PROGRESS: recommendations finished on 2000/6871 queries. users per second: 11326.8 PROGRESS: recommendations finished on 3000/6871 queries. users per second: 12072.2 PROGRESS: recommendations finished on 4000/6871 queries. users per second: 12724.1 PROGRESS: recommendations finished on 5000/6871 queries. users per second: 12371.5 PROGRESS: recommendations finished on 6000/6871 queries. users per second: 12328.1 Precision and recall summary statistics by cutoff +--------+-------------------+-------------------+ | cutoff | mean_precision | mean_recall | +--------+-------------------+-------------------+ | 2 | 0.000291078445641 | 0.000204967738806 | | 4 | 0.000291078445641 | 0.000321852261162 | | 6 | 0.000315334982778 | 0.000400252854408 | | 8 | 0.000382040459904 | 0.000704556888155 | | 10 | 0.000480279435308 | 0.00107200975662 | | 12 | 0.000533643817009 | 0.00138531272247 | | 14 | 0.000550969914964 | 0.0016547624225 | | 16 | 0.000591253092708 | 0.00185731432805 | | 18 | 0.000582156891282 | 0.00207921438986 | | 20 | 0.000574879930141 | 0.00223034316254 | +--------+-------------------+-------------------+ [10 rows x 3 columns] Overall RMSE: 7.66449264849 Per User RMSE (best) +-------------------------------+-------+-------------------+ | user_id | count | rmse | +-------------------------------+-------+-------------------+ | ac810151e32857e9f4200e8fa7... | 1 | 0.000812624627255 | +-------------------------------+-------+-------------------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | 2c263b458bb317ee91c346ae90... | 4 | 182.725743431 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+-------------------+ | music_id | count | rmse | +--------------------+-------+-------------------+ | SOJWIJT12A8C136100 | 1 | 0.000881766015195 | +--------------------+-------+-------------------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+--------------+ | music_id | count | rmse | +--------------------+-------+--------------+ | SOTGIKV12AB0182176 | 1 | 236.91250578 | +--------------------+-------+--------------+ [1 rows x 3 columns]
/Users/chengjun/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:387: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure "matplotlib is currently using a non-GUI backend, "
K = 10
users = gl.SArray(sf['user_id'].unique().head(100))
recs = item_sim_model.recommend(users=users, k=K)
recs.head()
user_id | music_id | score | rank |
---|---|---|---|
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOFCGSE12AF72A674F | 20.686440678 | 1 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOQYHJW12AB0182AA6 | 20.0 | 2 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOELDGL12A8C135ED7 | 20.0 | 3 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOUWZPO12A6D4F83E3 | 20.0 | 4 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SONAQRQ12AB017FD0B | 20.0 | 5 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOMVIOV12A6D4F719A | 20.0 | 6 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOHJWLZ12A6D4F7756 | 20.0 | 7 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOIRUXQ12A8C133060 | 20.0 | 8 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOLVHIW12A8C13BA03 | 20.0 | 9 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOPWZGK12A67020744 | 20.0 | 10 |