集体智慧是指为了创造新想法,将一群人的行为、偏好或思想组合在一起。一般基于聪明的算法(Netflix, Google)或者提供内容的用户(Wikipedia)。
集体智慧编程所强调的是前者,即通过编写计算机程序、构造具有智能的算法收集并分析用户的数据,发现新的信息甚至是知识。
Toby Segaran, 2007, Programming Collective Intelligence. O'Reilly.
用户满意度
预测准确度
$r_{ui}$用户实际打分, $\hat{r_{ui}}$推荐算法预测打分, T为测量次数
均方根误差RMSE
$RMSE = \sqrt{\frac{\sum_{u, i \in T} (r_{ui} - \hat{r_{ui}})}{ T }^2} $
平均绝对误差MAE
$ MAE = \frac{\sum_{u, i \in T} \left | r_{ui} - \hat{r_{ui}} \right|}{ T}$
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
critics['Lisa Rose']['Lady in the Water']
2.5
critics['Toby']['Snakes on a Plane']
4.5
critics['Toby']
{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}
# 欧几里得距离
import numpy as np
np.sqrt(np.power(5-4, 2) + np.power(4-1, 2))
3.1622776601683795
1.0 /(1 + np.sqrt(np.power(5-4, 2) + np.power(4-1, 2)) )
0.2402530733520421
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
# Get the list of shared_items
si={}
for item in prefs[person1]:
if item in prefs[person2]:
si[item]=1
# if they have no ratings in common, return 0
if len(si)==0: return 0
# Add up the squares of all the differences
sum_of_squares=np.sum([np.power(prefs[person1][item]-prefs[person2][item],2)
for item in prefs[person1] if item in prefs[person2]])
#for item in si.keys()])#
return 1/(1+np.sqrt(sum_of_squares) )
sim_distance(critics, 'Lisa Rose','Toby')
0.3483314773547883
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
# Get the list of mutually rated items
si={}
for item in prefs[p1]:
if item in prefs[p2]: si[item]=1
# Find the number of elements
n=len(si)
# if they are no ratings in common, return 0
if n==0: return 0
# Add up all the preferences
sum1=np.sum([prefs[p1][it] for it in si])
sum2=np.sum([prefs[p2][it] for it in si])
# Sum up the squares
sum1Sq=np.sum([np.power(prefs[p1][it],2) for it in si])
sum2Sq=np.sum([np.power(prefs[p2][it],2) for it in si])
# Sum up the products
pSum=np.sum([prefs[p1][it]*prefs[p2][it] for it in si])
# Calculate Pearson score
num=pSum-(sum1*sum2/n)
den=np.sqrt((sum1Sq-np.power(sum1,2)/n)*(sum2Sq-np.power(sum2,2)/n))
if den==0: return 0
return num/den
sim_pearson(critics, 'Lisa Rose','Toby')
0.9912407071619299
# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
scores=[(similarity(prefs,person,other),other)
for other in prefs if other!=person]
# Sort the list so the highest scores appear at the top
scores.sort( )
scores.reverse( )
return scores[0:n]
topMatches(critics,'Toby',n=3) # topN
[(0.9912407071619299, 'Lisa Rose'), (0.9244734516419049, 'Mick LaSalle'), (0.8934051474415647, 'Claudia Puig')]
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
totals={}
simSums={}
for other in prefs:
# don't compare me to myself
if other==person: continue
sim=similarity(prefs,person,other)
# ignore scores of zero or lower
if sim<=0: continue
for item in prefs[other]:
# only score movies I haven't seen yet
if item not in prefs[person]:# or prefs[person][item]==0:
# Similarity * Score
totals.setdefault(item,0)
totals[item]+=prefs[other][item]*sim
# Sum of similarities
simSums.setdefault(item,0)
simSums[item]+=sim
# Create the normalized list
rankings=[(total/simSums[item],item) for item,total in totals.items()]
# Return the sorted list
rankings.sort()
rankings.reverse()
return rankings
# Now you can find out what movies I should watch next:
getRecommendations(critics,'Toby')
[(3.3477895267131013, 'The Night Listener'), (2.8325499182641614, 'Lady in the Water'), (2.530980703765565, 'Just My Luck')]
# You’ll find that the results are only affected very slightly by the choice of similarity metric.
getRecommendations(critics,'Toby',similarity=sim_distance)
[(3.457128694491423, 'The Night Listener'), (2.778584003814923, 'Lady in the Water'), (2.422482042361917, 'Just My Luck')]
# you just need to swap the people and the items.
def transformPrefs(prefs):
result={}
for person in prefs:
for item in prefs[person]:
result.setdefault(item,{})
# Flip item and person
result[item][person]=prefs[person][item]
return result
movies = transformPrefs(critics)
topMatches(movies,'Superman Returns')
[(0.6579516949597695, 'You, Me and Dupree'), (0.4879500364742689, 'Lady in the Water'), (0.11180339887498941, 'Snakes on a Plane'), (-0.1798471947990544, 'The Night Listener'), (-0.42289003161103106, 'Just My Luck')]
def calculateSimilarItems(prefs,n=10):
# Create a dictionary of items showing which other items they
# are most similar to.
result={}
# Invert the preference matrix to be item-centric
itemPrefs=transformPrefs(prefs)
c=0
for item in itemPrefs:
# Status updates for large datasets
c+=1
if c%100==0:
print("%d / %d" % (c,len(itemPrefs)))
# Find the most similar items to this one
scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
result[item]=scores
return result
itemsim=calculateSimilarItems(critics)
itemsim['Superman Returns']
[(0.3090169943749474, 'Snakes on a Plane'), (0.252650308587072, 'The Night Listener'), (0.2402530733520421, 'Lady in the Water'), (0.20799159651347807, 'Just My Luck'), (0.1918253663634734, 'You, Me and Dupree')]
Toby看过三个电影(snakes、Superman、dupree)和评分(依次是4.5、4.0、1.0)
表格2-3给出这三部电影与另外三部电影的相似度
R.xNight表示Toby对自己看过的三部定影的评分与Night这部电影相似度的乘积
那么Toby对于Night的评分可以表达为0.818+0.412+0.148 = 1.378
def getRecommendedItems(prefs,itemMatch,user):
userRatings=prefs[user]
scores={}
totalSim={}
# Loop over items rated by this user
for (item,rating) in userRatings.items( ):
# Loop over items similar to this one
for (similarity,item2) in itemMatch[item]:
# Ignore if this user has already rated this item
if item2 in userRatings: continue
# Weighted sum of rating times similarity
scores.setdefault(item2,0)
scores[item2]+=similarity*rating
# Sum of all the similarities
totalSim.setdefault(item2,0)
totalSim[item2]+=similarity
# Divide each total score by total weighting to get an average
rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
# Return the rankings from highest to lowest
rankings.sort( )
rankings.reverse( )
return rankings
getRecommendedItems(critics,itemsim,'Toby')
[(3.1667425234070894, 'The Night Listener'), (2.9366294028444346, 'Just My Luck'), (2.868767392626467, 'Lady in the Water')]
getRecommendations(movies,'Just My Luck')
[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]
getRecommendations(movies, 'You, Me and Dupree')
[(3.1637361366111816, 'Michael Phillips')]
使用二分图表示用户行为,因此基于图的算法可以应用到推荐系统当中。
# https://github.com/ParticleWave/RecommendationSystemStudy/blob/d1960056b96cfaad62afbfe39225ff680240d37e/PersonalRank.py
import os
import random
class Graph:
def __init__(self):
self.G = dict()
def addEdge(self, p, q):
if p not in self.G: self.G[p] = dict()
if q not in self.G: self.G[q] = dict()
self.G[p][q] = 1
self.G[q][p] = 1
def getGraphMatrix(self):
return self.G
graph = Graph()
graph.addEdge('A', 'a')
graph.addEdge('A', 'c')
graph.addEdge('B', 'a')
graph.addEdge('B', 'b')
graph.addEdge('B', 'c')
graph.addEdge('B', 'd')
graph.addEdge('C', 'c')
graph.addEdge('C', 'd')
G = graph.getGraphMatrix()
print(G.keys())
dict_keys(['A', 'd', 'b', 'C', 'a', 'c', 'B'])
G
{'A': {'a': 1, 'c': 1}, 'B': {'a': 1, 'b': 1, 'c': 1, 'd': 1}, 'C': {'c': 1, 'd': 1}, 'a': {'A': 1, 'B': 1}, 'b': {'B': 1}, 'c': {'A': 1, 'B': 1, 'C': 1}, 'd': {'B': 1, 'C': 1}}
for i, ri in G.items():
for j, wij in ri.items():
print(i, j, wij)
A a 1 A c 1 d C 1 d B 1 b B 1 C d 1 C c 1 a B 1 a A 1 c C 1 c B 1 c A 1 B a 1 B d 1 B c 1 B b 1
def PersonalRank(G, alpha, root, max_step):
# G is the biparitite graph of users' ratings on items
# alpha is the probability of random walk forward
# root is the studied User
# max_step if the steps of iterations.
rank = dict()
rank = {x:0.0 for x in G.keys()}
rank[root] = 1.0
for k in range(max_step):
tmp = {x:0.0 for x in G.keys()}
for i,ri in G.items():
for j,wij in ri.items():
if j not in tmp: tmp[j] = 0.0 #
tmp[j] += alpha * rank[i] / (len(ri)*1.0)
if j == root: tmp[j] += 1.0 - alpha
rank = tmp
print(k, rank)
return rank
PersonalRank(G, 0.8, 'A', 20)
# print(PersonalRank(G, 0.8, 'B', 20))
# print(PersonalRank(G, 0.8, 'C', 20))
0 {'c': 0.4, 'd': 0.0, 'b': 0.0, 'C': 0.0, 'a': 0.4, 'A': 0.3999999999999999, 'B': 0.0} 1 {'c': 0.15999999999999998, 'd': 0.0, 'b': 0.0, 'C': 0.10666666666666669, 'a': 0.15999999999999998, 'A': 0.6666666666666666, 'B': 0.2666666666666667} 2 {'c': 0.3626666666666667, 'd': 0.09600000000000003, 'b': 0.053333333333333344, 'C': 0.04266666666666666, 'a': 0.32, 'A': 0.5066666666666666, 'B': 0.10666666666666665} 3 {'c': 0.24106666666666665, 'd': 0.03839999999999999, 'b': 0.02133333333333333, 'C': 0.13511111111111113, 'a': 0.22399999999999998, 'A': 0.624711111111111, 'B': 0.3057777777777778} 4 {'c': 0.36508444444444443, 'd': 0.11520000000000002, 'b': 0.06115555555555557, 'C': 0.07964444444444443, 'a': 0.31104, 'A': 0.5538844444444444, 'B': 0.1863111111111111} 5 {'c': 0.29067377777777775, 'd': 0.06911999999999999, 'b': 0.03726222222222222, 'C': 0.14343585185185187, 'a': 0.258816, 'A': 0.6217718518518518, 'B': 0.31677629629629633} 6 {'c': 0.3694383407407408, 'd': 0.12072960000000002, 'b': 0.06335525925925926, 'C': 0.1051610074074074, 'a': 0.312064, 'A': 0.5810394074074073, 'B': 0.2384971851851852} 7 {'c': 0.322179602962963, 'd': 0.08976384000000001, 'b': 0.047699437037037044, 'C': 0.14680873086419757, 'a': 0.2801152, 'A': 0.6233424908641975, 'B': 0.322318538271605} 8 {'c': 0.37252419634567907, 'd': 0.12318720000000004, 'b': 0.06446370765432101, 'C': 0.12182009679012348, 'a': 0.313800704, 'A': 0.5979606407901235, 'B': 0.27202572641975314} 9 {'c': 0.34231744031604944, 'd': 0.10313318400000002, 'b': 0.05440514528395063, 'C': 0.14861466569218112, 'a': 0.2935894016, 'A': 0.624860067292181, 'B': 0.3257059134156379} 10 {'c': 0.37453107587687245, 'd': 0.12458704896000004, 'b': 0.06514118268312759, 'C': 0.13253792435094652, 'a': 0.3150852096, 'A': 0.6087204113909463, 'B': 0.29349780121810704} 11 {'c': 0.35520289454037857, 'd': 0.11171472998400003, 'b': 0.05869956024362141, 'C': 0.14970977315116601, 'a': 0.3021877248, 'A': 0.6259090374071659, 'B': 0.3278568031376681} 12 {'c': 0.3758188848508664, 'd': 0.12545526988800004, 'b': 0.06557136062753362, 'C': 0.1394066638710343, 'a': 0.31593497559039996, 'A': 0.6155958617974342, 'B': 0.3072414019859314} 13 {'c': 0.3634492906645737, 'd': 0.1172109459456, 'b': 0.061448280397186285, 'C': 0.1504004772487644, 'a': 0.30768662511615996, 'A': 0.6265923595297243, 'B': 0.3292315559869513} 14 {'c': 0.37664344590878573, 'd': 0.126006502096896, 'b': 0.06584631119739026, 'C': 0.14380418922212634, 'a': 0.31648325500928, 'A': 0.6199944608903503, 'B': 0.3160374635863394} 15 {'c': 0.36872695276225853, 'd': 0.12072916840611841, 'b': 0.06320749271726787, 'C': 0.1508408530811013, 'a': 0.311205277073408, 'A': 0.6270315542460547, 'B': 0.3301112040427255} 16 {'c': 0.3771712037394075, 'd': 0.12635858204098563, 'b': 0.0660222408085451, 'C': 0.14661885476571632, 'a': 0.316834862506967, 'A': 0.6228092982326321, 'B': 0.3216669597688938} 17 {'c': 0.37210465315311814, 'd': 0.1229809338600653, 'b': 0.06433339195377877, 'C': 0.15112242048023627, 'a': 0.3134571112468316, 'A': 0.6273129326666287, 'B': 0.3306741581298592} 18 {'c': 0.3775089728847178, 'd': 0.12658379981806633, 'b': 0.06613483162597183, 'C': 0.1484202810515243, 'a': 0.3170600046926233, 'A': 0.6246107520062307, 'B': 0.32526983911327995} 19 {'c': 0.37426638104575805, 'd': 0.1244220802432657, 'b': 0.06505396782265599, 'C': 0.15130257936315128, 'a': 0.3148982686251483, 'A': 0.627493061312974, 'B': 0.3310344465409781}
{'A': 0.627493061312974, 'B': 0.3310344465409781, 'C': 0.15130257936315128, 'a': 0.3148982686251483, 'b': 0.06505396782265599, 'c': 0.37426638104575805, 'd': 0.1244220802432657}
MovieLens是一个电影评价的真实数据,由明尼苏达州立大学的GroupLens项目组开发。
http://grouplens.org/datasets/movielens/1m/
These files contain 1,000,209 anonymous ratings of approximately 3,900 movies
made by 6,040 MovieLens users who joined MovieLens in 2000.
All ratings are contained in the file "ratings.dat" and are in the following format:
UserID::MovieID::Rating::Timestamp
1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
def loadMovieLens(path='/Users/datalab/bigdata/cjc/ml-1m/'):
# Get movie titles
movies={}
for line in open(path+'movies.dat', encoding = 'iso-8859-15'):
(id,title)=line.split('::')[0:2]
movies[id]=title
# Load data
prefs={}
for line in open(path+'/ratings.dat'):
(user,movieid,rating,ts)=line.split('::')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs
prefs=loadMovieLens()
prefs['87']
{'Alice in Wonderland (1951)': 1.0, 'Army of Darkness (1993)': 3.0, 'Bad Boys (1995)': 5.0, 'Benji (1974)': 1.0, 'Brady Bunch Movie, The (1995)': 1.0, 'Braveheart (1995)': 5.0, 'Buffalo 66 (1998)': 1.0, 'Chambermaid on the Titanic, The (1998)': 1.0, 'Cowboy Way, The (1994)': 1.0, 'Cyrano de Bergerac (1990)': 4.0, 'Dear Diary (Caro Diario) (1994)': 1.0, 'Die Hard (1988)': 3.0, 'Diebinnen (1995)': 1.0, 'Dr. No (1962)': 1.0, 'Escape from the Planet of the Apes (1971)': 1.0, 'Fast, Cheap & Out of Control (1997)': 1.0, 'Faster Pussycat! Kill! Kill! (1965)': 1.0, 'From Russia with Love (1963)': 1.0, 'Fugitive, The (1993)': 5.0, 'Get Shorty (1995)': 1.0, 'Gladiator (2000)': 5.0, 'Goldfinger (1964)': 5.0, 'Good, The Bad and The Ugly, The (1966)': 4.0, 'Hunt for Red October, The (1990)': 5.0, 'Hurricane, The (1999)': 5.0, 'Indiana Jones and the Last Crusade (1989)': 4.0, 'Jaws (1975)': 5.0, 'Jurassic Park (1993)': 5.0, 'King Kong (1933)': 1.0, 'King of New York (1990)': 1.0, 'Last of the Mohicans, The (1992)': 1.0, 'Lethal Weapon (1987)': 5.0, 'Longest Day, The (1962)': 1.0, 'Man with the Golden Gun, The (1974)': 5.0, 'Mask of Zorro, The (1998)': 5.0, 'Matrix, The (1999)': 5.0, "On Her Majesty's Secret Service (1969)": 1.0, 'Out of Sight (1998)': 1.0, 'Palookaville (1996)': 1.0, 'Planet of the Apes (1968)': 1.0, 'Pope of Greenwich Village, The (1984)': 1.0, 'Princess Bride, The (1987)': 3.0, 'Raiders of the Lost Ark (1981)': 4.0, 'Rock, The (1996)': 5.0, 'Rocky (1976)': 5.0, 'Saving Private Ryan (1998)': 4.0, 'Shanghai Noon (2000)': 1.0, 'Speed (1994)': 1.0, 'Star Wars: Episode IV - A New Hope (1977)': 5.0, 'Star Wars: Episode V - The Empire Strikes Back (1980)': 5.0, 'Taking of Pelham One Two Three, The (1974)': 1.0, 'Terminator 2: Judgment Day (1991)': 5.0, 'Terminator, The (1984)': 4.0, 'Thelma & Louise (1991)': 1.0, 'True Romance (1993)': 1.0, 'U-571 (2000)': 5.0, 'Untouchables, The (1987)': 5.0, 'Westworld (1973)': 1.0, 'X-Men (2000)': 4.0}
getRecommendations(prefs,'87')[0:30]
[(5.0, 'Time of the Gypsies (Dom za vesanje) (1989)'), (5.0, 'Tigrero: A Film That Was Never Made (1994)'), (5.0, 'Schlafes Bruder (Brother of Sleep) (1995)'), (5.0, 'Return with Honor (1998)'), (5.0, 'Lured (1947)'), (5.0, 'Identification of a Woman (Identificazione di una donna) (1982)'), (5.0, 'I Am Cuba (Soy Cuba/Ya Kuba) (1964)'), (5.0, 'Hour of the Pig, The (1993)'), (5.0, 'Gay Deceivers, The (1969)'), (5.0, 'Gate of Heavenly Peace, The (1995)'), (5.0, 'Foreign Student (1994)'), (5.0, 'Dingo (1992)'), (5.0, 'Dangerous Game (1993)'), (5.0, 'Callejón de los milagros, El (1995)'), (5.0, 'Bittersweet Motel (2000)'), (4.820460101722989, 'Apple, The (Sib) (1998)'), (4.738956184936386, 'Lamerica (1994)'), (4.681816541467396, 'Bells, The (1926)'), (4.664958072522234, 'Hurricane Streets (1998)'), (4.650741840804562, 'Sanjuro (1962)'), (4.649974172600346, 'On the Ropes (1999)'), (4.636825408739504, 'Shawshank Redemption, The (1994)'), (4.627888709544556, 'For All Mankind (1989)'), (4.582048349280509, 'Midaq Alley (Callejón de los milagros, El) (1995)'), (4.579778646871153, "Schindler's List (1993)"), (4.57519994103739, 'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)'), (4.574904988403455, 'Godfather, The (1972)'), (4.5746840191882345, "Ed's Next Move (1996)"), (4.558519037147828, 'Hanging Garden, The (1997)'), (4.527760042775589, 'Close Shave, A (1995)')]
itemsim=calculateSimilarItems(prefs,n=50)
100 / 3706 200 / 3706 300 / 3706 400 / 3706 500 / 3706 600 / 3706 700 / 3706 800 / 3706 900 / 3706 1000 / 3706
getRecommendedItems(prefs,itemsim,'87')[0:30]
[(5.0, 'Uninvited Guest, An (2000)'), (5.0, 'Two Much (1996)'), (5.0, 'Two Family House (2000)'), (5.0, 'Trial by Jury (1994)'), (5.0, 'Tom & Viv (1994)'), (5.0, 'This Is My Father (1998)'), (5.0, 'Something to Sing About (1937)'), (5.0, 'Slappy and the Stinkers (1998)'), (5.0, 'Running Free (2000)'), (5.0, 'Roula (1995)'), (5.0, 'Prom Night IV: Deliver Us From Evil (1992)'), (5.0, 'Project Moon Base (1953)'), (5.0, 'Price Above Rubies, A (1998)'), (5.0, 'Open Season (1996)'), (5.0, 'Only Angels Have Wings (1939)'), (5.0, 'Onegin (1999)'), (5.0, 'Once Upon a Time... When We Were Colored (1995)'), (5.0, 'Office Killer (1997)'), (5.0, 'N\xe9nette et Boni (1996)'), (5.0, 'No Looking Back (1998)'), (5.0, 'Never Met Picasso (1996)'), (5.0, 'Music From Another Room (1998)'), (5.0, "Mummy's Tomb, The (1942)"), (5.0, 'Modern Affair, A (1995)'), (5.0, 'Machine, The (1994)'), (5.0, 'Lured (1947)'), (5.0, 'Low Life, The (1994)'), (5.0, 'Lodger, The (1926)'), (5.0, 'Loaded (1994)'), (5.0, 'Line King: Al Hirschfeld, The (1996)')]
In this notebook we will import Turicreate and use it to
%matplotlib inline
import turicreate as tc
import matplotlib.pyplot as plt
sf = tc.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],
'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],
'rating': [1, 3, 2, 5, 4, 1, 4, 3]})
sf
item_id | rating | user_id |
---|---|---|
a | 1 | 0 |
b | 3 | 0 |
c | 2 | 0 |
a | 5 | 1 |
b | 4 | 1 |
b | 1 | 2 |
c | 4 | 2 |
d | 3 | 2 |
m = tc.recommender.create(sf, target='rating')
recs = m.recommend()
recs
Preparing data set.
Data has 8 observations with 3 users and 4 items.
Data prepared in: 0.003196s
Training ranking_factorization_recommender for recommendations.
+--------------------------------+--------------------------------------------------+----------+
| Parameter | Description | Value |
+--------------------------------+--------------------------------------------------+----------+
| num_factors | Factor Dimension | 32 |
| regularization | L2 Regularization on Factors | 1e-09 |
| solver | Solver used for training | sgd |
| linear_regularization | L2 Regularization on Linear Coefficients | 1e-09 |
| ranking_regularization | Rank-based Regularization Weight | 0.25 |
| max_iterations | Maximum Number of Iterations | 25 |
+--------------------------------+--------------------------------------------------+----------+
Optimizing model using SGD; tuning step size.
Using 8 / 8 points for tuning the step size.
+---------+-------------------+------------------------------------------+
| Attempt | Initial Step Size | Estimated Objective Value |
+---------+-------------------+------------------------------------------+
| 0 | 25 | Not Viable |
| 1 | 6.25 | Not Viable |
| 2 | 1.5625 | Not Viable |
| 3 | 0.390625 | 2.79772 |
| 4 | 0.195312 | 2.74043 |
| 5 | 0.0976562 | 2.79932 |
| 6 | 0.0488281 | 3.00656 |
| 7 | 0.0244141 | 3.28278 |
+---------+-------------------+------------------------------------------+
| Final | 0.195312 | 2.74043 |
+---------+-------------------+------------------------------------------+
Starting Optimization.
+---------+--------------+-------------------+-----------------------+-------------+
| Iter. | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size |
+---------+--------------+-------------------+-----------------------+-------------+
| Initial | 141us | 3.89999 | 1.3637 | |
+---------+--------------+-------------------+-----------------------+-------------+
| 1 | 762us | 4.11274 | 1.60028 | 0.195312 |
| 2 | 1.298ms | 3.00481 | 1.39367 | 0.116134 |
| 3 | 1.73ms | 2.6827 | 1.23558 | 0.0856819 |
| 4 | 2.257ms | 2.49202 | 1.17326 | 0.0580668 |
| 5 | 2.757ms | 2.4385 | 1.16425 | 0.0491185 |
| 10 | 5.116ms | 2.28964 | 1.08345 | 0.029206 |
| 25 | 12.067ms | 2.20318 | 1.06674 | 0.0146899 |
user_id | item_id | score | rank |
---|---|---|---|
0 | d | 1.2718666195869446 | 1 |
1 | c | 4.015937566757202 | 1 |
1 | d | 3.1318363547325134 | 2 |
2 | a | 2.478732317686081 | 1 |
+---------+--------------+-------------------+-----------------------+-------------+
Optimization Complete: Maximum number of passes through the data reached.
Computing final objective value and training RMSE.
Final objective value: 2.79567
Final training RMSE: 1.03992
Loading of the CourseTalk database.
#train_file = 'http://s3.amazonaws.com/dato-datasets/millionsong/10000.txt'
train_file = '../data/ratings.dat'
sf = tc.SFrame.read_csv(train_file, header=False,
delimiter='|', verbose=False)
sf = sf.rename({'X1':'user_id', 'X2':'course_id', 'X3':'rating'})
sf.show()
Materializing SFrame
In order to evaluate the performance of our model, we randomly split the observations in our data set into two partitions: we will use train_set
when creating our model and test_set
for evaluating its performance.
sf
user_id | course_id | rating |
---|---|---|
1 | 1 | 5.0 |
2 | 1 | 5.0 |
3 | 1 | 5.0 |
4 | 1 | 5.0 |
5 | 1 | 5.0 |
6 | 1 | 5.0 |
7 | 1 | 5.0 |
8 | 1 | 5.0 |
9 | 1 | 5.0 |
10 | 1 | 5.0 |
train_set, test_set = sf.random_split(0.8, seed=1)
Create a model that makes recommendations using item popularity. When no target column is provided, the popularity is determined by the number of observations involving each item. When a target is provided, popularity is computed using the item’s mean target value. When the target column contains ratings, for example, the model computes the mean rating for each item and uses this to rank items for recommendations.
One typically wants to initially create a simple recommendation system that can be used as a baseline and to verify that the rest of the pipeline works as expected. The recommender
package has several models available for this purpose. For example, we can create a model that predicts songs based on their overall popularity across all users.
popularity_model = tc.popularity_recommender.create(train_set, 'user_id', 'course_id', target = 'rating')
Preparing data set.
Data has 2202 observations with 1651 users and 201 items.
Data prepared in: 0.006547s
2202 observations to process; with 201 unique items.
If your data is implicit, i.e., you only observe interactions between users and items, without a rating, then use ItemSimilarityModel with Jaccard similarity.
If your data is explicit, i.e., the observations include an actual rating given by the user, then you have a wide array of options. ItemSimilarityModel with cosine or Pearson similarity can incorporate ratings. In addition, MatrixFactorizationModel, FactorizationModel, as well as LinearRegressionModel all support rating prediction.
itemsim_cosine_model = graphlab.recommender.create(data, target=’rating’, method=’item_similarity’, similarity_type=’cosine’)
factorization_machine_model = graphlab.recommender.create(data, target=’rating’, method=’factorization_model’)
In the following code block, we compute all the item-item similarities and create an object that can be used for recommendations.
item_sim_model = tc.item_similarity_recommender.create(
train_set, 'user_id', 'course_id', target = 'rating',
similarity_type='cosine')
Preparing data set.
Data has 2202 observations with 1651 users and 201 items.
Data prepared in: 0.006935s
Training model from provided data.
Gathering per-item and per-user statistics.
+--------------------------------+------------+
| Elapsed Time (Item Statistics) | % Complete |
+--------------------------------+------------+
| 2.388ms | 60.5 |
| 2.607ms | 100 |
+--------------------------------+------------+
Setting up lookup tables.
Processing data in one pass using dense lookup tables.
+-------------------------------------+------------------+-----------------+
| Elapsed Time (Constructing Lookups) | Total % Complete | Items Processed |
+-------------------------------------+------------------+-----------------+
| 3.247ms | 0 | 0 |
| 6.225ms | 100 | 201 |
+-------------------------------------+------------------+-----------------+
Finalizing lookup tables.
Generating candidate set for working with new users.
Finished training in 0.008353s
Create a FactorizationRecommender that learns latent factors for each user and item and uses them to make rating predictions. This includes both standard matrix factorization as well as factorization machines models (in the situation where side data is available for users and/or items). link
factorization_machine_model = tc.recommender.factorization_recommender.create(
train_set, 'user_id', 'course_id',
target='rating')
Preparing data set.
Data has 2202 observations with 1651 users and 201 items.
Data prepared in: 0.006938s
Training factorization_recommender for recommendations.
+--------------------------------+--------------------------------------------------+----------+
| Parameter | Description | Value |
+--------------------------------+--------------------------------------------------+----------+
| num_factors | Factor Dimension | 8 |
| regularization | L2 Regularization on Factors | 1e-08 |
| solver | Solver used for training | sgd |
| linear_regularization | L2 Regularization on Linear Coefficients | 1e-10 |
| max_iterations | Maximum Number of Iterations | 50 |
+--------------------------------+--------------------------------------------------+----------+
Optimizing model using SGD; tuning step size.
Using 2202 / 2202 points for tuning the step size.
+---------+-------------------+------------------------------------------+
| Attempt | Initial Step Size | Estimated Objective Value |
+---------+-------------------+------------------------------------------+
| 0 | 25 | Not Viable |
| 1 | 6.25 | Not Viable |
| 2 | 1.5625 | Not Viable |
| 3 | 0.390625 | 0.131214 |
| 4 | 0.195312 | 0.168634 |
| 5 | 0.0976562 | 0.237488 |
| 6 | 0.0488281 | 0.338842 |
+---------+-------------------+------------------------------------------+
| Final | 0.390625 | 0.131214 |
+---------+-------------------+------------------------------------------+
Starting Optimization.
+---------+--------------+-------------------+-----------------------+-------------+
| Iter. | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size |
+---------+--------------+-------------------+-----------------------+-------------+
| Initial | 61us | 0.891401 | 0.94414 | |
+---------+--------------+-------------------+-----------------------+-------------+
| 1 | 31.222ms | 0.904366 | 0.950979 | 0.390625 |
| 2 | 63.351ms | 0.493281 | 0.702338 | 0.232267 |
| 3 | 100.4ms | 0.273931 | 0.523383 | 0.171364 |
| 4 | 143.322ms | 0.20113 | 0.448475 | 0.116134 |
| 5 | 174.795ms | 0.150903 | 0.388462 | 0.098237 |
| 10 | 318.538ms | 0.0530298 | 0.230279 | 0.0584121 |
| 50 | 1.50s | 0.00183493 | 0.04281 | 0.0174693 |
+---------+--------------+-------------------+-----------------------+-------------+
Optimization Complete: Maximum number of passes through the data reached.
Computing final objective value and training RMSE.
Final objective value: 0.00162513
Final training RMSE: 0.0402852
It's straightforward to use GraphLab to compare models on a small subset of users in the test_set
. The precision-recall plot that is computed shows the benefits of using the similarity-based model instead of the baseline popularity_model
: better curves tend toward the upper-right hand corner of the plot.
The following command finds the top-ranked items for all users in the first 500 rows of test_set
. The observations in train_set
are not included in the predicted items.
result = tc.recommender.util.compare_models(
test_set, [popularity_model, item_sim_model, factorization_machine_model],
user_sample=.5, skip_set=train_set)
compare_models: using 246 users to estimate model performance PROGRESS: Evaluate model M0 Precision and recall summary statistics by cutoff +--------+-----------------------+-----------------------+ | cutoff | mean_precision | mean_recall | +--------+-----------------------+-----------------------+ | 1 | 0.0 | 0.0 | | 2 | 0.0 | 0.0 | | 3 | 0.0 | 0.0 | | 4 | 0.0 | 0.0 | | 5 | 0.0 | 0.0 | | 6 | 0.0006775067750677507 | 0.0020325203252032527 | | 7 | 0.0005807200929152149 | 0.0020325203252032527 | | 8 | 0.0010162601626016261 | 0.006097560975609755 | | 9 | 0.0009033423667570006 | 0.006097560975609755 | | 10 | 0.0008130081300813011 | 0.006097560975609755 | +--------+-----------------------+-----------------------+ [10 rows x 3 columns] Overall RMSE: 0.7309854474588884 Per User RMSE (best) +---------+------+-------+ | user_id | rmse | count | +---------+------+-------+ | 1621 | 0.0 | 1 | +---------+------+-------+ [1 rows x 3 columns] Per User RMSE (worst) +---------+-------------------+-------+ | user_id | rmse | count | +---------+-------------------+-------+ | 2009 | 3.006811989100817 | 1 | +---------+-------------------+-------+ [1 rows x 3 columns] Per Item RMSE (best) +-----------+------+-------+ | course_id | rmse | count | +-----------+------+-------+ | 8 | 0.0 | 1 | +-----------+------+-------+ [1 rows x 3 columns] Per Item RMSE (worst) +-----------+--------------------+-------+ | course_id | rmse | count | +-----------+--------------------+-------+ | 144 | 3.0954212400029215 | 3 | +-----------+--------------------+-------+ [1 rows x 3 columns] PROGRESS: Evaluate model M1 Precision and recall summary statistics by cutoff +--------+----------------------+----------------------+ | cutoff | mean_precision | mean_recall | +--------+----------------------+----------------------+ | 1 | 0.004065040650406504 | 0.004065040650406504 | | 2 | 0.006097560975609755 | 0.010162601626016263 | | 3 | 0.005420054200542005 | 0.011178861788617888 | | 4 | 0.007113821138211382 | 0.018292682926829285 | | 5 | 0.008130081300813007 | 0.026422764227642268 | | 6 | 0.007452574525745257 | 0.03048780487804879 | | 7 | 0.00696864111498258 | 0.03455284552845529 | | 8 | 0.007113821138211383 | 0.0426829268292683 | | 9 | 0.006775067750677503 | 0.04471544715447155 | | 10 | 0.006097560975609756 | 0.04471544715447156 | +--------+----------------------+----------------------+ [10 rows x 3 columns] Overall RMSE: 4.586134473539705 Per User RMSE (best) +---------+------+-------+ | user_id | rmse | count | +---------+------+-------+ | 1300 | 0.5 | 1 | +---------+------+-------+ [1 rows x 3 columns] Per User RMSE (worst) +---------+------+-------+ | user_id | rmse | count | +---------+------+-------+ | 1704 | 5.0 | 1 | +---------+------+-------+ [1 rows x 3 columns] Per Item RMSE (best) +-----------+------+-------+ | course_id | rmse | count | +-----------+------+-------+ | 200 | 0.5 | 1 | +-----------+------+-------+ [1 rows x 3 columns] Per Item RMSE (worst) +-----------+------+-------+ | course_id | rmse | count | +-----------+------+-------+ | 43 | 5.0 | 1 | +-----------+------+-------+ [1 rows x 3 columns] PROGRESS: Evaluate model M2 Precision and recall summary statistics by cutoff +--------+-----------------------+-----------------------+ | cutoff | mean_precision | mean_recall | +--------+-----------------------+-----------------------+ | 1 | 0.0 | 0.0 | | 2 | 0.0020325203252032522 | 0.0010162601626016261 | | 3 | 0.002710027100271004 | 0.005081300813008129 | | 4 | 0.003048780487804878 | 0.009146341463414632 | | 5 | 0.0024390243902439033 | 0.009146341463414632 | | 6 | 0.0020325203252032527 | 0.009146341463414632 | | 7 | 0.001742160278745646 | 0.009146341463414632 | | 8 | 0.001524390243902439 | 0.009146341463414632 | | 9 | 0.0013550135501355005 | 0.009146341463414632 | | 10 | 0.0016260162601626018 | 0.013211382113821135 | +--------+-----------------------+-----------------------+ [10 rows x 3 columns] Overall RMSE: 0.8346975060578289 Per User RMSE (best) +---------+-----------------------+-------+ | user_id | rmse | count | +---------+-----------------------+-------+ | 1156 | 0.0025112800279867287 | 1 | +---------+-----------------------+-------+ [1 rows x 3 columns] Per User RMSE (worst) +---------+--------------------+-------+ | user_id | rmse | count | +---------+--------------------+-------+ | 1646 | 3.3541036468019474 | 1 | +---------+--------------------+-------+ [1 rows x 3 columns] Per Item RMSE (best) +-----------+---------------------+-------+ | course_id | rmse | count | +-----------+---------------------+-------+ | 129 | 0.00681198910081271 | 1 | +-----------+---------------------+-------+ [1 rows x 3 columns] Per Item RMSE (worst) +-----------+--------------------+-------+ | course_id | rmse | count | +-----------+--------------------+-------+ | 144 | 3.5404041983447554 | 3 | +-----------+--------------------+-------+ [1 rows x 3 columns]
Now let's ask the item similarity model for song recommendations on several users. We first create a list of users and create a subset of observations, users_ratings
, that pertain to these users.
K = 10
users = tc.SArray(sf['user_id'].unique().head(100))
users
dtype: int Rows: 100 [232, 363, 431, 738, 1860, 732, 187, 1368, 1753, 764, 926, 1180, 1323, 1742, 1685, 1876, 1232, 614, 1573, 786, 1158, 1072, 863, 695, 454, 1211, 1404, 1242, 696, 444, 349, 1883, 499, 354, 573, 1531, 71, 1889, 312, 578, 1556, 1572, 79, 416, 848, 1805, 550, 1644, 1017, 521, 566, 703, 196, 1401, 533, 1054, 398, 1077, 341, 982, 516, 1473, 1982, 1670, 529, 1544, 1920, 1414, 223, 465, 376, 1231, 193, 202, 1128, 1853, 1907, 1331, 966, 810, 1895, 1704, 267, 1615, 350, 979, 69, 138, 1645, 837, 1841, 1801, 853, 803, 405, 1172, 112, 1653, 937, 1429]
Next we use the recommend()
function to query the model we created for recommendations. The returned object has four columns: user_id
, song_id
, the score
that the algorithm gave this user for this song, and the song's rank (an integer from 0 to K-1). To see this we can grab the top few rows of recs
:
recs = item_sim_model.recommend(users=users, k=K)
recs.head()
user_id | course_id | score | rank |
---|---|---|---|
232 | 93 | 0.21959692239761353 | 1 |
232 | 180 | 0.2127474546432495 | 2 |
232 | 188 | 0.20187050104141235 | 3 |
232 | 108 | 0.17809301614761353 | 4 |
232 | 55 | 0.1753571629524231 | 5 |
232 | 168 | 0.16715019941329956 | 6 |
232 | 133 | 0.16673386096954346 | 7 |
232 | 186 | 0.16153007745742798 | 8 |
232 | 164 | 0.1601088047027588 | 9 |
232 | 187 | 0.15785843133926392 | 10 |
To learn what songs these ids pertain to, we can merge in metadata about each song.
# Get the meta data of the courses
courses = tc.SFrame.read_csv('../data/cursos.dat', header=False, delimiter='|', verbose=False)
courses =courses.rename({'X1':'course_id', 'X2':'title', 'X3':'avg_rating',
'X4':'workload', 'X5':'university', 'X6':'difficulty', 'X7':'provider'})
courses.show()
courses = courses[['course_id', 'title', 'provider']]
results = recs.join(courses, on='course_id', how='inner')
#Populate observed user-course data with course info
userset = frozenset(users)
ix = sf['user_id'].apply(lambda x: x in userset, int)
user_data = sf[ix]
user_data = user_data.join(courses, on='course_id')[['user_id', 'title', 'provider']]
Materializing SFrame
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) ~/Applications/anaconda/lib/python3.5/site-packages/turicreate/data_structures/sframe.py in join(self, right, on, how) 4351 with cython_context(): -> 4352 return SFrame(_proxy=self.__proxy__.join(right.__proxy__, how, join_keys)) 4353 turicreate/_cython/cy_sframe.pyx in turicreate._cython.cy_sframe.UnitySFrameProxy.join() turicreate/_cython/cy_sframe.pyx in turicreate._cython.cy_sframe.UnitySFrameProxy.join() RuntimeError: Column name course_id does not exist. During handling of the above exception, another exception occurred: RuntimeError Traceback (most recent call last) <ipython-input-42-09739df6b78f> in <module>() 6 7 courses = courses[['course_id', 'title', 'provider']] ----> 8 results = recs.join(courses, on='course_id', how='inner') 9 10 #Populate observed user-course data with course info ~/Applications/anaconda/lib/python3.5/site-packages/turicreate/data_structures/sframe.py in join(self, right, on, how) 4350 4351 with cython_context(): -> 4352 return SFrame(_proxy=self.__proxy__.join(right.__proxy__, how, join_keys)) 4353 4354 def filter_by(self, values, column_name, exclude=False): ~/Applications/anaconda/lib/python3.5/site-packages/turicreate/_cython/context.py in __exit__(self, exc_type, exc_value, traceback) 47 if not self.show_cython_trace: 48 # To hide cython trace, we re-raise from here ---> 49 raise exc_type(exc_value) 50 else: 51 # To show the full trace, we do nothing and let exception propagate RuntimeError: Column name course_id does not exist.
# Print out some recommendations
for i in range(5):
user = list(users)[i]
print("User: " + str(i + 1))
user_obs = user_data[user_data['user_id'] == user].head(K)
del user_obs['user_id']
user_recs = results[results['user_id'] == str(user)][['title', 'provider']]
print("We were told that the user liked these courses: ")
print (user_obs.head(K))
print ("We recommend these other courses:")
print (user_recs.head(K))
print ("")
User: 1 We were told that the user liked these courses: +-------------------------------+----------+ | title | provider | +-------------------------------+----------+ | An Introduction to Interac... | coursera | +-------------------------------+----------+ [1 rows x 2 columns] We recommend these other courses: +-------+----------+ | title | provider | +-------+----------+ +-------+----------+ [0 rows x 2 columns] User: 2 We were told that the user liked these courses: +-------------------------------+----------+ | title | provider | +-------------------------------+----------+ | An Introduction to Interac... | coursera | +-------------------------------+----------+ [1 rows x 2 columns] We recommend these other courses: +-------+----------+ | title | provider | +-------+----------+ +-------+----------+ [0 rows x 2 columns] User: 3 We were told that the user liked these courses: +-------------------------------+----------+ | title | provider | +-------------------------------+----------+ | An Introduction to Interac... | coursera | +-------------------------------+----------+ [1 rows x 2 columns] We recommend these other courses: +-------+----------+ | title | provider | +-------+----------+ +-------+----------+ [0 rows x 2 columns] User: 4 We were told that the user liked these courses: +-------------------------------+----------+ | title | provider | +-------------------------------+----------+ | A Beginner's Guide to ... | coursera | | Gamification | coursera | +-------------------------------+----------+ [2 rows x 2 columns] We recommend these other courses: +-------+----------+ | title | provider | +-------+----------+ +-------+----------+ [0 rows x 2 columns] User: 5 We were told that the user liked these courses: +-------------------------------+----------+ | title | provider | +-------------------------------+----------+ | Web Intelligence and Big Data | coursera | +-------------------------------+----------+ [1 rows x 2 columns] We recommend these other courses: +-------+----------+ | title | provider | +-------+----------+ +-------+----------+ [0 rows x 2 columns]