-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnetflix.py
128 lines (96 loc) · 4.09 KB
/
netflix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# code from the book "programming collective intelligence"
from math import sqrt
def transformPrefs(prefs):
result={}
for person in prefs:
for item in prefs[person]:
result.setdefault(item, {})
#flip item and person
result[item][person]=prefs[person][item]
return result
def sim_distance(prefs, person1, person2):
# there is a typo in the book for this function
si={} #shared items
for item in prefs[person1]:
if item in prefs[person2]:
si[item]=1
if len(si) == 0:
return 0
sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item], 2)
for item in si])
return 1/(1+sqrt(sum_of_squares))
def sim_pearson(prefs, person1, person2):
si={} #shared items
for item in prefs[person1]:
if item in prefs[person2]:
si[item]=1
n = float(len(si))
if n==0: return 0 # 0 means no linear relationship b/t two variables
#add up ratings each critic made for common films
sum1 = sum([prefs[person1][item] for item in si])
sum2 = sum([prefs[person2][item] for item in si])
#sum the squares of each critics ratings
sumSq1 = sum([pow(prefs[person1][item], 2) for item in si])
sumSq2 = sum([pow(prefs[person2][item], 2) for item in si])
#sum the products of each critics ratings
pSum = sum([prefs[person1][item] * prefs[person2][item] for item in si])
#calculate pearson correlation
numerator = pSum-((sum1*sum2)/n)
denominator = sqrt((sumSq1 - pow(sum1, 2)/n) * (sumSq2 - pow(sum2, 2)/n))
if denominator == 0: return 0
else:
pearson = numerator/denominator
return pearson
def topMatches(prefs, person, n=5, similarityMetric=sim_distance):
#compare:
#topMatches(critics, 'Lisa Rose', similarityMetric=sim_pearson)
#topMatches(critics, 'Lisa Rose', similarityMetric=sim_distance)
#How are they different? Which do you think is more intuitive?
#apply similarity metric to the subject and iterate against every other member of the set
scores = [(similarityMetric(prefs, person, other), other)
for other in prefs if other != person]
#sort the list and reverse so largest scores appear at the beginning of the list
scores.sort()
scores.reverse()
return scores[0:n] #return the top number of scores requested
def calculateSimilarItems(prefs, n=10):
result={}
itemPrefs=prefs#transformPrefs(prefs)
c=0
for item in itemPrefs:
print item
#print 'calculating similarities for ' + item[0] + '...'
c+=1
if c%100==0:
print "%d / %d" % (c, len(itemPrefs))
scores=topMatches(itemPrefs, item, n=n, similarityMetric=sim_distance)
result[item] = scores
return result
def getRecommendedItems(prefs, itemMatch, user):
prefs = transformPrefs(prefs) # because we have an inverse set to start with
userRatings=prefs[user]
scores={}
totalSim={}
#loop over items rated by this user
for (item, rating) in userRatings.items():
#loop over items similar to this one
for (similarity, item2) in itemMatch[item]:
#ignore if user already rated item
if item2 in userRatings: continue
#weighted sum of ratings * similarity
scores.setdefault(item2, 0)
scores[item2]+=similarity*rating
#sum similarities
totalSim.setdefault(item2, 0)
totalSim[item2]+=similarity
#divide each total score by total weighting to get an average
#this will only work correctly in a large database with lots of overlap between
#what users have rated
rankings = [(score/totalSim[item], item) for item, score in scores.items()]
#taking the square root of the total similarities seems to give expected results for smaller sparser data
#you should evaluate both approaches (or design your own!)
#rankings = [(score/sqrt(totalSim[item]), item) for item, score in scores.items()]
#return rankings from high to low
rankings.sort()
rankings.reverse()
return rankings