-
Notifications
You must be signed in to change notification settings - Fork 69
/
recommendations.py
142 lines (103 loc) · 4.2 KB
/
recommendations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from math import sqrt
import collections
import clusters
# prefs is a map from people to a map from things to scores
def sim_distance(prefs, person1, person2):
# get common items
ci = {}
for item in prefs[person1]:
if item in prefs[person2]:
ci[item] = prefs[person1][item] - prefs[person2][item]
if len(ci) == 0: return 0
return 1/(1 + clusters.hypot(ci.values()))
def sim_pearson(prefs, person1, person2):
# get common items
ci = {}
for item in prefs[person1]:
if item in prefs[person2]: ci[item] = 1
if len(ci) == 1: # confuses pearson metric
return sim_distance(prefs, person1, person2)
v1 = [prefs[person1][it] for it in ci]
v2 = [prefs[person2][it] for it in ci]
return clusters.pearson(v1, v2)
def topMatches(prefs, person, n=5, similarity=sim_pearson):
"""Given a map from persons to personal preferences, returns the top n
people similar to a given person."""
scores = [(similarity(prefs, person, other), other)
for other in prefs if other != person]
scores.sort(reverse=True)
return scores[0:n]
def getRecommendations(prefs, person, similarity=sim_pearson):
totals = collections.defaultdict(float)
simSums = collections.defaultdict(float)
for other in prefs:
if other == person: continue
sim = similarity(prefs, person, other)
if sim <= 0: continue
for item in prefs[other]:
# only score items person doesn't know
if item not in prefs[person] or prefs[person][item] == 0:
simSums[item] += sim
totals[item] += prefs[other][item]*sim # weight score by similarity
# Note that renormalization gives items that are only known to one person
# their score, regardless of how similar I am to them
rankings = [(total/simSums[item], item) for item,total in totals.items()]
return sorted(rankings, reverse=True)
def transformPrefs(prefs):
"""Use this to transform a map from persons to rated things to a map from
things to persons that describes how much a thing is liked by a person. Use
the result of this function as parameter to topMatches() to get items similar
to a given item."""
r = collections.defaultdict(dict)
for person in prefs:
for item in prefs[person]:
r[item][person] = prefs[person][item]
return r
def calculateSimilarItems(prefs, n=10):
"""Item-based collaborative filtering instead of user-based collaborative
filtering as done before. This precomputes for each item the `n` most similar
items. Items are considered similar if they are liked by the same set of
people (roughly).
This is useful because item sets are more stable than person sets."""
result = {}
# Invert preference matrix to be item-centric
itemPrefs = transformPrefs(prefs)
c = 0
for item in itemPrefs:
# Status updates for large datasets
c += 1
if c % 100 == 0: print '%d / %d' % (c, len(itemPrefs))
# Find the items most similar to current one
scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)
result[item] = scores
return result
def getRecommendedItems(prefs, itemMatch, user):
"""Recommends based on item similarity. Hence, this is faster than
getRecommendations(), which loops over all users."""
userRatings = prefs[user]
scores = collections.defaultdict(float)
totalSim = collections.defaultdict(float)
for item, rating in userRatings.items():
for similarity, item2 in itemMatch[item]:
#ignore of this user has already rated item2
if item2 in userRatings: continue
scores[item2] += similarity * rating
totalSim[item2] += similarity
# normalize scores
rankings = [(score/totalSim[item], item) for item,score in scores.items()]
return sorted(rankings, reverse=True)
def sim_tanimoto(prefs, person1, person2):
ci = set([])
# get common items
for item in prefs[person1]:
if item in prefs[person2]:
ci.add(item)
if len(ci) == 0: return 0
# use only items in both sets for a and b
a = sum([pow(prefs[person1][k], 2) for k in ci])
b = sum([pow(prefs[person2][k], 2) for k in ci])
# use the full sets for a and b
#a = sum([s*s for s in prefs[person1].values()])
#b = sum([s*s for s in prefs[person2].values()])
c = sum([prefs[person1][k] * prefs[person2][k] for k in ci])
return c/(a + b - c)