-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommender_systems.py
176 lines (140 loc) · 6.98 KB
/
recommender_systems.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2018 David Arroyo Menéndez
# Author: David Arroyo Menéndez <[email protected]>
# Maintainer: David Arroyo Menéndez <[email protected]>
# This file is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
# This file is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with GNU Emacs; see the file COPYING. If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301 USA,
from __future__ import division
import math, random
from collections import defaultdict, Counter
from linear_algebra import dot
users_interests = [
["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]
]
popular_interests = Counter(interest
for user_interests in users_interests
for interest in user_interests).most_common()
def most_popular_new_interests(user_interests, max_results=5):
suggestions = [(interest, frequency)
for interest, frequency in popular_interests
if interest not in user_interests]
return suggestions[:max_results]
#
# user-based filtering
#
def cosine_similarity(v, w):
return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))
unique_interests = sorted(list({ interest
for user_interests in users_interests
for interest in user_interests }))
def make_user_interest_vector(user_interests):
"""given a list of interests, produce a vector whose i-th element is 1
if unique_interests[i] is in the list, 0 otherwise"""
return [1 if interest in user_interests else 0
for interest in unique_interests]
user_interest_matrix = map(make_user_interest_vector, users_interests)
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
for interest_vector_j in user_interest_matrix]
for interest_vector_i in user_interest_matrix]
def most_similar_users_to(user_id):
pairs = [(other_user_id, similarity) # find other
for other_user_id, similarity in # users with
enumerate(user_similarities[user_id]) # nonzero
if user_id != other_user_id and similarity > 0] # similarity
return sorted(pairs, # sort them
key=lambda(_, similarity): similarity, # most similar
reverse=True) # first
def user_based_suggestions(user_id, include_current_interests=False):
# sum up the similarities
suggestions = defaultdict(float)
for other_user_id, similarity in most_similar_users_to(user_id):
for interest in users_interests[other_user_id]:
suggestions[interest] += similarity
# convert them to a sorted list
suggestions = sorted(suggestions.items(),
key=lambda (_, weight): weight,
reverse=True)
# and (maybe) exclude already-interests
if include_current_interests:
return suggestions
else:
return [(suggestion, weight)
for suggestion, weight in suggestions
if suggestion not in users_interests[user_id]]
#
# Item-Based Collaborative Filtering
#
interest_user_matrix = [[user_interest_vector[j]
for user_interest_vector in user_interest_matrix]
for j, _ in enumerate(unique_interests)]
interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
for user_vector_j in interest_user_matrix]
for user_vector_i in interest_user_matrix]
def most_similar_interests_to(interest_id):
similarities = interest_similarities[interest_id]
pairs = [(unique_interests[other_interest_id], similarity)
for other_interest_id, similarity in enumerate(similarities)
if interest_id != other_interest_id and similarity > 0]
return sorted(pairs,
key=lambda (_, similarity): similarity,
reverse=True)
def item_based_suggestions(user_id, include_current_interests=False):
suggestions = defaultdict(float)
user_interest_vector = user_interest_matrix[user_id]
for interest_id, is_interested in enumerate(user_interest_vector):
if is_interested == 1:
similar_interests = most_similar_interests_to(interest_id)
for interest, similarity in similar_interests:
suggestions[interest] += similarity
suggestions = sorted(suggestions.items(),
key=lambda (_, similarity): similarity,
reverse=True)
if include_current_interests:
return suggestions
else:
return [(suggestion, weight)
for suggestion, weight in suggestions
if suggestion not in users_interests[user_id]]
if __name__ == "__main__":
print("Popular Interests")
print(popular_interests)
print("Most Popular New Interests")
print("already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
print(most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
print("already like:", ["R", "Python", "statistics", "regression", "probability"])
print(most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"]))
print("User based similarity")
print("most similar to 0")
print(most_similar_users_to(0))
print("Suggestions for 0")
print(user_based_suggestions(0))
print("Item based similarity")
print("most similar to 'Big Data'")
print(most_similar_interests_to(0))
print("suggestions for user 0")
print(item_based_suggestions(0))