-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeaturize_helper_functions.py
143 lines (87 loc) · 3.88 KB
/
featurize_helper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
from pyzipcode import ZipCodeDatabase
zcdb = ZipCodeDatabase()
import colorsys
def get_N_HexCol(N=5, a=0.5, b=0.5):
HSV_tuples = [(x*1.0/N, a, b) for x in xrange(N)]
hex_out = []
for rgb in HSV_tuples:
rgb = map(lambda x: int(x*255),colorsys.hsv_to_rgb(*rgb))
hex_out.append("#" + "".join(map(lambda x: chr(x).encode('hex'),rgb)))
return hex_out
def get_zip_data(zipcode, zip_feature):
"""
Given a zip code, retrieve a geographical feature such as
State, Latitude, Longitude, etc
"""
try:
return getattr(zcdb[zipcode], zip_feature)
except IndexError:
return np.nan
def get_user_features(users, items, ratings):
"""
Given a data set of users, movies, and movie ratings,
generate user demographic and rating features
"""
def get_user_demographics(users):
"""
Generate demographic features from a set of users
"""
state_dummies = pd.get_dummies(users['state'])
sex_dummies = pd.get_dummies(users['sex'])
occupation_dummies = pd.get_dummies(users['occupation'])
user_demographics = pd.DataFrame()
user_demographics['normed_age'] = users.age/130
user_demographics = pd.concat([user_demographics, state_dummies, sex_dummies, occupation_dummies], axis = 1)
user_demographics.index = users.user_id
return user_demographics
def get_user_ratings(items, ratings):
"""
Generate rating features from a set of movies and user ratings
"""
user_movie_ratings = pd.merge(items, ratings, how = 'right', left_on = 'movie id', right_on = 'movie_id')
user_movie_ratings = user_movie_ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')
user_movie_ratings.columns = user_movie_ratings.columns.astype(str)
return user_movie_ratings
if len(np.unique(users.user_id)) < len(users.user_id):
raise ValueError('Error: Duplicate user IDs detected in "users" dataframe')
if len(np.unique(items['movie id'])) < len(items['movie id']):
raise ValueError('Error: Duplicate movie IDs detected in "items" dataframe')
#Generate the demographic and rating features
user_demographics = get_user_demographics(users)
user_movie_ratings = get_user_ratings(items, ratings)
#Normalize the rating features by the maximum rating
max_rating = 5.
user_movie_ratings = user_movie_ratings/max_rating
#Combine demographic and rating featurs into one data frame
user_features = pd.merge(user_demographics, user_movie_ratings,
left_index = True, right_index = True, how = 'outer')
#Set the data frame's index to the users' IDs
user_features.index = users.user_id
return user_features, user_demographics, user_movie_ratings
def get_item_features(items):
"""
Extract the feature columns from the 'items' data frame
"""
item_features = items.iloc[:,6:].copy()
item_names = items['movie title'].copy()
item_ids = items['movie id'].copy()
item_features.index = item_ids
return item_features, item_names, item_ids
def euclidean_score(vec1, vec2):
"""
Compute a user similarity score based on
euclidean distance of user feature vectors
vec1, vec2 - numpy arrays
"""
vec1 = np.squeeze(vec1)
vec2 = np.squeeze(vec2)
mask = np.where((~np.isnan(vec1)) & (~np.isnan(vec2)))[0]
weight = np.float64(len(mask))/len(vec1)
vec1 = vec1[mask]
vec2 = vec2[mask]
dist = pdist([vec1, vec2], "euclidean")[0]
dist_score = 1./(1+dist)
wdist_score = dist_score*weight
return dist_score, wdist_score, weight