-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplotting_helper_functions.py
257 lines (198 loc) · 9.36 KB
/
plotting_helper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import featurize_helper_functions as fhf
import numpy as np
from plotly import tools
import plotly.graph_objs as go
from matplotlib import pyplot as plt
def get_job_groups():
# The data contains many user jobs. We'll bin them into similar groups
job_groups = {
('administrator','executive','marketing','salesman') : 'Business / Management',
('librarian','educator') : 'Education',
('doctor', 'healthcare') : 'Healthcare',
('lawyer',) : 'Law',
('other', 'none', 'technician','homemaker') : 'Other',
('artist', 'writer', 'entertainment') : 'Content Producer',
('retired',): 'Retired',
('engineer', 'programmer','scientist') : 'Technical',
('student',) : 'Student'
}
return job_groups
def get_avg_hg_ratings(user_features, item_features):
avg_ratings = []
for gi, g in enumerate(item_features.columns):
g_mid = item_features.index.values[np.where(item_features[g].values==1)[0]]
g_idx = np.where(np.in1d(user_features.columns.values, g_mid.astype(str)))[0]
avg_ratings.append(np.nanmean(user_features.iloc[:,g_idx],1))
avg_ratings = np.array(avg_ratings)
high_g_ratings = []
for i in avg_ratings:
high_g_ratings.append(np.where(i >= 0.8)[0])
high_g_ratings = np.array(high_g_ratings)
return avg_ratings, high_g_ratings
def user_scatter_plot(user_features):
job_groups = get_job_groups()
# Scatter latitude (x), longitude(y), and Age(z), and color code by occupation
lat = np.array([fhf.get_zip_data(z,'latitude') for z in user_features.zip_code])
lon = np.array([fhf.get_zip_data(z,'longitude') for z in user_features.zip_code])
x = lat /90
y = lon / 180
z = np.array(user_features.normed_age*130) #Age was normalized by 130, un-normalize for plot
# Generate a list of colors for plotting, one for each job group
color_list = fhf.get_N_HexCol(len(job_groups.keys()), 0.6, 0.75)
# Generate the employment bins, and for each one, generate its scatter plot data
traces = []
for ei, e in enumerate(job_groups.keys()):
# Generate the employment bins
e_idx = []
for jobs in e:
e_idx.extend(np.where(user_features[jobs]==1)[0].tolist())
e_idx = np.array(e_idx)
# Generate the scatter data for each bin, and append it to the 'traces' list
# which we will then plot
x_i = x[e_idx]
y_i = y[e_idx]
z_i = z[e_idx]
traces.append(
go.Scatter3d(x=x_i, y=y_i, z=z_i, name = job_groups[e], mode='markers',
marker=dict(color = color_list[ei], size=5,
line=dict(color='rgba(58, 71, 80, 1.0)',width=0.5),
opacity=0.8)
)
)
# Set up the 3d plot's camera position
camera = dict(
#up=dict(x=0, y=0, z=1),
#center=dict(x=0, y=0, z=0),
eye=dict(x=1.35, y=1.35, z=0.65)
)
# Set up the plot's layout
layout = dict(
#height = 650,
height = 500,
width = 780,
#width = 900,
scene = dict(camera=camera, xaxis=dict(title='Latitude',),
yaxis=dict(title='Longitude',),
zaxis=dict(title='Age',)),
margin=dict(l=30, r=30, t=30, b=30,),
)
# Get plotting traces and layout
return traces, layout
def user_job_stacked_bar(user_features):
# Plot the percentage of users for each occupation
# Generate occupation percentages
job_groups = get_job_groups()
job_perc = []
jp_labels = []
color_list = fhf.get_N_HexCol(len(job_groups.keys()), 0.6, 0.75)
for ei, e in enumerate(job_groups.keys()):
e_idx = []
jp_labels.append(job_groups[e])
for jobs in e:
e_idx.extend(np.where(user_features[jobs]==1)[0].tolist())
job_perc.append(np.float64(len(e_idx))/len(user_features))
job_perc = np.round(np.array(job_perc)*100)
jp_labels = np.array(jp_labels)
sortidx = np.argsort(job_perc)
# For each occupation, create a separate bar, and append it to trace1
# These will be plotted as a stacked bar plot
y_j = job_perc[sortidx].tolist()
x_j = jp_labels[sortidx].tolist()
trace1 = []
for xi, x_ji in enumerate(x_j):
trace1.append(go.Bar(y=['Job Percentages'], x=[y_j[xi]], name = x_ji, orientation = 'h',
marker = dict(color = np.array(color_list)[sortidx].tolist()[xi],
line = dict(color = 'rgba(58, 71, 80, 1.0)',width = 2)))
)
# Set the layout of the figure, and set barmode to 'stack' for stacked bar plot
layout = dict(barmode = 'stack',showlegend = False, height=300,
margin=dict(l=100, r=20, t=170, b=70,))
# Get plotting traces and layout
return trace1, layout
def inventory_hbar(user_features, item_features):
# Compute the average rating each user rates each movie genre
# Get user's average ratings, and pull out the index of users that rated each genre 4/5 or higher
avg_ratings, high_g_ratings = get_avg_hg_ratings(user_features, item_features)
# Compute the percentage of users who rated each genre 4/5 or higher
perc_users = []
for hgi, hg in enumerate(high_g_ratings):
perc_users.append(np.float64(len(hg))/np.sum(~np.isnan(avg_ratings[hgi])))
perc_users = np.round(100*np.array(perc_users))
# Compute the percentage of movies that make up each genre
percentages = np.round((100*np.sum(item_features)/np.sum(np.sum(item_features))))
sortidx = np.argsort(percentages)
# Generate bar plot data for the percengate of items that make up each genre in the library
y_pi = percentages[sortidx].tolist()
x_i = item_features.columns[sortidx].tolist()
trace0 = go.Bar(x=y_pi, y=x_i, marker=dict(color='rgba(171, 50, 96, 0.6)',line=dict(color='rgba(171, 50, 96, 1.0)',width=1),),
name='Percentage of movies in genre',orientation='h')
# Generate bar plot data for the percentage of users who rated each item highly
y_pu = perc_users[sortidx].tolist()
x_u = item_features.columns[sortidx].tolist()
trace1 = go.Bar(x=y_pu, y=x_u, marker=dict(color='rgba(50, 171, 96, 0.6)',line=dict(color='rgba(50, 171, 96, 1.0)',width=1),),
name='Percentage users rating genre high',orientation='h'
)
# Create the layout of the plot
layout = dict(yaxis1=dict(showgrid=False,showline=True,linewidth=1,showticklabels=True,domain=[0, 0.85],),
yaxis2=dict(showgrid=False,showline=True,showticklabels=False,linewidth=1,domain=[0, 0.85],),
xaxis1=dict(zeroline=False,showline=False,showticklabels=True,showgrid=True,domain=[0, 0.42],),
xaxis2=dict(zeroline=False,showline=False,showticklabels=True,showgrid=True,domain=[0.47, 1],),
legend=dict(x=0.029,y=1.038,font=dict(size=10,),),
margin=dict(l=100,r=20,t=70,b=70),
paper_bgcolor='rgb(248, 248, 255)', plot_bgcolor='rgb(248, 248, 255)',
)
# Get plotting traces and layouts
return trace0, trace1, layout
def conditional_interest(user_features, item_features):
# Compute conditional ratings:
# e.g. given that a user rated a given genre highly ...
# ... how did the user rate each of the other genres?
avg_ratings, high_g_ratings = get_avg_hg_ratings(user_features, item_features)
cond_ratings = []
for gi, g in enumerate(item_features.columns):
cond_ratings.append([])
g_mid = item_features.index.values[np.where(item_features[g].values==1)[0]]
g_idx = np.where(np.in1d(user_features.columns.values, g_mid.astype(str)))[0]
for gi2, g2 in enumerate(item_features.columns):
g2_mid = item_features.index.values[np.where(item_features[g2].values==1)[0]]
g2_idx = np.where(np.in1d(user_features.columns.values, g2_mid.astype(str)))[0]
cond_ratings[gi].append(np.nanmean(user_features.iloc[high_g_ratings[gi],g2_idx]))
cond_ratings = np.array(cond_ratings)
# Normalize data within each row by the maximum value in that row
cond_norm = cond_ratings/np.max(cond_ratings,0)
# Make a heat map / color matrix of the normalized ratings
fig, ax = plt.subplots()
heatmap = ax.pcolor(cond_norm, cmap=plt.cm.Blues, alpha=0.8)
# Set figure size
fig = plt.gcf()
fig.set_size_inches(12,6)
# turn off the frame
ax.set_frame_on(False)
# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(cond_norm.shape[0]) + 0.5, minor=False)
ax.set_xticks(np.arange(cond_norm.shape[1]) + 0.5, minor=False)
# reverse the axes so they go from top to bottom, left to right
ax.invert_yaxis()
ax.xaxis.tick_top()
ax.xaxis.set_label_position("top")
# Set the labels
labels = item_features.columns.values
ax.set_xticklabels(labels, minor=False)
ax.set_yticklabels(labels, minor=False)
# rotate the text of the x axis to be vertical
plt.xticks(rotation=90)
# turn off grid
ax.grid(False)
# Turn off all the ticks
ax = plt.gca()
for t in ax.xaxis.get_major_ticks():
t.tick1On = False
t.tick2On = False
for t in ax.yaxis.get_major_ticks():
t.tick1On = False
t.tick2On = False
# Set titles for axes
plt.xlabel('... like [genre] this much.')
plt.ylabel('People who like [genre]...')
# Show plot
plt.show();