-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexperiment.py
432 lines (366 loc) · 18.1 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
"""Generate the experiment environment to test optimisation algorithms."""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
class ExperimentEnvironment:
"""
Create the environment that simulates scenarios that happen out there.
Motivation: We need an environment that provides a common framework for all the algorithms so,
we can compare their effectiveness.
There are some attributes that control the shape of the environment:
- Min and max: the amount of people in the team that can vary depending on the day and the
hires (that's why they are picked uniformly).
- Days: a set of days with their probability of happening as most of the sprints we will
have the full amount of days. In our experiment we are assuming that the sprint lasts two
weeks in which case the max total days would be 10
"""
min_people, max_people = 3, 7
days = pd.Series(range(7, 11), index=[0.03, 0.07, 0.1, 0.8])
def __init__(self, sprints=5000, no_20=False):
"""
Class constructor.
Args:
sprints: int, define the number of generated sprints. It has no effect if get_env is
passed use_cache=True.
"""
self.sprints = sprints
shortcut_stories = pd.read_csv(
"source_data/stories_estimations_from_shortcut.csv", index_col=0
)
if no_20:
shortcut_stories = shortcut_stories[shortcut_stories.estimate < 20]
self.shortcut_stories = shortcut_stories.copy()
self.no_20 = no_20
def _draw_days(self):
"""
Generate samples from a distribution of days given some weights.
Since 10 days is the most frequent we might want to sample that number most of the time.
"""
return self.days.sample(
self.sprints, weights=self.days.index, replace=True
).values
def _get_noisy_sprint(self, days):
"""
Get a noisy set of sprints for a single person.
Basically we will use some collected data about the effective time dedicated to burn points
per sprint (performance). We will assume that the mean of that distribution represents one
point.
We need to come up with a shift such that when we face a sprint with ten points and we
reduce it by a factor of the mean, mu, adding this shift to that reduction will return
ten again.
Args:
days: a numpy array containing the number of days for each sprint.
Returns: a numpy array with the real length of the sprint for a single person.
"""
percentages = pd.read_csv("source_data/performance.csv").points
mu = percentages.mean()
hs = pd.Series(np.linspace(0, 1, 101))
weights = gaussian_kde(percentages).pdf(hs.values)
samples = hs.sample(self.sprints, weights=weights, replace=True).reset_index(
drop=True
)
shift = days * (1 - mu)
return days * samples.values + shift
def _get_total_noise_per_sprint(self, days, people):
"""
Get the total noise per sprint in the set given how many people are in it.
Each person will draw samples from the distribution created in _get_a_noisy_sprint that
will represent their effective time in the sprint. Then we will sum the points done by the
people.
This has a caveat, as persons individually drawing from the distribution implies
that each individual is independent of everyone else which feels not true as a week
with an incident will affect the whole team.
Args:
days: a numpy array containing the number of days for each sprint.
people: a numpy array containing the number of people for each sprint.
Returns: a numpy array containing the total noise per sprint.
"""
total_sprint = []
for _ in range(people):
individual_sprint = self._get_noisy_sprint(days)
total_sprint.append(individual_sprint)
return np.array(total_sprint).sum(axis=0)
def _get_real_lengths(self, days, people):
"""
Get the real lengths of the set of sprints.
Args:
days: a numpy array containing the number of days for each sprint.
people: a numpy array containing the number of people for each sprint.
Returns: a numpy array with the real size for each sprint in integers.
"""
real_lengths = pd.DataFrame({"people": people, "sprint_len": 0})
for p in range(people.min(), people.max() + 1):
people_subset = real_lengths[real_lengths.people == p]
k1 = self._get_total_noise_per_sprint(days, p)
real_lengths.loc[people_subset.index, "sprint_len"] = k1[
people_subset.index
]
assert real_lengths[real_lengths.sprint_len == 0].empty
return real_lengths.sprint_len.astype(int)
def plot_sprint_diff(self, use_cache=False):
"""Plot the difference between the estimation and the real length of a sprint."""
if use_cache:
data = pd.read_csv(f"cached_experiment/experiment_sprints.csv")
else:
data = self._create_sprints()
diff = data.points - data.real_length
_, ax = plt.subplots()
ax.set_title = "Sprint diff estimation vs real"
ax.set_xlabel = "Sprint no."
ax.set_ylabel = "Diff in points"
ax.plot(diff.index, diff.values)
def _create_sprints(self):
"""Get the main dataframe with the information of each sprint."""
people = np.random.randint(self.min_people, self.max_people, size=self.sprints)
days = self._draw_days()
points = people * days
return pd.DataFrame(
{
"people": people,
"days": days,
"points": points,
"real_length": self._get_real_lengths(days, people),
},
index=np.arange(self.sprints),
)
def _create_bag_of_stories(self):
"""
Create a dataframe that will represent the stories available for the sprints.
Basically, to get the estimated size we will sample from the distribution of story points
50 stories per sprint pretending that all the stories in the sprint are one-pointers
exclusively.
In practice, though, we will likely have more points per story, so this way we ensure
that the algorithms will have enough room to woefully overshoot. With this setting of
`self.days.max() + 3` we average ~246 points per sprint worth of stories.
To avoid errors with indices, a unique id whose values match the index is also returned.
Returns:
A pandas DataFrame with a set of stories for each sprint defined by `sprint_id` along
with the estimated and the real size of them and a unique id.
"""
max_points_per_sprint = self.max_people * (self.days.max() + 3)
no_of_samples = max_points_per_sprint * self.sprints
estimated_size = self.shortcut_stories.sample(
no_of_samples, replace=True
).estimate.values
sprint_id = np.full(
(max_points_per_sprint, self.sprints), np.arange(self.sprints)
).T.ravel()
return pd.DataFrame(
{
"story_id": np.arange(no_of_samples),
"sprint_id": sprint_id,
"estimated_size": estimated_size,
"real_size": self._get_real_size(estimated_size),
}
)
def _get_noise_for_stories(self):
"""
Extract the error from the points and the durations.
The source data contains stories whose duration and estimated
points are known. Since points are a proxy of time we can compare them as long as they are
normalised.
To do so, we scale the durations df to match the max amount of points we ever defined in
shortcut pretending that the story that last the most should have carried that max amount
of points.
Once the durations are normalised in the same units as the score, we can calculate the
difference that we will use as the noise.
Returns:
A numpy array with the differences between the estimation and the real length of the
story where the negative values represent stories underestimated.
"""
data = pd.read_csv("source_data/story_points.csv")
durations = self.shortcut_stories.estimate.max() * data.dur / data.dur.max()
scores = data.score
return (durations - scores).round().astype(np.int8)
def _get_real_size(self, estimated_size):
"""
Calculate the real size of a story.
As a starting point we can naively assume that the noise for the stories does not depend on
the story, this way what we do is for each story in the estimated size array we draw a
value from the noise bag.
This will produce that some stories, specially one-pointers, land into negative real
values, so for these guys we keep drawing samples from the noise bag. This will effectively
produce that stories are at least 1 point and at most the max points in a story ever had in
shortcut plus the max noise (something around ~27 points).
Args:
A numpy array with the estimated sizes of the stories in the bag of stories.
Returns:
A numpy array with the real size corresponding to each story in the bag of stories.
"""
assert isinstance(estimated_size, np.ndarray)
# The negative values get exhausted usually after 10 iterations but, we set this number
# just in case we increase the size of the experiment.
max_iter = 100
real_size = np.zeros(estimated_size.size)
noise_bag = self._get_noise_for_stories()
under_1 = real_size < 1
while max_iter and np.sum(under_1):
entries = estimated_size[under_1]
noise = noise_bag.sample(entries.shape[0], replace=True).values
real_size[under_1] = entries + noise
under_1 = real_size < 1
max_iter -= 1
assert np.sum(real_size < 1) == 0, f"Max iterations exhausted, {under_1.shape}"
return real_size.astype(np.int8)
def get_env(self, use_cache=False, save=True):
"""
Get the environment.
Args:
use_cache: bool, defines whether a cached experiment should be used, otherwise
`get_env` will generate a new one.
save: determine whether a new generated environment should be saved to be used as a
cache.
Returns:
A tuple of dataframes containing the sprints and the stories.
"""
names = ("sprints", "stories")
if self.no_20:
names = ("sprints", "stories_no_20")
if use_cache:
sprints, stories = [
pd.read_csv(f"cached_experiment/experiment_{name}.csv")
for name in names
]
else:
sprints = self._create_sprints()
stories = self._create_bag_of_stories()
# Ensure that index match the story_ids
assert (stories.index == stories.story_id).all()
if save:
pairs = zip((sprints, stories), names)
[
df.to_csv(f"cached_experiment/experiment_{name}.csv", index=False)
for df, name in pairs
]
return sprints, stories
class ExperimentEvaluator:
"""Evaluate the performance of an algorithm."""
def __init__(self, algorithm_outcome, sprints, stories):
"""
Class constructor.
Args:
algorithm_outcome: a pandas DataFrame containing two columns:
- story_id: It represents the stories over the experiment's sprints where the
evaluator should stop at to compute.
- extra_points: the extra points the evaluator should add on top to fine tune the
main sum of points.
sprints: a pandas DataFrame like the one created by ExperimentEnvironment.
stories: a pandas DataFrame like the one created by ExperimentEnvironment.
"""
assert algorithm_outcome.shape == (sprints.shape[0], 2)
assert (algorithm_outcome.extra_points >= 0).all()
self.outcome = algorithm_outcome.copy()
self.sprints = sprints
self.stories = stories
def _check_extra_points(self):
"""
Ensure that the extra points are smaller than the next story after the breakpoint.
This implies that algorithm may not be able to nail the sprint as in the following example:
sprint.real_length = 10
algorithm.outcome = [3, 2] # 3rd index, 2 extra points
stories up to 3rd index real sum = 8
However, if stories' 4th index is [1, 5] (estimated_size, real_size) the algorithm will be
forced to choose [4, 1] which will turn into an overshoot as
stories up to 4th index real sum = 13; and when we add the extra point we end up
overshooting by 4.
"""
next_story_indices = self.outcome.story_id + 1
sizes = self.stories.loc[next_story_indices, "estimated_size"]
assert np.all(sizes.values > self.outcome.extra_points.values)
def _compute_diff(self):
"""
Compute the value the evaluator will use to compute the regret.
Remember that sizes refer to stories whereas lengths refer to sprints.
Most of the columns created in the outcome dataframe will be for debugging purposes as the
one regret will use will be `distance` but in a nutshell this is the explanation behind the
logic:
Basically the distance is the difference between the sprint real length and the sum of
real sizes of the stories plus the extra points.
distance = real_length - real_size - extra points
This effectively means that overshooting returns a negative distance whereas falling
short a positive one.
`point_distance` refers to the integer values, that is, distances in terms of story
points. However, squaring this number turns out in a huge regret so, we might want to
reduce this distance by some factor that in this case is the mean of the real sizes
of the environment stories (~3.2).
"""
estimated_sizes, real_sizes = list(), list()
for bkp in self.outcome.story_id:
# locate the sprint_id represented by the breakpoint bkp
sprint_id = self.stories.at[bkp, "sprint_id"]
# Locate the first story in that sprint
start = self.stories[self.stories.sprint_id == sprint_id].index.min()
# Filter the range
story_range = self.stories.loc[start:bkp]
# Compute the estimated_size and real size in the range
estimated_size = story_range.estimated_size.sum()
real_size = story_range.real_size.sum()
# Finally, pack all the stuff
estimated_sizes.append(estimated_size)
real_sizes.append(real_size)
self.outcome["estimated_size"] = estimated_sizes
self.outcome["real_size"] = real_sizes
self.outcome["total_real_size"] = (
self.outcome.real_size + self.outcome.extra_points
)
self.outcome["real_length"] = self.sprints.real_length
self.outcome["point_distance"] = (
self.outcome.real_length - self.outcome.total_real_size
)
self.outcome["distance"] = (
self.outcome.point_distance / self.stories.real_size.mean()
)
def _compute_regret(self):
"""
Compute the performance value of the algorithm.
We evaluate the algorithm performance with a number we call regret that represents how much
we will regret had we hit the sprint's real size with our set of stories + extra points.
This regret is asymmetrical, as overshooting is worse than falling short. Still, getting
close to the real size of the sprint, while regretable, is not comparatively as bad as the
regret for falling short. That's why we square the distance for all distances under 1
(overshooting the sprint by more than the mean of the real sizes, ~3points)
https://www.desmos.com/calculator/igkkwcxv0e
"""
self.outcome["regret"] = self.outcome.distance
under_1 = self.outcome.distance < 1
squared_entries = self.outcome[under_1]
self.outcome.loc[squared_entries.index, "regret"] = (
squared_entries.distance**2
)
self.outcome["regret"] = self.outcome.regret
def describe(self):
"""Show some stats about the point distances and plot them."""
self.outcome.point_distance.plot()
return self.outcome.point_distance.describe()
def nailed_sprints(self):
"""Return the number of sprints where we land in the optimal capacity."""
return np.sum(self.outcome.point_distance == 0)
def sprints_overshoot(self):
return np.sum(self.outcome.point_distance < 0)
def accumulated_regret(self):
"""
Show a plot of how the regret increases over time.
Ideally we would like this plot to grow logarithmically.
"""
self.outcome.regret.cumsum().plot()
def run(self):
# self._check_extra_points()
self._compute_diff()
self._compute_regret()
return (
f"Total regret: {self.outcome.regret.sum().astype(int)}; "
f"Nailed Sprints {self.nailed_sprints()}; "
f"Sprints overshoot {self.sprints_overshoot()}"
)
def evaluate_algorithm(algorithm_class, no_20=False, *args, **kwargs):
"""
The common procedure to evaluate a given algorithm so the process is the same for all of them.
"""
sprints, stories = ExperimentEnvironment(no_20=no_20).get_env(use_cache=True) # use the 5k env
a1 = algorithm_class(sprints, stories, *args, **kwargs)
algorithm_outcome = a1.run()
ev = ExperimentEvaluator(algorithm_outcome, sprints, stories)
outcome = ev.run()
print(outcome)
return ev