-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy path050 Parallel Algorithms.jl
297 lines (230 loc) · 8.37 KB
/
050 Parallel Algorithms.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
# # Parallel Algorithms: Thinking in Parallel
#
# Now that we're starting to see the challenges of parallelism, it's worth taking
# a step back and examining how we might go about designing parallel algorithms.
#
# This is adapted from a [workshop paper](http://jiahao.github.io/parallel-prefix/) by Jiahao Chen and
# Alan Edelman entitled "Parallel Prefix Polymorphism Permits Parallelization, Presentation & Proof" and
# will appear in the proceedings of the [First Workshop for High Performance Technical Computing in Dynamic
# Languages](http://jiahao.github.io/hptcdl-sc14/), held in conjunction with [SC14: The International Conference on High Performance Computing, Networking, Storage and Analysis](http://sc14.supercomputing.org/)
#-
using Compose, Gadfly
# # `reduce()`
#
# Reduction applies a binary operator to a vector repeatedly to return a scalar. Thus + becomes sum, and * becomes prod.
#
# It is considered a basic parallel computing primitive.
reduce(+, 1:8) == sum(1:8) # triangular numbers
#-
reduce(*, 1:8) == prod(1:8) # factorials
# You can also use reduce to compute Fibonacci numbers using their recurrences.
M=[1 1; 1 0]
reduce(*,fill(M,3))
prod(fill(M,3))
#-
n= 40 # Try changing n to pick different values (try between 0-100)
@show prod(fill(big.(M),n))
# # `prefix` or `scan`
#
# Having discussed `reduce`, we are now ready for the idea behind prefix sum.
# Prefix or scan or accumulate is long considered an important parallel
# primitive as well.
#
# Suppose you wanted to compute the partial sums of a vector, i.e. given
# `y[1:n]`, we want to overwrite the vector `y` with the vector of partial sums
#
# ```julia
# new_y[1] = y[1]
# new_y[2] = y[1] + y[2]
# new_y[3] = y[1] + y[2] + y[3]
# ...
# ```
#
# At first blush, it seems impossible to parallelize this, since
#
# ```julia
# new_y[1] = y[1]
# new_y[2] = new_y[1] + y[2]
# new_y[3] = new_y[2] + y[3]
# ...
# ```
#
# which appears to be an intrinsically serial process. As written with a `+`
# operator, this is `cumsum` — but note that it can generalize to any operation.
function prefix_serial!(⊕, y)
for i=2:length(y)
y[i] = y[i-1] ⊕ y[i]
end
y
end
#-
@show prefix_serial!(+, [1:8;])
@show cumsum(1:8)
#-
@show prefix_serial!(*, [1:8;])
@show cumprod(1:8)
#-
@show accumulate(*, [1:8;])
# However, it turns out that because these operations are associative, we can regroup the _order_ of how these sums or products are carried out. (This of course extends to other associative operations, too.) Another ordering of 8 associative operations is provided by `prefix8!`:
## Magic :)
function prefix8!(⊕, y)
length(y)==8 || error("length 8 only")
for i in (2,4,6,8); y[i] = y[i-1] ⊕ y[i]; end
for i in ( 4, 8); y[i] = y[i-2] ⊕ y[i]; end
for i in ( 8); y[i] = y[i-4] ⊕ y[i]; end
for i in ( 6 ); y[i] = y[i-2] ⊕ y[i]; end
for i in ( 3,5,7 ); y[i] = y[i-1] ⊕ y[i]; end
y
end
#-
prefix8!(+, [1:8;]) == cumsum(1:8)
# In fact, this can generalize beyond just length-8 arrays:
## More magic
function prefix!(⊕, y)
l=length(y)
k=ceil(Int, log2(l))
@inbounds for j=1:k, i=2^j:2^j:min(l, 2^k) #"reduce"
y[i] = y[i-2^(j-1)] ⊕ y[i]
end
@inbounds for j=(k-1):-1:1, i=3*2^(j-1):2^j:min(l, 2^k) #"expand"
y[i] = y[i-2^(j-1)] ⊕ y[i]
end
y
end
# -
A = rand(0:9, 123)
prefix!(*, copy(A)) == cumprod(A)
# ## What is this magic?
#-
# We can visualize the operations with a little bit of trickery. In Julia, arrays are simply types that expose the array protocol. In particular, they need to implement methods for the generic functions `length`, `getindex` and `setindex!`. The last two are used in indexing operations, since statements
#
# y = A[1]
# A[3] = y
#
# get desugared to
#
# y = getindex(A, 1)
# setindex!(A, y, 3)
#
# respectively.
#
# We can trace through the iterable by introduce a dummy array type, `AccessArray`, which records every access to `getindex` and `setindex!`.
#
# Specifically:
#
# - `length(A::AccessArray)` returns the length of the array it wraps
# - `getindex(A::AccessArray, i)` records read access to the index `i` in the `A.read` field and then actually retuns the value in the array it wraps.
# - `setindex!(A::AccessArray, x, i)` records write access to the index `i`. The `A.history` field is appended with a new tuple consisting of the current `A.read` field and the index `i`, and then it performs the assignment.
#
# The way `AccessArray` works, it assumes an association between a single `setindex!` call and and all the preceding `getindex` calls since the previous `setindex!` call, which is sufficient for the purposes of tracing through prefix calls.
mutable struct AccessArray{T,N,A}
data :: A
read :: Vector{Int}
history :: Vector{Tuple{Vector{Int},Int}}
end
AccessArray(A) = AccessArray{eltype(A), ndims(A), typeof(A)}(A, Int[], Int[])
Base.length(A::AccessArray) = length(A.data)
function Base.getindex(A::AccessArray, i::Int)
push!(A.read, i)
A.data[i]
end
function Base.setindex!(A::AccessArray, x, i::Int)
push!(A.history, (A.read, i))
A.read = Int[]
A.data[i] = x
end
#-
M = AccessArray(rand(8))
#-
M[7] = M[3] + M[2]
#-
M.history
# So now we can trace the access pattern when calling `prefix8`!
A=prefix8!(+, AccessArray(rand(8)))
#-
A.history
# Now let's visualize this! Each entry in `A.history` is rendered by a gate object:
using Compose: circle, mm
#-
struct Gate{I,O}
ins :: I
outs :: O
end
import Gadfly.render
function render(G::Gate, x₁, y₁, y₀; rᵢ=0.1, rₒ=0.25)
ipoints = [(i, y₀+rᵢ) for i in G.ins]
opoints = [(i, y₀+0.5) for i in G.outs]
igates = [circle(i..., rᵢ) for i in ipoints]
ogates = [circle(i..., rₒ) for i in opoints]
lines = [line([i, j]) for i in ipoints, j in opoints]
compose(context(units=UnitBox(0.5,0,x₁,y₁+1)),
compose(context(), stroke(colorant"black"), fill(colorant"white"),
igates..., ogates...),
compose(context(), linewidth(0.3mm), stroke(colorant"black"),
lines...))
end
A=Gate([1,2],2)
render(A,2,0,0)
# Now we render the whole algorithm. We have to scan through the trace twice; the first time merely calculates the maximum depth that needs to be drawn and the second time actually generates the objects.
function render(A::AccessArray)
#Scan to find maximum depth
olast = depth = 0
for y in A.history
(any(y[1] .≤ olast)) && (depth += 1)
olast = maximum(y[2])
end
maxdepth = depth
olast = depth = 0
C = []
for y in A.history
(any(y[1] .≤ olast)) && (depth += 1)
push!(C, render(Gate(y...), length(A), maxdepth, depth))
olast = maximum(y[2])
end
push!(C, compose(context(units=UnitBox(0.5,0,length(A),1)),
[line([(i,0), (i,1)]) for i=1:length(A)]...,
linewidth(0.1mm), stroke(colorant"grey")))
compose(context(), C...)
end
#-
render(prefix!(+, AccessArray(zeros(8))))
# Now we can see that `prefix!` rearranges the operations to form two spanning trees:
# Try changing the number of elements!
render(prefix!(+, AccessArray(zeros(16))))
# as contrasted with the serial code:
render(prefix_serial!(+, AccessArray(zeros(8))))
# # Now exploit the parallelism in the _algorithm_ to use a parallel _implementation_
using .Threads
function prefix_threads!(⊕, y)
l=length(y)
k=ceil(Int, log2(l))
for j=1:k
@threads for i=2^j:2^j:min(l, 2^k) #"reduce"
@inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
end
end
for j=(k-1):-1:1
@threads for i=3*2^(j-1):2^j:min(l, 2^k) #"expand"
@inbounds y[i] = y[i-2^(j-1)] ⊕ y[i]
end
end
y
end
A = rand(500_000);
using BenchmarkTools
@btime prefix_serial!(+, $(copy(A)));
@btime prefix!(+, $(copy(A)));
@btime prefix_threads!(+, $(copy(A)));
prefix_threads!(+, copy(A)) == prefix!(+, copy(A)) ≈ cumsum(A)
# # Thinking in parallel
#
# Notice how we didn't need to contort ourselves in making our algorithm
# work with `@threads`. We really did _just_ take a `@threads` on it and it
# just worked. It was both accurate _and_ fast.
#
# Coming up with rearrangements that make your particular algorithm parallel
# friendly isn't always easy, but when possible it makes everything else
# just fall out naturally.
#
# Finally, note that there can be clever ways to visualize algorithms as sanity checks.