forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParallelNative.h
167 lines (149 loc) · 5.08 KB
/
ParallelNative.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#pragma once
#include <ATen/ATen.h>
#include <c10/core/thread_pool.h>
#include <algorithm>
#include <cstddef>
#include <exception>
#define INTRA_OP_PARALLEL
namespace at {
namespace internal {
// internal function to get access to intra-op thread pool from
// template parallel primitives (parallel_for, parallel_reduce)
CAFFE2_API TaskThreadPoolBase& _get_intraop_pool();
// internal utility function to mark master thread as in parallel
// region when executing parallel primitives
CAFFE2_API void _set_in_parallel_region(bool);
// Simulate OMP's omp_get_thread_num() by force-setting thread local
// task id as thread number when executing parallel primitives
CAFFE2_API void _set_thread_num(size_t thread_num);
CAFFE2_API void _unset_thread_num();
}
template <class F>
inline void parallel_for(
const int64_t begin,
const int64_t end,
const int64_t grain_size,
const F& f) {
TORCH_CHECK(grain_size >= 0);
if (begin >= end) {
return;
}
if (((end - begin) >= grain_size) && !in_parallel_region()) {
// choose number of tasks based on grain size and number of threads
size_t chunk_size = divup((end - begin), get_num_threads());
// make sure each task is at least grain_size size
chunk_size = std::max((size_t)grain_size, chunk_size);
size_t num_tasks = divup((end - begin), chunk_size);
std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
std::exception_ptr eptr;
auto task = [f, &eptr, &err_flag]
(int64_t task_id, int64_t local_start, int64_t local_end) {
internal::_set_thread_num(task_id);
internal::_set_in_parallel_region(true);
try {
f(local_start, local_end);
} catch (...) {
if (!err_flag.test_and_set()) {
eptr = std::current_exception();
}
}
internal::_set_in_parallel_region(false);
internal::_unset_thread_num();
};
std::vector<c10::ivalue::Future> futures(num_tasks);
for (size_t task_id = 1; task_id < num_tasks; ++task_id) {
int64_t local_start = begin + task_id * chunk_size;
if (local_start < end) {
int64_t local_end = std::min(end, (int64_t)(chunk_size + local_start));
internal::_get_intraop_pool().run(
// copy task_id, local_start, local_end
[&task, &futures, task_id, local_start, local_end]() {
task(task_id, local_start, local_end);
futures[task_id].markCompleted();
}
);
} else {
futures[task_id].markCompleted();
}
}
int64_t first_task_end = std::min(end, (int64_t)(chunk_size + begin));
task(0, begin, first_task_end);
// wait for all tasks to finish
for (size_t task_id = 1; task_id < num_tasks; ++task_id) {
futures[task_id].wait();
}
if (eptr) {
std::rethrow_exception(eptr);
}
} else {
f(begin, end);
}
}
template <class scalar_t, class F, class SF>
inline scalar_t parallel_reduce(
const int64_t begin,
const int64_t end,
const int64_t grain_size,
const scalar_t ident,
const F& f,
const SF& sf) {
TORCH_CHECK(grain_size >= 0);
if (begin >= end) {
return ident;
}
if (((end - begin) >= grain_size) && !in_parallel_region()) {
size_t chunk_size = divup((end - begin), get_num_threads());
chunk_size = std::max((size_t)grain_size, chunk_size);
size_t num_tasks = divup((end - begin), chunk_size);
std::vector<scalar_t> results(num_tasks);
scalar_t* results_data = results.data();
std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
std::exception_ptr eptr;
auto task = [f, ident, results_data, &eptr, &err_flag]
(int64_t task_id, int64_t local_start, int64_t local_end) {
internal::_set_thread_num(task_id);
internal::_set_in_parallel_region(true);
try {
results_data[task_id] = f(local_start, local_end, ident);
} catch (...) {
if (!err_flag.test_and_set()) {
eptr = std::current_exception();
}
}
internal::_set_in_parallel_region(false);
internal::_unset_thread_num();
};
std::vector<c10::ivalue::Future> futures(num_tasks);
for (size_t task_id = 1; task_id < num_tasks; ++task_id) {
int64_t local_start = begin + task_id * chunk_size;
if (local_start < end) {
int64_t local_end = std::min(end, (int64_t)(chunk_size + local_start));
internal::_get_intraop_pool().run(
// copy task_id, local_start, local_end
[&task, &futures, task_id, local_start, local_end]() {
task(task_id, local_start, local_end);
futures[task_id].markCompleted();
}
);
} else {
futures[task_id].markCompleted();
}
}
int64_t first_task_end = std::min(end, (int64_t)(chunk_size + begin));
task(0, begin, first_task_end);
for (size_t task_id = 1; task_id < num_tasks; ++task_id) {
futures[task_id].wait();
}
if (eptr) {
std::rethrow_exception(eptr);
}
scalar_t result = ident;
for (auto partial_result : results) {
result = sf(result, partial_result);
}
return result;
} else {
return f(begin, end, ident);
}
}
} // namespace at