13 #include "caffe2/core/logging.h" 14 #include "caffe2/utils/fixed_divisor.h" 15 #include "caffe2/utils/threadpool/pthreadpool.h" 18 static inline size_t divide_round_up(
size_t dividend,
size_t divisor) {
19 if (dividend % divisor == 0) {
20 return dividend / divisor;
22 return dividend / divisor + 1;
26 static inline size_t min(
size_t a,
size_t b) {
31 pthreadpool_function_1d_tiled_t
function;
37 static void compute_1d_tiled(
void* context_,
size_t linear_index) {
39 const size_t tile_index = linear_index;
40 const size_t index = tile_index * context->tile;
41 const size_t tile = min(context->tile, context->range - index);
42 context->function(context->argument, index, tile);
45 void pthreadpool_compute_1d_tiled(
46 pthreadpool_t threadpool,
47 pthreadpool_function_1d_tiled_t
function,
52 if (threadpool == NULL) {
54 for (
size_t i = 0; i < range; i += tile) {
55 function(argument, i, min(range - i, tile));
59 const size_t tile_range = divide_round_up(range, tile);
66 pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
71 pthreadpool_function_2d_t
function;
76 static void compute_2d(
void* context_,
size_t linear_index) {
77 DCHECK_LE(linear_index, std::numeric_limits<int32_t>::max());
82 context->range_j.DivMod(static_cast<int32_t>(linear_index), &q, &r);
83 context->function(context->argument, q, r);
86 void pthreadpool_compute_2d(
87 struct pthreadpool* threadpool,
88 pthreadpool_function_2d_t
function,
93 if (threadpool == NULL) {
95 for (
size_t i = 0; i < range_i; i++) {
96 for (
size_t j = 0; j < range_j; j++) {
97 function(argument, i, j);
101 DCHECK_LE(range_i * range_j, (
size_t)std::numeric_limits<int32_t>::max());
104 .function =
function,
105 .argument = argument,
107 pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
112 pthreadpool_function_2d_tiled_t
function;
121 static void compute_2d_tiled(
void* context_,
size_t linear_index) {
126 context->tile_range_j.DivMod(linear_index, &q, &r);
127 const size_t max_tile_i = context->tile_i;
128 const size_t max_tile_j = context->tile_j;
129 const size_t index_i = q * max_tile_i;
130 const size_t index_j = r * max_tile_j;
131 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
132 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
133 context->function(context->argument, index_i, index_j, tile_i, tile_j);
136 void pthreadpool_compute_2d_tiled(
137 pthreadpool_t threadpool,
138 pthreadpool_function_2d_tiled_t
function,
145 if (threadpool == NULL) {
147 for (
size_t i = 0; i < range_i; i += tile_i) {
148 for (
size_t j = 0; j < range_j; j += tile_j) {
149 function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
154 const size_t tile_range_i = divide_round_up(range_i, tile_i);
155 const size_t tile_range_j = divide_round_up(range_j, tile_j);
157 tile_range_i * tile_range_j,
158 (
size_t)std::numeric_limits<int32_t>::max());
160 .function =
function,
161 .argument = argument,
167 pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
172 pthreadpool_function_3d_tiled_t
function;
184 static void compute_3d_tiled(
186 size_t linear_index) {
187 int32_t tile_index_ij, tile_index_k;
189 context->tile_range_k.DivMod(
190 static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_k);
191 int32_t tile_index_i, tile_index_j;
192 context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
193 const size_t max_tile_i = context->tile_i;
194 const size_t max_tile_j = context->tile_j;
195 const size_t max_tile_k = context->tile_k;
196 const size_t index_i =
static_cast<uint32_t
>(tile_index_i) * max_tile_i;
197 const size_t index_j =
static_cast<uint32_t
>(tile_index_j) * max_tile_j;
198 const size_t index_k =
static_cast<uint32_t
>(tile_index_k) * max_tile_k;
199 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
200 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
201 const size_t tile_k = min(max_tile_k, context->range_k - index_k);
203 context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
206 void pthreadpool_compute_3d_tiled(
207 pthreadpool_t threadpool,
208 pthreadpool_function_3d_tiled_t
function,
216 if (threadpool == NULL) {
219 for (
size_t i = 0; i < range_i; i += tile_i) {
220 for (
size_t j = 0; j < range_j; j += tile_j) {
221 for (
size_t k = 0; k < range_k; k += tile_k) {
227 min(range_i - i, tile_i),
228 min(range_j - j, tile_j),
229 min(range_k - k, tile_k));
235 const size_t tile_range_i = divide_round_up(range_i, tile_i);
236 const size_t tile_range_j = divide_round_up(range_j, tile_j);
237 const size_t tile_range_k = divide_round_up(range_k, tile_k);
239 tile_range_i * tile_range_j * tile_range_k,
240 (
size_t)std::numeric_limits<int>::max());
242 .function =
function,
243 .argument = argument,
252 pthreadpool_compute_1d(
254 (pthreadpool_function_1d_t)compute_3d_tiled,
256 tile_range_i * tile_range_j * tile_range_k);
261 pthreadpool_function_4d_tiled_t
function;
276 static void compute_4d_tiled(
278 size_t linear_index) {
279 int32_t tile_index_ij, tile_index_kl;
281 context->tile_range_kl.DivMod(
282 static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_kl);
283 int32_t tile_index_i, tile_index_j;
284 context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
285 int32_t tile_index_k, tile_index_l;
286 context->tile_range_l.DivMod(tile_index_kl, &tile_index_k, &tile_index_l);
287 const size_t max_tile_i = context->tile_i;
288 const size_t max_tile_j = context->tile_j;
289 const size_t max_tile_k = context->tile_k;
290 const size_t max_tile_l = context->tile_l;
291 const size_t index_i =
static_cast<uint32_t
>(tile_index_i) * max_tile_i;
292 const size_t index_j =
static_cast<uint32_t
>(tile_index_j) * max_tile_j;
293 const size_t index_k =
static_cast<uint32_t
>(tile_index_k) * max_tile_k;
294 const size_t index_l =
static_cast<uint32_t
>(tile_index_l) * max_tile_l;
295 const size_t tile_i = min(max_tile_i, context->range_i - index_i);
296 const size_t tile_j = min(max_tile_j, context->range_j - index_j);
297 const size_t tile_k = min(max_tile_k, context->range_k - index_k);
298 const size_t tile_l = min(max_tile_l, context->range_l - index_l);
311 void pthreadpool_compute_4d_tiled(
312 pthreadpool_t threadpool,
313 pthreadpool_function_4d_tiled_t
function,
323 if (threadpool == NULL) {
326 for (
size_t i = 0; i < range_i; i += tile_i) {
327 for (
size_t j = 0; j < range_j; j += tile_j) {
328 for (
size_t k = 0; k < range_k; k += tile_k) {
329 for (
size_t l = 0; l < range_l; l += tile_l) {
336 min(range_i - i, tile_i),
337 min(range_j - j, tile_j),
338 min(range_k - k, tile_k),
339 min(range_l - l, tile_l));
346 const size_t tile_range_i = divide_round_up(range_i, tile_i);
347 const size_t tile_range_j = divide_round_up(range_j, tile_j);
348 const size_t tile_range_k = divide_round_up(range_k, tile_k);
349 const size_t tile_range_l = divide_round_up(range_l, tile_l);
351 tile_range_i * tile_range_j * tile_range_k * tile_range_l,
352 (
size_t)std::numeric_limits<int>::max());
354 .function =
function,
355 .argument = argument,
367 pthreadpool_compute_1d(
369 (pthreadpool_function_1d_t)compute_4d_tiled,
371 tile_range_i * tile_range_j * tile_range_k * tile_range_l);