Caffe2 - C++ API
A deep learning, cross platform ML framework
pthreadpool.cc
1 /* Standard C headers */
2 #include <stdint.h>
3 #include <stdbool.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <assert.h>
7 
8 /* POSIX headers */
9 #include <pthread.h>
10 #include <unistd.h>
11 
12 /* Library header */
13 #include "caffe2/core/logging.h"
14 #include "caffe2/utils/fixed_divisor.h"
15 #include "caffe2/utils/threadpool/pthreadpool.h"
16 
17 
18 static inline size_t divide_round_up(size_t dividend, size_t divisor) {
19  if (dividend % divisor == 0) {
20  return dividend / divisor;
21  } else {
22  return dividend / divisor + 1;
23  }
24 }
25 
26 static inline size_t min(size_t a, size_t b) {
27  return a < b ? a : b;
28 }
29 
31  pthreadpool_function_1d_tiled_t function;
32  void* argument;
33  size_t range;
34  size_t tile;
35 };
36 
37 static void compute_1d_tiled(void* context_, size_t linear_index) {
38  const struct compute_1d_tiled_context* context = (compute_1d_tiled_context*) context_;
39  const size_t tile_index = linear_index;
40  const size_t index = tile_index * context->tile;
41  const size_t tile = min(context->tile, context->range - index);
42  context->function(context->argument, index, tile);
43 }
44 
45 void pthreadpool_compute_1d_tiled(
46  pthreadpool_t threadpool,
47  pthreadpool_function_1d_tiled_t function,
48  void* argument,
49  size_t range,
50  size_t tile)
51 {
52  if (threadpool == NULL) {
53  /* No thread pool provided: execute function sequentially on the calling thread */
54  for (size_t i = 0; i < range; i += tile) {
55  function(argument, i, min(range - i, tile));
56  }
57  } else {
58  /* Execute in parallel on the thread pool using linearized index */
59  const size_t tile_range = divide_round_up(range, tile);
60  struct compute_1d_tiled_context context = {
61  .function = function,
62  .argument = argument,
63  .range = range,
64  .tile = tile
65  };
66  pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range);
67  }
68 }
69 
71  pthreadpool_function_2d_t function;
72  void* argument;
74 };
75 
76 static void compute_2d(void* context_, size_t linear_index) {
77  DCHECK_LE(linear_index, std::numeric_limits<int32_t>::max());
78 
79  const struct compute_2d_context* context = static_cast<compute_2d_context*>(context_);
80  int32_t q;
81  int32_t r;
82  context->range_j.DivMod(static_cast<int32_t>(linear_index), &q, &r);
83  context->function(context->argument, q, r);
84 }
85 
86 void pthreadpool_compute_2d(
87  struct pthreadpool* threadpool,
88  pthreadpool_function_2d_t function,
89  void* argument,
90  size_t range_i,
91  size_t range_j)
92 {
93  if (threadpool == NULL) {
94  /* No thread pool provided: execute function sequentially on the calling thread */
95  for (size_t i = 0; i < range_i; i++) {
96  for (size_t j = 0; j < range_j; j++) {
97  function(argument, i, j);
98  }
99  }
100  } else {
101  DCHECK_LE(range_i * range_j, (size_t)std::numeric_limits<int32_t>::max());
102  /* Execute in parallel on the thread pool using linearized index */
103  struct compute_2d_context context = {
104  .function = function,
105  .argument = argument,
106  .range_j = caffe2::FixedDivisor<int32_t>(range_j)};
107  pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j);
108  }
109 }
110 
112  pthreadpool_function_2d_tiled_t function;
113  void* argument;
114  caffe2::FixedDivisor<int32_t> tile_range_j;
115  size_t range_i;
116  size_t range_j;
117  size_t tile_i;
118  size_t tile_j;
119 };
120 
121 static void compute_2d_tiled(void* context_, size_t linear_index) {
122  int32_t q;
123  int32_t r;
124 
125  const struct compute_2d_tiled_context* context = static_cast<compute_2d_tiled_context*>(context_);
126  context->tile_range_j.DivMod(linear_index, &q, &r);
127  const size_t max_tile_i = context->tile_i;
128  const size_t max_tile_j = context->tile_j;
129  const size_t index_i = q * max_tile_i;
130  const size_t index_j = r * max_tile_j;
131  const size_t tile_i = min(max_tile_i, context->range_i - index_i);
132  const size_t tile_j = min(max_tile_j, context->range_j - index_j);
133  context->function(context->argument, index_i, index_j, tile_i, tile_j);
134 }
135 
136 void pthreadpool_compute_2d_tiled(
137  pthreadpool_t threadpool,
138  pthreadpool_function_2d_tiled_t function,
139  void* argument,
140  size_t range_i,
141  size_t range_j,
142  size_t tile_i,
143  size_t tile_j)
144 {
145  if (threadpool == NULL) {
146  /* No thread pool provided: execute function sequentially on the calling thread */
147  for (size_t i = 0; i < range_i; i += tile_i) {
148  for (size_t j = 0; j < range_j; j += tile_j) {
149  function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j));
150  }
151  }
152  } else {
153  /* Execute in parallel on the thread pool using linearized index */
154  const size_t tile_range_i = divide_round_up(range_i, tile_i);
155  const size_t tile_range_j = divide_round_up(range_j, tile_j);
156  DCHECK_LE(
157  tile_range_i * tile_range_j,
158  (size_t)std::numeric_limits<int32_t>::max());
159  struct compute_2d_tiled_context context = {
160  .function = function,
161  .argument = argument,
162  .tile_range_j = caffe2::FixedDivisor<int32_t>(tile_range_j),
163  .range_i = range_i,
164  .range_j = range_j,
165  .tile_i = tile_i,
166  .tile_j = tile_j};
167  pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
168  }
169 }
170 
172  pthreadpool_function_3d_tiled_t function;
173  void* argument;
174  caffe2::FixedDivisor<int32_t> tile_range_j;
175  caffe2::FixedDivisor<int32_t> tile_range_k;
176  size_t range_i;
177  size_t range_j;
178  size_t range_k;
179  size_t tile_i;
180  size_t tile_j;
181  size_t tile_k;
182 };
183 
184 static void compute_3d_tiled(
185  void* context_,
186  size_t linear_index) {
187  int32_t tile_index_ij, tile_index_k;
188  const struct compute_3d_tiled_context* context = static_cast<compute_3d_tiled_context*>(context_);
189  context->tile_range_k.DivMod(
190  static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_k);
191  int32_t tile_index_i, tile_index_j;
192  context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
193  const size_t max_tile_i = context->tile_i;
194  const size_t max_tile_j = context->tile_j;
195  const size_t max_tile_k = context->tile_k;
196  const size_t index_i = static_cast<uint32_t>(tile_index_i) * max_tile_i;
197  const size_t index_j = static_cast<uint32_t>(tile_index_j) * max_tile_j;
198  const size_t index_k = static_cast<uint32_t>(tile_index_k) * max_tile_k;
199  const size_t tile_i = min(max_tile_i, context->range_i - index_i);
200  const size_t tile_j = min(max_tile_j, context->range_j - index_j);
201  const size_t tile_k = min(max_tile_k, context->range_k - index_k);
202  context->function(
203  context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
204 }
205 
206 void pthreadpool_compute_3d_tiled(
207  pthreadpool_t threadpool,
208  pthreadpool_function_3d_tiled_t function,
209  void* argument,
210  size_t range_i,
211  size_t range_j,
212  size_t range_k,
213  size_t tile_i,
214  size_t tile_j,
215  size_t tile_k) {
216  if (threadpool == NULL) {
217  /* No thread pool provided: execute function sequentially on the calling
218  * thread */
219  for (size_t i = 0; i < range_i; i += tile_i) {
220  for (size_t j = 0; j < range_j; j += tile_j) {
221  for (size_t k = 0; k < range_k; k += tile_k) {
222  function(
223  argument,
224  i,
225  j,
226  k,
227  min(range_i - i, tile_i),
228  min(range_j - j, tile_j),
229  min(range_k - k, tile_k));
230  }
231  }
232  }
233  } else {
234  /* Execute in parallel on the thread pool using linearized index */
235  const size_t tile_range_i = divide_round_up(range_i, tile_i);
236  const size_t tile_range_j = divide_round_up(range_j, tile_j);
237  const size_t tile_range_k = divide_round_up(range_k, tile_k);
238  DCHECK_LE(
239  tile_range_i * tile_range_j * tile_range_k,
240  (size_t)std::numeric_limits<int>::max());
241  struct compute_3d_tiled_context context = {
242  .function = function,
243  .argument = argument,
244  .tile_range_j = caffe2::FixedDivisor<int>(tile_range_j),
245  .tile_range_k = caffe2::FixedDivisor<int>(tile_range_k),
246  .range_i = range_i,
247  .range_j = range_j,
248  .range_k = range_k,
249  .tile_i = tile_i,
250  .tile_j = tile_j,
251  .tile_k = tile_k};
252  pthreadpool_compute_1d(
253  threadpool,
254  (pthreadpool_function_1d_t)compute_3d_tiled,
255  &context,
256  tile_range_i * tile_range_j * tile_range_k);
257  }
258 }
259 
261  pthreadpool_function_4d_tiled_t function;
262  void* argument;
263  caffe2::FixedDivisor<int32_t> tile_range_kl;
264  caffe2::FixedDivisor<int32_t> tile_range_j;
265  caffe2::FixedDivisor<int32_t> tile_range_l;
266  size_t range_i;
267  size_t range_j;
268  size_t range_k;
269  size_t range_l;
270  size_t tile_i;
271  size_t tile_j;
272  size_t tile_k;
273  size_t tile_l;
274 };
275 
276 static void compute_4d_tiled(
277  void* context_,
278  size_t linear_index) {
279  int32_t tile_index_ij, tile_index_kl;
280  const struct compute_4d_tiled_context* context = static_cast<compute_4d_tiled_context*>(context_);
281  context->tile_range_kl.DivMod(
282  static_cast<int32_t>(linear_index), &tile_index_ij, &tile_index_kl);
283  int32_t tile_index_i, tile_index_j;
284  context->tile_range_j.DivMod(tile_index_ij, &tile_index_i, &tile_index_j);
285  int32_t tile_index_k, tile_index_l;
286  context->tile_range_l.DivMod(tile_index_kl, &tile_index_k, &tile_index_l);
287  const size_t max_tile_i = context->tile_i;
288  const size_t max_tile_j = context->tile_j;
289  const size_t max_tile_k = context->tile_k;
290  const size_t max_tile_l = context->tile_l;
291  const size_t index_i = static_cast<uint32_t>(tile_index_i) * max_tile_i;
292  const size_t index_j = static_cast<uint32_t>(tile_index_j) * max_tile_j;
293  const size_t index_k = static_cast<uint32_t>(tile_index_k) * max_tile_k;
294  const size_t index_l = static_cast<uint32_t>(tile_index_l) * max_tile_l;
295  const size_t tile_i = min(max_tile_i, context->range_i - index_i);
296  const size_t tile_j = min(max_tile_j, context->range_j - index_j);
297  const size_t tile_k = min(max_tile_k, context->range_k - index_k);
298  const size_t tile_l = min(max_tile_l, context->range_l - index_l);
299  context->function(
300  context->argument,
301  index_i,
302  index_j,
303  index_k,
304  index_l,
305  tile_i,
306  tile_j,
307  tile_k,
308  tile_l);
309 }
310 
311 void pthreadpool_compute_4d_tiled(
312  pthreadpool_t threadpool,
313  pthreadpool_function_4d_tiled_t function,
314  void* argument,
315  size_t range_i,
316  size_t range_j,
317  size_t range_k,
318  size_t range_l,
319  size_t tile_i,
320  size_t tile_j,
321  size_t tile_k,
322  size_t tile_l) {
323  if (threadpool == NULL) {
324  /* No thread pool provided: execute function sequentially on the calling
325  * thread */
326  for (size_t i = 0; i < range_i; i += tile_i) {
327  for (size_t j = 0; j < range_j; j += tile_j) {
328  for (size_t k = 0; k < range_k; k += tile_k) {
329  for (size_t l = 0; l < range_l; l += tile_l) {
330  function(
331  argument,
332  i,
333  j,
334  k,
335  l,
336  min(range_i - i, tile_i),
337  min(range_j - j, tile_j),
338  min(range_k - k, tile_k),
339  min(range_l - l, tile_l));
340  }
341  }
342  }
343  }
344  } else {
345  /* Execute in parallel on the thread pool using linearized index */
346  const size_t tile_range_i = divide_round_up(range_i, tile_i);
347  const size_t tile_range_j = divide_round_up(range_j, tile_j);
348  const size_t tile_range_k = divide_round_up(range_k, tile_k);
349  const size_t tile_range_l = divide_round_up(range_l, tile_l);
350  DCHECK_LE(
351  tile_range_i * tile_range_j * tile_range_k * tile_range_l,
352  (size_t)std::numeric_limits<int>::max());
353  struct compute_4d_tiled_context context = {
354  .function = function,
355  .argument = argument,
356  .tile_range_kl = caffe2::FixedDivisor<int>(tile_range_k * tile_range_l),
357  .tile_range_j = caffe2::FixedDivisor<int>(tile_range_j),
358  .tile_range_l = caffe2::FixedDivisor<int>(tile_range_l),
359  .range_i = range_i,
360  .range_j = range_j,
361  .range_k = range_k,
362  .range_l = range_l,
363  .tile_i = tile_i,
364  .tile_j = tile_j,
365  .tile_k = tile_k,
366  .tile_l = tile_l};
367  pthreadpool_compute_1d(
368  threadpool,
369  (pthreadpool_function_1d_t)compute_4d_tiled,
370  &context,
371  tile_range_i * tile_range_j * tile_range_k * tile_range_l);
372  }
373 }