Caffe2 - C++ API
A deep learning, cross platform ML framework
dnnlowp_partition.cc
1 #include "dnnlowp_partition.h"
2 
3 #include "caffe2/core/logging.h"
4 
5 namespace caffe2 {
6 
7 static size_t GetWorkPerThread_(size_t work, int nthreads, int work_align) {
8  return ((work + work_align - 1) / work_align + nthreads - 1) / nthreads *
9  work_align;
10 }
11 
12 std::pair<size_t, size_t>
13 Get1DPartition(size_t work, int nthreads, int tid, int work_align /*=1*/) {
14  size_t work_per_thread = GetWorkPerThread_(work, nthreads, work_align);
15  size_t work_begin = std::min(tid * work_per_thread, work);
16  size_t work_end = std::min(work_begin + work_per_thread, work);
17  return {work_begin, work_end};
18 }
19 
21  int m,
22  int n,
23  int nthreads,
24  int tid,
25  int* m_begin,
26  int* m_end,
27  int* n_begin,
28  int* n_end,
29  int n_align /*=1*/) {
30  if (m >= nthreads) {
31  // When m >= nthreads, just parallelize over m.
32  std::tie(*m_begin, *m_end) = Get1DPartition(m, nthreads, tid);
33  *n_begin = 0;
34  *n_end = n;
35  } else {
36  // Otherwise, each row is parallelized by multiple threads.
37  // nthreads_per_row is floor(nthreads / m). If we use ceil, some rows won't
38  // be handled by any thread.
39  int nthreads_per_row = nthreads / m;
40  *m_begin = std::max(std::min(tid / nthreads_per_row, m - 1), 0);
41  *m_end = std::min(*m_begin + 1, m);
42 
43  int tid_of_m_begin = std::min(*m_begin * nthreads_per_row, nthreads);
44  int tid_of_m_end = std::min(
45  (*m_end == m) ? nthreads : (tid_of_m_begin + nthreads_per_row),
46  nthreads);
47  int nthreads_within_row = tid_of_m_end - tid_of_m_begin;
48  int tid_within_row = tid - tid_of_m_begin;
49  CAFFE_ENFORCE_GE(tid_within_row, 0);
50  CAFFE_ENFORCE_LT(tid_within_row, nthreads_within_row);
51 
52  std::tie(*n_begin, *n_end) =
53  Get1DPartition(n, nthreads_within_row, tid_within_row, n_align);
54  }
55 }
56 
57 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
void Get1DPartitionOf2D(int m, int n, int nthreads, int tid, int *m_begin, int *m_end, int *n_begin, int *n_end, int n_align)
1D-partition m x n 2D work.