Caffe2 - C++ API
A deep learning, cross platform ML framework
transpose.cc
1 #include "caffe2/utils/math/transpose.h"
2 
3 #include <algorithm>
4 #include <functional>
5 #include <limits>
6 #include <numeric>
7 
8 #ifdef CAFFE2_USE_MKL
9 #include <mkl.h>
10 #endif // CAFFE2_USE_MKL
11 
12 #ifdef CAFFE2_USE_HPTT
13 #include <hptt.h>
14 #endif // CAFFE2_USE_HPTT
15 
16 #include "caffe2/core/context.h"
17 #include "caffe2/utils/eigen_utils.h"
18 #include "caffe2/utils/math/utils.h"
19 
20 namespace caffe2 {
21 namespace math {
22 
23 namespace {
24 
25 template <typename TIndex, typename TData>
26 void Transpose2D(
27  const TIndex rows,
28  const TIndex cols,
29  const TData* X,
30  TData* Y) {
31  EigenMatrixMap<TData>(Y, rows, cols) =
32  ConstEigenMatrixMap<TData>(X, cols, rows).transpose();
33 }
34 
35 #ifdef CAFFE2_USE_MKL
36 
37 #define DELEGATE_TRANSPOSE_2D(TIndex, TData, MKLFunc) \
38  template <> \
39  void Transpose2D<TIndex, TData>( \
40  const TIndex rows, const TIndex cols, const TData* X, TData* Y) { \
41  MKLFunc('R', 'T', rows, cols, TData(1), X, cols, Y, rows); \
42  }
43 DELEGATE_TRANSPOSE_2D(std::int32_t, float, mkl_somatcopy);
44 DELEGATE_TRANSPOSE_2D(std::int64_t, float, mkl_somatcopy);
45 DELEGATE_TRANSPOSE_2D(std::int32_t, double, mkl_domatcopy);
46 DELEGATE_TRANSPOSE_2D(std::int64_t, double, mkl_domatcopy);
47 #undef DELEGATE_TRANSPOSE_2D
48 
49 #endif // CAFFE2_USE_MKL
50 
51 #ifdef CAFFE2_USE_HPTT
52 
53 template <typename TIndex, typename TData>
54 bool TransposeByHPTT(
55  const int ndim,
56  const TIndex* dims,
57  const int* axes,
58  const TData* X,
59  TData* Y) {
60  for (int i = 0; i < ndim; ++i) {
61  if (dims[i] <= 0 || dims[i] > std::numeric_limits<int>::max()) {
62  return false;
63  }
64  }
65 
66  std::vector<int> axes_cm(ndim);
67  std::vector<int> dims_cm(ndim);
68  // Convert row-major index to column-major.
69  const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
70  for (int i = 0; i < ndim; ++i) {
71  axes_cm[i] = cm_fn(axes[cm_fn(i)]);
72  dims_cm[i] = dims[cm_fn(i)];
73  }
74  auto plan = hptt::create_plan(
75  axes_cm.data(),
76  ndim,
77  TData(1),
78  X,
79  dims_cm.data(),
80  nullptr,
81  TData(0),
82  Y,
83  nullptr,
84  hptt::ESTIMATE,
85  1 /* num_threads */);
86  if (plan == nullptr) {
87  return false;
88  }
89  plan->execute();
90  return true;
91 }
92 
93 #endif // CAFFE2_USE_HPTT
94 
95 template <typename TIndex, typename TData>
96 void TransposeND(
97  const int ndim,
98  const TIndex* dims,
99  const int* axes,
100  const TData* X,
101  TData* Y) {
102  std::vector<TIndex> Y_dims(ndim);
103  for (int i = 0; i < ndim; ++i) {
104  Y_dims[i] = dims[axes[i]];
105  }
106  // Measure amount of contiguous data we can copy at once
107  int pivot = ndim - 1;
108  TIndex block_size = 1;
109  for (; pivot >= 0 && axes[pivot] == pivot; --pivot) {
110  block_size *= Y_dims[pivot];
111  }
112  ++pivot;
113  const TIndex num_blocks = std::accumulate(
114  Y_dims.cbegin(),
115  Y_dims.cbegin() + pivot,
116  TIndex(1),
117  std::multiplies<TIndex>());
118  std::vector<TIndex> X_strides(pivot);
119  utils::ComputeTransposedStrides<TIndex>(pivot, dims, axes, X_strides.data());
120  std::vector<TIndex> index(pivot, 0);
121  for (TIndex Y_index = 0; Y_index < num_blocks; ++Y_index) {
122  const TIndex X_index = std::inner_product(
123  X_strides.cbegin(), X_strides.cend(), index.cbegin(), TIndex(0));
124  if (block_size == 1) {
125  Y[Y_index] = X[X_index];
126  } else {
127  std::memcpy(
128  Y + block_size * Y_index,
129  X + block_size * X_index,
130  block_size * sizeof(TData));
131  }
132  utils::IncreaseIndexInDims<TIndex>(pivot, Y_dims.data(), index.data());
133  }
134 }
135 
136 template <typename TIndex, typename TData>
137 void TransposeImpl(
138  const int ndim,
139  const TIndex* dims,
140  const int* axes,
141  const TData* X,
142  TData* Y) {
143  const TIndex size =
144  std::accumulate(dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());
145  if (size == 0) {
146  return;
147  }
148  if (utils::IsIdentityPermutation(ndim, axes)) {
149  std::memcpy(Y, X, size * sizeof(TData));
150  return;
151  }
152  if (utils::IsBatchTranspose2D(ndim, axes)) {
153  const TIndex H = dims[ndim - 2];
154  const TIndex W = dims[ndim - 1];
155  const TIndex N = size / (H * W);
156  for (TIndex i = 0; i < N; ++i) {
157  Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W);
158  }
159  return;
160  }
161  TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);
162 }
163 
164 #ifdef CAFFE2_USE_HPTT
165 
166 #define CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(TIndex, TData) \
167  template <> \
168  void TransposeImpl<TIndex, TData>( \
169  const int ndim, \
170  const TIndex* dims, \
171  const int* axes, \
172  const TData* X, \
173  TData* T) { \
174  const TIndex size = std::accumulate( \
175  dims, dims + ndim, TIndex(1), std::multiplies<TIndex>()); \
176  if (size == 0) { \
177  return; \
178  } \
179  if (utils::IsIdentityPermutation(ndim, axes)) { \
180  std::memcpy(Y, X, size * sizeof(TData)); \
181  return; \
182  } \
183  if (TransposeByHPTT(ndim, dims, axes, X, Y)) { \
184  return; \
185  } \
186  if (utils::IsBatchTranspose2D(ndim, axes)) { \
187  const TIndex H = dims[ndim - 2]; \
188  const TIndex W = dims[ndim - 1]; \
189  const TIndex N = size / (H * W); \
190  for (TIndex i = 0; i < N; ++i) { \
191  Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W); \
192  } \
193  return; \
194  } \
195  TransposeND<TIndex, TData>(ndim, dims, axes, X, Y); \
196  }
197 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, float)
198 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, float)
199 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, double)
200 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, double)
201 #undef CAFFE2_SPECIALIZED_TRANSPOSE_IMPL
202 
203 #endif // CAFFE2_USE_HPTT
204 
205 } // namespace
206 
207 #define CAFFE2_SPECIALIZED_TRANSPOSE(TIndex, TData) \
208  template <> \
209  C10_EXPORT void Transpose<TIndex, TData, CPUContext>( \
210  const int ndim, \
211  const TIndex* dims, \
212  const int* axes, \
213  const TData* X, \
214  TData* Y, \
215  CPUContext* /* context */) { \
216  TransposeImpl<TIndex, TData>(ndim, dims, axes, X, Y); \
217  }
218 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, float)
219 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, float)
220 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, double)
221 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, double)
222 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int32_t)
223 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int32_t)
224 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int64_t)
225 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int64_t)
226 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint8_t)
227 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint8_t)
228 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint16_t)
229 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint16_t)
230 #undef CAFFE2_SPECIALIZED_TRANSPOSE
231 
232 #define CAFFE2_SPECIALIZED_NCHW2NHWC(T) \
233  template <> \
234  C10_EXPORT void NCHW2NHWC<T, CPUContext>( \
235  const int N, \
236  const int C, \
237  const int HxW, \
238  const T* X, \
239  T* Y, \
240  CPUContext* /* context */) { \
241  const int stride = C * HxW; \
242  for (int i = 0; i < N; ++i) { \
243  Transpose2D<T>(C, HxW, X + i * stride, Y + i * stride); \
244  } \
245  }
246 CAFFE2_SPECIALIZED_NCHW2NHWC(float)
247 #undef CAFFE2_SPECIALIZED_NCHW2NHWC
248 
249 #define CAFFE2_SPECIALIZED_NHWC2NCHW(T) \
250  template <> \
251  C10_EXPORT void NHWC2NCHW<T, CPUContext>( \
252  const int N, \
253  const int C, \
254  const int HxW, \
255  const T* X, \
256  T* Y, \
257  CPUContext* /* context */) { \
258  const int stride = HxW * C; \
259  for (int i = 0; i < N; ++i) { \
260  Transpose2D<T>(HxW, C, X + i * stride, Y + i * stride); \
261  } \
262  }
263 CAFFE2_SPECIALIZED_NHWC2NCHW(float)
264 #undef CAFFE2_SPECIALIZED_NHWC2NCHW
265 
266 } // namespace math
267 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13