Caffe2 - C++ API
A deep learning, cross platform ML framework
math_cpu.cc
1 
17 // Implements the math functions for CPU.
18 // The implementation in this file allows us to route the underlying numerical
19 // computation library to different backends. Notably:
20 // (1) For all BLAS-related functions, one can explicitly request a BLAS backend
21 // such as MKL, openblas or Atlas. To see the set of supported backends
22 // currently provided, check //third_party/blas/.
23 // (2) If one chooses to link against MKL, we utilize MKL's vector math library
24 // (VML) for a few functions such as Exp and Log.
25 // (3) Fallback implementations are provided in Eigen for cross-platform
26 // support. Since Eigen is a header-only library and supports a number of
27 // platforms, it allows one to quickly port Caffe2 to different platforms
28 // where BLAS may not be present.
29 
30 #include <algorithm>
31 #include <atomic>
32 #include <chrono>
33 #include <cstring>
34 #include <numeric>
35 #include <random>
36 #include <unordered_set>
37 #include <vector>
38 
39 #include "caffe2/utils/math.h"
40 #include "caffe2/utils/cpu_neon.h"
41 #include "caffe2/core/context.h"
42 #include "Eigen/Core"
43 #include "Eigen/Dense"
44 
45 #ifdef CAFFE2_USE_MKL
46 #include <mkl.h>
47 #endif // CAFFE2_USE_MKL
48 
49 #ifdef CAFFE2_USE_HPTT
50 #include <hptt.h>
51 #endif // CAFFE2_USE_HPTT
52 
53 #if defined(_MSC_VER)
54 #include <process.h>
55 #endif
56 
57 namespace caffe2 {
58 namespace math {
59 
61 // BLAS alternatives.
62 // Depending on whether we have specified an external BLAS library or not, we
63 // will delegate the Caffe math functions that are BLAS-related to either the
64 // CBLAS call or the Eigen implementation.
66 #ifdef CAFFE2_USE_EIGEN_FOR_BLAS
67 
68 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
69 // limitation that the data has to be contiguous in memory.
70 //
71 // The gemm call implements the following operation:
72 //
73 // C = alpha * op(A) * op(B) + beta * C
74 //
75 // where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each
76 // of A, B, and C are matrices and alpha and beta are scalars. Note that the
77 // most common use case of gemm will involve setting alpha to 1 and beta to 0.
78 //
79 // op(A) and op(B) represent the transformations that are done to A and B before
80 // the matrix multiply; depending on the flags set, op(A) is equal to A or A^T
81 // (transpose) if the argument TransA or TransB is set to CblasNoTrans or
82 // CblasTrans, respectively, for each of A and B.
83 template <>
84 void Gemm<float, CPUContext>(
85  const CBLAS_TRANSPOSE TransA,
86  const CBLAS_TRANSPOSE TransB,
87  const int M,
88  const int N,
89  const int K,
90  const float alpha,
91  const float* A,
92  const float* B,
93  const float beta,
94  float* C,
95  CPUContext* context,
96  TensorProto::DataType math_type) {
97  auto C_mat = EigenMatrixMap<float>(C, N, M);
98  if (beta == 0) {
99  C_mat.setZero();
100  } else {
101  C_mat *= beta;
102  }
103  switch (TransA) {
104  case CblasNoTrans: {
105  switch (TransB) {
106  case CblasNoTrans:
107  C_mat.noalias() += alpha * (
108  ConstEigenMatrixMap<float>(B, N, K) *
109  ConstEigenMatrixMap<float>(A, K, M));
110  return;
111  case CblasTrans:
112  C_mat.noalias() += alpha * (
113  ConstEigenMatrixMap<float>(B, K, N).transpose() *
114  ConstEigenMatrixMap<float>(A, K, M));
115  return;
116  default:
117  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
118  }
119  }
120  case CblasTrans: {
121  switch (TransB) {
122  case CblasNoTrans:
123  C_mat.noalias() += alpha * (
124  ConstEigenMatrixMap<float>(B, N, K) *
125  ConstEigenMatrixMap<float>(A, M, K).transpose());
126  return;
127  case CblasTrans:
128  C_mat.noalias() += alpha * (
129  ConstEigenMatrixMap<float>(B, K, N).transpose() *
130  ConstEigenMatrixMap<float>(A, M, K).transpose());
131  return;
132  default:
133  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
134  }
135  }
136  default:
137  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
138  }
139 }
140 
141 template <>
142 void GemmEx<float, CPUContext>(
143  const CBLAS_TRANSPOSE TransA,
144  const CBLAS_TRANSPOSE TransB,
145  const int M,
146  const int N,
147  const int K,
148  const float alpha,
149  const float* A,
150  const int lda,
151  const float* B,
152  const int ldb,
153  const float beta,
154  float* C,
155  const int ldc,
156  CPUContext*) {
157  using OuterStride = Eigen::OuterStride<Eigen::Dynamic>;
158  using StridedMap = Eigen::Map<Eigen::MatrixXf, 0, OuterStride>;
159  using ConstStridedMap = Eigen::Map<const Eigen::MatrixXf, 0, OuterStride>;
160  auto C_mat = StridedMap(C, N, M, OuterStride(ldc));
161  if (beta == 0) {
162  C_mat.setZero();
163  } else {
164  C_mat *= beta;
165  }
166  switch (TransA) {
167  case CblasNoTrans: {
168  switch (TransB) {
169  case CblasNoTrans:
170  C_mat.noalias() +=
171  alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
172  ConstStridedMap(A, K, M, OuterStride(lda)));
173  return;
174  case CblasTrans:
175  C_mat.noalias() +=
176  alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
177  ConstStridedMap(A, K, M, OuterStride(lda)));
178  return;
179  default:
180  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
181  }
182  }
183  case CblasTrans: {
184  switch (TransB) {
185  case CblasNoTrans:
186  C_mat.noalias() +=
187  alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
188  ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
189  return;
190  case CblasTrans:
191  C_mat.noalias() +=
192  alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
193  ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
194  return;
195  default:
196  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
197  }
198  }
199  default:
200  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
201  }
202 }
203 
204 template <>
205 void Gemv<float, CPUContext>(
206  const CBLAS_TRANSPOSE TransA,
207  const int M,
208  const int N,
209  const float alpha,
210  const float* A,
211  const float* x,
212  const float beta,
213  float* y,
214  CPUContext* context,
215  TensorProto::DataType math_type) {
216  EigenVectorMap<float> y_vec(y, TransA == CblasNoTrans ? M : N);
217  if (beta == 0) {
218  // In Caffe2 we often do a lazy initialization, which may contain NaNs in
219  // the float values. As a result, if beta is 0, we explicitly do a setzero.
220  y_vec.setZero();
221  } else {
222  y_vec *= beta;
223  }
224  switch (TransA) {
225  case CblasNoTrans: {
226  y_vec.noalias() += alpha * (
227  ConstEigenMatrixMap<float>(A, N, M).transpose() *
228  ConstEigenVectorMap<float>(x, N));
229  return;
230  }
231  case CblasTrans: {
232  y_vec.noalias() += alpha * (
233  ConstEigenMatrixMap<float>(A, N, M) *
234  ConstEigenVectorMap<float>(x, M));
235  return;
236  }
237  default:
238  LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input.";
239  }
240 }
241 
242 #define CAFFE2_SPECIALIZED_SCALE(T) \
243  template <> \
244  void Scale<T, CPUContext>( \
245  const int n, const float alpha, const T* x, T* y, CPUContext* context) { \
246  EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha; \
247  } \
248  template <> \
249  void Scale<T, CPUContext>( \
250  const int n, \
251  const float* alpha, \
252  const T* x, \
253  T* y, \
254  CPUContext* context) { \
255  EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha); \
256  }
257 CAFFE2_SPECIALIZED_SCALE(float)
258 #undef CAFFE2_SPECIALIZED_SCALE
259 
260 #define CAFFE2_SPECIALIZED_DOT(T) \
261 template<> \
262 void Dot<T, CPUContext>( \
263  const int N, const T* a, const T* b, T* y, \
264  CPUContext* context) { \
265  *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
266 }
267 CAFFE2_SPECIALIZED_DOT(float)
268 #undef CAFFE2_SPECIALIZED_DOT
269 
270 #define CAFFE2_SPECIALIZED_AXPY(T) \
271  template <> \
272  void Axpy<T, CPUContext>( \
273  const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \
274  EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha; \
275  } \
276  template <> \
277  void Axpy<T, CPUContext>( \
278  const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
279  EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha); \
280  }
281 CAFFE2_SPECIALIZED_AXPY(float)
282 #undef CAFFE2_SPECIALIZED_AXPY
283 
284 #define CAFFE2_SPECIALIZED_AXPBY(T) \
285 template <> \
286 void Axpby<T, CPUContext>(const int N, const T alpha, const T* x, \
287  const T beta, T* y, CPUContext* context) { \
288  EigenVectorMap<T> y_vec(y, N); \
289  y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha; \
290 }
291 CAFFE2_SPECIALIZED_AXPBY(float)
292 #undef CAFFE2_SPECIALIZED_AXPBY
293 
294 #else // CAFFE2_USE_EIGEN_FOR_BLAS
295 
296 template <>
297 void Gemm<float, CPUContext>(
298  const CBLAS_TRANSPOSE TransA,
299  const CBLAS_TRANSPOSE TransB,
300  const int M,
301  const int N,
302  const int K,
303  const float alpha,
304  const float* A,
305  const float* B,
306  const float beta,
307  float* C,
308  CPUContext* /*context*/,
309  TensorProto::DataType /*math_type*/) {
310  int lda = (TransA == CblasNoTrans) ? K : M;
311  int ldb = (TransB == CblasNoTrans) ? N : K;
312  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
313  beta, C, N);
314 }
315 
316 template <>
317 void GemmEx<float, CPUContext>(
318  const CBLAS_TRANSPOSE TransA,
319  const CBLAS_TRANSPOSE TransB,
320  const int M,
321  const int N,
322  const int K,
323  const float alpha,
324  const float* A,
325  const int lda,
326  const float* B,
327  const int ldb,
328  const float beta,
329  float* C,
330  const int ldc,
331  CPUContext* /*context*/) {
332  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
333  beta, C, ldc);
334 }
335 
336 template <>
337 void Gemv<float, CPUContext>(
338  const CBLAS_TRANSPOSE TransA,
339  const int M,
340  const int N,
341  const float alpha,
342  const float* A,
343  const float* x,
344  const float beta,
345  float* y,
346  CPUContext* /*context*/,
347  TensorProto::DataType /*math_type*/) {
348  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
349 }
350 
351 #define CAFFE2_SPECIALIZED_SCALE(T, prefix) \
352  template <> \
353  void Scale<T, CPUContext>( \
354  const int n, const float alpha, const T* x, T* y, CPUContext*) { \
355  if (y != x) \
356  cblas_##prefix##copy(n, x, 1, y, 1); \
357  cblas_##prefix##scal(n, static_cast<float>(alpha), y, 1); \
358  } \
359  template <> \
360  void Scale<T, CPUContext>( \
361  const int n, const float* alpha, const T* x, T* y, CPUContext*) { \
362  if (y != x) \
363  cblas_##prefix##copy(n, x, 1, y, 1); \
364  cblas_##prefix##scal(n, static_cast<float>(*alpha), y, 1); \
365  }
366 CAFFE2_SPECIALIZED_SCALE(float, s)
367 #undef CAFFE2_SPECIALIZED_SCALE
368 
369 #define CAFFE2_SPECIALIZED_DOT(T, prefix) \
370  template <> \
371  void Dot<T, CPUContext>( \
372  const int N, const T* a, const T* b, T* y, CPUContext*) { \
373  *y = cblas_##prefix##dot(N, a, 1, b, 1); \
374  }
375 CAFFE2_SPECIALIZED_DOT(float, s)
376 #undef CAFFE2_SPECIALIZED_DOT
377 
378 #define CAFFE2_SPECIALIZED_AXPY(T, prefix) \
379  template <> \
380  void Axpy<T, CPUContext>( \
381  const int N, const T alpha, const T* x, T* y, CPUContext*) { \
382  cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
383  } \
384  template <> \
385  void Axpy<T, CPUContext>( \
386  const int N, const T* alpha, const T* x, T* y, CPUContext*) { \
387  cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \
388  }
389 CAFFE2_SPECIALIZED_AXPY(float, s)
390 #undef CAFFE2_SPECIALIZED_AXPY
391 
392 // cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
393 // we will need to implement it.
394 #ifdef CAFFE2_USE_MKL
395 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \
396  template <> \
397  void Axpby<T, CPUContext>( \
398  const int N, \
399  const T alpha, \
400  const T* x, \
401  const T beta, \
402  T* y, \
403  CPUContext*) { \
404  cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \
405  }
406 #else // CAFFE2_USE_MKL
407 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \
408  template <> \
409  void Axpby<T, CPUContext>( \
410  const int N, \
411  const T alpha, \
412  const T* x, \
413  const T beta, \
414  T* y, \
415  CPUContext*) { \
416  cblas_##prefix##scal(N, beta, y, 1); \
417  cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
418  }
419 #endif // CAFFE2_USE_MKL
420 CAFFE2_SPECIALIZED_AXPBY(float, s)
421 #undef CAFFE2_SPECIALIZED_AXPBY
422 
423 #endif // CAFFE2_USE_EIGEN_FOR_BLAS
424 
425 template <>
426 void GemmBatched<float, CPUContext>(
427  const CBLAS_TRANSPOSE TransA,
428  const CBLAS_TRANSPOSE TransB,
429  const int batch_size,
430  const int M,
431  const int N,
432  const int K,
433  const float alpha,
434  const float* A,
435  const float* B,
436  const float beta,
437  float* C,
438  CPUContext* context,
439  Tensor<CPUContext>*, /* scratch */
440  TensorProto::DataType /* math_type */) {
441  const int a_stride = M * K;
442  const int b_stride = K * N;
443  const int c_stride = M * N;
444 
445 #ifdef CAFFE2_USE_MKL
446  (void)context;
447 
448  const int lda = (TransA == CblasNoTrans) ? K : M;
449  const int ldb = (TransB == CblasNoTrans) ? N : K;
450  std::vector<const float*> a_array(batch_size, nullptr);
451  std::vector<const float*> b_array(batch_size, nullptr);
452  std::vector<float*> c_array(batch_size, nullptr);
453  for (int i = 0; i < batch_size; ++i) {
454  a_array[i] = A + a_stride * i;
455  b_array[i] = B + b_stride * i;
456  c_array[i] = C + c_stride * i;
457  }
458  cblas_sgemm_batch(
459  CblasRowMajor,
460  &TransA,
461  &TransB,
462  &M,
463  &N,
464  &K,
465  &alpha,
466  a_array.data(),
467  &lda,
468  b_array.data(),
469  &ldb,
470  &beta,
471  c_array.data(),
472  &N, // ldc_array
473  1,
474  &batch_size);
475 #else // CAFFE2_USE_MKL
476  // loop over matrices in the batch
477  for (int i = 0; i < batch_size; ++i) {
478  math::Gemm<float, CPUContext>(
479  TransA,
480  TransB,
481  M,
482  N,
483  K,
484  alpha,
485  A + a_stride * i,
486  B + b_stride * i,
487  beta,
488  C + c_stride * i,
489  context);
490  }
491 #endif
492 }
493 
495 // MKL VML alternatives.
496 // Depending on whether we are using MKL, we will delegate the Caffe math
497 // functions that are VML-related to either the VML call or the Eigen
498 // implementation. If you are setting the flags (such as AVX) right for your CPU
499 // architecture, usually Eigen will deliver a throughput as fast as the VML
500 // functions.
502 #ifdef CAFFE2_USE_MKL
503 
504 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...) \
505  template <> \
506  void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
507  OriginalFunc(N, x, y, ##__VA_ARGS__); \
508  }
509 DELEGATE_SIMPLE_UNARY_FUNCTION(
510  float,
511  Exp,
512  vmsExp,
513  VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
514 DELEGATE_SIMPLE_UNARY_FUNCTION(
515  double,
516  Exp,
517  vmdExp,
518  VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
519 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, vsLn)
520 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, vdLn)
521 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, vsCos)
522 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, vdCos)
523 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, vsSin)
524 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, vdSin)
525 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, vsAbs)
526 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, vdAbs)
527 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, vsSqrt)
528 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, vdSqrt)
529 DELEGATE_SIMPLE_UNARY_FUNCTION(float, InvSqrt, vsInvSqrt)
530 DELEGATE_SIMPLE_UNARY_FUNCTION(double, InvSqrt, vdInvSqrt)
531 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, vsSqr)
532 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, vdSqr)
533 #undef DELEGATE_SIMPLE_UNARY_FUNCTION
534 
535 #define DELEGATE_SINCOS_FUNCTION(T, OriginalFunc) \
536  template <> \
537  void SinCos<T, CPUContext>( \
538  const int N, const T* a, T* ys, T* yc, CPUContext*) { \
539  OriginalFunc(N, a, ys, yc); \
540  }
541 DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
542 DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
543 #undef DELEGATE_SINCOS_FUNCTION
544 
545 #define DELEGATE_POWX_FUNCTION(T, OriginalFunc) \
546  template <> \
547  void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
548  OriginalFunc(N, a, b, y); \
549  }
550 DELEGATE_POWX_FUNCTION(float, vsPowx)
551 DELEGATE_POWX_FUNCTION(double, vdPowx)
552 #undef DELEGATE_POWX_FUNCTION
553 
554 #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
555  template <> \
556  void Funcname<T, CPUContext>( \
557  const int N, const T* a, const T* b, T* y, CPUContext*) { \
558  OriginalFunc(N, a, b, y); \
559  }
560 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
561 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
562 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub)
563 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
564 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul)
565 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
566 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
567 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
568 #undef DELEGATE_SIMPLE_BINARY_FUNCTION
569 
570 #else // CAFFE2_USE_MKL
571 
572 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \
573  template <> \
574  void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
575  EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(x, N).array().expr(); \
576  }
577 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
578 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
579 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, cos)
580 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, sin)
581 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, abs)
582 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, sqrt)
583 DELEGATE_SIMPLE_UNARY_FUNCTION(float, InvSqrt, rsqrt)
584 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
585 #undef DELEGATE_SIMPLE_UNARY_FUNCTION
586 
587 #define DELEGATE_SINCOS_FUNCTION(T) \
588  template <> \
589  void SinCos<T, CPUContext>( \
590  const int N, const T* x, T* ys, T* yc, CPUContext*) { \
591  EigenVectorMap<T>(ys, N) = ConstEigenVectorMap<T>(x, N).array().sin(); \
592  EigenVectorMap<T>(yc, N) = ConstEigenVectorMap<T>(x, N).array().cos(); \
593  }
594 DELEGATE_SINCOS_FUNCTION(float)
595 DELEGATE_SINCOS_FUNCTION(double)
596 #undef DELEGATE_SINCOS_FUNCTION
597 
598 #define DELEGATE_POWX_FUNCTION(T) \
599  template <> \
600  void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
601  EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(a, N).array().pow(b); \
602  }
603 DELEGATE_POWX_FUNCTION(float)
604 #undef DELEGATE_POWX_FUNCTION
605 
606 #endif // CAFFE2_USE_MKL
607 
608 
609 #define EIGEN_SIMPLE_BINARY_FUNCTION(T, Funcname, expr) \
610 template <> \
611 void Funcname<T, CPUContext>( \
612  const int N, const T* a, const T* b, T* y, \
613  CPUContext*) { \
614  EigenVectorMap<T>(y, N) = \
615  ConstEigenVectorMap<T>(a, N).array() expr \
616  ConstEigenVectorMap<T>(b, N).array(); \
617 }
618 
619 #ifdef CAFFE2_USE_MKL
620 
621 #define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \
622 EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \
623 EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
624 
625 #else
626 
627 #define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \
628 EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr) \
629 EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \
630 EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
631 
632 #endif
633 
634 DEFINE_SIMPLE_BINARY_FUNCTION(Add, +)
635 DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -)
636 DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *)
637 DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
638 
639 #undef EIGEN_SIMPLE_BINARY_FUNCTION
640 #undef DEFINE_FLOAT_BINARY_FUNCTION
641 
642 
644 // Common math functions being used in Caffe that do not have a BLAS or MKL
645 // equivalent. For all these functions, we will simply implement them either via
646 // Eigen or via custom code.
648 
649 #define CAFFE2_SPECIALIZED_REDUCEMIN(T) \
650  template <> \
651  void ReduceMin<T, CPUContext>( \
652  const int N, \
653  const T* x, \
654  T* y, \
655  Tensor<CPUContext>* /*scratch_ptr*/, \
656  CPUContext* /*context*/) { \
657  *y = *std::min_element(x, x + N); \
658  }
659 CAFFE2_SPECIALIZED_REDUCEMIN(float)
660 #undef CAFFE2_SPECIALIZED_REDUCEMIN
661 
662 #define CAFFE2_SPECIALIZED_REDUCEMAX(T) \
663  template <> \
664  void ReduceMax<T, CPUContext>( \
665  const int N, \
666  const T* x, \
667  T* y, \
668  Tensor<CPUContext>* /*scratch_ptr*/, \
669  CPUContext* /*context*/) { \
670  *y = *std::max_element(x, x + N); \
671  }
672 CAFFE2_SPECIALIZED_REDUCEMAX(float)
673 CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
674 CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
675 
676 #undef CAFFE2_SPECIALIZED_REDUCEMAX
677 
678 #define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \
679  template <> \
680  void RowwiseMax<T, CPUContext>( \
681  const int N, const int D, const T* x, T* y, CPUContext*) { \
682  EigenVectorMap<T>(y, N) = \
683  ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \
684  }
685 CAFFE2_SPECIALIZED_ROWWISEMAX(float)
686 #undef CAFFE2_SPECIALIZED_ROWWISEMAX
687 
688 #define CAFFE2_SPECIALIZED_COLWISEMAX(T) \
689  template <> \
690  void ColwiseMax<T, CPUContext>( \
691  const int N, const int D, const T* x, T* y, CPUContext*) { \
692  EigenVectorMap<T>(y, D) = \
693  ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \
694  }
695 CAFFE2_SPECIALIZED_COLWISEMAX(float)
696 #undef CAFFE2_SPECIALIZED_COLWISEMAX
697 
698 #define CAFFE2_SPECIALIZED_ELEMWISEMAX(T) \
699  template <> \
700  void ElemwiseMax<T, CPUContext>( \
701  const int N, const T* x, const T* y, T* z, CPUContext* /*context*/) { \
702  std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) { \
703  return std::max(x_i, y_i); \
704  }); \
705  }
706 CAFFE2_SPECIALIZED_ELEMWISEMAX(float)
707 #undef CAFFE2_SPECIALIZED_ELEMWISEMAX
708 
709 #define CAFFE2_SPECIALIZED_MAXIMUM(T) \
710  template <> \
711  void Maximum<T, CPUContext>( \
712  const int N, const float alpha, const T* x, T* y, CPUContext* context) { \
713  std::transform( \
714  x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \
715  }
716 CAFFE2_SPECIALIZED_MAXIMUM(float)
717 #undef CAFFE2_SPECIALIZED_MAXIMUM
718 
719 // AddToRow and AddToCol adds the corresponding row/col vector b to the matrix a
720 // of shape M x N. The actual implementation uses eigen which is column major,
721 // so notice the row/column swap in the actual implementation.
722 #define DELEGATE_BROADCAST_BINARY_FUNCTION(T, Funcname, expr) \
723  template <> \
724  void Funcname##ToRow<T, CPUContext>( \
725  const int M, const int N, const T* a, const T* b, T* y, CPUContext*) { \
726  EigenArrayMap<T>(y, N, M) = ConstEigenArrayMap<T>(a, N, M).colwise() \
727  expr ConstEigenVectorArrayMap<T>(b, N); \
728  } \
729  /* inplace versions */ \
730  template <> \
731  void Funcname##ToRow<T, CPUContext>( \
732  const int M, const int N, const T* x, T* y, CPUContext*) { \
733  EigenArrayMap<T>(y, N, M).colwise() expr## = \
734  ConstEigenVectorArrayMap<T>(x, N); \
735  } \
736  template <> \
737  void Funcname##ToCol<T, CPUContext>( \
738  const int M, const int N, const T* x, T* y, CPUContext*) { \
739  EigenArrayMap<T>(y, N, M).rowwise() expr## = \
740  ConstEigenVectorArrayMap<T>(x, M).transpose(); \
741  }
742 
743 #define DEFINE_BROADCAST_BINARY_FUNCTION(name, op) \
744  DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op) \
745  DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op) \
746  DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op) \
747 
748 DEFINE_BROADCAST_BINARY_FUNCTION(Add, +)
749 DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -)
750 DEFINE_BROADCAST_BINARY_FUNCTION(Mul, *)
751 DEFINE_BROADCAST_BINARY_FUNCTION(Div, /)
752 
753 #undef DEFINE_BROADCAST_BINARY_FUNCTION
754 #undef DELEGATE_BROADCAST_BINARY_FUNCTION
755 
756 #define CAFFE2_SPECIALIZED_SET(T) \
757  template <> \
758  void Set<T, CPUContext>(const size_t N, const T alpha, T* Y, CPUContext*) { \
759  if (alpha == (T)0) { \
760  memset(Y, 0, N * sizeof(T)); \
761  } else { \
762  EigenVectorMap<T>(Y, N).setConstant(alpha); \
763  } \
764  }
765 
766 CAFFE2_SPECIALIZED_SET(float);
767 CAFFE2_SPECIALIZED_SET(double);
768 CAFFE2_SPECIALIZED_SET(int8_t);
769 CAFFE2_SPECIALIZED_SET(int16_t);
770 CAFFE2_SPECIALIZED_SET(int);
771 CAFFE2_SPECIALIZED_SET(int64_t);
772 CAFFE2_SPECIALIZED_SET(bool);
773 CAFFE2_SPECIALIZED_SET(char);
774 CAFFE2_SPECIALIZED_SET(uint8_t);
775 CAFFE2_SPECIALIZED_SET(uint16_t);
776 #undef CAFFE2_SPECIALIZED_SET
777 
778 #define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T) \
779  template <> \
780  void name<T, CPUContext>( \
781  const int n, const T* a, const T* b, bool* y, CPUContext*) { \
782  for (int i = 0; i < n; ++i) { \
783  y[i] = a[i] op b[i]; \
784  } \
785  } \
786  template <> \
787  void name##ToRow<T, CPUContext>( \
788  const int m, \
789  const int n, \
790  const T* a, \
791  const T* b, \
792  bool* y, \
793  CPUContext*) { \
794  for (int i = 0; i < n * m; ++i) { \
795  y[i] = a[i] op b[i % n]; \
796  } \
797  }
798 
799 #define CAFFE2_DEFINE_BINARY_OP(name, op) \
800  CAFFE2_INSTANTIATE_BINARY_OP(name, op, float) \
801  CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \
802  CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t)
803 
804 CAFFE2_DEFINE_BINARY_OP(LT, <);
805 CAFFE2_DEFINE_BINARY_OP(LE, <=);
806 CAFFE2_DEFINE_BINARY_OP(GT, >);
807 CAFFE2_DEFINE_BINARY_OP(GE, >=);
808 
809 CAFFE2_INSTANTIATE_BINARY_OP(Or, |, bool);
810 CAFFE2_INSTANTIATE_BINARY_OP(And, &, bool);
811 CAFFE2_INSTANTIATE_BINARY_OP(Xor, ^, bool);
812 
813 template <>
814 void Not<bool, CPUContext>(
815  const int n,
816  const bool* x,
817  bool* y,
818  CPUContext* /*context*/) {
819  for (int i = 0; i < n; ++i) {
820  y[i] = !x[i];
821  }
822 }
823 
824 #undef CAFFE2_DEFINE_BINARY_OP
825 #undef CAFFE2_INSTANTIATE_BINARY_OP
826 
827 #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \
828  template <> \
829  void AddStripedBatch( \
830  const int N, \
831  const T* first, \
832  T* y, \
833  const int stripe, \
834  const int batch, \
835  CPUContext* context) { \
836  for (int j = 0; j < batch; j++) { \
837  Add<T, CPUContext>(N, first + j * stripe, y, y, context); \
838  } \
839  }
840 
841 CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
842 #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
843 
844 template <>
845 void RandUniform<float, CPUContext>(
846  const size_t n,
847  const float a,
848  const float b,
849  float* r,
850  CPUContext* context) {
851  std::uniform_real_distribution<float> distribution(a, b);
852  for (auto i = 0; i < n; ++i) {
853  r[i] = distribution(context->RandGenerator());
854  }
855 }
856 
857 template <>
858 void RandUniform<int, CPUContext>(
859  const size_t n,
860  const int a,
861  const int b,
862  int* r,
863  CPUContext* context) {
864  std::uniform_int_distribution<int> distribution(a, b);
865  for (auto i = 0; i < n; ++i) {
866  r[i] = distribution(context->RandGenerator());
867  }
868 }
869 
870 #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \
871  template <> \
872  void RandUniformUnique<T, CPUContext>( \
873  const size_t n, \
874  const T a, \
875  const T b, \
876  T* r, \
877  const size_t m, \
878  const T* avoid, \
879  CPUContext* context) { \
880  CAFFE_ENFORCE_LE( \
881  n, b - a - m + 1, "Cannot satisfy the unique requirement"); \
882  std::unordered_set<T> avoid_set(n); \
883  if (m) { \
884  avoid_set.insert(avoid, avoid + m); \
885  CAFFE_ENFORCE_EQ(m, avoid_set.size(), "Avoid should be unique"); \
886  } \
887  std::uniform_int_distribution<T> distribution(a, b); \
888  T v = 0; \
889  for (size_t i = 0; i < n; ++i) { \
890  do { \
891  v = distribution(context->RandGenerator()); \
892  } while (avoid_set.count(v)); \
893  r[i] = v; \
894  avoid_set.insert(v); \
895  } \
896  }
897 
898 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
899 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
900 #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE
901 
902 template <>
903 void RandGaussian<float, CPUContext>(
904  const size_t n,
905  const float mean,
906  const float std,
907  float* r,
908  CPUContext* context) {
909  std::normal_distribution<float> distribution(mean, std);
910  for (auto i = 0; i < n; ++i) {
911  r[i] = distribution(context->RandGenerator());
912  }
913 }
914 
915 #define CAFFE2_SPECIALIZED_SUM(T) \
916  template <> \
917  void Sum<T, CPUContext>( \
918  const int N, \
919  const T* x, \
920  T* y, \
921  CPUContext* /* unused */, \
922  Tensor<CPUContext>* /* unused */) { \
923  *y = ConstEigenVectorMap<T>(x, N).sum(); \
924  }
925 
926 CAFFE2_SPECIALIZED_SUM(float);
927 CAFFE2_SPECIALIZED_SUM(int32_t);
928 CAFFE2_SPECIALIZED_SUM(int64_t);
929 
930 #undef CAFFE2_SPECIALIZED_SUM
931 
932 template <>
933 void SumSqr<float, CPUContext>(
934  const int N,
935  const float* x,
936  float* y,
937  CPUContext* /*context*/ /* unused */,
938  Tensor<CPUContext>* /*scratch_ptr*/ /* unused */) {
939  *y = ConstEigenVectorMap<float>(x, N).squaredNorm();
940 }
941 
942 template <>
943 void Select<float, CPUContext>(
944  const int N,
945  const int D,
946  const float* x,
947  const int* idx,
948  float* y,
949  CPUContext* /*context*/) {
950  for (int i = 0; i < N; ++i) {
951  DCHECK_LT(idx[i], D);
952  y[i] = x[i * D + idx[i]];
953  }
954 }
955 // Ported from caffe 1.
956 template <>
957 void Im2colNd<float, CPUContext, StorageOrder::NCHW>(
958  const float* data_img,
959  const int* im_shape,
960  const int* col_shape,
961  const int /* img_size*/,
962  const int /* col_size*/,
963  const int* kernel_shape,
964  const int* stride,
965  const int* dilation,
966  const int* pad,
967  const int N,
968  float* data_col,
969  CPUContext* /* context */,
970  bool accumulate_output) {
971  int kernel_size = 1;
972  for (int i = 0; i < N; ++i) {
973  kernel_size *= kernel_shape[i];
974  }
975  const int channels_col = col_shape[0];
976  vector<int> d_offset(N, 0);
977  vector<int> d_iter(N, 0);
978  for (int c_col = 0; c_col < channels_col; ++c_col) {
979  // Loop over spatial axes in reverse order to compute a per-axis offset.
980  int offset = c_col;
981  for (int d_i = N - 1; d_i >= 0; --d_i) {
982  if (d_i < N - 1) {
983  offset /= kernel_shape[d_i + 1];
984  }
985  d_offset[d_i] = offset % kernel_shape[d_i];
986  }
987  for (bool incremented = true; incremented;) {
988  // Loop over spatial axes in forward order to compute the indices in the
989  // image and column, and whether the index lies in the padding.
990  int index_col = c_col;
991  int index_im = c_col / kernel_size;
992  bool is_padding = false;
993  for (int d_i = 0; d_i < N; ++d_i) {
994  const int d = d_iter[d_i];
995  const int d_im =
996  d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
997  is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
998  index_col *= col_shape[d_i + 1];
999  index_col += d;
1000  index_im *= im_shape[d_i + 1];
1001  index_im += d_im;
1002  }
1003  if (!accumulate_output) {
1004  if (is_padding) {
1005  data_col[index_col] = 0;
1006  } else {
1007  data_col[index_col] = data_img[index_im];
1008  }
1009  } else if (!is_padding) { // col2im
1010  data_col[index_im] += data_img[index_col];
1011  }
1012  // Loop over spatial axes in reverse order to choose an index,
1013  // like counting.
1014  incremented = false;
1015  for (int d_i = N - 1; d_i >= 0; --d_i) {
1016  const int d_max = col_shape[d_i + 1];
1017  DCHECK_LT(d_iter[d_i], d_max);
1018  if (d_iter[d_i] == d_max - 1) {
1019  d_iter[d_i] = 0;
1020  } else { // d_iter[d_i] < d_max - 1
1021  ++d_iter[d_i];
1022  incremented = true;
1023  break;
1024  }
1025  }
1026  } // while(incremented) {
1027  } // for (int c = 0; c < channels_col; ++c) {
1028 }
1029 
1030 template <>
1031 void Col2imNd<float, CPUContext, StorageOrder::NCHW>(
1032  const float* data_col,
1033  const int* img_shape,
1034  const int* col_shape,
1035  const int img_size,
1036  const int col_size,
1037  const int* kernel_shape,
1038  const int* stride,
1039  const int* dilation,
1040  const int* pad,
1041  const int N,
1042  float* data_img,
1043  CPUContext* context) {
1044  Set<float, CPUContext>(img_size, 0, data_img, context);
1045  Im2colNd<float, CPUContext, StorageOrder::NCHW>(
1046  data_col,
1047  img_shape,
1048  col_shape,
1049  img_size,
1050  col_size,
1051  kernel_shape,
1052  stride,
1053  dilation,
1054  pad,
1055  N,
1056  data_img,
1057  context,
1058  true);
1059 }
1060 
1061 template <>
1062 void Im2col<float, CPUContext, StorageOrder::NCHW>(
1063  const float* data_im,
1064  const int channels,
1065  const int height,
1066  const int width,
1067  const int kernel_h,
1068  const int kernel_w,
1069  const int dilation_h,
1070  const int dilation_w,
1071  const int pad_t,
1072  const int pad_l,
1073  const int pad_b,
1074  const int pad_r,
1075  const int stride_h,
1076  const int stride_w,
1077  float* data_col,
1078  CPUContext* /*context*/) {
1079  const int output_h =
1080  (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1081  1;
1082  const int output_w =
1083  (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1084  1;
1085 
1086  // Fast path for zero padding and no dilation
1087  // From Torch, THNN_(unfolded_copy)
1088  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
1089  pad_t == 0 && pad_b == 0) {
1090  for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
1091  const auto nip = k / (kernel_h * kernel_w);
1092  const auto rest = k % (kernel_h * kernel_w);
1093  const auto kh = rest / kernel_w;
1094  const auto kw = rest % kernel_w;
1095  auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
1096  kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
1097  const auto* src = data_im + nip * (height * width);
1098  for (auto y = 0; y < output_h; y++) {
1099  const auto iy = y * stride_h + kh;
1100  const auto ix = kw;
1101  if (stride_w == 1) {
1102  memcpy(
1103  dst + (y * output_w),
1104  src + (iy * width + ix),
1105  sizeof(float) * output_w);
1106  } else {
1107  for (auto x = 0; x < output_w; x++) {
1108  memcpy(
1109  dst + (y * output_w + x),
1110  src + (iy * width + ix + x * stride_w),
1111  sizeof(float));
1112  }
1113  }
1114  }
1115  }
1116  return;
1117  }
1118 
1119  // Fast path for equal padding
1120  if (pad_l == pad_r && pad_t == pad_b) {
1121  // From Intel, https://github.com/BVLC/caffe/pull/3536
1122  const int pad_h = pad_t;
1123  const int pad_w = pad_l;
1124  const int channel_size = height * width;
1125  for (int channel = channels; channel--; data_im += channel_size) {
1126  for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
1127  for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
1128  int input_row = -pad_h + kernel_row * dilation_h;
1129  for (int output_rows = output_h; output_rows; output_rows--) {
1130  if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
1131  for (int output_cols = output_w; output_cols; output_cols--) {
1132  *(data_col++) = 0;
1133  }
1134  } else {
1135  int input_col = -pad_w + kernel_col * dilation_w;
1136  for (int output_col = output_w; output_col; output_col--) {
1137  if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
1138  *(data_col++) = data_im[input_row * width + input_col];
1139  } else {
1140  *(data_col++) = 0;
1141  }
1142  input_col += stride_w;
1143  }
1144  }
1145  input_row += stride_h;
1146  }
1147  }
1148  }
1149  }
1150  return;
1151  }
1152 
1153  // Baseline
1154  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1155  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1156 
1157  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1158  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1159 
1160  int channels_col = channels * kernel_h * kernel_w;
1161  for (int c = 0; c < channels_col; ++c) {
1162  int w_offset = c % kernel_w;
1163  int h_offset = (c / kernel_w) % kernel_h;
1164  int c_im = c / kernel_h / kernel_w;
1165  for (int h = 0; h < height_col; ++h) {
1166  for (int w = 0; w < width_col; ++w) {
1167  int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
1168  int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
1169  if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
1170  data_col[(c * height_col + h) * width_col + w] =
1171  data_im[(c_im * height + h_pad) * width + w_pad];
1172  else
1173  data_col[(c * height_col + h) * width_col + w] = 0;
1174  }
1175  }
1176  }
1177 }
1178 
1179 template <>
1180 void Im2col<float, CPUContext, StorageOrder::NHWC>(
1181  const float* data_im,
1182  const int channels,
1183  const int height,
1184  const int width,
1185  const int kernel_h,
1186  const int kernel_w,
1187  const int dilation_h,
1188  const int dilation_w,
1189  const int pad_t,
1190  const int pad_l,
1191  const int pad_b,
1192  const int pad_r,
1193  const int stride_h,
1194  const int stride_w,
1195  float* data_col,
1196  CPUContext* /*context*/) {
1197  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1198  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1199 
1200  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1201  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1202 
1203  int h_pad = -pad_t;
1204  for (int h = 0; h < height_col; ++h) {
1205  int w_pad = -pad_l;
1206  for (int w = 0; w < width_col; ++w) {
1207  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
1208  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
1209  if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
1210  memcpy(data_col, data_im + (ih * width + iw) * channels,
1211  sizeof(float) * channels);
1212  } else {
1213  // This should be simply padded with zero.
1214  memset(data_col, 0, sizeof(float) * channels);
1215  }
1216  data_col += channels;
1217  }
1218  }
1219  w_pad += stride_w;
1220  }
1221  h_pad += stride_h;
1222  }
1223 }
1224 
1225 template <>
1226 void Col2im<float, CPUContext, StorageOrder::NCHW>(
1227  const float* data_col,
1228  const int channels,
1229  const int height,
1230  const int width,
1231  const int kernel_h,
1232  const int kernel_w,
1233  const int dilation_h,
1234  const int dilation_w,
1235  const int pad_t,
1236  const int pad_l,
1237  const int pad_b,
1238  const int pad_r,
1239  const int stride_h,
1240  const int stride_w,
1241  float* data_im,
1242  CPUContext* context) {
1243  const int output_h =
1244  (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1245  1;
1246  const int output_w =
1247  (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1248  1;
1249 
1250  Set<float, CPUContext>(height * width * channels, 0, data_im, context);
1251 
1252  // Fast path for zero padding and no dilation
1253  // From Torch, modified THNN_(unfolded_acc)
1254  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
1255  pad_t == 0 && pad_b == 0) {
1256  for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
1257  const auto nip = k / (kernel_h * kernel_w);
1258  const auto rest = k % (kernel_h * kernel_w);
1259  const auto kh = rest / kernel_w;
1260  const auto kw = rest % kernel_w;
1261  const auto* dst = data_col +
1262  nip * (kernel_h * kernel_w * output_h * output_w) +
1263  kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
1264  auto* src = data_im + nip * (height * width);
1265  for (auto y = 0; y < output_h; y++) {
1266  const auto iy = y * stride_h + kh;
1267  const auto ix = kw;
1268  if (stride_w == 1) {
1269  auto offsrc = src + (iy * width + ix);
1270  const auto offdst = dst + (y * output_w);
1271  for (auto i = 0; i < output_w; ++i) {
1272  offsrc[i] += offdst[i];
1273  }
1274  } else {
1275  for (auto x = 0; x < output_w; x++) {
1276  auto offsrc = src + (iy * width + ix + x * stride_w);
1277  const auto offdst = dst + (y * output_w + x);
1278  *offsrc += *offdst;
1279  }
1280  }
1281  }
1282  }
1283  return;
1284  }
1285 
1286  // Fast path for equal padding
1287  if (pad_l == pad_r && pad_t == pad_b) {
1288  // From Intel, https://github.com/BVLC/caffe/pull/3536
1289  const int pad_h = pad_t;
1290  const int pad_w = pad_l;
1291  const int channel_size = height * width;
1292  for (int channel = channels; channel--; data_im += channel_size) {
1293  for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
1294  for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
1295  int input_row = -pad_h + kernel_row * dilation_h;
1296  for (int output_rows = output_h; output_rows; output_rows--) {
1297  if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
1298  data_col += output_w;
1299  } else {
1300  int input_col = -pad_w + kernel_col * dilation_w;
1301  for (int output_col = output_w; output_col; output_col--) {
1302  if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
1303  data_im[input_row * width + input_col] += *data_col;
1304  }
1305  data_col++;
1306  input_col += stride_w;
1307  }
1308  }
1309  input_row += stride_h;
1310  }
1311  }
1312  }
1313  }
1314  return;
1315  }
1316 
1317  // Fallback
1318  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1319  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1320 
1321  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1322  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1323  int channels_col = channels * kernel_h * kernel_w;
1324  for (int c = 0; c < channels_col; ++c) {
1325  int w_offset = c % kernel_w;
1326  int h_offset = (c / kernel_w) % kernel_h;
1327  int c_im = c / kernel_h / kernel_w;
1328  for (int h = 0; h < height_col; ++h) {
1329  for (int w = 0; w < width_col; ++w) {
1330  int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
1331  int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
1332  if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) {
1333  data_im[(c_im * height + h_pad) * width + w_pad] +=
1334  data_col[(c * height_col + h) * width_col + w];
1335  }
1336  }
1337  }
1338  }
1339 }
1340 
1341 template <>
1342 void Col2im<float, CPUContext, StorageOrder::NHWC>(
1343  const float* data_col,
1344  const int channels,
1345  const int height,
1346  const int width,
1347  const int kernel_h,
1348  const int kernel_w,
1349  const int dilation_h,
1350  const int dilation_w,
1351  const int pad_t,
1352  const int pad_l,
1353  const int pad_b,
1354  const int pad_r,
1355  const int stride_h,
1356  const int stride_w,
1357  float* data_im,
1358  CPUContext* context) {
1359  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1360  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1361 
1362  Set<float, CPUContext>(height * width * channels, 0, data_im, context);
1363  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1364  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1365  int h_pad = -pad_t;
1366  for (int h = 0; h < height_col; ++h) {
1367  int w_pad = -pad_l;
1368  for (int w = 0; w < width_col; ++w) {
1369  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
1370  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
1371  if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
1372  auto* data_im_patch = data_im + (ih * width + iw) * channels;
1373  Add<float, CPUContext>(
1374  channels, data_im_patch, data_col, data_im_patch, context);
1375  }
1376  data_col += channels;
1377  }
1378  }
1379  w_pad += stride_w;
1380  }
1381  h_pad += stride_h;
1382  }
1383 }
1384 
1385 template <>
1386 void BiasCHW<float, CPUContext>(
1387  const float* bias,
1388  const int bias_channels,
1389  const int image_size,
1390  float* image,
1391  CPUContext* /*context*/) {
1392  // Sum the per-channel bias into every image plane
1393  for (int c = 0; c < bias_channels; ++c) {
1394  float b = bias[c];
1395 
1396 #ifdef __ARM_NEON__
1397  float32x4_t vBias = vdupq_n_f32(b);
1398 
1399  // We give alignment hints for additional speed, so handle the
1400  // non-vectorizable prologue separately
1401  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
1402 
1403  // FIXME: if input < kVecSizeInFloat, can't vectorize at all
1404 
1405  int prologue =
1406  kVecSizeInFloat -
1407  // remainder in floats
1408  (((uintptr_t) image) % (sizeof(float32x4_t))) / sizeof(float);
1409 
1410  int i = 0;
1411  // Prologue loop
1412  for (; i < prologue; ++i) {
1413  image[i] += b;
1414  }
1415 
1416  // The loop is manually unrolled by 8
1417  constexpr int kUnroll = 8;
1418  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
1419 
1420  int remainder = image_size - prologue;
1421  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
1422 
1423  // Vectorizable body
1424  for (; i < vectorizable; i += kFloatsPerLoop) {
1425  // Manually unrolled
1426  float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
1427  float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
1428  float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
1429  float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
1430  float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
1431  float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
1432  float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
1433  float32x4_t v7 = vld1q_f32_aligned(image + i + 28);
1434 
1435  v0 = vaddq_f32(v0, vBias);
1436  v1 = vaddq_f32(v1, vBias);
1437  v2 = vaddq_f32(v2, vBias);
1438  v3 = vaddq_f32(v3, vBias);
1439  v4 = vaddq_f32(v4, vBias);
1440  v5 = vaddq_f32(v5, vBias);
1441  v6 = vaddq_f32(v6, vBias);
1442  v7 = vaddq_f32(v7, vBias);
1443 
1444  vst1q_f32_aligned(image + i + 0, v0);
1445  vst1q_f32_aligned(image + i + 4, v1);
1446  vst1q_f32_aligned(image + i + 8, v2);
1447  vst1q_f32_aligned(image + i + 12, v3);
1448  vst1q_f32_aligned(image + i + 16, v4);
1449  vst1q_f32_aligned(image + i + 20, v5);
1450  vst1q_f32_aligned(image + i + 24, v6);
1451  vst1q_f32_aligned(image + i + 28, v7);
1452  }
1453 
1454  // Non-vectorizable epilogue
1455  for (; i < image_size; ++i) {
1456  image[i] += b;
1457  }
1458 #else
1459  // Non-NEON CPU implementation
1460  for (int i = 0; i < image_size; ++i) {
1461  image[i] += b;
1462  }
1463 #endif // __ARM_NEON__
1464 
1465  image += image_size;
1466  }
1467 }
1468 
1469 template <>
1470 void CopyMatrix<CPUContext>(
1471  const size_t itemsize,
1472  const int M,
1473  const int N,
1474  const void* A,
1475  const int lda,
1476  void* B,
1477  const int ldb,
1478  CPUContext* /*context*/,
1479  TypeMeta::TypedCopy copy) {
1480  if (lda == N && ldb == N) {
1481  // can coalese to a single memcpy of size M * N
1482  if (copy) {
1483  copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
1484  } else {
1485  memcpy(
1486  static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
1487  }
1488  return;
1489  }
1490 
1491  for (int i = 0; i < M; ++i) {
1492  if (copy) {
1493  copy(
1494  static_cast<const char*>(A) + lda * i * itemsize,
1495  static_cast<char*>(B) + ldb * i * itemsize,
1496  N);
1497  } else {
1498  memcpy(
1499  static_cast<char*>(B) + ldb * i * itemsize,
1500  static_cast<const char*>(A) + lda * i * itemsize,
1501  itemsize * N);
1502  }
1503  }
1504 }
1505 
1506 #define CAFFE2_SPECIALIZED_COPYVECTOR(T) \
1507  template <> \
1508  void CopyVector<T, CPUContext>( \
1509  const int N, const T* src, T* dst, CPUContext* /*context*/) { \
1510  if (src != dst && N > 0) { \
1511  memcpy(dst, src, sizeof(T) * N); \
1512  } \
1513  }
1514 CAFFE2_SPECIALIZED_COPYVECTOR(float)
1515 #undef CAFFE2_SPECIALIZED_COPYVECTOR
1516 
1517 namespace {
1518 
1519 #ifdef CAFFE2_USE_HPTT
1520 
1521 bool TryTransposeWithHPTT(
1522  const int num_axes,
1523  const int* dims,
1524  const int* axes,
1525  const float* X,
1526  float* Y) {
1527  std::vector<int> axes_cm(num_axes);
1528  std::vector<int> dims_cm(num_axes);
1529 
1530  // Convert row-major index to column-major.
1531  const auto cm_fn = [num_axes](const int i) { return num_axes - i - 1; };
1532  for (int i = 0; i < num_axes; ++i) {
1533  axes_cm[i] = cm_fn(axes[cm_fn(i)]);
1534  dims_cm[i] = dims[cm_fn(i)];
1535  }
1536  auto plan = hptt::create_plan(
1537  axes_cm.data(),
1538  num_axes,
1539  1.0,
1540  X,
1541  dims_cm.data(),
1542  nullptr,
1543  0.0,
1544  Y,
1545  nullptr,
1546  hptt::ESTIMATE,
1547  1);
1548  if (plan == nullptr) {
1549  return false;
1550  }
1551  plan->execute();
1552  return true;
1553 }
1554 
1555 #endif // CAFFE2_USE_HPTT
1556 
1557 std::vector<int>
1558 ComputeXStrides(const int num_axes, const int* dims, const int* axes) {
1559  std::vector<int> x_strides(num_axes);
1560  std::vector<int> buff(num_axes);
1561  int cur_stride = 1;
1562  for (int i = num_axes - 1; i >= 0; --i) {
1563  buff[i] = cur_stride;
1564  cur_stride *= dims[i];
1565  }
1566  for (int i = 0; i < num_axes; ++i) {
1567  x_strides[i] = buff[axes[i]];
1568  }
1569  return x_strides;
1570 }
1571 
1572 void IncreaseIndex(const int* dims, std::vector<int>* index) {
1573  for (int i = index->size() - 1; i >= 0; --i) {
1574  ++index->at(i);
1575  if (index->at(i) >= dims[i]) {
1576  index->at(i) -= dims[i];
1577  } else {
1578  break;
1579  }
1580  }
1581 }
1582 
1583 template <typename T>
1584 void TransposeCPU(
1585  const int num_axes,
1586  const int* x_dims,
1587  const int* y_dims,
1588  const int* axes,
1589  const int data_size,
1590  const T* X,
1591  T* Y) {
1592  // Measure amount of contiguous data we can copy at once
1593  int block_size = 1;
1594  int num_shared_idxs = 0;
1595  for (int i = num_axes - 1; i >= 0 && axes[i] == i; --i) {
1596  block_size *= y_dims[i];
1597  ++num_shared_idxs;
1598  }
1599 
1600  if (num_axes < 2 || num_shared_idxs == num_axes) {
1601  memcpy(Y, X, data_size * sizeof(T));
1602  return;
1603  }
1604 
1605  const int itr_axes = num_axes - num_shared_idxs;
1606  const std::vector<int> x_strides = ComputeXStrides(itr_axes, x_dims, axes);
1607  std::vector<int> index_digits(itr_axes, 0);
1608  const int num_blocks = data_size / block_size;
1609  for (int y_index = 0; y_index < num_blocks; ++y_index) {
1610  const int x_index = std::inner_product(
1611  x_strides.cbegin(), x_strides.cend(), index_digits.cbegin(), 0);
1612  if (block_size == 1) {
1613  Y[y_index] = X[x_index];
1614  } else {
1615  memcpy(
1616  Y + block_size * y_index,
1617  X + block_size * x_index,
1618  block_size * sizeof(T));
1619  }
1620  IncreaseIndex(y_dims, &index_digits);
1621  }
1622 }
1623 
1624 } // namespace
1625 
1626 template <>
1627 void Transpose<float, CPUContext>(
1628  const int num_axes,
1629  const int* x_dims,
1630  const int* y_dims,
1631  const int* axes,
1632  const int data_size,
1633  const float* X,
1634  float* Y,
1635  CPUContext* /* context */) {
1636 #ifdef CAFFE2_USE_HPTT
1637  if (TryTransposeWithHPTT(num_axes, x_dims, axes, X, Y)) {
1638  return;
1639  }
1640 #endif // CAFFE2_USE_HPTT
1641  TransposeCPU(num_axes, x_dims, y_dims, axes, data_size, X, Y);
1642 }
1643 
1644 #define CAFFE2_SPECIALIZED_TRANSPOSE(T) \
1645  template <> \
1646  void Transpose<T, CPUContext>( \
1647  const int num_axes, \
1648  const int* x_dims, \
1649  const int* y_dims, \
1650  const int* axes, \
1651  const int data_size, \
1652  const T* X, \
1653  T* Y, \
1654  CPUContext* /* context */) { \
1655  TransposeCPU(num_axes, x_dims, y_dims, axes, data_size, X, Y); \
1656  }
1657 CAFFE2_SPECIALIZED_TRANSPOSE(double)
1658 CAFFE2_SPECIALIZED_TRANSPOSE(int)
1659 CAFFE2_SPECIALIZED_TRANSPOSE(long)
1660 #undef CAFFE2_SPECIALIZED_TRANSPOSE
1661 
1662 } // namespace math
1663 } // namespace caffe2
Definition: types.h:88
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016-present, Facebook, Inc.