Caffe2 - C++ API
A deep learning, cross platform ML framework
math_cpu.cc
1 // Implements the math functions for CPU.
2 // The implementation in this file allows us to route the underlying numerical
3 // computation library to different backends. Notably:
4 // (1) For all BLAS-related functions, one can explicitly request a BLAS backend
5 // such as MKL, openblas or Atlas. To see the set of supported backends
6 // currently provided, check //third_party/blas/.
7 // (2) If one chooses to link against MKL, we utilize MKL's vector math library
8 // (VML) for a few functions such as Exp and Log.
9 // (3) Fallback implementations are provided in Eigen for cross-platform
10 // support. Since Eigen is a header-only library and supports a number of
11 // platforms, it allows one to quickly port Caffe2 to different platforms
12 // where BLAS may not be present.
13 
14 #include "caffe2/utils/math.h"
15 
16 #include <algorithm>
17 #include <array>
18 #include <atomic>
19 #include <chrono>
20 #include <cmath>
21 #include <cstdlib>
22 #include <cstring>
23 #include <functional>
24 #include <limits>
25 #include <numeric>
26 #include <random>
27 #include <tuple>
28 #include <unordered_set>
29 #include <vector>
30 
31 #include "caffe2/core/context.h"
32 #include "caffe2/utils/cpu_neon.h"
33 #include "caffe2/utils/eigen_utils.h"
34 #include "caffe2/utils/fixed_divisor.h"
35 
36 #include "Eigen/Core"
37 #include "Eigen/Dense"
38 
39 #ifdef CAFFE2_USE_MKL
40 #include <mkl.h>
41 #endif // CAFFE2_USE_MKL
42 
43 #ifdef CAFFE2_USE_HPTT
44 #include <hptt.h>
45 #endif // CAFFE2_USE_HPTT
46 
47 #if defined(_MSC_VER)
48 #include <process.h>
49 #endif
50 
51 namespace caffe2 {
52 
53 namespace math {
54 
56 // BLAS alternatives.
57 // Depending on whether we have specified an external BLAS library or not, we
58 // will delegate the Caffe math functions that are BLAS-related to either the
59 // CBLAS call or the Eigen implementation.
61 #ifdef CAFFE2_USE_EIGEN_FOR_BLAS
62 
63 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
64 // limitation that the data has to be contiguous in memory.
65 //
66 // The gemm call implements the following operation:
67 //
68 // C = alpha * op(A) * op(B) + beta * C
69 //
70 // where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each
71 // of A, B, and C are matrices and alpha and beta are scalars. Note that the
72 // most common use case of gemm will involve setting alpha to 1 and beta to 0.
73 //
74 // op(A) and op(B) represent the transformations that are done to A and B before
75 // the matrix multiply; depending on the flags set, op(A) is equal to A or A^T
76 // (transpose) if the argument TransA or TransB is set to CblasNoTrans or
77 // CblasTrans, respectively, for each of A and B.
78 template <>
79 C10_EXPORT void Gemm<float, CPUContext>(
80  const CBLAS_TRANSPOSE trans_A,
81  const CBLAS_TRANSPOSE trans_B,
82  const int M,
83  const int N,
84  const int K,
85  const float alpha,
86  const float* A,
87  const float* B,
88  const float beta,
89  float* C,
90  CPUContext* context,
91  TensorProto::DataType math_type) {
92  auto C_mat = EigenMatrixMap<float>(C, N, M);
93  if (beta == 0) {
94  C_mat.setZero();
95  } else {
96  C_mat *= beta;
97  }
98  switch (trans_A) {
99  case CblasNoTrans: {
100  switch (trans_B) {
101  case CblasNoTrans:
102  C_mat.noalias() += alpha *
103  (ConstEigenMatrixMap<float>(B, N, K) *
104  ConstEigenMatrixMap<float>(A, K, M));
105  return;
106  case CblasTrans:
107  C_mat.noalias() += alpha *
108  (ConstEigenMatrixMap<float>(B, K, N).transpose() *
109  ConstEigenMatrixMap<float>(A, K, M));
110  return;
111  default:
112  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
113  }
114  }
115  case CblasTrans: {
116  switch (trans_B) {
117  case CblasNoTrans:
118  C_mat.noalias() += alpha *
119  (ConstEigenMatrixMap<float>(B, N, K) *
120  ConstEigenMatrixMap<float>(A, M, K).transpose());
121  return;
122  case CblasTrans:
123  C_mat.noalias() += alpha *
124  (ConstEigenMatrixMap<float>(B, K, N).transpose() *
125  ConstEigenMatrixMap<float>(A, M, K).transpose());
126  return;
127  default:
128  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
129  }
130  }
131  default:
132  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A";
133  }
134 }
135 
136 template <>
137 C10_EXPORT void GemmEx<float, CPUContext>(
138  const CBLAS_TRANSPOSE trans_A,
139  const CBLAS_TRANSPOSE trans_B,
140  const int M,
141  const int N,
142  const int K,
143  const float alpha,
144  const float* A,
145  const int lda,
146  const float* B,
147  const int ldb,
148  const float beta,
149  float* C,
150  const int ldc,
151  CPUContext*) {
152  EigenOuterStridedMatrixMap<float> C_mat(C, N, M, EigenOuterStride(ldc));
153  if (beta == 0) {
154  C_mat.setZero();
155  } else {
156  C_mat *= beta;
157  }
158  switch (trans_A) {
159  case CblasNoTrans: {
160  switch (trans_B) {
161  case CblasNoTrans:
162  C_mat.noalias() += alpha *
163  (ConstEigenOuterStridedMatrixMap<float>(
164  B, N, K, EigenOuterStride(ldb)) *
165  ConstEigenOuterStridedMatrixMap<float>(
166  A, K, M, EigenOuterStride(lda)));
167  return;
168  case CblasTrans:
169  C_mat.noalias() += alpha *
170  (ConstEigenOuterStridedMatrixMap<float>(
171  B, K, N, EigenOuterStride(ldb))
172  .transpose() *
173  ConstEigenOuterStridedMatrixMap<float>(
174  A, K, M, EigenOuterStride(lda)));
175  return;
176  default:
177  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
178  }
179  }
180  case CblasTrans: {
181  switch (trans_B) {
182  case CblasNoTrans:
183  C_mat.noalias() += alpha *
184  (ConstEigenOuterStridedMatrixMap<float>(
185  B, N, K, EigenOuterStride(ldb)) *
186  ConstEigenOuterStridedMatrixMap<float>(
187  A, M, K, EigenOuterStride(lda))
188  .transpose());
189  return;
190  case CblasTrans:
191  C_mat.noalias() += alpha *
192  (ConstEigenOuterStridedMatrixMap<float>(
193  B, K, N, EigenOuterStride(ldb))
194  .transpose() *
195  ConstEigenOuterStridedMatrixMap<float>(
196  A, M, K, EigenOuterStride(lda))
197  .transpose());
198  return;
199  default:
200  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
201  }
202  }
203  default:
204  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A";
205  }
206 }
207 
208 template <>
209 C10_EXPORT void Gemv<float, CPUContext>(
210  const CBLAS_TRANSPOSE trans_A,
211  const int M,
212  const int N,
213  const float alpha,
214  const float* A,
215  const float* x,
216  const float beta,
217  float* y,
218  CPUContext* context,
219  TensorProto::DataType math_type) {
220  EigenVectorMap<float> y_vec(y, trans_A == CblasNoTrans ? M : N);
221  if (beta == 0) {
222  // In Caffe2 we often do a lazy initialization, which may contain NaNs in
223  // the float values. As a result, if beta is 0, we explicitly do a setzero.
224  y_vec.setZero();
225  } else {
226  y_vec *= beta;
227  }
228  switch (trans_A) {
229  case CblasNoTrans: {
230  y_vec.noalias() += alpha *
231  (ConstEigenMatrixMap<float>(A, N, M).transpose() *
232  ConstEigenVectorMap<float>(x, N));
233  return;
234  }
235  case CblasTrans: {
236  y_vec.noalias() += alpha *
237  (ConstEigenMatrixMap<float>(A, N, M) *
238  ConstEigenVectorMap<float>(x, M));
239  return;
240  }
241  default:
242  LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input.";
243  }
244 }
245 
246 #define CAFFE2_SPECIALIZED_DOT(T) \
247  template <> \
248  C10_EXPORT void Dot<T, CPUContext>( \
249  const int N, const T* a, const T* b, T* y, CPUContext* context) { \
250  *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
251  }
252 CAFFE2_SPECIALIZED_DOT(float)
253 #undef CAFFE2_SPECIALIZED_DOT
254 
255 #define CAFFE2_SPECIALIZED_AXPY(T) \
256  template <> \
257  C10_EXPORT void Axpy<T, CPUContext>( \
258  const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \
259  EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha; \
260  } \
261  template <> \
262  C10_EXPORT void Axpy<T, CPUContext>( \
263  const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
264  EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha); \
265  }
266 CAFFE2_SPECIALIZED_AXPY(float)
267 #undef CAFFE2_SPECIALIZED_AXPY
268 
269 #define CAFFE2_SPECIALIZED_AXPBY(T) \
270  template <> \
271  C10_EXPORT void Axpby<T, T, CPUContext>( \
272  const int N, \
273  const T alpha, \
274  const T* x, \
275  const T beta, \
276  T* y, \
277  CPUContext* context) { \
278  EigenVectorArrayMap<T> y_arr(y, N); \
279  y_arr = y_arr * beta + ConstEigenVectorArrayMap<T>(x, N) * alpha; \
280  } \
281  template <> \
282  C10_EXPORT void Axpby<T, T, CPUContext>( \
283  const int N, \
284  const T* alpha, \
285  const T* x, \
286  const T* beta, \
287  T* y, \
288  CPUContext* context) { \
289  EigenVectorArrayMap<T> y_arr(y, N); \
290  y_arr = y_arr * *beta + ConstEigenVectorArrayMap<T>(x, N) * *alpha; \
291  }
292 CAFFE2_SPECIALIZED_AXPBY(float)
293 #undef CAFFE2_SPECIALIZED_AXPBY
294 
295 #else // CAFFE2_USE_EIGEN_FOR_BLAS
296 
297 template <>
298 C10_EXPORT void Gemm<float, CPUContext>(
299  const CBLAS_TRANSPOSE trans_A,
300  const CBLAS_TRANSPOSE trans_B,
301  const int M,
302  const int N,
303  const int K,
304  const float alpha,
305  const float* A,
306  const float* B,
307  const float beta,
308  float* C,
309  CPUContext* /*context*/,
310  TensorProto::DataType /*math_type*/) {
311  const int lda = (trans_A == CblasNoTrans) ? K : M;
312  const int ldb = (trans_B == CblasNoTrans) ? N : K;
313  cblas_sgemm(
314  CblasRowMajor,
315  trans_A,
316  trans_B,
317  M,
318  N,
319  K,
320  alpha,
321  A,
322  lda,
323  B,
324  ldb,
325  beta,
326  C,
327  N);
328 }
329 
330 template <>
331 C10_EXPORT void GemmEx<float, CPUContext>(
332  const CBLAS_TRANSPOSE trans_A,
333  const CBLAS_TRANSPOSE trans_B,
334  const int M,
335  const int N,
336  const int K,
337  const float alpha,
338  const float* A,
339  const int lda,
340  const float* B,
341  const int ldb,
342  const float beta,
343  float* C,
344  const int ldc,
345  CPUContext* /*context*/) {
346  cblas_sgemm(
347  CblasRowMajor,
348  trans_A,
349  trans_B,
350  M,
351  N,
352  K,
353  alpha,
354  A,
355  lda,
356  B,
357  ldb,
358  beta,
359  C,
360  ldc);
361 }
362 
363 template <>
364 C10_EXPORT void Gemv<float, CPUContext>(
365  const CBLAS_TRANSPOSE trans_A,
366  const int M,
367  const int N,
368  const float alpha,
369  const float* A,
370  const float* x,
371  const float beta,
372  float* y,
373  CPUContext* /*context*/,
374  TensorProto::DataType /*math_type*/) {
375  cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1);
376 }
377 
378 #define CAFFE2_SPECIALIZED_DOT(T, prefix) \
379  template <> \
380  C10_EXPORT void Dot<T, CPUContext>( \
381  const int N, const T* a, const T* b, T* y, CPUContext*) { \
382  *y = cblas_##prefix##dot(N, a, 1, b, 1); \
383  }
384 CAFFE2_SPECIALIZED_DOT(float, s)
385 #undef CAFFE2_SPECIALIZED_DOT
386 
387 #define CAFFE2_SPECIALIZED_AXPY(T, prefix) \
388  template <> \
389  C10_EXPORT void Axpy<T, CPUContext>( \
390  const int N, const T alpha, const T* x, T* y, CPUContext*) { \
391  cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
392  } \
393  template <> \
394  C10_EXPORT void Axpy<T, CPUContext>( \
395  const int N, const T* alpha, const T* x, T* y, CPUContext*) { \
396  cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \
397  }
398 CAFFE2_SPECIALIZED_AXPY(float, s)
399 #undef CAFFE2_SPECIALIZED_AXPY
400 
401 // cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
402 // we will need to implement it.
403 #ifdef CAFFE2_USE_MKL
404 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \
405  template <> \
406  C10_EXPORT void Axpby<T, T, CPUContext>( \
407  const int N, \
408  const T alpha, \
409  const T* x, \
410  const T beta, \
411  T* y, \
412  CPUContext*) { \
413  cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \
414  } \
415  template <> \
416  C10_EXPORT void Axpby<T, T, CPUContext>( \
417  const int N, \
418  const T* alpha, \
419  const T* x, \
420  const T* beta, \
421  T* y, \
422  CPUContext*) { \
423  cblas_##prefix##axpby(N, *alpha, x, 1, *beta, y, 1); \
424  }
425 #else // CAFFE2_USE_MKL
426 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \
427  template <> \
428  C10_EXPORT void Axpby<T, T, CPUContext>( \
429  const int N, \
430  const T alpha, \
431  const T* x, \
432  const T beta, \
433  T* y, \
434  CPUContext*) { \
435  cblas_##prefix##scal(N, beta, y, 1); \
436  cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
437  } \
438  template <> \
439  C10_EXPORT void Axpby<T, T, CPUContext>( \
440  const int N, \
441  const T* alpha, \
442  const T* x, \
443  const T* beta, \
444  T* y, \
445  CPUContext*) { \
446  cblas_##prefix##scal(N, *beta, y, 1); \
447  cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \
448  }
449 #endif // CAFFE2_USE_MKL
450 CAFFE2_SPECIALIZED_AXPBY(float, s)
451 #undef CAFFE2_SPECIALIZED_AXPBY
452 
453 #endif // CAFFE2_USE_EIGEN_FOR_BLAS
454 
455 template <>
456 C10_EXPORT void GemmBatched<float, CPUContext>(
457  const CBLAS_TRANSPOSE trans_A,
458  const CBLAS_TRANSPOSE trans_B,
459  const int batch_size,
460  const int M,
461  const int N,
462  const int K,
463  const float alpha,
464  const float** A,
465  const float** B,
466  const float beta,
467  float** C,
468  CPUContext* context,
469  TensorProto::DataType /* math_type */) {
470 #ifdef CAFFE2_USE_MKL
471  (void)context;
472  const int lda = (trans_A == CblasNoTrans) ? K : M;
473  const int ldb = (trans_B == CblasNoTrans) ? N : K;
474  const int ldc = N;
475  cblas_sgemm_batch(
476  CblasRowMajor,
477  &trans_A,
478  &trans_B,
479  &M,
480  &N,
481  &K,
482  &alpha,
483  A,
484  &lda,
485  B,
486  &ldb,
487  &beta,
488  C,
489  &ldc,
490  1,
491  &batch_size);
492 #else // CAFFE2_USE_MKL
493  // loop over matrices in the batch
494  for (int i = 0; i < batch_size; ++i) {
495  math::Gemm<float, CPUContext>(
496  trans_A, trans_B, M, N, K, alpha, A[i], B[i], beta, C[i], context);
497  }
498 #endif // CAFFE2_USE_MKL
499 }
500 
501 template <>
502 C10_EXPORT void GemmStridedBatched<float, CPUContext>(
503  const CBLAS_TRANSPOSE trans_A,
504  const CBLAS_TRANSPOSE trans_B,
505  const int batch_size,
506  const int M,
507  const int N,
508  const int K,
509  const float alpha,
510  const float* A,
511  const int A_stride,
512  const float* B,
513  const int B_stride,
514  const float beta,
515  float* C,
516  const int C_stride,
517  CPUContext* context,
518  TensorProto::DataType /* math_type */) {
519 #ifdef CAFFE2_USE_MKL
520  (void)context;
521  const int lda = (trans_A == CblasNoTrans) ? K : M;
522  const int ldb = (trans_B == CblasNoTrans) ? N : K;
523  const int ldc = N;
524  std::vector<const float*> A_array(batch_size);
525  std::vector<const float*> B_array(batch_size);
526  std::vector<float*> C_array(batch_size);
527  for (int i = 0; i < batch_size; ++i) {
528  A_array[i] = A + i * A_stride;
529  B_array[i] = B + i * B_stride;
530  C_array[i] = C + i * C_stride;
531  }
532  cblas_sgemm_batch(
533  CblasRowMajor,
534  &trans_A,
535  &trans_B,
536  &M,
537  &N,
538  &K,
539  &alpha,
540  A_array.data(),
541  &lda,
542  B_array.data(),
543  &ldb,
544  &beta,
545  C_array.data(),
546  &ldc,
547  1,
548  &batch_size);
549 #else // CAFFE2_USE_MKL
550  // loop over matrices in the batch
551  for (int i = 0; i < batch_size; ++i) {
552  math::Gemm<float, CPUContext>(
553  trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context);
554  A += A_stride;
555  B += B_stride;
556  C += C_stride;
557  }
558 #endif
559 }
560 
562 // Common math functions being used in Caffe that do not have a BLAS or MKL
563 // equivalent. For all these functions, we will simply implement them either via
564 // Eigen or via custom code.
566 
567 namespace {
568 
569 template <typename T>
570 C10_EXPORT void BroadcastImpl(
571  const int X_ndim,
572  const int* X_dims,
573  const int Y_ndim,
574  const int* Y_dims,
575  const T alpha,
576  const T* X,
577  T* Y,
578  CPUContext* context) {
579  CAFFE_ENFORCE_LE(X_ndim, Y_ndim);
580  std::vector<int> X_dims_vector(Y_ndim);
581  const int d = Y_ndim - X_ndim;
582  std::fill(X_dims_vector.begin(), X_dims_vector.begin() + d, 1);
583  for (int i = d; i < Y_ndim; ++i) {
584  CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
585  X_dims_vector[i] = X_dims[i - d];
586  }
587  X_dims = X_dims_vector.data();
588  const int Y_size =
589  std::accumulate(Y_dims, Y_dims + Y_ndim, 1, std::multiplies<int>());
590  std::vector<int> index(Y_ndim, 0);
591  for (int Y_index = 0; Y_index < Y_size; ++Y_index) {
592  const int X_index = utils::GetIndexFromDims(Y_ndim, X_dims, index.data());
593  Y[Y_index] = X[X_index];
594  utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data());
595  }
596  Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
597 }
598 
599 } // namespace
600 
601 #define CAFFE2_SPECIALIZED_BROADCAST(T) \
602  template <> \
603  C10_EXPORT void Broadcast<T, CPUContext>( \
604  const int X_ndim, \
605  const int* X_dims, \
606  const int Y_ndim, \
607  const int* Y_dims, \
608  const T alpha, \
609  const T* X, \
610  T* Y, \
611  CPUContext* context) { \
612  BroadcastImpl<T>(X_ndim, X_dims, Y_ndim, Y_dims, alpha, X, Y, context); \
613  }
614 CAFFE2_SPECIALIZED_BROADCAST(std::int32_t)
615 CAFFE2_SPECIALIZED_BROADCAST(std::int64_t)
616 CAFFE2_SPECIALIZED_BROADCAST(float)
617 CAFFE2_SPECIALIZED_BROADCAST(double)
618 #undef CAFFE2_SPECIALIZED_BROADCAST
619 
620 #define CAFFE2_SPECIALIZED_INV_STD(T) \
621  template <> \
622  void InvStd<T, CPUContext>( \
623  const int N, \
624  const T epsilon, \
625  const T* var, \
626  T* inv_std, \
627  CPUContext* context) { \
628  EigenVectorArrayMap<T>(inv_std, N) = \
629  (ConstEigenVectorArrayMap<T>(var, N) + epsilon).rsqrt(); \
630  }
631 CAFFE2_SPECIALIZED_INV_STD(float)
632 #undef CAFFE2_SPECIALIZED_INV_STD
633 
634 #define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \
635  template <> \
636  C10_EXPORT void RowwiseMax<T, CPUContext>( \
637  const int N, const int D, const T* x, T* y, CPUContext*) { \
638  EigenVectorMap<T>(y, N) = \
639  ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \
640  }
641 CAFFE2_SPECIALIZED_ROWWISEMAX(float)
642 #undef CAFFE2_SPECIALIZED_ROWWISEMAX
643 
644 #define CAFFE2_SPECIALIZED_COLWISEMAX(T) \
645  template <> \
646  C10_EXPORT void ColwiseMax<T, CPUContext>( \
647  const int N, const int D, const T* x, T* y, CPUContext*) { \
648  EigenVectorMap<T>(y, D) = \
649  ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \
650  }
651 CAFFE2_SPECIALIZED_COLWISEMAX(float)
652 #undef CAFFE2_SPECIALIZED_COLWISEMAX
653 
654 #define CAFFE2_SPECIALIZED_MAXIMUM(T) \
655  template <> \
656  C10_EXPORT void Maximum<T, CPUContext>( \
657  const int N, const float alpha, const T* x, T* y, CPUContext* context) { \
658  std::transform( \
659  x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \
660  }
661 CAFFE2_SPECIALIZED_MAXIMUM(float)
662 #undef CAFFE2_SPECIALIZED_MAXIMUM
663 
664 // The actual implementation uses eigen which is column major, so notice the
665 // row/column swap in the actual implementation.
666 
667 #define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
668  template <> \
669  C10_EXPORT void Rowwise##Func<T, CPUContext, true>( \
670  const int rows, \
671  const int cols, \
672  const T* A, \
673  const T* B, \
674  T* C, \
675  CPUContext*) { \
676  if (C == B) { \
677  EigenArrayMap<T>(C, cols, rows).colwise() expr## = \
678  ConstEigenVectorArrayMap<T>(A, cols); \
679  } else { \
680  EigenArrayMap<T>(C, cols, rows) = \
681  ConstEigenArrayMap<T>(B, cols, rows) \
682  .colwise() expr ConstEigenVectorArrayMap<T>(A, cols); \
683  } \
684  } \
685  template <> \
686  C10_EXPORT void Colwise##Func<T, CPUContext, true>( \
687  const int rows, \
688  const int cols, \
689  const T* A, \
690  const T* B, \
691  T* C, \
692  CPUContext*) { \
693  if (C == B) { \
694  EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \
695  ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
696  } else { \
697  EigenArrayMap<T>(C, cols, rows) = \
698  ConstEigenArrayMap<T>(B, cols, rows) \
699  .rowwise() expr ConstEigenVectorArrayMap<T>(A, rows) \
700  .transpose(); \
701  } \
702  }
703 
704 #define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \
705  template <> \
706  C10_EXPORT void Rowwise##Func<T, CPUContext, false>( \
707  const int rows, \
708  const int cols, \
709  const T* A, \
710  const T* B, \
711  T* C, \
712  CPUContext*) { \
713  if (C == A) { \
714  EigenArrayMap<T>(C, cols, rows).colwise() expr## = \
715  ConstEigenVectorArrayMap<T>(B, cols); \
716  } else { \
717  EigenArrayMap<T>(C, cols, rows) = \
718  ConstEigenArrayMap<T>(A, cols, rows) \
719  .colwise() expr ConstEigenVectorArrayMap<T>(B, cols); \
720  } \
721  } \
722  template <> \
723  C10_EXPORT void Colwise##Func<T, CPUContext, false>( \
724  const int rows, \
725  const int cols, \
726  const T* A, \
727  const T* B, \
728  T* C, \
729  CPUContext*) { \
730  if (C == A) { \
731  EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \
732  ConstEigenVectorArrayMap<T>(B, rows).transpose(); \
733  } else { \
734  EigenArrayMap<T>(C, cols, rows) = \
735  ConstEigenArrayMap<T>(A, cols, rows) \
736  .rowwise() expr ConstEigenVectorArrayMap<T>(B, rows) \
737  .transpose(); \
738  } \
739  }
740 
741 #define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \
742  DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
743  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr)
744 
745 #define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr) \
746  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr) \
747  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr) \
748  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \
749  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr)
750 
751 DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Add, +)
752 DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
753 
754 #undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
755 #undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
756 
757 #define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T) \
758  template <> \
759  C10_EXPORT void RowwiseSub<T, CPUContext, true>( \
760  const int rows, \
761  const int cols, \
762  const T* A, \
763  const T* B, \
764  T* C, \
765  CPUContext*) { \
766  EigenArrayMap<T>(C, cols, rows) = \
767  (-ConstEigenArrayMap<T>(B, cols, rows)).colwise() + \
768  ConstEigenVectorArrayMap<T>(A, cols); \
769  } \
770  template <> \
771  C10_EXPORT void ColwiseSub<T, CPUContext, true>( \
772  const int rows, \
773  const int cols, \
774  const T* A, \
775  const T* B, \
776  T* C, \
777  CPUContext*) { \
778  EigenArrayMap<T>(C, cols, rows) = \
779  (-ConstEigenArrayMap<T>(B, cols, rows)).rowwise() + \
780  ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
781  } \
782  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -)
783 
784 DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(float)
785 DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(double)
786 DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int32_t)
787 DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
788 
789 #undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION
790 
791 #define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T) \
792  template <> \
793  C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \
794  const int rows, \
795  const int cols, \
796  const T* A, \
797  const T* B, \
798  T* C, \
799  CPUContext*) { \
800  EigenArrayMap<T>(C, cols, rows) = \
801  ConstEigenArrayMap<T>(B, cols, rows).inverse().colwise() * \
802  ConstEigenVectorArrayMap<T>(A, cols); \
803  } \
804  template <> \
805  C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \
806  const int rows, \
807  const int cols, \
808  const T* A, \
809  const T* B, \
810  T* C, \
811  CPUContext*) { \
812  EigenArrayMap<T>(C, cols, rows) = \
813  ConstEigenArrayMap<T>(B, cols, rows).inverse().rowwise() * \
814  ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
815  } \
816  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /)
817 
818 DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(float)
819 DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(double)
820 DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int32_t, Div, /)
821 DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /)
822 
823 #undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION
824 
825 #undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION
826 #undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION
827 
828 template <>
829 C10_EXPORT void Not<bool, CPUContext>(
830  const int N,
831  const bool* x,
832  bool* y,
833  CPUContext* /*context*/) {
834  for (int i = 0; i < N; ++i) {
835  y[i] = !x[i];
836  }
837 }
838 
839 #undef C10_DEFINE_BINARY_OP
840 #undef CAFFE2_INSTANTIATE_BINARY_OP
841 
842 #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \
843  template <> \
844  C10_EXPORT void AddStripedBatch( \
845  const int N, \
846  const T* first, \
847  T* y, \
848  const int stripe, \
849  const int batch, \
850  CPUContext* context) { \
851  for (int j = 0; j < batch; j++) { \
852  Add<T, CPUContext>(N, first + j * stripe, y, y, context); \
853  } \
854  }
855 
856 CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
857 #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
858 
859 namespace {
860 
861 template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
862 C10_EXPORT void RowwiseBinaryOp(
863  const int rows,
864  const int cols,
865  const BinaryOperator& op,
866  const TIn* A,
867  const TIn* B,
868  TOut* C) {
869  for (int i = 0; i < rows; ++i) {
870  for (int j = 0; j < cols; ++j) {
871  const int C_index = i * cols + j;
872  const int A_index = kBroadcast1st ? j : C_index;
873  const int B_index = kBroadcast1st ? C_index : j;
874  C[C_index] = op(A[A_index], B[B_index]);
875  }
876  }
877 }
878 
879 template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
880 C10_EXPORT void ColwiseBinaryOp(
881  const int rows,
882  const int cols,
883  const BinaryOperator& op,
884  const TIn* A,
885  const TIn* B,
886  TOut* C) {
887  for (int i = 0; i < rows; ++i) {
888  for (int j = 0; j < cols; ++j) {
889  const int C_index = i * cols + j;
890  const int A_index = kBroadcast1st ? i : C_index;
891  const int B_index = kBroadcast1st ? C_index : i;
892  C[C_index] = op(A[A_index], B[B_index]);
893  }
894  }
895 }
896 
897 template <typename TIn, typename TOut, class BinaryOperator>
898 C10_EXPORT void BroadcastBinaryOpImpl(
899  const int ndim,
900  const int* A_dims,
901  const int* B_dims,
902  const int* C_dims,
903  const BinaryOperator& op,
904  const TIn* A,
905  const TIn* B,
906  TOut* C) {
907  std::vector<int> index(ndim, 0);
908  const int C_size =
909  std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
910  for (int C_index = 0; C_index < C_size; ++C_index) {
911  const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data());
912  const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data());
913  C[C_index] = op(A[A_index], B[B_index]);
914  utils::IncreaseIndexInDims(ndim, C_dims, index.data());
915  }
916 }
917 
918 } // namespace
919 
920 #define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
921  template <> \
922  C10_EXPORT void Rowwise##Func<TIn, CPUContext, true>( \
923  const int rows, \
924  const int cols, \
925  const TIn* A, \
926  const TIn* B, \
927  TOut* C, \
928  CPUContext*) { \
929  RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
930  } \
931  template <> \
932  C10_EXPORT void Rowwise##Func<TIn, CPUContext, false>( \
933  const int rows, \
934  const int cols, \
935  const TIn* A, \
936  const TIn* B, \
937  TOut* C, \
938  CPUContext*) { \
939  RowwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \
940  rows, cols, Op<TIn>(), A, B, C); \
941  } \
942  template <> \
943  C10_EXPORT void Colwise##Func<TIn, CPUContext, true>( \
944  const int rows, \
945  const int cols, \
946  const TIn* A, \
947  const TIn* B, \
948  TOut* C, \
949  CPUContext*) { \
950  ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
951  } \
952  template <> \
953  C10_EXPORT void Colwise##Func<TIn, CPUContext, false>( \
954  const int rows, \
955  const int cols, \
956  const TIn* A, \
957  const TIn* B, \
958  TOut* C, \
959  CPUContext*) { \
960  ColwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \
961  rows, cols, Op<TIn>(), A, B, C); \
962  }
963 
964 #define DEFINE_2D_COMPARE_FUNCTION(Func, Op) \
965  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \
966  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \
967  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
968  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
969  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
970 
971 DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to)
972 DEFINE_2D_COMPARE_FUNCTION(NE, std::not_equal_to)
973 DEFINE_2D_COMPARE_FUNCTION(LT, std::less)
974 DEFINE_2D_COMPARE_FUNCTION(LE, std::less_equal)
975 DEFINE_2D_COMPARE_FUNCTION(GT, std::greater)
976 DEFINE_2D_COMPARE_FUNCTION(GE, std::greater_equal)
977 
978 #undef DEFINE_2D_COMPARE_FUNCTION
979 
980 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
981 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
982 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
983 
984 #define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \
985  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \
986  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
987  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
988 
989 DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
990 DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
991 DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
992 
993 #undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION
994 
995 #undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION
996 
997 #define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T) \
998  template <> \
999  C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \
1000  const int rows, \
1001  const int cols, \
1002  const T* A, \
1003  const T* B, \
1004  T* C, \
1005  CPUContext*) { \
1006  RowwiseBinaryOp<T, T, std::divides<T>, true>( \
1007  rows, cols, std::divides<T>(), A, B, C); \
1008  } \
1009  template <> \
1010  C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \
1011  const int rows, \
1012  const int cols, \
1013  const T* A, \
1014  const T* B, \
1015  T* C, \
1016  CPUContext*) { \
1017  ColwiseBinaryOp<T, T, std::divides<T>, true>( \
1018  rows, cols, std::divides<T>(), A, B, C); \
1019  }
1020 DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t)
1021 DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
1022 #undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION
1023 
1024 #define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
1025  template <> \
1026  C10_EXPORT void Func<TIn, CPUContext>( \
1027  const int A_ndim, \
1028  const int* A_dims, \
1029  const int B_ndim, \
1030  const int* B_dims, \
1031  const TIn* A, \
1032  const TIn* B, \
1033  TOut* C, \
1034  CPUContext* context) { \
1035  const int ndim = std::max(A_ndim, B_ndim); \
1036  std::vector<int> A_dims_array(ndim); \
1037  std::vector<int> B_dims_array(ndim); \
1038  std::vector<int> C_dims_array(ndim); \
1039  utils::ComputeBroadcastBinaryOpDims( \
1040  A_ndim, \
1041  A_dims, \
1042  B_ndim, \
1043  B_dims, \
1044  A_dims_array.data(), \
1045  B_dims_array.data(), \
1046  C_dims_array.data()); \
1047  if (A_dims_array == B_dims_array) { \
1048  const int size = std::accumulate( \
1049  C_dims_array.cbegin(), \
1050  C_dims_array.cend(), \
1051  1, \
1052  std::multiplies<int>()); \
1053  Func<TIn, CPUContext>(size, A, B, C, context); \
1054  return; \
1055  } \
1056  int rows; \
1057  int cols; \
1058  bool broadcast_1st; \
1059  if (utils::IsRowwiseBroadcastBinaryOp( \
1060  ndim, \
1061  A_dims_array.data(), \
1062  B_dims_array.data(), \
1063  &rows, \
1064  &cols, \
1065  &broadcast_1st)) { \
1066  if (broadcast_1st) { \
1067  Rowwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \
1068  } else { \
1069  Rowwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \
1070  } \
1071  return; \
1072  } \
1073  if (utils::IsColwiseBroadcastBinaryOp( \
1074  ndim, \
1075  A_dims_array.data(), \
1076  B_dims_array.data(), \
1077  &rows, \
1078  &cols, \
1079  &broadcast_1st)) { \
1080  if (broadcast_1st) { \
1081  Colwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \
1082  } else { \
1083  Colwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \
1084  } \
1085  return; \
1086  } \
1087  int pre; \
1088  int mid; \
1089  int nxt; \
1090  if (utils::IsBothEndsBroadcastBinaryOp( \
1091  ndim, \
1092  A_dims_array.data(), \
1093  B_dims_array.data(), \
1094  &pre, \
1095  &mid, \
1096  &nxt, \
1097  &broadcast_1st)) { \
1098  const int stride = mid * nxt; \
1099  for (int i = 0; i < pre; ++i) { \
1100  if (broadcast_1st) { \
1101  Colwise##Func<TIn, CPUContext, true>( \
1102  mid, nxt, A, B + i * stride, C + i * stride, context); \
1103  } else { \
1104  Colwise##Func<TIn, CPUContext, false>( \
1105  mid, nxt, A + i * stride, B, C + i * stride, context); \
1106  } \
1107  } \
1108  return; \
1109  } \
1110  BroadcastBinaryOpImpl( \
1111  ndim, \
1112  A_dims_array.data(), \
1113  B_dims_array.data(), \
1114  C_dims_array.data(), \
1115  Op<TIn>(), \
1116  A, \
1117  B, \
1118  C); \
1119  }
1120 
1121 #define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op) \
1122  DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \
1123  DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \
1124  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
1125  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
1126  DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
1127 
1128 DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to)
1129 DEFINE_BROADCAST_COMPARE_FUNCTION(NE, std::not_equal_to)
1130 DEFINE_BROADCAST_COMPARE_FUNCTION(LT, std::less)
1131 DEFINE_BROADCAST_COMPARE_FUNCTION(LE, std::less_equal)
1132 DEFINE_BROADCAST_COMPARE_FUNCTION(GT, std::greater)
1133 DEFINE_BROADCAST_COMPARE_FUNCTION(GE, std::greater_equal)
1134 
1135 #undef DEFINE_BROADCAST_COMPARE_FUNCTION
1136 
1137 #define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op) \
1138  DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op) \
1139  DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op) \
1140  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
1141  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
1142 
1143 DEFINE_BROADCAST_BINARY_FUNCTION(Add, std::plus)
1144 DEFINE_BROADCAST_BINARY_FUNCTION(Sub, std::minus)
1145 DEFINE_BROADCAST_BINARY_FUNCTION(Mul, std::multiplies)
1146 DEFINE_BROADCAST_BINARY_FUNCTION(Div, std::divides)
1147 
1148 #undef DEFINE_BROADCAST_BINARY_FUNCTION
1149 
1150 DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
1151 DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
1152 DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
1153 
1154 #define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \
1155  DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \
1156  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
1157  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
1158 
1159 DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
1160 DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
1161 DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
1162 
1163 #undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION
1164 
1165 #undef DELEGATE_BROADCAST_BINARY_FUNCTION
1166 
1167 #define CAFFE2_RAND_UNIFORM_REAL(T) \
1168  template <> \
1169  C10_EXPORT void RandUniform<T, CPUContext>( \
1170  const size_t n, const T a, const T b, T* r, CPUContext* context) { \
1171  std::uniform_real_distribution<T> distribution(a, b); \
1172  for (size_t i = 0; i < n; ++i) { \
1173  r[i] = distribution(context->RandGenerator()); \
1174  } \
1175  }
1176 CAFFE2_RAND_UNIFORM_REAL(float);
1177 CAFFE2_RAND_UNIFORM_REAL(double);
1178 #undef CAFFE2_RAND_UNIFORM_REAL
1179 
1180 #define CAFFE2_RAND_UNIFORM_CHAR(T) \
1181  template <> \
1182  C10_EXPORT void RandUniform<T, CPUContext>( \
1183  const size_t n, const T a, const T b, T* r, CPUContext* context) { \
1184  std::uniform_int_distribution<short> distribution((short)a, (short)b); \
1185  for (size_t i = 0; i < n; ++i) { \
1186  r[i] = static_cast<T>(distribution(context->RandGenerator())); \
1187  } \
1188  }
1189 CAFFE2_RAND_UNIFORM_CHAR(int8_t);
1190 CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
1191 #undef CAFFE2_RAND_UNIFORM_CHAR
1192 
1193 #define CAFFE2_RAND_UNIFORM_INT(T) \
1194  template <> \
1195  C10_EXPORT void RandUniform<T, CPUContext>( \
1196  const size_t n, const T a, const T b, T* r, CPUContext* context) { \
1197  std::uniform_int_distribution<T> distribution(a, b); \
1198  for (size_t i = 0; i < n; ++i) { \
1199  r[i] = distribution(context->RandGenerator()); \
1200  } \
1201  }
1202 
1203 CAFFE2_RAND_UNIFORM_INT(int16_t);
1204 CAFFE2_RAND_UNIFORM_INT(int32_t);
1205 CAFFE2_RAND_UNIFORM_INT(int64_t);
1206 CAFFE2_RAND_UNIFORM_INT(uint16_t);
1207 CAFFE2_RAND_UNIFORM_INT(uint32_t);
1208 CAFFE2_RAND_UNIFORM_INT(uint64_t);
1209 #undef CAFFE2_RAND_UNIFORM_INT
1210 
1211 // This is not uniformly distributed between a and b.
1212 // It takes advantage of normal distribution to generate numbers
1213 // with mean = sum / n.
1214 // Ideally the algorithm should be generating n numbers between 0 and 1,
1215 // sum them up as scaled_sum, and use sum / scaled_sum to adjust the values
1216 // to between a and b.
1217 // The algorithm is non-trivial given the adjustment would be different towards
1218 // each value.
1219 #define CAFFE2_RAND_FIXED_SUM(T) \
1220  template <> \
1221  C10_EXPORT void RandFixedSum<T, CPUContext>( \
1222  const size_t n, \
1223  const T a, \
1224  const T b, \
1225  const T sum, \
1226  T* r, \
1227  CPUContext* context) { \
1228  CAFFE_ENFORCE_GE(a, 0); \
1229  CAFFE_ENFORCE_GE(sum / (double)n, a); \
1230  CAFFE_ENFORCE_LE(sum / (double)n, b); \
1231  T current_sum = 0; \
1232  T remaining_sum = sum; \
1233  for (size_t i = 0; i < n; ++i) { \
1234  auto remaining_numbers = n - 1 - i; \
1235  double mean = (sum - current_sum) / (remaining_numbers + 1); \
1236  double stdev = std::min(mean - a, b - mean); \
1237  std::normal_distribution<double> distribution{mean, stdev / 4.0}; \
1238  T value, remaining_sum_test; \
1239  do { \
1240  value = distribution(context->RandGenerator()); \
1241  remaining_sum_test = remaining_sum - value; \
1242  } while (value < a || remaining_sum_test < a * remaining_numbers || \
1243  value > b || remaining_sum_test > b * remaining_numbers); \
1244  r[i] = value; \
1245  CAFFE_ENFORCE(a <= value && value <= b); \
1246  current_sum += value; \
1247  remaining_sum -= value; \
1248  CAFFE_ENFORCE_GE(remaining_sum, a* remaining_numbers); \
1249  CAFFE_ENFORCE_LE(remaining_sum, b* remaining_numbers); \
1250  } \
1251  r[n - 1] += remaining_sum; \
1252  current_sum += remaining_sum; \
1253  CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b); \
1254  CAFFE_ENFORCE_EQ(current_sum, sum); \
1255  }
1256 CAFFE2_RAND_FIXED_SUM(float);
1257 CAFFE2_RAND_FIXED_SUM(double);
1258 CAFFE2_RAND_FIXED_SUM(int8_t);
1259 CAFFE2_RAND_FIXED_SUM(int16_t);
1260 CAFFE2_RAND_FIXED_SUM(int32_t);
1261 CAFFE2_RAND_FIXED_SUM(int64_t);
1262 CAFFE2_RAND_FIXED_SUM(uint8_t);
1263 CAFFE2_RAND_FIXED_SUM(uint16_t);
1264 CAFFE2_RAND_FIXED_SUM(uint32_t);
1265 CAFFE2_RAND_FIXED_SUM(uint64_t);
1266 #undef CAFFE2_RAND_FIXED_SUM
1267 
1268 template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app>
1269 Ind_t generate_stack_distance(
1270  std::vector<Ind_t>& cum_val,
1271  std::vector<Val_t>& cum_dis,
1272  std::vector<Ind_t>& cum_map,
1273  Ind_t max_i,
1274  Ind_t i,
1275  Context_t* context) {
1276  /* Description:
1277  Inverse Transform Sampling method to generate values for random variable X
1278  that is described by the cumulative distribution F (cum_val,cum_dis).
1279  Notice, that we may choose to use the inverse map of F (cum_map) as an
1280  approximation to avoid searching. Also, scaling the probability so that
1281  the values are within max_i refs, because stack distance can not be >
1282  than the # of already generated refs (max_i).
1283  */
1284  Ind_t j, k, n;
1285  Val_t u, f, fi;
1286 
1287  // generate a random number u in [0,1] from a uniform distribution U
1288  math::RandUniform<Val_t, Context_t>(1, 0, 1, &u, context);
1289 
1290  // scale the random number u to be within range [0,f(i)], if needed
1291  if (i < max_i) {
1292  // approach 2: allows gaps in the distribution
1293  j = (std::upper_bound(cum_val.begin(), cum_val.end(), i) -
1294  cum_val.begin()) -
1295  1;
1296  fi = cum_dis[j];
1297  u *= fi;
1298  }
1299  // 2. compute the stack distance value of x, s.t. F(x)=u
1300  // notice that the cumulative distribution F increases monotonically up to 1
1301  if (cdf_app) {
1302  // look up cum_val corresponding to u <= cum_dis[j]
1303  k = cum_map.size();
1304  n = (Ind_t)round(u * k);
1305  j = cum_map[n];
1306  return cum_val[j];
1307  } else {
1308  // iterate until you find the cum_val corresponding to u <= cum_dis[j]
1309  for (j = 0; j < cum_dis.size(); j++) {
1310  f = cum_dis[j];
1311  if (u <= f) {
1312  return cum_val[j];
1313  }
1314  }
1315  return cum_val[j - 1];
1316  }
1317 }
1318 
1319 template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app>
1320 C10_EXPORT void generate_trace_lru(
1321  std::vector<Ind_t>& uni_ref,
1322  std::vector<Ind_t>& cum_val,
1323  std::vector<Val_t>& cum_dis,
1324  std::vector<Ind_t>& cum_map,
1325  Context_t* context,
1326  Ind_t cache_line_size,
1327  Ind_t n,
1328  Type min,
1329  Type max,
1330  Type* syn_ref) {
1331  /* Description:
1332  Generate synthetic trace from a list of unique accesses uni_ref, and
1333  cumulative distribution of distances (cum_val,cum_dis) between them.
1334  Also, there is an option to use cum_map approximation to avoid searching.
1335  */
1336  Ind_t i, j, k, sd, line_ref, mem_ref, mem_ref_within_line;
1337  Ind_t max_sd = cum_val.back();
1338  Ind_t l = uni_ref.size();
1339 
1340  for (i = 0, j = 0; j < n; j++) {
1341  // generate stack distance
1342  sd = generate_stack_distance<Type, Val_t, Ind_t, Context_t, cdf_app>(
1343  cum_val, cum_dis, cum_map, max_sd, i, context);
1344  // fixed access within cache line
1345  mem_ref_within_line = 0;
1346  // random access within cache line
1347  // Val_t r;
1348  // math::RandUniform<Val_t, Context_t>(1, 0, 1, &r, context);
1349  // mem_ref_within_line = floor(r*cache_line_size);
1350 
1351  // generate memory reference
1352  if (sd == 0) {
1353  k = 0;
1354  i++;
1355  } else {
1356  k = l - sd;
1357  }
1358  line_ref = uni_ref[k]; // pop k-th element
1359  uni_ref.erase(uni_ref.begin() + k);
1360  uni_ref.push_back(line_ref); // append it back
1361  mem_ref = line_ref * cache_line_size + mem_ref_within_line;
1362  /*
1363  //debug prints
1364  if ((mem_ref < min) || (mem_ref > max)) {
1365  //printf("mem_ref[%d]=%d (%ld) \n",j,mem_ref,syn_ref[j]);
1366  std::cout << "syn_ref[" << j << "]=" << (Type)mem_ref << " ";
1367  std::cout << "(" << mem_ref << ") ";
1368  std::cout << "[" << min << "," << max << "]" << std::endl;
1369  int scanf_temp;
1370  scanf("%d",&scanf_temp);
1371  }
1372  */
1373 
1374  // patch mem_ref to be within range
1375  // WARNING: this should not be needed if instantiation type and distribution
1376  // choice is correct. It is remeding a symptom of earlier mistakes.
1377  if (mem_ref < min) {
1378  mem_ref = min;
1379  // std::cout << "clamping (min) mem_ref=" << mem_ref << std::endl;
1380  }
1381  if (mem_ref > max) {
1382  mem_ref = max; // mem_ref % max;
1383  // std::cout << "clamping (max) mem_ref=" << mem_ref << std::endl;
1384  }
1385 
1386  // save generated memory reference
1387  syn_ref[j] = (Type)mem_ref;
1388  }
1389 }
1390 
1391 // Generate n values from synthetic data distribution,
1392 // define by unique accesses and stack distances
1393 // WARNING: can create this for all tables or per table, but in latter
1394 // case we need to know the table id, to sample from the right distribution
1395 #define CAFFE2_RAND_SYNTHETIC_DATA(T) \
1396  template <> \
1397  C10_EXPORT void RandSyntheticData<T, CPUContext>( \
1398  const size_t n, const T a, const T b, T* r, CPUContext* context) { \
1399  /* unique memory references */ \
1400  std::vector<int> mem_ref = {1, 2, 3, 4, 5, 6}; \
1401  /* cumulative distribution of distances */ \
1402  std::vector<int> cum_val = {0, 1, 3, 4, 5}; \
1403  std::vector<double> cum_dis = {0.55, 0.64, 0.82, 0.91, 1.0}; \
1404  /* inverse map of cumulative distribution (for O(1) lookup) */ \
1405  /* std::vector<int> cum_map = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4}; */ \
1406  int k = 10; /* 100; */ \
1407  std::vector<int> cum_map(k, 0); \
1408  for (int j = 0; j < cum_dis.size();) { \
1409  int sz = (int)round(cum_dis[j] * k); \
1410  for (int i = 0; i < sz; i++) { \
1411  cum_map[j + i] = j; \
1412  } \
1413  j += sz; \
1414  } \
1415  \
1416  /* code to generate the synthetic data from the above values */ \
1417  const int cache_line = 1; /* 64; */ \
1418  generate_trace_lru<T, double, int, CPUContext, false>( \
1419  mem_ref, cum_val, cum_dis, cum_map, context, cache_line, n, a, b, r); \
1420  }
1421 
1422 CAFFE2_RAND_SYNTHETIC_DATA(float);
1423 CAFFE2_RAND_SYNTHETIC_DATA(double);
1424 CAFFE2_RAND_SYNTHETIC_DATA(int8_t);
1425 CAFFE2_RAND_SYNTHETIC_DATA(int16_t);
1426 CAFFE2_RAND_SYNTHETIC_DATA(int32_t);
1427 CAFFE2_RAND_SYNTHETIC_DATA(int64_t);
1428 CAFFE2_RAND_SYNTHETIC_DATA(uint8_t);
1429 CAFFE2_RAND_SYNTHETIC_DATA(uint16_t);
1430 CAFFE2_RAND_SYNTHETIC_DATA(uint32_t);
1431 CAFFE2_RAND_SYNTHETIC_DATA(uint64_t);
1432 #undef CAFFE2_RAND_SYNTHETIC_DATA
1433 
1434 #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \
1435  template <> \
1436  C10_EXPORT void RandUniformUnique<T, CPUContext>( \
1437  const size_t n, \
1438  const T a, \
1439  const T b, \
1440  T* r, \
1441  const size_t m, \
1442  const T* avoid, \
1443  CPUContext* context) { \
1444  CAFFE_ENFORCE_LE( \
1445  n, b - a - m + 1, "Cannot satisfy the unique requirement"); \
1446  std::unordered_set<T> avoid_set(n); \
1447  if (m) { \
1448  avoid_set.insert(avoid, avoid + m); \
1449  CAFFE_ENFORCE_EQ( \
1450  m, avoid_set.size(), "AC10_EXPORT void should be unique"); \
1451  } \
1452  std::uniform_int_distribution<T> distribution(a, b); \
1453  T v = 0; \
1454  for (size_t i = 0; i < n; ++i) { \
1455  do { \
1456  v = distribution(context->RandGenerator()); \
1457  } while (avoid_set.count(v)); \
1458  r[i] = v; \
1459  avoid_set.insert(v); \
1460  } \
1461  }
1462 
1463 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
1464 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
1465 #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE
1466 
1467 template <>
1468 C10_EXPORT void RandGaussian<float, CPUContext>(
1469  const size_t n,
1470  const float mean,
1471  const float std,
1472  float* r,
1473  CPUContext* context) {
1474  std::normal_distribution<float> distribution(mean, std);
1475  for (size_t i = 0; i < n; ++i) {
1476  r[i] = distribution(context->RandGenerator());
1477  }
1478 }
1479 
1480 #define CAFFE2_SPECIALIZED_SUM(T) \
1481  template <> \
1482  C10_EXPORT void Sum<T, CPUContext>( \
1483  const int N, \
1484  const T* x, \
1485  T* y, \
1486  CPUContext* /* unused */, \
1487  Tensor* /* unused */) { \
1488  *y = ConstEigenVectorMap<T>(x, N).sum(); \
1489  }
1490 
1491 CAFFE2_SPECIALIZED_SUM(float);
1492 CAFFE2_SPECIALIZED_SUM(int32_t);
1493 CAFFE2_SPECIALIZED_SUM(int64_t);
1494 
1495 #undef CAFFE2_SPECIALIZED_SUM
1496 
1497 template <>
1498 C10_EXPORT void SumSqr<float, CPUContext>(
1499  const int N,
1500  const float* x,
1501  float* y,
1502  CPUContext* /*context*/ /* unused */,
1503  Tensor* /*scratch_ptr*/ /* unused */) {
1504  *y = ConstEigenVectorMap<float>(x, N).squaredNorm();
1505 }
1506 
1507 template <>
1508 C10_EXPORT void Select<float, CPUContext>(
1509  const int N,
1510  const int D,
1511  const float* x,
1512  const int* idx,
1513  float* y,
1514  CPUContext* /*context*/) {
1515  for (int i = 0; i < N; ++i) {
1516  DCHECK_LT(idx[i], D);
1517  y[i] = x[i * D + idx[i]];
1518  }
1519 }
1520 
1521 template <>
1522 C10_EXPORT void CopyMatrix<CPUContext>(
1523  const size_t itemsize,
1524  const int M,
1525  const int N,
1526  const void* A,
1527  const int lda,
1528  void* B,
1529  const int ldb,
1530  CPUContext* /*context*/,
1531  TypeMeta::Copy copy) {
1532  if (A == nullptr || B == nullptr) {
1533  return;
1534  }
1535  if (lda == N && ldb == N) {
1536  // can coalese to a single memcpy of size M * N
1537  if (copy) {
1538  copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
1539  } else {
1540  memcpy(
1541  static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
1542  }
1543  return;
1544  }
1545 
1546  for (int i = 0; i < M; ++i) {
1547  if (copy) {
1548  copy(
1549  static_cast<const char*>(A) + lda * i * itemsize,
1550  static_cast<char*>(B) + ldb * i * itemsize,
1551  N);
1552  } else {
1553  memcpy(
1554  static_cast<char*>(B) + ldb * i * itemsize,
1555  static_cast<const char*>(A) + lda * i * itemsize,
1556  itemsize * N);
1557  }
1558  }
1559 }
1560 
1561 #ifdef CAFFE2_USE_MKL
1562 
1563 #define DELEGATE_COPY_MATRIX_FUNCTION(T, Func) \
1564  template <> \
1565  C10_EXPORT void CopyMatrix<T, CPUContext>( \
1566  const int M, \
1567  const int N, \
1568  const T* A, \
1569  const int lda, \
1570  T* B, \
1571  const int ldb, \
1572  CPUContext* /* context */) { \
1573  Func('R', 'N', M, N, T(1), A, lda, B, ldb); \
1574  } \
1575  template <> \
1576  C10_EXPORT void CopyMatrix<T, CPUContext>( \
1577  const int M, \
1578  const int N, \
1579  const T* A, \
1580  const int A_outer_stride, \
1581  const int A_inner_stride, \
1582  T* B, \
1583  const int B_outer_stride, \
1584  const int B_inner_stride, \
1585  CPUContext* /* context */) { \
1586  Func##2( \
1587  'R', \
1588  'N', \
1589  M, \
1590  N, \
1591  T(1), \
1592  A, \
1593  A_outer_stride, \
1594  A_inner_stride, \
1595  B, \
1596  B_outer_stride, \
1597  B_inner_stride); \
1598  }
1599 DELEGATE_COPY_MATRIX_FUNCTION(float, mkl_somatcopy)
1600 DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy)
1601 #undef DELEGATE_COPY_MATRIX_FUNCTION
1602 
1603 #endif // CAFFE2_USE_MKL
1604 
1605 #define CAFFE2_SPECIALIZED_COPY_MATRIX(T) \
1606  template <> \
1607  C10_EXPORT void CopyMatrix<T, CPUContext>( \
1608  const int M, \
1609  const int N, \
1610  const T* A, \
1611  const int lda, \
1612  T* B, \
1613  const int ldb, \
1614  CPUContext* /* context */) { \
1615  if (M == 0 || N == 0) { \
1616  return; \
1617  } \
1618  if (lda == N) { \
1619  if (ldb == N) { \
1620  std::memcpy(B, A, sizeof(T) * M * N); \
1621  } else { \
1622  EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \
1623  ConstEigenMatrixMap<T>(A, N, M); \
1624  } \
1625  } else { \
1626  if (ldb == N) { \
1627  EigenMatrixMap<T>(B, N, M) = ConstEigenOuterStridedMatrixMap<T>( \
1628  A, N, M, EigenOuterStride(lda)); \
1629  } else { \
1630  EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \
1631  ConstEigenOuterStridedMatrixMap<T>( \
1632  A, N, M, EigenOuterStride(lda)); \
1633  } \
1634  } \
1635  } \
1636  template <> \
1637  C10_EXPORT void CopyMatrix<T, CPUContext>( \
1638  const int M, \
1639  const int N, \
1640  const T* A, \
1641  const int A_outer_stride, \
1642  const int A_inner_stride, \
1643  T* B, \
1644  const int B_outer_stride, \
1645  const int B_inner_stride, \
1646  CPUContext* context) { \
1647  if (A_inner_stride == 1 && B_inner_stride == 1) { \
1648  CopyMatrix<T, CPUContext>( \
1649  M, N, A, A_outer_stride, B, B_outer_stride, context); \
1650  return; \
1651  } \
1652  EigenStridedMatrixMap<T>( \
1653  B, N, M, EigenStride(B_outer_stride, B_inner_stride)) = \
1654  ConstEigenStridedMatrixMap<T>( \
1655  A, N, M, EigenStride(A_outer_stride, A_inner_stride)); \
1656  }
1657 
1658 #ifndef CAFFE2_USE_MKL
1659 CAFFE2_SPECIALIZED_COPY_MATRIX(float)
1660 CAFFE2_SPECIALIZED_COPY_MATRIX(double)
1661 #endif // CAFFE2_USE_MKL
1662 
1663 CAFFE2_SPECIALIZED_COPY_MATRIX(int)
1664 CAFFE2_SPECIALIZED_COPY_MATRIX(int64_t)
1665 CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint8_t)
1666 CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint16_t)
1667 
1668 #undef CAFFE2_SPECIALIZXED_COPY_MATRIX
1669 
1670 namespace {
1671 
1672 template <typename T>
1673 C10_EXPORT void Im2ColZeroPaddingAndNoDilationNCHW(
1674  const int C,
1675  const int H,
1676  const int W,
1677  const int kernel_h,
1678  const int kernel_w,
1679  const int stride_h,
1680  const int stride_w,
1681  const T* img_data,
1682  T* col_data,
1683  CPUContext* context) {
1684  const int output_h = (H - kernel_h) / stride_h + 1;
1685  const int output_w = (W - kernel_w) / stride_w + 1;
1686  const int output_size = output_h * output_w;
1687  for (int c = 0; c < C; ++c) {
1688  for (int kh = 0; kh < kernel_h; ++kh) {
1689  for (int kw = 0; kw < kernel_w; ++kw) {
1690  const T* src = img_data + kh * W + kw;
1691  if (stride_w == 1) {
1692  CopyMatrix<T, CPUContext>(
1693  output_h,
1694  output_w,
1695  src,
1696  stride_h * W,
1697  col_data,
1698  output_w,
1699  context);
1700  } else {
1701  CopyMatrix<T, CPUContext>(
1702  output_h,
1703  output_w,
1704  src,
1705  stride_h * W,
1706  stride_w,
1707  col_data,
1708  output_w,
1709  1,
1710  context);
1711  }
1712  col_data += output_size;
1713  }
1714  }
1715  img_data += H * W;
1716  }
1717 }
1718 
1719 template <typename T>
1720 C10_EXPORT void Col2ImZeroPaddingAndNoDilationNCHW(
1721  const int C,
1722  const int H,
1723  const int W,
1724  const int kernel_h,
1725  const int kernel_w,
1726  const int stride_h,
1727  const int stride_w,
1728  const T* col_data,
1729  T* img_data,
1730  CPUContext* context) {
1731  Set<T, CPUContext>(C * H * W, T(0), img_data, context);
1732  const int output_h = (H - kernel_h) / stride_h + 1;
1733  const int output_w = (W - kernel_w) / stride_w + 1;
1734  const int output_size = output_h * output_w;
1735  for (int c = 0; c < C; ++c) {
1736  for (int kh = 0; kh < kernel_h; ++kh) {
1737  for (int kw = 0; kw < kernel_w; ++kw) {
1738  T* dst = img_data + kh * W + kw;
1739  if (stride_w == 1) {
1740  EigenOuterStridedArrayMap<T>(
1741  dst, output_w, output_h, EigenOuterStride(stride_h * W)) +=
1742  ConstEigenArrayMap<T>(col_data, output_w, output_h);
1743  } else {
1744  EigenStridedArrayMap<T>(
1745  dst, output_w, output_h, EigenStride(stride_h * W, stride_w)) +=
1746  ConstEigenArrayMap<T>(col_data, output_w, output_h);
1747  }
1748  col_data += output_size;
1749  }
1750  }
1751  img_data += H * W;
1752  }
1753 }
1754 
1755 template <typename T>
1756 C10_EXPORT void Im2ColZeroPaddingAndNoDilationNHWC(
1757  const int C,
1758  const int H,
1759  const int W,
1760  const int kernel_h,
1761  const int kernel_w,
1762  const int stride_h,
1763  const int stride_w,
1764  const T* img_data,
1765  T* col_data,
1766  CPUContext* context) {
1767  const int output_h = (H - kernel_h) / stride_h + 1;
1768  const int output_w = (W - kernel_w) / stride_w + 1;
1769  const int kernel_size = kernel_h * kernel_w;
1770  for (int yh = 0; yh < output_h; ++yh) {
1771  for (int yw = 0; yw < output_w; ++yw) {
1772  const T* src = img_data + (yh * stride_h * W + yw * stride_w) * C;
1773  CopyMatrix<T, CPUContext>(
1774  kernel_h, kernel_w * C, src, W * C, col_data, kernel_w * C, context);
1775  col_data += kernel_size * C;
1776  }
1777  }
1778 }
1779 
1780 template <typename T>
1781 C10_EXPORT void Col2ImZeroPaddingAndNoDilationNHWC(
1782  const int C,
1783  const int H,
1784  const int W,
1785  const int kernel_h,
1786  const int kernel_w,
1787  const int stride_h,
1788  const int stride_w,
1789  const T* col_data,
1790  T* img_data,
1791  CPUContext* context) {
1792  Set<T, CPUContext>(H * W * C, T(0), img_data, context);
1793  const int output_h = (H - kernel_h) / stride_h + 1;
1794  const int output_w = (W - kernel_w) / stride_w + 1;
1795  const int kernel_size = kernel_h * kernel_w;
1796  for (int yh = 0; yh < output_h; ++yh) {
1797  for (int yw = 0; yw < output_w; ++yw) {
1798  T* dst = img_data + (yh * stride_h * W + yw * stride_w) * C;
1799  EigenOuterStridedArrayMap<T>(
1800  dst, kernel_w * C, kernel_h, EigenOuterStride(W * C)) +=
1801  ConstEigenArrayMap<T>(col_data, kernel_w * C, kernel_h);
1802  col_data += kernel_size * C;
1803  }
1804  }
1805 }
1806 
1807 template <typename T, bool kCol2Im>
1808 C10_EXPORT void Im2ColNdNCHWImpl(
1809  const int N,
1810  const int img_size,
1811  const int col_size,
1812  const int* img_shape,
1813  const int* col_shape,
1814  const int* kernel_shape,
1815  const int* stride,
1816  const int* dilation,
1817  const int* pad,
1818  const float* X_data,
1819  float* Y_data) {
1820  if (kCol2Im) {
1821  std::memset(Y_data, 0, img_size * sizeof(float));
1822  }
1823  const int outer_size = col_shape[0];
1824  const int inner_size = col_size / outer_size;
1825  const int kernel_size = std::accumulate(
1826  kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
1827  std::vector<FixedDivisor<int>> kernel_shape_div(N);
1828  for (int i = 0; i < N; ++i) {
1829  kernel_shape_div[i] = FixedDivisor<int>(kernel_shape[i]);
1830  }
1831  std::vector<int> d_offset(N, 0);
1832  std::vector<int> d_iter(N, 0);
1833  for (int i = 0; i < outer_size; ++i) {
1834  // Loop over spatial axes in reverse order to compute a per-axis offset.
1835  int offset = i;
1836  for (int d_i = N - 1; d_i >= 0; --d_i) {
1837  kernel_shape_div[d_i].DivMod(offset, &offset, &d_offset[d_i]);
1838  }
1839  for (int j = 0; j < inner_size; ++j) {
1840  // Loop over spatial axes in forward order to compute the indices in the
1841  // image and column, and whether the index lies in the padding.
1842  const int col_index = i * inner_size + j;
1843  int img_index = i / kernel_size;
1844  bool is_padding = false;
1845  for (int d_i = 0; d_i < N; ++d_i) {
1846  const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
1847  d_offset[d_i] * dilation[d_i];
1848  is_padding |= !utils::IsAGeZeroAndALtB(d_img, img_shape[d_i + 1]);
1849  img_index = img_index * img_shape[d_i + 1] + d_img;
1850  }
1851  if (!kCol2Im) {
1852  Y_data[col_index] = is_padding ? 0 : X_data[img_index];
1853  } else if (!is_padding) {
1854  Y_data[img_index] += X_data[col_index];
1855  }
1856  utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
1857  }
1858  }
1859 }
1860 
1861 template <typename T>
1862 void Im2Col3dNCHWImpl(
1863  const int channels,
1864  const int clip_len,
1865  const int height,
1866  const int width,
1867  const int kernel_t,
1868  const int kernel_h,
1869  const int kernel_w,
1870  const int dilation_t,
1871  const int dilation_h,
1872  const int dilation_w,
1873  const int pad_p,
1874  const int pad_t,
1875  const int pad_l,
1876  const int pad_a,
1877  const int pad_b,
1878  const int pad_r,
1879  const int stride_t,
1880  const int stride_h,
1881  const int stride_w,
1882  const T* img_data,
1883  T* col_data) {
1884  const int output_t =
1885  (clip_len + pad_p + pad_a - (dilation_t * (kernel_t - 1) + 1)) /
1886  stride_t +
1887  1;
1888  const int output_h =
1889  (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1890  1;
1891  const int output_w =
1892  (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1893  1;
1894  const int kernel_size = kernel_t * kernel_h * kernel_w;
1895  const int kernel_hw_size = kernel_h * kernel_w;
1896  const int output_size = output_t * output_h * output_w;
1897  const int channel_size = clip_len * height * width;
1898  const int output_hw_size = output_h * output_w;
1899  const int channel_hw_size = height * width;
1900 
1901  // Fast path for zero padding and no dilation
1902  // From Torch, THNN_(unfolded_copy)
1903  if (dilation_t == 1 && dilation_h == 1 && dilation_w == 1 && pad_a == 0 &&
1904  pad_p == 0 && pad_l == 0 && pad_r == 0 && pad_t == 0 && pad_b == 0) {
1905  for (auto k = 0; k < channels * kernel_size; k++) {
1906  const auto nip = k / kernel_size;
1907  const auto rest = k % kernel_size;
1908  const auto kt = rest / kernel_hw_size;
1909  const auto rest_hw = rest % kernel_hw_size;
1910  const auto kh = rest_hw / kernel_w;
1911  const auto kw = rest_hw % kernel_w;
1912  auto* dst = col_data + nip * (kernel_size * output_size) +
1913  kt * (kernel_hw_size * output_size) + kh * (kernel_w * output_size) +
1914  kw * output_size;
1915  const auto* src = img_data + nip * channel_size;
1916  for (auto t = 0; t < output_t; t++) {
1917  const auto it = t * stride_t + kt;
1918  for (auto y = 0; y < output_h; y++) {
1919  const auto iy = y * stride_h + kh;
1920  const auto ix = kw;
1921  if (stride_w == 1) {
1922  memcpy(
1923  dst + (t * output_hw_size + y * output_w),
1924  src + (it * channel_hw_size + iy * width + ix),
1925  sizeof(T) * output_w);
1926  } else {
1927  for (auto x = 0; x < output_w; x++) {
1928  memcpy(
1929  dst + (t * output_hw_size + y * output_w + x),
1930  src + (it * channel_hw_size + iy * width + ix + x * stride_w),
1931  sizeof(T));
1932  }
1933  }
1934  }
1935  }
1936  }
1937  return;
1938  }
1939  // Fast path for equal padding
1940  if (pad_a == pad_p && pad_l == pad_r && pad_t == pad_b) {
1941  const int pad_f = pad_a;
1942  const int pad_h = pad_t;
1943  const int pad_w = pad_l;
1944  for (int channel = channels; channel--; img_data += channel_size) {
1945  for (int kernel_frame = 0; kernel_frame < kernel_t; kernel_frame++) {
1946  for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
1947  for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
1948  int input_frame = -pad_f + kernel_frame * dilation_t;
1949  for (int output_frames = output_t; output_frames; output_frames--) {
1950  if (!utils::IsAGeZeroAndALtB(input_frame, clip_len)) {
1951  for (int output_rows = output_h; output_rows; output_rows--) {
1952  for (int output_cols = output_w; output_cols; output_cols--) {
1953  *(col_data++) = 0;
1954  }
1955  }
1956  } else {
1957  int input_row = -pad_h + kernel_row * dilation_h;
1958  for (int output_rows = output_h; output_rows; output_rows--) {
1959  if (!utils::IsAGeZeroAndALtB(input_row, height)) {
1960  for (int output_cols = output_w; output_cols;
1961  output_cols--) {
1962  *(col_data++) = 0;
1963  }
1964  } else {
1965  int input_col = -pad_w + kernel_col * dilation_w;
1966  for (int output_col = output_w; output_col; output_col--) {
1967  if (utils::IsAGeZeroAndALtB(input_col, width)) {
1968  *(col_data++) = img_data
1969  [(input_frame * height + input_row) * width +
1970  input_col];
1971  } else {
1972  *(col_data++) = 0;
1973  }
1974  input_col += stride_w;
1975  }
1976  }
1977  input_row += stride_h;
1978  }
1979  }
1980  input_frame += stride_t;
1981  }
1982  }
1983  }
1984  }
1985  }
1986  return;
1987  }
1988 
1989  // Baseline
1990  const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
1991  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1992  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1993 
1994  int clip_col = (clip_len + pad_p + pad_a - dkernel_t) / stride_t + 1;
1995  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1996  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1997 
1998  int channels_col = channels * kernel_t * kernel_h * kernel_w;
1999  for (int c = 0; c < channels_col; ++c) {
2000  int w_offset = c % kernel_w;
2001  int h_offset = (c / kernel_w) % kernel_h;
2002  int t_offset = (c / kernel_w / kernel_h) % kernel_t;
2003  int c_im = c / kernel_h / kernel_w / kernel_t;
2004  for (int t = 0; t < clip_col; ++t) {
2005  for (int h = 0; h < height_col; ++h) {
2006  for (int w = 0; w < width_col; ++w) {
2007  int t_pad = t * stride_t - pad_p + t_offset * dilation_t;
2008  int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
2009  int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
2010  if (t_pad >= 0 && t_pad < clip_len && h_pad >= 0 && h_pad < height &&
2011  w_pad >= 0 && w_pad < width) {
2012  col_data[((c * clip_col + t) * height_col + h) * width_col + w] =
2013  img_data
2014  [((c_im * clip_len + t_pad) * height + h_pad) * width +
2015  w_pad];
2016  } else {
2017  col_data[((c * clip_col + t) * height_col + h) * width_col + w] = 0;
2018  }
2019  }
2020  }
2021  }
2022  }
2023 }
2024 
2025 } // namespace
2026 
2027 template <>
2028 C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
2029  const int N,
2030  const int img_size,
2031  const int col_size,
2032  const int* img_shape,
2033  const int* col_shape,
2034  const int* kernel_shape,
2035  const int* stride,
2036  const int* dilation,
2037  const int* pad,
2038  const float* img_data,
2039  float* col_data,
2040  CPUContext* /* context */,
2041  const int /* groups */) {
2042  // In NCHW, the number of groups doesn't affect Im2Col.
2043  if (N == 3) {
2044  const int channels =
2045  col_shape[0] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
2046  Im2Col3dNCHWImpl<float>(
2047  channels,
2048  img_shape[1],
2049  img_shape[2],
2050  img_shape[3],
2051  kernel_shape[0],
2052  kernel_shape[1],
2053  kernel_shape[2],
2054  dilation[0],
2055  dilation[1],
2056  dilation[2],
2057  pad[0],
2058  pad[1],
2059  pad[2],
2060  pad[3],
2061  pad[4],
2062  pad[5],
2063  stride[0],
2064  stride[1],
2065  stride[2],
2066  img_data,
2067  col_data);
2068  } else {
2069  Im2ColNdNCHWImpl<float, false>(
2070  N,
2071  img_size,
2072  col_size,
2073  img_shape,
2074  col_shape,
2075  kernel_shape,
2076  stride,
2077  dilation,
2078  pad,
2079  img_data,
2080  col_data);
2081  }
2082 }
2083 
2084 template <>
2085 C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
2086  const int N,
2087  const int img_size,
2088  const int col_size,
2089  const int* img_shape,
2090  const int* col_shape,
2091  const int* kernel_shape,
2092  const int* stride,
2093  const int* dilation,
2094  const int* pad,
2095  const float* col_data,
2096  float* img_data,
2097  CPUContext* /* context */,
2098  const int /* groups */) {
2099  // In NCHW, the number of groups doesn't affect Col2Im.
2100  Im2ColNdNCHWImpl<float, true>(
2101  N,
2102  img_size,
2103  col_size,
2104  img_shape,
2105  col_shape,
2106  kernel_shape,
2107  stride,
2108  dilation,
2109  pad,
2110  col_data,
2111  img_data);
2112 }
2113 
2114 template <>
2115 C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>(
2116  const int C,
2117  const int H,
2118  const int W,
2119  const int kernel_h,
2120  const int kernel_w,
2121  const int dilation_h,
2122  const int dilation_w,
2123  const int pad_t,
2124  const int pad_l,
2125  const int pad_b,
2126  const int pad_r,
2127  const int stride_h,
2128  const int stride_w,
2129  const float* img_data,
2130  float* col_data,
2131  CPUContext* context,
2132  const int /* groups */) {
2133  // In NCHW, the number of groups doesn't affect Im2Col.
2134 
2135  // Fast path for zero padding and no dilation
2136  if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
2137  dilation_w == 1) {
2138  Im2ColZeroPaddingAndNoDilationNCHW<float>(
2139  C,
2140  H,
2141  W,
2142  kernel_h,
2143  kernel_w,
2144  stride_h,
2145  stride_w,
2146  img_data,
2147  col_data,
2148  context);
2149  return;
2150  }
2151 
2152  // Baseline
2153  const int output_h =
2154  (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
2155  const int output_w =
2156  (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
2157  const int output_size = output_h * output_w;
2158  for (int c = 0; c < C; ++c) {
2159  for (int kh = 0; kh < kernel_h; ++kh) {
2160  for (int kw = 0; kw < kernel_w; ++kw) {
2161  for (int h = 0; h < output_h; ++h) {
2162  const int h_pad = h * stride_h - pad_t + kh * dilation_h;
2163  if (!utils::IsAGeZeroAndALtB(h_pad, H)) {
2164  std::memset(col_data + h * output_w, 0, output_w * sizeof(float));
2165  continue;
2166  }
2167  for (int w = 0; w < output_w; ++w) {
2168  const int w_pad = w * stride_w - pad_l + kw * dilation_w;
2169  col_data[h * output_w + w] = utils::IsAGeZeroAndALtB(w_pad, W)
2170  ? img_data[(c * H + h_pad) * W + w_pad]
2171  : 0;
2172  }
2173  }
2174  col_data += output_size;
2175  }
2176  }
2177  }
2178 }
2179 
2180 template <>
2181 C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>(
2182  const int C,
2183  const int H,
2184  const int W,
2185  const int kernel_h,
2186  const int kernel_w,
2187  const int dilation_h,
2188  const int dilation_w,
2189  const int pad_t,
2190  const int pad_l,
2191  const int pad_b,
2192  const int pad_r,
2193  const int stride_h,
2194  const int stride_w,
2195  const float* img_data,
2196  float* col_data,
2197  CPUContext* context,
2198  const int groups) {
2199  // Fast path for zero padding and no dilation
2200  if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
2201  dilation_w == 1 && groups == 1) {
2202  Im2ColZeroPaddingAndNoDilationNHWC<float>(
2203  C,
2204  H,
2205  W,
2206  kernel_h,
2207  kernel_w,
2208  stride_h,
2209  stride_w,
2210  img_data,
2211  col_data,
2212  context);
2213  return;
2214  }
2215 
2216  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
2217  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
2218  const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
2219  const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
2220  int h_pad = -pad_t;
2221  if (groups == 1) {
2222  for (int h = 0; h < output_h; ++h) {
2223  int w_pad = -pad_l;
2224  for (int w = 0; w < output_w; ++w) {
2225  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
2226  if (!utils::IsAGeZeroAndALtB(ih, H)) {
2227  std::memset(col_data, 0, sizeof(float) * kernel_w * C);
2228  col_data += kernel_w * C;
2229  continue;
2230  }
2231  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
2232  if (utils::IsAGeZeroAndALtB(iw, W)) {
2233  std::memcpy(
2234  col_data, img_data + (ih * W + iw) * C, sizeof(float) * C);
2235  } else {
2236  std::memset(col_data, 0, sizeof(float) * C);
2237  }
2238  col_data += C;
2239  } // iw
2240  } // ih
2241  w_pad += stride_w;
2242  } // w
2243  h_pad += stride_h;
2244  } // h
2245  } else {
2252  const int C_per_G = C / groups;
2253  for (int h = 0; h < output_h; ++h) {
2254  int w_pad = -pad_l;
2255  for (int w = 0; w < output_w; ++w) {
2256  int r = 0;
2257  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
2258  int s = 0;
2259  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
2260  if (utils::IsAGeZeroAndALtB(ih, H) &&
2261  utils::IsAGeZeroAndALtB(iw, W)) {
2262  for (int g = 0; g < groups; ++g) {
2263  std::memcpy(
2264  col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
2265  img_data + (ih * W + iw) * C + g * C_per_G,
2266  sizeof(float) * C_per_G);
2267  }
2268  } else {
2269  for (int g = 0; g < groups; ++g) {
2270  std::memset(
2271  col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
2272  0,
2273  sizeof(float) * C_per_G);
2274  }
2275  }
2276  } // iw
2277  } // ih
2278  col_data += kernel_h * kernel_w * C;
2279  w_pad += stride_w;
2280  } // w
2281  h_pad += stride_h;
2282  } // h
2283  }
2284 }
2285 
2291 template <typename TData>
2292 C10_EXPORT void Im2Col3dNHWCImpl(
2293  const int C,
2294  const int T,
2295  const int H,
2296  const int W,
2297  const int kernel_t,
2298  const int kernel_h,
2299  const int kernel_w,
2300  const int dilation_t,
2301  const int dilation_h,
2302  const int dilation_w,
2303  const int pad_p, // previous frame
2304  const int pad_t, // top
2305  const int pad_l, // left
2306  const int pad_n, // next frame
2307  const int pad_b, // bottom
2308  const int pad_r, // right
2309  const int stride_t,
2310  const int stride_h,
2311  const int stride_w,
2312  const TData* img_data,
2313  TData* col_data,
2314  const int groups) {
2315  const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
2316  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
2317  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
2318  const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1;
2319  const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
2320  const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
2321  const int C_per_G = C / groups;
2322  int t_pad = -pad_p;
2323  for (int t = 0; t < output_t; ++t) {
2324  int h_pad = -pad_t;
2325  for (int h = 0; h < output_h; ++h) {
2326  int w_pad = -pad_l;
2327  for (int w = 0; w < output_w; ++w) {
2328  int q = 0;
2329  for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
2330  int r = 0;
2331  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
2332  int s = 0;
2333  for (int iw = w_pad; iw < w_pad + dkernel_w;
2334  iw += dilation_w, ++s) {
2335  if (utils::IsAGeZeroAndALtB(it, T) &&
2336  utils::IsAGeZeroAndALtB(ih, H) &&
2337  utils::IsAGeZeroAndALtB(iw, W)) {
2338  for (int g = 0; g < groups; ++g) {
2339  std::memcpy(
2340  col_data +
2341  (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
2342  C_per_G,
2343  img_data + ((it * H + ih) * W + iw) * C + g * C_per_G,
2344  sizeof(TData) * C_per_G);
2345  }
2346  } else {
2347  for (int g = 0; g < groups; ++g) {
2348  std::memset(
2349  col_data +
2350  (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
2351  C_per_G,
2352  0,
2353  sizeof(TData) * C_per_G);
2354  }
2355  }
2356  } // iw
2357  } // ih
2358  } // it
2359  col_data += kernel_t * kernel_h * kernel_w * C;
2360  w_pad += stride_w;
2361  } // w
2362  h_pad += stride_h;
2363  } // h
2364  t_pad += stride_t;
2365  } // t
2366 }
2367 
2368 template <>
2369 C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NHWC>(
2370  const int N,
2371  const int /*img_size*/,
2372  const int /*col_size*/,
2373  const int* img_shape,
2374  const int* col_shape,
2375  const int* kernel_shape,
2376  const int* stride,
2377  const int* dilation,
2378  const int* pad,
2379  const float* img_data,
2380  float* col_data,
2381  CPUContext* /* context */,
2382  const int groups) {
2383  if (N == 3) {
2384  const int channels =
2385  col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
2386  Im2Col3dNHWCImpl<float>(
2387  channels,
2388  img_shape[0],
2389  img_shape[1],
2390  img_shape[2],
2391  kernel_shape[0],
2392  kernel_shape[1],
2393  kernel_shape[2],
2394  dilation[0],
2395  dilation[1],
2396  dilation[2],
2397  pad[0],
2398  pad[1],
2399  pad[2],
2400  pad[3],
2401  pad[4],
2402  pad[5],
2403  stride[0],
2404  stride[1],
2405  stride[2],
2406  img_data,
2407  col_data,
2408  groups);
2409  } else {
2410  CAFFE_NOT_IMPLEMENTED;
2411  }
2412 }
2413 
2414 template <>
2415 C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>(
2416  const int C,
2417  const int H,
2418  const int W,
2419  const int kernel_h,
2420  const int kernel_w,
2421  const int dilation_h,
2422  const int dilation_w,
2423  const int pad_t,
2424  const int pad_l,
2425  const int pad_b,
2426  const int pad_r,
2427  const int stride_h,
2428  const int stride_w,
2429  const float* col_data,
2430  float* img_data,
2431  CPUContext* context,
2432  const int /* groups */) {
2433  // In NCHW, the number of groups doesn't affect Col2Im.
2434 
2435  // Fast path for zero padding and no dilation
2436  if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
2437  dilation_w == 1) {
2438  Col2ImZeroPaddingAndNoDilationNCHW<float>(
2439  C,
2440  H,
2441  W,
2442  kernel_h,
2443  kernel_w,
2444  stride_h,
2445  stride_w,
2446  col_data,
2447  img_data,
2448  context);
2449  return;
2450  }
2451 
2452  // Fallback
2453  Set<float, CPUContext>(C * H * W, 0.0f, img_data, context);
2454  const int output_h =
2455  (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
2456  const int output_w =
2457  (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
2458  const int output_size = output_h * output_w;
2459  for (int c = 0; c < C; ++c) {
2460  for (int kh = 0; kh < kernel_h; ++kh) {
2461  for (int kw = 0; kw < kernel_w; ++kw) {
2462  for (int h = 0; h < output_h; ++h) {
2463  const int h_pad = h * stride_h - pad_t + kh * dilation_h;
2464  if (!utils::IsAGeZeroAndALtB(h_pad, H)) {
2465  continue;
2466  }
2467  for (int w = 0; w < output_w; ++w) {
2468  const int w_pad = w * stride_w - pad_l + kw * dilation_w;
2469  if (utils::IsAGeZeroAndALtB(w_pad, W)) {
2470  img_data[(c * H + h_pad) * W + w_pad] +=
2471  col_data[h * output_w + w];
2472  }
2473  }
2474  }
2475  col_data += output_size;
2476  }
2477  }
2478  }
2479 }
2480 
2481 template <>
2482 C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>(
2483  const int C,
2484  const int H,
2485  const int W,
2486  const int kernel_h,
2487  const int kernel_w,
2488  const int dilation_h,
2489  const int dilation_w,
2490  const int pad_t,
2491  const int pad_l,
2492  const int pad_b,
2493  const int pad_r,
2494  const int stride_h,
2495  const int stride_w,
2496  const float* col_data,
2497  float* img_data,
2498  CPUContext* context,
2499  const int groups) {
2500  // Fast path for zero padding and no dilation
2501  if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
2502  dilation_w == 1 && groups == 1) {
2503  Col2ImZeroPaddingAndNoDilationNHWC<float>(
2504  C,
2505  H,
2506  W,
2507  kernel_h,
2508  kernel_w,
2509  stride_h,
2510  stride_w,
2511  col_data,
2512  img_data,
2513  context);
2514  return;
2515  }
2516 
2517  Set<float, CPUContext>(H * W * C, 0, img_data, context);
2518  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
2519  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
2520  const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
2521  const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
2522 
2523  int h_pad = -pad_t;
2524  if (groups == 1) {
2525  for (int h = 0; h < output_h; ++h) {
2526  int w_pad = -pad_l;
2527  for (int w = 0; w < output_w; ++w) {
2528  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
2529  if (!utils::IsAGeZeroAndALtB(ih, H)) {
2530  col_data += kernel_w * C;
2531  continue;
2532  }
2533  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
2534  if (utils::IsAGeZeroAndALtB(iw, W)) {
2535  float* img_data_patch = img_data + (ih * W + iw) * C;
2537  C, img_data_patch, col_data, img_data_patch, context);
2538  }
2539  col_data += C;
2540  } // iw
2541  } // ih
2542  w_pad += stride_w;
2543  } // w
2544  h_pad += stride_h;
2545  } // h
2546  } else {
2547  const int C_per_G = C / groups;
2548  for (int h = 0; h < output_h; ++h) {
2549  int w_pad = -pad_l;
2550  for (int w = 0; w < output_w; ++w) {
2551  int r = 0;
2552  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
2553  int s = 0;
2554  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
2555  if (utils::IsAGeZeroAndALtB(ih, H) &&
2556  utils::IsAGeZeroAndALtB(iw, W)) {
2557  float* img_data_patch = img_data + (ih * W + iw) * C;
2558  for (int g = 0; g < groups; ++g) {
2560  C_per_G,
2561  img_data_patch + g * C_per_G,
2562  col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
2563  img_data_patch + g * C_per_G,
2564  context);
2565  }
2566  }
2567  } // iw
2568  } // ih
2569  col_data += kernel_h * kernel_w * C;
2570  w_pad += stride_w;
2571  } // w
2572  h_pad += stride_h;
2573  } // h
2574  }
2575 }
2576 
2582 template <typename TData>
2583 C10_EXPORT void Col2Im3dNHWCImpl(
2584  const int C,
2585  const int T,
2586  const int H,
2587  const int W,
2588  const int kernel_t,
2589  const int kernel_h,
2590  const int kernel_w,
2591  const int dilation_t,
2592  const int dilation_h,
2593  const int dilation_w,
2594  const int pad_p, // previous frame
2595  const int pad_t, // top
2596  const int pad_l, // left
2597  const int pad_n, // next frame
2598  const int pad_b, // bottom
2599  const int pad_r, // right
2600  const int stride_t,
2601  const int stride_h,
2602  const int stride_w,
2603  const TData* col_data,
2604  TData* img_data,
2605  CPUContext* context,
2606  const int groups) {
2607  Set<float, CPUContext>(T * H * W * C, 0, img_data, context);
2608  const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
2609  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
2610  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
2611  const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1;
2612  const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
2613  const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
2614  const int C_per_G = C / groups;
2615 
2616  int t_pad = -pad_p;
2617  for (int t = 0; t < output_t; ++t) {
2618  int h_pad = -pad_t;
2619  for (int h = 0; h < output_h; ++h) {
2620  int w_pad = -pad_l;
2621  for (int w = 0; w < output_w; ++w) {
2622  int q = 0;
2623  for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
2624  int r = 0;
2625  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
2626  int s = 0;
2627  for (int iw = w_pad; iw < w_pad + dkernel_w;
2628  iw += dilation_w, ++s) {
2629  if (utils::IsAGeZeroAndALtB(it, T) &&
2630  utils::IsAGeZeroAndALtB(ih, H) &&
2631  utils::IsAGeZeroAndALtB(iw, W)) {
2632  float* img_data_patch = img_data + ((it * T + ih) * W + iw) * C;
2633  for (int g = 0; g < groups; ++g) {
2635  C_per_G,
2636  img_data_patch + g * C_per_G,
2637  col_data +
2638  (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
2639  C_per_G,
2640  img_data_patch + g * C_per_G,
2641  context);
2642  }
2643  }
2644  } // iw
2645  } // ih
2646  } // it
2647  col_data += kernel_t * kernel_h * kernel_w * C;
2648  w_pad += stride_w;
2649  } // w
2650  h_pad += stride_h;
2651  } // h
2652  t_pad += stride_t;
2653  } // t
2654 }
2655 
2656 template <>
2657 C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NHWC>(
2658  const int N,
2659  const int /*img_size*/,
2660  const int /*col_size*/,
2661  const int* img_shape,
2662  const int* col_shape,
2663  const int* kernel_shape,
2664  const int* stride,
2665  const int* dilation,
2666  const int* pad,
2667  const float* col_data,
2668  float* img_data,
2669  CPUContext* context,
2670  const int groups) {
2671  if (N == 3) {
2672  const int channels =
2673  col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
2674  Col2Im3dNHWCImpl<float>(
2675  channels,
2676  img_shape[0],
2677  img_shape[1],
2678  img_shape[2],
2679  kernel_shape[0],
2680  kernel_shape[1],
2681  kernel_shape[2],
2682  dilation[0],
2683  dilation[1],
2684  dilation[2],
2685  pad[0],
2686  pad[1],
2687  pad[2],
2688  pad[3],
2689  pad[4],
2690  pad[5],
2691  stride[0],
2692  stride[1],
2693  stride[2],
2694  col_data,
2695  img_data,
2696  context,
2697  groups);
2698  } else {
2699  CAFFE_NOT_IMPLEMENTED;
2700  }
2701 }
2702 
2703 template <>
2704 C10_EXPORT void BiasCHW<float, CPUContext>(
2705  const float* bias,
2706  const float* /*bias_multiplier*/,
2707  const int bias_channels,
2708  const int image_size,
2709  float* image,
2710  CPUContext* /*context*/) {
2711  // Sum the per-channel bias into every image plane
2712  for (int c = 0; c < bias_channels; ++c) {
2713  float b = bias[c];
2714 
2715 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
2716  float32x4_t vBias = vdupq_n_f32(b);
2717 
2718  // We give alignment hints for additional speed, so handle the
2719  // non-vectorizable prologue separately
2720  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
2721 
2722  // FIXME: if input < kVecSizeInFloat, can't vectorize at all
2723 
2724  int prologue = kVecSizeInFloat -
2725  // remainder in floats
2726  (((uintptr_t)image) % (sizeof(float32x4_t))) / sizeof(float);
2727 
2728  int i = 0;
2729  // Prologue loop
2730  for (; i < prologue; ++i) {
2731  image[i] += b;
2732  }
2733 
2734  // The loop is manually unrolled by 8
2735  constexpr int kUnroll = 8;
2736  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
2737 
2738  int remainder = image_size - prologue;
2739  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
2740 
2741  // Vectorizable body
2742  for (; i < vectorizable; i += kFloatsPerLoop) {
2743  // Manually unrolled
2744  float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
2745  float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
2746  float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
2747  float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
2748  float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
2749  float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
2750  float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
2751  float32x4_t v7 = vld1q_f32_aligned(image + i + 28);
2752 
2753  v0 = vaddq_f32(v0, vBias);
2754  v1 = vaddq_f32(v1, vBias);
2755  v2 = vaddq_f32(v2, vBias);
2756  v3 = vaddq_f32(v3, vBias);
2757  v4 = vaddq_f32(v4, vBias);
2758  v5 = vaddq_f32(v5, vBias);
2759  v6 = vaddq_f32(v6, vBias);
2760  v7 = vaddq_f32(v7, vBias);
2761 
2762  vst1q_f32_aligned(image + i + 0, v0);
2763  vst1q_f32_aligned(image + i + 4, v1);
2764  vst1q_f32_aligned(image + i + 8, v2);
2765  vst1q_f32_aligned(image + i + 12, v3);
2766  vst1q_f32_aligned(image + i + 16, v4);
2767  vst1q_f32_aligned(image + i + 20, v5);
2768  vst1q_f32_aligned(image + i + 24, v6);
2769  vst1q_f32_aligned(image + i + 28, v7);
2770  }
2771 
2772  // Non-vectorizable epilogue
2773  for (; i < image_size; ++i) {
2774  image[i] += b;
2775  }
2776 #else
2777  // Non-NEON CPU implementation
2778  for (int i = 0; i < image_size; ++i) {
2779  image[i] += b;
2780  }
2781 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
2782 
2783  image += image_size;
2784  }
2785 }
2786 
2787 #define CAFFE2_SPECIALIZED_COPYVECTOR(T) \
2788  template <> \
2789  C10_EXPORT void CopyVector<T, CPUContext>( \
2790  const int N, const T* src, T* dst, CPUContext* /*context*/) { \
2791  if (src != dst && N > 0) { \
2792  memcpy(dst, src, sizeof(T) * N); \
2793  } \
2794  }
2795 CAFFE2_SPECIALIZED_COPYVECTOR(float)
2796 #undef CAFFE2_SPECIALIZED_COPYVECTOR
2797 
2798 } // namespace math
2799 } // namespace caffe2
Definition: any.cpp:108
Definition: static.cpp:52
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64
Definition: static.cpp:58
Definition: static.cpp:70
Definition: OpClasses.h:659