14 #include "caffe2/utils/math.h" 28 #include <unordered_set> 31 #include "caffe2/core/context.h" 32 #include "caffe2/utils/cpu_neon.h" 33 #include "caffe2/utils/eigen_utils.h" 34 #include "caffe2/utils/fixed_divisor.h" 37 #include "Eigen/Dense" 41 #endif // CAFFE2_USE_MKL 43 #ifdef CAFFE2_USE_HPTT 45 #endif // CAFFE2_USE_HPTT 61 #ifdef CAFFE2_USE_EIGEN_FOR_BLAS 79 C10_EXPORT
void Gemm<float, CPUContext>(
80 const CBLAS_TRANSPOSE trans_A,
81 const CBLAS_TRANSPOSE trans_B,
91 TensorProto::DataType math_type) {
92 auto C_mat = EigenMatrixMap<float>(C, N, M);
102 C_mat.noalias() += alpha *
103 (ConstEigenMatrixMap<float>(B, N, K) *
104 ConstEigenMatrixMap<float>(A, K, M));
107 C_mat.noalias() += alpha *
108 (ConstEigenMatrixMap<float>(B, K, N).transpose() *
109 ConstEigenMatrixMap<float>(A, K, M));
112 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for trans_B";
118 C_mat.noalias() += alpha *
119 (ConstEigenMatrixMap<float>(B, N, K) *
120 ConstEigenMatrixMap<float>(A, M, K).transpose());
123 C_mat.noalias() += alpha *
124 (ConstEigenMatrixMap<float>(B, K, N).transpose() *
125 ConstEigenMatrixMap<float>(A, M, K).transpose());
128 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for trans_B";
132 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for trans_A";
137 C10_EXPORT
void GemmEx<float, CPUContext>(
138 const CBLAS_TRANSPOSE trans_A,
139 const CBLAS_TRANSPOSE trans_B,
152 EigenOuterStridedMatrixMap<float> C_mat(C, N, M, EigenOuterStride(ldc));
162 C_mat.noalias() += alpha *
163 (ConstEigenOuterStridedMatrixMap<float>(
164 B, N, K, EigenOuterStride(ldb)) *
165 ConstEigenOuterStridedMatrixMap<float>(
166 A, K, M, EigenOuterStride(lda)));
169 C_mat.noalias() += alpha *
170 (ConstEigenOuterStridedMatrixMap<float>(
171 B, K, N, EigenOuterStride(ldb))
173 ConstEigenOuterStridedMatrixMap<float>(
174 A, K, M, EigenOuterStride(lda)));
177 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for trans_B";
183 C_mat.noalias() += alpha *
184 (ConstEigenOuterStridedMatrixMap<float>(
185 B, N, K, EigenOuterStride(ldb)) *
186 ConstEigenOuterStridedMatrixMap<float>(
187 A, M, K, EigenOuterStride(lda))
191 C_mat.noalias() += alpha *
192 (ConstEigenOuterStridedMatrixMap<float>(
193 B, K, N, EigenOuterStride(ldb))
195 ConstEigenOuterStridedMatrixMap<float>(
196 A, M, K, EigenOuterStride(lda))
200 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for trans_B";
204 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for trans_A";
209 C10_EXPORT
void Gemv<float, CPUContext>(
210 const CBLAS_TRANSPOSE trans_A,
219 TensorProto::DataType math_type) {
220 EigenVectorMap<float> y_vec(y, trans_A == CblasNoTrans ? M : N);
230 y_vec.noalias() += alpha *
231 (ConstEigenMatrixMap<float>(A, N, M).transpose() *
232 ConstEigenVectorMap<float>(x, N));
236 y_vec.noalias() += alpha *
237 (ConstEigenMatrixMap<float>(A, N, M) *
238 ConstEigenVectorMap<float>(x, M));
242 LOG(FATAL) <<
"Gemv float found an unexpected CBLAS_TRANSPOSE input.";
246 #define CAFFE2_SPECIALIZED_DOT(T) \ 248 C10_EXPORT void Dot<T, CPUContext>( \ 249 const int N, const T* a, const T* b, T* y, CPUContext* context) { \ 250 *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \ 252 CAFFE2_SPECIALIZED_DOT(
float)
253 #undef CAFFE2_SPECIALIZED_DOT 255 #define CAFFE2_SPECIALIZED_AXPY(T) \ 257 C10_EXPORT void Axpy<T, CPUContext>( \ 258 const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \ 259 EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha; \ 262 C10_EXPORT void Axpy<T, CPUContext>( \ 263 const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \ 264 EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha); \ 266 CAFFE2_SPECIALIZED_AXPY(
float)
267 #undef CAFFE2_SPECIALIZED_AXPY 269 #define CAFFE2_SPECIALIZED_AXPBY(T) \ 271 C10_EXPORT void Axpby<T, T, CPUContext>( \ 277 CPUContext* context) { \ 278 EigenVectorArrayMap<T> y_arr(y, N); \ 279 y_arr = y_arr * beta + ConstEigenVectorArrayMap<T>(x, N) * alpha; \ 282 C10_EXPORT void Axpby<T, T, CPUContext>( \ 288 CPUContext* context) { \ 289 EigenVectorArrayMap<T> y_arr(y, N); \ 290 y_arr = y_arr * *beta + ConstEigenVectorArrayMap<T>(x, N) * *alpha; \ 292 CAFFE2_SPECIALIZED_AXPBY(
float)
293 #undef CAFFE2_SPECIALIZED_AXPBY 295 #else // CAFFE2_USE_EIGEN_FOR_BLAS 298 C10_EXPORT
void Gemm<float, CPUContext>(
299 const CBLAS_TRANSPOSE trans_A,
300 const CBLAS_TRANSPOSE trans_B,
310 TensorProto::DataType ) {
311 const int lda = (trans_A == CblasNoTrans) ? K : M;
312 const int ldb = (trans_B == CblasNoTrans) ? N : K;
331 C10_EXPORT
void GemmEx<float, CPUContext>(
332 const CBLAS_TRANSPOSE trans_A,
333 const CBLAS_TRANSPOSE trans_B,
364 C10_EXPORT
void Gemv<float, CPUContext>(
365 const CBLAS_TRANSPOSE trans_A,
374 TensorProto::DataType ) {
375 cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1);
378 #define CAFFE2_SPECIALIZED_DOT(T, prefix) \ 380 C10_EXPORT void Dot<T, CPUContext>( \ 381 const int N, const T* a, const T* b, T* y, CPUContext*) { \ 382 *y = cblas_##prefix##dot(N, a, 1, b, 1); \ 384 CAFFE2_SPECIALIZED_DOT(
float, s)
385 #undef CAFFE2_SPECIALIZED_DOT 387 #define CAFFE2_SPECIALIZED_AXPY(T, prefix) \ 389 C10_EXPORT void Axpy<T, CPUContext>( \ 390 const int N, const T alpha, const T* x, T* y, CPUContext*) { \ 391 cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ 394 C10_EXPORT void Axpy<T, CPUContext>( \ 395 const int N, const T* alpha, const T* x, T* y, CPUContext*) { \ 396 cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \ 398 CAFFE2_SPECIALIZED_AXPY(
float, s)
399 #undef CAFFE2_SPECIALIZED_AXPY 403 #ifdef CAFFE2_USE_MKL 404 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ 406 C10_EXPORT void Axpby<T, T, CPUContext>( \ 413 cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \ 416 C10_EXPORT void Axpby<T, T, CPUContext>( \ 423 cblas_##prefix##axpby(N, *alpha, x, 1, *beta, y, 1); \ 425 #else // CAFFE2_USE_MKL 426 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ 428 C10_EXPORT void Axpby<T, T, CPUContext>( \ 435 cblas_##prefix##scal(N, beta, y, 1); \ 436 cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ 439 C10_EXPORT void Axpby<T, T, CPUContext>( \ 446 cblas_##prefix##scal(N, *beta, y, 1); \ 447 cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \ 449 #endif // CAFFE2_USE_MKL 450 CAFFE2_SPECIALIZED_AXPBY(
float, s)
451 #undef CAFFE2_SPECIALIZED_AXPBY 453 #endif // CAFFE2_USE_EIGEN_FOR_BLAS 456 C10_EXPORT
void GemmBatched<float, CPUContext>(
457 const CBLAS_TRANSPOSE trans_A,
458 const CBLAS_TRANSPOSE trans_B,
459 const int batch_size,
469 TensorProto::DataType ) {
470 #ifdef CAFFE2_USE_MKL 472 const int lda = (trans_A == CblasNoTrans) ? K : M;
473 const int ldb = (trans_B == CblasNoTrans) ? N : K;
492 #else // CAFFE2_USE_MKL 494 for (
int i = 0; i < batch_size; ++i) {
495 math::Gemm<float, CPUContext>(
496 trans_A, trans_B, M, N, K, alpha, A[i], B[i], beta, C[i], context);
498 #endif // CAFFE2_USE_MKL 502 C10_EXPORT
void GemmStridedBatched<float, CPUContext>(
503 const CBLAS_TRANSPOSE trans_A,
504 const CBLAS_TRANSPOSE trans_B,
505 const int batch_size,
518 TensorProto::DataType ) {
519 #ifdef CAFFE2_USE_MKL 521 const int lda = (trans_A == CblasNoTrans) ? K : M;
522 const int ldb = (trans_B == CblasNoTrans) ? N : K;
524 std::vector<const float*> A_array(batch_size);
525 std::vector<const float*> B_array(batch_size);
526 std::vector<float*> C_array(batch_size);
527 for (
int i = 0; i < batch_size; ++i) {
528 A_array[i] = A + i * A_stride;
529 B_array[i] = B + i * B_stride;
530 C_array[i] = C + i * C_stride;
549 #else // CAFFE2_USE_MKL 551 for (
int i = 0; i < batch_size; ++i) {
552 math::Gemm<float, CPUContext>(
553 trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context);
569 template <
typename T>
570 C10_EXPORT
void BroadcastImpl(
578 CPUContext* context) {
579 CAFFE_ENFORCE_LE(X_ndim, Y_ndim);
580 std::vector<int> X_dims_vector(Y_ndim);
581 const int d = Y_ndim - X_ndim;
582 std::fill(X_dims_vector.begin(), X_dims_vector.begin() + d, 1);
583 for (
int i = d; i < Y_ndim; ++i) {
584 CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
585 X_dims_vector[i] = X_dims[i - d];
587 X_dims = X_dims_vector.data();
589 std::accumulate(Y_dims, Y_dims + Y_ndim, 1, std::multiplies<int>());
590 std::vector<int> index(Y_ndim, 0);
591 for (
int Y_index = 0; Y_index < Y_size; ++Y_index) {
592 const int X_index = utils::GetIndexFromDims(Y_ndim, X_dims, index.data());
593 Y[Y_index] = X[X_index];
594 utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data());
596 Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
601 #define CAFFE2_SPECIALIZED_BROADCAST(T) \ 603 C10_EXPORT void Broadcast<T, CPUContext>( \ 611 CPUContext* context) { \ 612 BroadcastImpl<T>(X_ndim, X_dims, Y_ndim, Y_dims, alpha, X, Y, context); \ 614 CAFFE2_SPECIALIZED_BROADCAST(std::int32_t)
615 CAFFE2_SPECIALIZED_BROADCAST(
std::int64_t)
616 CAFFE2_SPECIALIZED_BROADCAST(
float)
617 CAFFE2_SPECIALIZED_BROADCAST(
double)
618 #undef CAFFE2_SPECIALIZED_BROADCAST 620 #define CAFFE2_SPECIALIZED_INV_STD(T) \ 622 void InvStd<T, CPUContext>( \ 627 CPUContext* context) { \ 628 EigenVectorArrayMap<T>(inv_std, N) = \ 629 (ConstEigenVectorArrayMap<T>(var, N) + epsilon).rsqrt(); \ 631 CAFFE2_SPECIALIZED_INV_STD(
float)
632 #undef CAFFE2_SPECIALIZED_INV_STD 634 #define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \ 636 C10_EXPORT void RowwiseMax<T, CPUContext>( \ 637 const int N, const int D, const T* x, T* y, CPUContext*) { \ 638 EigenVectorMap<T>(y, N) = \ 639 ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \ 641 CAFFE2_SPECIALIZED_ROWWISEMAX(
float)
642 #undef CAFFE2_SPECIALIZED_ROWWISEMAX 644 #define CAFFE2_SPECIALIZED_COLWISEMAX(T) \ 646 C10_EXPORT void ColwiseMax<T, CPUContext>( \ 647 const int N, const int D, const T* x, T* y, CPUContext*) { \ 648 EigenVectorMap<T>(y, D) = \ 649 ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \ 651 CAFFE2_SPECIALIZED_COLWISEMAX(
float)
652 #undef CAFFE2_SPECIALIZED_COLWISEMAX 654 #define CAFFE2_SPECIALIZED_MAXIMUM(T) \ 656 C10_EXPORT void Maximum<T, CPUContext>( \ 657 const int N, const float alpha, const T* x, T* y, CPUContext* context) { \ 659 x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \ 661 CAFFE2_SPECIALIZED_MAXIMUM(
float)
662 #undef CAFFE2_SPECIALIZED_MAXIMUM 667 #define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \ 669 C10_EXPORT void Rowwise##Func<T, CPUContext, true>( \ 677 EigenArrayMap<T>(C, cols, rows).colwise() expr## = \ 678 ConstEigenVectorArrayMap<T>(A, cols); \ 680 EigenArrayMap<T>(C, cols, rows) = \ 681 ConstEigenArrayMap<T>(B, cols, rows) \ 682 .colwise() expr ConstEigenVectorArrayMap<T>(A, cols); \ 686 C10_EXPORT void Colwise##Func<T, CPUContext, true>( \ 694 EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \ 695 ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ 697 EigenArrayMap<T>(C, cols, rows) = \ 698 ConstEigenArrayMap<T>(B, cols, rows) \ 699 .rowwise() expr ConstEigenVectorArrayMap<T>(A, rows) \ 704 #define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \ 706 C10_EXPORT void Rowwise##Func<T, CPUContext, false>( \ 714 EigenArrayMap<T>(C, cols, rows).colwise() expr## = \ 715 ConstEigenVectorArrayMap<T>(B, cols); \ 717 EigenArrayMap<T>(C, cols, rows) = \ 718 ConstEigenArrayMap<T>(A, cols, rows) \ 719 .colwise() expr ConstEigenVectorArrayMap<T>(B, cols); \ 723 C10_EXPORT void Colwise##Func<T, CPUContext, false>( \ 731 EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \ 732 ConstEigenVectorArrayMap<T>(B, rows).transpose(); \ 734 EigenArrayMap<T>(C, cols, rows) = \ 735 ConstEigenArrayMap<T>(A, cols, rows) \ 736 .rowwise() expr ConstEigenVectorArrayMap<T>(B, rows) \ 741 #define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \ 742 DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \ 743 DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) 745 #define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr) \ 746 DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr) \ 747 DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr) \ 748 DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \ 749 DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr) 751 DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(
Add, +)
752 DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
754 #undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION 755 #undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION 757 #define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T) \ 759 C10_EXPORT void RowwiseSub<T, CPUContext, true>( \ 766 EigenArrayMap<T>(C, cols, rows) = \ 767 (-ConstEigenArrayMap<T>(B, cols, rows)).colwise() + \ 768 ConstEigenVectorArrayMap<T>(A, cols); \ 771 C10_EXPORT void ColwiseSub<T, CPUContext, true>( \ 778 EigenArrayMap<T>(C, cols, rows) = \ 779 (-ConstEigenArrayMap<T>(B, cols, rows)).rowwise() + \ 780 ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ 782 DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -) 784 DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(
float)
785 DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(
double)
786 DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(
std::int32_t)
787 DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(
std::int64_t)
789 #undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION 791 #define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T) \ 793 C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \ 800 EigenArrayMap<T>(C, cols, rows) = \ 801 ConstEigenArrayMap<T>(B, cols, rows).inverse().colwise() * \ 802 ConstEigenVectorArrayMap<T>(A, cols); \ 805 C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \ 812 EigenArrayMap<T>(C, cols, rows) = \ 813 ConstEigenArrayMap<T>(B, cols, rows).inverse().rowwise() * \ 814 ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ 816 DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /) 818 DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(
float)
819 DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(
double)
820 DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(
std::int32_t, Div, /)
821 DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(
std::int64_t, Div, /)
823 #undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION 825 #undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION 826 #undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION 829 C10_EXPORT
void Not<bool, CPUContext>(
834 for (
int i = 0; i < N; ++i) {
839 #undef C10_DEFINE_BINARY_OP 840 #undef CAFFE2_INSTANTIATE_BINARY_OP 842 #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \ 844 C10_EXPORT void AddStripedBatch( \ 850 CPUContext* context) { \ 851 for (int j = 0; j < batch; j++) { \ 852 Add<T, CPUContext>(N, first + j * stripe, y, y, context); \ 856 CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(
float);
857 #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH 861 template <
typename TIn,
typename TOut,
class BinaryOperator,
bool kBroadcast1st>
862 C10_EXPORT
void RowwiseBinaryOp(
865 const BinaryOperator& op,
869 for (
int i = 0; i < rows; ++i) {
870 for (
int j = 0; j < cols; ++j) {
871 const int C_index = i * cols + j;
872 const int A_index = kBroadcast1st ? j : C_index;
873 const int B_index = kBroadcast1st ? C_index : j;
874 C[C_index] = op(A[A_index], B[B_index]);
879 template <
typename TIn,
typename TOut,
class BinaryOperator,
bool kBroadcast1st>
880 C10_EXPORT
void ColwiseBinaryOp(
883 const BinaryOperator& op,
887 for (
int i = 0; i < rows; ++i) {
888 for (
int j = 0; j < cols; ++j) {
889 const int C_index = i * cols + j;
890 const int A_index = kBroadcast1st ? i : C_index;
891 const int B_index = kBroadcast1st ? C_index : i;
892 C[C_index] = op(A[A_index], B[B_index]);
897 template <
typename TIn,
typename TOut,
class BinaryOperator>
898 C10_EXPORT
void BroadcastBinaryOpImpl(
903 const BinaryOperator& op,
907 std::vector<int> index(ndim, 0);
909 std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
910 for (
int C_index = 0; C_index < C_size; ++C_index) {
911 const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data());
912 const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data());
913 C[C_index] = op(A[A_index], B[B_index]);
914 utils::IncreaseIndexInDims(ndim, C_dims, index.data());
920 #define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \ 922 C10_EXPORT void Rowwise##Func<TIn, CPUContext, true>( \ 929 RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \ 932 C10_EXPORT void Rowwise##Func<TIn, CPUContext, false>( \ 939 RowwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \ 940 rows, cols, Op<TIn>(), A, B, C); \ 943 C10_EXPORT void Colwise##Func<TIn, CPUContext, true>( \ 950 ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \ 953 C10_EXPORT void Colwise##Func<TIn, CPUContext, false>( \ 960 ColwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \ 961 rows, cols, Op<TIn>(), A, B, C); \ 964 #define DEFINE_2D_COMPARE_FUNCTION(Func, Op) \ 965 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \ 966 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \ 967 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ 968 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ 969 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) 971 DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to)
972 DEFINE_2D_COMPARE_FUNCTION(NE,
std::not_equal_to)
973 DEFINE_2D_COMPARE_FUNCTION(LT,
std::less)
974 DEFINE_2D_COMPARE_FUNCTION(LE,
std::less_equal)
975 DEFINE_2D_COMPARE_FUNCTION(GT,
std::greater)
976 DEFINE_2D_COMPARE_FUNCTION(GE,
std::greater_equal)
978 #undef DEFINE_2D_COMPARE_FUNCTION 980 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(
bool,
bool, And, std::logical_and)
981 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(
bool,
bool, Or,
std::logical_or)
982 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(
bool,
bool, Xor,
std::bit_xor)
984 #define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \ 985 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \ 986 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ 987 DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) 989 DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
990 DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr,
std::bit_or)
991 DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor,
std::bit_xor)
993 #undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION 995 #undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION 997 #define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T) \ 999 C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \ 1006 RowwiseBinaryOp<T, T, std::divides<T>, true>( \ 1007 rows, cols, std::divides<T>(), A, B, C); \ 1010 C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \ 1017 ColwiseBinaryOp<T, T, std::divides<T>, true>( \ 1018 rows, cols, std::divides<T>(), A, B, C); \ 1020 DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t)
1021 DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(
std::int64_t)
1022 #undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION 1024 #define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \ 1026 C10_EXPORT void Func<TIn, CPUContext>( \ 1028 const int* A_dims, \ 1030 const int* B_dims, \ 1034 CPUContext* context) { \ 1035 const int ndim = std::max(A_ndim, B_ndim); \ 1036 std::vector<int> A_dims_array(ndim); \ 1037 std::vector<int> B_dims_array(ndim); \ 1038 std::vector<int> C_dims_array(ndim); \ 1039 utils::ComputeBroadcastBinaryOpDims( \ 1044 A_dims_array.data(), \ 1045 B_dims_array.data(), \ 1046 C_dims_array.data()); \ 1047 if (A_dims_array == B_dims_array) { \ 1048 const int size = std::accumulate( \ 1049 C_dims_array.cbegin(), \ 1050 C_dims_array.cend(), \ 1052 std::multiplies<int>()); \ 1053 Func<TIn, CPUContext>(size, A, B, C, context); \ 1058 bool broadcast_1st; \ 1059 if (utils::IsRowwiseBroadcastBinaryOp( \ 1061 A_dims_array.data(), \ 1062 B_dims_array.data(), \ 1065 &broadcast_1st)) { \ 1066 if (broadcast_1st) { \ 1067 Rowwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \ 1069 Rowwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \ 1073 if (utils::IsColwiseBroadcastBinaryOp( \ 1075 A_dims_array.data(), \ 1076 B_dims_array.data(), \ 1079 &broadcast_1st)) { \ 1080 if (broadcast_1st) { \ 1081 Colwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \ 1083 Colwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \ 1090 if (utils::IsBothEndsBroadcastBinaryOp( \ 1092 A_dims_array.data(), \ 1093 B_dims_array.data(), \ 1097 &broadcast_1st)) { \ 1098 const int stride = mid * nxt; \ 1099 for (int i = 0; i < pre; ++i) { \ 1100 if (broadcast_1st) { \ 1101 Colwise##Func<TIn, CPUContext, true>( \ 1102 mid, nxt, A, B + i * stride, C + i * stride, context); \ 1104 Colwise##Func<TIn, CPUContext, false>( \ 1105 mid, nxt, A + i * stride, B, C + i * stride, context); \ 1110 BroadcastBinaryOpImpl( \ 1112 A_dims_array.data(), \ 1113 B_dims_array.data(), \ 1114 C_dims_array.data(), \ 1121 #define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op) \ 1122 DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \ 1123 DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \ 1124 DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ 1125 DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ 1126 DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) 1128 DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to)
1129 DEFINE_BROADCAST_COMPARE_FUNCTION(NE,
std::not_equal_to)
1130 DEFINE_BROADCAST_COMPARE_FUNCTION(LT,
std::less)
1131 DEFINE_BROADCAST_COMPARE_FUNCTION(LE,
std::less_equal)
1132 DEFINE_BROADCAST_COMPARE_FUNCTION(GT,
std::greater)
1133 DEFINE_BROADCAST_COMPARE_FUNCTION(GE,
std::greater_equal)
1135 #undef DEFINE_BROADCAST_COMPARE_FUNCTION 1137 #define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op) \ 1138 DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op) \ 1139 DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op) \ 1140 DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ 1141 DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) 1143 DEFINE_BROADCAST_BINARY_FUNCTION(
Add, std::plus)
1144 DEFINE_BROADCAST_BINARY_FUNCTION(Sub,
std::minus)
1145 DEFINE_BROADCAST_BINARY_FUNCTION(Mul,
std::multiplies)
1146 DEFINE_BROADCAST_BINARY_FUNCTION(Div,
std::divides)
1148 #undef DEFINE_BROADCAST_BINARY_FUNCTION 1150 DELEGATE_BROADCAST_BINARY_FUNCTION(
bool,
bool, And, std::logical_and)
1151 DELEGATE_BROADCAST_BINARY_FUNCTION(
bool,
bool, Or,
std::logical_or)
1152 DELEGATE_BROADCAST_BINARY_FUNCTION(
bool,
bool, Xor,
std::bit_xor)
1154 #define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \ 1155 DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \ 1156 DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ 1157 DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) 1159 DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
1160 DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr,
std::bit_or)
1161 DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor,
std::bit_xor)
1163 #undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION 1165 #undef DELEGATE_BROADCAST_BINARY_FUNCTION 1167 #define CAFFE2_RAND_UNIFORM_REAL(T) \ 1169 C10_EXPORT void RandUniform<T, CPUContext>( \ 1170 const size_t n, const T a, const T b, T* r, CPUContext* context) { \ 1171 std::uniform_real_distribution<T> distribution(a, b); \ 1172 for (size_t i = 0; i < n; ++i) { \ 1173 r[i] = distribution(context->RandGenerator()); \ 1176 CAFFE2_RAND_UNIFORM_REAL(
float);
1177 CAFFE2_RAND_UNIFORM_REAL(
double);
1178 #undef CAFFE2_RAND_UNIFORM_REAL 1180 #define CAFFE2_RAND_UNIFORM_CHAR(T) \ 1182 C10_EXPORT void RandUniform<T, CPUContext>( \ 1183 const size_t n, const T a, const T b, T* r, CPUContext* context) { \ 1184 std::uniform_int_distribution<short> distribution((short)a, (short)b); \ 1185 for (size_t i = 0; i < n; ++i) { \ 1186 r[i] = static_cast<T>(distribution(context->RandGenerator())); \ 1189 CAFFE2_RAND_UNIFORM_CHAR(int8_t);
1190 CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
1191 #undef CAFFE2_RAND_UNIFORM_CHAR 1193 #define CAFFE2_RAND_UNIFORM_INT(T) \ 1195 C10_EXPORT void RandUniform<T, CPUContext>( \ 1196 const size_t n, const T a, const T b, T* r, CPUContext* context) { \ 1197 std::uniform_int_distribution<T> distribution(a, b); \ 1198 for (size_t i = 0; i < n; ++i) { \ 1199 r[i] = distribution(context->RandGenerator()); \ 1203 CAFFE2_RAND_UNIFORM_INT(int16_t);
1204 CAFFE2_RAND_UNIFORM_INT(int32_t);
1205 CAFFE2_RAND_UNIFORM_INT(int64_t);
1206 CAFFE2_RAND_UNIFORM_INT(uint16_t);
1207 CAFFE2_RAND_UNIFORM_INT(uint32_t);
1208 CAFFE2_RAND_UNIFORM_INT(uint64_t);
1209 #undef CAFFE2_RAND_UNIFORM_INT 1219 #define CAFFE2_RAND_FIXED_SUM(T) \ 1221 C10_EXPORT void RandFixedSum<T, CPUContext>( \ 1227 CPUContext* context) { \ 1228 CAFFE_ENFORCE_GE(a, 0); \ 1229 CAFFE_ENFORCE_GE(sum / (double)n, a); \ 1230 CAFFE_ENFORCE_LE(sum / (double)n, b); \ 1231 T current_sum = 0; \ 1232 T remaining_sum = sum; \ 1233 for (size_t i = 0; i < n; ++i) { \ 1234 auto remaining_numbers = n - 1 - i; \ 1235 double mean = (sum - current_sum) / (remaining_numbers + 1); \ 1236 double stdev = std::min(mean - a, b - mean); \ 1237 std::normal_distribution<double> distribution{mean, stdev / 4.0}; \ 1238 T value, remaining_sum_test; \ 1240 value = distribution(context->RandGenerator()); \ 1241 remaining_sum_test = remaining_sum - value; \ 1242 } while (value < a || remaining_sum_test < a * remaining_numbers || \ 1243 value > b || remaining_sum_test > b * remaining_numbers); \ 1245 CAFFE_ENFORCE(a <= value && value <= b); \ 1246 current_sum += value; \ 1247 remaining_sum -= value; \ 1248 CAFFE_ENFORCE_GE(remaining_sum, a* remaining_numbers); \ 1249 CAFFE_ENFORCE_LE(remaining_sum, b* remaining_numbers); \ 1251 r[n - 1] += remaining_sum; \ 1252 current_sum += remaining_sum; \ 1253 CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b); \ 1254 CAFFE_ENFORCE_EQ(current_sum, sum); \ 1256 CAFFE2_RAND_FIXED_SUM(
float);
1257 CAFFE2_RAND_FIXED_SUM(
double);
1258 CAFFE2_RAND_FIXED_SUM(int8_t);
1259 CAFFE2_RAND_FIXED_SUM(int16_t);
1260 CAFFE2_RAND_FIXED_SUM(int32_t);
1261 CAFFE2_RAND_FIXED_SUM(int64_t);
1262 CAFFE2_RAND_FIXED_SUM(uint8_t);
1263 CAFFE2_RAND_FIXED_SUM(uint16_t);
1264 CAFFE2_RAND_FIXED_SUM(uint32_t);
1265 CAFFE2_RAND_FIXED_SUM(uint64_t);
1266 #undef CAFFE2_RAND_FIXED_SUM 1268 template <
class Type,
class Val_t,
class Ind_t,
class Context_t,
bool cdf_app>
1269 Ind_t generate_stack_distance(
1270 std::vector<Ind_t>& cum_val,
1271 std::vector<Val_t>& cum_dis,
1272 std::vector<Ind_t>& cum_map,
1275 Context_t* context) {
1288 math::RandUniform<Val_t, Context_t>(1, 0, 1, &u, context);
1293 j = (std::upper_bound(cum_val.begin(), cum_val.end(), i) -
1304 n = (Ind_t)round(u * k);
1309 for (j = 0; j < cum_dis.size(); j++) {
1315 return cum_val[j - 1];
1319 template <
class Type,
class Val_t,
class Ind_t,
class Context_t,
bool cdf_app>
1320 C10_EXPORT
void generate_trace_lru(
1321 std::vector<Ind_t>& uni_ref,
1322 std::vector<Ind_t>& cum_val,
1323 std::vector<Val_t>& cum_dis,
1324 std::vector<Ind_t>& cum_map,
1326 Ind_t cache_line_size,
1336 Ind_t i, j, k, sd, line_ref, mem_ref, mem_ref_within_line;
1337 Ind_t max_sd = cum_val.back();
1338 Ind_t l = uni_ref.size();
1340 for (i = 0, j = 0; j < n; j++) {
1342 sd = generate_stack_distance<Type, Val_t, Ind_t, Context_t, cdf_app>(
1343 cum_val, cum_dis, cum_map, max_sd, i, context);
1345 mem_ref_within_line = 0;
1358 line_ref = uni_ref[k];
1359 uni_ref.erase(uni_ref.begin() + k);
1360 uni_ref.push_back(line_ref);
1361 mem_ref = line_ref * cache_line_size + mem_ref_within_line;
1377 if (mem_ref < min) {
1381 if (mem_ref > max) {
1387 syn_ref[j] = (Type)mem_ref;
1395 #define CAFFE2_RAND_SYNTHETIC_DATA(T) \ 1397 C10_EXPORT void RandSyntheticData<T, CPUContext>( \ 1398 const size_t n, const T a, const T b, T* r, CPUContext* context) { \ 1400 std::vector<int> mem_ref = {1, 2, 3, 4, 5, 6}; \ 1402 std::vector<int> cum_val = {0, 1, 3, 4, 5}; \ 1403 std::vector<double> cum_dis = {0.55, 0.64, 0.82, 0.91, 1.0}; \ 1407 std::vector<int> cum_map(k, 0); \ 1408 for (int j = 0; j < cum_dis.size();) { \ 1409 int sz = (int)round(cum_dis[j] * k); \ 1410 for (int i = 0; i < sz; i++) { \ 1411 cum_map[j + i] = j; \ 1417 const int cache_line = 1; \ 1418 generate_trace_lru<T, double, int, CPUContext, false>( \ 1419 mem_ref, cum_val, cum_dis, cum_map, context, cache_line, n, a, b, r); \ 1422 CAFFE2_RAND_SYNTHETIC_DATA(
float);
1423 CAFFE2_RAND_SYNTHETIC_DATA(
double);
1424 CAFFE2_RAND_SYNTHETIC_DATA(int8_t);
1425 CAFFE2_RAND_SYNTHETIC_DATA(int16_t);
1426 CAFFE2_RAND_SYNTHETIC_DATA(int32_t);
1427 CAFFE2_RAND_SYNTHETIC_DATA(int64_t);
1428 CAFFE2_RAND_SYNTHETIC_DATA(uint8_t);
1429 CAFFE2_RAND_SYNTHETIC_DATA(uint16_t);
1430 CAFFE2_RAND_SYNTHETIC_DATA(uint32_t);
1431 CAFFE2_RAND_SYNTHETIC_DATA(uint64_t);
1432 #undef CAFFE2_RAND_SYNTHETIC_DATA 1434 #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \ 1436 C10_EXPORT void RandUniformUnique<T, CPUContext>( \ 1443 CPUContext* context) { \ 1445 n, b - a - m + 1, "Cannot satisfy the unique requirement"); \ 1446 std::unordered_set<T> avoid_set(n); \ 1448 avoid_set.insert(avoid, avoid + m); \ 1450 m, avoid_set.size(), "AC10_EXPORT void should be unique"); \ 1452 std::uniform_int_distribution<T> distribution(a, b); \ 1454 for (size_t i = 0; i < n; ++i) { \ 1456 v = distribution(context->RandGenerator()); \ 1457 } while (avoid_set.count(v)); \ 1459 avoid_set.insert(v); \ 1463 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
1464 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
1465 #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE 1468 C10_EXPORT
void RandGaussian<float, CPUContext>(
1473 CPUContext* context) {
1474 std::normal_distribution<float> distribution(mean, std);
1475 for (
size_t i = 0; i < n; ++i) {
1476 r[i] = distribution(context->RandGenerator());
1480 #define CAFFE2_SPECIALIZED_SUM(T) \ 1482 C10_EXPORT void Sum<T, CPUContext>( \ 1488 *y = ConstEigenVectorMap<T>(x, N).sum(); \ 1491 CAFFE2_SPECIALIZED_SUM(
float);
1492 CAFFE2_SPECIALIZED_SUM(int32_t);
1493 CAFFE2_SPECIALIZED_SUM(int64_t);
1495 #undef CAFFE2_SPECIALIZED_SUM 1498 C10_EXPORT
void SumSqr<float, CPUContext>(
1504 *y = ConstEigenVectorMap<float>(x, N).squaredNorm();
1508 C10_EXPORT
void Select<float, CPUContext>(
1515 for (
int i = 0; i < N; ++i) {
1516 DCHECK_LT(idx[i], D);
1517 y[i] = x[i * D + idx[i]];
1522 C10_EXPORT
void CopyMatrix<CPUContext>(
1523 const size_t itemsize,
1531 TypeMeta::Copy copy) {
1532 if (A ==
nullptr || B ==
nullptr) {
1535 if (lda == N && ldb == N) {
1538 copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
1541 static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
1546 for (
int i = 0; i < M; ++i) {
1549 static_cast<const char*>(A) + lda * i * itemsize,
1550 static_cast<char*>(B) + ldb * i * itemsize,
1554 static_cast<char*>(B) + ldb * i * itemsize,
1555 static_cast<const char*>(A) + lda * i * itemsize,
1561 #ifdef CAFFE2_USE_MKL 1563 #define DELEGATE_COPY_MATRIX_FUNCTION(T, Func) \ 1565 C10_EXPORT void CopyMatrix<T, CPUContext>( \ 1573 Func('R', 'N', M, N, T(1), A, lda, B, ldb); \ 1576 C10_EXPORT void CopyMatrix<T, CPUContext>( \ 1580 const int A_outer_stride, \ 1581 const int A_inner_stride, \ 1583 const int B_outer_stride, \ 1584 const int B_inner_stride, \ 1599 DELEGATE_COPY_MATRIX_FUNCTION(
float, mkl_somatcopy)
1600 DELEGATE_COPY_MATRIX_FUNCTION(
double, mkl_domatcopy)
1601 #undef DELEGATE_COPY_MATRIX_FUNCTION 1603 #endif // CAFFE2_USE_MKL 1605 #define CAFFE2_SPECIALIZED_COPY_MATRIX(T) \ 1607 C10_EXPORT void CopyMatrix<T, CPUContext>( \ 1615 if (M == 0 || N == 0) { \ 1620 std::memcpy(B, A, sizeof(T) * M * N); \ 1622 EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \ 1623 ConstEigenMatrixMap<T>(A, N, M); \ 1627 EigenMatrixMap<T>(B, N, M) = ConstEigenOuterStridedMatrixMap<T>( \ 1628 A, N, M, EigenOuterStride(lda)); \ 1630 EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \ 1631 ConstEigenOuterStridedMatrixMap<T>( \ 1632 A, N, M, EigenOuterStride(lda)); \ 1637 C10_EXPORT void CopyMatrix<T, CPUContext>( \ 1641 const int A_outer_stride, \ 1642 const int A_inner_stride, \ 1644 const int B_outer_stride, \ 1645 const int B_inner_stride, \ 1646 CPUContext* context) { \ 1647 if (A_inner_stride == 1 && B_inner_stride == 1) { \ 1648 CopyMatrix<T, CPUContext>( \ 1649 M, N, A, A_outer_stride, B, B_outer_stride, context); \ 1652 EigenStridedMatrixMap<T>( \ 1653 B, N, M, EigenStride(B_outer_stride, B_inner_stride)) = \ 1654 ConstEigenStridedMatrixMap<T>( \ 1655 A, N, M, EigenStride(A_outer_stride, A_inner_stride)); \ 1658 #ifndef CAFFE2_USE_MKL 1659 CAFFE2_SPECIALIZED_COPY_MATRIX(
float)
1660 CAFFE2_SPECIALIZED_COPY_MATRIX(
double)
1661 #endif // CAFFE2_USE_MKL 1663 CAFFE2_SPECIALIZED_COPY_MATRIX(
int)
1664 CAFFE2_SPECIALIZED_COPY_MATRIX(int64_t)
1665 CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint8_t)
1666 CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint16_t)
1668 #undef CAFFE2_SPECIALIZXED_COPY_MATRIX 1672 template <
typename T>
1673 C10_EXPORT
void Im2ColZeroPaddingAndNoDilationNCHW(
1683 CPUContext* context) {
1684 const int output_h = (H - kernel_h) / stride_h + 1;
1685 const int output_w = (W - kernel_w) / stride_w + 1;
1686 const int output_size = output_h * output_w;
1687 for (
int c = 0; c < C; ++c) {
1688 for (
int kh = 0; kh < kernel_h; ++kh) {
1689 for (
int kw = 0; kw < kernel_w; ++kw) {
1690 const T* src = img_data + kh * W + kw;
1691 if (stride_w == 1) {
1692 CopyMatrix<T, CPUContext>(
1701 CopyMatrix<T, CPUContext>(
1712 col_data += output_size;
1719 template <
typename T>
1720 C10_EXPORT
void Col2ImZeroPaddingAndNoDilationNCHW(
1730 CPUContext* context) {
1731 Set<T, CPUContext>(C * H * W,
T(0), img_data, context);
1732 const int output_h = (H - kernel_h) / stride_h + 1;
1733 const int output_w = (W - kernel_w) / stride_w + 1;
1734 const int output_size = output_h * output_w;
1735 for (
int c = 0; c < C; ++c) {
1736 for (
int kh = 0; kh < kernel_h; ++kh) {
1737 for (
int kw = 0; kw < kernel_w; ++kw) {
1738 T* dst = img_data + kh * W + kw;
1739 if (stride_w == 1) {
1740 EigenOuterStridedArrayMap<T>(
1741 dst, output_w, output_h, EigenOuterStride(stride_h * W)) +=
1742 ConstEigenArrayMap<T>(col_data, output_w, output_h);
1744 EigenStridedArrayMap<T>(
1745 dst, output_w, output_h, EigenStride(stride_h * W, stride_w)) +=
1746 ConstEigenArrayMap<T>(col_data, output_w, output_h);
1748 col_data += output_size;
1755 template <
typename T>
1756 C10_EXPORT
void Im2ColZeroPaddingAndNoDilationNHWC(
1766 CPUContext* context) {
1767 const int output_h = (H - kernel_h) / stride_h + 1;
1768 const int output_w = (W - kernel_w) / stride_w + 1;
1769 const int kernel_size = kernel_h * kernel_w;
1770 for (
int yh = 0; yh < output_h; ++yh) {
1771 for (
int yw = 0; yw < output_w; ++yw) {
1772 const T* src = img_data + (yh * stride_h * W + yw * stride_w) * C;
1773 CopyMatrix<T, CPUContext>(
1774 kernel_h, kernel_w * C, src, W * C, col_data, kernel_w * C, context);
1775 col_data += kernel_size * C;
1780 template <
typename T>
1781 C10_EXPORT
void Col2ImZeroPaddingAndNoDilationNHWC(
1791 CPUContext* context) {
1792 Set<T, CPUContext>(H * W * C,
T(0), img_data, context);
1793 const int output_h = (H - kernel_h) / stride_h + 1;
1794 const int output_w = (W - kernel_w) / stride_w + 1;
1795 const int kernel_size = kernel_h * kernel_w;
1796 for (
int yh = 0; yh < output_h; ++yh) {
1797 for (
int yw = 0; yw < output_w; ++yw) {
1798 T* dst = img_data + (yh * stride_h * W + yw * stride_w) * C;
1799 EigenOuterStridedArrayMap<T>(
1800 dst, kernel_w * C, kernel_h, EigenOuterStride(W * C)) +=
1801 ConstEigenArrayMap<T>(col_data, kernel_w * C, kernel_h);
1802 col_data += kernel_size * C;
1807 template <
typename T,
bool kCol2Im>
1808 C10_EXPORT
void Im2ColNdNCHWImpl(
1812 const int* img_shape,
1813 const int* col_shape,
1814 const int* kernel_shape,
1816 const int* dilation,
1818 const float* X_data,
1821 std::memset(Y_data, 0, img_size *
sizeof(
float));
1823 const int outer_size = col_shape[0];
1824 const int inner_size = col_size / outer_size;
1825 const int kernel_size = std::accumulate(
1826 kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
1827 std::vector<FixedDivisor<int>> kernel_shape_div(N);
1828 for (
int i = 0; i < N; ++i) {
1829 kernel_shape_div[i] = FixedDivisor<int>(kernel_shape[i]);
1831 std::vector<int> d_offset(N, 0);
1832 std::vector<int> d_iter(N, 0);
1833 for (
int i = 0; i < outer_size; ++i) {
1836 for (
int d_i = N - 1; d_i >= 0; --d_i) {
1837 kernel_shape_div[d_i].DivMod(offset, &offset, &d_offset[d_i]);
1839 for (
int j = 0; j < inner_size; ++j) {
1842 const int col_index = i * inner_size + j;
1843 int img_index = i / kernel_size;
1844 bool is_padding =
false;
1845 for (
int d_i = 0; d_i < N; ++d_i) {
1846 const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
1847 d_offset[d_i] * dilation[d_i];
1848 is_padding |= !utils::IsAGeZeroAndALtB(d_img, img_shape[d_i + 1]);
1849 img_index = img_index * img_shape[d_i + 1] + d_img;
1852 Y_data[col_index] = is_padding ? 0 : X_data[img_index];
1853 }
else if (!is_padding) {
1854 Y_data[img_index] += X_data[col_index];
1856 utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
1861 template <
typename T>
1862 void Im2Col3dNCHWImpl(
1870 const int dilation_t,
1871 const int dilation_h,
1872 const int dilation_w,
1884 const int output_t =
1885 (clip_len + pad_p + pad_a - (dilation_t * (kernel_t - 1) + 1)) /
1888 const int output_h =
1889 (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1891 const int output_w =
1892 (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1894 const int kernel_size = kernel_t * kernel_h * kernel_w;
1895 const int kernel_hw_size = kernel_h * kernel_w;
1896 const int output_size = output_t * output_h * output_w;
1897 const int channel_size = clip_len * height * width;
1898 const int output_hw_size = output_h * output_w;
1899 const int channel_hw_size = height * width;
1903 if (dilation_t == 1 && dilation_h == 1 && dilation_w == 1 && pad_a == 0 &&
1904 pad_p == 0 && pad_l == 0 && pad_r == 0 && pad_t == 0 && pad_b == 0) {
1905 for (
auto k = 0; k < channels * kernel_size; k++) {
1906 const auto nip = k / kernel_size;
1907 const auto rest = k % kernel_size;
1908 const auto kt = rest / kernel_hw_size;
1909 const auto rest_hw = rest % kernel_hw_size;
1910 const auto kh = rest_hw / kernel_w;
1911 const auto kw = rest_hw % kernel_w;
1912 auto* dst = col_data + nip * (kernel_size * output_size) +
1913 kt * (kernel_hw_size * output_size) + kh * (kernel_w * output_size) +
1915 const auto* src = img_data + nip * channel_size;
1916 for (
auto t = 0; t < output_t; t++) {
1917 const auto it = t * stride_t + kt;
1918 for (
auto y = 0; y < output_h; y++) {
1919 const auto iy = y * stride_h + kh;
1921 if (stride_w == 1) {
1923 dst + (t * output_hw_size + y * output_w),
1924 src + (it * channel_hw_size + iy * width + ix),
1925 sizeof(
T) * output_w);
1927 for (
auto x = 0; x < output_w; x++) {
1929 dst + (t * output_hw_size + y * output_w + x),
1930 src + (it * channel_hw_size + iy * width + ix + x * stride_w),
1940 if (pad_a == pad_p && pad_l == pad_r && pad_t == pad_b) {
1941 const int pad_f = pad_a;
1942 const int pad_h = pad_t;
1943 const int pad_w = pad_l;
1944 for (
int channel = channels; channel--; img_data += channel_size) {
1945 for (
int kernel_frame = 0; kernel_frame < kernel_t; kernel_frame++) {
1946 for (
int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
1947 for (
int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
1948 int input_frame = -pad_f + kernel_frame * dilation_t;
1949 for (
int output_frames = output_t; output_frames; output_frames--) {
1950 if (!utils::IsAGeZeroAndALtB(input_frame, clip_len)) {
1951 for (
int output_rows = output_h; output_rows; output_rows--) {
1952 for (
int output_cols = output_w; output_cols; output_cols--) {
1957 int input_row = -pad_h + kernel_row * dilation_h;
1958 for (
int output_rows = output_h; output_rows; output_rows--) {
1959 if (!utils::IsAGeZeroAndALtB(input_row, height)) {
1960 for (
int output_cols = output_w; output_cols;
1965 int input_col = -pad_w + kernel_col * dilation_w;
1966 for (
int output_col = output_w; output_col; output_col--) {
1967 if (utils::IsAGeZeroAndALtB(input_col, width)) {
1968 *(col_data++) = img_data
1969 [(input_frame * height + input_row) * width +
1974 input_col += stride_w;
1977 input_row += stride_h;
1980 input_frame += stride_t;
1990 const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
1991 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1992 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1994 int clip_col = (clip_len + pad_p + pad_a - dkernel_t) / stride_t + 1;
1995 int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1996 int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1998 int channels_col = channels * kernel_t * kernel_h * kernel_w;
1999 for (
int c = 0; c < channels_col; ++c) {
2000 int w_offset = c % kernel_w;
2001 int h_offset = (c / kernel_w) % kernel_h;
2002 int t_offset = (c / kernel_w / kernel_h) % kernel_t;
2003 int c_im = c / kernel_h / kernel_w / kernel_t;
2004 for (
int t = 0; t < clip_col; ++t) {
2005 for (
int h = 0; h < height_col; ++h) {
2006 for (
int w = 0; w < width_col; ++w) {
2007 int t_pad = t * stride_t - pad_p + t_offset * dilation_t;
2008 int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
2009 int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
2010 if (t_pad >= 0 && t_pad < clip_len && h_pad >= 0 && h_pad < height &&
2011 w_pad >= 0 && w_pad < width) {
2012 col_data[((c * clip_col + t) * height_col + h) * width_col + w] =
2014 [((c_im * clip_len + t_pad) * height + h_pad) * width +
2017 col_data[((c * clip_col + t) * height_col + h) * width_col + w] = 0;
2028 C10_EXPORT
void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
2032 const int* img_shape,
2033 const int* col_shape,
2034 const int* kernel_shape,
2036 const int* dilation,
2038 const float* img_data,
2044 const int channels =
2045 col_shape[0] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
2046 Im2Col3dNCHWImpl<float>(
2069 Im2ColNdNCHWImpl<float, false>(
2085 C10_EXPORT
void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
2089 const int* img_shape,
2090 const int* col_shape,
2091 const int* kernel_shape,
2093 const int* dilation,
2095 const float* col_data,
2100 Im2ColNdNCHWImpl<float, true>(
2115 C10_EXPORT
void Im2Col<float, CPUContext, StorageOrder::NCHW>(
2121 const int dilation_h,
2122 const int dilation_w,
2129 const float* img_data,
2131 CPUContext* context,
2136 if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
2138 Im2ColZeroPaddingAndNoDilationNCHW<float>(
2153 const int output_h =
2154 (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
2155 const int output_w =
2156 (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
2157 const int output_size = output_h * output_w;
2158 for (
int c = 0; c < C; ++c) {
2159 for (
int kh = 0; kh < kernel_h; ++kh) {
2160 for (
int kw = 0; kw < kernel_w; ++kw) {
2161 for (
int h = 0; h < output_h; ++h) {
2162 const int h_pad = h * stride_h - pad_t + kh * dilation_h;
2163 if (!utils::IsAGeZeroAndALtB(h_pad, H)) {
2164 std::memset(col_data + h * output_w, 0, output_w *
sizeof(
float));
2167 for (
int w = 0; w < output_w; ++w) {
2168 const int w_pad = w * stride_w - pad_l + kw * dilation_w;
2169 col_data[h * output_w + w] = utils::IsAGeZeroAndALtB(w_pad, W)
2170 ? img_data[(c * H + h_pad) * W + w_pad]
2174 col_data += output_size;
2181 C10_EXPORT
void Im2Col<float, CPUContext, StorageOrder::NHWC>(
2187 const int dilation_h,
2188 const int dilation_w,
2195 const float* img_data,
2197 CPUContext* context,
2200 if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
2201 dilation_w == 1 && groups == 1) {
2202 Im2ColZeroPaddingAndNoDilationNHWC<float>(
2216 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
2217 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
2218 const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
2219 const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
2222 for (
int h = 0; h < output_h; ++h) {
2224 for (
int w = 0; w < output_w; ++w) {
2225 for (
int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
2226 if (!utils::IsAGeZeroAndALtB(ih, H)) {
2227 std::memset(col_data, 0,
sizeof(
float) * kernel_w * C);
2228 col_data += kernel_w * C;
2231 for (
int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
2232 if (utils::IsAGeZeroAndALtB(iw, W)) {
2234 col_data, img_data + (ih * W + iw) * C,
sizeof(
float) * C);
2236 std::memset(col_data, 0,
sizeof(
float) * C);
2252 const int C_per_G = C / groups;
2253 for (
int h = 0; h < output_h; ++h) {
2255 for (
int w = 0; w < output_w; ++w) {
2257 for (
int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
2259 for (
int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
2260 if (utils::IsAGeZeroAndALtB(ih, H) &&
2261 utils::IsAGeZeroAndALtB(iw, W)) {
2262 for (
int g = 0; g < groups; ++g) {
2264 col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
2265 img_data + (ih * W + iw) * C + g * C_per_G,
2266 sizeof(
float) * C_per_G);
2269 for (
int g = 0; g < groups; ++g) {
2271 col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
2273 sizeof(
float) * C_per_G);
2278 col_data += kernel_h * kernel_w * C;
2291 template <
typename TData>
2292 C10_EXPORT
void Im2Col3dNHWCImpl(
2300 const int dilation_t,
2301 const int dilation_h,
2302 const int dilation_w,
2312 const TData* img_data,
2315 const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
2316 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
2317 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
2318 const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1;
2319 const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
2320 const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
2321 const int C_per_G = C / groups;
2323 for (
int t = 0; t < output_t; ++t) {
2325 for (
int h = 0; h < output_h; ++h) {
2327 for (
int w = 0; w < output_w; ++w) {
2329 for (
int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
2331 for (
int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
2333 for (
int iw = w_pad; iw < w_pad + dkernel_w;
2334 iw += dilation_w, ++s) {
2335 if (utils::IsAGeZeroAndALtB(it, T) &&
2336 utils::IsAGeZeroAndALtB(ih, H) &&
2337 utils::IsAGeZeroAndALtB(iw, W)) {
2338 for (
int g = 0; g < groups; ++g) {
2341 (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
2343 img_data + ((it * H + ih) * W + iw) * C + g * C_per_G,
2344 sizeof(TData) * C_per_G);
2347 for (
int g = 0; g < groups; ++g) {
2350 (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
2353 sizeof(TData) * C_per_G);
2359 col_data += kernel_t * kernel_h * kernel_w * C;
2369 C10_EXPORT
void Im2ColNd<float, CPUContext, StorageOrder::NHWC>(
2373 const int* img_shape,
2374 const int* col_shape,
2375 const int* kernel_shape,
2377 const int* dilation,
2379 const float* img_data,
2384 const int channels =
2385 col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
2386 Im2Col3dNHWCImpl<float>(
2410 CAFFE_NOT_IMPLEMENTED;
2415 C10_EXPORT
void Col2Im<float, CPUContext, StorageOrder::NCHW>(
2421 const int dilation_h,
2422 const int dilation_w,
2429 const float* col_data,
2431 CPUContext* context,
2436 if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
2438 Col2ImZeroPaddingAndNoDilationNCHW<float>(
2453 Set<float, CPUContext>(C * H * W, 0.0f, img_data, context);
2454 const int output_h =
2455 (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
2456 const int output_w =
2457 (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
2458 const int output_size = output_h * output_w;
2459 for (
int c = 0; c < C; ++c) {
2460 for (
int kh = 0; kh < kernel_h; ++kh) {
2461 for (
int kw = 0; kw < kernel_w; ++kw) {
2462 for (
int h = 0; h < output_h; ++h) {
2463 const int h_pad = h * stride_h - pad_t + kh * dilation_h;
2464 if (!utils::IsAGeZeroAndALtB(h_pad, H)) {
2467 for (
int w = 0; w < output_w; ++w) {
2468 const int w_pad = w * stride_w - pad_l + kw * dilation_w;
2469 if (utils::IsAGeZeroAndALtB(w_pad, W)) {
2470 img_data[(c * H + h_pad) * W + w_pad] +=
2471 col_data[h * output_w + w];
2475 col_data += output_size;
2482 C10_EXPORT
void Col2Im<float, CPUContext, StorageOrder::NHWC>(
2488 const int dilation_h,
2489 const int dilation_w,
2496 const float* col_data,
2498 CPUContext* context,
2501 if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
2502 dilation_w == 1 && groups == 1) {
2503 Col2ImZeroPaddingAndNoDilationNHWC<float>(
2517 Set<float, CPUContext>(H * W * C, 0, img_data, context);
2518 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
2519 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
2520 const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
2521 const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
2525 for (
int h = 0; h < output_h; ++h) {
2527 for (
int w = 0; w < output_w; ++w) {
2528 for (
int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
2529 if (!utils::IsAGeZeroAndALtB(ih, H)) {
2530 col_data += kernel_w * C;
2533 for (
int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
2534 if (utils::IsAGeZeroAndALtB(iw, W)) {
2535 float* img_data_patch = img_data + (ih * W + iw) * C;
2537 C, img_data_patch, col_data, img_data_patch, context);
2547 const int C_per_G = C / groups;
2548 for (
int h = 0; h < output_h; ++h) {
2550 for (
int w = 0; w < output_w; ++w) {
2552 for (
int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
2554 for (
int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
2555 if (utils::IsAGeZeroAndALtB(ih, H) &&
2556 utils::IsAGeZeroAndALtB(iw, W)) {
2557 float* img_data_patch = img_data + (ih * W + iw) * C;
2558 for (
int g = 0; g < groups; ++g) {
2561 img_data_patch + g * C_per_G,
2562 col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G,
2563 img_data_patch + g * C_per_G,
2569 col_data += kernel_h * kernel_w * C;
2582 template <
typename TData>
2583 C10_EXPORT
void Col2Im3dNHWCImpl(
2591 const int dilation_t,
2592 const int dilation_h,
2593 const int dilation_w,
2603 const TData* col_data,
2605 CPUContext* context,
2607 Set<float, CPUContext>(T * H * W * C, 0, img_data, context);
2608 const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
2609 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
2610 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
2611 const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1;
2612 const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
2613 const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
2614 const int C_per_G = C / groups;
2617 for (
int t = 0; t < output_t; ++t) {
2619 for (
int h = 0; h < output_h; ++h) {
2621 for (
int w = 0; w < output_w; ++w) {
2623 for (
int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
2625 for (
int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
2627 for (
int iw = w_pad; iw < w_pad + dkernel_w;
2628 iw += dilation_w, ++s) {
2629 if (utils::IsAGeZeroAndALtB(it, T) &&
2630 utils::IsAGeZeroAndALtB(ih, H) &&
2631 utils::IsAGeZeroAndALtB(iw, W)) {
2632 float* img_data_patch = img_data + ((it * T + ih) * W + iw) * C;
2633 for (
int g = 0; g < groups; ++g) {
2636 img_data_patch + g * C_per_G,
2638 (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
2640 img_data_patch + g * C_per_G,
2647 col_data += kernel_t * kernel_h * kernel_w * C;
2657 C10_EXPORT
void Col2ImNd<float, CPUContext, StorageOrder::NHWC>(
2661 const int* img_shape,
2662 const int* col_shape,
2663 const int* kernel_shape,
2665 const int* dilation,
2667 const float* col_data,
2669 CPUContext* context,
2672 const int channels =
2673 col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2];
2674 Col2Im3dNHWCImpl<float>(
2699 CAFFE_NOT_IMPLEMENTED;
2704 C10_EXPORT
void BiasCHW<float, CPUContext>(
2707 const int bias_channels,
2708 const int image_size,
2712 for (
int c = 0; c < bias_channels; ++c) {
2715 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 2716 float32x4_t vBias = vdupq_n_f32(b);
2720 constexpr
int kVecSizeInFloat =
sizeof(float32x4_t) /
sizeof(
float);
2724 int prologue = kVecSizeInFloat -
2726 (((uintptr_t)image) % (
sizeof(float32x4_t))) /
sizeof(
float);
2730 for (; i < prologue; ++i) {
2735 constexpr
int kUnroll = 8;
2736 constexpr
int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
2738 int remainder = image_size - prologue;
2739 int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
2742 for (; i < vectorizable; i += kFloatsPerLoop) {
2744 float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
2745 float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
2746 float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
2747 float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
2748 float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
2749 float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
2750 float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
2751 float32x4_t v7 = vld1q_f32_aligned(image + i + 28);
2753 v0 = vaddq_f32(v0, vBias);
2754 v1 = vaddq_f32(v1, vBias);
2755 v2 = vaddq_f32(v2, vBias);
2756 v3 = vaddq_f32(v3, vBias);
2757 v4 = vaddq_f32(v4, vBias);
2758 v5 = vaddq_f32(v5, vBias);
2759 v6 = vaddq_f32(v6, vBias);
2760 v7 = vaddq_f32(v7, vBias);
2762 vst1q_f32_aligned(image + i + 0, v0);
2763 vst1q_f32_aligned(image + i + 4, v1);
2764 vst1q_f32_aligned(image + i + 8, v2);
2765 vst1q_f32_aligned(image + i + 12, v3);
2766 vst1q_f32_aligned(image + i + 16, v4);
2767 vst1q_f32_aligned(image + i + 20, v5);
2768 vst1q_f32_aligned(image + i + 24, v6);
2769 vst1q_f32_aligned(image + i + 28, v7);
2773 for (; i < image_size; ++i) {
2778 for (
int i = 0; i < image_size; ++i) {
2781 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) 2783 image += image_size;
2787 #define CAFFE2_SPECIALIZED_COPYVECTOR(T) \ 2789 C10_EXPORT void CopyVector<T, CPUContext>( \ 2790 const int N, const T* src, T* dst, CPUContext* ) { \ 2791 if (src != dst && N > 0) { \ 2792 memcpy(dst, src, sizeof(T) * N); \ 2795 CAFFE2_SPECIALIZED_COPYVECTOR(
float)
2796 #undef CAFFE2_SPECIALIZED_COPYVECTOR
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...