1 #include "caffe2/utils/math/reduce.h" 9 #ifdef CAFFE2_USE_ACCELERATE 10 #include <Accelerate/Accelerate.h> 11 #endif // CAFFE2_USE_ACCELERATE 15 #endif // CAFFE2_USE_MKL 17 #include "caffe2/core/context.h" 18 #include "caffe2/utils/eigen_utils.h" 19 #include "caffe2/utils/math/elementwise.h" 20 #include "caffe2/utils/math/utils.h" 27 #define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenFunc) \ 28 template <typename T> \ 36 EigenVectorMap<T>(Y, rows) = ConstEigenMatrixMap<T>(X, cols, rows) \ 42 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
43 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
44 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
45 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
46 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>)
47 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
48 #undef DELEGATE_ROWWISE_REDUCE_FUNCTION 50 #ifndef CAFFE2_USE_EIGEN_FOR_BLAS 52 #define DELEGATE_ROWWISE_REDUCE_FUNCTION(T, Func, BLASFunc) \ 61 for (int i = 0; i < rows; ++i) { \ 62 Y[i] = BLASFunc(cols, X + i * cols, 1) * alpha; \ 65 DELEGATE_ROWWISE_REDUCE_FUNCTION(
float, ReduceL1, cblas_sasum)
66 DELEGATE_ROWWISE_REDUCE_FUNCTION(
double, ReduceL1, cblas_dasum)
67 DELEGATE_ROWWISE_REDUCE_FUNCTION(
float, ReduceL2, cblas_snrm2)
68 DELEGATE_ROWWISE_REDUCE_FUNCTION(
double, ReduceL2, cblas_dnrm2)
69 #undef DELEGATE_ROWWISE_REDUCE_FUNCTION 71 #endif // CAFFE2_USE_EIGEN_FOR_BLAS 73 #define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, MathFunc) \ 74 template <typename T> \ 81 CPUContext* context) { \ 82 std::memcpy(Y, X, sizeof(T) * cols); \ 83 for (int i = 1; i < rows; ++i) { \ 84 MathFunc<T, CPUContext>(cols, Y, X + i * cols, Y, context); \ 86 Scale<T, T, CPUContext>(cols, alpha, Y, Y, context); \ 88 DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, Min)
89 DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, Max)
90 DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum,
Add)
91 #undef DELEGATE_COLWISE_REDUCE_FUNCTION 94 void ColwiseReduceMean(
100 CPUContext* context) {
101 ColwiseReduceSum<T>(rows, cols, alpha /
static_cast<T>(rows), X, Y, context);
104 template <
typename T>
105 void ColwiseReduceL1(
111 CPUContext* context) {
112 ConstEigenArrayMap<T> X_arr(X, cols, rows);
113 EigenVectorArrayMap<T> Y_arr(Y, cols);
114 Y_arr = X_arr.col(0).abs();
115 for (
int i = 1; i < rows; ++i) {
116 Y_arr += X_arr.col(i).abs();
118 Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);
121 template <
typename T>
122 void ColwiseReduceL2(
129 ConstEigenArrayMap<T> X_arr(X, cols, rows);
130 EigenVectorArrayMap<T> Y_arr(Y, cols);
131 Y_arr = X_arr.col(0).square();
132 for (
int i = 1; i < rows; ++i) {
133 Y_arr += X_arr.col(i).square();
135 Y_arr = Y_arr.sqrt() * alpha;
138 template <
typename T>
139 void BothEndsReduceMin(
146 CPUContext* context) {
147 EigenVectorArrayMap<T> Y_arr(Y, N);
148 Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().minCoeff();
149 for (
int i = 1; i < M; ++i) {
150 ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
151 for (
int j = 0; j < N; ++j) {
152 Y[j] = std::min(Y[j], X_arr.col(j).minCoeff());
155 Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
158 template <
typename T>
159 void BothEndsReduceMax(
166 CPUContext* context) {
167 EigenVectorArrayMap<T> Y_arr(Y, N);
168 Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().maxCoeff();
169 for (
int i = 1; i < M; ++i) {
170 ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
171 for (
int j = 0; j < N; ++j) {
172 Y[j] = std::max(Y[j], X_arr.col(j).maxCoeff());
175 Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
178 template <
typename T>
179 void BothEndsReduceSum(
186 CPUContext* context) {
187 EigenVectorArrayMap<T> Y_arr(Y, N);
188 Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
189 for (
int i = 1; i < M; ++i) {
191 ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum().transpose();
193 Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
196 template <
typename T>
197 void BothEndsReduceMean(
204 CPUContext* context) {
205 EigenVectorArrayMap<T> Y_arr(Y, N);
206 Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
207 for (
int i = 1; i < M; ++i) {
209 ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum().transpose();
211 Scale<T, T, CPUContext>(N, alpha /
static_cast<T>(M * K), Y, Y, context);
214 template <
typename T>
215 void BothEndsReduceL1(
222 CPUContext* context) {
223 EigenVectorMap<T> Y_vec(Y, N);
224 Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().template lpNorm<1>();
225 for (
int i = 1; i < M; ++i) {
226 Y_vec += ConstEigenMatrixMap<T>(X + i * N * K, K, N)
228 .template lpNorm<1>()
231 Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
234 template <
typename T>
235 void BothEndsReduceL2(
243 ConstEigenArrayMap<T> X0_arr(X, K, N);
244 EigenVectorArrayMap<T> Y_arr(Y, N);
245 for (
int i = 0; i < N; ++i) {
246 Y_arr(i) = X0_arr.col(i).square().sum();
248 for (
int i = 1; i < M; ++i) {
249 ConstEigenArrayMap<T> Xi_arr(X + i * N * K, K, N);
250 for (
int j = 0; j < N; ++j) {
251 Y_arr(j) += Xi_arr.col(j).square().sum();
254 Y_arr = Y_arr.sqrt() * alpha;
257 template <
typename T,
class Reducer>
258 void ReduceTensorImpl(
262 const Reducer& reducer,
266 CPUContext* context) {
268 std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
270 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
271 Set<T, CPUContext>(Y_size, init, Y, context);
272 std::vector<int> index(ndim, 0);
273 for (
int X_index = 0; X_index < X_size; ++X_index) {
274 const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
275 Y[Y_index] = reducer(Y[Y_index], X[X_index]);
276 utils::IncreaseIndexInDims(ndim, X_dims, index.data());
280 template <
typename T>
288 CPUContext* context) {
293 [](
const T a,
const T b) {
return std::min(a, b); },
294 std::numeric_limits<T>::max(),
299 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
300 Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
303 template <
typename T>
311 CPUContext* context) {
316 [](
const T a,
const T b) {
return std::max(a, b); },
317 std::numeric_limits<T>::lowest(),
322 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
323 Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
326 template <
typename T>
334 CPUContext* context) {
335 ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(),
T(0), X, Y, context);
337 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
338 Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
341 template <
typename T>
349 CPUContext* context) {
350 ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(),
T(0), X, Y, context);
352 std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
354 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
355 Scale<T, T, CPUContext>(
357 alpha *
static_cast<T>(Y_size) / static_cast<T>(X_size),
363 template <
typename T>
371 CPUContext* context) {
376 [](
const T a,
const T b) {
return a + std::abs(b); },
382 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
383 Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
386 template <
typename T>
394 CPUContext* context) {
399 [](
const T a,
const T b) {
return a + b * b; },
405 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
406 EigenVectorArrayMap<T> Y_arr(Y, Y_size);
407 Y_arr = Y_arr.sqrt() * alpha;
410 template <
typename T>
417 ConstEigenArrayMap<T> X_arr(X, cols, rows);
418 for (
int i = 0; i < rows; ++i) {
419 mean[i] = X_arr.col(i).mean();
420 var[i] = X_arr.col(i).square().mean() - mean[i] * mean[i];
424 template <
typename T>
431 ConstEigenArrayMap<T> X_arr(X, cols, rows);
432 EigenVectorArrayMap<T> mean_arr(mean, cols);
433 EigenVectorArrayMap<T> var_arr(var, cols);
434 mean_arr = X_arr.col(0);
435 var_arr = X_arr.col(0).square();
436 for (
int i = 1; i < rows; ++i) {
437 mean_arr += X_arr.col(i);
438 var_arr += X_arr.col(i).square();
440 const T scale =
T(1) /
static_cast<T>(rows);
442 var_arr = var_arr * scale - mean_arr.square();
445 template <
typename T>
446 void BothEndsMoments(
453 ConstEigenArrayMap<T> X_arr(X, K, M * N);
454 EigenVectorArrayMap<T> mean_arr(mean, N);
455 EigenVectorArrayMap<T> var_arr(var, N);
456 for (
int i = 0; i < N; ++i) {
457 mean_arr(i) = X_arr.col(i).sum();
458 var_arr(i) = X_arr.col(i).square().sum();
460 for (
int i = 1; i < M; ++i) {
461 for (
int j = 0; j < N; ++j) {
462 const int c = i * N + j;
463 mean_arr(j) += X_arr.col(c).sum();
464 var_arr(j) += X_arr.col(c).square().sum();
467 const T scale =
T(1) /
static_cast<T>(M * K);
469 var_arr = var_arr * scale - mean_arr.square();
472 template <
typename T>
482 std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
484 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
486 std::memset(mean, 0,
sizeof(
T) * Y_size);
487 std::memset(var, 0,
sizeof(
T) * Y_size);
490 if (std::equal(X_dims, X_dims + ndim, Y_dims)) {
491 std::memcpy(mean, X,
sizeof(
T) * Y_size);
492 std::memset(var, 0,
sizeof(
T) * Y_size);
497 if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
498 RowwiseMoments<T>(rows, cols, X, mean, var);
501 if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
502 ColwiseMoments<T>(rows, cols, X, mean, var);
508 if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &pre, &mid, &nxt)) {
509 BothEndsMoments<T>(pre, mid, nxt, X, mean, var);
512 std::memset(mean, 0,
sizeof(
T) * Y_size);
513 std::memset(var, 0,
sizeof(
T) * Y_size);
514 std::vector<int> index(ndim, 0);
515 for (
int X_index = 0; X_index < X_size; ++X_index) {
516 const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
517 mean[Y_index] += X[X_index];
518 var[Y_index] += X[X_index] * X[X_index];
519 utils::IncreaseIndexInDims(ndim, X_dims, index.data());
521 const T scale =
static_cast<T>(Y_size) / static_cast<T>(X_size);
522 EigenVectorArrayMap<T> mean_arr(mean, Y_size);
523 EigenVectorArrayMap<T> var_arr(var, Y_size);
525 var_arr = var_arr * scale - mean_arr.square();
530 #define DELEGATE_GLOBAL_REDUCE_FUNCTION(T, Func, EigenFunc) \ 532 C10_EXPORT void Func<T, CPUContext>( \ 538 *Y = ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \ 540 DELEGATE_GLOBAL_REDUCE_FUNCTION(
float, ReduceMin, minCoeff)
541 DELEGATE_GLOBAL_REDUCE_FUNCTION(
std::int32_t, ReduceMin, minCoeff)
542 DELEGATE_GLOBAL_REDUCE_FUNCTION(
std::int64_t, ReduceMin, minCoeff)
543 DELEGATE_GLOBAL_REDUCE_FUNCTION(
float, ReduceMax, maxCoeff)
544 DELEGATE_GLOBAL_REDUCE_FUNCTION(
std::int32_t, ReduceMax, maxCoeff)
545 DELEGATE_GLOBAL_REDUCE_FUNCTION(
std::int64_t, ReduceMax, maxCoeff)
546 #undef DELEGATE_GLOBAL_REDUCE_FUNCTION 548 #define DELEGATE_REDUCE_FUNCTION(T, Func, kInit, kIsNorm) \ 550 C10_EXPORT void Func<T, CPUContext>( \ 557 CPUContext* context) { \ 559 std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>()); \ 561 std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>()); \ 563 Set<T, CPUContext>(Y_size, alpha * kInit, Y, context); \ 566 if (alpha == T(0)) { \ 567 std::memset(Y, 0, sizeof(T) * Y_size); \ 570 if (std::equal(X_dims, X_dims + ndim, Y_dims)) { \ 572 EigenVectorArrayMap<T>(Y, Y_size) = \ 573 ConstEigenVectorArrayMap<T>(X, X_size).abs() * alpha; \ 575 Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \ 581 if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \ 582 Rowwise##Func<T>(rows, cols, alpha, X, Y, context); \ 585 if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \ 586 Colwise##Func<T>(rows, cols, alpha, X, Y, context); \ 592 if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &M, &N, &K)) { \ 593 BothEnds##Func<T>(M, N, K, alpha, X, Y, context); \ 596 Func##Impl<T>(ndim, X_dims, Y_dims, alpha, X, Y, context); \ 598 DELEGATE_REDUCE_FUNCTION(
601 std::numeric_limits<float>::max(),
603 DELEGATE_REDUCE_FUNCTION(
606 std::numeric_limits<
double>::max(),
608 DELEGATE_REDUCE_FUNCTION(
611 std::numeric_limits<
std::int32_t>::max(),
613 DELEGATE_REDUCE_FUNCTION(
616 std::numeric_limits<
std::int64_t>::max(),
618 DELEGATE_REDUCE_FUNCTION(
621 std::numeric_limits<
float>::lowest(),
623 DELEGATE_REDUCE_FUNCTION(
626 std::numeric_limits<
double>::lowest(),
628 DELEGATE_REDUCE_FUNCTION(
631 std::numeric_limits<
std::int32_t>::lowest(),
633 DELEGATE_REDUCE_FUNCTION(
636 std::numeric_limits<
std::int64_t>::lowest(),
638 DELEGATE_REDUCE_FUNCTION(
float, ReduceSum, 0.0f, false)
639 DELEGATE_REDUCE_FUNCTION(
double, ReduceSum, 0.0, false)
640 DELEGATE_REDUCE_FUNCTION(
std::int32_t, ReduceSum, 0, false)
641 DELEGATE_REDUCE_FUNCTION(
std::int64_t, ReduceSum, 0LL, false)
642 DELEGATE_REDUCE_FUNCTION(
float, ReduceMean, 0.0f, false)
643 DELEGATE_REDUCE_FUNCTION(
double, ReduceMean, 0.0, false)
644 DELEGATE_REDUCE_FUNCTION(
float, ReduceL1, 0.0f, true)
645 DELEGATE_REDUCE_FUNCTION(
double, ReduceL1, 0.0, true)
646 DELEGATE_REDUCE_FUNCTION(
std::int32_t, ReduceL1, 0, true)
647 DELEGATE_REDUCE_FUNCTION(
std::int64_t, ReduceL1, 0LL, true)
648 DELEGATE_REDUCE_FUNCTION(
float, ReduceL2, 0.0f, true)
649 DELEGATE_REDUCE_FUNCTION(
double, ReduceL2, 0.0, true)
650 #undef DELEGATE_REDUCE_FUNCTION 652 #define CAFFE2_SPECIALIZED_MOMENTS(T) \ 654 C10_EXPORT void Moments<T, CPUContext>( \ 661 CPUContext* context) { \ 662 MomentsImpl<T>(ndim, X_dims, Y_dims, X, mean, var, context); \ 664 CAFFE2_SPECIALIZED_MOMENTS(
float)
665 #undef CAFFE2_SPECIALIZED_MOMENTS
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...