1 #include "caffe2/operators/pool_op.h" 3 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" 4 #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h" 5 #include "caffe2/quantization/server/op_wrapper.h" 6 #include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h" 7 #include "caffe2/utils/eigen_utils.h" 18 static float initialize() {
25 ConstEigenMatrixMap<float>& x_mat,
26 EigenMatrixMap<float>& y_mat) {
27 y_mat.col(y_col) += x_mat.col(x_col);
30 static void process(
const T& x_data,
T& y_data) {
34 static void finalize(
const int size,
T& y_data) {
42 static T initialize() {
43 return std::numeric_limits<T>::lowest();
49 ConstEigenMatrixMap<float>& x_mat,
50 EigenMatrixMap<float>& y_mat) {
51 y_mat.col(y_col) = y_mat.col(y_col).cwiseMax(x_mat.col(x_col));
54 static void process(
const T& x_data,
T& y_data) {
55 if (x_data > y_data) {
60 static void finalize(
const int ,
T& ) {}
63 using AveragePoolFp32Op =
64 PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>;
67 class AveragePoolDnnLowPOp final
68 :
public ConvPoolDNNLowPOpBase<T, AveragePoolFp32Op> {
70 USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
71 USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(
T, AveragePoolFp32Op);
73 AveragePoolDnnLowPOp(
const OperatorDef& operator_def, Workspace* ws)
74 : BaseType(operator_def, ws) {
75 for (
int i = 0; i < this->kernel_.size(); ++i) {
77 dilation_[i] == 1,
"Pooling op does not support dilation right now.");
79 if (!global_pooling_) {
80 for (
int i = 0; i < this->kernel_.size(); ++i) {
82 pads_[i] < kernel_[i] &&
83 pads_[i + this->kernel_.size()] < kernel_[i],
84 "Pad should be smaller than kernel.");
89 bool RunOnDeviceWithOrderNCHW()
override {
92 this->ParseDNNLowPOperatorArguments_();
95 GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
99 const T* Xdata = QuantizeInputIfNeeded(
this, 0, in_qparams_[0], X_temp);
101 GetOutputQuantizationParams_();
103 auto& X = InputTensorCPU_(0);
104 auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1));
105 auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
107 T* Ydata = GetQuantizedOutputData_();
110 int channels = X.dim32(1);
111 int height = X.dim32(2);
112 int width = this->kernel_.size() > 1 ? X.dim32(3) : 1;
113 int depth = this->kernel_.size() > 2 ? X.dim32(4) : 1;
114 int pooled_height = Y->dim32(2);
115 int pooled_width = this->kernel_.size() > 1 ? Y->dim32(3) : 1;
116 int pooled_depth = this->kernel_.size() > 2 ? Y->dim32(4) : 1;
118 bool is_signed = std::is_signed<T>::value;
119 int precision = out_qparams_.precision;
120 int32_t minimum = is_signed ? -(1 << (precision - 1)) : 0;
122 is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1;
124 switch (this->kernel_.size()) {
127 #pragma omp parallel for 129 for (
int n = 0; n < X.dim32(0); ++n) {
130 for (
int c = 0; c < channels; ++c) {
131 const T* Xdata_temp = Xdata + height * width * (c + channels * n);
133 Ydata + pooled_height * pooled_width * (c + channels * n);
134 for (
int ph = 0; ph < pooled_height; ++ph) {
135 int hstart = ph * stride_h() - pad_t();
136 int hend = min(hstart + kernel_h(), height);
137 hstart = max(hstart, 0);
138 for (
int pw = 0; pw < pooled_width; ++pw) {
139 int wstart = pw * stride_w() - pad_l();
140 int wend = min(wstart + kernel_w(), width);
141 wstart = max(wstart, 0);
143 int size = (hend - hstart) * (wend - wstart);
145 const int pool_index = ph * pooled_width + pw;
146 int32_t Yh = -in_qparams_[0].zero_point * size;
147 for (
int h = hstart; h < hend; ++h) {
148 for (
int w = wstart; w < wend; ++w) {
149 const int input_index = h * width + w;
150 Yh += Xdata_temp[input_index];
154 in_qparams_[0].scale / out_qparams_.scale / size;
155 Ydata_temp[pool_index] = std::min<int32_t>(
157 nearbyint(Yh * multiplier + out_qparams_.zero_point),
167 #pragma omp parallel for 169 for (
int n = 0; n < X.dim32(0); ++n) {
170 for (
int c = 0; c < channels; ++c) {
171 const T* Xdata_temp =
172 Xdata + height * width * depth * (c + channels * n);
173 T* Ydata_temp = Ydata +
174 pooled_height * pooled_width * pooled_depth *
176 for (
int ph = 0; ph < pooled_height; ++ph) {
177 int hstart = ph * stride_h() - pad_t();
178 int hend = min(hstart + kernel_h(), height);
179 hstart = max(hstart, 0);
180 for (
int pw = 0; pw < pooled_width; ++pw) {
181 int wstart = pw * stride_w() - pad_l();
182 int wend = min(wstart + kernel_w(), width);
183 wstart = max(wstart, 0);
184 for (
int pd = 0; pd < pooled_depth; ++pd) {
185 int dstart = pd * stride_[2] - pads_[2];
186 int dend = min(dstart + kernel_[2], depth);
187 dstart = max(dstart, 0);
190 (hend - hstart) * (wend - wstart) * (dend - dstart);
191 const int pool_index =
192 ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
193 int32_t Yh = -in_qparams_[0].zero_point * size;
194 for (
int h = hstart; h < hend; ++h) {
195 for (
int w = wstart; w < wend; ++w) {
196 for (
int d = dstart; d < dend; ++d) {
197 const int input_index =
198 h * width * depth + w * depth + d;
199 Yh += Xdata_temp[input_index];
204 in_qparams_[0].scale / out_qparams_.scale / size;
205 Ydata_temp[pool_index] = std::min<int32_t>(
207 nearbyint(Yh * multiplier + out_qparams_.zero_point),
218 CAFFE_THROW(
"Unsupported pooling size : ", this->kernel_.size());
222 RunOnDeviceEpilogue_();
226 bool RunOnDeviceWithOrderNHWC()
override {
230 this->ParseDNNLowPOperatorArguments_();
233 GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
237 const T* Xdata = QuantizeInputIfNeeded(
this, 0, in_qparams_[0], X_temp);
239 GetOutputQuantizationParams_();
241 auto& X = InputTensorCPU_(0);
242 int channels = X.dim32(X.ndim() - 1);
243 auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, channels);
244 auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
246 T* Ydata = GetQuantizedOutputData_();
248 int height = X.dim32(1);
249 int width = this->kernel_.size() > 1 ? X.dim32(2) : 1;
250 int depth = this->kernel_.size() > 2 ? X.dim32(3) : 1;
251 int pooled_height = Y->dim32(1);
252 int pooled_width = this->kernel_.size() > 1 ? Y->dim32(2) : 1;
253 int pooled_depth = this->kernel_.size() > 2 ? Y->dim32(3) : 1;
255 bool is_signed = std::is_signed<T>::value;
256 int precision = out_qparams_.precision;
257 int32_t minimum = is_signed ? -(1 << (precision - 1)) : 0;
259 is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1;
261 switch (this->kernel_.size()) {
264 #pragma omp parallel for 266 for (
int n = 0; n < X.dim32(0); ++n) {
267 const T* Xdata_temp = Xdata + n * height * width * channels;
268 T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
269 for (
int ph = 0; ph < pooled_height; ++ph) {
270 int hstart = ph * stride_h() - pad_t();
271 int hend = min(hstart + kernel_h(), height);
272 hstart = max(hstart, 0);
273 for (
int pw = 0; pw < pooled_width; ++pw) {
274 int wstart = pw * stride_w() - pad_l();
275 int wend = min(wstart + kernel_w(), width);
276 wstart = max(wstart, 0);
277 int size = (hend - hstart) * (wend - wstart);
279 in_qparams_[0].scale / out_qparams_.scale / size;
281 for (
int c = 0; c < channels; ++c) {
282 const int pool_idx = (ph * pooled_width + pw) * channels + c;
283 int32_t Yh = -in_qparams_[0].zero_point * size;
284 for (
int h = hstart; h < hend; ++h) {
285 for (
int w = wstart; w < wend; ++w) {
286 const int input_idx = (h * width + w) * channels + c;
287 Yh += Xdata_temp[input_idx];
290 Ydata_temp[pool_idx] = std::min<int32_t>(
292 nearbyint(Yh * multiplier + out_qparams_.zero_point),
302 #pragma omp parallel for 304 for (
int n = 0; n < X.dim32(0); ++n) {
305 const T* Xdata_temp = Xdata + n * height * width * depth * channels;
306 T* Ydata_temp = Ydata +
307 n * pooled_height * pooled_width * pooled_depth * channels;
308 for (
int ph = 0; ph < pooled_height; ++ph) {
309 int hstart = ph * stride_h() - pad_t();
310 int hend = min(hstart + kernel_h(), height);
311 hstart = max(hstart, 0);
312 for (
int pw = 0; pw < pooled_width; ++pw) {
313 int wstart = pw * stride_w() - pad_l();
314 int wend = min(wstart + kernel_w(), width);
315 wstart = max(wstart, 0);
316 for (
int pd = 0; pd < pooled_depth; ++pd) {
317 int dstart = pd * stride_[2] - pads_[2];
318 int dend = min(dstart + kernel_[2], depth);
319 dstart = max(dstart, 0);
320 int size = (hend - hstart) * (wend - wstart) * (dend - dstart);
322 in_qparams_[0].scale / out_qparams_.scale / size;
324 for (
int c = 0; c < channels; ++c) {
326 ((ph * pooled_width + pw) * pooled_depth + pd) *
329 int32_t Yh = -in_qparams_[0].zero_point * size;
330 for (
int h = hstart; h < hend; ++h) {
331 for (
int w = wstart; w < wend; ++w) {
332 for (
int d = dstart; d < dend; ++d) {
333 const int input_idx =
334 ((h * width + w) * depth + d) * channels + c;
335 Yh += Xdata_temp[input_idx];
339 Ydata_temp[pool_idx] = std::min<int32_t>(
341 nearbyint(Yh * multiplier + out_qparams_.zero_point),
351 CAFFE_THROW(
"Unsupported pooling size : ", this->kernel_.size());
355 RunOnDeviceEpilogue_();
360 using MaxPoolFp32Op = PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>;
362 template <
typename T>
363 class MaxPoolDnnLowPOp final :
public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> {
365 USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
366 USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(
T, MaxPoolFp32Op);
368 MaxPoolDnnLowPOp(
const OperatorDef& operator_def, Workspace* ws)
369 : BaseType(operator_def, ws) {
370 for (
int i = 0; i < this->kernel_.size(); ++i) {
372 dilation_[i] == 1,
"Pooling op does not support dilation right now.");
374 if (!global_pooling_) {
375 for (
int i = 0; i < this->kernel_.size(); ++i) {
377 pads_[i] < kernel_[i] &&
378 pads_[i + this->kernel_.size()] < kernel_[i],
379 "Pad should be smaller than kernel.");
384 bool RunOnDeviceWithOrderNCHW()
override {
387 this->ParseDNNLowPOperatorArguments_();
390 GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
394 out_qparams_ = in_qparams_[0];
398 const T* Xdata = QuantizeInputIfNeeded(
this, 0, in_qparams_[0], X_temp);
400 auto& X = InputTensorCPU_(0);
401 auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1));
402 auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
404 T* Ydata = GetQuantizedOutputData_();
407 int channels = X.dim32(1);
408 int height = X.dim32(2);
409 int width = this->kernel_.size() > 1 ? X.dim32(3) : 1;
410 int depth = this->kernel_.size() > 2 ? X.dim32(4) : 1;
411 int pooled_height = Y->dim32(2);
412 int pooled_width = this->kernel_.size() > 1 ? Y->dim32(3) : 1;
413 int pooled_depth = this->kernel_.size() > 2 ? Y->dim32(4) : 1;
415 switch (this->kernel_.size()) {
417 for (
int n = 0; n < X.dim32(0); ++n) {
418 for (
int c = 0; c < channels; ++c) {
419 for (
int ph = 0; ph < pooled_height; ++ph) {
420 int hstart = ph * stride_h() - pad_t();
421 int hend = min(hstart + kernel_h(), height);
422 hstart = max(hstart, 0);
424 for (
int h = hstart; h < hend; ++h) {
432 Ydata += pooled_height;
438 #pragma omp parallel for 440 for (
int n = 0; n < X.dim32(0); ++n) {
441 for (
int c = 0; c < channels; ++c) {
443 const T* Xdata_temp = Xdata + height * width * (c + channels * n);
445 Ydata + pooled_height * pooled_width * (c + channels * n);
446 for (
int ph = 0; ph < pooled_height; ++ph) {
447 int hstart = ph * stride_h() - pad_t();
448 int hend = min(hstart + kernel_h(), height);
449 hstart = max(hstart, 0);
450 for (
int pw = 0; pw < pooled_width; ++pw) {
451 int wstart = pw * stride_w() - pad_l();
452 int wend = min(wstart + kernel_w(), width);
453 wstart = max(wstart, 0);
454 const int pool_index = ph * pooled_width + pw;
456 for (
int h = hstart; h < hend; ++h) {
457 for (
int w = wstart; w < wend; ++w) {
458 const int input_index = h * width + w;
463 Ydata_temp[pool_index] = Yh;
471 #pragma omp parallel for 473 for (
int n = 0; n < X.dim32(0); ++n) {
474 for (
int c = 0; c < channels; ++c) {
476 const T* Xdata_temp =
477 Xdata + height * width * depth * (c + channels * n);
478 T* Ydata_temp = Ydata +
479 pooled_height * pooled_width * pooled_depth *
481 for (
int ph = 0; ph < pooled_height; ++ph) {
482 int hstart = ph * stride_h() - pad_t();
483 int hend = min(hstart + kernel_h(), height);
484 hstart = max(hstart, 0);
485 for (
int pw = 0; pw < pooled_width; ++pw) {
486 int wstart = pw * stride_w() - pad_l();
487 int wend = min(wstart + kernel_w(), width);
488 wstart = max(wstart, 0);
489 for (
int pd = 0; pd < pooled_depth; ++pd) {
490 int dstart = pd * stride_[2] - pads_[2];
491 int dend = min(dstart + kernel_[2], depth);
492 dstart = max(dstart, 0);
493 const int pool_index =
494 ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
496 for (
int h = hstart; h < hend; ++h) {
497 for (
int w = wstart; w < wend; ++w) {
498 for (
int d = dstart; d < dend; ++d) {
499 const int input_index =
500 h * width * depth + w * depth + d;
506 (hend - hstart) * (wend - wstart) * (dend - dstart), Yh);
507 Ydata_temp[pool_index] = Yh;
515 CAFFE_THROW(
"Unsupported pooling size : ", this->kernel_.size());
519 if (measure_quantization_error_) {
521 Fp32Op_()->DequantizeInput();
522 Fp32Op_()->Get()->RunOnDevice();
525 RunOnDeviceEpilogue_();
529 bool RunOnDeviceWithOrderNHWC()
override {
533 this->ParseDNNLowPOperatorArguments_();
536 GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
540 out_qparams_ = in_qparams_[0];
544 const T* Xdata = QuantizeInputIfNeeded(
this, 0, in_qparams_[0], X_temp);
546 auto& X = InputTensorCPU_(0);
547 int channels = X.dim32(X.ndim() - 1);
548 auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, channels);
549 auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
551 T* Ydata = GetQuantizedOutputData_();
553 int height = X.dim32(1);
554 int width = this->kernel_.size() > 1 ? X.dim32(2) : 1;
555 int depth = this->kernel_.size() > 2 ? X.dim32(3) : 1;
556 int pooled_height = Y->dim32(1);
557 int pooled_width = this->kernel_.size() > 1 ? Y->dim32(2) : 1;
558 int pooled_depth = this->kernel_.size() > 2 ? Y->dim32(3) : 1;
560 switch (this->kernel_.size()) {
563 #pragma omp parallel for 565 for (
int n = 0; n < X.dim32(0); ++n) {
566 const T* Xdata_temp = Xdata + n * height * channels;
567 T* Ydata_temp = Ydata + n * pooled_height * channels;
568 for (
int ph = 0; ph < pooled_height; ++ph) {
569 int hstart = ph * stride_h() - pad_t();
570 int hend = min(hstart + kernel_h(), height);
571 hstart = max(hstart, 0);
572 for (
int c = 0; c < channels; ++c) {
574 const int pool_idx = ph * channels + c;
575 for (
int h = hstart; h < hend; ++h) {
576 const int input_idx = h * channels + c;
580 Ydata_temp[pool_idx] = Yh;
586 if (is_same<T, uint8_t>::value) {
588 #pragma omp parallel for 590 for (
int n = 0; n < X.dim32(0); ++n) {
592 reinterpret_cast<const uint8_t*>(Xdata),
605 reinterpret_cast<uint8_t*>(Ydata));
609 #pragma omp parallel for 611 for (
int n = 0; n < X.dim32(0); ++n) {
612 const T* Xdata_temp = Xdata + n * height * width * channels;
613 T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
614 for (
int ph = 0; ph < pooled_height; ++ph) {
615 int hstart = ph * stride_h() - pad_t();
616 int hend = min(hstart + kernel_h(), height);
617 hstart = max(hstart, 0);
618 for (
int pw = 0; pw < pooled_width; ++pw) {
619 int wstart = pw * stride_w() - pad_l();
620 int wend = min(wstart + kernel_w(), width);
621 wstart = max(wstart, 0);
622 int size = (hend - hstart) * (wend - wstart);
623 for (
int c = 0; c < channels; ++c) {
625 const int pool_idx = (ph * pooled_width + pw) * channels + c;
626 for (
int h = hstart; h < hend; ++h) {
627 for (
int w = wstart; w < wend; ++w) {
628 const int input_idx = (h * width + w) * channels + c;
633 Ydata_temp[pool_idx] = Yh;
642 #pragma omp parallel for 644 for (
int n = 0; n < X.dim32(0); ++n) {
645 const T* Xdata_temp = Xdata + n * height * width * depth * channels;
646 T* Ydata_temp = Ydata +
647 n * pooled_height * pooled_width * pooled_depth * channels;
648 for (
int ph = 0; ph < pooled_height; ++ph) {
649 int hstart = ph * stride_h() - pad_t();
650 int hend = min(hstart + kernel_h(), height);
651 hstart = max(hstart, 0);
652 for (
int pw = 0; pw < pooled_width; ++pw) {
653 int wstart = pw * stride_w() - pad_l();
654 int wend = min(wstart + kernel_w(), width);
655 wstart = max(wstart, 0);
656 for (
int pd = 0; pd < pooled_depth; ++pd) {
657 int dstart = pd * stride_[2] - pads_[2];
658 int dend = min(dstart + kernel_[2], depth);
659 dstart = max(dstart, 0);
660 int size = (hend - hstart) * (wend - wstart) * (dend - dstart);
661 for (
int c = 0; c < channels; ++c) {
664 ((ph * pooled_width + pw) * pooled_depth + pd) *
667 for (
int h = hstart; h < hend; ++h) {
668 for (
int w = wstart; w < wend; ++w) {
669 for (
int d = dstart; d < dend; ++d) {
670 const int input_idx =
671 ((h * width + w) * depth + d) * channels + c;
677 Ydata_temp[pool_idx] = Yh;
685 CAFFE_THROW(
"Unsupported pooling size : ", this->kernel_.size());
689 if (measure_quantization_error_) {
691 Fp32Op_()->DequantizeInput();
692 Fp32Op_()->Get()->RunOnDevice();
695 RunOnDeviceEpilogue_();
700 REGISTER_CPU_OPERATOR_WITH_ENGINE(
703 AveragePoolDnnLowPOp<uint8_t>);
704 REGISTER_CPU_OPERATOR_WITH_ENGINE(
MaxPool, DNNLOWP, MaxPoolDnnLowPOp<uint8_t>);
706 REGISTER_CPU_OPERATOR_WITH_ENGINE(
709 AveragePoolDnnLowPOp<uint16_t>);
710 REGISTER_CPU_OPERATOR_WITH_ENGINE(
713 MaxPoolDnnLowPOp<uint16_t>);
715 REGISTER_CPU_OPERATOR_WITH_ENGINE(
718 AveragePoolDnnLowPOp<uint8_t>);
719 REGISTER_CPU_OPERATOR_WITH_ENGINE(
722 MaxPoolDnnLowPOp<uint8_t>);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...