Caffe2 - C++ API
A deep learning, cross platform ML framework
pool_dnnlowp_op.cc
1 #include "caffe2/operators/pool_op.h"
2 
3 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
4 #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
5 #include "caffe2/quantization/server/op_wrapper.h"
6 #include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h"
7 #include "caffe2/utils/eigen_utils.h"
8 
9 namespace caffe2 {
10 
11 using namespace std;
12 
13 namespace {
14 
15 template <typename T>
16 class AveragePool {
17  public:
18  static float initialize() {
19  return 0.0;
20  }
21 
22  static void process(
23  const int x_col,
24  const int y_col,
25  ConstEigenMatrixMap<float>& x_mat,
26  EigenMatrixMap<float>& y_mat) {
27  y_mat.col(y_col) += x_mat.col(x_col);
28  }
29 
30  static void process(const T& x_data, T& y_data) {
31  y_data += x_data;
32  }
33 
34  static void finalize(const int size, T& y_data) {
35  y_data /= size;
36  }
37 };
38 
39 template <typename T>
40 class MaxPool {
41  public:
42  static T initialize() {
43  return std::numeric_limits<T>::lowest();
44  }
45 
46  static void process(
47  const int x_col,
48  const int y_col,
49  ConstEigenMatrixMap<float>& x_mat,
50  EigenMatrixMap<float>& y_mat) {
51  y_mat.col(y_col) = y_mat.col(y_col).cwiseMax(x_mat.col(x_col));
52  }
53 
54  static void process(const T& x_data, T& y_data) {
55  if (x_data > y_data) {
56  y_data = x_data;
57  }
58  }
59 
60  static void finalize(const int /*size*/, T& /*y_data*/) {}
61 };
62 
63 using AveragePoolFp32Op =
64  PoolOp<float, CPUContext, AveragePoolFunctor<CPUContext>>;
65 
66 template <typename T>
67 class AveragePoolDnnLowPOp final
68  : public ConvPoolDNNLowPOpBase<T, AveragePoolFp32Op> {
69  public:
70  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
71  USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, AveragePoolFp32Op);
72 
73  AveragePoolDnnLowPOp(const OperatorDef& operator_def, Workspace* ws)
74  : BaseType(operator_def, ws) {
75  for (int i = 0; i < this->kernel_.size(); ++i) {
76  CAFFE_ENFORCE(
77  dilation_[i] == 1, "Pooling op does not support dilation right now.");
78  }
79  if (!global_pooling_) {
80  for (int i = 0; i < this->kernel_.size(); ++i) {
81  CAFFE_ENFORCE(
82  pads_[i] < kernel_[i] &&
83  pads_[i + this->kernel_.size()] < kernel_[i],
84  "Pad should be smaller than kernel.");
85  }
86  }
87  }
88 
89  bool RunOnDeviceWithOrderNCHW() override {
90  using namespace dnnlowp;
91 
92  this->ParseDNNLowPOperatorArguments_();
93 
94  in_qparams_[0] =
95  GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
96 
97  // Quantize input if needed
98  vector<T> X_temp;
99  const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
100 
101  GetOutputQuantizationParams_();
102 
103  auto& X = InputTensorCPU_(0);
104  auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1));
105  auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
106 
107  T* Ydata = GetQuantizedOutputData_();
108 
109  // The main loop
110  int channels = X.dim32(1);
111  int height = X.dim32(2);
112  int width = this->kernel_.size() > 1 ? X.dim32(3) : 1;
113  int depth = this->kernel_.size() > 2 ? X.dim32(4) : 1;
114  int pooled_height = Y->dim32(2);
115  int pooled_width = this->kernel_.size() > 1 ? Y->dim32(3) : 1;
116  int pooled_depth = this->kernel_.size() > 2 ? Y->dim32(4) : 1;
117 
118  bool is_signed = std::is_signed<T>::value;
119  int precision = out_qparams_.precision;
120  int32_t minimum = is_signed ? -(1 << (precision - 1)) : 0;
121  int32_t maximum =
122  is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1;
123 
124  switch (this->kernel_.size()) {
125  case 2:
126 #ifdef _OPENMP
127 #pragma omp parallel for
128 #endif
129  for (int n = 0; n < X.dim32(0); ++n) {
130  for (int c = 0; c < channels; ++c) {
131  const T* Xdata_temp = Xdata + height * width * (c + channels * n);
132  T* Ydata_temp =
133  Ydata + pooled_height * pooled_width * (c + channels * n);
134  for (int ph = 0; ph < pooled_height; ++ph) {
135  int hstart = ph * stride_h() - pad_t();
136  int hend = min(hstart + kernel_h(), height);
137  hstart = max(hstart, 0);
138  for (int pw = 0; pw < pooled_width; ++pw) {
139  int wstart = pw * stride_w() - pad_l();
140  int wend = min(wstart + kernel_w(), width);
141  wstart = max(wstart, 0);
142 
143  int size = (hend - hstart) * (wend - wstart);
144 
145  const int pool_index = ph * pooled_width + pw;
146  int32_t Yh = -in_qparams_[0].zero_point * size;
147  for (int h = hstart; h < hend; ++h) {
148  for (int w = wstart; w < wend; ++w) {
149  const int input_index = h * width + w;
150  Yh += Xdata_temp[input_index];
151  }
152  }
153  float multiplier =
154  in_qparams_[0].scale / out_qparams_.scale / size;
155  Ydata_temp[pool_index] = std::min<int32_t>(
156  std::max<int32_t>(
157  nearbyint(Yh * multiplier + out_qparams_.zero_point),
158  minimum),
159  maximum);
160  } // width
161  } // height
162  } // channel
163  } // for each image
164  break;
165  case 3:
166 #ifdef _OPENMP
167 #pragma omp parallel for
168 #endif
169  for (int n = 0; n < X.dim32(0); ++n) {
170  for (int c = 0; c < channels; ++c) {
171  const T* Xdata_temp =
172  Xdata + height * width * depth * (c + channels * n);
173  T* Ydata_temp = Ydata +
174  pooled_height * pooled_width * pooled_depth *
175  (c + channels * n);
176  for (int ph = 0; ph < pooled_height; ++ph) {
177  int hstart = ph * stride_h() - pad_t();
178  int hend = min(hstart + kernel_h(), height);
179  hstart = max(hstart, 0);
180  for (int pw = 0; pw < pooled_width; ++pw) {
181  int wstart = pw * stride_w() - pad_l();
182  int wend = min(wstart + kernel_w(), width);
183  wstart = max(wstart, 0);
184  for (int pd = 0; pd < pooled_depth; ++pd) {
185  int dstart = pd * stride_[2] - pads_[2];
186  int dend = min(dstart + kernel_[2], depth);
187  dstart = max(dstart, 0);
188 
189  int size =
190  (hend - hstart) * (wend - wstart) * (dend - dstart);
191  const int pool_index =
192  ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
193  int32_t Yh = -in_qparams_[0].zero_point * size;
194  for (int h = hstart; h < hend; ++h) {
195  for (int w = wstart; w < wend; ++w) {
196  for (int d = dstart; d < dend; ++d) {
197  const int input_index =
198  h * width * depth + w * depth + d;
199  Yh += Xdata_temp[input_index];
200  }
201  }
202  }
203  float multiplier =
204  in_qparams_[0].scale / out_qparams_.scale / size;
205  Ydata_temp[pool_index] = std::min<int32_t>(
206  std::max<int32_t>(
207  nearbyint(Yh * multiplier + out_qparams_.zero_point),
208  minimum),
209  maximum);
210  } // depth
211  } // width
212  } // height
213  // Do offset.
214  } // channel
215  } // for each image
216  break;
217  default:
218  CAFFE_THROW("Unsupported pooling size : ", this->kernel_.size());
219  return false;
220  }
221 
222  RunOnDeviceEpilogue_();
223  return true;
224  }
225 
226  bool RunOnDeviceWithOrderNHWC() override {
227  // average pooling
228  using namespace dnnlowp;
229 
230  this->ParseDNNLowPOperatorArguments_();
231 
232  in_qparams_[0] =
233  GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
234 
235  // Quantize input if needed
236  vector<T> X_temp;
237  const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
238 
239  GetOutputQuantizationParams_();
240 
241  auto& X = InputTensorCPU_(0);
242  int channels = X.dim32(X.ndim() - 1);
243  auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, channels);
244  auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
245 
246  T* Ydata = GetQuantizedOutputData_();
247 
248  int height = X.dim32(1);
249  int width = this->kernel_.size() > 1 ? X.dim32(2) : 1;
250  int depth = this->kernel_.size() > 2 ? X.dim32(3) : 1;
251  int pooled_height = Y->dim32(1);
252  int pooled_width = this->kernel_.size() > 1 ? Y->dim32(2) : 1;
253  int pooled_depth = this->kernel_.size() > 2 ? Y->dim32(3) : 1;
254 
255  bool is_signed = std::is_signed<T>::value;
256  int precision = out_qparams_.precision;
257  int32_t minimum = is_signed ? -(1 << (precision - 1)) : 0;
258  int32_t maximum =
259  is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1;
260 
261  switch (this->kernel_.size()) {
262  case 2:
263 #ifdef _OPENMP
264 #pragma omp parallel for
265 #endif
266  for (int n = 0; n < X.dim32(0); ++n) {
267  const T* Xdata_temp = Xdata + n * height * width * channels;
268  T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
269  for (int ph = 0; ph < pooled_height; ++ph) {
270  int hstart = ph * stride_h() - pad_t();
271  int hend = min(hstart + kernel_h(), height);
272  hstart = max(hstart, 0);
273  for (int pw = 0; pw < pooled_width; ++pw) {
274  int wstart = pw * stride_w() - pad_l();
275  int wend = min(wstart + kernel_w(), width);
276  wstart = max(wstart, 0);
277  int size = (hend - hstart) * (wend - wstart);
278  float multiplier =
279  in_qparams_[0].scale / out_qparams_.scale / size;
280 
281  for (int c = 0; c < channels; ++c) {
282  const int pool_idx = (ph * pooled_width + pw) * channels + c;
283  int32_t Yh = -in_qparams_[0].zero_point * size;
284  for (int h = hstart; h < hend; ++h) {
285  for (int w = wstart; w < wend; ++w) {
286  const int input_idx = (h * width + w) * channels + c;
287  Yh += Xdata_temp[input_idx];
288  }
289  }
290  Ydata_temp[pool_idx] = std::min<int32_t>(
291  std::max<int32_t>(
292  nearbyint(Yh * multiplier + out_qparams_.zero_point),
293  minimum),
294  maximum);
295  } // channel
296  } // width
297  } // height
298  } // for each image
299  break;
300  case 3:
301 #ifdef _OPENMP
302 #pragma omp parallel for
303 #endif
304  for (int n = 0; n < X.dim32(0); ++n) {
305  const T* Xdata_temp = Xdata + n * height * width * depth * channels;
306  T* Ydata_temp = Ydata +
307  n * pooled_height * pooled_width * pooled_depth * channels;
308  for (int ph = 0; ph < pooled_height; ++ph) {
309  int hstart = ph * stride_h() - pad_t();
310  int hend = min(hstart + kernel_h(), height);
311  hstart = max(hstart, 0);
312  for (int pw = 0; pw < pooled_width; ++pw) {
313  int wstart = pw * stride_w() - pad_l();
314  int wend = min(wstart + kernel_w(), width);
315  wstart = max(wstart, 0);
316  for (int pd = 0; pd < pooled_depth; ++pd) {
317  int dstart = pd * stride_[2] - pads_[2];
318  int dend = min(dstart + kernel_[2], depth);
319  dstart = max(dstart, 0);
320  int size = (hend - hstart) * (wend - wstart) * (dend - dstart);
321  float multiplier =
322  in_qparams_[0].scale / out_qparams_.scale / size;
323 
324  for (int c = 0; c < channels; ++c) {
325  const int pool_idx =
326  ((ph * pooled_width + pw) * pooled_depth + pd) *
327  channels +
328  c;
329  int32_t Yh = -in_qparams_[0].zero_point * size;
330  for (int h = hstart; h < hend; ++h) {
331  for (int w = wstart; w < wend; ++w) {
332  for (int d = dstart; d < dend; ++d) {
333  const int input_idx =
334  ((h * width + w) * depth + d) * channels + c;
335  Yh += Xdata_temp[input_idx];
336  }
337  }
338  }
339  Ydata_temp[pool_idx] = std::min<int32_t>(
340  std::max<int32_t>(
341  nearbyint(Yh * multiplier + out_qparams_.zero_point),
342  minimum),
343  maximum);
344  } // channel
345  } // depth
346  } // width
347  } // height
348  } // for each image
349  break;
350  default:
351  CAFFE_THROW("Unsupported pooling size : ", this->kernel_.size());
352  return false;
353  }
354 
355  RunOnDeviceEpilogue_();
356  return true;
357  }
358 }; // class AveragePoolDnnLowPOp
359 
360 using MaxPoolFp32Op = PoolOp<float, CPUContext, MaxPoolFunctor<CPUContext>>;
361 
362 template <typename T>
363 class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> {
364  public:
365  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
366  USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, MaxPoolFp32Op);
367 
368  MaxPoolDnnLowPOp(const OperatorDef& operator_def, Workspace* ws)
369  : BaseType(operator_def, ws) {
370  for (int i = 0; i < this->kernel_.size(); ++i) {
371  CAFFE_ENFORCE(
372  dilation_[i] == 1, "Pooling op does not support dilation right now.");
373  }
374  if (!global_pooling_) {
375  for (int i = 0; i < this->kernel_.size(); ++i) {
376  CAFFE_ENFORCE(
377  pads_[i] < kernel_[i] &&
378  pads_[i + this->kernel_.size()] < kernel_[i],
379  "Pad should be smaller than kernel.");
380  }
381  }
382  }
383 
384  bool RunOnDeviceWithOrderNCHW() override {
385  using namespace dnnlowp;
386 
387  this->ParseDNNLowPOperatorArguments_();
388 
389  in_qparams_[0] =
390  GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
391  // Even if there is a pre-chosen quantization parameters for the output,
392  // it is ignored because maxpool output quantization should be same as the
393  // input.
394  out_qparams_ = in_qparams_[0];
395 
396  // Quantize input if needed
397  vector<T> X_temp;
398  const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
399 
400  auto& X = InputTensorCPU_(0);
401  auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1));
402  auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
403 
404  T* Ydata = GetQuantizedOutputData_();
405 
406  // The main loop
407  int channels = X.dim32(1);
408  int height = X.dim32(2);
409  int width = this->kernel_.size() > 1 ? X.dim32(3) : 1;
410  int depth = this->kernel_.size() > 2 ? X.dim32(4) : 1;
411  int pooled_height = Y->dim32(2);
412  int pooled_width = this->kernel_.size() > 1 ? Y->dim32(3) : 1;
413  int pooled_depth = this->kernel_.size() > 2 ? Y->dim32(4) : 1;
414 
415  switch (this->kernel_.size()) {
416  case 1:
417  for (int n = 0; n < X.dim32(0); ++n) {
418  for (int c = 0; c < channels; ++c) {
419  for (int ph = 0; ph < pooled_height; ++ph) {
420  int hstart = ph * stride_h() - pad_t();
421  int hend = min(hstart + kernel_h(), height);
422  hstart = max(hstart, 0);
423  T Yh = MaxPool<T>::initialize();
424  for (int h = hstart; h < hend; ++h) {
425  MaxPool<T>::process(Xdata[h], Yh);
426  }
427  MaxPool<T>::finalize(hend - hstart, Yh);
428  Ydata[ph] = Yh;
429  }
430  // Do offset.
431  Xdata += height;
432  Ydata += pooled_height;
433  }
434  }
435  break;
436  case 2:
437 #ifdef _OPENMP
438 #pragma omp parallel for
439 #endif
440  for (int n = 0; n < X.dim32(0); ++n) {
441  for (int c = 0; c < channels; ++c) {
442  // Do offset.
443  const T* Xdata_temp = Xdata + height * width * (c + channels * n);
444  T* Ydata_temp =
445  Ydata + pooled_height * pooled_width * (c + channels * n);
446  for (int ph = 0; ph < pooled_height; ++ph) {
447  int hstart = ph * stride_h() - pad_t();
448  int hend = min(hstart + kernel_h(), height);
449  hstart = max(hstart, 0);
450  for (int pw = 0; pw < pooled_width; ++pw) {
451  int wstart = pw * stride_w() - pad_l();
452  int wend = min(wstart + kernel_w(), width);
453  wstart = max(wstart, 0);
454  const int pool_index = ph * pooled_width + pw;
455  T Yh = MaxPool<T>::initialize();
456  for (int h = hstart; h < hend; ++h) {
457  for (int w = wstart; w < wend; ++w) {
458  const int input_index = h * width + w;
459  MaxPool<T>::process(Xdata_temp[input_index], Yh);
460  }
461  }
462  MaxPool<T>::finalize((hend - hstart) * (wend - wstart), Yh);
463  Ydata_temp[pool_index] = Yh;
464  }
465  }
466  }
467  }
468  break;
469  case 3:
470 #ifdef _OPENMP
471 #pragma omp parallel for
472 #endif
473  for (int n = 0; n < X.dim32(0); ++n) {
474  for (int c = 0; c < channels; ++c) {
475  // Do offset.
476  const T* Xdata_temp =
477  Xdata + height * width * depth * (c + channels * n);
478  T* Ydata_temp = Ydata +
479  pooled_height * pooled_width * pooled_depth *
480  (c + channels * n);
481  for (int ph = 0; ph < pooled_height; ++ph) {
482  int hstart = ph * stride_h() - pad_t();
483  int hend = min(hstart + kernel_h(), height);
484  hstart = max(hstart, 0);
485  for (int pw = 0; pw < pooled_width; ++pw) {
486  int wstart = pw * stride_w() - pad_l();
487  int wend = min(wstart + kernel_w(), width);
488  wstart = max(wstart, 0);
489  for (int pd = 0; pd < pooled_depth; ++pd) {
490  int dstart = pd * stride_[2] - pads_[2];
491  int dend = min(dstart + kernel_[2], depth);
492  dstart = max(dstart, 0);
493  const int pool_index =
494  ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
495  T Yh = MaxPool<T>::initialize();
496  for (int h = hstart; h < hend; ++h) {
497  for (int w = wstart; w < wend; ++w) {
498  for (int d = dstart; d < dend; ++d) {
499  const int input_index =
500  h * width * depth + w * depth + d;
501  MaxPool<T>::process(Xdata_temp[input_index], Yh);
502  }
503  }
504  }
506  (hend - hstart) * (wend - wstart) * (dend - dstart), Yh);
507  Ydata_temp[pool_index] = Yh;
508  }
509  }
510  }
511  }
512  }
513  break;
514  default:
515  CAFFE_THROW("Unsupported pooling size : ", this->kernel_.size());
516  return false;
517  }
518 
519  if (measure_quantization_error_) {
520  // to measure quantization error, run ref impl.
521  Fp32Op_()->DequantizeInput();
522  Fp32Op_()->Get()->RunOnDevice();
523  }
524 
525  RunOnDeviceEpilogue_();
526  return true;
527  }
528 
529  bool RunOnDeviceWithOrderNHWC() override {
530  // max pooling
531  using namespace dnnlowp;
532 
533  this->ParseDNNLowPOperatorArguments_();
534 
535  in_qparams_[0] =
536  GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
537  // Even if there is a pre-chosen quantization parameters for the output,
538  // it is ignored because maxpool output quantization should be same as the
539  // input.
540  out_qparams_ = in_qparams_[0];
541 
542  // Quantize input if needed
543  vector<T> X_temp;
544  const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
545 
546  auto& X = InputTensorCPU_(0);
547  int channels = X.dim32(X.ndim() - 1);
548  auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, channels);
549  auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>());
550 
551  T* Ydata = GetQuantizedOutputData_();
552 
553  int height = X.dim32(1);
554  int width = this->kernel_.size() > 1 ? X.dim32(2) : 1;
555  int depth = this->kernel_.size() > 2 ? X.dim32(3) : 1;
556  int pooled_height = Y->dim32(1);
557  int pooled_width = this->kernel_.size() > 1 ? Y->dim32(2) : 1;
558  int pooled_depth = this->kernel_.size() > 2 ? Y->dim32(3) : 1;
559 
560  switch (this->kernel_.size()) {
561  case 1:
562 #ifdef _OPENMP
563 #pragma omp parallel for
564 #endif
565  for (int n = 0; n < X.dim32(0); ++n) {
566  const T* Xdata_temp = Xdata + n * height * channels;
567  T* Ydata_temp = Ydata + n * pooled_height * channels;
568  for (int ph = 0; ph < pooled_height; ++ph) {
569  int hstart = ph * stride_h() - pad_t();
570  int hend = min(hstart + kernel_h(), height);
571  hstart = max(hstart, 0);
572  for (int c = 0; c < channels; ++c) {
573  T Yh = MaxPool<T>::initialize();
574  const int pool_idx = ph * channels + c;
575  for (int h = hstart; h < hend; ++h) {
576  const int input_idx = h * channels + c;
577  MaxPool<T>::process(Xdata_temp[input_idx], Yh);
578  }
579  MaxPool<T>::finalize(hend - hstart, Yh);
580  Ydata_temp[pool_idx] = Yh;
581  }
582  }
583  }
584  break;
585  case 2:
586  if (is_same<T, uint8_t>::value) {
587 #ifdef _OPENMP
588 #pragma omp parallel for
589 #endif
590  for (int n = 0; n < X.dim32(0); ++n) {
591  max_pool_avx2(
592  reinterpret_cast<const uint8_t*>(Xdata),
593  n,
594  height,
595  width,
596  channels,
597  pooled_height,
598  pooled_width,
599  kernel_h(),
600  kernel_w(),
601  stride_h(),
602  stride_w(),
603  pad_t(),
604  pad_l(),
605  reinterpret_cast<uint8_t*>(Ydata));
606  }
607  } else {
608 #ifdef _OPENMP
609 #pragma omp parallel for
610 #endif
611  for (int n = 0; n < X.dim32(0); ++n) {
612  const T* Xdata_temp = Xdata + n * height * width * channels;
613  T* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
614  for (int ph = 0; ph < pooled_height; ++ph) {
615  int hstart = ph * stride_h() - pad_t();
616  int hend = min(hstart + kernel_h(), height);
617  hstart = max(hstart, 0);
618  for (int pw = 0; pw < pooled_width; ++pw) {
619  int wstart = pw * stride_w() - pad_l();
620  int wend = min(wstart + kernel_w(), width);
621  wstart = max(wstart, 0);
622  int size = (hend - hstart) * (wend - wstart);
623  for (int c = 0; c < channels; ++c) {
624  T Yh = MaxPool<T>::initialize();
625  const int pool_idx = (ph * pooled_width + pw) * channels + c;
626  for (int h = hstart; h < hend; ++h) {
627  for (int w = wstart; w < wend; ++w) {
628  const int input_idx = (h * width + w) * channels + c;
629  MaxPool<T>::process(Xdata_temp[input_idx], Yh);
630  }
631  }
632  MaxPool<T>::finalize(size, Yh);
633  Ydata_temp[pool_idx] = Yh;
634  }
635  }
636  }
637  }
638  }
639  break;
640  case 3:
641 #ifdef _OPENMP
642 #pragma omp parallel for
643 #endif
644  for (int n = 0; n < X.dim32(0); ++n) {
645  const T* Xdata_temp = Xdata + n * height * width * depth * channels;
646  T* Ydata_temp = Ydata +
647  n * pooled_height * pooled_width * pooled_depth * channels;
648  for (int ph = 0; ph < pooled_height; ++ph) {
649  int hstart = ph * stride_h() - pad_t();
650  int hend = min(hstart + kernel_h(), height);
651  hstart = max(hstart, 0);
652  for (int pw = 0; pw < pooled_width; ++pw) {
653  int wstart = pw * stride_w() - pad_l();
654  int wend = min(wstart + kernel_w(), width);
655  wstart = max(wstart, 0);
656  for (int pd = 0; pd < pooled_depth; ++pd) {
657  int dstart = pd * stride_[2] - pads_[2];
658  int dend = min(dstart + kernel_[2], depth);
659  dstart = max(dstart, 0);
660  int size = (hend - hstart) * (wend - wstart) * (dend - dstart);
661  for (int c = 0; c < channels; ++c) {
662  T Yh = MaxPool<T>::initialize();
663  const int pool_idx =
664  ((ph * pooled_width + pw) * pooled_depth + pd) *
665  channels +
666  c;
667  for (int h = hstart; h < hend; ++h) {
668  for (int w = wstart; w < wend; ++w) {
669  for (int d = dstart; d < dend; ++d) {
670  const int input_idx =
671  ((h * width + w) * depth + d) * channels + c;
672  MaxPool<T>::process(Xdata_temp[input_idx], Yh);
673  }
674  }
675  }
676  MaxPool<T>::finalize(size, Yh);
677  Ydata_temp[pool_idx] = Yh;
678  }
679  }
680  }
681  }
682  }
683  break;
684  default:
685  CAFFE_THROW("Unsupported pooling size : ", this->kernel_.size());
686  return false;
687  }
688 
689  if (measure_quantization_error_) {
690  // to measure quantization error, run ref impl.
691  Fp32Op_()->DequantizeInput();
692  Fp32Op_()->Get()->RunOnDevice();
693  }
694 
695  RunOnDeviceEpilogue_();
696  return true;
697  }
698 }; // class MaxPoolDnnLowPOp
699 
700 REGISTER_CPU_OPERATOR_WITH_ENGINE(
701  AveragePool,
702  DNNLOWP,
703  AveragePoolDnnLowPOp<uint8_t>);
704 REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, DNNLOWP, MaxPoolDnnLowPOp<uint8_t>);
705 
706 REGISTER_CPU_OPERATOR_WITH_ENGINE(
707  AveragePool,
708  DNNLOWP_16,
709  AveragePoolDnnLowPOp<uint16_t>);
710 REGISTER_CPU_OPERATOR_WITH_ENGINE(
711  MaxPool,
712  DNNLOWP_16,
713  MaxPoolDnnLowPOp<uint16_t>);
714 
715 REGISTER_CPU_OPERATOR_WITH_ENGINE(
716  Int8AveragePool,
717  DNNLOWP,
718  AveragePoolDnnLowPOp<uint8_t>);
719 REGISTER_CPU_OPERATOR_WITH_ENGINE(
720  Int8MaxPool,
721  DNNLOWP,
722  MaxPoolDnnLowPOp<uint8_t>);
723 
724 } // namespace
725 
726 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13