Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_dnnlowp_acc16_op.cc
1 #include "conv_dnnlowp_acc16_op.h"
2 
3 // #define DNNLOWP_ACC16_IN_SLOW_PATH
4 // #define DNNLOWP_MEASURE_TIME_BREAKDOWN
5 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
6 #include <chrono>
7 #endif
8 #ifdef _OPENMP
9 #include <omp.h>
10 #endif
11 
12 #include "dnnlowp_op.h"
13 #include "dnnlowp_partition.h"
14 #include "fbgemm_pack_op.h"
15 #include "im2col_dnnlowp.h"
16 
17 C10_DECLARE_int32(caffe2_dnnlowp_nbits_in_non_outlier);
18 C10_DECLARE_int32(caffe2_dnnlowp_copy_to_32bit_frequency);
19 C10_DECLARE_bool(caffe2_dnnlowp_shared_int32_buffer);
20 
21 // Thresholds to fallback to 32-bit accumulation when 16-bit accumulation
22 // doesn't provide performance benefits.
23 C10_DEFINE_double(
24  caffe2_dnnlowp_acc16_density_threshold,
25  0.05,
26  "If density of outlier is higher than this, fallback to 32-bit accumulation");
27 C10_DEFINE_int32(
28  caffe2_dnnlowp_acc16_m_threshold,
29  0,
30  "If m is smaller than this, fallback to 32-bit accumulation");
31 C10_DEFINE_int32(
32  caffe2_dnnlowp_acc16_n_threshold,
33  0,
34  "If n is smaller than this, fallback to 32-bit accumulation");
35 C10_DEFINE_int32(
36  caffe2_dnnlowp_acc16_k_threshold,
37  0,
38  "If k is smaller than this, fallback to 32-bit accumulation");
39 
40 namespace caffe2 {
41 
42 using namespace std;
43 
44 template <bool ReluFused>
45 ConvDNNLowPAcc16Op<ReluFused>::ConvDNNLowPAcc16Op(
46  const OperatorDef& operator_def,
47  Workspace* ws)
48  : ConvDNNLowPOp<uint8_t, ReluFused>(operator_def, ws),
49  nbits_in_non_outlier_(this->template GetSingleArgument<int>(
50  "nbits_in_non_outlier",
51  FLAGS_caffe2_dnnlowp_nbits_in_non_outlier)),
52  copy_to_32bit_frequency_(this->template GetSingleArgument<int>(
53  "copy_to_32bit_frequency",
54  FLAGS_caffe2_dnnlowp_copy_to_32bit_frequency)) {
55  if (nbits_in_non_outlier_ == 0) {
56  LOG(INFO) << "nbits_in_non_outlier == 0 means everything is outlier so we "
57  "fallback to acc32";
58  fallback_to_32_bit_accumulation_ = true;
59  }
60 }
61 
62 template <bool ReluFused>
63 bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
64  if (fallback_to_32_bit_accumulation_) {
65  return true;
66  }
67 
68  if (!BaseType::GetQuantizationParameters_()) {
69  return false;
70  }
71 
72  if (!Wq_acc16_packed_ &&
73  this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
74  CAFFE_ENFORCE_EQ(
75  this->order_,
76  StorageOrder::NHWC,
77  "Pre-packed weight only works with NHWC layout");
78  // If the input is already packed
79  const auto& packed_filter =
80  this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER);
81  Wq_outlier_ = packed_filter.W_outlier;
82  Wq_acc16_packed_ = packed_filter.W_acc16;
83 
84  if (nbits_in_non_outlier_ != packed_filter.nbits_in_non_outlier) {
85  LOG(WARNING)
86  << "nbits_in_non_outlier in packed weight "
87  << packed_filter.nbits_in_non_outlier
88  << " doesn't match with nbits_in_non_outlier specified in operator "
89  << nbits_in_non_outlier_;
90  }
91 
92  first_invocation_ = false;
93  return true;
94  }
95 
96  int kernel_dim = this->KernelDim_();
97  const auto& filter = InputTensorCPU_(FILTER);
98  int num_out_channels = filter.dim32(0);
99 
100  // Check if we should fallback to 32-bit accumulation
101  if (this->order_ == StorageOrder::NHWC) {
102  const Tensor& X = InputTensorCPU_(INPUT);
103  int N = X.dim32(0);
104 
105  auto sizes = this->GetOutputSize(X, filter.dim32(0));
106  Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>());
107  const int output_image_size = this->GetDimsSize(*Y);
108 
109  if (N * output_image_size < FLAGS_caffe2_dnnlowp_acc16_m_threshold) {
110  LOG(INFO) << "M " << N * output_image_size
111  << " of Conv layer with weight blob "
112  << this->debug_def().input(1) << " is smaller than threshold "
113  << FLAGS_caffe2_dnnlowp_acc16_m_threshold
114  << " . Falling back to acc32";
115  fallback_to_32_bit_accumulation_ = true;
116  return true;
117  }
118  if (num_out_channels / group_ < FLAGS_caffe2_dnnlowp_acc16_n_threshold) {
119  LOG(INFO) << "N " << num_out_channels / group_
120  << " of Conv layer with weight blob "
121  << this->debug_def().input(1) << " is smaller than threshold "
122  << FLAGS_caffe2_dnnlowp_acc16_n_threshold
123  << " . Falling back to acc32";
124  fallback_to_32_bit_accumulation_ = true;
125  return true;
126  }
127  if (kernel_dim < FLAGS_caffe2_dnnlowp_acc16_k_threshold) {
128  LOG(INFO) << "K " << kernel_dim << " of Conv layer with weight blob "
129  << this->debug_def().input(1) << " is smaller than threshold "
130  << FLAGS_caffe2_dnnlowp_acc16_k_threshold
131  << " . Falling back to acc32";
132  fallback_to_32_bit_accumulation_ = true;
133  return true;
134  }
135  }
136 
137  // Separate out outliers
138  if (!Wq_outlier_ && this->order_ == StorageOrder::NHWC &&
139  nbits_in_non_outlier_ < 8) {
140  CAFFE_ENFORCE(!W_quantized_.empty());
141 
142  Wq_outlier_.reset(ExtractOutlierMatrix(
143  group_,
144  kernel_dim,
145  num_out_channels,
146  nbits_in_non_outlier_,
147  W_quantized_));
148  int outlier_cnt = Wq_outlier_->ColPtr()[num_out_channels];
149 
150  LOG(INFO) << "Proportion of outlier for Conv layer with weight blob "
151  << this->debug_def().input(1) << " is "
152  << static_cast<float>(outlier_cnt) / W_quantized_.size();
153  LOG(INFO) << "nbits_in_non_outlier " << nbits_in_non_outlier_
154  << " copy_to_32bit_frequency " << copy_to_32bit_frequency_;
155 
156  if (static_cast<float>(outlier_cnt) / W_quantized_.size() >
157  FLAGS_caffe2_dnnlowp_acc16_density_threshold) {
158  LOG(INFO) << "Density of outliers is higher than threshold "
159  << FLAGS_caffe2_dnnlowp_acc16_density_threshold
160  << " . Falling back to acc32";
161  fallback_to_32_bit_accumulation_ = true;
162  Wq_outlier_.reset();
163  return true;
164  }
165  }
166 
167  bool packW = this->order_ == StorageOrder::NHWC && GetCpuId().avx2();
168 
169  if (first_invocation_) {
170  if (!packW) {
171  string reason;
172  if (this->order_ != StorageOrder::NHWC) {
173  reason = "fbgemm only supports NHWC layout";
174  } else if (!GetCpuId().avx2()) {
175  reason = "fbgemm only supports AVX2+";
176  } else {
177  assert(false);
178  }
179 
180  if (!reason.empty()) {
181  static int log_occurences = 0;
182  if (log_occurences < 32) {
183  ++log_occurences;
184  LOG(WARNING) << "Conv with weight " << this->debug_def().input(FILTER)
185  << " falls back to slow path because " << reason;
186  }
187  }
188  }
189  if (nbits_in_non_outlier_ < 8 && this->order_ != StorageOrder::NHWC) {
190  static int log_occurences = 0;
191  if (log_occurences < 32) {
192  ++log_occurences;
193  LOG(WARNING) << "Outlier-aware quantization only supports "
194  "NHWC layout";
195  }
196  }
197  first_invocation_ = false;
198  }
199 
200  if (packW && !Wq_acc16_packed_) {
201  Wq_acc16_packed_.reset(new fbgemm::PackBMatrix<int8_t, int16_t>(
202  fbgemm::matrix_op_t::Transpose,
203  group_ * kernel_dim,
204  num_out_channels / group_,
205  W_quantized_.data(),
206  kernel_dim, // ld
207  nullptr, // pmat
208  group_));
209  vector<int8_t>().swap(W_quantized_);
210  }
211 
212  return true;
213 }
214 
215 template <bool ReluFused>
216 bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() {
217  VLOG(2) << "Running DNNLOWP_ACC16 Conv";
218 
219  using namespace dnnlowp;
220 
221  // Get quantization parameters
222  if (!GetQuantizationParameters_()) {
223  return false;
224  }
225  if (fallback_to_32_bit_accumulation_) {
226  return BaseType::RunOnDeviceWithOrderNCHW();
227  }
228 
229  const Tensor& X = InputTensorCPU_(INPUT);
230  auto& filter = InputTensorCPU_(FILTER);
231  const int N = X.dim32(0), C = X.dim32(1);
232  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
233  const int M = filter.dim32(0);
234  CAFFE_ENFORCE_EQ(
235  C,
236  filter.dim32(1) * group_,
237  "Convolution op: input channels does not match: # of input channels ",
238  C,
239  " is not equal to kernel channels * group:",
240  filter.dim32(1),
241  "*",
242  group_);
243  CAFFE_ENFORCE_EQ(
244  M % group_,
245  0,
246  "The number of output channels is not divisible by group.");
247 
248  auto sizes = this->GetOutputSize(X, filter.dim32(0));
249  Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>());
250 
251  const vector<int> input_dims = GetDims(X);
252  const vector<int> output_dims = GetDims(*Y);
253  const int input_image_size = this->GetDimsSize(X);
254  const int output_image_size = this->GetDimsSize(*Y);
255 
256  // The dimension of each kernel
257  const int kernel_dim = this->KernelDim_();
258 
259  vector<int> img_shape;
260  img_shape.assign(X.sizes().begin() + 1, X.sizes().end());
261 
262  vector<int> buffer_shape;
263  buffer_shape.push_back(kernel_dim);
264  buffer_shape.insert(
265  buffer_shape.end(), output_dims.begin(), output_dims.end());
266  buffer_shape.insert(buffer_shape.begin(), dnnlowp_get_max_threads());
267 
268  if (this->kernel_.size() != 2) {
269  SetDeviceTensor(img_shape, &(this->img_shape_device_));
270  SetDeviceTensor(buffer_shape, &(this->col_buffer_shape_device_));
271  }
272 
273  const int col_buffer_size = kernel_dim * output_image_size;
274 
275  // The offset corresponding to a single input image, and a single output
276  // image.
277  const int input_offset = C / group_ * input_image_size;
278 
279  // The col buffer is stored in CHW order as well - kernel_dim, and the
280  // height and width.
281  const uint8_t* Xdata = X.template data<uint8_t>();
282 
283  auto f = [&](Tensor* col_buffer, vector<int32_t>* Y_int32) {
284  col_buffer->Resize(buffer_shape);
285  uint8_t* col_buffer_data = col_buffer->template mutable_data<uint8_t>();
286 
287  Y_int32->resize(M * output_image_size * dnnlowp_get_max_threads());
288  vector<int> buffer_shape_per_thread(
289  buffer_shape.begin() + 1, buffer_shape.end());
290 
291  // Im2Col, followed by gemm.
292  uint8_t* Y_data = Y->template mutable_data<uint8_t>();
293  this->column_offsets_->resize(
294  output_image_size * dnnlowp_get_max_threads());
295 
296 #ifdef _OPENMP
297 #pragma omp parallel for
298 #endif
299  for (int image_id = 0; image_id < N; ++image_id) {
300  int tid = dnnlowp_get_thread_num();
301  for (int group_id = 0; group_id < group_; ++group_id) {
302  if (this->kernel_.size() == 2) {
303  math::Im2ColNCHW<uint8_t>(
304  C / group_,
305  input_dims[0],
306  input_dims[1],
307  kernel_h(),
308  kernel_w(),
309  dilation_h(),
310  dilation_w(),
311  pad_t(),
312  pad_l(),
313  pad_b(),
314  pad_r(),
315  stride_h(),
316  stride_w(),
317  Xdata + (group_ * image_id + group_id) * input_offset,
318  col_buffer_data + tid * col_buffer_size,
319  &context_,
320  in_qparams_[INPUT].zero_point);
321  } else {
322  math::Im2ColNdNCHW<uint8_t>(
323  this->kernel_.size(),
324  C * input_image_size,
325  col_buffer_size,
326  img_shape.data(),
327  buffer_shape_per_thread.data(),
328  this->kernel_.data(),
329  this->stride_.data(),
330  this->dilation_.data(),
331  this->pads_.data(),
332  Xdata + (group_ * image_id + group_id) * input_offset,
333  col_buffer_data + tid * col_buffer_size,
334  &context_,
335  in_qparams_[INPUT].zero_point);
336  }
337 
338  // quantize col_buffer
339  uint8_t* col_buffer_private = col_buffer_data + tid * col_buffer_size;
340 
341  // main GEMM
342  int32_t* Y_int32_temp = Y_int32->data() +
343  ((M / group_) * group_id + M * tid) * output_image_size;
344  int8_t* W_quantized_group =
345  W_quantized_.data() + (M / group_) * group_id * kernel_dim;
346 
347  static int log_occurences = 0;
348  if (log_occurences < 32) {
349  ++log_occurences;
350  LOG(WARNING)
351  << "Consider using DNNLOWP instead of DNNLOWP_ACC16 engine since "
352  "we're falling back to a slow path because of NCHW layout";
353  }
354 
355  for (int i = 0; i < M / group_; ++i) {
356  for (int j = 0; j < output_image_size; ++j) {
357  int32_t int32_sum = 0;
358  int16_t int16_sum = 0;
359  for (int k = 0; k < kernel_dim; ++k) {
360  int32_t w = W_quantized_group[i * kernel_dim + k];
361  int32_t x = col_buffer_private[k * output_image_size + j];
362 #ifdef DNNLOWP_ACC16_IN_SLOW_PATH
363  int16_sum = std::max<int32_t>(
364  numeric_limits<int16_t>::min(),
365  std::min<int32_t>(
366  numeric_limits<int16_t>::max(), int16_sum + x * w));
367  if (k % copy_to_32bit_frequency_ ==
368  copy_to_32bit_frequency_ - 1) {
369  int32_sum += int16_sum;
370  int16_sum = 0;
371  }
372 #else
373  int32_sum += w * x;
374 #endif
375  }
376  Y_int32_temp[i * output_image_size + j] = int32_sum + int16_sum;
377  }
378  }
379 
380  this->RunOnDeviceEpilogueNCHW_(
381  col_buffer_private,
382  Y_int32_temp,
383  Y_data + (M * image_id + M / group_ * group_id) * output_image_size,
384  M / group_ * group_id,
385  group_id);
386  } // for each group
387  } // for each image_id
388  }; // f
389 
390  this->RunWithSharedBuffer_(&col_buffer_, &(this->Y_int32_), f);
391 
392  PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
393 
394  this->MeasureQuantizationError_();
395 
396  return true;
397 } // RunOnDeviceWithOrderNCHWAndType_
398 
399 static void conv_nhwc_acc16_ref_(
400  int num_groups,
401  int N,
402  int output_image_size,
403  int M,
404  int kernel_dim,
405  const uint8_t* col_buffer,
406  const int8_t* W,
407  int32_t* Y
408 #ifdef DNNLOWP_ACC16_IN_SLOW_PATH
409  ,
410  OperatorBase* op
411 #endif
412 ) {
413 #ifdef DNNLOWP_ACC16_IN_SLOW_PATH
414  uint64_t underflow_cnt = 0, overflow_cnt = 0;
415 #endif
416  for (int group_id = 0; group_id < num_groups; ++group_id) {
417  for (int i = 0; i < N * output_image_size; ++i) {
418  for (int j = 0; j < M / num_groups; ++j) {
419  int32_t int32_sum = 0;
420  int16_t int16_sum = 0;
421 #ifdef DNNLOWP_ACC16_IN_SLOW_PATH
422  bool overflowed = false, underflowed = false;
423 #endif
424  for (int k = 0; k < kernel_dim; ++k) {
425  int32_t x = col_buffer[(i * num_groups + group_id) * kernel_dim + k];
426  int32_t w = W[(group_id * (M / num_groups) + j) * kernel_dim + k];
427 #ifdef DNNLOWP_ACC16_IN_SLOW_PATH
428  if (!overflowed && !underflowed) {
429  if (int16_sum + x * w > numeric_limits<int16_t>::max()) {
430  overflowed = true;
431  } else if (int16_sum + x * w < numeric_limits<int16_t>::min()) {
432  underflowed = true;
433  }
434  }
435 
436  int16_sum = std::max<int32_t>(
437  numeric_limits<int16_t>::min(),
438  std::min<int32_t>(
439  numeric_limits<int16_t>::max(), int16_sum + x * w));
440  if (k % copy_to_32bit_frequency_ == copy_to_32bit_frequency_ - 1) {
441  int32_sum += int16_sum;
442  int16_sum = 0;
443  }
444 #else
445  int32_sum += x * w;
446 #endif
447  }
448  Y[i * M + group_id * (M / num_groups) + j] = int32_sum + int16_sum;
449 #ifdef DNNLOWP_ACC16_IN_SLOW_PATH
450  if (overflowed) {
451  ++overflow_cnt;
452  } else if (underflowed) {
453  ++underflow_cnt;
454  }
455 #ifdef DNNLOWP_DETAILED_LOG_IN_ACC16_SLOW_PATH
456  if (overflowed || underflowed) {
457  int32_t sum = 0;
458  for (int k = 0; k < kernel_dim; ++k) {
459  int32_t x =
460  col_buffer[(i * num_groups + group_id) * kernel_dim + k];
461  int32_t w = W[k * M + group_id * (M / num_groups) + j];
462  LOG(INFO) << k << ": " << sum << " + " << x << " * " << w << " = "
463  << sum + x * w;
464  sum += x * w;
465  }
466  }
467 #endif
468 #endif
469  }
470  }
471  } // for each group
472 
473 #ifdef DNNLOWP_ACC16_IN_SLOW_PATH
474  LOG(INFO) << op->debug_def().input(1) << " underflow_cnt " << underflow_cnt
475  << " (" << (float)underflow_cnt / (N * output_image_size * M) * 100
476  << ") overflow_cnt " << overflow_cnt << " ("
477  << (float)overflow_cnt / (N * output_image_size * M) * 100 << ")";
478 #endif
479 }
480 
481 template <bool ReluFused>
482 template <fbgemm::QuantizationGranularity Q_GRAN>
483 void ConvDNNLowPAcc16Op<ReluFused>::DispatchFBGEMM_(
484  fbgemm::PackAWithRowOffset<uint8_t, int16_t>& packA,
485  const uint8_t* col_buffer_data,
486  vector<int32_t>* Y_int32,
487  uint8_t* Y_uint8_data) {
488  // This function is called within an OpenMP region
489  auto& filter = InputTensorCPU_(FILTER);
490  const int M = filter.dim32(0);
491 
492  assert(Wq_acc16_packed_.get());
493  int kernel_dim = this->KernelDim_();
494 
495  int nthreads = dnnlowp_get_num_threads();
496  int tid = dnnlowp_get_thread_num();
497 
498  using namespace fbgemm;
499  DoNothing<> doNothingObj{};
500  ReQuantizeOutput<ReluFused, Q_GRAN> reqObj(
501  doNothingObj,
502  this->requantization_multipliers_.data(),
503  out_qparams_.zero_point,
504  in_qparams_[INPUT].zero_point,
505  this->filter_zero_points_.data(),
506  packA.getRowOffsetBuffer(),
507  this->column_offsets_->data(),
508  InputSize() == 3 ? this->b_quantized_data_ : nullptr,
509  M,
510  group_);
511 
512  if (nbits_in_non_outlier_ < 8) {
513  DoSpmdmOnInpBuffer<
514  typename ReQuantizeOutput<ReluFused>::outType,
515  int32_t,
516  ReQuantizeOutput<ReluFused, Q_GRAN>>
517  spmdmObj(
518  reqObj, col_buffer_data, group_ * kernel_dim, *Wq_outlier_, group_);
519 
520  fbgemmPacked(
521  packA,
522  *Wq_acc16_packed_,
523  Y_uint8_data,
524  Y_int32->data(),
525  M,
526  spmdmObj,
527  tid,
528  nthreads);
529  } else {
530  fbgemmPacked(
531  packA,
532  *Wq_acc16_packed_,
533  Y_uint8_data,
534  Y_int32->data(),
535  M,
536  reqObj,
537  tid,
538  nthreads);
539  }
540 }
541 
542 template <bool ReluFused>
543 void ConvDNNLowPAcc16Op<ReluFused>::ConvOutlier_(
544  const uint8_t* col_buffer,
545  vector<int32_t>* Y_int32) {
546  if (nbits_in_non_outlier_ < 8) {
547  const Tensor& X = InputTensorCPU_(INPUT);
548  auto& filter = InputTensorCPU_(FILTER);
549  Tensor* Y = OutputTensorCPU_(0);
550  const int N = X.dim32(0);
551  const int M = filter.dim32(0);
552 
553  const int kernel_dim = this->KernelDim_();
554  const int output_image_size = this->GetDimsSize(*Y);
555 
556 #ifdef _OPENMP
557 #pragma omp parallel
558 #endif
559  {
560  int group_begin, group_end, i_begin, i_end;
561  this->PartitionGroupedNHWCConv_(
562  &group_begin,
563  &group_end,
564  &i_begin,
565  &i_end,
566  group_,
567  N * output_image_size,
568  dnnlowp_get_num_threads(),
569  dnnlowp_get_thread_num());
570 
571  for (int group_id = group_begin; group_id < group_end; ++group_id) {
572  CAFFE_ENFORCE_EQ(Wq_outlier_->NumOfRows(), kernel_dim);
573  // Dense-matrix times sparse-matrix multiplication for outlier
574  fbgemm::block_type_t block = {
575  0, i_end - i_begin, group_id * (M / group_), M / group_};
576  Wq_outlier_->SpMDM(
577  block,
578  col_buffer + (i_begin * group_ + group_id) * kernel_dim,
579  group_ * kernel_dim,
580  true /* accumulate */,
581  Y_int32->data() + i_begin * M + group_id * (M / group_),
582  M);
583  }
584  }
585  }
586 }
587 
588 template <bool ReluFused>
589 bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWC() {
590  CAFFE_ENFORCE_LE(
591  this->kernel_.size(),
592  3,
593  "Only 1-3d convolution is supported for NHWC storage type");
594 
595  using namespace dnnlowp;
596 
597 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
598  chrono::time_point<chrono::system_clock> t_very_begin, t_begin, t_end;
599 
600  t_begin = chrono::system_clock::now();
601  t_very_begin = t_begin;
602 #endif
603 
604  // Get quantization parameters
605  if (!GetQuantizationParameters_()) {
606  return false;
607  }
608 
609  if (fallback_to_32_bit_accumulation_) {
610  return BaseType::RunOnDeviceWithOrderNHWC();
611  }
612 
613 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
614  t_end = chrono::system_clock::now();
615  double dt = chrono::duration<double>(t_end - t_begin).count();
616  LOG(INFO) << "this=" << this << " get_quant_params: " << dt * 1e3 << " ms";
617 #endif
618 
619  const Tensor& X = InputTensorCPU_(INPUT);
620  auto& filter = InputTensorCPU_(FILTER);
621  const int N = X.dim32(0), C = X.dim32(X.ndim() - 1);
622 
623  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
624  const int M = filter.dim32(0);
625  CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
626 
627  auto sizes = this->GetOutputSize(X, filter.dim32(0));
628  Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>());
629  // The dimension of each kernel
630  const int kernel_dim = this->KernelDim_();
631  // The output image size is the spatial size of the output.
632  const int output_image_size = this->GetDimsSize(*Y);
633  // The col buffer is stored in HWC order as well - kernel_dim, and the height
634  // and width.
635 
636  auto f = [&](Tensor* col_buffer, vector<int32_t>* Y_int32) {
637  Y_int32->resize(Y->numel());
638 
639 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
640  t_begin = chrono::system_clock::now();
641 #endif
642 
643  bool no_im2col = this->NoIm2ColNHWC_();
644 
645  // Im2Col, followed by gemm.
646  const uint8_t* Xdata = X.template data<uint8_t>();
647  const uint8_t* col_buffer_data =
648  no_im2col ? Xdata : this->Im2ColNHWC_(col_buffer);
649 
650 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
651  t_end = chrono::system_clock::now();
652  dt = chrono::duration<double>(t_end - t_begin).count();
653  LOG(INFO) << "this=" << this << " im2col: " << dt * 1e3 << " ms";
654  t_begin = chrono::system_clock::now();
655 #endif
656 
657  using namespace fbgemm;
658  int row_offset_size_per_thread = -1;
659  int x_pack_buf_size_per_thread = -1;
660  if (Wq_acc16_packed_) {
661  row_offset_size_per_thread =
662  PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
663  x_pack_buf_size_per_thread =
664  PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
665  row_offsets_.resize(
666  dnnlowp_get_max_threads() * row_offset_size_per_thread);
667  X_pack_buf_.resize(
668  dnnlowp_get_max_threads() * x_pack_buf_size_per_thread);
669  }
670 
671  uint8_t* Y_uint8_data = Y->template mutable_data<uint8_t>();
672 
673  // Main GEMM for non-outlier
674  if (Wq_acc16_packed_)
675 #ifdef _OPENMP
676 #pragma omp parallel
677 #endif
678  {
679  // fast path
680  int tid = dnnlowp_get_thread_num();
681 
682  // no im2col fusion
683  PackAWithRowOffset<uint8_t, int16_t> packA(
684  matrix_op_t::NoTranspose,
685  N * output_image_size,
686  group_ * kernel_dim,
687  col_buffer_data,
688  group_ * kernel_dim,
689  X_pack_buf_.data() + tid * x_pack_buf_size_per_thread,
690  group_,
691  row_offsets_.data() + tid * row_offset_size_per_thread);
692 
693  if (this->quantize_groupwise_) {
694  DispatchFBGEMM_<QuantizationGranularity::GROUP>(
695  packA, col_buffer_data, Y_int32, Y_uint8_data);
696  } else {
697  DispatchFBGEMM_<QuantizationGranularity::TENSOR>(
698  packA, col_buffer_data, Y_int32, Y_uint8_data);
699  }
700  } else {
701  // slow path
702  conv_nhwc_acc16_ref_(
703  group_,
704  N,
705  output_image_size,
706  M,
707  kernel_dim,
708  col_buffer_data,
709  W_quantized_.data(),
710  Y_int32->data()
711 #ifdef DNNLOWP_ACC16_IN_SLOW_PATH
712  ,
713  this
714 #endif
715  );
716  } // slow path
717 
718 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
719  t_end = chrono::system_clock::now();
720  dt = chrono::duration<double>(t_end - t_begin).count();
721  double ops = 2. * N * output_image_size * M * kernel_dim;
722  double gops = ops / dt / 1e9;
723  LOG(INFO) << "this=" << this << " GEMM: " << dt * 1e3 << " ms " << gops
724  << " gops";
725  t_begin = chrono::system_clock::now();
726 #endif
727 
728  if (!Wq_acc16_packed_) {
729  ConvOutlier_(col_buffer_data, Y_int32);
730  }
731 
732 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
733  t_end = chrono::system_clock::now();
734  dt = chrono::duration<double>(t_end - t_begin).count();
735  LOG(INFO) << "this=" << this << " out-lier: " << dt * 1e3 << " ms";
736  t_begin = chrono::system_clock::now();
737 #endif
738 
739  if (!Wq_acc16_packed_) {
740  this->RunOnDeviceEpilogueNHWC_(col_buffer_data, Y_int32->data());
741  } else {
742  PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
743  }
744  }; // f
745 
746  this->RunWithSharedBuffer_(&col_buffer_, &(this->Y_int32_), f);
747 
748 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
749  t_end = chrono::system_clock::now();
750  dt = chrono::duration<double>(t_end - t_begin).count();
751  LOG(INFO) << "this=" << this << " prologue: " << dt * 1e3 << " ms";
752  t_begin = chrono::system_clock::now();
753 
754  t_end = chrono::system_clock::now();
755  dt = chrono::duration<double>(t_end - t_very_begin).count();
756  double ops = 2. * N * output_image_size * M * kernel_dim;
757  double gops = ops / dt / 1e9;
758  LOG(INFO) << "this=" << this << " " << this->debug_def().type()
759  << " output=" << this->debug_def().output(0) << " "
760  << N * output_image_size << "x" << M << "x" << kernel_dim
761  << " G=" << group_ << " C/G=" << C / group_ << " K/G=" << M / group_
762  << " R=" << kernel_h() << " S=" << kernel_w() << " : " << dt * 1e3
763  << " ms " << gops << " gops";
764 #endif
765 
766  this->MeasureQuantizationError_();
767 
768  return true;
769 }
770 
771 REGISTER_CPU_OPERATOR_WITH_ENGINE(
772  Conv,
773  DNNLOWP_ACC16,
774  ConvDNNLowPAcc16Op<false>);
775 REGISTER_CPU_OPERATOR_WITH_ENGINE(
776  ConvRelu,
777  DNNLOWP_ACC16,
778  ConvDNNLowPAcc16Op<true>);
779 
780 REGISTER_CPU_OPERATOR_WITH_ENGINE(
781  Int8Conv,
782  DNNLOWP_ACC16,
783  ConvDNNLowPAcc16Op<false>);
784 REGISTER_CPU_OPERATOR_WITH_ENGINE(
785  Int8ConvRelu,
786  DNNLOWP_ACC16,
787  ConvDNNLowPAcc16Op<true>);
788 
789 } // namespace caffe2
Definition: any.cpp:108
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: OpClasses.h:13
Definition: static.cpp:64