Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_pool_dnnlowp_op_base.h
1 #pragma once
2 
3 #ifdef _OPENMP
4 #include <omp.h>
5 #endif
6 
7 #include "caffe2/core/tensor_int8.h"
8 #include "caffe2/operators/conv_op_shared.h"
9 #include "caffe2/operators/conv_pool_op_base.h"
10 #include "caffe2/quantization/server/fbgemm_pack_blob.h"
11 #include "caffe2/quantization/server/op_wrapper.h"
12 
13 #ifdef _OPENMP
14 C10_DECLARE_int(caffe2_omp_num_threads);
15 #endif
16 C10_DECLARE_bool(caffe2_dnnlowp_shared_int32_buffer);
17 C10_DECLARE_bool(caffe2_force_shared_col_buffer);
18 
19 namespace caffe2 {
20 
21 // TODO: code duplication with dnnlowp_op.h
22 template <typename T, typename FP32_OP>
23 class ConvPoolDNNLowPOpBase : public ConvPoolOpBase<CPUContext> {
24  static_assert(std::is_integral<T>::value, "Integral required.");
25 
26  public:
27  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
28  ConvPoolDNNLowPOpBase(const OperatorDef& operator_def, Workspace* ws)
29  : ConvPoolOpBase<CPUContext>(operator_def, ws),
30  in_qparams_(InputSize()),
31  qfactory_(dnnlowp::GetQuantizationFactoryOf(this)) {
32 #ifdef _OPENMP
33  if (FLAGS_caffe2_omp_num_threads > 0) {
34  omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
35  }
36 #endif
37 
38  if (this->debug_def().engine() == "DNNLOWP_16" ||
39  this->debug_def().engine() == "DNNLOWP_ROWWISE_16") {
40  LOG(WARNING)
41  << this->debug_def().engine()
42  << " is an experimental feature mostly for testing accuracy with "
43  "fixed-point precision higher than 8 and performance is very slow";
44  }
45  }
46 
47  virtual ~ConvPoolDNNLowPOpBase() {
48  if (measure_quantization_error_) {
49  dnnlowp::ReportQuantizationError(this, quantization_error_stats_);
50  LOG(WARNING) << this->debug_def().output(0) << " with type "
51  << this->debug_def().type() << " has output qparams : "
52  << "scale " << out_qparams_.scale << " offset "
53  << out_qparams_.zero_point << "; ";
54  }
55  }
56 
57  protected:
58  const TensorCPU& InputTensorCPU_(int idx) {
59  if (InputIsType<int8::Int8TensorCPU>(idx)) {
60  return this->Input<int8::Int8TensorCPU>(idx).t;
61  } else if (InputIsType<Int8ConvDNNLowPPackedWeightBlob>(idx)) {
62  return this->Input<Int8ConvDNNLowPPackedWeightBlob>(idx).original_tensor;
63  } else {
64  return Input(idx);
65  }
66  }
67 
68  TensorCPU* OutputTensorCPU_(int idx) {
69  return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
70  }
71 
72  Tensor* OutputTensorCPU_(int idx, at::IntList dims, at::TensorOptions options) {
73  auto* t = &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
74  ReinitializeTensor(t, dims, options.device(CPU));
75  return t;
76  }
77 
78  T* GetQuantizedOutputData_() {
79  return OutputTensorCPU_(0)->template mutable_data<T>();
80  }
81 
82  void MeasureQuantizationError_() {
83  if (!measure_quantization_error_ || !Fp32Op_()) {
84  return;
85  }
86 
87  const float* actual = nullptr;
88  vector<float> actual_temp;
89  if (OutputTensorCPU_(0)->template IsType<float>()) {
90  actual = OutputTensorCPU_(0)->template data<float>();
91  } else {
92  actual_temp.resize(OutputTensorCPU_(0)->numel());
93  fbgemm::Dequantize<T>(
94  OutputTensorCPU_(0)->template data<T>(),
95  actual_temp.data(),
96  OutputTensorCPU_(0)->numel(),
97  out_qparams_);
98  actual = actual_temp.data();
99  }
100 
101  TensorCPU* float_tensor = Fp32Op_()->Get()->Output(0);
102  float* ref = float_tensor->template mutable_data<float>();
103  if (followed_by_ == "Relu" || debug_def().type() == "ConvRelu" ||
104  debug_def().type() == "Int8ConvRelu") {
105  for (int i = 0; i < OutputTensorCPU_(0)->numel(); ++i) {
106  ref[i] = std::max(0.f, ref[i]);
107  }
108  }
109 
110  dnnlowp::MeasureQuantizationError(
111  actual, ref, OutputTensorCPU_(0)->numel(), &quantization_error_stats_);
112  }
113 
114  void RunOnDeviceEpilogue_() {
115  dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
116 
117  MeasureQuantizationError_();
118  }
119 
120  void ParseDNNLowPOperatorArguments_() {
121  if (!arguments_parsed_) {
122  bool dequantize_output;
123  dnnlowp::ParseDNNLowPOperatorArguments(
124  this,
125  &dequantize_output,
126  &measure_quantization_error_,
127  &followed_by_);
128  CAFFE_ENFORCE_EQ(
129  dequantize_output,
130  false,
131  "Conv DNNLOWP operators don't support dequantize_output");
132  arguments_parsed_ = true;
133  }
134  }
135 
136  void GetOutputQuantizationParams_() {
137  using namespace dnnlowp;
138 
139  ParseDNNLowPOperatorArguments_();
140 
141  if (HasStaticQuantization(this)) {
142  out_qparams_ = GetStaticQuantizationParamsOf(this, 0);
143 
144  if (measure_quantization_error_) {
145  // To measure quantization error, run ref fp32 impl.
146  // This doesn't really belong here but we need to run the reference fp32
147  // implementation before quantized computation of some inplace operators
148  // will overwrite their inputs.
149  Fp32Op_()->DequantizeInput();
150  Fp32Op_()->Get()->RunOnDevice();
151  }
152  } else {
153  // TODO: this is only needed when dequantize_output_ == false but leave
154  // as it is now because some code relies on out_qparams_ initialized even
155  // though it never actually uses it.
156  Fp32Op_()->DequantizeInput();
157  Fp32Op_()->Get()->RunOnDevice();
158  out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
159  }
160  }
161 
162  OpWrapper<FP32_OP, T>* Fp32Op_() {
163  if (!fp32_op_) {
164  fp32_op_.reset(new OpWrapper<FP32_OP, T>(this, qfactory_.get()));
165  }
166  return fp32_op_.get();
167  }
168 
169  void CreateSharedInt32Buffer_() {
170  auto* mutexPtr =
171  ws_->CreateBlob("__CAFFE2_DNNLOWP_SHARED_INT32_BUFFER_CPU_MUTEX__")
172  ->GetMutable<std::unique_ptr<std::mutex>>();
173  mutexPtr->reset(new std::mutex());
174  ws_->CreateBlob("__CAFFE2_DNNLOWP_SHARED_INT32_BUFFER_CPU__");
175  }
176 
177  void RunWithSharedBuffer_(
178  Tensor* col_buffer,
179  vector<int32_t>* Y_int32,
180  std::function<
181  void(Tensor* col_buffer_shared, vector<int32_t>* Y_int32_shared)> f) {
182  auto f2 = [this, Y_int32, f](Tensor* col_buffer_shared) {
183  if (FLAGS_caffe2_dnnlowp_shared_int32_buffer) {
184  auto* mutexBlob =
185  ws_->GetBlob("__CAFFE2_DNNLOWP_SHARED_INT32_BUFFER_CPU_MUTEX__");
186  CAFFE_ENFORCE(mutexBlob, "Must call CreateSharedInt32Buffer() first");
187 
188  auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
189  std::lock_guard<std::mutex> g(**mutexPtr);
190 
191  auto* Y_int32_shared =
192  ws_->GetBlob("__CAFFE2_DNNLOWP_SHARED_INT32_BUFFER_CPU__")
193  ->template GetMutable<vector<int32_t>>();
194  f(col_buffer_shared, Y_int32_shared);
195  } else {
196  f(col_buffer_shared, Y_int32);
197  }
198  };
199 
200  if (FLAGS_caffe2_force_shared_col_buffer || this->shared_buffer_) {
201  runWithSharedBuffer<CPUContext>(this->ws_, f2);
202  } else {
203  f2(col_buffer);
204  }
205  }
206 
207  bool measure_quantization_error_{false};
208  std::string followed_by_;
209 
210  std::vector<dnnlowp::TensorQuantizationParams> in_qparams_;
211  dnnlowp::TensorQuantizationParams out_qparams_;
212 
213  std::unique_ptr<OpWrapper<FP32_OP, T>> fp32_op_;
214  std::unique_ptr<dnnlowp::QuantizationFactory> qfactory_;
215 
216  std::vector<T> out_temp_;
217  // Buffer to store quantized output temporarily
218  // when we output dequantized values.
219 
220  dnnlowp::QuantizationErrorStats quantization_error_stats_;
221 
222  bool arguments_parsed_{false};
223 };
224 
225 #define USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FP32_OP) \
226  /* using override */ using BaseType = ConvPoolDNNLowPOpBase<T, FP32_OP>; \
227  /* using override */ using BaseType::GetOutputQuantizationParams_; \
228  /* using override */ using BaseType::GetQuantizedOutputData_; \
229  /* using override */ using BaseType::Fp32Op_; \
230  /* using override */ using BaseType::InputTensorCPU_; \
231  /* using override */ using BaseType::MeasureQuantizationError_; \
232  /* using override */ using BaseType::OutputTensorCPU_; \
233  /* using override */ using BaseType::RunOnDeviceEpilogue_; \
234  /* using override */ using BaseType::followed_by_; \
235  /* using override */ using BaseType::in_qparams_; \
236  /* using override */ using BaseType::measure_quantization_error_; \
237  /* using override */ using BaseType::out_qparams_; \
238  /* using override */ using BaseType::qfactory_;
239 
240 } // namespace caffe2
C10_NODISCARD TensorOptions device(c10::optional< Device > device) const noexcept
Return a copy of TensorOptions with device set to the given one, or cleared if device is nullopt...
void ReinitializeTensor(Tensor *tensor, at::IntArrayRef dims, at::TensorOptions options)
Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if ...
Definition: tensor.cc:127
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:40
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Wrap a floating-point operator with quantized inputs with type T.
Definition: op_wrapper.h:15