7 #include "caffe2/core/tensor_int8.h" 8 #include "caffe2/operators/conv_op_shared.h" 9 #include "caffe2/operators/conv_pool_op_base.h" 10 #include "caffe2/quantization/server/fbgemm_pack_blob.h" 11 #include "caffe2/quantization/server/op_wrapper.h" 14 C10_DECLARE_int(caffe2_omp_num_threads);
16 C10_DECLARE_bool(caffe2_dnnlowp_shared_int32_buffer);
17 C10_DECLARE_bool(caffe2_force_shared_col_buffer);
22 template <
typename T,
typename FP32_OP>
24 static_assert(std::is_integral<T>::value,
"Integral required.");
30 in_qparams_(InputSize()),
31 qfactory_(dnnlowp::GetQuantizationFactoryOf(
this)) {
33 if (FLAGS_caffe2_omp_num_threads > 0) {
34 omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
38 if (this->debug_def().engine() ==
"DNNLOWP_16" ||
39 this->debug_def().engine() ==
"DNNLOWP_ROWWISE_16") {
41 << this->debug_def().engine()
42 <<
" is an experimental feature mostly for testing accuracy with " 43 "fixed-point precision higher than 8 and performance is very slow";
48 if (measure_quantization_error_) {
49 dnnlowp::ReportQuantizationError(
this, quantization_error_stats_);
50 LOG(WARNING) << this->debug_def().output(0) <<
" with type " 51 << this->debug_def().type() <<
" has output qparams : " 52 <<
"scale " << out_qparams_.scale <<
" offset " 53 << out_qparams_.zero_point <<
"; ";
58 const TensorCPU& InputTensorCPU_(
int idx) {
59 if (InputIsType<int8::Int8TensorCPU>(idx)) {
60 return this->Input<int8::Int8TensorCPU>(idx).t;
61 }
else if (InputIsType<Int8ConvDNNLowPPackedWeightBlob>(idx)) {
62 return this->Input<Int8ConvDNNLowPPackedWeightBlob>(idx).original_tensor;
69 return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
73 auto* t = &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
78 T* GetQuantizedOutputData_() {
79 return OutputTensorCPU_(0)->template mutable_data<T>();
82 void MeasureQuantizationError_() {
83 if (!measure_quantization_error_ || !Fp32Op_()) {
87 const float* actual =
nullptr;
88 vector<float> actual_temp;
89 if (OutputTensorCPU_(0)->
template IsType<float>()) {
90 actual = OutputTensorCPU_(0)->template data<float>();
92 actual_temp.resize(OutputTensorCPU_(0)->numel());
93 fbgemm::Dequantize<T>(
94 OutputTensorCPU_(0)->template data<T>(),
96 OutputTensorCPU_(0)->numel(),
98 actual = actual_temp.data();
101 TensorCPU* float_tensor = Fp32Op_()->Get()->Output(0);
102 float* ref = float_tensor->template mutable_data<float>();
103 if (followed_by_ ==
"Relu" || debug_def().type() ==
"ConvRelu" ||
104 debug_def().type() ==
"Int8ConvRelu") {
105 for (
int i = 0; i < OutputTensorCPU_(0)->numel(); ++i) {
106 ref[i] = std::max(0.f, ref[i]);
110 dnnlowp::MeasureQuantizationError(
111 actual, ref, OutputTensorCPU_(0)->numel(), &quantization_error_stats_);
114 void RunOnDeviceEpilogue_() {
115 dnnlowp::PropagateOutputTensorQuantizationParams(
this, 0, out_qparams_);
117 MeasureQuantizationError_();
120 void ParseDNNLowPOperatorArguments_() {
121 if (!arguments_parsed_) {
122 bool dequantize_output;
123 dnnlowp::ParseDNNLowPOperatorArguments(
126 &measure_quantization_error_,
131 "Conv DNNLOWP operators don't support dequantize_output");
132 arguments_parsed_ =
true;
136 void GetOutputQuantizationParams_() {
139 ParseDNNLowPOperatorArguments_();
141 if (HasStaticQuantization(
this)) {
142 out_qparams_ = GetStaticQuantizationParamsOf(
this, 0);
144 if (measure_quantization_error_) {
149 Fp32Op_()->DequantizeInput();
150 Fp32Op_()->Get()->RunOnDevice();
156 Fp32Op_()->DequantizeInput();
157 Fp32Op_()->Get()->RunOnDevice();
158 out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
166 return fp32_op_.get();
169 void CreateSharedInt32Buffer_() {
171 ws_->CreateBlob(
"__CAFFE2_DNNLOWP_SHARED_INT32_BUFFER_CPU_MUTEX__")
172 ->GetMutable<std::unique_ptr<std::mutex>>();
173 mutexPtr->reset(
new std::mutex());
174 ws_->CreateBlob(
"__CAFFE2_DNNLOWP_SHARED_INT32_BUFFER_CPU__");
177 void RunWithSharedBuffer_(
179 vector<int32_t>* Y_int32,
181 void(
Tensor* col_buffer_shared, vector<int32_t>* Y_int32_shared)> f) {
182 auto f2 = [
this, Y_int32, f](
Tensor* col_buffer_shared) {
183 if (FLAGS_caffe2_dnnlowp_shared_int32_buffer) {
185 ws_->GetBlob(
"__CAFFE2_DNNLOWP_SHARED_INT32_BUFFER_CPU_MUTEX__");
186 CAFFE_ENFORCE(mutexBlob,
"Must call CreateSharedInt32Buffer() first");
188 auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
189 std::lock_guard<std::mutex> g(**mutexPtr);
191 auto* Y_int32_shared =
192 ws_->GetBlob(
"__CAFFE2_DNNLOWP_SHARED_INT32_BUFFER_CPU__")
193 ->template GetMutable<vector<int32_t>>();
194 f(col_buffer_shared, Y_int32_shared);
196 f(col_buffer_shared, Y_int32);
200 if (FLAGS_caffe2_force_shared_col_buffer || this->shared_buffer_) {
201 runWithSharedBuffer<CPUContext>(this->ws_, f2);
207 bool measure_quantization_error_{
false};
208 std::string followed_by_;
210 std::vector<dnnlowp::TensorQuantizationParams> in_qparams_;
211 dnnlowp::TensorQuantizationParams out_qparams_;
213 std::unique_ptr<OpWrapper<FP32_OP, T>> fp32_op_;
214 std::unique_ptr<dnnlowp::QuantizationFactory> qfactory_;
216 std::vector<T> out_temp_;
222 bool arguments_parsed_{
false};
225 #define USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FP32_OP) \ 226 using BaseType = ConvPoolDNNLowPOpBase<T, FP32_OP>; \ 227 using BaseType::GetOutputQuantizationParams_; \ 228 using BaseType::GetQuantizedOutputData_; \ 229 using BaseType::Fp32Op_; \ 230 using BaseType::InputTensorCPU_; \ 231 using BaseType::MeasureQuantizationError_; \ 232 using BaseType::OutputTensorCPU_; \ 233 using BaseType::RunOnDeviceEpilogue_; \ 234 using BaseType::followed_by_; \ 235 using BaseType::in_qparams_; \ 236 using BaseType::measure_quantization_error_; \ 237 using BaseType::out_qparams_; \ 238 using BaseType::qfactory_; C10_NODISCARD TensorOptions device(c10::optional< Device > device) const noexcept
Return a copy of TensorOptions with device set to the given one, or cleared if device is nullopt...
void ReinitializeTensor(Tensor *tensor, at::IntArrayRef dims, at::TensorOptions options)
Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if ...
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position 'idx' for this operator. ...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Wrap a floating-point operator with quantized inputs with type T.