7 #include "caffe2/core/operator.h" 8 #include "caffe2/core/tensor_int8.h" 9 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" 10 #include "caffe2/quantization/server/dnnlowp.h" 11 #include "caffe2/quantization/server/fbgemm_pack_blob.h" 12 #include "caffe2/quantization/server/op_wrapper.h" 13 #include "caffe2/quantization/server/sigmoid.h" 14 #include "caffe2/quantization/server/tanh.h" 17 C10_DECLARE_int(caffe2_omp_num_threads);
76 template <
typename T,
typename FP32_OP>
78 static_assert(std::is_integral<T>::value,
"Integral required.");
84 in_qparams_(InputSize()),
85 qfactory_(dnnlowp::GetQuantizationFactoryOf(
this)) {
87 if (FLAGS_caffe2_omp_num_threads > 0) {
88 omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
91 if (this->debug_def().engine() ==
"DNNLOWP_16" ||
92 this->debug_def().engine() ==
"DNNLOWP_ROWWISE_16") {
94 << this->debug_def().engine()
95 <<
" is an experimental feature mostly for testing accuracy with " 96 "fixed-point precision higher than 8 and performance is very slow";
101 if (measure_quantization_error_) {
102 dnnlowp::ReportQuantizationError(
this, quantization_error_stats_);
107 const TensorCPU& InputTensorCPU_(
int idx) {
108 if (InputIsType<int8::Int8TensorCPU>(idx)) {
109 return this->Input<int8::Int8TensorCPU>(idx).t;
110 }
else if (InputIsType<Int8FCDNNLowPPackedWeightBlob>(idx)) {
111 return this->Input<Int8FCDNNLowPPackedWeightBlob>(idx).original_tensor;
118 if (dequantize_output_) {
121 return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
126 if (dequantize_output_) {
127 return Output(idx, dims, options.
device(CPU));
129 auto* t = &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
135 T* GetQuantizedOutputData_() {
136 if (dequantize_output_) {
137 out_temp_.resize(Output(0)->numel());
138 return out_temp_.data();
140 return OutputTensorCPU_(0)->template mutable_data<T>();
144 void MeasureQuantizationError_() {
145 if (!measure_quantization_error_ || !Fp32Op_()) {
149 const float* actual =
nullptr;
150 vector<float> actual_temp;
151 if (OutputTensorCPU_(0)->
template IsType<float>()) {
152 actual = OutputTensorCPU_(0)->template data<float>();
154 actual_temp.resize(OutputTensorCPU_(0)->numel());
155 fbgemm::Dequantize<T>(
156 OutputTensorCPU_(0)->template data<T>(),
158 OutputTensorCPU_(0)->numel(),
160 actual = actual_temp.data();
163 float* ref = Fp32Op_()->Get()->Output(0)->template mutable_data<float>();
164 if (followed_by_ ==
"Relu") {
165 for (
int i = 0; i < Output(0)->numel(); ++i) {
166 ref[i] = std::max(0.f, ref[i]);
170 dnnlowp::MeasureQuantizationError(
171 actual, ref, OutputTensorCPU_(0)->numel(), &quantization_error_stats_);
174 void RunOnDeviceEpilogue_() {
175 if (dequantize_output_) {
176 fbgemm::Dequantize<T>(
178 OutputTensorCPU_(0)->template mutable_data<float>(),
179 OutputTensorCPU_(0)->numel(),
182 dnnlowp::PropagateOutputTensorQuantizationParams(
this, 0, out_qparams_);
185 MeasureQuantizationError_();
188 void ParseDNNLowPOperatorArguments_() {
194 if (!arguments_parsed_) {
195 dnnlowp::ParseDNNLowPOperatorArguments(
198 &measure_quantization_error_,
200 arguments_parsed_ =
true;
204 void GetOutputQuantizationParams_() {
207 ParseDNNLowPOperatorArguments_();
209 if (HasStaticQuantization(
this)) {
210 out_qparams_ = GetStaticQuantizationParamsOf(
this, 0);
212 if (measure_quantization_error_) {
217 Fp32Op_()->DequantizeInput();
218 Fp32Op_()->Get()->RunOnDevice();
224 Fp32Op_()->DequantizeInput();
225 Fp32Op_()->Get()->RunOnDevice();
226 out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
234 return fp32_op_.get();
237 bool dequantize_output_{
false}, measure_quantization_error_{
false};
238 std::string followed_by_;
240 std::vector<dnnlowp::TensorQuantizationParams> in_qparams_;
241 dnnlowp::TensorQuantizationParams out_qparams_;
243 std::unique_ptr<OpWrapper<FP32_OP, T>> fp32_op_;
244 std::unique_ptr<dnnlowp::QuantizationFactory> qfactory_;
246 std::vector<T> out_temp_;
252 bool arguments_parsed_{
false};
255 #define USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FP32_OP) \ 256 using BaseType = DNNLowPOp<T, FP32_OP>; \ 257 using BaseType::GetOutputQuantizationParams_; \ 258 using BaseType::GetQuantizedOutputData_; \ 259 using BaseType::Fp32Op_; \ 260 using BaseType::InputTensorCPU_; \ 261 using BaseType::MeasureQuantizationError_; \ 262 using BaseType::OutputTensorCPU_; \ 263 using BaseType::RunOnDeviceEpilogue_; \ 264 using BaseType::dequantize_output_; \ 265 using BaseType::followed_by_; \ 266 using BaseType::in_qparams_; \ 267 using BaseType::measure_quantization_error_; \ 268 using BaseType::out_qparams_; \ 269 using BaseType::qfactory_; 271 inline int dnnlowp_get_num_threads() {
273 return omp_get_num_threads();
279 inline int dnnlowp_get_max_threads() {
281 return omp_get_max_threads();
287 inline int dnnlowp_get_thread_num() {
289 return omp_get_thread_num();
C10_NODISCARD TensorOptions device(c10::optional< Device > device) const noexcept
Return a copy of TensorOptions with device set to the given one, or cleared if device is nullopt...
void ReinitializeTensor(Tensor *tensor, at::IntArrayRef dims, at::TensorOptions options)
Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if ...
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position 'idx' for this operator. ...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
A convenient base class for C2 operators with DNNLOWP engine.
Wrap a floating-point operator with quantized inputs with type T.