1 #include "utility_dnnlowp_ops.h" 8 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 12 #include "dnnlowp_partition.h" 18 template <
typename T,
bool ReluFused>
19 SumDNNLowPOp<T, ReluFused>::SumDNNLowPOp(
20 const OperatorDef& operator_def,
22 : BaseType(operator_def, ws) {}
24 template <
typename T,
bool ReluFused>
25 bool SumDNNLowPOp<T, ReluFused>::RunOnDevice() {
26 if (!this->arguments_parsed_) {
27 dnnlowp::ParseDNNLowPOperatorArguments(
28 this, &dequantize_output_, &measure_quantization_error_, &followed_by_);
34 followed_by_ =
"Relu";
35 dnnlowp::AdjustOutputTensorQuantizationParamsWithFollowedBy(
38 this->arguments_parsed_ =
true;
41 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 42 chrono::time_point<chrono::system_clock> t_begin, t_end;
44 t_begin = chrono::system_clock::now();
47 if (!GetQuantizationParameters_()) {
51 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 52 t_end = chrono::system_clock::now();
53 double dt = chrono::duration<double>(t_end - t_begin).count();
54 LOG(INFO) <<
"this=" <<
this <<
" get_quant_params: " << dt * 1e3 <<
" ms";
56 t_begin = chrono::system_clock::now();
61 int len = InputTensorCPU_(0).size();
64 int32_t intermediate_zero_point =
65 intermediate_qparams_.zero_point * InputSize();
67 auto* output = OutputTensorCPU_(0);
68 output->ResizeLike(InputTensorCPU_(0));
70 T* output_data = GetQuantizedOutputData_();
72 if (InputTensorCPU_(0).
template IsType<T>()) {
73 if (InputSize() == 2 && is_same<T, uint8_t>::value && GetCpuId().avx2() &&
76 array<const T*, 2> input_data;
77 for (
int i = 0; i < 2; ++i) {
78 input_data[i] = InputTensorCPU_(i).template data<T>();
85 constexpr
int VLEN = 8;
87 tie(j_begin, j_end) = Get1DPartition(
88 len, dnnlowp_get_num_threads(), dnnlowp_get_thread_num(), VLEN);
90 internal::ElementWiseSumAVX2<T, ReluFused>(
91 input_data[0] + j_begin,
92 input_data[1] + j_begin,
93 output_data + j_begin,
96 in_qparams_[0].zero_point,
98 in_qparams_[1].zero_point,
100 out_qparams_.zero_point);
103 RequantizationParams in_requantization_params[InputSize()];
104 const T* input_data[InputSize()];
105 for (
int i = 0; i < InputSize(); ++i) {
106 float real_multiplier =
107 in_qparams_[i].scale / intermediate_qparams_.scale;
108 in_requantization_params[i] = qfactory_->ChooseRequantizationMultiplier(
109 real_multiplier, intermediate_qparams_);
110 input_data[i] = InputTensorCPU_(i).template data<T>();
118 tie(j_begin, j_end) = Get1DPartition(
119 len, dnnlowp_get_num_threads(), dnnlowp_get_thread_num());
121 for (
int j = j_begin; j < j_end; ++j) {
123 for (
int i = 0; i < InputSize(); ++i) {
124 acc += fbgemm::Requantize<int32_t>(
125 input_data[i][j] - in_qparams_[i].zero_point,
126 in_requantization_params[i]);
128 int32_t raw = acc - intermediate_zero_point;
130 raw = std::max(0, raw);
133 fbgemm::Requantize<T>(raw, out_requantization_params_);
139 const float* input_data[InputSize()];
140 for (
int i = 0; i < InputSize(); ++i) {
141 input_data[i] = InputTensorCPU_(i).template data<float>();
149 tie(j_begin, j_end) = Get1DPartition(
150 len, dnnlowp_get_num_threads(), dnnlowp_get_thread_num());
152 for (
int j = j_begin; j < j_end; ++j) {
154 for (
int i = 0; i < InputSize(); ++i) {
155 acc += fbgemm::Quantize<int32_t>(
156 ((
const float*)input_data[i])[j],
157 intermediate_qparams_.zero_point,
158 intermediate_qparams_.scale,
159 qfactory_->GetEltwiseQuantizePrecision());
161 int32_t raw = acc - intermediate_zero_point;
163 raw = std::max(0, raw);
165 output_data[j] = fbgemm::Requantize<T>(raw, out_requantization_params_);
170 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 171 t_end = chrono::system_clock::now();
172 dt = chrono::duration<double>(t_end - t_begin).count();
173 LOG(INFO) <<
"this=" <<
this <<
" requantize inputs: " << dt * 1e3 <<
" ms";
175 t_begin = chrono::system_clock::now();
178 RunOnDeviceEpilogue_();
180 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 181 t_end = chrono::system_clock::now();
182 dt = chrono::duration<double>(t_end - t_begin).count();
183 LOG(INFO) <<
"this=" <<
this <<
" prologue: " << dt * 1e3 <<
" ms";
185 t_begin = chrono::system_clock::now();
191 template <
typename T,
bool ReluFused>
192 bool SumDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() {
196 float global_min = numeric_limits<float>::max(),
197 global_max = numeric_limits<float>::lowest();
199 for (
int i = 0; i < InputSize(); ++i) {
201 GetInputTensorQuantizationParamsOf(
this, i, qfactory_.get());
203 global_min = std::min(global_min, in_qparams_[i].Min());
204 global_max = std::max(global_max, in_qparams_[i].Max());
207 intermediate_qparams_ = qfactory_->ChooseQuantizationParams(
210 qfactory_->GetEltwiseQuantizePrecision(),
211 qfactory_->GetPreserveActivationSparsity());
213 GetOutputQuantizationParams_();
216 float real_multiplier = intermediate_qparams_.scale / out_qparams_.scale;
217 out_requantization_params_ =
218 qfactory_->ChooseRequantizationMultiplier(real_multiplier, out_qparams_);
224 .NumInputs(1, INT_MAX)
226 .AllowInplace({{0, 0}})
227 .InputsCanCrossDevices()
228 .IdenticalTypeAndShapeOfInput(0)
229 .Input(0,
"data_0",
"First of the input tensors. Can be inplace.")
230 .Output(0,
"sum",
"Output tensor. Same dimension as inputs.");
232 REGISTER_CPU_OPERATOR_WITH_ENGINE(
Sum, DNNLOWP, SumDNNLowPOp<uint8_t, false>);
233 REGISTER_CPU_OPERATOR_WITH_ENGINE(
236 SumDNNLowPOp<uint8_t, true>);
238 REGISTER_CPU_OPERATOR_WITH_ENGINE(
241 SumDNNLowPOp<uint8_t, false>);
242 REGISTER_CPU_OPERATOR_WITH_ENGINE(
245 SumDNNLowPOp<uint8_t, true>);
247 REGISTER_CPU_OPERATOR_WITH_ENGINE(
250 SumDNNLowPOp<uint16_t, false>);
251 REGISTER_CPU_OPERATOR_WITH_ENGINE(
254 SumDNNLowPOp<uint16_t, true>);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...