1 #include "caffe2/operators/elementwise_add_op.h" 2 #include "caffe2/quantization/server/sigmoid.h" 3 #include "elementwise_dnnlowp_op.h" 4 #include "op_wrapper.h" 12 BinaryElementwiseOp<NumericTypes, CPUContext, AddFunctor<CPUContext>>;
18 USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(
T,
AddFp32Op);
26 bool RunOnDevice()
override {
27 if (!GetQuantizationParameters_()) {
31 const auto&
A = InputTensorCPU_(0);
32 const auto&
B = InputTensorCPU_(1);
33 auto*
C = OutputTensorCPU_(0);
35 &
B !=
C || !enable_broadcast_,
36 "In-place is allowed only with the first tensor when broadcasting");
40 vector<int32_t> A_quantized(
A.numel()), B_quantized(
B.numel());
41 for (
int i = 0; i < 2; ++i) {
42 int32_t* quantized_in = i == 0 ? A_quantized.data() : B_quantized.data();
43 if (InputTensorCPU_(i).
template IsType<T>()) {
44 float real_multiplier =
45 in_qparams_[i].scale / intermediate_qparams_.scale;
46 RequantizationParams in_requantization_params =
47 qfactory_->ChooseRequantizationMultiplier(
48 real_multiplier, intermediate_qparams_);
50 const T* input_data = InputTensorCPU_(i).template data<T>();
52 #pragma omp parallel for 54 for (
int j = 0; j < InputTensorCPU_(i).numel(); ++j) {
55 quantized_in[j] = fbgemm::Requantize<int32_t>(
56 input_data[j] - in_qparams_[i].zero_point,
57 in_requantization_params);
60 assert(
A.template IsType<float>());
61 const float* input_data = InputTensorCPU_(i).template data<float>();
63 #pragma omp parallel for 65 for (
int j = 0; j < InputTensorCPU_(i).numel(); ++j) {
66 quantized_in[j] = fbgemm::Quantize<uint32_t>(
68 intermediate_qparams_.zero_point,
69 intermediate_qparams_.scale,
70 qfactory_->GetEltwiseQuantizePrecision());
75 int32_t intermediate_zero_point =
76 intermediate_qparams_.zero_point * InputSize();
78 T* C_quantized = GetQuantizedOutputData_();
80 if (!enable_broadcast_) {
84 "Dimension mismatch - did you forget to set broadcast=1?");
86 #pragma omp parallel for 88 for (
int i = 0; i <
C->numel(); ++i) {
89 int32_t raw = A_quantized[i] + B_quantized[i] - intermediate_zero_point;
90 C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
92 }
else if (
B.numel() == 1) {
94 #pragma omp parallel for 96 for (
int i = 0; i <
C->numel(); ++i) {
97 int32_t raw = A_quantized[i] + B_quantized[0] - intermediate_zero_point;
98 C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
102 std::tie(pre, n, post) =
103 elementwise_ops_utils::ComputeLegacyBroadcastSizes(
A,
B, axis_);
105 #pragma omp parallel for 107 for (
int i = 0; i < pre; ++i) {
108 for (
int j = 0; j < n; ++j) {
109 for (
int k = 0; k < post; ++k) {
110 int32_t raw = A_quantized[((i * n) + j) * post + k] +
111 B_quantized[j] - intermediate_zero_point;
112 C_quantized[((i * n) + j) * post + k] =
113 fbgemm::Requantize<T>(raw, requantization_params_);
119 RunOnDeviceEpilogue_();
125 bool GetQuantizationParameters_() {
127 float global_min = numeric_limits<float>::max(),
128 global_max = numeric_limits<float>::lowest();
130 for (
int i = 0; i < InputSize(); ++i) {
132 GetInputTensorQuantizationParamsOf(
this, i, qfactory_.get());
134 global_min = std::min(global_min, in_qparams_[i].Min());
135 global_max = std::max(global_max, in_qparams_[i].Max());
138 intermediate_qparams_ = qfactory_->ChooseQuantizationParams(
141 qfactory_->GetEltwiseQuantizePrecision(),
142 qfactory_->GetPreserveActivationSparsity());
144 GetOutputQuantizationParams_();
146 float real_multiplier = intermediate_qparams_.scale / out_qparams_.scale;
147 requantization_params_ = qfactory_->ChooseRequantizationMultiplier(
148 real_multiplier, out_qparams_);
153 dnnlowp::TensorQuantizationParams intermediate_qparams_;
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
does bound shape inference given a C2 net.