1 #include "caffe2/operators/elementwise_mul_op.h" 2 #include "caffe2/quantization/server/elementwise_dnnlowp_op.h" 3 #include "caffe2/quantization/server/op_wrapper.h" 4 #include "caffe2/quantization/server/sigmoid.h" 12 BinaryElementwiseOp<NumericTypes, CPUContext, MulFunctor<CPUContext>>;
18 USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(
T,
MulFp32Op);
26 bool RunOnDevice()
override {
27 if (!GetQuantizationParameters_()) {
31 const auto&
A = InputTensorCPU_(0);
32 const auto&
B = InputTensorCPU_(1);
33 auto*
C = OutputTensorCPU_(0);
35 &
B !=
C || !enable_broadcast_,
36 "In-place is allowed only with the first tensor when broadcasting");
40 vector<T> A_temp, B_temp;
41 const T* A_quantized =
42 QuantizeInputIfNeeded<T>(
this, 0, in_qparams_[0], A_temp);
43 const T* B_quantized =
44 QuantizeInputIfNeeded<T>(
this, 1, in_qparams_[1], B_temp);
46 T* C_quantized = GetQuantizedOutputData_();
48 if (!enable_broadcast_) {
52 "Dimension mismatch - did you forget to set broadcast=1?");
54 #pragma omp parallel for 56 for (
int i = 0; i <
C->size(); ++i) {
57 int32_t raw = (A_quantized[i] - in_qparams_[0].zero_point) *
58 (B_quantized[i] - in_qparams_[1].zero_point);
59 C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
61 }
else if (
B.size() == 1) {
63 #pragma omp parallel for 65 for (
int i = 0; i <
C->size(); ++i) {
66 int32_t raw = (A_quantized[i] - in_qparams_[0].zero_point) *
67 (B_quantized[0] - in_qparams_[1].zero_point);
68 C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
72 std::tie(pre, n, post) =
73 elementwise_ops_utils::ComputeLegacyBroadcastSizes(
A,
B, axis_);
75 #pragma omp parallel for 77 for (
int i = 0; i < pre; ++i) {
78 for (
int j = 0; j < n; ++j) {
79 for (
int k = 0; k < post; ++k) {
80 int32_t raw = (A_quantized[((i * n) + j) * post + k] -
81 in_qparams_[0].zero_point) *
82 (B_quantized[j] - in_qparams_[1].zero_point);
83 C_quantized[((i * n) + j) * post + k] =
84 fbgemm::Requantize<T>(raw, requantization_params_);
90 RunOnDeviceEpilogue_();
96 bool GetQuantizationParameters_() {
99 GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
101 GetInputTensorQuantizationParamsOf(
this, 1, qfactory_.get());
103 GetOutputQuantizationParams_();
105 float real_multiplier =
106 in_qparams_[0].scale * in_qparams_[1].scale / out_qparams_.scale;
107 requantization_params_ = qfactory_->ChooseRequantizationMultiplier(
108 real_multiplier, out_qparams_);
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
does bound shape inference given a C2 net.