17 #include "elementwise_linear_dnnlowp_op.h" 24 ElementwiseLinearDNNLowPOp<T>::ElementwiseLinearDNNLowPOp(
25 const OperatorDef& operator_def,
27 : BaseType(operator_def, ws),
28 axis_(this->template GetSingleArgument<int>(
"axis", 1)) {}
31 bool ElementwiseLinearDNNLowPOp<T>::RunOnDevice() {
32 if (!GetQuantizationParameters_()) {
36 const auto& X = InputTensorCPU_(0);
37 const auto& a = InputTensorCPU_(1);
38 const auto& b = InputTensorCPU_(2);
39 auto* Y = OutputTensorCPU_(0);
41 const auto canonical_axis = X.canonical_axis_index(axis_);
42 const int N = X.size_to_dim(canonical_axis);
43 const int D = X.size_from_dim(canonical_axis);
45 CAFFE_ENFORCE_EQ(a.ndim(), 1, a.ndim());
46 CAFFE_ENFORCE_EQ(a.size(0), D, a.ndim());
47 CAFFE_ENFORCE_EQ(b.ndim(), 1, b.ndim());
48 CAFFE_ENFORCE_EQ(b.size(0), D, b.ndim());
54 const T* X_quantized =
55 QuantizeInputIfNeeded<T>(
this, 0, in_qparams_[0], X_temp);
58 vector<int32_t> b_quantized(b.numel());
59 const float* b_data = b.template data<float>();
61 #pragma omp parallel for 63 for (
int i = 0; i < b.numel(); ++i) {
64 b_quantized[i] = fbgemm::Quantize<int32_t>(
67 in_qparams_[0].scale * in_qparams_[1].scale,
72 T* Y_quantized = GetQuantizedOutputData_();
74 #pragma omp parallel for 76 for (
int n = 0; n < N; ++n) {
77 for (
int d = 0; d < D; ++d) {
78 int32_t raw = (X_quantized[n * D + d] - in_qparams_[0].zero_point) *
79 (a_quantized_[d] - in_qparams_[1].zero_point) +
81 Y_quantized[n * D + d] =
82 fbgemm::Requantize<T>(raw, requantization_params_);
86 RunOnDeviceEpilogue_();
92 bool ElementwiseLinearDNNLowPOp<T>::GetQuantizationParameters_() {
96 in_qparams_[0] = GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
99 if (a_quantized_.empty()) {
100 const auto& a = InputTensorCPU_(1);
101 in_qparams_[1] = qfactory_->ChooseQuantizationParams(
102 a.template data<float>(), a.numel(),
true );
104 a_quantized_.resize(a.numel());
106 a.template data<float>(),
112 GetOutputQuantizationParams_();
114 float real_multiplier =
115 in_qparams_[0].scale * in_qparams_[1].scale / out_qparams_.scale;
116 requantization_params_ =
117 qfactory_->ChooseRequantizationMultiplier(real_multiplier, out_qparams_);
122 REGISTER_CPU_OPERATOR_WITH_ENGINE(
125 ElementwiseLinearDNNLowPOp<uint8_t>);
126 REGISTER_CPU_OPERATOR_WITH_ENGINE(
127 Int8ElementwiseLinear,
129 ElementwiseLinearDNNLowPOp<uint8_t>);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...