1 #include "caffe2/quantization/server/relu_dnnlowp_op.h" 8 bool ReluDNNLowPOp<T>::RunOnDevice() {
9 auto& X = InputIsType<int8::Int8TensorCPU>(0)
10 ? (this->
template Input<int8::Int8TensorCPU>(0)).t
13 TensorCPU* Y =
nullptr;
14 if (InputIsType<int8::Int8TensorCPU>(0)) {
16 Y = &Outputs()[0]->template GetMutable<int8::Int8TensorCPU>()->t;
25 TensorQuantizationParams in_qparams =
26 GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
29 std::vector<T> X_temp, Y_temp;
30 const T* X_data = QuantizeInputIfNeeded(
this, 0, in_qparams, X_temp);
33 if (X.template IsType<T>()) {
34 Y_data = Y->template mutable_data<T>();
36 Y_temp.resize(Y->numel());
37 Y_data = Y_temp.data();
40 CAFFE_ENFORCE_GE(in_qparams.zero_point, std::numeric_limits<T>::lowest());
41 CAFFE_ENFORCE_LE(in_qparams.zero_point, std::numeric_limits<T>::max());
42 const int N = X.numel();
43 if (in_qparams.zero_point == std::numeric_limits<T>::lowest()) {
44 if (Y_data != X_data) {
45 std::memcpy(Y_data, X_data, N *
sizeof(
T));
48 if (GetCpuId().avx2()) {
49 internal::ReluAVX2<T>(N, in_qparams.zero_point, X_data, Y_data);
52 #pragma omp parallel for 54 for (
int i = 0; i < N; ++i) {
55 Y_data[i] = std::max(X_data[i], static_cast<T>(in_qparams.zero_point));
63 PropagateOutputTensorQuantizationParams(
this, 0, in_qparams);
67 if (!X.template IsType<T>()) {
68 fbgemm::Dequantize<T>(
69 Y_data, Y->template mutable_data<float>(), Y->numel(), in_qparams);
75 REGISTER_CPU_OPERATOR_WITH_ENGINE(
Relu, DNNLOWP, ReluDNNLowPOp<uint8_t>);
76 REGISTER_CPU_OPERATOR_WITH_ENGINE(
Relu, DNNLOWP_16, ReluDNNLowPOp<uint16_t>);
78 REGISTER_CPU_OPERATOR_WITH_ENGINE(Int8Relu, DNNLOWP, ReluDNNLowPOp<uint8_t>);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...