1 #include "caffe2/operators/elementwise_add_op.h"     2 #include "caffe2/quantization/server/sigmoid.h"     3 #include "elementwise_dnnlowp_op.h"     4 #include "op_wrapper.h"    12     BinaryElementwiseOp<NumericTypes, CPUContext, AddFunctor<CPUContext>>;
    18   USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(
T, 
AddFp32Op);
    26   bool RunOnDevice()
 override {
    27     if (!GetQuantizationParameters_()) {
    31     const auto& 
A = InputTensorCPU_(0);
    32     const auto& 
B = InputTensorCPU_(1);
    33     auto* 
C = OutputTensorCPU_(0);
    35         &
B != 
C || !enable_broadcast_,
    36         "In-place is allowed only with the first tensor when broadcasting");
    40     vector<int32_t> A_quantized(
A.numel()), B_quantized(
B.numel());
    41     for (
int i = 0; i < 2; ++i) {
    42       int32_t* quantized_in = i == 0 ? A_quantized.data() : B_quantized.data();
    43       if (InputTensorCPU_(i).
template IsType<T>()) {
    44         float real_multiplier =
    45             in_qparams_[i].scale / intermediate_qparams_.scale;
    46         RequantizationParams in_requantization_params =
    47             qfactory_->ChooseRequantizationMultiplier(
    48                 real_multiplier, intermediate_qparams_);
    50         const T* input_data = InputTensorCPU_(i).template data<T>();
    52 #pragma omp parallel for    54         for (
int j = 0; j < InputTensorCPU_(i).numel(); ++j) {
    55           quantized_in[j] = fbgemm::Requantize<int32_t>(
    56               input_data[j] - in_qparams_[i].zero_point,
    57               in_requantization_params);
    60         assert(
A.template IsType<float>());
    61         const float* input_data = InputTensorCPU_(i).template data<float>();
    63 #pragma omp parallel for    65         for (
int j = 0; j < InputTensorCPU_(i).numel(); ++j) {
    66           quantized_in[j] = fbgemm::Quantize<uint32_t>(
    68               intermediate_qparams_.zero_point,
    69               intermediate_qparams_.scale,
    70               qfactory_->GetEltwiseQuantizePrecision());
    75     int32_t intermediate_zero_point =
    76         intermediate_qparams_.zero_point * InputSize();
    78     T* C_quantized = GetQuantizedOutputData_();
    80     if (!enable_broadcast_) {
    84           "Dimension mismatch - did you forget to set broadcast=1?");
    86 #pragma omp parallel for    88       for (
int i = 0; i < 
C->numel(); ++i) {
    89         int32_t raw = A_quantized[i] + B_quantized[i] - intermediate_zero_point;
    90         C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
    92     } 
else if (
B.numel() == 1) {
    94 #pragma omp parallel for    96       for (
int i = 0; i < 
C->numel(); ++i) {
    97         int32_t raw = A_quantized[i] + B_quantized[0] - intermediate_zero_point;
    98         C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
   102       std::tie(pre, n, post) =
   103           elementwise_ops_utils::ComputeLegacyBroadcastSizes(
A, 
B, axis_);
   105 #pragma omp parallel for   107       for (
int i = 0; i < pre; ++i) {
   108         for (
int j = 0; j < n; ++j) {
   109           for (
int k = 0; k < post; ++k) {
   110             int32_t raw = A_quantized[((i * n) + j) * post + k] +
   111                 B_quantized[j] - intermediate_zero_point;
   112             C_quantized[((i * n) + j) * post + k] =
   113                 fbgemm::Requantize<T>(raw, requantization_params_);
   119     RunOnDeviceEpilogue_();
   125   bool GetQuantizationParameters_() {
   127     float global_min = numeric_limits<float>::max(),
   128           global_max = numeric_limits<float>::lowest();
   130     for (
int i = 0; i < InputSize(); ++i) {
   132           GetInputTensorQuantizationParamsOf(
this, i, qfactory_.get());
   134       global_min = std::min(global_min, in_qparams_[i].Min());
   135       global_max = std::max(global_max, in_qparams_[i].Max());
   138     intermediate_qparams_ = qfactory_->ChooseQuantizationParams(
   141         qfactory_->GetEltwiseQuantizePrecision(),
   142         qfactory_->GetPreserveActivationSparsity());
   144     GetOutputQuantizationParams_();
   146     float real_multiplier = intermediate_qparams_.scale / out_qparams_.scale;
   147     requantization_params_ = qfactory_->ChooseRequantizationMultiplier(
   148         real_multiplier, out_qparams_);
   153   dnnlowp::TensorQuantizationParams intermediate_qparams_;
 
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
does bound shape inference given a C2 net.