Caffe2 - C++ API
A deep learning, cross platform ML framework
utility_dnnlowp_ops.h
1 #pragma once
2 
3 #include "caffe2/operators/utility_ops.h"
4 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
5 #include "caffe2/quantization/server/dnnlowp.h"
6 #include "caffe2/quantization/server/dnnlowp_op.h"
7 
8 namespace caffe2 {
9 
10 template <typename T, bool ReluFused = false>
11 class SumDNNLowPOp final : public DNNLowPOp<T, SumOp<CPUContext>> {
12  public:
13  SumDNNLowPOp(const OperatorDef& operator_def, Workspace* ws);
14  bool RunOnDevice() override;
15 
16  USE_OPERATOR_FUNCTIONS(CPUContext);
17  USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, SumOp<CPUContext>);
18 
19  private:
20  bool GetQuantizationParameters_();
21 
22  dnnlowp::TensorQuantizationParams intermediate_qparams_;
23 
24  dnnlowp::RequantizationParams out_requantization_params_;
25 }; // class SumDNNLowPOp
26 
27 template <typename T>
28 class GatherDNNLowPOp final : public GatherOp<CPUContext> {
29  static_assert(std::is_integral<T>::value, "Integral required.");
30 
31  public:
32  GatherDNNLowPOp(const OperatorDef& operator_def, Workspace* ws);
33  ~GatherDNNLowPOp();
34  bool RunOnDevice() override;
35 
36  template <typename Index>
37  bool DoRunWithType() {
38  // If we endup using it on GPU doing O(N) memcpy is probably not best :)
39  // TODO: implement prefetching if it starts mattering (TF does it)
40  auto& data = (this->template Input<int8::Int8TensorCPU>(DATA)).t;
41  auto& indices = Input(INDICES);
42  auto* output = &Outputs()[0]->template GetMutable<int8::Int8TensorCPU>()->t;
43 
44  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
45  auto shape = indices.sizes().vec();
46  shape.insert(shape.end(), data.sizes().begin() + 1, data.sizes().end());
47  output->Resize(shape);
48 
49  int block_size = data.size_from_dim(1);
50  auto block_bytesize = data.size_from_dim(1) * data.dtype().itemsize();
51  int N = indices.numel();
52 
53  auto src_base = static_cast<const char*>(data.raw_data());
54  const Index* idxs = indices.template data<Index>();
55  auto out = static_cast<char*>(output->raw_mutable_data(data.dtype()));
56 
57  for (int i = 0; i < N; ++i) {
58  auto idx = idxs[i];
59  CAFFE_ENFORCE(
60  0 <= idx && idx < data.size(0),
61  "INDICES element is out of DATA bounds, id=",
62  idx,
63  " data_dim=",
64  data.size(0));
65  auto src = src_base + idx * block_bytesize;
66  context_.CopyItemsSameDevice(
67  data.dtype(), block_size, src, out + block_bytesize * i);
68  }
69  return true;
70  }
71 
72  USE_OPERATOR_FUNCTIONS(CPUContext);
73 
74  private:
75  OpWrapper<GatherOp<CPUContext>, T>* Fp32Op_() {
76  if (!fp32_op_) {
77  fp32_op_.reset(
78  new OpWrapper<GatherOp<CPUContext>, T>(this, qfactory_.get()));
79  }
80  return fp32_op_.get();
81  }
82 
83  std::unique_ptr<OpWrapper<GatherOp<CPUContext>, T>> fp32_op_;
84  bool dequantize_output_{false}, measure_quantization_error_{false};
85 
86  std::unique_ptr<dnnlowp::QuantizationFactory> qfactory_;
87 
88  dnnlowp::QuantizationErrorStats quantization_error_stats_;
89 
90  bool arguments_parsed_{false};
91 }; // class GatherDNNLowPOp
92 
93 namespace internal {
94 
95 template <typename T, bool ReluFused>
96 void ElementWiseSumAVX2(
97  const T* input0,
98  const T* input1,
99  T* output,
100  int len,
101  float a_scale,
102  int32_t a_zero_point,
103  float b_scale,
104  int32_t b_zero_point,
105  float c_scale,
106  int32_t c_zero_points);
107 
108 }
109 
110 } // namespace caffe2
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:40
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
A convenient base class for C2 operators with DNNLOWP engine.
Definition: dnnlowp_op.h:77
Wrap a floating-point operator with quantized inputs with type T.
Definition: op_wrapper.h:15