Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_dnnlowp_acc16_op.h
1 #pragma once
2 
3 #include "caffe2/quantization/server/conv_dnnlowp_op.h"
4 #include "fbgemm/Fbgemm.h"
5 
6 namespace caffe2 {
7 
12 template <bool ReluFused = false>
13 class ConvDNNLowPAcc16Op final : public ConvDNNLowPOp<std::uint8_t, ReluFused> {
14  public:
15  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
16  ConvDNNLowPAcc16Op(const OperatorDef& operator_def, Workspace* ws);
17 
19  using BaseType::BIAS;
20  using BaseType::col_buffer_;
21  using BaseType::FILTER;
22  using BaseType::in_qparams_;
23  using BaseType::INPUT;
24  using BaseType::InputTensorCPU_;
25  using BaseType::out_qparams_;
26  using BaseType::OutputTensorCPU_;
27  using BaseType::row_offsets_;
28  using BaseType::W_quantized_;
29  using BaseType::X_pack_buf_;
30  using BaseType::Y_int32_;
31 
32  private:
33  bool RunOnDeviceWithOrderNCHW() override;
34  bool RunOnDeviceWithOrderNHWC() override;
35 
36  bool GetQuantizationParameters_();
37 
38  template <fbgemm::QuantizationGranularity Q_GRAN>
39  void DispatchFBGEMM_(
40  fbgemm::PackAWithRowOffset<std::uint8_t, std::int16_t>& packA,
41  const std::uint8_t* col_buffer_data,
42  vector<std::int32_t>* Y_int32,
43  uint8_t* Y_uint8_data);
44 
45  void ConvOutlier_(
46  const std::uint8_t* col_buffer,
47  vector<std::int32_t>* Y_int32);
48 
49  virtual bool Acc16() const override {
50  return !fallback_to_32_bit_accumulation_;
51  }
52 
53  std::shared_ptr<fbgemm::PackBMatrix<std::int8_t, std::int16_t>>
54  Wq_acc16_packed_;
55 
56  // Wq outlier in CSC format
57  std::shared_ptr<fbgemm::CompressedSparseColumn> Wq_outlier_;
58 
59  // Threshold to decide whether a weight is outlier.
60  // For example, if nbits_in_non_outlier_ == 7, w is an outlier if w < -64 or
61  // w >= 64.
62  // nbits_in_non_outlier_ == 0 means everything is outlier.
63  // nbits_in_non_outlier_ == 8 means nothing is outlier.
64  int nbits_in_non_outlier_;
65  int copy_to_32bit_frequency_;
66 
67  bool first_invocation_{true};
68  // If outlier matrix is not sparse enough, using 16-bit accumulation won't
69  // give speedup due to too much overhead of sparse matrix multiplication or
70  // sparse convolution anyway, so fallback to 32-bit accumulation
71  bool fallback_to_32_bit_accumulation_{false};
72 }; // class ConvDNNLowPAcc16Op
73 
74 } // namespace caffe2
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:40
Quantized Conv operator with 16-bit accumulation.
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13