Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_dnnlowp_op.h
1 #pragma once
2 
3 #include <fbgemm/Fbgemm.h>
4 #include <fbgemm/src/FbgemmI8DepthwiseAvx2.h>
5 #include "caffe2/operators/conv_op.h"
6 #include "caffe2/operators/conv_pool_op_base.h"
7 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
8 #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
9 #include "caffe2/quantization/server/dnnlowp.h"
10 #include "caffe2/quantization/server/op_wrapper.h"
11 
12 namespace caffe2 {
13 
14 using ConvFp32Op = ConvOp<float, CPUContext>;
15 
16 // Convolutional layer computed in integer with quantization
17 template <typename T, bool ReluFused = false>
18 class ConvDNNLowPOp : public ConvPoolDNNLowPOpBase<T, ConvFp32Op> {
19  public:
20  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
21  USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, ConvFp32Op);
22  ConvDNNLowPOp(const OperatorDef& operator_def, Workspace* ws);
23  virtual ~ConvDNNLowPOp();
24 
25  protected:
26  bool RunOnDeviceWithOrderNCHW() override;
27  bool RunOnDeviceWithOrderNHWC() override;
28 
30 
35  bool IsConvGEMM_() const;
36  bool NoIm2ColNHWC_();
37  int KernelDim_();
38 
39  const T* Im2ColNHWC_(Tensor* col_buffer);
40 
41  dnnlowp::TensorQuantizationParams& FilterQuantizationParams(int group_id);
42  dnnlowp::RequantizationParams& RequantizationParams(int group_id);
43 
44  static void PartitionGroupedNHWCConv_(
45  int* group_begin,
46  int* group_end,
47  int* i_begin,
48  int* i_end,
49  int num_groups,
50  int m,
51  int nthreads,
52  int thread_id);
53 
54  virtual bool Acc16() const {
55  return false;
56  }
57 
58  Tensor col_buffer_{CPU};
59  Tensor img_shape_device_{CPU};
60  Tensor col_buffer_shape_device_{CPU};
61 
62  // Input: X, W, b
63  // Output: Y
64  INPUT_TAGS(INPUT, FILTER, BIAS);
65 
66  // x86 only provides SIMD instructions that multiply a signed integer with an
67  // unsigned integer. We use signed for weights.
68  using T_signed = typename std::make_signed<T>::type;
69 
70  // used in slow path for T != uint8_t
71  std::vector<T_signed> W_quantized_;
72 
73  // pre-computed biases and offsets
74  std::shared_ptr<std::vector<std::int32_t>> column_offsets_;
75  std::vector<std::int32_t> row_offsets_;
76  const std::int32_t* b_quantized_data_{nullptr};
77 
78  std::vector<std::uint8_t> X_pack_buf_;
79 
80  void RunOnDeviceEpilogueNCHW_(
81  const T* col_buffer_data,
82  std::int32_t* Y_int32,
83  T* Y_data,
84  std::size_t i_offset,
85  int group_id);
86  void RunOnDeviceEpilogueNHWC_(
87  const T* col_buffer_data,
88  std::int32_t* Y_int32);
89 
90  std::vector<std::int32_t> Y_int32_;
91  std::vector<dnnlowp::TensorQuantizationParams> filter_qparams_;
92  std::vector<std::int32_t> filter_zero_points_;
93 
94  std::vector<float> requantization_multipliers_;
95  bool quantize_groupwise_;
96 
97  private:
98  void QuantizeWeight_();
99  void PreComputeRowColumnOffsets_();
100  void QuantizeBias_();
101 
102  bool TakeDepthWise3x3FastPath_();
103  bool TakeDepthWise3x3x3FastPath_();
104  bool TakeGConvFastPath_();
105 
106  template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
107  void DispatchFBGEMM_(
108  PackAMatrix& packA,
109  vector<std::int32_t>* Y_int32,
110  uint8_t* Y_uint8_data);
111 
112  void ConvNHWCCore_(const T* col_buffer_data, vector<std::int32_t>* Y_int32);
113 
114  std::vector<dnnlowp::RequantizationParams> requantization_params_;
115 
116  // used in fast path for T == uint8_t
117  std::shared_ptr<fbgemm::PackBMatrix<std::int8_t>> Wq_packed_;
118 
119  // For depthwise 3x3 conv
120  std::shared_ptr<fbgemm::Packed3x3ConvMatrix> Wq_depthwise_3x3_packed_;
121  // For depthwise 3x3x3 conv
122  std::shared_ptr<fbgemm::Packed3x3x3ConvMatrix> Wq_depthwise_3x3x3_packed_;
123  // For small gconv
124  std::shared_ptr<fbgemm::PackWeightMatrixForGConv<std::int8_t>>
125  Wq_gconv_packed_;
126 
127  // pre-computed biases and offsets
128  std::shared_ptr<std::vector<std::int32_t>> b_quantized_;
129 
130  float in_qparams_scale_old_ = 0;
131 }; // class ConvDNNLowPOp
132 
133 } // namespace caffe2
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:40
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13