Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op.cc
1 
17 #include "caffe2/core/common.h"
18 
19 
20 #include "caffe2/core/context.h"
21 #include "caffe2/core/logging.h"
22 #include "caffe2/core/operator.h"
23 #include "caffe2/operators/conv_op_shared.h"
24 #include "caffe2/operators/conv_pool_op_base.h"
25 
26 #include "caffe2/utils/math.h"
27 #include "caffe2/utils/threadpool/pthreadpool_impl.h"
28 #include "nnpack.h"
29 
30 CAFFE2_DEFINE_bool(caffe2_profile_nnpack, false, "");
31 namespace caffe2 {
32 
33 void initNNPACK() {
34  static std::once_flag once;
35  std::call_once(once, []() {
36  enum nnp_status nnpack_status = nnp_initialize();
37  CAFFE_ENFORCE(
38  nnpack_status == nnp_status_success, "NNPack is not supported here!");
39  });
40 }
41 
43 // Definitions
45 
46 class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
47  public:
48  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
49  : ConvPoolOpBase<CPUContext>(operator_def, ws),
50  algorithm_(getConvolutionAlgorithm()),
51  transformStrategy_(getConvolutionTransformStrategy()),
52  ws_(ws) {
53  OPERATOR_NEEDS_FEATURE(
54  this->order_ == StorageOrder::NCHW,
55  "NNPack only supports NCHW order. Please consider add \
56  TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
57  OPERATOR_NEEDS_FEATURE(
58  pad_t() < kernel_h(), "NNPACK only supports pad < kernel size");
59  OPERATOR_NEEDS_FEATURE(
60  pad_b() < kernel_h(), "NNPACK only supports pad < kernel size");
61  OPERATOR_NEEDS_FEATURE(
62  pad_l() < kernel_w(), "NNPACK only supports pad < kernel size");
63  OPERATOR_NEEDS_FEATURE(
64  pad_r() < kernel_w(), "NNPACK only supports pad < kernel size");
65 
66  createSharedBuffer<CPUContext>(ws);
67  }
68 
69  bool RunOnDeviceWithOrderNCHW() override;
70 
71  private:
72  nnp_convolution_algorithm getConvolutionAlgorithm() const;
73  nnp_convolution_transform_strategy getConvolutionTransformStrategy() const;
74 
75  const nnp_convolution_algorithm algorithm_;
76  // Modified after precomputing the kernels. State transitions are:
77  // - precompute -> (first call to Run()) -> reuse (on successful precompute)
78  // -> compute (on failing precompute)
79  // - compute
80  nnp_convolution_transform_strategy transformStrategy_;
81  Workspace* ws_;
82  // Per-group transformed filters
83  std::vector<TensorCPU*> transformedFilters_;
84 };
85 
87 // Implementations
89 
90 nnp_convolution_algorithm NNPACKConvOp::getConvolutionAlgorithm() const {
91  if (!OperatorBase::HasSingleArgumentOfType<std::string>("algo")) {
92  // No preference is stated. Heuristics for the best mobile device
93  // algorithm are different than NNPACK's version, as Winograd
94  // tends to be a lot faster. Use Winograd if the convolution
95  // is 3x3d1s1.
96  if (kernel_h() == 3 && kernel_w() == 3 && dilation_h() == 1 &&
97  dilation_w() == 1 && stride_h() == 1 && stride_w() == 1) {
98  // use Winograd
99  return nnp_convolution_algorithm_wt8x8;
100  }
101 
102  return nnp_convolution_algorithm_auto;
103  }
104 
105  // Otherwise, there is a preference.
106  auto algo = OperatorBase::GetSingleArgument<std::string>("algo", "AUTO");
107  if (algo == "AUTO") {
108  return nnp_convolution_algorithm_auto;
109  }
110  if (algo == "WINOGRAD") {
111  return nnp_convolution_algorithm_wt8x8;
112  }
113  if (algo == "WINOGRAD_FP16") {
114  return nnp_convolution_algorithm_wt8x8_fp16;
115  }
116  if (algo == "FT16") {
117  return nnp_convolution_algorithm_ft16x16;
118  }
119  if (algo == "FT8") {
120  return nnp_convolution_algorithm_ft8x8;
121  }
122  if (algo == "IMPLICIT_GEMM") {
123  return nnp_convolution_algorithm_implicit_gemm;
124  }
125  if (algo == "DIRECT") {
126  return nnp_convolution_algorithm_direct;
127  }
128  return nnp_convolution_algorithm_auto;
129 }
130 
131 nnp_convolution_transform_strategy
132 NNPACKConvOp::getConvolutionTransformStrategy() const {
133  auto kts = OperatorBase::GetSingleArgument<std::string>(
134  "convolution_transform_strategy", "COMPUTE");
135  if (kts == "PRECOMPUTE") {
136  return nnp_convolution_transform_strategy_precompute;
137  }
138  // Default to computing each time.
139  return nnp_convolution_transform_strategy_compute;
140 }
141 
142 bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
143  auto& X = Input(0);
144  auto& filter = Input(1);
145  auto& bias = Input(2);
146  auto* Y = Output(0);
147  CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
148  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
149  CAFFE_ENFORCE(filter.ndim(), 4);
150  const int M = filter.dim32(0);
151  CAFFE_ENFORCE(C % this->group_ == 0, "");
152  CAFFE_ENFORCE(M % this->group_ == 0, "");
153  CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
154  CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
155  CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
156  CAFFE_ENFORCE(bias.ndim() == 1, "");
157  CAFFE_ENFORCE(bias.dim32(0) == M, "");
158  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
159  const int oH = Y->dim32(2), oW = Y->dim32(3);
160 
161  const size_t batch_size = X.dim32(0);
162  const size_t input_channels = X.dim32(1);
163  const size_t output_channels = Y->dim32(1);
164  const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
165  .height = static_cast<size_t>(X.dim32(2))};
166  // filter is MCHW
167  const nnp_size kernel_size = {.width = static_cast<size_t>(filter.dim32(3)),
168  .height = static_cast<size_t>(filter.dim32(2))};
169  // pad is tblr
170  const nnp_padding padding = {.top = static_cast<size_t>(pad_t()),
171  .right = static_cast<size_t>(pad_r()),
172  .bottom = static_cast<size_t>(pad_b()),
173  .left = static_cast<size_t>(pad_l())};
174 
175  const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
176  .height = static_cast<size_t>(stride_h())};
177  initNNPACK();
178  pthreadpool pool(ws_->GetThreadPool());
179 
180  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor<CPUContext>* buffer) {
181  if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
182  transformedFilters_.resize(group_);
183 
184  size_t transformedFilterSize = 0;
185  nnp_status status = nnp_convolution_inference(
186  algorithm_,
187  nnp_convolution_transform_strategy_precompute,
188  C / group_,
189  M / group_,
190  input_size,
191  padding,
192  kernel_size,
193  output_subsample,
194  nullptr /* input */,
195  nullptr /* filters */,
196  nullptr /* bias */,
197  nullptr /* output */,
198  nullptr /* workspace buffer = transformed filter */,
199  &transformedFilterSize,
200  nnp_activation_identity,
201  nullptr /* activation parameter */,
202  &pool,
203  nullptr /* profile */);
204  if (status == nnp_status_success) {
205  /* For these convolution parameters filter transforms can be
206  * pre-computed */
207 
208  /* Division with rounding up, in case size is not multiple of
209  * sizeof(float) */
210  const size_t transformedFilterElements =
211  (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
212 
213  for (auto g = 0; g < group_; g++) {
214  transformedFilters_[g] =
215  ws_->CreateBlob(
216  debug_def().name() + "_transformed_" + to_string(g))
217  ->GetMutable<TensorCPU>();
218  transformedFilters_[g]->Resize(transformedFilterElements);
219 
220  status = nnp_convolution_inference(
221  algorithm_,
222  nnp_convolution_transform_strategy_precompute,
223  C / group_,
224  M / group_,
225  input_size,
226  padding,
227  kernel_size,
228  output_subsample,
229  nullptr /* input */,
230  filter.template data<float>() + filter.size() / group_ * g,
231  nullptr /* bias */,
232  nullptr /* output */,
233  static_cast<void*>(
234  transformedFilters_[g]->template mutable_data<float>()),
235  &transformedFilterSize,
236  nnp_activation_identity,
237  nullptr /* activation parameter */,
238  &pool,
239  nullptr /* profile */);
240  CAFFE_ENFORCE(
241  nnp_status_success == status,
242  "NNPACK convolution filter pre-transformation return error");
243  }
244 
245  /*
246  * Now, we've precomputed all our filter transformations.
247  * Switch to reuse strategy to avoid doing transformation again on next
248  * iteration.
249  */
250  if (transformStrategy_ ==
251  nnp_convolution_transform_strategy_precompute) {
252  CAFFE_ENFORCE_EQ(transformedFilters_.size(), group_);
253  transformStrategy_ = nnp_convolution_transform_strategy_reuse;
254  }
255  } else {
256  LOG(WARNING)
257  << "Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
258  transformStrategy_ = nnp_convolution_transform_strategy_compute;
259  }
260 
261  // Enforce when we leave this block that we have transitioned out of the
262  // precompute state.
263  CAFFE_ENFORCE(
264  transformStrategy_ != nnp_convolution_transform_strategy_precompute);
265  }
266 
267  CAFFE_ENFORCE(
268  transformStrategy_ == nnp_convolution_transform_strategy_reuse ||
269  transformStrategy_ == nnp_convolution_transform_strategy_compute);
270  const auto N = X.dim32(0);
271  for (auto n = 0; n < N; ++n) {
272  for (auto g = 0; g < group_; ++g) {
273  nnp_profile profile;
274  size_t workspaceSize = buffer->nbytes();
275  if (workspaceSize == 0) {
276  /* Allocate some memory to ensure buffer pointer is not NULL. This
277  * simplifies further logic. */
278  buffer->Resize(1);
279  workspaceSize = buffer->nbytes();
280  }
281  nnp_status status = nnp_convolution_inference(
282  algorithm_,
283  transformStrategy_,
284  C / group_,
285  M / group_,
286  input_size,
287  padding,
288  kernel_size,
289  output_subsample,
290  X.template data<float>() + n * C * H * W + g * H * W * (C / group_),
291  transformStrategy_ == nnp_convolution_transform_strategy_reuse
292  ? transformedFilters_[g]->template data<float>()
293  : filter.template data<float>() + filter.size() / group_ * g,
294  bias.template data<float>() + bias.size() / group_ * g,
295  Y->template mutable_data<float>() + n * oH * oW * M +
296  g * oH * oW * (M / group_),
297  static_cast<void*>(buffer->template mutable_data<float>()),
298  &workspaceSize,
299  nnp_activation_identity,
300  nullptr /* activation parameter */,
301  &pool,
302  FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
303  if (status == nnp_status_insufficient_buffer) {
304  /* Query required workspace size, increase buffer, and try again */
305  status = nnp_convolution_inference(
306  algorithm_,
307  transformStrategy_,
308  C / group_,
309  M / group_,
310  input_size,
311  padding,
312  kernel_size,
313  output_subsample,
314  nullptr /* input */,
315  nullptr,
316  nullptr /* bias */,
317  nullptr /* output */,
318  nullptr /* workspace buffer */,
319  &workspaceSize,
320  nnp_activation_identity,
321  nullptr /* activation parameter */,
322  &pool,
323  nullptr /* profile */);
324  if (status == nnp_status_success) {
325  /* Division with rounding up, in case size is not multiple of
326  * sizeof(float) */
327  const size_t workspace_elements =
328  (workspaceSize + sizeof(float) - 1) / sizeof(float);
329  buffer->Resize(workspace_elements);
330 
331  /* Try convolution_inference again. If this time it fails, it is
332  * fatal. */
333  status = nnp_convolution_inference(
334  algorithm_,
335  transformStrategy_,
336  C / group_,
337  M / group_,
338  input_size,
339  padding,
340  kernel_size,
341  output_subsample,
342  X.template data<float>() + n * C * H * W +
343  g * H * W * (C / group_),
344  transformStrategy_ == nnp_convolution_transform_strategy_reuse
345  ? transformedFilters_[g]->template data<float>()
346  : filter.template data<float>() +
347  filter.size() / group_ * g,
348  bias.template data<float>() + bias.size() / group_ * g,
349  Y->template mutable_data<float>() + n * oH * oW * M +
350  g * oH * oW * (M / group_),
351  static_cast<void*>(buffer->template mutable_data<float>()),
352  &workspaceSize,
353  nnp_activation_identity,
354  nullptr /* activation parameter */,
355  &pool,
356  FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
357  }
358  }
359 
360  VLOG(1) << "NNPACK buffer size: " << buffer->nbytes();
361  CAFFE_ENFORCE(
362  nnp_status_success == status,
363  "NNPACK convolution computation returned error");
364  if (FLAGS_caffe2_profile_nnpack) {
365  char buffer[1024];
366  const double gmacs =
367  double(
368  Y->dim32(2) * Y->dim32(3) * Y->dim32(1) * X.dim32(1) *
369  kernel_size.width * kernel_size.height / group_ / group_) /
370  1.0E9;
371  const double gflops = 2 * gmacs / profile.total;
372  auto ret = snprintf(
373  buffer,
374  sizeof(buffer),
375  "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: "
376  "%4.2f, totalT: %6.3f, inputT: %6.3f, "
377  "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
378  size_t(X.dim(2)),
379  size_t(X.dim(3)),
380  size_t(X.dim(1)),
381  size_t(Y->dim(1)),
382  size_t(kernel_size.width),
383  size_t(output_subsample.width),
384  size_t(padding.top),
385  gmacs,
386  profile.total * 1E3,
387  profile.input_transform * 1E3,
388  profile.kernel_transform * 1E3,
389  profile.block_multiplication * 1E3,
390  profile.output_transform * 1E3,
391  gflops);
392  CAFFE_ENFORCE(ret > 0);
393  LOG(INFO) << buffer;
394  }
395  }
396  }
397  });
398  return true;
399 }
400 
401 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
402 
403 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Copyright (c) 2016-present, Facebook, Inc.