Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op.cc
1 
2 #include <iostream>
3 
4 #include "caffe2/core/common.h"
5 
6 
7 #include "caffe2/core/context.h"
8 #include "caffe2/core/logging.h"
9 #include "caffe2/core/operator.h"
10 #include "caffe2/operators/conv_op_shared.h"
11 #include "caffe2/operators/conv_pool_op_base.h"
12 
13 #include "caffe2/utils/math.h"
14 #include "nnpack.h"
15 
16 C10_DEFINE_bool(caffe2_profile_nnpack, false, "");
17 namespace caffe2 {
18 
19 void initNNPACK() {
20  static std::once_flag once;
21  std::call_once(once, []() {
22  enum nnp_status nnpack_status = nnp_initialize();
23  CAFFE_ENFORCE(
24  nnpack_status == nnp_status_success, "NNPack is not supported here!");
25  });
26 }
27 
29 // Definitions
31 
32 class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
33  public:
34  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
35  : ConvPoolOpBase<CPUContext>(operator_def, ws),
36  algorithm_(getConvolutionAlgorithm()),
37  activation_(getActivationType()),
38  transformStrategy_(getConvolutionTransformStrategy()),
39  ws_(ws) {
40  OPERATOR_NEEDS_FEATURE(
41  this->order_ == StorageOrder::NCHW,
42  "NNPack only supports NCHW order. Please consider add \
43  TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
44  OPERATOR_NEEDS_FEATURE(
45  pad_t() < kernel_h(), "NNPACK only supports pad < kernel size");
46  OPERATOR_NEEDS_FEATURE(
47  pad_b() < kernel_h(), "NNPACK only supports pad < kernel size");
48  OPERATOR_NEEDS_FEATURE(
49  pad_l() < kernel_w(), "NNPACK only supports pad < kernel size");
50  OPERATOR_NEEDS_FEATURE(
51  pad_r() < kernel_w(), "NNPACK only supports pad < kernel size");
52 
53  createSharedBuffer<CPUContext>(ws);
54  }
55 
56  bool RunOnDeviceWithOrderNCHW() override;
57 
58  private:
59  nnp_convolution_algorithm getConvolutionAlgorithm() const;
60  nnp_convolution_transform_strategy getConvolutionTransformStrategy() const;
61  nnp_activation getActivationType() const;
62 
63  const nnp_convolution_algorithm algorithm_;
64  const nnp_activation activation_;
65  // Modified after precomputing the kernels. State transitions are:
66  // - precompute -> (first call to Run()) -> reuse (on successful precompute)
67  // -> compute (on failing precompute)
68  // - compute
69  nnp_convolution_transform_strategy transformStrategy_;
70  Workspace* ws_;
71  // Per-group transformed filters
72  std::vector<TensorCPU*> transformedFilters_;
73  // Zero-filled bias for convolutions without bias
74  // This may be needed because NNPACK interface always expects conv with bias
75  std::vector<float> dummyBias_;
76 };
77 
79 // Implementations
81 
82 nnp_convolution_algorithm NNPACKConvOp::getConvolutionAlgorithm() const {
83  if (!OperatorBase::HasSingleArgumentOfType<std::string>("algo")) {
84  // No preference is stated. Heuristics for the best mobile device
85  // algorithm are different than NNPACK's version, as Winograd
86  // tends to be a lot faster. Use Winograd if the convolution
87  // is 3x3d1s1.
88  if (kernel_h() == 3 && kernel_w() == 3 && dilation_h() == 1 &&
89  dilation_w() == 1 && stride_h() == 1 && stride_w() == 1) {
90  // use Winograd
91  return nnp_convolution_algorithm_wt8x8;
92  }
93 
94  return nnp_convolution_algorithm_auto;
95  }
96 
97  // Otherwise, there is a preference.
98  auto algo = OperatorBase::GetSingleArgument<std::string>("algo", "AUTO");
99  if (algo == "AUTO") {
100  return nnp_convolution_algorithm_auto;
101  }
102  if (algo == "WINOGRAD") {
103  return nnp_convolution_algorithm_wt8x8;
104  }
105  if (algo == "WINOGRAD_FP16") {
106  return nnp_convolution_algorithm_wt8x8_fp16;
107  }
108  if (algo == "FT16") {
109  return nnp_convolution_algorithm_ft16x16;
110  }
111  if (algo == "FT8") {
112  return nnp_convolution_algorithm_ft8x8;
113  }
114  if (algo == "IMPLICIT_GEMM") {
115  return nnp_convolution_algorithm_implicit_gemm;
116  }
117  if (algo == "DIRECT") {
118  return nnp_convolution_algorithm_direct;
119  }
120  return nnp_convolution_algorithm_auto;
121 }
122 
123 nnp_convolution_transform_strategy
124 NNPACKConvOp::getConvolutionTransformStrategy() const {
125  auto kts = OperatorBase::GetSingleArgument<std::string>(
126  "convolution_transform_strategy", "COMPUTE");
127  if (kts == "PRECOMPUTE") {
128  return nnp_convolution_transform_strategy_precompute;
129  }
130  // Default to computing each time.
131  return nnp_convolution_transform_strategy_compute;
132 }
133 
134 nnp_activation
135 NNPACKConvOp::getActivationType() const {
136  auto activation = OperatorBase::GetSingleArgument<std::string>(
137  "activation", "identity");
138  if (activation == "identity") {
139  return nnp_activation_identity;
140  } else if (activation == "Relu") {
141  return nnp_activation_relu;
142  } else {
143  CAFFE_THROW("unsupported activation type \"", activation, "\"");
144  }
145 }
146 
147 bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
148  /* Global variable with a unique ID of the pre-transformed kernel blob */
149  volatile static uint32_t precomputed_transform_id = 0;
150 
151  auto& X = Input(0);
152  auto& filter = Input(1);
153  auto* Y = Output(0);
154  CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
155  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
156  CAFFE_ENFORCE(filter.ndim() == 4, "");
157  const int M = filter.dim32(0);
158  CAFFE_ENFORCE(C % this->group_ == 0, "");
159  CAFFE_ENFORCE(M % this->group_ == 0, "");
160  CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
161  CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
162  CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
163  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
164  const int oH = Y->dim32(2), oW = Y->dim32(3);
165 
166  const float* biasData = NULL;
167  if (InputSize() == 3) {
168  /* Convolution with bias */
169  auto& bias = Input(2);
170  CAFFE_ENFORCE(bias.ndim() == 1, "");
171  CAFFE_ENFORCE(bias.dim32(0) == M, "");
172  biasData = bias.template data<float>();
173  } else {
174  /* NNPACK interface requires bias. Use a dummy zero-filled vector. */
175  if (dummyBias_.size() != M) {
176  dummyBias_.resize(M);
177  }
178  biasData = dummyBias_.data();
179  }
180 
181  const size_t batch_size = X.dim32(0);
182  const size_t input_channels = X.dim32(1);
183  const size_t output_channels = Y->dim32(1);
184  const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
185  .height = static_cast<size_t>(X.dim32(2))};
186  // filter is MCHW
187  const nnp_size kernel_size = {.width = static_cast<size_t>(filter.dim32(3)),
188  .height = static_cast<size_t>(filter.dim32(2))};
189  // pad is tblr
190  const nnp_padding padding = {.top = static_cast<size_t>(pad_t()),
191  .right = static_cast<size_t>(pad_r()),
192  .bottom = static_cast<size_t>(pad_b()),
193  .left = static_cast<size_t>(pad_l())};
194 
195  const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
196  .height = static_cast<size_t>(stride_h())};
197  initNNPACK();
198  pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());
199 
200  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
201  if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
202  transformedFilters_.resize(group_);
203 
204  size_t transformedFilterSize = 0;
205  nnp_status status = nnp_convolution_inference(
206  algorithm_,
207  nnp_convolution_transform_strategy_precompute,
208  C / group_,
209  M / group_,
210  input_size,
211  padding,
212  kernel_size,
213  output_subsample,
214  nullptr /* input */,
215  nullptr /* filters */,
216  nullptr /* bias */,
217  nullptr /* output */,
218  nullptr /* workspace buffer = transformed filter */,
219  &transformedFilterSize,
220  nnp_activation_identity,
221  nullptr /* activation parameter */,
222  pool,
223  nullptr /* profile */);
224  if (status == nnp_status_success) {
225  /* For these convolution parameters filter transforms can be
226  * pre-computed */
227 
228  /* Division with rounding up, in case size is not multiple of
229  * sizeof(float) */
230  const size_t transformedFilterElements =
231  (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
232 
233  for (auto g = 0; g < group_; g++) {
234  transformedFilters_[g] = BlobGetMutableTensor(
235  ws_->CreateBlob(
236  "__transformed_kernel_" +
237  to_string(
238  __sync_fetch_and_add(&precomputed_transform_id, 1))),
239  CPU);
240  transformedFilters_[g]->Resize(transformedFilterElements);
241 
242  status = nnp_convolution_inference(
243  algorithm_,
244  nnp_convolution_transform_strategy_precompute,
245  C / group_,
246  M / group_,
247  input_size,
248  padding,
249  kernel_size,
250  output_subsample,
251  nullptr /* input */,
252  filter.template data<float>() + filter.size() / group_ * g,
253  nullptr /* bias */,
254  nullptr /* output */,
255  static_cast<void*>(
256  transformedFilters_[g]->template mutable_data<float>()),
257  &transformedFilterSize,
258  nnp_activation_identity,
259  nullptr /* activation parameter */,
260  pool,
261  nullptr /* profile */);
262  CAFFE_ENFORCE(
263  nnp_status_success == status,
264  "NNPACK convolution filter pre-transformation return error");
265  }
266 
267  /*
268  * Now, we've precomputed all our filter transformations.
269  * Switch to reuse strategy to avoid doing transformation again on next
270  * iteration.
271  */
272  if (transformStrategy_ ==
273  nnp_convolution_transform_strategy_precompute) {
274  CAFFE_ENFORCE_EQ(transformedFilters_.size(), group_);
275  transformStrategy_ = nnp_convolution_transform_strategy_reuse;
276  }
277  } else {
278  LOG(WARNING)
279  << "Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
280  transformStrategy_ = nnp_convolution_transform_strategy_compute;
281  }
282 
283  // Enforce when we leave this block that we have transitioned out of the
284  // precompute state.
285  CAFFE_ENFORCE(
286  transformStrategy_ != nnp_convolution_transform_strategy_precompute);
287  }
288 
289  CAFFE_ENFORCE(
290  transformStrategy_ == nnp_convolution_transform_strategy_reuse ||
291  transformStrategy_ == nnp_convolution_transform_strategy_compute);
292  const auto N = X.dim32(0);
293  for (auto n = 0; n < N; ++n) {
294  for (auto g = 0; g < group_; ++g) {
295  nnp_profile profile;
296  size_t workspaceSize = buffer->nbytes();
297  if (workspaceSize == 0) {
298  /* Allocate some memory to ensure buffer pointer is not NULL. This
299  * simplifies further logic. */
300  buffer->Resize(1);
301  workspaceSize = buffer->nbytes();
302  }
303  nnp_status status = nnp_convolution_inference(
304  algorithm_,
305  transformStrategy_,
306  C / group_,
307  M / group_,
308  input_size,
309  padding,
310  kernel_size,
311  output_subsample,
312  X.template data<float>() + n * C * H * W + g * H * W * (C / group_),
313  transformStrategy_ == nnp_convolution_transform_strategy_reuse
314  ? transformedFilters_[g]->template data<float>()
315  : filter.template data<float>() + filter.size() / group_ * g,
316  biasData + M / group_ * g,
317  Y->template mutable_data<float>() + n * oH * oW * M +
318  g * oH * oW * (M / group_),
319  static_cast<void*>(buffer->template mutable_data<float>()),
320  &workspaceSize,
321  activation_,
322  nullptr /* activation parameter */,
323  pool,
324  FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
325  if (status == nnp_status_insufficient_buffer) {
326  /* Query required workspace size, increase buffer, and try again */
327  status = nnp_convolution_inference(
328  algorithm_,
329  transformStrategy_,
330  C / group_,
331  M / group_,
332  input_size,
333  padding,
334  kernel_size,
335  output_subsample,
336  nullptr /* input */,
337  nullptr,
338  nullptr /* bias */,
339  nullptr /* output */,
340  nullptr /* workspace buffer */,
341  &workspaceSize,
342  activation_,
343  nullptr /* activation parameter */,
344  pool,
345  nullptr /* profile */);
346  if (status == nnp_status_success) {
347  /* Division with rounding up, in case size is not multiple of
348  * sizeof(float) */
349  const size_t workspace_elements =
350  (workspaceSize + sizeof(float) - 1) / sizeof(float);
351  buffer->Resize(workspace_elements);
352 
353  /* Try convolution_inference again. If this time it fails, it is
354  * fatal. */
355  status = nnp_convolution_inference(
356  algorithm_,
357  transformStrategy_,
358  C / group_,
359  M / group_,
360  input_size,
361  padding,
362  kernel_size,
363  output_subsample,
364  X.template data<float>() + n * C * H * W +
365  g * H * W * (C / group_),
366  transformStrategy_ == nnp_convolution_transform_strategy_reuse
367  ? transformedFilters_[g]->template data<float>()
368  : filter.template data<float>() +
369  filter.size() / group_ * g,
370  biasData + M / group_ * g,
371  Y->template mutable_data<float>() + n * oH * oW * M +
372  g * oH * oW * (M / group_),
373  static_cast<void*>(buffer->template mutable_data<float>()),
374  &workspaceSize,
375  activation_,
376  nullptr /* activation parameter */,
377  pool,
378  FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
379  }
380  }
381 
382  VLOG(1) << "NNPACK buffer size: " << buffer->nbytes();
383  CAFFE_ENFORCE(
384  nnp_status_success == status,
385  "NNPACK convolution computation returned error");
386  if (FLAGS_caffe2_profile_nnpack) {
387  char buffer[1024];
388  const double gmacs =
389  double(
390  Y->dim32(2) * Y->dim32(3) * Y->dim32(1) * X.dim32(1) *
391  kernel_size.width * kernel_size.height / group_ / group_) /
392  1.0E9;
393  const double gflops = 2 * gmacs / profile.total;
394  auto ret = snprintf(
395  buffer,
396  sizeof(buffer),
397  "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: "
398  "%4.2f, totalT: %6.3f, inputT: %6.3f, "
399  "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
400  size_t(X.dim(2)),
401  size_t(X.dim(3)),
402  size_t(X.dim(1)),
403  size_t(Y->dim(1)),
404  size_t(kernel_size.width),
405  size_t(output_subsample.width),
406  size_t(padding.top),
407  gmacs,
408  profile.total * 1E3,
409  profile.input_transform * 1E3,
410  profile.kernel_transform * 1E3,
411  profile.block_multiplication * 1E3,
412  profile.output_transform * 1E3,
413  gflops);
414  CAFFE_ENFORCE(ret > 0);
415  std::cout << buffer << std::endl;
416  }
417  }
418  }
419  });
420  return true;
421 }
422 
423 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
424 
425 } // namespace caffe2
Definition: any.cpp:108
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: OpClasses.h:13
Definition: static.cpp:64