4 #include "caffe2/core/common.h" 7 #include "caffe2/core/context.h" 8 #include "caffe2/core/logging.h" 9 #include "caffe2/core/operator.h" 10 #include "caffe2/operators/conv_op_shared.h" 11 #include "caffe2/operators/conv_pool_op_base.h" 13 #include "caffe2/utils/math.h" 16 C10_DEFINE_bool(caffe2_profile_nnpack,
false,
"");
20 static std::once_flag once;
21 std::call_once(once, []() {
22 enum nnp_status nnpack_status = nnp_initialize();
24 nnpack_status == nnp_status_success,
"NNPack is not supported here!");
36 algorithm_(getConvolutionAlgorithm()),
37 activation_(getActivationType()),
38 transformStrategy_(getConvolutionTransformStrategy()),
40 OPERATOR_NEEDS_FEATURE(
41 this->order_ == StorageOrder::NCHW,
42 "NNPack only supports NCHW order. Please consider add \ 43 TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
44 OPERATOR_NEEDS_FEATURE(
45 pad_t() < kernel_h(),
"NNPACK only supports pad < kernel size");
46 OPERATOR_NEEDS_FEATURE(
47 pad_b() < kernel_h(),
"NNPACK only supports pad < kernel size");
48 OPERATOR_NEEDS_FEATURE(
49 pad_l() < kernel_w(),
"NNPACK only supports pad < kernel size");
50 OPERATOR_NEEDS_FEATURE(
51 pad_r() < kernel_w(),
"NNPACK only supports pad < kernel size");
53 createSharedBuffer<CPUContext>(ws);
56 bool RunOnDeviceWithOrderNCHW()
override;
59 nnp_convolution_algorithm getConvolutionAlgorithm()
const;
60 nnp_convolution_transform_strategy getConvolutionTransformStrategy()
const;
61 nnp_activation getActivationType()
const;
63 const nnp_convolution_algorithm algorithm_;
64 const nnp_activation activation_;
69 nnp_convolution_transform_strategy transformStrategy_;
72 std::vector<TensorCPU*> transformedFilters_;
75 std::vector<float> dummyBias_;
82 nnp_convolution_algorithm NNPACKConvOp::getConvolutionAlgorithm()
const {
83 if (!OperatorBase::HasSingleArgumentOfType<std::string>(
"algo")) {
88 if (kernel_h() == 3 && kernel_w() == 3 && dilation_h() == 1 &&
89 dilation_w() == 1 && stride_h() == 1 && stride_w() == 1) {
91 return nnp_convolution_algorithm_wt8x8;
94 return nnp_convolution_algorithm_auto;
98 auto algo = OperatorBase::GetSingleArgument<std::string>(
"algo",
"AUTO");
100 return nnp_convolution_algorithm_auto;
102 if (algo ==
"WINOGRAD") {
103 return nnp_convolution_algorithm_wt8x8;
105 if (algo ==
"WINOGRAD_FP16") {
106 return nnp_convolution_algorithm_wt8x8_fp16;
108 if (algo ==
"FT16") {
109 return nnp_convolution_algorithm_ft16x16;
112 return nnp_convolution_algorithm_ft8x8;
114 if (algo ==
"IMPLICIT_GEMM") {
115 return nnp_convolution_algorithm_implicit_gemm;
117 if (algo ==
"DIRECT") {
118 return nnp_convolution_algorithm_direct;
120 return nnp_convolution_algorithm_auto;
123 nnp_convolution_transform_strategy
124 NNPACKConvOp::getConvolutionTransformStrategy()
const {
125 auto kts = OperatorBase::GetSingleArgument<std::string>(
126 "convolution_transform_strategy",
"COMPUTE");
127 if (kts ==
"PRECOMPUTE") {
128 return nnp_convolution_transform_strategy_precompute;
131 return nnp_convolution_transform_strategy_compute;
135 NNPACKConvOp::getActivationType()
const {
136 auto activation = OperatorBase::GetSingleArgument<std::string>(
137 "activation",
"identity");
138 if (activation ==
"identity") {
139 return nnp_activation_identity;
140 }
else if (activation ==
"Relu") {
141 return nnp_activation_relu;
143 CAFFE_THROW(
"unsupported activation type \"", activation,
"\"");
147 bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
149 volatile static uint32_t precomputed_transform_id = 0;
152 auto& filter =
Input(1);
154 CAFFE_ENFORCE(X.ndim() == 4,
"Input dim should be 4");
155 const int N = X.dim32(0),
C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
156 CAFFE_ENFORCE(filter.ndim() == 4,
"");
157 const int M = filter.dim32(0);
158 CAFFE_ENFORCE(
C % this->group_ == 0,
"");
159 CAFFE_ENFORCE(M % this->group_ == 0,
"");
160 CAFFE_ENFORCE(filter.dim32(1) ==
C / this->group_,
"");
161 CAFFE_ENFORCE(filter.dim32(2) == kernel_h(),
"");
162 CAFFE_ENFORCE(filter.dim32(3) == kernel_w(),
"");
164 const int oH = Y->dim32(2), oW = Y->dim32(3);
166 const float* biasData = NULL;
167 if (InputSize() == 3) {
169 auto& bias =
Input(2);
170 CAFFE_ENFORCE(bias.ndim() == 1,
"");
171 CAFFE_ENFORCE(bias.dim32(0) == M,
"");
172 biasData = bias.template data<float>();
175 if (dummyBias_.size() != M) {
176 dummyBias_.resize(M);
178 biasData = dummyBias_.data();
181 const size_t batch_size = X.dim32(0);
182 const size_t input_channels = X.dim32(1);
183 const size_t output_channels = Y->dim32(1);
184 const nnp_size input_size = {.width =
static_cast<size_t>(X.dim32(3)),
185 .height = static_cast<size_t>(X.dim32(2))};
187 const nnp_size kernel_size = {.width =
static_cast<size_t>(filter.dim32(3)),
188 .height = static_cast<size_t>(filter.dim32(2))};
190 const nnp_padding padding = {.top =
static_cast<size_t>(pad_t()),
191 .right = static_cast<size_t>(pad_r()),
192 .bottom = static_cast<size_t>(pad_b()),
193 .left = static_cast<size_t>(pad_l())};
195 const nnp_size output_subsample = {.width =
static_cast<size_t>(stride_w()),
196 .height = static_cast<size_t>(stride_h())};
198 pthreadpool_t pool =
reinterpret_cast<pthreadpool_t
>(ws_->GetThreadPool());
200 runWithSharedBuffer<CPUContext>(ws_, [&](
Tensor* buffer) {
201 if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
202 transformedFilters_.resize(group_);
204 size_t transformedFilterSize = 0;
205 nnp_status status = nnp_convolution_inference(
207 nnp_convolution_transform_strategy_precompute,
219 &transformedFilterSize,
220 nnp_activation_identity,
224 if (status == nnp_status_success) {
230 const size_t transformedFilterElements =
231 (transformedFilterSize +
sizeof(float) - 1) /
sizeof(float);
233 for (
auto g = 0; g < group_; g++) {
234 transformedFilters_[g] = BlobGetMutableTensor(
236 "__transformed_kernel_" +
238 __sync_fetch_and_add(&precomputed_transform_id, 1))),
240 transformedFilters_[g]->Resize(transformedFilterElements);
242 status = nnp_convolution_inference(
244 nnp_convolution_transform_strategy_precompute,
252 filter.template data<float>() + filter.size() / group_ * g,
256 transformedFilters_[g]->template mutable_data<float>()),
257 &transformedFilterSize,
258 nnp_activation_identity,
263 nnp_status_success == status,
264 "NNPACK convolution filter pre-transformation return error");
272 if (transformStrategy_ ==
273 nnp_convolution_transform_strategy_precompute) {
274 CAFFE_ENFORCE_EQ(transformedFilters_.size(), group_);
275 transformStrategy_ = nnp_convolution_transform_strategy_reuse;
279 <<
"Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
280 transformStrategy_ = nnp_convolution_transform_strategy_compute;
286 transformStrategy_ != nnp_convolution_transform_strategy_precompute);
290 transformStrategy_ == nnp_convolution_transform_strategy_reuse ||
291 transformStrategy_ == nnp_convolution_transform_strategy_compute);
292 const auto N = X.dim32(0);
293 for (
auto n = 0; n < N; ++n) {
294 for (
auto g = 0; g < group_; ++g) {
296 size_t workspaceSize = buffer->nbytes();
297 if (workspaceSize == 0) {
301 workspaceSize = buffer->nbytes();
303 nnp_status status = nnp_convolution_inference(
312 X.template data<float>() + n *
C * H * W + g * H * W * (
C / group_),
313 transformStrategy_ == nnp_convolution_transform_strategy_reuse
314 ? transformedFilters_[g]->template data<float>()
315 : filter.template data<float>() + filter.size() / group_ * g,
316 biasData + M / group_ * g,
317 Y->template mutable_data<float>() + n * oH * oW * M +
318 g * oH * oW * (M / group_),
319 static_cast<void*
>(buffer->template mutable_data<float>()),
324 FLAGS_caffe2_profile_nnpack ? &profile :
nullptr);
325 if (status == nnp_status_insufficient_buffer) {
327 status = nnp_convolution_inference(
346 if (status == nnp_status_success) {
349 const size_t workspace_elements =
350 (workspaceSize +
sizeof(float) - 1) /
sizeof(float);
351 buffer->Resize(workspace_elements);
355 status = nnp_convolution_inference(
364 X.template data<float>() + n *
C * H * W +
365 g * H * W * (
C / group_),
366 transformStrategy_ == nnp_convolution_transform_strategy_reuse
367 ? transformedFilters_[g]->template data<float>()
368 : filter.template data<float>() +
369 filter.size() / group_ * g,
370 biasData + M / group_ * g,
371 Y->template mutable_data<float>() + n * oH * oW * M +
372 g * oH * oW * (M / group_),
373 static_cast<void*
>(buffer->template mutable_data<float>()),
378 FLAGS_caffe2_profile_nnpack ? &profile :
nullptr);
382 VLOG(1) <<
"NNPACK buffer size: " << buffer->nbytes();
384 nnp_status_success == status,
385 "NNPACK convolution computation returned error");
386 if (FLAGS_caffe2_profile_nnpack) {
390 Y->dim32(2) * Y->dim32(3) * Y->dim32(1) * X.dim32(1) *
391 kernel_size.width * kernel_size.height / group_ / group_) /
393 const double gflops = 2 * gmacs / profile.total;
397 "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: " 398 "%4.2f, totalT: %6.3f, inputT: %6.3f, " 399 "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
404 size_t(kernel_size.width),
405 size_t(output_subsample.width),
409 profile.input_transform * 1E3,
410 profile.kernel_transform * 1E3,
411 profile.block_multiplication * 1E3,
412 profile.output_transform * 1E3,
414 CAFFE_ENFORCE(ret > 0);
415 std::cout << buffer << std::endl;
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position 'idx' for this operator. ...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...