doxygen-c/html/share_2contrib_2nnpack_2conv__op_8cc_source.html

 #include <iostream>

 #include "caffe2/core/common.h"


 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/conv_op_shared.h"
 #include "caffe2/operators/conv_pool_op_base.h"

 #include "caffe2/utils/math.h"
 #include "nnpack.h"

 C10_DEFINE_bool(caffe2_profile_nnpack, false, "");
 namespace caffe2 {

 void initNNPACK() {
   static std::once_flag once;
   std::call_once(once, []() {
     enum nnp_status nnpack_status = nnp_initialize();
     CAFFE_ENFORCE(
         nnpack_status == nnp_status_success, "NNPack is not supported here!");
   });
 }

 // Definitions

 class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
  public:
   NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
       : ConvPoolOpBase<CPUContext>(operator_def, ws),
         algorithm_(getConvolutionAlgorithm()),
         activation_(getActivationType()),
         transformStrategy_(getConvolutionTransformStrategy()),
         ws_(ws) {
     OPERATOR_NEEDS_FEATURE(
         this->order_ == StorageOrder::NCHW,
         "NNPack only supports NCHW order. Please consider add \
             TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
     OPERATOR_NEEDS_FEATURE(
         pad_t() < kernel_h(), "NNPACK only supports pad < kernel size");
     OPERATOR_NEEDS_FEATURE(
         pad_b() < kernel_h(), "NNPACK only supports pad < kernel size");
     OPERATOR_NEEDS_FEATURE(
         pad_l() < kernel_w(), "NNPACK only supports pad < kernel size");
     OPERATOR_NEEDS_FEATURE(
         pad_r() < kernel_w(), "NNPACK only supports pad < kernel size");

     createSharedBuffer<CPUContext>(ws);
   }

   bool RunOnDeviceWithOrderNCHW() override;

  private:
   nnp_convolution_algorithm getConvolutionAlgorithm() const;
   nnp_convolution_transform_strategy getConvolutionTransformStrategy() const;
   nnp_activation getActivationType() const;

   const nnp_convolution_algorithm algorithm_;
   const nnp_activation activation_;
   // Modified after precomputing the kernels. State transitions are:
   // - precompute -> (first call to Run()) -> reuse (on successful precompute)
   //                                       -> compute (on failing precompute)
   // - compute
   nnp_convolution_transform_strategy transformStrategy_;
   Workspace* ws_;
   // Per-group transformed filters
   std::vector<TensorCPU*> transformedFilters_;
   // Zero-filled bias for convolutions without bias
   // This may be needed because NNPACK interface always expects conv with bias
   std::vector<float> dummyBias_;
 };

 // Implementations

 nnp_convolution_algorithm NNPACKConvOp::getConvolutionAlgorithm() const {
   if (!OperatorBase::HasSingleArgumentOfType<std::string>("algo")) {
     // No preference is stated. Heuristics for the best mobile device
     // algorithm are different than NNPACK's version, as Winograd
     // tends to be a lot faster. Use Winograd if the convolution
     // is 3x3d1s1.
     if (kernel_h() == 3 && kernel_w() == 3 && dilation_h() == 1 &&
         dilation_w() == 1 && stride_h() == 1 && stride_w() == 1) {
       // use Winograd
       return nnp_convolution_algorithm_wt8x8;
     }

     return nnp_convolution_algorithm_auto;
   }

   // Otherwise, there is a preference.
   auto algo = OperatorBase::GetSingleArgument<std::string>("algo", "AUTO");
   if (algo == "AUTO") {
     return nnp_convolution_algorithm_auto;
   }
   if (algo == "WINOGRAD") {
     return nnp_convolution_algorithm_wt8x8;
   }
   if (algo == "WINOGRAD_FP16") {
     return nnp_convolution_algorithm_wt8x8_fp16;
   }
   if (algo == "FT16") {
     return nnp_convolution_algorithm_ft16x16;
   }
   if (algo == "FT8") {
     return nnp_convolution_algorithm_ft8x8;
   }
   if (algo == "IMPLICIT_GEMM") {
     return nnp_convolution_algorithm_implicit_gemm;
   }
   if (algo == "DIRECT") {
     return nnp_convolution_algorithm_direct;
   }
   return nnp_convolution_algorithm_auto;
 }

 nnp_convolution_transform_strategy
 NNPACKConvOp::getConvolutionTransformStrategy() const {
   auto kts = OperatorBase::GetSingleArgument<std::string>(
       "convolution_transform_strategy", "COMPUTE");
   if (kts == "PRECOMPUTE") {
     return nnp_convolution_transform_strategy_precompute;
   }
   // Default to computing each time.
   return nnp_convolution_transform_strategy_compute;
 }

 nnp_activation
 NNPACKConvOp::getActivationType() const {
   auto activation = OperatorBase::GetSingleArgument<std::string>(
     "activation", "identity");
   if (activation == "identity") {
     return nnp_activation_identity;
   } else if (activation == "Relu") {
     return nnp_activation_relu;
   } else {
     CAFFE_THROW("unsupported activation type \"", activation, "\"");
   }
 }

 bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
   /* Global variable with a unique ID of the pre-transformed kernel blob */
   volatile static uint32_t precomputed_transform_id = 0;

   auto& X = Input(0);
   auto& filter = Input(1);
   auto* Y = Output(0);
   CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
   const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   CAFFE_ENFORCE(filter.ndim() == 4, "");
   const int M = filter.dim32(0);
   CAFFE_ENFORCE(C % this->group_ == 0, "");
   CAFFE_ENFORCE(M % this->group_ == 0, "");
   CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
   CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
   CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
   ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
   const int oH = Y->dim32(2), oW = Y->dim32(3);

   const float* biasData = NULL;
   if (InputSize() == 3) {
     /* Convolution with bias */
     auto& bias = Input(2);
     CAFFE_ENFORCE(bias.ndim() == 1, "");
     CAFFE_ENFORCE(bias.dim32(0) == M, "");
     biasData = bias.template data<float>();
   } else {
     /* NNPACK interface requires bias. Use a dummy zero-filled vector. */
     if (dummyBias_.size() != M) {
       dummyBias_.resize(M);
     }
     biasData = dummyBias_.data();
   }

   const size_t batch_size = X.dim32(0);
   const size_t input_channels = X.dim32(1);
   const size_t output_channels = Y->dim32(1);
   const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
                                .height = static_cast<size_t>(X.dim32(2))};
   // filter is MCHW
   const nnp_size kernel_size = {.width = static_cast<size_t>(filter.dim32(3)),
                                 .height = static_cast<size_t>(filter.dim32(2))};
   // pad is tblr
   const nnp_padding padding = {.top = static_cast<size_t>(pad_t()),
                                .right = static_cast<size_t>(pad_r()),
                                .bottom = static_cast<size_t>(pad_b()),
                                .left = static_cast<size_t>(pad_l())};

   const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
                                      .height = static_cast<size_t>(stride_h())};
   initNNPACK();
   pthreadpool_t pool = reinterpret_cast<pthreadpool_t>(ws_->GetThreadPool());

   runWithSharedBuffer<CPUContext>(ws_, [&](Tensor* buffer) {
     if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
       transformedFilters_.resize(group_);

       size_t transformedFilterSize = 0;
       nnp_status status = nnp_convolution_inference(
           algorithm_,
           nnp_convolution_transform_strategy_precompute,
           C / group_,
           M / group_,
           input_size,
           padding,
           kernel_size,
           output_subsample,
           nullptr /* input */,
           nullptr /* filters */,
           nullptr /* bias */,
           nullptr /* output */,
           nullptr /* workspace buffer = transformed filter */,
           &transformedFilterSize,
           nnp_activation_identity,
           nullptr /* activation parameter */,
           pool,
           nullptr /* profile */);
       if (status == nnp_status_success) {
         /* For these convolution parameters filter transforms can be
          * pre-computed */

         /* Division with rounding up, in case size is not multiple of
          * sizeof(float) */
         const size_t transformedFilterElements =
             (transformedFilterSize + sizeof(float) - 1) / sizeof(float);

         for (auto g = 0; g < group_; g++) {
           transformedFilters_[g] = BlobGetMutableTensor(
               ws_->CreateBlob(
                   "__transformed_kernel_" +
                   to_string(
                       __sync_fetch_and_add(&precomputed_transform_id, 1))),
               CPU);
           transformedFilters_[g]->Resize(transformedFilterElements);

           status = nnp_convolution_inference(
               algorithm_,
               nnp_convolution_transform_strategy_precompute,
               C / group_,
               M / group_,
               input_size,
               padding,
               kernel_size,
               output_subsample,
               nullptr /* input */,
               filter.template data<float>() + filter.size() / group_ * g,
               nullptr /* bias */,
               nullptr /* output */,
               static_cast<void*>(
                   transformedFilters_[g]->template mutable_data<float>()),
               &transformedFilterSize,
               nnp_activation_identity,
               nullptr /* activation parameter */,
               pool,
               nullptr /* profile */);
           CAFFE_ENFORCE(
               nnp_status_success == status,
               "NNPACK convolution filter pre-transformation return error");
         }

         /*
          * Now, we've precomputed all our filter transformations.
          * Switch to reuse strategy to avoid doing transformation again on next
          * iteration.
          */
         if (transformStrategy_ ==
             nnp_convolution_transform_strategy_precompute) {
           CAFFE_ENFORCE_EQ(transformedFilters_.size(), group_);
           transformStrategy_ = nnp_convolution_transform_strategy_reuse;
         }
       } else {
         LOG(WARNING)
             << "Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
         transformStrategy_ = nnp_convolution_transform_strategy_compute;
       }

       // Enforce when we leave this block that we have transitioned out of the
       // precompute state.
       CAFFE_ENFORCE(
           transformStrategy_ != nnp_convolution_transform_strategy_precompute);
     }

     CAFFE_ENFORCE(
         transformStrategy_ == nnp_convolution_transform_strategy_reuse ||
         transformStrategy_ == nnp_convolution_transform_strategy_compute);
     const auto N = X.dim32(0);
     for (auto n = 0; n < N; ++n) {
       for (auto g = 0; g < group_; ++g) {
         nnp_profile profile;
         size_t workspaceSize = buffer->nbytes();
         if (workspaceSize == 0) {
           /* Allocate some memory to ensure buffer pointer is not NULL. This
            * simplifies further logic. */
           buffer->Resize(1);
           workspaceSize = buffer->nbytes();
         }
         nnp_status status = nnp_convolution_inference(
             algorithm_,
             transformStrategy_,
             C / group_,
             M / group_,
             input_size,
             padding,
             kernel_size,
             output_subsample,
             X.template data<float>() + n * C * H * W + g * H * W * (C / group_),
             transformStrategy_ == nnp_convolution_transform_strategy_reuse
                 ? transformedFilters_[g]->template data<float>()
                 : filter.template data<float>() + filter.size() / group_ * g,
             biasData + M / group_ * g,
             Y->template mutable_data<float>() + n * oH * oW * M +
                 g * oH * oW * (M / group_),
             static_cast<void*>(buffer->template mutable_data<float>()),
             &workspaceSize,
             activation_,
             nullptr /* activation parameter */,
             pool,
             FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
         if (status == nnp_status_insufficient_buffer) {
           /* Query required workspace size, increase buffer, and try again */
           status = nnp_convolution_inference(
               algorithm_,
               transformStrategy_,
               C / group_,
               M / group_,
               input_size,
               padding,
               kernel_size,
               output_subsample,
               nullptr /* input */,
               nullptr,
               nullptr /* bias */,
               nullptr /* output */,
               nullptr /* workspace buffer */,
               &workspaceSize,
               activation_,
               nullptr /* activation parameter */,
               pool,
               nullptr /* profile */);
           if (status == nnp_status_success) {
             /* Division with rounding up, in case size is not multiple of
              * sizeof(float) */
             const size_t workspace_elements =
                 (workspaceSize + sizeof(float) - 1) / sizeof(float);
             buffer->Resize(workspace_elements);

             /* Try convolution_inference again. If this time it fails, it is
              * fatal. */
             status = nnp_convolution_inference(
                 algorithm_,
                 transformStrategy_,
                 C / group_,
                 M / group_,
                 input_size,
                 padding,
                 kernel_size,
                 output_subsample,
                 X.template data<float>() + n * C * H * W +
                     g * H * W * (C / group_),
                 transformStrategy_ == nnp_convolution_transform_strategy_reuse
                     ? transformedFilters_[g]->template data<float>()
                     : filter.template data<float>() +
                         filter.size() / group_ * g,
                 biasData + M / group_ * g,
                 Y->template mutable_data<float>() + n * oH * oW * M +
                     g * oH * oW * (M / group_),
                 static_cast<void*>(buffer->template mutable_data<float>()),
                 &workspaceSize,
                 activation_,
                 nullptr /* activation parameter */,
                 pool,
                 FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
           }
         }

         VLOG(1) << "NNPACK buffer size: " << buffer->nbytes();
         CAFFE_ENFORCE(
             nnp_status_success == status,
             "NNPACK convolution computation returned error");
         if (FLAGS_caffe2_profile_nnpack) {
           char buffer[1024];
           const double gmacs =
               double(
                   Y->dim32(2) * Y->dim32(3) * Y->dim32(1) * X.dim32(1) *
                   kernel_size.width * kernel_size.height / group_ / group_) /
               1.0E9;
           const double gflops = 2 * gmacs / profile.total;
           auto ret = snprintf(
               buffer,
               sizeof(buffer),
               "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: "
               "%4.2f, totalT: %6.3f, inputT: %6.3f, "
               "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
               size_t(X.dim(2)),
               size_t(X.dim(3)),
               size_t(X.dim(1)),
               size_t(Y->dim(1)),
               size_t(kernel_size.width),
               size_t(output_subsample.width),
               size_t(padding.top),
               gmacs,
               profile.total * 1E3,
               profile.input_transform * 1E3,
               profile.kernel_transform * 1E3,
               profile.block_multiplication * 1E3,
               profile.output_transform * 1E3,
               gflops);
           CAFFE_ENFORCE(ret > 0);
           std::cout << buffer << std::endl;
         }
       }
     }
   });
   return true;
 }

 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);

 } // namespace caffe2
M
Definition: any.cpp:108

nom::repr::Tensor
Definition: NeuralNet.h:158

caffe2::Workspace
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47

caffe2::Operator< CPUContext >::Input
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702

caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13

Conv
Definition: OpClasses.h:13

C
Definition: static.cpp:64

caffe2::NNPACKConvOp
Definition: conv_op.cc:32

caffe2::ConvPoolOpBase
Definition: conv_pool_op_base.h:29