doxygen-c/html/batch__matmul__op_8h_source.html

 #ifndef CAFFE2_OPERATORS_MATMUL_OP_H_
 #define CAFFE2_OPERATORS_MATMUL_OP_H_

 #include <sstream>

 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 template <class Context, class Engine = DefaultEngine>
 class BatchMatMulOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   template <class... Args>
   explicit BatchMatMulOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...),
         trans_a_(this->template GetSingleArgument<int>("trans_a", 0)),
         trans_b_(this->template GetSingleArgument<int>("trans_b", 0)),
         broadcast_(this->template GetSingleArgument<int>("broadcast", 0)) {}

   ~BatchMatMulOp() {}

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
   }

   template <typename T>
   bool DoRunWithType() {
     const auto& A = Input(0);
     const auto& B = Input(1);

     auto ndims_A = A.dim();
     auto dims_A = A.sizes().vec();
     auto ndims_B = B.dim();
     auto dims_B = B.sizes().vec();

     auto noBroadcastErrorMsg = [](size_t dim1, size_t dim2) {
       std::stringstream ss;
       ss << "Inputs with dimensions A = ";
       ss << dim1;
       ss << " and B = ";
       ss << dim2;
       ss << " is not supported with broadcast=0. Did you forget to set the "
             "broadcast flag?";
       return ss.str();
     };

     // These should all be false if we're not broadcasting.
     bool dimMismatch = ndims_A != ndims_B;
     bool dimsLessThan1D = ndims_A < 2;
     CAFFE_ENFORCE(
         broadcast_ || (!dimMismatch && !dimsLessThan1D),
         noBroadcastErrorMsg(ndims_A, ndims_B));

     auto* data_A = A.template data<T>();
     auto* data_B = B.template data<T>();

     auto dimMismatchErrorString = [](size_t dimnum1,
                                      size_t dim1,
                                      size_t dimnum2,
                                      size_t dim2,
                                      bool trans_a,
                                      bool trans_b) {
       std::stringstream ss;
       ss << "Expected dimension ";
       ss << dimnum1;
       ss << " of tensor A with value ";
       ss << dim1;
       ss << " to match dimension ";
       ss << dimnum2;
       ss << " of tensor B with value ";
       ss << dim2;
       ss << ". trans_a = ";
       ss << trans_a;
       ss << " trans_b = ";
       ss << trans_b;
       return ss.str();
     };

     if (ndims_A == 1 && ndims_B == 1) {
       // vector-vector
       CAFFE_ENFORCE_EQ(
           dims_A[0],
           dims_B[0],
           "Vector-vector product requires each of the vectors to "
           "be the same size.");
       auto* Y = Output(0, {1}, at::dtype<T>());
       math::Dot<T, Context>(
           dims_A[0], data_A, data_B, Y->template mutable_data<T>(), &context_);
     } else {
       bool A_broadcasted = false, B_broadcasted = false;
       if (ndims_A == 1) {
         dims_A.insert(dims_A.begin(), 1);
         ndims_A = 2;
         A_broadcasted = true;
       }
       if (ndims_B == 1) {
         dims_B.push_back(1);
         ndims_B = 2;
         B_broadcasted = true;
       }
       // matrix-matrix with batches
       // [B1..., M, K] * [B2..., K, N] -> [B..., M, N]
       // In the event that A or B are one-dimensional, the trailing or leading
       // 1 is not added to the output tensor's size.

       // First step: partition the tensors into inner and outer blocks.
       // Ignoring the last two dimensions of A and B, ensure that one of the
       // tensors' dimensions is a suffix of the other. For example,
       // [4, x, x] is a suffix of [2, 3, 4, x, x]. In this example, the
       // dimensions of size 2 and 3 will be broadcasted, so we partition into
       // 2*3=6 individual instances of batched GEMM with A and B \in [4, x, x].
       size_t num_inner_dims = std::min(ndims_A, ndims_B);
       for (size_t i = 2; i < num_inner_dims; ++i) {
         auto first_r_itr = dims_A.rbegin();
         auto second_r_itr = dims_B.rbegin();
         CAFFE_ENFORCE_EQ(
             *(first_r_itr + i),
             *(second_r_itr + i),
             dimMismatchErrorString(
                 ndims_A - i - 1,
                 *(first_r_itr + i),
                 ndims_B - i - 1,
                 *(second_r_itr + i),
                 trans_a_,
                 trans_b_));
       }
       size_t num_outer_dims = std::max(ndims_A, ndims_B) - num_inner_dims;

       // Standard M, N, and K parameters respecting GEMM API and transpose
       // flags
       size_t M, N, K, K_dim;
       if (trans_a_) {
         M = dims_A[ndims_A - 1];
         K = dims_A[ndims_A - 2];
         K_dim = ndims_A - 2;
       } else {
         M = dims_A[ndims_A - 2];
         K = dims_A[ndims_A - 1];
         K_dim = ndims_A - 1;
       }
       if (trans_b_) {
         N = dims_B[ndims_B - 2];
         CAFFE_ENFORCE_EQ(
             K,
             dims_B[ndims_B - 1],
             dimMismatchErrorString(
                 K_dim,
                 K,
                 ndims_B - 1,
                 dims_B[ndims_B - 1],
                 trans_a_,
                 trans_b_));
       } else {
         N = dims_B[ndims_B - 1];
         CAFFE_ENFORCE_EQ(
             K,
             dims_B[ndims_B - 2],
             dimMismatchErrorString(
                 K_dim,
                 K,
                 ndims_B - 2,
                 dims_B[ndims_B - 2],
                 trans_a_,
                 trans_b_));
       }

       // Calculate output tensor shapes [B..., (M), (N)]
       // Batch dimensions will be broadcasted out to those of the longer tensor
       // A or B. Either M or N are optional if A or B, respectively are 1-D.
       std::vector<int64_t> new_dims;
       if (ndims_A >= ndims_B) {
         new_dims.assign(dims_A.begin(), dims_A.end() - 2);
       } else {
         new_dims.assign(dims_B.begin(), dims_B.end() - 2);
       }
       if (!A_broadcasted) {
         new_dims.push_back(M);
       } else {
         new_dims.push_back(1);
       }
       if (!B_broadcasted) {
         new_dims.push_back(N);
       } else {
         new_dims.push_back(1);
       }

       // Calculate strides. Continuing our example above,
       //   [4, M, K] * [2, 3, 4, K, N] = [2, 3, 4, M, N]
       // We calculate this as follows:
       //   1) Treat the outer batch dimensions as flattened, i.e. view the B
       //      tensor here as [6, 4, K, N] and Y as [6, 4, M, N]. The same rea-
       //      soning is analogous for the case where # dims A >= # dims B.
       //   2) Perform this operation:
       //        for i in range(6):
       //          Y[i, :, :, :] = BatchMatMul(A, B[i, :, :, :])
       size_t A_stride = 1; // How far to increment A pointer each itr
       size_t B_stride = 1; // How far to increment B pointer each itr
       size_t Y_stride = 1; // How far to increment Y pointer each itr
       // How many "inner batches" we have. That is, the product of sizes for
       // the slices excluding M, K, and N, for their respective matrices.
       size_t num_sub_batches = 1;
       if (ndims_A >= ndims_B) {
         auto first_r_itr = dims_A.rbegin();
         auto output_r_itr = new_dims.rbegin();
         for (size_t i = 0; i < num_inner_dims; ++i) {
           A_stride *= *(first_r_itr + i);
           Y_stride *= *(output_r_itr + i);
           if (i >= 2) {
             num_sub_batches *= *(first_r_itr + i);
           }
         }
         B_stride = 0;
       } else {
         A_stride = 0;
         auto second_r_itr = dims_B.rbegin();
         auto output_r_itr = new_dims.rbegin();
         for (size_t i = 0; i < num_inner_dims; ++i) {
           B_stride *= *(second_r_itr + i);
           Y_stride *= *(output_r_itr + i);
           if (i >= 2) {
             num_sub_batches *= *(second_r_itr + i);
           }
         }
       }

       size_t num_outer_batches = 1;
       for (size_t i = 0; i < num_outer_dims; ++i) {
         num_outer_batches *= new_dims[i];
       }

       // Mutually exclusive since otherwise we would've taken the vector-vector
       // path above
       if (A_broadcasted) {
         new_dims.erase(new_dims.end() - 2);
       } else if (B_broadcasted) {
         new_dims.erase(new_dims.end() - 1);
       }

       // Allocate output tensor
       auto* Y = Output(0, new_dims, at::dtype<T>());
       auto* Y_data = Y->template mutable_data<T>();

       // Zero batch dimension indicates no elements
       if (num_sub_batches == 0 || num_outer_batches == 0) {
         return true;
       }

       // TODO(T23893772): doing this in a loop is likely going to be slow on GPU
       for (size_t p = 0; p < num_outer_batches; ++p) {
         math::GemmStridedBatched<T, Context, Engine>(
             trans_a_ ? CblasTrans : CblasNoTrans,
             trans_b_ ? CblasTrans : CblasNoTrans,
             num_sub_batches,
             M,
             N,
             K,
             1.0f,
             data_A + p * A_stride,
             M * K,
             data_B + p * B_stride,
             K * N,
             0.0f,
             Y_data + p * Y_stride,
             M * N,
             &context_);
       }
     }
     return true;
   }

  protected:
   bool trans_a_;
   bool trans_b_;
   bool broadcast_;
 };

 } // namespace caffe2

 #endif /* CAFFE2_OPERATORS_MATMUL_OP_H_ */
M
Definition: any.cpp:108

caffe2::Operator::Input
const Tensor & Input(int idx, DeviceType type=Context::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702

A
Definition: static.cpp:52

caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13

caffe2::DispatchHelper
Definition: operator.h:1052

B
Definition: static.cpp:58

caffe2::BatchMatMulOp
Definition: batch_matmul_op.h:13

caffe2::Operator
Definition: operator.h:677