1 #include "concat_dnnlowp_op.h"     7 #include "dnnlowp_partition.h"    14 ConcatDNNLowPOp<T>::ConcatDNNLowPOp(
    15     const OperatorDef& operator_def,
    17     : BaseType(operator_def, ws) {
    19     axis_ = this->
template GetSingleArgument<int>(
"axis", -1);
    20     add_axis_ = this->
template GetSingleArgument<int>(
"add_axis", 0);
    22     axis_ = GetDimFromOrderString(
    23         this->
template GetSingleArgument<string>(
"order", 
"NCHW"));
    26   CAFFE_ENFORCE_GE(axis_, 0);
    27   requantization_params_.resize(InputSize());
    31 bool ConcatDNNLowPOp<T>::RunOnDevice() {
    32   GetQuantizationParameters_();
    34   auto* output = OutputTensorCPU_(0);
    36   int* axis_data = 
nullptr;
    37   if (OutputSize() >= 2) {
    38     split = this->
template Output<Tensor>(1, CPU);
    39     split->Resize(vector<int64_t>(1, InputSize()));
    40     axis_data = split->template mutable_data<int>();
    42   auto& input_zero = InputTensorCPU_(0);
    45       input_zero.ndim() + (add_axis_ ? 1 : 0),
    46       "Axis not in input ndim range.");
    47   for (
int i = 1; i < InputSize(); ++i) {
    49         InputTensorCPU_(i).dtype() == input_zero.dtype(),
    50         "All inputs must have the same type, expected: ",
    51         input_zero.dtype().name(),
    53         InputTensorCPU_(i).dtype().name(),
    58   int before = 1, after = 1;
    59   vector<int64_t> output_dims(input_zero.sizes().vec());
    60   for (
int i = 0; i < input_zero.ndim(); ++i) {
    61     if (i == axis_ && !add_axis_) {
    64     int dim = input_zero.dim32(i);
    71     for (
int j = 1; j < InputSize(); ++j) {
    72       int dim_j = InputTensorCPU_(j).dim32(i);
    75           "Expect dimension = ",
    83           ". The input tensors can only have different dimensions "    84           "when arg 'add_axis' = 0 and along the axis = ",
    87           InputTensorCPU_(0).sizes(),
    89           InputTensorCPU_(j).sizes(),
    94   int output_channels = 0;
    95   for (
int i = 0; i < InputSize(); ++i) {
    96     auto dim = add_axis_ ? 1 : InputTensorCPU_(i).dim32(axis_);
   100     output_channels += dim;
   103     output_dims.insert(output_dims.begin() + axis_, output_channels);
   105     output_dims[axis_] = output_channels;
   107   output->Resize(output_dims);
   108   size_t output_offset = 0;
   110   char* output_data = 
reinterpret_cast<char*
>(GetQuantizedOutputData_());
   112   for (
int i = 0; i < InputSize(); ++i) {
   113     auto& input = InputTensorCPU_(i);
   114     auto axis_dim = add_axis_ ? 1 : input.dim32(axis_);
   116     vector<T> input_temp(input.numel());
   121       int nthreads = dnnlowp_get_num_threads();
   122       int tid = dnnlowp_get_thread_num();
   123       int before_begin, before_end;
   124       int after_begin, after_end;
   136       int j_begin = before_begin * axis_dim * after + after_begin;
   137       int j_end = (before_end - 1) * axis_dim * after + after_end;
   139       if (InputTensorCPU_(i).template IsType<T>()) {
   140         const T* input_data = input.template data<T>();
   141         for (
int j = j_begin; j < j_end; ++j) {
   142           input_temp[j] = fbgemm::Requantize<T>(
   143               input_data[j] - in_qparams_[i].zero_point,
   144               requantization_params_[i]);
   148             input.template data<float>() + j_begin,
   149             input_temp.data() + j_begin,
   154       math::CopyMatrix<CPUContext>(
   156           before_end - before_begin,
   157           after_end - after_begin,
   158           input_temp.data() + before_begin * axis_dim * after + after_begin,
   160           output_data + output_offset + before_begin * output_channels * after +
   161               after_begin * 
sizeof(
T),
   162           output_channels * after,
   164           input_zero.dtype().copy());
   167     output_offset += axis_dim * after * 
sizeof(
T);
   170   RunOnDeviceEpilogue_();
   175 template <
typename T>
   176 void ConcatDNNLowPOp<T>::GetQuantizationParameters_() {
   178   for (
int i = 0; i < InputSize(); ++i) {
   180         GetInputTensorQuantizationParamsOf(
this, i, qfactory_.get());
   183   GetOutputQuantizationParams_();
   185   for (
int i = 0; i < InputSize(); ++i) {
   186     float real_multiplier = in_qparams_[i].scale / out_qparams_.scale;
   187     requantization_params_[i] = qfactory_->ChooseRequantizationMultiplier(
   188         real_multiplier, out_qparams_);
   192 REGISTER_CPU_OPERATOR_WITH_ENGINE(
Concat, DNNLOWP, ConcatDNNLowPOp<uint8_t>);
   193 REGISTER_CPU_OPERATOR_WITH_ENGINE(
   196     ConcatDNNLowPOp<uint8_t>);
 
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
 
bool HasArgument(const string &name) const 
Checks if the operator has an argument of the given name. 
 
void Get1DPartitionOf2D(int m, int n, int nthreads, int tid, int *m_begin, int *m_end, int *n_begin, int *n_end, int n_align)
1D-partition m x n 2D work.