1 #include "concat_dnnlowp_op.h" 7 #include "dnnlowp_partition.h" 14 ConcatDNNLowPOp<T>::ConcatDNNLowPOp(
15 const OperatorDef& operator_def,
17 : BaseType(operator_def, ws) {
19 axis_ = this->
template GetSingleArgument<int>(
"axis", -1);
20 add_axis_ = this->
template GetSingleArgument<int>(
"add_axis", 0);
22 axis_ = GetDimFromOrderString(
23 this->
template GetSingleArgument<string>(
"order",
"NCHW"));
26 CAFFE_ENFORCE_GE(axis_, 0);
27 requantization_params_.resize(InputSize());
31 bool ConcatDNNLowPOp<T>::RunOnDevice() {
32 GetQuantizationParameters_();
34 auto* output = OutputTensorCPU_(0);
36 int* axis_data =
nullptr;
37 if (OutputSize() >= 2) {
38 split = this->
template Output<Tensor>(1, CPU);
39 split->Resize(vector<int64_t>(1, InputSize()));
40 axis_data = split->template mutable_data<int>();
42 auto& input_zero = InputTensorCPU_(0);
45 input_zero.ndim() + (add_axis_ ? 1 : 0),
46 "Axis not in input ndim range.");
47 for (
int i = 1; i < InputSize(); ++i) {
49 InputTensorCPU_(i).dtype() == input_zero.dtype(),
50 "All inputs must have the same type, expected: ",
51 input_zero.dtype().name(),
53 InputTensorCPU_(i).dtype().name(),
58 int before = 1, after = 1;
59 vector<int64_t> output_dims(input_zero.sizes().vec());
60 for (
int i = 0; i < input_zero.ndim(); ++i) {
61 if (i == axis_ && !add_axis_) {
64 int dim = input_zero.dim32(i);
71 for (
int j = 1; j < InputSize(); ++j) {
72 int dim_j = InputTensorCPU_(j).dim32(i);
75 "Expect dimension = ",
83 ". The input tensors can only have different dimensions " 84 "when arg 'add_axis' = 0 and along the axis = ",
87 InputTensorCPU_(0).sizes(),
89 InputTensorCPU_(j).sizes(),
94 int output_channels = 0;
95 for (
int i = 0; i < InputSize(); ++i) {
96 auto dim = add_axis_ ? 1 : InputTensorCPU_(i).dim32(axis_);
100 output_channels += dim;
103 output_dims.insert(output_dims.begin() + axis_, output_channels);
105 output_dims[axis_] = output_channels;
107 output->Resize(output_dims);
108 size_t output_offset = 0;
110 char* output_data =
reinterpret_cast<char*
>(GetQuantizedOutputData_());
112 for (
int i = 0; i < InputSize(); ++i) {
113 auto& input = InputTensorCPU_(i);
114 auto axis_dim = add_axis_ ? 1 : input.dim32(axis_);
116 vector<T> input_temp(input.numel());
121 int nthreads = dnnlowp_get_num_threads();
122 int tid = dnnlowp_get_thread_num();
123 int before_begin, before_end;
124 int after_begin, after_end;
136 int j_begin = before_begin * axis_dim * after + after_begin;
137 int j_end = (before_end - 1) * axis_dim * after + after_end;
139 if (InputTensorCPU_(i).template IsType<T>()) {
140 const T* input_data = input.template data<T>();
141 for (
int j = j_begin; j < j_end; ++j) {
142 input_temp[j] = fbgemm::Requantize<T>(
143 input_data[j] - in_qparams_[i].zero_point,
144 requantization_params_[i]);
148 input.template data<float>() + j_begin,
149 input_temp.data() + j_begin,
154 math::CopyMatrix<CPUContext>(
156 before_end - before_begin,
157 after_end - after_begin,
158 input_temp.data() + before_begin * axis_dim * after + after_begin,
160 output_data + output_offset + before_begin * output_channels * after +
161 after_begin *
sizeof(
T),
162 output_channels * after,
164 input_zero.dtype().copy());
167 output_offset += axis_dim * after *
sizeof(
T);
170 RunOnDeviceEpilogue_();
175 template <
typename T>
176 void ConcatDNNLowPOp<T>::GetQuantizationParameters_() {
178 for (
int i = 0; i < InputSize(); ++i) {
180 GetInputTensorQuantizationParamsOf(
this, i, qfactory_.get());
183 GetOutputQuantizationParams_();
185 for (
int i = 0; i < InputSize(); ++i) {
186 float real_multiplier = in_qparams_[i].scale / out_qparams_.scale;
187 requantization_params_[i] = qfactory_->ChooseRequantizationMultiplier(
188 real_multiplier, out_qparams_);
192 REGISTER_CPU_OPERATOR_WITH_ENGINE(
Concat, DNNLOWP, ConcatDNNLowPOp<uint8_t>);
193 REGISTER_CPU_OPERATOR_WITH_ENGINE(
196 ConcatDNNLowPOp<uint8_t>);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
void Get1DPartitionOf2D(int m, int n, int nthreads, int tid, int *m_begin, int *m_end, int *n_begin, int *n_end, int n_align)
1D-partition m x n 2D work.