4 #include "caffe2/operators/conv_pool_op_base.h" 5 #include "caffe2/utils/eigen_utils.h" 10 void uniformQuantize2b1b(
const TensorCPU& X,
11 const std::vector<std::unique_ptr<TensorCPU>>& XQ,
13 float inter_center_distance) {
14 CAFFE_ENFORCE_GT(X.ndim(), 1);
15 const auto N = X.size_to_dim(X.ndim() - 1);
16 auto C = X.size() / N;
17 const auto QC = divRoundUp(
C, 8);
18 auto XQs = X.sizes().vec();
19 XQs[X.ndim() - 1] = QC;
20 CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
21 for (
auto i = 0; i < k2b1bXBits; ++i) {
24 const float* Xdata = X.data<
float>();
25 std::array<uint8_t*, k2b1bXBits> XQdata;
26 for (
auto i = 0; i < k2b1bXBits; ++i) {
27 XQdata[i] = XQ[i]->mutable_data<uint8_t>();
29 for (
auto n = 0; n < N; ++n) {
30 for (
auto qc = 0; qc < QC; ++qc) {
32 std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
33 for (
auto b = 0; b < 8; ++b) {
34 const auto c = qc * 8 + b;
36 float v = Xdata[qc * 8 + b +
C * n];
39 }
else if (v < offset + inter_center_distance) {
41 }
else if (v < offset + 2 * inter_center_distance) {
49 for (
auto i = 0; i < k2b1bXBits; ++i) {
50 XQdata[i][qc + QC * n] = p[i];
56 void qconv(
const ConvArgs& args,
61 const auto N = X.dim32(0);
62 const auto IH = X.dim32(1);
63 const auto IW = X.dim32(2);
64 const auto KH = W.dim32(1);
65 const auto KW = W.dim32(2);
66 const auto KC = W.dim32(3);
68 (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
69 (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
71 const auto OH = Y->dim32(1);
72 const auto OW = Y->dim32(2);
73 const auto OC = Y->dim32(3);
75 CAFFE_ENFORCE_EQ(W.dim32(3), X.dim32(3));
77 const auto* Xdata = X.data<uint8_t>();
78 const auto* Wdata = W.data<uint8_t>();
79 auto* Ydata = Y->mutable_data<
float>();
80 for (
size_t n = 0; n < N; ++n) {
81 for (
size_t oh = 0; oh < OH; ++oh) {
82 for (
size_t ow = 0; ow < OW; ++ow) {
83 for (
size_t oc = 0; oc < OC; ++oc) {
85 for (
size_t kh = 0; kh < KH; ++kh) {
86 const int32_t ih = (int32_t)kh + (int32_t)args.stride_h * oh - (int32_t)args.pad_t;
87 for (
size_t kw = 0; kw < KW; ++kw) {
88 const int32_t iw = (int32_t)kw + (int32_t)args.stride_w * ow - (int32_t)args.pad_l;
89 for (
size_t kc = 0; kc < KC; ++kc) {
90 const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
92 if ((
size_t)ih >= (size_t)IH || (
size_t)iw >= (size_t)IW) {
93 acc += __builtin_popcount(0 ^ w);
96 Xdata[kc + KC * (size_t)iw + KC * IW * (
size_t)ih + n * KC * IW * IH];
97 const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
98 acc += __builtin_popcount(x ^ w);
103 Ydata[oc + OC * ow + OC * OW * oh + n * OC * OW * OH] =
104 KW * KH * KC * 8 - 2 * acc + (b ? b->data<
float>()[oc] : 0.0);
112 void qpad_zero(const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
113 CAFFE_ENFORCE_EQ(args.stride_h, 1);
114 CAFFE_ENFORCE_EQ(args.stride_w, 1);
115 const auto* Xdata = X.data<uint8_t>();
116 Y->Resize(X.dim32(0),
117 X.dim32(1) + args.pad_t + args.pad_b,
118 X.dim32(2) + args.pad_l + args.pad_r,
120 auto* Ydata = Y->mutable_data<uint8_t>();
121 ::memset(Ydata, 0, Y->nbytes());
122 const auto C = Y->dim32(3);
123 const auto XrowSize = X.dim32(3) * X.dim32(2);
124 const auto YrowSize = Y->dim32(3) * Y->dim32(2);
125 math::CopyMatrix<CPUContext>(1,
130 Ydata + C * args.pad_l + YrowSize * args.pad_t,
135 void signQuantize(
const TensorCPU& X, TensorCPU* XQ) {
136 CAFFE_ENFORCE_GT(X.ndim(), 1);
137 const auto N = X.size_to_dim(X.ndim() - 1);
138 auto C = X.size() / N;
139 const auto QC = divRoundUp(C, 8);
140 auto XQs = X.sizes().vec();
141 XQs[X.ndim() - 1] = QC;
143 const float* Xdata = X.data<
float>();
144 uint8_t* XQdata = XQ->mutable_data<uint8_t>();
145 for (
auto n = 0; n < N; ++n) {
146 for (
auto qc = 0; qc < QC; ++qc) {
149 for (
auto b = 0; b < 8; ++b) {
150 const auto c = qc * 8 + b;
152 p |= (Xdata[c + C * n] > 0) << b;
155 XQdata[qc + QC * n] = p;
160 void filterNormalization11(
const TensorCPU& WQ, TensorCPU* WQN) {
161 const auto F = WQ.dim32(0);
163 WQN->Resize(divRoundUp(F, kGEMMTileSize) * kGEMMTileSize);
164 const auto WQs = WQ.size() / F;
165 const auto WQbits = 8 * WQs;
166 const auto* WQdata = WQ.data<uint8_t>();
167 auto* WQNdata = WQN->mutable_data<
float>();
168 for (
auto f = 0; f < F; ++f) {
170 for (
auto j = 0; j < WQs; ++j) {
171 bitSum += __builtin_popcount(WQdata[f * WQs + j]);
173 DCHECK_LE(bitSum, WQbits);
174 WQNdata[f] = 2 * bitSum - WQbits;
178 void filterNormalizationL1(
const TensorCPU& W, TensorCPU* WL1) {
179 const auto F = W.dim32(0);
181 const auto Ws = W.size() / F;
182 const auto* Wdata = W.data<
float>();
183 auto* WL1data = WL1->mutable_data<
float>();
184 for (
auto f = 0; f < F; ++f) {
186 for (
auto j = 0; j < Ws; ++j) {
187 l1sum += std::abs(Wdata[f * Ws + j]);
189 WL1data[f] = l1sum / Ws;
193 void qim2col(
const ConvArgs& args,
const TensorCPU& XQ,
const TensorCPU& WQ, TensorCPU* XQcol) {
197 CAFFE_ENFORCE_EQ(XQ.dim32(3), WQ.dim32(3));
198 const size_t N = XQ.dim32(0);
199 const size_t IH = XQ.dim32(1);
200 const size_t IW = XQ.dim32(2);
201 const size_t KH = WQ.dim32(1);
202 const size_t KW = WQ.dim32(2);
203 const size_t KC = WQ.dim32(3);
205 XQcol->Resize(XQ.dim32(0),
206 (XQ.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
207 (XQ.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
210 if (args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 && args.pad_t == 0 &&
211 args.stride_h == 1 && args.stride_w == 1 && KH == 1 && KW == 1) {
212 CAFFE_ENFORCE_EQ(XQ.size(), XQcol->size());
213 XQcol->ShareExternalPointer(const_cast<uint8_t*>(XQ.data<uint8_t>()), XQ.size());
216 const size_t OH = XQcol->dim32(1);
217 const size_t OW = XQcol->dim32(2);
219 const uint8_t* XQdata = XQ.data<uint8_t>();
220 uint8_t* XQcoldata = XQcol->mutable_data<uint8_t>();
221 for (
size_t n = 0; n < N; ++n) {
222 for (
size_t oh = 0; oh < OH; ++oh) {
223 int32_t h_pad = (int32_t)(args.stride_h * oh) - (int32_t)args.pad_t;
224 for (
size_t ow = 0; ow < OW; ++ow) {
225 int32_t w_pad = (int32_t)(args.stride_w * ow) - (int32_t)args.pad_l;
226 for (
size_t kh = 0; kh < KH; ++kh) {
227 int32_t ih = (int32_t)kh + h_pad;
228 if ((
size_t)ih < (size_t)IH && (
size_t)w_pad < (
size_t)IW &&
229 (
size_t)((int32_t)w_pad + (int32_t)KW) < (
size_t)IW) {
231 size_t off = kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
232 n * KH * KW * KC * OW * OH;
233 std::memcpy(&XQcoldata[off],
234 &XQdata[((int32_t)w_pad) * KC + ih * IW * KC + n * IW * KC * IH],
237 for (
size_t kw = 0; kw < KW; ++kw) {
238 int32_t iw = (int32_t)kw + w_pad;
240 size_t off = kw * KC + kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
241 n * KH * KW * KC * OW * OH;
242 if ((
size_t)ih < (size_t)IH && (
size_t)iw < (
size_t)IW) {
244 &XQcoldata[off], &XQdata[iw * KC + ih * IW * KC + n * KC * IW * IH], KC);
247 std::memset(&XQcoldata[off], 0, KC);
257 std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
259 const TensorCPU* b) {
260 auto state = caffe2::make_unique<QConvState>();
261 state->XQs.resize(k2b1bXBits);
262 state->YQs.resize(k2b1bXBits);
263 for (
auto i = 0; i < k2b1bXBits; ++i) {
264 state->XQs[i] = caffe2::make_unique<Tensor>(CPU);
265 state->YQs[i] = caffe2::make_unique<Tensor>(CPU);
267 state->WQ = caffe2::make_unique<Tensor>(CPU);
268 state->WQN = caffe2::make_unique<Tensor>(CPU);
269 state->WQL1Norm = caffe2::make_unique<Tensor>(CPU);
270 state->scratch = caffe2::make_unique<Tensor>(CPU);
271 state->scratchColBuffer = caffe2::make_unique<Tensor>(CPU);
273 signQuantize(W, state->WQ.get());
274 filterNormalization11(*(state->WQ), state->WQN.get());
275 filterNormalizationL1(W, state->WQL1Norm.get());
282 state->parallelFor = [ws](
size_t range, std::function<void(size_t)> f) {
284 ws->GetThreadPool()->run([&](
int,
size_t v) { f(v); }, range);
286 for (
size_t v = 0; v < range; ++v) {
292 state->bias = caffe2::make_unique<Tensor>(*b, CPU);
297 void run2b1bConvGeneric(QConvState* state,
const ConvArgs& args,
const TensorCPU& X, TensorCPU* Y) {
298 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 299 if (run2b1bConvNeon(state, args, X, Y)) {
303 uniformQuantize2b1b(X, state->XQs, 0.5, 1.0);
304 for (
auto i = 0; i < k2b1bXBits; ++i) {
305 qconv(args, *(state->XQs[i]), *(state->WQ),
nullptr, state->YQs[i].get());
307 Y->ResizeLike(*(state->YQs[0]));
308 const auto F = state->WQ->dim(0);
309 const auto N = Y->size() / F;
310 run2b1bUnification(state,
313 state->WQN->data<
float>(),
314 state->YQs[0]->data<
float>(),
315 state->YQs[1]->data<
float>(),
317 Y->mutable_data<
float>(),
319 state->bias ? state->bias->data<
float>() :
nullptr);
322 void run2b1bUnification(QConvState* state,
325 const float* WQNVdata,
326 const float* YQs0Vdata,
327 const float* YQs1Vdata,
332 ConstEigenVectorArrayMap<float> WQNV(WQNVdata, C);
334 for (
size_t j = 0; j < N; ++j) {
335 ConstEigenVectorArrayMap<float> YQs0V(YQs0Vdata + YQstride * j, C);
336 ConstEigenVectorArrayMap<float> YQs1V(YQs1Vdata + YQstride * j, C);
337 EigenVectorArrayMap<float> YNV(Ydata + Ystride * j, C);
339 ConstEigenVectorArrayMap<float> BV(bias, C);
340 YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
341 std::pow<float>(2, 0) * YQs1V + BV;
343 YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
344 std::pow<float>(2, 0) * YQs1V;
353 OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NHWC,
"QConvOp only supports NHWC order");
354 OPERATOR_NEEDS_FEATURE(this->dilation_h() == 1,
"");
355 OPERATOR_NEEDS_FEATURE(this->dilation_w() == 1,
"");
356 OPERATOR_NEEDS_FEATURE(this->group_ == 1,
"");
359 bool RunOnDeviceWithOrderNHWC()
override {
361 auto& filter =
Input(1);
362 const auto* bias = InputSize() == 3 ? &
Input(2) :
nullptr;
367 state_ = create2b1bConvState(ws_, filter, bias);
370 args.pad_l = this->pad_l();
371 args.pad_t = this->pad_t();
372 args.pad_b = this->pad_b();
373 args.pad_r = this->pad_r();
374 args.stride_h = this->stride_h();
375 args.stride_w = this->stride_w();
376 run2b1bConvGeneric(state_.get(), args, X, Y);
381 std::unique_ptr<QConvState> state_;
385 REGISTER_CPU_OPERATOR(QConv,
QConvOp);
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position 'idx' for this operator. ...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...