1 #include "caffe2/core/operator.h" 2 #include "caffe2/utils/cpu_neon.h" 3 #include "caffe2/utils/math.h" 5 #ifdef CAFFE2_USE_MKLDNN 6 #include <caffe2/ideep/operators/operator_fallback_ideep.h> 7 #include <caffe2/ideep/utils/ideep_operator.h> 12 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 19 inline float32x4_t to_v4_f32(uint16x4_t v) {
20 return vcvtq_f32_u32(vmovl_u16(v));
23 inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
26 uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));
28 out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
29 out.val[1] = to_v4_f32(vget_high_u16(lo_u16));
31 uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));
33 out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
34 out.val[3] = to_v4_f32(vget_high_u16(hi_u16));
39 inline void clamp(float32x4_t& v) {
40 v = vmaxq_f32(v, vdupq_n_f32(0));
41 v = vminq_f32(v, vdupq_n_f32((
float)std::numeric_limits<uint8_t>::max()));
44 inline void addMeanAndClamp(float32x4_t& v,
float mean) {
45 v = vaddq_f32(v, vdupq_n_f32(mean));
49 inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
50 uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
51 uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
52 uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
53 return vmovn_u16(u16_01);
57 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) 63 static constexpr
int kInputChannels = 4;
66 static constexpr
int kOutputChannels = 3;
69 static constexpr
int kNeonNoiseReadSize = kOutputChannels * 16;
75 bool RunOnDevice()
override {
76 const auto& X =
Input(0);
77 const auto& mean =
Input(1);
79 auto* noiseBlob = ws_->
CreateBlob(
"__CAFFE2_STYLIZER_NOISE__");
80 auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
83 if (!BlobIsTensorType(*noiseBlob, CPU)) {
86 auto* t = BlobGetMutableTensor(noiseBlob, CPU);
88 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 91 initNoiseCPUNeon(t, defaultNoiseSize);
93 initNoiseCPU(t, defaultNoiseSize);
96 const auto& noise = noiseBlob->template Get<TensorCPU>();
97 CAFFE_ENFORCE(noise.numel() >= defaultNoiseSize);
99 CAFFE_ENFORCE(X.dim() == 4);
100 const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2),
C = X.dim32(3);
102 CAFFE_ENFORCE(mean.numel() == kOutputChannels);
104 CAFFE_ENFORCE(
C == kInputChannels);
105 auto* Y = Output(0, {N, kOutputChannels, H, W}, at::dtype<float>());
116 Y->template mutable_data<float>());
121 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON) 122 void initNoiseCPU(
Tensor* noise,
int size) {
125 math::RandGaussian<float, CPUContext>(
128 OperatorBase::GetSingleArgument<float>(
"noise_std", 10.0),
129 noise->template mutable_data<float>(),
132 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON) 134 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 135 void initNoiseCPUNeon(
Tensor* noise,
int size) {
139 size = math::RoundUp(size, kNeonNoiseReadSize) + size;
142 math::RandGaussian<float, CPUContext>(
145 OperatorBase::GetSingleArgument<float>(
"noise_std", 10.0),
146 noise->template mutable_data<float>(),
149 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) 157 const uint8_t* input,
158 const float* meanChannel,
161 int planeSize = H * W;
163 for (
int n = 0; n < N; ++n) {
164 auto curInput = input + n * kInputChannels * planeSize;
165 auto curOutput = output + n * kOutputChannels * planeSize;
167 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 168 runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
170 runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
171 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) 175 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON) 180 const uint8_t* input,
181 const float* meanChannel,
184 int planeSize = H * W;
187 for (
int point = 0; point < planeSize; ++point) {
188 for (
int c = 0; c < kOutputChannels; ++c) {
189 float v = (float)input[point * kInputChannels + c];
190 output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset];
192 if (++noiseOffset >= noiseCycle) {
198 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON) 200 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 205 const uint8_t* input,
206 const float* meanChannel,
214 constexpr
int kUnroll = 1;
217 constexpr
int kInnerLoadSize =
sizeof(uint8x16x4_t);
220 constexpr
int kInnerStoreSize =
sizeof(float32x4_t);
223 constexpr
int kLoadPixels = kInnerLoadSize / kInputChannels;
224 static_assert(kLoadPixels == 16,
"unexpected");
227 constexpr
int kLoadPixelsPerLoop = kLoadPixels * kUnroll;
230 CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);
233 const float* curNoise = noise;
235 float mean[kOutputChannels] = {
236 meanChannel[0], meanChannel[1], meanChannel[2]};
237 int planeSize = H * W;
244 bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
245 isPointerAligned(output, kInnerStoreSize) &&
248 (planeSize % kInnerStoreSize == 0);
252 isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;
254 for (; point < limit; point += kLoadPixelsPerLoop) {
256 for (
int j = 0; j < kUnroll; ++j) {
258 const uint8_t* inputAligned = (
const uint8_t*)__builtin_assume_aligned(
259 input + (point + j * kLoadPixels) * kInputChannels,
260 sizeof(uint8x16x4_t));
261 uint8x16x4_t loadV = vld4q_u8(inputAligned);
269 constexpr
int kChannel = 0;
270 float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
271 float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
272 float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
273 float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);
275 float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
276 float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
277 outV.val[0] = vsubq_f32(outV.val[0], meanV);
278 outV.val[1] = vsubq_f32(outV.val[1], meanV);
279 outV.val[2] = vsubq_f32(outV.val[2], meanV);
280 outV.val[3] = vsubq_f32(outV.val[3], meanV);
282 outV.val[0] = vaddq_f32(outV.val[0], noise0);
283 outV.val[1] = vaddq_f32(outV.val[1], noise1);
284 outV.val[2] = vaddq_f32(outV.val[2], noise2);
285 outV.val[3] = vaddq_f32(outV.val[3], noise3);
287 float* outputAligned = (
float*)__builtin_assume_aligned(
288 &output[kChannel * planeSize + (point + j * kLoadPixels)],
289 sizeof(float32x4_t));
291 vst1q_f32(outputAligned + 0, outV.val[0]);
292 vst1q_f32(outputAligned + 4, outV.val[1]);
293 vst1q_f32(outputAligned + 8, outV.val[2]);
294 vst1q_f32(outputAligned + 12, outV.val[3]);
298 constexpr
int kChannel = 1;
299 float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
300 float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
301 float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
302 float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);
304 float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
305 float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
306 outV.val[0] = vsubq_f32(outV.val[0], meanV);
307 outV.val[1] = vsubq_f32(outV.val[1], meanV);
308 outV.val[2] = vsubq_f32(outV.val[2], meanV);
309 outV.val[3] = vsubq_f32(outV.val[3], meanV);
311 outV.val[0] = vaddq_f32(outV.val[0], noise0);
312 outV.val[1] = vaddq_f32(outV.val[1], noise1);
313 outV.val[2] = vaddq_f32(outV.val[2], noise2);
314 outV.val[3] = vaddq_f32(outV.val[3], noise3);
316 float* outputAligned = (
float*)__builtin_assume_aligned(
317 &output[kChannel * planeSize + (point + j * kLoadPixels)],
318 sizeof(float32x4_t));
320 vst1q_f32(outputAligned + 0, outV.val[0]);
321 vst1q_f32(outputAligned + 4, outV.val[1]);
322 vst1q_f32(outputAligned + 8, outV.val[2]);
323 vst1q_f32(outputAligned + 12, outV.val[3]);
327 constexpr
int kChannel = 2;
328 float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
329 float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
330 float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
331 float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);
333 float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
334 float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
335 outV.val[0] = vsubq_f32(outV.val[0], meanV);
336 outV.val[1] = vsubq_f32(outV.val[1], meanV);
337 outV.val[2] = vsubq_f32(outV.val[2], meanV);
338 outV.val[3] = vsubq_f32(outV.val[3], meanV);
340 outV.val[0] = vaddq_f32(outV.val[0], noise0);
341 outV.val[1] = vaddq_f32(outV.val[1], noise1);
342 outV.val[2] = vaddq_f32(outV.val[2], noise2);
343 outV.val[3] = vaddq_f32(outV.val[3], noise3);
345 float* outputAligned = (
float*)__builtin_assume_aligned(
346 &output[kChannel * planeSize + (point + j * kLoadPixels)],
347 sizeof(float32x4_t));
349 vst1q_f32(outputAligned + 0, outV.val[0]);
350 vst1q_f32(outputAligned + 4, outV.val[1]);
351 vst1q_f32(outputAligned + 8, outV.val[2]);
352 vst1q_f32(outputAligned + 12, outV.val[3]);
356 curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
357 noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;
359 if (noiseUsed >= noiseCycle) {
361 curNoise = noise + ((curNoise - noise) % noiseCycle);
366 for (; point < planeSize; ++point) {
367 for (
int c = 0; c < kOutputChannels; ++c) {
368 float v = (float)input[point * kInputChannels + c];
369 output[c * planeSize + point] = v - mean[c] + *curNoise++;
373 if (noiseUsed >= noiseCycle) {
375 curNoise = noise + ((curNoise - noise) % noiseCycle);
379 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) 387 template <
typename T>
388 static inline T clamped_cast(
float f) {
389 if (f >= std::numeric_limits<T>::max()) {
390 return std::numeric_limits<T>::max();
392 if (f <= std::numeric_limits<T>::min()) {
393 return std::numeric_limits<T>::min();
395 return static_cast<T>(f);
406 static constexpr
int kInputChannels = 3;
409 static constexpr
int kOutputChannels = 4;
411 bool RunOnDevice()
override {
412 const auto& X =
Input(0);
413 const auto& mean =
Input(1);
415 CAFFE_ENFORCE(X.dim() == 4);
416 const int N = X.dim32(0),
C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
418 CAFFE_ENFORCE(mean.numel() == kInputChannels);
419 CAFFE_ENFORCE(
C == kInputChannels);
421 auto* Y = Output(0, {N, H, W, kOutputChannels}, at::dtype<uint8_t>());
430 Y->template mutable_data<uint8_t>());
441 const float* meanChannel,
443 int planeSize = H * W;
445 for (
int n = 0; n < N; ++n) {
446 auto curInput = input + n * kInputChannels * planeSize;
447 auto curOutput = output + n * kOutputChannels * planeSize;
449 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 450 runCPUNeon(H, W, curInput, meanChannel, curOutput);
452 runCPU(H, W, curInput, meanChannel, curOutput);
453 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) 457 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON) 462 const float* meanChannel,
464 int planeSize = H * W;
466 for (
int point = 0; point < planeSize; ++point) {
467 for (
int c = 0; c < kInputChannels; ++c) {
468 uint8_t v = clamped_cast<uint8_t>(
469 input[c * planeSize + point] + meanChannel[c]);
470 output[point * kOutputChannels + c] = v;
474 output[point * kOutputChannels + (kOutputChannels - 1)] =
475 std::numeric_limits<uint8_t>::max();
478 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON) 480 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 485 const float* meanChannel,
490 constexpr
int kLoadUnit =
sizeof(float32x4_t);
491 constexpr
int kLoadFloats = (
sizeof(float32x4_t) /
sizeof(
float));
494 constexpr
int kStoreUnit =
sizeof(uint8x8x4_t);
497 constexpr
int kLoadPixels = 2 * kLoadFloats;
499 float mean[kInputChannels] = {
500 meanChannel[0], meanChannel[1], meanChannel[2]};
501 int planeSize = H * W;
508 bool isAligned = isPointerAligned(input, kLoadUnit) &&
509 isPointerAligned(output, kStoreUnit) &&
512 (planeSize % kLoadUnit == 0);
515 int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;
517 for (; point < limit; point += kLoadPixels) {
520 float32x4_t inputc0_0 =
521 vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
522 float32x4_t inputc0_1 =
523 vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);
525 float32x4_t inputc1_0 =
526 vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
527 float32x4_t inputc1_1 =
528 vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);
530 float32x4_t inputc2_0 =
531 vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
532 float32x4_t inputc2_1 =
533 vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);
535 addMeanAndClamp(inputc0_0, mean[0]);
536 addMeanAndClamp(inputc0_1, mean[0]);
537 uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);
539 addMeanAndClamp(inputc1_0, mean[1]);
540 addMeanAndClamp(inputc1_1, mean[1]);
541 uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);
543 addMeanAndClamp(inputc2_0, mean[2]);
544 addMeanAndClamp(inputc2_1, mean[2]);
545 uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);
548 uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());
552 uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
553 vst4_u8_aligned(output + kOutputChannels * point, u8_out);
557 for (; point < planeSize; ++point) {
558 for (
int c = 0; c < kInputChannels; ++c) {
560 clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
561 output[point * kOutputChannels + c] = v;
565 output[point * kOutputChannels + (kOutputChannels - 1)] =
566 std::numeric_limits<uint8_t>::max();
569 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) 574 REGISTER_CPU_OPERATOR(
575 PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
577 OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
580 REGISTER_CPU_OPERATOR(
581 BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
583 OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
587 #ifdef CAFFE2_USE_MKLDNN 588 REGISTER_IDEEP_OPERATOR(
589 BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
591 REGISTER_IDEEP_OPERATOR(
592 PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
Blob * CreateBlob(const string &name)
Creates a blob of the given name.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position 'idx' for this operator. ...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
A templated class to allow one to wrap a CPU operator as an IDEEP operator.