Caffe2 - C++ API
A deep learning, cross platform ML framework
stylizer_ops.cc
1 
17 #include "caffe2/core/operator.h"
18 #include "caffe2/utils/cpu_neon.h"
19 #include "caffe2/utils/math.h"
20 
21 namespace caffe2 {
22 
23 #ifdef __ARM_NEON__
24 namespace {
25 
26 //
27 // ARM Neon code utilities
28 //
29 
30 inline float32x4_t to_v4_f32(uint16x4_t v) {
31  return vcvtq_f32_u32(vmovl_u16(v));
32 }
33 
34 inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
35  float32x4x4_t out;
36 
37  uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));
38 
39  out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
40  out.val[1] = to_v4_f32(vget_high_u16(lo_u16));
41 
42  uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));
43 
44  out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
45  out.val[3] = to_v4_f32(vget_high_u16(hi_u16));
46 
47  return out;
48 }
49 
50 inline void clamp(float32x4_t& v) {
51  v = vmaxq_f32(v, vdupq_n_f32(0));
52  v = vminq_f32(v, vdupq_n_f32((float)std::numeric_limits<uint8_t>::max()));
53 }
54 
55 inline void addMeanAndClamp(float32x4_t& v, float mean) {
56  v = vaddq_f32(v, vdupq_n_f32(mean));
57  clamp(v);
58 }
59 
60 inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
61  uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
62  uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
63  uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
64  return vmovn_u16(u16_01);
65 }
66 
67 } // unnamed namespace
68 #endif // __ARM_NEON__
69 
71  : public Operator<CPUContext> {
72  public:
73  // Expect this many channels as input
74  static constexpr int kInputChannels = 4;
75 
76  // Expect this many channels as output
77  static constexpr int kOutputChannels = 3;
78 
79  // We read this much noise per vectorized cycle
80  static constexpr int kNeonNoiseReadSize = kOutputChannels * 16;
81 
82  USE_OPERATOR_FUNCTIONS(CPUContext);
84  const OperatorDef& operator_def,
85  Workspace* ws)
86  : Operator<CPUContext>(operator_def, ws), ws_(ws) {}
87 
88  bool RunOnDevice() {
89  const auto& X = Input(0);
90  const auto& mean = Input(1);
91  auto* Y = Output(0);
92  auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__");
93  auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
94  "noise_size", 491 /* prime to avoid artifacts */);
95 
96  if (!noiseBlob->IsType<TensorCPU>()) {
97  // Initialize random noise on first use.
98  // Cache it to maintain temporal consistency.
99  auto* t = noiseBlob->template GetMutable<TensorCPU>();
100 
101 #ifdef __ARM_NEON__
102  // Noise space is larger for vectorized code due to the
103  // vectorized load
104  initNoiseCPUNeon(t, defaultNoiseSize);
105 #else
106  initNoiseCPU(t, defaultNoiseSize);
107 #endif
108  }
109  const auto& noise = noiseBlob->template Get<TensorCPU>();
110  CAFFE_ENFORCE(noise.size() >= defaultNoiseSize);
111 
112  CAFFE_ENFORCE(X.ndim() == 4);
113  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
114  // Assume BGR or BGRA
115  CAFFE_ENFORCE(mean.size() == kOutputChannels);
116 
117  CAFFE_ENFORCE(C == kInputChannels);
118  Y->Resize(N, kOutputChannels, H, W);
119 
120  runBatch(
121  N,
122  C,
123  H,
124  W,
125  defaultNoiseSize,
126  X.data<uint8_t>(),
127  mean.data<float>(),
128  noise.data<float>(),
129  Y->mutable_data<float>());
130 
131  return true;
132  }
133 
134 #ifndef __ARM_NEON__
135  void initNoiseCPU(Tensor<CPUContext>* noise, int size) {
136  noise->Resize(size);
137 
138  math::RandGaussian<float, CPUContext>(
139  size,
140  0.0,
141  OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
142  noise->template mutable_data<float>(),
143  &context_);
144  }
145 #endif // !__ARM_NEON__
146 
147 #ifdef __ARM_NEON__
148  void initNoiseCPUNeon(Tensor<CPUContext>* noise, int size) {
149  // For ARM NEON, we read in multiples of kNeonNoiseReadSize since
150  // the inner loop is vectorized. Round up to the next highest
151  // multiple of kNeonNoiseReadSize
152  size = math::roundUp(size, kNeonNoiseReadSize) + size;
153  noise->Resize(size);
154 
155  math::RandGaussian<float, CPUContext>(
156  size,
157  0.0,
158  OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
159  noise->template mutable_data<float>(),
160  &context_);
161  }
162 #endif // __ARM_NEON
163 
164  void runBatch(
165  int N,
166  int /*C*/,
167  int H,
168  int W,
169  int noiseCycle,
170  const uint8_t* input,
171  const float* meanChannel,
172  const float* noise,
173  float* output) {
174  int planeSize = H * W;
175 
176  for (int n = 0; n < N; ++n) {
177  auto curInput = input + n * kInputChannels * planeSize;
178  auto curOutput = output + n * kOutputChannels * planeSize;
179 
180 #ifdef __ARM_NEON__
181  runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
182 #else
183  runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
184 #endif // __ARM_NEON__
185  }
186  }
187 
188 #ifndef __ARM_NEON__
189  void runCPU(
190  int H,
191  int W,
192  int noiseCycle,
193  const uint8_t* input,
194  const float* meanChannel,
195  const float* noise,
196  float* output) {
197  int planeSize = H * W;
198  int noiseOffset = 0;
199 
200  for (int point = 0; point < planeSize; ++point) {
201  for (int c = 0; c < kOutputChannels; ++c) {
202  float v = (float)input[point * kInputChannels + c];
203  output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset];
204 
205  if (++noiseOffset >= noiseCycle) {
206  noiseOffset = 0;
207  }
208  }
209  }
210  }
211 #endif // !__ARM_NEON__
212 
213 #ifdef __ARM_NEON__
214  void runCPUNeon(
215  int H,
216  int W,
217  int noiseCycle,
218  const uint8_t* input,
219  const float* meanChannel,
220  const float* noise,
221  float* output) {
222  // Vectorized load parameters:
223 
224  // Loop unroll factor
225  // FIXME: this doesn't actually unroll; clang has per-loop unroll
226  // pragmas but GCC does not
227  constexpr int kUnroll = 1;
228 
229  // How much data we load for each inner loop
230  constexpr int kInnerLoadSize = sizeof(uint8x16x4_t);
231 
232  // What we write out
233  constexpr int kInnerStoreSize = sizeof(float32x4_t);
234 
235  // We load 16 pixels at a time, with 4 channels each
236  constexpr int kLoadPixels = kInnerLoadSize / kInputChannels;
237  static_assert(kLoadPixels == 16, "unexpected");
238 
239  // How many pixels we load per loop
240  constexpr int kLoadPixelsPerLoop = kLoadPixels * kUnroll;
241 
242  // We need at least this much noise each loop through
243  CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);
244 
245  int noiseUsed = 0;
246  const float* curNoise = noise;
247 
248  float mean[kOutputChannels] = {
249  meanChannel[0], meanChannel[1], meanChannel[2]};
250  int planeSize = H * W;
251 
252  // Vectorized portion
253  int point = 0;
254 
255  // If the slice is not aligned, then we have to use the
256  // un-vectorized version
257  bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
258  isPointerAligned(output, kInnerStoreSize) &&
259  // Because we are writing to output at offsets of planeSize,
260  // planeSize has to be an even multiple of kInnerStoreSize
261  (planeSize % kInnerStoreSize == 0);
262 
263  // What portion the vectorized loop will handle
264  int limit =
265  isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;
266 
267  for (; point < limit; point += kLoadPixelsPerLoop) {
268  // Unroll load/update/store by kUnroll
269  for (int j = 0; j < kUnroll; ++j) {
270  // We load 16 pixels x 4 channels at a time
271  const uint8_t* inputAligned = (const uint8_t*)__builtin_assume_aligned(
272  input + (point + j * kLoadPixels) * kInputChannels,
273  sizeof(uint8x16x4_t));
274  uint8x16x4_t loadV = vld4q_u8(inputAligned);
275 
276  // The compiler doesn't want to unroll this when we put it in a
277  // loop, and in GCC there's no per-loop unroll pragma, so we do
278  // it manually.
279  // This seems to involve no register spillage, crossing fingers
280  // that it remains that way.
281  {
282  constexpr int kChannel = 0;
283  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
284  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
285  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
286  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);
287 
288  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
289  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
290  outV.val[0] = vsubq_f32(outV.val[0], meanV);
291  outV.val[1] = vsubq_f32(outV.val[1], meanV);
292  outV.val[2] = vsubq_f32(outV.val[2], meanV);
293  outV.val[3] = vsubq_f32(outV.val[3], meanV);
294 
295  outV.val[0] = vaddq_f32(outV.val[0], noise0);
296  outV.val[1] = vaddq_f32(outV.val[1], noise1);
297  outV.val[2] = vaddq_f32(outV.val[2], noise2);
298  outV.val[3] = vaddq_f32(outV.val[3], noise3);
299 
300  float* outputAligned = (float*)__builtin_assume_aligned(
301  &output[kChannel * planeSize + (point + j * kLoadPixels)],
302  sizeof(float32x4_t));
303 
304  vst1q_f32(outputAligned + 0, outV.val[0]);
305  vst1q_f32(outputAligned + 4, outV.val[1]);
306  vst1q_f32(outputAligned + 8, outV.val[2]);
307  vst1q_f32(outputAligned + 12, outV.val[3]);
308  }
309 
310  {
311  constexpr int kChannel = 1;
312  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
313  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
314  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
315  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);
316 
317  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
318  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
319  outV.val[0] = vsubq_f32(outV.val[0], meanV);
320  outV.val[1] = vsubq_f32(outV.val[1], meanV);
321  outV.val[2] = vsubq_f32(outV.val[2], meanV);
322  outV.val[3] = vsubq_f32(outV.val[3], meanV);
323 
324  outV.val[0] = vaddq_f32(outV.val[0], noise0);
325  outV.val[1] = vaddq_f32(outV.val[1], noise1);
326  outV.val[2] = vaddq_f32(outV.val[2], noise2);
327  outV.val[3] = vaddq_f32(outV.val[3], noise3);
328 
329  float* outputAligned = (float*)__builtin_assume_aligned(
330  &output[kChannel * planeSize + (point + j * kLoadPixels)],
331  sizeof(float32x4_t));
332 
333  vst1q_f32(outputAligned + 0, outV.val[0]);
334  vst1q_f32(outputAligned + 4, outV.val[1]);
335  vst1q_f32(outputAligned + 8, outV.val[2]);
336  vst1q_f32(outputAligned + 12, outV.val[3]);
337  }
338 
339  {
340  constexpr int kChannel = 2;
341  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
342  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
343  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
344  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);
345 
346  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
347  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
348  outV.val[0] = vsubq_f32(outV.val[0], meanV);
349  outV.val[1] = vsubq_f32(outV.val[1], meanV);
350  outV.val[2] = vsubq_f32(outV.val[2], meanV);
351  outV.val[3] = vsubq_f32(outV.val[3], meanV);
352 
353  outV.val[0] = vaddq_f32(outV.val[0], noise0);
354  outV.val[1] = vaddq_f32(outV.val[1], noise1);
355  outV.val[2] = vaddq_f32(outV.val[2], noise2);
356  outV.val[3] = vaddq_f32(outV.val[3], noise3);
357 
358  float* outputAligned = (float*)__builtin_assume_aligned(
359  &output[kChannel * planeSize + (point + j * kLoadPixels)],
360  sizeof(float32x4_t));
361 
362  vst1q_f32(outputAligned + 0, outV.val[0]);
363  vst1q_f32(outputAligned + 4, outV.val[1]);
364  vst1q_f32(outputAligned + 8, outV.val[2]);
365  vst1q_f32(outputAligned + 12, outV.val[3]);
366  }
367  }
368 
369  curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
370  noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;
371 
372  if (noiseUsed >= noiseCycle) {
373  noiseUsed = 0;
374  curNoise = noise + ((curNoise - noise) % noiseCycle);
375  }
376  }
377 
378  // Epilogue: non-vectorized remainder
379  for (; point < planeSize; ++point) {
380  for (int c = 0; c < kOutputChannels; ++c) {
381  float v = (float)input[point * kInputChannels + c];
382  output[c * planeSize + point] = v - mean[c] + *curNoise++;
383  ++noiseUsed;
384  }
385 
386  if (noiseUsed >= noiseCycle) {
387  noiseUsed = 0;
388  curNoise = noise + ((curNoise - noise) % noiseCycle);
389  }
390  }
391  }
392 #endif // __ARM_NEON__
393 
394  private:
395  Workspace* ws_;
396 };
397 
398 namespace {
399 
400 template <typename T>
401 static inline T clamped_cast(float f) {
402  if (f >= std::numeric_limits<T>::max()) {
403  return std::numeric_limits<T>::max();
404  }
405  if (f <= std::numeric_limits<T>::min()) {
406  return std::numeric_limits<T>::min();
407  }
408  return static_cast<T>(f);
409 }
410 
411 } // unnamed namespace
412 
414  : public Operator<CPUContext> {
415  public:
417 
418  // Expect this many channels as input
419  static constexpr int kInputChannels = 3;
420 
421  // Expect this many channels as output
422  static constexpr int kOutputChannels = 4;
423 
424  bool RunOnDevice() {
425  const auto& X = Input(0);
426  const auto& mean = Input(1);
427  auto* Y = Output(0);
428  CAFFE_ENFORCE(X.ndim() == 4);
429  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
430  // Assume BGR or BGRA
431  CAFFE_ENFORCE(mean.size() == kInputChannels);
432  CAFFE_ENFORCE(C == kInputChannels);
433  // RGB
434  Y->Resize(N, H, W, kOutputChannels);
435 
436  runBatch(
437  N,
438  C,
439  H,
440  W,
441  X.data<float>(),
442  mean.data<float>(),
443  Y->mutable_data<uint8_t>());
444 
445  return true;
446  }
447 
448  void runBatch(
449  int N,
450  int /*C*/,
451  int H,
452  int W,
453  const float* input,
454  const float* meanChannel,
455  uint8_t* output) {
456  int planeSize = H * W;
457 
458  for (int n = 0; n < N; ++n) {
459  auto curInput = input + n * kInputChannels * planeSize;
460  auto curOutput = output + n * kOutputChannels * planeSize;
461 
462 #ifdef __ARM_NEON__
463  runCPUNeon(H, W, curInput, meanChannel, curOutput);
464 #else
465  runCPU(H, W, curInput, meanChannel, curOutput);
466 #endif // __ARM_NEON__
467  }
468  }
469 
470 #ifndef __ARM_NEON__
471  void runCPU(
472  int H,
473  int W,
474  const float* input,
475  const float* meanChannel,
476  uint8_t* output) {
477  int planeSize = H * W;
478 
479  for (int point = 0; point < planeSize; ++point) {
480  for (int c = 0; c < kInputChannels; ++c) {
481  uint8_t v = clamped_cast<uint8_t>(
482  input[c * planeSize + point] + meanChannel[c]);
483  output[point * kOutputChannels + c] = v;
484  }
485 
486  // alpha
487  output[point * kOutputChannels + (kOutputChannels - 1)] =
488  std::numeric_limits<uint8_t>::max();
489  }
490  }
491 #endif // !__ARM_NEON__
492 
493 #ifdef __ARM_NEON__
494  void runCPUNeon(
495  int H,
496  int W,
497  const float* input,
498  const float* meanChannel,
499  uint8_t* output) {
500  // Vectorized load parameters:
501 
502  // We load in chunks of this size
503  constexpr int kLoadUnit = sizeof(float32x4_t);
504  constexpr int kLoadFloats = (sizeof(float32x4_t) / sizeof(float));
505 
506  // We store in chunks of this size
507  constexpr int kStoreUnit = sizeof(uint8x8x4_t);
508 
509  // The vector portion loads this many f32 pixels at a time (8)
510  constexpr int kLoadPixels = 2 * kLoadFloats;
511 
512  float mean[kInputChannels] = {
513  meanChannel[0], meanChannel[1], meanChannel[2]};
514  int planeSize = H * W;
515 
516  // Vectorized portion
517  int point = 0;
518 
519  // If the slice is not aligned, then we have to use the
520  // un-vectorized version
521  bool isAligned = isPointerAligned(input, kLoadUnit) &&
522  isPointerAligned(output, kStoreUnit) &&
523  // Because we are reading from input at offsets of planeSize,
524  // planeSize has to be an even multiple of kLoadUnit
525  (planeSize % kLoadUnit == 0);
526 
527  // What portion the vectorized loop will handle
528  int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;
529 
530  for (; point < limit; point += kLoadPixels) {
531  // Load 8 f32 pixels from each channel; loading 16 involves
532  // register spills it seems
533  float32x4_t inputc0_0 =
534  vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
535  float32x4_t inputc0_1 =
536  vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);
537 
538  float32x4_t inputc1_0 =
539  vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
540  float32x4_t inputc1_1 =
541  vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);
542 
543  float32x4_t inputc2_0 =
544  vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
545  float32x4_t inputc2_1 =
546  vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);
547 
548  addMeanAndClamp(inputc0_0, mean[0]);
549  addMeanAndClamp(inputc0_1, mean[0]);
550  uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);
551 
552  addMeanAndClamp(inputc1_0, mean[1]);
553  addMeanAndClamp(inputc1_1, mean[1]);
554  uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);
555 
556  addMeanAndClamp(inputc2_0, mean[2]);
557  addMeanAndClamp(inputc2_1, mean[2]);
558  uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);
559 
560  // This is the alpha channel
561  uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());
562 
563  // We now have 8 bytes of each channel in a separate vector
564  // Write BGRA interleaved to output
565  uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
566  vst4_u8_aligned(output + kOutputChannels * point, u8_out);
567  }
568 
569  // Epilogue: non-vectorized remainder
570  for (; point < planeSize; ++point) {
571  for (int c = 0; c < kInputChannels; ++c) {
572  uint8_t v =
573  clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
574  output[point * kOutputChannels + c] = v;
575  }
576 
577  // alpha
578  output[point * kOutputChannels + (kOutputChannels - 1)] =
579  std::numeric_limits<uint8_t>::max();
580  }
581  }
582 #endif // __ARM_NEON__
583 };
584 
585 namespace {
586 
587 REGISTER_CPU_OPERATOR(
588  PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
590 OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
591  .NumInputs(2)
592  .NumOutputs(1);
593 REGISTER_CPU_OPERATOR(
594  BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
596 OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
597  .NumInputs(2)
598  .NumOutputs(1);
599 } // namespace
600 } // namespace caffe2
Blob * CreateBlob(const string &name)
Creates a blob of the given name.
Definition: workspace.cc:120
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:82
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
void Resize(Ts...dim_source)
Resizes a tensor.
Definition: tensor.h:304
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016-present, Facebook, Inc.