Caffe2 - C++ API
A deep learning, cross platform ML framework
stylizer_ops.cc
1 #include "caffe2/core/operator.h"
2 #include "caffe2/utils/cpu_neon.h"
3 #include "caffe2/utils/math.h"
4 
5 #ifdef CAFFE2_USE_MKLDNN
6 #include <caffe2/ideep/operators/operator_fallback_ideep.h>
7 #include <caffe2/ideep/utils/ideep_operator.h>
8 #endif
9 
10 namespace caffe2 {
11 
12 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
13 namespace {
14 
15 //
16 // ARM Neon code utilities
17 //
18 
19 inline float32x4_t to_v4_f32(uint16x4_t v) {
20  return vcvtq_f32_u32(vmovl_u16(v));
21 }
22 
23 inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
24  float32x4x4_t out;
25 
26  uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));
27 
28  out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
29  out.val[1] = to_v4_f32(vget_high_u16(lo_u16));
30 
31  uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));
32 
33  out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
34  out.val[3] = to_v4_f32(vget_high_u16(hi_u16));
35 
36  return out;
37 }
38 
39 inline void clamp(float32x4_t& v) {
40  v = vmaxq_f32(v, vdupq_n_f32(0));
41  v = vminq_f32(v, vdupq_n_f32((float)std::numeric_limits<uint8_t>::max()));
42 }
43 
44 inline void addMeanAndClamp(float32x4_t& v, float mean) {
45  v = vaddq_f32(v, vdupq_n_f32(mean));
46  clamp(v);
47 }
48 
49 inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
50  uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
51  uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
52  uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
53  return vmovn_u16(u16_01);
54 }
55 
56 } // unnamed namespace
57 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
58 
60  : public Operator<CPUContext> {
61  public:
62  // Expect this many channels as input
63  static constexpr int kInputChannels = 4;
64 
65  // Expect this many channels as output
66  static constexpr int kOutputChannels = 3;
67 
68  // We read this much noise per vectorized cycle
69  static constexpr int kNeonNoiseReadSize = kOutputChannels * 16;
70 
71  USE_OPERATOR_FUNCTIONS(CPUContext);
72  explicit PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp(const OperatorDef& operator_def, Workspace* ws)
73  : Operator<CPUContext>(operator_def, ws), ws_(ws) {}
74 
75  bool RunOnDevice() override {
76  const auto& X = Input(0);
77  const auto& mean = Input(1);
78 
79  auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__");
80  auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
81  "noise_size", 491 /* prime to avoid artifacts */);
82 
83  if (!BlobIsTensorType(*noiseBlob, CPU)) {
84  // Initialize random noise on first use.
85  // Cache it to maintain temporal consistency.
86  auto* t = BlobGetMutableTensor(noiseBlob, CPU);
87 
88 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
89  // Noise space is larger for vectorized code due to the
90  // vectorized load
91  initNoiseCPUNeon(t, defaultNoiseSize);
92 #else
93  initNoiseCPU(t, defaultNoiseSize);
94 #endif
95  }
96  const auto& noise = noiseBlob->template Get<TensorCPU>();
97  CAFFE_ENFORCE(noise.numel() >= defaultNoiseSize);
98 
99  CAFFE_ENFORCE(X.dim() == 4);
100  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
101  // Assume BGR or BGRA
102  CAFFE_ENFORCE(mean.numel() == kOutputChannels);
103 
104  CAFFE_ENFORCE(C == kInputChannels);
105  auto* Y = Output(0, {N, kOutputChannels, H, W}, at::dtype<float>());
106 
107  runBatch(
108  N,
109  C,
110  H,
111  W,
112  defaultNoiseSize,
113  X.data<uint8_t>(),
114  mean.data<float>(),
115  noise.data<float>(),
116  Y->template mutable_data<float>());
117 
118  return true;
119  }
120 
121 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
122  void initNoiseCPU(Tensor* noise, int size) {
123  noise->Resize(size);
124 
125  math::RandGaussian<float, CPUContext>(
126  size,
127  0.0,
128  OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
129  noise->template mutable_data<float>(),
130  &context_);
131  }
132 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
133 
134 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
135  void initNoiseCPUNeon(Tensor* noise, int size) {
136  // For ARM NEON, we read in multiples of kNeonNoiseReadSize since
137  // the inner loop is vectorized. Round up to the next highest
138  // multiple of kNeonNoiseReadSize
139  size = math::RoundUp(size, kNeonNoiseReadSize) + size;
140  noise->Resize(size);
141 
142  math::RandGaussian<float, CPUContext>(
143  size,
144  0.0,
145  OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
146  noise->template mutable_data<float>(),
147  &context_);
148  }
149 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
150 
151  void runBatch(
152  int N,
153  int /*C*/,
154  int H,
155  int W,
156  int noiseCycle,
157  const uint8_t* input,
158  const float* meanChannel,
159  const float* noise,
160  float* output) {
161  int planeSize = H * W;
162 
163  for (int n = 0; n < N; ++n) {
164  auto curInput = input + n * kInputChannels * planeSize;
165  auto curOutput = output + n * kOutputChannels * planeSize;
166 
167 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
168  runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
169 #else
170  runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
171 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
172  }
173  }
174 
175 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
176  void runCPU(
177  int H,
178  int W,
179  int noiseCycle,
180  const uint8_t* input,
181  const float* meanChannel,
182  const float* noise,
183  float* output) {
184  int planeSize = H * W;
185  int noiseOffset = 0;
186 
187  for (int point = 0; point < planeSize; ++point) {
188  for (int c = 0; c < kOutputChannels; ++c) {
189  float v = (float)input[point * kInputChannels + c];
190  output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset];
191 
192  if (++noiseOffset >= noiseCycle) {
193  noiseOffset = 0;
194  }
195  }
196  }
197  }
198 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
199 
200 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
201  void runCPUNeon(
202  int H,
203  int W,
204  int noiseCycle,
205  const uint8_t* input,
206  const float* meanChannel,
207  const float* noise,
208  float* output) {
209  // Vectorized load parameters:
210 
211  // Loop unroll factor
212  // FIXME: this doesn't actually unroll; clang has per-loop unroll
213  // pragmas but GCC does not
214  constexpr int kUnroll = 1;
215 
216  // How much data we load for each inner loop
217  constexpr int kInnerLoadSize = sizeof(uint8x16x4_t);
218 
219  // What we write out
220  constexpr int kInnerStoreSize = sizeof(float32x4_t);
221 
222  // We load 16 pixels at a time, with 4 channels each
223  constexpr int kLoadPixels = kInnerLoadSize / kInputChannels;
224  static_assert(kLoadPixels == 16, "unexpected");
225 
226  // How many pixels we load per loop
227  constexpr int kLoadPixelsPerLoop = kLoadPixels * kUnroll;
228 
229  // We need at least this much noise each loop through
230  CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);
231 
232  int noiseUsed = 0;
233  const float* curNoise = noise;
234 
235  float mean[kOutputChannels] = {
236  meanChannel[0], meanChannel[1], meanChannel[2]};
237  int planeSize = H * W;
238 
239  // Vectorized portion
240  int point = 0;
241 
242  // If the slice is not aligned, then we have to use the
243  // un-vectorized version
244  bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
245  isPointerAligned(output, kInnerStoreSize) &&
246  // Because we are writing to output at offsets of planeSize,
247  // planeSize has to be an even multiple of kInnerStoreSize
248  (planeSize % kInnerStoreSize == 0);
249 
250  // What portion the vectorized loop will handle
251  int limit =
252  isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;
253 
254  for (; point < limit; point += kLoadPixelsPerLoop) {
255  // Unroll load/update/store by kUnroll
256  for (int j = 0; j < kUnroll; ++j) {
257  // We load 16 pixels x 4 channels at a time
258  const uint8_t* inputAligned = (const uint8_t*)__builtin_assume_aligned(
259  input + (point + j * kLoadPixels) * kInputChannels,
260  sizeof(uint8x16x4_t));
261  uint8x16x4_t loadV = vld4q_u8(inputAligned);
262 
263  // The compiler doesn't want to unroll this when we put it in a
264  // loop, and in GCC there's no per-loop unroll pragma, so we do
265  // it manually.
266  // This seems to involve no register spillage, crossing fingers
267  // that it remains that way.
268  {
269  constexpr int kChannel = 0;
270  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
271  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
272  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
273  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);
274 
275  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
276  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
277  outV.val[0] = vsubq_f32(outV.val[0], meanV);
278  outV.val[1] = vsubq_f32(outV.val[1], meanV);
279  outV.val[2] = vsubq_f32(outV.val[2], meanV);
280  outV.val[3] = vsubq_f32(outV.val[3], meanV);
281 
282  outV.val[0] = vaddq_f32(outV.val[0], noise0);
283  outV.val[1] = vaddq_f32(outV.val[1], noise1);
284  outV.val[2] = vaddq_f32(outV.val[2], noise2);
285  outV.val[3] = vaddq_f32(outV.val[3], noise3);
286 
287  float* outputAligned = (float*)__builtin_assume_aligned(
288  &output[kChannel * planeSize + (point + j * kLoadPixels)],
289  sizeof(float32x4_t));
290 
291  vst1q_f32(outputAligned + 0, outV.val[0]);
292  vst1q_f32(outputAligned + 4, outV.val[1]);
293  vst1q_f32(outputAligned + 8, outV.val[2]);
294  vst1q_f32(outputAligned + 12, outV.val[3]);
295  }
296 
297  {
298  constexpr int kChannel = 1;
299  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
300  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
301  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
302  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);
303 
304  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
305  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
306  outV.val[0] = vsubq_f32(outV.val[0], meanV);
307  outV.val[1] = vsubq_f32(outV.val[1], meanV);
308  outV.val[2] = vsubq_f32(outV.val[2], meanV);
309  outV.val[3] = vsubq_f32(outV.val[3], meanV);
310 
311  outV.val[0] = vaddq_f32(outV.val[0], noise0);
312  outV.val[1] = vaddq_f32(outV.val[1], noise1);
313  outV.val[2] = vaddq_f32(outV.val[2], noise2);
314  outV.val[3] = vaddq_f32(outV.val[3], noise3);
315 
316  float* outputAligned = (float*)__builtin_assume_aligned(
317  &output[kChannel * planeSize + (point + j * kLoadPixels)],
318  sizeof(float32x4_t));
319 
320  vst1q_f32(outputAligned + 0, outV.val[0]);
321  vst1q_f32(outputAligned + 4, outV.val[1]);
322  vst1q_f32(outputAligned + 8, outV.val[2]);
323  vst1q_f32(outputAligned + 12, outV.val[3]);
324  }
325 
326  {
327  constexpr int kChannel = 2;
328  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
329  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
330  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
331  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);
332 
333  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
334  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
335  outV.val[0] = vsubq_f32(outV.val[0], meanV);
336  outV.val[1] = vsubq_f32(outV.val[1], meanV);
337  outV.val[2] = vsubq_f32(outV.val[2], meanV);
338  outV.val[3] = vsubq_f32(outV.val[3], meanV);
339 
340  outV.val[0] = vaddq_f32(outV.val[0], noise0);
341  outV.val[1] = vaddq_f32(outV.val[1], noise1);
342  outV.val[2] = vaddq_f32(outV.val[2], noise2);
343  outV.val[3] = vaddq_f32(outV.val[3], noise3);
344 
345  float* outputAligned = (float*)__builtin_assume_aligned(
346  &output[kChannel * planeSize + (point + j * kLoadPixels)],
347  sizeof(float32x4_t));
348 
349  vst1q_f32(outputAligned + 0, outV.val[0]);
350  vst1q_f32(outputAligned + 4, outV.val[1]);
351  vst1q_f32(outputAligned + 8, outV.val[2]);
352  vst1q_f32(outputAligned + 12, outV.val[3]);
353  }
354  }
355 
356  curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
357  noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;
358 
359  if (noiseUsed >= noiseCycle) {
360  noiseUsed = 0;
361  curNoise = noise + ((curNoise - noise) % noiseCycle);
362  }
363  }
364 
365  // Epilogue: non-vectorized remainder
366  for (; point < planeSize; ++point) {
367  for (int c = 0; c < kOutputChannels; ++c) {
368  float v = (float)input[point * kInputChannels + c];
369  output[c * planeSize + point] = v - mean[c] + *curNoise++;
370  ++noiseUsed;
371  }
372 
373  if (noiseUsed >= noiseCycle) {
374  noiseUsed = 0;
375  curNoise = noise + ((curNoise - noise) % noiseCycle);
376  }
377  }
378  }
379 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
380 
381  private:
382  Workspace* ws_;
383 };
384 
385 namespace {
386 
387 template <typename T>
388 static inline T clamped_cast(float f) {
389  if (f >= std::numeric_limits<T>::max()) {
390  return std::numeric_limits<T>::max();
391  }
392  if (f <= std::numeric_limits<T>::min()) {
393  return std::numeric_limits<T>::min();
394  }
395  return static_cast<T>(f);
396 }
397 
398 } // unnamed namespace
399 
401  : public Operator<CPUContext> {
402  public:
404 
405  // Expect this many channels as input
406  static constexpr int kInputChannels = 3;
407 
408  // Expect this many channels as output
409  static constexpr int kOutputChannels = 4;
410 
411  bool RunOnDevice() override {
412  const auto& X = Input(0);
413  const auto& mean = Input(1);
414 
415  CAFFE_ENFORCE(X.dim() == 4);
416  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
417  // Assume BGR or BGRA
418  CAFFE_ENFORCE(mean.numel() == kInputChannels);
419  CAFFE_ENFORCE(C == kInputChannels);
420  // RGB
421  auto* Y = Output(0, {N, H, W, kOutputChannels}, at::dtype<uint8_t>());
422 
423  runBatch(
424  N,
425  C,
426  H,
427  W,
428  X.data<float>(),
429  mean.data<float>(),
430  Y->template mutable_data<uint8_t>());
431 
432  return true;
433  }
434 
435  void runBatch(
436  int N,
437  int /*C*/,
438  int H,
439  int W,
440  const float* input,
441  const float* meanChannel,
442  uint8_t* output) {
443  int planeSize = H * W;
444 
445  for (int n = 0; n < N; ++n) {
446  auto curInput = input + n * kInputChannels * planeSize;
447  auto curOutput = output + n * kOutputChannels * planeSize;
448 
449 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
450  runCPUNeon(H, W, curInput, meanChannel, curOutput);
451 #else
452  runCPU(H, W, curInput, meanChannel, curOutput);
453 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
454  }
455  }
456 
457 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
458  void runCPU(
459  int H,
460  int W,
461  const float* input,
462  const float* meanChannel,
463  uint8_t* output) {
464  int planeSize = H * W;
465 
466  for (int point = 0; point < planeSize; ++point) {
467  for (int c = 0; c < kInputChannels; ++c) {
468  uint8_t v = clamped_cast<uint8_t>(
469  input[c * planeSize + point] + meanChannel[c]);
470  output[point * kOutputChannels + c] = v;
471  }
472 
473  // alpha
474  output[point * kOutputChannels + (kOutputChannels - 1)] =
475  std::numeric_limits<uint8_t>::max();
476  }
477  }
478 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
479 
480 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
481  void runCPUNeon(
482  int H,
483  int W,
484  const float* input,
485  const float* meanChannel,
486  uint8_t* output) {
487  // Vectorized load parameters:
488 
489  // We load in chunks of this size
490  constexpr int kLoadUnit = sizeof(float32x4_t);
491  constexpr int kLoadFloats = (sizeof(float32x4_t) / sizeof(float));
492 
493  // We store in chunks of this size
494  constexpr int kStoreUnit = sizeof(uint8x8x4_t);
495 
496  // The vector portion loads this many f32 pixels at a time (8)
497  constexpr int kLoadPixels = 2 * kLoadFloats;
498 
499  float mean[kInputChannels] = {
500  meanChannel[0], meanChannel[1], meanChannel[2]};
501  int planeSize = H * W;
502 
503  // Vectorized portion
504  int point = 0;
505 
506  // If the slice is not aligned, then we have to use the
507  // un-vectorized version
508  bool isAligned = isPointerAligned(input, kLoadUnit) &&
509  isPointerAligned(output, kStoreUnit) &&
510  // Because we are reading from input at offsets of planeSize,
511  // planeSize has to be an even multiple of kLoadUnit
512  (planeSize % kLoadUnit == 0);
513 
514  // What portion the vectorized loop will handle
515  int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;
516 
517  for (; point < limit; point += kLoadPixels) {
518  // Load 8 f32 pixels from each channel; loading 16 involves
519  // register spills it seems
520  float32x4_t inputc0_0 =
521  vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
522  float32x4_t inputc0_1 =
523  vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);
524 
525  float32x4_t inputc1_0 =
526  vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
527  float32x4_t inputc1_1 =
528  vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);
529 
530  float32x4_t inputc2_0 =
531  vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
532  float32x4_t inputc2_1 =
533  vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);
534 
535  addMeanAndClamp(inputc0_0, mean[0]);
536  addMeanAndClamp(inputc0_1, mean[0]);
537  uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);
538 
539  addMeanAndClamp(inputc1_0, mean[1]);
540  addMeanAndClamp(inputc1_1, mean[1]);
541  uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);
542 
543  addMeanAndClamp(inputc2_0, mean[2]);
544  addMeanAndClamp(inputc2_1, mean[2]);
545  uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);
546 
547  // This is the alpha channel
548  uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());
549 
550  // We now have 8 bytes of each channel in a separate vector
551  // Write BGRA interleaved to output
552  uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
553  vst4_u8_aligned(output + kOutputChannels * point, u8_out);
554  }
555 
556  // Epilogue: non-vectorized remainder
557  for (; point < planeSize; ++point) {
558  for (int c = 0; c < kInputChannels; ++c) {
559  uint8_t v =
560  clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
561  output[point * kOutputChannels + c] = v;
562  }
563 
564  // alpha
565  output[point * kOutputChannels + (kOutputChannels - 1)] =
566  std::numeric_limits<uint8_t>::max();
567  }
568  }
569 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
570 };
571 
572 namespace {
573 
574 REGISTER_CPU_OPERATOR(
575  PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
577 OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
578  .NumInputs(2)
579  .NumOutputs(1);
580 REGISTER_CPU_OPERATOR(
581  BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
583 OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
584  .NumInputs(2)
585  .NumOutputs(1);
586 
587 #ifdef CAFFE2_USE_MKLDNN
588 REGISTER_IDEEP_OPERATOR(
589  BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
591 REGISTER_IDEEP_OPERATOR(
592  PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
594 #endif
595 } // namespace
596 } // namespace caffe2
Blob * CreateBlob(const string &name)
Creates a blob of the given name.
Definition: workspace.cc:100
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:40
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64
A templated class to allow one to wrap a CPU operator as an IDEEP operator.