Caffe2 - C++ API
A deep learning, cross platform ML framework
utility_ops.cc
1 
17 #include "caffe2/operators/utility_ops.h"
18 
19 #include <cmath>
20 
21 namespace caffe2 {
22 
23 template <>
24 bool WeightedSumOp<CPUContext>::RunOnDevice() {
25  return DoRunWithType<float>();
26 }
27 
28 template <>
29 bool WeightedSumGradientOp<CPUContext>::RunOnDevice() {
30  return DoRunWithType<float>();
31 }
32 
33 template <>
34 template <typename T>
35 void UniqueOp<CPUContext>::DoRun() {
36  auto& inputTensor = Input(0);
37  // use dim32 to enforce that it's fine to have remapping of type int
38  int N = inputTensor.dim32(0);
39  CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1, "Input should be a vector");
40  auto* uniqueTensor = Output(UNIQUE);
41 
42  int* remapping = nullptr;
43  if (REMAPPING < OutputSize()) {
44  auto* remappingTensor = Output(REMAPPING);
45  remappingTensor->ResizeLike(inputTensor);
46  remapping = remappingTensor->template mutable_data<int>();
47  }
48 
49  const T* input = inputTensor.template data<T>();
50  // TODO(dzhulgakov): if perf becomes an issue consider doing hash table
51  // instead of sorting
52  order_.resize(N);
53  std::iota(order_.begin(), order_.end(), 0);
54  std::sort(order_.begin(), order_.end(), [input](const int x, const int y) {
55  return input[x] < input[y];
56  });
57  int K = N;
58  for (int i = 1; i < N; ++i) {
59  K -= input[order_[i]] == input[order_[i - 1]];
60  }
61  uniqueTensor->Resize(K);
62  T* unique = uniqueTensor->template mutable_data<T>();
63  K = 0;
64  T prev = -1;
65  for (int i = 0; i < N; ++i) {
66  if (i == 0 || prev != input[order_[i]]) {
67  prev = unique[K++] = input[order_[i]];
68  }
69  if (remapping) {
70  remapping[order_[i]] = K - 1;
71  }
72  }
73 }
74 
75 REGISTER_CPU_OPERATOR(WallClockTime, WallClockTimeOp<CPUContext>);
76 REGISTER_CPU_OPERATOR(Print, PrintOp<CPUContext>);
77 REGISTER_CPU_OPERATOR(FlattenToVec, FlattenToVecOp<CPUContext>);
78 REGISTER_CPU_OPERATOR(Alias, AliasOp<CPUContext>);
79 REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
80 REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
81 REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
82 REGISTER_CPU_OPERATOR(WeightedSumGradient, WeightedSumGradientOp<CPUContext>);
83 REGISTER_CPU_OPERATOR(
84  ScatterWeightedSum,
85  ScatterWeightedSumOp<float, CPUContext>);
86 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
87 // From whatever the current context, ensure the output is TensorCPU
88 REGISTER_CPU_OPERATOR(
89  EnsureCPUOutput,
90  CopyOp<CPUContext, CPUContext, CPUContext>);
91 // From CPU, copy it to whatever the current context
92 REGISTER_CPU_OPERATOR(
93  CopyFromCPUInput,
94  CopyOp<CPUContext, CPUContext, CPUContext>);
95 REGISTER_CPU_OPERATOR(
96  CopyOnDeviceLike,
97  CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
98 REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
99 REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp<CPUContext>);
100 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
101 REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
102 REGISTER_CPU_OPERATOR(Gather, GatherOp<CPUContext>);
103 REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
104 REGISTER_CPU_OPERATOR(LengthsGather, LengthsGatherOp<CPUContext>);
105 REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
106 REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
107 REGISTER_CPU_OPERATOR(LengthsToRanges, LengthsToRangesOp<CPUContext>);
108 REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
109 REGISTER_CPU_OPERATOR(SegmentIdsToRanges, SegmentIdsToRangesOp<CPUContext>);
110 REGISTER_CPU_OPERATOR(LengthsToWeights, LengthsToWeightsOp<CPUContext>);
111 REGISTER_CPU_OPERATOR(EnsureDense, EnsureDenseOp<CPUContext>);
112 REGISTER_CPU_OPERATOR(
113  AccumulateHistogram,
114  AccumulateHistogramOp<float, CPUContext>);
115 
116 OPERATOR_SCHEMA(WallClockTime)
117  .NumInputs(0)
118  .NumOutputs(1)
119  .SetDoc("Time since epoch in nanoseconds.")
120  .Output(0, "time", "The time in nanoseconds.");
121 
122 REGISTER_CPU_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CPUContext>);
123 
124 OPERATOR_SCHEMA(Print)
125  .NumInputs(1)
126  .NumOutputs(0)
127  .SetDoc("Logs shape and contents of input tensor to stderr or to a file.")
128  .Arg(
129  "to_file",
130  "(bool) if 1, saves contents to the root folder of the current "
131  "workspace, appending the tensor contents to a file named after "
132  "the blob name. Otherwise, logs to stderr.")
133  .Input(0, "tensor", "The tensor to print.");
134 
135 OPERATOR_SCHEMA(LengthsToShape).NumInputs(1).NumOutputs(1);
136 
137 OPERATOR_SCHEMA(FlattenToVec)
138  .NumInputs(1)
139  .NumOutputs(1)
140  .TensorInferenceFunction([](const OperatorDef& /*def*/,
141  const vector<TensorShape>& in) {
142  vector<TensorShape> out(1);
143  int total = 1;
144  for (auto d : in[0].dims()) {
145  total *= d;
146  }
147  out[0].set_data_type(in[0].data_type());
148  out[0].add_dims(total);
149  return out;
150  })
151  .SetDoc(R"DOC(
152 Flattens the input tensor into a 1D vector.
153 )DOC")
154  .Input(0, "input", "A tensor of rank >= 1.")
155  .Output(
156  0,
157  "output",
158  "A tensor of rank 1 with the contents of the input tensor");
159 
160 OPERATOR_SCHEMA(Alias)
161  .NumInputs(1)
162  .NumOutputs(1)
163  .IdenticalTypeAndShape()
164  .SetDoc(R"DOC(
165 Makes the output and the input share the same underlying storage.
166 
167 WARNING: in general, in caffe2's operator interface different tensors should
168 have different underlying storage, which is the assumption made by
169 components such as the dependency engine and memory optimization. Thus, in
170 normal situations you should not use the AliasOp, especially in a normal
171 forward-backward pass.
172 
173 The Alias op is provided so one can achieve true asynchrony, such as
174 Hogwild, in a graph. But make sure you understand all the implications
175 similar to multi-thread computation before you use it explicitly.
176 )DOC")
177  .Input(0, "input", "Input tensor whose storage will be shared.")
178  .Output(0, "output", "Tensor of same shape as input, sharing its storage.");
179 
180 OPERATOR_SCHEMA(ResizeLike)
181  .NumInputs(2)
182  .NumOutputs(1)
183  .TensorInferenceFunction([](const OperatorDef& /*def*/,
184  const vector<TensorShape>& in) {
185  vector<TensorShape> out(1);
186  out.push_back(in[1]);
187  out[0].set_data_type(in[0].data_type());
188  return out;
189  })
190  .SetDoc(R"DOC(
191 Produces tensor containing data of first input and shape of second input.
192 )DOC")
193  .Input(0, "data", "Tensor whose data will be copied into the output.")
194  .Input(1, "shape_tensor", "Tensor whose shape will be applied to output.")
195  .Output(0, "output", "Tensor with data of input 0 and shape of input 1.");
196 
197 OPERATOR_SCHEMA(SumInt)
198  .NumInputs(1, INT_MAX)
199  .NumOutputs(1)
200  .InputsCanCrossDevices()
201  .TensorInferenceFunction([](const OperatorDef& /*def*/,
202  const vector<TensorShape>& in) {
203  vector<TensorShape> out(1);
204  out.push_back(in[0]);
205  out[0].set_data_type(TensorProto::INT32);
206  return out;
207  })
208  .AllowInplace({{0, 0}});
209 
210 OPERATOR_SCHEMA(WeightedSum)
211  .NumInputs([](int n) { return (n > 0 && n % 2 == 0); })
212  .NumOutputs(1)
213  .AllowInplace({{0, 0}})
214  .IdenticalTypeAndShapeOfInput(0)
215  .SetDoc(R"DOC(
216 Element-wise weighted sum of several data, weight tensor pairs.
217 Input should be in the form X_0, weight_0, X_1, weight_1, ... where X_i all
218 have the same shape, and weight_i are size 1 tensors that specifies the weight
219 of each vector. Note that if one wants to do in-place computation, it could
220 only be done with X_0 also as the output, but not other X_i.
221 )DOC")
222  .Input(0, "data_0", "First of the input tensors.")
223  .Input(0, "weight_0", "Weight of the first input in the sum.")
224  .Output(0, "output", "Result containing weighted elem-wise sum of inputs.");
225 
226 OPERATOR_SCHEMA(WeightedSumGradient)
227  .NumInputs([](int n) { return (n > 0 && n % 2 == 1); })
228  .NumOutputs(1, INT_MAX);
229 
230 OPERATOR_SCHEMA(ScatterWeightedSum)
231  .NumInputs([](int n) { return (n > 3 && (n - 3) % 2 == 0); })
232  .NumOutputs(1)
233  .EnforceInplace({{0, 0}})
234  .SetDoc(R"DOC(
235 Similar to WeightedSum, computes the weighted sum of several tensors, with
236 the difference that inputs are sliced tensors. The first tensor has to be
237 in-place and only slices of it on the first dimension as indexed by INDICES
238 will be updated.
239 
240 Note: The op pretty much ignores the exact shapes of the input arguments and
241 cares only about sizes. It's done for performance consideration to avoid
242 unnecessary reshapes. Only first dimension of X_0 is important, let's call it
243 N. If M is the total size of X_0 and K is the size of INDICES then X_i is
244 assumed to be of shape K x (M / N) regardless of the real shape.
245 
246 Note: Each update in INDICES is applied independently which means that if
247 duplicated elements are present in INDICES the corresponding slice of X_0
248 will be scaled multiple times. Manual collapsing of INDICES is required
249 beforehand if necessary.
250 
251 Note: Updates are applied sequentially by inputs which might have undesired
252 consequences if the input tensor is accessed concurrently by different op
253 (e.g. when doing Hogwild). Other threads might see intermediate results even
254 on individual slice level, e.g. X_0 scaled by weight_0 but without any
255 updates applied.
256 
257 Currently only works on CPU because of access to INDICES.
258 )DOC")
259  .Input(0, "X_0", "Tensor to be updated.")
260  .Input(
261  1,
262  "Weight_0",
263  "Scalar weight for X_0, applied only to slices affected.")
264  .Input(
265  2,
266  "INDICES",
267  "1-D list of indices on the first dimension of X_0 "
268  "that need to be updated")
269  .Input(3, "X_1", "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
270  .Input(4, "Weight_1", "Scalar weight for X_1 update")
271  .Output(0, "X_0", "Has to be exactly the same tensor as the input 0")
272  .EnforceInplace({{0, 0}});
273 
274 OPERATOR_SCHEMA(ScatterAssign)
275  .NumInputs(3)
276  .NumOutputs(1)
277  .EnforceInplace({{0, 0}})
278  .SetDoc(R"DOC(
279 Update slices of the tensor in-place by overriding current value.
280 
281 Note: The op pretty much ignores the exact shapes of the input arguments and
282 cares only about sizes. It's done for performance consideration to avoid
283 unnecessary reshapes. Only first dimension of X_0 is important, let's call it
284 N. If M is the total size of X_0 and K is the size of INDICES then X_i is
285 assumed to be of shape K x (M / N) regardless of the real shape.
286 
287 Note: Each update in INDICES is applied independently which means that if
288 duplicated elements are present in INDICES arbitrary one will win.
289 
290 Currently only works on CPU because of access to INDICES.
291 )DOC")
292  .Input(0, "DATA", "Tensor to be updated.")
293  .Input(
294  1,
295  "INDICES",
296  "1-D list of indices on the first dimension"
297  "of X_0 that need to be updated")
298  .Input(
299  2,
300  "SLICES",
301  "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
302  .Output(0, "DATA", "Has to be exactly the same tensor as the input 0");
303 
304 OPERATOR_SCHEMA(Copy)
305  .NumInputs(1)
306  .NumOutputs(1)
307  .IdenticalTypeAndShape()
308  .InputsCanCrossDevices()
309  .SetDoc("Copy input tensor into output, potentially across devices.")
310  .Input(0, "input", "The input tensor.")
311  .Output(0, "output", "Tensor that will contain a copy of the input.");
312 
313 OPERATOR_SCHEMA(CopyGPUToCPU)
314  .NumInputs(1)
315  .NumOutputs(1)
316  .IdenticalTypeAndShape()
317  .InputsCanCrossDevices()
318  .DeviceInferenceFunction([](const OperatorDef& def) {
319  CAFFE_ENFORCE(
320  def.has_device_option(),
321  "CopyGPUToCPU op should have cuda device option.");
322  auto& cuda_option = def.device_option();
323  auto cpu_option = DeviceOption();
324  vector<DeviceOption> in_dev(def.input_size(), cuda_option);
325  vector<DeviceOption> out_dev(def.output_size(), cpu_option);
326  return std::make_pair(in_dev, out_dev);
327  })
328  .SetDoc(R"DOC(
329 Copy tensor for GPU to CPU context. Must be run under GPU device option.
330 )DOC")
331  .Input(0, "input", "The input tensor.")
332  .Output(0, "output", "Tensor that will contain a copy of the input.");
333 
334 OPERATOR_SCHEMA(CopyCPUToGPU)
335  .NumInputs(1)
336  .NumOutputs(1)
337  .IdenticalTypeAndShape()
338  .InputsCanCrossDevices()
339  .DeviceInferenceFunction([](const OperatorDef& def) {
340  CAFFE_ENFORCE(
341  def.has_device_option(),
342  "CopyCPUToGPU op should have cuda device option.");
343  auto& cuda_option = def.device_option();
344  auto cpu_option = DeviceOption();
345  vector<DeviceOption> in_dev(def.input_size(), cpu_option);
346  vector<DeviceOption> out_dev(def.output_size(), cuda_option);
347  return std::make_pair(in_dev, out_dev);
348  })
349  .SetDoc(R"DOC(
350 Copy tensor for CPU to GPU context. Must be run under GPU device option.
351 )DOC")
352  .Input(0, "input", "The input tensor.")
353  .Output(0, "output", "Tensor that will contain a copy of the input.");
354 
355 OPERATOR_SCHEMA(EnsureCPUOutput)
356  .NumInputs(1)
357  .NumOutputs(1)
358  .IdenticalTypeAndShape()
359  .InputsCanCrossDevices()
360  .DeviceInferenceFunction([](const OperatorDef& def) {
361  auto op_device =
362  def.has_device_option() ? def.device_option() : DeviceOption();
363  auto cpu_option = DeviceOption();
364  vector<DeviceOption> in_dev(def.input_size(), op_device);
365  vector<DeviceOption> out_dev(def.output_size(), cpu_option);
366  return std::make_pair(in_dev, out_dev);
367  })
368  .SetDoc(R"DOC(
369 Take an input tensor in the current Context (GPU or CPU) and create an output
370 which is always a TensorCPU. This may involves cross-device MemCpy.
371 )DOC")
372  .Input(0, "input", "The input CUDA or CPU tensor.")
373  .Output(0, "output", "TensorCPU that is a copy of the input.");
374 
375 OPERATOR_SCHEMA(CopyFromCPUInput)
376  .NumInputs(1)
377  .NumOutputs(1)
378  .IdenticalTypeAndShape()
379  .InputsCanCrossDevices()
380  .DeviceInferenceFunction([](const OperatorDef& def) {
381  auto op_device =
382  def.has_device_option() ? def.device_option() : DeviceOption();
383  auto cpu_option = DeviceOption();
384  vector<DeviceOption> in_dev(def.input_size(), cpu_option);
385  vector<DeviceOption> out_dev(def.output_size(), op_device);
386  return std::make_pair(in_dev, out_dev);
387  })
388  .SetDoc(R"DOC(
389 Take a CPU input tensor and copy it to an output in the current
390 Context (GPU or CPU). This may involves cross-device MemCpy.
391 )DOC")
392  .Input(0, "input", "The input CPU tensor.")
393  .Output(0, "output", "either a TensorCUDA or a TensorCPU");
394 
395 OPERATOR_SCHEMA(CopyOnDeviceLike)
396  .NumInputs(2)
397  .NumOutputs(1)
398  .SetDoc("Copy input tensor into output to the specific device.")
399  .Input(0, "input", "The input tensor.")
400  .Input(1, "dst", "Tensor, on which device the copy will be performed.")
401  .Output(0, "output", "Tensor that will contain a copy of the input.");
402 
403 OPERATOR_SCHEMA(HasElements)
404  .NumInputs(1)
405  .NumOutputs(1)
406  .SetDoc("Returns true iff the input tensor has size > 0")
407  .Input(0, "tensor", "Tensor of any type.")
408  .Output(
409  0,
410  "has_elements",
411  "Scalar bool tensor. True if input is not empty.");
412 
413 OPERATOR_SCHEMA(IsEmpty)
414  .NumInputs(1)
415  .NumOutputs(1)
416  .SetDoc("Returns true iff the input tensor has size == 0")
417  .ScalarType(::caffe2::TensorProto_DataType::TensorProto_DataType_BOOL)
418  .Input(0, "tensor", "Tensor of any type.")
419  .Output(0, "is_empty", "Scalar bool tensor. True if input is empty.");
420 
421 OPERATOR_SCHEMA(Gather)
422  .NumInputs(2)
423  .NumOutputs(1)
424  .SetDoc(R"DOC(
425 Given DATA tensor of rank r >= 1, and INDICES tensor of rank q, gather
426 entries of the outer-most dimension of DATA indexed by INDICES, and concatenate
427 them in an output tensor of rank q + (r - 1).
428 
429 Example:
430  DATA = [
431  [1.0, 1.2],
432  [2.3, 3.4],
433  [4.5, 5.7],
434  ]
435  INDICES = [
436  [0, 1],
437  [1, 2],
438  ]
439  OUTPUT = [
440  [
441  [1.0, 1.2],
442  [2.3, 3.4],
443  ],
444  [
445  [2.3, 3.4],
446  [4.5, 5.7],
447  ],
448  ]
449 )DOC")
450  .Input(0, "DATA", "Tensor of rank r >= 1.")
451  .Input(1, "INDICES", "Tensor of int32/int64 indices, of any rank q.")
452  .Output(0, "OUTPUT", "Tensor of rank q + (r - 1).")
453  .TensorInferenceFunction([](const OperatorDef& def,
454  const vector<TensorShape>& in) {
455  vector<TensorShape> out(1);
456  for (auto d : in[1].dims()) {
457  out[0].add_dims(d);
458  }
459  for (int i = 1; i < in[0].dims_size(); ++i) {
460  out[0].add_dims(in[0].dims(i));
461  }
462  out[0].set_data_type(in[0].data_type());
463  return out;
464  });
465 
466 OPERATOR_SCHEMA(GatherRanges)
467  .NumInputs(2)
468  .NumOutputs(2)
469  .SetDoc(R"DOC(
470 Given DATA tensor of rank 1, and RANGES tensor of rank 3, gather
471 corresponding ranges into a 1-D tensor OUTPUT.
472 
473 RANGES dimentions description:
474 1: represents list of examples within a batch
475 2: represents list features
476 3: two values which are start and length or a range (to be applied on DATA)
477 
478 Another output LENGTHS represents each example length within OUTPUT
479 
480 Example:
481  DATA = [1, 2, 3, 4, 5, 6]
482  RANGES = [
483  [
484  [0, 1],
485  [2, 2],
486  ],
487  [
488  [4, 1],
489  [5, 1],
490  ]
491  ]
492  OUTPUT = [1, 3, 4, 5, 6]
493  LENGTHS = [3, 2]
494 )DOC")
495  .Input(0, "DATA", "Tensor of rank 1.")
496  .Input(
497  1,
498  "RANGES",
499  "Tensor of int32/int64 ranges, of dims (N, M, 2). "
500  "Where N is number of examples and M is a size of each example. "
501  "Last dimension represents a range in the format (start, lengths)")
502  .Output(0, "OUTPUT", "1-D tensor of size sum of range lengths")
503  .Output(
504  1,
505  "LENGTHS",
506  "1-D tensor of size N with lengths over gathered data"
507  " for each row in a batch. sum(LENGTHS) == OUTPUT.size()");
508 
509 OPERATOR_SCHEMA(LengthsGather)
510  .NumInputs(3)
511  .NumOutputs(1)
512  .SetDoc(R"DOC(
513 Gather items from sparse tensor. Sparse tensor is described by items and
514 lengths. This operator gathers items corresponding to lengths at the given
515 indices. This deliberately doesn't return lengths of OUTPUTS so that both lists
516 and maps can be supported without special cases. If you need lengths tensor for
517  OUTPUT, use `Gather`.
518 
519 Example:
520  ITEMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
521  LENGTHS = [0, 2, 3, 1, 4]
522  INDICES = [0, 2, 4]
523 
524  OUTPUT = [2, 3, 4, 6, 7, 8, 9]
525 )DOC")
526  .Input(0, "ITEMS", "items tensor")
527  .Input(1, "LENGTHS", "lengths tensor")
528  .Input(2, "INDICES", "indices into LENGTHS where items should be gathered")
529  .Output(0, "OUTPUT", "1-D tensor containing gathered items");
530 
531 OPERATOR_SCHEMA(Unique)
532  .NumInputs(1)
533  .NumOutputs(1, 2)
534  .SetDoc(R"DOC(
535 Deduplicates input indices vector and optionally produces reverse remapping.
536 There's no guarantees on the ordering of the output indices.
537 )DOC")
538  .Input(0, "indices", "1D tensor of int32 or int64 indices.")
539  .Output(0, "unique_indices", "1D tensor of deduped entries.")
540  .Output(
541  1,
542  "remapping",
543  "(optional) mapping from `indices` to `unique_indices`. This has the "
544  "same shape as `indices`. Its elements are the indices into "
545  "`unique_indices` such that `Gather(['unique_indices', 'remapping'])` "
546  "yields `indices`.")
547  .TensorInferenceFunction([](const OperatorDef& def,
548  const vector<TensorShape>& in) {
549  std::vector<TensorShape> out(1);
550  out[0].set_data_type(in[0].data_type());
551  CAFFE_ENFORCE_EQ(in[0].dims_size(), 1);
552  if (in[0].dims(0) <= 1) {
553  // This special case is useful in some situation, e.g., when feeding
554  // tensor inference with empty tensor (where the first dim is the batch
555  // size)
556  out[0].add_dims(in[0].dims(0));
557  } else {
558  out[0].set_unknown_shape(true);
559  }
560  if (def.output_size() > 1) {
561  // Remapping has the same shape as the input tensor
562  out.push_back(in[0]);
563  out.back().set_data_type(TensorProto::INT32);
564  }
565  return out;
566  });
567 
568 OPERATOR_SCHEMA(LengthsToSegmentIds)
569  .NumInputs(1)
570  .NumOutputs(1)
571  .SetDoc(R"DOC(
572 Given a vector of segment lengths, returns a zero-based, consecutive vector
573 of segment_ids. For example, [1, 3, 0, 2] will produce [0, 1, 1, 1, 3, 3].
574 In general, the inverse operation is SegmentIdsToLengths. Notice though that
575 trailing empty sequence lengths can't be properly recovered from segment ids.
576 )DOC")
577  .Input(0, "lengths", "1D tensor of int32 or int64 segment lengths.")
578  .Output(0, "segment_ids", "1D tensor of length `sum(lengths)`");
579 
580 OPERATOR_SCHEMA(LengthsToRanges)
581  .NumInputs(1)
582  .NumOutputs(1)
583  .SetDoc(R"DOC(
584 Given a vector of segment lengths, calculates offsets of each segment and packs
585 them next to the lengths. For the input vector of length N the output is a Nx2
586 matrix with (offset, lengths) packaged for each segment.
587 
588 For example, `[1, 3, 0, 2]` transforms into `[[0, 1], [1, 3], [4, 0], [4, 2]]`.
589 )DOC")
590  .Input(0, "lengths", "1D tensor of int32 segment lengths.")
591  .Output(
592  0,
593  "ranges",
594  "2D tensor of shape len(lengths) X 2 and the same type as `lengths`");
595 
596 OPERATOR_SCHEMA(SegmentIdsToLengths)
597  .NumInputs(1, 2)
598  .NumOutputs(1)
599  .SetDoc(R"DOC(
600 Transfers a vector of segment ids to a vector of segment lengths. This operation
601 supports non-consecutive segment ids. Segments not appearing in the input vector
602 will have length 0. If the second input is provided, the number of segments =
603 the size of its first dimension. Otherwise, the number of segments = the last
604 index in the first input vector + 1.
605 
606 In general, for consecutive, zero-based segment IDs, this is the inverse
607 operation of LengthsToSegmentIds, except that a vector of segment IDs
608 cannot represent empty segments at the end (if the second input is absent).
609 )DOC")
610  .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
611  .Input(
612  1,
613  "data (optional)",
614  "if provided, number of segments = the size of its first dimension")
615  .Output(0, "lengths", "1-D int64_t tensor of segment lengths");
616 
617 OPERATOR_SCHEMA(SegmentIdsToRanges)
618  .NumInputs(1, 2)
619  .NumOutputs(1)
620  .SetDoc(R"DOC(
621 Transfers a vector of segment ids to a vector of segment ranges. This operation
622 supports non-consecutive segment ids. Segments not appearing in the input vector
623 will have length 0. If the second input is provided, the number of segments =
624 the size of its first dimension. Otherwise, the number of segments = the last
625 index in the first input vector + 1.
626 )DOC")
627  .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
628  .Input(
629  1,
630  "data (optional)",
631  "if provided, number of segments = the size of its first dimension")
632  .Output(0, "lengths", "1-D int64_t tensor of segment lengths");
633 
634 OPERATOR_SCHEMA(LengthsToWeights)
635  .NumInputs(1)
636  .NumOutputs(1)
637  .Arg("power", "n of 1/pow(length,n) for normalization")
638  .SetDoc(R"DOC(
639 Similar as LengthsToSegmentIds but output vector of segment
640 weights derived by lengths. i.e 1/pow(length, power)
641 )DOC")
642  .Input(0, "lengths", "1-D int32_t or int64_t tensor of lengths")
643  .Output(0, "a vector of weights", "1-D float tensor of weights by length");
644 
645 
646 
647 SHOULD_NOT_DO_GRADIENT(WallClockTime);
648 
649 OPERATOR_SCHEMA(UnsafeCoalesce)
650  .NumInputsOutputs([](int inputs, int outputs) {
651  return inputs + 1 == outputs;
652  })
653  .AllowInplace([](int input, int output) { return input == output; })
654  .SetDoc(R"DOC(
655 Coalesce the N inputs into N outputs and a single coalesced output blob.
656 
657 This allows operations that operate over multiple small kernels (e.g.
658 biases in a deep CNN) to be coalesced into a single larger operation,
659 amortizing the kernel launch overhead, synchronization costs for
660 distributed computation, etc.
661 
662 The operator:
663 
664 - computes the total size of the coalesced blob by summing the input sizes
665 - allocates the coalesced output blob as the total size
666 - copies the input vectors into the coalesced blob, at the correct offset.
667 - aliases each Output(i) to- point into the coalesced blob, at the corresponding offset for Input(i).
668 
669 This is 'unsafe' as the output vectors are aliased, so use with
670 caution.
671 
672 )DOC");
673 
674 OPERATOR_SCHEMA(EnsureDense)
675  .NumInputs(1)
676  .NumOutputs(1)
677  .AllowInplace({{0, 0}})
678  .IdenticalTypeAndShape()
679  .SetDoc(R"DOC(
680 This operator converts dense or sparse gradients to dense ones.
681 Therefore, sparse gradient can be back propagated to Operators that consume
682 dense gradients only (e.g., FCGradient).
683 
684 The operator's behaviors:
685 
686 - In forward, simply pass in place or copy input to the output.
687 - In backward, if the gradient passed-in is sparse gradient, change it to dense gradient in linear time; otherwise, simply pass the dense gradient.
688 )DOC")
689  .Input(0, "input", "Input tensors.")
690  .Output(0, "output", "Output tensor. Same dimension as inputs.");
691 
692 OPERATOR_SCHEMA(AccumulateHistogram)
693  .NumInputs(1)
694  .NumOutputs(2)
695  .SetDoc(R"DOC(
696 This operator calculate thes histogram of values in input tensor.
697 There're 2 outputs, one for histogram of current input tensor, and another
698 for histogram of the all input tensors accumulated through history.
699 The output would contain num_buckets + 2 values. index[1 ... num_buckets]
700 for values in [lower_bound, upper_bound) interval. And the rest 2 for values
701 smaller than lower_bound or greater than upper_bound respectively.
702 )DOC")
703  .Input(0, "X", "Input tensor.")
704  .Output(0, "CurHist", "Output histogram of the current tensor.")
705  .Output(1, "AccHist", "Accumulated histogram of the history tensor.")
706  .Arg("lower_bound", "the lower bound value")
707  .Arg("upper_bound", "the upper bound value")
708  .Arg(
709  "num_buckets",
710  "number of buckets to use in [lower_bound, upper_bound)");
711 
712 class GetEnsureDenseGradient : public GradientMakerBase {
713  using GradientMakerBase::GradientMakerBase;
714  vector<OperatorDef> GetGradientDefs() override {
715  CAFFE_ENFORCE(
716  GradOut(0).IsSparse() || GradOut(0).IsDense(),
717  "Input gradient ",
718  O(0),
719  " should be either sparse or dense.");
720 
721  if (GradOut(0).IsDense()) {
722  SetDense(0, GO(0));
723  return vector<OperatorDef>();
724  } else {
725  return SingleGradientDef(
726  "SparseToDense",
727  "",
728  vector<string>{GO_I(0), GO_V(0), I(0)},
729  vector<string>{GI(0)});
730  }
731  }
732 };
733 REGISTER_GRADIENT(EnsureDense, GetEnsureDenseGradient);
734 
735 SHOULD_NOT_DO_GRADIENT(Print);
736 SHOULD_NOT_DO_GRADIENT(HasElements);
737 SHOULD_NOT_DO_GRADIENT(IsEmpty);
738 SHOULD_NOT_DO_GRADIENT(LengthsToShape);
739 SHOULD_NOT_DO_GRADIENT(UnsafeCoalesce);
740 
741 class GetAliasGradient : public GradientMakerBase {
742  using GradientMakerBase::GradientMakerBase;
743  vector<OperatorDef> GetGradientDefs() override {
744  // We will simply pass-along the gradient. Nothing needs to
745  // be calculated.
746  SetDense(0, GO(0));
747  return vector<OperatorDef>();
748  }
749 };
750 REGISTER_GRADIENT(Alias, GetAliasGradient);
751 
752 SHOULD_NOT_DO_GRADIENT(ResizeLike);
753 
754 class GetSumGradient : public GradientMakerBase {
755  using GradientMakerBase::GradientMakerBase;
756  vector<OperatorDef> GetGradientDefs() override {
757  for (auto i = 0; i < def_.input_size(); ++i) {
758  SetDense(i, GO(0));
759  }
760  return vector<OperatorDef>();
761  }
762 };
763 REGISTER_GRADIENT(Sum, GetSumGradient);
764 
765 SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
766 SHOULD_NOT_DO_GRADIENT(ScatterAssign);
767 
768 class GetWeightedSumGradient : public GradientMakerBase {
769  using GradientMakerBase::GradientMakerBase;
770  vector<OperatorDef> GetGradientDefs() override {
771  ArgumentHelper argsHelper(def_);
772  const bool grad_on_w = argsHelper.GetSingleArgument<bool>("grad_on_w", 0);
773 
774  auto inputs = vector<string>{GO(0)};
775  auto outputs = vector<string>();
776  for (int i = 0; i < def_.input_size(); i += 2) {
777  inputs.push_back(I(i));
778  inputs.push_back(I(i + 1));
779  outputs.push_back(GI(i));
780  }
781 
782  if (grad_on_w) {
783  for (int i = 0; i < def_.input_size(); i += 2) {
784  outputs.push_back(GI(i + 1));
785  }
786  }
787 
788  return SingleGradientDef("WeightedSumGradient", "", inputs, outputs);
789  }
790 };
791 REGISTER_GRADIENT(WeightedSum, GetWeightedSumGradient);
792 
793 class GetGatherGradient : public GradientMakerBase {
794  using GradientMakerBase::GradientMakerBase;
795  vector<OperatorDef> GetGradientDefs() override {
796  ArgumentHelper argsHelper(def_);
797  const bool dense_gradient =
798  argsHelper.GetSingleArgument<bool>("dense_gradient", false);
799 
800  using Op = GatherOp<CPUContext>;
801 
802  if (dense_gradient) {
803  return vector<OperatorDef>{CreateOperatorDef(
804  "SparseToDense",
805  "",
806  vector<string>{I(Op::INDICES), GO(0), I(Op::DATA)},
807  vector<string>{GI(Op::DATA)})};
808  } else {
809  // For now we don't do any reshaping as the consumer of this op would
810  // probably be ScatterUpdate which is intenionally ignores shapes. We
811  // might need to revisit it in the future for correctness purposes. The
812  // right shape for the output woild be to flatten INDICES and collapse
813  // first X dims of GRAD
814  SetSparse(Op::DATA, I(Op::INDICES), GO(0));
815  return vector<OperatorDef>();
816  }
817  }
818 };
819 REGISTER_GRADIENT(Gather, GetGatherGradient);
820 
821 struct GetFlattenToVecGradient : public GradientMakerBase {
822  using GradientMakerBase::GradientMakerBase;
823  vector<OperatorDef> GetGradientDefs() override {
824  return SingleGradientDef(
825  "ResizeLike", "", vector<string>{GO(0), I(0)}, vector<string>{GI(0)});
826  }
827 };
828 REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient);
829 
830 struct GetCopyGradient : public GradientMakerBase {
831  using GradientMakerBase::GradientMakerBase;
832  vector<OperatorDef> GetGradientDefs() override {
833  return SingleGradientDef(
834  "CopyOnDeviceLike",
835  "",
836  vector<string>{GO(0), I(0)},
837  vector<string>{GI(0)});
838  }
839 };
840 REGISTER_GRADIENT(Copy, GetCopyGradient);
841 
842 struct GetGPUToCPUGradient : public GradientMakerBase {
843  using GradientMakerBase::GradientMakerBase;
844  vector<OperatorDef> GetGradientDefs() override {
845  if (g_output_[0].IsDense()) {
846  return SingleGradientDef(
847  "CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
848  } else {
849  return vector<OperatorDef>{CreateOperatorDef(
850  "CopyCPUToGPU",
851  "",
852  std::vector<string>{GO_I(0)},
853  std::vector<string>{GI_I(0)}),
854  CreateOperatorDef(
855  "CopyCPUToGPU",
856  "",
857  std::vector<string>{GO_V(0)},
858  std::vector<string>{GI_V(0)})};
859  }
860  }
861 };
862 REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
863 
864 struct GetCPUToGPUGradient : public GradientMakerBase {
865  using GradientMakerBase::GradientMakerBase;
866  vector<OperatorDef> GetGradientDefs() override {
867  if (g_output_[0].IsDense()) {
868  return SingleGradientDef(
869  "CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
870  } else {
871  return vector<OperatorDef>{CreateOperatorDef(
872  "CopyGPUToCPU",
873  "",
874  std::vector<string>{GO_I(0)},
875  std::vector<string>{GI_I(0)}),
876  CreateOperatorDef(
877  "CopyGPUToCPU",
878  "",
879  std::vector<string>{GO_V(0)},
880  std::vector<string>{GI_V(0)})};
881  }
882  }
883 };
884 REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
885 
886 SHOULD_NOT_DO_GRADIENT(Unique);
887 SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
888 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
889 SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges);
890 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengthWeights);
891 SHOULD_NOT_DO_GRADIENT(GatherRangesOp);
892 SHOULD_NOT_DO_GRADIENT(LengthsGather);
893 SHOULD_NOT_DO_GRADIENT(AccumulateHistogram);
894 
895 template <>
896 bool NanCheckOp<CPUContext>::RunOnDevice() {
897  auto& X = Input(0);
898  auto* Y = Output(0);
899  const int D = X.size();
900  const float* data = X.data<float>();
901  ConstEigenVectorMap<float> input_data(data, D);
902 
903  bool all_finite = input_data.allFinite();
904 
905  if (!all_finite) {
906  std::cerr << "Tensor contained NaN or inf: [" << this->debug_def().input(0)
907  << "]" << std::endl;
908 
909  for (int j = 0; j < InputSize(); j++) {
910  std::cerr << "Tensor name: " << this->debug_def().input(j) << std::endl;
911  std::cerr << "Input tensor:" << std::endl;
912  tensorPrinter_.Print<float>(Input(j));
913  std::cerr << "NaN idxs:" << std::endl;
914  const float* x = Input(j).data<float>();
915  for (size_t i = 0; i < Input(j).size(); ++i) {
916  if (std::isnan(x[i]) || std::isinf(x[i])) {
917  std::cerr << i << " ";
918  }
919  }
920  std::cerr << std::endl;
921  }
922  return false;
923  }
924 
925  if (&X != Y) {
926  Y->CopyFrom(X, &context_);
927  }
928  return true;
929 }
930 REGISTER_CPU_OPERATOR(NanCheck, NanCheckOp<CPUContext>);
931 REGISTER_GRADIENT(NanCheck, GetNanCheckGradient);
932 
933 OPERATOR_SCHEMA(NanCheck)
934  .NumInputs(1, INT_MAX)
935  .NumOutputs(1)
936  .AllowInplace({{0, 0}})
937  .IdenticalTypeAndShapeOfInput(0)
938  .SetDoc("Identity operator, but checks all values for nan or inf")
939  .Input(0, "tensor", "Tensor to check for nan/inf")
940  .Output(
941  0,
942  "output",
943  "Tensor to copy input into if no NaNs or inf."
944  " Can be in-place");
945 
946 OPERATOR_SCHEMA(Size)
947  .NumInputs(1)
948  .NumOutputs(1)
949  .SetDoc(
950  "Return a 1D tensor of type int64 that contains the number "
951  "of elements of the input tensor")
952  .Input(0, "tensor", "Tensor to calculate number of elements")
953  .Output(
954  0,
955  "output",
956  "1D tensor of type int64 that contains the number of "
957  "elements in the input tensor.");
958 
959 REGISTER_CPU_OPERATOR(Size, SizeOp<CPUContext>);
960 NO_GRADIENT(Size);
961 
962 template <>
963 template <typename T>
964 bool RangeOp<CPUContext>::DoRunOnDevice(
965  const T& start,
966  const T& step,
967  Tensor<CPUContext>* output) {
968  auto* output_data = output->template mutable_data<T>();
969  for (int i = 0; i < output->size(); ++i) {
970  output_data[i] = i * step + start;
971  }
972  return true;
973 }
974 
975 OPERATOR_SCHEMA(Range)
976  .NumInputs(1, 3)
977  .NumOutputs(1)
978  .SetDoc(
979  "Values are generated within the half-open interval [start, stop) "
980  "(in other words, the interval including start but excluding stop). "
981  "When called with a single value, this will return `[0, v]` with the "
982  "result type inferred from the input types.")
983  .Input(
984  0,
985  "start",
986  "Optional scalar Tensor with the start of the interval (inclusive).")
987  .Input(1, "stop", "scalar Tensor with the end of the interval (exclusive)")
988  .Input(2, "step", "Optional scalar Tensor with spacing between values.")
989  .Output(
990  0,
991  "output",
992  "1D tensor of same type as inputs that contains the sequence.");
993 
994 REGISTER_CPU_OPERATOR(Range, RangeOp<CPUContext>);
995 NO_GRADIENT(Range);
996 
997 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016-present, Facebook, Inc.