Caffe2 - C++ API
A deep learning, cross platform ML framework
distance_op.cc
1 #include "caffe2/operators/distance_op.h"
2 #include "caffe2/utils/eigen_utils.h"
3 #ifdef CAFFE2_USE_MKLDNN
4 #include <caffe2/ideep/operators/operator_fallback_ideep.h>
5 #include <caffe2/ideep/utils/ideep_operator.h>
6 #endif
7 
8 namespace caffe2 {
9 
10 template<>
11 bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
12  auto& X = Input(0);
13  auto& Y = Input(1);
14 
15  CAFFE_ENFORCE_EQ(X.dim(), Y.dim());
16  for (int i = 0; i < X.dim(); ++i) {
17  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
18  }
19  int N = X.dim() > 0 ? X.dim32(0) : 1;
20  auto* distance = Output(0, {N}, at::dtype<float>());
21  int D = N > 0 ? X.numel() / N : 0;
22  float* distance_data = distance->template mutable_data<float>();
23  const float* X_data = X.data<float>();
24  const float* Y_data = Y.data<float>();
25  for (int i = 0; i < N; ++i) {
26  float Xscale, Yscale, cross;
27  math::Dot<float, CPUContext>(
28  D, X_data + i * D, X_data + i * D, &Xscale, &context_);
29  math::Dot<float, CPUContext>(
30  D, Y_data + i * D, Y_data + i * D, &Yscale, &context_);
31  math::Dot<float, CPUContext>(
32  D, X_data + i * D, Y_data + i * D, &cross, &context_);
33  distance_data[i] = (Xscale + Yscale) * 0.5 - cross;
34  }
35  return true;
36 }
37 
38 template <>
39 bool L1DistanceOp<float, CPUContext>::RunOnDevice() {
40  auto& X = Input(0);
41  auto& Y = Input(1);
42 
43  CAFFE_ENFORCE_EQ(X.dim(), Y.dim());
44  for (int i = 0; i < X.dim(); ++i) {
45  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
46  }
47  int N = X.dim() > 0 ? X.dim32(0) : 1;
48  auto* distance = Output(0, {N}, at::dtype<float>());
49  int D = N > 0 ? X.numel() / N : 0;
50 
51  const float* X_data = X.data<float>();
52  const float* Y_data = Y.data<float>();
53 
54  for (int i = 0; i < N; ++i) {
55  (distance->template mutable_data<float>())[i] =
56  (ConstEigenVectorMap<float>(X_data + i * D, D).array() -
57  ConstEigenVectorMap<float>(Y_data + i * D, D).array())
58  .abs()
59  .sum();
60  }
61  return true;
62 }
63 
64 template <>
65 bool L1DistanceGradientOp<float, CPUContext>::RunOnDevice() {
66  auto& X = Input(0);
67  auto& Y = Input(1);
68  auto& dDistance = Input(2);
69 
70  CAFFE_ENFORCE_EQ(X.dim(), Y.dim());
71  for (int i = 0; i < X.dim(); ++i) {
72  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
73  }
74  int N = X.dim() > 0 ? X.dim32(0) : 1;
75  int D = N > 0 ? X.numel() / N : 0;
76  CAFFE_ENFORCE(X.dim() == Y.dim());
77  for (int i = 0; i < X.dim(); ++i) {
78  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
79  }
80  CAFFE_ENFORCE(dDistance.dim() == 1);
81  CAFFE_ENFORCE(dDistance.dim32(0) == N);
82  auto* dX = Output(0, X.sizes(), at::dtype<float>());
83  auto* dY = Output(1, Y.sizes(), at::dtype<float>());
84 
85  for (int i = 0; i < N; ++i) {
86  auto offset = i * D;
87  for (int j = 0; j < D; ++j) {
88  const float temp =
89  (X.data<float>())[offset + j] - (Y.data<float>())[offset + j];
90  const float kEps = 1e-12f;
91  if (temp < -kEps) {
92  dX->template mutable_data<float>()[offset + j] =
93  -(dDistance.data<float>())[i];
94  dY->template mutable_data<float>()[offset + j] =
95  (dDistance.data<float>())[i];
96  } else if (temp > kEps) {
97  dX->template mutable_data<float>()[offset + j] =
98  (dDistance.data<float>())[i];
99  dY->template mutable_data<float>()[offset + j] =
100  -(dDistance.data<float>())[i];
101  } else {
102  dX->template mutable_data<float>()[offset + j] = 0;
103  dY->template mutable_data<float>()[offset + j] = 0;
104  }
105  }
106  }
107  return true;
108 }
109 
110 template <>
111 bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
112  auto& X = Input(X_IN);
113  auto& Y = Input(Y_IN);
114 
115  CAFFE_ENFORCE_EQ(X.dim(), Y.dim());
116  for (int i = 0; i < X.dim(); ++i) {
117  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
118  }
119  const int N = X.dim() > 0 ? X.dim32(0) : 1;
120  const int D = X.size_from_dim(1);
121  auto* result = Output(COS_OUT, {N}, at::dtype<float>());
122  float* result_data = result->template mutable_data<float>();
123  const float* X_data = X.data<float>();
124  const float* Y_data = Y.data<float>();
125  float X2, Y2;
126  const float kEps = 1e-12f;
127  for (int i = 0; i < N; ++i) { // TODO: multithreading
128  auto offset = i * D;
129  math::Dot<float, CPUContext>(
130  D, X_data + offset, X_data + offset, &X2, &context_);
131  math::Dot<float, CPUContext>(
132  D, Y_data + offset, Y_data + offset, &Y2, &context_);
133  math::Dot<float, CPUContext>(
134  D, X_data + offset, Y_data + offset, result_data + i, &context_);
135  result_data[i] /= std::sqrt(std::max(X2, kEps) * std::max(Y2, kEps));
136  }
137  return true;
138 }
139 
140 template <>
141 bool CosineSimilarityGradientOp<float, CPUContext>::RunOnDevice() {
142  auto& X = Input(X_IN);
143  auto& Y = Input(Y_IN);
144  auto& dCos = Input(DER_COS_IN);
145 
146  const int N = X.dim() > 0 ? X.dim32(0) : 1;
147  const int D = X.size_from_dim(1);
148  CAFFE_ENFORCE(X.dim() == Y.dim());
149  for (int i = 0; i < X.dim(); ++i) {
150  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
151  }
152  CAFFE_ENFORCE(dCos.dim() == 1);
153  CAFFE_ENFORCE(dCos.dim32(0) == N);
154  auto* dX = Output(DER_X_OUT, X.sizes(), at::dtype<float>());
155  auto* dY = Output(DER_Y_OUT, Y.sizes(), at::dtype<float>());
156 
157  const auto* X_data = X.template data<float>();
158  const auto* Y_data = Y.template data<float>();
159  const auto* dCos_data = dCos.template data<float>();
160  auto* dX_data = dX->template mutable_data<float>();
161  auto* dY_data = dY->template mutable_data<float>();
162  float XN, YN, XY;
163  const float kEps = 1e-12f;
164  for (int i = 0; i < N; ++i) { // TODO: multithreading
165  auto offset = i * D;
166 
167  // TODO: cache these result from the forward pass
168  // ||x||
169  math::Dot<float, CPUContext>(
170  D, X_data + offset, X_data + offset, &XN, &context_);
171  XN = std::sqrt(std::max(XN, kEps));
172  // ||y||
173  math::Dot<float, CPUContext>(
174  D, Y_data + offset, Y_data + offset, &YN, &context_);
175  YN = std::sqrt(std::max(YN, kEps));
176  // ||x|| * || y ||
177  float XYN = XN * YN;
178  // x^Ty
179  math::Dot<float, CPUContext>(
180  D, X_data + offset, Y_data + offset, &XY, &context_);
181 
182  math::Scale<float, float, CPUContext>(
183  D, dCos_data[i] / XYN, Y_data + offset, dX_data + offset, &context_);
184  math::Axpy(
185  D,
186  -dCos_data[i] * XY / (XN * XN * XYN),
187  X_data + offset,
188  dX_data + offset,
189  &context_);
190 
191  math::Scale<float, float, CPUContext>(
192  D, dCos_data[i] / XYN, X_data + offset, dY_data + offset, &context_);
193  math::Axpy(
194  D,
195  -dCos_data[i] * XY / (YN * YN * XYN),
196  Y_data + offset,
197  dY_data + offset,
198  &context_);
199  }
200 
201  return true;
202 }
203 
204 template <>
205 bool DotProductOp<float, CPUContext>::RunOnDevice() {
206  auto& X = Input(X_IN);
207  auto& Y = Input(Y_IN);
208 
209  CAFFE_ENFORCE_EQ(X.dim(), Y.dim());
210  for (int i = 0; i < X.dim(); ++i) {
211  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i), "dimension at ", i);
212  }
213  int N, D;
214  if (X.numel() > 0) {
215  N = X.dim() > 0 ? X.dim32(0) : 1;
216  D = X.numel() / N;
217  } else {
218  N = 0;
219  D = 0;
220  }
221  auto* result = Output(DOT_OUT, {N}, at::dtype<float>());
222  float* result_data = result->template mutable_data<float>();
223  const float* X_data = X.template data<float>();
224  const float* Y_data = Y.template data<float>();
225  for (int i = 0; i < N; ++i) { // TODO: multithreading
226  auto offset = i * D;
227  math::Dot<float, CPUContext>(
228  D, X_data + offset, Y_data + offset, result_data + i, &context_);
229  }
230  return true;
231 }
232 
233 vector<TensorShape> TensorInferenceForDotProduct(
234  const OperatorDef& /* def */,
235  const vector<TensorShape>& in) {
236  CAFFE_ENFORCE_GT(in.size(), 0);
237 
238  vector<int64_t> dims(1);
239  dims[0] = in[0].dims().size() > 0 ? in[0].dims(0) : 1;
240  return vector<TensorShape>{CreateTensorShape(dims, in[0].data_type())};
241 }
242 
243 OpSchema::Cost CostInferenceForDotProduct(
244  const OperatorDef& def,
245  const vector<TensorShape>& in) {
246  std::vector<TensorShape> out = TensorInferenceForDotProduct(def, in);
247  CAFFE_ENFORCE_GT(out.size(), 0);
248  CAFFE_ENFORCE_EQ(out[0].dims().size(), 1);
249 
250  struct OpSchema::Cost c = PointwiseCostInference<2>(def, in);
251  c.bytes_written = out[0].dims(0) * sizeof(out[0].data_type());
252  c.params_bytes = 0;
253  return c;
254 }
255 
256 template <>
257 bool DotProductGradientOp<float, CPUContext>::RunOnDevice() {
258  auto& X = Input(X_IN);
259  auto& Y = Input(Y_IN);
260  auto& dDot = Input(DER_DOT_IN);
261 
262  int N, D;
263  if (X.numel() > 0) {
264  N = X.dim() > 0 ? X.dim32(0) : 1;
265  D = X.numel() / N;
266  } else {
267  N = 0;
268  D = 0;
269  }
270  CAFFE_ENFORCE(X.dim() == Y.dim());
271  for (int i = 0; i < X.dim(); ++i) {
272  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
273  }
274  CAFFE_ENFORCE(dDot.dim() == 1);
275  CAFFE_ENFORCE(dDot.dim32(0) == N);
276  auto* dX = Output(DER_X_OUT, X.sizes(), at::dtype<float>());
277  auto* dY = Output(DER_Y_OUT, Y.sizes(), at::dtype<float>());
278 
279  const auto* X_data = X.template data<float>();
280  const auto* Y_data = Y.template data<float>();
281  const auto* dDot_data = dDot.template data<float>();
282  auto* dX_data = dX->template mutable_data<float>();
283  auto* dY_data = dY->template mutable_data<float>();
284  for (int i = 0; i < N; ++i) { // TODO: multithreading
285  auto offset = i * D;
286  math::Scale<float, float, CPUContext>(
287  D, dDot_data[i], X_data + offset, dY_data + offset, &context_);
288  math::Scale<float, float, CPUContext>(
289  D, dDot_data[i], Y_data + offset, dX_data + offset, &context_);
290  }
291  return true;
292 }
293 
294 template <>
295 bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
296  auto& X = Input(X_IN);
297  auto& Y = Input(Y_IN);
298 
299  CAFFE_ENFORCE_EQ(X.dim(), Y.dim());
300  CAFFE_ENFORCE_EQ(X.dim32(0), Y.dim32(0));
301 
302  int N, D, DX, DY, restD;
303  if (X.numel() > 0) {
304  N = X.dim() > 0 ? X.dim32(0) : 1;
305  DX = X.numel() / N;
306  DY = Y.numel() / N;
307  } else {
308  N = 0;
309  DX = 0;
310  DY = 0;
311  }
312 
313  D = std::min(DX, DY);
314  restD = std::max(DX, DY) - D;
315  auto* result = Output(DOT_OUT, {N}, at::dtype<float>());
316  float* result_data = result->template mutable_data<float>();
317  const float* X_data = X.data<float>();
318  const float* Y_data = Y.data<float>();
319  for (int i = 0; i < N; ++i) { // TODO: multithreading
320  auto offsetX = i * DX, offsetY = i * DY;
321  if (replicate_) {
322  // L_ for longer vector and S_ for shorter vector
323  const float *L_data, *S_data;
324  int DL, DS;
325  if (DX > DY) {
326  L_data = X_data + offsetX;
327  S_data = Y_data + offsetY;
328  DL = DX;
329  DS = DY;
330  } else {
331  L_data = Y_data + offsetY;
332  S_data = X_data + offsetX;
333  DL = DY;
334  DS = DX;
335  }
336  float sum = 0.0;
337  float tmp = 0.0;
338  for (int j = 0; j < DL / DS; j++) {
339  math::Dot<float, CPUContext>(
340  DS, L_data + j * DS, S_data, &tmp, &context_);
341  sum += tmp;
342  }
343  *(result_data + i) = sum;
344  } else {
345  math::Dot<float, CPUContext>(
346  D, X_data + offsetX, Y_data + offsetY, result_data + i, &context_);
347  }
348 
349  if (!replicate_ && DX != DY) {
350  const float* rest_data;
351  float rest_sum = 0;
352  if (DX > DY) {
353  rest_data = X_data + offsetX + D;
354  } else {
355  rest_data = Y_data + offsetY + D;
356  }
357  math::Sum<float, CPUContext>(restD, rest_data, &rest_sum, &context_);
358  result_data[i] += rest_sum * pad_value_;
359  }
360  }
361  return true;
362 }
363 
364 // L2
365 REGISTER_CPU_OPERATOR(SquaredL2Distance,
366  SquaredL2DistanceOp<float, CPUContext>);
367 REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
368  SquaredL2DistanceGradientOp<float, CPUContext>);
369 
370 OPERATOR_SCHEMA(SquaredL2Distance)
371  .NumInputs(2)
372  .NumOutputs(1)
373  .IdenticalTypeAndShapeOfInputDim(0, 0)
374  .SetDoc(R"DOC(
375 Given two input float tensors X, Y, and produces one output float tensor
376 of the L2 difference between X and Y that is computed as ||(X - Y)^2 / 2||.
377 )DOC")
378  .Input(0, "X", "1D or 2D input tensor")
379  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
380  .Output(0, "Z", "1D output tensor");
382 OPERATOR_SCHEMA(SquaredL2DistanceGradient).NumInputs(3).NumOutputs(2);
383 
385  using GradientMakerBase::GradientMakerBase;
386  vector<OperatorDef> GetGradientDefs() override {
387  return SingleGradientDef(
388  "SquaredL2DistanceGradient", "",
389  vector<string>{I(0), I(1), GO(0)},
390  vector<string>{GI(0), GI(1)});
391  }
392 };
393 REGISTER_GRADIENT(SquaredL2Distance, GetSquaredL2DistanceGradient);
394 
395 // L1
396 REGISTER_CPU_OPERATOR(L1Distance, L1DistanceOp<float, CPUContext>);
397 REGISTER_CPU_OPERATOR(
398  L1DistanceGradient,
400 #ifdef CAFFE2_USE_MKLDNN
401 REGISTER_IDEEP_OPERATOR(
402  L1DistanceGradient,
404 #endif
405 
406 OPERATOR_SCHEMA(L1Distance)
407  .NumInputs(2)
408  .NumOutputs(1)
409  .IdenticalTypeAndShapeOfInputDim(0, 0)
410  .SetDoc(R"DOC(
411 Computes the row-wise L1 Distance between the two input tensors $X$ and $Y$, which is defined as
412 
413 $$L1Distance(\mathbf{x},\mathbf{y}) = \sum_{i}\mid x_i - y_i\mid$$
415 Note, both inputs must either be 1-dimensional or 2-dimensional and both must have the same shape. The output $Z$ will be 1-dimensional regardless and its length will equal the number of rows in the inputs.
416 
417 Github Links:
418 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.h
419 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc
420 
421 <details>
422 
423 <summary> <b>Example</b> </summary>
424 
425 **Code**
426 
427 ```
428 
429 workspace.ResetWorkspace()
430 
431 op = core.CreateOperator(
432  "L1Distance",
433  ["X", "Y"],
434  ["Z"]
435 )
436 
437 // Create X
438 X = 5*np.ones((1, 4))
439 print("X:\n",X)
440 
441 // Create Y
442 Y = np.ones((1, 4))
443 print("Y:\n",Y)
444 
445 // Feed X & Y into workspace
446 workspace.FeedBlob("X", X.astype(np.float32))
447 workspace.FeedBlob("Y", Y.astype(np.float32))
448 
449 // Run op
450 workspace.RunOperatorOnce(op)
452 // Collect Output
453 print("Z:\n", workspace.FetchBlob("Z"))
454 
455 ```
456 
457 **Result**
458 
459 ```
460 
461 X:
462  [[5. 5. 5. 5.]]
463 Y:
464  [[1. 1. 1. 1.]]
465 Z:
466  [16.]
467 
468 ```
469 
470 </details>
471 
472 )DOC")
473  .Input(0, "X", "First input tensor. (1D or 2D)")
474  .Input(1, "Y", "Second input tensor. (must have the same shape as $X$)")
475  .Output(0, "Z", "1D output tensor. One value for each row of the inputs.");
476 
477 OPERATOR_SCHEMA(L1DistanceGradient).NumInputs(3).NumOutputs(2);
478 
480  using GradientMakerBase::GradientMakerBase;
481  vector<OperatorDef> GetGradientDefs() override {
482  return SingleGradientDef(
483  "L1DistanceGradient",
484  "",
485  vector<string>{I(0), I(1), GO(0)},
486  vector<string>{GI(0), GI(1)});
487  }
488 };
489 
490 REGISTER_GRADIENT(L1Distance, GetL1DistanceGradient);
491 
492 // Dot Product
493 REGISTER_CPU_OPERATOR(DotProduct, DotProductOp<float, CPUContext>);
494 REGISTER_CPU_OPERATOR(
495  DotProductGradient,
497 
498 OPERATOR_SCHEMA(DotProduct)
499  .NumInputs(2)
500  .NumOutputs(1)
501  .IdenticalTypeAndShapeOfInputDim(0, 0)
502  .SetDoc(R"DOC(
503 Computes and outputs the dot product of the two input float tensors `X` and `Y`.
504 Note that `X` and `Y` must be either 1D or 2D, and they must be the same shape.
505 The output tensor is 1D, which represents either the product of each element in
506 a respective dimension if the inputs are 1D, or the sum of the products in a
507 given dimension if the inputs are 2D matrices. Note that the actual dot product
508 is a scalar value, which is effectively the sum of the elements in the 1D
509 output tensor.
510 
511 For 1D inputs:
512 Given two vectors $X = [x_0, x_1, x_2]$ and $Y = [y_0, y_1, y_2]$; $Z = [x_0 * y_0, x_1 * y_1, x_2 * y_2]$
513 
514 For 2D inputs:
515 Given two matrices:
516 $$X = [[x_0^0, x_1^0, x_2^0], \\ [x_0^1, x_1^1, x_2^1], \\ [x_0^2, x_1^2, x_2^2], \\ ..., \\ [x_0^n, x_1^n, x_2^n]]$$
517 
518 and
519 
520 $$Y = [[y_0^0, y_1^0, y_2^0], \\ [y_0^1, y_1^1, y_2^1], \\ [y_0^2, y_1^2, y_2^2], \\ ..., \\ [y_0^n, y_1^n, y_2^n]]$$
522 then
523 
524 $$Z = \biggl[\Big((x_0^0 * y_0^0) + (x_1^0 * y_1^0) + (x_2^0 * y_2^0)\Big), \\ \Big((x_0^1 * y_0^1) + (x_1^1 * y_1^1) + (x_2^1 * y_2^1)\Big), \\ \Big((x_0^2 * y_0^2) + (x_1^2 * y_1^2) + (x_2^2 * y_2^2)\Big), \\ ..., \\ \Big((x_0^n * y_0^n) + (x_1^n * y_1^n) + (x_2^n * y_2^n)\Big)\biggr]$$
525 
526 Github Link:
527 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc
528 
529 <details>
530 
531 <summary> <b>Example</b> </summary>
532 
533 **Code**
534 
535 ```
536 
537 workspace.ResetWorkspace()
538 
539 op = core.CreateOperator(
540  "DotProduct",
541  ["X", "Y"],
542  ["Z"]
543 )
544 
545 workspace.FeedBlob("X", np.random.randint(20, size=(5)).astype(np.float32))
546 workspace.FeedBlob("Y", np.random.randint(20, size=(5)).astype(np.float32))
547 print("X:\n", workspace.FetchBlob("X"))
548 print("Y:\n", workspace.FetchBlob("Y"))
549 workspace.RunOperatorOnce(op)
550 print("Z:\n", workspace.FetchBlob("X"))
551 
552 
553 workspace.ResetWorkspace()
554 workspace.FeedBlob("X", np.random.randint(10, size=(3,3)).astype(np.float32))
555 workspace.FeedBlob("Y", np.random.randint(10, size=(3,3)).astype(np.float32))
556 print("X:\n", workspace.FetchBlob("X"))
557 print("Y:\n", workspace.FetchBlob("Y"))
558 workspace.RunOperatorOnce(op)
559 print("Z:\n", workspace.FetchBlob("Z"))
560 
561 ```
562 
563 **Result**
564 
565 ```
566 
567 X:
568  [ 2. 15. 2. 7. 12.]
569 Y:
570  [ 3. 12. 9. 3. 18.]
571 Z:
572  [ 2. 15. 2. 7. 12.]
573 X:
574  [[2. 0. 4.]
575  [7. 7. 4.]
576  [7. 9. 9.]]
577 Y:
578  [[2. 0. 8.]
579  [9. 6. 1.]
580  [7. 8. 0.]]
581 Z:
582  [ 36. 109. 121.]
583 
584 ```
585 
586 </details>
587 
588 )DOC")
589  .Input(0, "X", "*(type: Tensor`<float>`)* 1D or 2D input tensor.")
590  .Input(
591  1,
592  "Y",
593  "*(type: Tensor`<float>`)* 1D or 2D input tensor (must have the same shape as X).")
594  .Output(0, "Z", "*(type: Tensor`<float>`)* 1D output tensor.")
595  .TensorInferenceFunction(TensorInferenceForDotProduct)
596  .CostInferenceFunction(
597  OpSchema::CostInferenceFunctionType(CostInferenceForDotProduct))
598  .InheritOnnxSchema();
599 
600 OPERATOR_SCHEMA(DotProductGradient).NumInputs(3).NumOutputs(2);
601 
603  using GradientMakerBase::GradientMakerBase;
604  vector<OperatorDef> GetGradientDefs() override {
605  return SingleGradientDef(
606  "DotProductGradient",
607  "",
608  vector<string>{I(0), I(1), GO(0)},
609  vector<string>{GI(0), GI(1)});
610  }
611 };
612 REGISTER_GRADIENT(DotProduct, GetDotProductGradient);
613 
614 // Cosine Similarity
615 REGISTER_CPU_OPERATOR(CosineSimilarity, CosineSimilarityOp<float, CPUContext>);
616 REGISTER_CPU_OPERATOR(
617  CosineSimilarityGradient,
619 
620 OPERATOR_SCHEMA(CosineSimilarity)
621  .NumInputs(2)
622  .NumOutputs(1)
623  .IdenticalTypeAndShapeOfInputDim(0, 0)
624  .SetDoc(R"DOC(
625 This op takes two input float tensors of the same size, $X$ and $Y$, and produces one output float tensor , $Z$, calculated as the cosine similarity between $X$ and $Y$. Recall, the cosine similarity between two tensors $X$ and $Y$ is defined as:
626 
627 $$\mathbf{Z}=CosineSimilarity(\mathbf{X},\mathbf{Y}) = \frac{\mathbf{X}\cdot\mathbf{Y}}{\|\mathbf{X}\|\|\mathbf{Y}\|} = \frac{\sum_n^{i=1}X_iY_i}{\sqrt{\sum_n^{i=1}X_i^2}\sqrt{\sum_n^{i=1}Y_i^2}}$$
628 
629 Github Links:
630 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.h
631 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc
632 
633 <details>
634 
635 <summary> <b>Example</b> </summary>
636 
637 **Code**
638 
639 ```
640 
641 workspace.ResetWorkspace()
642 
643 op = core.CreateOperator(
644  "CosineSimilarity",
645  ["X", "Y"],
646  ["Z"]
647 )
648 
649 // Create X
650 X = np.random.randn(3, 3)
651 print("X:\n",X)
652 
653 // Create Y
654 Y = np.random.randn(3, 3)
655 print("Y:\n",Y)
656 
657 // Feed X & Y into workspace
658 workspace.FeedBlob("X", X.astype(np.float32))
659 workspace.FeedBlob("Y", Y.astype(np.float32))
660 
661 // Run op
662 workspace.RunOperatorOnce(op)
663 
664 // Collect Output
665 print("Z:\n", workspace.FetchBlob("Z"))
666 
667 ```
668 
669 **Result**
670 
671 ```
672 
673 X:
674  [[-0.42635564 -0.23831588 -0.25515547]
675  [ 1.43914719 -1.05613228 1.01717373]
676  [ 0.06883105 0.33386519 -1.46648334]]
677 Y:
678  [[-0.90648691 -0.14241514 -1.1070837 ]
679  [ 0.92152729 -0.28115511 -0.17756722]
680  [-0.88394254 1.34654037 -0.80080998]]
681 Z:
682  [-1.7849885e-23 1.7849885e-23 -1.0842022e-07]
683 
684 ```
685 
686 </details>
687 
688 )DOC")
689  .Input(0, "X", "1D or 2D input tensor")
690  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
691  .Output(0, "Z", "1D output tensor");
692 
693 OPERATOR_SCHEMA(CosineSimilarityGradient).NumInputs(3).NumOutputs(2);
694 
696  using GradientMakerBase::GradientMakerBase;
697  vector<OperatorDef> GetGradientDefs() override {
698  return SingleGradientDef(
699  "CosineSimilarityGradient",
700  "",
701  vector<string>{I(0), I(1), GO(0)},
702  vector<string>{GI(0), GI(1)});
703  }
704 };
705 REGISTER_GRADIENT(CosineSimilarity, GetCosineSimilarityGradient);
706 
707 // Dot Product allows padding
708 REGISTER_CPU_OPERATOR(
709  DotProductWithPadding,
711 REGISTER_CPU_OPERATOR(
712  DotProductWithPaddingGradient,
714 
715 OPERATOR_SCHEMA(DotProductWithPadding)
716  .NumInputs(2)
717  .NumOutputs(1)
718  .SetDoc(R"DOC(
719 Given two input float tensors X, Y with different shapes and produces one
720 output float tensor of the dot product between X and Y. We currently support
721 two kinds of strategies to achieve this. Before doing normal dot_product 1)
722 pad the smaller tensor (using pad_value) to the same shape as the other one.
723 2) replicate the smaller tensor to the same shape as the other one. Note the
724 first dimension of X, Y must be equal. Only the second dimension of X or Y
725 can be padded.
726 )DOC")
727  .Input(0, "X", "1D or 2D input tensor")
728  .Input(1, "Y", "1D or 2D input tensor")
729  .Output(0, "Z", "1D output tensor")
730  .IdenticalTypeAndShapeOfInputDim(0, 0)
731  .Arg("pad_value", "the padding value for tensors with smaller dimension")
732  .Arg("replicate", "whether to replicate the smaller tensor or not");
733 
734 OPERATOR_SCHEMA(DotProductWithPaddingGradient).NumInputs(3).NumOutputs(2);
735 
737  using GradientMakerBase::GradientMakerBase;
738  vector<OperatorDef> GetGradientDefs() override {
739  float pad_value = 0;
740  bool replicate = false;
741  if (ArgumentHelper::HasArgument(Def(), "pad_value")) {
742  pad_value = GetArgument(Def(), "pad_value").f();
743  }
744  if (ArgumentHelper::HasArgument(Def(), "replicate")) {
745  replicate = GetArgument(Def(), "replicate").i();
746  }
747 
748  const auto dot_arg =
749  vector<Argument>{MakeArgument<float>("pad_value", pad_value),
750  MakeArgument<bool>("replicate", replicate)};
751 
752  return SingleGradientDef(
753  "DotProductWithPaddingGradient",
754  "",
755  vector<string>{I(0), I(1), GO(0)},
756  vector<string>{GI(0), GI(1)},
757  dot_arg);
758  }
759 };
760 REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
761 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...
A templated class to allow one to wrap a CPU operator as an IDEEP operator.
Definition: static.cpp:70
std::function< struct Cost(const OperatorDef &, const vector< TensorShape > &)> CostInferenceFunctionType
Registers a function that takes in an OperatorDef and a series of input shapes and returns the total ...