Caffe2 - C++ API
A deep learning, cross platform ML framework
distance_op.cc
1 
17 #include "caffe2/operators/distance_op.h"
18 
19 namespace caffe2 {
20 
21 template<>
22 bool SquaredL2DistanceOp<float, CPUContext>::RunOnDevice() {
23  auto& X = Input(0);
24  auto& Y = Input(1);
25  auto* distance = Output(0);
26  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
27  for (int i = 0; i < X.ndim(); ++i) {
28  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
29  }
30  int N = X.ndim() > 0 ? X.dim32(0) : 1;
31  distance->Resize(N);
32  int D = N > 0 ? X.size() / N : 0;
33  float* distance_data = distance->mutable_data<float>();
34  const float* X_data = X.data<float>();
35  const float* Y_data = Y.data<float>();
36  for (int i = 0; i < N; ++i) {
37  float Xscale, Yscale, cross;
38  math::Dot<float, CPUContext>(
39  D, X_data + i * D, X_data + i * D, &Xscale, &context_);
40  math::Dot<float, CPUContext>(
41  D, Y_data + i * D, Y_data + i * D, &Yscale, &context_);
42  math::Dot<float, CPUContext>(
43  D, X_data + i * D, Y_data + i * D, &cross, &context_);
44  distance_data[i] = (Xscale + Yscale) * 0.5 - cross;
45  }
46  return true;
47 }
48 
49 template <>
50 bool L1DistanceOp<float, CPUContext>::RunOnDevice() {
51  auto& X = Input(0);
52  auto& Y = Input(1);
53  auto* distance = Output(0);
54  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
55  for (int i = 0; i < X.ndim(); ++i) {
56  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
57  }
58  int N = X.ndim() > 0 ? X.dim32(0) : 1;
59  distance->Resize(N);
60  int D = N > 0 ? X.size() / N : 0;
61 
62  const float* X_data = X.data<float>();
63  const float* Y_data = Y.data<float>();
64 
65  for (int i = 0; i < N; ++i) {
66  (distance->mutable_data<float>())[i] =
67  (ConstEigenVectorMap<float>(X_data + i * D, D).array() -
68  ConstEigenVectorMap<float>(Y_data + i * D, D).array())
69  .abs()
70  .sum();
71  }
72  return true;
73 }
74 
75 template <>
76 bool L1DistanceGradientOp<float, CPUContext>::RunOnDevice() {
77  auto& X = Input(0);
78  auto& Y = Input(1);
79  auto& dDistance = Input(2);
80  auto* dX = Output(0);
81  auto* dY = Output(1);
82  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
83  for (int i = 0; i < X.ndim(); ++i) {
84  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
85  }
86  int N = X.ndim() > 0 ? X.dim32(0) : 1;
87  int D = N > 0 ? X.size() / N : 0;
88  CAFFE_ENFORCE(X.ndim() == Y.ndim());
89  for (int i = 0; i < X.ndim(); ++i) {
90  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
91  }
92  CAFFE_ENFORCE(dDistance.ndim() == 1);
93  CAFFE_ENFORCE(dDistance.dim32(0) == N);
94  dX->ResizeLike(X);
95  dY->ResizeLike(Y);
96 
97  for (int i = 0; i < N; ++i) {
98  auto offset = i * D;
99  for (int j = 0; j < D; ++j) {
100  const float temp =
101  (X.data<float>())[offset + j] - (Y.data<float>())[offset + j];
102  const float kEps = 1e-12f;
103  if (temp < -kEps) {
104  dX->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
105  dY->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
106  } else if (temp > kEps) {
107  dX->mutable_data<float>()[offset + j] = (dDistance.data<float>())[i];
108  dY->mutable_data<float>()[offset + j] = -(dDistance.data<float>())[i];
109  } else {
110  dX->mutable_data<float>()[offset + j] = 0;
111  dY->mutable_data<float>()[offset + j] = 0;
112  }
113  }
114  }
115  return true;
116 }
117 
118 template <>
119 bool CosineSimilarityOp<float, CPUContext>::RunOnDevice() {
120  auto& X = Input(X_IN);
121  auto& Y = Input(Y_IN);
122  auto* result = Output(COS_OUT);
123  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
124  for (int i = 0; i < X.ndim(); ++i) {
125  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i));
126  }
127  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
128  const int D = X.size_from_dim(1);
129  result->Resize(N);
130  float* result_data = result->mutable_data<float>();
131  const float* X_data = X.data<float>();
132  const float* Y_data = Y.data<float>();
133  float X2, Y2;
134  const float kEps = 1e-12f;
135  for (int i = 0; i < N; ++i) { // TODO: multithreading
136  auto offset = i * D;
137  math::Dot<float, CPUContext>(
138  D, X_data + offset, X_data + offset, &X2, &context_);
139  math::Dot<float, CPUContext>(
140  D, Y_data + offset, Y_data + offset, &Y2, &context_);
141  math::Dot<float, CPUContext>(
142  D, X_data + offset, Y_data + offset, result_data + i, &context_);
143  result_data[i] /= std::sqrt(std::max(X2, kEps) * std::max(Y2, kEps));
144  }
145  return true;
146 }
147 
148 template <>
149 bool CosineSimilarityGradientOp<float, CPUContext>::RunOnDevice() {
150  auto& X = Input(X_IN);
151  auto& Y = Input(Y_IN);
152  auto& dCos = Input(DER_COS_IN);
153  auto* dX = Output(DER_X_OUT);
154  auto* dY = Output(DER_Y_OUT);
155  const int N = X.ndim() > 0 ? X.dim32(0) : 1;
156  const int D = X.size_from_dim(1);
157  CAFFE_ENFORCE(X.ndim() == Y.ndim());
158  for (int i = 0; i < X.ndim(); ++i) {
159  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
160  }
161  CAFFE_ENFORCE(dCos.ndim() == 1);
162  CAFFE_ENFORCE(dCos.dim32(0) == N);
163  dX->ResizeLike(X);
164  dY->ResizeLike(Y);
165 
166  const auto* X_data = X.template data<float>();
167  const auto* Y_data = Y.template data<float>();
168  const auto* dCos_data = dCos.template data<float>();
169  auto* dX_data = dX->template mutable_data<float>();
170  auto* dY_data = dY->template mutable_data<float>();
171  float XN, YN, XY;
172  const float kEps = 1e-12f;
173  for (int i = 0; i < N; ++i) { // TODO: multithreading
174  auto offset = i * D;
175 
176  // TODO: cache these result from the forward pass
177  // ||x||
178  math::Dot<float, CPUContext>(
179  D, X_data + offset, X_data + offset, &XN, &context_);
180  XN = std::sqrt(std::max(XN, kEps));
181  // ||y||
182  math::Dot<float, CPUContext>(
183  D, Y_data + offset, Y_data + offset, &YN, &context_);
184  YN = std::sqrt(std::max(YN, kEps));
185  // ||x|| * || y ||
186  float XYN = XN * YN;
187  // x^Ty
188  math::Dot<float, CPUContext>(
189  D, X_data + offset, Y_data + offset, &XY, &context_);
190 
191  math::Scale<float, CPUContext>(
192  D, dCos_data[i] / XYN, Y_data + offset, dX_data + offset, &context_);
193  math::Axpy(
194  D,
195  -dCos_data[i] * XY / (XN * XN * XYN),
196  X_data + offset,
197  dX_data + offset,
198  &context_);
199 
200  math::Scale<float, CPUContext>(
201  D, dCos_data[i] / XYN, X_data + offset, dY_data + offset, &context_);
202  math::Axpy(
203  D,
204  -dCos_data[i] * XY / (YN * YN * XYN),
205  Y_data + offset,
206  dY_data + offset,
207  &context_);
208  }
209 
210  return true;
211 }
212 
213 template <>
214 bool DotProductOp<float, CPUContext>::RunOnDevice() {
215  auto& X = Input(X_IN);
216  auto& Y = Input(Y_IN);
217  auto* result = Output(DOT_OUT);
218  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
219  for (int i = 0; i < X.ndim(); ++i) {
220  CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i), "dimension at ", i);
221  }
222  int N, D;
223  if (X.size() > 0) {
224  N = X.ndim() > 0 ? X.dim32(0) : 1;
225  D = X.size() / N;
226  } else {
227  N = 0;
228  D = 0;
229  }
230  result->Resize(N);
231  float* result_data = result->template mutable_data<float>();
232  const float* X_data = X.template data<float>();
233  const float* Y_data = Y.template data<float>();
234  for (int i = 0; i < N; ++i) { // TODO: multithreading
235  auto offset = i * D;
236  math::Dot<float, CPUContext>(
237  D, X_data + offset, Y_data + offset, result_data + i, &context_);
238  }
239  return true;
240 }
241 
242 template <>
243 bool DotProductGradientOp<float, CPUContext>::RunOnDevice() {
244  auto& X = Input(X_IN);
245  auto& Y = Input(Y_IN);
246  auto& dDot = Input(DER_DOT_IN);
247  auto* dX = Output(DER_X_OUT);
248  auto* dY = Output(DER_Y_OUT);
249  int N, D;
250  if (X.size() > 0) {
251  N = X.ndim() > 0 ? X.dim32(0) : 1;
252  D = X.size() / N;
253  } else {
254  N = 0;
255  D = 0;
256  }
257  CAFFE_ENFORCE(X.ndim() == Y.ndim());
258  for (int i = 0; i < X.ndim(); ++i) {
259  CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
260  }
261  CAFFE_ENFORCE(dDot.ndim() == 1);
262  CAFFE_ENFORCE(dDot.dim32(0) == N);
263  dX->ResizeLike(X);
264  dY->ResizeLike(Y);
265 
266  const auto* X_data = X.template data<float>();
267  const auto* Y_data = Y.template data<float>();
268  const auto* dDot_data = dDot.template data<float>();
269  auto* dX_data = dX->template mutable_data<float>();
270  auto* dY_data = dY->template mutable_data<float>();
271  for (int i = 0; i < N; ++i) { // TODO: multithreading
272  auto offset = i * D;
273  math::Scale<float, CPUContext>(
274  D, dDot_data[i], X_data + offset, dY_data + offset, &context_);
275  math::Scale<float, CPUContext>(
276  D, dDot_data[i], Y_data + offset, dX_data + offset, &context_);
277  }
278  return true;
279 }
280 
281 template <>
282 bool DotProductWithPaddingOp<float, CPUContext>::RunOnDevice() {
283  auto& X = Input(X_IN);
284  auto& Y = Input(Y_IN);
285  auto* result = Output(DOT_OUT);
286  CAFFE_ENFORCE_EQ(X.ndim(), Y.ndim());
287  CAFFE_ENFORCE_EQ(X.dim32(0), Y.dim32(0));
288 
289  int N, D, DX, DY, restD;
290  if (X.size() > 0) {
291  N = X.ndim() > 0 ? X.dim32(0) : 1;
292  DX = X.size() / N;
293  DY = Y.size() / N;
294  } else {
295  N = 0;
296  DX = 0;
297  DY = 0;
298  }
299 
300  D = std::min(DX, DY);
301  restD = std::max(DX, DY) - D;
302  result->Resize(N);
303  float* result_data = result->mutable_data<float>();
304  const float* X_data = X.data<float>();
305  const float* Y_data = Y.data<float>();
306  for (int i = 0; i < N; ++i) { // TODO: multithreading
307  auto offsetX = i * DX, offsetY = i * DY;
308  if (replicate_) {
309  // L_ for longer vector and S_ for shorter vector
310  const float *L_data, *S_data;
311  int DL, DS;
312  if (DX > DY) {
313  L_data = X_data + offsetX;
314  S_data = Y_data + offsetY;
315  DL = DX;
316  DS = DY;
317  } else {
318  L_data = Y_data + offsetY;
319  S_data = X_data + offsetX;
320  DL = DY;
321  DS = DX;
322  }
323  float sum = 0.0;
324  float tmp = 0.0;
325  for (int j = 0; j < DL / DS; j++) {
326  math::Dot<float, CPUContext>(
327  DS, L_data + j * DS, S_data, &tmp, &context_);
328  sum += tmp;
329  }
330  *(result_data + i) = sum;
331  } else {
332  math::Dot<float, CPUContext>(
333  D, X_data + offsetX, Y_data + offsetY, result_data + i, &context_);
334  }
335 
336  if (!replicate_ && DX != DY) {
337  const float* rest_data;
338  float rest_sum = 0;
339  if (DX > DY) {
340  rest_data = X_data + offsetX + D;
341  } else {
342  rest_data = Y_data + offsetY + D;
343  }
344  math::Sum<float, CPUContext>(restD, rest_data, &rest_sum, &context_);
345  result_data[i] += rest_sum * pad_value_;
346  }
347  }
348  return true;
349 }
350 
351 // L2
352 REGISTER_CPU_OPERATOR(SquaredL2Distance,
353  SquaredL2DistanceOp<float, CPUContext>);
354 REGISTER_CPU_OPERATOR(SquaredL2DistanceGradient,
355  SquaredL2DistanceGradientOp<float, CPUContext>);
356 
357 OPERATOR_SCHEMA(SquaredL2Distance)
358  .NumInputs(2)
359  .NumOutputs(1)
360  .IdenticalTypeAndShapeOfInputDim(0, 0)
361  .SetDoc(R"DOC(
362 Given two input float tensors X, Y, and produces one output float tensor
363 of the L2 difference between X and Y that is computed as ||(X - Y)^2 / 2||.
364 )DOC")
365  .Input(0, "X", "1D or 2D input tensor")
366  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
367  .Output(0, "Z", "1D output tensor");
369 OPERATOR_SCHEMA(SquaredL2DistanceGradient).NumInputs(3).NumOutputs(2);
370 
372  using GradientMakerBase::GradientMakerBase;
373  vector<OperatorDef> GetGradientDefs() override {
374  return SingleGradientDef(
375  "SquaredL2DistanceGradient", "",
376  vector<string>{I(0), I(1), GO(0)},
377  vector<string>{GI(0), GI(1)});
378  }
379 };
380 REGISTER_GRADIENT(SquaredL2Distance, GetSquaredL2DistanceGradient);
381 
382 // L1
383 REGISTER_CPU_OPERATOR(L1Distance, L1DistanceOp<float, CPUContext>);
384 REGISTER_CPU_OPERATOR(
385  L1DistanceGradient,
387 
388 OPERATOR_SCHEMA(L1Distance)
389  .NumInputs(2)
390  .NumOutputs(1)
391  .IdenticalTypeAndShapeOfInputDim(0, 0)
392  .SetDoc(R"DOC(
393 Given two input float tensors X, Y, and produces one output float tensor
394 of the L1 difference between X and Y, computed as L1(x,y) = sum over |x-y|
395 )DOC")
396  .Input(0, "X", "1D or 2D input tensor")
397  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
398  .Output(0, "Z", "1D output tensor");
399 
400 OPERATOR_SCHEMA(L1DistanceGradient).NumInputs(3).NumOutputs(2);
401 
403  using GradientMakerBase::GradientMakerBase;
404  vector<OperatorDef> GetGradientDefs() override {
405  return SingleGradientDef(
406  "L1DistanceGradient",
407  "",
408  vector<string>{I(0), I(1), GO(0)},
409  vector<string>{GI(0), GI(1)});
410  }
411 };
412 
413 REGISTER_GRADIENT(L1Distance, GetL1DistanceGradient);
414 
415 // Dot Product
416 REGISTER_CPU_OPERATOR(DotProduct, DotProductOp<float, CPUContext>);
417 REGISTER_CPU_OPERATOR(
418  DotProductGradient,
420 
421 OPERATOR_SCHEMA(DotProduct)
422  .NumInputs(2)
423  .NumOutputs(1)
424  .IdenticalTypeAndShapeOfInputDim(0, 0)
425  .SetDoc(R"DOC(
426 Given two input float tensors X, Y, and produces one output float tensor
427 of the dot product between X and Y.
428 )DOC")
429  .Input(0, "X", "1D or 2D input tensor")
430  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
431  .Output(0, "Z", "1D output tensor");
432 
433 OPERATOR_SCHEMA(DotProductGradient).NumInputs(3).NumOutputs(2);
434 
436  using GradientMakerBase::GradientMakerBase;
437  vector<OperatorDef> GetGradientDefs() override {
438  return SingleGradientDef(
439  "DotProductGradient",
440  "",
441  vector<string>{I(0), I(1), GO(0)},
442  vector<string>{GI(0), GI(1)});
443  }
444 };
445 REGISTER_GRADIENT(DotProduct, GetDotProductGradient);
446 
447 // Cosine Similarity
448 REGISTER_CPU_OPERATOR(CosineSimilarity, CosineSimilarityOp<float, CPUContext>);
449 REGISTER_CPU_OPERATOR(
450  CosineSimilarityGradient,
452 
453 OPERATOR_SCHEMA(CosineSimilarity)
454  .NumInputs(2)
455  .NumOutputs(1)
456  .IdenticalTypeAndShapeOfInputDim(0, 0)
457  .SetDoc(R"DOC(
458 Given two input float tensors X, Y, and produces one output float tensor
459 of the cosine similarity between X and Y.
460 )DOC")
461  .Input(0, "X", "1D or 2D input tensor")
462  .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
463  .Output(0, "Z", "1D output tensor");
464 
465 OPERATOR_SCHEMA(CosineSimilarityGradient).NumInputs(3).NumOutputs(2);
466 
468  using GradientMakerBase::GradientMakerBase;
469  vector<OperatorDef> GetGradientDefs() override {
470  return SingleGradientDef(
471  "CosineSimilarityGradient",
472  "",
473  vector<string>{I(0), I(1), GO(0)},
474  vector<string>{GI(0), GI(1)});
475  }
476 };
477 REGISTER_GRADIENT(CosineSimilarity, GetCosineSimilarityGradient);
478 
479 // Dot Product allows padding
480 REGISTER_CPU_OPERATOR(
481  DotProductWithPadding,
483 REGISTER_CPU_OPERATOR(
484  DotProductWithPaddingGradient,
486 
487 OPERATOR_SCHEMA(DotProductWithPadding)
488  .NumInputs(2)
489  .NumOutputs(1)
490  .SetDoc(R"DOC(
491 Given two input float tensors X, Y with different shapes and produces one
492 output float tensor of the dot product between X and Y. We currently support
493 two kinds of strategies to achieve this. Before doing normal dot_product 1)
494 pad the smaller tensor (using pad_value) to the same shape as the other one.
495 2) replicate the smaller tensor to the same shape as the other one. Note the
496 first dimension of X, Y must be equal. Only the second dimension of X or Y
497 can be padded.
498 )DOC")
499  .Input(0, "X", "1D or 2D input tensor")
500  .Input(1, "Y", "1D or 2D input tensor")
501  .Output(0, "Z", "1D output tensor")
502  .IdenticalTypeAndShapeOfInputDim(0, 0)
503  .Arg("pad_value", "the padding value for tensors with smaller dimension")
504  .Arg("replicate", "whether to replicate the smaller tensor or not");
505 
506 OPERATOR_SCHEMA(DotProductWithPaddingGradient).NumInputs(3).NumOutputs(2);
507 
509  using GradientMakerBase::GradientMakerBase;
510  vector<OperatorDef> GetGradientDefs() override {
511  float pad_value = 0;
512  bool replicate = false;
513  if (ArgumentHelper::HasArgument(Def(), "pad_value")) {
514  pad_value = GetArgument(Def(), "pad_value").f();
515  }
516  if (ArgumentHelper::HasArgument(Def(), "replicate")) {
517  replicate = GetArgument(Def(), "replicate").i();
518  }
519 
520  const auto dot_arg =
521  vector<Argument>{MakeArgument<float>("pad_value", pad_value),
522  MakeArgument<bool>("replicate", replicate)};
523 
524  return SingleGradientDef(
525  "DotProductWithPaddingGradient",
526  "",
527  vector<string>{I(0), I(1), GO(0)},
528  vector<string>{GI(0), GI(1)},
529  dot_arg);
530  }
531 };
532 REGISTER_GRADIENT(DotProductWithPadding, GetDotProductWithPaddingGradient);
533 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...