Caffe2 - C++ API
A deep learning, cross platform ML framework
cross_entropy_op.cc
1 
17 #include "caffe2/operators/cross_entropy_op.h"
18 
19 namespace caffe2 {
20 
21 namespace {
22 
23 inline float sigmoid_xent_forward(float lgt, float tgt) {
24  return lgt * (tgt - (lgt >= 0)) - log(1 + exp(lgt - 2 * lgt * (lgt >= 0)));
25 }
26 
27 inline float sigmoid_xent_backward(float lgt, float tgt) {
28  return tgt - 1. / (1. + exp(-lgt));
29 }
30 }
31 
32 template <>
33 bool LabelCrossEntropyOp<float, CPUContext>::RunOnDevice() {
34  auto& X = Input(0);
35  auto& label = Input(1);
36  auto* Y = Output(0);
37  int N, D;
38  if (X.ndim() > 1) {
39  N = X.dim32(0);
40  D = X.size_from_dim(1);
41  } else {
42  N = 1;
43  D = X.dim32(0);
44  }
45  CAFFE_ENFORCE(
46  (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
47  CAFFE_ENFORCE_EQ(label.dim32(0), N);
48  Y->Resize(N);
49  const auto* Xdata = X.data<float>();
50  const auto* labelData = label.data<int>();
51  auto* Ydata = Y->mutable_data<float>();
52  CAFFE_ENFORCE(
53  (ConstEigenVectorArrayMap<int>(labelData, N) < D).all() &&
54  (ConstEigenVectorArrayMap<int>(labelData, N) >= 0).all(),
55  "Label seems to be outside of supported range. Supported labels are in "
56  "range [0,",
57  D,
58  ")");
59  for (int i = 0; i < N; ++i) {
60  Ydata[i] = -log(std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD()));
61  }
62  return true;
63 }
64 
65 template <>
66 bool SigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
67  auto& logits = Input(0);
68  auto& targets = Input(1);
69  CAFFE_ENFORCE_EQ(logits.dims(), targets.dims());
70  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
71  const auto outer_size = logits.size() / inner_size;
72 
73  auto* out = Output(0);
74  if (logits.ndim() == 0) {
75  out->Resize(std::vector<TIndex>{});
76  } else {
77  std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
78  out->Resize(dims);
79  }
80  auto* out_ptr = out->mutable_data<float>();
81 
82  auto* logits_ptr = logits.data<float>();
83  auto* targets_ptr = targets.data<float>();
84 
85  auto in_idx = 0;
86  for (int i = 0; i < outer_size; ++i) {
87  float value = 0;
88  for (int j = 0; j < inner_size; ++j) {
89  value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]);
90  ++in_idx;
91  }
92  out_ptr[i] = -value / inner_size;
93  }
94  return true;
95 }
96 
97 template <>
98 bool SigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::RunOnDevice() {
99  auto& g = Input(0);
100  auto& logits = Input(1);
101  auto& targets = Input(2);
102  CAFFE_ENFORCE(logits.dims() == targets.dims());
103  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
104  const auto outer_size = logits.size() / inner_size;
105  CAFFE_ENFORCE(g.size() == outer_size);
106 
107  auto* out = Output(0);
108  out->ResizeLike(logits);
109  auto* out_ptr = out->mutable_data<float>();
110 
111  auto* logits_ptr = logits.data<float>();
112  auto* targets_ptr = targets.data<float>();
113  auto* g_ptr = g.data<float>();
114 
115  auto in_idx = 0;
116  for (int i = 0; i < outer_size; ++i) {
117  auto g_factor = -g_ptr[i] / inner_size;
118  for (int j = 0; j < inner_size; ++j) {
119  out_ptr[in_idx] = g_factor *
120  sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]);
121  ++in_idx;
122  }
123  }
124  return true;
125 }
126 
127 template <>
128 bool WeightedSigmoidCrossEntropyWithLogitsOp<float, CPUContext>::RunOnDevice() {
129  auto& logits = Input(0);
130  auto& targets = Input(1);
131  auto& weights = Input(2);
132  CAFFE_ENFORCE(logits.dims() == targets.dims());
133  CAFFE_ENFORCE(weights.dims() == targets.dims());
134  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
135  const auto outer_size = logits.size() / inner_size;
136 
137  auto* out = Output(0);
138  if (logits.ndim() == 0) {
139  out->Resize(std::vector<TIndex>{});
140  } else {
141  std::vector<TIndex> dims(logits.dims().begin(), logits.dims().end() - 1);
142  out->Resize(dims);
143  }
144  auto* out_ptr = out->mutable_data<float>();
145 
146  auto* logits_ptr = logits.data<float>();
147  auto* targets_ptr = targets.data<float>();
148  auto* weights_ptr = weights.data<float>();
149 
150  auto in_idx = 0;
151  for (int i = 0; i < outer_size; ++i) {
152  float value = 0;
153  for (int j = 0; j < inner_size; ++j) {
154  value += sigmoid_xent_forward(logits_ptr[in_idx], targets_ptr[in_idx]) *
155  weights_ptr[in_idx];
156  ++in_idx;
157  }
158  out_ptr[i] = -value / inner_size;
159  }
160  return true;
161 }
162 
163 template <>
164 bool WeightedSigmoidCrossEntropyWithLogitsGradientOp<float, CPUContext>::
165  RunOnDevice() {
166  auto& g = Input(0);
167  auto& logits = Input(1);
168  auto& targets = Input(2);
169  auto& weights = Input(3);
170  CAFFE_ENFORCE(logits.dims() == targets.dims());
171  CAFFE_ENFORCE(weights.dims() == targets.dims());
172  const auto inner_size = logits.ndim() > 0 ? logits.dims().back() : 1;
173  const auto outer_size = logits.size() / inner_size;
174  CAFFE_ENFORCE(g.size() == outer_size);
175 
176  auto* out = Output(0);
177  out->ResizeLike(logits);
178  auto* out_ptr = out->mutable_data<float>();
179 
180  auto* logits_ptr = logits.data<float>();
181  auto* targets_ptr = targets.data<float>();
182  auto* weights_ptr = weights.data<float>();
183  auto* g_ptr = g.data<float>();
184 
185  auto in_idx = 0;
186  for (int i = 0; i < outer_size; ++i) {
187  auto g_factor = -g_ptr[i] / inner_size;
188  for (int j = 0; j < inner_size; ++j) {
189  out_ptr[in_idx] = g_factor *
190  sigmoid_xent_backward(logits_ptr[in_idx], targets_ptr[in_idx]) *
191  weights_ptr[in_idx];
192  ++in_idx;
193  }
194  }
195  return true;
196 }
197 
198 template <>
199 bool LabelCrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
200  auto& X = Input(0);
201  auto& label = Input(1);
202  auto& dY = Input(2);
203  auto* dX = Output(0);
204  int N, D;
205  if (X.ndim() > 1) {
206  N = X.dim32(0);
207  D = X.size_from_dim(1);
208  } else {
209  N = 1;
210  D = X.dim32(0);
211  }
212  CAFFE_ENFORCE(
213  (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == 1));
214  CAFFE_ENFORCE_EQ(label.dim32(0), N);
215  CAFFE_ENFORCE_EQ(dY.ndim(), 1);
216  CAFFE_ENFORCE_EQ(dY.dim32(0), N);
217  dX->ResizeLike(X);
218  math::Set<float, CPUContext>(dX->size(), 0.f, dX->mutable_data<float>(),
219  &context_);
220  const float* Xdata = X.data<float>();
221  const float* dYdata = dY.data<float>();
222  const int* labelData = label.data<int>();
223  float* dXdata = dX->mutable_data<float>();
224  for (int i = 0; i < N; ++i) {
225  dXdata[i * D + labelData[i]] =
226  - dYdata[i] / std::max(Xdata[i * D + labelData[i]], kLOG_THRESHOLD());
227  }
228  return true;
229 }
230 
231 template <>
232 bool MakeTwoClassOp<float, CPUContext>::RunOnDevice() {
233  auto& X = Input(0);
234  auto* Y = Output(0);
235  auto shape = X.dims();
236  shape.push_back(2);
237  TIndex N = X.size();
238  Y->Resize(shape);
239  const auto* Xdata = X.data<float>();
240  auto* Ydata = Y->mutable_data<float>();
241  for (TIndex i = 0; i < N; ++i) {
242  DCHECK_GE(Xdata[i], 0.0);
243  DCHECK_LE(Xdata[i], 1.0);
244  Ydata[i * 2] = 1.0 - Xdata[i];
245  Ydata[i * 2 + 1] = Xdata[i];
246  }
247  return true;
248 }
249 
250 template <>
251 bool MakeTwoClassGradientOp<float, CPUContext>::RunOnDevice() {
252  auto& dY = Input(0);
253  auto* dX = Output(0);
254  auto shape = dY.dims();
255  CAFFE_ENFORCE_GE(shape.size(), 1);
256  CAFFE_ENFORCE_EQ(shape.back(), 2);
257  shape.pop_back();
258  dX->Resize(shape);
259  const float* dYdata = dY.data<float>();
260  float* dXdata = dX->mutable_data<float>();
261  TIndex N = dX->size();
262  // use eigen?
263  for (TIndex i = 0; i < N; ++i) {
264  dXdata[i] = dYdata[i * 2 + 1] - dYdata[i * 2];
265  }
266  return true;
267 }
268 
269 template <>
270 bool CrossEntropyOp<float, CPUContext>::RunOnDevice() {
271  auto& X = Input(0);
272  auto& label = Input(1);
273  auto* Y = Output(0);
274  int N, D;
275  if (X.ndim() > 1) {
276  N = X.dim32(0);
277  D = X.size_from_dim(1);
278  } else {
279  N = 1;
280  D = X.dim32(0);
281  }
282  CAFFE_ENFORCE(
283  (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
284  CAFFE_ENFORCE_EQ(label.dim32(0), N);
285  Y->Resize(vector<TIndex>{N});
286  const float* Xdata = X.data<float>();
287  const float* labelData = label.data<float>();
288  auto* Ydata = Y->mutable_data<float>();
289  CAFFE_ENFORCE(
290  (ConstEigenArrayMap<float>(labelData, D, N) <= 1.0f).all() &&
291  (ConstEigenArrayMap<float>(labelData, D, N) >= 0.0f).all(),
292  "Soft label seems incorrect: label value should be a probability ",
293  "between 0 and 1.0. You may be using the wrong cross entropy operator; ",
294  "use LabelCrossEntropy if the labels are integers whose values are at ",
295  "most the number of classes, ",
296  D,
297  ".");
298  EigenArrayMap<float>(Ydata, 1, N) =
299  -(ConstEigenArrayMap<float>(labelData, D, N) *
300  ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()).log())
301  .colwise()
302  .sum();
303  return true;
304 }
305 
306 template <>
307 bool CrossEntropyGradientOp<float, CPUContext>::RunOnDevice() {
308  auto& X = Input(0);
309  auto& label = Input(1);
310  auto& dY = Input(2);
311  auto* dX = Output(0);
312  int N, D;
313  if (X.ndim() > 1) {
314  N = X.dim32(0);
315  D = X.size_from_dim(1);
316  } else {
317  N = 1;
318  D = X.dim32(0);
319  }
320  CAFFE_ENFORCE(
321  (label.ndim() == 1) || (label.ndim() == 2 && label.dim32(1) == D));
322  CAFFE_ENFORCE_EQ(label.dim32(0), N);
323  CAFFE_ENFORCE_EQ(dY.ndim(), 1);
324  CAFFE_ENFORCE_EQ(dY.dim32(0), N);
325  dX->ResizeLike(X);
326  math::Set<float, CPUContext>(
327  dX->size(), 0.f, dX->mutable_data<float>(), &context_);
328  const float* Xdata = X.data<float>();
329  const float* dYdata = dY.data<float>();
330  const float* labelData = label.data<float>();
331  float* dXdata = dX->mutable_data<float>();
332  EigenArrayMap<float>(dXdata, D, N) =
333  (ConstEigenArrayMap<float>(labelData, D, N) /
334  ConstEigenArrayMap<float>(Xdata, D, N).cwiseMax(kLOG_THRESHOLD()))
335  .rowwise() *
336  (-ConstEigenVectorArrayMap<float>(dYdata, N).transpose());
337  return true;
338 }
339 
340 REGISTER_CPU_OPERATOR(LabelCrossEntropy,
341  LabelCrossEntropyOp<float, CPUContext>);
342 REGISTER_CPU_OPERATOR(LabelCrossEntropyGradient,
343  LabelCrossEntropyGradientOp<float, CPUContext>);
344 
345 OPERATOR_SCHEMA(LabelCrossEntropy)
346  .NumInputs(2)
347  .NumOutputs(1)
348  .IdenticalTypeAndShapeOfInputDim(0, 0)
349  .SetDoc(R"DOC(
350 Operator computes the cross entropy between the input and the label set. In
351  practice, it is most commonly used at the end of models, after the SoftMax
352  operator and before the AveragedLoss operator. Note that LabelCrossEntropy
353  assumes that the label provided is either a 1D array of size N (batch size), or
354  a 2D array of size N x 1 (batch size). Each entry in the label vector indicates
355  which is the correct class; as such, each entry must be between 0 and D - 1,
356  inclusive, where D is the total number of classes. The formula used is:
357 
358  Y[i] = -log(X[i][j])
359 
360  where (i, j) is the classifier's prediction of the jth class (the correct one),
361  and i is the batch size. Each log has a lower limit for numerical stability.
362 )DOC")
363  .Input(
364  0,
365  "X",
366  "Input blob from the previous layer, which is almost always "
367  "the result of a softmax operation; X is a 2D array of size N x D, where N "
368  "is the batch size and D is the number of classes")
369  .Input(1, "label", "Blob containing the labels used to compare the input")
370  .Output(0, "Y", "Output blob after the cross entropy computation");
371 OPERATOR_SCHEMA(LabelCrossEntropyGradient)
372  .NumInputs(3)
373  .NumOutputs(1);
374 
376  using GradientMakerBase::GradientMakerBase;
377  vector<OperatorDef> GetGradientDefs() override {
378  return SingleGradientDef(
379  "LabelCrossEntropyGradient", "",
380  vector<string>{I(0), I(1), GO(0)},
381  vector<string>{GI(0)});
382  }
383 };
384 REGISTER_GRADIENT(LabelCrossEntropy, GetLabelCrossEntropyGradient);
385 
386 REGISTER_CPU_OPERATOR(MakeTwoClass,
388 REGISTER_CPU_OPERATOR(MakeTwoClassGradient,
390 
391 REGISTER_CPU_OPERATOR(
392  SigmoidCrossEntropyWithLogits,
394 REGISTER_CPU_OPERATOR(
395  SigmoidCrossEntropyWithLogitsGradient,
397 
398 REGISTER_CPU_OPERATOR(
399  WeightedSigmoidCrossEntropyWithLogits,
401 REGISTER_CPU_OPERATOR(
402  WeightedSigmoidCrossEntropyWithLogitsGradient,
404 
405 OPERATOR_SCHEMA(MakeTwoClass)
406  .NumInputs(1)
407  .NumOutputs(1)
408  .TensorInferenceFunction(
409  [](const OperatorDef& /* unused */, const vector<TensorShape>& in) {
410  vector<TensorShape> out(1);
411  out[0].add_dims(in[0].dims(0));
412  out[0].add_dims(2);
413  return out;
414  })
415  .SetDoc(R"DOC(
416 Given a vector of probabilities, this operator transforms this into a 2-column
417  matrix with complimentary probabilities for binary classification. In explicit
418  terms, given the vector X, the output Y is vstack(1 - X, X).
419  )DOC")
420  .Input(0, "X", "Input vector of probabilities")
421  .Output(
422  0,
423  "Y",
424  "2-column matrix with complimentary probabilities of X for "
425  "binary classification");
426 
427 OPERATOR_SCHEMA(MakeTwoClassGradient)
428  .NumInputs(1)
429  .NumOutputs(1);
430 
431 OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogits)
432  .NumInputs(2)
433  .NumOutputs(1)
434  .IdenticalTypeAndShapeOfInputDim(0, 0)
435  .SetDoc(R"DOC(
436 Given two matrices logits and targets, of same shape,
437 (batch_size, num_classes), computes the sigmoid cross entropy between the two.
438 Returns a tensor of shape (batch_size,) of losses for each example.
439 )DOC")
440  .Input(0, "logits", "matrix of logits for each example and class.")
441  .Input(1, "targets", "matrix of targets, same shape as logits.")
442  .Output(0, "xentropy", "Vector with the total xentropy for each example.");
443 
444 OPERATOR_SCHEMA(SigmoidCrossEntropyWithLogitsGradient)
445  .NumInputs(3)
446  .NumOutputs(1);
447 
448 OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogits)
449  .NumInputs(3)
450  .NumOutputs(1)
451  .IdenticalTypeAndShapeOfInputDim(0, 0)
452  .SetDoc(R"DOC(
453 Given three matrices: logits, targets, weights, all of the same shape,
454 (batch_size, num_classes), computes the weighted sigmoid cross entropy between
455 logits and targets. Specifically, at each position r,c, this computes
456 weights[r, c] * crossentropy(sigmoid(logits[r, c]), targets[r, c]), and then
457 averages over each row.
458 Returns a tensor of shape (batch_size,) of losses for each example.
459 )DOC")
460  .Input(0, "logits", "matrix of logits for each example and class.")
461  .Input(1, "targets", "matrix of targets, same shape as logits.")
462  .Input(2, "weights", "matrix of weights, same shape as logits.")
463  .Output(0, "xentropy", "Vector with the total xentropy for each example.");
464 
465 OPERATOR_SCHEMA(WeightedSigmoidCrossEntropyWithLogitsGradient)
466  .NumInputs(4)
467  .NumOutputs(1);
468 
470  using GradientMakerBase::GradientMakerBase;
471  vector<OperatorDef> GetGradientDefs() override {
472  return SingleGradientDef(
473  "MakeTwoClassGradient",
474  "",
475  vector<string>{GO(0)},
476  vector<string>{GI(0)});
477  }
478 };
479 REGISTER_GRADIENT(MakeTwoClass, GetMakeTwoClassGradient);
480 
482  using GradientMakerBase::GradientMakerBase;
483  vector<OperatorDef> GetGradientDefs() override {
484  return SingleGradientDef(
485  "SigmoidCrossEntropyWithLogitsGradient",
486  "",
487  vector<string>{GO(0), I(0), I(1)},
488  vector<string>{GI(0)});
489  }
490 };
491 REGISTER_GRADIENT(
492  SigmoidCrossEntropyWithLogits,
494 
496  : public GradientMakerBase {
497  using GradientMakerBase::GradientMakerBase;
498  vector<OperatorDef> GetGradientDefs() override {
499  return SingleGradientDef(
500  "WeightedSigmoidCrossEntropyWithLogitsGradient",
501  "",
502  vector<string>{GO(0), I(0), I(1), I(2)},
503  vector<string>{GI(0)});
504  }
505 };
506 REGISTER_GRADIENT(
507  WeightedSigmoidCrossEntropyWithLogits,
509 
510 REGISTER_CPU_OPERATOR(CrossEntropy,
512 REGISTER_CPU_OPERATOR(CrossEntropyGradient,
514 
515 OPERATOR_SCHEMA(CrossEntropy)
516  .NumInputs(2)
517  .NumOutputs(1)
518  .IdenticalTypeAndShapeOfInputDim(0, 0)
519  .SetDoc(R"DOC(
520 Operator computes the cross entropy between the input and the label set. In
521  practice, it is most commonly used at the end of models, after the SoftMax
522  operator and before the AveragedLoss operator. Note that CrossEntropy
523  assumes that the soft labels provided is a 2D array of size N x D
524  (batch size x number of classes). Each entry in the 2D label corresponds to
525  the soft label for the input, where each element represents the correct
526  probability of the class being selected. As such, each element must be between
527  0 and 1, and all elements in an entry must sum to 1. The formula used is:
528 
529  Y[i] = sum_j (label[i][j] * log(X[i][j]))
530 
531  where (i, j) is the classifier's prediction of the jth class (the correct one),
532  and i is the batch size. Each log has a lower limit for numerical stability.
533 )DOC")
534  .Input(
535  0,
536  "X",
537  "Input blob from the previous layer, which is almost always "
538  "the result of a softmax operation; X is a 2D array of size N x D, where N "
539  "is the batch size and D is the number of classes")
540  .Input(1, "label", "Blob containing the labels used to compare the input")
541  .Output(0, "Y", "Output blob after the cross entropy computation");
542 OPERATOR_SCHEMA(CrossEntropyGradient)
543  .NumInputs(3)
544  .NumOutputs(1);
545 
547  using GradientMakerBase::GradientMakerBase;
548  vector<OperatorDef> GetGradientDefs() override {
549  return SingleGradientDef(
550  "CrossEntropyGradient", "",
551  vector<string>{I(0), I(1), GO(0)},
552  vector<string>{GI(0)});
553  }
554 };
555 REGISTER_GRADIENT(CrossEntropy, GetCrossEntropyGradient);
556 
557 } // namespace caffe2
const T * data() const
Returns a typed pointer of the underlying storage.
Definition: tensor.h:500
const vector< TIndex > & dims() const
Returns the dimensions of the tensor as a vector.
Definition: tensor.h:627
Copyright (c) 2016-present, Facebook, Inc.
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...