Caffe2 - C++ API
A deep learning, cross platform ML framework
softmax_with_loss_op.cc
1 
17 #include "softmax_with_loss_op.h"
18 #include "softmax_shared.h"
19 
20 namespace caffe2 {
21 
22 REGISTER_CPU_OPERATOR(SoftmaxWithLoss, SoftmaxWithLossOp<float, CPUContext>);
23 REGISTER_CPU_OPERATOR(
24  SoftmaxWithLossGradient,
25  SoftmaxWithLossGradientOp<float, CPUContext>);
26 
27 // Input: X (logits), T (labels); Output: P (probs), Y
28 OPERATOR_SCHEMA(SoftmaxWithLoss)
29  .NumInputs(2, 3)
30  .NumOutputs(2)
31  .TensorInferenceFunction(
32  [](const OperatorDef& def, const vector<TensorShape>& in) {
33  ArgumentHelper helper(def);
34  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
35 
36  vector<TensorShape> out(2);
37 
38  auto logits = in[0]; // Tensor with Shape [batch_size, num_classes]
39  auto labels = in[1]; // Tensor with shape [batch_size, ]
40  const auto canonical_axis =
41  canonical_axis_index_(axis, logits.dims().size());
42  const int batch_size =
43  size_to_dim_(canonical_axis, GetDimsVector(logits));
44  const int num_classes =
45  size_from_dim_(canonical_axis, GetDimsVector(logits));
46 
47  out[0].set_data_type(logits.data_type());
48  out[0].add_dims(batch_size);
49  out[0].add_dims(num_classes);
50 
51  return out;
52  })
53  .SetDoc(R"DOC(
54 Combined Softmax and Cross-Entropy loss operator.
55 The operator computes the softmax normalized values for each layer in the batch
56 of the given input, after which cross-entropy loss is computed. This operator is
57 numerically more stable than separate Softmax and CrossEntropy ops.
58 The inputs are a 2-D tensor (Tensor<float>) of size
59 (batch_size x input_feature_dimensions) and tensor of labels (ground truth).
60 Output is tensor with the probability for each label for each example (N x D)
61 and averaged loss (scalar).
62 Use parameter label_prob=1 to enable inputting labels as a probability
63 distribution.
64 Optional third input blob can be used to weight the samples for the loss.
65 )DOC")
66  .Input(0, "logits", "Unscaled log probabilities")
67  .Input(1, "labels", "Ground truth")
68  .Input(
69  2,
70  "weight_tensor",
71  "Optional blob to be used to weight the samples for the loss.")
72  .Output(0, "softmax", "Tensor with softmax cross entropy loss")
73  .Output(1, "loss", "Average loss");
74 
75 // Input: X, T, P, dY; Output: dX
76 OPERATOR_SCHEMA(SoftmaxWithLossGradient).NumOutputs(1);
77 
78 #define DONT_CARE (-1)
79 
80 template <>
81 bool SoftmaxWithLossOp<float, CPUContext>::RunOnDevice() {
82  auto& X = Input(0); // Logits
83  auto& T = Input(1); // Labels / targets
84  auto* P = Output(0); // Probabilities from softmax
85  auto* avg_loss = Output(1); // Average loss
86 
87  const auto canonical_axis = X.canonical_axis_index(axis_);
88  int N, D;
89  N = X.size_to_dim(canonical_axis); // batch size
90  D = X.size_from_dim(canonical_axis);
91  P->ResizeLike(X);
92 
93  if (sum_multiplier_.size() != D) {
94  sum_multiplier_.Resize(D);
95  math::Set<float, CPUContext>(
96  D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
97  }
98 
99  float* Pdata = P->mutable_data<float>();
100  const float* weights = (InputSize() > 2 ? Input(2).data<float>() : nullptr);
101 
102  if (label_prob_mode_) {
103  CAFFE_ENFORCE_GE(T.ndim(), 2);
104  CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
105  CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
106  } else {
107  if (T.ndim() == canonical_axis) {
108  CAFFE_ENFORCE_EQ(T.size(), N);
109  } else {
110  CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
111  CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
112  }
113  }
114 
115  if (sum_multiplier_.size() != D) {
116  sum_multiplier_.Resize(D);
117  math::Set<float, CPUContext>(
118  D, 1.f, sum_multiplier_.mutable_data<float>(), &context_);
119  }
120 
121  rowmax_.Resize(N);
122  losses_.Resize(N);
123 
124  SoftmaxCPU(
125  context_,
126  N,
127  D,
128  X.data<float>(),
129  Pdata,
130  losses_.mutable_data<float>(),
131  sum_multiplier_.data<float>(),
132  !label_prob_mode_,
133  rowmax_.mutable_data<float>());
134 
135  // Then compute cross entropy
136  float loss_sum = 0.0;
137  float weight_sum = 0.0;
138  if (!label_prob_mode_) {
139  const int* label_data = T.data<int>();
140  const float* Xdata = X.data<float>();
141 
142  for (int i = 0; i < N; ++i) {
143  CAFFE_ENFORCE(
144  label_data[i] < D && label_data[i] >= 0,
145  "Label seems incorrect: label value larger than number of classes: ",
146  label_data[i],
147  " vs ",
148  D);
149  float weight = weights ? weights[i] : 1.0;
150  float l = -Pdata[i * D + label_data[i]] * weight;
151  loss_sum += l;
152  weight_sum += weight;
153  }
154  math::Exp(N * D, Pdata, Pdata, &context_);
155  } else {
156  const float* label_data = T.data<float>();
157 
158  for (int i = 0; i < N; ++i) {
159  float l = 0.0;
160  float total_prob = 0.0;
161  float weight = weights ? weights[i] : 1.0;
162  for (int j = 0; j < D; ++j) {
163  CAFFE_ENFORCE(
164  label_data[i * D + j] >= 0,
165  "Label prob seems incorrect: label prob value must be nonnegative:",
166  " ",
167  label_data[i * D + j]);
168  l += -log(std::max(Pdata[i * D + j], 1e-20f)) * label_data[i * D + j] *
169  weight;
170  total_prob += label_data[i * D + j];
171  }
172  loss_sum += l;
173  CAFFE_ENFORCE(
174  std::abs(total_prob - 1.) < 1e-5f,
175  "Label prob seems incorrect: label prob values do not sum to 1.0: ",
176  total_prob,
177  " vs 1.0 (+/- 1e-5)");
178  weight_sum += weight;
179  }
180  }
181 
182  avg_loss->Resize(vector<TIndex>());
183  float* avg_loss_data = avg_loss->mutable_data<float>();
184  if (weight_sum != 0.0) {
185  avg_loss_data[0] = loss_sum * scale_ / weight_sum;
186  } else {
187  avg_loss_data[0] = 0.0;
188  }
189  return true;
190 }
191 
192 template <>
193 bool SoftmaxWithLossGradientOp<float, CPUContext>::RunOnDevice() {
194  auto& X = Input(0); // Logits
195  auto& T = Input(1); // Labels / targets
196  // Input(2) is weights if given
197  auto& P = Input(InputSize() - 2); // Probabilities from softmax
198  auto& d_avg_loss = Input(InputSize() - 1); // Gradient w.r.t. avg loss
199  auto* dX = Output(0);
200  const float* weights = (InputSize() > 4 ? Input(2).data<float>() : nullptr);
201 
202  const auto canonical_axis = X.canonical_axis_index(axis_);
203  int N, D;
204  N = X.size_to_dim(canonical_axis); // batch size
205  D = X.size_from_dim(canonical_axis);
206  dX->ResizeLike(X);
207 
208  if (label_prob_mode_) {
209  CAFFE_ENFORCE_GE(T.ndim(), 2);
210  CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
211  CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), D);
212  } else {
213  if (T.ndim() == canonical_axis) {
214  CAFFE_ENFORCE_EQ(T.size(), N);
215  } else {
216  CAFFE_ENFORCE_EQ(T.size_to_dim(canonical_axis), N);
217  CAFFE_ENFORCE_EQ(T.size_from_dim(canonical_axis), 1);
218  }
219  }
220 
221  const float* Pdata = P.data<float>();
222  float* dX_data = dX->mutable_data<float>();
223 
224  // Copy softmax probabilities into dX. All but the neuron
225  // corresponding to the correct label has gradient equaling e(x_j)
226  // which is the probability under softmax.
227  context_.Copy<float, CPUContext, CPUContext>(P.size(), Pdata, dX_data);
228 
229  // Compute gradient for the matching labels.
230  float total_weight = 0.0f;
231  if (!label_prob_mode_) {
232  const int* label_data = T.data<int>();
233 
234  if (weights) {
235  for (int i = 0; i < N; ++i) {
236  int idx = i * D + label_data[i];
237  float weight = weights[i];
238  dX_data[idx] = Pdata[idx] - 1.0;
239  for (int d = 0; d < D; d++) {
240  int k = i * D + d;
241  dX_data[k] *= weight;
242  }
243 
244  total_weight += weight;
245  }
246  } else {
247  for (int i = 0; i < N; ++i) {
248  int idx = i * D + label_data[i];
249  dX_data[idx] = Pdata[idx] - 1.0f;
250  }
251  total_weight = N;
252  }
253  } else {
254  const float* label_data = T.data<float>();
255 
256  if (weights) {
257  for (int i = 0; i < N; ++i) {
258  float weight = weights[i];
259  for (int j = 0; j < D; ++j) {
260  int idx = i * D + j;
261  dX_data[idx] = (Pdata[idx] - label_data[idx]) * weight;
262  }
263  total_weight += weight;
264  }
265  } else {
266  for (int i = 0; i < N; ++i) {
267  for (int j = 0; j < D; ++j) {
268  int idx = i * D + j;
269  dX_data[idx] = Pdata[idx] - label_data[idx];
270  }
271  }
272  total_weight = N;
273  }
274  }
275 
276  // Scale by d_avg_loss / N
277  if (total_weight > 0) {
278  math::Scale<float, CPUContext>(
279  dX->size(),
280  scale_ / total_weight * d_avg_loss.data<float>()[0],
281  dX->data<float>(),
282  dX_data,
283  &context_);
284  }
285  return true;
286 }
287 
288 namespace {
289 class GetSoftmaxWithLossGradient : public GradientMakerBase {
290  using GradientMakerBase::GradientMakerBase;
291  vector<OperatorDef> GetGradientDefs() override {
292  vector<string> blob_names{
293  {I(0), I(1), O(0), GO(1)},
294  };
295 
296  // Add weight blob, if given
297  if (def_.input_size() == 3) {
298  blob_names.emplace(blob_names.begin() + 2, I(2));
299  }
300  return SingleGradientDef(
301  "SoftmaxWithLossGradient", "", blob_names, vector<string>{GI(0)});
302  }
303 };
304 
305 REGISTER_GRADIENT(SoftmaxWithLoss, GetSoftmaxWithLossGradient);
306 }
307 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.
TIndex size_from_dim_(int k, vector< TIndex > dims)
Return product of all dimensions starting from K.
Definition: tensor.h:56