Caffe2 - C++ API
A deep learning, cross platform ML framework
batch_box_cox_op.cc
1 
17 #include "caffe2/operators/batch_box_cox_op.h"
18 
19 #include "caffe2/core/operator.h"
20 #include "caffe2/core/tensor.h"
21 
22 #ifdef CAFFE2_USE_MKL
23 #include <mkl.h>
24 #endif // CAFFE2_USE_MKL
25 
26 namespace caffe2 {
27 
28 #ifdef CAFFE2_USE_MKL
29 namespace {
30 
31 // Helpers for copying parameters.
32 template <typename T>
33 void TileArrayIntoVector(const T* a, int D, int K, vector<T>* b) {
34  b->resize(K * D);
35  for (int k = 0; k < K; k++) {
36  std::copy(a, a + D, b->begin() + k * D);
37  }
38 }
39 
40 void TileIndicesInPlace(vector<int>* v, int D, int K) {
41  int n = v->size();
42  v->resize(K * n);
43  for (int k = 1; k < K; k++) {
44  for (int j = 0; j < n; j++) {
45  (*v)[k * n + j] = (*v)[j] + k * D;
46  }
47  }
48 }
49 
50 // MKL VML function templates.
51 template <typename T>
52 void PackV(const int N, const T* a, const int* ia, T* y);
53 template <typename T>
54 void UnpackV(const int N, const T* a, T* y, const int* iy);
55 template <typename T>
56 void Pow(const int N, const T* a, const T* b, T* y);
57 
58 #define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \
59  template <> \
60  void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
61  OriginalFunc(N, a, ia, y); \
62  }
63 DELEGATE_PACKV_FUNCTION(float, vsPackV)
64 DELEGATE_PACKV_FUNCTION(double, vdPackV)
65 #undef DELEGATE_PACKV_FUNCTION
66 
67 #define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \
68  template <> \
69  void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
70  OriginalFunc(N, a, y, iy); \
71  }
72 DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
73 DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
74 #undef DELEGATE_UNPACKV_FUNCTION
75 
76 #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
77  template <> \
78  void Funcname<T>(const int N, const T* a, const T* b, T* y) { \
79  OriginalFunc(N, a, b, y); \
80  }
81 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow)
82 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow)
83 #undef DELEGATE_SIMPLE_BINARY_FUNCTION
84 
85 } // namespace
86 #endif // CAFFE2_USE_MKL
87 
88 template <>
89 template <typename T>
90 bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
91  auto& data = Input(DATA);
92  auto& lambda1 = Input(LAMBDA1);
93  auto& lambda2 = Input(LAMBDA2);
94  CAFFE_ENFORCE_GE(data.ndim(), 1);
95  auto N = data.dim(0);
96  auto D = data.size_from_dim(1);
97 
98  auto* output = Output(0);
99  output->ResizeLike(Input(DATA));
100  auto* output_ptr = output->template mutable_data<T>();
101 
102  if (data.size() <= 0) {
103  return true;
104  }
105 
106  CAFFE_ENFORCE_EQ(lambda1.size(), D);
107  CAFFE_ENFORCE_EQ(lambda2.size(), D);
108 
109  const auto* data_ptr = data.template data<T>();
110  const auto* lambda1_ptr = lambda1.template data<T>();
111  const auto* lambda2_ptr = lambda2.template data<T>();
112 
113  const T k_eps = static_cast<T>(1e-6);
114 
115 #ifdef CAFFE2_USE_MKL
116  if (min_block_size_ < 1) {
117  BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
118  } else {
119  // Find zero-valued columns, since they get special treatment.
120  nonzeros_.clear();
121  zeros_.clear();
122  nonzeros_.reserve(D);
123  zeros_.reserve(D);
124  for (TIndex j = 0; j < D; j++) {
125  if (lambda1_ptr[j] == 0) {
126  zeros_.push_back(j);
127  } else {
128  nonzeros_.push_back(j);
129  }
130  }
131 
132  // Process K rows at a time for effective vectorization with small rows.
133  const int K = std::min(N, (min_block_size_ + D - 1) / D);
134 
135  // Avoid copying data if all lambda1 values are zero, or if all are nonzero.
136  // In each of the three cases here, when K > 1, first process batches of K
137  // rows by replicating the input parameters K times. Then finish row-by-row.
138  TypedCachedBuffers<T>& b = GetBuffers<T>();
139  if (nonzeros_.size() == D) {
140  TIndex i = 0;
141  if (K > 1) {
142  TileArrayIntoVector(lambda1_ptr, D, K, &b.lambda1_);
143  TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_);
144  DCHECK_EQ(K * D, b.lambda1_.size());
145  DCHECK_EQ(K * D, b.lambda2_.size());
146  for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
147  BoxCoxNonzeroLambda(
148  K * D,
149  data_ptr,
150  b.lambda1_.data(),
151  b.lambda2_.data(),
152  k_eps,
153  output_ptr);
154  }
155  }
156  for (; i < N; i++, data_ptr += D, output_ptr += D) {
157  BoxCoxNonzeroLambda(
158  D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
159  }
160  } else if (zeros_.size() == D) {
161  TIndex i = 0;
162  if (K > 1) {
163  TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_z_);
164  DCHECK_EQ(K * D, b.lambda2_z_.size());
165  for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
166  BoxCoxZeroLambda(
167  K * D, data_ptr, b.lambda2_z_.data(), k_eps, output_ptr);
168  }
169  }
170  for (; i < N; i++, data_ptr += D, output_ptr += D) {
171  BoxCoxZeroLambda(D, data_ptr, lambda2_ptr, k_eps, output_ptr);
172  }
173  } else { // General case of mixed zero and non-zero lambda1 values.
174  int n = nonzeros_.size();
175  if (K > 1) {
176  TileIndicesInPlace(&nonzeros_, 0, K);
177  TileIndicesInPlace(&zeros_, 0, K);
178  }
179 
180  // Gather parameter values into contiguous memory.
181  b.lambda1_.resize(nonzeros_.size());
182  b.lambda2_.resize(nonzeros_.size());
183  b.lambda2_z_.resize(zeros_.size());
184  PackV(nonzeros_.size(), lambda1_ptr, nonzeros_.data(), b.lambda1_.data());
185  PackV(nonzeros_.size(), lambda2_ptr, nonzeros_.data(), b.lambda2_.data());
186  PackV(zeros_.size(), lambda2_ptr, zeros_.data(), b.lambda2_z_.data());
187 
188  TIndex i = 0;
189  b.accumulator_.resize(std::max(nonzeros_.size(), zeros_.size()));
190  if (K > 1) {
191  // Truncate to original size, and re-tile with offsets this time.
192  nonzeros_.resize(n);
193  zeros_.resize(D - n);
194  TileIndicesInPlace(&nonzeros_, D, K);
195  TileIndicesInPlace(&zeros_, D, K);
196  DCHECK_EQ(nonzeros_.size(), b.lambda1_.size());
197  DCHECK_EQ(nonzeros_.size(), b.lambda2_.size());
198  DCHECK_EQ(zeros_.size(), b.lambda2_z_.size());
199  for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
200  BoxCoxMixedLambda(
201  data_ptr,
202  nonzeros_,
203  zeros_,
204  b.lambda1_.data(),
205  b.lambda2_.data(),
206  b.lambda2_z_.data(),
207  k_eps,
208  b.accumulator_.data(),
209  output_ptr);
210  }
211  // Truncate to original size.
212  nonzeros_.resize(n);
213  zeros_.resize(D - n);
214  }
215  for (; i < N; i++, data_ptr += D, output_ptr += D) {
216  BoxCoxMixedLambda(
217  data_ptr,
218  nonzeros_,
219  zeros_,
220  b.lambda1_.data(),
221  b.lambda2_.data(),
222  b.lambda2_z_.data(),
223  k_eps,
224  b.accumulator_.data(),
225  output_ptr);
226  }
227  }
228  }
229 #else // CAFFE2_USE_MKL
230  BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
231 #endif // CAFFE2_USE_MKL
232  return true;
233 }
234 
235 template <>
236 template <typename T>
237 void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
238  TIndex N,
239  TIndex D,
240  const T* data_ptr,
241  const T* lambda1_ptr,
242  const T* lambda2_ptr,
243  T k_eps,
244  T* output_ptr) {
245  for (TIndex i = 0; i < N; i++) {
246  for (TIndex j = 0; j < D; j++, data_ptr++, output_ptr++) {
247  T lambda1_v = lambda1_ptr[j];
248  T lambda2_v = lambda2_ptr[j];
249  T tmp = std::max(*data_ptr + lambda2_v, k_eps);
250  if (lambda1_v == 0) {
251  *output_ptr = std::log(tmp);
252  } else {
253  *output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v;
254  }
255  }
256  }
257 }
258 
259 #ifdef CAFFE2_USE_MKL
260 
261 template <>
262 template <typename T>
263 void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
264  TIndex D,
265  const T* data_ptr,
266  const T* lambda1,
267  const T* lambda2,
268  T k_eps,
269  T* out) {
270  caffe2::math::Add(D, data_ptr, lambda2, out, &context_);
271  for (TIndex j = 0; j < D; j++) {
272  out[j] = std::max(out[j], k_eps);
273  }
274  Pow(D, out, lambda1, out);
275  for (TIndex j = 0; j < D; j++) {
276  out[j] -= 1.0;
277  }
278  caffe2::math::Div(D, out, lambda1, out, &context_);
279 }
280 
281 template <>
282 template <typename T>
283 void BatchBoxCoxOp<CPUContext>::BoxCoxZeroLambda(
284  TIndex D,
285  const T* data_ptr,
286  const T* lambda2,
287  T k_eps,
288  T* output_ptr) {
289  caffe2::math::Add(D, data_ptr, lambda2, output_ptr, &context_);
290  for (TIndex j = 0; j < D; j++) {
291  output_ptr[j] = std::max(output_ptr[j], k_eps);
292  }
293  caffe2::math::Log(D, output_ptr, output_ptr, &context_);
294 }
295 
296 template <>
297 template <typename T>
298 void BatchBoxCoxOp<CPUContext>::BoxCoxMixedLambda(
299  const T* data_ptr,
300  const vector<int>& nonzeros,
301  const vector<int>& zeros,
302  const T* lambda1,
303  const T* lambda2,
304  const T* lambda2_z,
305  T k_eps,
306  T* buffer,
307  T* output_ptr) {
308  PackV(nonzeros.size(), data_ptr, nonzeros.data(), buffer);
309  BoxCoxNonzeroLambda(nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
310  UnpackV(nonzeros.size(), buffer, output_ptr, nonzeros.data());
311 
312  PackV(zeros.size(), data_ptr, zeros.data(), buffer);
313  BoxCoxZeroLambda(zeros.size(), buffer, lambda2_z, k_eps, buffer);
314  UnpackV(zeros.size(), buffer, output_ptr, zeros.data());
315 }
316 
317 // Helpers to access cached buffers.
318 #define DEFINE_CACHED_BUFFERS(T, tag) \
319  template <> \
320  template <> \
321  BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>& \
322  BatchBoxCoxOp<CPUContext>::GetBuffers<T>() { \
323  if (!buffers_ || buffers_->type_ != tag) { \
324  buffers_.reset(new BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>()); \
325  buffers_->type_ = tag; \
326  } \
327  return *static_cast<TypedCachedBuffers<T>*>(buffers_.get()); \
328  }
329 DEFINE_CACHED_BUFFERS(float, 1);
330 DEFINE_CACHED_BUFFERS(double, 2);
331 #undef DEFINE_CACHED_BUFFERS
332 
333 #endif // CAFFE2_USE_MKL
334 
335 namespace {
336 
337 REGISTER_CPU_OPERATOR(BatchBoxCox, BatchBoxCoxOp<CPUContext>);
338 OPERATOR_SCHEMA(BatchBoxCox)
339  .NumInputs(3)
340  .NumOutputs(1)
341  .IdenticalTypeAndShapeOfInput(0)
342  .AllowInplace({{0, 0}})
343  .SetDoc(R"DOC(
344 Input `data` is a N * D matrix. Apply box-cox transform for each column.
345 `lambda1` and `lambda2` is of size D that defines the hyper-parameters for
346 the transform of each column `x` of the input `data`:
347 
348  ln(x + lambda2), if lambda1 == 0
349  ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0
350 
351 )DOC")
352  .Input(0, "data", "input float or double N * D matrix")
353  .Input(1, "lambda1", "tensor of size D with the same type as data")
354  .Input(2, "lambda2", "tensor of size D with the same type as data")
355  .Output(0, "output", "output matrix that applied box-cox transform");
356 
357 GRADIENT_NOT_IMPLEMENTED_YET(BatchBoxCox);
358 } // namespace
359 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.