Caffe2 - C++ API
A deep learning, cross platform ML framework
batch_box_cox_op.cc
1 #include "caffe2/operators/batch_box_cox_op.h"
2 
3 #include "caffe2/core/operator.h"
4 #include "caffe2/core/tensor.h"
5 
6 #ifdef CAFFE2_USE_MKL
7 #include <mkl.h>
8 #endif // CAFFE2_USE_MKL
9 
10 namespace caffe2 {
11 
12 #ifdef CAFFE2_USE_MKL
13 namespace {
14 
15 // Helpers for copying parameters.
16 template <typename T>
17 void TileArrayIntoVector(const T* a, int D, int K, vector<T>* b) {
18  b->resize(K * D);
19  for (int k = 0; k < K; k++) {
20  std::copy(a, a + D, b->begin() + k * D);
21  }
22 }
23 
24 void TileIndicesInPlace(vector<int>* v, int D, int K) {
25  int n = v->size();
26  v->resize(K * n);
27  for (int k = 1; k < K; k++) {
28  for (int j = 0; j < n; j++) {
29  (*v)[k * n + j] = (*v)[j] + k * D;
30  }
31  }
32 }
33 
34 // MKL VML function templates.
35 template <typename T>
36 void PackV(const int N, const T* a, const int* ia, T* y);
37 template <typename T>
38 void UnpackV(const int N, const T* a, T* y, const int* iy);
39 template <typename T>
40 void Pow(const int N, const T* a, const T* b, T* y);
41 
42 #define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \
43  template <> \
44  void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
45  OriginalFunc(N, a, ia, y); \
46  }
47 DELEGATE_PACKV_FUNCTION(float, vsPackV)
48 DELEGATE_PACKV_FUNCTION(double, vdPackV)
49 #undef DELEGATE_PACKV_FUNCTION
50 
51 #define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \
52  template <> \
53  void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
54  OriginalFunc(N, a, y, iy); \
55  }
56 DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
57 DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
58 #undef DELEGATE_UNPACKV_FUNCTION
59 
60 #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
61  template <> \
62  void Funcname<T>(const int N, const T* a, const T* b, T* y) { \
63  OriginalFunc(N, a, b, y); \
64  }
65 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow)
66 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow)
67 #undef DELEGATE_SIMPLE_BINARY_FUNCTION
68 
69 } // namespace
70 #endif // CAFFE2_USE_MKL
71 
72 template <>
73 template <typename T>
74 bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
75  auto& data = Input(DATA);
76  auto& lambda1 = Input(LAMBDA1);
77  auto& lambda2 = Input(LAMBDA2);
78  CAFFE_ENFORCE_GE(data.dim(), 1);
79  auto N = data.size(0);
80  auto D = data.size_from_dim(1);
81 
82  auto* output = Output(0, Input(DATA).sizes(), at::dtype<T>());
83  auto* output_ptr = output->template mutable_data<T>();
84 
85  if (data.numel() <= 0) {
86  return true;
87  }
88 
89  CAFFE_ENFORCE_EQ(lambda1.numel(), D);
90  CAFFE_ENFORCE_EQ(lambda2.numel(), D);
91 
92  const auto* data_ptr = data.template data<T>();
93  const auto* lambda1_ptr = lambda1.template data<T>();
94  const auto* lambda2_ptr = lambda2.template data<T>();
95 
96  const T k_eps = static_cast<T>(1e-6);
97 
98 #ifdef CAFFE2_USE_MKL
99  if (min_block_size_ < 1) {
100  BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
101  } else {
102  // Find zero-valued columns, since they get special treatment.
103  nonzeros_.clear();
104  zeros_.clear();
105  nonzeros_.reserve(D);
106  zeros_.reserve(D);
107  for (int64_t j = 0; j < D; j++) {
108  if (lambda1_ptr[j] == 0) {
109  zeros_.push_back(j);
110  } else {
111  nonzeros_.push_back(j);
112  }
113  }
114 
115  // Process K rows at a time for effective vectorization with small rows.
116  const int K = std::min(N, (min_block_size_ + D - 1) / D);
117 
118  // Avoid copying data if all lambda1 values are zero, or if all are nonzero.
119  // In each of the three cases here, when K > 1, first process batches of K
120  // rows by replicating the input parameters K times. Then finish row-by-row.
121  TypedCachedBuffers<T>& b = GetBuffers<T>();
122  if (nonzeros_.size() == D) {
123  int64_t i = 0;
124  if (K > 1) {
125  TileArrayIntoVector(lambda1_ptr, D, K, &b.lambda1_);
126  TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_);
127  DCHECK_EQ(K * D, b.lambda1_.size());
128  DCHECK_EQ(K * D, b.lambda2_.size());
129  for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
130  BoxCoxNonzeroLambda(
131  K * D,
132  data_ptr,
133  b.lambda1_.data(),
134  b.lambda2_.data(),
135  k_eps,
136  output_ptr);
137  }
138  }
139  for (; i < N; i++, data_ptr += D, output_ptr += D) {
140  BoxCoxNonzeroLambda(
141  D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
142  }
143  } else if (zeros_.size() == D) {
144  int64_t i = 0;
145  if (K > 1) {
146  TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_z_);
147  DCHECK_EQ(K * D, b.lambda2_z_.size());
148  for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
149  BoxCoxZeroLambda(
150  K * D, data_ptr, b.lambda2_z_.data(), k_eps, output_ptr);
151  }
152  }
153  for (; i < N; i++, data_ptr += D, output_ptr += D) {
154  BoxCoxZeroLambda(D, data_ptr, lambda2_ptr, k_eps, output_ptr);
155  }
156  } else { // General case of mixed zero and non-zero lambda1 values.
157  int n = nonzeros_.size();
158  if (K > 1) {
159  TileIndicesInPlace(&nonzeros_, 0, K);
160  TileIndicesInPlace(&zeros_, 0, K);
161  }
162 
163  // Gather parameter values into contiguous memory.
164  b.lambda1_.resize(nonzeros_.size());
165  b.lambda2_.resize(nonzeros_.size());
166  b.lambda2_z_.resize(zeros_.size());
167  PackV(nonzeros_.size(), lambda1_ptr, nonzeros_.data(), b.lambda1_.data());
168  PackV(nonzeros_.size(), lambda2_ptr, nonzeros_.data(), b.lambda2_.data());
169  PackV(zeros_.size(), lambda2_ptr, zeros_.data(), b.lambda2_z_.data());
170 
171  int64_t i = 0;
172  b.accumulator_.resize(std::max(nonzeros_.size(), zeros_.size()));
173  if (K > 1) {
174  // Truncate to original size, and re-tile with offsets this time.
175  nonzeros_.resize(n);
176  zeros_.resize(D - n);
177  TileIndicesInPlace(&nonzeros_, D, K);
178  TileIndicesInPlace(&zeros_, D, K);
179  DCHECK_EQ(nonzeros_.size(), b.lambda1_.size());
180  DCHECK_EQ(nonzeros_.size(), b.lambda2_.size());
181  DCHECK_EQ(zeros_.size(), b.lambda2_z_.size());
182  for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
183  BoxCoxMixedLambda(
184  data_ptr,
185  nonzeros_,
186  zeros_,
187  b.lambda1_.data(),
188  b.lambda2_.data(),
189  b.lambda2_z_.data(),
190  k_eps,
191  b.accumulator_.data(),
192  output_ptr);
193  }
194  // Truncate to original size.
195  nonzeros_.resize(n);
196  zeros_.resize(D - n);
197  }
198  for (; i < N; i++, data_ptr += D, output_ptr += D) {
199  BoxCoxMixedLambda(
200  data_ptr,
201  nonzeros_,
202  zeros_,
203  b.lambda1_.data(),
204  b.lambda2_.data(),
205  b.lambda2_z_.data(),
206  k_eps,
207  b.accumulator_.data(),
208  output_ptr);
209  }
210  }
211  }
212 #else // CAFFE2_USE_MKL
213  BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
214 #endif // CAFFE2_USE_MKL
215  return true;
216 }
217 
218 template <>
219 template <typename T>
220 void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
221  int64_t N,
222  int64_t D,
223  const T* data_ptr,
224  const T* lambda1_ptr,
225  const T* lambda2_ptr,
226  T k_eps,
227  T* output_ptr) {
228  for (int64_t i = 0; i < N; i++) {
229  for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
230  T lambda1_v = lambda1_ptr[j];
231  T lambda2_v = lambda2_ptr[j];
232  T tmp = std::max(*data_ptr + lambda2_v, k_eps);
233  if (lambda1_v == 0) {
234  *output_ptr = std::log(tmp);
235  } else {
236  *output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v;
237  }
238  }
239  }
240 }
241 
242 #ifdef CAFFE2_USE_MKL
243 
244 template <>
245 template <typename T>
246 void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
247  int64_t D,
248  const T* data_ptr,
249  const T* lambda1,
250  const T* lambda2,
251  T k_eps,
252  T* out) {
253  caffe2::math::Add(D, data_ptr, lambda2, out, &context_);
254  for (int64_t j = 0; j < D; j++) {
255  out[j] = std::max(out[j], k_eps);
256  }
257  Pow(D, out, lambda1, out);
258  for (int64_t j = 0; j < D; j++) {
259  out[j] -= 1.0;
260  }
261  caffe2::math::Div(D, out, lambda1, out, &context_);
262 }
263 
264 template <>
265 template <typename T>
266 void BatchBoxCoxOp<CPUContext>::BoxCoxZeroLambda(
267  int64_t D,
268  const T* data_ptr,
269  const T* lambda2,
270  T k_eps,
271  T* output_ptr) {
272  caffe2::math::Add(D, data_ptr, lambda2, output_ptr, &context_);
273  for (int64_t j = 0; j < D; j++) {
274  output_ptr[j] = std::max(output_ptr[j], k_eps);
275  }
276  caffe2::math::Log(D, output_ptr, output_ptr, &context_);
277 }
278 
279 template <>
280 template <typename T>
281 void BatchBoxCoxOp<CPUContext>::BoxCoxMixedLambda(
282  const T* data_ptr,
283  const vector<int>& nonzeros,
284  const vector<int>& zeros,
285  const T* lambda1,
286  const T* lambda2,
287  const T* lambda2_z,
288  T k_eps,
289  T* buffer,
290  T* output_ptr) {
291  PackV(nonzeros.size(), data_ptr, nonzeros.data(), buffer);
292  BoxCoxNonzeroLambda(nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
293  UnpackV(nonzeros.size(), buffer, output_ptr, nonzeros.data());
294 
295  PackV(zeros.size(), data_ptr, zeros.data(), buffer);
296  BoxCoxZeroLambda(zeros.size(), buffer, lambda2_z, k_eps, buffer);
297  UnpackV(zeros.size(), buffer, output_ptr, zeros.data());
298 }
299 
300 // Helpers to access cached buffers.
301 #define DEFINE_CACHED_BUFFERS(T, tag) \
302  template <> \
303  template <> \
304  BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>& \
305  BatchBoxCoxOp<CPUContext>::GetBuffers<T>() { \
306  if (!buffers_ || buffers_->type_ != tag) { \
307  buffers_.reset(new BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>()); \
308  buffers_->type_ = tag; \
309  } \
310  return *static_cast<TypedCachedBuffers<T>*>(buffers_.get()); \
311  }
312 DEFINE_CACHED_BUFFERS(float, 1);
313 DEFINE_CACHED_BUFFERS(double, 2);
314 #undef DEFINE_CACHED_BUFFERS
315 
316 #endif // CAFFE2_USE_MKL
317 
318 namespace {
319 
320 REGISTER_CPU_OPERATOR(BatchBoxCox, BatchBoxCoxOp<CPUContext>);
321 OPERATOR_SCHEMA(BatchBoxCox)
322  .NumInputs(3)
323  .NumOutputs(1)
324  .IdenticalTypeAndShapeOfInput(0)
325  .AllowInplace({{0, 0}})
326  .SetDoc(R"DOC(
327 Input `data` is a N * D matrix. Apply box-cox transform for each column.
328 `lambda1` and `lambda2` is of size D that defines the hyper-parameters for
329 the transform of each column `x` of the input `data`:
330 
331  ln(x + lambda2), if lambda1 == 0
332  ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0
333 
334 )DOC")
335  .Input(0, "data", "input float or double N * D matrix")
336  .Input(1, "lambda1", "tensor of size D with the same type as data")
337  .Input(2, "lambda2", "tensor of size D with the same type as data")
338  .Output(0, "output", "output matrix that applied box-cox transform");
339 
340 GRADIENT_NOT_IMPLEMENTED_YET(BatchBoxCox);
341 } // namespace
342 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:70