Caffe2 - C++ API
A deep learning, cross platform ML framework
im2col_dnnlowp.h
1 #pragma once
2 
3 #ifdef _OPENMP
4 #include <omp.h>
5 #endif
6 
7 #include "caffe2/core/operator.h"
8 #include "caffe2/utils/math.h"
9 #include "caffe2/utils/math/utils.h"
10 
11 namespace caffe2 {
12 
13 namespace math {
14 
15 template <typename T>
16 static void Im2ColNCHW(
17  const int channels,
18  const int height,
19  const int width,
20  const int kernel_h,
21  const int kernel_w,
22  const int dilation_h,
23  const int dilation_w,
24  const int pad_t,
25  const int pad_l,
26  const int pad_b,
27  const int pad_r,
28  const int stride_h,
29  const int stride_w,
30  const T* data_im,
31  T* data_col,
32  CPUContext* /*context*/,
33  const T& zero_point = 0) {
34  const int output_h =
35  (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
36  1;
37  const int output_w =
38  (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
39  1;
40 
41  // Fast path for zero padding and no dilation
42  // From Torch, THNN_(unfolded_copy)
43  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
44  pad_t == 0 && pad_b == 0) {
45  for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
46  const auto nip = k / (kernel_h * kernel_w);
47  const auto rest = k % (kernel_h * kernel_w);
48  const auto kh = rest / kernel_w;
49  const auto kw = rest % kernel_w;
50  auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
51  kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
52  const auto* src = data_im + nip * (height * width);
53  for (auto y = 0; y < output_h; y++) {
54  const auto iy = y * stride_h + kh;
55  const auto ix = kw;
56  if (stride_w == 1) {
57  memcpy(
58  dst + (y * output_w),
59  src + (iy * width + ix),
60  sizeof(T) * output_w);
61  } else {
62  for (auto x = 0; x < output_w; x++) {
63  memcpy(
64  dst + (y * output_w + x),
65  src + (iy * width + ix + x * stride_w),
66  sizeof(T));
67  }
68  }
69  }
70  }
71  return;
72  }
73 
74  // Fast path for equal padding
75  if (pad_l == pad_r && pad_t == pad_b) {
76  // From Intel, https://github.com/BVLC/caffe/pull/3536
77  const int pad_h = pad_t;
78  const int pad_w = pad_l;
79  const int channel_size = height * width;
80  for (int channel = channels; channel--; data_im += channel_size) {
81  for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
82  for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
83  int input_row = -pad_h + kernel_row * dilation_h;
84  for (int output_rows = output_h; output_rows; output_rows--) {
85  if (!utils::IsAGeZeroAndALtB(input_row, height)) {
86  for (int output_cols = output_w; output_cols; output_cols--) {
87  *(data_col++) = zero_point;
88  }
89  } else {
90  int input_col = -pad_w + kernel_col * dilation_w;
91  for (int output_col = output_w; output_col; output_col--) {
92  if (utils::IsAGeZeroAndALtB(input_col, width)) {
93  *(data_col++) = data_im[input_row * width + input_col];
94  } else {
95  *(data_col++) = zero_point;
96  }
97  input_col += stride_w;
98  }
99  }
100  input_row += stride_h;
101  }
102  }
103  }
104  }
105  return;
106  }
107 
108  // Baseline
109  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
110  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
111 
112  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
113  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
114 
115  int channels_col = channels * kernel_h * kernel_w;
116  for (int c = 0; c < channels_col; ++c) {
117  int w_offset = c % kernel_w;
118  int h_offset = (c / kernel_w) % kernel_h;
119  int c_im = c / kernel_h / kernel_w;
120  for (int h = 0; h < height_col; ++h) {
121  for (int w = 0; w < width_col; ++w) {
122  int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
123  int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
124  if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
125  data_col[(c * height_col + h) * width_col + w] =
126  data_im[(c_im * height + h_pad) * width + w_pad];
127  else
128  data_col[(c * height_col + h) * width_col + w] = zero_point;
129  }
130  }
131  }
132 }
133 
134 template <typename T>
135 static void Im2ColNdNCHW(
136  const int N,
137  const int /* img_size*/,
138  const int col_size,
139  const int* img_shape,
140  const int* col_shape,
141  const int* kernel_shape,
142  const int* stride,
143  const int* dilation,
144  const int* pad,
145  const T* X_data,
146  T* Y_data,
147  CPUContext* /* context */,
148  const T& zero_point = 0) {
149  const int outer_size = col_shape[0];
150  const int inner_size = col_size / outer_size;
151  const int kernel_size = std::accumulate(
152  kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
153  std::vector<int> d_offset(N, 0);
154  std::vector<int> d_iter(N, 0);
155  for (int i = 0; i < outer_size; ++i) {
156  // Loop over spatial axes in reverse order to compute a per-axis offset.
157  int offset = i;
158  for (int d_i = N - 1; d_i >= 0; --d_i) {
159  d_offset[d_i] = offset % kernel_shape[d_i];
160  offset /= kernel_shape[d_i];
161  }
162  for (int j = 0; j < inner_size; ++j) {
163  // Loop over spatial axes in forward order to compute the indices in the
164  // image and column, and whether the index lies in the padding.
165  const int col_index = i * inner_size + j;
166  int img_index = i / kernel_size;
167  bool is_padding = false;
168  for (int d_i = 0; d_i < N; ++d_i) {
169  const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
170  d_offset[d_i] * dilation[d_i];
171  is_padding |= d_img < 0 || d_img >= img_shape[d_i + 1];
172  img_index = img_index * img_shape[d_i + 1] + d_img;
173  }
174  Y_data[col_index] = is_padding ? zero_point : X_data[img_index];
175  utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
176  }
177  }
178 }
179 
185 template <typename T>
186 static void Im2ColNHWC(
187  const int channels,
188  const int height,
189  const int width,
190  const int kernel_h,
191  const int kernel_w,
192  const int dilation_h,
193  const int dilation_w,
194  const int pad_t,
195  const int pad_l,
196  const int pad_b,
197  const int pad_r,
198  const int stride_h,
199  const int stride_w,
200  const T* data_im,
201  T* data_col,
202  CPUContext* /*context*/,
203  const int groups,
204  const T& zero_point) {
205  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
206  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
207 
208  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
209  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
210 
211 #ifdef _OPENMP
212 #pragma omp parallel for if (!omp_in_parallel())
213 #endif
214  for (int h = 0; h < height_col; ++h) {
215  int h_pad = -pad_t + h * stride_h;
216  T* data_col_temp =
217  data_col + h * width_col * kernel_h * kernel_w * channels;
218  int w_pad = -pad_l;
219  for (int w = 0; w < width_col; ++w) {
220  int r = 0;
221  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
222  int s = 0;
223  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) {
224  if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
225  for (int g = 0; g < groups; ++g) {
226  memcpy(
227  data_col_temp +
228  ((g * kernel_h + r) * kernel_w + s) * (channels / groups),
229  data_im + (ih * width + iw) * channels +
230  g * (channels / groups),
231  sizeof(T) * (channels / groups));
232  }
233  } else {
234  // This should be simply padded with zero.
235  for (int g = 0; g < groups; ++g) {
236  for (int i = 0; i < channels / groups; ++i) {
237  data_col_temp
238  [(((g * kernel_h + r) * kernel_w) + s) *
239  (channels / groups) +
240  i] = zero_point;
241  }
242  }
243  }
244  } // for each iw
245  } // for each ih
246  data_col_temp += kernel_h * kernel_w * channels;
247  w_pad += stride_w;
248  } // for each output pixel
249  } // for each image row
250 }
251 
257 template <typename T>
258 static void Im2Col3DNHWC(
259  const int channels,
260  const int num_frames,
261  const int height,
262  const int width,
263  const int kernel_t,
264  const int kernel_h,
265  const int kernel_w,
266  const int dilation_t,
267  const int dilation_h,
268  const int dilation_w,
269  const int pad_p, // previous frame
270  const int pad_t, // top
271  const int pad_l, // left
272  const int pad_n, // next frame
273  const int pad_b, // bottom
274  const int pad_r, // right
275  const int stride_t,
276  const int stride_h,
277  const int stride_w,
278  const T* data_im,
279  T* data_col,
280  CPUContext* /*context*/,
281  const int groups,
282  const T& zero_point) {
283  const int dkernel_t = dilation_t * (kernel_t - 1) + 1;
284  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
285  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
286 
287  int frame_col = (num_frames + pad_p + pad_n - dkernel_t) / stride_t + 1;
288  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
289  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
290 
291 #ifdef _OPENMP
292 #pragma omp parallel for if (!omp_in_parallel())
293 #endif
294  for (int t = 0; t < frame_col; ++t) {
295  int t_pad = -pad_p + t * stride_t;
296  for (int h = 0; h < height_col; ++h) {
297  int h_pad = -pad_t + h * stride_h;
298  T* data_col_temp = data_col +
299  (t * height_col + h) * width_col * kernel_t * kernel_h * kernel_w *
300  channels;
301  for (int w = 0; w < width_col; ++w) {
302  int w_pad = -pad_l + w * stride_w;
303  int q = 0;
304  for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) {
305  int r = 0;
306  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) {
307  int s = 0;
308  for (int iw = w_pad; iw < w_pad + dkernel_w;
309  iw += dilation_w, ++s) {
310  if (it >= 0 && it < num_frames && ih >= 0 && ih < height &&
311  iw >= 0 && iw < width) {
312  for (int g = 0; g < groups; ++g) {
313  memcpy(
314  data_col_temp +
315  (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) *
316  (channels / groups),
317  data_im + ((it * height + ih) * width + iw) * channels +
318  g * (channels / groups),
319  sizeof(T) * (channels / groups));
320  }
321  } else {
322  // This should be simply padded with zero.
323  for (int g = 0; g < groups; ++g) {
324  for (int i = 0; i < channels / groups; ++i) {
325  data_col_temp
326  [((((g * kernel_t + q) * kernel_h + r) * kernel_w) +
327  s) *
328  (channels / groups) +
329  i] = zero_point;
330  }
331  }
332  }
333  } // for each iw
334  } // for each ih
335  } // for each it
336  data_col_temp += kernel_t * kernel_h * kernel_w * channels;
337  } // for each output pixel
338  } // for each image row
339  } // for each frame
340 }
341 
342 } // namespace math
343 
344 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13