Caffe2 - C++ API
A deep learning, cross platform ML framework
unfold.c
1 #ifndef TH_GENERIC_FILE
2 #define TH_GENERIC_FILE "THNN/generic/unfold.c"
3 #else
4 
5 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
6 void THNN_(unfolded_acc)(
7  THTensor *finput,
8  THTensor *input,
9  int kW,
10  int kH,
11  int dW,
12  int dH,
13  int padW,
14  int padH,
15  int nInputPlane,
16  int inputWidth,
17  int inputHeight,
18  int outputWidth,
19  int outputHeight)
20 {
21  // This function assumes that
22  // outputHeight*dH does not overflow a int64_t
23  // outputWidth*dW does not overflow a int64_t
24 
25  int nip;
26 
27  scalar_t *input_data = input->data<scalar_t>();
28  scalar_t *finput_data = finput->data<scalar_t>();
29 
30 #pragma omp parallel for private(nip)
31  for(nip = 0; nip < nInputPlane; nip++)
32  {
33  int kw, kh, y, x;
34  int64_t ix, iy;
35  for(kh = 0; kh < kH; kh++)
36  {
37  for(kw = 0; kw < kW; kw++)
38  {
39  scalar_t *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
40  scalar_t *dst = input_data + nip*((size_t)inputHeight*inputWidth);
41  if (padW > 0 || padH > 0) {
42  int lpad,rpad;
43  for(y = 0; y < outputHeight; y++) {
44  iy = (int64_t)y*dH - padH + kh;
45  if (iy < 0 || iy >= inputHeight) {
46  } else {
47  if (dW==1){
48  ix = 0 - padW + kw;
49  lpad = fmaxf(0,padW-kw);
50  rpad = fmaxf(0,padW-(kW-kw-1));
51  scalar_t *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad;
52  THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
53  }
54  else{
55  for (x=0; x<outputWidth; x++){
56  ix = (int64_t)x*dW - padW + kw;
57  if (ix < 0 || ix >= inputWidth){
58  }else{
59  scalar_t *dst_slice = dst+(size_t)iy*inputWidth+ix;
60  THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
61  }
62  }
63  }
64  }
65  }
66  } else {
67  for(y = 0; y < outputHeight; y++) {
68  iy = (int64_t)y*dH + kh;
69  ix = 0 + kw;
70  if (dW == 1 ) {
71  scalar_t *dst_slice = dst+(size_t)iy*inputWidth+ix;
72  THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
73  }else{
74  for(x = 0; x < outputWidth; x++) {
75  scalar_t *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW;
76  THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
77  }
78  }
79  }
80  }
81  }
82  }
83  }
84 }
85 
86 void THNN_(unfolded_copy)(
87  THTensor *finput,
88  THTensor *input,
89  int kW,
90  int kH,
91  int dW,
92  int dH,
93  int padW,
94  int padH,
95  int nInputPlane,
96  int inputWidth,
97  int inputHeight,
98  int outputWidth,
99  int outputHeight)
100 {
101  // This function assumes that
102  // kH*kW does not overflow an int
103  // nInputPlane*kH*kW does not overflow a int64_t
104  // outputHeight*dH does not overflow a int64_t
105  // outputWidth*dW does not overflow a int64_t
106 
107  int64_t k;
108  scalar_t *input_data = input->data<scalar_t>();
109  scalar_t *finput_data = finput->data<scalar_t>();
110 
111 #pragma omp parallel for private(k)
112  for(k = 0; k < (int64_t)nInputPlane*kH*kW; k++) {
113  int64_t nip = k / (kH*kW);
114  int64_t rest = k % (kH*kW);
115  int64_t kh = rest / kW;
116  int64_t kw = rest % kW;
117  int x, y;
118  int64_t ix, iy;
119  scalar_t *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
120  scalar_t *src = input_data + nip*((size_t)inputHeight*inputWidth);
121  if (padW > 0 || padH > 0) {
122  int64_t lpad,rpad;
123  for(y = 0; y < outputHeight; y++) {
124  iy = (int64_t)y*dH - padH + kh;
125  if (iy < 0 || iy >= inputHeight) {
126  memset(dst+(size_t)y*outputWidth, 0, sizeof(scalar_t)*outputWidth);
127  } else {
128  if (dW==1){
129  ix = 0 - padW + kw;
130  lpad = fmaxf(0,padW-kw);
131  rpad = fmaxf(0,padW-(kW-kw-1));
132  if (outputWidth-rpad-lpad <= 0) {
133  memset(dst+(size_t)y*outputWidth, 0, sizeof(scalar_t)*outputWidth);
134  } else {
135  if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(scalar_t)*lpad);
136  memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(scalar_t)*(outputWidth-rpad-lpad));
137  if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(scalar_t)*rpad);
138  }
139  }
140  else{
141  for (x=0; x<outputWidth; x++){
142  ix = (int64_t)x*dW - padW + kw;
143  if (ix < 0 || ix >= inputWidth)
144  memset(dst+(size_t)y*outputWidth+x, 0, sizeof(scalar_t)*1);
145  else
146  memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(scalar_t)*(1));
147  }
148  }
149  }
150  }
151  } else {
152  for(y = 0; y < outputHeight; y++) {
153  iy = (int64_t)y*dH + kh;
154  ix = 0 + kw;
155  if (dW == 1)
156  memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(scalar_t)*outputWidth);
157  else{
158  for (x=0; x<outputWidth; x++)
159  memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix+(int64_t)x*dW, sizeof(scalar_t)*(1));
160  }
161  }
162  }
163  }
164 }
165 
166 #endif