Caffe2 - C++ API
A deep learning, cross platform ML framework
THTensorConv.cpp
1 #ifndef TH_GENERIC_FILE
2 #define TH_GENERIC_FILE "TH/generic/THTensorConv.cpp"
3 #else
4 
5 /*
6  2D Input, 2D kernel : convolve given image with the given kernel.
7 */
8 void THTensor_(validXCorr2Dptr)(scalar_t *r_,
9  scalar_t alpha,
10  scalar_t *t_, int64_t ir, int64_t ic,
11  scalar_t *k_, int64_t kr, int64_t kc,
12  int64_t sr, int64_t sc)
13 {
14  int64_t or_ = (ir - kr) / sr + 1;
15  int64_t oc = (ic - kc) / sc + 1;
16 
17  int64_t xx, yy, kx, ky;
18 
19  if ((sc != 1) || (oc < 4)) {
20  /* regular convolution */
21  for(yy = 0; yy < or_; yy++) {
22  for(xx = 0; xx < oc; xx++) {
23  /* Dot product in two dimensions... (between input image and the mask) */
24  scalar_t *pi_ = t_ + yy*sr*ic + xx*sc;
25  scalar_t *pw_ = k_;
26  scalar_t sum = 0;
27  for(ky = 0; ky < kr; ky++) {
28  for(kx = 0; kx < kc; kx++) {
29  sum += pi_[kx]*pw_[kx];
30  }
31  pi_ += ic; /* next input line */
32  pw_ += kc; /* next mask line */
33  }
34  /* Update output */
35  *r_++ += alpha*sum;
36  }
37  }
38 
39  } else {
40  /* SSE-based convolution */
41  for(yy = 0; yy < or_; yy++) {
42  scalar_t *pi_ = t_ + yy*sr*ic;
43  scalar_t *pw_ = k_;
44  for (ky = 0; ky < kr; ky++) {
45  scalar_t *pis_ = pi_;
46  for (kx = 0; kx < kc; kx++) {
47  THVector_(cadd)(r_, r_, pis_, alpha*pw_[kx], oc);
48  pis_++;
49  }
50  pi_ += ic; /* next input line */
51  pw_ += kc; /* next mask line */
52  }
53  r_ += oc;
54  }
55  }
56 }
57 
58 /*
59  2D Input, 2D kernel : convolve given image with the given kernel.
60 */
61 void THTensor_(validConv2Dptr)(scalar_t *r_,
62  scalar_t alpha,
63  scalar_t *t_, int64_t ir, int64_t ic,
64  scalar_t *k_, int64_t kr, int64_t kc,
65  int64_t sr, int64_t sc)
66 {
67  int64_t or_ = (ir - kr) / sr + 1;
68  int64_t oc = (ic - kc) / sc + 1;
69 
70  int64_t xx, yy, kx, ky;
71 
72  if ((sc != 1) || (oc < 4)) {
73  /* regular convolution */
74  for(yy = 0; yy < or_; yy++) {
75  for(xx = 0; xx < oc; xx++) {
76  /* Dot product in two dimensions... (between input image and the mask) */
77  scalar_t *pi_ = t_ + yy*sr*ic + xx*sc;
78  scalar_t *pw_ = k_ + kr*kc - 1;
79  scalar_t sum = 0;
80  for(ky = 0; ky < kr; ky++) {
81  for(kx = 0; kx < kc; kx++) {
82  sum += pi_[kx]*pw_[-kx];
83  }
84  pi_ += ic; /* next input line */
85  pw_ -= kc; /* next mask line */
86  }
87  /* Update output */
88  *r_++ += alpha*sum;
89  }
90  }
91 
92  } else {
93  /* SSE-based convolution */
94  for(yy = 0; yy < or_; yy++) {
95  scalar_t *pw_ = k_ + kr*kc - 1;
96  scalar_t *pi_ = t_ + yy*sr*ic;
97  for (ky = 0; ky < kr; ky++) {
98  scalar_t *pis_ = pi_;
99  for (kx = 0; kx < kc; kx++) {
100  THVector_(cadd)(r_, r_, pis_, alpha*pw_[-kx], oc);
101  pis_++;
102  }
103  pi_ += ic; /* next input line */
104  pw_ -= kc; /* next mask line */
105  }
106  r_ += oc;
107  }
108  }
109 }
110 
111 /*
112  2D Input, 2D kernel : convolve given image with the given kernel, full convolution.
113 */
114 void THTensor_(fullConv2Dptr)(scalar_t *r_,
115  scalar_t alpha,
116  scalar_t *t_, int64_t ir, int64_t ic,
117  scalar_t *k_, int64_t kr, int64_t kc,
118  int64_t sr, int64_t sc)
119 {
120  int64_t oc = (ic - 1) * sc + kc;
121 
122  int64_t xx, yy, kx, ky;
123 
124  if ((sc != 1) || (ic < 4)) {
125  /* regular convolution */
126  for(yy = 0; yy < ir; yy++) {
127  for(xx = 0; xx < ic; xx++) {
128  /* Outer product in two dimensions... (between input image and the mask) */
129  scalar_t *po_ = r_ + yy*sr*oc + xx*sc;
130  scalar_t *pw_ = k_;
131  for(ky = 0; ky < kr; ky++)
132  {
133  scalar_t z = *t_ * alpha;
134  for(kx = 0; kx < kc; kx++) {
135  po_[kx] += z * pw_[kx];
136  }
137  po_ += oc; /* next input line */
138  pw_ += kc; /* next mask line */
139  }
140  t_++;
141  }
142  }
143 
144  } else {
145  /* SSE-based convolution */
146  for(yy = 0; yy < ir; yy++) {
147  scalar_t *po_ = r_ + yy*sr*oc;
148  scalar_t *pw_ = k_;
149  for (ky = 0; ky < kr; ky++) {
150  scalar_t *pos_ = po_;
151  for (kx = 0; kx < kc; kx++) {
152  THVector_(cadd)(pos_, pos_, t_, alpha*pw_[kx], ic);
153  pos_++;
154  }
155  po_ += oc; /* next input line */
156  pw_ += kc; /* next mask line */
157  }
158  t_ += ic;
159  }
160  }
161 }
162 
163 /*
164  2D Input, 2D kernel : convolve given image with the given kernel, full convolution.
165 */
166 void THTensor_(fullXCorr2Dptr)(scalar_t *r_,
167  scalar_t alpha,
168  scalar_t *t_, int64_t ir, int64_t ic,
169  scalar_t *k_, int64_t kr, int64_t kc,
170  int64_t sr, int64_t sc)
171 {
172  int64_t oc = (ic - 1) * sc + kc;
173 
174  int64_t xx, yy, kx, ky;
175 
176  if ((sc != 1) || (ic < 4)) {
177  /* regular convolution */
178  for(yy = 0; yy < ir; yy++) {
179  for(xx = 0; xx < ic; xx++) {
180  /* Outer product in two dimensions... (between input image and the mask) */
181  scalar_t *po_ = r_ + yy*sr*oc + xx*sc;
182  scalar_t *pw_ = k_ + kr*kc -1;
183  int64_t kx, ky;
184  for(ky = 0; ky < kr; ky++)
185  {
186  scalar_t z = *t_ * alpha;
187  for(kx = 0; kx < kc; kx++) {
188  po_[kx] += z * pw_[-kx];
189  }
190  po_ += oc; /* next input line */
191  pw_ -= kc; /* next mask line */
192  }
193  t_++;
194  }
195  }
196 
197  } else {
198  /* SSE-based convolution */
199  for(yy = 0; yy < ir; yy++) {
200  scalar_t *po_ = r_ + yy*sr*oc;
201  scalar_t *pw_ = k_ + kr*kc -1;
202  for (ky = 0; ky < kr; ky++) {
203  scalar_t *pos_ = po_;
204  for (kx = 0; kx < kc; kx++) {
205  THVector_(cadd)(pos_, pos_, t_, pw_[-kx]*alpha, ic);
206  pos_++;
207  }
208  po_ += oc; /* next input line */
209  pw_ -= kc; /* next mask line */
210  }
211  t_ += ic;
212  }
213  }
214 }
215 
216 /*
217  2D Input, 2D kernel : convolve given image with the given kernel, valid convolution.
218  for sr,sc=1 this is equivalent to validXCorr2Dptr, but otherwise it is useful for
219  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
220 */
221 void THTensor_(validXCorr2DRevptr)(scalar_t *r_,
222  scalar_t alpha,
223  scalar_t *t_, int64_t ir, int64_t ic,
224  scalar_t *k_, int64_t kr, int64_t kc,
225  int64_t sr, int64_t sc)
226 {
227  int64_t or_ = ir - (kr - 1) * sr;
228  int64_t oc = ic - (kc - 1) * sc;
229 
230  int64_t xx, yy, kx, ky;
231 
232  if ((sc != 1) || (kc < 4)) {
233  /* regular convolution */
234  for(yy = 0; yy < kr; yy++) {
235  for(xx = 0; xx < kc; xx++) {
236  scalar_t *po_ = r_;
237  scalar_t *pi_ = t_ + yy*sr*ic + xx*sc;
238  scalar_t z = *k_++ * alpha;
239 
240  for(ky = 0; ky < or_; ky++) {
241  for(kx = 0; kx < oc; kx++)
242  po_[kx] += z * pi_[kx];
243  pi_ += ic;
244  po_ += oc;
245  }
246  }
247  }
248 
249  } else {
250  /* SSE-based convolution */
251  for(yy = 0; yy < kr; yy++) {
252  for(xx = 0; xx < kc; xx++) {
253  scalar_t *po_ = r_;
254  scalar_t *pi_ = t_ + yy*sr*ic + xx*sc;
255  scalar_t z = *k_++ * alpha;
256 
257  for(ky = 0; ky < or_; ky++) {
258  THVector_(cadd)(po_, po_, pi_, z, oc);
259  pi_ += ic;
260  po_ += oc;
261  }
262  }
263  }
264  }
265 }
266 /*
267  3D Input, 3D kernel : convolve given volume with the given kernel.
268 */
269 void THTensor_(validXCorr3Dptr)(scalar_t *r_,
270  scalar_t alpha,
271  scalar_t *t_, int64_t it, int64_t ir, int64_t ic,
272  scalar_t *k_, int64_t kt, int64_t kr, int64_t kc,
273  int64_t st, int64_t sr, int64_t sc)
274 {
275  int64_t ot = (it - kt) / st + 1;
276  int64_t or_ = (ir - kr) / sr + 1;
277  int64_t oc = (ic - kc) / sc + 1;
278 
279  int64_t zz, xx, yy;
280 
281  for (zz = 0; zz < ot; zz++)
282  {
283  for(yy = 0; yy < or_; yy++)
284  {
285  for(xx = 0; xx < oc; xx++)
286  {
287  /* Dot product in two dimensions... (between input image and the mask) */
288  scalar_t *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
289  scalar_t *pw_ = k_;
290  scalar_t sum = 0;
291  int64_t kz, kx, ky;
292  for(kz = 0; kz < kt; kz++)
293  {
294  for(ky = 0; ky < kr; ky++)
295  {
296  for(kx = 0; kx < kc; kx++) {
297  sum += pi_[kx]*pw_[kx];
298  }
299  pi_ += ic; /* next input line */
300  pw_ += kc; /* next mask line */
301  }
302  pi_ += (ir-kr)*ic; /* next input slice */
303  }
304  /* Update output */
305  *r_++ += sum*alpha;
306  }
307  }
308  }
309 }
310 
311 /*
312  3D Input, 3D kernel : convolve given volume with the given kernel.
313 */
314 void THTensor_(validConv3Dptr)(scalar_t *r_,
315  scalar_t alpha,
316  scalar_t *t_, int64_t it, int64_t ir, int64_t ic,
317  scalar_t *k_, int64_t kt, int64_t kr, int64_t kc,
318  int64_t st, int64_t sr, int64_t sc)
319 {
320  int64_t ot = (it - kt) / st + 1;
321  int64_t or_ = (ir - kr) / sr + 1;
322  int64_t oc = (ic - kc) / sc + 1;
323 
324  int64_t zz, xx, yy;
325 
326  for(zz = 0; zz < ot; zz++)
327  {
328  for(yy = 0; yy < or_; yy++)
329  {
330  for(xx = 0; xx < oc; xx++)
331  {
332  /* Dot product in two dimensions... (between input image and the mask) */
333  scalar_t *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
334  scalar_t *pw_ = k_ + kt*kr*kc - 1;
335  scalar_t sum = 0;
336  int64_t kz, kx, ky;
337  for(kz = 0; kz < kt; kz++)
338  {
339  for(ky = 0; ky < kr; ky++)
340  {
341  for(kx = 0; kx < kc; kx++) {
342  sum += pi_[kx]*pw_[-kx];
343  }
344  pi_ += ic; /* next input line */
345  pw_ -= kc; /* next mask line */
346  }
347  pi_ += (ir-kr)*ic; /* next input slice */
348  }
349  /* Update output */
350  *r_++ += alpha*sum;
351  }
352  }
353  }
354 }
355 
356 
357 /*
358  3D Input, 3D kernel : convolve given volume with the given kernel, full convolution.
359 */
360 void THTensor_(fullConv3Dptr)(scalar_t *r_,
361  scalar_t alpha,
362  scalar_t *t_, int64_t it, int64_t ir, int64_t ic,
363  scalar_t *k_, int64_t kt, int64_t kr, int64_t kc,
364  int64_t st, int64_t sr, int64_t sc)
365 {
366  int64_t or_ = (ir - 1) * sr + kr;
367  int64_t oc = (ic - 1) * sc + kc;
368 
369  int64_t zz, xx, yy;
370 
371  for(zz = 0; zz < it; zz++)
372  {
373  for(yy = 0; yy < ir; yy++)
374  {
375  for(xx = 0; xx < ic; xx++)
376  {
377  /* Outer product in two dimensions... (between input image and the mask) */
378  scalar_t *po_ = r_ + zz*st*or_*oc + yy*sr*oc + xx*sc;
379  scalar_t *pw_ = k_;
380  int64_t kz, kx, ky;
381  /* printf("Output Plane : %ld,%ld,%ld, input val=%g\n",zz,yy,xx,*t_); */
382  for(kz = 0; kz < kt; kz++)
383  {
384  for(ky = 0; ky < kr; ky++)
385  {
386  scalar_t z = *t_ * alpha;
387  for(kx = 0; kx < kc; kx++) {
388  /* printf("o=%g,k=%g," , po_[kx],pw_[kx]); */
389  po_[kx] += z * pw_[kx];
390  /* printf("o=%g " , po_[kx]); */
391  }
392  /* printf("\n"); */
393  po_ += oc; /* next input line */
394  pw_ += kc; /* next mask line */
395  }
396  po_ += (or_-kr)*oc; /* next output slice */
397  /* printf("\n"); */
398  }
399  t_++;
400  }
401  }
402  }
403 }
404 
405 /*
406  3D Input, 3D kernel : convolve given volume with the given kernel, full convolution.
407 */
408 void THTensor_(fullXCorr3Dptr)(scalar_t *r_,
409  scalar_t alpha,
410  scalar_t *t_, int64_t it, int64_t ir, int64_t ic,
411  scalar_t *k_, int64_t kt, int64_t kr, int64_t kc,
412  int64_t st, int64_t sr, int64_t sc)
413 {
414  int64_t or_ = (ir - 1) * sr + kr;
415  int64_t oc = (ic - 1) * sc + kc;
416 
417  int64_t zz, xx, yy;
418 
419  for(zz = 0; zz < it; zz++)
420  {
421  for(yy = 0; yy < ir; yy++)
422  {
423  for(xx = 0; xx < ic; xx++)
424  {
425  /* Outer product in two dimensions... (between input image and the mask) */
426  scalar_t *po_ = r_ + zz*st*or_*oc + yy*sr*oc + xx*sc;
427  scalar_t *pw_ = k_ + kt*kr*kc -1;
428  int64_t kz, kx, ky;
429  for(kz = 0; kz < kt; kz++)
430  {
431  for(ky = 0; ky < kr; ky++)
432  {
433  scalar_t z = *t_ * alpha;
434  for(kx = 0; kx < kc; kx++) {
435  po_[kx] += z * pw_[-kx];
436  }
437  po_ += oc; /* next input line */
438  pw_ -= kc; /* next mask line */
439  }
440  po_ += (or_-kr)*oc; /* next output slice */
441  }
442  t_++;
443  }
444  }
445  }
446 }
447 
448 /*
449  3D Input, 3D kernel : convolve given image with the given kernel, valid convolution.
450  for sr,sc=1 this is equivalent to validXCorr3Dptr, but otherwise it is useful for
451  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
452 */
453 void THTensor_(validXCorr3DRevptr)(scalar_t *r_,
454  scalar_t alpha,
455  scalar_t *t_, int64_t it, int64_t ir, int64_t ic,
456  scalar_t *k_, int64_t kt, int64_t kr, int64_t kc,
457  int64_t st, int64_t sr, int64_t sc)
458 {
459  int64_t ot = it - (kt - 1) * st;
460  int64_t or_ = ir - (kr - 1) * sr;
461  int64_t oc = ic - (kc - 1) * sc;
462 
463  int64_t zz, xx, yy;
464  for(zz = 0; zz < kt; zz++)
465  {
466  for(yy = 0; yy < kr; yy++)
467  {
468  for(xx = 0; xx < kc; xx++)
469  {
470  scalar_t *po_ = r_;
471  scalar_t *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
472  scalar_t z = *k_++ * alpha;
473  int64_t kz, kx, ky;
474  for(kz = 0; kz < ot; kz++)
475  {
476  for(ky = 0; ky < or_; ky++)
477  {
478  for(kx = 0; kx < oc; kx++)
479  po_[kx] += z * pi_[kx];
480  pi_ += ic;
481  po_ += oc;
482  }
483  pi_ += (ir-or_)*ic; /* next input slice */
484  }
485  }
486  }
487  }
488 }
489 
490 void THTensor_(conv2d)(scalar_t* output_data,
491  scalar_t alpha,
492  scalar_t* ptr_input, int64_t nInputRows, int64_t nInputCols,
493  scalar_t* ptr_weight, int64_t nKernelRows, int64_t nKernelCols,
494  int64_t srow, int64_t scol,
495  const char *vf, const char *xc)
496 {
497  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
498  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
499  if (*vf == 'F')
500  if (*xc == 'X')
501  THTensor_(fullXCorr2Dptr)(output_data,
502  alpha,
503  ptr_input, nInputRows, nInputCols,
504  ptr_weight, nKernelRows, nKernelCols,
505  srow, scol);
506  else
507  THTensor_(fullConv2Dptr)(output_data,
508  alpha,
509  ptr_input, nInputRows, nInputCols,
510  ptr_weight, nKernelRows, nKernelCols,
511  srow, scol);
512  else
513  if (*xc == 'X')
514  THTensor_(validXCorr2Dptr)(output_data,
515  alpha,
516  ptr_input, nInputRows, nInputCols,
517  ptr_weight, nKernelRows, nKernelCols,
518  srow, scol);
519  else
520  THTensor_(validConv2Dptr)(output_data,
521  alpha,
522  ptr_input, nInputRows, nInputCols,
523  ptr_weight, nKernelRows, nKernelCols,
524  srow, scol);
525 }
526 
527 void THTensor_(conv3d)(scalar_t* output_data,
528  scalar_t alpha,
529  scalar_t* ptr_input, int64_t nInputDepth, int64_t nInputRows, int64_t nInputCols,
530  scalar_t* ptr_weight, int64_t nKernelDepth, int64_t nKernelRows, int64_t nKernelCols,
531  int64_t sdepth, int64_t srow, int64_t scol,
532  const char *vf, const char *xc)
533 {
534  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
535  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
536  if (*vf == 'F')
537  if (*xc == 'X')
538  THTensor_(fullXCorr3Dptr)(output_data,
539  alpha,
540  ptr_input, nInputDepth, nInputRows, nInputCols,
541  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
542  sdepth, srow, scol);
543  else
544  THTensor_(fullConv3Dptr)(output_data,
545  alpha,
546  ptr_input, nInputDepth, nInputRows, nInputCols,
547  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
548  sdepth, srow, scol);
549  else
550  if (*xc == 'X')
551  THTensor_(validXCorr3Dptr)(output_data,
552  alpha,
553  ptr_input, nInputDepth, nInputRows, nInputCols,
554  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
555  sdepth, srow, scol);
556  else
557  THTensor_(validConv3Dptr)(output_data,
558  alpha,
559  ptr_input, nInputDepth, nInputRows, nInputCols,
560  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
561  sdepth, srow, scol);
562 }
563 
564 int64_t THTensor_(convsize)(int64_t x, int64_t k, int64_t s, const char* vf)
565 {
566  THArgCheck(*vf == 'V' || *vf == 'F', 1, "type of convolution can be 'V' or 'F'");
567  if (*vf == 'V')
568  return (x-k)/s + 1;
569  else
570  return (x-1)*s + k;
571 }
572 
573 
574 /*
575  3D input, 3D kernel, 4D output
576  like rank1 update
577  A <- xx' + beta*A
578  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
579  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
580 */
581 void THTensor_(conv2DRevger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol)
582 {
583  int64_t nInputPlane, nInputRows, nInputCols;
584  int64_t nKernelPlane, nKernelRows, nKernelCols;
585  int64_t nOutputRows, nOutputCols;
586  int64_t istride0, kstride0;
587  THTensor *input;
588  THTensor *kernel;
589  scalar_t *input_data;
590  scalar_t *weight_data;
591  scalar_t *output_data;
592  ptrdiff_t nelem;
593  int64_t k;
594 
595  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
596  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
597  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
598  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
599 
600  input = THTensor_(newContiguous)(t_);
601  kernel = THTensor_(newContiguous)(k_);
602 
603  nInputPlane = input->size(0);
604  istride0 = input->stride(0);
605  nInputRows = input->size(1);
606  nInputCols = input->size(2);
607 
608  kstride0 = kernel->stride(0);
609  nKernelPlane = kernel->size(0);
610  nKernelRows = kernel->size(1);
611  nKernelCols = kernel->size(2);
612 
613  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel");
614 
615  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
616  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
617 
618  nelem = THTensor_(nElement)(r_);
619  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
620 
621  input_data = input->data<scalar_t>();
622  weight_data = kernel->data<scalar_t>();
623  output_data = r_->data<scalar_t>();
624 
625  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
626  {
627  /*THTensor_(zero)(r_);*/
628 
629 #pragma omp parallel for private(k)
630  for (k = 0; k < r_->size(0)*r_->size(1); k++)
631  {
632  scalar_t* ptr_output = output_data + k*nOutputCols*nOutputRows;
633  int64_t l;
634  for (l = 0; l < nOutputRows*nOutputCols; l++)
635  ptr_output[l] = 0.0;
636  }
637  }
638  else if (beta != 1)
639  {
640  /*THTensor_(mul)(r_, beta);*/
641 #pragma omp parallel for private(k)
642  for (k = 0; k < r_->size(0)*r_->size(1); k++)
643  {
644  scalar_t* ptr_output = output_data + k*nOutputCols*nOutputRows;
645  int64_t l;
646  for (l = 0; l < nOutputRows*nOutputCols; l++)
647  ptr_output[l] *= beta;
648  }
649  }
650 
651 #pragma omp parallel for private(k)
652  for(k = 0; k < nKernelPlane; k++)
653  {
654  int64_t i;
655  /* get kernel */
656  scalar_t *ptr_weight = weight_data+k*kstride0;
657 
658  for(i = 0; i < nInputPlane; i++)
659  {
660  /* get output */
661  scalar_t *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
662  /* get input */
663  scalar_t *ptr_input = input_data+i*istride0;
664 
665  /* do image, kernel convolution */
666  THTensor_(validXCorr2DRevptr)(ptr_output,
667  alpha,
668  ptr_input, nInputRows, nInputCols,
669  ptr_weight, nKernelRows, nKernelCols,
670  srow, scol);
671  /* Next output plane */
672  /* output_data += nOutputCols*nOutputRows; */
673  }
674  }
675  c10::raw::intrusive_ptr::decref(input);
676  c10::raw::intrusive_ptr::decref(kernel);
677 }
678 
679 
680 /*
681  3D input, 3D kernel, 4D output
682  like rank1 update
683  A <- xx' + beta*A
684  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
685  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
686 */
687 void THTensor_(conv2DRevgerm)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol)
688 {
689  int64_t nbatch, nInputPlane, nInputRows, nInputCols;
690  int64_t nKernelPlane, nKernelRows, nKernelCols;
691  int64_t nOutputRows, nOutputCols;
692  int64_t istride0, kstride0, istride1, kstride1;
693  THTensor *input;
694  THTensor *kernel;
695  scalar_t *input_data;
696  scalar_t *weight_data;
697  scalar_t *output_data;
698  ptrdiff_t nelem;
699  int64_t k;
700 
701  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
702  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
703  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
704  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
705 
706  input = THTensor_(newContiguous)(t_);
707  kernel = THTensor_(newContiguous)(k_);
708 
709  istride0 = input->stride(0);
710  istride1 = input->stride(1);
711  nbatch = input->size(0);
712  nInputPlane = input->size(1);
713  nInputRows = input->size(2);
714  nInputCols = input->size(3);
715 
716  kstride0 = kernel->stride(0);
717  kstride1 = kernel->stride(1);
718  nKernelPlane = kernel->size(1);
719  nKernelRows = kernel->size(2);
720  nKernelCols = kernel->size(3);
721 
722  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel");
723  THArgCheck(kernel->size(0) == input->size(0) , 2, "conv2DRevger : Input batch and kernel batch is not same size");
724 
725  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
726  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
727 
728  nelem = THTensor_(nElement)(r_);
729  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
730 
731  input_data = input->data<scalar_t>();
732  weight_data = kernel->data<scalar_t>();
733  output_data = r_->data<scalar_t>();
734 
735  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
736  {
737  /*THTensor_(zero)(r_);*/
738 
739 #pragma omp parallel for private(k)
740  for (k = 0; k < r_->size(0)*r_->size(1); k++)
741  {
742  scalar_t* ptr_output = output_data + k*nOutputCols*nOutputRows;
743  int64_t l;
744  for (l = 0; l < nOutputRows*nOutputCols; l++)
745  ptr_output[l] = 0.0;
746  }
747  }
748  else if (beta != 1)
749  {
750  /*THTensor_(mul)(r_, beta);*/
751 #pragma omp parallel for private(k)
752  for (k = 0; k < r_->size(0)*r_->size(1); k++)
753  {
754  scalar_t* ptr_output = output_data + k*nOutputCols*nOutputRows;
755  int64_t l;
756  for (l = 0; l < nOutputRows*nOutputCols; l++)
757  ptr_output[l] *= beta;
758  }
759  }
760 
761 #pragma omp parallel for private(k)
762  for(k = 0; k < nKernelPlane; k++)
763  {
764  int64_t i;
765  for(i = 0; i < nInputPlane; i++)
766  {
767  int64_t p;
768  for(p = 0; p < nbatch; p++)
769  {
770  /* get kernel */
771  scalar_t *ptr_weight = weight_data + p*kstride0 + k*kstride1;
772  /* get output */
773  scalar_t *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
774  /* get input */
775  scalar_t *ptr_input = input_data + p*istride0 + i*istride1;
776 
777  /* do image, kernel convolution */
778  THTensor_(validXCorr2DRevptr)(ptr_output,
779  alpha,
780  ptr_input, nInputRows, nInputCols,
781  ptr_weight, nKernelRows, nKernelCols,
782  srow, scol);
783  /* Next output plane */
784  /* output_data += nOutputCols*nOutputRows; */
785  }
786  }
787  }
788  c10::raw::intrusive_ptr::decref(input);
789  c10::raw::intrusive_ptr::decref(kernel);
790 }
791 
792 
793 /*
794  3D input, 3D kernel, 4D output
795  like rank1 update
796  A <- xx' + beta*A
797 */
798 void THTensor_(conv2Dger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
799 {
800  int64_t nInputPlane, nInputRows, nInputCols;
801  int64_t nKernelPlane, nKernelRows, nKernelCols;
802  int64_t nOutputRows, nOutputCols;
803  int64_t istride0, kstride0;
804 
805  THTensor *input;
806  THTensor *kernel;
807  scalar_t *input_data;
808  scalar_t *weight_data;
809  scalar_t *output_data;
810  ptrdiff_t nelem;
811  int64_t k;
812 
813  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
814  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
815  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
816  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
817  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
818  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
819 
820  input = THTensor_(newContiguous)(t_);
821  kernel = THTensor_(newContiguous)(k_);
822 
823  nInputPlane = input->size(0);
824  istride0 = input->stride(0);
825  nInputRows = input->size(1);
826  nInputCols = input->size(2);
827 
828  kstride0 = kernel->stride(0);
829  nKernelPlane = kernel->size(0);
830  nKernelRows = kernel->size(1);
831  nKernelCols = kernel->size(2);
832 
833  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel");
834 
835  if (*vf == 'F') {
836  nOutputRows = (nInputRows - 1) * srow + nKernelRows;
837  nOutputCols = (nInputCols - 1) * scol + nKernelCols;
838  } else { /* valid */
839  nOutputRows = (nInputRows - nKernelRows) / srow + 1;
840  nOutputCols = (nInputCols - nKernelCols) / scol + 1;
841  }
842 
843  nelem = THTensor_(nElement)(r_);
844  THTensor_(resize4d)(r_, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
845 
846  input_data = input->data<scalar_t>();
847  weight_data = kernel->data<scalar_t>();
848  output_data = r_->data<scalar_t>();
849 
850  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
851  {
852  /*THTensor_(zero)(r_);*/
853 #pragma omp parallel for private(k)
854  for (k = 0; k < r_->size(0)*r_->size(1); k++)
855  {
856  scalar_t* ptr_output = output_data + k*nOutputCols*nOutputRows;
857  int64_t l;
858  for (l = 0; l < nOutputRows*nOutputCols; l++)
859  ptr_output[l] = 0.0;
860  }
861  }
862  else if (beta != 1)
863  {
864  /*THTensor_(mul)(r_, beta);*/
865 #pragma omp parallel for private(k)
866  for (k = 0; k < r_->size(0)*r_->size(1); k++)
867  {
868  scalar_t* ptr_output = output_data + k*nOutputCols*nOutputRows;
869  int64_t l;
870  for (l = 0; l < nOutputRows*nOutputCols; l++)
871  ptr_output[l] *= beta;
872  }
873  }
874 
875 #pragma omp parallel for private(k)
876  for(k = 0; k < nKernelPlane; k++)
877  {
878  int64_t i;
879  /* get kernel */
880  scalar_t *ptr_weight = weight_data+k*kstride0;
881 
882  for(i = 0; i < nInputPlane; i++)
883  {
884  /* get output */
885  scalar_t *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
886  /* get input */
887  scalar_t *ptr_input = input_data+i*istride0;
888 
889  /* do image, kernel convolution */
890  if (*vf == 'F')
891  if (*xc == 'X')
892  THTensor_(fullXCorr2Dptr)(ptr_output,
893  alpha,
894  ptr_input, nInputRows, nInputCols,
895  ptr_weight, nKernelRows, nKernelCols,
896  srow, scol);
897  else
898  THTensor_(fullConv2Dptr)(ptr_output,
899  alpha,
900  ptr_input, nInputRows, nInputCols,
901  ptr_weight, nKernelRows, nKernelCols,
902  srow, scol);
903  else
904  if (*xc == 'X')
905  THTensor_(validXCorr2Dptr)(ptr_output,
906  alpha,
907  ptr_input, nInputRows, nInputCols,
908  ptr_weight, nKernelRows, nKernelCols,
909  srow, scol);
910  else
911  THTensor_(validConv2Dptr)(ptr_output,
912  alpha,
913  ptr_input, nInputRows, nInputCols,
914  ptr_weight, nKernelRows, nKernelCols,
915  srow, scol);
916  /* Next output plane */
917  /* output_data += nOutputCols*nOutputRows; */
918  }
919  }
920  c10::raw::intrusive_ptr::decref(input);
921  c10::raw::intrusive_ptr::decref(kernel);
922 }
923 
924 
925 /*
926  3D input, 4D kernel, 3D output
927  matrix vector product like
928  y <- Ax + beta*y
929 */
930 void THTensor_(conv2Dmv)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
931 {
932  int64_t nInputPlane, nInputRows, nInputCols;
933  int64_t nKernelRows, nKernelCols;
934  int64_t nOutputPlane, nOutputRows, nOutputCols;
935  int64_t istride0, kstride0, kstride1;
936  THTensor *input;
937  THTensor* kernel;
938  scalar_t *input_data;
939  scalar_t *weight_data;
940  scalar_t *output_data;
941  ptrdiff_t nelem;
942  int64_t k;
943 
944  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
945  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
946  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
947  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
948  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
949  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
950 
951  input = THTensor_(newContiguous)(t_);
952  if (!(k_->stride(3) == 1) || !(k_->stride(2) == k_->size(3))) {
953  kernel = THTensor_(newContiguous)(k_);
954  } else {
955  THTensor_(retain)(k_);
956  kernel = k_;
957  }
958 
959  nInputPlane = input->size(0);
960  istride0 = input->stride(0);
961  nInputRows = input->size(1);
962  nInputCols = input->size(2);
963 
964  kstride0 = kernel->stride(0);
965  kstride1 = kernel->stride(1);
966  nKernelRows = kernel->size(2);
967  nKernelCols = kernel->size(3);
968  nOutputPlane = kernel->size(0);
969  THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes");
970 
971  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
972 
973  if (*vf == 'F') {
974  nOutputRows = (nInputRows - 1) * srow + nKernelRows;
975  nOutputCols = (nInputCols - 1) * scol + nKernelCols;
976  } else { /* valid */
977  nOutputRows = (nInputRows - nKernelRows) / srow + 1;
978  nOutputCols = (nInputCols - nKernelCols) / scol + 1;
979  }
980 
981  nelem = THTensor_(nElement)(r_);
982  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
983 
984  input_data = input->data<scalar_t>();
985  weight_data = kernel->data<scalar_t>();
986  output_data = r_->data<scalar_t>();
987 
988  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
989  {
990  /*THTensor_(zero)(r_);*/
991 #pragma omp parallel for private(k)
992  for (k = 0; k < r_->size(0); k++)
993  {
994  scalar_t* ptr_output = output_data + k*nOutputCols*nOutputRows;
995  int64_t l;
996  for (l = 0; l < nOutputRows*nOutputCols; l++)
997  ptr_output[l] = 0.0;
998  }
999  }
1000  else if (beta != 1)
1001  {
1002  /*THTensor_(mul)(r_, beta);*/
1003 #pragma omp parallel for private(k)
1004  for (k = 0; k < r_->size(0); k++)
1005  {
1006  scalar_t* ptr_output = output_data + k*nOutputCols*nOutputRows;
1007  int64_t l;
1008  for (l = 0; l < nOutputRows*nOutputCols; l++)
1009  ptr_output[l] *= beta;
1010  }
1011  }
1012 
1013 #pragma omp parallel for private(k)
1014  for(k = 0; k < nOutputPlane; k++)
1015  {
1016  int64_t i;
1017  /* get output */
1018  scalar_t *ptr_output = output_data + k*nOutputCols*nOutputRows;
1019  for(i = 0; i < nInputPlane; i++)
1020  {
1021  /* get kernel */
1022  scalar_t *ptr_weight = weight_data + k*kstride0 + i*kstride1;
1023  /* get input */
1024  scalar_t *ptr_input = input_data + i*istride0;
1025 
1026  /* do image, kernel convolution */
1027  if (*vf == 'F')
1028  if (*xc == 'X')
1029  THTensor_(fullXCorr2Dptr)(ptr_output,
1030  alpha,
1031  ptr_input, nInputRows, nInputCols,
1032  ptr_weight, nKernelRows, nKernelCols,
1033  srow, scol);
1034  else
1035  THTensor_(fullConv2Dptr)(ptr_output,
1036  alpha,
1037  ptr_input, nInputRows, nInputCols,
1038  ptr_weight, nKernelRows, nKernelCols,
1039  srow, scol);
1040  else
1041  if (*xc == 'X')
1042  THTensor_(validXCorr2Dptr)(ptr_output,
1043  alpha,
1044  ptr_input, nInputRows, nInputCols,
1045  ptr_weight, nKernelRows, nKernelCols,
1046  srow, scol);
1047  else
1048  THTensor_(validConv2Dptr)(ptr_output,
1049  alpha,
1050  ptr_input, nInputRows, nInputCols,
1051  ptr_weight, nKernelRows, nKernelCols,
1052  srow, scol);
1053  }
1054  /* Next output plane */
1055  /* output_data += nOutputCols*nOutputRows;*/
1056  }
1057  c10::raw::intrusive_ptr::decref(input);
1058  c10::raw::intrusive_ptr::decref(kernel);
1059 }
1060 
1061 
1062 /*
1063  3D input, 4D kernel, 3D output
1064  matrix vector product like
1065  y <- Ax + beta*y
1066 */
1067 void THTensor_(conv2Dmm)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
1068 {
1069  int64_t nInputPlane, nInputRows, nInputCols;
1070  int64_t nKernelRows, nKernelCols;
1071  int64_t nOutputPlane, nOutputRows, nOutputCols;
1072  int64_t kstride0, kstride1;
1073  THTensor *input;
1074  THTensor* kernel;
1075  int64_t nbatch;
1076  ptrdiff_t nelem;
1077  scalar_t *input_data;
1078  scalar_t *weight_data;
1079  scalar_t *output_data;
1080  int64_t p;
1081 
1082  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
1083  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
1084  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
1085  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
1086  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
1087  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
1088 
1089  input = THTensor_(newContiguous)(t_);
1090  if (!(k_->stride(3) == 1) || !(k_->stride(2) == k_->size(3))) {
1091  kernel = THTensor_(newContiguous)(k_);
1092  } else {
1093  THTensor_(retain)(k_);
1094  kernel = k_;
1095  }
1096 
1097  nbatch = input->size(0);
1098  nInputPlane = input->size(1);
1099  nInputRows = input->size(2);
1100  nInputCols = input->size(3);
1101 
1102  kstride0 = kernel->stride(0);
1103  kstride1 = kernel->stride(1);
1104  nKernelRows = kernel->size(2);
1105  nKernelCols = kernel->size(3);
1106  nOutputPlane = kernel->size(0);
1107  THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes");
1108 
1109  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
1110 
1111  if (*vf == 'F') {
1112  nOutputRows = (nInputRows - 1) * srow + nKernelRows;
1113  nOutputCols = (nInputCols - 1) * scol + nKernelCols;
1114  } else { /* valid */
1115  nOutputRows = (nInputRows - nKernelRows) / srow + 1;
1116  nOutputCols = (nInputCols - nKernelCols) / scol + 1;
1117  }
1118 
1119  nelem = THTensor_(nElement)(r_);
1120  THTensor_(resize4d)(r_, nbatch, nOutputPlane, nOutputRows, nOutputCols);
1121 
1122  input_data = input->data<scalar_t>();
1123  weight_data = kernel->data<scalar_t>();
1124  output_data = r_->data<scalar_t>();
1125 
1126  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1127  {
1128  /*THTensor_(zero)(r_);*/
1129 #pragma omp parallel for private(p)
1130  for (p=0; p < r_->size(0); p++)
1131  {
1132  int64_t k;
1133  for (k = 0; k < r_->size(1); k++)
1134  {
1135  scalar_t* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
1136  int64_t l;
1137  for (l = 0; l < nOutputRows*nOutputCols; l++)
1138  ptr_output[l] = 0.0;
1139  }
1140  }
1141  }
1142  else if (beta != 1)
1143  {
1144  /*THTensor_(mul)(r_, beta);*/
1145 #pragma omp parallel for private(p)
1146  for(p=0; p < r_->size(0); p++)
1147  {
1148  int64_t k;
1149  for (k = 0; k < r_->size(1); k++)
1150  {
1151  scalar_t* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
1152  int64_t l;
1153  for (l = 0; l < nOutputRows*nOutputCols; l++)
1154  ptr_output[l] *= beta;
1155  }
1156  }
1157  }
1158 
1159 #pragma omp parallel for private(p)
1160  for(p=0; p < nbatch; p++)
1161  {
1162  int64_t k;
1163  for(k = 0; k < nOutputPlane; k++)
1164  {
1165  int64_t i;
1166  /* get output */
1167  scalar_t *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows;
1168  for(i = 0; i < nInputPlane; i++)
1169  {
1170  /* get kernel */
1171  scalar_t *ptr_weight = weight_data + k*kstride0 + i*kstride1;
1172  /* get input */
1173  scalar_t *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols;
1174 
1175  /* do image, kernel convolution */
1176  if (*vf == 'F')
1177  if (*xc == 'X')
1178  THTensor_(fullXCorr2Dptr)(ptr_output,
1179  alpha,
1180  ptr_input, nInputRows, nInputCols,
1181  ptr_weight, nKernelRows, nKernelCols,
1182  srow, scol);
1183  else
1184  THTensor_(fullConv2Dptr)(ptr_output,
1185  alpha,
1186  ptr_input, nInputRows, nInputCols,
1187  ptr_weight, nKernelRows, nKernelCols,
1188  srow, scol);
1189  else
1190  if (*xc == 'X')
1191  THTensor_(validXCorr2Dptr)(ptr_output,
1192  alpha,
1193  ptr_input, nInputRows, nInputCols,
1194  ptr_weight, nKernelRows, nKernelCols,
1195  srow, scol);
1196  else
1197  THTensor_(validConv2Dptr)(ptr_output,
1198  alpha,
1199  ptr_input, nInputRows, nInputCols,
1200  ptr_weight, nKernelRows, nKernelCols,
1201  srow, scol);
1202  }
1203  /* Next output plane */
1204  /* output_data += nOutputCols*nOutputRows;*/
1205  }
1206  }
1207  c10::raw::intrusive_ptr::decref(input);
1208  c10::raw::intrusive_ptr::decref(kernel);
1209 }
1210 
1211 
1212 /*
1213  2D input, 2D kernel, 2D output
1214  scalar multiplication like
1215  y <- x*y + beta*y
1216 */
1217 void THTensor_(conv2Dmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
1218 {
1219  THTensor *input;
1220  THTensor* kernel;
1221  int64_t nInputRows;
1222  int64_t nInputCols;
1223  int64_t nKernelRows;
1224  int64_t nKernelCols;
1225  int64_t nOutputRows, nOutputCols;
1226  scalar_t *ptr_input;
1227  scalar_t *ptr_weight;
1228  scalar_t *output_data;
1229  ptrdiff_t nelem;
1230 
1231  AT_CHECK(!t_->is_empty() && t_->dim() == 2, "input: non-empty 2D Tensor expected, got size: ", t_->sizes());
1232  AT_CHECK(!k_->is_empty() && k_->dim() == 2, "kernel: non-empty 2D Tensor expected, got size: ", k_->sizes());
1233  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
1234  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
1235 
1236  input = THTensor_(newContiguous)(t_);
1237  kernel = THTensor_(newContiguous)(k_);
1238 
1239  nInputRows = input->size(0);
1240  nInputCols = input->size(1);
1241  nKernelRows = kernel->size(0);
1242  nKernelCols = kernel->size(1);
1243 
1244  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel");
1245 
1246  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
1247  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
1248 
1249  nelem = THTensor_(nElement)(r_);
1250  THTensor_(resize2d)(r_, nOutputRows, nOutputCols);
1251  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1252  THTensor_(zero)(r_);
1253  else if (beta != 1)
1254  THTensor_(mul)(r_, r_, beta);
1255 
1256  ptr_input = input->data<scalar_t>();
1257  ptr_weight = kernel->data<scalar_t>();
1258  output_data = r_->data<scalar_t>();
1259 
1260 
1261  /* do image, kernel convolution */
1262  THTensor_(conv2d)(output_data,
1263  alpha,
1264  ptr_input, nInputRows, nInputCols,
1265  ptr_weight, nKernelRows, nKernelCols,
1266  srow, scol, vf, xc);
1267  c10::raw::intrusive_ptr::decref(input);
1268  c10::raw::intrusive_ptr::decref(kernel);
1269 }
1270 
1271 /*
1272  3D input, 3D kernel, 3D output
1273  component wise multiplication like
1274  y <- y.*x + beta*y
1275 */
1276 void THTensor_(conv2Dcmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, int64_t srow, int64_t scol, const char *vf, const char *xc)
1277 {
1278  int64_t nInputPlane, nInputRows, nInputCols;
1279  int64_t nKernelRows, nKernelCols;
1280  int64_t nOutputPlane, nOutputRows, nOutputCols;
1281  int64_t istride0, kstride0;
1282  THTensor *input;
1283  THTensor *kernel;
1284  scalar_t *input_data;
1285  scalar_t *weight_data;
1286  scalar_t *output_data;
1287  ptrdiff_t nelem;
1288  int64_t k;
1289 
1290  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
1291  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
1292  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
1293  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
1294 
1295  input = THTensor_(newContiguous)(t_);
1296  kernel = THTensor_(newContiguous)(k_);
1297 
1298  istride0 = input->stride(0);
1299  nInputPlane = input->size(0);
1300  nInputRows = input->size(1);
1301  nInputCols = input->size(2);
1302 
1303  kstride0 = kernel->stride(0);
1304  nOutputPlane = kernel->size(0);
1305  nKernelRows = kernel->size(1);
1306  nKernelCols = kernel->size(2);
1307 
1308  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
1309  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel");
1310 
1311  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
1312  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
1313 
1314  nelem = THTensor_(nElement)(r_);
1315  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
1316 
1317  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1318  {
1319  THTensor_(zero)(r_);
1320  }
1321  else if (beta != 1)
1322  THTensor_(mul)(r_, r_, beta);
1323 
1324  input_data = input->data<scalar_t>();
1325  weight_data = kernel->data<scalar_t>();
1326  output_data = r_->data<scalar_t>();
1327 
1328  for(k = 0; k < nOutputPlane; k++)
1329  {
1330  /* get kernel */
1331  scalar_t *ptr_weight = weight_data + k*kstride0;
1332  /* get input */
1333  scalar_t *ptr_input = input_data + k*istride0;
1334 
1335  /* do image, kernel convolution */
1336  THTensor_(conv2d)(output_data,
1337  alpha,
1338  ptr_input, nInputRows, nInputCols,
1339  ptr_weight, nKernelRows, nKernelCols,
1340  srow, scol, vf, xc);
1341  /* Next output plane */
1342  output_data += nOutputCols*nOutputRows;
1343  }
1344  c10::raw::intrusive_ptr::decref(input);
1345  c10::raw::intrusive_ptr::decref(kernel);
1346 }
1347 
1348 /*
1349  3D input, 3D kernel, 3D output
1350  component wise multiplication like with a permutation map
1351  y <- y.*x + beta*y
1352 */
1353 void THTensor_(conv2Dmap)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, THTensor *map, int64_t srow, int64_t scol, const char *vf, const char *xc)
1354 {
1355  int64_t nInputPlane, nInputRows, nInputCols;
1356  int64_t nKernelRows, nKernelCols;
1357  int64_t nOutputPlane, nOutputRows, nOutputCols;
1358  int64_t istride0, kstride0;
1359  THTensor *input;
1360  THTensor* kernel;
1361  scalar_t *input_data;
1362  scalar_t *weight_data;
1363  scalar_t *output_data;
1364  int64_t nmaps;
1365  ptrdiff_t nelem;
1366  int64_t k;
1367 
1368  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
1369  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
1370  THArgCheck(THTensor_nDimensionLegacyAll(map) == 2 , 4, "map: 2D Tensor expected");
1371  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
1372  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
1373 
1374  input = THTensor_(newContiguous)(t_);
1375  kernel = THTensor_(newContiguous)(k_);
1376 
1377  istride0 = input->stride(0);
1378  nInputPlane = input->size(0);
1379  nInputRows = input->size(1);
1380  nInputCols = input->size(2);
1381 
1382  kstride0 = kernel->stride(0);
1383  nOutputPlane = kernel->size(0);
1384  nKernelRows = kernel->size(1);
1385  nKernelCols = kernel->size(2);
1386 
1387  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
1388  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols)
1389  || *vf == 'F', 2, "conv2Dmap : Input image is smaller than kernel");
1390 
1391  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
1392  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
1393 
1394  nelem = THTensor_(nElement)(r_);
1395  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
1396 
1397  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1398  {
1399  THTensor_(zero)(r_);
1400  }
1401  else if (beta != 1)
1402  THTensor_(mul)(r_, r_, beta);
1403 
1404  input_data = input->data<scalar_t>();
1405  weight_data = kernel->data<scalar_t>();
1406  output_data = r_->data<scalar_t>();
1407 
1408  nmaps = map->size(0);
1409 
1410  for(k = 0; k < nmaps; k++)
1411  {
1412  /* get indices */
1413  int64_t from = (int64_t)THTensor_(get2d)(map,k,0)-1;
1414  int64_t to = (int64_t)THTensor_(get2d)(map,k,1)-1;
1415 
1416  /* get kernel */
1417  scalar_t *ptr_weight = weight_data + k*kstride0;
1418  /* get input */
1419  scalar_t *ptr_input = input_data + from*istride0;
1420  /* get output */
1421  scalar_t *ptr_output = output_data + to*nOutputRows*nOutputCols;
1422 
1423  /* do image, kernel convolution */
1424  THTensor_(conv2d)(ptr_output,
1425  alpha,
1426  ptr_input, nInputRows, nInputCols,
1427  ptr_weight, nKernelRows, nKernelCols,
1428  srow, scol, vf, xc);
1429  }
1430  c10::raw::intrusive_ptr::decref(input);
1431  c10::raw::intrusive_ptr::decref(kernel);
1432 }
1433 
1434 /*
1435  4D input, 4D kernel, 5D output
1436  like rank1 update
1437  A <- xx' + beta*A
1438  for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for
1439  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
1440 */
1441 void THTensor_(conv3DRevger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_,
1442  int64_t sdepth, int64_t srow, int64_t scol)
1443 {
1444  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
1445  int64_t nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
1446  int64_t nOutputDepth, nOutputRows, nOutputCols;
1447  int64_t istride0, kstride0;
1448  THTensor *input;
1449  THTensor *kernel;
1450  scalar_t *input_data;
1451  scalar_t *weight_data;
1452  scalar_t *output_data;
1453  ptrdiff_t nelem;
1454  int64_t k, i;
1455 
1456  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
1457  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
1458  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
1459  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
1460  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
1461 
1462  input = THTensor_(newContiguous)(t_);
1463  kernel = THTensor_(newContiguous)(k_);
1464 
1465  nInputPlane = input->size(0);
1466  istride0 = input->stride(0);
1467  nInputDepth = input->size(1);
1468  nInputRows = input->size(2);
1469  nInputCols = input->size(3);
1470 
1471  kstride0 = kernel->stride(0);
1472  nKernelPlane = kernel->size(0);
1473  nKernelDepth= kernel->size(1);
1474  nKernelRows = kernel->size(2);
1475  nKernelCols = kernel->size(3);
1476 
1477  THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel");
1478 
1479  nOutputDepth = nInputDepth - (nKernelDepth - 1) * sdepth;
1480  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
1481  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
1482 
1483  nelem = THTensor_(nElement)(r_);
1484  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);
1485 
1486  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1487  {
1488  THTensor_(zero)(r_);
1489  }
1490  else if (beta != 1)
1491  THTensor_(mul)(r_, r_, beta);
1492 
1493  input_data = input->data<scalar_t>();
1494  weight_data = kernel->data<scalar_t>();
1495  output_data = r_->data<scalar_t>();
1496 
1497  for(k = 0; k < nKernelPlane; k++)
1498  {
1499  /* get kernel */
1500  scalar_t *ptr_weight = weight_data+k*kstride0;
1501 
1502  for(i = 0; i < nInputPlane; i++)
1503  {
1504  /* get input */
1505  scalar_t *ptr_input = input_data+i*istride0;
1506 
1507  /* do image, kernel convolution */
1508  THTensor_(validXCorr3DRevptr)(output_data,
1509  alpha,
1510  ptr_input, nInputDepth, nInputRows, nInputCols,
1511  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
1512  sdepth, srow, scol);
1513  /* Next output plane */
1514  output_data += nOutputDepth*nOutputCols*nOutputRows;
1515  }
1516  }
1517  c10::raw::intrusive_ptr::decref(input);
1518  c10::raw::intrusive_ptr::decref(kernel);
1519 }
1520 
1521 
1522 /*
1523  4D input, 4D kernel, 5D output
1524  like rank1 update
1525  A <- xx' + beta*A
1526 */
1527 void THTensor_(conv3Dger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_,
1528  int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
1529 {
1530  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
1531  int64_t nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
1532  int64_t nOutputDepth, nOutputRows, nOutputCols;
1533  int64_t istride0, kstride0;
1534  THTensor *input;
1535  THTensor *kernel;
1536  scalar_t *input_data;
1537  scalar_t *weight_data;
1538  scalar_t *output_data;
1539  ptrdiff_t nelem;
1540  int64_t k, i;
1541 
1542  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
1543  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
1544  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
1545  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
1546  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
1547  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
1548  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
1549 
1550  input = THTensor_(newContiguous)(t_);
1551  kernel = THTensor_(newContiguous)(k_);
1552 
1553  nInputPlane = input->size(0);
1554  istride0 = input->stride(0);
1555  nInputDepth = input->size(1);
1556  nInputRows = input->size(2);
1557  nInputCols = input->size(3);
1558 
1559  kstride0 = kernel->stride(0);
1560  nKernelPlane = kernel->size(0);
1561  nKernelDepth = kernel->size(1);
1562  nKernelRows = kernel->size(2);
1563  nKernelCols = kernel->size(3);
1564 
1565  THArgCheck((nInputDepth >= nKernelDepth
1566  && nInputRows >= nKernelRows
1567  && nInputCols >= nKernelCols)
1568  || *vf == 'F', 2, "conv3Dger : Input image is smaller than kernel");
1569 
1570  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
1571  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
1572  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
1573 
1574  nelem = THTensor_(nElement)(r_);
1575  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);
1576 
1577  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1578  {
1579  THTensor_(zero)(r_);
1580  }
1581  else if (beta != 1)
1582  THTensor_(mul)(r_, r_, beta);
1583 
1584  input_data = input->data<scalar_t>();
1585  weight_data = kernel->data<scalar_t>();
1586  output_data = r_->data<scalar_t>();
1587 
1588  for(k = 0; k < nKernelPlane; k++)
1589  {
1590  /* get kernel */
1591  scalar_t *ptr_weight = weight_data+k*kstride0;
1592 
1593  for(i = 0; i < nInputPlane; i++)
1594  {
1595  /* get input */
1596  scalar_t *ptr_input = input_data+i*istride0;
1597 
1598  /* do image, kernel convolution */
1599  THTensor_(conv3d)(output_data,
1600  alpha,
1601  ptr_input, nInputDepth, nInputRows, nInputCols,
1602  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
1603  sdepth, srow, scol, vf, xc);
1604 
1605  /* Next output plane */
1606  output_data += nOutputDepth*nOutputCols*nOutputRows;
1607  }
1608  }
1609  c10::raw::intrusive_ptr::decref(input);
1610  c10::raw::intrusive_ptr::decref(kernel);
1611 }
1612 
1613 /*
1614  4D input, 5D kernel, 4D output
1615  matrix vector product like
1616  y <- Ax + beta*y
1617 */
1618 void THTensor_(conv3Dmv)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_,
1619  int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
1620 {
1621  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
1622  int64_t nKernelDepth, nKernelRows, nKernelCols;
1623  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
1624  int64_t istride0, kstride0, kstride1;
1625  THTensor *input;
1626  THTensor *kernel;
1627  scalar_t *input_data;
1628  scalar_t *weight_data;
1629  scalar_t *output_data;
1630  ptrdiff_t nelem;
1631  int64_t k, i;
1632 
1633  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
1634  AT_CHECK(!k_->is_empty() && k_->dim() == 5, "kernel: non-empty 5D Tensor expected, got size: ", k_->sizes());
1635  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
1636  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
1637  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
1638  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
1639  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
1640 
1641  input = THTensor_(newContiguous)(t_);
1642  if (!(k_->stride(4) == 1) || !(k_->stride(3) == k_->size(4))) {
1643  kernel = THTensor_(newContiguous)(k_);
1644  } else {
1645  THTensor_(retain)(k_);
1646  kernel = k_;
1647  }
1648 
1649  nInputPlane = input->size(0);
1650  istride0 = input->stride(0);
1651  nInputDepth = input->size(1);
1652  nInputRows = input->size(2);
1653  nInputCols = input->size(3);
1654 
1655  kstride0 = kernel->stride(0);
1656  kstride1 = kernel->stride(1);
1657  nKernelDepth = kernel->size(2);
1658  nKernelRows = kernel->size(3);
1659  nKernelCols = kernel->size(4);
1660  nOutputPlane = kernel->size(0);
1661  THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes");
1662 
1663  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel");
1664 
1665  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
1666  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
1667  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
1668 
1669  nelem = THTensor_(nElement)(r_);
1670  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
1671 
1672  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1673  {
1674  THTensor_(zero)(r_);
1675  }
1676  else if (beta != 1)
1677  THTensor_(mul)(r_, r_, beta);
1678 
1679  input_data = input->data<scalar_t>();
1680  weight_data = kernel->data<scalar_t>();
1681  output_data = r_->data<scalar_t>();
1682 
1683  for(k = 0; k < nOutputPlane; k++)
1684  {
1685  for(i = 0; i < nInputPlane; i++)
1686  {
1687  /* get kernel */
1688  scalar_t *ptr_weight = weight_data + k*kstride0 + i*kstride1;
1689  /* get input */
1690  scalar_t *ptr_input = input_data + i*istride0;
1691 
1692  /* do image, kernel convolution */
1693  THTensor_(conv3d)(output_data,
1694  alpha,
1695  ptr_input, nInputDepth, nInputRows, nInputCols,
1696  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
1697  sdepth, srow, scol, vf, xc);
1698  }
1699  /* Next output plane */
1700  output_data += nOutputDepth*nOutputCols*nOutputRows;
1701  }
1702  c10::raw::intrusive_ptr::decref(input);
1703  c10::raw::intrusive_ptr::decref(kernel);
1704 }
1705 
1706 /*
1707  3D input, 3D kernel, 3D output
1708  scalar multiplication like
1709  y <- x*y + beta*y
1710 */
1711 void THTensor_(conv3Dmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_,
1712  int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
1713 {
1714  THTensor *input;
1715  THTensor* kernel;
1716  int64_t nInputDepth;
1717  int64_t nInputRows;
1718  int64_t nInputCols;
1719  int64_t nKernelDepth;
1720  int64_t nKernelRows;
1721  int64_t nKernelCols;
1722  int64_t nOutputDepth, nOutputRows, nOutputCols;
1723  scalar_t *ptr_input;
1724  scalar_t *ptr_weight;
1725  scalar_t *output_data;
1726  ptrdiff_t nelem;
1727 
1728  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
1729  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
1730  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
1731  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
1732  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
1733  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
1734  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
1735 
1736  input = THTensor_(newContiguous)(t_);
1737  kernel = THTensor_(newContiguous)(k_);
1738 
1739  nInputDepth = input->size(0);
1740  nInputRows = input->size(1);
1741  nInputCols = input->size(2);
1742  nKernelDepth = kernel->size(0);
1743  nKernelRows = kernel->size(1);
1744  nKernelCols = kernel->size(2);
1745 
1746  THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel");
1747 
1748  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
1749  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
1750  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
1751 
1752  nelem = THTensor_(nElement)(r_);
1753  THTensor_(resize3d)(r_, nOutputDepth, nOutputRows, nOutputCols);
1754  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1755  THTensor_(zero)(r_);
1756  else if (beta != 1)
1757  THTensor_(mul)(r_, r_, beta);
1758 
1759  ptr_input = input->data<scalar_t>();
1760  ptr_weight = kernel->data<scalar_t>();
1761  output_data = r_->data<scalar_t>();
1762 
1763 
1764  /* do image, kernel convolution */
1765  THTensor_(conv3d)(output_data,
1766  alpha,
1767  ptr_input, nInputDepth, nInputRows, nInputCols,
1768  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
1769  sdepth, srow, scol, vf, xc);
1770  c10::raw::intrusive_ptr::decref(input);
1771  c10::raw::intrusive_ptr::decref(kernel);
1772 }
1773 
1774 /*
1775  4D input, 4D kernel, 4D output
1776  component wise multiplication like
1777  y <- y.*x + beta*y
1778 */
1779 void THTensor_(conv3Dcmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_,
1780  int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
1781 {
1782  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
1783  int64_t nKernelDepth, nKernelRows, nKernelCols;
1784  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
1785  int64_t istride0, kstride0;
1786 
1787  THTensor *input;
1788  THTensor *kernel;
1789  scalar_t *input_data;
1790  scalar_t *weight_data;
1791  scalar_t *output_data;
1792  ptrdiff_t nelem;
1793  int64_t k;
1794 
1795  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
1796  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
1797  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
1798  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
1799  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
1800  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
1801 
1802  input = THTensor_(newContiguous)(t_);
1803  kernel = THTensor_(newContiguous)(k_);
1804 
1805  istride0 = input->stride(0);
1806  nInputPlane = input->size(0);
1807  nInputDepth = input->size(1);
1808  nInputRows = input->size(2);
1809  nInputCols = input->size(3);
1810 
1811  kstride0 = kernel->stride(0);
1812  nOutputPlane = kernel->size(0);
1813  nKernelDepth = kernel->size(1);
1814  nKernelRows = kernel->size(2);
1815  nKernelCols = kernel->size(3);
1816 
1817  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
1818  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel");
1819 
1820  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
1821  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
1822  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
1823 
1824  nelem = THTensor_(nElement)(r_);
1825  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
1826 
1827  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1828  {
1829  THTensor_(zero)(r_);
1830  }
1831  else if (beta != 1)
1832  THTensor_(mul)(r_, r_, beta);
1833 
1834  input_data = input->data<scalar_t>();
1835  weight_data = kernel->data<scalar_t>();
1836  output_data = r_->data<scalar_t>();
1837 
1838  for(k = 0; k < nOutputPlane; k++)
1839  {
1840  /* get kernel */
1841  scalar_t *ptr_weight = weight_data + k*kstride0;
1842  /* get input */
1843  scalar_t *ptr_input = input_data + k*istride0;
1844 
1845  /* do image, kernel convolution */
1846  THTensor_(conv3d)(output_data,
1847  alpha,
1848  ptr_input, nInputDepth, nInputRows, nInputCols,
1849  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
1850  sdepth, srow, scol, vf, xc);
1851 
1852  /* Next output plane */
1853  output_data += nOutputDepth*nOutputCols*nOutputRows;
1854  }
1855  c10::raw::intrusive_ptr::decref(input);
1856  c10::raw::intrusive_ptr::decref(kernel);
1857 }
1858 
1859 /*
1860  4D input, 4D kernel, 4D output
1861  component wise multiplication like with a permutation map
1862  y <- y.*x + beta*y
1863 */
1864 void THTensor_(conv3Dmap)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *t_, THTensor *k_, THTensor *map,
1865  int64_t sdepth, int64_t srow, int64_t scol, const char *vf, const char *xc)
1866 {
1867  int64_t nInputPlane, nInputDepth, nInputRows, nInputCols;
1868  int64_t nKernelDepth, nKernelRows, nKernelCols;
1869  int64_t nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
1870  int64_t istride0, kstride0;
1871 
1872  THTensor *input;
1873  THTensor *kernel;
1874  ptrdiff_t nelem;
1875  scalar_t *input_data;
1876  scalar_t *weight_data;
1877  scalar_t *output_data;
1878  int64_t nmaps;
1879  int64_t k;
1880 
1881  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
1882  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
1883  THArgCheck(THTensor_nDimensionLegacyAll(map) == 2 , 4, "map: 2D Tensor expected");
1884  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
1885  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
1886  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
1887  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
1888 
1889  input = THTensor_(newContiguous)(t_);
1890  kernel = THTensor_(newContiguous)(k_);
1891 
1892  istride0 = input->stride(0);
1893  nInputPlane = input->size(0);
1894  nInputDepth = input->size(1);
1895  nInputRows = input->size(2);
1896  nInputCols = input->size(3);
1897 
1898  kstride0 = kernel->stride(0);
1899  nOutputPlane = kernel->size(0);
1900  nKernelDepth = kernel->size(1);
1901  nKernelRows = kernel->size(2);
1902  nKernelCols = kernel->size(3);
1903 
1904  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
1905  THArgCheck((nInputDepth >= nKernelDepth
1906  && nInputRows >= nKernelRows
1907  && nInputCols >= nKernelCols) || *vf == 'F',
1908  2, "conv3Dmap : Input image is smaller than kernel");
1909 
1910  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
1911  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
1912  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
1913 
1914  nelem = THTensor_(nElement)(r_);
1915  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
1916 
1917  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
1918  {
1919  THTensor_(zero)(r_);
1920  }
1921  else if (beta != 1)
1922  THTensor_(mul)(r_, r_, beta);
1923 
1924  input_data = input->data<scalar_t>();
1925  weight_data = kernel->data<scalar_t>();
1926  output_data = r_->data<scalar_t>();
1927 
1928  nmaps = map->size(0);
1929 
1930  for(k = 0; k < nmaps; k++)
1931  {
1932  /* get indices */
1933  int64_t from = (int64_t)THTensor_(get2d)(map,k,0)-1;
1934  int64_t to = (int64_t)THTensor_(get2d)(map,k,1)-1;
1935 
1936  /* get kernel */
1937  scalar_t *ptr_weight = weight_data + k*kstride0;
1938  /* get input */
1939  scalar_t *ptr_input = input_data + from*istride0;
1940  /* get output */
1941  scalar_t *ptr_output = output_data + to*nOutputDepth*nOutputRows*nOutputCols;
1942 
1943  /* do image, kernel convolution */
1944  THTensor_(conv3d)(ptr_output,
1945  alpha,
1946  ptr_input, nInputDepth, nInputRows, nInputCols,
1947  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
1948  sdepth, srow, scol, vf, xc);
1949  }
1950  c10::raw::intrusive_ptr::decref(input);
1951  c10::raw::intrusive_ptr::decref(kernel);
1952 }
1953 #endif