Caffe2 - C++ API
A deep learning, cross platform ML framework
THTensorMath.cpp
1 #ifndef TH_GENERIC_FILE
2 #define TH_GENERIC_FILE "TH/generic/THTensorMath.cpp"
3 #else
4 
5 #include <TH/generic/THTensorApply.hpp>
6 
7 // HEY YOU!
8 //
9 // Looking for a function which used to be in THTensorMath.cpp, but
10 // can't find it anymore? Check THTensorMoreMath.cpp and
11 // THTensorEvenMoreMath.cpp. These source files have been split up
12 // because they were getting too big (a whopping 4669 lines at time
13 // of writing) and causing MSVC to run out of memory. Did you come
14 // here because you saw:
15 //
16 // fatal error C1002: compiler is out of heap space in pass 2
17 //
18 // Try splitting up the file some more.
19 //
20 // At some point, we should reorganize these files in a way that makes
21 // sense (rather than just having cut the file down the middle, which is
22 // what I did when I split these up originally).
23 
24 
25 // Should wrap if the value (a) has a different sign than the divisor (b), but is not 0.
26 static inline bool modulo_wrap(scalar_t a, scalar_t b) {
27  return (a != 0) && (a < 0) != (b < 0);
28 }
29 
30 void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value)
31 {
32 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
33  (void)r_;
34  (void)t;
35  (void)value;
36  return THError("bitor is only supported for integer type tensors");
37 #else
38  THTensor_(resizeAs)(r_, t);
39  int64_t r_Size = THTensor_(nElement)(r_);
40  int r_Contig = THTensor_(isContiguous)(r_);
41  int tContig = THTensor_(isContiguous)(t);
42  int serial_path = 0;
43  if (r_Contig && tContig) {
44  scalar_t *tp = t->data<scalar_t>();
45  scalar_t *rp = r_->data<scalar_t>();
46  int64_t i;
47  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
48  for (i=0; i<r_Size; i++) {
49  rp[i] = tp[i] | value;
50  }
51  } else {
52 #ifdef _OPENMP
53  int inOMP = omp_in_parallel();
54  if (inOMP) {
55  serial_path = 1;
56  } else {
57  TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data | value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
58  }
59 #else
60  serial_path = 1;
61 #endif
62  }
63  if (serial_path) {
64  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = *t_data | value;);
65  }
66 #endif
67 }
68 
69 void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value)
70 {
71 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
72  (void)r_;
73  (void)t;
74  (void)value;
75  return THError("bitxor is only supported for integer type tensors");
76 #else
77  THTensor_(resizeAs)(r_, t);
78  int64_t r_Size = THTensor_(nElement)(r_);
79  int r_Contig = THTensor_(isContiguous)(r_);
80  int tContig = THTensor_(isContiguous)(t);
81  int serial_path = 0;
82  if (r_Contig && tContig) {
83  scalar_t *tp = t->data<scalar_t>();
84  scalar_t *rp = r_->data<scalar_t>();
85  int64_t i;
86  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
87  for (i=0; i<r_Size; i++) {
88  rp[i] = tp[i] ^ value;
89  }
90  } else {
91 #ifdef _OPENMP
92  int inOMP = omp_in_parallel();
93  if (inOMP) {
94  serial_path = 1;
95  } else {
96  TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data ^ value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
97  }
98 #else
99  serial_path = 1;
100 #endif
101  }
102  if (serial_path) {
103  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = *t_data ^ value;);
104  }
105 #endif
106 }
107 
108 void THTensor_(clamp)(THTensor *r_, THTensor *t, scalar_t min_value, scalar_t max_value)
109 {
110  THTensor_(resizeAs)(r_, t);
111  int64_t r_Size = THTensor_(nElement)(r_);
112  int r_Contig = THTensor_(isContiguous)(r_);
113  int tContig = THTensor_(isContiguous)(t);
114  int serial_path = 0;
115  if (r_Contig && tContig) {
116  scalar_t *tp = t->data<scalar_t>();
117  scalar_t *rp = r_->data<scalar_t>();
118  /* scalar_t t_val; */
119  int64_t i;
120  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
121  for (i=0; i<r_Size; i++)
122  rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
123  } else {
124 #ifdef _OPENMP
125  int inOMP = omp_in_parallel();
126  if (inOMP) {
127  serial_path = 1;
128  } else {
129  TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
130  }
131 #else
132  serial_path = 1;
133 #endif
134  }
135  if (serial_path) {
136  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
137  }
138 }
139 
140 void THTensor_(cadd)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src)
141 {
142  THTensor_(resizeAs)(r_, t);
143  int64_t r_Size = THTensor_(nElement)(r_);
144  int64_t srcSize = THTensor_(nElement)(src);
145  int r_Contig = THTensor_(isContiguous)(r_);
146  int tContig = THTensor_(isContiguous)(t);
147  int srcContig = THTensor_(isContiguous)(src);
148  int serial_path = 0;
149  if (srcSize == r_Size){
150  if (r_Contig && tContig && srcContig) {
151  if(r_ == t) {
152  THBlas_(axpy)(THTensor_(nElement)(t), value, src->data<scalar_t>(), 1, r_->data<scalar_t>(), 1);
153  } else {
154  TH_TENSOR_APPLY3_CONTIG(scalar_t, r_, scalar_t, t, scalar_t, src, THVector_(cadd)(r__data, t_data, src_data, value, r__len););
155  }
156  } else {
157 #if _OPENMP
158  int inOMP = omp_in_parallel();
159  if (inOMP) {
160  serial_path = 1;
161  } else {
162  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data + value * *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
163  }
164 #else
165  serial_path = 1;
166 #endif
167  }
168  } else {
169  serial_path = 1;
170  }
171  if (serial_path) {
172  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data + value * *src_data;);
173  }
174 }
175 
176 void THTensor_(csub)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src)
177 {
178  THTensor_(cadd)(r_, t, -value, src);
179 }
180 
181 void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src)
182 {
183  THTensor_(resizeAs)(r_, t);
184  int64_t r_Size = THTensor_(nElement)(r_);
185  int64_t srcSize = THTensor_(nElement)(src);
186  int r_Contig = THTensor_(isContiguous)(r_);
187  int tContig = THTensor_(isContiguous)(t);
188  int srcContig = THTensor_(isContiguous)(src);
189  int serial_path = 0;
190  if (srcSize == r_Size){
191  if (r_Contig && tContig && srcContig) {
192  TH_TENSOR_APPLY3_CONTIG(scalar_t, r_, scalar_t, t, scalar_t, src, THVector_(cmul)(r__data, t_data, src_data, r__len););
193  } else {
194 #if _OPENMP
195  int inOMP = omp_in_parallel();
196  if (inOMP) {
197  serial_path = 1;
198  } else {
199  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data * *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
200  }
201 #else
202  serial_path = 1;
203 #endif
204  }
205  } else {
206  serial_path = 1;
207  }
208  if (serial_path) {
209  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data * *src_data;);
210  }
211 }
212 
213 void THTensor_(pow)(THTensor *r_, THTensor *t, scalar_t value)
214 {
215  THTensor_(resizeAs)(r_, t);
216  if(value == 1) {
217  at::Tensor r__wrap = THTensor_wrap(r_);
218  at::Tensor t_wrap = THTensor_wrap(t);
219  at::_copy_same_type_(r__wrap, t_wrap);
220  }
221  else if(value == 2){
222  THTensor_(cmul)(r_, t, t);
223  }
224  else if(value == 3){
225  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = *t_data * *t_data * *t_data;);
226  }
227 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
228 #if defined (TH_REAL_IS_FLOAT)
229 #define TH_MATH_NAME(fn) fn##f
230 #else
231 #define TH_MATH_NAME(fn) fn
232 #endif
233  else if(value == 0.5){
234  THTensor_(sqrt)(r_, t);
235  }
236  else if(value == -0.5){
237  THTensor_(rsqrt)(r_, t);
238  }
239  else if(value == -1){
240  THTensor_(cinv)(r_, t);
241  }
242  else if(value == -2){
243  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = TH_MATH_NAME(1.0) / (*t_data * *t_data););
244  }
245  else{
246  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = TH_MATH_NAME(pow)(*t_data, value););
247  }
248 #undef TH_MATH_NAME
249 #else
250  else {
251  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = THTensor_(powOne)(*t_data, value););
252  }
253 #endif
254 }
255 
256 scalar_t THTensor_(powOne)(scalar_t x, scalar_t y) {
257 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_HALF)
258  return powf(x, y);
259 #elif defined(TH_REAL_IS_DOUBLE)
260  return pow(x, y);
261 #else
262  THArgCheck(y >= 0, 1,
263  "Integers to negative integer powers are not allowed");
264  scalar_t result = 1;
265  while (y) {
266  if (y & 1) {
267  result *= x;
268  }
269  y /= 2;
270  x *= x;
271  }
272  return result;
273 #endif
274 }
275 
276 void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src)
277 {
278  THTensor_(resizeAs)(r_, t);
279  int64_t r_Size = THTensor_(nElement)(r_);
280  int64_t srcSize = THTensor_(nElement)(src);
281  int r_Contig = THTensor_(isContiguous)(r_);
282  int tContig = THTensor_(isContiguous)(t);
283  int srcContig = THTensor_(isContiguous)(src);
284  int serial_path = 0;
285  if (srcSize == r_Size){
286  if (r_Contig && tContig && srcContig) {
287  scalar_t *tp = t->data<scalar_t>();
288  scalar_t *sp = src->data<scalar_t>();
289  scalar_t *rp = r_->data<scalar_t>();
290  int64_t i;
291  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
292  for (i=0; i<r_Size; i++)
293  rp[i] = THTensor_(powOne)(tp[i], sp[i]);
294  } else {
295 #if _OPENMP
296  int inOMP = omp_in_parallel();
297  if (inOMP) {
298  serial_path = 1;
299  } else {
300  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = THTensor_(powOne)(*t_data, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
301  }
302 #else
303  serial_path = 1;
304 #endif
305  }
306  } else {
307  serial_path = 1;
308  }
309  if (serial_path) {
310  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = THTensor_(powOne)(*t_data, *src_data););
311  }
312 }
313 
314 void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src)
315 {
316  THTensor_(resizeAs)(r_, t);
317  int64_t r_Size = THTensor_(nElement)(r_);
318  int64_t srcSize = THTensor_(nElement)(src);
319  int r_Contig = THTensor_(isContiguous)(r_);
320  int tContig = THTensor_(isContiguous)(t);
321  int srcContig = THTensor_(isContiguous)(src);
322  int serial_path = 0;
323  if (srcSize == r_Size){
324  if (r_Contig && tContig && srcContig) {
325  TH_TENSOR_APPLY3_CONTIG(scalar_t, r_, scalar_t, t, scalar_t, src, THVector_(cdiv)(r__data, t_data, src_data, r__len););
326  } else {
327 #if _OPENMP
328  int inOMP = omp_in_parallel();
329  if (inOMP) {
330  serial_path = 1;
331  } else {
332  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data / *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
333  }
334 #else
335  serial_path = 1;
336 #endif
337  }
338  } else {
339  serial_path = 1;
340  }
341  if (serial_path) {
342  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data / *src_data;);
343  }
344 }
345 
346 void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src)
347 {
348 #if defined(TH_REAL_IS_HALF)
349  return THError("clshift is not supported for torch.HalfTensor");
350 #endif
351  THTensor_(resizeAs)(r_, t);
352  int64_t r_Size = THTensor_(nElement)(r_);
353  int64_t srcSize = THTensor_(nElement)(src);
354  int r_Contig = THTensor_(isContiguous)(r_);
355  int tContig = THTensor_(isContiguous)(t);
356  int srcContig = THTensor_(isContiguous)(src);
357  int serial_path = 0;
358  if (srcSize == r_Size){
359  if (r_Contig && tContig && srcContig) {
360  scalar_t *tp = t->data<scalar_t>();
361  scalar_t *sp = src->data<scalar_t>();
362  scalar_t *rp = r_->data<scalar_t>();
363  int64_t i;
364  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
365  for (i=0; i<r_Size; i++) {
366 #if defined(TH_REAL_IS_FLOAT)
367  rp[i] = tp[i] * powf(2, sp[i]);
368 #elif defined(TH_REAL_IS_DOUBLE)
369  rp[i] = tp[i] * pow(2, sp[i]);
370 #elif defined(TH_REAL_IS_BYTE)
371  rp[i] = ((scalar_t) tp[i]) << sp[i];
372 #else
373  rp[i] = ((ureal) tp[i]) << sp[i];
374 #endif
375  }
376  } else {
377 #if _OPENMP
378  int inOMP = omp_in_parallel();
379  if (inOMP) {
380  serial_path = 1;
381  } else {
382 #if defined(TH_REAL_IS_FLOAT)
383  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data * powf(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
384 #elif defined(TH_REAL_IS_DOUBLE)
385  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data * pow(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
386 #elif defined(TH_REAL_IS_BYTE)
387  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = ((scalar_t)*t_data) << *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
388 #else
389  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = ((ureal)*t_data) << *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
390 #endif
391  }
392 #else
393  serial_path = 1;
394 #endif
395  }
396  } else {
397  serial_path = 1;
398  }
399  if (serial_path) {
400 #if defined(TH_REAL_IS_FLOAT)
401  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data * powf(2, *src_data););
402 #elif defined(TH_REAL_IS_DOUBLE)
403  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data * pow(2, *src_data););
404 #elif defined(TH_REAL_IS_BYTE)
405  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = ((scalar_t)*t_data) << *src_data;);
406 #else
407  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = ((ureal)*t_data) << *src_data;);
408 #endif
409  }
410 }
411 
412 void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src)
413 {
414 #if defined(TH_REAL_IS_HALF)
415  return THError("crshift is not supported for torch.HalfTensor");
416 #endif
417  THTensor_(resizeAs)(r_, t);
418  int64_t r_Size = THTensor_(nElement)(r_);
419  int64_t srcSize = THTensor_(nElement)(src);
420  int r_Contig = THTensor_(isContiguous)(r_);
421  int tContig = THTensor_(isContiguous)(t);
422  int srcContig = THTensor_(isContiguous)(src);
423  int serial_path = 0;
424  if (srcSize == r_Size){
425  if (r_Contig && tContig && srcContig) {
426  scalar_t *tp = t->data<scalar_t>();
427  scalar_t *sp = src->data<scalar_t>();
428  scalar_t *rp = r_->data<scalar_t>();
429  int64_t i;
430  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
431  for (i=0; i<r_Size; i++) {
432 #if defined(TH_REAL_IS_FLOAT)
433  rp[i] = tp[i] / powf(2, sp[i]);
434 #elif defined(TH_REAL_IS_DOUBLE)
435  rp[i] = tp[i] / pow(2, sp[i]);
436 #elif defined(TH_REAL_IS_BYTE)
437  rp[i] = ((scalar_t) tp[i]) >> sp[i];
438 #else
439  rp[i] = ((ureal) tp[i]) >> sp[i];
440 #endif
441  }
442  } else {
443 #if _OPENMP
444  int inOMP = omp_in_parallel();
445  if (inOMP) {
446  serial_path = 1;
447  } else {
448 #if defined(TH_REAL_IS_FLOAT)
449  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data / powf(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
450 #elif defined(TH_REAL_IS_DOUBLE)
451  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data / pow(2, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
452 #elif defined(TH_REAL_IS_BYTE)
453  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = ((scalar_t)*t_data) >> *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
454 #else
455  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = ((ureal)*t_data) >> *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
456 #endif
457  }
458 #else
459  serial_path = 1;
460 #endif
461  }
462  } else {
463  serial_path = 1;
464  }
465  if (serial_path) {
466 #if defined(TH_REAL_IS_FLOAT)
467  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data / powf(2, *src_data););
468 #elif defined(TH_REAL_IS_DOUBLE)
469  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data / pow(2, *src_data););
470 #elif defined(TH_REAL_IS_BYTE)
471  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = ((scalar_t)*t_data) >> *src_data;);
472 #else
473  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = ((ureal)*t_data) >> *src_data;);
474 #endif
475  }
476 }
477 
478 void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src)
479 {
480  THTensor_(resizeAs)(r_, t);
481  int64_t r_Size = THTensor_(nElement)(r_);
482  int64_t srcSize = THTensor_(nElement)(src);
483  int r_Contig = THTensor_(isContiguous)(r_);
484  int tContig = THTensor_(isContiguous)(t);
485  int srcContig = THTensor_(isContiguous)(src);
486  int serial_path = 0;
487  if (srcSize == r_Size){
488  if (r_Contig && tContig && srcContig) {
489  scalar_t *tp = t->data<scalar_t>();
490  scalar_t *sp = src->data<scalar_t>();
491  scalar_t *rp = r_->data<scalar_t>();
492  int64_t i;
493  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
494  for (i=0; i<r_Size; i++) {
495 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
496  rp[i] = fmod(tp[i], sp[i]);
497 #else
498  rp[i] = tp[i] % sp[i];
499 #endif
500  }
501  } else {
502 #if _OPENMP
503  int inOMP = omp_in_parallel();
504  if (inOMP) {
505  serial_path = 1;
506  } else {
507 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
508  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig,scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = fmod(*t_data, *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
509 #else
510  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = (*t_data % *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
511 #endif
512  }
513 #else
514  serial_path = 1;
515 #endif
516  }
517  } else {
518  serial_path = 1;
519  }
520  if (serial_path) {
521 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
522  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = fmod(*t_data, *src_data););
523 #else
524  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = (*t_data % *src_data););
525 #endif
526  }
527 }
528 
529 void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
530 {
531  THTensor_(resizeAs)(r_, t);
532  int64_t r_Size = THTensor_(nElement)(r_);
533  int64_t srcSize = THTensor_(nElement)(src);
534  int r_Contig = THTensor_(isContiguous)(r_);
535  int tContig = THTensor_(isContiguous)(t);
536  int srcContig = THTensor_(isContiguous)(src);
537  int serial_path = 0;
538  if (srcSize == r_Size){
539  if (r_Contig && tContig && srcContig) {
540  scalar_t *tp = t->data<scalar_t>();
541  scalar_t *sp = src->data<scalar_t>();
542  scalar_t *rp = r_->data<scalar_t>();
543  int64_t i;
544  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
545  for (i=0; i<r_Size; i++) {
546 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
547  rp[i] = (sp[i] == 0)? NAN : tp[i] - sp[i] * floor(tp[i] / sp[i]);
548 #else
549  // There is no NAN for integers
550  rp[i] = tp[i] % sp[i];
551  if (modulo_wrap(rp[i], sp[i]))
552  rp[i] += sp[i];
553 #endif
554  }
555  } else {
556 #if _OPENMP
557  int inOMP = omp_in_parallel();
558  if (inOMP) {
559  serial_path = 1;
560  } else {
561 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
562  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = (*src_data == 0)? NAN : *t_data - *src_data * floor(*t_data / *src_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
563 #else
564  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data % *src_data;
565  if (modulo_wrap(*r__data, *src_data)) *r__data += *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
566 #endif
567  }
568 #else
569  serial_path = 1;
570 #endif
571  }
572  } else {
573  serial_path = 1;
574  }
575  if (serial_path) {
576 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
577  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = (*src_data == 0)? NAN : *t_data - *src_data * floor(*t_data / *src_data););
578 #else
579  // There is no NAN for integers
580  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data % *src_data;
581  if (modulo_wrap(*r__data, *src_data)) *r__data += *src_data;);
582 #endif
583 
584  }
585 }
586 
587 void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
588 {
589 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
590  (void)r_;
591  (void)t;
592  (void)src;
593  return THError("cbitand is only supported for integer type tensors");
594 #else
595  THTensor_(resizeAs)(r_, t);
596  int64_t r_Size = THTensor_(nElement)(r_);
597  int64_t srcSize = THTensor_(nElement)(src);
598  int r_Contig = THTensor_(isContiguous)(r_);
599  int tContig = THTensor_(isContiguous)(t);
600  int srcContig = THTensor_(isContiguous)(src);
601  int serial_path = 0;
602  if (srcSize == r_Size){
603  if (r_Contig && tContig && srcContig) {
604  scalar_t *tp = t->data<scalar_t>();
605  scalar_t *sp = src->data<scalar_t>();
606  scalar_t *rp = r_->data<scalar_t>();
607  int64_t i;
608  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
609  for (i=0; i<r_Size; i++) {
610  rp[i] = tp[i] & sp[i];
611  }
612  } else {
613 #if _OPENMP
614  int inOMP = omp_in_parallel();
615  if (inOMP) {
616  serial_path = 1;
617  } else {
618  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
619  }
620 #else
621  serial_path = 1;
622 #endif
623  }
624  } else {
625  serial_path = 1;
626  }
627  if (serial_path) {
628  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;);
629  }
630 #endif
631 }
632 
633 void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
634 {
635 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
636  (void)r_;
637  (void)t;
638  (void)src;
639  return THError("cbitor is only supported for integer type tensors");
640 #else
641  THTensor_(resizeAs)(r_, t);
642  int64_t r_Size = THTensor_(nElement)(r_);
643  int64_t srcSize = THTensor_(nElement)(src);
644  int r_Contig = THTensor_(isContiguous)(r_);
645  int tContig = THTensor_(isContiguous)(t);
646  int srcContig = THTensor_(isContiguous)(src);
647  int serial_path = 0;
648  if (srcSize == r_Size){
649  if (r_Contig && tContig && srcContig) {
650  scalar_t *tp = t->data<scalar_t>();
651  scalar_t *sp = src->data<scalar_t>();
652  scalar_t *rp = r_->data<scalar_t>();
653  int64_t i;
654  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
655  for (i=0; i<r_Size; i++) {
656  rp[i] = tp[i] | sp[i];
657  }
658  } else {
659 #if _OPENMP
660  int inOMP = omp_in_parallel();
661  if (inOMP) {
662  serial_path = 1;
663  } else {
664  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
665  }
666 #else
667  serial_path = 1;
668 #endif
669  }
670  } else {
671  serial_path = 1;
672  }
673  if (serial_path) {
674  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;);
675  }
676 #endif
677 }
678 
679 void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src)
680 {
681 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
682  (void)r_;
683  (void)t;
684  (void)src;
685  return THError("cbitxor is only supported for integer type tensors");
686 #else
687  THTensor_(resizeAs)(r_, t);
688  int64_t r_Size = THTensor_(nElement)(r_);
689  int64_t srcSize = THTensor_(nElement)(src);
690  int r_Contig = THTensor_(isContiguous)(r_);
691  int tContig = THTensor_(isContiguous)(t);
692  int srcContig = THTensor_(isContiguous)(src);
693  int serial_path = 0;
694  if (srcSize == r_Size){
695  if (r_Contig && tContig && srcContig) {
696  scalar_t *tp = t->data<scalar_t>();
697  scalar_t *sp = src->data<scalar_t>();
698  scalar_t *rp = r_->data<scalar_t>();
699  int64_t i;
700  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
701  for (i=0; i<r_Size; i++) {
702  rp[i] = tp[i] ^ sp[i];
703  }
704  } else {
705 #if _OPENMP
706  int inOMP = omp_in_parallel();
707  if (inOMP) {
708  serial_path = 1;
709  } else {
710  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
711  }
712 #else
713  serial_path = 1;
714 #endif
715  }
716  } else {
717  serial_path = 1;
718  }
719  if (serial_path) {
720  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;);
721  }
722 #endif
723 }
724 
725 void THTensor_(tpow)(THTensor *r_, scalar_t value, THTensor *t)
726 {
727  THTensor_(resizeAs)(r_, t);
728  int64_t r_Size = THTensor_(nElement)(r_);
729  int r_Contig = THTensor_(isContiguous)(r_);
730  int tContig = THTensor_(isContiguous)(t);
731  int serial_path = 0;
732  if (r_Contig && tContig) {
733  scalar_t *tp = t->data<scalar_t>();
734  scalar_t *rp = r_->data<scalar_t>();
735  int64_t i;
736  #pragma omp parallel for if(r_Size > TH_OMP_OVERHEAD_THRESHOLD) private(i)
737  for (i=0; i<r_Size; i++)
738  rp[i] = THTensor_(powOne)(value, tp[i]);
739  } else {
740 #if _OPENMP
741  int inOMP = omp_in_parallel();
742  if (inOMP) {
743  serial_path = 1;
744  } else {
745  TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = THTensor_(powOne)(value, *t_data);, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
746  }
747 #else
748  serial_path = 1;
749 #endif
750  }
751  if (serial_path) {
752  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = THTensor_(powOne)(value, *t_data););
753  }
754 }
755 
756 void THTensor_(addcmul)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2)
757 {
758  if(r_ != t)
759  {
760  THTensor_(resizeAs)(r_, t);
761  at::Tensor r__wrap = THTensor_wrap(r_);
762  at::Tensor t_wrap = THTensor_wrap(t);
763  at::_copy_same_type_(r__wrap, t_wrap);
764  }
765  int64_t r_Size = THTensor_(nElement)(r_);
766  int64_t src1Size = THTensor_(nElement)(src1);
767  int64_t src2Size = THTensor_(nElement)(src2);
768  int r_Contig = THTensor_(isContiguous)(r_);
769  int src1Contig = THTensor_(isContiguous)(src1);
770  int src2Contig = THTensor_(isContiguous)(src2);
771  int serial_path = 0;
772  if( (src1Size == src2Size) && (src1Size == r_Size) ){
773 #if _OPENMP
774  int inOMP = omp_in_parallel();
775  if (inOMP) {
776  serial_path = 1;
777  } else {
778  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, src1Contig, src2Contig, scalar_t, r_, scalar_t, src1, scalar_t, src2, *r__data += value * *src1_data * *src2_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
779  }
780 #else
781  (void)r_Contig;
782  (void)src1Contig;
783  (void)src2Contig;
784  serial_path = 1;
785 #endif
786  } else {
787  serial_path = 1;
788  }
789  if (serial_path) {
790  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, src1, scalar_t, src2, *r__data += value * *src1_data * *src2_data;);
791  }
792 }
793 
794 void THTensor_(addcdiv)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2)
795 {
796  if(r_ != t)
797  {
798  THTensor_(resizeAs)(r_, t);
799  at::Tensor r__wrap = THTensor_wrap(r_);
800  at::Tensor t_wrap = THTensor_wrap(t);
801  at::_copy_same_type_(r__wrap, t_wrap);
802  }
803  int64_t r_Size = THTensor_(nElement)(r_);
804  int64_t src1Size = THTensor_(nElement)(src1);
805  int64_t src2Size = THTensor_(nElement)(src2);
806  int r_Contig = THTensor_(isContiguous)(r_);
807  int src1Contig = THTensor_(isContiguous)(src1);
808  int src2Contig = THTensor_(isContiguous)(src2);
809  int serial_path = 0;
810  if( (src1Size == src2Size) && (src1Size == r_Size) ){
811 #if _OPENMP
812  int inOMP = omp_in_parallel();
813  if (inOMP) {
814  serial_path = 1;
815  } else {
816  TH_TENSOR_APPLY3_OMP(r_Size, r_Contig, src1Contig, src2Contig, scalar_t, r_, scalar_t, src1, scalar_t, src2, *r__data += value * *src1_data / *src2_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
817  }
818 #else
819  (void)r_Contig;
820  (void)src1Contig;
821  (void)src2Contig;
822  serial_path = 1;
823 #endif
824  } else {
825  serial_path = 1;
826  }
827  if (serial_path) {
828  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, src1, scalar_t, src2, *r__data += value * *src1_data / *src2_data;);
829  }
830 }
831 
832 void THTensor_(addmv)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha, THTensor *mat, THTensor *vec)
833 {
834  if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) )
835  THError("matrix and vector expected, got %dD, %dD",
836  mat->dim(), THTensor_nDimensionLegacyNoScalars(vec));
837 
838  if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) ) {
839  THDescBuff bm = THTensor_(sizeDesc)(mat);
840  THDescBuff bv = THTensor_(sizeDesc)(vec);
841  THError("size mismatch, %s, %s", bm.str, bv.str);
842  }
843 
844  if(THTensor_nDimensionLegacyNoScalars(t) != 1)
845  THError("vector expected, got t: %dD", t->dim());
846 
847  if(THTensor_sizeLegacyNoScalars(t, 0) != mat->size(0)) {
848  THDescBuff bt = THTensor_(sizeDesc)(t);
849  THDescBuff bm = THTensor_(sizeDesc)(mat);
850  THError("size mismatch, t: %s, mat: %s", bt.str, bm.str);
851  }
852 
853  if(r_ != t)
854  {
855  THTensor_(resizeAs)(r_, t);
856  at::Tensor r__wrap = THTensor_wrap(r_);
857  at::Tensor t_wrap = THTensor_wrap(t);
858  at::_copy_same_type_(r__wrap, t_wrap);
859  }
860 
861  auto r_stride = THTensor_strideLegacyNoScalars(r_, 0);
862 
863  // n == 1 || lda >= max(1, m)
864  #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
865 
866  if(mat->stride(0) == 1 && LDA_COND(mat->size(0), mat->size(1), mat->stride(1)))
867  {
868  THBlas_(gemv)('n', mat->size(0), mat->size(1),
869  alpha, mat->data<scalar_t>(), mat->stride(1),
870  vec->data<scalar_t>(), THTensor_strideLegacyNoScalars(vec, 0),
871  beta, r_->data<scalar_t>(), r_stride);
872  }
873  else if(mat->stride(1) == 1 && LDA_COND(mat->size(1), mat->size(0), mat->stride(0)))
874  {
875  THBlas_(gemv)('t', mat->size(1), mat->size(0),
876  alpha, mat->data<scalar_t>(), mat->stride(0),
877  vec->data<scalar_t>(), THTensor_strideLegacyNoScalars(vec, 0),
878  beta, r_->data<scalar_t>(), r_stride);
879  }
880  else
881  {
882  THTensor *cmat = THTensor_(newContiguous)(mat);
883 
884  THBlas_(gemv)('t', mat->size(1), mat->size(0),
885  alpha, cmat->data<scalar_t>(), cmat->stride(0),
886  vec->data<scalar_t>(), THTensor_strideLegacyNoScalars(vec, 0),
887  beta, r_->data<scalar_t>(), r_stride);
888 
889  c10::raw::intrusive_ptr::decref(cmat);
890  }
891 
892  // In gemv (x,0).mv(0) does not
893  // handle beta, whereas gemm does for case where (x,0).mm(0,y).
894  if (THTensor_sizeLegacyNoScalars(vec, 0) == 0 && mat->size(0) != 0) {
895  if (beta == 0) {
896  THTensor_(zero)(r_);
897  } else if (beta != 1) {
898  THTensor_(mul)(r_, r_, beta);
899  }
900  }
901 
902  #undef LDA_COND
903 }
904 
905 void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, scalar_t gain)
906 {
907  int64_t N1 = m1->size(0);
908  int64_t N2 = m2->size(0);
909  int64_t dim;
910  scalar_t *m1_p;
911  scalar_t *m2_p;
912  scalar_t *r_p;
913  int64_t i;
914 
915  THTensor_(resize2d)(r_, N1, N2);
916 
917  m1 = THTensor_(newContiguous)(m1);
918  m2 = THTensor_(newContiguous)(m2);
919 
920  THTensor_(resize2d)(m1, N1, THTensor_(nElement)(m1) / N1);
921  THTensor_(resize2d)(m2, N2, THTensor_(nElement)(m2) / N2);
922 
923  dim = m1->size(1);
924  THArgCheck(m1->size(1) == m2->size(1), 3, "m1 and m2 must have the same inner vector dim");
925 
926  m1_p = m1->data<scalar_t>();
927  m2_p = m2->data<scalar_t>();
928  r_p = r_->data<scalar_t>();
929 
930 #pragma omp parallel for private(i)
931  for (i=0; i<N1; i++) {
932  int64_t j,k;
933  for (j=0; j<N2; j++) {
934  scalar_t sum = 0;
935  for (k=0; k<dim; k++) {
936  scalar_t term = m1_p[ i*dim + k ] - m2_p[ j*dim + k ];
937  sum += term*term;
938  }
939  r_p[ i*N2 + j ] = gain * sum;
940  }
941  }
942 
943  c10::raw::intrusive_ptr::decref(m1);
944  c10::raw::intrusive_ptr::decref(m2);
945 }
946 
947 void THTensor_(addmm)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha, THTensor *m1, THTensor *m2)
948 {
949  char transpose_r, transpose_m1, transpose_m2;
950  THTensor *r__, *m1_, *m2_;
951  int free_m1 = 0;
952  int free_m2 = 0;
953 
954  if( (m1->dim() != 2) || (m2->dim() != 2))
955  THError("matrices expected, got %dD, %dD tensors", m1->dim(), m2->dim());
956 
957  if(m1->size(1) != m2->size(0)) {
958  THDescBuff bm1 = THTensor_(sizeDesc)(m1);
959  THDescBuff bm2 = THTensor_(sizeDesc)(m2);
960  THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str);
961  }
962 
963  if( t->dim() != 2 )
964  THError("matrix expected, got %dD tensor for t", t->dim());
965 
966  if( (t->size(0) != m1->size(0)) || (t->size(1) != m2->size(1)) ) {
967  THDescBuff bt = THTensor_(sizeDesc)(t);
968  THDescBuff bm1 = THTensor_(sizeDesc)(m1);
969  THDescBuff bm2 = THTensor_(sizeDesc)(m2);
970  THError("size mismatch, t: %s, m1: %s, m2: %s", bt.str, bm1.str, bm2.str);
971  }
972 
973  if(t != r_)
974  {
975  THTensor_(resizeAs)(r_, t);
976  if (beta != 0.0) {
977  at::Tensor r__wrap = THTensor_wrap(r_);
978  at::Tensor t_wrap = THTensor_wrap(t);
979  at::_copy_same_type_(r__wrap, t_wrap);
980  }
981  }
982 
983  // n == 1 || ldc >= max(1, m)
984  #define LDC_COND(M, N, LDC) ((N) == 1 || (LDC) >= THMax(1, M))
985 
986  /* r_ */
987  if(r_->stride(0) == 1 &&
988  LDC_COND(r_->size(0), r_->size(1), r_->stride(1)))
989  {
990  transpose_r = 'n';
991  r__ = r_;
992  }
993  else if(r_->stride(1) == 1 &&
994  LDC_COND(r_->size(1), r_->size(0), r_->stride(0)))
995  {
996  THTensor *swap = m2;
997  m2 = m1;
998  m1 = swap;
999  transpose_r = 't';
1000  r__ = r_;
1001  }
1002  else
1003  {
1004  transpose_r = 'n';
1005  // make r__ FORTRAN contiguous
1006  THTensor *transp_r_ = THTensor_(newTranspose)(r_, 0, 1);
1007  r__ = THTensor_(newClone)(transp_r_);
1008  c10::raw::intrusive_ptr::decref(transp_r_);
1009  THTensor_(transpose)(r__, NULL, 0, 1);
1010  }
1011 
1012  #undef LDC_COND
1013 
1014  int64_t m = r__->size((transpose_r == 'n' ? 0 : 1));
1015  int64_t n = r__->size((transpose_r == 'n' ? 1 : 0));
1016  int64_t k = m1->size((transpose_r == 'n' ? 1 : 0));
1017  int64_t ldr__ = r__->stride((transpose_r == 'n' ? 1 : 0));
1018 
1019  /* m1 */
1020  /* Need ldm1_ >= max(1, (transpose_m1 == 'n' ? m : k)) */
1021  if(m1->stride((transpose_r == 'n' ? 0 : 1)) == 1 &&
1022  m1->stride((transpose_r == 'n' ? 1 : 0)) >= THMax(1, m))
1023  {
1024  transpose_m1 = 'n';
1025  m1_ = m1;
1026  }
1027  else if(m1->stride((transpose_r == 'n' ? 1 : 0)) == 1 &&
1028  m1->stride((transpose_r == 'n' ? 0 : 1)) >= THMax(1, k))
1029  {
1030  transpose_m1 = 't';
1031  m1_ = m1;
1032  }
1033  else
1034  {
1035  transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
1036  m1_ = THTensor_(newContiguous)(m1);
1037  free_m1 = 1;
1038  }
1039 
1040  /* m2 */
1041  /* Need ldm2_ >= max(1, (transpose_m2 == 'n' ? k : n)) */
1042  if(m2->stride((transpose_r == 'n' ? 0 : 1)) == 1 &&
1043  m2->stride((transpose_r == 'n' ? 1 : 0)) >= THMax(1, k))
1044  {
1045  transpose_m2 = 'n';
1046  m2_ = m2;
1047  }
1048  else if(m2->stride((transpose_r == 'n' ? 1 : 0)) == 1 &&
1049  m2->stride((transpose_r == 'n' ? 0 : 1)) >= THMax(1, n))
1050  {
1051  transpose_m2 = 't';
1052  m2_ = m2;
1053  }
1054  else
1055  {
1056  transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
1057  m2_ = THTensor_(newContiguous)(m2);
1058  free_m2 = 1;
1059  }
1060 
1061  int64_t ldm1_ = (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1)));
1062  int64_t ldm2_ = (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1)));
1063 
1064  /* do the operation */
1065  THBlas_(gemm)(transpose_m1,
1066  transpose_m2,
1067  m,
1068  n,
1069  k,
1070  alpha,
1071  m1_->data<scalar_t>(),
1072  ldm1_,
1073  m2_->data<scalar_t>(),
1074  ldm2_,
1075  beta,
1076  r__->data<scalar_t>(),
1077  ldr__);
1078 
1079  /* free intermediate variables */
1080  if(free_m1)
1081  c10::raw::intrusive_ptr::decref(m1_);
1082 
1083  if(free_m2)
1084  c10::raw::intrusive_ptr::decref(m2_);
1085 
1086  if(r__ != r_)
1087  THTensor_(freeCopyTo)(r__, r_);
1088 }
1089 
1090 void THTensor_(addr)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha, THTensor *vec1, THTensor *vec2)
1091 {
1092  if( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) )
1093  THError("vector and vector expected, got %dD, %dD tensors",
1094  THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2));
1095 
1096  if(t->dim() != 2)
1097  THError("expected matrix, got %dD tensor for t", t->dim());
1098 
1099  auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0);
1100  auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0);
1101  auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0);
1102  auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0);
1103 
1104  if( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) {
1105  THDescBuff bt = THTensor_(sizeDesc)(t);
1106  THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
1107  THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
1108  THError("size mismatch, t: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str);
1109  }
1110 
1111  if(r_ != t)
1112  {
1113  THTensor_(resizeAs)(r_, t);
1114  at::Tensor r__wrap = THTensor_wrap(r_);
1115  at::Tensor t_wrap = THTensor_wrap(t);
1116  at::_copy_same_type_(r__wrap, t_wrap);
1117  }
1118 
1119  if(beta == 0) {
1120  THTensor_(zero)(r_);
1121  }
1122  else if(beta != 1)
1123  THTensor_(mul)(r_, r_, beta);
1124 
1125  // n == 1 || lda >= max(1, m)
1126  #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
1127 
1128  if(r_->stride(0) == 1 && LDA_COND(vec1_size, vec2_size, r_->stride(1)))
1129  {
1130  THBlas_(ger)(vec1_size, vec2_size,
1131  alpha, vec1->data<scalar_t>(), vec1_stride,
1132  vec2->data<scalar_t>(), vec2_stride,
1133  r_->data<scalar_t>(), r_->stride(1));
1134  }
1135  else if(r_->stride(1) == 1 && LDA_COND(vec2_size, vec1_size, r_->stride(0)))
1136  {
1137  THBlas_(ger)(vec2_size, vec1_size,
1138  alpha, vec2->data<scalar_t>(), vec2_stride,
1139  vec1->data<scalar_t>(), vec1_stride,
1140  r_->data<scalar_t>(), r_->stride(0));
1141  }
1142  else
1143  {
1144  THTensor *cr = THTensor_(newClone)(r_);
1145 
1146  THBlas_(ger)(vec2_size, vec1_size,
1147  alpha, vec2->data<scalar_t>(), vec2_stride,
1148  vec1->data<scalar_t>(), vec1_stride,
1149  cr->data<scalar_t>(), cr->stride(0));
1150 
1151  THTensor_(freeCopyTo)(cr, r_);
1152  }
1153 
1154  #undef LDA_COND
1155 }
1156 
1157 void THTensor_(addbmm)(THTensor *result, scalar_t beta, THTensor *t, scalar_t alpha, THTensor *batch1, THTensor *batch2)
1158 {
1159  int64_t batch;
1160 
1161  THArgCheck(THTensor_(nDimensionLegacyNoScalars)(batch1) == 3, 1, "expected 3D tensor");
1162  THArgCheck(THTensor_(nDimensionLegacyNoScalars)(batch2) == 3, 2, "expected 3D tensor");
1163  THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
1164  "equal number of batches expected, got %d, %d",
1165  THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
1166  THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2,
1167  "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
1168  THTensor_(size)(batch1, 1), THTensor_(size)(batch1,2),
1169  THTensor_(size)(batch2, 1), THTensor_(size)(batch2,2));
1170 
1171  int64_t dim1 = THTensor_(size)(batch1, 1);
1172  int64_t dim2 = THTensor_(size)(batch2, 2);
1173  THArgCheck(THTensor_(size)(t, 0) == dim1, 1, "output tensor of incorrect size");
1174  THArgCheck(THTensor_(size)(t, 1) == dim2, 1, "output tensor of incorrect size");
1175 
1176  if (t != result) {
1177  THTensor_(resizeAs)(result, t);
1178  if (beta != 0.0) {
1179  at::Tensor result_wrap = THTensor_wrap(result);
1180  at::Tensor t_wrap = THTensor_wrap(t);
1181  at::_copy_same_type_(result_wrap, t_wrap);
1182  }
1183  }
1184 
1185  THTensor *matrix1 = THTensor_(new)();
1186  THTensor *matrix2 = THTensor_(new)();
1187 
1188  for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) {
1189  THTensor_(select)(matrix1, batch1, 0, batch);
1190  THTensor_(select)(matrix2, batch2, 0, batch);
1191 
1192  THTensor_(addmm)(result, beta, result, alpha, matrix1, matrix2);
1193  beta = 1; // accumulate output once
1194  }
1195 
1196  c10::raw::intrusive_ptr::decref(matrix1);
1197  c10::raw::intrusive_ptr::decref(matrix2);
1198 }
1199 
1200 #endif /* TH_GENERIC_FILE */