Caffe2 - C++ API
A deep learning, cross platform ML framework
THTensorMoreMath.cpp
1 #ifndef TH_GENERIC_FILE
2 #define TH_GENERIC_FILE "TH/generic/THTensorMoreMath.cpp"
3 #else
4 
5 #include <TH/generic/THTensorApply.hpp>
6 #include <TH/THGenerator.hpp>
7 
8 void THTensor_(baddbmm)(THTensor *result, scalar_t beta, THTensor *t, scalar_t alpha, THTensor *batch1, THTensor *batch2)
9 {
10  int64_t batch;
11 
12  THArgCheck(THTensor_(nDimensionLegacyNoScalars)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimensionLegacyNoScalars)(batch1));
13  THArgCheck(THTensor_(nDimensionLegacyNoScalars)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(nDimensionLegacyNoScalars)(batch2));
14  THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
15  "equal number of batches expected, got %d, %d",
16  THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
17  THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2,
18  "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
19  THTensor_(size)(batch1, 1), THTensor_(size)(batch1, 2),
20  THTensor_(size)(batch2, 1), THTensor_(size)(batch2, 2));
21 
22  int64_t bs = THTensor_(size)(batch1, 0);
23  int64_t dim1 = THTensor_(size)(batch1, 1);
24  int64_t dim2 = THTensor_(size)(batch2, 2);
25  THArgCheck(THTensor_(size)(t, 0) == bs, 1, "output tensor of incorrect size");
26  THArgCheck(THTensor_(size)(t, 1) == dim1, 1, "output tensor of incorrect size");
27  THArgCheck(THTensor_(size)(t, 2) == dim2, 1, "output tensor of incorrect size");
28 
29  if (t != result) {
30  THTensor_(resizeAs)(result, t);
31  if (beta != 0.0) {
32  at::Tensor result_wrap = THTensor_wrap(result);
33  at::Tensor t_wrap = THTensor_wrap(t);
34  at::_copy_same_type_(result_wrap, t_wrap);
35  }
36  }
37 
38  THTensor *matrix1 = THTensor_(new)();
39  THTensor *matrix2 = THTensor_(new)();
40  THTensor *result_matrix = THTensor_(new)();
41 
42  for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) {
43  THTensor_(select)(matrix1, batch1, 0, batch);
44  THTensor_(select)(matrix2, batch2, 0, batch);
45  THTensor_(select)(result_matrix, result, 0, batch);
46 
47  THTensor_(addmm)(result_matrix, beta, result_matrix, alpha, matrix1, matrix2);
48  }
49 
50  c10::raw::intrusive_ptr::decref(matrix1);
51  c10::raw::intrusive_ptr::decref(matrix2);
52  c10::raw::intrusive_ptr::decref(result_matrix);
53 }
54 
55 ptrdiff_t THTensor_(numel)(THTensor *t)
56 {
57  return THTensor_(nElement)(t);
58 }
59 
60 
61 // Helper function to be used in a reduction operation.
62 // Due to resize semantics of outputs, if the specified output tensor r_ has
63 // same size as the output of the reduction operation, then any noncontiguities
64 // in r_ should be preserved.
65 // The reduction operation, however, needs to act on r_ with an extra dimension
66 // (the reduced dimension), so this function "resizes" r_ and preserves its
67 // noncontiguities if necessary.
68 void THTensor_(preserveReduceDimSemantics)(
69  THTensor *r_, int in_dims, int reduce_dimension, int keepdim) {
70  if (r_ && !keepdim &&
71  THTensor_(nDimensionLegacyAll)(r_) == in_dims - 1 &&
72  THTensor_(nDimensionLegacyAll)(r_) != 0) {
73  THTensor_(unsqueeze1d)(r_, r_, reduce_dimension);
74  }
75 }
76 
77 void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
78 {
79  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 2, "dimension %d out of range",
80  dimension);
81 
82  int in_dims = THTensor_(nDimensionLegacyAll)(t);
83  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
84  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
85  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
86  dim[dimension] = 1;
87  THTensor_(resize)(values_, dim, {});
88  THLongTensor_resize(indices_, dim, {});
89 
90  // two implementations optimized for data locality
91  if (THTensor_strideLegacyNoScalars(t, dimension) == 1) {
92  scalar_t theMax;
93  scalar_t value;
94  int64_t theIndex;
95  int64_t i;
96  TH_TENSOR_DIM_APPLY3(scalar_t, t, scalar_t, values_, int64_t, indices_, dimension,
97  TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
98  theMax = t_data[0];
99  theIndex = 0;
100 
101  for(i = 0; i < t_size; i++)
102  {
103  value = t_data[i*t_stride];
104  /* This is not the same as value>theMax in the case of NaNs */
105  if(!(value <= theMax))
106  {
107  theIndex = i;
108  theMax = value;
109  th_isnan_break(value)
110  }
111  }
112  *indices__data = theIndex;
113  *values__data = theMax;);
114  } else {
115  if (THTensor_(nDimensionLegacyAll)(t) > 1) {
116  THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
117  at::Tensor values__wrap = THTensor_wrap(values_);
118  at::Tensor t0_wrap = THTensor_wrap(t0);
119  at::_copy_same_type_(values__wrap, t0_wrap);
120  c10::raw::intrusive_ptr::decref(t0);
121  } else {
122  THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
123  }
124  THLongTensor_zero(indices_);
125 
126  if(THTensor_sizeLegacyNoScalars(t, dimension) == 1) {
127  if (!keepdim) {
128  THTensor_(squeeze1d)(values_, values_, dimension);
129  THLongTensor_squeeze1d(indices_, indices_, dimension);
130  }
131  return;
132  }
133 
134  THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
135  // tempValues_.expand_as(t)
136  tempValues_->set_size(dimension,THTensor_sizeLegacyNoScalars(t, dimension));
137  tempValues_->set_stride(dimension, 0);
138 
139  THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
140  // tempIndices_.expand_as(t)
141  tempIndices_->set_size(dimension,THTensor_sizeLegacyNoScalars(t, dimension));
142  tempIndices_->set_stride(dimension, 0);
143 
144  TH_TENSOR_APPLY3_D(scalar_t, t, scalar_t, tempValues_, int64_t, tempIndices_, dimension,
145  if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) {
146  *tempValues__data = *t_data;
147  *tempIndices__data = *tempIndices__dimOffset;
148  });
149 
150  c10::raw::intrusive_ptr::decref(tempValues_);
151  THLongTensor_free(tempIndices_);
152  }
153 
154  if (!keepdim) {
155  THTensor_(squeeze1d)(values_, values_, dimension);
156  THLongTensor_squeeze1d(indices_, indices_, dimension);
157  }
158 }
159 
160 void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
161 {
162  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 2, "dimension %d out of range",
163  dimension);
164 
165  int in_dims = THTensor_(nDimensionLegacyAll)(t);
166  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
167  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
168  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
169  dim[dimension] = 1;
170  THTensor_(resize)(values_, dim, {});
171  THLongTensor_resize(indices_, dim, {});
172 
173  // two implementations optimized for data locality
174  if (THTensor_strideLegacyNoScalars(t, dimension) == 1) {
175  scalar_t theMax;
176  scalar_t value;
177  int64_t theIndex;
178  int64_t i;
179  TH_TENSOR_DIM_APPLY3(scalar_t, t, scalar_t, values_, int64_t, indices_, dimension,
180  TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
181  theMax = t_data[0];
182  theIndex = 0;
183 
184  for(i = 0; i < t_size; i++)
185  {
186  value = t_data[i*t_stride];
187  /* This is not the same as value>theMax in the case of NaNs */
188  if(!(value >= theMax))
189  {
190  theIndex = i;
191  theMax = value;
192  th_isnan_break(value)
193  }
194  }
195  *indices__data = theIndex;
196  *values__data = theMax;);
197  } else {
198  if (THTensor_(nDimensionLegacyAll)(t) > 1) {
199  THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
200  at::Tensor values__wrap = THTensor_wrap(values_);
201  at::Tensor t0_wrap = THTensor_wrap(t0);
202  at::_copy_same_type_(values__wrap, t0_wrap);
203  c10::raw::intrusive_ptr::decref(t0);
204  } else {
205  THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
206  }
207  THLongTensor_zero(indices_);
208 
209  if(THTensor_sizeLegacyNoScalars(t, dimension) == 1) {
210  if (!keepdim) {
211  THTensor_(squeeze1d)(values_, values_, dimension);
212  THLongTensor_squeeze1d(indices_, indices_, dimension);
213  }
214  return;
215  }
216 
217  THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
218  // tempValues_.expand_as(t)
219  tempValues_->set_size(dimension,THTensor_sizeLegacyNoScalars(t, dimension));
220  tempValues_->set_stride(dimension, 0);
221 
222  THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
223  // tempIndices_.expand_as(t)
224  tempIndices_->set_size(dimension,THTensor_sizeLegacyNoScalars(t, dimension));
225  tempIndices_->set_stride(dimension, 0);
226 
227  TH_TENSOR_APPLY3_D(scalar_t, t, scalar_t, tempValues_, int64_t, tempIndices_, dimension,
228  if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) {
229  *tempValues__data = *t_data;
230  *tempIndices__data = *tempIndices__dimOffset;
231  });
232 
233  c10::raw::intrusive_ptr::decref(tempValues_);
234  THLongTensor_free(tempIndices_);
235  }
236 
237  if (!keepdim) {
238  THTensor_(squeeze1d)(values_, values_, dimension);
239  THLongTensor_squeeze1d(indices_, indices_, dimension);
240  }
241 }
242 
243 void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
244 {
245  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 2, "dimension %d out of range",
246  dimension);
247 
248  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(nDimensionLegacyAll)(t), dimension, keepdim);
249  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
250  dim[dimension] = 1;
251  THTensor_(resize)(r_, dim, {});
252 
253  int serial_path = 0;
254 #ifdef _OPENMP
255  int inOMP = omp_in_parallel();
256  if (inOMP) {
257  serial_path = 1;
258  } else {
259  int r_Contig = THTensor_(isContiguous)(r_);
260  scalar_t *tp = t->data<scalar_t>();
261  scalar_t *rp = r_->data<scalar_t>();
262  if(r_Contig && (tp != rp)){
263  ptrdiff_t iter = 0;
264  ptrdiff_t r_Size = THTensor_(nElement)(r_);
265  int r_Dim = THTensor_nDimensionLegacyAll(r_);
266  #pragma omp parallel for if ( r_Size > HYPER_TH_OMP_OVERHEAD_THRESHOLD)
267  for (iter = 0; iter < r_Size; iter++) {
268  int j;
269  int64_t quot;
270  int64_t rem = iter;
271  ptrdiff_t tBasicIndex = 0;
272 
273  for(j = 0; j < r_Dim; ++j) {
274  if(j != dimension){
275  quot = rem/r_->stride(j);
276  rem = rem%r_->stride(j);
277  tBasicIndex += quot*t->stride(j);
278  }
279  }
280  scalar_t *t_data = tp+tBasicIndex;
281  scalar_t *r__data = rp+iter;
282  *r__data = 1;
283  for(j=0; j < THTensor_sizeLegacyNoScalars(t, dimension); ++j) {
284  *r__data *= *(t_data + j*THTensor_strideLegacyNoScalars(t, dimension));
285  }
286  }
287  } else {
288  serial_path = 1;
289  }
290  }
291 #else
292  serial_path = 1;
293 #endif
294 
295  if(serial_path) {
296  // two implementations optimized for data locality
297  if (THTensor_strideLegacyNoScalars(t, dimension) == 1) {
298  TH_TENSOR_DIM_APPLY2(scalar_t, t, scalar_t, r_, dimension,
299  accreal prod = 1;
300  int64_t i;
301  for(i = 0; i < t_size; i++)
302  prod *= t_data[i*t_stride];
303  *r__data = (scalar_t)prod;);
304  } else {
305  THTensor_(fill)(r_, 1);
306  THTensor *temp_ = THTensor_(newWithTensor)(r_);
307  // r_.expand_as(t)
308  temp_->set_size(dimension,THTensor_sizeLegacyNoScalars(t, dimension));
309  temp_->set_stride(dimension, 0);
310 
311  TH_TENSOR_APPLY2(scalar_t, temp_, scalar_t, t, *temp__data = *temp__data * *t_data;);
312  c10::raw::intrusive_ptr::decref(temp_);
313  }
314  }
315  if (!keepdim) {
316  THTensor_(squeeze1d)(r_, r_, dimension);
317  }
318 }
319 
320 void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
321 {
322  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyNoScalars)(t), 2, "dimension %d out of range",
323  dimension);
324 
325  THTensor_(resizeAs)(r_, t);
326 
327  TH_TENSOR_DIM_APPLY2(scalar_t, t, scalar_t, r_, dimension,
328  accreal cumsum = 0;
329  int64_t i;
330  for(i = 0; i < t_size; i++)
331  {
332  cumsum += t_data[i*t_stride];
333  r__data[i*r__stride] = (scalar_t)cumsum;
334  });
335 }
336 
337 void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension)
338 {
339  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyNoScalars)(t), 2, "dimension %d out of range",
340  dimension);
341 
342  THTensor_(resizeAs)(r_, t);
343 
344  TH_TENSOR_DIM_APPLY2(scalar_t, t, scalar_t, r_, dimension,
345  accreal cumprod = 1;
346  int64_t i;
347  for(i = 0; i < t_size; i++)
348  {
349  cumprod *= t_data[i*t_stride];
350  r__data[i*r__stride] = (scalar_t)cumprod;
351  });
352 }
353 
354 
355 void THTensor_(sign)(THTensor *r_, THTensor *t)
356 {
357  THTensor_(resizeAs)(r_, t);
358 
359 #if defined (TH_REAL_IS_BYTE)
360  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,
361  if (*t_data > 0) *r__data = 1;
362  else *r__data = 0;);
363 #else
364  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,
365  if (*t_data > 0) *r__data = 1;
366  else if (*t_data < 0) *r__data = -1;
367  else *r__data = 0;);
368 #endif
369 }
370 
371 
372 accreal THTensor_(trace)(THTensor *t)
373 {
374  scalar_t *t_data = t->data<scalar_t>();
375  accreal sum = 0;
376  int64_t i = 0;
377  int64_t t_stride_0, t_stride_1, t_diag_size;
378 
379  THArgCheck(THTensor_(nDimensionLegacyAll)(t) == 2, 1, "expected a matrix");
380 
381  t_stride_0 = THTensor_(stride)(t, 0);
382  t_stride_1 = THTensor_(stride)(t, 1);
383  t_diag_size = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1));
384  while(i < t_diag_size)
385  {
386  sum += t_data[i*(t_stride_0+t_stride_1)];
387  i++;
388  }
389 
390  return sum;
391 }
392 
393 void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension)
394 {
395  int i;
396 
397  if(THTensor_(nDimensionLegacyNoScalars)(a) != THTensor_(nDimensionLegacyNoScalars)(b))
398  THError("inconsistent tensor dimension %dD, %dD",
399  THTensor_(nDimensionLegacyNoScalars)(a), THTensor_(nDimensionLegacyNoScalars)(b));
400 
401  for(i = 0; i < a->dim(); i++)
402  {
403  if(THTensor_(size)(a, i) != THTensor_(size)(b, i)) {
404  THDescBuff ba = THTensor_(sizeDesc)(a);
405  THDescBuff bb = THTensor_(sizeDesc)(b);
406  THError("inconsistent tensor sizes %s, %s", ba.str, bb.str);
407  }
408  }
409 
410  if(dimension < 0)
411  {
412  for(i = 0; i < THTensor_(nDimensionLegacyNoScalars)(a); i++)
413  {
414  if(THTensor_sizeLegacyNoScalars(a, i) == 3)
415  {
416  dimension = i;
417  break;
418  }
419  }
420  if(dimension < 0) {
421  THDescBuff ba = THTensor_(sizeDesc)(a);
422  THError("no dimension of size 3 in a: %s", ba.str);
423  }
424  }
425 
426  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyNoScalars)(a), 3, "dimension %d out of range",
427  dimension);
428  THArgCheck(THTensor_sizeLegacyNoScalars(a, dimension) == 3, 3, "dimension %d does not have size 3",
429  dimension);
430 
431  THTensor_(resizeAs)(r_, a);
432 
433  TH_TENSOR_DIM_APPLY3(scalar_t, a, scalar_t, b, scalar_t, r_, dimension,
434  TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
435  r__data[0*r__stride] = a_data[1*a_stride]*b_data[2*b_stride] - a_data[2*a_stride]*b_data[1*b_stride];
436  r__data[1*r__stride] = a_data[2*a_stride]*b_data[0*b_stride] - a_data[0*a_stride]*b_data[2*b_stride];
437  r__data[2*r__stride] = a_data[0*a_stride]*b_data[1*b_stride] - a_data[1*a_stride]*b_data[0*b_stride];);
438 }
439 
440 void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src) {
441  THTensor_(resizeAs)(r, t);
442  TH_TENSOR_APPLY3(scalar_t, r, scalar_t, t, scalar_t, src,
443  *r_data = *t_data > *src_data ? *t_data : *src_data;);
444 }
445 
446 void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src) {
447  THTensor_(resizeAs)(r, t);
448  TH_TENSOR_APPLY3(scalar_t, r, scalar_t, t, scalar_t, src,
449  *r_data = *t_data < *src_data ? *t_data : *src_data;);
450 }
451 
452 void THTensor_(cmaxValue)(THTensor *r, THTensor *t, scalar_t value) {
453  THTensor_(resizeAs)(r, t);
454  TH_TENSOR_APPLY2(scalar_t, r, scalar_t, t,
455  *r_data = *t_data < value ? value : *t_data;); // this order propagates NaN
456 }
457 
458 void THTensor_(cminValue)(THTensor *r, THTensor *t, scalar_t value) {
459  THTensor_(resizeAs)(r, t);
460  TH_TENSOR_APPLY2(scalar_t, r, scalar_t, t,
461  *r_data = *t_data > value ? value : *t_data;); // this order propagates NaN
462 }
463 
464 void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
465 {
466  THArgCheck(THTensor_(nDimensionLegacyNoScalars)(t) == 1 || THTensor_(nDimensionLegacyNoScalars)(t) == 2, 1, "matrix or a vector expected");
467 
468  if(THTensor_(nDimensionLegacyNoScalars)(t) == 1)
469  {
470  scalar_t *t_data = t->data<scalar_t>();
471  int64_t t_stride_0 = THTensor_strideLegacyNoScalars(t, 0);
472  int64_t t_size = THTensor_sizeLegacyNoScalars(t, 0);
473  int64_t sz = t_size + (k >= 0 ? k : -k);
474  scalar_t *r__data;
475  int64_t r__stride_0;
476  int64_t r__stride_1;
477  int64_t i;
478 
479  THTensor_(resize2d)(r_, sz, sz);
480  THTensor_(zero)(r_);
481  r__data = r_->data<scalar_t>();
482  r__stride_0 = THTensor_(stride)(r_, 0);
483  r__stride_1 = THTensor_(stride)(r_, 1);
484  r__data += (k >= 0 ? k*r__stride_1 : -k*r__stride_0);
485 
486  for(i = 0; i < t_size; i++)
487  r__data[i*(r__stride_0+r__stride_1)] = t_data[i*t_stride_0];
488  }
489  else
490  {
491  scalar_t *t_data = t->data<scalar_t>();
492  int64_t t_stride_0 = THTensor_(stride)(t, 0);
493  int64_t t_stride_1 = THTensor_(stride)(t, 1);
494  int64_t sz;
495  scalar_t *r__data;
496  int64_t r__stride_0;
497  int64_t i;
498 
499  if(k >= 0)
500  sz = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1)-k);
501  else
502  sz = THMin(THTensor_(size)(t, 0)+k, THTensor_(size)(t, 1));
503  THTensor_(resize1d)(r_, sz);
504  r__data = r_->data<scalar_t>();
505  r__stride_0 = THTensor_(stride)(r_, 0);
506 
507  t_data += (k >= 0 ? k*t_stride_1 : -k*t_stride_0);
508  for(i = 0; i < sz; i++)
509  r__data[i*r__stride_0] = t_data[i*(t_stride_0+t_stride_1)];
510  }
511 }
512 
513 
514 /* I cut and pasted (slightly adapted) the quicksort code from
515  Sedgewick's 1978 "Implementing Quicksort Programs" article
516  http://www.csie.ntu.edu.tw/~b93076/p847-sedgewick.pdf
517 
518  It is the state of the art existing implementation. The macros
519  are here to make as close a match as possible to the pseudocode of
520  Program 2 p.851
521 
522  Note that other partition schemes exist, and are typically presented
523  in textbook, but those are less efficient. See e.g.
524  http://cs.stackexchange.com/questions/11458/quicksort-partitioning-hoare-vs-lomuto
525 
526  Julien, November 12th 2013
527 */
528 #define MAX_LEVELS 300
529 #define M_SMALL 10 /* Limit for small subfiles */
530 
531 #define ARR(III) arr[(III)*stride]
532 #define IDX(III) idx[(III)*stride]
533 
534 #define LONG_SWAP(AAA, BBB) swap = AAA; AAA = BBB; BBB = swap
535 #define REAL_SWAP(AAA, BBB) rswap = AAA; AAA = BBB; BBB = rswap
536 
537 #define ARR_SWAP(III, JJJ) \
538  REAL_SWAP(ARR(III), ARR(JJJ));
539 
540 #define BOTH_SWAP(III, JJJ) \
541  REAL_SWAP(ARR(III), ARR(JJJ)); \
542  LONG_SWAP(IDX(III), IDX(JJJ))
543 
544 /* Emulate NumPy behavior of putting NaNs
545  * at the end of an ascending list. */
546 #define GT_OR_NAN(x, y) \
547  ((x != x && y == y) || (x > y))
548 
549 static void THTensor_(quicksortascend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride)
550 {
551  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
552  scalar_t rswap, piv;
553  unsigned char done = 0;
554 
555  /* beg[0]=0; end[0]=elements; */
556  stack = 0;
557  L = 0; R = elements-1;
558  done = elements-1 <= M_SMALL;
559 
560  while(!done) {
561  /* Use median of three for pivot choice */
562  P=(L+R)>>1;
563  BOTH_SWAP(P, L+1);
564  if (GT_OR_NAN(ARR(L+1), ARR(R))) { BOTH_SWAP(L+1, R); }
565  if (GT_OR_NAN(ARR(L), ARR(R))) { BOTH_SWAP(L, R); }
566  if (GT_OR_NAN(ARR(L+1), ARR(L))) { BOTH_SWAP(L+1, L); }
567 
568  i = L+1; j = R; piv = ARR(L); pid = IDX(L);
569 
570  do {
571  do { i = i+1; } while(GT_OR_NAN(piv, ARR(i)));
572  do { j = j-1; } while(GT_OR_NAN(ARR(j), piv));
573  if (j < i)
574  break;
575  BOTH_SWAP(i, j);
576  } while(1);
577  BOTH_SWAP(L, j);
578  /* Left subfile is (L, j-1) */
579  /* Right subfile is (i, R) */
580  sz_left = j-L;
581  sz_right = R-i+1;
582  if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
583  /* both subfiles are small */
584  /* if stack empty */
585  if (stack == 0) {
586  done = 1;
587  } else {
588  stack--;
589  L = beg[stack];
590  R = end[stack];
591  }
592  } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
593  /* exactly one of the subfiles is small */
594  /* (L,R) = large subfile */
595  if (sz_left > sz_right) {
596  /* Implicit: L = L; */
597  R = j-1;
598  } else {
599  L = i;
600  /* Implicit: R = R; */
601  }
602  } else {
603  /* none of the subfiles is small */
604  /* push large subfile */
605  /* (L,R) = small subfile */
606  if (sz_left > sz_right) {
607  beg[stack] = L;
608  end[stack] = j-1;
609  stack++;
610  L = i;
611  /* Implicit: R = R */
612  } else {
613  beg[stack] = i;
614  end[stack] = R;
615  stack++;
616  /* Implicit: L = L; */
617  R = j-1;
618  }
619  }
620  } /* while not done */
621  /* Now insertion sort on the concatenation of subfiles */
622  for(i=elements-2; i>=0; i--) {
623  if (GT_OR_NAN(ARR(i),ARR(i+1))) {
624  piv = ARR(i);
625  pid = IDX(i);
626  j = i+1;
627  do {
628  ARR(j-1) = ARR(j);
629  IDX(j-1) = IDX(j);
630  j = j+1;
631  } while(j < elements && GT_OR_NAN(piv, ARR(j)));
632  ARR(j-1) = piv;
633  IDX(j-1) = pid;
634  }
635  }
636 }
637 
638 static void THTensor_(quicksortdescend)(scalar_t *arr, int64_t *idx, int64_t elements, int64_t stride)
639 {
640  int64_t beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
641  scalar_t rswap, piv;
642  unsigned char done = 0;
643 
644  /* beg[0]=0; end[0]=elements; */
645  stack = 0;
646  L = 0; R = elements-1;
647  done = elements-1 <= M_SMALL;
648 
649  while(!done) {
650  /* Use median of three for pivot choice */
651  P=(L+R)>>1;
652  BOTH_SWAP(P, L+1);
653  if (GT_OR_NAN(ARR(R), ARR(L+1))) { BOTH_SWAP(L+1, R); }
654  if (GT_OR_NAN(ARR(R), ARR(L))) { BOTH_SWAP(L, R); }
655  if (GT_OR_NAN(ARR(L), ARR(L+1))) { BOTH_SWAP(L+1, L); }
656 
657  i = L+1; j = R; piv = ARR(L); pid = IDX(L);
658 
659  do {
660  do { i = i+1; } while(GT_OR_NAN(ARR(i), piv));
661  do { j = j-1; } while(GT_OR_NAN(piv, ARR(j)));
662  if (j < i)
663  break;
664  BOTH_SWAP(i, j);
665  } while(1);
666  BOTH_SWAP(L, j);
667  /* Left subfile is (L, j-1) */
668  /* Right subfile is (i, R) */
669  sz_left = j-L;
670  sz_right = R-i+1;
671  if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
672  /* both subfiles are small */
673  /* if stack empty */
674  if (stack == 0) {
675  done = 1;
676  } else {
677  stack--;
678  L = beg[stack];
679  R = end[stack];
680  }
681  } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
682  /* exactly one of the subfiles is small */
683  /* (L,R) = large subfile */
684  if (sz_left > sz_right) {
685  /* Implicit: L = L; */
686  R = j-1;
687  } else {
688  L = i;
689  /* Implicit: R = R; */
690  }
691  } else {
692  /* none of the subfiles is small */
693  /* push large subfile */
694  /* (L,R) = small subfile */
695  if (sz_left > sz_right) {
696  beg[stack] = L;
697  end[stack] = j-1;
698  stack++;
699  L = i;
700  /* Implicit: R = R */
701  } else {
702  beg[stack] = i;
703  end[stack] = R;
704  stack++;
705  /* Implicit: L = L; */
706  R = j-1;
707  }
708  }
709  } /* while not done */
710  /* Now insertion sort on the concatenation of subfiles */
711  for(i=elements-2; i>=0; i--) {
712  if (GT_OR_NAN(ARR(i+1), ARR(i))) {
713  piv = ARR(i);
714  pid = IDX(i);
715  j = i+1;
716  do {
717  ARR(j-1) = ARR(j);
718  IDX(j-1) = IDX(j);
719  j = j+1;
720  } while(j < elements && GT_OR_NAN(ARR(j), piv));
721  ARR(j-1) = piv;
722  IDX(j-1) = pid;
723  }
724  }
725 }
726 
727 #undef MAX_LEVELS
728 #undef M_SMALL
729 
730 void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder)
731 {
732  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyNoScalars)(t), 2, "invalid dimension %d",
733  dimension);
734 
735  THTensor_(resizeAs)(rt_, t);
736  at::Tensor rt__wrap = THTensor_wrap(rt_);
737  at::Tensor t_wrap = THTensor_wrap(t);
738  at::_copy_same_type_(rt__wrap, t_wrap);
739  THLongTensor_resize(ri_, t->sizes(), {});
740 
741  if(descendingOrder)
742  {
743  TH_TENSOR_DIM_APPLY2(scalar_t, rt_, int64_t, ri_, dimension,
744  int64_t i;
745  for(i = 0; i < ri__size; i++)
746  ri__data[i*ri__stride] = i;
747  THTensor_(quicksortdescend)(rt__data, ri__data, rt__size, rt__stride);)
748  }
749  else
750  {
751  TH_TENSOR_DIM_APPLY2(scalar_t, rt_, int64_t, ri_, dimension,
752  int64_t i;
753  for(i = 0; i < ri__size; i++)
754  ri__data[i*ri__stride] = i;
755  THTensor_(quicksortascend)(rt__data, ri__data, rt__size, rt__stride);)
756  }
757 }
758 
759 /* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
760 public domain implementation at http://ndevilla.free.fr/median/median/
761 Adapted similarly to the above Quicksort algorithm.
762 This version does not produce indices along with values. */
763 static void THTensor_(quickselectnoidx)(scalar_t *arr, int64_t k, int64_t elements, int64_t stride)
764 {
765  int64_t P, L, R, i, j;
766  scalar_t rswap, piv;
767  L = 0;
768  R = elements-1;
769 
770  do {
771  if (R <= L) /* One element only */
772  return;
773 
774  if (R == L+1) { /* Two elements only */
775  if (ARR(L) > ARR(R)) {
776  ARR_SWAP(L, R);
777  }
778  return;
779  }
780 
781  /* Use median of three for pivot choice */
782  P=(L+R)>>1;
783  ARR_SWAP(P, L+1);
784  if (ARR(L+1) > ARR(R)) { ARR_SWAP(L+1, R); }
785  if (ARR(L) > ARR(R)) { ARR_SWAP(L, R); }
786  if (ARR(L+1) > ARR(L)) { ARR_SWAP(L+1, L); }
787 
788  i = L+1;
789  j = R;
790  piv = ARR(L);
791  do {
792  do i++; while(ARR(i) < piv);
793  do j--; while(ARR(j) > piv);
794  if (j < i)
795  break;
796  ARR_SWAP(i, j);
797  } while(1);
798  ARR_SWAP(L, j);
799 
800  /* Re-set active partition */
801  if (j <= k) L=i;
802  if (j >= k) R=j-1;
803  } while(1);
804 }
805 
806 /* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
807 public domain implementation at http://ndevilla.free.fr/median/median/
808 Adapted similarly to the above Quicksort algorithm. */
809 static void THTensor_(quickselect)(scalar_t *arr, int64_t *idx, int64_t k, int64_t elements, int64_t stride)
810 {
811  int64_t P, L, R, i, j, swap;
812  scalar_t rswap, piv;
813  L = 0;
814  R = elements-1;
815 
816  do {
817  if (R <= L) /* One element only */
818  return;
819 
820  if (R == L+1) { /* Two elements only */
821  if (ARR(L) > ARR(R)) {
822  BOTH_SWAP(L, R);
823  }
824  return;
825  }
826 
827  /* Use median of three for pivot choice */
828  P=(L+R)>>1;
829  BOTH_SWAP(P, L+1);
830  if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
831  if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
832  if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
833 
834  i = L+1;
835  j = R;
836  piv = ARR(L);
837  do {
838  do i++; while(ARR(i) < piv);
839  do j--; while(ARR(j) > piv);
840  if (j < i)
841  break;
842  BOTH_SWAP(i, j);
843  } while(1);
844  BOTH_SWAP(L, j);
845 
846  /* Re-set active partition */
847  if (j <= k) L=i;
848  if (j >= k) R=j-1;
849  } while(1);
850 }
851 
852 #undef ARR
853 #undef IDX
854 #undef LONG_SWAP
855 #undef REAL_SWAP
856 #undef BOTH_SWAP
857 
858 scalar_t THTensor_(medianall)(THTensor *tensor)
859 {
860  THArgCheck(THTensor_nDimensionLegacyAll(tensor) > 0, 1, "tensor must have one dimension");
861 
862  scalar_t theMedian;
863  ptrdiff_t numel;
864  int64_t k;
865  THTensor *temp_;
866  scalar_t *temp__data;
867 
868  numel = THTensor_(nElement)(tensor);
869  k = (numel-1) >> 1;
870 
871  temp_ = THTensor_(newClone)(tensor);
872  temp__data = temp_->data<scalar_t>();
873 
874  THTensor_(quickselectnoidx)(temp__data, k, numel, 1);
875 
876  theMedian = temp__data[k];
877 
878  c10::raw::intrusive_ptr::decref(temp_);
879 
880  return theMedian;
881 }
882 
883 void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
884 {
885  THTensor *temp_;
886  THLongTensor *tempi_;
887  scalar_t *temp__data;
888  int64_t *tempi__data;
889  int64_t t_size_dim;
890 
891  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 3, "dimension out of range");
892 
893  int in_dims = THTensor_(nDimensionLegacyAll)(t);
894  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
895  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
896  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
897  dim[dimension] = 1;
898  THTensor_(resize)(values_, dim, {});
899  THLongTensor_resize(indices_, dim, {});
900 
901  t_size_dim = THTensor_sizeLegacyNoScalars(t, dimension);
902 
903  temp_ = THTensor_(new)();
904  THTensor_(resize1d)(temp_, t_size_dim);
905  temp__data = temp_->data<scalar_t>();
906 
907  tempi_ = THLongTensor_new();
908  THLongTensor_resize1d(tempi_, t_size_dim);
909  tempi__data = THLongTensor_data(tempi_);
910 
911  TH_TENSOR_DIM_APPLY3(scalar_t, t, scalar_t, values_, int64_t, indices_, dimension,
912  TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
913  int64_t i;
914  scalar_t mode = 0;
915  int64_t modei = 0;
916  int64_t temp_freq = 0;
917  int64_t max_freq = 0;
918  for(i = 0; i < t_size_dim; i++)
919  temp__data[i] = t_data[i*t_stride];
920  for(i = 0; i < t_size_dim; i++)
921  tempi__data[i] = i;
922  THTensor_(quicksortascend)(temp__data, tempi__data, t_size_dim, 1);
923 
924  for(i = 0; i < t_size_dim; i++)
925  {
926  temp_freq++;
927  if ((i == t_size_dim - 1) || (temp__data[i] != temp__data[i+1]))
928  {
929  if (temp_freq > max_freq)
930  {
931  mode = temp__data[i];
932  modei = tempi__data[i];
933  max_freq = temp_freq;
934  }
935  temp_freq = 0;
936  }
937  }
938  *values__data = mode;
939  *indices__data = modei;);
940 
941  c10::raw::intrusive_ptr::decref(temp_);
942  THLongTensor_free(tempi_);
943  if (!keepdim) {
944  THTensor_(squeeze1d)(values_, values_, dimension);
945  THLongTensor_squeeze1d(indices_, indices_, dimension);
946  }
947 }
948 
949 void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t k, int dimension, int keepdim)
950 {
951  THTensor *temp_;
952  THLongTensor *tempi_;
953  scalar_t *temp__data;
954  int64_t *tempi__data;
955  int64_t t_size_dim;
956 
957  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 3, "dimension out of range");
958  THArgCheck(k > 0 && k <= THTensor_sizeLegacyNoScalars(t, dimension), 2, "selected index out of range");
959 
960  int in_dims = THTensor_(nDimensionLegacyAll)(t);
961  THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
962  THLongTensor_preserveReduceDimSemantics(indices_, in_dims, dimension, keepdim);
963  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
964  dim[dimension] = 1;
965  THTensor_(resize)(values_, dim, {});
966  THLongTensor_resize(indices_, dim, {});
967 
968  t_size_dim = THTensor_sizeLegacyNoScalars(t, dimension);
969 
970  temp_ = THTensor_(new)();
971  THTensor_(resize1d)(temp_, t_size_dim);
972  temp__data = temp_->data<scalar_t>();
973 
974  tempi_ = THLongTensor_new();
975  THLongTensor_resize1d(tempi_, t_size_dim);
976  tempi__data = THLongTensor_data(tempi_);
977 
978  TH_TENSOR_DIM_APPLY3(scalar_t, t, scalar_t, values_, int64_t, indices_, dimension,
979  TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
980  int64_t i;
981  for(i = 0; i < t_size_dim; i++)
982  temp__data[i] = t_data[i*t_stride];
983  for(i = 0; i < t_size_dim; i++)
984  tempi__data[i] = i;
985  THTensor_(quickselect)(temp__data, tempi__data, k - 1, t_size_dim, 1);
986  *values__data = temp__data[k-1];
987  *indices__data = tempi__data[k-1];);
988 
989  c10::raw::intrusive_ptr::decref(temp_);
990  THLongTensor_free(tempi_);
991  if (!keepdim) {
992  THTensor_(squeeze1d)(values_, values_, dimension);
993  THLongTensor_squeeze1d(indices_, indices_, dimension);
994  }
995 }
996 
997 void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
998 {
999  int64_t t_size_dim, k;
1000 
1001  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 3, "dimension out of range");
1002 
1003  t_size_dim = THTensor_sizeLegacyNoScalars(t, dimension);
1004  k = (t_size_dim-1) >> 1; /* take middle or one-before-middle element */
1005 
1006  THTensor_(kthvalue)(values_, indices_, t, k+1, dimension, keepdim);
1007 }
1008 
1009 void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted)
1010 {
1011  int numDims = THTensor_(nDimensionLegacyNoScalars)(t);
1012  THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
1013 
1014  int64_t sliceSize = THTensor_sizeLegacyNoScalars(t, dim);
1015  THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension");
1016 
1017  THTensor *tmpResults = THTensor_(new)();
1018  THTensor_(resize1d)(tmpResults, sliceSize);
1019  scalar_t *tmp__data = tmpResults->data<scalar_t>();
1020 
1021  THLongTensor *tmpIndices = THLongTensor_new();
1022  THLongTensor_resize1d(tmpIndices, sliceSize);
1023  int64_t *tmpi__data = THLongTensor_data(tmpIndices);
1024 
1025  std::vector<int64_t> topKSize = t->sizes().vec();
1026  if (topKSize.size() > 0) { // handle 0-dim vs 1-dim differences.
1027  topKSize[dim] = k;
1028  }
1029  THTensor_(resize)(rt_, topKSize, {});
1030  THLongTensor_resize(ri_, topKSize, {});
1031 
1032  if (dir) {
1033  /* k largest elements, descending order (optional: see sorted) */
1034  int64_t K = sliceSize - k;
1035  TH_TENSOR_DIM_APPLY3(scalar_t, t, scalar_t, rt_, int64_t, ri_, dim,
1036  TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
1037  int64_t i;
1038  for(i = 0; i < sliceSize; i++)
1039  {
1040  tmp__data[i] = t_data[i*t_stride];
1041  tmpi__data[i] = i;
1042  }
1043  if (K > 0)
1044  THTensor_(quickselect)(tmp__data, tmpi__data, K - 1, sliceSize, 1);
1045  if (sorted)
1046  THTensor_(quicksortdescend)(tmp__data + K, tmpi__data + K, k, 1);
1047  for(i = 0; i < k; i++)
1048  {
1049  rt__data[i*rt__stride] = tmp__data[i + K];
1050  ri__data[i*ri__stride] = tmpi__data[i + K];
1051  })
1052  }
1053  else {
1054  /* k smallest elements, ascending order (optional: see sorted) */
1055  TH_TENSOR_DIM_APPLY3(scalar_t, t, scalar_t, rt_, int64_t, ri_, dim,
1056  TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM,
1057  int64_t i;
1058  for(i = 0; i < sliceSize; i++)
1059  {
1060  tmp__data[i] = t_data[i*t_stride];
1061  tmpi__data[i] = i;
1062  }
1063  THTensor_(quickselect)(tmp__data, tmpi__data, k - 1, sliceSize, 1);
1064  if (sorted)
1065  THTensor_(quicksortascend)(tmp__data, tmpi__data, k - 1, 1);
1066  for(i = 0; i < k; i++)
1067  {
1068  rt__data[i*rt__stride] = tmp__data[i];
1069  ri__data[i*ri__stride] = tmpi__data[i];
1070  })
1071  }
1072 
1073  c10::raw::intrusive_ptr::decref(tmpResults);
1074  THLongTensor_free(tmpIndices);
1075 }
1076 
1077 void THTensor_(triu)(THTensor *r_, THTensor *t, int64_t k)
1078 {
1079  int64_t t_size_0, t_size_1;
1080  int64_t t_stride_0, t_stride_1;
1081  int64_t r__stride_0, r__stride_1;
1082  scalar_t *t_data, *r__data;
1083  int64_t r, c;
1084 
1085  THArgCheck(THTensor_(nDimensionLegacyAll)(t) == 2, 1, "expected a matrix");
1086 
1087  THTensor_(resizeAs)(r_, t);
1088 
1089  t_size_0 = THTensor_(size)(t, 0);
1090  t_size_1 = THTensor_(size)(t, 1);
1091  t_stride_0 = THTensor_(stride)(t, 0);
1092  t_stride_1 = THTensor_(stride)(t, 1);
1093  r__stride_0 = THTensor_(stride)(r_, 0);
1094  r__stride_1 = THTensor_(stride)(r_, 1);
1095  r__data = r_->data<scalar_t>();
1096  t_data = t->data<scalar_t>();
1097 
1098  for(r = 0; r < t_size_0; r++)
1099  {
1100  int64_t sz = THMin(r+k, t_size_1);
1101  for(c = THMax(0, r+k); c < t_size_1; c++)
1102  r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
1103  for(c = 0; c < sz; c++)
1104  r__data[r*r__stride_0+c*r__stride_1] = 0;
1105  }
1106 }
1107 
1108 int THTensor_(equal)(THTensor *ta, THTensor* tb)
1109 {
1110  int equal = 1;
1111  if(!THTensor_(isSameSizeAs)(ta, tb))
1112  return 0;
1113 
1114  if (THTensor_(isContiguous)(ta) && THTensor_(isContiguous)(tb)) {
1115  scalar_t *tap = ta->data<scalar_t>();
1116  scalar_t *tbp = tb->data<scalar_t>();
1117  ptrdiff_t sz = THTensor_(nElement)(ta);
1118  ptrdiff_t i;
1119  for (i=0; i<sz; ++i){
1120  if(tap[i] != tbp[i]) return 0;
1121  }
1122  } else {
1123  // Short-circuit the apply function on inequality
1124  TH_TENSOR_APPLY2(scalar_t, ta, scalar_t, tb,
1125  if (equal && *ta_data != *tb_data) {
1126  equal = 0;
1127  TH_TENSOR_APPLY_hasFinished = 1; break;
1128  })
1129  }
1130  return equal;
1131 }
1132 
1133 #define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \
1134  void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, scalar_t value) \
1135  { \
1136  THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
1137  TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t, \
1138  *r__data = (*t_data OP value) ? 1 : 0;); \
1139  } \
1140  void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value) \
1141  { \
1142  THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL); \
1143  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, \
1144  *r__data = (*t_data OP value) ? 1 : 0;); \
1145  } \
1146  void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
1147  { \
1148  THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
1149  TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb, \
1150  *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
1151  } \
1152  void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
1153  { \
1154  THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \
1155  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb, \
1156  *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
1157  } \
1158 
1159 
1160 TENSOR_IMPLEMENT_LOGICAL(lt,<)
1161 TENSOR_IMPLEMENT_LOGICAL(gt,>)
1162 TENSOR_IMPLEMENT_LOGICAL(le,<=)
1163 TENSOR_IMPLEMENT_LOGICAL(ge,>=)
1164 TENSOR_IMPLEMENT_LOGICAL(eq,==)
1165 TENSOR_IMPLEMENT_LOGICAL(ne,!=)
1166 
1167 
1168 #ifdef _OPENMP
1169 
1170 #define LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, OMP_THRESHOLD) \
1171  void THTensor_(NAME)(THTensor *r_, THTensor *t) \
1172  { \
1173  THTensor_(resizeAs)(r_, t); \
1174  ptrdiff_t r_Size = THTensor_(nElement)(r_); \
1175  int r_Contig = THTensor_(isContiguous)(r_); \
1176  int tContig = THTensor_(isContiguous)(t); \
1177  int inOMP = omp_in_parallel(); \
1178  if( !inOMP ){ \
1179  TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = CFUNC(*t_data);, OMP_THRESHOLD); \
1180  } else { \
1181  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = CFUNC(*t_data);); \
1182  } \
1183  }
1184 
1185 #define LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC) \
1186  LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD)
1187 
1188 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, OMP_THRESHOLD) \
1189  void THTensor_(NAME)(THTensor *r_, THTensor *t) \
1190  { \
1191  THTensor_(resizeAs)(r_, t); \
1192  ptrdiff_t r_Size = THTensor_(nElement)(r_); \
1193  int r_Contig = THTensor_(isContiguous)(r_); \
1194  int tContig = THTensor_(isContiguous)(t); \
1195  if (r_Contig && tContig) { \
1196  TH_TENSOR_APPLY2_CONTIG(scalar_t, r_, scalar_t, t, THVector_(NAME)(r__data, t_data, r__len);); \
1197  } else { \
1198  int inOMP = omp_in_parallel(); \
1199  if( !inOMP ){ \
1200  TH_TENSOR_APPLY2_OMP(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = CFUNC(*t_data);, OMP_THRESHOLD); \
1201  } \
1202  else { \
1203  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, *r__data = CFUNC(*t_data);); \
1204  } \
1205  } \
1206  }
1207 
1208 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC) \
1209  LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD)
1210 
1211 #else
1212 
1213 #define LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC) \
1214  void THTensor_(NAME)(THTensor *r_, THTensor *t) \
1215  { \
1216  THTensor_(resizeAs)(r_, t); \
1217  TH_TENSOR_APPLY2(scalar_t, t, scalar_t, r_, *r__data = CFUNC(*t_data);); \
1218  } \
1219 
1220 #define LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS(NAME, CFUNC, PSEUDO_OMP_THRESHOLD) \
1221  LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS(NAME, CFUNC)
1222 
1223 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC) \
1224  void THTensor_(NAME)(THTensor *r_, THTensor *t) \
1225  { \
1226  THTensor_(resizeAs)(r_, t); \
1227  int r_Contig = THTensor_(isContiguous)(r_); \
1228  int tContig = THTensor_(isContiguous)(t); \
1229  if (r_Contig && tContig) { \
1230  TH_TENSOR_APPLY2_CONTIG(scalar_t, r_, scalar_t, t, THVector_(NAME)(r__data, t_data, r__len);); \
1231  } else { \
1232  TH_TENSOR_APPLY2(scalar_t, t, scalar_t, r_, *r__data = CFUNC(*t_data);); \
1233  } \
1234  } \
1235 
1236 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS(NAME, CFUNC, PSEUDO_OMP_THRESHOLD) \
1237  LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS(NAME, CFUNC)
1238 
1239 #endif
1240 
1241 #define EXPAND(...) __VA_ARGS__
1242 
1243 #define GET_4TH_ARG(ARG0, ARG1, ARG2, ARG3, ...) ARG3
1244 
1245 #define LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(...) \
1246  EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_BASIC_FUNCTION_3_ARGS, LAB_IMPLEMENT_BASIC_FUNCTION_2_ARGS, ))
1247 
1248 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(...) \
1249  EXPAND(GET_4TH_ARG(__VA_ARGS__, LAB_IMPLEMENT_VECTORIZED_FUNCTION_3_ARGS, LAB_IMPLEMENT_VECTORIZED_FUNCTION_2_ARGS, ))
1250 
1251 #define LAB_IMPLEMENT_BASIC_FUNCTION(...) EXPAND(LAB_IMPLEMENT_BASIC_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__))
1252 
1253 #define LAB_IMPLEMENT_VECTORIZED_FUNCTION(...) EXPAND(LAB_IMPLEMENT_VECTORIZED_FUNCTION_CHOOSE(__VA_ARGS__)(__VA_ARGS__))
1254 
1255 /*
1256  * LAB_IMPLEMENT_BASIC_FUNCTION is a macro with optional parameters, you can use it flexibly.
1257  * The macro will discard the invalid openmp threshold if openmp is unavailable. The macro will give a default threshold even if you forget to pass one.
1258  * In other word,
1259  * (A), If openmp is UNavailable, the two usage below is both right.
1260  * (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD) // discard the invalid openmp threshold
1261  * (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity)
1262  * (B), If openmp is available, the two usage below is also both right.
1263  * (1) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity, OMP_OVERHEAD_THRESHOLD)
1264  * (2) LAB_IMPLEMENT_BASIC_FUNCTION(type_func, func_entity) // pass the default openmp threshold
1265  * So do LAB_IMPLEMENT_VECTORIZED_FUNCTION.
1266 */
1267 
1268 LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
1269 
1270 #if defined(TH_REAL_IS_LONG)
1271 LAB_IMPLEMENT_BASIC_FUNCTION(abs,labs)
1272 #endif /* int64_t only part */
1273 
1274 #if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_CHAR)
1275 LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
1276 #endif /* int only part */
1277 
1278 #if defined(TH_REAL_IS_BYTE)
1279 LAB_IMPLEMENT_BASIC_FUNCTION(abs,)
1280 #endif /* for byte, identity due to it being unsigned */
1281 
1282 /* floating point only now */
1283 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
1284 
1285 #if defined (TH_REAL_IS_FLOAT)
1286 #define TH_MATH_NAME(fn) fn##f
1287 #else
1288 #define TH_MATH_NAME(fn) fn
1289 #endif
1290 
1291 LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,TH_MATH_NAME(lgamma))
1292 LAB_IMPLEMENT_BASIC_FUNCTION(digamma,TH_MATH_NAME(TH_digamma))
1293 LAB_IMPLEMENT_BASIC_FUNCTION(trigamma,TH_MATH_NAME(TH_trigamma))
1294 LAB_IMPLEMENT_BASIC_FUNCTION(erfinv,TH_erfinv)
1295 LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs))
1296 LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_MATH_NAME(TH_frac))
1297 LAB_IMPLEMENT_BASIC_FUNCTION(cinv, TH_MATH_NAME(1.0) / )
1298 
1299 LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
1300 LAB_IMPLEMENT_BASIC_FUNCTION(sinh,TH_MATH_NAME(sinh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
1301 LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
1302 LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,TH_MATH_NAME(sqrt),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
1303 LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_MATH_NAME(TH_rsqrt),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
1304 
1305 LAB_IMPLEMENT_VECTORIZED_FUNCTION(sigmoid,TH_MATH_NAME(TH_sigmoid),HYPER_TH_OMP_OVERHEAD_THRESHOLD)
1306 
1307 void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty)
1308 {
1309  THTensor_(resizeAs)(r_, tx);
1310  TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, tx, scalar_t, ty, *r__data = TH_MATH_NAME(atan2)(*tx_data,*ty_data););
1311 }
1312 
1313 void THTensor_(polygamma)(THTensor *r_, int64_t n, THTensor *t) {
1314  switch (n) {
1315  case 0: THTensor_(digamma)(r_, t); return;
1316  case 1: THTensor_(trigamma)(r_, t); return;
1317  default: THError("polygamma(n,x) is not implemented for n>=2");
1318  }
1319 }
1320 
1321 void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim)
1322 {
1323  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 3, "invalid dimension %d",
1324  dimension);
1325 
1326  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(nDimensionLegacyAll)(t), dimension, keepdim);
1327  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
1328  dim[dimension] = 1;
1329  THTensor_(resize)(r_, dim, {});
1330 
1331  TH_TENSOR_DIM_APPLY2(scalar_t, t, scalar_t, r_, dimension,
1332  // Uses Welford's algorithm for numeric stability
1333  accreal mean = 0;
1334  accreal M2 = 0;
1335 
1336  int64_t i;
1337  for (i = 0; i < t_size; i++)
1338  {
1339  scalar_t z = t_data[i*t_stride];
1340  scalar_t delta = z - mean;
1341  mean += delta / (i + 1);
1342  scalar_t delta2 = z - mean;
1343  M2 += delta * delta2;
1344  }
1345 
1346  if (biased && t_size >= 2)
1347  {
1348  *r__data = TH_MATH_NAME(sqrt)(M2 / t_size);
1349  } else if (!biased && t_size >= 2) {
1350  *r__data = TH_MATH_NAME(sqrt)(M2 / (t_size - 1));
1351  } else if (biased && t_size == 1) {
1352  *r__data = 0;
1353  } else {
1354  *r__data = NAN;
1355  });
1356 
1357  if (!keepdim) {
1358  THTensor_(squeeze1d)(r_, r_, dimension);
1359  }
1360 }
1361 
1362 void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim)
1363 {
1364  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 3, "invalid dimension %d",
1365  dimension);
1366 
1367  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(nDimensionLegacyAll)(t), dimension, keepdim);
1368  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
1369  dim[dimension] = 1;
1370  THTensor_(resize)(r_, dim, {});
1371 
1372  TH_TENSOR_DIM_APPLY2(scalar_t, t, scalar_t, r_, dimension,
1373  // Uses Welford's algorithm for numeric stability
1374  accreal mean = 0;
1375  accreal M2 = 0;
1376 
1377  int64_t i;
1378  for (i = 0; i < t_size; i++)
1379  {
1380  scalar_t z = t_data[i*t_stride];
1381  scalar_t delta = z - mean;
1382  mean += delta / (i + 1);
1383  scalar_t delta2 = z - mean;
1384  M2 += delta * delta2;
1385  }
1386 
1387  if (biased && t_size >= 2)
1388  {
1389  *r__data = M2 / t_size;
1390  } else if (!biased && t_size >= 2) {
1391  *r__data = M2 / (t_size - 1);
1392  } else if (biased && t_size == 1) {
1393  *r__data = 0;
1394  } else {
1395  *r__data = NAN;
1396  });
1397 
1398  if (!keepdim) {
1399  THTensor_(squeeze1d)(r_, r_, dimension);
1400  }
1401 }
1402 
1403 void THTensor_(norm)(THTensor *r_, THTensor *t, scalar_t value, int dimension, int keepdim)
1404 {
1405  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(t), 3, "invalid dimension %d",
1406  dimension);
1407 
1408  THTensor_(preserveReduceDimSemantics)(r_, THTensor_(nDimensionLegacyAll)(t), dimension, keepdim);
1409  std::vector<int64_t> dim = THTensor_sizesLegacyNoScalars(t);
1410  dim[dimension] = 1;
1411  THTensor_(resize)(r_, dim, {});
1412 
1413  #define DIM_REDUCE(reduce, transform, init) \
1414  TH_TENSOR_DIM_APPLY2(scalar_t, t, scalar_t, r_, dimension, \
1415  accreal sum = init; \
1416  int64_t i; \
1417  for(i = 0; i < t_size; i++) { \
1418  (reduce); \
1419  } \
1420  (transform);) \
1421 
1422  if(value == 0) {
1423  DIM_REDUCE(sum += t_data[i*t_stride] != 0.0,
1424  *r__data = sum, 0);
1425  } else if (value == 1) {
1426  DIM_REDUCE(sum += TH_MATH_NAME(fabs)(t_data[i*t_stride]),
1427  *r__data = sum, 0);
1428  } else if (value == 2) {
1429  DIM_REDUCE(sum += t_data[i*t_stride] * t_data[i*t_stride],
1430  *r__data = TH_MATH_NAME(sqrt)(sum), 0);
1431  } else if (value == 3) {
1432  DIM_REDUCE(sum += TH_MATH_NAME(fabs)(t_data[i*t_stride] * t_data[i*t_stride] * t_data[i*t_stride]),
1433  *r__data = TH_MATH_NAME(pow)(sum, 1.0/3), 0);
1434  } else if (value == INFINITY) {
1435  DIM_REDUCE(sum = THMax(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
1436  *r__data = sum, 0);
1437  } else if (value == -INFINITY) {
1438  DIM_REDUCE(sum = THMin(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
1439  *r__data = sum, INFINITY);
1440  } else {
1441  DIM_REDUCE(sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(t_data[i*t_stride]), value),
1442  *r__data = TH_MATH_NAME(pow)(sum, 1.0/value), 0);
1443  }
1444 
1445  if (!keepdim) {
1446  THTensor_(squeeze1d)(r_, r_, dimension);
1447  }
1448  #undef DIM_REDUCE
1449 }
1450 
1451 accreal THTensor_(normall)(THTensor *tensor, scalar_t value)
1452 {
1453  accreal sum = 0;
1454  if(value == 0) {
1455  TH_TENSOR_APPLY(scalar_t, tensor, sum += *tensor_data != 0.0;);
1456  return sum;
1457  } else if(value == 1) {
1458  TH_TENSOR_APPLY(scalar_t, tensor, sum += TH_MATH_NAME(fabs)(*tensor_data););
1459  return sum;
1460  } else if(value == 2) {
1461  TH_TENSOR_APPLY(scalar_t, tensor, accreal z = *tensor_data; sum += z*z;);
1462  return sqrt(sum);
1463  } else if(value == 3) {
1464  TH_TENSOR_APPLY(scalar_t, tensor, accreal z = *tensor_data; sum += std::abs(z*z*z););
1465  return TH_MATH_NAME(pow)(sum, 1.0/3);
1466  } else if(value == INFINITY) {
1467  TH_TENSOR_APPLY(scalar_t, tensor, sum = THMax(sum, TH_MATH_NAME(fabs)(*tensor_data)););
1468  return sum;
1469  } else if(value == -INFINITY) {
1470  sum = INFINITY;
1471  TH_TENSOR_APPLY(scalar_t, tensor, sum = THMin(sum, TH_MATH_NAME(fabs)(*tensor_data)););
1472  return sum;
1473  } else {
1474  TH_TENSOR_APPLY(scalar_t, tensor, sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*tensor_data), value););
1475  return TH_MATH_NAME(pow)(sum, 1.0/value);
1476  }
1477 }
1478 
1479 void THTensor_(renorm)(THTensor *res, THTensor *src, scalar_t value, int dimension, scalar_t maxnorm)
1480 {
1481  THTensor *rowR, *rowS;
1482 
1483  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyNoScalars)(src), 3, "invalid dimension %d",
1484  dimension);
1485  THArgCheck(value > 0, 2, "non-positive-norm not supported");
1486  THArgCheck(THTensor_(nDimensionLegacyNoScalars)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions",
1487  THTensor_(nDimensionLegacyNoScalars)(src));
1488 
1489  rowR = THTensor_(new)();
1490  rowS = THTensor_(new)();
1491 
1492  THTensor_(resizeAs)(res, src);
1493 
1494  for (int64_t i = 0; i < THTensor_sizeLegacyNoScalars(src, dimension); i++)
1495  {
1496  scalar_t norm = 0;
1497  scalar_t new_norm;
1498 
1499  THTensor_(select)(rowS, src, dimension, i);
1500  THTensor_(select)(rowR, res, dimension, i);
1501  if (value == 1) {
1502  TH_TENSOR_APPLY(scalar_t, rowS, norm += fabs(*rowS_data););
1503  } else if (value == 2) {
1504  TH_TENSOR_APPLY(scalar_t, rowS, accreal z = *rowS_data; norm += z*z;);
1505  } else if (value == INFINITY) {
1506  TH_TENSOR_APPLY(scalar_t, rowS, norm = THMax(norm, TH_MATH_NAME(fabs)(*rowS_data)););
1507  } else {
1508  TH_TENSOR_APPLY(scalar_t, rowS, norm += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*rowS_data), value););
1509  }
1510 
1511  if (value != INFINITY) {
1512  norm = pow(norm, 1/value);
1513  }
1514 
1515  if (norm > maxnorm)
1516  {
1517  new_norm = maxnorm / (norm + 1e-7);
1518 
1519  TH_TENSOR_APPLY2(
1520  scalar_t, rowR, scalar_t, rowS,
1521  *rowR_data = (*rowS_data) * new_norm;
1522  )
1523  }
1524  else
1525  {
1526  at::Tensor rowR_wrap = THTensor_wrap(rowR);
1527  at::Tensor rowS_wrap = THTensor_wrap(rowS);
1528  at::_copy_same_type_(rowR_wrap, rowS_wrap);
1529  }
1530  }
1531 
1532  c10::raw::intrusive_ptr::decref(rowR);
1533  c10::raw::intrusive_ptr::decref(rowS);
1534 }
1535 
1536 accreal THTensor_(dist)(THTensor *tensor, THTensor *src, scalar_t value)
1537 {
1538  scalar_t sum;
1539  if (value == INFINITY) {
1540  sum = -1.0;
1541  TH_TENSOR_APPLY2(scalar_t, tensor, scalar_t, src,
1542  sum = THMax(sum, TH_MATH_NAME(fabs)(*tensor_data - *src_data)););
1543  return sum;
1544  } else if (value == -INFINITY) {
1545  sum = INFINITY;
1546  TH_TENSOR_APPLY2(scalar_t, tensor, scalar_t, src,
1547  sum = THMin(sum, TH_MATH_NAME(fabs)(*tensor_data - *src_data)););
1548  return sum;
1549  } else if (value == 0.0) {
1550  sum = 0.0;
1551  TH_TENSOR_APPLY2(scalar_t, tensor, scalar_t, src,
1552  sum += (*tensor_data - *src_data != 0.0););
1553  return sum;
1554  } else {
1555  sum = 0.0;
1556  TH_TENSOR_APPLY2(scalar_t, tensor, scalar_t, src,
1557  sum += TH_MATH_NAME(pow)(
1558  TH_MATH_NAME(fabs)(*tensor_data - *src_data), value););
1559  return TH_MATH_NAME(pow)(sum, 1.0/value);
1560  }
1561 }
1562 
1563 accreal THTensor_(meanall)(THTensor *tensor)
1564 {
1565  return THTensor_(sumall)(tensor)/THTensor_(nElement)(tensor);
1566 }
1567 
1568 accreal THTensor_(varall)(THTensor *tensor, int biased)
1569 {
1570  accreal mean = THTensor_(meanall)(tensor);
1571  accreal sum = 0;
1572  TH_TENSOR_APPLY(scalar_t, tensor, sum += (*tensor_data - mean)*(*tensor_data - mean););
1573  sum /= std::max<int64_t>(0, THTensor_(nElement)(tensor) - (biased ? 0 : 1));
1574  return sum;
1575 }
1576 
1577 accreal THTensor_(stdall)(THTensor *tensor, int biased)
1578 {
1579  return sqrt(THTensor_(varall)(tensor, biased));
1580 }
1581 
1582 void THTensor_(histc)(THTensor *hist, THTensor *tensor, int64_t nbins, scalar_t minvalue, scalar_t maxvalue)
1583 {
1584  if (nbins <= 0) {
1585  THError("bins must be > 0");
1586  }
1587  scalar_t minval;
1588  scalar_t maxval;
1589  scalar_t *h_data;
1590 
1591  THTensor_(resize1d)(hist, nbins);
1592  THTensor_(zero)(hist);
1593  minval = minvalue;
1594  maxval = maxvalue;
1595  if (minval == maxval)
1596  {
1597  minval = THTensor_(minall)(tensor);
1598  maxval = THTensor_(maxall)(tensor);
1599  }
1600  if (minval == maxval)
1601  {
1602  minval = minval - 1;
1603  maxval = maxval + 1;
1604  }
1605 
1606  h_data = hist->data<scalar_t>();
1607 
1608  TH_TENSOR_APPLY(scalar_t, tensor,
1609  if (*tensor_data >= minval && *tensor_data <= maxval) {
1610  const int bin = (int)((*tensor_data-minval) / (maxval-minval) * nbins);
1611  h_data[THMin(bin, nbins-1)] += 1;
1612  }
1613  );
1614 }
1615 
1616 void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, scalar_t minvalue, scalar_t maxvalue)
1617 {
1618  THArgCheck(THTensor_(nDimensionLegacyAll)(tensor) < 3, 2, "invalid dimension %d, the input must be a 2d tensor", THTensor_(nDimensionLegacyAll)(tensor));
1619 
1620  int dimension = 1;
1621  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimensionLegacyAll)(tensor), 2, "invalid dimension %d",
1622  dimension);
1623 
1624  scalar_t minval;
1625  scalar_t maxval;
1626 
1627  THTensor_(resize2d)(hist, THTensor_sizeLegacyNoScalars(tensor, 0), nbins);
1628  THTensor_(zero)(hist);
1629 
1630  minval = minvalue;
1631  maxval = maxvalue;
1632  if (minval == maxval)
1633  {
1634  minval = THTensor_(minall)(tensor);
1635  maxval = THTensor_(maxall)(tensor);
1636  }
1637  if (minval == maxval)
1638  {
1639  minval = minval - 1;
1640  maxval = maxval + 1;
1641  }
1642 
1643  TH_TENSOR_DIM_APPLY2(scalar_t, tensor, scalar_t, hist, dimension, int64_t i;
1644  for(i = 0; i < tensor_size; i++)
1645  {
1646  if(tensor_data[i*tensor_stride] >= minval && tensor_data[i*tensor_stride] <= maxval) {
1647  const int bin = (int)((tensor_data[i*tensor_stride]-minval) / (maxval-minval) * nbins);
1648  hist_data[THMin(bin, nbins-1)] += 1;
1649  }
1650  }
1651  );
1652 }
1653 
1654 // Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha.
1655 // Assumes x is close to zero and uses a Taylor expansion.
1656 static inline scalar_t THTensor_(beta_grad_alpha_small)(scalar_t x, scalar_t alpha, scalar_t beta) {
1657  const scalar_t factor = TH_MATH_NAME(TH_digamma)(alpha) - TH_MATH_NAME(TH_digamma)(alpha + beta) - TH_MATH_NAME(log)(x);
1658  scalar_t numer = 1;
1659  scalar_t series = numer / alpha * (factor + 1 / alpha);
1660  for (int i = 1; i <= 10; ++i) {
1661  numer *= (i - beta) * x / i;
1662  const scalar_t denom = alpha + i;
1663  series += numer / denom * (factor + 1 / denom);
1664  }
1665  const scalar_t result = x * TH_MATH_NAME(pow)(1 - x, -beta) * series;
1666  return th_isnan(result) ? 0.0 : result;
1667 }
1668 
1669 // Approximate reparameterized gradient of Beta(x,alpha,beta) wrt beta.
1670 // Assumes x is close to zero and uses a Taylor expansion.
1671 static inline scalar_t THTensor_(beta_grad_beta_small)(scalar_t x, scalar_t alpha, scalar_t beta) {
1672  const scalar_t factor = TH_MATH_NAME(TH_digamma)(alpha+beta) - TH_MATH_NAME(TH_digamma)(beta);
1673  scalar_t numer = 1;
1674  scalar_t betas = 1;
1675  scalar_t dbetas = 0;
1676  scalar_t series = factor / alpha;
1677  for (int i = 1; i <= 8; ++i) {
1678  numer *= -x / i;
1679  dbetas = dbetas * (beta - i) + betas;
1680  betas = betas * (beta - i);
1681  series += numer / (alpha + i) * (dbetas + factor * betas);
1682  }
1683  const scalar_t result = -TH_MATH_NAME(pow)(1 - x, 1 - beta) * series;
1684  return th_isnan(result) ? 0.0 : result;
1685 }
1686 
1687 // Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha.
1688 // Assumes alpha and beta are both large and uses a Rice saddle point expansion.
1689 // To ensure numerical stability, this computation is performed at higher precision.
1690 static inline scalar_t THTensor_(beta_grad_alpha_mid)(double x, double alpha, double beta) {
1691  const double total = alpha + beta;
1692  const double mean = alpha / total;
1693  const double std = sqrt(alpha * beta / (total + 1)) / total;
1694  if (mean - 0.1 * std <= x && x <= mean + 0.1 * std) {
1695  // Avoid the singularity at x = mean.
1696  const double poly = 47 * x * (beta*beta)*(beta*beta) + alpha * (
1697  (43 + 20 * (16 + 27 * beta) * x) * (beta*beta)*beta + alpha * (
1698  3 * (59 + 180 * beta - 90 * x) * (beta*beta) + alpha * (
1699  (453 + 1620 * beta * (1 - x) - 455 * x) * beta + alpha * (
1700  8 * (1 - x) * (135 * beta - 11)))));
1701  const double prefactor_num = (1 + 12 * alpha) * (1 + 12 * beta) / (total * total);
1702  const double prefactor_den = 12960 * alpha * alpha * alpha * beta * beta * (1 + 12 * total);
1703  return prefactor_num / (1 - x) * poly / prefactor_den;
1704  }
1705  const double prefactor = -x / sqrt(2 * alpha * beta / total);
1706  const double stirling = (1 + 1 / (12 * alpha) + 1 / (288 * alpha*alpha))
1707  * (1 + 1 / (12 * beta) + 1 / (288 * beta*beta))
1708  / (1 + 1 / (12 * total) + 1 / (288 * total*total));
1709  const double term1_num = 2 * (alpha*alpha) * (x - 1) + alpha * beta * (x - 1) - x * (beta*beta);
1710  const double axbx = alpha * (x-1) + beta * x;
1711  const double term1_den = sqrt(2 * alpha / beta) * pow(total, 1.5f) * axbx*axbx;
1712  const double term1 = term1_num / term1_den;
1713  const double term2 = 0.5f * log(alpha / (total * x));
1714  const double term3_num = sqrt(8 * alpha * beta / total);
1715  const double term3_den = beta * x + alpha * (x - 1);
1716  const double term3 = term3_num / term3_den;
1717  const double term4_base = beta * log(beta / (total * (1 - x))) +
1718  alpha * log(alpha / (total * x));
1719  const double term4 = pow(term4_base, -1.5f);
1720  const double term1234 = term1 + term2 * (term3 + (x < mean ? term4 : -term4));
1721  return stirling * prefactor * term1234;
1722 }
1723 
1724 // Computes a scaled reparameterized gradient
1725 // -(d/dalpha cdf(x;alpha,beta)) / pdf(x;alpha,beta) / (1-x)
1726 // for random number x drawn from a Beta distribution Beta(alpha,beta).
1727 // This function inputs total=alpha+beta to make it easy to implement
1728 // Dirichlet reparameterized gradients in terms of Betas.
1729 static inline scalar_t THTensor_(dirichlet_grad_one)(scalar_t x, scalar_t alpha, scalar_t total) {
1730  const scalar_t beta = total - alpha;
1731  const scalar_t boundary = total * x * (1 - x);
1732 
1733  // Use an asymptotic approximation for x close to 0.
1734  if (x <= 0.5f && boundary < 2.5f) {
1735  return THTensor_(beta_grad_alpha_small)(x, alpha, beta);
1736  }
1737 
1738  // Use an asymptotic approximation for x close to 1.
1739  if (x >= 0.5f && boundary < 0.75f) {
1740  return -THTensor_(beta_grad_beta_small)(1 - x, beta, alpha);
1741  }
1742 
1743  // Use an asymptotic approximation when alpha and (total - alpha) are both large.
1744  if (alpha > 6 && beta > 6) {
1745  return THTensor_(beta_grad_alpha_mid)(x, alpha, beta);
1746  }
1747 
1748  // Use a rational correction to an analytic approximation.
1749  static const scalar_t c[2][3][3][4] = {
1750  {{{1.003668233, -0.01061107488, -0.0657888334, 0.01201642863},
1751  {0.6336835991, -0.3557432599, 0.05486251648, -0.001465281033},
1752  {-0.03276231906, 0.004474107445, 0.002429354597, -0.0001557569013}},
1753  {{0.221950385, -0.3187676331, 0.01799915743, 0.01074823814},
1754  {-0.2951249643, 0.06219954479, 0.01535556598, 0.001550077057},
1755  {0.02155310298, 0.004170831599, 0.001292462449, 6.976601077e-05}},
1756  {{-0.05980841433, 0.008441916499, 0.01085618172, 0.002319392565},
1757  {0.02911413504, 0.01400243777, -0.002721828457, 0.000751041181},
1758  {0.005900514878, -0.001936558688, -9.495446725e-06, 5.385558597e-05}}},
1759  {{{1, -0.02924021934, -0.04438342661, 0.007285809825},
1760  {0.6357567472, -0.3473456711, 0.05454656494, -0.002407477521},
1761  {-0.03301322327, 0.004845219414, 0.00231480583, -0.0002307248149}},
1762  {{0.5925320577, -0.1757678135, 0.01505928619, 0.000564515273},
1763  {0.1014815858, -0.06589186703, 0.01272886114, -0.0007316646956},
1764  {-0.007258481865, 0.001096195486, 0.0003934994223, -4.12701925e-05}},
1765  {{0.06469649321, -0.0236701437, 0.002902096474, -5.896963079e-05},
1766  {0.001925008108, -0.002869809258, 0.0008000589141, -6.063713228e-05},
1767  {-0.0003477407336, 6.959756487e-05, 1.097287507e-05, -1.650964693e-06}}},
1768  };
1769  const scalar_t u = TH_MATH_NAME(log)(x);
1770  const scalar_t a = TH_MATH_NAME(log)(alpha) - u;
1771  const scalar_t b = TH_MATH_NAME(log)(total) - a;
1772  const scalar_t pow_u[3] = {1, u, u * u};
1773  const scalar_t pow_a[3] = {1, a, a * a};
1774  scalar_t p = 0.0;
1775  scalar_t q = 0.0;
1776  for (int i = 0; i < 3; ++i) {
1777  for (int j = 0; j < 3; ++j) {
1778  const scalar_t ua = pow_u[i] * pow_a[j];
1779  p += ua * (c[0][i][j][0] + b * (c[0][i][j][1] + b * (c[0][i][j][2] + b * c[0][i][j][3])));
1780  q += ua * (c[1][i][j][0] + b * (c[1][i][j][1] + b * (c[1][i][j][2] + b * c[1][i][j][3])));
1781  }
1782  }
1783  const scalar_t approx = x * (TH_MATH_NAME(TH_digamma)(total) - TH_MATH_NAME(TH_digamma)(alpha)) / beta;
1784  return p / q * approx;
1785 }
1786 
1787 void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alpha, THTensor *total)
1788 {
1789  x = THTensor_(newContiguous)(x);
1790  alpha = THTensor_(newContiguous)(alpha);
1791  total = THTensor_(newContiguous)(total);
1792  TH_CHECK_SAME_SIZE(alpha, x);
1793  TH_CHECK_SAME_SIZE(total, x);
1794  THTensor_(resizeAs)(self, x);
1795  THTensor* grad = THTensor_(newContiguous)(self);
1796 
1797  scalar_t*const grad_data = grad->data<scalar_t>();
1798  scalar_t*const x_data = x->data<scalar_t>();
1799  scalar_t*const alpha_data = alpha->data<scalar_t>();
1800  scalar_t*const total_data = total->data<scalar_t>();
1801  const int64_t numel = THTensor_(nElement)(x);
1802  int64_t i;
1803  #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
1804  for(i = 0; i < numel; ++i) {
1805  grad_data[i] = THTensor_(dirichlet_grad_one)(x_data[i], alpha_data[i], total_data[i]);
1806  }
1807 
1808  THTensor_(freeCopyTo)(grad, self);
1809 }
1810 
1811 #undef TH_MATH_NAME
1812 #endif /* floating point only part */
1813 #undef IS_NONZERO
1814 
1815 #endif /* TH_GENERIC_FILE */