Caffe2 - C++ API
A deep learning, cross platform ML framework
THTensorLapack.cpp
1 #ifndef TH_GENERIC_FILE
2 #define TH_GENERIC_FILE "TH/generic/THTensorLapack.cpp"
3 #else
4 
5 /*
6 Check if self is transpose of a contiguous matrix
7 */
8 static int THTensor_(isTransposedContiguous)(THTensor *self)
9 {
10  return self->stride(0) == 1 && self->stride(1) == self->size(0);
11 }
12 /*
13 If a matrix is a regular contiguous matrix, make sure it is transposed
14 because this is what we return from Lapack calls.
15 */
16 static void THTensor_(checkTransposed)(THTensor *self)
17 {
18  if(THTensor_(isContiguous)(self))
19  THTensor_(transpose)(self, NULL, 0, 1);
20  return;
21 }
22 /*
23 newContiguous followed by transpose
24 Similar to (newContiguous), but checks if the transpose of the matrix
25 is contiguous and also limited to 2D matrices.
26 */
27 static THTensor *THTensor_(newTransposedContiguous)(THTensor *self)
28 {
29  THTensor *tensor;
30  if(THTensor_(isTransposedContiguous)(self))
31  {
32  THTensor_(retain)(self);
33  tensor = self;
34  }
35  else
36  {
37  tensor = THTensor_(newContiguous)(self);
38  THTensor_(transpose)(tensor, NULL, 0, 1);
39  }
40 
41  return tensor;
42 }
43 
44 /*
45 Given the result tensor and src tensor, decide if the lapack call should use the
46 provided result tensor or should allocate a new space to put the result in.
47 
48 The returned tensor have to be freed by the calling function.
49 
50 nrows is required, because some lapack calls, require output space smaller than
51 input space, like underdetermined gels.
52 */
53 static THTensor *THTensor_(checkLapackClone)(THTensor *result, THTensor *src, int nrows)
54 {
55  /* check if user wants to reuse src and if it is correct shape/size */
56  if (src == result && THTensor_(isTransposedContiguous)(src) && src->size(1) == nrows)
57  THTensor_(retain)(result);
58  else if(src == result || result == NULL) /* in this case, user wants reuse of src, but its structure is not OK */
59  result = THTensor_(new)();
60  else
61  THTensor_(retain)(result);
62  return result;
63 }
64 
65 /*
66 Same as cloneColumnMajor, but accepts nrows argument, because some lapack calls require
67 the resulting tensor to be larger than src.
68 */
69 static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src, int nrows)
70 {
71  THTensor *result;
72  THTensor *view;
73 
74  if (src == NULL)
75  src = self;
76  result = THTensor_(checkLapackClone)(self, src, nrows);
77  if (src == result)
78  return result;
79 
80  THTensor_(resize2d)(result, src->size(1), nrows);
81  THTensor_(checkTransposed)(result);
82 
83  if (src->size(0) == nrows) {
84  at::Tensor result_wrap = THTensor_wrap(result);
85  at::Tensor src_wrap = THTensor_wrap(src);
86  at::_copy_same_type_(result_wrap, src_wrap);
87  }
88  else
89  {
90  view = THTensor_(newNarrow)(result, 0, 0, src->size(0));
91  at::Tensor view_wrap = THTensor_wrap(view);
92  at::Tensor src_wrap = THTensor_wrap(src);
93  at::_copy_same_type_(view_wrap, src_wrap);
94  c10::raw::intrusive_ptr::decref(view);
95  }
96  return result;
97 }
98 
99 /*
100 Create a clone of src in self column major order for use with Lapack.
101 If src == self, a new tensor is allocated, in any case, the return tensor should be
102 freed by calling function.
103 */
104 static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src)
105 {
106  return THTensor_(cloneColumnMajorNrows)(self, src, src->size(0));
107 }
108 
109 void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
110 {
111  int free_b = 0;
112  // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_.
113  if (a == NULL) a = ra_;
114  if (b == NULL) b = rb_;
115  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
116  a->dim());
117  THArgCheck(!a->is_empty(), 2, "A should not be empty");
118  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
119  "dimensions, but has %d", b->dim());
120  THArgCheck(!b->is_empty(), 1, "B should not be empty");
121  AT_CHECK(a->size(0) == b->size(0), "Expected A and b to have same size "
122  "at dim 0, but A has ", a->size(0), " rows and B has ", b->size(0), " rows");
123 
124  if (THTensor_nDimensionLegacyAll(b) == 1) {
125  b = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(b), b->storage_offset(), b->size(0),
126  b->stride(0), 1, 0);
127  free_b = 1;
128  }
129 
130  int m, n, nrhs, lda, ldb, info, lwork;
131  THTensor *work = NULL;
132  scalar_t wkopt = 0;
133 
134  THTensor *ra__ = NULL; // working version of A matrix to be passed into lapack GELS
135  THTensor *rb__ = NULL; // working version of B matrix to be passed into lapack GELS
136 
137  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
138 
139  m = ra__->size(0);
140  n = ra__->size(1);
141  lda = m;
142  ldb = (m > n) ? m : n;
143 
144  rb__ = THTensor_(cloneColumnMajorNrows)(rb_, b, ldb);
145 
146  nrhs = rb__->size(1);
147  info = 0;
148 
149 
150  /* get optimal workspace size */
151  THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
152  rb__->data<scalar_t>(), ldb,
153  &wkopt, -1, &info);
154  lwork = (int)wkopt;
155  work = THTensor_(newWithSize1d)(lwork);
156  THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
157  rb__->data<scalar_t>(), ldb,
158  work->data<scalar_t>(), lwork, &info);
159 
160  THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
161  THCleanup(c10::raw::intrusive_ptr::decref(ra__);
162  c10::raw::intrusive_ptr::decref(rb__);
163  c10::raw::intrusive_ptr::decref(work);
164  if (free_b) c10::raw::intrusive_ptr::decref(b);),
165  "gels", info,"");
166 
167  /*
168  * In the m < n case, if the input b is used as the result (so b == _rb),
169  * then rb_ was originally m by nrhs but now should be n by nrhs.
170  * This is larger than before, so we need to expose the new rows by resizing.
171  */
172  if (m < n && b == rb_) {
173  THTensor_(resize2d)(rb_, n, nrhs);
174  }
175 
176  THTensor_(freeCopyTo)(ra__, ra_);
177  THTensor_(freeCopyTo)(rb__, rb_);
178  c10::raw::intrusive_ptr::decref(work);
179  if (free_b) c10::raw::intrusive_ptr::decref(b);
180 }
181 
182 void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
183 {
184  int n, lda, lwork, info, ldvr;
185  THTensor *work=nullptr, *wi, *wr, *a;
186  scalar_t wkopt;
187  scalar_t *rv_data;
188  int64_t i;
189 
190  THTensor *re__ = NULL;
191  THTensor *rv__ = NULL;
192 
193  THArgCheck(a_->dim() == 2, 1, "A should be 2 dimensional");
194  THArgCheck(a_->size(0) == a_->size(1), 1,"A should be square");
195 
196  /* we want to definitely clone a_ for geev*/
197  a = THTensor_(cloneColumnMajor)(NULL, a_);
198 
199  n = a->size(0);
200  lda = n;
201 
202  wi = THTensor_(newWithSize1d)(n);
203  wr = THTensor_(newWithSize1d)(n);
204 
205  rv_data = NULL;
206  ldvr = 1;
207  if (*jobvr == 'V')
208  {
209  THTensor_(resize2d)(rv_,n,n);
210  /* guard against someone passing a correct size, but wrong stride */
211  rv__ = THTensor_(newTransposedContiguous)(rv_);
212  rv_data = rv__->data<scalar_t>();
213  ldvr = n;
214  }
215  THTensor_(resize2d)(re_,n,2);
216  re__ = THTensor_(newContiguous)(re_);
217 
218  if (n > 0) { // lapack doesn't work with size 0
219  /* get optimal workspace size */
220  THLapack_(geev)('N', jobvr[0], n, a->data<scalar_t>(), lda, wr->data<scalar_t>(), wi->data<scalar_t>(),
221  NULL, 1, rv_data, ldvr, &wkopt, -1, &info);
222 
223  lwork = (int)wkopt;
224  work = THTensor_(newWithSize1d)(lwork);
225 
226  THLapack_(geev)('N', jobvr[0], n, a->data<scalar_t>(), lda, wr->data<scalar_t>(), wi->data<scalar_t>(),
227  NULL, 1, rv_data, ldvr, work->data<scalar_t>(), lwork, &info);
228 
229  THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero",
230  THCleanup(c10::raw::intrusive_ptr::decref(re__);
231  c10::raw::intrusive_ptr::decref(rv__);
232  c10::raw::intrusive_ptr::decref(a);
233  c10::raw::intrusive_ptr::decref(wi);
234  c10::raw::intrusive_ptr::decref(wr);
235  c10::raw::intrusive_ptr::decref(work);),
236  "geev", info,"");
237  }
238 
239  {
240  scalar_t *re_data = re__->data<scalar_t>();
241  scalar_t *wi_data = wi->data<scalar_t>();
242  scalar_t *wr_data = wr->data<scalar_t>();
243  for (i=0; i<n; i++)
244  {
245  re_data[2*i] = wr_data[i];
246  re_data[2*i+1] = wi_data[i];
247  }
248  }
249 
250  if (*jobvr == 'V')
251  {
252  THTensor_(checkTransposed)(rv_);
253  THTensor_(freeCopyTo)(rv__, rv_);
254  }
255  THTensor_(freeCopyTo)(re__, re_);
256  c10::raw::intrusive_ptr::decref(a);
257  c10::raw::intrusive_ptr::decref(wi);
258  c10::raw::intrusive_ptr::decref(wr);
259  c10::raw::intrusive_ptr::decref(work);
260 }
261 
262 void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz, const char *uplo)
263 {
264  if (a == NULL) a = rv_;
265  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
266  THArgCheck(a->size(0) == a->size(1), 1,"A should be square");
267 
268  int n, lda, lwork, info;
269  THTensor *work = nullptr;
270  scalar_t wkopt;
271 
272  THTensor *rv__ = NULL;
273  THTensor *re__ = NULL;
274 
275  rv__ = THTensor_(cloneColumnMajor)(rv_, a);
276 
277  n = THTensor_sizeLegacyNoScalars(rv__, 0);
278  lda = n;
279 
280  THTensor_(resize1d)(re_,n);
281  re__ = THTensor_(newContiguous)(re_);
282 
283  /* get optimal workspace size */
284  if (n != 0) {
285  THLapack_(syev)(jobz[0], uplo[0], n, rv__->data<scalar_t>(), lda,
286  re_->data<scalar_t>(), &wkopt, -1, &info);
287  lwork = (int)wkopt;
288  work = THTensor_(newWithSize1d)(lwork);
289  THLapack_(syev)(jobz[0], uplo[0], n, rv__->data<scalar_t>(), lda,
290  re_->data<scalar_t>(), work->data<scalar_t>(), lwork, &info);
291 
292  THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero",
293  THCleanup(c10::raw::intrusive_ptr::decref(rv__);
294  c10::raw::intrusive_ptr::decref(re__);
295  c10::raw::intrusive_ptr::decref(work);),
296  "syev", info,"");
297  }
298 
299  // No eigenvectors specified
300  if (*jobz == 'N') {
301  THTensor_(fill)(rv_, 0);
302  }
303 
304  THTensor_(freeCopyTo)(rv__, rv_);
305  THTensor_(freeCopyTo)(re__, re_);
306  c10::raw::intrusive_ptr::decref(work);
307 }
308 
309 void THTensor_(gesdd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char* some, const char* compute_uv)
310 {
311  THTensor *ra_ = THTensor_(new)();
312  THTensor_(gesdd2)(ru_, rs_, rv_, ra_, a, some, compute_uv);
313  c10::raw::intrusive_ptr::decref(ra_);
314 }
315 
316 void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a,
317  const char* some, const char* compute_uv)
318 {
319  if (a == NULL) a = ra_;
320  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
321  THArgCheck(!a->is_empty(), 1, "A should not be empty");
322 
323  int k, m, n, lda, ldu, ldvt, lwork, info;
324  THTensor *work;
325  scalar_t wkopt;
326  THIntTensor *iwork;
327 
328  THTensor *ra__ = NULL;
329  THTensor *ru__ = NULL;
330  THTensor *rs__ = NULL;
331  THTensor *rv__ = NULL;
332 
333  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
334 
335  m = ra__->size(0);
336  n = ra__->size(1);
337  k = (m < n ? m : n);
338 
339  lda = m;
340  ldu = m;
341  ldvt = n;
342 
343  iwork = k ? THIntTensor_newWithSize1d((int64_t)(8 * m)) : THIntTensor_newWithSize1d((int64_t)(8 * n));
344 
345  THTensor_(resize1d)(rs_,k);
346  THTensor *rvf_ = NULL;
347 
348  if (*compute_uv != 'N') {
349  rvf_ = THTensor_(new)();
350  THTensor_(resize2d)(rvf_,ldvt,n);
351  if (*some == 'A')
352  THTensor_(resize2d)(ru_,m,ldu);
353  else
354  THTensor_(resize2d)(ru_,k,ldu);
355  } else {
356  THTensor_(resize2d)(rv_,ldvt,n);
357  THTensor_(resize2d)(ru_,m,ldu);
358  }
359 
360  THTensor_(checkTransposed)(ru_);
361 
362  char jobz = 'N';
363  scalar_t *rs__data = NULL;
364  scalar_t *ru__data = NULL;
365  scalar_t *rv__data = NULL;
366 
367  rs__ = THTensor_(newContiguous)(rs_);
368  rs__data = rs__->data<scalar_t>();
369  if (*compute_uv != 'N') {
370  /* guard against someone passing a correct size, but wrong stride */
371  ru__ = THTensor_(newTransposedContiguous)(ru_);
372  rv__ = THTensor_(newContiguous)(rvf_);
373 
374  ru__data = ru__->data<scalar_t>();
375  rv__data = rv__->data<scalar_t>();
376 
377  jobz = some[0];
378  }
379 
380  THLapack_(gesdd)(jobz,
381  m,n,ra__->data<scalar_t>(),lda,
382  rs__data,
383  ru__data,
384  ldu,
385  rv__data, ldvt,
386  &wkopt, -1, THIntTensor_data(iwork), &info);
387  lwork = (int)wkopt;
388  work = THTensor_(newWithSize1d)(lwork);
389  THLapack_(gesdd)(jobz,
390  m,n,ra__->data<scalar_t>(),lda,
391  rs__data,
392  ru__data,
393  ldu,
394  rv__data, ldvt,
395  work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
396 
397  if (jobz != 'N') {
398  THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",
399  THCleanup(
400  c10::raw::intrusive_ptr::decref(ru__);
401  c10::raw::intrusive_ptr::decref(rs__);
402  c10::raw::intrusive_ptr::decref(rv__);
403  c10::raw::intrusive_ptr::decref(ra__);
404  c10::raw::intrusive_ptr::decref(work);
405  c10::raw::intrusive_ptr::decref(iwork);),
406  "gesdd", info, "");
407  } else {
408  THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",
409  THCleanup(
410  c10::raw::intrusive_ptr::decref(rs__);
411  c10::raw::intrusive_ptr::decref(ra__);
412  c10::raw::intrusive_ptr::decref(work);
413  c10::raw::intrusive_ptr::decref(iwork);),
414  "gesdd", info, "");
415  }
416 
417  THTensor_(freeCopyTo)(ra__, ra_);
418  THTensor_(freeCopyTo)(rs__, rs_);
419  c10::raw::intrusive_ptr::decref(work);
420  c10::raw::intrusive_ptr::decref(iwork);
421 
422  if (jobz != 'N') {
423  if (jobz == 'S')
424  THTensor_(narrow)(rv__,NULL,1,0,k);
425 
426  THTensor_(freeCopyTo)(ru__, ru_);
427  THTensor_(freeCopyTo)(rv__, rvf_);
428 
429  if (jobz == 'S')
430  THTensor_(narrow)(rvf_,NULL,1,0,k);
431 
432  THTensor_(resizeAs)(rv_, rvf_);
433  at::Tensor rv__wrap = THTensor_wrap(rv_);
434  at::Tensor rvf__wrap = THTensor_wrap(rvf_);
435  at::_copy_same_type_(rv__wrap, rvf__wrap);
436  c10::raw::intrusive_ptr::decref(rvf_);
437  } else {
438  THTensor_(zero)(ru_);
439  THTensor_(zero)(rv_);
440  }
441 }
442 
443 void THTensor_(getri)(THTensor *ra_, THTensor *a)
444 {
445  if (a == NULL) a = ra_;
446  THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
447  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
448 
449  int m, n, lda, info, lwork;
450  scalar_t wkopt;
451  THIntTensor *ipiv;
452  THTensor *work;
453  THTensor *ra__ = NULL;
454 
455  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
456 
457  m = ra__->size(0);
458  n = ra__->size(1);
459  lda = m;
460  ipiv = THIntTensor_newWithSize1d((int64_t)m);
461 
462  /* Run LU */
463  THLapack_(getrf)(n, n, ra__->data<scalar_t>(), lda, THIntTensor_data(ipiv), &info);
464  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
465  THCleanup(
466  c10::raw::intrusive_ptr::decref(ra__);
467  THIntTensor_free(ipiv);),
468  "getrf", info, info);
469 
470  /* Run inverse */
471  THLapack_(getri)(n, ra__->data<scalar_t>(), lda, THIntTensor_data(ipiv), &wkopt, -1, &info);
472  lwork = (int)wkopt;
473  work = THTensor_(newWithSize1d)(lwork);
474  THLapack_(getri)(n, ra__->data<scalar_t>(), lda, THIntTensor_data(ipiv), work->data<scalar_t>(), lwork, &info);
475  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
476  THCleanup(
477  c10::raw::intrusive_ptr::decref(ra__);
478  c10::raw::intrusive_ptr::decref(work);
479  THIntTensor_free(ipiv);),
480  "getri", info, info);
481 
482  THTensor_(freeCopyTo)(ra__, ra_);
483  c10::raw::intrusive_ptr::decref(work);
484  THIntTensor_free(ipiv);
485 }
486 
487 void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
488 {
489  THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
490  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
491 
492  int n = a->size(0);
493 
494  /* Build full matrix */
495  scalar_t *p = a->data<scalar_t>();
496  int64_t i, j;
497 
498  /* Upper Triangular Case */
499  if (uplo[0] == 'U')
500  {
501  /* Clear lower triangle (excluding diagonals) */
502  for (i=0; i<n; i++) {
503  for (j=i+1; j<n; j++) {
504  p[n*i + j] = 0;
505  }
506  }
507  }
508  /* Lower Triangular Case */
509  else if (uplo[0] == 'L')
510  {
511  /* Clear upper triangle (excluding diagonals) */
512  for (i=0; i<n; i++) {
513  for (j=0; j<i; j++) {
514  p[n*i + j] = 0;
515  }
516  }
517  }
518 }
519 
520 void THTensor_(copyUpLoTriangle)(THTensor *a, const char *uplo)
521 {
522  THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
523  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
524 
525  int n = a->size(0);
526 
527  /* Build full matrix */
528  scalar_t *p = a->data<scalar_t>();
529  int64_t i, j;
530 
531  /* Upper Triangular Case */
532  if (uplo[0] == 'U')
533  {
534  /* Clear lower triangle (excluding diagonals) */
535  for (i=0; i<n; i++) {
536  for (j=i+1; j<n; j++) {
537  p[n*i + j] = p[n*j+i];
538  }
539  }
540  }
541  /* Lower Triangular Case */
542  else if (uplo[0] == 'L')
543  {
544  /* Clear upper triangle (excluding diagonals) */
545  for (i=0; i<n; i++) {
546  for (j=0; j<i; j++) {
547  p[n*i + j] = p[n*j+i];
548  }
549  }
550  }
551 }
552 
553 void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo)
554 {
555  if (a == NULL) a = ra_;
556  THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
557  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
558 
559  int n, lda, info;
560  THTensor *ra__ = NULL;
561 
562  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
563 
564  n = THTensor_sizeLegacyNoScalars(ra__, 0);
565  lda = n;
566 
567  /* Run inverse */
568  THLapack_(potri)(uplo[0], n, ra__->data<scalar_t>(), lda, &info);
569  THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized",
570  THCleanup(c10::raw::intrusive_ptr::decref(ra__);),
571  "potri", info, info);
572 
573  THTensor_(copyUpLoTriangle)(ra__, uplo);
574  THTensor_(freeCopyTo)(ra__, ra_);
575 }
576 
577 /*
578  Computes the Cholesky factorization with complete pivoting of a real symmetric
579  positive semidefinite matrix.
580 
581  Args:
582  * `ra_` - result Tensor in which to store the factor U or L from the
583  Cholesky factorization.
584  * `rpiv_` - result IntTensor containing sparse permutation matrix P, encoded
585  as P[rpiv_[k], k] = 1.
586  * `a` - input Tensor; the input matrix to factorize.
587  * `uplo` - string; specifies whether the upper or lower triangular part of
588  the symmetric matrix A is stored. "U"/"L" for upper/lower
589  triangular.
590  * `tol` - double; user defined tolerance, or < 0 for automatic choice.
591  The algorithm terminates when the pivot <= tol.
592  */
593 void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char *uplo, scalar_t tol) {
594  THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
595  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
596 
597  int n = a->size(0);
598 
599  THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a);
600  THIntTensor_resize1d(rpiv_, n);
601 
602  // Allocate working tensor
603  THTensor *work = THTensor_(newWithSize1d)(2 * n);
604 
605  // Run Cholesky factorization
606  int lda = n;
607  int rank, info;
608 
609  THLapack_(pstrf)(uplo[0], n, ra__->data<scalar_t>(), lda,
610  THIntTensor_data(rpiv_), &rank, tol,
611  work->data<scalar_t>(), &info);
612 
613  THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite",
614  THCleanup(
615  c10::raw::intrusive_ptr::decref(ra__);
616  c10::raw::intrusive_ptr::decref(work);),
617  "pstrf", info,"");
618 
619  THTensor_(clearUpLoTriangle)(ra__, uplo);
620 
621  THTensor_(freeCopyTo)(ra__, ra_);
622  c10::raw::intrusive_ptr::decref(work);
623 }
624 
625 /*
626  Perform a QR decomposition of a matrix.
627 
628  In LAPACK, two parts of the QR decomposition are implemented as two separate
629  functions: geqrf and orgqr. For flexibility and efficiency, these are wrapped
630  directly, below - but to make the common usage convenient, we also provide
631  this function, which calls them both and returns the results in a more
632  intuitive form.
633 
634  Args:
635  * `rq_` - result Tensor in which to store the Q part of the decomposition.
636  * `rr_` - result Tensor in which to store the R part of the decomposition.
637  * `a` - input Tensor; the matrix to decompose.
638 
639 */
640 void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a)
641 {
642  int m = a->size(0);
643  int n = a->size(1);
644  int k = (m < n ? m : n);
645  THTensor *ra_ = THTensor_(new)();
646  THTensor *rtau_ = THTensor_(new)();
647  THTensor *rr__ = THTensor_(new)();
648  THTensor_(geqrf)(ra_, rtau_, a);
649  THTensor_(resize2d)(rr__, k, ra_->size(1));
650  THTensor_(narrow)(rr__, ra_, 0, 0, k);
651  THTensor_(triu)(rr_, rr__, 0);
652  THTensor_(resize2d)(rq_, ra_->size(0), k);
653  THTensor_(orgqr)(rq_, ra_, rtau_);
654  THTensor_(narrow)(rq_, rq_, 1, 0, k);
655  c10::raw::intrusive_ptr::decref(ra_);
656  c10::raw::intrusive_ptr::decref(rtau_);
657  c10::raw::intrusive_ptr::decref(rr__);
658 }
659 
660 /*
661  The geqrf function does the main work of QR-decomposing a matrix.
662  However, rather than producing a Q matrix directly, it produces a sequence of
663  elementary reflectors which may later be composed to construct Q - for example
664  with the orgqr function, below.
665 
666  Args:
667  * `ra_` - Result matrix which will contain:
668  i) The elements of R, on and above the diagonal.
669  ii) Directions of the reflectors implicitly defining Q.
670  * `rtau_` - Result tensor which will contain the magnitudes of the reflectors
671  implicitly defining Q.
672  * `a` - Input matrix, to decompose. If NULL, `ra_` is used as input.
673 
674  For further details, please see the LAPACK documentation.
675 
676 */
677 void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a)
678 {
679  if (a == NULL) ra_ = a;
680  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
681  THArgCheck(!a->is_empty(), 1, "A should not be empty");
682 
683  THTensor *ra__ = NULL;
684 
685  /* Prepare the input for LAPACK, making a copy if necessary. */
686  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
687 
688  int m = ra__->size(0);
689  int n = ra__->size(1);
690  int k = (m < n ? m : n);
691  int lda = m;
692  THTensor_(resize1d)(rtau_, k);
693 
694  /* Dry-run to query the suggested size of the workspace. */
695  int info = 0;
696  scalar_t wkopt = 0;
697  THLapack_(geqrf)(m, n, ra__->data<scalar_t>(), lda,
698  rtau_->data<scalar_t>(),
699  &wkopt, -1, &info);
700 
701  /* Allocate the workspace and call LAPACK to do the real work. */
702  int lwork = (int)wkopt;
703  THTensor *work = THTensor_(newWithSize1d)(lwork);
704  THLapack_(geqrf)(m, n, ra__->data<scalar_t>(), lda,
705  rtau_->data<scalar_t>(),
706  work->data<scalar_t>(), lwork, &info);
707 
708  THLapackCheckWithCleanup("Lapack Error %s : unknown Lapack error. info = %i",
709  THCleanup(
710  c10::raw::intrusive_ptr::decref(ra__);
711  c10::raw::intrusive_ptr::decref(work);),
712  "geqrf", info,"");
713 
714  THTensor_(freeCopyTo)(ra__, ra_);
715  c10::raw::intrusive_ptr::decref(work);
716 }
717 
718 /*
719  The orgqr function allows reconstruction of a matrix Q with orthogonal
720  columns, from a sequence of elementary reflectors, such as is produced by the
721  geqrf function.
722 
723  Args:
724  * `ra_` - result Tensor, which will contain the matrix Q.
725  * `a` - input Tensor, which should be a matrix with the directions of the
726  elementary reflectors below the diagonal. If NULL, `ra_` is used as
727  input.
728  * `tau` - input Tensor, containing the magnitudes of the elementary
729  reflectors.
730 
731  For further details, please see the LAPACK documentation.
732 
733 */
734 void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
735 {
736  if (a == NULL) a = ra_;
737  THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
738 
739  THTensor *ra__ = NULL;
740  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
741 
742  int m = THTensor_sizeLegacyNoScalars(ra__, 0);
743  int k = THTensor_sizeLegacyNoScalars(tau, 0);
744  int lda = m;
745 
746  /* Dry-run to query the suggested size of the workspace. */
747  int info = 0;
748  scalar_t wkopt = 0;
749  THLapack_(orgqr)(m, k, k, ra__->data<scalar_t>(), lda,
750  tau->data<scalar_t>(),
751  &wkopt, -1, &info);
752 
753  /* Allocate the workspace and call LAPACK to do the real work. */
754  int lwork = (int)wkopt;
755  THTensor *work = THTensor_(newWithSize1d)(lwork);
756  THLapack_(orgqr)(m, k, k, ra__->data<scalar_t>(), lda,
757  tau->data<scalar_t>(),
758  work->data<scalar_t>(), lwork, &info);
759 
760  THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
761  THCleanup(
762  c10::raw::intrusive_ptr::decref(ra__);
763  c10::raw::intrusive_ptr::decref(work);),
764  "orgqr", info,"");
765  THTensor_(freeCopyTo)(ra__, ra_);
766  c10::raw::intrusive_ptr::decref(work);
767 }
768 
769 /*
770  The ormqr function multiplies Q with another matrix from a sequence of
771  elementary reflectors, such as is produced by the geqrf function.
772 
773  Args:
774  * `ra_` - result Tensor, which will contain the matrix Q' c.
775  * `a` - input Tensor, which should be a matrix with the directions of the
776  elementary reflectors below the diagonal. If NULL, `ra_` is used as
777  input.
778  * `tau` - input Tensor, containing the magnitudes of the elementary
779  reflectors.
780  * `c` - input Tensor, containing the matrix to be multiplied.
781  * `side` - char, determining whether c is left- or right-multiplied with Q.
782  * `trans` - char, determining whether to transpose Q before multiplying.
783 
784  For further details, please see the LAPACK documentation.
785 
786 */
787 void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans)
788 {
789  if (a == NULL) a = ra_;
790  THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
791 
792  THTensor *ra__ = NULL;
793  ra__ = THTensor_(cloneColumnMajor)(ra_, c);
794 
795  int m = c->size(0);
796  int n = c->size(1);
797  int k = THTensor_sizeLegacyNoScalars(tau, 0);
798  int lda;
799  if (*side == 'L')
800  {
801  lda = m;
802  }
803  else
804  {
805  lda = n;
806  }
807  int ldc = m;
808 
809  /* Dry-run to query the suggested size of the workspace. */
810  int info = 0;
811  scalar_t wkopt = 0;
812  THLapack_(ormqr)(side[0], trans[0], m, n, k, a->data<scalar_t>(), lda,
813  tau->data<scalar_t>(), ra__->data<scalar_t>(), ldc,
814  &wkopt, -1, &info);
815 
816  /* Allocate the workspace and call LAPACK to do the real work. */
817  int lwork = (int)wkopt;
818  THTensor *work = THTensor_(newWithSize1d)(lwork);
819  THLapack_(ormqr)(side[0], trans[0], m, n, k, a->data<scalar_t>(), lda,
820  tau->data<scalar_t>(), ra__->data<scalar_t>(), ldc,
821  work->data<scalar_t>(), lwork, &info);
822 
823  THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
824  THCleanup(
825  c10::raw::intrusive_ptr::decref(ra__);
826  c10::raw::intrusive_ptr::decref(work);),
827  "ormqr", info,"");
828  THTensor_(freeCopyTo)(ra__, ra_);
829  c10::raw::intrusive_ptr::decref(work);
830 }
831 
832 void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots)
833 {
834  AT_CHECK(!atf->is_empty() && THTensor_(nDimensionLegacyNoScalars)(atf) == 3, "expected non-empty 3D tensor, got size: ",
835  atf->sizes());
836  AT_CHECK(!b->is_empty() && (THTensor_(nDimensionLegacyNoScalars)(b) == 3 ||
837  THTensor_(nDimensionLegacyNoScalars)(b) == 2), "expected non-empty 2D or 3D tensor, got size: ", b->sizes());
838  THArgCheck(THTensor_(size)(atf, 0) ==
839  THTensor_(size)(b, 0), 3, "number of batches must be equal");
840  THArgCheck(THTensor_(size)(atf, 1) ==
841  THTensor_(size)(atf, 2), 3, "A matrices must be square");
842  THArgCheck(THTensor_(size)(atf, 1) ==
843  THTensor_(size)(b, 1), 3, "dimensions of A and b must be equal");
844 
845  if (rb_ != b) {
846  THTensor_(resizeAs)(rb_, b);
847  at::Tensor rb__wrap = THTensor_wrap(rb_);
848  at::Tensor b_wrap = THTensor_wrap(b);
849  at::_copy_same_type_(rb__wrap, b_wrap);
850  }
851 
852  int64_t num_batches = atf->size(0);
853  int64_t n = atf->size(1);
854  int nrhs = THTensor_nDimensionLegacyAll(rb_) > 2 ? rb_->size(2) : 1;
855 
856  int lda, ldb;
857  THTensor *atf_;
858  THTensor *rb__;
859 
860  // correct ordering of A
861  if (atf->stride(1) == 1) {
862  // column ordered, what BLAS wants
863  lda = atf->stride(2);
864  atf_ = atf;
865  } else {
866  // not column ordered, need to make it such (requires copy)
867  // it would be nice if we could use the op(A) flags to automatically
868  // transpose A if needed, but this leads to unpredictable behavior if the
869  // user clones A_tf later with a different ordering
870  THTensor *transp_r_ = THTensor_(newTranspose)(atf, 1, 2);
871  atf_ = THTensor_(newClone)(transp_r_);
872  c10::raw::intrusive_ptr::decref(transp_r_);
873  THTensor_(transpose)(atf_, NULL, 1, 2);
874  lda = atf_->stride(2);
875  }
876 
877  // correct ordering of B
878  if (rb_->stride(1) == 1) {
879  // column ordered
880  if (THTensor_nDimensionLegacyAll(rb_) == 2 || rb_->size(2) == 1) {
881  ldb = n;
882  } else {
883  ldb = rb_->stride(2);
884  }
885  rb__ = rb_;
886  } else {
887  // make column ordered
888  if (THTensor_nDimensionLegacyAll(rb_) > 2) {
889  THTensor *transp_r_ = THTensor_(newTranspose)(rb_, 1, 2);
890  rb__ = THTensor_(newClone)(transp_r_);
891  c10::raw::intrusive_ptr::decref(transp_r_);
892  THTensor_(transpose)(rb__, NULL, 1, 2);
893  ldb = rb__->stride(2);
894  } else {
895  rb__ = THTensor_(newClone)(rb_);
896  ldb = n;
897  }
898  }
899 
900  THTensor *ai = THTensor_(new)();
901  THTensor *rbi = THTensor_(new)();
902  THIntTensor *pivoti = THIntTensor_new();
903 
904  if (!THIntTensor_isContiguous(pivots)) {
905  THError("Error: rpivots_ is not contiguous.");
906  }
907 
908  for (int64_t batch = 0; batch < num_batches; ++batch) {
909  THTensor_(select)(ai, atf_, 0, batch);
910  THTensor_(select)(rbi, rb__, 0, batch);
911  THIntTensor_select(pivoti, pivots, 0, batch);
912 
913 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
914  int info;
915  THLapack_(getrs)('N', n, nrhs, ai->data<scalar_t>(), lda,
916  THIntTensor_data(pivoti), rbi->data<scalar_t>(),
917  ldb, &info);
918  if (info != 0) {
919  THError("Error: Nonzero info.");
920  }
921 #else
922  THError("Unimplemented");
923 #endif
924  }
925 
926  c10::raw::intrusive_ptr::decref(ai);
927  c10::raw::intrusive_ptr::decref(rbi);
928  THIntTensor_free(pivoti);
929 
930  if (atf_ != atf) {
931  c10::raw::intrusive_ptr::decref(atf_);
932  }
933 
934  if (rb__ != rb_) {
935  THTensor_(freeCopyTo)(rb__, rb_);
936  }
937 }
938 
939 #endif