1 #include "caffe2/utils/math/transpose.h" 10 #endif // CAFFE2_USE_MKL 12 #ifdef CAFFE2_USE_HPTT 14 #endif // CAFFE2_USE_HPTT 16 #include "caffe2/core/context.h" 17 #include "caffe2/utils/eigen_utils.h" 18 #include "caffe2/utils/math/utils.h" 25 template <
typename TIndex,
typename TData>
31 EigenMatrixMap<TData>(Y, rows, cols) =
32 ConstEigenMatrixMap<TData>(X, cols, rows).transpose();
37 #define DELEGATE_TRANSPOSE_2D(TIndex, TData, MKLFunc) \ 39 void Transpose2D<TIndex, TData>( \ 40 const TIndex rows, const TIndex cols, const TData* X, TData* Y) { \ 41 MKLFunc('R', 'T', rows, cols, TData(1), X, cols, Y, rows); \ 43 DELEGATE_TRANSPOSE_2D(std::int32_t,
float, mkl_somatcopy);
44 DELEGATE_TRANSPOSE_2D(std::int64_t,
float, mkl_somatcopy);
45 DELEGATE_TRANSPOSE_2D(std::int32_t,
double, mkl_domatcopy);
46 DELEGATE_TRANSPOSE_2D(std::int64_t,
double, mkl_domatcopy);
47 #undef DELEGATE_TRANSPOSE_2D 49 #endif // CAFFE2_USE_MKL 51 #ifdef CAFFE2_USE_HPTT 53 template <
typename TIndex,
typename TData>
60 for (
int i = 0; i < ndim; ++i) {
61 if (dims[i] <= 0 || dims[i] > std::numeric_limits<int>::max()) {
66 std::vector<int> axes_cm(ndim);
67 std::vector<int> dims_cm(ndim);
69 const auto cm_fn = [ndim](
const int i) {
return ndim - i - 1; };
70 for (
int i = 0; i < ndim; ++i) {
71 axes_cm[i] = cm_fn(axes[cm_fn(i)]);
72 dims_cm[i] = dims[cm_fn(i)];
74 auto plan = hptt::create_plan(
86 if (plan ==
nullptr) {
93 #endif // CAFFE2_USE_HPTT 95 template <
typename TIndex,
typename TData>
102 std::vector<TIndex> Y_dims(ndim);
103 for (
int i = 0; i < ndim; ++i) {
104 Y_dims[i] = dims[axes[i]];
107 int pivot = ndim - 1;
108 TIndex block_size = 1;
109 for (; pivot >= 0 && axes[pivot] == pivot; --pivot) {
110 block_size *= Y_dims[pivot];
113 const TIndex num_blocks = std::accumulate(
115 Y_dims.cbegin() + pivot,
117 std::multiplies<TIndex>());
118 std::vector<TIndex> X_strides(pivot);
119 utils::ComputeTransposedStrides<TIndex>(pivot, dims, axes, X_strides.data());
120 std::vector<TIndex> index(pivot, 0);
121 for (TIndex Y_index = 0; Y_index < num_blocks; ++Y_index) {
122 const TIndex X_index = std::inner_product(
123 X_strides.cbegin(), X_strides.cend(), index.cbegin(), TIndex(0));
124 if (block_size == 1) {
125 Y[Y_index] = X[X_index];
128 Y + block_size * Y_index,
129 X + block_size * X_index,
130 block_size *
sizeof(TData));
132 utils::IncreaseIndexInDims<TIndex>(pivot, Y_dims.data(), index.data());
136 template <
typename TIndex,
typename TData>
144 std::accumulate(dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());
148 if (utils::IsIdentityPermutation(ndim, axes)) {
149 std::memcpy(Y, X, size *
sizeof(TData));
152 if (utils::IsBatchTranspose2D(ndim, axes)) {
153 const TIndex H = dims[ndim - 2];
154 const TIndex W = dims[ndim - 1];
155 const TIndex N = size / (H * W);
156 for (TIndex i = 0; i < N; ++i) {
157 Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W);
161 TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);
164 #ifdef CAFFE2_USE_HPTT 166 #define CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(TIndex, TData) \ 168 void TransposeImpl<TIndex, TData>( \ 170 const TIndex* dims, \ 174 const TIndex size = std::accumulate( \ 175 dims, dims + ndim, TIndex(1), std::multiplies<TIndex>()); \ 179 if (utils::IsIdentityPermutation(ndim, axes)) { \ 180 std::memcpy(Y, X, size * sizeof(TData)); \ 183 if (TransposeByHPTT(ndim, dims, axes, X, Y)) { \ 186 if (utils::IsBatchTranspose2D(ndim, axes)) { \ 187 const TIndex H = dims[ndim - 2]; \ 188 const TIndex W = dims[ndim - 1]; \ 189 const TIndex N = size / (H * W); \ 190 for (TIndex i = 0; i < N; ++i) { \ 191 Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W); \ 195 TransposeND<TIndex, TData>(ndim, dims, axes, X, Y); \ 197 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t,
float)
198 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(
std::int64_t,
float)
199 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(
std::int32_t,
double)
200 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(
std::int64_t,
double)
201 #undef CAFFE2_SPECIALIZED_TRANSPOSE_IMPL 203 #endif // CAFFE2_USE_HPTT 207 #define CAFFE2_SPECIALIZED_TRANSPOSE(TIndex, TData) \ 209 C10_EXPORT void Transpose<TIndex, TData, CPUContext>( \ 211 const TIndex* dims, \ 216 TransposeImpl<TIndex, TData>(ndim, dims, axes, X, Y); \ 218 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t,
float)
219 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int64_t,
float)
220 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int32_t,
double)
221 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int64_t,
double)
222 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int32_t,
std::int32_t)
223 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int64_t,
std::int32_t)
224 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int32_t,
std::int64_t)
225 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int64_t,
std::int64_t)
226 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int32_t,
std::uint8_t)
227 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int64_t,
std::uint8_t)
228 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int32_t,
std::uint16_t)
229 CAFFE2_SPECIALIZED_TRANSPOSE(
std::int64_t,
std::uint16_t)
230 #undef CAFFE2_SPECIALIZED_TRANSPOSE 232 #define CAFFE2_SPECIALIZED_NCHW2NHWC(T) \ 234 C10_EXPORT void NCHW2NHWC<T, CPUContext>( \ 241 const int stride = C * HxW; \ 242 for (int i = 0; i < N; ++i) { \ 243 Transpose2D<T>(C, HxW, X + i * stride, Y + i * stride); \ 246 CAFFE2_SPECIALIZED_NCHW2NHWC(
float)
247 #undef CAFFE2_SPECIALIZED_NCHW2NHWC 249 #define CAFFE2_SPECIALIZED_NHWC2NCHW(T) \ 251 C10_EXPORT void NHWC2NCHW<T, CPUContext>( \ 258 const int stride = HxW * C; \ 259 for (int i = 0; i < N; ++i) { \ 260 Transpose2D<T>(HxW, C, X + i * stride, Y + i * stride); \ 263 CAFFE2_SPECIALIZED_NHWC2NCHW(
float)
264 #undef CAFFE2_SPECIALIZED_NHWC2NCHW
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...