Flush-To-Zero and Denormals-Are-Zero mode. More...
Typedefs | |
template<typename T , bool is_cuda> | |
using | acc_type = typename AccumulateType< T, is_cuda >::type |
using | DimVector = SmallVector< int64_t, 5 > |
A container for sizes or strides. | |
using | TensorList = ArrayRef< Tensor > |
using | DimMask = TensorIterator::DimMask |
using | PtrVector = TensorIterator::PtrVector |
using | loop_t = TensorIterator::loop_t |
using | loop2d_t = TensorIterator::loop2d_t |
using | CheckedFrom = const char * |
using | DataType = caffe2::TypeIdentifier |
Functions | |
Context & | globalContext () |
TypeExtendedInterface & | getType (TensorOptions options) |
TypeExtendedInterface & | getType (const TensorImpl *impl) |
TypeExtendedInterface & | getType (const Tensor &t) |
LegacyTHDispatcher & | getLegacyTHDispatcher (TensorOptions options) |
LegacyTHDispatcher & | getLegacyTHDispatcher (const TensorImpl *impl) |
Allocator * | getCPUAllocator () |
REGISTER_LEGACY_TYPE_INIT (LegacyDeviceTypeInit) | |
CAFFE2_API LegacyTHDispatcher & | getLegacyTHDispatcher (const Tensor &) |
C10_DEFINE_TYPED_REGISTRY (ContextRegistry, at::DeviceType, at::BaseContext, std::unique_ptr, at::Device) | |
C10_DECLARE_TYPED_REGISTRY (ContextRegistry, at::DeviceType, at::BaseContext, std::unique_ptr, at::Device) | |
std::unique_ptr< at::BaseContext > | CreateContext (const at::Device &device) |
std::ios_base & | defaultfloat (std::ios_base &__base) |
std::ostream & | operator<< (std::ostream &out, const Type &t) |
void | __printTensor (std::ostream &stream, Tensor &self, int64_t linesize) |
std::ostream & | print (std::ostream &stream, const Tensor &tensor_, int64_t linesize) |
C10_DEFINE_REGISTRY (LegacyDeviceTypeInitRegistry, LegacyDeviceTypeInitInterface, LegacyDeviceTypeInitArgs) const LegacyDeviceTypeInitInterface &getLegacyDeviceTypeInit() | |
C10_DECLARE_REGISTRY (LegacyDeviceTypeInitRegistry, LegacyDeviceTypeInitInterface, LegacyDeviceTypeInitArgs) | |
CAFFE2_API const LegacyDeviceTypeInitInterface & | getLegacyDeviceTypeInit () |
LegacyTypeDispatch & | globalLegacyTypeDispatch () |
Type & | legacyTensorType (const TensorImpl &tensor) |
Return the Type object corresponding to this Tensor, which we can use to do dynamic dispatch to operators from. More... | |
void | initializeLegacyTypeDispatchFor (const TensorImpl &tensor) |
std::ostream & | operator<< (std::ostream &out, const Range &range) |
int64_t | get_device (Tensor self) |
bool | is_cuda (Tensor self) |
bool | is_hip (Tensor self) |
bool | is_sparse (Tensor self) |
C10_DECLARE_REGISTRY (VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs) | |
template<typename T > | |
std::pair< int64_t, int64_t > | collapse_dims (T *sizes, T *strides, int64_t dims, const int excludeDim=-1) |
Tensor | sort_strides (Tensor &tensor_) |
bool | _all_equal_numel (at::ArrayRef< Tensor > tensors) |
std::string | _all_equal_numel_error (at::ArrayRef< Tensor > tensors) |
bool | _apply_preamble (ArrayRef< Tensor > tensors) |
int64_t | _max_dim_tensors (ArrayRef< Tensor > tensors) |
void | iterate (int64_t size) |
template<typename Arg , typename... Args> | |
void | iterate (int64_t size, Arg &iter, Args &...iter_tail) |
bool | iterate_continue () |
template<typename Arg , typename... Args> | |
bool | iterate_continue (Arg &iter, Args &...iter_tail) |
int64_t | max_iterate_size () |
template<typename Arg , typename... Args> | |
int64_t | max_iterate_size (Arg &iter, Args &...iter_tail) |
void | iterate_overflow () |
template<typename Arg , typename... Args> | |
void | iterate_overflow (Arg &iter, Args &...iter_tail) |
void | forward (int64_t offset) |
template<typename Arg , typename... Args> | |
void | forward (int64_t offset, Arg &iter, Args &...iter_tail) |
int64_t | max_dim () |
template<typename Arg , typename... Args> | |
int64_t | max_dim (Arg &iter, Args &...iter_tail) |
void | apply_op () |
template<typename Op , typename... Args> | |
void | apply_op (int64_t numel, int64_t offset, const Op &op, Args...iters) |
void | apply_kernel () |
template<typename Op , typename... Args> | |
void | apply_kernel (int64_t numel, int64_t offset, const Op &op, Args...iters) |
template<typename scalar1 , typename scalar2 , typename Op > | |
void | CPU_tensor_parallel_kernel_apply2 (Tensor tensor1, Tensor tensor2, const Op op) |
template<typename scalar1 , typename Op > | |
void | CPU_tensor_apply1 (Tensor tensor1, const Op op) |
template<typename scalar1 , typename scalar2 , typename Op > | |
void | CPU_tensor_apply2 (Tensor tensor1, Tensor tensor2, const Op op) |
template<typename scalar1 , typename scalar2 , typename scalar3 , typename Op > | |
void | CPU_tensor_apply3 (Tensor tensor1, Tensor tensor2, Tensor tensor3, const Op op) |
template<typename scalar1 , typename scalar2 , typename scalar3 , typename scalar4 , typename Op > | |
void | CPU_tensor_apply4 (Tensor tensor1, Tensor tensor2, Tensor tensor3, Tensor tensor4, const Op op) |
template<typename scalar1 , typename Op > | |
void | CPU_tensor_parallel_apply1 (Tensor tensor1, const Op op, int64_t grain_size=internal::GRAIN_SIZE) |
template<typename scalar1 , typename scalar2 , typename Op > | |
void | CPU_tensor_parallel_apply2 (Tensor tensor1, Tensor tensor2, const Op op, int64_t grain_size=internal::GRAIN_SIZE) |
std::atomic< int > | num_threads (-1) |
void | set_num_threads (int num_threads_) |
int | get_num_threads () |
C10_DECLARE_REGISTRY (ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs) | |
C10_DECLARE_REGISTRY (CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs) | |
C10_DECLARE_REGISTRY (HIPHooksRegistry, HIPHooksInterface, HIPHooksArgs) | |
optional< Device > | device_of (Tensor t) |
Return the Device of a Tensor, if the Tensor is defined. | |
optional< Device > | device_of (TensorList t) |
Return the Device of a TensorList, if the list is non-empty and the first Tensor is defined. More... | |
ScalarType | toScalarType (const DLDataType &dtype) |
void | deleter (DLManagedTensor *arg) |
DLManagedTensor * | toDLPack (const Tensor &src) |
Tensor | fromDLPack (const DLManagedTensor *src) |
std::vector< int64_t > | infer_size (IntArrayRef a, IntArrayRef b) |
std::tuple< std::vector< int64_t >, std::vector< int64_t > > | inferExpandGeometry (IntArrayRef tensor_sizes, IntArrayRef tensor_strides, IntArrayRef sizes) |
void | check_defined (std::initializer_list< std::reference_wrapper< const Tensor >> tensors, const char *api_name) |
std::tuple< Tensor > | expand_inplace (const Tensor &tensor, const Tensor &to_expand) |
std::tuple< Tensor > | expand_inplace (const Tensor &tensor, const Tensor &to_expand, const char *api_name) |
std::tuple< Tensor, Tensor > | expand_inplace (const Tensor &tensor, const Tensor &to_expand1, const Tensor &to_expand2) |
std::tuple< Tensor, Tensor > | expand_inplace (const Tensor &tensor, const Tensor &to_expand1, const Tensor &to_expand2, const char *api_name) |
std::tuple< Tensor, Tensor > | expand_outplace (const Tensor &to_expand1, const Tensor &to_expand2) |
std::tuple< Tensor, Tensor > | expand_outplace (const Tensor &to_expand1, const Tensor &to_expand2, const char *api_name) |
std::tuple< Tensor, Tensor, Tensor > | expand_outplace (const Tensor &to_expand1, const Tensor &to_expand2, const Tensor &to_expand3) |
std::tuple< Tensor, Tensor, Tensor > | expand_outplace (const Tensor &to_expand1, const Tensor &to_expand2, const Tensor &to_expand3, const char *api_name) |
std::tuple< Tensor > | expand_size (const Tensor &to_expand, IntArrayRef sizes) |
std::tuple< Tensor > | expand_size (const Tensor &to_expand, IntArrayRef sizes, const char *api_name) |
std::vector< Tensor > | expand_outplace (TensorList to_expand) |
TensorOptions | initialTensorOptions () |
LegacyTHDispatch & | globalLegacyTHDispatch () |
MemOverlap | has_internal_overlap (const Tensor &tensor) |
MemOverlap | has_internal_overlap (TensorImpl *t) |
void | assert_no_internal_overlap (const Tensor &t, std::string op) |
void | assert_no_internal_overlap (TensorImpl *t, std::string op) |
template<typename T , typename std::enable_if< std::is_integral< T >::value, int >::type = 0> | |
bool | _isnan (T val) |
int64_t | divup (int64_t x, int64_t y) |
int | get_max_threads () |
int | get_thread_num () |
bool | in_parallel_region () |
template<class F > | |
void | parallel_for (const int64_t begin, const int64_t end, const int64_t grain_size, const F &f) |
template<class scalar_t , class F , class SF > | |
scalar_t | parallel_reduce (const int64_t begin, const int64_t end, const int64_t grain_size, const scalar_t ident, const F f, const SF sf) |
struct { | |
} | Generator (Context *context) |
virtual CAFFE2_API uint64_t | seed () override |
virtual CAFFE2_API uint64_t | initialSeed () override |
virtual CAFFE2_API void * | unsafeGetTH () override |
struct { | |
} | Dispatcher () |
void | register_cpu_types (Context *context) |
void | register_cuda_types (Context *context) |
struct { | |
} | Type () |
virtual ScalarType | scalarType () const override |
virtual caffe2::TypeMeta | typeMeta () const override |
virtual Backend | backend () const override |
virtual const char * | toString () const override |
virtual TypeID | ID () const override |
struct at::CAFFE2_API | get_function (const std::string &schema) |
Allocator * | allocator () const override |
Device | getDeviceFromPtr (void *data) const override |
std::unique_ptr< Generator > | generator () const override |
std::ostream & | operator<< (std::ostream &out, TensorGeometryArg t) |
void | checkDim (CheckedFrom c, const TensorGeometryArg &t, int64_t dim) |
void | checkDimRange (CheckedFrom c, const TensorGeometryArg &t, int64_t dim_start, int64_t dim_end) |
void | checkContiguous (CheckedFrom c, const TensorGeometryArg &t) |
void | checkAllContiguous (CheckedFrom c, at::ArrayRef< TensorArg > ts) |
void | checkSize (CheckedFrom c, const TensorGeometryArg &t, IntArrayRef sizes) |
void | checkSize (CheckedFrom c, const TensorGeometryArg &t, int64_t dim, int64_t size) |
void | checkAllSame (CheckedFrom c, ArrayRef< TensorArg > tensors, void(*fn)(CheckedFrom, const TensorArg &, const TensorArg &)) |
void | checkSameSize (CheckedFrom c, const TensorArg &t1, const TensorArg &t2) |
void | checkAllSameSize (CheckedFrom c, ArrayRef< TensorArg > tensors) |
void | checkNumel (CheckedFrom c, const TensorGeometryArg &t, int64_t numel) |
void | checkSameNumel (CheckedFrom c, const TensorArg &t1, const TensorArg &t2) |
void | checkAllSameNumel (CheckedFrom c, ArrayRef< TensorArg > tensors) |
void | checkSameGPU (CheckedFrom c, const TensorArg &t1, const TensorArg &t2) |
void | checkAllSameGPU (CheckedFrom c, ArrayRef< TensorArg > tensors) |
void | checkSameType (CheckedFrom c, const TensorArg &t1, const TensorArg &t2) |
void | checkScalarType (CheckedFrom c, const TensorArg &t, ScalarType ty) |
void | checkScalarTypes (CheckedFrom c, const TensorArg &t, at::ArrayRef< ScalarType > l) |
void | checkAllSameType (CheckedFrom c, ArrayRef< TensorArg > tensors) |
void | checkSameDim (CheckedFrom c, const TensorGeometryArg &t1, const TensorGeometryArg &t2) |
void | checkDefined (CheckedFrom c, const TensorArg &t) |
void | checkAllDefined (CheckedFrom c, ArrayRef< TensorArg > ts) |
void | checkBackend (CheckedFrom c, const Tensor &t, Backend backend) |
void | checkBackend (CheckedFrom c, ArrayRef< Tensor > tensors, at::Backend backend) |
void * | maybe_data_ptr (const Tensor &tensor) |
void * | maybe_data_ptr (const TensorArg &tensor) |
bool | geometry_is_contiguous (IntArrayRef sizes, IntArrayRef strides) |
CAFFE2_API void | checkSameNumel (CheckedFrom c, const TensorGeometryArg &t1, const TensorGeometryArg &t2) |
int | _crash_if_asan (int arg) |
template<size_t N> | |
std::array< int64_t, N > | check_intlist (ArrayRef< int64_t > list, const char *name, int pos, ArrayRef< int64_t > def={}) |
int64_t | sum_intlist (ArrayRef< int64_t > list) |
int64_t | prod_intlist (ArrayRef< int64_t > list) |
REGISTER_CONTEXT (DeviceType::CPU, caffe2::CPUContext) | |
REGISTER_COPY_BYTES_FUNCTION (DeviceType::CPU, DeviceType::CPU, caffe2::CopyBytesWrapper) | |
REGISTER_CONTEXT (DeviceType::IDEEP, caffe2::IDEEPContext) | |
REGISTER_COPY_BYTES_FUNCTION (DeviceType::IDEEP, DeviceType::CPU, CopyBytesWrapper) | |
REGISTER_COPY_BYTES_FUNCTION (DeviceType::CPU, DeviceType::IDEEP, CopyBytesWrapper) | |
REGISTER_COPY_BYTES_FUNCTION (DeviceType::IDEEP, DeviceType::IDEEP, CopyBytesWrapper) | |
REGISTER_COMPLEX_HOOKS (ComplexHooks) | |
Variables | |
thread_local bool | NonVariableTypeMode_enabled = false |
NOTE [ Treating Variables as non-Variables in type dispatch ]. More... | |
constexpr const char * | CUDA_HELP |
virtual | CAFFE2_API {name}Generator() |
Context * | context |
struct CAFFE2_API at::LegacyTHDispatcher | Generator |
class CAFFE2_API at::Tensor | Dispatcher |
at::TypeExtendedInterface | Type |
constexpr size_t | dim_bitset_size = 64 |
Flush-To-Zero and Denormals-Are-Zero mode.
Contains the implementation of parallel reductions in TensorIterator.
Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) are modes that bypass IEEE 754 methods of dealing with denormal floating-point numbers on x86-64 and some x86 CPUs. They result in reduced precision for values near zero, but increased performance.
|
inline |
Return the Device of a TensorList, if the list is non-empty and the first Tensor is defined.
(This function implicitly assumes that all tensors in the list have the same device.)
Definition at line 28 of file DeviceGuard.h.
|
inline |
Return the Type object corresponding to this Tensor, which we can use to do dynamic dispatch to operators from.
This method is NOT intended to be used by end-users; it is purely an implementation detail.
NOTE: We also check at::NonVariableTypeMode
, and if it's enabled we always return non-Variable type in this function. See NOTE [ Treating Variables as non-Variables in type dispatch ]
Definition at line 176 of file LegacyTypeDispatch.h.
constexpr const char* at::CUDA_HELP |
Definition at line 23 of file CUDAHooksInterface.h.
thread_local bool at::NonVariableTypeMode_enabled = false |
NOTE [ Treating Variables as non-Variables in type dispatch ].
Previously, in VariableType_*.cpp (generated by gen_variable_type.py), when a function is using the 'use_derived' strategy, we call its implementation on the base non-Variable type (baseType
), passing unwrapped tensors to the call so that any .type()
calls in the implementation can treat the passed tensors as non-Variables and won't dispatch back to functions in VariableType.
However, after the Variable/Tensor merge, there is no concept of unwrapping a tensor anymore, and directly passing variables to the base type calls will cause the .type()
dispatch in the implementation to treat the tensor as a variable, and any function dispatch based on .type()
will dispatch back to VariableType, which is not what we want.
The solution to the above problem is to add at::NonVariableTypeMode
, which when enabled will cause legacyTensorType()
and getType()
to always return non-Variable type, even if the tensor being called on is a variable.
TODO: Since torch::NoGradGuard
serves the same purpose in libtorch, we should merge these two thread-local guards. In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, thread_local is not supported. In that case, we don't provide at::NonVariableTypeMode
.
Definition at line 31 of file LegacyTypeDispatch.cpp.