Flush-To-Zero and Denormals-Are-Zero mode. More...
Typedefs | |
| template<typename T , bool is_cuda> | |
| using | acc_type = typename AccumulateType< T, is_cuda >::type |
| using | DimVector = SmallVector< int64_t, 5 > |
| A container for sizes or strides. | |
| using | TensorList = ArrayRef< Tensor > |
| using | DimMask = TensorIterator::DimMask |
| using | PtrVector = TensorIterator::PtrVector |
| using | loop_t = TensorIterator::loop_t |
| using | loop2d_t = TensorIterator::loop2d_t |
| using | CheckedFrom = const char * |
| using | DataType = caffe2::TypeIdentifier |
Functions | |
| Context & | globalContext () |
| TypeExtendedInterface & | getType (TensorOptions options) |
| TypeExtendedInterface & | getType (const TensorImpl *impl) |
| TypeExtendedInterface & | getType (const Tensor &t) |
| LegacyTHDispatcher & | getLegacyTHDispatcher (TensorOptions options) |
| LegacyTHDispatcher & | getLegacyTHDispatcher (const TensorImpl *impl) |
| Allocator * | getCPUAllocator () |
| REGISTER_LEGACY_TYPE_INIT (LegacyDeviceTypeInit) | |
| CAFFE2_API LegacyTHDispatcher & | getLegacyTHDispatcher (const Tensor &) |
| C10_DEFINE_TYPED_REGISTRY (ContextRegistry, at::DeviceType, at::BaseContext, std::unique_ptr, at::Device) | |
| C10_DECLARE_TYPED_REGISTRY (ContextRegistry, at::DeviceType, at::BaseContext, std::unique_ptr, at::Device) | |
| std::unique_ptr< at::BaseContext > | CreateContext (const at::Device &device) |
| std::ios_base & | defaultfloat (std::ios_base &__base) |
| std::ostream & | operator<< (std::ostream &out, const Type &t) |
| void | __printTensor (std::ostream &stream, Tensor &self, int64_t linesize) |
| std::ostream & | print (std::ostream &stream, const Tensor &tensor_, int64_t linesize) |
| C10_DEFINE_REGISTRY (LegacyDeviceTypeInitRegistry, LegacyDeviceTypeInitInterface, LegacyDeviceTypeInitArgs) const LegacyDeviceTypeInitInterface &getLegacyDeviceTypeInit() | |
| C10_DECLARE_REGISTRY (LegacyDeviceTypeInitRegistry, LegacyDeviceTypeInitInterface, LegacyDeviceTypeInitArgs) | |
| CAFFE2_API const LegacyDeviceTypeInitInterface & | getLegacyDeviceTypeInit () |
| LegacyTypeDispatch & | globalLegacyTypeDispatch () |
| Type & | legacyTensorType (const TensorImpl &tensor) |
| Return the Type object corresponding to this Tensor, which we can use to do dynamic dispatch to operators from. More... | |
| void | initializeLegacyTypeDispatchFor (const TensorImpl &tensor) |
| std::ostream & | operator<< (std::ostream &out, const Range &range) |
| int64_t | get_device (Tensor self) |
| bool | is_cuda (Tensor self) |
| bool | is_hip (Tensor self) |
| bool | is_sparse (Tensor self) |
| C10_DECLARE_REGISTRY (VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs) | |
| template<typename T > | |
| std::pair< int64_t, int64_t > | collapse_dims (T *sizes, T *strides, int64_t dims, const int excludeDim=-1) |
| Tensor | sort_strides (Tensor &tensor_) |
| bool | _all_equal_numel (at::ArrayRef< Tensor > tensors) |
| std::string | _all_equal_numel_error (at::ArrayRef< Tensor > tensors) |
| bool | _apply_preamble (ArrayRef< Tensor > tensors) |
| int64_t | _max_dim_tensors (ArrayRef< Tensor > tensors) |
| void | iterate (int64_t size) |
| template<typename Arg , typename... Args> | |
| void | iterate (int64_t size, Arg &iter, Args &...iter_tail) |
| bool | iterate_continue () |
| template<typename Arg , typename... Args> | |
| bool | iterate_continue (Arg &iter, Args &...iter_tail) |
| int64_t | max_iterate_size () |
| template<typename Arg , typename... Args> | |
| int64_t | max_iterate_size (Arg &iter, Args &...iter_tail) |
| void | iterate_overflow () |
| template<typename Arg , typename... Args> | |
| void | iterate_overflow (Arg &iter, Args &...iter_tail) |
| void | forward (int64_t offset) |
| template<typename Arg , typename... Args> | |
| void | forward (int64_t offset, Arg &iter, Args &...iter_tail) |
| int64_t | max_dim () |
| template<typename Arg , typename... Args> | |
| int64_t | max_dim (Arg &iter, Args &...iter_tail) |
| void | apply_op () |
| template<typename Op , typename... Args> | |
| void | apply_op (int64_t numel, int64_t offset, const Op &op, Args...iters) |
| void | apply_kernel () |
| template<typename Op , typename... Args> | |
| void | apply_kernel (int64_t numel, int64_t offset, const Op &op, Args...iters) |
| template<typename scalar1 , typename scalar2 , typename Op > | |
| void | CPU_tensor_parallel_kernel_apply2 (Tensor tensor1, Tensor tensor2, const Op op) |
| template<typename scalar1 , typename Op > | |
| void | CPU_tensor_apply1 (Tensor tensor1, const Op op) |
| template<typename scalar1 , typename scalar2 , typename Op > | |
| void | CPU_tensor_apply2 (Tensor tensor1, Tensor tensor2, const Op op) |
| template<typename scalar1 , typename scalar2 , typename scalar3 , typename Op > | |
| void | CPU_tensor_apply3 (Tensor tensor1, Tensor tensor2, Tensor tensor3, const Op op) |
| template<typename scalar1 , typename scalar2 , typename scalar3 , typename scalar4 , typename Op > | |
| void | CPU_tensor_apply4 (Tensor tensor1, Tensor tensor2, Tensor tensor3, Tensor tensor4, const Op op) |
| template<typename scalar1 , typename Op > | |
| void | CPU_tensor_parallel_apply1 (Tensor tensor1, const Op op, int64_t grain_size=internal::GRAIN_SIZE) |
| template<typename scalar1 , typename scalar2 , typename Op > | |
| void | CPU_tensor_parallel_apply2 (Tensor tensor1, Tensor tensor2, const Op op, int64_t grain_size=internal::GRAIN_SIZE) |
| std::atomic< int > | num_threads (-1) |
| void | set_num_threads (int num_threads_) |
| int | get_num_threads () |
| C10_DECLARE_REGISTRY (ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs) | |
| C10_DECLARE_REGISTRY (CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs) | |
| C10_DECLARE_REGISTRY (HIPHooksRegistry, HIPHooksInterface, HIPHooksArgs) | |
| optional< Device > | device_of (Tensor t) |
| Return the Device of a Tensor, if the Tensor is defined. | |
| optional< Device > | device_of (TensorList t) |
| Return the Device of a TensorList, if the list is non-empty and the first Tensor is defined. More... | |
| ScalarType | toScalarType (const DLDataType &dtype) |
| void | deleter (DLManagedTensor *arg) |
| DLManagedTensor * | toDLPack (const Tensor &src) |
| Tensor | fromDLPack (const DLManagedTensor *src) |
| std::vector< int64_t > | infer_size (IntArrayRef a, IntArrayRef b) |
| std::tuple< std::vector< int64_t >, std::vector< int64_t > > | inferExpandGeometry (IntArrayRef tensor_sizes, IntArrayRef tensor_strides, IntArrayRef sizes) |
| void | check_defined (std::initializer_list< std::reference_wrapper< const Tensor >> tensors, const char *api_name) |
| std::tuple< Tensor > | expand_inplace (const Tensor &tensor, const Tensor &to_expand) |
| std::tuple< Tensor > | expand_inplace (const Tensor &tensor, const Tensor &to_expand, const char *api_name) |
| std::tuple< Tensor, Tensor > | expand_inplace (const Tensor &tensor, const Tensor &to_expand1, const Tensor &to_expand2) |
| std::tuple< Tensor, Tensor > | expand_inplace (const Tensor &tensor, const Tensor &to_expand1, const Tensor &to_expand2, const char *api_name) |
| std::tuple< Tensor, Tensor > | expand_outplace (const Tensor &to_expand1, const Tensor &to_expand2) |
| std::tuple< Tensor, Tensor > | expand_outplace (const Tensor &to_expand1, const Tensor &to_expand2, const char *api_name) |
| std::tuple< Tensor, Tensor, Tensor > | expand_outplace (const Tensor &to_expand1, const Tensor &to_expand2, const Tensor &to_expand3) |
| std::tuple< Tensor, Tensor, Tensor > | expand_outplace (const Tensor &to_expand1, const Tensor &to_expand2, const Tensor &to_expand3, const char *api_name) |
| std::tuple< Tensor > | expand_size (const Tensor &to_expand, IntArrayRef sizes) |
| std::tuple< Tensor > | expand_size (const Tensor &to_expand, IntArrayRef sizes, const char *api_name) |
| std::vector< Tensor > | expand_outplace (TensorList to_expand) |
| TensorOptions | initialTensorOptions () |
| LegacyTHDispatch & | globalLegacyTHDispatch () |
| MemOverlap | has_internal_overlap (const Tensor &tensor) |
| MemOverlap | has_internal_overlap (TensorImpl *t) |
| void | assert_no_internal_overlap (const Tensor &t, std::string op) |
| void | assert_no_internal_overlap (TensorImpl *t, std::string op) |
| template<typename T , typename std::enable_if< std::is_integral< T >::value, int >::type = 0> | |
| bool | _isnan (T val) |
| int64_t | divup (int64_t x, int64_t y) |
| int | get_max_threads () |
| int | get_thread_num () |
| bool | in_parallel_region () |
| template<class F > | |
| void | parallel_for (const int64_t begin, const int64_t end, const int64_t grain_size, const F &f) |
| template<class scalar_t , class F , class SF > | |
| scalar_t | parallel_reduce (const int64_t begin, const int64_t end, const int64_t grain_size, const scalar_t ident, const F f, const SF sf) |
| struct { | |
| } | Generator (Context *context) |
| virtual CAFFE2_API uint64_t | seed () override |
| virtual CAFFE2_API uint64_t | initialSeed () override |
| virtual CAFFE2_API void * | unsafeGetTH () override |
| struct { | |
| } | Dispatcher () |
| void | register_cpu_types (Context *context) |
| void | register_cuda_types (Context *context) |
| struct { | |
| } | Type () |
| virtual ScalarType | scalarType () const override |
| virtual caffe2::TypeMeta | typeMeta () const override |
| virtual Backend | backend () const override |
| virtual const char * | toString () const override |
| virtual TypeID | ID () const override |
| struct at::CAFFE2_API | get_function (const std::string &schema) |
| Allocator * | allocator () const override |
| Device | getDeviceFromPtr (void *data) const override |
| std::unique_ptr< Generator > | generator () const override |
| std::ostream & | operator<< (std::ostream &out, TensorGeometryArg t) |
| void | checkDim (CheckedFrom c, const TensorGeometryArg &t, int64_t dim) |
| void | checkDimRange (CheckedFrom c, const TensorGeometryArg &t, int64_t dim_start, int64_t dim_end) |
| void | checkContiguous (CheckedFrom c, const TensorGeometryArg &t) |
| void | checkAllContiguous (CheckedFrom c, at::ArrayRef< TensorArg > ts) |
| void | checkSize (CheckedFrom c, const TensorGeometryArg &t, IntArrayRef sizes) |
| void | checkSize (CheckedFrom c, const TensorGeometryArg &t, int64_t dim, int64_t size) |
| void | checkAllSame (CheckedFrom c, ArrayRef< TensorArg > tensors, void(*fn)(CheckedFrom, const TensorArg &, const TensorArg &)) |
| void | checkSameSize (CheckedFrom c, const TensorArg &t1, const TensorArg &t2) |
| void | checkAllSameSize (CheckedFrom c, ArrayRef< TensorArg > tensors) |
| void | checkNumel (CheckedFrom c, const TensorGeometryArg &t, int64_t numel) |
| void | checkSameNumel (CheckedFrom c, const TensorArg &t1, const TensorArg &t2) |
| void | checkAllSameNumel (CheckedFrom c, ArrayRef< TensorArg > tensors) |
| void | checkSameGPU (CheckedFrom c, const TensorArg &t1, const TensorArg &t2) |
| void | checkAllSameGPU (CheckedFrom c, ArrayRef< TensorArg > tensors) |
| void | checkSameType (CheckedFrom c, const TensorArg &t1, const TensorArg &t2) |
| void | checkScalarType (CheckedFrom c, const TensorArg &t, ScalarType ty) |
| void | checkScalarTypes (CheckedFrom c, const TensorArg &t, at::ArrayRef< ScalarType > l) |
| void | checkAllSameType (CheckedFrom c, ArrayRef< TensorArg > tensors) |
| void | checkSameDim (CheckedFrom c, const TensorGeometryArg &t1, const TensorGeometryArg &t2) |
| void | checkDefined (CheckedFrom c, const TensorArg &t) |
| void | checkAllDefined (CheckedFrom c, ArrayRef< TensorArg > ts) |
| void | checkBackend (CheckedFrom c, const Tensor &t, Backend backend) |
| void | checkBackend (CheckedFrom c, ArrayRef< Tensor > tensors, at::Backend backend) |
| void * | maybe_data_ptr (const Tensor &tensor) |
| void * | maybe_data_ptr (const TensorArg &tensor) |
| bool | geometry_is_contiguous (IntArrayRef sizes, IntArrayRef strides) |
| CAFFE2_API void | checkSameNumel (CheckedFrom c, const TensorGeometryArg &t1, const TensorGeometryArg &t2) |
| int | _crash_if_asan (int arg) |
| template<size_t N> | |
| std::array< int64_t, N > | check_intlist (ArrayRef< int64_t > list, const char *name, int pos, ArrayRef< int64_t > def={}) |
| int64_t | sum_intlist (ArrayRef< int64_t > list) |
| int64_t | prod_intlist (ArrayRef< int64_t > list) |
| REGISTER_CONTEXT (DeviceType::CPU, caffe2::CPUContext) | |
| REGISTER_COPY_BYTES_FUNCTION (DeviceType::CPU, DeviceType::CPU, caffe2::CopyBytesWrapper) | |
| REGISTER_CONTEXT (DeviceType::IDEEP, caffe2::IDEEPContext) | |
| REGISTER_COPY_BYTES_FUNCTION (DeviceType::IDEEP, DeviceType::CPU, CopyBytesWrapper) | |
| REGISTER_COPY_BYTES_FUNCTION (DeviceType::CPU, DeviceType::IDEEP, CopyBytesWrapper) | |
| REGISTER_COPY_BYTES_FUNCTION (DeviceType::IDEEP, DeviceType::IDEEP, CopyBytesWrapper) | |
| REGISTER_COMPLEX_HOOKS (ComplexHooks) | |
Variables | |
| thread_local bool | NonVariableTypeMode_enabled = false |
| NOTE [ Treating Variables as non-Variables in type dispatch ]. More... | |
| constexpr const char * | CUDA_HELP |
| virtual | CAFFE2_API {name}Generator() |
| Context * | context |
| struct CAFFE2_API at::LegacyTHDispatcher | Generator |
| class CAFFE2_API at::Tensor | Dispatcher |
| at::TypeExtendedInterface | Type |
| constexpr size_t | dim_bitset_size = 64 |
Flush-To-Zero and Denormals-Are-Zero mode.
Contains the implementation of parallel reductions in TensorIterator.
Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) are modes that bypass IEEE 754 methods of dealing with denormal floating-point numbers on x86-64 and some x86 CPUs. They result in reduced precision for values near zero, but increased performance.
|
inline |
Return the Device of a TensorList, if the list is non-empty and the first Tensor is defined.
(This function implicitly assumes that all tensors in the list have the same device.)
Definition at line 28 of file DeviceGuard.h.
|
inline |
Return the Type object corresponding to this Tensor, which we can use to do dynamic dispatch to operators from.
This method is NOT intended to be used by end-users; it is purely an implementation detail.
NOTE: We also check at::NonVariableTypeMode, and if it's enabled we always return non-Variable type in this function. See NOTE [ Treating Variables as non-Variables in type dispatch ]
Definition at line 176 of file LegacyTypeDispatch.h.
| constexpr const char* at::CUDA_HELP |
Definition at line 23 of file CUDAHooksInterface.h.
| thread_local bool at::NonVariableTypeMode_enabled = false |
NOTE [ Treating Variables as non-Variables in type dispatch ].
Previously, in VariableType_*.cpp (generated by gen_variable_type.py), when a function is using the 'use_derived' strategy, we call its implementation on the base non-Variable type (baseType), passing unwrapped tensors to the call so that any .type() calls in the implementation can treat the passed tensors as non-Variables and won't dispatch back to functions in VariableType.
However, after the Variable/Tensor merge, there is no concept of unwrapping a tensor anymore, and directly passing variables to the base type calls will cause the .type() dispatch in the implementation to treat the tensor as a variable, and any function dispatch based on .type() will dispatch back to VariableType, which is not what we want.
The solution to the above problem is to add at::NonVariableTypeMode, which when enabled will cause legacyTensorType() and getType() to always return non-Variable type, even if the tensor being called on is a variable.
TODO: Since torch::NoGradGuard serves the same purpose in libtorch, we should merge these two thread-local guards. In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, thread_local is not supported. In that case, we don't provide at::NonVariableTypeMode.
Definition at line 31 of file LegacyTypeDispatch.cpp.
1.8.11