1 #include <torch/csrc/python_headers.h> 3 #include <unordered_map> 9 #include <ATen/cuda/CUDAContext.h> 10 #include <c10/cuda/CUDACachingAllocator.h> 15 #include <torch/csrc/cuda/THCP.h> 17 #include <torch/csrc/utils/pybind.h> 18 #include <torch/csrc/autograd/generated/VariableType.h> 19 #include <torch/csrc/utils/python_strings.h> 20 #include <torch/csrc/cuda/python_comm.h> 21 #include <torch/csrc/autograd/generated/variable_factories.h> 23 using namespace torch;
31 void THCPModule_setDevice(
int device)
33 THCudaCheck(cudaSetDevice(device));
36 PyObject * THCPModule_setDevice_wrap(PyObject *
self, PyObject *arg)
39 THPUtils_assert(THPUtils_checkLong(arg),
"invalid argument to setDevice");
40 int64_t device = THPUtils_unpackLong(arg);
42 THCPModule_setDevice(device);
48 PyObject * THCPModule_getDevice_wrap(PyObject *
self)
52 THCudaCheck(cudaGetDevice(&device));
53 return PyLong_FromLong(device);
57 PyObject * THCPModule_getDeviceCount_wrap(PyObject *
self)
61 if (cudaGetDeviceCount(&ndevice) != cudaSuccess) {
65 return PyLong_FromLong(ndevice);
69 PyObject * THCPModule_getCurrentStream_wrap(
70 PyObject * , PyObject *device_index) {
73 THPUtils_checkLong(device_index),
"invalid argument to getCurrentStream");
74 int64_t device = THPUtils_unpackLong(device_index);
75 return PyLong_FromUnsignedLongLong(
76 at::cuda::getCurrentCUDAStream(device).pack());
80 PyObject * THCPModule_getDefaultStream_wrap(
81 PyObject * , PyObject *device_index) {
84 THPUtils_checkLong(device_index),
"invalid argument to getDefaultStream");
85 int64_t device = THPUtils_unpackLong(device_index);
86 return PyLong_FromUnsignedLongLong(
87 at::cuda::getDefaultCUDAStream(device).pack());
91 PyObject * THCPModule_setStream_wrap(PyObject *
self, PyObject *obj)
94 THPUtils_assert(PyLong_Check(obj),
"invalid stream");
95 uint64_t bits = PyLong_AsUnsignedLongLong(obj);
96 if (bits == static_cast<uint64_t>(-1) && PyErr_Occurred()) {
99 auto stream = at::cuda::CUDAStream::unpack(bits);
101 THCudaCheck(cudaGetDevice(&device));
102 if (device != stream.device_index()) {
103 THCPModule_setDevice(stream.device_index());
105 at::cuda::setCurrentCUDAStream(stream);
110 PyObject * THCPModule_isDriverSufficient(PyObject *
self)
113 cudaError_t err = cudaGetDeviceCount(&count);
114 if (err == cudaErrorInsufficientDriver) {
115 return PyBool_FromLong(0);
117 return PyBool_FromLong(1);
120 PyObject * THCPModule_getDriverVersion(PyObject *
self)
122 int driverVersion = -1;
123 cudaError_t err = cudaDriverGetVersion(&driverVersion);
124 if (err != cudaSuccess) {
125 PyErr_Format(PyExc_RuntimeError,
126 "Error calling cudaDriverGetVersion: %d %s",
127 err, cudaGetErrorString(err));
130 return PyLong_FromLong((int64_t) driverVersion);
133 PyObject * THCPModule_getCompiledVersion(PyObject *
self)
135 return PyLong_FromLong((
long) CUDA_VERSION);
138 PyObject * THCPModule_getRNGState(PyObject *_unused)
143 Variable var = torch::empty(0, at::device(at::kCPU).dtype(at::kByte));
144 THCRandom_getRNGState(state, (THByteTensor*)(var.data().unsafeGetTensorImpl()));
145 return THPVariable_Wrap(var);
149 PyObject * THCPModule_setRNGState(PyObject *_unused, PyObject *obj)
152 if (!THPVariable_Check(obj) ||
153 at::globalContext().getNonVariableType(THPVariable_Unpack(obj).type().backend(), THPVariable_Unpack(obj).scalar_type()).ID() != at::TypeID::CPUByte) {
154 throw TypeError(
"set_rng_state expects a torch.ByteTensor, but got %s",
155 Py_TYPE(obj)->tp_name);
157 auto& tensor = THPVariable_UnpackData(obj);
158 THCRandom_setRNGState(state, (THByteTensor*)tensor.unsafeGetTensorImpl());
163 PyObject * THCPModule_manualSeed(PyObject *_unused, PyObject *seed)
166 THPUtils_assert(THPUtils_checkLong(seed),
"manual_seed expected a long, " 167 "but got %s", THPUtils_typename(seed));
168 THCRandom_manualSeed(state, THPUtils_unpackLong(seed));
173 PyObject * THCPModule_manualSeedAll(PyObject *_unused, PyObject *seed)
176 THPUtils_assert(THPUtils_checkLong(seed),
"manual_seed expected a long, " 177 "but got %s", THPUtils_typename(seed));
178 THCRandom_manualSeedAll(state, THPUtils_unpackLong(seed));
183 PyObject * THCPModule_seed(PyObject *_unused)
186 return THPUtils_packUInt64(THCRandom_seed(state));
190 PyObject * THCPModule_seedAll(PyObject *_unused)
193 return THPUtils_packUInt64(THCRandom_seedAll(state));
197 PyObject * THCPModule_initialSeed(PyObject *_unused)
200 return THPUtils_packUInt64(THCRandom_initialSeed(state));
204 PyObject * THCPModule_cudaHostAllocator(PyObject *_unused)
208 return PyLong_FromVoidPtr(allocator);
212 PyObject * THCPModule_cudaSynchronize(PyObject *_unused)
215 THCudaCheck(cudaDeviceSynchronize());
220 PyObject * THCPModule_cudaSleep(PyObject *_unused, PyObject *cycles)
223 THPUtils_assert(THPUtils_checkLong(cycles),
"torch.cuda._sleep(): expected 'int'");
224 THC_sleep(LIBRARY_STATE THPUtils_unpackLong(cycles));
234 static PyGILState_STATE cudaMutexGILState;
236 PyObject * THCPModule_cudaLockMutex(PyObject *module)
238 auto mutex = c10::cuda::CUDACachingAllocator::getFreeMutex();
245 if (mutex->try_lock())
249 std::this_thread::sleep_for(std::chrono::microseconds(10));
253 cudaMutexGILState = PyGILState_Ensure();
257 PyObject * THCPModule_cudaUnlockMutex(PyObject *module)
259 auto mutex = c10::cuda::CUDACachingAllocator::getFreeMutex();
260 PyGILState_Release(cudaMutexGILState);
265 PyObject * THCPModule_emptyCache(PyObject *_unused)
268 c10::cuda::CUDACachingAllocator::emptyCache();
273 PyObject * THCPModule_memoryAllocated(PyObject *_unused, PyObject *arg)
276 THPUtils_assert(THPUtils_checkLong(arg),
"invalid argument to memory_allocated");
277 int device = (int) THPUtils_unpackLong(arg);
278 auto memory_allocated = c10::cuda::CUDACachingAllocator::currentMemoryAllocated(device);
279 return PyLong_FromUnsignedLongLong(memory_allocated);
283 PyObject * THCPModule_maxMemoryAllocated(PyObject *_unused, PyObject *arg)
286 THPUtils_assert(THPUtils_checkLong(arg),
"invalid argument to max_memory_allocated");
287 int device = (int) THPUtils_unpackLong(arg);
288 auto max_memory_allocated = c10::cuda::CUDACachingAllocator::maxMemoryAllocated(device);
289 return PyLong_FromUnsignedLongLong(max_memory_allocated);
293 PyObject * THCPModule_resetMaxMemoryAllocated(PyObject *_unused, PyObject *arg)
296 THPUtils_assert(THPUtils_checkLong(arg),
"invalid argument to reset_max_memory_allocated");
297 int device = (int) THPUtils_unpackLong(arg);
298 c10::cuda::CUDACachingAllocator::resetMaxMemoryAllocated(device);
303 PyObject * THCPModule_memoryCached(PyObject *_unused, PyObject *arg)
306 THPUtils_assert(THPUtils_checkLong(arg),
"invalid argument to memory_cached");
307 int device = (int) THPUtils_unpackLong(arg);
308 auto memory_cached = c10::cuda::CUDACachingAllocator::currentMemoryCached(device);
309 return PyLong_FromUnsignedLongLong(memory_cached);
313 PyObject * THCPModule_maxMemoryCached(PyObject *_unused, PyObject *arg)
316 THPUtils_assert(THPUtils_checkLong(arg),
"invalid argument to max_memory_cached");
317 int device = (int) THPUtils_unpackLong(arg);
318 auto max_memory_cached = c10::cuda::CUDACachingAllocator::maxMemoryCached(device);
319 return PyLong_FromUnsignedLongLong(max_memory_cached);
323 PyObject * THCPModule_resetMaxMemoryCached(PyObject *_unused, PyObject *arg)
326 THPUtils_assert(THPUtils_checkLong(arg),
"invalid argument to reset_max_memory_cached");
327 int device = (int) THPUtils_unpackLong(arg);
328 c10::cuda::CUDACachingAllocator::resetMaxMemoryCached(device);
337 static void bindCudaDeviceProperties(PyObject* module) {
339 auto m = py::handle(module).cast<py::module>();
340 py::class_<cudaDeviceProp>(m,
"_CudaDeviceProperties")
341 .def_readonly(
"name", &cudaDeviceProp::name)
342 .def_readonly(
"major", &cudaDeviceProp::major)
343 .def_readonly(
"minor", &cudaDeviceProp::minor)
344 .def_readonly(
"is_multi_gpu_board", &cudaDeviceProp::isMultiGpuBoard)
345 .def_readonly(
"is_integrated", &cudaDeviceProp::integrated)
346 .def_readonly(
"multi_processor_count", &cudaDeviceProp::multiProcessorCount)
347 .def_readonly(
"total_memory", &cudaDeviceProp::totalGlobalMem)
348 .def(
"__repr__", [](
const cudaDeviceProp &prop) {
349 std::ostringstream stream;
350 stream <<
"_CudaDeviceProperties(name='" << prop.name <<
"', major=" << prop.major
351 <<
", minor=" << prop.minor <<
", total_memory=" << prop.totalGlobalMem / (1024 * 1024)
352 <<
"MB, multi_processor_count=" << prop.multiProcessorCount <<
")";
355 m.def(
"_get_device_properties", [](
int device) -> cudaDeviceProp * {
356 return at::cuda::getDeviceProperties(device);
357 }, py::return_value_policy::reference);
361 static PyObject * THCPModule_initExtension(PyObject *
self)
364 state = at::globalContext().lazyInitCUDA();
366 auto m =
THPObjectPtr(PyImport_ImportModule(
"torch.cuda"));
370 THCPDoubleStorage_postInit(m);
371 THCPFloatStorage_postInit(m);
372 THCPHalfStorage_postInit(m);
373 THCPLongStorage_postInit(m);
374 THCPIntStorage_postInit(m);
375 THCPShortStorage_postInit(m);
376 THCPCharStorage_postInit(m);
377 THCPByteStorage_postInit(m);
378 THCPBoolStorage_postInit(m);
380 bool has_magma = at::hasMAGMA();
382 THCMagma_init(state);
385 bool has_half =
true;
387 auto set_module_attr = [&](
const char* name, PyObject* v) {
389 if (PyObject_SetAttrString(m, name, v) < 0) {
394 set_module_attr(
"has_magma", has_magma ? Py_True : Py_False);
395 set_module_attr(
"has_half", has_half ? Py_True : Py_False);
397 auto _state_cdata =
THPObjectPtr(PyLong_FromVoidPtr(state));
399 set_module_attr(
"_state_cdata", _state_cdata.get());
401 bindCudaDeviceProperties(m);
408 #include <torch/csrc/cuda/python_nccl.h> 410 void THCPModule_useNccl()
413 ncclUniqueId uniqueId;
414 ncclGetUniqueId(&uniqueId);
418 PyObject * THCPModule_getCurrentBlasHandle_wrap(PyObject *
self)
421 cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
422 return PyLong_FromVoidPtr(handle);
426 static struct PyMethodDef _THCPModule_methods[] = {
427 {
"_cuda_init", (PyCFunction)THCPModule_initExtension, METH_NOARGS,
nullptr},
428 {
"_cuda_setDevice", (PyCFunction)THCPModule_setDevice_wrap, METH_O,
nullptr},
429 {
"_cuda_getDevice", (PyCFunction)THCPModule_getDevice_wrap, METH_NOARGS,
nullptr},
430 {
"_cuda_getDeviceCount", (PyCFunction)THCPModule_getDeviceCount_wrap, METH_NOARGS,
nullptr},
431 {
"_cuda_getCurrentStream",
432 (PyCFunction)THCPModule_getCurrentStream_wrap, METH_O,
nullptr},
433 {
"_cuda_getDefaultStream",
434 (PyCFunction)THCPModule_getDefaultStream_wrap, METH_O,
nullptr},
435 {
"_cuda_getCurrentBlasHandle", (PyCFunction)THCPModule_getCurrentBlasHandle_wrap, METH_NOARGS,
nullptr},
436 {
"_cuda_setStream", (PyCFunction)THCPModule_setStream_wrap, METH_O,
nullptr},
437 {
"_cuda_isDriverSufficient", (PyCFunction)THCPModule_isDriverSufficient, METH_NOARGS,
nullptr},
438 {
"_cuda_getDriverVersion", (PyCFunction)THCPModule_getDriverVersion, METH_NOARGS,
nullptr},
439 {
"_cuda_getCompiledVersion", (PyCFunction)THCPModule_getCompiledVersion, METH_NOARGS,
nullptr},
440 {
"_cuda_getRNGState", (PyCFunction)THCPModule_getRNGState, METH_NOARGS,
nullptr},
441 {
"_cuda_setRNGState", (PyCFunction)THCPModule_setRNGState, METH_O,
nullptr},
442 {
"_cuda_emptyCache", (PyCFunction) THCPModule_emptyCache, METH_NOARGS,
nullptr},
443 {
"_cuda_memoryAllocated", (PyCFunction) THCPModule_memoryAllocated, METH_O,
nullptr},
444 {
"_cuda_maxMemoryAllocated", (PyCFunction) THCPModule_maxMemoryAllocated, METH_O,
nullptr},
445 {
"_cuda_resetMaxMemoryAllocated", (PyCFunction) THCPModule_resetMaxMemoryAllocated, METH_O,
nullptr},
446 {
"_cuda_memoryCached", (PyCFunction) THCPModule_memoryCached, METH_O,
nullptr},
447 {
"_cuda_maxMemoryCached", (PyCFunction) THCPModule_maxMemoryCached, METH_O,
nullptr},
448 {
"_cuda_resetMaxMemoryCached", (PyCFunction) THCPModule_resetMaxMemoryCached, METH_O,
nullptr},
449 {
"_cuda_manualSeed", (PyCFunction)THCPModule_manualSeed, METH_O,
nullptr},
450 {
"_cuda_manualSeedAll", (PyCFunction)THCPModule_manualSeedAll, METH_O,
nullptr},
451 {
"_cuda_seed", (PyCFunction)THCPModule_seed, METH_NOARGS,
nullptr},
452 {
"_cuda_seedAll", (PyCFunction)THCPModule_seedAll, METH_NOARGS,
nullptr},
453 {
"_cuda_initialSeed", (PyCFunction)THCPModule_initialSeed, METH_NOARGS,
nullptr},
454 {
"_cuda_cudaHostAllocator", (PyCFunction)THCPModule_cudaHostAllocator, METH_NOARGS,
nullptr},
455 {
"_cuda_synchronize", (PyCFunction)THCPModule_cudaSynchronize, METH_NOARGS,
nullptr},
456 {
"_cuda_sleep", (PyCFunction)THCPModule_cudaSleep, METH_O,
nullptr},
457 {
"_cuda_lock_mutex", (PyCFunction)THCPModule_cudaLockMutex, METH_NOARGS,
nullptr},
458 {
"_cuda_unlock_mutex", (PyCFunction)THCPModule_cudaUnlockMutex, METH_NOARGS,
nullptr},
460 {
"_nccl_version", (PyCFunction)THCPModule_nccl_version, METH_NOARGS,
nullptr},
461 {
"_nccl_unique_id", (PyCFunction)THCPModule_nccl_unique_id, METH_NOARGS,
nullptr},
462 {
"_nccl_init_rank", (PyCFunction)THCPModule_nccl_init_rank, METH_VARARGS,
nullptr},
463 {
"_nccl_reduce", (PyCFunction)THCPModule_nccl_reduce, METH_VARARGS,
nullptr},
464 {
"_nccl_all_reduce", (PyCFunction)THCPModule_nccl_all_reduce, METH_VARARGS,
nullptr},
465 {
"_nccl_broadcast", (PyCFunction)THCPModule_nccl_broadcast, METH_VARARGS,
nullptr},
466 {
"_nccl_all_gather", (PyCFunction)THCPModule_nccl_all_gather, METH_VARARGS,
nullptr},
467 {
"_nccl_reduce_scatter", (PyCFunction)THCPModule_nccl_reduce_scatter, METH_VARARGS,
nullptr},
472 PyMethodDef* THCPModule_methods() {
473 return _THCPModule_methods;
476 namespace torch {
namespace cuda {
478 void initModule(PyObject *module) {
479 python::initCommMethods(module);
Variable A Variable augments a Tensor with the ability to interact in our autograd machinery...
Flush-To-Zero and Denormals-Are-Zero mode.