12 #include <forward_list> 14 #include <ATen/ATen.h> 15 #include <torch/csrc/WindowsTorchApiMacro.h> 20 #include <torch/csrc/jit/code_template.h> 22 typedef struct CUevent_st* CUDAEventStub;
24 namespace torch {
namespace autograd {
31 virtual void record(
int* device, CUDAEventStub* event, int64_t* cpu_ns) {
34 virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
38 virtual void nvtxMarkA(
const char* name) {
41 virtual void nvtxRangePushA(
const char* name) {
44 virtual void nvtxRangePop() {
47 virtual bool enabled() {
50 virtual void onEachDevice(std::function<
void(
int)> op) {
53 virtual void synchronize() {
60 AT_ERROR(
"CUDA used in profiler but not enabled.");
64 TORCH_API
void registerCUDAMethods(
CUDAStubs* stubs);
66 constexpr
inline size_t ceilToMultiple(
size_t a,
size_t b) {
67 return ((a + b - 1) / b) * b;
70 #if defined(__MACH__) && !defined(CLOCK_REALTIME) 76 inline int64_t getTime() {
78 using namespace std::chrono;
79 using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
80 return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
81 #elif defined(__MACH__) && !defined(CLOCK_REALTIME) 83 gettimeofday(&now, NULL);
84 return static_cast<int64_t
>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
88 clock_gettime(CLOCK_MONOTONIC, &t);
89 return static_cast<int64_t
>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
93 enum class EventKind : uint16_t {
100 Event(EventKind kind, std::string name, uint16_t thread_id,
bool record_cuda)
101 : owned_name_(
new std::string(std::move(name)))
102 , name_ptr_(owned_name_->c_str())
104 , thread_id_(thread_id) { record(record_cuda); }
105 Event(EventKind kind,
const char* name, uint16_t thread_id,
bool record_cuda)
108 , thread_id_(thread_id) { record(record_cuda); }
110 void record(
bool record_cuda);
111 std::string kind()
const {
113 case EventKind::Mark:
return "mark";
114 case EventKind::PushRange:
return "push";
115 case EventKind::PopRange:
return "pop";
117 throw std::runtime_error(
"unknown EventKind");
119 const char* name()
const {
122 uint16_t thread_id()
const {
125 double cpu_elapsed_us(
const Event & e) {
126 return (e.cpu_ns_ - cpu_ns_)/(1000.0);
128 double cuda_elapsed_us(
const Event & e);
129 bool has_cuda()
const {
130 return event !=
nullptr;
140 std::unique_ptr<std::string> owned_name_;
141 const char * name_ptr_;
145 struct CUevent_st*
event =
nullptr;
152 constexpr
static size_t MB = 1024 * 1024;
153 constexpr
static size_t event_block_size = 16 * MB;
154 constexpr
static size_t num_block_elements =
155 event_block_size / ceilToMultiple(
sizeof(
Event),
alignof(
Event));
156 static_assert(
sizeof(
Event[num_block_elements]) <= event_block_size,
157 "num_block_elements is calculated incorrectly");
158 using block_type = std::vector<Event>;
161 blocks.emplace_front();
162 auto & new_block = blocks.front();
163 new_block.reserve(num_block_elements);
165 const char *
const end_ptr =
reinterpret_cast<char*
>(new_block.data() + num_block_elements);
166 for (
volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
167 ptr < end_ptr; ptr += 4 * 1024) {
172 template<
typename... Args>
173 void record(Args&&... args) {
174 if (blocks.empty() || blocks.front().size() == num_block_elements) {
177 blocks.front().emplace_back(std::forward<Args>(args)...);
180 std::vector<Event> consolidate() {
181 std::vector<Event> result;
182 for (
auto & block : blocks) {
183 result.insert(result.begin(),
184 std::make_move_iterator(block.begin()),
185 std::make_move_iterator(block.end()));
191 std::forward_list<block_type> blocks;
194 enum class ProfilerState {
202 TORCH_API
void mark(std::string name,
bool include_cuda =
true);
203 TORCH_API
void pushRange(std::string name);
204 TORCH_API
void popRange();
209 explicit RecordFunction(std::string name);
211 explicit RecordFunction(
const char* name);
213 explicit RecordFunction(
const char* name, int64_t current_sequence_nr);
220 using thread_event_lists = std::vector<std::vector<Event>>;
223 TORCH_API
void enableProfiler(ProfilerState new_state);
224 TORCH_API thread_event_lists disableProfiler();
235 RecordProfile(
const std::string& filename);
240 std::unique_ptr<std::ofstream> file_;
242 void processEvents(
const std::vector<Event*>& events);