Caffe2 - C++ API
A deep learning, cross platform ML framework
profiler.h
1 #pragma once
2 
3 #include <thread>
4 #include <iostream>
5 #include <mutex>
6 #include <memory>
7 #include <vector>
8 #include <cstdint>
9 #include <string>
10 #include <list>
11 #include <sstream>
12 #include <forward_list>
13 #include <tuple>
14 #include <ATen/ATen.h>
15 #include <torch/csrc/WindowsTorchApiMacro.h>
16 #ifndef _WIN32
17 #include <ctime>
18 #endif
19 
20 #include <torch/csrc/jit/code_template.h>
21 
22 typedef struct CUevent_st* CUDAEventStub;
23 
24 namespace torch { namespace autograd {
25 
26 struct Function;
27 
28 namespace profiler {
29 
30 struct TORCH_API CUDAStubs {
31  virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
32  fail();
33  }
34  virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
35  fail();
36  return 0.f;
37  }
38  virtual void nvtxMarkA(const char* name) {
39  fail();
40  }
41  virtual void nvtxRangePushA(const char* name) {
42  fail();
43  }
44  virtual void nvtxRangePop() {
45  fail();
46  }
47  virtual bool enabled() {
48  return false;
49  }
50  virtual void onEachDevice(std::function<void(int)> op) {
51  fail();
52  }
53  virtual void synchronize() {
54  fail();
55  }
56  virtual ~CUDAStubs();
57 
58 private:
59  void fail() {
60  AT_ERROR("CUDA used in profiler but not enabled.");
61  }
62 };
63 
64 TORCH_API void registerCUDAMethods(CUDAStubs* stubs);
65 
66 constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
67  return ((a + b - 1) / b) * b;
68 }
69 
70 #if defined(__MACH__) && !defined(CLOCK_REALTIME)
71 #include <sys/time.h>
72 // clock_gettime is not implemented on older versions of OS X (< 10.12).
73 // If implemented, CLOCK_REALTIME will have already been defined.
74 #endif
75 
76 inline int64_t getTime() {
77 #ifdef _WIN32
78  using namespace std::chrono;
79  using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
80  return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
81 #elif defined(__MACH__) && !defined(CLOCK_REALTIME)
82  struct timeval now;
83  gettimeofday(&now, NULL);
84  return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
85 #else
86  // clock_gettime is *much* faster than std::chrono implementation on Linux
87  struct timespec t{};
88  clock_gettime(CLOCK_MONOTONIC, &t);
89  return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
90 #endif
91 }
92 
93 enum class EventKind : uint16_t {
94  Mark,
95  PushRange,
96  PopRange
97 };
98 
99 struct TORCH_API Event final {
100  Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
101  : owned_name_(new std::string(std::move(name)))
102  , name_ptr_(owned_name_->c_str())
103  , kind_(kind)
104  , thread_id_(thread_id) { record(record_cuda); }
105  Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda)
106  : name_ptr_(name)
107  , kind_(kind)
108  , thread_id_(thread_id) { record(record_cuda); }
109 
110  void record(bool record_cuda);
111  std::string kind() const {
112  switch(kind_) {
113  case EventKind::Mark: return "mark";
114  case EventKind::PushRange: return "push";
115  case EventKind::PopRange: return "pop";
116  }
117  throw std::runtime_error("unknown EventKind");
118  }
119  const char* name() const {
120  return name_ptr_;
121  }
122  uint16_t thread_id() const {
123  return thread_id_;
124  }
125  double cpu_elapsed_us(const Event & e) {
126  return (e.cpu_ns_ - cpu_ns_)/(1000.0);
127  }
128  double cuda_elapsed_us(const Event & e);
129  bool has_cuda() const {
130  return event != nullptr;
131  }
132  int device() const {
133  return device_;
134  }
135 private:
136  int64_t cpu_ns_ = 0; // signed to allow for negative intervals, initialized for safety.
137  // std::string is a very large object (usually around 32B),
138  // and this field is used only for user-created ranges, so
139  // it's better to save on size of Events.
140  std::unique_ptr<std::string> owned_name_;
141  const char * name_ptr_;
142  EventKind kind_;
143  uint16_t thread_id_;
144  int device_ = -1;
145  struct CUevent_st* event = nullptr;
146 };
147 
148 // a linked-list of fixed sized vectors, to avoid
149 // a std::vector resize from taking a large amount of time inside
150 // a profiling event
152  constexpr static size_t MB = 1024 * 1024;
153  constexpr static size_t event_block_size = 16 * MB;
154  constexpr static size_t num_block_elements =
155  event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
156  static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
157  "num_block_elements is calculated incorrectly");
158  using block_type = std::vector<Event>;
159 
160  void allocBlock() {
161  blocks.emplace_front();
162  auto & new_block = blocks.front();
163  new_block.reserve(num_block_elements);
164  // Materialize all pages in the new block to release jitter when recording events.
165  const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
166  for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
167  ptr < end_ptr; ptr += 4 * 1024) {
168  (*ptr);
169  }
170  }
171 
172  template<typename... Args>
173  void record(Args&&... args) {
174  if (blocks.empty() || blocks.front().size() == num_block_elements) {
175  allocBlock();
176  }
177  blocks.front().emplace_back(std::forward<Args>(args)...);
178  }
179 
180  std::vector<Event> consolidate() {
181  std::vector<Event> result;
182  for (auto & block : blocks) {
183  result.insert(result.begin(),
184  std::make_move_iterator(block.begin()),
185  std::make_move_iterator(block.end()));
186  }
187  blocks.clear();
188  return result;
189  }
190 
191  std::forward_list<block_type> blocks;
192 };
193 
194 enum class ProfilerState {
195  Disabled,
196  CPU, // CPU-only profiling
197  CUDA, // CPU + CUDA events
198  NVTX, // only emit NVTX markers
199 };
200 
201 TORCH_API RangeEventList& getEventList();
202 TORCH_API void mark(std::string name, bool include_cuda = true);
203 TORCH_API void pushRange(std::string name);
204 TORCH_API void popRange();
205 
206 struct TORCH_API RecordFunction {
207  explicit RecordFunction(Function* fn);
208 
209  explicit RecordFunction(std::string name);
210 
211  explicit RecordFunction(const char* name);
212 
213  explicit RecordFunction(const char* name, int64_t current_sequence_nr);
214 
215  ~RecordFunction() {
216  popRange();
217  }
218 };
219 
220 using thread_event_lists = std::vector<std::vector<Event>>;
221 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
222 // there no autograd functions are being executed when these function are used.
223 TORCH_API void enableProfiler(ProfilerState new_state);
224 TORCH_API thread_event_lists disableProfiler();
225 
226 
227 // Usage:
228 // {
229 // RecordProfile guard("filename.trace");
230 // // code you want to profile
231 // }
232 // Then open filename.trace in chrome://tracing
233 struct TORCH_API RecordProfile {
234  RecordProfile(std::ostream& out);
235  RecordProfile(const std::string& filename);
236 
237  ~RecordProfile();
238 private:
239  void init();
240  std::unique_ptr<std::ofstream> file_;
241  std::ostream& out_;
242  void processEvents(const std::vector<Event*>& events);
243 };
244 
245 
246 } // namespace profiler
247 }} // namespace torch::autograd
Definition: jit_type.h:17