Caffe2 - C++ API
A deep learning, cross platform ML framework
profiler.cpp
1 #include <torch/csrc/autograd/profiler.h>
2 #include <torch/csrc/autograd/function.h>
3 
4 #include <sstream>
5 #include <fstream>
6 
7 namespace torch { namespace autograd { namespace profiler {
8 
9 CUDAStubs default_stubs;
10 constexpr CUDAStubs* default_stubs_addr = &default_stubs;
11 // constant initialization, so it is guarenteed to be initialized before
12 // static initialization calls which may invoke registerCUDAMethods
13 static CUDAStubs* cuda_stubs = default_stubs_addr;
14 
15 TORCH_API void registerCUDAMethods(CUDAStubs* stubs) {
16  cuda_stubs = stubs;
17 }
18 
19 ProfilerState state = ProfilerState::Disabled;
20 uint16_t next_thread_id = 0;
21 std::mutex all_event_lists_mutex;
22 std::list<std::shared_ptr<RangeEventList>> all_event_lists;
23 thread_local std::shared_ptr<RangeEventList> event_list;
24 thread_local uint16_t thread_id;
25 
26 RangeEventList& getEventList() {
27  if (!event_list) {
28  std::lock_guard<std::mutex> guard(all_event_lists_mutex);
29  event_list = std::make_shared<RangeEventList>();
30  thread_id = next_thread_id++;
31  all_event_lists.emplace_front(event_list);
32  }
33  return *event_list;
34 }
35 
36 void mark(std::string name, bool include_cuda /* = true */) {
37  if (state == ProfilerState::Disabled) {
38  return;
39  }
40  if (state == ProfilerState::NVTX) {
41  cuda_stubs->nvtxMarkA(name.c_str());
42  } else {
43  getEventList().record(
44  EventKind::Mark,
45  std::move(name),
46  thread_id,
47  include_cuda && state == ProfilerState::CUDA);
48  }
49 }
50 
51 const char* c_str(const char *str) { return str; }
52 // NB: non-const to disallow temporaries (lifetime issues)
53 const char* c_str(std::string& str) { return str.c_str(); }
54 
55 template<typename T>
56 void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) {
57  if (state == ProfilerState::Disabled) {
58  return;
59  }
60  if (state == ProfilerState::NVTX) {
61  if(sequence_nr >= 0) {
62  std::stringstream s;
63  s << name << msg << sequence_nr;
64  cuda_stubs->nvtxRangePushA(s.str().c_str());
65  } else {
66  cuda_stubs->nvtxRangePushA(c_str(name));
67  }
68  } else {
69  getEventList().record(
70  EventKind::PushRange,
71  std::move(name),
72  thread_id,
73  state == ProfilerState::CUDA);
74  }
75 }
76 
77 void pushRange(std::string name) {
78  pushRangeImpl(std::move(name));
79 }
80 
81 void popRange() {
82  if (state == ProfilerState::Disabled) {
83  return;
84  }
85  if (state == ProfilerState::NVTX) {
86  cuda_stubs->nvtxRangePop();
87  } else {
88  getEventList().record(
89  EventKind::PopRange,
90  "",
91  thread_id,
92  state == ProfilerState::CUDA);
93  }
94 }
95 
96 RecordFunction::RecordFunction(Function* fn) {
97  // typeid(*fn).name() would avoid an additional string allocation.
98  // However, typeid(*fn).name() would cause nvtx annotations for all user-defined
99  // (Python-side) custom autograd function backward() methods to have the same name,
100  // because they route through the same C++ side class.
101  // fn->name() ensures that nvtx annotations for custom function backward() methods
102  // receive a relevant, demangled name.
103  pushRangeImpl(fn->name(), ", stashed seq=", fn->sequence_nr());
104 }
105 
106 RecordFunction::RecordFunction(std::string name) {
107  pushRangeImpl(std::move(name));
108 }
109 
110 RecordFunction::RecordFunction(const char* name) {
111  pushRangeImpl<const char*>(name);
112 }
113 
114 RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
115 {
116  pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
117 }
118 
119 void enableProfiler(ProfilerState new_state) {
120  AT_ASSERT(new_state != ProfilerState::Disabled);
121  if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
122  throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
123  if (state != ProfilerState::Disabled && new_state != state) {
124  throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
125  }
126  state = new_state;
127 
128  if(state == ProfilerState::CUDA) {
129  // event recording appears to have some startup overhead, so we need to
130  // to generate some dummy events first before recording syncrhonization events
131  for(int i = 0; i < 5; i++) {
132  cuda_stubs->onEachDevice([](int d) {
133  mark("__cuda_startup");
134  cuda_stubs->synchronize();
135  });
136  }
137 
138  // cuda events must be on the same device, so we need a start event recorded
139  // for each gpu. we then use this event to synchronize time on the GPU
140  // with the CPU clock.
141  cuda_stubs->onEachDevice([](int d) {
142  mark("__cuda_start_event");
143  });
144  }
145  mark("__start_profile", false);
146 }
147 
148 thread_event_lists disableProfiler() {
149  if (state == ProfilerState::Disabled) {
150  throw std::runtime_error("can't disable profiler when it's not running");
151  }
152  ProfilerState old_state = state;
153  mark("__stop_profile");
154  state = ProfilerState::Disabled;
155  if (old_state == ProfilerState::NVTX) {
156  return thread_event_lists();
157  } else {
158  thread_event_lists result;
159  std::lock_guard<std::mutex> guard(all_event_lists_mutex);
160  for (auto it = all_event_lists.begin(); it != all_event_lists.end();) {
161  auto & list = *it;
162  result.emplace_back(list->consolidate());
163  // GC lists that are not held by any threads
164  if (list.use_count() == 1) {
165  auto current_it = it;
166  ++it;
167  all_event_lists.erase(current_it);
168  } else {
169  ++it;
170  }
171  }
172  return result;
173  }
174 }
175 
176 void Event::record(bool record_cuda) {
177  if (record_cuda) {
178  cuda_stubs->record(&device_, &event, &cpu_ns_);
179  return;
180  }
181  cpu_ns_ = getTime();
182 }
183 
184 double Event::cuda_elapsed_us(const Event & e) {
185  if(!e.has_cuda() || !has_cuda()) {
186  throw std::logic_error("Events were not recorded for CUDA");
187  }
188  if(e.device() != device()) {
189  throw std::logic_error("Events are not on the same device");
190  }
191  return cuda_stubs->elapsed(event, e.event);
192 }
193 
194 CUDAStubs::~CUDAStubs() = default;
195 
196 
197 static jit::CodeTemplate event_template(R"(
198 {
199  "name": "${name}",
200  "ph": "X",
201  "ts": ${ts},
202  "dur": ${dur},
203  "tid": ${tid},
204  "pid": "CPU Functions",
205  "args": {}
206 })");
207 
208 
209 RecordProfile::RecordProfile(std::ostream& out)
210 : out_(out) {
211  init();
212 }
213 
214 RecordProfile::RecordProfile(const std::string& filename)
215 : file_(new std::ofstream(filename)), out_(*file_) {
216  init();
217 }
218 
219 void RecordProfile::init() {
220  enableProfiler(ProfilerState::CPU);
221 }
222 
223 RecordProfile::~RecordProfile() {
224  thread_event_lists event_lists = disableProfiler();
225  std::vector<Event*> events;
226  for(auto& l : event_lists) {
227  for(auto& e : l) {
228  events.push_back(&e);
229  }
230  }
231  processEvents(events);
232  if (file_){
233  file_->close();
234  }
235 }
236 
237 void RecordProfile::processEvents(const std::vector<Event*>& events) {
238  AT_CHECK(out_, "could not open file");
239  Event* start = nullptr;
240  for (Event* e : events) {
241  if(0 == strcmp(e->name(), "__start_profile")) {
242  start = e;
243  break;
244  }
245  }
246  AT_CHECK(start, "could not find start?");
247  std::vector<Event*> stack;
248  out_ << "[\n";
249  bool first = true;
250  for(Event* e : events) {
251  if(e->kind() == "push") {
252  stack.push_back(e);
253  } else if(e->kind() == "pop") {
254  if(!first) {
255  out_ << ",\n";
256  }
257  first = false;
258  Event* e_start = stack.back();
259  stack.pop_back();
260  jit::TemplateEnv env;
261  env.s("name", e_start->name());
262  env.d("ts", start->cpu_elapsed_us(*e_start));
263  env.d("dur", e_start->cpu_elapsed_us(*e));
264  env.d("tid", e_start->thread_id());
265  out_ << event_template.format(env);
266  }
267  }
268  out_ << "]\n";
269 }
270 
271 }}}
Definition: jit_type.h:17