doxygen-c/html/profiler_8cpp_source.html

 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/function.h>

 #include <sstream>
 #include <fstream>

 namespace torch { namespace autograd { namespace profiler {

 CUDAStubs default_stubs;
 constexpr CUDAStubs* default_stubs_addr = &default_stubs;
 // constant initialization, so it is guarenteed to be initialized before
 // static initialization calls which may invoke registerCUDAMethods
 static CUDAStubs* cuda_stubs = default_stubs_addr;

 TORCH_API void registerCUDAMethods(CUDAStubs* stubs) {
   cuda_stubs = stubs;
 }

 ProfilerState state = ProfilerState::Disabled;
 uint16_t next_thread_id = 0;
 std::mutex all_event_lists_mutex;
 std::list<std::shared_ptr<RangeEventList>> all_event_lists;
 thread_local std::shared_ptr<RangeEventList> event_list;
 thread_local uint16_t thread_id;

 RangeEventList& getEventList() {
   if (!event_list) {
     std::lock_guard<std::mutex> guard(all_event_lists_mutex);
     event_list = std::make_shared<RangeEventList>();
     thread_id = next_thread_id++;
     all_event_lists.emplace_front(event_list);
   }
   return *event_list;
 }

 void mark(std::string name, bool include_cuda /* = true */) {
   if (state == ProfilerState::Disabled) {
     return;
   }
   if (state == ProfilerState::NVTX) {
     cuda_stubs->nvtxMarkA(name.c_str());
   } else {
     getEventList().record(
         EventKind::Mark,
         std::move(name),
         thread_id,
         include_cuda && state == ProfilerState::CUDA);
   }
 }

 const char* c_str(const char *str) { return str; }
 // NB: non-const to disallow temporaries (lifetime issues)
 const char* c_str(std::string& str) { return str.c_str(); }

 template<typename T>
 void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) {
   if (state == ProfilerState::Disabled) {
     return;
   }
   if (state == ProfilerState::NVTX) {
     if(sequence_nr >= 0) {
       std::stringstream s;
       s << name << msg << sequence_nr;
       cuda_stubs->nvtxRangePushA(s.str().c_str());
     } else {
       cuda_stubs->nvtxRangePushA(c_str(name));
     }
   } else {
     getEventList().record(
         EventKind::PushRange,
         std::move(name),
         thread_id,
         state == ProfilerState::CUDA);
   }
 }

 void pushRange(std::string name) {
   pushRangeImpl(std::move(name));
 }

 void popRange() {
   if (state == ProfilerState::Disabled) {
     return;
   }
   if (state == ProfilerState::NVTX) {
     cuda_stubs->nvtxRangePop();
   } else {
     getEventList().record(
         EventKind::PopRange,
         "",
         thread_id,
         state == ProfilerState::CUDA);
   }
 }

 RecordFunction::RecordFunction(Function* fn) {
   // typeid(*fn).name() would avoid an additional string allocation.
   // However, typeid(*fn).name() would cause nvtx annotations for all user-defined
   // (Python-side) custom autograd function backward() methods to have the same name,
   // because they route through the same C++ side class.
   // fn->name() ensures that nvtx annotations for custom function backward() methods
   // receive a relevant, demangled name.
   pushRangeImpl(fn->name(), ", stashed seq=", fn->sequence_nr());
 }

 RecordFunction::RecordFunction(std::string name) {
   pushRangeImpl(std::move(name));
 }

 RecordFunction::RecordFunction(const char* name) {
   pushRangeImpl<const char*>(name);
 }

 RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
 {
   pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
 }

 void enableProfiler(ProfilerState new_state) {
   AT_ASSERT(new_state != ProfilerState::Disabled);
   if (new_state == ProfilerState::NVTX && !cuda_stubs->enabled())
     throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
   if (state != ProfilerState::Disabled && new_state != state) {
       throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
   }
   state = new_state;

   if(state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
     // to generate some dummy events first before recording syncrhonization events
     for(int i = 0; i < 5; i++) {
       cuda_stubs->onEachDevice([](int d) {
           mark("__cuda_startup");
           cuda_stubs->synchronize();
       });
     }

     // cuda events must be on the same device, so we need a start event recorded
     // for each gpu. we then use this event to synchronize time on the GPU
     // with the CPU clock.
     cuda_stubs->onEachDevice([](int d) {
         mark("__cuda_start_event");
     });
   }
   mark("__start_profile", false);
 }

 thread_event_lists disableProfiler() {
   if (state == ProfilerState::Disabled) {
     throw std::runtime_error("can't disable profiler when it's not running");
   }
   ProfilerState old_state = state;
   mark("__stop_profile");
   state = ProfilerState::Disabled;
   if (old_state == ProfilerState::NVTX) {
     return thread_event_lists();
   } else {
     thread_event_lists result;
     std::lock_guard<std::mutex> guard(all_event_lists_mutex);
     for (auto it = all_event_lists.begin(); it != all_event_lists.end();) {
       auto & list = *it;
       result.emplace_back(list->consolidate());
       // GC lists that are not held by any threads
       if (list.use_count() == 1) {
         auto current_it = it;
         ++it;
         all_event_lists.erase(current_it);
       } else {
         ++it;
       }
     }
     return result;
   }
 }

 void Event::record(bool record_cuda) {
   if (record_cuda) {
     cuda_stubs->record(&device_, &event, &cpu_ns_);
     return;
   }
   cpu_ns_ = getTime();
 }

 double Event::cuda_elapsed_us(const Event & e) {
   if(!e.has_cuda() || !has_cuda()) {
     throw std::logic_error("Events were not recorded for CUDA");
   }
   if(e.device() != device()) {
     throw std::logic_error("Events are not on the same device");
   }
   return cuda_stubs->elapsed(event, e.event);
 }

 CUDAStubs::~CUDAStubs() = default;


 static jit::CodeTemplate event_template(R"(
 {
   "name": "${name}",
   "ph": "X",
   "ts": ${ts},
   "dur": ${dur},
   "tid": ${tid},
   "pid": "CPU Functions",
   "args": {}
 })");


 RecordProfile::RecordProfile(std::ostream& out)
 : out_(out) {
   init();
 }

 RecordProfile::RecordProfile(const std::string& filename)
 : file_(new std::ofstream(filename)), out_(*file_) {
   init();
 }

 void RecordProfile::init() {
   enableProfiler(ProfilerState::CPU);
 }

 RecordProfile::~RecordProfile() {
   thread_event_lists event_lists = disableProfiler();
   std::vector<Event*> events;
   for(auto& l : event_lists) {
     for(auto& e : l) {
         events.push_back(&e);
     }
   }
   processEvents(events);
   if (file_){
     file_->close();
   }
 }

 void RecordProfile::processEvents(const std::vector<Event*>& events) {
   AT_CHECK(out_, "could not open file");
   Event* start = nullptr;
   for (Event* e : events) {
     if(0 == strcmp(e->name(), "__start_profile")) {
       start = e;
       break;
     }
   }
   AT_CHECK(start, "could not find start?");
   std::vector<Event*> stack;
   out_ << "[\n";
   bool first = true;
   for(Event* e : events) {
     if(e->kind() == "push") {
       stack.push_back(e);
     } else if(e->kind() == "pop") {
       if(!first) {
         out_ << ",\n";
       }
       first = false;
       Event* e_start = stack.back();
       stack.pop_back();
       jit::TemplateEnv env;
       env.s("name", e_start->name());
       env.d("ts", start->cpu_elapsed_us(*e_start));
       env.d("dur", e_start->cpu_elapsed_us(*e));
       env.d("tid", e_start->thread_id());
       out_ << event_template.format(env);
     }
   }
   out_ << "]\n";
 }

 }}}
T
Definition: dataloader.cpp:482

std
Definition: interned_strings.h:312

torch
Definition: jit_type.h:17