doxygen-c/html/torch_2csrc_2autograd_2profiler_8h_source.html

 #pragma once

 #include <thread>
 #include <iostream>
 #include <mutex>
 #include <memory>
 #include <vector>
 #include <cstdint>
 #include <string>
 #include <list>
 #include <sstream>
 #include <forward_list>
 #include <tuple>
 #include <ATen/ATen.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #ifndef _WIN32
 #include <ctime>
 #endif

 #include <torch/csrc/jit/code_template.h>

 typedef struct CUevent_st* CUDAEventStub;

 namespace torch { namespace autograd {

 struct Function;

 namespace profiler {

 struct TORCH_API CUDAStubs {
   virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
     fail();
   }
   virtual float elapsed(CUDAEventStub event, CUDAEventStub event2) {
     fail();
     return 0.f;
   }
   virtual void nvtxMarkA(const char* name) {
     fail();
   }
   virtual void nvtxRangePushA(const char* name) {
     fail();
   }
   virtual void nvtxRangePop() {
     fail();
   }
   virtual bool enabled() {
     return false;
   }
   virtual void onEachDevice(std::function<void(int)> op) {
     fail();
   }
   virtual void synchronize() {
     fail();
   }
   virtual ~CUDAStubs();

 private:
   void fail() {
     AT_ERROR("CUDA used in profiler but not enabled.");
   }
 };

 TORCH_API void registerCUDAMethods(CUDAStubs* stubs);

 constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
   return ((a + b - 1) / b) * b;
 }

 #if defined(__MACH__) && !defined(CLOCK_REALTIME)
 #include <sys/time.h>
 // clock_gettime is not implemented on older versions of OS X (< 10.12).
 // If implemented, CLOCK_REALTIME will have already been defined.
 #endif

 inline int64_t getTime() {
 #ifdef _WIN32
   using namespace std::chrono;
   using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
   return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
 #elif defined(__MACH__) && !defined(CLOCK_REALTIME)
   struct timeval now;
   gettimeofday(&now, NULL);
   return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
 #else
   // clock_gettime is *much* faster than std::chrono implementation on Linux
   struct timespec t{};
   clock_gettime(CLOCK_MONOTONIC, &t);
   return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
 #endif
 }

 enum class EventKind : uint16_t {
   Mark,
   PushRange,
   PopRange
 };

 struct TORCH_API Event final {
   Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
   : owned_name_(new std::string(std::move(name)))
   , name_ptr_(owned_name_->c_str())
   , kind_(kind)
   , thread_id_(thread_id) { record(record_cuda); }
   Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda)
   : name_ptr_(name)
   , kind_(kind)
   , thread_id_(thread_id) { record(record_cuda); }

   void record(bool record_cuda);
   std::string kind() const {
     switch(kind_) {
       case EventKind::Mark: return "mark";
       case EventKind::PushRange: return "push";
       case EventKind::PopRange: return "pop";
     }
     throw std::runtime_error("unknown EventKind");
   }
   const char* name() const {
     return name_ptr_;
   }
   uint16_t thread_id() const {
     return thread_id_;
   }
   double cpu_elapsed_us(const Event & e) {
     return (e.cpu_ns_ - cpu_ns_)/(1000.0);
   }
   double cuda_elapsed_us(const Event & e);
   bool has_cuda() const {
     return event != nullptr;
   }
   int device() const {
     return device_;
   }
 private:
   int64_t cpu_ns_ = 0; // signed to allow for negative intervals, initialized for safety.
   // std::string is a very large object (usually around 32B),
   // and this field is used only for user-created ranges, so
   // it's better to save on size of Events.
   std::unique_ptr<std::string> owned_name_;
   const char * name_ptr_;
   EventKind kind_;
   uint16_t thread_id_;
   int device_ = -1;
   struct CUevent_st* event = nullptr;
 };

 // a linked-list of fixed sized vectors, to avoid
 // a std::vector resize from taking a large amount of time inside
 // a profiling  event
 struct RangeEventList {
   constexpr static size_t MB = 1024 * 1024;
   constexpr static size_t event_block_size = 16 * MB;
   constexpr static size_t num_block_elements =
     event_block_size / ceilToMultiple(sizeof(Event), alignof(Event));
   static_assert(sizeof(Event[num_block_elements]) <= event_block_size,
                 "num_block_elements is calculated incorrectly");
   using block_type = std::vector<Event>;

   void allocBlock() {
     blocks.emplace_front();
     auto & new_block = blocks.front();
     new_block.reserve(num_block_elements);
     // Materialize all pages in the new block to release jitter when recording events.
     const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
     for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
          ptr < end_ptr; ptr += 4 * 1024) {
       (*ptr);
     }
   }

   template<typename... Args>
   void record(Args&&... args) {
     if (blocks.empty() || blocks.front().size() == num_block_elements) {
       allocBlock();
     }
     blocks.front().emplace_back(std::forward<Args>(args)...);
   }

   std::vector<Event> consolidate() {
     std::vector<Event> result;
     for (auto & block : blocks) {
       result.insert(result.begin(),
                     std::make_move_iterator(block.begin()),
                     std::make_move_iterator(block.end()));
     }
     blocks.clear();
     return result;
   }

   std::forward_list<block_type> blocks;
 };

 enum class ProfilerState {
     Disabled,
     CPU, // CPU-only profiling
     CUDA, // CPU + CUDA events
     NVTX,  // only emit NVTX markers
 };

 TORCH_API RangeEventList& getEventList();
 TORCH_API void mark(std::string name, bool include_cuda = true);
 TORCH_API void pushRange(std::string name);
 TORCH_API void popRange();

 struct TORCH_API RecordFunction {
   explicit RecordFunction(Function* fn);

   explicit RecordFunction(std::string name);

   explicit RecordFunction(const char* name);

   explicit RecordFunction(const char* name, int64_t current_sequence_nr);

   ~RecordFunction() {
     popRange();
   }
 };

 using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
 // there no autograd functions are being executed when these function are used.
 TORCH_API void enableProfiler(ProfilerState new_state);
 TORCH_API thread_event_lists disableProfiler();


 // Usage:
 //   {
 //     RecordProfile guard("filename.trace");
 //     // code you want to profile
 //   }
 // Then open filename.trace in chrome://tracing
 struct TORCH_API RecordProfile {
   RecordProfile(std::ostream& out);
   RecordProfile(const std::string& filename);

   ~RecordProfile();
 private:
   void init();
   std::unique_ptr<std::ofstream> file_;
   std::ostream& out_;
   void processEvents(const std::vector<Event*>& events);
 };


 } // namespace profiler
 }} // namespace torch::autograd
torch::autograd::profiler::RecordFunction
Definition: profiler.h:206

torch::autograd::profiler::RecordProfile
Definition: profiler.h:233

torch::autograd::profiler::Event
Definition: profiler.h:99

torch::autograd::profiler::CUDAStubs
Definition: profiler.h:30

torch
Definition: jit_type.h:17

torch::autograd::profiler::RangeEventList
Definition: profiler.h:151