Caffe2 - Python API
A deep learning, cross platform ML framework
profiler.py
1 import subprocess
2 import re
3 import os
4 import sys
5 import itertools
6 from collections import defaultdict
7 
8 import torch
9 from torch._six import FileNotFoundError
10 
11 
12 class range(object):
13  def __init__(self, name):
14  self.name = name
15 
16  def __enter__(self):
17  torch.autograd._push_range(self.name)
18 
19  def __exit__(self, *args):
20  torch.autograd._pop_range()
21  return False
22 
23 
24 class EventList(list):
25  """A list of Events (for pretty printing)"""
26  def __init__(self, *args, **kwargs):
27  super(EventList, self).__init__(*args, **kwargs)
28 
29  def __str__(self):
30  return self.table()
31 
32  def table(self, sort_by=None):
33  """Prints an EventList as a nicely formatted table.
34 
35  Arguments:
36  sort_by (str, optional): Attribute used to sort entries. By default
37  they are printed in the same order as they were registered.
38  Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
39  ``cuda_time_total``, ``count``.
40 
41  Returns:
42  A string containing the table.
43  """
44  return build_table(self, sort_by)
45 
46  def export_chrome_trace(self, path):
47  """Exports an EventList as a Chrome tracing tools file.
48 
49  The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
50 
51  Arguments:
52  path (str): Path where the trace will be written.
53  """
54  import json
55  with open(path, 'w') as f:
56  chrome_events = []
57  next_id = 0
58  for evt in self:
59  chrome_events.append(dict(
60  name=evt.name,
61  ph='X',
62  ts=evt.cpu_interval.start,
63  dur=evt.cpu_interval.elapsed_us(),
64  tid=evt.thread,
65  pid='CPU functions',
66  args={},
67  ))
68  for k in evt.kernels:
69  # 's' and 'f' draw Flow arrows from
70  # the CPU launch to the GPU kernel
71  chrome_events.append(dict(
72  name=evt.name,
73  ph='s',
74  ts=evt.cpu_interval.start,
75  tid=evt.thread,
76  pid='CPU functions',
77  id=next_id,
78  cat='cpu_to_cuda',
79  args={},
80  ))
81  chrome_events.append(dict(
82  name=k.name,
83  ph='f',
84  ts=k.interval.start,
85  tid=k.device,
86  pid='CUDA functions',
87  id=next_id,
88  cat='cpu_to_cuda',
89  args={},
90  ))
91  chrome_events.append(dict(
92  name=k.name,
93  ph='X',
94  ts=k.interval.start,
95  dur=k.interval.elapsed_us(),
96  tid=k.device,
97  pid='CUDA functions',
98  args={},
99  ))
100  next_id += 1
101 
102  json.dump(chrome_events, f)
103 
104  def key_averages(self):
105  """Averages all function events over their keys.
106 
107  Returns:
108  An EventList containing FunctionEventAvg objects.
109  """
110  stats = defaultdict(FunctionEventAvg)
111  for evt in self:
112  stats[evt.key] += evt
113  return EventList(stats.values())
114 
115  def total_average(self):
116  """Averages all events.
117 
118  Returns:
119  A FunctionEventAvg object.
120  """
121  total_stat = FunctionEventAvg()
122  for evt in self:
123  total_stat += evt
124  total_stat.key = None
125  total_stat.key = 'Total'
126  return total_stat
127 
128 
129 class profile(object):
130  """Context manager that manages autograd profiler state and holds a summary of results.
131 
132  Arguments:
133  enabled (bool, optional): Setting this to False makes this context manager a no-op.
134  Default: ``True``.
135 
136  use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API.
137  Adds approximately 4us of overhead to each tensor operation.
138  Default: ``False``
139 
140  .. warning:
141  This context managers should not be called recursively, i.e. at most one
142  instance should be enabled at any given time.
143 
144  Example:
145  >>> x = torch.randn((1, 1), requires_grad=True)
146  >>> with torch.autograd.profiler.profile() as prof:
147  ... y = x ** 2
148  ... y.backward()
149  >>> # NOTE: some columns were removed for brevity
150  ... print(prof)
151  ------------------------------------- --------------- ---------------
152  Name CPU time CUDA time
153  ------------------------------------- --------------- ---------------
154  PowConstant 142.036us 0.000us
155  N5torch8autograd9GraphRootE 63.524us 0.000us
156  PowConstantBackward 184.228us 0.000us
157  MulConstant 50.288us 0.000us
158  PowConstant 28.439us 0.000us
159  Mul 20.154us 0.000us
160  N5torch8autograd14AccumulateGradE 13.790us 0.000us
161  N5torch8autograd5CloneE 4.088us 0.000us
162  """
163 
164  def __init__(self, enabled=True, use_cuda=False):
165  self.enabled = enabled
166  self.use_cuda = use_cuda
167  self.function_events = None
168  if not self.enabled:
169  return
170  self.entered = False
171 
172  def __enter__(self):
173  if not self.enabled:
174  return
175  if self.entered:
176  raise RuntimeError("autograd profiler traces are not reentrant")
177  self.entered = True
178  profiler_kind = torch.autograd.ProfilerState.CUDA if self.use_cuda \
179  else torch.autograd.ProfilerState.CPU
180  torch.autograd._enable_profiler(profiler_kind)
181  return self
182 
183  def __exit__(self, exc_type, exc_val, exc_tb):
184  if not self.enabled:
185  return
186  records = torch.autograd._disable_profiler()
187  self.function_events = EventList(parse_cpu_trace(records))
188  return False
189 
190  def __repr__(self):
191  if self.function_events is None:
192  return '<unfinished torch.autograd.profile>'
193  return repr(self.function_events)
194 
195  def __str__(self):
196  if self.function_events is None:
197  return '<unfinished torch.autograd.profile>'
198  return str(self.function_events)
199 
200  def _check_finish(self):
201  if self.function_events is None:
202  raise RuntimeError("can't export a trace that didn't finish running")
203 
204  def table(self, sort_by=None):
205  self._check_finish()
206  return self.function_events.table(sort_by)
207  table.__doc__ = EventList.table.__doc__
208 
209  def export_chrome_trace(self, path):
210  self._check_finish()
211  return self.function_events.export_chrome_trace(path)
212  export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
213 
214  def key_averages(self):
215  self._check_finish()
216  return self.function_events.key_averages()
217  key_averages.__doc__ = EventList.key_averages.__doc__
218 
219  def total_average(self):
220  self._check_finish()
221  return self.function_events.total_average()
222  total_average.__doc__ = EventList.total_average.__doc__
223 
224 
225 class emit_nvtx(object):
226  """Context manager that makes every autograd operation emit an NVTX range.
227 
228  It is useful when running the program under nvprof::
229 
230  nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
231 
232  Unfortunately, there's no way to force nvprof to flush the data it collected
233  to disk, so for CUDA profiling one has to use this context manager to annotate
234  nvprof traces and wait for the process to exit before inspecting them.
235  Then, either NVIDIA Visual Profiler (nvvp) can be used to visualize the timeline, or
236  :func:`torch.autograd.profiler.load_nvprof` can load the results for inspection
237  e.g. in Python REPL.
238 
239  .. warning:
240  This context manager should not be called recursively, i.e. at most one
241  instance should be enabled at any given time.
242 
243  Arguments:
244  enabled (bool, optional): Setting this to False makes this context manager a no-op.
245  Default: ``True``.
246 
247  Example:
248  >>> with torch.cuda.profiler.profile():
249  ... model(x) # Warmup CUDA memory allocator and profiler
250  ... with torch.autograd.profiler.emit_nvtx():
251  ... model(x)
252 
253  **Forward-backward correlation**
254 
255  When viewing a profile created using :class:`emit_nvtx` in the Nvidia Visual Profiler,
256  correlating each backward-pass op with the corresponding forward-pass op can be difficult.
257  To ease this task, :class:`emit_nvtx` appends sequence number information to the ranges it
258  generates.
259 
260  During the forward pass, each function range is decorated with ``seq=<N>``. ``seq`` is a running
261  counter, incremented each time a new backward Function object is created and stashed for backward.
262  Thus, the `seq=<N>` annotation associated with each forward function range tells you that
263  if a backward Function object is created by this forward function,
264  the backward object will receive sequence number N.
265  During the backward pass, the top-level range wrapping each C++ backward Function's
266  ``apply()`` call is decorated with ``stashed seq=<M>``. ``M`` is the sequence number that
267  the backward object was created with. By comparing ``stashed seq`` numbers in backward with ``seq``
268  numbers in forward, you can track down which forward op created each backward Function.
269 
270  Any functions executed during the backward pass are also decorated with ``seq=<N>``. During
271  default backward (with ``create_graph=False``) this information is irrelevant, and in fact,
272  ``N`` may simply be 0 for all such functions. Only the top-level ranges associated with
273  backward Function objects' ``apply()`` methods are useful, as a way to correlate these Function
274  objects with the earlier forward pass.
275 
276  **Double-backward**
277 
278  If, on the other hand, a backward pass with ``create_graph=True`` is underway (in other words,
279  if you are setting up for a double-backward), each function's execution during backward
280  is given a nonzero, useful ``seq=<N>``. Those functions may themselves create Function objects
281  to be executed later during double-backward, just as the original functions in the forward pass did.
282  The relationship between backward and double-backward is conceptually the same as the relationship
283  between forward and backward: The functions still emit current-sequence-number-tagged ranges,
284  the Function objects they create still stash those sequence numbers, and during the eventual
285  double-backward, the Function objects' ``apply()`` ranges are still tagged with ``stashed seq``
286  numbers, which can be compared to `seq` numbers from the backward pass.
287 
288  .. warning:
289  The sequence number is thread-local, and some forward functions don't create an associated
290  backward Function object (instead delegating that to sub-functions further down the call chain).
291  For these reasons, the correspondence of stashed sequence numbers in
292  backward Function ``apply()`` ranges with `seq` numbers in forward-pass ranges is
293  not guaranteed to be 1 to 1. The sequence numbers alone may not be enough to fully
294  disambiguate which forward function created which
295  backward Function object. You may need to make a judgment based on analytic knowledge of what
296  the expected correspondence should be.
297  """
298  def __init__(self, enabled=True):
299  self.enabled = enabled
300  self.entered = False
301 
302  def __enter__(self):
303  if not self.enabled:
304  return
305  if self.entered:
306  raise RuntimeError("NVTX annotation context manager is not reentrant")
307  self.entered = True
309  torch.autograd._enable_profiler(torch.autograd.ProfilerState.NVTX)
310  return self
311 
312  def __exit__(self, exc_type, exc_val, exc_tb):
313  if not self.enabled:
314  return
316  torch.autograd._disable_profiler()
317  return False
318 
319 
320 def load_nvprof(path):
321  """Opens an nvprof trace file and parses autograd annotations.
322 
323  Arguments:
324  path (str): path to nvprof trace
325  """
326  return EventList(parse_nvprof_trace(path))
327 
328 
329 ################################################################################
330 # FunctionEvent
331 
332 def format_time(time_us):
333  """Defines how to format time in FunctionEvent"""
334  return '{:.3f}us'.format(time_us)
335 
336 
337 def attr_formatter(name):
338  return property(lambda self: format_time(getattr(self, name)))
339 
340 
341 class FormattedTimesMixin(object):
342  """Helpers for FunctionEvent and FunctionEventAvg.
343 
344  The subclass should define `*_time_total` and `count` attributes.
345  """
346  cpu_time_str = attr_formatter('cpu_time')
347  cuda_time_str = attr_formatter('cuda_time')
348  cpu_time_total_str = attr_formatter('cpu_time_total')
349  cuda_time_total_str = attr_formatter('cuda_time_total')
350 
351  @property
352  def cpu_time(self):
353  return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count
354 
355  @property
356  def cuda_time(self):
357  return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count
358 
359 
360 class Interval(object):
361  def __init__(self, start, end):
362  self.start = start
363  self.end = end
364 
365  def elapsed_us(self):
366  return self.end - self.start
367 
368 
369 class Kernel(object):
370  def __init__(self, name, device, interval):
371  self.name = name
372  self.device = device
373  self.interval = interval
374 
375 
376 # TODO: record TID too
378  """Profiling information about a single function."""
379  def __init__(self, id, name, thread, cpu_start, cpu_end):
380  self.id = id
381  self.name = name
382  self.cpu_interval = Interval(cpu_start, cpu_end)
383  self.thread = thread
384  self.kernels = []
385  self.count = 1
386 
387  def append_kernel(self, name, device, start, end):
388  self.kernels.append(Kernel(name, device, Interval(start, end)))
389 
390  @property
391  def cuda_time_total(self):
392  return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
393 
394  @property
395  def cpu_time_total(self):
396  return self.cpu_interval.elapsed_us()
397 
398  @property
399  def key(self):
400  return self.name
401 
402  def __repr__(self):
403  return '<FunctionEvent id={} cpu_time={} cuda_time={} name={} thread={}>'.format(
404  self.id, self.cpu_time_str, self.cuda_time_str, self.name, self.thread)
405 
406 
408  """Used to average stats over multiple FunctionEvent objects."""
409  def __init__(self):
410  self.key = None
411  self.count = self.cpu_time_total = self.cuda_time_total = 0
412 
413  def __iadd__(self, other):
414  if self.key is None:
415  self.key = other.key
416  assert isinstance(other, FunctionEvent)
417  assert other.key == self.key
418  self.cpu_time_total += other.cpu_time
419  self.cuda_time_total += other.cuda_time
420  self.count += 1
421  return self
422 
423  def __repr__(self):
424  return '<FunctionEventAvg cpu_time={} cuda_time={} key={}>'.format(
425  self.cpu_time_str, self.cuda_time_str, self.key)
426 
427 
428 ################################################################################
429 # Utilities
430 
431 class StringTable(defaultdict):
432  def __missing__(self, key):
433  self[key] = torch._C._demangle(key)
434  return self[key]
435 
436 
437 ################################################################################
438 # CPU checkpoints
439 
440 def parse_cpu_trace(thread_records):
441  next_id = 0
442  start_record = None
443  cuda_records = {}
444  functions = []
445  record_stack = []
446  string_table = StringTable()
447 
448  # cuda start events and the overall profiler start event don't happen
449  # at exactly the same time because we need to record an event on each device
450  # and each record takes ~4us. So we adjust here by the difference
451  # adding the difference in CPU time between the profiler start event
452  # and the CPU time of the cuda start event for the device
453  def adjusted_time(cuda_record):
454  assert cuda_record.device() != -1
455  cuda_time_0 = cuda_records[cuda_record.device()]
456  return cuda_time_0.cuda_elapsed_us(cuda_record) + start_record.cpu_elapsed_us(cuda_time_0)
457 
458  # '__start_profile' is not guarenteed to be first, so we must find it here
459  for record in itertools.chain(*thread_records):
460  if record.name() == '__start_profile':
461  start_record = record
462  elif record.name() == '__cuda_start_event':
463  assert record.device() != -1
464  cuda_records[record.device()] = record
465  assert start_record is not None
466 
467  for record in itertools.chain(*thread_records):
468  if record.kind() == 'mark':
469  continue
470  elif record.kind() == 'push':
471  record_stack.append((next_id, record))
472  next_id += 1
473  elif record.kind() == 'pop':
474  function_id, start = record_stack.pop()
475  fe = FunctionEvent(
476  id=function_id,
477  name=string_table[start.name()],
478  thread=start.thread_id(),
479  cpu_start=start_record.cpu_elapsed_us(start),
480  cpu_end=start_record.cpu_elapsed_us(record))
481  if start.has_cuda():
482  cuda_start = adjusted_time(start)
483  cuda_end = adjusted_time(record)
484  fe.append_kernel(start.name(),
485  start.device(),
486  cuda_start,
487  cuda_end)
488  functions.append(fe)
489 
490  functions.sort(key=lambda evt: evt.cpu_interval.start)
491  return functions
492 
493 
494 ################################################################################
495 # CUDA checkpoints
496 
497 class EnforceUnique(object):
498  """Raises an error if a key is seen more than once."""
499  def __init__(self):
500  self.seen = set()
501 
502  def see(self, *key):
503  if key in self.seen:
504  raise RuntimeError('duplicate key: ' + str(key))
505  self.seen.add(key)
506 
507 
508 def parse_nvprof_trace(path):
509  import sqlite3
510  conn = sqlite3.connect(path)
511  conn.row_factory = sqlite3.Row
512 
513  # Parse strings table
514  strings = {}
515  for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
516  strings[r["id"]] = torch._C._demangle(r["value"])
517 
518  # First, find all functions and create FunctionEvents for them
519  marker_query = """
520  SELECT
521  start.id AS marker_id, start.name, start.timestamp AS start_time, end.timestamp AS end_time
522  FROM
523  CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
524  ON start.id = end.id
525  WHERE
526  start.name != 0 AND end.name = 0
527  """
528  functions = []
529  functions_map = {}
530  unique = EnforceUnique()
531  for row in conn.execute(marker_query):
532  unique.see(row['marker_id'])
533  evt = FunctionEvent(id=row['marker_id'],
534  name=strings[row['name']],
535  cpu_start=row['start_time'],
536  cpu_end=row['end_time'],
537  thread=0) # TODO: find in sqlite database
538  functions.append(evt)
539  functions_map[evt.id] = evt
540 
541  # Now, correlate all kernels with FunctionEvents
542  kernel_query = """
543  SELECT
544  start.id AS marker_id, start.name, start.timestamp, end.timestamp,
545  runtime._id_ AS runtime_id, runtime.cbid, runtime.start AS runtime_start, runtime.end AS runtime_end,
546  kernel.start AS kernel_start, kernel.end AS kernel_end, kernel.name AS kernel_name
547  FROM
548  CUPTI_ACTIVITY_KIND_MARKER AS start
549  INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
550  ON start.id = end.id
551  INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as runtime
552  ON (start.timestamp < runtime.start AND runtime.end < end.timestamp)
553  INNER JOIN CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL AS kernel
554  ON kernel.correlationId = runtime.correlationId
555  """
556  unique = EnforceUnique()
557  for row in conn.execute(kernel_query):
558  unique.see(row['marker_id'], row['runtime_id'])
559  assert row['cbid'] == 13 # 13 == Launch
560  evt = functions_map[row['marker_id']]
561  evt.append_kernel(row['kernel_name'],
562  0,
563  row['kernel_start'],
564  row['kernel_end'])
565 
566  functions.sort(key=lambda evt: evt.cpu_interval.start)
567  return functions
568 
569 
570 ################################################################################
571 # Pretty printer
572 
573 def build_table(events, sort_by=None, header=None):
574  """Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
575  if sort_by is not None:
576  events = sorted(events, key=lambda evt: getattr(evt, sort_by))
577 
578  name_lengths = [len(evt.key) for evt in events]
579  if len(name_lengths) == 0:
580  return ""
581  max_name_length = max(name_lengths)
582  max_name_length += 4 # Add some nice padding
583  col_width = 15
584  col_format = ' {: >' + str(col_width) + '}'
585  row_format = '{: <' + str(max_name_length) + '}' + col_format * 5
586  header_sep = '-' * max_name_length + (' ' + '-' * col_width) * 5
587 
588  # Have to use a list because nonlocal is Py3 only...
589  result = []
590 
591  def append(s):
592  result.append(s)
593  result.append('\n') # Yes, newline after the end as well
594 
595  # Actual printing
596  if header is not None:
597  line_length = max_name_length + (col_width + 2) * 5
598  append('=' * line_length)
599  append(header)
600  append(header_sep)
601  append(row_format.format('Name', 'CPU time', 'CUDA time', 'Calls', 'CPU total', 'CUDA total'))
602  append(header_sep)
603  for evt in events:
604  append(row_format.format(evt.key, evt.cpu_time_str, evt.cuda_time_str,
605  evt.count, evt.cpu_time_total_str, evt.cuda_time_total_str))
606 
607  return ''.join(result)
def table(self, sort_by=None)
Definition: profiler.py:32
def synchronize()
Definition: __init__.py:355
def export_chrome_trace(self, path)
Definition: profiler.py:46