Caffe2 - C++ API
A deep learning, cross platform ML framework
signal_handler.cc
1 #include "caffe2/utils/signal_handler.h"
2 #include "caffe2/core/logging.h"
3 
4 #if defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
5 
6 // Normal signal handler implementation.
7 #include <cxxabi.h>
8 #include <dirent.h>
9 #include <dlfcn.h>
10 #include <pthread.h>
11 #include <sys/syscall.h>
12 #include <sys/types.h>
13 #include <unistd.h>
14 #include <unwind.h>
15 
16 #include <atomic>
17 #include <csignal>
18 #include <cstdio>
19 #include <cstdlib>
20 #include <mutex>
21 #include <unordered_set>
22 
23 #include "caffe2/core/init.h"
24 #include "caffe2/core/workspace.h"
25 
26 #if C10_ANDROID
27 #ifndef SYS_gettid
28 #define SYS_gettid __NR_gettid
29 #endif
30 #ifndef SYS_tgkill
31 #define SYS_tgkill __NR_tgkill
32 #endif
33 #endif
34 
35 namespace {
36 
37 struct sigaction previousSighup;
38 struct sigaction previousSigint;
39 std::atomic<int> sigintCount(0);
40 std::atomic<int> sighupCount(0);
41 std::atomic<int> hookedUpCount(0);
42 
43 void handleSignal(int signal) {
44  switch (signal) {
45  // TODO: what if the previous handler uses sa_sigaction?
46  case SIGHUP:
47  sighupCount += 1;
48  if (previousSighup.sa_handler) {
49  previousSighup.sa_handler(signal);
50  }
51  break;
52  case SIGINT:
53  sigintCount += 1;
54  if (previousSigint.sa_handler) {
55  previousSigint.sa_handler(signal);
56  }
57  break;
58  }
59 }
60 
61 void hookupHandler() {
62  if (hookedUpCount++) {
63  return;
64  }
65  struct sigaction sa;
66  // Setup the handler
67  sa.sa_handler = &handleSignal;
68  // Restart the system call, if at all possible
69  sa.sa_flags = SA_RESTART;
70  // Block every signal during the handler
71  sigfillset(&sa.sa_mask);
72  // Intercept SIGHUP and SIGINT
73  if (sigaction(SIGHUP, &sa, &previousSighup) == -1) {
74  LOG(FATAL) << "Cannot install SIGHUP handler.";
75  }
76  if (sigaction(SIGINT, &sa, &previousSigint) == -1) {
77  LOG(FATAL) << "Cannot install SIGINT handler.";
78  }
79 }
80 
81 // Set the signal handlers to the default.
82 void unhookHandler() {
83  if (--hookedUpCount > 0) {
84  return;
85  }
86  struct sigaction sa;
87  // Setup the sighub handler
88  sa.sa_handler = SIG_DFL;
89  // Restart the system call, if at all possible
90  sa.sa_flags = SA_RESTART;
91  // Block every signal during the handler
92  sigfillset(&sa.sa_mask);
93  // Intercept SIGHUP and SIGINT
94  if (sigaction(SIGHUP, &previousSighup, nullptr) == -1) {
95  LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
96  }
97  if (sigaction(SIGINT, &previousSigint, nullptr) == -1) {
98  LOG(FATAL) << "Cannot uninstall SIGINT handler.";
99  }
100 }
101 
102 #if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
103 // The mutex protects the bool.
104 std::mutex fatalSignalHandlersInstallationMutex;
105 bool fatalSignalHandlersInstalled;
106 // We need to hold a reference to call the previous SIGUSR2 handler in case
107 // we didn't signal it
108 struct sigaction previousSigusr2;
109 // Flag dictating whether the SIGUSR2 handler falls back to previous handlers
110 // or is intercepted in order to print a stack trace.
111 std::atomic<bool> fatalSignalReceived(false);
112 // Global state set when a fatal signal is received so that backtracing threads
113 // know why they're printing a stacktrace.
114 const char* fatalSignalName("<UNKNOWN>");
115 int fatalSignum(-1);
116 // This wait condition is used to wait for other threads to finish writing
117 // their stack trace when in fatal sig handler (we can't use pthread_join
118 // because there's no way to convert from a tid to a pthread_t).
119 pthread_cond_t writingCond = PTHREAD_COND_INITIALIZER;
120 pthread_mutex_t writingMutex = PTHREAD_MUTEX_INITIALIZER;
121 
122 struct {
123  const char* name;
124  int signum;
125  struct sigaction previous;
126 } kSignalHandlers[] = {
127  { "SIGABRT", SIGABRT, {} },
128  { "SIGINT", SIGINT, {} },
129  { "SIGILL", SIGILL, {} },
130  { "SIGFPE", SIGFPE, {} },
131  { "SIGBUS", SIGBUS, {} },
132  { "SIGSEGV", SIGSEGV, {} },
133  { nullptr, 0, {} }
134 };
135 
136 struct sigaction* getPreviousSigaction(int signum) {
137  for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
138  if (handler->signum == signum) {
139  return &handler->previous;
140  }
141  }
142  return nullptr;
143 }
144 
145 const char* getSignalName(int signum) {
146  for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
147  if (handler->signum == signum) {
148  return handler->name;
149  }
150  }
151  return nullptr;
152 }
153 
154 _Unwind_Reason_Code unwinder(struct _Unwind_Context* context, void* userInfo) {
155  auto& pcs = *reinterpret_cast<std::vector<uintptr_t>*>(userInfo);
156  pcs.push_back(_Unwind_GetIP(context));
157  return _URC_NO_REASON;
158 }
159 
160 std::vector<uintptr_t> getBacktrace() {
161  std::vector<uintptr_t> pcs;
162  _Unwind_Backtrace(unwinder, &pcs);
163  return pcs;
164 }
165 
166 void printBlobSizes() {
168  [&](::caffe2::Workspace* ws) { ws->PrintBlobSizes(); });
169 }
170 
171 void printStacktrace() {
172  std::vector<uintptr_t> pcs = getBacktrace();
173  Dl_info info;
174  size_t i = 0;
175  for (uintptr_t pcAddr : pcs) {
176  const void* pc = reinterpret_cast<const void*>(pcAddr);
177  const char* path = nullptr;
178  const char* name = "???";
179  char* demangled = nullptr;
180  int offset = -1;
181 
182  std::cerr << "[" << i << "] ";
183  if (dladdr(pc, &info)) {
184  path = info.dli_fname;
185  name = info.dli_sname ?: "???";
186  offset = reinterpret_cast<uintptr_t>(pc) -
187  reinterpret_cast<uintptr_t>(info.dli_saddr);
188 
189  int status;
190  demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
191  if (status == 0) {
192  name = demangled;
193  }
194  }
195  std::cerr << name;
196  if (offset >= 0) {
197  std::cerr << "+" << reinterpret_cast<void*>(offset);
198  }
199  std::cerr << "(" << pc << ")";
200  if (path) {
201  std::cerr << " in " << path;
202  }
203  std::cerr << std::endl;
204  if (demangled) {
205  free(demangled);
206  }
207  i += 1;
208  }
209 }
210 
211 void callPreviousSignalHandler(
212  struct sigaction* action,
213  int signum,
214  siginfo_t* info,
215  void* ctx) {
216  if (!action->sa_handler) {
217  return;
218  }
219  if ((action->sa_flags & SA_SIGINFO) == SA_SIGINFO) {
220  action->sa_sigaction(signum, info, ctx);
221  } else {
222  action->sa_handler(signum);
223  }
224 }
225 
226 // needsLock signals whether we need to lock our writing mutex.
227 void stacktraceSignalHandler(bool needsLock) {
228  if (needsLock) {
229  pthread_mutex_lock(&writingMutex);
230  }
231  pid_t tid = syscall(SYS_gettid);
232  std::cerr << fatalSignalName << "(" << fatalSignum << "), Thread " << tid
233  << ": " << std::endl;
234  printStacktrace();
235  std::cerr << std::endl;
236  if (needsLock) {
237  pthread_mutex_unlock(&writingMutex);
238  pthread_cond_signal(&writingCond);
239  }
240 }
241 
242 // Our fatal signal entry point
243 void fatalSignalHandler(int signum) {
244  // Check if this is a proper signal that we declared above.
245  const char* name = getSignalName(signum);
246  if (!name) {
247  return;
248  }
249  if (fatalSignalReceived) {
250  return;
251  }
252  // Set the flag so that our SIGUSR2 handler knows that we're aborting and
253  // that it should intercept any SIGUSR2 signal.
254  fatalSignalReceived = true;
255  // Set state for other threads.
256  fatalSignum = signum;
257  fatalSignalName = name;
258  // Linux doesn't have a nice userland API for enumerating threads so we
259  // need to use the proc pseudo-filesystem.
260  DIR* procDir = opendir("/proc/self/task");
261  if (procDir) {
262  pid_t pid = getpid();
263  pid_t currentTid = syscall(SYS_gettid);
264  struct dirent* entry;
265  pthread_mutex_lock(&writingMutex);
266  while ((entry = readdir(procDir)) != nullptr) {
267  if (entry->d_name[0] == '.') {
268  continue;
269  }
270  pid_t tid = atoi(entry->d_name);
271  // If we've found the current thread then we'll jump into the SIGUSR2
272  // handler before calling pthread_cond_wait thus deadlocking, so branch
273  // our directly to the backtrace handler instead of signaling it.
274  if (tid != currentTid) {
275  syscall(SYS_tgkill, pid, tid, SIGUSR2);
276  pthread_cond_wait(&writingCond, &writingMutex);
277  } else {
278  stacktraceSignalHandler(false);
279  }
280  }
281  pthread_mutex_unlock(&writingMutex);
282  } else {
283  perror("Failed to open /proc/self/task");
284  }
285  printBlobSizes();
286  sigaction(signum, getPreviousSigaction(signum), nullptr);
287  raise(signum);
288 }
289 
290 // Our SIGUSR2 entry point
291 void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx) {
292  if (fatalSignalReceived) {
293  stacktraceSignalHandler(true);
294  } else {
295  // We don't want to actually change the signal handler as we want to
296  // remain the signal handler so that we may get the usr2 signal later.
297  callPreviousSignalHandler(&previousSigusr2, signum, info, ctx);
298  }
299 }
300 
301 // Installs SIGABRT signal handler so that we get stack traces
302 // from every thread on SIGABRT caused exit. Also installs SIGUSR2 handler
303 // so that threads can communicate with each other (be sure if you use SIGUSR2)
304 // to install your handler before initing caffe2 (we properly fall back to
305 // the previous handler if we didn't initiate the SIGUSR2).
306 void installFatalSignalHandlers() {
307  std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
308  if (fatalSignalHandlersInstalled) {
309  return;
310  }
311  fatalSignalHandlersInstalled = true;
312  struct sigaction sa;
313  sigemptyset(&sa.sa_mask);
314  // Since we'll be in an exiting situation it's possible there's memory
315  // corruption, so make our own stack just in case.
316  sa.sa_flags = SA_ONSTACK | SA_SIGINFO;
317  sa.sa_handler = ::fatalSignalHandler;
318  for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
319  if (sigaction(handler->signum, &sa, &handler->previous)) {
320  std::string str("Failed to add ");
321  str += handler->name;
322  str += " handler!";
323  perror(str.c_str());
324  }
325  }
326  sa.sa_sigaction = ::stacktraceSignalHandler;
327  if (sigaction(SIGUSR2, &sa, &::previousSigusr2)) {
328  perror("Failed to add SIGUSR2 handler!");
329  }
330 }
331 
332 void uninstallFatalSignalHandlers() {
333  std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
334  if (!fatalSignalHandlersInstalled) {
335  return;
336  }
337  fatalSignalHandlersInstalled = false;
338  for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
339  if (sigaction(handler->signum, &handler->previous, nullptr)) {
340  std::string str("Failed to remove ");
341  str += handler->name;
342  str += " handler!";
343  perror(str.c_str());
344  } else {
345  handler->previous = {};
346  }
347  }
348  if (sigaction(SIGUSR2, &::previousSigusr2, nullptr)) {
349  perror("Failed to add SIGUSR2 handler!");
350  } else {
351  ::previousSigusr2 = {};
352  }
353 }
354 #endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
355 
356 } // namespace
357 
358 #if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
359 C10_DEFINE_bool(
360  caffe2_print_stacktraces,
361  false,
362  "If set, prints stacktraces when a fatal signal is raised.");
363 #endif
364 
365 namespace caffe2 {
366 
367 SignalHandler::SignalHandler(
368  SignalHandler::Action SIGINT_action,
369  SignalHandler::Action SIGHUP_action)
370  : SIGINT_action_(SIGINT_action),
371  SIGHUP_action_(SIGHUP_action),
372  my_sigint_count_(sigintCount),
373  my_sighup_count_(sighupCount) {
374  hookupHandler();
375 }
376 
377 SignalHandler::~SignalHandler() {
378  unhookHandler();
379 }
380 
381 // Return true iff a SIGINT has been received since the last time this
382 // function was called.
383 bool SignalHandler::GotSIGINT() {
384  uint64_t count = sigintCount;
385  bool result = (count != my_sigint_count_);
386  my_sigint_count_ = count;
387  return result;
388 }
389 
390 // Return true iff a SIGHUP has been received since the last time this
391 // function was called.
392 bool SignalHandler::GotSIGHUP() {
393  uint64_t count = sighupCount;
394  bool result = (count != my_sighup_count_);
395  my_sighup_count_ = count;
396  return result;
397 }
398 
399 SignalHandler::Action SignalHandler::CheckForSignals() {
400  if (GotSIGHUP()) {
401  return SIGHUP_action_;
402  }
403  if (GotSIGINT()) {
404  return SIGINT_action_;
405  }
406  return SignalHandler::Action::NONE;
407 }
408 
409 #if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
410 void setPrintStackTracesOnFatalSignal(bool print) {
411  if (print) {
412  installFatalSignalHandlers();
413  } else {
414  uninstallFatalSignalHandlers();
415  }
416 }
417 bool printStackTracesOnFatalSignal() {
418  std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
419  return fatalSignalHandlersInstalled;
420 }
421 
422 namespace internal {
423 bool Caffe2InitFatalSignalHandler(int*, char***) {
424  if (FLAGS_caffe2_print_stacktraces) {
425  setPrintStackTracesOnFatalSignal(true);
426  }
427  return true;
428 }
429 
430 REGISTER_CAFFE2_INIT_FUNCTION(
431  Caffe2InitFatalSignalHandler,
432  &Caffe2InitFatalSignalHandler,
433  "Inits signal handlers for fatal signals so we can see what if"
434  " caffe2_print_stacktraces is set.");
435 
436 } // namepsace internal
437 #endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
438 } // namespace caffe2
439 
440 #else // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
441 
442 // TODO: Currently we do not support signal handling in non-Linux yet - below is
443 // a minimal implementation that makes things compile.
444 namespace caffe2 {
445 SignalHandler::SignalHandler(
446  SignalHandler::Action SIGINT_action,
447  SignalHandler::Action SIGHUP_action) {}
448 SignalHandler::~SignalHandler() {}
449 bool SignalHandler::GotSIGINT() {
450  return false;
451 }
452 bool SignalHandler::GotSIGHUP() {
453  return false;
454 }
455 SignalHandler::Action SignalHandler::CheckForSignals() {
456  return SignalHandler::Action::NONE;
457 }
458 } // namespace caffe2
459 
460 #endif // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
static void ForEach(F f)
Applies a function f on each workspace that currently exists.
Definition: workspace.h:302
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13