Add unit test for bgprocess smooth recovery

[oweals/dinit.git] / src / proc-service.cc
diff --git a/src/proc-service.cc b/src/proc-service.cc

index 2aa91ac0f05e2863eb484ea4258c0da32ffa5695..c7970dff824067e31f3a770d66e6b7cdca66c61d 100644 (file)
--- a/src/proc-service.cc
+++ b/src/proc-service.cc
@@ -1,4 +1,5 @@
  #include <cstring>
+#include <type_traits>
  
  #include <sys/un.h>
  #include <sys/socket.h>
@@ -15,6 +16,20 @@
   * See proc-service.h header for interface details.
   */
  
+// Strings describing the execution stages (failure points).
+const char * const exec_stage_descriptions[static_cast<int>(exec_stage::DO_EXEC) + 1] = {
+        "arranging file descriptors",   // ARRANGE_FDS
+        "reading environment file",     // READ_ENV_FILE
+        "setting environment variable", // SET_NOTIFYFD_VAR
+        "setting up activation socket", // SETUP_ACTIVATION_SOCKET
+        "setting up control socket",    // SETUP_CONTROL_SOCKET
+        "changing directory",           // CHDIR
+        "setting up standard input/output descriptors", // SETUP_STDINOUTERR
+        "setting resource limits",      // SET_RLIMITS
+        "setting user/group ID",        // SET_UIDGID
+        "executing command"             // DO_EXEC
+};
+
  // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  // store a null terminator for the argument. Return a `char *` vector containing the beginning
  // of each argument and a trailing nullptr. (The returned array is invalidated if the string is
@@ -47,7 +62,13 @@ void process_service::exec_succeeded() noexcept
      // might be stopped (and killed via a signal) during smooth recovery.  We don't to
      // process startup again in either case, so we check for state STARTING:
      if (get_state() == service_state_t::STARTING) {
-        started();
+        if (force_notification_fd != -1 || !notification_var.empty()) {
+            // Wait for readiness notification:
+            readiness_watcher.set_enabled(event_loop, true);
+        }
+        else {
+            started();
+        }
      }
      else if (get_state() == service_state_t::STOPPING) {
          // stopping, but smooth recovery was in process. That's now over so we can
@@ -70,8 +91,8 @@ rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) n
      base_process_service *sr = service;
      sr->waiting_for_execstat = false;
  
-    int exec_status;
-    int r = read(get_watched_fd(), &exec_status, sizeof(int));
+    run_proc_err exec_status;
+    int r = read(get_watched_fd(), &exec_status, sizeof(exec_status));
      deregister(loop);
      close(get_watched_fd());
  
@@ -102,6 +123,36 @@ rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) n
      return rearm::REMOVED;
  }
  
+rearm ready_notify_watcher::fd_event(eventloop_t &, int fd, int flags) noexcept
+{
+    char buf[128];
+    if (service->get_state() == service_state_t::STARTING) {
+        // can we actually read anything from the notification pipe?
+        int r = bp_sys::read(fd, buf, sizeof(buf));
+        if (r > 0) {
+            service->started();
+        }
+        else if (r == 0 || errno != EAGAIN) {
+            service->failed_to_start(false, false);
+            service->set_state(service_state_t::STOPPING);
+            service->bring_down();
+        }
+    }
+    else {
+        // Just keep consuming data from the pipe:
+        int r = bp_sys::read(fd, buf, sizeof(buf));
+        if (r == 0) {
+            // Process closed write end or terminated
+            close(fd);
+            service->notification_fd = -1;
+            return rearm::DISARM;
+        }
+    }
+
+    service->services->process_queues();
+    return rearm::REARM;
+}
+
  dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
  {
      base_process_service *sr = service;
@@ -109,10 +160,9 @@ dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t chil
      sr->pid = -1;
      sr->exit_status = bp_sys::exit_status(status);
  
-    // Ok, for a process service, any process death which we didn't rig
-    // ourselves is a bit... unexpected. Probably, the child died because
-    // we asked it to (sr->service_state == STOPPING). But even if
-    // we didn't, there's not much we can do.
+    // Ok, for a process service, any process death which we didn't rig ourselves is a bit... unexpected.
+    // Probably, the child died because we asked it to (sr->service_state == STOPPING). But even if we
+    // didn't, there's not much we can do.
  
      if (sr->waiting_for_execstat) {
          // We still don't have an exec() status from the forked child, wait for that
@@ -137,10 +187,15 @@ void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexce
  {
      bool did_exit = exit_status.did_exit();
      bool was_signalled = exit_status.was_signalled();
-    restarting = false;
      auto service_state = get_state();
  
-    if (exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
+    if (notification_fd != -1) {
+        readiness_watcher.deregister(event_loop);
+        bp_sys::close(notification_fd);
+        notification_fd = -1;
+    }
+
+    if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
          if (did_exit) {
              log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
                      exit_status.get_exit_status());
@@ -151,14 +206,17 @@ void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexce
          }
      }
  
+#if USE_UTMPX
+    if (*inittab_id || *inittab_line) {
+        clear_utmp_entry(inittab_id, inittab_line);
+    }
+#endif
+
      if (service_state == service_state_t::STARTING) {
-        if (exit_status.did_exit_clean()) {
-            started();
-        }
-        else {
-            stop_reason = stopped_reason_t::FAILED;
-            failed_to_start();
-        }
+        // If state is STARTING, we must be waiting for readiness notification; the process has
+        // terminated before becoming ready.
+        stop_reason = stopped_reason_t::FAILED;
+        failed_to_start();
      }
      else if (service_state == service_state_t::STOPPING) {
          // We won't log a non-zero exit status or termination due to signal here -
@@ -180,9 +238,17 @@ void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexce
      services->process_queues();
  }
  
-void process_service::exec_failed(int errcode) noexcept
+void process_service::exec_failed(run_proc_err errcode) noexcept
  {
-    log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
+    log(loglevel_t::ERROR, get_name(), ": execution failed - ",
+            exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
+
+    if (notification_fd != -1) {
+        readiness_watcher.deregister(event_loop);
+        bp_sys::close(notification_fd);
+        notification_fd = -1;
+    }
+
      if (get_state() == service_state_t::STARTING) {
          stop_reason = stopped_reason_t::EXECFAILED;
          failed_to_start();
@@ -196,6 +262,13 @@ void process_service::exec_failed(int errcode) noexcept
  
  void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
  {
+    // For bgproc services, receiving exit status can mean one of two things:
+    // 1. We were launching the process, and it finished (possibly after forking). If it did fork
+    //    we want to obtain the process id of the process that we should now monitor, the actual
+    //    daemon.
+    // 2. The above has already happened, and we are monitoring the daemon process, which has now
+    //    terminated for some reason.
+
      begin:
      bool did_exit = exit_status.did_exit();
      bool was_signalled = exit_status.was_signalled();
@@ -213,7 +286,8 @@ void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcep
      }
  
      // This may be a "smooth recovery" where we are restarting the process while leaving the
-    // service in the STARTED state.
+    // service in the STARTED state. This must be the case if 'restarting' is set while the state
+    // is currently STARTED.
      if (restarting && service_state == service_state_t::STARTED) {
          restarting = false;
          bool need_stop = false;
@@ -247,7 +321,6 @@ void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcep
          return;
      }
  
-    restarting = false;
      if (service_state == service_state_t::STARTING) {
          // POSIX requires that if the process exited clearly with a status code of 0,
          // the exit status value will be 0:
@@ -281,13 +354,10 @@ void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcep
      else {
          // we must be STARTED
          if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
+            restarting = true;
              do_smooth_recovery();
              return;
          }
-        if (! do_auto_restart() && start_explicit) {
-            start_explicit = false;
-            release(false);
-        }
          stop_reason = stopped_reason_t::TERMINATED;
          forced_stop();
          stop_dependents();
@@ -296,9 +366,11 @@ void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcep
      services->process_queues();
  }
  
-void bgproc_service::exec_failed(int errcode) noexcept
+void bgproc_service::exec_failed(run_proc_err errcode) noexcept
  {
-    log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
+    log(loglevel_t::ERROR, get_name(), ": execution failed - ",
+            exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
+
      // Only time we execute is for startup:
      stop_reason = stopped_reason_t::EXECFAILED;
      failed_to_start();
@@ -319,11 +391,15 @@ void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexc
          // We might be running the stop script, or we might be running the start script and have issued
          // a cancel order via SIGINT:
          if (interrupting_start) {
+            if (stop_timer_armed) {
+                restart_timer.stop_timer(event_loop);
+                stop_timer_armed = false;
+            }
              // We issued a start interrupt, so we expected this failure:
              if (did_exit && exit_status.get_exit_status() != 0) {
                  log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
                          exit_status.get_exit_status());
-                // Assume that a command terminating normally requires no cleanup:
+                // Assume that a command terminating normally (with failure status) requires no cleanup:
                  stopped();
              }
              else {
@@ -384,9 +460,10 @@ void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexc
      }
  }
  
-void scripted_service::exec_failed(int errcode) noexcept
+void scripted_service::exec_failed(run_proc_err errcode) noexcept
  {
-    log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
+    log(loglevel_t::ERROR, get_name(), ": execution failed - ",
+            exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
      auto service_state = get_state();
      if (service_state == service_state_t::STARTING) {
          stop_reason = stopped_reason_t::EXECFAILED;
@@ -399,11 +476,17 @@ void scripted_service::exec_failed(int errcode) noexcept
      }
  }
  
+// Return a value as an unsigned-type value.
+template <typename T> typename std::make_unsigned<T>::type make_unsigned_val(T val)
+{
+    return static_cast<typename std::make_unsigned<T>::type>(val);
+}
+
  bgproc_service::pid_result_t
  bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
  {
      const char *pid_file_c = pid_file.c_str();
-    int fd = open(pid_file_c, O_CLOEXEC);
+    int fd = bp_sys::open(pid_file_c, O_CLOEXEC);
      if (fd == -1) {
          log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
          return pid_result_t::FAILED;
@@ -414,17 +497,17 @@ bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
      if (r < 0) {
          // Could not read from PID file
          log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
-        close(fd);
+        bp_sys::close(fd);
          return pid_result_t::FAILED;
      }
  
-    close(fd);
+    bp_sys::close(fd);
      pidbuf[r] = 0; // store nul terminator
  
      bool valid_pid = false;
      try {
          unsigned long long v = std::stoull(pidbuf, nullptr, 0);
-        if (v <= std::numeric_limits<pid_t>::max()) {
+        if (v <= make_unsigned_val(std::numeric_limits<pid_t>::max())) {
              pid = (pid_t) v;
              valid_pid = true;
          }
@@ -440,7 +523,7 @@ bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
          pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
          if (wait_r == -1 && errno == ECHILD) {
              // We can't track this child - check process exists:
-            if (kill(pid, 0) == 0 || errno != ESRCH) {
+            if (bp_sys::kill(pid, 0) == 0 || errno != ESRCH) {
                  tracking_child = false;
                  return pid_result_t::OK;
              }