Fix case of process termination before readiness notification.
[oweals/dinit.git] / src / proc-service.cc
index 9e64eecc6e193062d4e0fea55ab9eae19509b25f..9cb94cdcde60a0278e04a8247114a5bea6df44e4 100644 (file)
@@ -1,20 +1,26 @@
+#include <cstring>
+
 #include <sys/un.h>
 #include <sys/socket.h>
 
+#include "dinit.h"
 #include "dinit-socket.h"
 #include "dinit-util.h"
+#include "dinit-log.h"
 #include "proc-service.h"
 
-extern eventloop_t event_loop;
-
-using clock_type = dasynq::clock_type;
-using rearm = dasynq::rearm;
-using time_val = dasynq::time_val;
+/*
+ * Most of the implementation for process-based services (process, scripted, bgprocess) is here.
+ *
+ * See proc-service.h header for interface details.
+ */
 
 // Given a string and a list of pairs of (start,end) indices for each argument in that string,
 // store a null terminator for the argument. Return a `char *` vector containing the beginning
-// of each argument and a trailing nullptr. (The returned array is invalidated if the string is later modified).
-std::vector<const char *> separate_args(std::string &s, std::list<std::pair<unsigned,unsigned>> &arg_indices)
+// of each argument and a trailing nullptr. (The returned array is invalidated if the string is
+// later modified).
+std::vector<const char *> separate_args(std::string &s,
+        const std::list<std::pair<unsigned,unsigned>> &arg_indices)
 {
     std::vector<const char *> r;
     r.reserve(arg_indices.size() + 1);
@@ -41,7 +47,13 @@ void process_service::exec_succeeded() noexcept
     // might be stopped (and killed via a signal) during smooth recovery.  We don't to
     // process startup again in either case, so we check for state STARTING:
     if (get_state() == service_state_t::STARTING) {
-        started();
+        if (force_notification_fd != -1 || !notification_var.empty()) {
+            // Wait for readiness notification:
+            readiness_watcher.set_enabled(event_loop, true);
+        }
+        else {
+            started();
+        }
     }
     else if (get_state() == service_state_t::STOPPING) {
         // stopping, but smooth recovery was in process. That's now over so we can
@@ -53,6 +65,12 @@ void process_service::exec_succeeded() noexcept
     }
 }
 
+void scripted_service::exec_succeeded() noexcept
+{
+       // For a scripted service, this means nothing other than that the start/stop
+       // script will now begin.
+}
+
 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
 {
     base_process_service *sr = service;
@@ -90,17 +108,44 @@ rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) n
     return rearm::REMOVED;
 }
 
+rearm ready_notify_watcher::fd_event(eventloop_t &, int fd, int flags) noexcept
+{
+    char buf[128];
+    if (service->get_state() == service_state_t::STARTING) {
+        // can we actually read anything from the notification pipe?
+        int r = bp_sys::read(fd, buf, sizeof(buf));
+        if (r > 0) {
+            service->started();
+        }
+        else if (r == 0 || errno != EAGAIN) {
+            service->failed_to_start(false, false);
+            service->set_state(service_state_t::STOPPING);
+            service->bring_down();
+        }
+    }
+    else {
+        // Just keep consuming data from the pipe:
+        int r = bp_sys::read(fd, buf, sizeof(buf));
+        if (r == 0) {
+            // Process closed write end or terminated
+            close(fd);
+            service->notification_fd = -1;
+            return rearm::DISARM;
+        }
+    }
+    return rearm::REARM;
+}
+
 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
 {
     base_process_service *sr = service;
 
     sr->pid = -1;
-    sr->exit_status = status;
+    sr->exit_status = bp_sys::exit_status(status);
 
-    // Ok, for a process service, any process death which we didn't rig
-    // ourselves is a bit... unexpected. Probably, the child died because
-    // we asked it to (sr->service_state == STOPPING). But even if
-    // we didn't, there's not much we can do.
+    // Ok, for a process service, any process death which we didn't rig ourselves is a bit... unexpected.
+    // Probably, the child died because we asked it to (sr->service_state == STOPPING). But even if we
+    // didn't, there's not much we can do.
 
     if (sr->waiting_for_execstat) {
         // We still don't have an exec() status from the forked child, wait for that
@@ -117,39 +162,46 @@ dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t chil
         sr->stop_timer_armed = false;
     }
 
-    sr->handle_exit_status(status);
+    sr->handle_exit_status(bp_sys::exit_status(status));
     return dasynq::rearm::NOOP;
 }
 
-void process_service::handle_exit_status(int exit_status) noexcept
+void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 {
-    bool did_exit = WIFEXITED(exit_status);
-    bool was_signalled = WIFSIGNALED(exit_status);
+    bool did_exit = exit_status.did_exit();
+    bool was_signalled = exit_status.was_signalled();
     restarting = false;
     auto service_state = get_state();
 
-    if (exit_status != 0 && service_state != service_state_t::STOPPING) {
+    if (notification_fd != -1) {
+        readiness_watcher.deregister(event_loop);
+        bp_sys::close(notification_fd);
+        notification_fd = -1;
+    }
+
+    if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
         if (did_exit) {
             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
-                    WEXITSTATUS(exit_status));
+                    exit_status.get_exit_status());
         }
         else if (was_signalled) {
             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
-                    WTERMSIG(exit_status));
+                    exit_status.get_term_sig());
         }
     }
 
     if (service_state == service_state_t::STARTING) {
-        if (did_exit && WEXITSTATUS(exit_status) == 0) {
-            started();
-        }
-        else {
-            failed_to_start();
-        }
+        // If state is STARTING, we must be waiting for readiness notification; the process has
+        // terminated before becoming ready.
+        stop_reason = stopped_reason_t::FAILED;
+        failed_to_start();
     }
     else if (service_state == service_state_t::STOPPING) {
         // We won't log a non-zero exit status or termination due to signal here -
         // we assume that the process died because we signalled it.
+        if (stop_timer_armed) {
+            restart_timer.stop_timer(event_loop);
+        }
         stopped();
     }
     else if (smooth_recovery && service_state == service_state_t::STARTED
@@ -158,6 +210,7 @@ void process_service::handle_exit_status(int exit_status) noexcept
         return;
     }
     else {
+        stop_reason = stopped_reason_t::TERMINATED;
         emergency_stop();
     }
     services->process_queues();
@@ -166,30 +219,39 @@ void process_service::handle_exit_status(int exit_status) noexcept
 void process_service::exec_failed(int errcode) noexcept
 {
     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
+
+    if (notification_fd != -1) {
+        readiness_watcher.deregister(event_loop);
+        bp_sys::close(notification_fd);
+        notification_fd = -1;
+    }
+
     if (get_state() == service_state_t::STARTING) {
+        stop_reason = stopped_reason_t::EXECFAILED;
         failed_to_start();
     }
     else {
         // Process service in smooth recovery:
+        stop_reason = stopped_reason_t::TERMINATED;
         emergency_stop();
     }
 }
 
-void bgproc_service::handle_exit_status(int exit_status) noexcept
+void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 {
     begin:
-    bool did_exit = WIFEXITED(exit_status);
-    bool was_signalled = WIFSIGNALED(exit_status);
+    bool did_exit = exit_status.did_exit();
+    bool was_signalled = exit_status.was_signalled();
     auto service_state = get_state();
 
-    if (exit_status != 0 && service_state != service_state_t::STOPPING) {
+    if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
         if (did_exit) {
             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
-                    WEXITSTATUS(exit_status));
+                    exit_status.get_exit_status());
         }
         else if (was_signalled) {
             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
-                    WTERMSIG(exit_status));
+                    exit_status.get_term_sig());
         }
     }
 
@@ -198,7 +260,7 @@ void bgproc_service::handle_exit_status(int exit_status) noexcept
     if (restarting && service_state == service_state_t::STARTED) {
         restarting = false;
         bool need_stop = false;
-        if ((did_exit && WEXITSTATUS(exit_status) != 0) || was_signalled) {
+        if ((did_exit && exit_status.get_exit_status() != 0) || was_signalled) {
             need_stop = true;
         }
         else {
@@ -220,6 +282,7 @@ void bgproc_service::handle_exit_status(int exit_status) noexcept
 
         if (need_stop) {
             // Failed startup: no auto-restart.
+            stop_reason = stopped_reason_t::TERMINATED;
             emergency_stop();
             services->process_queues();
         }
@@ -231,11 +294,12 @@ void bgproc_service::handle_exit_status(int exit_status) noexcept
     if (service_state == service_state_t::STARTING) {
         // POSIX requires that if the process exited clearly with a status code of 0,
         // the exit status value will be 0:
-        if (exit_status == 0) {
+        if (exit_status.did_exit_clean()) {
             auto pid_result = read_pid_file(&exit_status);
             switch (pid_result) {
                 case pid_result_t::FAILED:
                     // Failed startup: no auto-restart.
+                    stop_reason = stopped_reason_t::FAILED;
                     failed_to_start();
                     break;
                 case pid_result_t::TERMINATED:
@@ -248,6 +312,7 @@ void bgproc_service::handle_exit_status(int exit_status) noexcept
             }
         }
         else {
+            stop_reason = stopped_reason_t::FAILED;
             failed_to_start();
         }
     }
@@ -264,8 +329,9 @@ void bgproc_service::handle_exit_status(int exit_status) noexcept
         }
         if (! do_auto_restart() && start_explicit) {
             start_explicit = false;
-            release();
+            release(false);
         }
+        stop_reason = stopped_reason_t::TERMINATED;
         forced_stop();
         stop_dependents();
         stopped();
@@ -277,13 +343,14 @@ void bgproc_service::exec_failed(int errcode) noexcept
 {
     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
     // Only time we execute is for startup:
+    stop_reason = stopped_reason_t::EXECFAILED;
     failed_to_start();
 }
 
-void scripted_service::handle_exit_status(int exit_status) noexcept
+void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 {
-    bool did_exit = WIFEXITED(exit_status);
-    bool was_signalled = WIFSIGNALED(exit_status);
+    bool did_exit = exit_status.did_exit();
+    bool was_signalled = exit_status.was_signalled();
     auto service_state = get_state();
 
     // For a scripted service, a termination occurs in one of three main cases:
@@ -294,61 +361,66 @@ void scripted_service::handle_exit_status(int exit_status) noexcept
     if (service_state == service_state_t::STOPPING) {
         // We might be running the stop script, or we might be running the start script and have issued
         // a cancel order via SIGINT:
-        if (did_exit && WEXITSTATUS(exit_status) == 0) {
-            if (interrupting_start) {
-                interrupting_start = false;
-                // launch stop script:
-                bring_down();
+        if (interrupting_start) {
+            // We issued a start interrupt, so we expected this failure:
+            if (did_exit && exit_status.get_exit_status() != 0) {
+                log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
+                        exit_status.get_exit_status());
+                // Assume that a command terminating normally requires no cleanup:
+                stopped();
             }
             else {
-                // We were running the stop script and finished successfully
-                stopped();
+                if (was_signalled) {
+                    log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
+                            exit_status.get_term_sig());
+                }
+                // If the start script completed successfully, or was interrupted via our signal,
+                // we want to run the stop script to clean up:
+                bring_down();
             }
+            interrupting_start = false;
+        }
+        else if (exit_status.did_exit_clean()) {
+            // We were running the stop script and finished successfully
+            stopped();
         }
         else {
-            if (interrupting_start) {
-                // We issued a start interrupt, so we expected this failure:
-                if (did_exit) {
-                    log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
-                            WEXITSTATUS(exit_status));
-                }
-                else if (was_signalled) {
-                    log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
-                            WTERMSIG(exit_status));
-                }
+            // ??? failed to stop! Let's log it as warning:
+            if (did_exit) {
+                log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
+                        exit_status.get_exit_status());
             }
-            else {
-                // ??? failed to stop! Let's log it as warning:
-                if (did_exit) {
-                    log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
-                            WEXITSTATUS(exit_status));
-                }
-                else if (was_signalled) {
-                    log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
-                            WTERMSIG(exit_status));
-                }
+            else if (was_signalled) {
+                log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
+                        exit_status.get_term_sig());
             }
             // Even if the stop script failed, assume that service is now stopped, so that any dependencies
             // can be stopped. There's not really any other useful course of action here.
-            interrupting_start = false;
             stopped();
         }
         services->process_queues();
     }
     else { // STARTING
-        if (exit_status == 0) {
+        if (exit_status.did_exit_clean()) {
+            started();
+        }
+        else if (was_signalled && exit_status.get_term_sig() == SIGINT && onstart_flags.skippable) {
+            // A skippable service can be skipped by interrupting (eg by ^C if the service
+            // starts on the console).
+            start_skipped = true;
             started();
         }
         else {
             // failed to start
             if (did_exit) {
                 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
-                        WEXITSTATUS(exit_status));
+                        exit_status.get_exit_status());
             }
             else if (was_signalled) {
                 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
-                        WTERMSIG(exit_status));
+                        exit_status.get_term_sig());
             }
+            stop_reason = stopped_reason_t::FAILED;
             failed_to_start();
         }
         services->process_queues();
@@ -360,6 +432,7 @@ void scripted_service::exec_failed(int errcode) noexcept
     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
     auto service_state = get_state();
     if (service_state == service_state_t::STARTING) {
+        stop_reason = stopped_reason_t::EXECFAILED;
         failed_to_start();
     }
     else if (service_state == service_state_t::STOPPING) {
@@ -370,7 +443,7 @@ void scripted_service::exec_failed(int errcode) noexcept
 }
 
 bgproc_service::pid_result_t
-bgproc_service::read_pid_file(int *exit_status) noexcept
+bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
 {
     const char *pid_file_c = pid_file.c_str();
     int fd = open(pid_file_c, O_CLOEXEC);
@@ -380,7 +453,7 @@ bgproc_service::read_pid_file(int *exit_status) noexcept
     }
 
     char pidbuf[21]; // just enough to hold any 64-bit integer
-    int r = ss_read(fd, pidbuf, 20);
+    int r = complete_read(fd, pidbuf, 20);
     if (r < 0) {
         // Could not read from PID file
         log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
@@ -440,7 +513,6 @@ bgproc_service::read_pid_file(int *exit_status) noexcept
 
 void process_service::bring_down() noexcept
 {
-    waiting_for_deps = false;
     if (waiting_for_execstat) {
         // The process is still starting. This should be uncommon, but can occur during
         // smooth recovery. We can't do much now; we have to wait until we get the
@@ -458,10 +530,37 @@ void process_service::bring_down() noexcept
             kill_pg(term_signal);
         }
 
+        // If there's a stop timeout, arm the timer now:
+        if (stop_timeout != time_val(0,0)) {
+            restart_timer.arm_timer_rel(event_loop, stop_timeout);
+            stop_timer_armed = true;
+        }
+
+        // The rest is done in handle_exit_status.
+    }
+    else {
+        // The process is already dead.
+        stopped();
+    }
+}
+
+void bgproc_service::bring_down() noexcept
+{
+    if (pid != -1) {
+        // The process is still kicking on - must actually kill it. We signal the process
+        // group (-pid) rather than just the process as there's less risk then of creating
+        // an orphaned process group:
+        if (! onstart_flags.no_sigterm) {
+            kill_pg(SIGTERM);
+        }
+        if (term_signal != -1) {
+            kill_pg(term_signal);
+        }
+
         // In most cases, the rest is done in handle_exit_status.
         // If we are a BGPROCESS and the process is not our immediate child, however, that
         // won't work - check for this now:
-        if (get_type() == service_type_t::BGPROCESS && ! tracking_child) {
+        if (! tracking_child) {
             stopped();
         }
         else if (stop_timeout != time_val(0,0)) {
@@ -477,7 +576,11 @@ void process_service::bring_down() noexcept
 
 void scripted_service::bring_down() noexcept
 {
-    waiting_for_deps = false;
+       if (pid != -1) {
+               // We're already running the stop script; nothing to do.
+               return;
+       }
+
     if (stop_command.length() == 0) {
         stopped();
     }
@@ -496,24 +599,7 @@ void scripted_service::bring_down() noexcept
 
 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
 {
-    service->stop_timer_armed = false;
-
-    // Timer expires if:
-    // We are stopping, including after having startup cancelled (stop timeout, state is STOPPING); We are
-    // starting (start timeout, state is STARTING); We are waiting for restart timer before restarting,
-    // including smooth recovery (restart timeout, state is STARTING or STARTED).
-    if (service->get_state() == service_state_t::STOPPING) {
-        service->kill_with_fire();
-    }
-    else if (service->pid != -1) {
-        // Starting, start timed out.
-        service->stop_dependents();
-        service->interrupt_start();
-    }
-    else {
-        // STARTING / STARTED, and we have a pid: must be restarting (smooth recovery if STARTED)
-        service->do_restart();
-    }
+    service->timer_expired();
 
     // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
     return dasynq::rearm::NOOP;