Fix restart after unexpected termination.
[oweals/dinit.git] / src / service.cc
index c8e5f56f919e79fc1e788125d4c88d22915fcb89..504a9b6de0836710449785effbac4a3f04b97452 100644 (file)
@@ -1,17 +1,11 @@
 #include <cstring>
 #include <cerrno>
-#include <sstream>
 #include <iterator>
 #include <memory>
 #include <cstddef>
 
-#include <sys/types.h>
-#include <sys/stat.h>
 #include <sys/ioctl.h>
-#include <sys/un.h>
-#include <sys/socket.h>
 #include <fcntl.h>
-#include <unistd.h>
 #include <termios.h>
 
 #include "dinit.h"
@@ -19,6 +13,7 @@
 #include "dinit-log.h"
 #include "dinit-socket.h"
 #include "dinit-util.h"
+#include "baseproc-sys.h"
 
 /*
  * service.cc - Service management.
@@ -31,7 +26,7 @@ static service_record * find_service(const std::list<service_record *> & records
 {
     using std::list;
     list<service_record *>::const_iterator i = records.begin();
-    for ( ; i != records.end(); i++ ) {
+    for ( ; i != records.end(); ++i ) {
         if (strcmp((*i)->get_name().c_str(), name) == 0) {
             return *i;
         }
@@ -57,18 +52,18 @@ void service_set::stop_service(const std::string & name) noexcept
 // is due to an unexpected process termination.
 void service_record::stopped() noexcept
 {
-    if (onstart_flags.runs_on_console) {
-        tcsetpgrp(0, getpgrp());
-        discard_console_log_buffer();
+    if (have_console) {
+        bp_sys::tcsetpgrp(0, bp_sys::getpgrp());
         release_console();
     }
 
     force_stop = false;
 
-    // If we are a soft dependency of another target, break the acquisition from that target now:
+    // If we are a soft dependency of another target, break the acquisition from that target now,
+    // so that we don't re-start:
     for (auto & dependent : dependents) {
         if (dependent->dep_type != dependency_type::REGULAR) {
-            if (dependent->holding_acq) {
+            if (dependent->holding_acq  && ! dependent->waiting_on) {
                 dependent->holding_acq = false;
                 release();
             }
@@ -76,9 +71,9 @@ void service_record::stopped() noexcept
     }
 
     bool will_restart = (desired_state == service_state_t::STARTED)
-            && services->get_auto_restart();
+            && !services->is_shutting_down();
 
-    for (auto dependency : depends_on) {
+    for (auto dependency : depends_on) {
         // we signal dependencies in case they are waiting for us to stop:
         dependency.get_to()->dependent_stopped();
     }
@@ -91,55 +86,68 @@ void service_record::stopped() noexcept
         start(false);
     }
     else {
-        if (socket_fd != -1) {
-            close(socket_fd);
-            socket_fd = -1;
-        }
+        becoming_inactive();
         
         if (start_explicit) {
+            // If we were explicitly started, our required_by count must be at least 1. Use
+            // release() to correctly release, mark inactive and release dependencies.
             start_explicit = false;
             release();
         }
         else if (required_by == 0) {
+            // This can only be the case if we didn't have start_explicit, since required_by would
+            // otherwise by non-zero.
+            prop_release = !prop_require;
+            prop_require = false;
+            services->add_prop_queue(this);
             services->service_inactive(this);
         }
     }
 
-    log_service_stopped(service_name);
+    // Start failure will have been logged already, only log if we are stopped for other reasons:
+    if (! start_failed) {
+        log_service_stopped(service_name);
+
+        // If this service chains to another, start the other service now:
+        if (! will_restart && ! start_on_completion.empty()) {
+            try {
+                auto chain_to = services->load_service(start_on_completion.c_str());
+                chain_to->start();
+            }
+            catch (service_load_exc &sle) {
+                log(loglevel_t::ERROR, "Couldn't chain to service ", start_on_completion, ": ",
+                        "couldn't load ", sle.service_name, ": ", sle.exc_description);
+            }
+            catch (std::bad_alloc &bae) {
+                log(loglevel_t::ERROR, "Couldn't chain to service ", start_on_completion,
+                        ": Out of memory");
+            }
+        }
+    }
     notify_listeners(service_event_t::STOPPED);
 }
 
-
 bool service_record::do_auto_restart() noexcept
 {
     if (auto_restart) {
-        return services->get_auto_restart();
+        return !services->is_shutting_down();
     }
     return false;
 }
 
-void service_record::emergency_stop() noexcept
-{
-    if (! do_auto_restart() && start_explicit) {
-        start_explicit = false;
-        release();
-    }
-    forced_stop();
-    stop_dependents();
-    stopped();
-}
-
-
 void service_record::require() noexcept
 {
     if (required_by++ == 0) {
         prop_require = !prop_release;
         prop_release = false;
         services->add_prop_queue(this);
+        if (service_state != service_state_t::STARTING && service_state != service_state_t::STARTED) {
+            prop_start = true;
+        }
     }
 }
 
-void service_record::release() noexcept
+void service_record::release(bool issue_stop) noexcept
 {
     if (--required_by == 0) {
         desired_state = service_state_t::STOPPED;
@@ -153,7 +161,8 @@ void service_record::release() noexcept
         if (service_state == service_state_t::STOPPED) {
             services->service_inactive(this);
         }
-        else {
+        else if (issue_stop) {
+               stop_reason = stopped_reason_t::NORMAL;
             do_stop();
         }
     }
@@ -164,8 +173,10 @@ void service_record::release_dependencies() noexcept
     for (auto & dependency : depends_on) {
         service_record * dep_to = dependency.get_to();
         if (dependency.holding_acq) {
-            dep_to->release();
+            // We must clear holding_acq before calling release, otherwise the dependency
+            // may decide to stop, check this link and release itself a second time.
             dependency.holding_acq = false;
+            dep_to->release();
         }
     }
 }
@@ -197,6 +208,8 @@ void service_record::start(bool activate) noexcept
         services->service_active(this);
     }
 
+    start_failed = false;
+    start_skipped = false;
     service_state = service_state_t::STARTING;
     waiting_for_deps = true;
 
@@ -223,6 +236,7 @@ void service_record::do_propagation() noexcept
     
     if (prop_failure) {
         prop_failure = false;
+        stop_reason = stopped_reason_t::DEPFAILED;
         failed_to_start(true);
     }
     
@@ -243,12 +257,12 @@ void service_record::execute_transition() noexcept
     if (service_state == service_state_t::STARTING || (service_state == service_state_t::STARTED
             && restarting)) {
         if (check_deps_started()) {
-            bool have_console = service_state == service_state_t::STARTED && onstart_flags.runs_on_console;
-            all_deps_started(have_console);
+            all_deps_started();
         }
     }
     else if (service_state == service_state_t::STOPPING) {
         if (stop_check_dependents()) {
+            waiting_for_deps = false;
             bring_down();
         }
     }
@@ -275,6 +289,8 @@ void service_record::do_start() noexcept
 
 void service_record::dependency_started() noexcept
 {
+    // Note that we check for STARTED state here in case the service is in smooth recovery while pinned.
+    // In that case it will wait for dependencies to start before restarting the process.
     if ((service_state == service_state_t::STARTING || service_state == service_state_t::STARTED)
             && waiting_for_deps) {
         services->add_transition_queue(this);
@@ -311,89 +327,9 @@ bool service_record::check_deps_started() noexcept
     return true;
 }
 
-bool service_record::open_socket() noexcept
-{
-    if (socket_path.empty() || socket_fd != -1) {
-        // No socket, or already open
-        return true;
-    }
-    
-    const char * saddrname = socket_path.c_str();
-    
-    // Check the specified socket path
-    struct stat stat_buf;
-    if (stat(saddrname, &stat_buf) == 0) {
-        if ((stat_buf.st_mode & S_IFSOCK) == 0) {
-            // Not a socket
-            log(loglevel_t::ERROR, service_name, ": Activation socket file exists (and is not a socket)");
-            return false;
-        }
-    }
-    else if (errno != ENOENT) {
-        // Other error
-        log(loglevel_t::ERROR, service_name, ": Error checking activation socket: ", strerror(errno));
-        return false;
-    }
-
-    // Remove stale socket file (if it exists).
-    // We won't test the return from unlink - if it fails other than due to ENOENT, we should get an
-    // error when we try to create the socket anyway.
-    unlink(saddrname);
-
-    uint sockaddr_size = offsetof(struct sockaddr_un, sun_path) + socket_path.length() + 1;
-    struct sockaddr_un * name = static_cast<sockaddr_un *>(malloc(sockaddr_size));
-    if (name == nullptr) {
-        log(loglevel_t::ERROR, service_name, ": Opening activation socket: out of memory");
-        return false;
-    }
-
-    name->sun_family = AF_UNIX;
-    strcpy(name->sun_path, saddrname);
-
-    int sockfd = dinit_socket(AF_UNIX, SOCK_STREAM, 0, SOCK_NONBLOCK | SOCK_CLOEXEC);
-    if (sockfd == -1) {
-        log(loglevel_t::ERROR, service_name, ": Error creating activation socket: ", strerror(errno));
-        free(name);
-        return false;
-    }
-
-    if (bind(sockfd, (struct sockaddr *) name, sockaddr_size) == -1) {
-        log(loglevel_t::ERROR, service_name, ": Error binding activation socket: ", strerror(errno));
-        close(sockfd);
-        free(name);
-        return false;
-    }
-    
-    free(name);
-    
-    // POSIX (1003.1, 2013) says that fchown and fchmod don't necessarily work on sockets. We have to
-    // use chown and chmod instead.
-    if (chown(saddrname, socket_uid, socket_gid)) {
-        log(loglevel_t::ERROR, service_name, ": Error setting activation socket owner/group: ", strerror(errno));
-        close(sockfd);
-        return false;
-    }
-    
-    if (chmod(saddrname, socket_perms) == -1) {
-        log(loglevel_t::ERROR, service_name, ": Error setting activation socket permissions: ", strerror(errno));
-        close(sockfd);
-        return false;
-    }
-
-    if (listen(sockfd, 128) == -1) { // 128 "seems reasonable".
-        log(loglevel_t::ERROR, ": Error listening on activation socket: ", strerror(errno));
-        close(sockfd);
-        return false;
-    }
-    
-    socket_fd = sockfd;
-    return true;
-}
-
-void service_record::all_deps_started(bool has_console) noexcept
+void service_record::all_deps_started() noexcept
 {
-    if (onstart_flags.starts_on_console && ! has_console) {
-        waiting_for_deps = true;
+    if (onstart_flags.starts_on_console && ! have_console) {
         queue_for_console();
         return;
     }
@@ -405,10 +341,6 @@ void service_record::all_deps_started(bool has_console) noexcept
         return;
     }
 
-    if (! open_socket()) {
-        failed_to_start();
-    }
-
     bool start_success = bring_up();
     if (! start_success) {
         failed_to_start();
@@ -417,12 +349,15 @@ void service_record::all_deps_started(bool has_console) noexcept
 
 void service_record::acquired_console() noexcept
 {
+    waiting_for_console = false;
+    have_console = true;
+
     if (service_state != service_state_t::STARTING) {
         // We got the console but no longer want it.
         release_console();
     }
     else if (check_deps_started()) {
-        all_deps_started(true);
+        all_deps_started();
     }
     else {
         // We got the console but can't use it yet.
@@ -430,11 +365,11 @@ void service_record::acquired_console() noexcept
     }
 }
 
-
 void service_record::started() noexcept
 {
-    if (onstart_flags.starts_on_console && ! onstart_flags.runs_on_console) {
-        tcsetpgrp(0, getpgrp());
+    // If we start on console but don't keep it, release it now:
+    if (have_console && ! onstart_flags.runs_on_console) {
+        bp_sys::tcsetpgrp(0, bp_sys::getpgrp());
         release_console();
     }
 
@@ -443,7 +378,7 @@ void service_record::started() noexcept
     notify_listeners(service_event_t::STARTED);
 
     if (onstart_flags.rw_ready) {
-        open_control_socket();
+        rootfs_is_rw();
     }
     if (onstart_flags.log_ready) {
         setup_external_log();
@@ -462,21 +397,18 @@ void service_record::started() noexcept
     }
 }
 
-void service_record::failed_to_start(bool depfailed) noexcept
+void service_record::failed_to_start(bool depfailed, bool immediate_stop) noexcept
 {
-    if (!depfailed && onstart_flags.starts_on_console) {
-        tcsetpgrp(0, getpgrp());
-        release_console();
+    if (waiting_for_console) {
+        services->unqueue_console(this);
+        waiting_for_console = false;
     }
-    
-    log_service_failed(get_name());
-    service_state = service_state_t::STOPPED;
+
     if (start_explicit) {
         start_explicit = false;
-        release();
+        release(false);
     }
-    notify_listeners(service_event_t::FAILEDSTART);
-    
+
     // Cancel start of dependents:
     for (auto & dept : dependents) {
         switch (dept->dep_type) {
@@ -493,11 +425,22 @@ void service_record::failed_to_start(bool depfailed) noexcept
                 dept->waiting_on = false;
                 dept->get_from()->dependency_started();
             }
-            if (dept->holding_acq) {
-                dept->holding_acq = false;
-                release();
-            }
         }
+
+        // Always release now, so that our desired state will be STOPPED before we call
+        // stopped() below (if we do so). Otherwise it may decide to restart us.
+        if (dept->holding_acq) {
+            dept->holding_acq = false;
+            release(false);
+        }
+    }
+
+    start_failed = true;
+    log_service_failed(get_name());
+    notify_listeners(service_event_t::FAILEDSTART);
+
+    if (immediate_stop) {
+        stopped();
     }
 }
 
@@ -513,7 +456,10 @@ void service_record::forced_stop() noexcept
 {
     if (service_state != service_state_t::STOPPED) {
         force_stop = true;
-        services->add_transition_queue(this);
+        if (! pinned_started) {
+            prop_stop = true;
+            services->add_prop_queue(this);
+        }
     }
 }
 
@@ -531,37 +477,45 @@ void service_record::stop(bool bring_down) noexcept
         release();
     }
 
-    if (bring_down) {
+    if (bring_down && service_state != service_state_t::STOPPED
+               && service_state != service_state_t::STOPPING) {
+       stop_reason = stopped_reason_t::NORMAL;
         do_stop();
     }
 }
 
 void service_record::do_stop() noexcept
 {
-    if (pinned_started) return;
-
+    // A service that does actually stop for any reason should have its explicit activation released, unless
+    // it will restart:
     if (start_explicit && ! do_auto_restart()) {
         start_explicit = false;
-        release();
-        if (required_by == 0) return; // release will re-call us anyway
+        release(false);
     }
 
     bool all_deps_stopped = stop_dependents();
 
     if (service_state != service_state_t::STARTED) {
         if (service_state == service_state_t::STARTING) {
-            if (! waiting_for_deps) {
+            // If waiting for a dependency, or waiting for the console, we can interrupt start. Otherwise,
+            // we need to delegate to can_interrupt_start() (which can be overridden).
+            if (! waiting_for_deps && ! waiting_for_console) {
                 if (! can_interrupt_start()) {
-                    // Well this is awkward: we're going to have to continue starting. We can stop once we've
-                    // reached the started state.
+                    // Well this is awkward: we're going to have to continue starting. We can stop once
+                    // we've reached the started state.
                     return;
                 }
 
                 if (! interrupt_start()) {
                     // Now wait for service startup to actually end; we don't need to handle it here.
+                    notify_listeners(service_event_t::STARTCANCELLED);
                     return;
                 }
             }
+            else if (waiting_for_console) {
+                services->unqueue_console(this);
+                waiting_for_console = false;
+            }
 
             // We must have had desired_state == STARTED.
             notify_listeners(service_event_t::STARTCANCELLED);
@@ -576,6 +530,8 @@ void service_record::do_stop() noexcept
         }
     }
 
+    if (pinned_started) return;
+
     service_state = service_state_t::STOPPING;
     waiting_for_deps = true;
     if (all_deps_stopped) {
@@ -600,7 +556,9 @@ bool service_record::stop_dependents() noexcept
 {
     bool all_deps_stopped = true;
     for (auto dept : dependents) {
-        if (dept->dep_type == dependency_type::REGULAR) {
+        if (dept->dep_type == dependency_type::REGULAR ||
+                (dept->dep_type == dependency_type::MILESTONE &&
+                dept->get_from()->service_state != service_state_t::STARTED)) {
             if (! dept->get_from()->is_stopped()) {
                 // Note we check *first* since if the dependent service is not stopped,
                 // 1. We will issue a stop to it shortly and
@@ -617,6 +575,19 @@ bool service_record::stop_dependents() noexcept
             dept->get_from()->prop_stop = true;
             services->add_prop_queue(dept->get_from());
         }
+        else {
+            // waits-for or soft dependency:
+            if (dept->waiting_on) {
+                dept->waiting_on = false;
+                dept->get_from()->dependency_started();
+            }
+            if (dept->holding_acq) {
+                dept->holding_acq = false;
+                // release without issuing stop, since we should be called only when this
+                // service is already stopped/stopping:
+                release(false);
+            }
+        }
     }
 
     return all_deps_stopped;
@@ -649,17 +620,18 @@ void service_record::unpin() noexcept
 
 void service_record::queue_for_console() noexcept
 {
+    waiting_for_console = true;
     services->append_console_queue(this);
 }
 
 void service_record::release_console() noexcept
 {
+    have_console = false;
     services->pull_console_queue();
 }
 
 bool service_record::interrupt_start() noexcept
 {
-    services->unqueue_console(this);
     return true;
 }