From: Davin McCall Date: Thu, 11 Jan 2018 09:04:54 +0000 (+0000) Subject: Split service.cc into multiple files. X-Git-Tag: v0.08~58 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=9cb0528e4de8f04f863db3b0d4b7410475334b0a;p=oweals%2Fdinit.git Split service.cc into multiple files. service.cc becomes service.cc + proc-service.cc + baseproc-service.cc. The header service.h becomes service.h + proc-service.h. This refactoring should make testing easier. --- diff --git a/src/Makefile b/src/Makefile index 54ad5d9..be9cc4f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,9 +4,11 @@ ifeq ($(BUILD_SHUTDOWN),yes) SHUTDOWN=shutdown endif -objects = dinit.o load_service.o service.o control.o dinit-log.o dinit-main.o dinitctl.o shutdown.o +#objects = dinit.o load_service.o service.o proc-service.o baseproc-service.o control.o dinit-log.o dinit-main.o dinitctl.o shutdown.o -dinit_objects = dinit.o load_service.o service.o control.o dinit-log.o dinit-main.o +dinit_objects = dinit.o load_service.o service.o proc-service.o baseproc-service.o control.o dinit-log.o dinit-main.o + +objects = $(dinit_objects) dinitctl.o shtudown.o all: dinit dinitctl $(SHUTDOWN) diff --git a/src/baseproc-service.cc b/src/baseproc-service.cc new file mode 100644 index 0000000..dc13afe --- /dev/null +++ b/src/baseproc-service.cc @@ -0,0 +1,318 @@ +#include "dinit-socket.h" +#include "proc-service.h" + +/* + * Base process implementation (base_process_service). + * + * See proc-service.h for interface documentation. + */ + +extern eventloop_t event_loop; + +using clock_type = dasynq::clock_type; +using rearm = dasynq::rearm; +using time_val = dasynq::time_val; + +void base_process_service::do_smooth_recovery() noexcept +{ + if (! restart_ps_process()) { + emergency_stop(); + services->process_queues(); + } +} + +bool base_process_service::bring_up() noexcept +{ + if (restarting) { + if (pid == -1) { + return restart_ps_process(); + } + return true; + } + else { + event_loop.get_time(restart_interval_time, clock_type::MONOTONIC); + restart_interval_count = 0; + if (start_ps_process(exec_arg_parts, onstart_flags.starts_on_console)) { + if (start_timeout != time_val(0,0)) { + restart_timer.arm_timer_rel(event_loop, start_timeout); + stop_timer_armed = true; + } + else if (stop_timer_armed) { + restart_timer.stop_timer(event_loop); + stop_timer_armed = false; + } + return true; + } + return false; + } +} + +bool base_process_service::start_ps_process(const std::vector &cmd, bool on_console) noexcept +{ + // In general, you can't tell whether fork/exec is successful. We use a pipe to communicate + // success/failure from the child to the parent. The pipe is set CLOEXEC so a successful + // exec closes the pipe, and the parent sees EOF. If the exec is unsuccessful, the errno + // is written to the pipe, and the parent can read it. + + event_loop.get_time(last_start_time, clock_type::MONOTONIC); + + int pipefd[2]; + if (dasynq::pipe2(pipefd, O_CLOEXEC)) { + log(loglevel_t::ERROR, get_name(), ": can't create status check pipe: ", strerror(errno)); + return false; + } + + const char * logfile = this->logfile.c_str(); + if (*logfile == 0) { + logfile = "/dev/null"; + } + + bool child_status_registered = false; + control_conn_t *control_conn = nullptr; + + int control_socket[2] = {-1, -1}; + if (onstart_flags.pass_cs_fd) { + if (dinit_socketpair(AF_UNIX, SOCK_STREAM, /* protocol */ 0, control_socket, SOCK_NONBLOCK)) { + log(loglevel_t::ERROR, get_name(), ": can't create control socket: ", strerror(errno)); + goto out_p; + } + + // Make the server side socket close-on-exec: + int fdflags = fcntl(control_socket[0], F_GETFD); + fcntl(control_socket[0], F_SETFD, fdflags | FD_CLOEXEC); + + try { + control_conn = new control_conn_t(event_loop, services, control_socket[0]); + } + catch (std::exception &exc) { + log(loglevel_t::ERROR, get_name(), ": can't launch process; out of memory"); + goto out_cs; + } + } + + // Set up complete, now fork and exec: + + pid_t forkpid; + + try { + child_status_listener.add_watch(event_loop, pipefd[0], dasynq::IN_EVENTS); + child_status_registered = true; + + // We specify a high priority (i.e. low priority value) so that process termination is + // handled early. This means we have always recorded that the process is terminated by the + // time that we handle events that might otherwise cause us to signal the process, so we + // avoid sending a signal to an invalid (and possibly recycled) process ID. + forkpid = child_listener.fork(event_loop, reserved_child_watch, dasynq::DEFAULT_PRIORITY - 10); + reserved_child_watch = true; + } + catch (std::exception &e) { + log(loglevel_t::ERROR, get_name(), ": Could not fork: ", e.what()); + goto out_cs_h; + } + + if (forkpid == 0) { + run_child_proc(cmd.data(), logfile, on_console, pipefd[1], control_socket[1]); + } + else { + // Parent process + close(pipefd[1]); // close the 'other end' fd + if (control_socket[1] != -1) { + close(control_socket[1]); + } + pid = forkpid; + + waiting_for_execstat = true; + return true; + } + + // Failure exit: + + out_cs_h: + if (child_status_registered) { + child_status_listener.deregister(event_loop); + } + + if (onstart_flags.pass_cs_fd) { + delete control_conn; + + out_cs: + close(control_socket[0]); + close(control_socket[1]); + } + + out_p: + close(pipefd[0]); + close(pipefd[1]); + + return false; +} + +void base_process_service::bring_down() noexcept +{ + waiting_for_deps = false; + if (pid != -1) { + // The process is still kicking on - must actually kill it. We signal the process + // group (-pid) rather than just the process as there's less risk then of creating + // an orphaned process group: + if (! onstart_flags.no_sigterm) { + kill_pg(SIGTERM); + } + if (term_signal != -1) { + kill_pg(term_signal); + } + + // In most cases, the rest is done in handle_exit_status. + // If we are a BGPROCESS and the process is not our immediate child, however, that + // won't work - check for this now: + if (get_type() == service_type_t::BGPROCESS && ! tracking_child) { + stopped(); + } + else if (stop_timeout != time_val(0,0)) { + restart_timer.arm_timer_rel(event_loop, stop_timeout); + stop_timer_armed = true; + } + } + else { + // The process is already dead. + stopped(); + } +} + +base_process_service::base_process_service(service_set *sset, string name, + service_type_t service_type_p, string &&command, + std::list> &command_offsets, + const std::list &deplist_p) + : service_record(sset, name, service_type_p, deplist_p), child_listener(this), + child_status_listener(this), restart_timer(this) +{ + program_name = std::move(command); + exec_arg_parts = separate_args(program_name, command_offsets); + + restart_interval_count = 0; + restart_interval_time = {0, 0}; + restart_timer.service = this; + restart_timer.add_timer(event_loop); + + // By default, allow a maximum of 3 restarts within 10.0 seconds: + restart_interval.seconds() = 10; + restart_interval.nseconds() = 0; + max_restart_interval_count = 3; + + waiting_restart_timer = false; + reserved_child_watch = false; + tracking_child = false; + stop_timer_armed = false; + start_is_interruptible = false; +} + +void base_process_service::do_restart() noexcept +{ + waiting_restart_timer = false; + restart_interval_count++; + auto service_state = get_state(); + + // We may be STARTING (regular restart) or STARTED ("smooth recovery"). This affects whether + // the process should be granted access to the console: + bool on_console = service_state == service_state_t::STARTING + ? onstart_flags.starts_on_console : onstart_flags.runs_on_console; + + if (service_state == service_state_t::STARTING) { + // for a smooth recovery, we want to check dependencies are available before actually + // starting: + if (! check_deps_started()) { + waiting_for_deps = true; + return; + } + } + + if (! start_ps_process(exec_arg_parts, on_console)) { + restarting = false; + if (service_state == service_state_t::STARTING) { + failed_to_start(); + } + else { + // desired_state = service_state_t::STOPPED; + forced_stop(); + } + services->process_queues(); + } +} + +bool base_process_service::restart_ps_process() noexcept +{ + using time_val = dasynq::time_val; + + time_val current_time; + event_loop.get_time(current_time, clock_type::MONOTONIC); + + if (max_restart_interval_count != 0) { + // Check whether we're still in the most recent restart check interval: + time_val int_diff = current_time - restart_interval_time; + if (int_diff < restart_interval) { + if (restart_interval_count >= max_restart_interval_count) { + log(loglevel_t::ERROR, "Service ", get_name(), " restarting too quickly; stopping."); + return false; + } + } + else { + restart_interval_time = current_time; + restart_interval_count = 0; + } + } + + // Check if enough time has lapsed since the prevous restart. If not, start a timer: + time_val tdiff = current_time - last_start_time; + if (restart_delay <= tdiff) { + // > restart delay (normally 200ms) + do_restart(); + } + else { + time_val timeout = restart_delay - tdiff; + restart_timer.arm_timer_rel(event_loop, timeout); + waiting_restart_timer = true; + } + return true; +} + +bool base_process_service::interrupt_start() noexcept +{ + if (waiting_restart_timer) { + restart_timer.stop_timer(event_loop); + waiting_restart_timer = false; + return service_record::interrupt_start(); + } + else { + log(loglevel_t::WARN, "Interrupting start of service ", get_name(), " with pid ", pid, " (with SIGINT)."); + kill_pg(SIGINT); + if (stop_timeout != time_val(0,0)) { + restart_timer.arm_timer_rel(event_loop, stop_timeout); + stop_timer_armed = true; + } + else if (stop_timer_armed) { + restart_timer.stop_timer(event_loop); + stop_timer_armed = false; + } + set_state(service_state_t::STOPPING); + notify_listeners(service_event_t::STARTCANCELLED); + return false; + } +} + +void base_process_service::kill_with_fire() noexcept +{ + if (pid != -1) { + log(loglevel_t::WARN, "Service ", get_name(), " with pid ", pid, " exceeded allowed stop time; killing."); + kill_pg(SIGKILL); + } +} + +void base_process_service::kill_pg(int signo) noexcept +{ + pid_t pgid = getpgid(pid); + if (pgid == -1) { + // only should happen if pid is invalid, which should never happen... + log(loglevel_t::ERROR, get_name(), ": can't signal process: ", strerror(errno)); + return; + } + kill(-pgid, signo); +} diff --git a/src/dinit-util.h b/src/dinit-util.h index 947878e..77170f4 100644 --- a/src/dinit-util.h +++ b/src/dinit-util.h @@ -1,8 +1,12 @@ #ifndef DINIT_UTIL_H_INCLUDED #define DINIT_UTIL_H_INCLUDED 1 +#include #include +#include +#include + // Signal-safe read. Read and re-try if interrupted by signal (EINTR). // *May* affect errno even on a successful read (when the return is less than n). inline ssize_t ss_read(int fd, void * buf, size_t n) diff --git a/src/load_service.cc b/src/load_service.cc index 7a3578f..52937b0 100644 --- a/src/load_service.cc +++ b/src/load_service.cc @@ -10,7 +10,7 @@ #include #include -#include "service.h" +#include "proc-service.h" using string = std::string; using string_iterator = std::string::iterator; diff --git a/src/proc-service.cc b/src/proc-service.cc new file mode 100644 index 0000000..0aeb9e3 --- /dev/null +++ b/src/proc-service.cc @@ -0,0 +1,494 @@ +#include +#include + +#include "dinit-socket.h" +#include "dinit-util.h" +#include "proc-service.h" + +extern eventloop_t event_loop; + +using clock_type = dasynq::clock_type; +using rearm = dasynq::rearm; +using time_val = dasynq::time_val; + +rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept +{ + base_process_service *sr = service; + sr->waiting_for_execstat = false; + + int exec_status; + int r = read(get_watched_fd(), &exec_status, sizeof(int)); + deregister(loop); + close(get_watched_fd()); + + if (r > 0) { + // We read an errno code; exec() failed, and the service startup failed. + if (sr->pid != -1) { + sr->child_listener.deregister(event_loop, sr->pid); + sr->reserved_child_watch = false; + if (sr->stop_timer_armed) { + sr->restart_timer.stop_timer(loop); + sr->stop_timer_armed = false; + } + } + sr->pid = -1; + sr->exec_failed(exec_status); + } + else { + // exec() succeeded. + if (sr->get_type() == service_type_t::PROCESS) { + // This could be a smooth recovery (state already STARTED). Even more, the process + // might be stopped (and killed via a signal) during smooth recovery. We don't to + // process startup again in either case, so we check for state STARTING: + if (sr->get_state() == service_state_t::STARTING) { + sr->started(); + } + else if (sr->get_state() == service_state_t::STOPPING) { + // stopping, but smooth recovery was in process. That's now over so we can + // commence normal stop. Note that if pid == -1 the process already stopped(!), + // that's handled below. + if (sr->pid != -1 && sr->stop_check_dependents()) { + sr->bring_down(); + } + } + } + + if (sr->pid == -1) { + // Somehow the process managed to complete before we even saw the status. + sr->handle_exit_status(sr->exit_status); + } + } + + sr->services->process_queues(); + + return rearm::REMOVED; +} + +dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept +{ + base_process_service *sr = service; + + sr->pid = -1; + sr->exit_status = status; + + // Ok, for a process service, any process death which we didn't rig + // ourselves is a bit... unexpected. Probably, the child died because + // we asked it to (sr->service_state == STOPPING). But even if + // we didn't, there's not much we can do. + + if (sr->waiting_for_execstat) { + // We still don't have an exec() status from the forked child, wait for that + // before doing any further processing. + return dasynq::rearm::NOOP; // hold watch reservation + } + + // Must stop watch now since handle_exit_status might result in re-launch: + // (stop_watch instead of deregister, so that we hold watch reservation). + stop_watch(loop); + + if (sr->stop_timer_armed) { + sr->restart_timer.stop_timer(loop); + sr->stop_timer_armed = false; + } + + sr->handle_exit_status(status); + return dasynq::rearm::NOOP; +} + +void process_service::handle_exit_status(int exit_status) noexcept +{ + bool did_exit = WIFEXITED(exit_status); + bool was_signalled = WIFSIGNALED(exit_status); + restarting = false; + auto service_state = get_state(); + + if (exit_status != 0 && service_state != service_state_t::STOPPING) { + if (did_exit) { + log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ", + WEXITSTATUS(exit_status)); + } + else if (was_signalled) { + log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ", + WTERMSIG(exit_status)); + } + } + + if (service_state == service_state_t::STARTING) { + if (did_exit && WEXITSTATUS(exit_status) == 0) { + started(); + } + else { + failed_to_start(); + } + } + else if (service_state == service_state_t::STOPPING) { + // We won't log a non-zero exit status or termination due to signal here - + // we assume that the process died because we signalled it. + stopped(); + } + else if (smooth_recovery && service_state == service_state_t::STARTED + && get_target_state() == service_state_t::STARTED) { + do_smooth_recovery(); + return; + } + else { + emergency_stop(); + } + services->process_queues(); +} + +void process_service::exec_failed(int errcode) noexcept +{ + log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode)); + if (get_state() == service_state_t::STARTING) { + failed_to_start(); + } + else { + // Process service in smooth recovery: + emergency_stop(); + } +} + +void bgproc_service::handle_exit_status(int exit_status) noexcept +{ + begin: + bool did_exit = WIFEXITED(exit_status); + bool was_signalled = WIFSIGNALED(exit_status); + auto service_state = get_state(); + + if (exit_status != 0 && service_state != service_state_t::STOPPING) { + if (did_exit) { + log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ", + WEXITSTATUS(exit_status)); + } + else if (was_signalled) { + log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ", + WTERMSIG(exit_status)); + } + } + + // This may be a "smooth recovery" where we are restarting the process while leaving the + // service in the STARTED state. + if (restarting && service_state == service_state_t::STARTED) { + restarting = false; + bool need_stop = false; + if ((did_exit && WEXITSTATUS(exit_status) != 0) || was_signalled) { + need_stop = true; + } + else { + // We need to re-read the PID, since it has now changed. + if (pid_file.length() != 0) { + auto pid_result = read_pid_file(&exit_status); + switch (pid_result) { + case pid_result_t::FAILED: + // Failed startup: no auto-restart. + need_stop = true; + break; + case pid_result_t::TERMINATED: + goto begin; + case pid_result_t::OK: + break; + } + } + } + + if (need_stop) { + // Failed startup: no auto-restart. + emergency_stop(); + services->process_queues(); + } + + return; + } + + restarting = false; + if (service_state == service_state_t::STARTING) { + // POSIX requires that if the process exited clearly with a status code of 0, + // the exit status value will be 0: + if (exit_status == 0) { + auto pid_result = read_pid_file(&exit_status); + switch (pid_result) { + case pid_result_t::FAILED: + // Failed startup: no auto-restart. + failed_to_start(); + break; + case pid_result_t::TERMINATED: + // started, but immediately terminated + started(); + goto begin; + case pid_result_t::OK: + started(); + break; + } + } + else { + failed_to_start(); + } + } + else if (service_state == service_state_t::STOPPING) { + // We won't log a non-zero exit status or termination due to signal here - + // we assume that the process died because we signalled it. + stopped(); + } + else { + // we must be STARTED + if (smooth_recovery && get_target_state() == service_state_t::STARTED) { + do_smooth_recovery(); + return; + } + if (! do_auto_restart() && start_explicit) { + start_explicit = false; + release(); + } + forced_stop(); + stop_dependents(); + stopped(); + } + services->process_queues(); +} + +void bgproc_service::exec_failed(int errcode) noexcept +{ + log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode)); + // Only time we execute is for startup: + failed_to_start(); +} + +void scripted_service::handle_exit_status(int exit_status) noexcept +{ + bool did_exit = WIFEXITED(exit_status); + bool was_signalled = WIFSIGNALED(exit_status); + auto service_state = get_state(); + + // For a scripted service, a termination occurs in one of three main cases: + // - the start script completed (or failed), when service was STARTING + // - the start script was interrupted to cancel startup; state is STOPPING + // - the stop script complete (or failed), state is STOPPING + + if (service_state == service_state_t::STOPPING) { + // We might be running the stop script, or we might be running the start script and have issued + // a cancel order via SIGINT: + if (did_exit && WEXITSTATUS(exit_status) == 0) { + if (interrupting_start) { + interrupting_start = false; + // launch stop script: + bring_down(); + } + else { + // We were running the stop script and finished successfully + stopped(); + } + } + else { + if (interrupting_start) { + // We issued a start interrupt, so we expected this failure: + if (did_exit) { + log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ", + WEXITSTATUS(exit_status)); + } + else if (was_signalled) { + log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ", + WTERMSIG(exit_status)); + } + } + else { + // ??? failed to stop! Let's log it as warning: + if (did_exit) { + log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ", + WEXITSTATUS(exit_status)); + } + else if (was_signalled) { + log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ", + WTERMSIG(exit_status)); + } + } + // Even if the stop script failed, assume that service is now stopped, so that any dependencies + // can be stopped. There's not really any other useful course of action here. + interrupting_start = false; + stopped(); + } + services->process_queues(); + } + else { // STARTING + if (exit_status == 0) { + started(); + } + else { + // failed to start + if (did_exit) { + log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ", + WEXITSTATUS(exit_status)); + } + else if (was_signalled) { + log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ", + WTERMSIG(exit_status)); + } + failed_to_start(); + } + services->process_queues(); + } +} + +void scripted_service::exec_failed(int errcode) noexcept +{ + log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode)); + auto service_state = get_state(); + if (service_state == service_state_t::STARTING) { + failed_to_start(); + } + else if (service_state == service_state_t::STOPPING) { + // We've logged the failure, but it's probably better not to leave the service in + // STOPPING state: + stopped(); + } +} + +bgproc_service::pid_result_t +bgproc_service::read_pid_file(int *exit_status) noexcept +{ + const char *pid_file_c = pid_file.c_str(); + int fd = open(pid_file_c, O_CLOEXEC); + if (fd == -1) { + log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno)); + return pid_result_t::FAILED; + } + + char pidbuf[21]; // just enough to hold any 64-bit integer + int r = ss_read(fd, pidbuf, 20); + if (r < 0) { + // Could not read from PID file + log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno)); + close(fd); + return pid_result_t::FAILED; + } + + close(fd); + pidbuf[r] = 0; // store nul terminator + + bool valid_pid = false; + try { + unsigned long long v = std::stoull(pidbuf, nullptr, 0); + if (v <= std::numeric_limits::max()) { + pid = (pid_t) v; + valid_pid = true; + } + } + catch (std::out_of_range &exc) { + // Too large? + } + catch (std::invalid_argument &exc) { + // Ok, so it doesn't look like a number: proceed... + } + + if (valid_pid) { + pid_t wait_r = waitpid(pid, exit_status, WNOHANG); + if (wait_r == -1 && errno == ECHILD) { + // We can't track this child - check process exists: + if (kill(pid, 0) == 0 || errno != ESRCH) { + tracking_child = false; + return pid_result_t::OK; + } + else { + log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid"); + pid = -1; + return pid_result_t::FAILED; + } + } + else if (wait_r == pid) { + pid = -1; + return pid_result_t::TERMINATED; + } + else if (wait_r == 0) { + // We can track the child + child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10); + tracking_child = true; + reserved_child_watch = true; + return pid_result_t::OK; + } + } + + log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid"); + pid = -1; + return pid_result_t::FAILED; +} + +void process_service::bring_down() noexcept +{ + waiting_for_deps = false; + if (waiting_for_execstat) { + // The process is still starting. This should be uncommon, but can occur during + // smooth recovery. We can't do much now; we have to wait until we get the + // status, and then act appropriately. + return; + } + else if (pid != -1) { + // The process is still kicking on - must actually kill it. We signal the process + // group (-pid) rather than just the process as there's less risk then of creating + // an orphaned process group: + if (! onstart_flags.no_sigterm) { + kill_pg(SIGTERM); + } + if (term_signal != -1) { + kill_pg(term_signal); + } + + // In most cases, the rest is done in handle_exit_status. + // If we are a BGPROCESS and the process is not our immediate child, however, that + // won't work - check for this now: + if (get_type() == service_type_t::BGPROCESS && ! tracking_child) { + stopped(); + } + else if (stop_timeout != time_val(0,0)) { + restart_timer.arm_timer_rel(event_loop, stop_timeout); + stop_timer_armed = true; + } + } + else { + // The process is already dead. + stopped(); + } +} + +void scripted_service::bring_down() noexcept +{ + waiting_for_deps = false; + if (stop_command.length() == 0) { + stopped(); + } + else if (! start_ps_process(stop_arg_parts, false)) { + // Couldn't execute stop script, but there's not much we can do: + stopped(); + } + else { + // successfully started stop script: start kill timer: + if (stop_timeout != time_val(0,0)) { + restart_timer.arm_timer_rel(event_loop, stop_timeout); + stop_timer_armed = true; + } + } +} + +dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count) +{ + service->stop_timer_armed = false; + + // Timer expires if: + // We are stopping, including after having startup cancelled (stop timeout, state is STOPPING); We are + // starting (start timeout, state is STARTING); We are waiting for restart timer before restarting, + // including smooth recovery (restart timeout, state is STARTING or STARTED). + if (service->get_state() == service_state_t::STOPPING) { + service->kill_with_fire(); + } + else if (service->pid != -1) { + // Starting, start timed out. + service->stop_dependents(); + service->interrupt_start(); + } + else { + // STARTING / STARTED, and we have a pid: must be restarting (smooth recovery if STARTED) + service->do_restart(); + } + + // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed: + return dasynq::rearm::NOOP; +} diff --git a/src/proc-service.h b/src/proc-service.h new file mode 100644 index 0000000..edba90e --- /dev/null +++ b/src/proc-service.h @@ -0,0 +1,247 @@ +#include "service.h" + +// Given a string and a list of pairs of (start,end) indices for each argument in that string, +// store a null terminator for the argument. Return a `char *` vector containing the beginning +// of each argument and a trailing nullptr. (The returned array is invalidated if the string is later modified). +static std::vector separate_args(std::string &s, std::list> &arg_indices) +{ + std::vector r; + r.reserve(arg_indices.size() + 1); + + // First store nul terminator for each part: + for (auto index_pair : arg_indices) { + if (index_pair.second < s.length()) { + s[index_pair.second] = 0; + } + } + + // Now we can get the C string (c_str) and store offsets into it: + const char * cstr = s.c_str(); + for (auto index_pair : arg_indices) { + r.push_back(cstr + index_pair.first); + } + r.push_back(nullptr); + return r; +} + +class base_process_service; + +// A timer for process restarting. Used to ensure a minimum delay between process restarts (and +// also for timing service stop before the SIGKILL hammer is used). +class process_restart_timer : public eventloop_t::timer_impl +{ + public: + base_process_service * service; + + process_restart_timer(base_process_service *service_p) + : service(service_p) + { + } + + dasynq::rearm timer_expiry(eventloop_t &, int expiry_count); +}; + +class base_process_service : public service_record +{ + friend class service_child_watcher; + friend class exec_status_pipe_watcher; + friend class process_restart_timer; + + private: + // Re-launch process + void do_restart() noexcept; + + protected: + string program_name; // storage for program/script and arguments + std::vector exec_arg_parts; // pointer to each argument/part of the program_name, and nullptr + + string stop_command; // storage for stop program/script and arguments + std::vector stop_arg_parts; // pointer to each argument/part of the stop_command, and nullptr + + service_child_watcher child_listener; + exec_status_pipe_watcher child_status_listener; + process_restart_timer restart_timer; + time_val last_start_time; + + // Restart interval time and restart count are used to track the number of automatic restarts + // over an interval. Too many restarts over an interval will inhibit further restarts. + time_val restart_interval_time; // current restart interval + int restart_interval_count; // count of restarts within current interval + + time_val restart_interval; // maximum restart interval + int max_restart_interval_count; // number of restarts allowed over maximum interval + time_val restart_delay; // delay between restarts + + // Time allowed for service stop, after which SIGKILL is sent. 0 to disable. + time_val stop_timeout = {10, 0}; // default of 10 seconds + + // Time allowed for service start, after which SIGINT is sent (and then SIGKILL after + // ). 0 to disable. + time_val start_timeout = {60, 0}; // default of 1 minute + + bool waiting_restart_timer : 1; + bool stop_timer_armed : 1; + bool reserved_child_watch : 1; + bool tracking_child : 1; // whether we expect to see child process status + bool start_is_interruptible : 1; // whether we can interrupt start + + // Launch the process with the given arguments, return true on success + bool start_ps_process(const std::vector &args, bool on_console) noexcept; + + // Restart the process (due to start failure or unexpected termination). Restarts will be + // rate-limited. + bool restart_ps_process() noexcept; + + // Perform smooth recovery process + void do_smooth_recovery() noexcept; + + // Start the process, return true on success + virtual bool bring_up() noexcept override; + + virtual void bring_down() noexcept override; + + // Called when the process exits. The exit_status is the status value yielded by + // the "wait" system call. + virtual void handle_exit_status(int exit_status) noexcept = 0; + + // Called if an exec fails. + virtual void exec_failed(int errcode) noexcept = 0; + + virtual bool can_interrupt_start() noexcept override + { + return waiting_restart_timer || start_is_interruptible || service_record::can_interrupt_start(); + } + + virtual bool can_proceed_to_start() noexcept override + { + return ! waiting_restart_timer; + } + + virtual bool interrupt_start() noexcept override; + + // Kill with SIGKILL + void kill_with_fire() noexcept; + + // Signal the process group of the service process + void kill_pg(int signo) noexcept; + + public: + base_process_service(service_set *sset, string name, service_type_t record_type_p, string &&command, + std::list> &command_offsets, + const std::list &deplist_p); + + ~base_process_service() noexcept + { + } + + // Set the stop command and arguments (may throw std::bad_alloc) + void set_stop_command(std::string command, std::list> &stop_command_offsets) + { + stop_command = command; + stop_arg_parts = separate_args(stop_command, stop_command_offsets); + } + + void set_restart_interval(timespec interval, int max_restarts) noexcept + { + restart_interval = interval; + max_restart_interval_count = max_restarts; + } + + void set_restart_delay(timespec delay) noexcept + { + restart_delay = delay; + } + + void set_stop_timeout(timespec timeout) noexcept + { + stop_timeout = timeout; + } + + void set_start_timeout(timespec timeout) noexcept + { + start_timeout = timeout; + } + + void set_start_interruptible(bool value) noexcept + { + start_is_interruptible = value; + } +}; + +class process_service : public base_process_service +{ + virtual void handle_exit_status(int exit_status) noexcept override; + virtual void exec_failed(int errcode) noexcept override; + virtual void bring_down() noexcept override; + + public: + process_service(service_set *sset, string name, string &&command, + std::list> &command_offsets, + std::list depends_p) + : base_process_service(sset, name, service_type_t::PROCESS, std::move(command), command_offsets, + depends_p) + { + } + + ~process_service() noexcept + { + } +}; + +class bgproc_service : public base_process_service +{ + virtual void handle_exit_status(int exit_status) noexcept override; + virtual void exec_failed(int errcode) noexcept override; + + enum class pid_result_t { + OK, + FAILED, // failed to read pid or read invalid pid + TERMINATED // read pid successfully, but the process already terminated + }; + + // Read the pid-file, return false on failure + pid_result_t read_pid_file(int *exit_status) noexcept; + + public: + bgproc_service(service_set *sset, string name, string &&command, + std::list> &command_offsets, + std::list depends_p) + : base_process_service(sset, name, service_type_t::BGPROCESS, std::move(command), command_offsets, + depends_p) + { + } + + ~bgproc_service() noexcept + { + } +}; + +class scripted_service : public base_process_service +{ + virtual void handle_exit_status(int exit_status) noexcept override; + virtual void exec_failed(int errcode) noexcept override; + virtual void bring_down() noexcept override; + + virtual bool interrupt_start() noexcept override + { + // if base::interrupt_start() returns false, then start hasn't been fully interrupted, but an + // interrupt has been issued: + interrupting_start = ! base_process_service::interrupt_start(); + return ! interrupting_start; + } + + bool interrupting_start : 1; // running start script (true) or stop script (false) + + public: + scripted_service(service_set *sset, string name, string &&command, + std::list> &command_offsets, + std::list depends_p) + : base_process_service(sset, name, service_type_t::SCRIPTED, std::move(command), command_offsets, + depends_p), interrupting_start(false) + { + } + + ~scripted_service() noexcept + { + } +}; diff --git a/src/service.cc b/src/service.cc index 3420b7d..8e06719 100644 --- a/src/service.cc +++ b/src/service.cc @@ -27,7 +27,6 @@ // from dinit.cc: void open_control_socket(bool report_ro_failure = true) noexcept; void setup_external_log() noexcept; -extern eventloop_t event_loop; using clock_type = dasynq::clock_type; using rearm = dasynq::rearm; @@ -117,36 +116,6 @@ void service_record::stopped() noexcept notify_listeners(service_event_t::STOPPED); } -dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept -{ - base_process_service *sr = service; - - sr->pid = -1; - sr->exit_status = status; - - // Ok, for a process service, any process death which we didn't rig - // ourselves is a bit... unexpected. Probably, the child died because - // we asked it to (sr->service_state == STOPPING). But even if - // we didn't, there's not much we can do. - - if (sr->waiting_for_execstat) { - // We still don't have an exec() status from the forked child, wait for that - // before doing any further processing. - return dasynq::rearm::NOOP; // hold watch reservation - } - - // Must stop watch now since handle_exit_status might result in re-launch: - // (stop_watch instead of deregister, so that we hold watch reservation). - stop_watch(loop); - - if (sr->stop_timer_armed) { - sr->restart_timer.stop_timer(loop); - sr->stop_timer_armed = false; - } - - sr->handle_exit_status(status); - return dasynq::rearm::NOOP; -} bool service_record::do_auto_restart() noexcept { @@ -167,314 +136,6 @@ void service_record::emergency_stop() noexcept stopped(); } -void base_process_service::do_smooth_recovery() noexcept -{ - if (! restart_ps_process()) { - emergency_stop(); - services->process_queues(); - } -} - -void process_service::handle_exit_status(int exit_status) noexcept -{ - bool did_exit = WIFEXITED(exit_status); - bool was_signalled = WIFSIGNALED(exit_status); - restarting = false; - auto service_state = get_state(); - - if (exit_status != 0 && service_state != service_state_t::STOPPING) { - if (did_exit) { - log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ", - WEXITSTATUS(exit_status)); - } - else if (was_signalled) { - log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ", - WTERMSIG(exit_status)); - } - } - - if (service_state == service_state_t::STARTING) { - if (did_exit && WEXITSTATUS(exit_status) == 0) { - started(); - } - else { - failed_to_start(); - } - } - else if (service_state == service_state_t::STOPPING) { - // We won't log a non-zero exit status or termination due to signal here - - // we assume that the process died because we signalled it. - stopped(); - } - else if (smooth_recovery && service_state == service_state_t::STARTED - && get_target_state() == service_state_t::STARTED) { - do_smooth_recovery(); - return; - } - else { - emergency_stop(); - } - services->process_queues(); -} - -void process_service::exec_failed(int errcode) noexcept -{ - log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode)); - if (get_state() == service_state_t::STARTING) { - failed_to_start(); - } - else { - // Process service in smooth recovery: - emergency_stop(); - } -} - -void bgproc_service::handle_exit_status(int exit_status) noexcept -{ - begin: - bool did_exit = WIFEXITED(exit_status); - bool was_signalled = WIFSIGNALED(exit_status); - auto service_state = get_state(); - - if (exit_status != 0 && service_state != service_state_t::STOPPING) { - if (did_exit) { - log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ", - WEXITSTATUS(exit_status)); - } - else if (was_signalled) { - log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ", - WTERMSIG(exit_status)); - } - } - - // This may be a "smooth recovery" where we are restarting the process while leaving the - // service in the STARTED state. - if (restarting && service_state == service_state_t::STARTED) { - restarting = false; - bool need_stop = false; - if ((did_exit && WEXITSTATUS(exit_status) != 0) || was_signalled) { - need_stop = true; - } - else { - // We need to re-read the PID, since it has now changed. - if (pid_file.length() != 0) { - auto pid_result = read_pid_file(&exit_status); - switch (pid_result) { - case pid_result_t::FAILED: - // Failed startup: no auto-restart. - need_stop = true; - break; - case pid_result_t::TERMINATED: - goto begin; - case pid_result_t::OK: - break; - } - } - } - - if (need_stop) { - // Failed startup: no auto-restart. - emergency_stop(); - services->process_queues(); - } - - return; - } - - restarting = false; - if (service_state == service_state_t::STARTING) { - // POSIX requires that if the process exited clearly with a status code of 0, - // the exit status value will be 0: - if (exit_status == 0) { - auto pid_result = read_pid_file(&exit_status); - switch (pid_result) { - case pid_result_t::FAILED: - // Failed startup: no auto-restart. - failed_to_start(); - break; - case pid_result_t::TERMINATED: - // started, but immediately terminated - started(); - goto begin; - case pid_result_t::OK: - started(); - break; - } - } - else { - failed_to_start(); - } - } - else if (service_state == service_state_t::STOPPING) { - // We won't log a non-zero exit status or termination due to signal here - - // we assume that the process died because we signalled it. - stopped(); - } - else { - // we must be STARTED - if (smooth_recovery && get_target_state() == service_state_t::STARTED) { - do_smooth_recovery(); - return; - } - if (! do_auto_restart() && start_explicit) { - start_explicit = false; - release(); - } - forced_stop(); - stop_dependents(); - stopped(); - } - services->process_queues(); -} - -void bgproc_service::exec_failed(int errcode) noexcept -{ - log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode)); - // Only time we execute is for startup: - failed_to_start(); -} - -void scripted_service::handle_exit_status(int exit_status) noexcept -{ - bool did_exit = WIFEXITED(exit_status); - bool was_signalled = WIFSIGNALED(exit_status); - auto service_state = get_state(); - - // For a scripted service, a termination occurs in one of three main cases: - // - the start script completed (or failed), when service was STARTING - // - the start script was interrupted to cancel startup; state is STOPPING - // - the stop script complete (or failed), state is STOPPING - - if (service_state == service_state_t::STOPPING) { - // We might be running the stop script, or we might be running the start script and have issued - // a cancel order via SIGINT: - if (did_exit && WEXITSTATUS(exit_status) == 0) { - if (interrupting_start) { - interrupting_start = false; - // launch stop script: - bring_down(); - } - else { - // We were running the stop script and finished successfully - stopped(); - } - } - else { - if (interrupting_start) { - // We issued a start interrupt, so we expected this failure: - if (did_exit) { - log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ", - WEXITSTATUS(exit_status)); - } - else if (was_signalled) { - log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ", - WTERMSIG(exit_status)); - } - } - else { - // ??? failed to stop! Let's log it as warning: - if (did_exit) { - log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ", - WEXITSTATUS(exit_status)); - } - else if (was_signalled) { - log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ", - WTERMSIG(exit_status)); - } - } - // Even if the stop script failed, assume that service is now stopped, so that any dependencies - // can be stopped. There's not really any other useful course of action here. - interrupting_start = false; - stopped(); - } - services->process_queues(); - } - else { // STARTING - if (exit_status == 0) { - started(); - } - else { - // failed to start - if (did_exit) { - log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ", - WEXITSTATUS(exit_status)); - } - else if (was_signalled) { - log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ", - WTERMSIG(exit_status)); - } - failed_to_start(); - } - services->process_queues(); - } -} - -void scripted_service::exec_failed(int errcode) noexcept -{ - log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode)); - auto service_state = get_state(); - if (service_state == service_state_t::STARTING) { - failed_to_start(); - } - else if (service_state == service_state_t::STOPPING) { - // We've logged the failure, but it's probably better not to leave the service in - // STOPPING state: - stopped(); - } -} - -rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept -{ - base_process_service *sr = service; - sr->waiting_for_execstat = false; - - int exec_status; - int r = read(get_watched_fd(), &exec_status, sizeof(int)); - deregister(loop); - close(get_watched_fd()); - - if (r > 0) { - // We read an errno code; exec() failed, and the service startup failed. - if (sr->pid != -1) { - sr->child_listener.deregister(event_loop, sr->pid); - sr->reserved_child_watch = false; - if (sr->stop_timer_armed) { - sr->restart_timer.stop_timer(loop); - sr->stop_timer_armed = false; - } - } - sr->pid = -1; - sr->exec_failed(exec_status); - } - else { - // exec() succeeded. - if (sr->get_type() == service_type_t::PROCESS) { - // This could be a smooth recovery (state already STARTED). Even more, the process - // might be stopped (and killed via a signal) during smooth recovery. We don't to - // process startup again in either case, so we check for state STARTING: - if (sr->get_state() == service_state_t::STARTING) { - sr->started(); - } - else if (sr->get_state() == service_state_t::STOPPING) { - // stopping, but smooth recovery was in process. That's now over so we can - // commence normal stop. Note that if pid == -1 the process already stopped(!), - // that's handled below. - if (sr->pid != -1 && sr->stop_check_dependents()) { - sr->bring_down(); - } - } - } - - if (sr->pid == -1) { - // Somehow the process managed to complete before we even saw the status. - sr->handle_exit_status(sr->exit_status); - } - } - - sr->services->process_queues(); - - return rearm::REMOVED; -} void service_record::require() noexcept { @@ -776,74 +437,6 @@ void service_record::acquired_console() noexcept } } -bgproc_service::pid_result_t -bgproc_service::read_pid_file(int *exit_status) noexcept -{ - const char *pid_file_c = pid_file.c_str(); - int fd = open(pid_file_c, O_CLOEXEC); - if (fd == -1) { - log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno)); - return pid_result_t::FAILED; - } - - char pidbuf[21]; // just enough to hold any 64-bit integer - int r = ss_read(fd, pidbuf, 20); - if (r < 0) { - // Could not read from PID file - log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno)); - close(fd); - return pid_result_t::FAILED; - } - - close(fd); - pidbuf[r] = 0; // store nul terminator - - bool valid_pid = false; - try { - unsigned long long v = std::stoull(pidbuf, nullptr, 0); - if (v <= std::numeric_limits::max()) { - pid = (pid_t) v; - valid_pid = true; - } - } - catch (std::out_of_range &exc) { - // Too large? - } - catch (std::invalid_argument &exc) { - // Ok, so it doesn't look like a number: proceed... - } - - if (valid_pid) { - pid_t wait_r = waitpid(pid, exit_status, WNOHANG); - if (wait_r == -1 && errno == ECHILD) { - // We can't track this child - check process exists: - if (kill(pid, 0) == 0 || errno != ESRCH) { - tracking_child = false; - return pid_result_t::OK; - } - else { - log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid"); - pid = -1; - return pid_result_t::FAILED; - } - } - else if (wait_r == pid) { - pid = -1; - return pid_result_t::TERMINATED; - } - else if (wait_r == 0) { - // We can track the child - child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10); - tracking_child = true; - reserved_child_watch = true; - return pid_result_t::OK; - } - } - - log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid"); - pid = -1; - return pid_result_t::FAILED; -} void service_record::started() noexcept { @@ -922,131 +515,6 @@ bool service_record::bring_up() noexcept return true; } -bool base_process_service::bring_up() noexcept -{ - if (restarting) { - if (pid == -1) { - return restart_ps_process(); - } - return true; - } - else { - event_loop.get_time(restart_interval_time, clock_type::MONOTONIC); - restart_interval_count = 0; - if (start_ps_process(exec_arg_parts, onstart_flags.starts_on_console)) { - if (start_timeout != time_val(0,0)) { - restart_timer.arm_timer_rel(event_loop, start_timeout); - stop_timer_armed = true; - } - else if (stop_timer_armed) { - restart_timer.stop_timer(event_loop); - stop_timer_armed = false; - } - return true; - } - return false; - } -} - -bool base_process_service::start_ps_process(const std::vector &cmd, bool on_console) noexcept -{ - // In general, you can't tell whether fork/exec is successful. We use a pipe to communicate - // success/failure from the child to the parent. The pipe is set CLOEXEC so a successful - // exec closes the pipe, and the parent sees EOF. If the exec is unsuccessful, the errno - // is written to the pipe, and the parent can read it. - - event_loop.get_time(last_start_time, clock_type::MONOTONIC); - - int pipefd[2]; - if (dasynq::pipe2(pipefd, O_CLOEXEC)) { - log(loglevel_t::ERROR, get_name(), ": can't create status check pipe: ", strerror(errno)); - return false; - } - - const char * logfile = this->logfile.c_str(); - if (*logfile == 0) { - logfile = "/dev/null"; - } - - bool child_status_registered = false; - control_conn_t *control_conn = nullptr; - - int control_socket[2] = {-1, -1}; - if (onstart_flags.pass_cs_fd) { - if (dinit_socketpair(AF_UNIX, SOCK_STREAM, /* protocol */ 0, control_socket, SOCK_NONBLOCK)) { - log(loglevel_t::ERROR, get_name(), ": can't create control socket: ", strerror(errno)); - goto out_p; - } - - // Make the server side socket close-on-exec: - int fdflags = fcntl(control_socket[0], F_GETFD); - fcntl(control_socket[0], F_SETFD, fdflags | FD_CLOEXEC); - - try { - control_conn = new control_conn_t(event_loop, services, control_socket[0]); - } - catch (std::exception &exc) { - log(loglevel_t::ERROR, get_name(), ": can't launch process; out of memory"); - goto out_cs; - } - } - - // Set up complete, now fork and exec: - - pid_t forkpid; - - try { - child_status_listener.add_watch(event_loop, pipefd[0], dasynq::IN_EVENTS); - child_status_registered = true; - - // We specify a high priority (i.e. low priority value) so that process termination is - // handled early. This means we have always recorded that the process is terminated by the - // time that we handle events that might otherwise cause us to signal the process, so we - // avoid sending a signal to an invalid (and possibly recycled) process ID. - forkpid = child_listener.fork(event_loop, reserved_child_watch, dasynq::DEFAULT_PRIORITY - 10); - reserved_child_watch = true; - } - catch (std::exception &e) { - log(loglevel_t::ERROR, get_name(), ": Could not fork: ", e.what()); - goto out_cs_h; - } - - if (forkpid == 0) { - run_child_proc(cmd.data(), logfile, on_console, pipefd[1], control_socket[1]); - } - else { - // Parent process - close(pipefd[1]); // close the 'other end' fd - if (control_socket[1] != -1) { - close(control_socket[1]); - } - pid = forkpid; - - waiting_for_execstat = true; - return true; - } - - // Failure exit: - - out_cs_h: - if (child_status_registered) { - child_status_listener.deregister(event_loop); - } - - if (onstart_flags.pass_cs_fd) { - delete control_conn; - - out_cs: - close(control_socket[0]); - close(control_socket[1]); - } - - out_p: - close(pipefd[0]); - close(pipefd[1]); - - return false; -} void service_record::run_child_proc(const char * const *args, const char *logfile, bool on_console, int wpipefd, int csfd) noexcept @@ -1289,104 +757,6 @@ void service_record::bring_down() noexcept stopped(); } -void base_process_service::kill_pg(int signo) noexcept -{ - pid_t pgid = getpgid(pid); - if (pgid == -1) { - // only should happen if pid is invalid, which should never happen... - log(loglevel_t::ERROR, get_name(), ": can't signal process: ", strerror(errno)); - return; - } - kill(-pgid, signo); -} - -void base_process_service::bring_down() noexcept -{ - waiting_for_deps = false; - if (pid != -1) { - // The process is still kicking on - must actually kill it. We signal the process - // group (-pid) rather than just the process as there's less risk then of creating - // an orphaned process group: - if (! onstart_flags.no_sigterm) { - kill_pg(SIGTERM); - } - if (term_signal != -1) { - kill_pg(term_signal); - } - - // In most cases, the rest is done in handle_exit_status. - // If we are a BGPROCESS and the process is not our immediate child, however, that - // won't work - check for this now: - if (get_type() == service_type_t::BGPROCESS && ! tracking_child) { - stopped(); - } - else if (stop_timeout != time_val(0,0)) { - restart_timer.arm_timer_rel(event_loop, stop_timeout); - stop_timer_armed = true; - } - } - else { - // The process is already dead. - stopped(); - } -} - -void process_service::bring_down() noexcept -{ - waiting_for_deps = false; - if (waiting_for_execstat) { - // The process is still starting. This should be uncommon, but can occur during - // smooth recovery. We can't do much now; we have to wait until we get the - // status, and then act appropriately. - return; - } - else if (pid != -1) { - // The process is still kicking on - must actually kill it. We signal the process - // group (-pid) rather than just the process as there's less risk then of creating - // an orphaned process group: - if (! onstart_flags.no_sigterm) { - kill_pg(SIGTERM); - } - if (term_signal != -1) { - kill_pg(term_signal); - } - - // In most cases, the rest is done in handle_exit_status. - // If we are a BGPROCESS and the process is not our immediate child, however, that - // won't work - check for this now: - if (get_type() == service_type_t::BGPROCESS && ! tracking_child) { - stopped(); - } - else if (stop_timeout != time_val(0,0)) { - restart_timer.arm_timer_rel(event_loop, stop_timeout); - stop_timer_armed = true; - } - } - else { - // The process is already dead. - stopped(); - } -} - -void scripted_service::bring_down() noexcept -{ - waiting_for_deps = false; - if (stop_command.length() == 0) { - stopped(); - } - else if (! start_ps_process(stop_arg_parts, false)) { - // Couldn't execute stop script, but there's not much we can do: - stopped(); - } - else { - // successfully started stop script: start kill timer: - if (stop_timeout != time_val(0,0)) { - restart_timer.arm_timer_rel(event_loop, stop_timeout); - stop_timer_armed = true; - } - } -} - void service_record::unpin() noexcept { if (pinned_started) { @@ -1430,156 +800,3 @@ void service_set::service_inactive(service_record *sr) noexcept { active_services--; } - -base_process_service::base_process_service(service_set *sset, string name, - service_type_t service_type_p, string &&command, - std::list> &command_offsets, - const std::list &deplist_p) - : service_record(sset, name, service_type_p, deplist_p), child_listener(this), - child_status_listener(this), restart_timer(this) -{ - program_name = std::move(command); - exec_arg_parts = separate_args(program_name, command_offsets); - - restart_interval_count = 0; - restart_interval_time = {0, 0}; - restart_timer.service = this; - restart_timer.add_timer(event_loop); - - // By default, allow a maximum of 3 restarts within 10.0 seconds: - restart_interval.seconds() = 10; - restart_interval.nseconds() = 0; - max_restart_interval_count = 3; - - waiting_restart_timer = false; - reserved_child_watch = false; - tracking_child = false; - stop_timer_armed = false; - start_is_interruptible = false; -} - -void base_process_service::do_restart() noexcept -{ - waiting_restart_timer = false; - restart_interval_count++; - auto service_state = get_state(); - - // We may be STARTING (regular restart) or STARTED ("smooth recovery"). This affects whether - // the process should be granted access to the console: - bool on_console = service_state == service_state_t::STARTING - ? onstart_flags.starts_on_console : onstart_flags.runs_on_console; - - if (service_state == service_state_t::STARTING) { - // for a smooth recovery, we want to check dependencies are available before actually - // starting: - if (! check_deps_started()) { - waiting_for_deps = true; - return; - } - } - - if (! start_ps_process(exec_arg_parts, on_console)) { - restarting = false; - if (service_state == service_state_t::STARTING) { - failed_to_start(); - } - else { - // desired_state = service_state_t::STOPPED; - forced_stop(); - } - services->process_queues(); - } -} - -bool base_process_service::restart_ps_process() noexcept -{ - using time_val = dasynq::time_val; - - time_val current_time; - event_loop.get_time(current_time, clock_type::MONOTONIC); - - if (max_restart_interval_count != 0) { - // Check whether we're still in the most recent restart check interval: - time_val int_diff = current_time - restart_interval_time; - if (int_diff < restart_interval) { - if (restart_interval_count >= max_restart_interval_count) { - log(loglevel_t::ERROR, "Service ", get_name(), " restarting too quickly; stopping."); - return false; - } - } - else { - restart_interval_time = current_time; - restart_interval_count = 0; - } - } - - // Check if enough time has lapsed since the prevous restart. If not, start a timer: - time_val tdiff = current_time - last_start_time; - if (restart_delay <= tdiff) { - // > restart delay (normally 200ms) - do_restart(); - } - else { - time_val timeout = restart_delay - tdiff; - restart_timer.arm_timer_rel(event_loop, timeout); - waiting_restart_timer = true; - } - return true; -} - -bool base_process_service::interrupt_start() noexcept -{ - if (waiting_restart_timer) { - restart_timer.stop_timer(event_loop); - waiting_restart_timer = false; - return service_record::interrupt_start(); - } - else { - log(loglevel_t::WARN, "Interrupting start of service ", get_name(), " with pid ", pid, " (with SIGINT)."); - kill_pg(SIGINT); - if (stop_timeout != time_val(0,0)) { - restart_timer.arm_timer_rel(event_loop, stop_timeout); - stop_timer_armed = true; - } - else if (stop_timer_armed) { - restart_timer.stop_timer(event_loop); - stop_timer_armed = false; - } - set_state(service_state_t::STOPPING); - notify_listeners(service_event_t::STARTCANCELLED); - return false; - } -} - -void base_process_service::kill_with_fire() noexcept -{ - if (pid != -1) { - log(loglevel_t::WARN, "Service ", get_name(), " with pid ", pid, " exceeded allowed stop time; killing."); - kill_pg(SIGKILL); - } -} - -dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count) -{ - service->stop_timer_armed = false; - - // Timer expires if: - // We are stopping, including after having startup cancelled (stop timeout, state is STOPPING); We are - // starting (start timeout, state is STARTING); We are waiting for restart timer before restarting, - // including smooth recovery (restart timeout, state is STARTING or STARTED). - if (service->get_state() == service_state_t::STOPPING) { - service->kill_with_fire(); - } - else if (service->pid != -1) { - // Starting, start timed out. - service->stop_dependents(); - service->interrupt_start(); - } - else { - // STARTING / STARTED, and we have a pid: must be restarting (smooth recovery if STARTED) - service->do_restart(); - } - - // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed: - return dasynq::rearm::NOOP; -} diff --git a/src/service.h b/src/service.h index 2e77a79..de20521 100644 --- a/src/service.h +++ b/src/service.h @@ -213,30 +213,6 @@ class prelim_dep } }; -// Given a string and a list of pairs of (start,end) indices for each argument in that string, -// store a null terminator for the argument. Return a `char *` vector containing the beginning -// of each argument and a trailing nullptr. (The returned array is invalidated if the string is later modified). -static std::vector separate_args(std::string &s, std::list> &arg_indices) -{ - std::vector r; - r.reserve(arg_indices.size() + 1); - - // First store nul terminator for each part: - for (auto index_pair : arg_indices) { - if (index_pair.second < s.length()) { - s[index_pair.second] = 0; - } - } - - // Now we can get the C string (c_str) and store offsets into it: - const char * cstr = s.c_str(); - for (auto index_pair : arg_indices) { - r.push_back(cstr + index_pair.first); - } - r.push_back(nullptr); - return r; -} - class service_child_watcher : public eventloop_t::child_proc_watcher_impl { public: @@ -602,228 +578,6 @@ class service_record } }; -class base_process_service; - -// A timer for process restarting. Used to ensure a minimum delay between process restarts (and -// also for timing service stop before the SIGKILL hammer is used). -class process_restart_timer : public eventloop_t::timer_impl -{ - public: - base_process_service * service; - - process_restart_timer(base_process_service *service_p) - : service(service_p) - { - } - - dasynq::rearm timer_expiry(eventloop_t &, int expiry_count); -}; - -class base_process_service : public service_record -{ - friend class service_child_watcher; - friend class exec_status_pipe_watcher; - friend class process_restart_timer; - - private: - // Re-launch process - void do_restart() noexcept; - - protected: - string program_name; // storage for program/script and arguments - std::vector exec_arg_parts; // pointer to each argument/part of the program_name, and nullptr - - string stop_command; // storage for stop program/script and arguments - std::vector stop_arg_parts; // pointer to each argument/part of the stop_command, and nullptr - - service_child_watcher child_listener; - exec_status_pipe_watcher child_status_listener; - process_restart_timer restart_timer; - time_val last_start_time; - - // Restart interval time and restart count are used to track the number of automatic restarts - // over an interval. Too many restarts over an interval will inhibit further restarts. - time_val restart_interval_time; // current restart interval - int restart_interval_count; // count of restarts within current interval - - time_val restart_interval; // maximum restart interval - int max_restart_interval_count; // number of restarts allowed over maximum interval - time_val restart_delay; // delay between restarts - - // Time allowed for service stop, after which SIGKILL is sent. 0 to disable. - time_val stop_timeout = {10, 0}; // default of 10 seconds - - // Time allowed for service start, after which SIGINT is sent (and then SIGKILL after - // ). 0 to disable. - time_val start_timeout = {60, 0}; // default of 1 minute - - bool waiting_restart_timer : 1; - bool stop_timer_armed : 1; - bool reserved_child_watch : 1; - bool tracking_child : 1; // whether we expect to see child process status - bool start_is_interruptible : 1; // whether we can interrupt start - - // Launch the process with the given arguments, return true on success - bool start_ps_process(const std::vector &args, bool on_console) noexcept; - - // Restart the process (due to start failure or unexpected termination). Restarts will be - // rate-limited. - bool restart_ps_process() noexcept; - - // Perform smooth recovery process - void do_smooth_recovery() noexcept; - - // Start the process, return true on success - virtual bool bring_up() noexcept override; - - virtual void bring_down() noexcept override; - - // Called when the process exits. The exit_status is the status value yielded by - // the "wait" system call. - virtual void handle_exit_status(int exit_status) noexcept = 0; - - // Called if an exec fails. - virtual void exec_failed(int errcode) noexcept = 0; - - virtual bool can_interrupt_start() noexcept override - { - return waiting_restart_timer || start_is_interruptible || service_record::can_interrupt_start(); - } - - virtual bool can_proceed_to_start() noexcept override - { - return ! waiting_restart_timer; - } - - virtual bool interrupt_start() noexcept override; - - // Kill with SIGKILL - void kill_with_fire() noexcept; - - // Signal the process group of the service process - void kill_pg(int signo) noexcept; - - public: - base_process_service(service_set *sset, string name, service_type_t record_type_p, string &&command, - std::list> &command_offsets, - const std::list &deplist_p); - - ~base_process_service() noexcept - { - } - - // Set the stop command and arguments (may throw std::bad_alloc) - void set_stop_command(std::string command, std::list> &stop_command_offsets) - { - stop_command = command; - stop_arg_parts = separate_args(stop_command, stop_command_offsets); - } - - void set_restart_interval(timespec interval, int max_restarts) noexcept - { - restart_interval = interval; - max_restart_interval_count = max_restarts; - } - - void set_restart_delay(timespec delay) noexcept - { - restart_delay = delay; - } - - void set_stop_timeout(timespec timeout) noexcept - { - stop_timeout = timeout; - } - - void set_start_timeout(timespec timeout) noexcept - { - start_timeout = timeout; - } - - void set_start_interruptible(bool value) noexcept - { - start_is_interruptible = value; - } -}; - -class process_service : public base_process_service -{ - virtual void handle_exit_status(int exit_status) noexcept override; - virtual void exec_failed(int errcode) noexcept override; - virtual void bring_down() noexcept override; - - public: - process_service(service_set *sset, string name, string &&command, - std::list> &command_offsets, - std::list depends_p) - : base_process_service(sset, name, service_type_t::PROCESS, std::move(command), command_offsets, - depends_p) - { - } - - ~process_service() noexcept - { - } -}; - -class bgproc_service : public base_process_service -{ - virtual void handle_exit_status(int exit_status) noexcept override; - virtual void exec_failed(int errcode) noexcept override; - - enum class pid_result_t { - OK, - FAILED, // failed to read pid or read invalid pid - TERMINATED // read pid successfully, but the process already terminated - }; - - // Read the pid-file, return false on failure - pid_result_t read_pid_file(int *exit_status) noexcept; - - public: - bgproc_service(service_set *sset, string name, string &&command, - std::list> &command_offsets, - std::list depends_p) - : base_process_service(sset, name, service_type_t::BGPROCESS, std::move(command), command_offsets, - depends_p) - { - } - - ~bgproc_service() noexcept - { - } -}; - -class scripted_service : public base_process_service -{ - virtual void handle_exit_status(int exit_status) noexcept override; - virtual void exec_failed(int errcode) noexcept override; - virtual void bring_down() noexcept override; - - virtual bool interrupt_start() noexcept override - { - // if base::interrupt_start() returns false, then start hasn't been fully interrupted, but an - // interrupt has been issued: - interrupting_start = ! base_process_service::interrupt_start(); - return ! interrupting_start; - } - - bool interrupting_start : 1; // running start script (true) or stop script (false) - - public: - scripted_service(service_set *sset, string name, string &&command, - std::list> &command_offsets, - std::list depends_p) - : base_process_service(sset, name, service_type_t::SCRIPTED, std::move(command), command_offsets, - depends_p), interrupting_start(false) - { - } - - ~scripted_service() noexcept - { - } -}; - inline auto extract_prop_queue(service_record *sr) -> decltype(sr->prop_queue_node) & { return sr->prop_queue_node;