2 #include <sys/socket.h>
4 #include "dinit-socket.h"
5 #include "dinit-util.h"
6 #include "proc-service.h"
8 extern eventloop_t event_loop;
10 using clock_type = dasynq::clock_type;
11 using rearm = dasynq::rearm;
12 using time_val = dasynq::time_val;
14 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
16 base_process_service *sr = service;
17 sr->waiting_for_execstat = false;
20 int r = read(get_watched_fd(), &exec_status, sizeof(int));
22 close(get_watched_fd());
25 // We read an errno code; exec() failed, and the service startup failed.
27 sr->child_listener.deregister(event_loop, sr->pid);
28 sr->reserved_child_watch = false;
29 if (sr->stop_timer_armed) {
30 sr->restart_timer.stop_timer(loop);
31 sr->stop_timer_armed = false;
35 sr->exec_failed(exec_status);
39 if (sr->get_type() == service_type_t::PROCESS) {
40 // This could be a smooth recovery (state already STARTED). Even more, the process
41 // might be stopped (and killed via a signal) during smooth recovery. We don't to
42 // process startup again in either case, so we check for state STARTING:
43 if (sr->get_state() == service_state_t::STARTING) {
46 else if (sr->get_state() == service_state_t::STOPPING) {
47 // stopping, but smooth recovery was in process. That's now over so we can
48 // commence normal stop. Note that if pid == -1 the process already stopped(!),
49 // that's handled below.
50 if (sr->pid != -1 && sr->stop_check_dependents()) {
57 // Somehow the process managed to complete before we even saw the status.
58 sr->handle_exit_status(sr->exit_status);
62 sr->services->process_queues();
64 return rearm::REMOVED;
67 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
69 base_process_service *sr = service;
72 sr->exit_status = status;
74 // Ok, for a process service, any process death which we didn't rig
75 // ourselves is a bit... unexpected. Probably, the child died because
76 // we asked it to (sr->service_state == STOPPING). But even if
77 // we didn't, there's not much we can do.
79 if (sr->waiting_for_execstat) {
80 // We still don't have an exec() status from the forked child, wait for that
81 // before doing any further processing.
82 return dasynq::rearm::NOOP; // hold watch reservation
85 // Must stop watch now since handle_exit_status might result in re-launch:
86 // (stop_watch instead of deregister, so that we hold watch reservation).
89 if (sr->stop_timer_armed) {
90 sr->restart_timer.stop_timer(loop);
91 sr->stop_timer_armed = false;
94 sr->handle_exit_status(status);
95 return dasynq::rearm::NOOP;
98 void process_service::handle_exit_status(int exit_status) noexcept
100 bool did_exit = WIFEXITED(exit_status);
101 bool was_signalled = WIFSIGNALED(exit_status);
103 auto service_state = get_state();
105 if (exit_status != 0 && service_state != service_state_t::STOPPING) {
107 log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
108 WEXITSTATUS(exit_status));
110 else if (was_signalled) {
111 log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
112 WTERMSIG(exit_status));
116 if (service_state == service_state_t::STARTING) {
117 if (did_exit && WEXITSTATUS(exit_status) == 0) {
124 else if (service_state == service_state_t::STOPPING) {
125 // We won't log a non-zero exit status or termination due to signal here -
126 // we assume that the process died because we signalled it.
129 else if (smooth_recovery && service_state == service_state_t::STARTED
130 && get_target_state() == service_state_t::STARTED) {
131 do_smooth_recovery();
137 services->process_queues();
140 void process_service::exec_failed(int errcode) noexcept
142 log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
143 if (get_state() == service_state_t::STARTING) {
147 // Process service in smooth recovery:
152 void bgproc_service::handle_exit_status(int exit_status) noexcept
155 bool did_exit = WIFEXITED(exit_status);
156 bool was_signalled = WIFSIGNALED(exit_status);
157 auto service_state = get_state();
159 if (exit_status != 0 && service_state != service_state_t::STOPPING) {
161 log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
162 WEXITSTATUS(exit_status));
164 else if (was_signalled) {
165 log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
166 WTERMSIG(exit_status));
170 // This may be a "smooth recovery" where we are restarting the process while leaving the
171 // service in the STARTED state.
172 if (restarting && service_state == service_state_t::STARTED) {
174 bool need_stop = false;
175 if ((did_exit && WEXITSTATUS(exit_status) != 0) || was_signalled) {
179 // We need to re-read the PID, since it has now changed.
180 if (pid_file.length() != 0) {
181 auto pid_result = read_pid_file(&exit_status);
182 switch (pid_result) {
183 case pid_result_t::FAILED:
184 // Failed startup: no auto-restart.
187 case pid_result_t::TERMINATED:
189 case pid_result_t::OK:
196 // Failed startup: no auto-restart.
198 services->process_queues();
205 if (service_state == service_state_t::STARTING) {
206 // POSIX requires that if the process exited clearly with a status code of 0,
207 // the exit status value will be 0:
208 if (exit_status == 0) {
209 auto pid_result = read_pid_file(&exit_status);
210 switch (pid_result) {
211 case pid_result_t::FAILED:
212 // Failed startup: no auto-restart.
215 case pid_result_t::TERMINATED:
216 // started, but immediately terminated
219 case pid_result_t::OK:
228 else if (service_state == service_state_t::STOPPING) {
229 // We won't log a non-zero exit status or termination due to signal here -
230 // we assume that the process died because we signalled it.
234 // we must be STARTED
235 if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
236 do_smooth_recovery();
239 if (! do_auto_restart() && start_explicit) {
240 start_explicit = false;
247 services->process_queues();
250 void bgproc_service::exec_failed(int errcode) noexcept
252 log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
253 // Only time we execute is for startup:
257 void scripted_service::handle_exit_status(int exit_status) noexcept
259 bool did_exit = WIFEXITED(exit_status);
260 bool was_signalled = WIFSIGNALED(exit_status);
261 auto service_state = get_state();
263 // For a scripted service, a termination occurs in one of three main cases:
264 // - the start script completed (or failed), when service was STARTING
265 // - the start script was interrupted to cancel startup; state is STOPPING
266 // - the stop script complete (or failed), state is STOPPING
268 if (service_state == service_state_t::STOPPING) {
269 // We might be running the stop script, or we might be running the start script and have issued
270 // a cancel order via SIGINT:
271 if (did_exit && WEXITSTATUS(exit_status) == 0) {
272 if (interrupting_start) {
273 interrupting_start = false;
274 // launch stop script:
278 // We were running the stop script and finished successfully
283 if (interrupting_start) {
284 // We issued a start interrupt, so we expected this failure:
286 log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
287 WEXITSTATUS(exit_status));
289 else if (was_signalled) {
290 log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
291 WTERMSIG(exit_status));
295 // ??? failed to stop! Let's log it as warning:
297 log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
298 WEXITSTATUS(exit_status));
300 else if (was_signalled) {
301 log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
302 WTERMSIG(exit_status));
305 // Even if the stop script failed, assume that service is now stopped, so that any dependencies
306 // can be stopped. There's not really any other useful course of action here.
307 interrupting_start = false;
310 services->process_queues();
313 if (exit_status == 0) {
319 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
320 WEXITSTATUS(exit_status));
322 else if (was_signalled) {
323 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
324 WTERMSIG(exit_status));
328 services->process_queues();
332 void scripted_service::exec_failed(int errcode) noexcept
334 log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
335 auto service_state = get_state();
336 if (service_state == service_state_t::STARTING) {
339 else if (service_state == service_state_t::STOPPING) {
340 // We've logged the failure, but it's probably better not to leave the service in
346 bgproc_service::pid_result_t
347 bgproc_service::read_pid_file(int *exit_status) noexcept
349 const char *pid_file_c = pid_file.c_str();
350 int fd = open(pid_file_c, O_CLOEXEC);
352 log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
353 return pid_result_t::FAILED;
356 char pidbuf[21]; // just enough to hold any 64-bit integer
357 int r = ss_read(fd, pidbuf, 20);
359 // Could not read from PID file
360 log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
362 return pid_result_t::FAILED;
366 pidbuf[r] = 0; // store nul terminator
368 bool valid_pid = false;
370 unsigned long long v = std::stoull(pidbuf, nullptr, 0);
371 if (v <= std::numeric_limits<pid_t>::max()) {
376 catch (std::out_of_range &exc) {
379 catch (std::invalid_argument &exc) {
380 // Ok, so it doesn't look like a number: proceed...
384 pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
385 if (wait_r == -1 && errno == ECHILD) {
386 // We can't track this child - check process exists:
387 if (kill(pid, 0) == 0 || errno != ESRCH) {
388 tracking_child = false;
389 return pid_result_t::OK;
392 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
394 return pid_result_t::FAILED;
397 else if (wait_r == pid) {
399 return pid_result_t::TERMINATED;
401 else if (wait_r == 0) {
402 // We can track the child
403 child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
404 tracking_child = true;
405 reserved_child_watch = true;
406 return pid_result_t::OK;
410 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
412 return pid_result_t::FAILED;
415 void process_service::bring_down() noexcept
417 waiting_for_deps = false;
418 if (waiting_for_execstat) {
419 // The process is still starting. This should be uncommon, but can occur during
420 // smooth recovery. We can't do much now; we have to wait until we get the
421 // status, and then act appropriately.
424 else if (pid != -1) {
425 // The process is still kicking on - must actually kill it. We signal the process
426 // group (-pid) rather than just the process as there's less risk then of creating
427 // an orphaned process group:
428 if (! onstart_flags.no_sigterm) {
431 if (term_signal != -1) {
432 kill_pg(term_signal);
435 // In most cases, the rest is done in handle_exit_status.
436 // If we are a BGPROCESS and the process is not our immediate child, however, that
437 // won't work - check for this now:
438 if (get_type() == service_type_t::BGPROCESS && ! tracking_child) {
441 else if (stop_timeout != time_val(0,0)) {
442 restart_timer.arm_timer_rel(event_loop, stop_timeout);
443 stop_timer_armed = true;
447 // The process is already dead.
452 void scripted_service::bring_down() noexcept
454 waiting_for_deps = false;
455 if (stop_command.length() == 0) {
458 else if (! start_ps_process(stop_arg_parts, false)) {
459 // Couldn't execute stop script, but there's not much we can do:
463 // successfully started stop script: start kill timer:
464 if (stop_timeout != time_val(0,0)) {
465 restart_timer.arm_timer_rel(event_loop, stop_timeout);
466 stop_timer_armed = true;
471 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
473 service->stop_timer_armed = false;
476 // We are stopping, including after having startup cancelled (stop timeout, state is STOPPING); We are
477 // starting (start timeout, state is STARTING); We are waiting for restart timer before restarting,
478 // including smooth recovery (restart timeout, state is STARTING or STARTED).
479 if (service->get_state() == service_state_t::STOPPING) {
480 service->kill_with_fire();
482 else if (service->pid != -1) {
483 // Starting, start timed out.
484 service->stop_dependents();
485 service->interrupt_start();
488 // STARTING / STARTED, and we have a pid: must be restarting (smooth recovery if STARTED)
489 service->do_restart();
492 // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
493 return dasynq::rearm::NOOP;