0aeb9e3ee85b6cf56f0aff3688438ac6eea7557f
[oweals/dinit.git] / src / proc-service.cc
1 #include <sys/un.h>
2 #include <sys/socket.h>
3
4 #include "dinit-socket.h"
5 #include "dinit-util.h"
6 #include "proc-service.h"
7
8 extern eventloop_t event_loop;
9
10 using clock_type = dasynq::clock_type;
11 using rearm = dasynq::rearm;
12 using time_val = dasynq::time_val;
13
14 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
15 {
16     base_process_service *sr = service;
17     sr->waiting_for_execstat = false;
18
19     int exec_status;
20     int r = read(get_watched_fd(), &exec_status, sizeof(int));
21     deregister(loop);
22     close(get_watched_fd());
23
24     if (r > 0) {
25         // We read an errno code; exec() failed, and the service startup failed.
26         if (sr->pid != -1) {
27             sr->child_listener.deregister(event_loop, sr->pid);
28             sr->reserved_child_watch = false;
29             if (sr->stop_timer_armed) {
30                 sr->restart_timer.stop_timer(loop);
31                 sr->stop_timer_armed = false;
32             }
33         }
34         sr->pid = -1;
35         sr->exec_failed(exec_status);
36     }
37     else {
38         // exec() succeeded.
39         if (sr->get_type() == service_type_t::PROCESS) {
40             // This could be a smooth recovery (state already STARTED). Even more, the process
41             // might be stopped (and killed via a signal) during smooth recovery.  We don't to
42             // process startup again in either case, so we check for state STARTING:
43             if (sr->get_state() == service_state_t::STARTING) {
44                 sr->started();
45             }
46             else if (sr->get_state() == service_state_t::STOPPING) {
47                 // stopping, but smooth recovery was in process. That's now over so we can
48                 // commence normal stop. Note that if pid == -1 the process already stopped(!),
49                 // that's handled below.
50                 if (sr->pid != -1 && sr->stop_check_dependents()) {
51                     sr->bring_down();
52                 }
53             }
54         }
55
56         if (sr->pid == -1) {
57             // Somehow the process managed to complete before we even saw the status.
58             sr->handle_exit_status(sr->exit_status);
59         }
60     }
61
62     sr->services->process_queues();
63
64     return rearm::REMOVED;
65 }
66
67 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
68 {
69     base_process_service *sr = service;
70
71     sr->pid = -1;
72     sr->exit_status = status;
73
74     // Ok, for a process service, any process death which we didn't rig
75     // ourselves is a bit... unexpected. Probably, the child died because
76     // we asked it to (sr->service_state == STOPPING). But even if
77     // we didn't, there's not much we can do.
78
79     if (sr->waiting_for_execstat) {
80         // We still don't have an exec() status from the forked child, wait for that
81         // before doing any further processing.
82         return dasynq::rearm::NOOP; // hold watch reservation
83     }
84
85     // Must stop watch now since handle_exit_status might result in re-launch:
86     // (stop_watch instead of deregister, so that we hold watch reservation).
87     stop_watch(loop);
88
89     if (sr->stop_timer_armed) {
90         sr->restart_timer.stop_timer(loop);
91         sr->stop_timer_armed = false;
92     }
93
94     sr->handle_exit_status(status);
95     return dasynq::rearm::NOOP;
96 }
97
98 void process_service::handle_exit_status(int exit_status) noexcept
99 {
100     bool did_exit = WIFEXITED(exit_status);
101     bool was_signalled = WIFSIGNALED(exit_status);
102     restarting = false;
103     auto service_state = get_state();
104
105     if (exit_status != 0 && service_state != service_state_t::STOPPING) {
106         if (did_exit) {
107             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
108                     WEXITSTATUS(exit_status));
109         }
110         else if (was_signalled) {
111             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
112                     WTERMSIG(exit_status));
113         }
114     }
115
116     if (service_state == service_state_t::STARTING) {
117         if (did_exit && WEXITSTATUS(exit_status) == 0) {
118             started();
119         }
120         else {
121             failed_to_start();
122         }
123     }
124     else if (service_state == service_state_t::STOPPING) {
125         // We won't log a non-zero exit status or termination due to signal here -
126         // we assume that the process died because we signalled it.
127         stopped();
128     }
129     else if (smooth_recovery && service_state == service_state_t::STARTED
130             && get_target_state() == service_state_t::STARTED) {
131         do_smooth_recovery();
132         return;
133     }
134     else {
135         emergency_stop();
136     }
137     services->process_queues();
138 }
139
140 void process_service::exec_failed(int errcode) noexcept
141 {
142     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
143     if (get_state() == service_state_t::STARTING) {
144         failed_to_start();
145     }
146     else {
147         // Process service in smooth recovery:
148         emergency_stop();
149     }
150 }
151
152 void bgproc_service::handle_exit_status(int exit_status) noexcept
153 {
154     begin:
155     bool did_exit = WIFEXITED(exit_status);
156     bool was_signalled = WIFSIGNALED(exit_status);
157     auto service_state = get_state();
158
159     if (exit_status != 0 && service_state != service_state_t::STOPPING) {
160         if (did_exit) {
161             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
162                     WEXITSTATUS(exit_status));
163         }
164         else if (was_signalled) {
165             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
166                     WTERMSIG(exit_status));
167         }
168     }
169
170     // This may be a "smooth recovery" where we are restarting the process while leaving the
171     // service in the STARTED state.
172     if (restarting && service_state == service_state_t::STARTED) {
173         restarting = false;
174         bool need_stop = false;
175         if ((did_exit && WEXITSTATUS(exit_status) != 0) || was_signalled) {
176             need_stop = true;
177         }
178         else {
179             // We need to re-read the PID, since it has now changed.
180             if (pid_file.length() != 0) {
181                 auto pid_result = read_pid_file(&exit_status);
182                 switch (pid_result) {
183                     case pid_result_t::FAILED:
184                         // Failed startup: no auto-restart.
185                         need_stop = true;
186                         break;
187                     case pid_result_t::TERMINATED:
188                         goto begin;
189                     case pid_result_t::OK:
190                         break;
191                 }
192             }
193         }
194
195         if (need_stop) {
196             // Failed startup: no auto-restart.
197             emergency_stop();
198             services->process_queues();
199         }
200
201         return;
202     }
203
204     restarting = false;
205     if (service_state == service_state_t::STARTING) {
206         // POSIX requires that if the process exited clearly with a status code of 0,
207         // the exit status value will be 0:
208         if (exit_status == 0) {
209             auto pid_result = read_pid_file(&exit_status);
210             switch (pid_result) {
211                 case pid_result_t::FAILED:
212                     // Failed startup: no auto-restart.
213                     failed_to_start();
214                     break;
215                 case pid_result_t::TERMINATED:
216                     // started, but immediately terminated
217                     started();
218                     goto begin;
219                 case pid_result_t::OK:
220                     started();
221                     break;
222             }
223         }
224         else {
225             failed_to_start();
226         }
227     }
228     else if (service_state == service_state_t::STOPPING) {
229         // We won't log a non-zero exit status or termination due to signal here -
230         // we assume that the process died because we signalled it.
231         stopped();
232     }
233     else {
234         // we must be STARTED
235         if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
236             do_smooth_recovery();
237             return;
238         }
239         if (! do_auto_restart() && start_explicit) {
240             start_explicit = false;
241             release();
242         }
243         forced_stop();
244         stop_dependents();
245         stopped();
246     }
247     services->process_queues();
248 }
249
250 void bgproc_service::exec_failed(int errcode) noexcept
251 {
252     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
253     // Only time we execute is for startup:
254     failed_to_start();
255 }
256
257 void scripted_service::handle_exit_status(int exit_status) noexcept
258 {
259     bool did_exit = WIFEXITED(exit_status);
260     bool was_signalled = WIFSIGNALED(exit_status);
261     auto service_state = get_state();
262
263     // For a scripted service, a termination occurs in one of three main cases:
264     // - the start script completed (or failed), when service was STARTING
265     // - the start script was interrupted to cancel startup; state is STOPPING
266     // - the stop script complete (or failed), state is STOPPING
267
268     if (service_state == service_state_t::STOPPING) {
269         // We might be running the stop script, or we might be running the start script and have issued
270         // a cancel order via SIGINT:
271         if (did_exit && WEXITSTATUS(exit_status) == 0) {
272             if (interrupting_start) {
273                 interrupting_start = false;
274                 // launch stop script:
275                 bring_down();
276             }
277             else {
278                 // We were running the stop script and finished successfully
279                 stopped();
280             }
281         }
282         else {
283             if (interrupting_start) {
284                 // We issued a start interrupt, so we expected this failure:
285                 if (did_exit) {
286                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
287                             WEXITSTATUS(exit_status));
288                 }
289                 else if (was_signalled) {
290                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
291                             WTERMSIG(exit_status));
292                 }
293             }
294             else {
295                 // ??? failed to stop! Let's log it as warning:
296                 if (did_exit) {
297                     log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
298                             WEXITSTATUS(exit_status));
299                 }
300                 else if (was_signalled) {
301                     log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
302                             WTERMSIG(exit_status));
303                 }
304             }
305             // Even if the stop script failed, assume that service is now stopped, so that any dependencies
306             // can be stopped. There's not really any other useful course of action here.
307             interrupting_start = false;
308             stopped();
309         }
310         services->process_queues();
311     }
312     else { // STARTING
313         if (exit_status == 0) {
314             started();
315         }
316         else {
317             // failed to start
318             if (did_exit) {
319                 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
320                         WEXITSTATUS(exit_status));
321             }
322             else if (was_signalled) {
323                 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
324                         WTERMSIG(exit_status));
325             }
326             failed_to_start();
327         }
328         services->process_queues();
329     }
330 }
331
332 void scripted_service::exec_failed(int errcode) noexcept
333 {
334     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
335     auto service_state = get_state();
336     if (service_state == service_state_t::STARTING) {
337         failed_to_start();
338     }
339     else if (service_state == service_state_t::STOPPING) {
340         // We've logged the failure, but it's probably better not to leave the service in
341         // STOPPING state:
342         stopped();
343     }
344 }
345
346 bgproc_service::pid_result_t
347 bgproc_service::read_pid_file(int *exit_status) noexcept
348 {
349     const char *pid_file_c = pid_file.c_str();
350     int fd = open(pid_file_c, O_CLOEXEC);
351     if (fd == -1) {
352         log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
353         return pid_result_t::FAILED;
354     }
355
356     char pidbuf[21]; // just enough to hold any 64-bit integer
357     int r = ss_read(fd, pidbuf, 20);
358     if (r < 0) {
359         // Could not read from PID file
360         log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
361         close(fd);
362         return pid_result_t::FAILED;
363     }
364
365     close(fd);
366     pidbuf[r] = 0; // store nul terminator
367
368     bool valid_pid = false;
369     try {
370         unsigned long long v = std::stoull(pidbuf, nullptr, 0);
371         if (v <= std::numeric_limits<pid_t>::max()) {
372             pid = (pid_t) v;
373             valid_pid = true;
374         }
375     }
376     catch (std::out_of_range &exc) {
377         // Too large?
378     }
379     catch (std::invalid_argument &exc) {
380         // Ok, so it doesn't look like a number: proceed...
381     }
382
383     if (valid_pid) {
384         pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
385         if (wait_r == -1 && errno == ECHILD) {
386             // We can't track this child - check process exists:
387             if (kill(pid, 0) == 0 || errno != ESRCH) {
388                 tracking_child = false;
389                 return pid_result_t::OK;
390             }
391             else {
392                 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
393                 pid = -1;
394                 return pid_result_t::FAILED;
395             }
396         }
397         else if (wait_r == pid) {
398             pid = -1;
399             return pid_result_t::TERMINATED;
400         }
401         else if (wait_r == 0) {
402             // We can track the child
403             child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
404             tracking_child = true;
405             reserved_child_watch = true;
406             return pid_result_t::OK;
407         }
408     }
409
410     log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
411     pid = -1;
412     return pid_result_t::FAILED;
413 }
414
415 void process_service::bring_down() noexcept
416 {
417     waiting_for_deps = false;
418     if (waiting_for_execstat) {
419         // The process is still starting. This should be uncommon, but can occur during
420         // smooth recovery. We can't do much now; we have to wait until we get the
421         // status, and then act appropriately.
422         return;
423     }
424     else if (pid != -1) {
425         // The process is still kicking on - must actually kill it. We signal the process
426         // group (-pid) rather than just the process as there's less risk then of creating
427         // an orphaned process group:
428         if (! onstart_flags.no_sigterm) {
429             kill_pg(SIGTERM);
430         }
431         if (term_signal != -1) {
432             kill_pg(term_signal);
433         }
434
435         // In most cases, the rest is done in handle_exit_status.
436         // If we are a BGPROCESS and the process is not our immediate child, however, that
437         // won't work - check for this now:
438         if (get_type() == service_type_t::BGPROCESS && ! tracking_child) {
439             stopped();
440         }
441         else if (stop_timeout != time_val(0,0)) {
442             restart_timer.arm_timer_rel(event_loop, stop_timeout);
443             stop_timer_armed = true;
444         }
445     }
446     else {
447         // The process is already dead.
448         stopped();
449     }
450 }
451
452 void scripted_service::bring_down() noexcept
453 {
454     waiting_for_deps = false;
455     if (stop_command.length() == 0) {
456         stopped();
457     }
458     else if (! start_ps_process(stop_arg_parts, false)) {
459         // Couldn't execute stop script, but there's not much we can do:
460         stopped();
461     }
462     else {
463         // successfully started stop script: start kill timer:
464         if (stop_timeout != time_val(0,0)) {
465             restart_timer.arm_timer_rel(event_loop, stop_timeout);
466             stop_timer_armed = true;
467         }
468     }
469 }
470
471 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
472 {
473     service->stop_timer_armed = false;
474
475     // Timer expires if:
476     // We are stopping, including after having startup cancelled (stop timeout, state is STOPPING); We are
477     // starting (start timeout, state is STARTING); We are waiting for restart timer before restarting,
478     // including smooth recovery (restart timeout, state is STARTING or STARTED).
479     if (service->get_state() == service_state_t::STOPPING) {
480         service->kill_with_fire();
481     }
482     else if (service->pid != -1) {
483         // Starting, start timed out.
484         service->stop_dependents();
485         service->interrupt_start();
486     }
487     else {
488         // STARTING / STARTED, and we have a pid: must be restarting (smooth recovery if STARTED)
489         service->do_restart();
490     }
491
492     // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
493     return dasynq::rearm::NOOP;
494 }