src/proc-service.cc

   1 #include <sys/un.h>
   2 #include <sys/socket.h>
   3
   4 #include "dinit.h"
   5 #include "dinit-socket.h"
   6 #include "dinit-util.h"
   7 #include "proc-service.h"
   8
   9 /*
  10  * Most of the implementation for process-based services (process, scripted, bgprocess) is here.
  11  *
  12  * See proc-service.h header for interface details.
  13  */
  14
  15 // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  16 // store a null terminator for the argument. Return a `char *` vector containing the beginning
  17 // of each argument and a trailing nullptr. (The returned array is invalidated if the string is later modified).
  18 std::vector<const char *> separate_args(std::string &s, std::list<std::pair<unsigned,unsigned>> &arg_indices)
  19 {
  20     std::vector<const char *> r;
  21     r.reserve(arg_indices.size() + 1);
  22
  23     // First store nul terminator for each part:
  24     for (auto index_pair : arg_indices) {
  25         if (index_pair.second < s.length()) {
  26             s[index_pair.second] = 0;
  27         }
  28     }
  29
  30     // Now we can get the C string (c_str) and store offsets into it:
  31     const char * cstr = s.c_str();
  32     for (auto index_pair : arg_indices) {
  33         r.push_back(cstr + index_pair.first);
  34     }
  35     r.push_back(nullptr);
  36     return r;
  37 }
  38
  39 void process_service::exec_succeeded() noexcept
  40 {
  41     // This could be a smooth recovery (state already STARTED). Even more, the process
  42     // might be stopped (and killed via a signal) during smooth recovery.  We don't to
  43     // process startup again in either case, so we check for state STARTING:
  44     if (get_state() == service_state_t::STARTING) {
  45         started();
  46     }
  47     else if (get_state() == service_state_t::STOPPING) {
  48         // stopping, but smooth recovery was in process. That's now over so we can
  49         // commence normal stop. Note that if pid == -1 the process already stopped(!),
  50         // that's handled below.
  51         if (pid != -1 && stop_check_dependents()) {
  52             bring_down();
  53         }
  54     }
  55 }
  56
  57 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
  58 {
  59     base_process_service *sr = service;
  60     sr->waiting_for_execstat = false;
  61
  62     int exec_status;
  63     int r = read(get_watched_fd(), &exec_status, sizeof(int));
  64     deregister(loop);
  65     close(get_watched_fd());
  66
  67     if (r > 0) {
  68         // We read an errno code; exec() failed, and the service startup failed.
  69         if (sr->pid != -1) {
  70             sr->child_listener.deregister(event_loop, sr->pid);
  71             sr->reserved_child_watch = false;
  72             if (sr->stop_timer_armed) {
  73                 sr->restart_timer.stop_timer(loop);
  74                 sr->stop_timer_armed = false;
  75             }
  76         }
  77         sr->pid = -1;
  78         sr->exec_failed(exec_status);
  79     }
  80     else {
  81         sr->exec_succeeded();
  82
  83         if (sr->pid == -1) {
  84             // Somehow the process managed to complete before we even saw the exec() status.
  85             sr->handle_exit_status(sr->exit_status);
  86         }
  87     }
  88
  89     sr->services->process_queues();
  90
  91     return rearm::REMOVED;
  92 }
  93
  94 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
  95 {
  96     base_process_service *sr = service;
  97
  98     sr->pid = -1;
  99     sr->exit_status = status;
 100
 101     // Ok, for a process service, any process death which we didn't rig
 102     // ourselves is a bit... unexpected. Probably, the child died because
 103     // we asked it to (sr->service_state == STOPPING). But even if
 104     // we didn't, there's not much we can do.
 105
 106     if (sr->waiting_for_execstat) {
 107         // We still don't have an exec() status from the forked child, wait for that
 108         // before doing any further processing.
 109         return dasynq::rearm::NOOP; // hold watch reservation
 110     }
 111
 112     // Must stop watch now since handle_exit_status might result in re-launch:
 113     // (stop_watch instead of deregister, so that we hold watch reservation).
 114     stop_watch(loop);
 115
 116     if (sr->stop_timer_armed) {
 117         sr->restart_timer.stop_timer(loop);
 118         sr->stop_timer_armed = false;
 119     }
 120
 121     sr->handle_exit_status(status);
 122     return dasynq::rearm::NOOP;
 123 }
 124
 125 void process_service::handle_exit_status(int exit_status) noexcept
 126 {
 127     bool did_exit = WIFEXITED(exit_status);
 128     bool was_signalled = WIFSIGNALED(exit_status);
 129     restarting = false;
 130     auto service_state = get_state();
 131
 132     if (exit_status != 0 && service_state != service_state_t::STOPPING) {
 133         if (did_exit) {
 134             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 135                     WEXITSTATUS(exit_status));
 136         }
 137         else if (was_signalled) {
 138             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 139                     WTERMSIG(exit_status));
 140         }
 141     }
 142
 143     if (service_state == service_state_t::STARTING) {
 144         if (did_exit && WEXITSTATUS(exit_status) == 0) {
 145             started();
 146         }
 147         else {
 148             failed_to_start();
 149         }
 150     }
 151     else if (service_state == service_state_t::STOPPING) {
 152         // We won't log a non-zero exit status or termination due to signal here -
 153         // we assume that the process died because we signalled it.
 154         stopped();
 155     }
 156     else if (smooth_recovery && service_state == service_state_t::STARTED
 157             && get_target_state() == service_state_t::STARTED) {
 158         do_smooth_recovery();
 159         return;
 160     }
 161     else {
 162         emergency_stop();
 163     }
 164     services->process_queues();
 165 }
 166
 167 void process_service::exec_failed(int errcode) noexcept
 168 {
 169     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 170     if (get_state() == service_state_t::STARTING) {
 171         failed_to_start();
 172     }
 173     else {
 174         // Process service in smooth recovery:
 175         emergency_stop();
 176     }
 177 }
 178
 179 void bgproc_service::handle_exit_status(int exit_status) noexcept
 180 {
 181     begin:
 182     bool did_exit = WIFEXITED(exit_status);
 183     bool was_signalled = WIFSIGNALED(exit_status);
 184     auto service_state = get_state();
 185
 186     if (exit_status != 0 && service_state != service_state_t::STOPPING) {
 187         if (did_exit) {
 188             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 189                     WEXITSTATUS(exit_status));
 190         }
 191         else if (was_signalled) {
 192             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 193                     WTERMSIG(exit_status));
 194         }
 195     }
 196
 197     // This may be a "smooth recovery" where we are restarting the process while leaving the
 198     // service in the STARTED state.
 199     if (restarting && service_state == service_state_t::STARTED) {
 200         restarting = false;
 201         bool need_stop = false;
 202         if ((did_exit && WEXITSTATUS(exit_status) != 0) || was_signalled) {
 203             need_stop = true;
 204         }
 205         else {
 206             // We need to re-read the PID, since it has now changed.
 207             if (pid_file.length() != 0) {
 208                 auto pid_result = read_pid_file(&exit_status);
 209                 switch (pid_result) {
 210                     case pid_result_t::FAILED:
 211                         // Failed startup: no auto-restart.
 212                         need_stop = true;
 213                         break;
 214                     case pid_result_t::TERMINATED:
 215                         goto begin;
 216                     case pid_result_t::OK:
 217                         break;
 218                 }
 219             }
 220         }
 221
 222         if (need_stop) {
 223             // Failed startup: no auto-restart.
 224             emergency_stop();
 225             services->process_queues();
 226         }
 227
 228         return;
 229     }
 230
 231     restarting = false;
 232     if (service_state == service_state_t::STARTING) {
 233         // POSIX requires that if the process exited clearly with a status code of 0,
 234         // the exit status value will be 0:
 235         if (exit_status == 0) {
 236             auto pid_result = read_pid_file(&exit_status);
 237             switch (pid_result) {
 238                 case pid_result_t::FAILED:
 239                     // Failed startup: no auto-restart.
 240                     failed_to_start();
 241                     break;
 242                 case pid_result_t::TERMINATED:
 243                     // started, but immediately terminated
 244                     started();
 245                     goto begin;
 246                 case pid_result_t::OK:
 247                     started();
 248                     break;
 249             }
 250         }
 251         else {
 252             failed_to_start();
 253         }
 254     }
 255     else if (service_state == service_state_t::STOPPING) {
 256         // We won't log a non-zero exit status or termination due to signal here -
 257         // we assume that the process died because we signalled it.
 258         stopped();
 259     }
 260     else {
 261         // we must be STARTED
 262         if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
 263             do_smooth_recovery();
 264             return;
 265         }
 266         if (! do_auto_restart() && start_explicit) {
 267             start_explicit = false;
 268             release();
 269         }
 270         forced_stop();
 271         stop_dependents();
 272         stopped();
 273     }
 274     services->process_queues();
 275 }
 276
 277 void bgproc_service::exec_failed(int errcode) noexcept
 278 {
 279     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 280     // Only time we execute is for startup:
 281     failed_to_start();
 282 }
 283
 284 void scripted_service::handle_exit_status(int exit_status) noexcept
 285 {
 286     bool did_exit = WIFEXITED(exit_status);
 287     bool was_signalled = WIFSIGNALED(exit_status);
 288     auto service_state = get_state();
 289
 290     // For a scripted service, a termination occurs in one of three main cases:
 291     // - the start script completed (or failed), when service was STARTING
 292     // - the start script was interrupted to cancel startup; state is STOPPING
 293     // - the stop script complete (or failed), state is STOPPING
 294
 295     if (service_state == service_state_t::STOPPING) {
 296         // We might be running the stop script, or we might be running the start script and have issued
 297         // a cancel order via SIGINT:
 298         if (did_exit && WEXITSTATUS(exit_status) == 0) {
 299             if (interrupting_start) {
 300                 interrupting_start = false;
 301                 // launch stop script:
 302                 bring_down();
 303             }
 304             else {
 305                 // We were running the stop script and finished successfully
 306                 stopped();
 307             }
 308         }
 309         else {
 310             if (interrupting_start) {
 311                 // We issued a start interrupt, so we expected this failure:
 312                 if (did_exit) {
 313                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
 314                             WEXITSTATUS(exit_status));
 315                 }
 316                 else if (was_signalled) {
 317                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
 318                             WTERMSIG(exit_status));
 319                 }
 320             }
 321             else {
 322                 // ??? failed to stop! Let's log it as warning:
 323                 if (did_exit) {
 324                     log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
 325                             WEXITSTATUS(exit_status));
 326                 }
 327                 else if (was_signalled) {
 328                     log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
 329                             WTERMSIG(exit_status));
 330                 }
 331             }
 332             // Even if the stop script failed, assume that service is now stopped, so that any dependencies
 333             // can be stopped. There's not really any other useful course of action here.
 334             interrupting_start = false;
 335             stopped();
 336         }
 337         services->process_queues();
 338     }
 339     else { // STARTING
 340         if (exit_status == 0) {
 341             started();
 342         }
 343         else {
 344             // failed to start
 345             if (did_exit) {
 346                 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
 347                         WEXITSTATUS(exit_status));
 348             }
 349             else if (was_signalled) {
 350                 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
 351                         WTERMSIG(exit_status));
 352             }
 353             failed_to_start();
 354         }
 355         services->process_queues();
 356     }
 357 }
 358
 359 void scripted_service::exec_failed(int errcode) noexcept
 360 {
 361     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 362     auto service_state = get_state();
 363     if (service_state == service_state_t::STARTING) {
 364         failed_to_start();
 365     }
 366     else if (service_state == service_state_t::STOPPING) {
 367         // We've logged the failure, but it's probably better not to leave the service in
 368         // STOPPING state:
 369         stopped();
 370     }
 371 }
 372
 373 bgproc_service::pid_result_t
 374 bgproc_service::read_pid_file(int *exit_status) noexcept
 375 {
 376     const char *pid_file_c = pid_file.c_str();
 377     int fd = open(pid_file_c, O_CLOEXEC);
 378     if (fd == -1) {
 379         log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
 380         return pid_result_t::FAILED;
 381     }
 382
 383     char pidbuf[21]; // just enough to hold any 64-bit integer
 384     int r = ss_read(fd, pidbuf, 20);
 385     if (r < 0) {
 386         // Could not read from PID file
 387         log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
 388         close(fd);
 389         return pid_result_t::FAILED;
 390     }
 391
 392     close(fd);
 393     pidbuf[r] = 0; // store nul terminator
 394
 395     bool valid_pid = false;
 396     try {
 397         unsigned long long v = std::stoull(pidbuf, nullptr, 0);
 398         if (v <= std::numeric_limits<pid_t>::max()) {
 399             pid = (pid_t) v;
 400             valid_pid = true;
 401         }
 402     }
 403     catch (std::out_of_range &exc) {
 404         // Too large?
 405     }
 406     catch (std::invalid_argument &exc) {
 407         // Ok, so it doesn't look like a number: proceed...
 408     }
 409
 410     if (valid_pid) {
 411         pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
 412         if (wait_r == -1 && errno == ECHILD) {
 413             // We can't track this child - check process exists:
 414             if (kill(pid, 0) == 0 || errno != ESRCH) {
 415                 tracking_child = false;
 416                 return pid_result_t::OK;
 417             }
 418             else {
 419                 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 420                 pid = -1;
 421                 return pid_result_t::FAILED;
 422             }
 423         }
 424         else if (wait_r == pid) {
 425             pid = -1;
 426             return pid_result_t::TERMINATED;
 427         }
 428         else if (wait_r == 0) {
 429             // We can track the child
 430             child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
 431             tracking_child = true;
 432             reserved_child_watch = true;
 433             return pid_result_t::OK;
 434         }
 435     }
 436
 437     log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 438     pid = -1;
 439     return pid_result_t::FAILED;
 440 }
 441
 442 void process_service::bring_down() noexcept
 443 {
 444     waiting_for_deps = false;
 445     if (waiting_for_execstat) {
 446         // The process is still starting. This should be uncommon, but can occur during
 447         // smooth recovery. We can't do much now; we have to wait until we get the
 448         // status, and then act appropriately.
 449         return;
 450     }
 451     else if (pid != -1) {
 452         // The process is still kicking on - must actually kill it. We signal the process
 453         // group (-pid) rather than just the process as there's less risk then of creating
 454         // an orphaned process group:
 455         if (! onstart_flags.no_sigterm) {
 456             kill_pg(SIGTERM);
 457         }
 458         if (term_signal != -1) {
 459             kill_pg(term_signal);
 460         }
 461
 462         // In most cases, the rest is done in handle_exit_status.
 463         // If we are a BGPROCESS and the process is not our immediate child, however, that
 464         // won't work - check for this now:
 465         if (get_type() == service_type_t::BGPROCESS && ! tracking_child) {
 466             stopped();
 467         }
 468         else if (stop_timeout != time_val(0,0)) {
 469             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 470             stop_timer_armed = true;
 471         }
 472     }
 473     else {
 474         // The process is already dead.
 475         stopped();
 476     }
 477 }
 478
 479 void scripted_service::bring_down() noexcept
 480 {
 481     waiting_for_deps = false;
 482     if (stop_command.length() == 0) {
 483         stopped();
 484     }
 485     else if (! start_ps_process(stop_arg_parts, false)) {
 486         // Couldn't execute stop script, but there's not much we can do:
 487         stopped();
 488     }
 489     else {
 490         // successfully started stop script: start kill timer:
 491         if (stop_timeout != time_val(0,0)) {
 492             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 493             stop_timer_armed = true;
 494         }
 495     }
 496 }
 497
 498 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
 499 {
 500     service->stop_timer_armed = false;
 501
 502     // Timer expires if:
 503     // We are stopping, including after having startup cancelled (stop timeout, state is STOPPING); We are
 504     // starting (start timeout, state is STARTING); We are waiting for restart timer before restarting,
 505     // including smooth recovery (restart timeout, state is STARTING or STARTED).
 506     if (service->get_state() == service_state_t::STOPPING) {
 507         service->kill_with_fire();
 508     }
 509     else if (service->pid != -1) {
 510         // Starting, start timed out.
 511         service->stop_dependents();
 512         service->interrupt_start();
 513     }
 514     else {
 515         // STARTING / STARTED, and we have a pid: must be restarting (smooth recovery if STARTED)
 516         service->do_restart();
 517     }
 518
 519     // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
 520     return dasynq::rearm::NOOP;
 521 }