src/proc-service.cc

   1 #include <sys/un.h>
   2 #include <sys/socket.h>
   3
   4 #include "dinit.h"
   5 #include "dinit-socket.h"
   6 #include "dinit-util.h"
   7 #include "dinit-log.h"
   8 #include "proc-service.h"
   9
  10 /*
  11  * Most of the implementation for process-based services (process, scripted, bgprocess) is here.
  12  *
  13  * See proc-service.h header for interface details.
  14  */
  15
  16 // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  17 // store a null terminator for the argument. Return a `char *` vector containing the beginning
  18 // of each argument and a trailing nullptr. (The returned array is invalidated if the string is later modified).
  19 std::vector<const char *> separate_args(std::string &s, std::list<std::pair<unsigned,unsigned>> &arg_indices)
  20 {
  21     std::vector<const char *> r;
  22     r.reserve(arg_indices.size() + 1);
  23
  24     // First store nul terminator for each part:
  25     for (auto index_pair : arg_indices) {
  26         if (index_pair.second < s.length()) {
  27             s[index_pair.second] = 0;
  28         }
  29     }
  30
  31     // Now we can get the C string (c_str) and store offsets into it:
  32     const char * cstr = s.c_str();
  33     for (auto index_pair : arg_indices) {
  34         r.push_back(cstr + index_pair.first);
  35     }
  36     r.push_back(nullptr);
  37     return r;
  38 }
  39
  40 void process_service::exec_succeeded() noexcept
  41 {
  42     // This could be a smooth recovery (state already STARTED). Even more, the process
  43     // might be stopped (and killed via a signal) during smooth recovery.  We don't to
  44     // process startup again in either case, so we check for state STARTING:
  45     if (get_state() == service_state_t::STARTING) {
  46         started();
  47     }
  48     else if (get_state() == service_state_t::STOPPING) {
  49         // stopping, but smooth recovery was in process. That's now over so we can
  50         // commence normal stop. Note that if pid == -1 the process already stopped(!),
  51         // that's handled below.
  52         if (pid != -1 && stop_check_dependents()) {
  53             bring_down();
  54         }
  55     }
  56 }
  57
  58 void scripted_service::exec_succeeded() noexcept
  59 {
  60         // For a scripted service, this means nothing other than that the start/stop
  61         // script will now begin.
  62 }
  63
  64 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
  65 {
  66     base_process_service *sr = service;
  67     sr->waiting_for_execstat = false;
  68
  69     int exec_status;
  70     int r = read(get_watched_fd(), &exec_status, sizeof(int));
  71     deregister(loop);
  72     close(get_watched_fd());
  73
  74     if (r > 0) {
  75         // We read an errno code; exec() failed, and the service startup failed.
  76         if (sr->pid != -1) {
  77             sr->child_listener.deregister(event_loop, sr->pid);
  78             sr->reserved_child_watch = false;
  79             if (sr->stop_timer_armed) {
  80                 sr->restart_timer.stop_timer(loop);
  81                 sr->stop_timer_armed = false;
  82             }
  83         }
  84         sr->pid = -1;
  85         sr->exec_failed(exec_status);
  86     }
  87     else {
  88         sr->exec_succeeded();
  89
  90         if (sr->pid == -1) {
  91             // Somehow the process managed to complete before we even saw the exec() status.
  92             sr->handle_exit_status(sr->exit_status);
  93         }
  94     }
  95
  96     sr->services->process_queues();
  97
  98     return rearm::REMOVED;
  99 }
 100
 101 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
 102 {
 103     base_process_service *sr = service;
 104
 105     sr->pid = -1;
 106     sr->exit_status = bp_sys::exit_status(status);
 107
 108     // Ok, for a process service, any process death which we didn't rig
 109     // ourselves is a bit... unexpected. Probably, the child died because
 110     // we asked it to (sr->service_state == STOPPING). But even if
 111     // we didn't, there's not much we can do.
 112
 113     if (sr->waiting_for_execstat) {
 114         // We still don't have an exec() status from the forked child, wait for that
 115         // before doing any further processing.
 116         return dasynq::rearm::NOOP; // hold watch reservation
 117     }
 118
 119     // Must stop watch now since handle_exit_status might result in re-launch:
 120     // (stop_watch instead of deregister, so that we hold watch reservation).
 121     stop_watch(loop);
 122
 123     if (sr->stop_timer_armed) {
 124         sr->restart_timer.stop_timer(loop);
 125         sr->stop_timer_armed = false;
 126     }
 127
 128     sr->handle_exit_status(bp_sys::exit_status(status));
 129     return dasynq::rearm::NOOP;
 130 }
 131
 132 void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 133 {
 134     bool did_exit = exit_status.did_exit();
 135     bool was_signalled = exit_status.was_signalled();
 136     restarting = false;
 137     auto service_state = get_state();
 138
 139     if (exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
 140         if (did_exit) {
 141             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 142                     exit_status.get_exit_status());
 143         }
 144         else if (was_signalled) {
 145             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 146                     exit_status.get_term_sig());
 147         }
 148     }
 149
 150     if (service_state == service_state_t::STARTING) {
 151         if (exit_status.did_exit_clean()) {
 152             started();
 153         }
 154         else {
 155             failed_to_start();
 156         }
 157     }
 158     else if (service_state == service_state_t::STOPPING) {
 159         // We won't log a non-zero exit status or termination due to signal here -
 160         // we assume that the process died because we signalled it.
 161         if (stop_timer_armed) {
 162             restart_timer.stop_timer(event_loop);
 163         }
 164         stopped();
 165     }
 166     else if (smooth_recovery && service_state == service_state_t::STARTED
 167             && get_target_state() == service_state_t::STARTED) {
 168         do_smooth_recovery();
 169         return;
 170     }
 171     else {
 172         emergency_stop();
 173     }
 174     services->process_queues();
 175 }
 176
 177 void process_service::exec_failed(int errcode) noexcept
 178 {
 179     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 180     if (get_state() == service_state_t::STARTING) {
 181         failed_to_start();
 182     }
 183     else {
 184         // Process service in smooth recovery:
 185         emergency_stop();
 186     }
 187 }
 188
 189 void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 190 {
 191     begin:
 192     bool did_exit = exit_status.did_exit();
 193     bool was_signalled = exit_status.was_signalled();
 194     auto service_state = get_state();
 195
 196     if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
 197         if (did_exit) {
 198             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 199                     exit_status.get_exit_status());
 200         }
 201         else if (was_signalled) {
 202             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 203                     exit_status.get_term_sig());
 204         }
 205     }
 206
 207     // This may be a "smooth recovery" where we are restarting the process while leaving the
 208     // service in the STARTED state.
 209     if (restarting && service_state == service_state_t::STARTED) {
 210         restarting = false;
 211         bool need_stop = false;
 212         if ((did_exit && exit_status.get_exit_status() != 0) || was_signalled) {
 213             need_stop = true;
 214         }
 215         else {
 216             // We need to re-read the PID, since it has now changed.
 217             if (pid_file.length() != 0) {
 218                 auto pid_result = read_pid_file(&exit_status);
 219                 switch (pid_result) {
 220                     case pid_result_t::FAILED:
 221                         // Failed startup: no auto-restart.
 222                         need_stop = true;
 223                         break;
 224                     case pid_result_t::TERMINATED:
 225                         goto begin;
 226                     case pid_result_t::OK:
 227                         break;
 228                 }
 229             }
 230         }
 231
 232         if (need_stop) {
 233             // Failed startup: no auto-restart.
 234             emergency_stop();
 235             services->process_queues();
 236         }
 237
 238         return;
 239     }
 240
 241     restarting = false;
 242     if (service_state == service_state_t::STARTING) {
 243         // POSIX requires that if the process exited clearly with a status code of 0,
 244         // the exit status value will be 0:
 245         if (exit_status.did_exit_clean()) {
 246             auto pid_result = read_pid_file(&exit_status);
 247             switch (pid_result) {
 248                 case pid_result_t::FAILED:
 249                     // Failed startup: no auto-restart.
 250                     failed_to_start();
 251                     break;
 252                 case pid_result_t::TERMINATED:
 253                     // started, but immediately terminated
 254                     started();
 255                     goto begin;
 256                 case pid_result_t::OK:
 257                     started();
 258                     break;
 259             }
 260         }
 261         else {
 262             failed_to_start();
 263         }
 264     }
 265     else if (service_state == service_state_t::STOPPING) {
 266         // We won't log a non-zero exit status or termination due to signal here -
 267         // we assume that the process died because we signalled it.
 268         stopped();
 269     }
 270     else {
 271         // we must be STARTED
 272         if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
 273             do_smooth_recovery();
 274             return;
 275         }
 276         if (! do_auto_restart() && start_explicit) {
 277             start_explicit = false;
 278             release();
 279         }
 280         forced_stop();
 281         stop_dependents();
 282         stopped();
 283     }
 284     services->process_queues();
 285 }
 286
 287 void bgproc_service::exec_failed(int errcode) noexcept
 288 {
 289     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 290     // Only time we execute is for startup:
 291     failed_to_start();
 292 }
 293
 294 void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 295 {
 296     bool did_exit = exit_status.did_exit();
 297     bool was_signalled = exit_status.was_signalled();
 298     auto service_state = get_state();
 299
 300     // For a scripted service, a termination occurs in one of three main cases:
 301     // - the start script completed (or failed), when service was STARTING
 302     // - the start script was interrupted to cancel startup; state is STOPPING
 303     // - the stop script complete (or failed), state is STOPPING
 304
 305     if (service_state == service_state_t::STOPPING) {
 306         // We might be running the stop script, or we might be running the start script and have issued
 307         // a cancel order via SIGINT:
 308         if (interrupting_start) {
 309             // We issued a start interrupt, so we expected this failure:
 310             if (did_exit && exit_status.get_exit_status() != 0) {
 311                 log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
 312                         exit_status.get_exit_status());
 313                 // Assume that a command terminating normally requires no cleanup:
 314                 stopped();
 315             }
 316             else {
 317                 if (was_signalled) {
 318                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
 319                             exit_status.get_term_sig());
 320                 }
 321                 // If the start script completed successfully, or was interrupted via our signal,
 322                 // we want to run the stop script to clean up:
 323                 bring_down();
 324             }
 325             interrupting_start = false;
 326         }
 327         else if (exit_status.did_exit_clean()) {
 328             // We were running the stop script and finished successfully
 329             stopped();
 330         }
 331         else {
 332             // ??? failed to stop! Let's log it as warning:
 333             if (did_exit) {
 334                 log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
 335                         exit_status.get_exit_status());
 336             }
 337             else if (was_signalled) {
 338                 log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
 339                         exit_status.get_term_sig());
 340             }
 341             // Even if the stop script failed, assume that service is now stopped, so that any dependencies
 342             // can be stopped. There's not really any other useful course of action here.
 343             stopped();
 344         }
 345         services->process_queues();
 346     }
 347     else { // STARTING
 348         if (exit_status.did_exit_clean()) {
 349             started();
 350         }
 351         else if (was_signalled && exit_status.get_term_sig() == SIGINT && onstart_flags.skippable) {
 352             // A skippable service can be skipped by interrupting (eg by ^C if the service
 353             // starts on the console).
 354             start_skipped = true;
 355             started();
 356         }
 357         else {
 358             // failed to start
 359             if (did_exit) {
 360                 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
 361                         exit_status.get_exit_status());
 362             }
 363             else if (was_signalled) {
 364                 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
 365                         exit_status.get_term_sig());
 366             }
 367             failed_to_start();
 368         }
 369         services->process_queues();
 370     }
 371 }
 372
 373 void scripted_service::exec_failed(int errcode) noexcept
 374 {
 375     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 376     auto service_state = get_state();
 377     if (service_state == service_state_t::STARTING) {
 378         failed_to_start();
 379     }
 380     else if (service_state == service_state_t::STOPPING) {
 381         // We've logged the failure, but it's probably better not to leave the service in
 382         // STOPPING state:
 383         stopped();
 384     }
 385 }
 386
 387 bgproc_service::pid_result_t
 388 bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
 389 {
 390     const char *pid_file_c = pid_file.c_str();
 391     int fd = open(pid_file_c, O_CLOEXEC);
 392     if (fd == -1) {
 393         log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
 394         return pid_result_t::FAILED;
 395     }
 396
 397     char pidbuf[21]; // just enough to hold any 64-bit integer
 398     int r = ss_read(fd, pidbuf, 20);
 399     if (r < 0) {
 400         // Could not read from PID file
 401         log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
 402         close(fd);
 403         return pid_result_t::FAILED;
 404     }
 405
 406     close(fd);
 407     pidbuf[r] = 0; // store nul terminator
 408
 409     bool valid_pid = false;
 410     try {
 411         unsigned long long v = std::stoull(pidbuf, nullptr, 0);
 412         if (v <= std::numeric_limits<pid_t>::max()) {
 413             pid = (pid_t) v;
 414             valid_pid = true;
 415         }
 416     }
 417     catch (std::out_of_range &exc) {
 418         // Too large?
 419     }
 420     catch (std::invalid_argument &exc) {
 421         // Ok, so it doesn't look like a number: proceed...
 422     }
 423
 424     if (valid_pid) {
 425         pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
 426         if (wait_r == -1 && errno == ECHILD) {
 427             // We can't track this child - check process exists:
 428             if (kill(pid, 0) == 0 || errno != ESRCH) {
 429                 tracking_child = false;
 430                 return pid_result_t::OK;
 431             }
 432             else {
 433                 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 434                 pid = -1;
 435                 return pid_result_t::FAILED;
 436             }
 437         }
 438         else if (wait_r == pid) {
 439             pid = -1;
 440             return pid_result_t::TERMINATED;
 441         }
 442         else if (wait_r == 0) {
 443             // We can track the child
 444             child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
 445             tracking_child = true;
 446             reserved_child_watch = true;
 447             return pid_result_t::OK;
 448         }
 449     }
 450
 451     log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 452     pid = -1;
 453     return pid_result_t::FAILED;
 454 }
 455
 456 void process_service::bring_down() noexcept
 457 {
 458     if (waiting_for_execstat) {
 459         // The process is still starting. This should be uncommon, but can occur during
 460         // smooth recovery. We can't do much now; we have to wait until we get the
 461         // status, and then act appropriately.
 462         return;
 463     }
 464     else if (pid != -1) {
 465         // The process is still kicking on - must actually kill it. We signal the process
 466         // group (-pid) rather than just the process as there's less risk then of creating
 467         // an orphaned process group:
 468         if (! onstart_flags.no_sigterm) {
 469             kill_pg(SIGTERM);
 470         }
 471         if (term_signal != -1) {
 472             kill_pg(term_signal);
 473         }
 474
 475         // If there's a stop timeout, arm the timer now:
 476         if (stop_timeout != time_val(0,0)) {
 477             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 478             stop_timer_armed = true;
 479         }
 480
 481         // The rest is done in handle_exit_status.
 482     }
 483     else {
 484         // The process is already dead.
 485         stopped();
 486     }
 487 }
 488
 489 void bgproc_service::bring_down() noexcept
 490 {
 491     if (pid != -1) {
 492         // The process is still kicking on - must actually kill it. We signal the process
 493         // group (-pid) rather than just the process as there's less risk then of creating
 494         // an orphaned process group:
 495         if (! onstart_flags.no_sigterm) {
 496             kill_pg(SIGTERM);
 497         }
 498         if (term_signal != -1) {
 499             kill_pg(term_signal);
 500         }
 501
 502         // In most cases, the rest is done in handle_exit_status.
 503         // If we are a BGPROCESS and the process is not our immediate child, however, that
 504         // won't work - check for this now:
 505         if (! tracking_child) {
 506             stopped();
 507         }
 508         else if (stop_timeout != time_val(0,0)) {
 509             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 510             stop_timer_armed = true;
 511         }
 512     }
 513     else {
 514         // The process is already dead.
 515         stopped();
 516     }
 517 }
 518
 519 void scripted_service::bring_down() noexcept
 520 {
 521         if (pid != -1) {
 522                 // We're already running the stop script; nothing to do.
 523                 return;
 524         }
 525
 526     if (stop_command.length() == 0) {
 527         stopped();
 528     }
 529     else if (! start_ps_process(stop_arg_parts, false)) {
 530         // Couldn't execute stop script, but there's not much we can do:
 531         stopped();
 532     }
 533     else {
 534         // successfully started stop script: start kill timer:
 535         if (stop_timeout != time_val(0,0)) {
 536             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 537             stop_timer_armed = true;
 538         }
 539     }
 540 }
 541
 542 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
 543 {
 544     service->timer_expired();
 545
 546     // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
 547     return dasynq::rearm::NOOP;
 548 }