src/proc-service.cc

   1 #include <cstring>
   2 #include <type_traits>
   3
   4 #include <sys/un.h>
   5 #include <sys/socket.h>
   6
   7 #include "dinit.h"
   8 #include "dinit-socket.h"
   9 #include "dinit-util.h"
  10 #include "dinit-log.h"
  11 #include "proc-service.h"
  12
  13 /*
  14  * Most of the implementation for process-based services (process, scripted, bgprocess) is here.
  15  *
  16  * See proc-service.h header for interface details.
  17  */
  18
  19 // Strings describing the execution stages (failure points).
  20 const char * const exec_stage_descriptions[static_cast<int>(exec_stage::DO_EXEC) + 1] = {
  21         "arranging file descriptors",   // ARRANGE_FDS
  22         "reading environment file",     // READ_ENV_FILE
  23         "setting environment variable", // SET_NOTIFYFD_VAR
  24         "setting up activation socket", // SETUP_ACTIVATION_SOCKET
  25         "setting up control socket",    // SETUP_CONTROL_SOCKET
  26         "changing directory",           // CHDIR
  27         "setting up standard input/output descriptors", // SETUP_STDINOUTERR
  28         "setting resource limits",      // SET_RLIMITS
  29         "setting user/group ID",        // SET_UIDGID
  30         "executing command"             // DO_EXEC
  31 };
  32
  33 // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  34 // store a null terminator for the argument. Return a `char *` vector containing the beginning
  35 // of each argument and a trailing nullptr. (The returned array is invalidated if the string is
  36 // later modified).
  37 std::vector<const char *> separate_args(std::string &s,
  38         const std::list<std::pair<unsigned,unsigned>> &arg_indices)
  39 {
  40     std::vector<const char *> r;
  41     r.reserve(arg_indices.size() + 1);
  42
  43     // First store nul terminator for each part:
  44     for (auto index_pair : arg_indices) {
  45         if (index_pair.second < s.length()) {
  46             s[index_pair.second] = 0;
  47         }
  48     }
  49
  50     // Now we can get the C string (c_str) and store offsets into it:
  51     const char * cstr = s.c_str();
  52     for (auto index_pair : arg_indices) {
  53         r.push_back(cstr + index_pair.first);
  54     }
  55     r.push_back(nullptr);
  56     return r;
  57 }
  58
  59 void process_service::exec_succeeded() noexcept
  60 {
  61     // This could be a smooth recovery (state already STARTED). Even more, the process
  62     // might be stopped (and killed via a signal) during smooth recovery.  We don't to
  63     // process startup again in either case, so we check for state STARTING:
  64     if (get_state() == service_state_t::STARTING) {
  65         if (force_notification_fd != -1 || !notification_var.empty()) {
  66             // Wait for readiness notification:
  67             readiness_watcher.set_enabled(event_loop, true);
  68         }
  69         else {
  70             started();
  71         }
  72     }
  73     else if (get_state() == service_state_t::STOPPING) {
  74         // stopping, but smooth recovery was in process. That's now over so we can
  75         // commence normal stop. Note that if pid == -1 the process already stopped(!),
  76         // that's handled below.
  77         if (pid != -1 && stop_check_dependents()) {
  78             bring_down();
  79         }
  80     }
  81 }
  82
  83 void scripted_service::exec_succeeded() noexcept
  84 {
  85         // For a scripted service, this means nothing other than that the start/stop
  86         // script will now begin.
  87 }
  88
  89 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
  90 {
  91     base_process_service *sr = service;
  92     sr->waiting_for_execstat = false;
  93
  94     run_proc_err exec_status;
  95     int r = read(get_watched_fd(), &exec_status, sizeof(exec_status));
  96     deregister(loop);
  97     close(get_watched_fd());
  98
  99     if (r > 0) {
 100         // We read an errno code; exec() failed, and the service startup failed.
 101         if (sr->pid != -1) {
 102             sr->child_listener.deregister(event_loop, sr->pid);
 103             sr->reserved_child_watch = false;
 104             if (sr->stop_timer_armed) {
 105                 sr->restart_timer.stop_timer(loop);
 106                 sr->stop_timer_armed = false;
 107             }
 108         }
 109         sr->pid = -1;
 110         sr->exec_failed(exec_status);
 111     }
 112     else {
 113         sr->exec_succeeded();
 114
 115         if (sr->pid == -1) {
 116             // Somehow the process managed to complete before we even saw the exec() status.
 117             sr->handle_exit_status(sr->exit_status);
 118         }
 119     }
 120
 121     sr->services->process_queues();
 122
 123     return rearm::REMOVED;
 124 }
 125
 126 rearm ready_notify_watcher::fd_event(eventloop_t &, int fd, int flags) noexcept
 127 {
 128     char buf[128];
 129     if (service->get_state() == service_state_t::STARTING) {
 130         // can we actually read anything from the notification pipe?
 131         int r = bp_sys::read(fd, buf, sizeof(buf));
 132         if (r > 0) {
 133             service->started();
 134         }
 135         else if (r == 0 || errno != EAGAIN) {
 136             service->failed_to_start(false, false);
 137             service->set_state(service_state_t::STOPPING);
 138             service->bring_down();
 139         }
 140     }
 141     else {
 142         // Just keep consuming data from the pipe:
 143         int r = bp_sys::read(fd, buf, sizeof(buf));
 144         if (r == 0) {
 145             // Process closed write end or terminated
 146             close(fd);
 147             service->notification_fd = -1;
 148             return rearm::DISARM;
 149         }
 150     }
 151
 152     service->services->process_queues();
 153     return rearm::REARM;
 154 }
 155
 156 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
 157 {
 158     base_process_service *sr = service;
 159
 160     sr->pid = -1;
 161     sr->exit_status = bp_sys::exit_status(status);
 162
 163     // Ok, for a process service, any process death which we didn't rig ourselves is a bit... unexpected.
 164     // Probably, the child died because we asked it to (sr->service_state == STOPPING). But even if we
 165     // didn't, there's not much we can do.
 166
 167     if (sr->waiting_for_execstat) {
 168         // We still don't have an exec() status from the forked child, wait for that
 169         // before doing any further processing.
 170         return dasynq::rearm::NOOP; // hold watch reservation
 171     }
 172
 173     // Must stop watch now since handle_exit_status might result in re-launch:
 174     // (stop_watch instead of deregister, so that we hold watch reservation).
 175     stop_watch(loop);
 176
 177     if (sr->stop_timer_armed) {
 178         sr->restart_timer.stop_timer(loop);
 179         sr->stop_timer_armed = false;
 180     }
 181
 182     sr->handle_exit_status(bp_sys::exit_status(status));
 183     return dasynq::rearm::NOOP;
 184 }
 185
 186 void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 187 {
 188     bool did_exit = exit_status.did_exit();
 189     bool was_signalled = exit_status.was_signalled();
 190     auto service_state = get_state();
 191
 192     if (notification_fd != -1) {
 193         readiness_watcher.deregister(event_loop);
 194         bp_sys::close(notification_fd);
 195         notification_fd = -1;
 196     }
 197
 198     if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
 199         if (did_exit) {
 200             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 201                     exit_status.get_exit_status());
 202         }
 203         else if (was_signalled) {
 204             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 205                     exit_status.get_term_sig());
 206         }
 207     }
 208
 209 #if USE_UTMPX
 210     if (*inittab_id || *inittab_line) {
 211         clear_utmp_entry(inittab_id, inittab_line);
 212     }
 213 #endif
 214
 215     if (service_state == service_state_t::STARTING) {
 216         // If state is STARTING, we must be waiting for readiness notification; the process has
 217         // terminated before becoming ready.
 218         stop_reason = stopped_reason_t::FAILED;
 219         failed_to_start();
 220     }
 221     else if (service_state == service_state_t::STOPPING) {
 222         // We won't log a non-zero exit status or termination due to signal here -
 223         // we assume that the process died because we signalled it.
 224         if (stop_timer_armed) {
 225             restart_timer.stop_timer(event_loop);
 226         }
 227         stopped();
 228     }
 229     else if (smooth_recovery && service_state == service_state_t::STARTED
 230             && get_target_state() == service_state_t::STARTED) {
 231         do_smooth_recovery();
 232         return;
 233     }
 234     else {
 235         stop_reason = stopped_reason_t::TERMINATED;
 236         emergency_stop();
 237     }
 238     services->process_queues();
 239 }
 240
 241 void process_service::exec_failed(run_proc_err errcode) noexcept
 242 {
 243     log(loglevel_t::ERROR, get_name(), ": execution failed - ",
 244             exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
 245
 246     if (notification_fd != -1) {
 247         readiness_watcher.deregister(event_loop);
 248         bp_sys::close(notification_fd);
 249         notification_fd = -1;
 250     }
 251
 252     if (get_state() == service_state_t::STARTING) {
 253         stop_reason = stopped_reason_t::EXECFAILED;
 254         failed_to_start();
 255     }
 256     else {
 257         // Process service in smooth recovery:
 258         stop_reason = stopped_reason_t::TERMINATED;
 259         emergency_stop();
 260     }
 261 }
 262
 263 void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 264 {
 265     begin:
 266     bool did_exit = exit_status.did_exit();
 267     bool was_signalled = exit_status.was_signalled();
 268     auto service_state = get_state();
 269
 270     if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
 271         if (did_exit) {
 272             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 273                     exit_status.get_exit_status());
 274         }
 275         else if (was_signalled) {
 276             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 277                     exit_status.get_term_sig());
 278         }
 279     }
 280
 281     // This may be a "smooth recovery" where we are restarting the process while leaving the
 282     // service in the STARTED state.
 283     if (restarting && service_state == service_state_t::STARTED) {
 284         bool need_stop = false;
 285         if ((did_exit && exit_status.get_exit_status() != 0) || was_signalled) {
 286             need_stop = true;
 287         }
 288         else {
 289             // We need to re-read the PID, since it has now changed.
 290             if (pid_file.length() != 0) {
 291                 auto pid_result = read_pid_file(&exit_status);
 292                 switch (pid_result) {
 293                     case pid_result_t::FAILED:
 294                         // Failed startup: no auto-restart.
 295                         need_stop = true;
 296                         break;
 297                     case pid_result_t::TERMINATED:
 298                         goto begin;
 299                     case pid_result_t::OK:
 300                         break;
 301                 }
 302             }
 303         }
 304
 305         if (need_stop) {
 306             // Failed startup: no auto-restart.
 307             stop_reason = stopped_reason_t::TERMINATED;
 308             emergency_stop();
 309             services->process_queues();
 310         }
 311
 312         return;
 313     }
 314
 315     if (service_state == service_state_t::STARTING) {
 316         // POSIX requires that if the process exited clearly with a status code of 0,
 317         // the exit status value will be 0:
 318         if (exit_status.did_exit_clean()) {
 319             auto pid_result = read_pid_file(&exit_status);
 320             switch (pid_result) {
 321                 case pid_result_t::FAILED:
 322                     // Failed startup: no auto-restart.
 323                     stop_reason = stopped_reason_t::FAILED;
 324                     failed_to_start();
 325                     break;
 326                 case pid_result_t::TERMINATED:
 327                     // started, but immediately terminated
 328                     started();
 329                     goto begin;
 330                 case pid_result_t::OK:
 331                     started();
 332                     break;
 333             }
 334         }
 335         else {
 336             stop_reason = stopped_reason_t::FAILED;
 337             failed_to_start();
 338         }
 339     }
 340     else if (service_state == service_state_t::STOPPING) {
 341         // We won't log a non-zero exit status or termination due to signal here -
 342         // we assume that the process died because we signalled it.
 343         stopped();
 344     }
 345     else {
 346         // we must be STARTED
 347         if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
 348             do_smooth_recovery();
 349             return;
 350         }
 351         stop_reason = stopped_reason_t::TERMINATED;
 352         forced_stop();
 353         stop_dependents();
 354         stopped();
 355     }
 356     services->process_queues();
 357 }
 358
 359 void bgproc_service::exec_failed(run_proc_err errcode) noexcept
 360 {
 361     log(loglevel_t::ERROR, get_name(), ": execution failed - ",
 362             exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
 363
 364     // Only time we execute is for startup:
 365     stop_reason = stopped_reason_t::EXECFAILED;
 366     failed_to_start();
 367 }
 368
 369 void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 370 {
 371     bool did_exit = exit_status.did_exit();
 372     bool was_signalled = exit_status.was_signalled();
 373     auto service_state = get_state();
 374
 375     // For a scripted service, a termination occurs in one of three main cases:
 376     // - the start script completed (or failed), when service was STARTING
 377     // - the start script was interrupted to cancel startup; state is STOPPING
 378     // - the stop script complete (or failed), state is STOPPING
 379
 380     if (service_state == service_state_t::STOPPING) {
 381         // We might be running the stop script, or we might be running the start script and have issued
 382         // a cancel order via SIGINT:
 383         if (interrupting_start) {
 384             if (stop_timer_armed) {
 385                 restart_timer.stop_timer(event_loop);
 386                 stop_timer_armed = false;
 387             }
 388             // We issued a start interrupt, so we expected this failure:
 389             if (did_exit && exit_status.get_exit_status() != 0) {
 390                 log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
 391                         exit_status.get_exit_status());
 392                 // Assume that a command terminating normally (with failure status) requires no cleanup:
 393                 stopped();
 394             }
 395             else {
 396                 if (was_signalled) {
 397                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
 398                             exit_status.get_term_sig());
 399                 }
 400                 // If the start script completed successfully, or was interrupted via our signal,
 401                 // we want to run the stop script to clean up:
 402                 bring_down();
 403             }
 404             interrupting_start = false;
 405         }
 406         else if (exit_status.did_exit_clean()) {
 407             // We were running the stop script and finished successfully
 408             stopped();
 409         }
 410         else {
 411             // ??? failed to stop! Let's log it as warning:
 412             if (did_exit) {
 413                 log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
 414                         exit_status.get_exit_status());
 415             }
 416             else if (was_signalled) {
 417                 log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
 418                         exit_status.get_term_sig());
 419             }
 420             // Even if the stop script failed, assume that service is now stopped, so that any dependencies
 421             // can be stopped. There's not really any other useful course of action here.
 422             stopped();
 423         }
 424         services->process_queues();
 425     }
 426     else { // STARTING
 427         if (exit_status.did_exit_clean()) {
 428             started();
 429         }
 430         else if (was_signalled && exit_status.get_term_sig() == SIGINT && onstart_flags.skippable) {
 431             // A skippable service can be skipped by interrupting (eg by ^C if the service
 432             // starts on the console).
 433             start_skipped = true;
 434             started();
 435         }
 436         else {
 437             // failed to start
 438             if (did_exit) {
 439                 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
 440                         exit_status.get_exit_status());
 441             }
 442             else if (was_signalled) {
 443                 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
 444                         exit_status.get_term_sig());
 445             }
 446             stop_reason = stopped_reason_t::FAILED;
 447             failed_to_start();
 448         }
 449         services->process_queues();
 450     }
 451 }
 452
 453 void scripted_service::exec_failed(run_proc_err errcode) noexcept
 454 {
 455     log(loglevel_t::ERROR, get_name(), ": execution failed - ",
 456             exec_stage_descriptions[static_cast<int>(errcode.stage)], ": ", strerror(errcode.st_errno));
 457     auto service_state = get_state();
 458     if (service_state == service_state_t::STARTING) {
 459         stop_reason = stopped_reason_t::EXECFAILED;
 460         failed_to_start();
 461     }
 462     else if (service_state == service_state_t::STOPPING) {
 463         // We've logged the failure, but it's probably better not to leave the service in
 464         // STOPPING state:
 465         stopped();
 466     }
 467 }
 468
 469 // Return a value as an unsigned-type value.
 470 template <typename T> typename std::make_unsigned<T>::type make_unsigned_val(T val)
 471 {
 472     return static_cast<typename std::make_unsigned<T>::type>(val);
 473 }
 474
 475 bgproc_service::pid_result_t
 476 bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
 477 {
 478     const char *pid_file_c = pid_file.c_str();
 479     int fd = open(pid_file_c, O_CLOEXEC);
 480     if (fd == -1) {
 481         log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
 482         return pid_result_t::FAILED;
 483     }
 484
 485     char pidbuf[21]; // just enough to hold any 64-bit integer
 486     int r = complete_read(fd, pidbuf, 20);
 487     if (r < 0) {
 488         // Could not read from PID file
 489         log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
 490         close(fd);
 491         return pid_result_t::FAILED;
 492     }
 493
 494     close(fd);
 495     pidbuf[r] = 0; // store nul terminator
 496
 497     bool valid_pid = false;
 498     try {
 499         unsigned long long v = std::stoull(pidbuf, nullptr, 0);
 500         if (v <= make_unsigned_val(std::numeric_limits<pid_t>::max())) {
 501             pid = (pid_t) v;
 502             valid_pid = true;
 503         }
 504     }
 505     catch (std::out_of_range &exc) {
 506         // Too large?
 507     }
 508     catch (std::invalid_argument &exc) {
 509         // Ok, so it doesn't look like a number: proceed...
 510     }
 511
 512     if (valid_pid) {
 513         pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
 514         if (wait_r == -1 && errno == ECHILD) {
 515             // We can't track this child - check process exists:
 516             if (kill(pid, 0) == 0 || errno != ESRCH) {
 517                 tracking_child = false;
 518                 return pid_result_t::OK;
 519             }
 520             else {
 521                 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 522                 pid = -1;
 523                 return pid_result_t::FAILED;
 524             }
 525         }
 526         else if (wait_r == pid) {
 527             pid = -1;
 528             return pid_result_t::TERMINATED;
 529         }
 530         else if (wait_r == 0) {
 531             // We can track the child
 532             child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
 533             tracking_child = true;
 534             reserved_child_watch = true;
 535             return pid_result_t::OK;
 536         }
 537     }
 538
 539     log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 540     pid = -1;
 541     return pid_result_t::FAILED;
 542 }
 543
 544 void process_service::bring_down() noexcept
 545 {
 546     if (waiting_for_execstat) {
 547         // The process is still starting. This should be uncommon, but can occur during
 548         // smooth recovery. We can't do much now; we have to wait until we get the
 549         // status, and then act appropriately.
 550         return;
 551     }
 552     else if (pid != -1) {
 553         // The process is still kicking on - must actually kill it. We signal the process
 554         // group (-pid) rather than just the process as there's less risk then of creating
 555         // an orphaned process group:
 556         if (! onstart_flags.no_sigterm) {
 557             kill_pg(SIGTERM);
 558         }
 559         if (term_signal != -1) {
 560             kill_pg(term_signal);
 561         }
 562
 563         // If there's a stop timeout, arm the timer now:
 564         if (stop_timeout != time_val(0,0)) {
 565             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 566             stop_timer_armed = true;
 567         }
 568
 569         // The rest is done in handle_exit_status.
 570     }
 571     else {
 572         // The process is already dead.
 573         stopped();
 574     }
 575 }
 576
 577 void bgproc_service::bring_down() noexcept
 578 {
 579     if (pid != -1) {
 580         // The process is still kicking on - must actually kill it. We signal the process
 581         // group (-pid) rather than just the process as there's less risk then of creating
 582         // an orphaned process group:
 583         if (! onstart_flags.no_sigterm) {
 584             kill_pg(SIGTERM);
 585         }
 586         if (term_signal != -1) {
 587             kill_pg(term_signal);
 588         }
 589
 590         // In most cases, the rest is done in handle_exit_status.
 591         // If we are a BGPROCESS and the process is not our immediate child, however, that
 592         // won't work - check for this now:
 593         if (! tracking_child) {
 594             stopped();
 595         }
 596         else if (stop_timeout != time_val(0,0)) {
 597             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 598             stop_timer_armed = true;
 599         }
 600     }
 601     else {
 602         // The process is already dead.
 603         stopped();
 604     }
 605 }
 606
 607 void scripted_service::bring_down() noexcept
 608 {
 609         if (pid != -1) {
 610                 // We're already running the stop script; nothing to do.
 611                 return;
 612         }
 613
 614     if (stop_command.length() == 0) {
 615         stopped();
 616     }
 617     else if (! start_ps_process(stop_arg_parts, false)) {
 618         // Couldn't execute stop script, but there's not much we can do:
 619         stopped();
 620     }
 621     else {
 622         // successfully started stop script: start kill timer:
 623         if (stop_timeout != time_val(0,0)) {
 624             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 625             stop_timer_armed = true;
 626         }
 627     }
 628 }
 629
 630 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
 631 {
 632     service->timer_expired();
 633
 634     // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
 635     return dasynq::rearm::NOOP;
 636 }