src/proc-service.cc

   1 #include <cstring>
   2
   3 #include <sys/un.h>
   4 #include <sys/socket.h>
   5
   6 #include "dinit.h"
   7 #include "dinit-socket.h"
   8 #include "dinit-util.h"
   9 #include "dinit-log.h"
  10 #include "proc-service.h"
  11
  12 /*
  13  * Most of the implementation for process-based services (process, scripted, bgprocess) is here.
  14  *
  15  * See proc-service.h header for interface details.
  16  */
  17
  18 // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  19 // store a null terminator for the argument. Return a `char *` vector containing the beginning
  20 // of each argument and a trailing nullptr. (The returned array is invalidated if the string is
  21 // later modified).
  22 std::vector<const char *> separate_args(std::string &s,
  23         const std::list<std::pair<unsigned,unsigned>> &arg_indices)
  24 {
  25     std::vector<const char *> r;
  26     r.reserve(arg_indices.size() + 1);
  27
  28     // First store nul terminator for each part:
  29     for (auto index_pair : arg_indices) {
  30         if (index_pair.second < s.length()) {
  31             s[index_pair.second] = 0;
  32         }
  33     }
  34
  35     // Now we can get the C string (c_str) and store offsets into it:
  36     const char * cstr = s.c_str();
  37     for (auto index_pair : arg_indices) {
  38         r.push_back(cstr + index_pair.first);
  39     }
  40     r.push_back(nullptr);
  41     return r;
  42 }
  43
  44 void process_service::exec_succeeded() noexcept
  45 {
  46     // This could be a smooth recovery (state already STARTED). Even more, the process
  47     // might be stopped (and killed via a signal) during smooth recovery.  We don't to
  48     // process startup again in either case, so we check for state STARTING:
  49     if (get_state() == service_state_t::STARTING) {
  50         if (force_notification_fd != -1 || !notification_var.empty()) {
  51             // Wait for readiness notification:
  52             readiness_watcher.set_enabled(event_loop, true);
  53         }
  54         else {
  55             started();
  56         }
  57     }
  58     else if (get_state() == service_state_t::STOPPING) {
  59         // stopping, but smooth recovery was in process. That's now over so we can
  60         // commence normal stop. Note that if pid == -1 the process already stopped(!),
  61         // that's handled below.
  62         if (pid != -1 && stop_check_dependents()) {
  63             bring_down();
  64         }
  65     }
  66 }
  67
  68 void scripted_service::exec_succeeded() noexcept
  69 {
  70         // For a scripted service, this means nothing other than that the start/stop
  71         // script will now begin.
  72 }
  73
  74 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
  75 {
  76     base_process_service *sr = service;
  77     sr->waiting_for_execstat = false;
  78
  79     int exec_status;
  80     int r = read(get_watched_fd(), &exec_status, sizeof(int));
  81     deregister(loop);
  82     close(get_watched_fd());
  83
  84     if (r > 0) {
  85         // We read an errno code; exec() failed, and the service startup failed.
  86         if (sr->pid != -1) {
  87             sr->child_listener.deregister(event_loop, sr->pid);
  88             sr->reserved_child_watch = false;
  89             if (sr->stop_timer_armed) {
  90                 sr->restart_timer.stop_timer(loop);
  91                 sr->stop_timer_armed = false;
  92             }
  93         }
  94         sr->pid = -1;
  95         sr->exec_failed(exec_status);
  96     }
  97     else {
  98         sr->exec_succeeded();
  99
 100         if (sr->pid == -1) {
 101             // Somehow the process managed to complete before we even saw the exec() status.
 102             sr->handle_exit_status(sr->exit_status);
 103         }
 104     }
 105
 106     sr->services->process_queues();
 107
 108     return rearm::REMOVED;
 109 }
 110
 111 rearm ready_notify_watcher::fd_event(eventloop_t &, int fd, int flags) noexcept
 112 {
 113     char buf[128];
 114     if (service->get_state() == service_state_t::STARTING) {
 115         // can we actually read anything from the notification pipe?
 116         int r = bp_sys::read(fd, buf, sizeof(buf));
 117         if (r > 0) {
 118             service->started();
 119         }
 120         else if (r == 0 || errno != EAGAIN) {
 121             service->failed_to_start(false, false);
 122             service->set_state(service_state_t::STOPPING);
 123             service->bring_down();
 124         }
 125     }
 126     else {
 127         // Just keep consuming data from the pipe:
 128         int r = bp_sys::read(fd, buf, sizeof(buf));
 129         if (r == 0) {
 130             // Process closed write end or terminated
 131             close(fd);
 132             service->notification_fd = -1;
 133             return rearm::DISARM;
 134         }
 135     }
 136     return rearm::REARM;
 137 }
 138
 139 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
 140 {
 141     base_process_service *sr = service;
 142
 143     sr->pid = -1;
 144     sr->exit_status = bp_sys::exit_status(status);
 145
 146     // Ok, for a process service, any process death which we didn't rig ourselves is a bit... unexpected.
 147     // Probably, the child died because we asked it to (sr->service_state == STOPPING). But even if we
 148     // didn't, there's not much we can do.
 149
 150     if (sr->waiting_for_execstat) {
 151         // We still don't have an exec() status from the forked child, wait for that
 152         // before doing any further processing.
 153         return dasynq::rearm::NOOP; // hold watch reservation
 154     }
 155
 156     // Must stop watch now since handle_exit_status might result in re-launch:
 157     // (stop_watch instead of deregister, so that we hold watch reservation).
 158     stop_watch(loop);
 159
 160     if (sr->stop_timer_armed) {
 161         sr->restart_timer.stop_timer(loop);
 162         sr->stop_timer_armed = false;
 163     }
 164
 165     sr->handle_exit_status(bp_sys::exit_status(status));
 166     return dasynq::rearm::NOOP;
 167 }
 168
 169 void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 170 {
 171     bool did_exit = exit_status.did_exit();
 172     bool was_signalled = exit_status.was_signalled();
 173     restarting = false;
 174     auto service_state = get_state();
 175
 176     if (notification_fd != -1) {
 177         readiness_watcher.deregister(event_loop);
 178         bp_sys::close(notification_fd);
 179         notification_fd = -1;
 180     }
 181
 182     if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
 183         if (did_exit) {
 184             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 185                     exit_status.get_exit_status());
 186         }
 187         else if (was_signalled) {
 188             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 189                     exit_status.get_term_sig());
 190         }
 191     }
 192
 193 #if USE_UTMPX
 194     if (*inittab_id || *inittab_line) {
 195         clear_utmp_entry(inittab_id, inittab_line);
 196     }
 197 #endif
 198
 199     if (service_state == service_state_t::STARTING) {
 200         // If state is STARTING, we must be waiting for readiness notification; the process has
 201         // terminated before becoming ready.
 202         stop_reason = stopped_reason_t::FAILED;
 203         failed_to_start();
 204     }
 205     else if (service_state == service_state_t::STOPPING) {
 206         // We won't log a non-zero exit status or termination due to signal here -
 207         // we assume that the process died because we signalled it.
 208         if (stop_timer_armed) {
 209             restart_timer.stop_timer(event_loop);
 210         }
 211         stopped();
 212     }
 213     else if (smooth_recovery && service_state == service_state_t::STARTED
 214             && get_target_state() == service_state_t::STARTED) {
 215         do_smooth_recovery();
 216         return;
 217     }
 218     else {
 219         stop_reason = stopped_reason_t::TERMINATED;
 220         emergency_stop();
 221     }
 222     services->process_queues();
 223 }
 224
 225 void process_service::exec_failed(int errcode) noexcept
 226 {
 227     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 228
 229     if (notification_fd != -1) {
 230         readiness_watcher.deregister(event_loop);
 231         bp_sys::close(notification_fd);
 232         notification_fd = -1;
 233     }
 234
 235     if (get_state() == service_state_t::STARTING) {
 236         stop_reason = stopped_reason_t::EXECFAILED;
 237         failed_to_start();
 238     }
 239     else {
 240         // Process service in smooth recovery:
 241         stop_reason = stopped_reason_t::TERMINATED;
 242         emergency_stop();
 243     }
 244 }
 245
 246 void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 247 {
 248     begin:
 249     bool did_exit = exit_status.did_exit();
 250     bool was_signalled = exit_status.was_signalled();
 251     auto service_state = get_state();
 252
 253     if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
 254         if (did_exit) {
 255             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 256                     exit_status.get_exit_status());
 257         }
 258         else if (was_signalled) {
 259             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 260                     exit_status.get_term_sig());
 261         }
 262     }
 263
 264     // This may be a "smooth recovery" where we are restarting the process while leaving the
 265     // service in the STARTED state.
 266     if (restarting && service_state == service_state_t::STARTED) {
 267         restarting = false;
 268         bool need_stop = false;
 269         if ((did_exit && exit_status.get_exit_status() != 0) || was_signalled) {
 270             need_stop = true;
 271         }
 272         else {
 273             // We need to re-read the PID, since it has now changed.
 274             if (pid_file.length() != 0) {
 275                 auto pid_result = read_pid_file(&exit_status);
 276                 switch (pid_result) {
 277                     case pid_result_t::FAILED:
 278                         // Failed startup: no auto-restart.
 279                         need_stop = true;
 280                         break;
 281                     case pid_result_t::TERMINATED:
 282                         goto begin;
 283                     case pid_result_t::OK:
 284                         break;
 285                 }
 286             }
 287         }
 288
 289         if (need_stop) {
 290             // Failed startup: no auto-restart.
 291             stop_reason = stopped_reason_t::TERMINATED;
 292             emergency_stop();
 293             services->process_queues();
 294         }
 295
 296         return;
 297     }
 298
 299     restarting = false;
 300     if (service_state == service_state_t::STARTING) {
 301         // POSIX requires that if the process exited clearly with a status code of 0,
 302         // the exit status value will be 0:
 303         if (exit_status.did_exit_clean()) {
 304             auto pid_result = read_pid_file(&exit_status);
 305             switch (pid_result) {
 306                 case pid_result_t::FAILED:
 307                     // Failed startup: no auto-restart.
 308                     stop_reason = stopped_reason_t::FAILED;
 309                     failed_to_start();
 310                     break;
 311                 case pid_result_t::TERMINATED:
 312                     // started, but immediately terminated
 313                     started();
 314                     goto begin;
 315                 case pid_result_t::OK:
 316                     started();
 317                     break;
 318             }
 319         }
 320         else {
 321             stop_reason = stopped_reason_t::FAILED;
 322             failed_to_start();
 323         }
 324     }
 325     else if (service_state == service_state_t::STOPPING) {
 326         // We won't log a non-zero exit status or termination due to signal here -
 327         // we assume that the process died because we signalled it.
 328         stopped();
 329     }
 330     else {
 331         // we must be STARTED
 332         if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
 333             do_smooth_recovery();
 334             return;
 335         }
 336         if (! do_auto_restart() && start_explicit) {
 337             start_explicit = false;
 338             release(false);
 339         }
 340         stop_reason = stopped_reason_t::TERMINATED;
 341         forced_stop();
 342         stop_dependents();
 343         stopped();
 344     }
 345     services->process_queues();
 346 }
 347
 348 void bgproc_service::exec_failed(int errcode) noexcept
 349 {
 350     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 351     // Only time we execute is for startup:
 352     stop_reason = stopped_reason_t::EXECFAILED;
 353     failed_to_start();
 354 }
 355
 356 void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 357 {
 358     bool did_exit = exit_status.did_exit();
 359     bool was_signalled = exit_status.was_signalled();
 360     auto service_state = get_state();
 361
 362     // For a scripted service, a termination occurs in one of three main cases:
 363     // - the start script completed (or failed), when service was STARTING
 364     // - the start script was interrupted to cancel startup; state is STOPPING
 365     // - the stop script complete (or failed), state is STOPPING
 366
 367     if (service_state == service_state_t::STOPPING) {
 368         // We might be running the stop script, or we might be running the start script and have issued
 369         // a cancel order via SIGINT:
 370         if (interrupting_start) {
 371             // We issued a start interrupt, so we expected this failure:
 372             if (did_exit && exit_status.get_exit_status() != 0) {
 373                 log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
 374                         exit_status.get_exit_status());
 375                 // Assume that a command terminating normally requires no cleanup:
 376                 stopped();
 377             }
 378             else {
 379                 if (was_signalled) {
 380                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
 381                             exit_status.get_term_sig());
 382                 }
 383                 // If the start script completed successfully, or was interrupted via our signal,
 384                 // we want to run the stop script to clean up:
 385                 bring_down();
 386             }
 387             interrupting_start = false;
 388         }
 389         else if (exit_status.did_exit_clean()) {
 390             // We were running the stop script and finished successfully
 391             stopped();
 392         }
 393         else {
 394             // ??? failed to stop! Let's log it as warning:
 395             if (did_exit) {
 396                 log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
 397                         exit_status.get_exit_status());
 398             }
 399             else if (was_signalled) {
 400                 log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
 401                         exit_status.get_term_sig());
 402             }
 403             // Even if the stop script failed, assume that service is now stopped, so that any dependencies
 404             // can be stopped. There's not really any other useful course of action here.
 405             stopped();
 406         }
 407         services->process_queues();
 408     }
 409     else { // STARTING
 410         if (exit_status.did_exit_clean()) {
 411             started();
 412         }
 413         else if (was_signalled && exit_status.get_term_sig() == SIGINT && onstart_flags.skippable) {
 414             // A skippable service can be skipped by interrupting (eg by ^C if the service
 415             // starts on the console).
 416             start_skipped = true;
 417             started();
 418         }
 419         else {
 420             // failed to start
 421             if (did_exit) {
 422                 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
 423                         exit_status.get_exit_status());
 424             }
 425             else if (was_signalled) {
 426                 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
 427                         exit_status.get_term_sig());
 428             }
 429             stop_reason = stopped_reason_t::FAILED;
 430             failed_to_start();
 431         }
 432         services->process_queues();
 433     }
 434 }
 435
 436 void scripted_service::exec_failed(int errcode) noexcept
 437 {
 438     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 439     auto service_state = get_state();
 440     if (service_state == service_state_t::STARTING) {
 441         stop_reason = stopped_reason_t::EXECFAILED;
 442         failed_to_start();
 443     }
 444     else if (service_state == service_state_t::STOPPING) {
 445         // We've logged the failure, but it's probably better not to leave the service in
 446         // STOPPING state:
 447         stopped();
 448     }
 449 }
 450
 451 bgproc_service::pid_result_t
 452 bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
 453 {
 454     const char *pid_file_c = pid_file.c_str();
 455     int fd = open(pid_file_c, O_CLOEXEC);
 456     if (fd == -1) {
 457         log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
 458         return pid_result_t::FAILED;
 459     }
 460
 461     char pidbuf[21]; // just enough to hold any 64-bit integer
 462     int r = complete_read(fd, pidbuf, 20);
 463     if (r < 0) {
 464         // Could not read from PID file
 465         log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
 466         close(fd);
 467         return pid_result_t::FAILED;
 468     }
 469
 470     close(fd);
 471     pidbuf[r] = 0; // store nul terminator
 472
 473     bool valid_pid = false;
 474     try {
 475         unsigned long long v = std::stoull(pidbuf, nullptr, 0);
 476         if (v <= std::numeric_limits<pid_t>::max()) {
 477             pid = (pid_t) v;
 478             valid_pid = true;
 479         }
 480     }
 481     catch (std::out_of_range &exc) {
 482         // Too large?
 483     }
 484     catch (std::invalid_argument &exc) {
 485         // Ok, so it doesn't look like a number: proceed...
 486     }
 487
 488     if (valid_pid) {
 489         pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
 490         if (wait_r == -1 && errno == ECHILD) {
 491             // We can't track this child - check process exists:
 492             if (kill(pid, 0) == 0 || errno != ESRCH) {
 493                 tracking_child = false;
 494                 return pid_result_t::OK;
 495             }
 496             else {
 497                 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 498                 pid = -1;
 499                 return pid_result_t::FAILED;
 500             }
 501         }
 502         else if (wait_r == pid) {
 503             pid = -1;
 504             return pid_result_t::TERMINATED;
 505         }
 506         else if (wait_r == 0) {
 507             // We can track the child
 508             child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
 509             tracking_child = true;
 510             reserved_child_watch = true;
 511             return pid_result_t::OK;
 512         }
 513     }
 514
 515     log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 516     pid = -1;
 517     return pid_result_t::FAILED;
 518 }
 519
 520 void process_service::bring_down() noexcept
 521 {
 522     if (waiting_for_execstat) {
 523         // The process is still starting. This should be uncommon, but can occur during
 524         // smooth recovery. We can't do much now; we have to wait until we get the
 525         // status, and then act appropriately.
 526         return;
 527     }
 528     else if (pid != -1) {
 529         // The process is still kicking on - must actually kill it. We signal the process
 530         // group (-pid) rather than just the process as there's less risk then of creating
 531         // an orphaned process group:
 532         if (! onstart_flags.no_sigterm) {
 533             kill_pg(SIGTERM);
 534         }
 535         if (term_signal != -1) {
 536             kill_pg(term_signal);
 537         }
 538
 539         // If there's a stop timeout, arm the timer now:
 540         if (stop_timeout != time_val(0,0)) {
 541             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 542             stop_timer_armed = true;
 543         }
 544
 545         // The rest is done in handle_exit_status.
 546     }
 547     else {
 548         // The process is already dead.
 549         stopped();
 550     }
 551 }
 552
 553 void bgproc_service::bring_down() noexcept
 554 {
 555     if (pid != -1) {
 556         // The process is still kicking on - must actually kill it. We signal the process
 557         // group (-pid) rather than just the process as there's less risk then of creating
 558         // an orphaned process group:
 559         if (! onstart_flags.no_sigterm) {
 560             kill_pg(SIGTERM);
 561         }
 562         if (term_signal != -1) {
 563             kill_pg(term_signal);
 564         }
 565
 566         // In most cases, the rest is done in handle_exit_status.
 567         // If we are a BGPROCESS and the process is not our immediate child, however, that
 568         // won't work - check for this now:
 569         if (! tracking_child) {
 570             stopped();
 571         }
 572         else if (stop_timeout != time_val(0,0)) {
 573             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 574             stop_timer_armed = true;
 575         }
 576     }
 577     else {
 578         // The process is already dead.
 579         stopped();
 580     }
 581 }
 582
 583 void scripted_service::bring_down() noexcept
 584 {
 585         if (pid != -1) {
 586                 // We're already running the stop script; nothing to do.
 587                 return;
 588         }
 589
 590     if (stop_command.length() == 0) {
 591         stopped();
 592     }
 593     else if (! start_ps_process(stop_arg_parts, false)) {
 594         // Couldn't execute stop script, but there's not much we can do:
 595         stopped();
 596     }
 597     else {
 598         // successfully started stop script: start kill timer:
 599         if (stop_timeout != time_val(0,0)) {
 600             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 601             stop_timer_armed = true;
 602         }
 603     }
 604 }
 605
 606 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
 607 {
 608     service->timer_expired();
 609
 610     // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
 611     return dasynq::rearm::NOOP;
 612 }