src/proc-service.cc

   1 #include <sys/un.h>
   2 #include <sys/socket.h>
   3
   4 #include "dinit-socket.h"
   5 #include "dinit-util.h"
   6 #include "proc-service.h"
   7
   8 extern eventloop_t event_loop;
   9
  10 using clock_type = dasynq::clock_type;
  11 using rearm = dasynq::rearm;
  12 using time_val = dasynq::time_val;
  13
  14 // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  15 // store a null terminator for the argument. Return a `char *` vector containing the beginning
  16 // of each argument and a trailing nullptr. (The returned array is invalidated if the string is later modified).
  17 std::vector<const char *> separate_args(std::string &s, std::list<std::pair<unsigned,unsigned>> &arg_indices)
  18 {
  19     std::vector<const char *> r;
  20     r.reserve(arg_indices.size() + 1);
  21
  22     // First store nul terminator for each part:
  23     for (auto index_pair : arg_indices) {
  24         if (index_pair.second < s.length()) {
  25             s[index_pair.second] = 0;
  26         }
  27     }
  28
  29     // Now we can get the C string (c_str) and store offsets into it:
  30     const char * cstr = s.c_str();
  31     for (auto index_pair : arg_indices) {
  32         r.push_back(cstr + index_pair.first);
  33     }
  34     r.push_back(nullptr);
  35     return r;
  36 }
  37
  38 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
  39 {
  40     base_process_service *sr = service;
  41     sr->waiting_for_execstat = false;
  42
  43     int exec_status;
  44     int r = read(get_watched_fd(), &exec_status, sizeof(int));
  45     deregister(loop);
  46     close(get_watched_fd());
  47
  48     if (r > 0) {
  49         // We read an errno code; exec() failed, and the service startup failed.
  50         if (sr->pid != -1) {
  51             sr->child_listener.deregister(event_loop, sr->pid);
  52             sr->reserved_child_watch = false;
  53             if (sr->stop_timer_armed) {
  54                 sr->restart_timer.stop_timer(loop);
  55                 sr->stop_timer_armed = false;
  56             }
  57         }
  58         sr->pid = -1;
  59         sr->exec_failed(exec_status);
  60     }
  61     else {
  62         // exec() succeeded.
  63         if (sr->get_type() == service_type_t::PROCESS) {
  64             // This could be a smooth recovery (state already STARTED). Even more, the process
  65             // might be stopped (and killed via a signal) during smooth recovery.  We don't to
  66             // process startup again in either case, so we check for state STARTING:
  67             if (sr->get_state() == service_state_t::STARTING) {
  68                 sr->started();
  69             }
  70             else if (sr->get_state() == service_state_t::STOPPING) {
  71                 // stopping, but smooth recovery was in process. That's now over so we can
  72                 // commence normal stop. Note that if pid == -1 the process already stopped(!),
  73                 // that's handled below.
  74                 if (sr->pid != -1 && sr->stop_check_dependents()) {
  75                     sr->bring_down();
  76                 }
  77             }
  78         }
  79
  80         if (sr->pid == -1) {
  81             // Somehow the process managed to complete before we even saw the status.
  82             sr->handle_exit_status(sr->exit_status);
  83         }
  84     }
  85
  86     sr->services->process_queues();
  87
  88     return rearm::REMOVED;
  89 }
  90
  91 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
  92 {
  93     base_process_service *sr = service;
  94
  95     sr->pid = -1;
  96     sr->exit_status = status;
  97
  98     // Ok, for a process service, any process death which we didn't rig
  99     // ourselves is a bit... unexpected. Probably, the child died because
 100     // we asked it to (sr->service_state == STOPPING). But even if
 101     // we didn't, there's not much we can do.
 102
 103     if (sr->waiting_for_execstat) {
 104         // We still don't have an exec() status from the forked child, wait for that
 105         // before doing any further processing.
 106         return dasynq::rearm::NOOP; // hold watch reservation
 107     }
 108
 109     // Must stop watch now since handle_exit_status might result in re-launch:
 110     // (stop_watch instead of deregister, so that we hold watch reservation).
 111     stop_watch(loop);
 112
 113     if (sr->stop_timer_armed) {
 114         sr->restart_timer.stop_timer(loop);
 115         sr->stop_timer_armed = false;
 116     }
 117
 118     sr->handle_exit_status(status);
 119     return dasynq::rearm::NOOP;
 120 }
 121
 122 void process_service::handle_exit_status(int exit_status) noexcept
 123 {
 124     bool did_exit = WIFEXITED(exit_status);
 125     bool was_signalled = WIFSIGNALED(exit_status);
 126     restarting = false;
 127     auto service_state = get_state();
 128
 129     if (exit_status != 0 && service_state != service_state_t::STOPPING) {
 130         if (did_exit) {
 131             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 132                     WEXITSTATUS(exit_status));
 133         }
 134         else if (was_signalled) {
 135             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 136                     WTERMSIG(exit_status));
 137         }
 138     }
 139
 140     if (service_state == service_state_t::STARTING) {
 141         if (did_exit && WEXITSTATUS(exit_status) == 0) {
 142             started();
 143         }
 144         else {
 145             failed_to_start();
 146         }
 147     }
 148     else if (service_state == service_state_t::STOPPING) {
 149         // We won't log a non-zero exit status or termination due to signal here -
 150         // we assume that the process died because we signalled it.
 151         stopped();
 152     }
 153     else if (smooth_recovery && service_state == service_state_t::STARTED
 154             && get_target_state() == service_state_t::STARTED) {
 155         do_smooth_recovery();
 156         return;
 157     }
 158     else {
 159         emergency_stop();
 160     }
 161     services->process_queues();
 162 }
 163
 164 void process_service::exec_failed(int errcode) noexcept
 165 {
 166     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 167     if (get_state() == service_state_t::STARTING) {
 168         failed_to_start();
 169     }
 170     else {
 171         // Process service in smooth recovery:
 172         emergency_stop();
 173     }
 174 }
 175
 176 void bgproc_service::handle_exit_status(int exit_status) noexcept
 177 {
 178     begin:
 179     bool did_exit = WIFEXITED(exit_status);
 180     bool was_signalled = WIFSIGNALED(exit_status);
 181     auto service_state = get_state();
 182
 183     if (exit_status != 0 && service_state != service_state_t::STOPPING) {
 184         if (did_exit) {
 185             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 186                     WEXITSTATUS(exit_status));
 187         }
 188         else if (was_signalled) {
 189             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 190                     WTERMSIG(exit_status));
 191         }
 192     }
 193
 194     // This may be a "smooth recovery" where we are restarting the process while leaving the
 195     // service in the STARTED state.
 196     if (restarting && service_state == service_state_t::STARTED) {
 197         restarting = false;
 198         bool need_stop = false;
 199         if ((did_exit && WEXITSTATUS(exit_status) != 0) || was_signalled) {
 200             need_stop = true;
 201         }
 202         else {
 203             // We need to re-read the PID, since it has now changed.
 204             if (pid_file.length() != 0) {
 205                 auto pid_result = read_pid_file(&exit_status);
 206                 switch (pid_result) {
 207                     case pid_result_t::FAILED:
 208                         // Failed startup: no auto-restart.
 209                         need_stop = true;
 210                         break;
 211                     case pid_result_t::TERMINATED:
 212                         goto begin;
 213                     case pid_result_t::OK:
 214                         break;
 215                 }
 216             }
 217         }
 218
 219         if (need_stop) {
 220             // Failed startup: no auto-restart.
 221             emergency_stop();
 222             services->process_queues();
 223         }
 224
 225         return;
 226     }
 227
 228     restarting = false;
 229     if (service_state == service_state_t::STARTING) {
 230         // POSIX requires that if the process exited clearly with a status code of 0,
 231         // the exit status value will be 0:
 232         if (exit_status == 0) {
 233             auto pid_result = read_pid_file(&exit_status);
 234             switch (pid_result) {
 235                 case pid_result_t::FAILED:
 236                     // Failed startup: no auto-restart.
 237                     failed_to_start();
 238                     break;
 239                 case pid_result_t::TERMINATED:
 240                     // started, but immediately terminated
 241                     started();
 242                     goto begin;
 243                 case pid_result_t::OK:
 244                     started();
 245                     break;
 246             }
 247         }
 248         else {
 249             failed_to_start();
 250         }
 251     }
 252     else if (service_state == service_state_t::STOPPING) {
 253         // We won't log a non-zero exit status or termination due to signal here -
 254         // we assume that the process died because we signalled it.
 255         stopped();
 256     }
 257     else {
 258         // we must be STARTED
 259         if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
 260             do_smooth_recovery();
 261             return;
 262         }
 263         if (! do_auto_restart() && start_explicit) {
 264             start_explicit = false;
 265             release();
 266         }
 267         forced_stop();
 268         stop_dependents();
 269         stopped();
 270     }
 271     services->process_queues();
 272 }
 273
 274 void bgproc_service::exec_failed(int errcode) noexcept
 275 {
 276     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 277     // Only time we execute is for startup:
 278     failed_to_start();
 279 }
 280
 281 void scripted_service::handle_exit_status(int exit_status) noexcept
 282 {
 283     bool did_exit = WIFEXITED(exit_status);
 284     bool was_signalled = WIFSIGNALED(exit_status);
 285     auto service_state = get_state();
 286
 287     // For a scripted service, a termination occurs in one of three main cases:
 288     // - the start script completed (or failed), when service was STARTING
 289     // - the start script was interrupted to cancel startup; state is STOPPING
 290     // - the stop script complete (or failed), state is STOPPING
 291
 292     if (service_state == service_state_t::STOPPING) {
 293         // We might be running the stop script, or we might be running the start script and have issued
 294         // a cancel order via SIGINT:
 295         if (did_exit && WEXITSTATUS(exit_status) == 0) {
 296             if (interrupting_start) {
 297                 interrupting_start = false;
 298                 // launch stop script:
 299                 bring_down();
 300             }
 301             else {
 302                 // We were running the stop script and finished successfully
 303                 stopped();
 304             }
 305         }
 306         else {
 307             if (interrupting_start) {
 308                 // We issued a start interrupt, so we expected this failure:
 309                 if (did_exit) {
 310                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
 311                             WEXITSTATUS(exit_status));
 312                 }
 313                 else if (was_signalled) {
 314                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
 315                             WTERMSIG(exit_status));
 316                 }
 317             }
 318             else {
 319                 // ??? failed to stop! Let's log it as warning:
 320                 if (did_exit) {
 321                     log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
 322                             WEXITSTATUS(exit_status));
 323                 }
 324                 else if (was_signalled) {
 325                     log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
 326                             WTERMSIG(exit_status));
 327                 }
 328             }
 329             // Even if the stop script failed, assume that service is now stopped, so that any dependencies
 330             // can be stopped. There's not really any other useful course of action here.
 331             interrupting_start = false;
 332             stopped();
 333         }
 334         services->process_queues();
 335     }
 336     else { // STARTING
 337         if (exit_status == 0) {
 338             started();
 339         }
 340         else {
 341             // failed to start
 342             if (did_exit) {
 343                 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
 344                         WEXITSTATUS(exit_status));
 345             }
 346             else if (was_signalled) {
 347                 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
 348                         WTERMSIG(exit_status));
 349             }
 350             failed_to_start();
 351         }
 352         services->process_queues();
 353     }
 354 }
 355
 356 void scripted_service::exec_failed(int errcode) noexcept
 357 {
 358     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 359     auto service_state = get_state();
 360     if (service_state == service_state_t::STARTING) {
 361         failed_to_start();
 362     }
 363     else if (service_state == service_state_t::STOPPING) {
 364         // We've logged the failure, but it's probably better not to leave the service in
 365         // STOPPING state:
 366         stopped();
 367     }
 368 }
 369
 370 bgproc_service::pid_result_t
 371 bgproc_service::read_pid_file(int *exit_status) noexcept
 372 {
 373     const char *pid_file_c = pid_file.c_str();
 374     int fd = open(pid_file_c, O_CLOEXEC);
 375     if (fd == -1) {
 376         log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
 377         return pid_result_t::FAILED;
 378     }
 379
 380     char pidbuf[21]; // just enough to hold any 64-bit integer
 381     int r = ss_read(fd, pidbuf, 20);
 382     if (r < 0) {
 383         // Could not read from PID file
 384         log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
 385         close(fd);
 386         return pid_result_t::FAILED;
 387     }
 388
 389     close(fd);
 390     pidbuf[r] = 0; // store nul terminator
 391
 392     bool valid_pid = false;
 393     try {
 394         unsigned long long v = std::stoull(pidbuf, nullptr, 0);
 395         if (v <= std::numeric_limits<pid_t>::max()) {
 396             pid = (pid_t) v;
 397             valid_pid = true;
 398         }
 399     }
 400     catch (std::out_of_range &exc) {
 401         // Too large?
 402     }
 403     catch (std::invalid_argument &exc) {
 404         // Ok, so it doesn't look like a number: proceed...
 405     }
 406
 407     if (valid_pid) {
 408         pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
 409         if (wait_r == -1 && errno == ECHILD) {
 410             // We can't track this child - check process exists:
 411             if (kill(pid, 0) == 0 || errno != ESRCH) {
 412                 tracking_child = false;
 413                 return pid_result_t::OK;
 414             }
 415             else {
 416                 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 417                 pid = -1;
 418                 return pid_result_t::FAILED;
 419             }
 420         }
 421         else if (wait_r == pid) {
 422             pid = -1;
 423             return pid_result_t::TERMINATED;
 424         }
 425         else if (wait_r == 0) {
 426             // We can track the child
 427             child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
 428             tracking_child = true;
 429             reserved_child_watch = true;
 430             return pid_result_t::OK;
 431         }
 432     }
 433
 434     log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 435     pid = -1;
 436     return pid_result_t::FAILED;
 437 }
 438
 439 void process_service::bring_down() noexcept
 440 {
 441     waiting_for_deps = false;
 442     if (waiting_for_execstat) {
 443         // The process is still starting. This should be uncommon, but can occur during
 444         // smooth recovery. We can't do much now; we have to wait until we get the
 445         // status, and then act appropriately.
 446         return;
 447     }
 448     else if (pid != -1) {
 449         // The process is still kicking on - must actually kill it. We signal the process
 450         // group (-pid) rather than just the process as there's less risk then of creating
 451         // an orphaned process group:
 452         if (! onstart_flags.no_sigterm) {
 453             kill_pg(SIGTERM);
 454         }
 455         if (term_signal != -1) {
 456             kill_pg(term_signal);
 457         }
 458
 459         // In most cases, the rest is done in handle_exit_status.
 460         // If we are a BGPROCESS and the process is not our immediate child, however, that
 461         // won't work - check for this now:
 462         if (get_type() == service_type_t::BGPROCESS && ! tracking_child) {
 463             stopped();
 464         }
 465         else if (stop_timeout != time_val(0,0)) {
 466             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 467             stop_timer_armed = true;
 468         }
 469     }
 470     else {
 471         // The process is already dead.
 472         stopped();
 473     }
 474 }
 475
 476 void scripted_service::bring_down() noexcept
 477 {
 478     waiting_for_deps = false;
 479     if (stop_command.length() == 0) {
 480         stopped();
 481     }
 482     else if (! start_ps_process(stop_arg_parts, false)) {
 483         // Couldn't execute stop script, but there's not much we can do:
 484         stopped();
 485     }
 486     else {
 487         // successfully started stop script: start kill timer:
 488         if (stop_timeout != time_val(0,0)) {
 489             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 490             stop_timer_armed = true;
 491         }
 492     }
 493 }
 494
 495 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
 496 {
 497     service->stop_timer_armed = false;
 498
 499     // Timer expires if:
 500     // We are stopping, including after having startup cancelled (stop timeout, state is STOPPING); We are
 501     // starting (start timeout, state is STARTING); We are waiting for restart timer before restarting,
 502     // including smooth recovery (restart timeout, state is STARTING or STARTED).
 503     if (service->get_state() == service_state_t::STOPPING) {
 504         service->kill_with_fire();
 505     }
 506     else if (service->pid != -1) {
 507         // Starting, start timed out.
 508         service->stop_dependents();
 509         service->interrupt_start();
 510     }
 511     else {
 512         // STARTING / STARTED, and we have a pid: must be restarting (smooth recovery if STARTED)
 513         service->do_restart();
 514     }
 515
 516     // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
 517     return dasynq::rearm::NOOP;
 518 }