src/proc-service.cc

   1 #include <cstring>
   2
   3 #include <sys/un.h>
   4 #include <sys/socket.h>
   5
   6 #include "dinit.h"
   7 #include "dinit-socket.h"
   8 #include "dinit-util.h"
   9 #include "dinit-log.h"
  10 #include "proc-service.h"
  11
  12 /*
  13  * Most of the implementation for process-based services (process, scripted, bgprocess) is here.
  14  *
  15  * See proc-service.h header for interface details.
  16  */
  17
  18 // Given a string and a list of pairs of (start,end) indices for each argument in that string,
  19 // store a null terminator for the argument. Return a `char *` vector containing the beginning
  20 // of each argument and a trailing nullptr. (The returned array is invalidated if the string is
  21 // later modified).
  22 std::vector<const char *> separate_args(std::string &s,
  23         const std::list<std::pair<unsigned,unsigned>> &arg_indices)
  24 {
  25     std::vector<const char *> r;
  26     r.reserve(arg_indices.size() + 1);
  27
  28     // First store nul terminator for each part:
  29     for (auto index_pair : arg_indices) {
  30         if (index_pair.second < s.length()) {
  31             s[index_pair.second] = 0;
  32         }
  33     }
  34
  35     // Now we can get the C string (c_str) and store offsets into it:
  36     const char * cstr = s.c_str();
  37     for (auto index_pair : arg_indices) {
  38         r.push_back(cstr + index_pair.first);
  39     }
  40     r.push_back(nullptr);
  41     return r;
  42 }
  43
  44 void process_service::exec_succeeded() noexcept
  45 {
  46     // This could be a smooth recovery (state already STARTED). Even more, the process
  47     // might be stopped (and killed via a signal) during smooth recovery.  We don't to
  48     // process startup again in either case, so we check for state STARTING:
  49     if (get_state() == service_state_t::STARTING) {
  50         if (force_notification_fd != -1 || !notification_var.empty()) {
  51             // Wait for readiness notification:
  52             readiness_watcher.set_enabled(event_loop, true);
  53         }
  54         else {
  55             started();
  56         }
  57     }
  58     else if (get_state() == service_state_t::STOPPING) {
  59         // stopping, but smooth recovery was in process. That's now over so we can
  60         // commence normal stop. Note that if pid == -1 the process already stopped(!),
  61         // that's handled below.
  62         if (pid != -1 && stop_check_dependents()) {
  63             bring_down();
  64         }
  65     }
  66 }
  67
  68 void scripted_service::exec_succeeded() noexcept
  69 {
  70         // For a scripted service, this means nothing other than that the start/stop
  71         // script will now begin.
  72 }
  73
  74 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
  75 {
  76     base_process_service *sr = service;
  77     sr->waiting_for_execstat = false;
  78
  79     int exec_status;
  80     int r = read(get_watched_fd(), &exec_status, sizeof(int));
  81     deregister(loop);
  82     close(get_watched_fd());
  83
  84     if (r > 0) {
  85         // We read an errno code; exec() failed, and the service startup failed.
  86         if (sr->pid != -1) {
  87             sr->child_listener.deregister(event_loop, sr->pid);
  88             sr->reserved_child_watch = false;
  89             if (sr->stop_timer_armed) {
  90                 sr->restart_timer.stop_timer(loop);
  91                 sr->stop_timer_armed = false;
  92             }
  93         }
  94         sr->pid = -1;
  95         sr->exec_failed(exec_status);
  96     }
  97     else {
  98         sr->exec_succeeded();
  99
 100         if (sr->pid == -1) {
 101             // Somehow the process managed to complete before we even saw the exec() status.
 102             sr->handle_exit_status(sr->exit_status);
 103         }
 104     }
 105
 106     sr->services->process_queues();
 107
 108     return rearm::REMOVED;
 109 }
 110
 111 rearm ready_notify_watcher::fd_event(eventloop_t &, int fd, int flags) noexcept
 112 {
 113     char buf[128];
 114     if (service->get_state() == service_state_t::STARTING) {
 115         // can we actually read anything from the notification pipe?
 116         int r = bp_sys::read(fd, buf, sizeof(buf));
 117         if (r > 0) {
 118             service->started();
 119         }
 120         else if (r == 0 || errno != EAGAIN) {
 121             service->failed_to_start(false, false);
 122             service->set_state(service_state_t::STOPPING);
 123             service->bring_down();
 124         }
 125     }
 126     else {
 127         // Just keep consuming data from the pipe:
 128         int r = bp_sys::read(fd, buf, sizeof(buf));
 129         if (r == 0) {
 130             // Process closed write end or terminated
 131             close(fd);
 132             service->notification_fd = -1;
 133             return rearm::DISARM;
 134         }
 135     }
 136     return rearm::REARM;
 137 }
 138
 139 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
 140 {
 141     base_process_service *sr = service;
 142
 143     sr->pid = -1;
 144     sr->exit_status = bp_sys::exit_status(status);
 145
 146     // Ok, for a process service, any process death which we didn't rig ourselves is a bit... unexpected.
 147     // Probably, the child died because we asked it to (sr->service_state == STOPPING). But even if we
 148     // didn't, there's not much we can do.
 149
 150     if (sr->waiting_for_execstat) {
 151         // We still don't have an exec() status from the forked child, wait for that
 152         // before doing any further processing.
 153         return dasynq::rearm::NOOP; // hold watch reservation
 154     }
 155
 156     // Must stop watch now since handle_exit_status might result in re-launch:
 157     // (stop_watch instead of deregister, so that we hold watch reservation).
 158     stop_watch(loop);
 159
 160     if (sr->stop_timer_armed) {
 161         sr->restart_timer.stop_timer(loop);
 162         sr->stop_timer_armed = false;
 163     }
 164
 165     sr->handle_exit_status(bp_sys::exit_status(status));
 166     return dasynq::rearm::NOOP;
 167 }
 168
 169 void process_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 170 {
 171     bool did_exit = exit_status.did_exit();
 172     bool was_signalled = exit_status.was_signalled();
 173     restarting = false;
 174     auto service_state = get_state();
 175
 176     if (notification_fd != -1) {
 177         readiness_watcher.deregister(event_loop);
 178         bp_sys::close(notification_fd);
 179         notification_fd = -1;
 180     }
 181
 182     if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
 183         if (did_exit) {
 184             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 185                     exit_status.get_exit_status());
 186         }
 187         else if (was_signalled) {
 188             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 189                     exit_status.get_term_sig());
 190         }
 191     }
 192
 193     if (service_state == service_state_t::STARTING) {
 194         // If state is STARTING, we must be waiting for readiness notification; the process has
 195         // terminated before becoming ready.
 196         stop_reason = stopped_reason_t::FAILED;
 197         failed_to_start();
 198     }
 199     else if (service_state == service_state_t::STOPPING) {
 200         // We won't log a non-zero exit status or termination due to signal here -
 201         // we assume that the process died because we signalled it.
 202         if (stop_timer_armed) {
 203             restart_timer.stop_timer(event_loop);
 204         }
 205         stopped();
 206     }
 207     else if (smooth_recovery && service_state == service_state_t::STARTED
 208             && get_target_state() == service_state_t::STARTED) {
 209         do_smooth_recovery();
 210         return;
 211     }
 212     else {
 213         stop_reason = stopped_reason_t::TERMINATED;
 214         emergency_stop();
 215     }
 216     services->process_queues();
 217 }
 218
 219 void process_service::exec_failed(int errcode) noexcept
 220 {
 221     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 222
 223     if (notification_fd != -1) {
 224         readiness_watcher.deregister(event_loop);
 225         bp_sys::close(notification_fd);
 226         notification_fd = -1;
 227     }
 228
 229     if (get_state() == service_state_t::STARTING) {
 230         stop_reason = stopped_reason_t::EXECFAILED;
 231         failed_to_start();
 232     }
 233     else {
 234         // Process service in smooth recovery:
 235         stop_reason = stopped_reason_t::TERMINATED;
 236         emergency_stop();
 237     }
 238 }
 239
 240 void bgproc_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 241 {
 242     begin:
 243     bool did_exit = exit_status.did_exit();
 244     bool was_signalled = exit_status.was_signalled();
 245     auto service_state = get_state();
 246
 247     if (!exit_status.did_exit_clean() && service_state != service_state_t::STOPPING) {
 248         if (did_exit) {
 249             log(loglevel_t::ERROR, "Service ", get_name(), " process terminated with exit code ",
 250                     exit_status.get_exit_status());
 251         }
 252         else if (was_signalled) {
 253             log(loglevel_t::ERROR, "Service ", get_name(), " terminated due to signal ",
 254                     exit_status.get_term_sig());
 255         }
 256     }
 257
 258     // This may be a "smooth recovery" where we are restarting the process while leaving the
 259     // service in the STARTED state.
 260     if (restarting && service_state == service_state_t::STARTED) {
 261         restarting = false;
 262         bool need_stop = false;
 263         if ((did_exit && exit_status.get_exit_status() != 0) || was_signalled) {
 264             need_stop = true;
 265         }
 266         else {
 267             // We need to re-read the PID, since it has now changed.
 268             if (pid_file.length() != 0) {
 269                 auto pid_result = read_pid_file(&exit_status);
 270                 switch (pid_result) {
 271                     case pid_result_t::FAILED:
 272                         // Failed startup: no auto-restart.
 273                         need_stop = true;
 274                         break;
 275                     case pid_result_t::TERMINATED:
 276                         goto begin;
 277                     case pid_result_t::OK:
 278                         break;
 279                 }
 280             }
 281         }
 282
 283         if (need_stop) {
 284             // Failed startup: no auto-restart.
 285             stop_reason = stopped_reason_t::TERMINATED;
 286             emergency_stop();
 287             services->process_queues();
 288         }
 289
 290         return;
 291     }
 292
 293     restarting = false;
 294     if (service_state == service_state_t::STARTING) {
 295         // POSIX requires that if the process exited clearly with a status code of 0,
 296         // the exit status value will be 0:
 297         if (exit_status.did_exit_clean()) {
 298             auto pid_result = read_pid_file(&exit_status);
 299             switch (pid_result) {
 300                 case pid_result_t::FAILED:
 301                     // Failed startup: no auto-restart.
 302                     stop_reason = stopped_reason_t::FAILED;
 303                     failed_to_start();
 304                     break;
 305                 case pid_result_t::TERMINATED:
 306                     // started, but immediately terminated
 307                     started();
 308                     goto begin;
 309                 case pid_result_t::OK:
 310                     started();
 311                     break;
 312             }
 313         }
 314         else {
 315             stop_reason = stopped_reason_t::FAILED;
 316             failed_to_start();
 317         }
 318     }
 319     else if (service_state == service_state_t::STOPPING) {
 320         // We won't log a non-zero exit status or termination due to signal here -
 321         // we assume that the process died because we signalled it.
 322         stopped();
 323     }
 324     else {
 325         // we must be STARTED
 326         if (smooth_recovery && get_target_state() == service_state_t::STARTED) {
 327             do_smooth_recovery();
 328             return;
 329         }
 330         if (! do_auto_restart() && start_explicit) {
 331             start_explicit = false;
 332             release(false);
 333         }
 334         stop_reason = stopped_reason_t::TERMINATED;
 335         forced_stop();
 336         stop_dependents();
 337         stopped();
 338     }
 339     services->process_queues();
 340 }
 341
 342 void bgproc_service::exec_failed(int errcode) noexcept
 343 {
 344     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 345     // Only time we execute is for startup:
 346     stop_reason = stopped_reason_t::EXECFAILED;
 347     failed_to_start();
 348 }
 349
 350 void scripted_service::handle_exit_status(bp_sys::exit_status exit_status) noexcept
 351 {
 352     bool did_exit = exit_status.did_exit();
 353     bool was_signalled = exit_status.was_signalled();
 354     auto service_state = get_state();
 355
 356     // For a scripted service, a termination occurs in one of three main cases:
 357     // - the start script completed (or failed), when service was STARTING
 358     // - the start script was interrupted to cancel startup; state is STOPPING
 359     // - the stop script complete (or failed), state is STOPPING
 360
 361     if (service_state == service_state_t::STOPPING) {
 362         // We might be running the stop script, or we might be running the start script and have issued
 363         // a cancel order via SIGINT:
 364         if (interrupting_start) {
 365             // We issued a start interrupt, so we expected this failure:
 366             if (did_exit && exit_status.get_exit_status() != 0) {
 367                 log(loglevel_t::INFO, "Service ", get_name(), " start cancelled; exit code ",
 368                         exit_status.get_exit_status());
 369                 // Assume that a command terminating normally requires no cleanup:
 370                 stopped();
 371             }
 372             else {
 373                 if (was_signalled) {
 374                     log(loglevel_t::INFO, "Service ", get_name(), " start cancelled from signal ",
 375                             exit_status.get_term_sig());
 376                 }
 377                 // If the start script completed successfully, or was interrupted via our signal,
 378                 // we want to run the stop script to clean up:
 379                 bring_down();
 380             }
 381             interrupting_start = false;
 382         }
 383         else if (exit_status.did_exit_clean()) {
 384             // We were running the stop script and finished successfully
 385             stopped();
 386         }
 387         else {
 388             // ??? failed to stop! Let's log it as warning:
 389             if (did_exit) {
 390                 log(loglevel_t::WARN, "Service ", get_name(), " stop command failed with exit code ",
 391                         exit_status.get_exit_status());
 392             }
 393             else if (was_signalled) {
 394                 log(loglevel_t::WARN, "Service ", get_name(), " stop command terminated due to signal ",
 395                         exit_status.get_term_sig());
 396             }
 397             // Even if the stop script failed, assume that service is now stopped, so that any dependencies
 398             // can be stopped. There's not really any other useful course of action here.
 399             stopped();
 400         }
 401         services->process_queues();
 402     }
 403     else { // STARTING
 404         if (exit_status.did_exit_clean()) {
 405             started();
 406         }
 407         else if (was_signalled && exit_status.get_term_sig() == SIGINT && onstart_flags.skippable) {
 408             // A skippable service can be skipped by interrupting (eg by ^C if the service
 409             // starts on the console).
 410             start_skipped = true;
 411             started();
 412         }
 413         else {
 414             // failed to start
 415             if (did_exit) {
 416                 log(loglevel_t::ERROR, "Service ", get_name(), " command failed with exit code ",
 417                         exit_status.get_exit_status());
 418             }
 419             else if (was_signalled) {
 420                 log(loglevel_t::ERROR, "Service ", get_name(), " command terminated due to signal ",
 421                         exit_status.get_term_sig());
 422             }
 423             stop_reason = stopped_reason_t::FAILED;
 424             failed_to_start();
 425         }
 426         services->process_queues();
 427     }
 428 }
 429
 430 void scripted_service::exec_failed(int errcode) noexcept
 431 {
 432     log(loglevel_t::ERROR, get_name(), ": execution failed: ", strerror(errcode));
 433     auto service_state = get_state();
 434     if (service_state == service_state_t::STARTING) {
 435         stop_reason = stopped_reason_t::EXECFAILED;
 436         failed_to_start();
 437     }
 438     else if (service_state == service_state_t::STOPPING) {
 439         // We've logged the failure, but it's probably better not to leave the service in
 440         // STOPPING state:
 441         stopped();
 442     }
 443 }
 444
 445 bgproc_service::pid_result_t
 446 bgproc_service::read_pid_file(bp_sys::exit_status *exit_status) noexcept
 447 {
 448     const char *pid_file_c = pid_file.c_str();
 449     int fd = open(pid_file_c, O_CLOEXEC);
 450     if (fd == -1) {
 451         log(loglevel_t::ERROR, get_name(), ": read pid file: ", strerror(errno));
 452         return pid_result_t::FAILED;
 453     }
 454
 455     char pidbuf[21]; // just enough to hold any 64-bit integer
 456     int r = complete_read(fd, pidbuf, 20);
 457     if (r < 0) {
 458         // Could not read from PID file
 459         log(loglevel_t::ERROR, get_name(), ": could not read from pidfile; ", strerror(errno));
 460         close(fd);
 461         return pid_result_t::FAILED;
 462     }
 463
 464     close(fd);
 465     pidbuf[r] = 0; // store nul terminator
 466
 467     bool valid_pid = false;
 468     try {
 469         unsigned long long v = std::stoull(pidbuf, nullptr, 0);
 470         if (v <= std::numeric_limits<pid_t>::max()) {
 471             pid = (pid_t) v;
 472             valid_pid = true;
 473         }
 474     }
 475     catch (std::out_of_range &exc) {
 476         // Too large?
 477     }
 478     catch (std::invalid_argument &exc) {
 479         // Ok, so it doesn't look like a number: proceed...
 480     }
 481
 482     if (valid_pid) {
 483         pid_t wait_r = waitpid(pid, exit_status, WNOHANG);
 484         if (wait_r == -1 && errno == ECHILD) {
 485             // We can't track this child - check process exists:
 486             if (kill(pid, 0) == 0 || errno != ESRCH) {
 487                 tracking_child = false;
 488                 return pid_result_t::OK;
 489             }
 490             else {
 491                 log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 492                 pid = -1;
 493                 return pid_result_t::FAILED;
 494             }
 495         }
 496         else if (wait_r == pid) {
 497             pid = -1;
 498             return pid_result_t::TERMINATED;
 499         }
 500         else if (wait_r == 0) {
 501             // We can track the child
 502             child_listener.add_reserved(event_loop, pid, dasynq::DEFAULT_PRIORITY - 10);
 503             tracking_child = true;
 504             reserved_child_watch = true;
 505             return pid_result_t::OK;
 506         }
 507     }
 508
 509     log(loglevel_t::ERROR, get_name(), ": pid read from pidfile (", pid, ") is not valid");
 510     pid = -1;
 511     return pid_result_t::FAILED;
 512 }
 513
 514 void process_service::bring_down() noexcept
 515 {
 516     if (waiting_for_execstat) {
 517         // The process is still starting. This should be uncommon, but can occur during
 518         // smooth recovery. We can't do much now; we have to wait until we get the
 519         // status, and then act appropriately.
 520         return;
 521     }
 522     else if (pid != -1) {
 523         // The process is still kicking on - must actually kill it. We signal the process
 524         // group (-pid) rather than just the process as there's less risk then of creating
 525         // an orphaned process group:
 526         if (! onstart_flags.no_sigterm) {
 527             kill_pg(SIGTERM);
 528         }
 529         if (term_signal != -1) {
 530             kill_pg(term_signal);
 531         }
 532
 533         // If there's a stop timeout, arm the timer now:
 534         if (stop_timeout != time_val(0,0)) {
 535             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 536             stop_timer_armed = true;
 537         }
 538
 539         // The rest is done in handle_exit_status.
 540     }
 541     else {
 542         // The process is already dead.
 543         stopped();
 544     }
 545 }
 546
 547 void bgproc_service::bring_down() noexcept
 548 {
 549     if (pid != -1) {
 550         // The process is still kicking on - must actually kill it. We signal the process
 551         // group (-pid) rather than just the process as there's less risk then of creating
 552         // an orphaned process group:
 553         if (! onstart_flags.no_sigterm) {
 554             kill_pg(SIGTERM);
 555         }
 556         if (term_signal != -1) {
 557             kill_pg(term_signal);
 558         }
 559
 560         // In most cases, the rest is done in handle_exit_status.
 561         // If we are a BGPROCESS and the process is not our immediate child, however, that
 562         // won't work - check for this now:
 563         if (! tracking_child) {
 564             stopped();
 565         }
 566         else if (stop_timeout != time_val(0,0)) {
 567             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 568             stop_timer_armed = true;
 569         }
 570     }
 571     else {
 572         // The process is already dead.
 573         stopped();
 574     }
 575 }
 576
 577 void scripted_service::bring_down() noexcept
 578 {
 579         if (pid != -1) {
 580                 // We're already running the stop script; nothing to do.
 581                 return;
 582         }
 583
 584     if (stop_command.length() == 0) {
 585         stopped();
 586     }
 587     else if (! start_ps_process(stop_arg_parts, false)) {
 588         // Couldn't execute stop script, but there's not much we can do:
 589         stopped();
 590     }
 591     else {
 592         // successfully started stop script: start kill timer:
 593         if (stop_timeout != time_val(0,0)) {
 594             restart_timer.arm_timer_rel(event_loop, stop_timeout);
 595             stop_timer_armed = true;
 596         }
 597     }
 598 }
 599
 600 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
 601 {
 602     service->timer_expired();
 603
 604     // Leave the timer disabled, or, if it has been reset by any processing above, leave it armed:
 605     return dasynq::rearm::NOOP;
 606 }