src/service.cc

   1 #include <cstring>
   2 #include <cerrno>
   3 #include <sstream>
   4 #include <iterator>
   5 #include <memory>
   6 #include <cstddef>
   7
   8 #include <sys/types.h>
   9 #include <sys/stat.h>
  10 #include <sys/ioctl.h>
  11 #include <sys/un.h>
  12 #include <sys/socket.h>
  13 #include <fcntl.h>
  14 #include <unistd.h>
  15 #include <termios.h>
  16
  17 #include "service.h"
  18 #include "dinit-log.h"
  19 #include "dinit-socket.h"
  20
  21 /*
  22  * service.cc - Service management.
  23  * See service.h for details.
  24  */
  25
  26 // from dinit.cc:
  27 void open_control_socket(bool report_ro_failure = true) noexcept;
  28 void setup_external_log() noexcept;
  29 extern eventloop_t eventLoop;
  30
  31 // Find the requested service by name
  32 static service_record * find_service(const std::list<service_record *> & records,
  33                                     const char *name) noexcept
  34 {
  35     using std::list;
  36     list<service_record *>::const_iterator i = records.begin();
  37     for ( ; i != records.end(); i++ ) {
  38         if (strcmp((*i)->getServiceName().c_str(), name) == 0) {
  39             return *i;
  40         }
  41     }
  42     return (service_record *)0;
  43 }
  44
  45 service_record * service_set::find_service(const std::string &name) noexcept
  46 {
  47     return ::find_service(records, name.c_str());
  48 }
  49
  50 void service_set::startService(const char *name)
  51 {
  52     using namespace std;
  53     service_record *record = loadServiceRecord(name);
  54
  55     record->start();
  56     processQueues(true);
  57 }
  58
  59 void service_set::stopService(const std::string & name) noexcept
  60 {
  61     service_record *record = find_service(name);
  62     if (record != nullptr) {
  63         record->stop();
  64         processQueues(false);
  65     }
  66 }
  67
  68 // Called when a service has actually stopped; dependents have stopped already, unless this stop
  69 // is due to an unexpected process termination.
  70 void service_record::stopped() noexcept
  71 {
  72     if (onstart_flags.runs_on_console) {
  73         tcsetpgrp(0, getpgrp());
  74         discard_console_log_buffer();
  75         release_console();
  76     }
  77
  78     force_stop = false;
  79
  80     // If we are a soft dependency of another target, break the acquisition from that target now:
  81     for (auto dependent : soft_dpts) {
  82         if (dependent->holding_acq) {
  83             dependent->holding_acq = false;
  84             release();
  85         }
  86     }
  87
  88     bool will_restart = (desired_state == service_state_t::STARTED)
  89             && services->get_auto_restart();
  90
  91     for (auto dependency : depends_on) {
  92         // we signal dependencies in case they are waiting for us to stop:
  93         dependency->dependentStopped();
  94     }
  95
  96     service_state = service_state_t::STOPPED;
  97
  98     if (will_restart) {
  99         // Desired state is "started".
 100         restarting = true;
 101         start(false);
 102     }
 103     else {
 104         if (socket_fd != -1) {
 105             close(socket_fd);
 106             socket_fd = -1;
 107         }
 108
 109         if (start_explicit) {
 110             start_explicit = false;
 111             release();
 112         }
 113         else if (required_by == 0) {
 114             services->service_inactive(this);
 115         }
 116     }
 117
 118     logServiceStopped(service_name);
 119     notifyListeners(service_event::STOPPED);
 120 }
 121
 122 dasynq::rearm service_child_watcher::status_change(eventloop_t &loop, pid_t child, int status) noexcept
 123 {
 124     base_process_service *sr = service;
 125
 126     sr->pid = -1;
 127     sr->exit_status = status;
 128
 129     // Ok, for a process service, any process death which we didn't rig
 130     // ourselves is a bit... unexpected. Probably, the child died because
 131     // we asked it to (sr->service_state == STOPPING). But even if
 132     // we didn't, there's not much we can do.
 133
 134     if (sr->waiting_for_execstat) {
 135         // We still don't have an exec() status from the forked child, wait for that
 136         // before doing any further processing.
 137         return rearm::REMOVE;
 138     }
 139
 140     // Must deregister now since handle_exit_status might result in re-launch:
 141     deregister(loop, child);
 142
 143     sr->handle_exit_status(status);
 144     return rearm::REMOVED;
 145 }
 146
 147 bool service_record::do_auto_restart() noexcept
 148 {
 149     if (auto_restart) {
 150         return services->get_auto_restart();
 151     }
 152     return false;
 153 }
 154
 155 void service_record::emergency_stop() noexcept
 156 {
 157     if (! do_auto_restart() && start_explicit) {
 158         start_explicit = false;
 159         release();
 160     }
 161     forceStop();
 162     stopDependents();
 163     stopped();
 164 }
 165
 166 void process_service::handle_exit_status(int exit_status) noexcept
 167 {
 168     bool did_exit = WIFEXITED(exit_status);
 169     bool was_signalled = WIFSIGNALED(exit_status);
 170
 171     if (exit_status != 0 && service_state != service_state_t::STOPPING) {
 172         if (did_exit) {
 173             log(LogLevel::ERROR, "Service ", service_name, " process terminated with exit code ", WEXITSTATUS(exit_status));
 174         }
 175         else if (was_signalled) {
 176             log(LogLevel::ERROR, "Service ", service_name, " terminated due to signal ", WTERMSIG(exit_status));
 177         }
 178     }
 179
 180     if (service_state == service_state_t::STARTING) {
 181         if (did_exit && WEXITSTATUS(exit_status) == 0) {
 182             started();
 183         }
 184         else {
 185             failed_to_start();
 186         }
 187     }
 188     else if (service_state == service_state_t::STOPPING) {
 189         // We won't log a non-zero exit status or termination due to signal here -
 190         // we assume that the process died because we signalled it.
 191         stopped();
 192     }
 193     else if (smooth_recovery && service_state == service_state_t::STARTED && desired_state == service_state_t::STARTED) {
 194         // TODO if we are pinned-started then we should probably check
 195         //      that dependencies have started before trying to re-start the
 196         //      service process.
 197         if (! restart_ps_process()) {
 198             emergency_stop();
 199             services->processQueues(false);
 200         }
 201         return;
 202     }
 203     else {
 204         emergency_stop();
 205     }
 206     services->processQueues(false);
 207 }
 208
 209 void bgproc_service::handle_exit_status(int exit_status) noexcept
 210 {
 211     bool did_exit = WIFEXITED(exit_status);
 212     bool was_signalled = WIFSIGNALED(exit_status);
 213
 214     if (exit_status != 0 && service_state != service_state_t::STOPPING) {
 215         if (did_exit) {
 216             log(LogLevel::ERROR, "Service ", service_name, " process terminated with exit code ", WEXITSTATUS(exit_status));
 217         }
 218         else if (was_signalled) {
 219             log(LogLevel::ERROR, "Service ", service_name, " terminated due to signal ", WTERMSIG(exit_status));
 220         }
 221     }
 222
 223     if (doing_recovery) {
 224         // (BGPROCESS only)
 225         doing_recovery = false;
 226         bool need_stop = false;
 227         if ((did_exit && WEXITSTATUS(exit_status) != 0) || was_signalled) {
 228             need_stop = true;
 229         }
 230         else {
 231             // We need to re-read the PID, since it has now changed.
 232             if (pid_file.length() != 0) {
 233                 if (! read_pid_file()) {
 234                     need_stop = true;
 235                 }
 236             }
 237         }
 238
 239         if (need_stop) {
 240             // Failed startup: no auto-restart.
 241             emergency_stop();
 242             services->processQueues(false);
 243         }
 244
 245         return;
 246     }
 247
 248     if (service_state == service_state_t::STARTING) {
 249         // POSIX requires that if the process exited clearly with a status code of 0,
 250         // the exit status value will be 0:
 251         if (exit_status == 0) {
 252             if (pid_file.length() != 0 && ! read_pid_file()) {
 253                 failed_to_start();
 254             }
 255             else {
 256                 started();
 257             }
 258         }
 259         else {
 260             failed_to_start();
 261         }
 262     }
 263     else if (service_state == service_state_t::STOPPING) {
 264         // We won't log a non-zero exit status or termination due to signal here -
 265         // we assume that the process died because we signalled it.
 266         stopped();
 267     }
 268     else if (smooth_recovery && service_state == service_state_t::STARTED && desired_state == service_state_t::STARTED) {
 269         // TODO if we are pinned-started then we should probably check
 270         //      that dependencies have started before trying to re-start the
 271         //      service process.
 272         doing_recovery = true;
 273         if (! restart_ps_process()) {
 274             emergency_stop();
 275             services->processQueues();
 276         }
 277         return;
 278     }
 279     else {
 280         // we must be STARTED
 281         if (! do_auto_restart() && start_explicit) {
 282             start_explicit = false;
 283             release();
 284         }
 285         forceStop();
 286         stopDependents();
 287         stopped();
 288     }
 289     services->processQueues(false);
 290 }
 291
 292 void scripted_service::handle_exit_status(int exit_status) noexcept
 293 {
 294     bool did_exit = WIFEXITED(exit_status);
 295     bool was_signalled = WIFSIGNALED(exit_status);
 296
 297     if (service_state == service_state_t::STOPPING) {
 298         if (did_exit && WEXITSTATUS(exit_status) == 0) {
 299             stopped();
 300         }
 301         else {
 302             // ??? failed to stop! Let's log it as info:
 303             if (did_exit) {
 304                 log(LogLevel::INFO, "Service ", service_name, " stop command failed with exit code ", WEXITSTATUS(exit_status));
 305             }
 306             else if (was_signalled) {
 307                 log(LogLevel::INFO, "Serivice ", service_name, " stop command terminated due to signal ", WTERMSIG(exit_status));
 308             }
 309             // Just assume that we stopped, so that any dependencies
 310             // can be stopped:
 311             stopped();
 312         }
 313         services->processQueues(false);
 314     }
 315     else { // STARTING
 316         if (exit_status == 0) {
 317             started();
 318         }
 319         else {
 320             // failed to start
 321             if (did_exit) {
 322                 log(LogLevel::ERROR, "Service ", service_name, " command failed with exit code ", WEXITSTATUS(exit_status));
 323             }
 324             else if (was_signalled) {
 325                 log(LogLevel::ERROR, "Service ", service_name, " command terminated due to signal ", WTERMSIG(exit_status));
 326             }
 327             failed_to_start();
 328         }
 329         services->processQueues(true);
 330     }
 331 }
 332
 333 rearm exec_status_pipe_watcher::fd_event(eventloop_t &loop, int fd, int flags) noexcept
 334 {
 335     base_process_service *sr = service;
 336     sr->waiting_for_execstat = false;
 337
 338     int exec_status;
 339     int r = read(get_watched_fd(), &exec_status, sizeof(int));
 340     deregister(loop);
 341     close(get_watched_fd());
 342
 343     if (r > 0) {
 344         // We read an errno code; exec() failed, and the service startup failed.
 345         if (sr->pid != -1) {
 346             sr->child_listener.deregister(eventLoop, sr->pid);
 347         }
 348         sr->pid = -1;
 349         log(LogLevel::ERROR, sr->service_name, ": execution failed: ", strerror(exec_status));
 350         if (sr->service_state == service_state_t::STARTING) {
 351             sr->failed_to_start();
 352         }
 353         else if (sr->service_state == service_state_t::STOPPING) {
 354             // Must be a scripted service. We've logged the failure, but it's probably better
 355             // not to leave the service in STARTED state:
 356             sr->stopped();
 357         }
 358     }
 359     else {
 360         // exec() succeeded.
 361         if (sr->record_type == service_type::PROCESS) {
 362             // This could be a smooth recovery (state already STARTED). Even more, the process
 363             // might be stopped (and killed via a signal) during smooth recovery.  We don't to
 364             // process startup again in either case, so we check for state STARTING:
 365             if (sr->service_state == service_state_t::STARTING) {
 366                 sr->started();
 367             }
 368         }
 369
 370         if (sr->pid == -1) {
 371             // Somehow the process managed to complete before we even saw the status.
 372             sr->handle_exit_status(sr->exit_status);
 373         }
 374     }
 375
 376     sr->services->processQueues(true);
 377
 378     return rearm::REMOVED;
 379 }
 380
 381 void service_record::require() noexcept
 382 {
 383     if (required_by++ == 0) {
 384         prop_require = !prop_release;
 385         prop_release = false;
 386         services->addToPropQueue(this);
 387     }
 388 }
 389
 390 void service_record::release() noexcept
 391 {
 392     if (--required_by == 0) {
 393         desired_state = service_state_t::STOPPED;
 394
 395         // Can stop, and can release dependencies now. We don't need to issue a release if
 396         // the require was pending though:
 397         prop_release = !prop_require;
 398         prop_require = false;
 399         services->addToPropQueue(this);
 400
 401         if (service_state == service_state_t::STOPPED) {
 402             services->service_inactive(this);
 403         }
 404         else {
 405             do_stop();
 406         }
 407     }
 408 }
 409
 410 void service_record::release_dependencies() noexcept
 411 {
 412     for (sr_iter i = depends_on.begin(); i != depends_on.end(); ++i) {
 413         (*i)->release();
 414     }
 415
 416     for (auto i = soft_deps.begin(); i != soft_deps.end(); ++i) {
 417         service_record * to = i->getTo();
 418         if (i->holding_acq) {
 419             to->release();
 420             i->holding_acq = false;
 421         }
 422     }
 423 }
 424
 425 void service_record::start(bool activate) noexcept
 426 {
 427     if (activate && ! start_explicit) {
 428         require();
 429         start_explicit = true;
 430     }
 431
 432     if (desired_state == service_state_t::STARTED && service_state != service_state_t::STOPPED) return;
 433
 434     bool was_active = service_state != service_state_t::STOPPED || desired_state != service_state_t::STOPPED;
 435     desired_state = service_state_t::STARTED;
 436
 437     if (service_state != service_state_t::STOPPED) {
 438         // We're already starting/started, or we are stopping and need to wait for
 439         // that the complete.
 440         if (service_state != service_state_t::STOPPING || ! can_interrupt_stop()) {
 441             return;
 442         }
 443         // We're STOPPING, and that can be interrupted. Our dependencies might be STOPPING,
 444         // but if so they are waiting (for us), so they too can be instantly returned to
 445         // STARTING state.
 446         notifyListeners(service_event::STOPCANCELLED);
 447     }
 448     else if (! was_active) {
 449         services->service_active(this);
 450     }
 451
 452     service_state = service_state_t::STARTING;
 453     waiting_for_deps = true;
 454
 455     if (startCheckDependencies(true)) {
 456         services->addToStartQueue(this);
 457     }
 458 }
 459
 460 void service_record::do_propagation() noexcept
 461 {
 462     if (prop_require) {
 463         // Need to require all our dependencies
 464         for (sr_iter i = depends_on.begin(); i != depends_on.end(); ++i) {
 465             (*i)->require();
 466         }
 467
 468         for (auto i = soft_deps.begin(); i != soft_deps.end(); ++i) {
 469             service_record * to = i->getTo();
 470             to->require();
 471             i->holding_acq = true;
 472         }
 473
 474         prop_require = false;
 475     }
 476
 477     if (prop_release) {
 478         release_dependencies();
 479         prop_release = false;
 480     }
 481
 482     if (prop_failure) {
 483         prop_failure = false;
 484         failed_to_start(true);
 485     }
 486
 487     if (prop_start) {
 488         prop_start = false;
 489         start(false);
 490     }
 491
 492     if (prop_stop) {
 493         prop_stop = false;
 494         do_stop();
 495     }
 496 }
 497
 498 void service_record::execute_transition() noexcept
 499 {
 500     if (service_state == service_state_t::STARTING) {
 501         if (startCheckDependencies(false)) {
 502             allDepsStarted(false);
 503         }
 504     }
 505     else if (service_state == service_state_t::STOPPING) {
 506         if (stopCheckDependents()) {
 507             all_deps_stopped();
 508         }
 509     }
 510 }
 511
 512 void service_record::do_start() noexcept
 513 {
 514     if (pinned_stopped) return;
 515
 516     if (service_state != service_state_t::STARTING) {
 517         return;
 518     }
 519
 520     service_state = service_state_t::STARTING;
 521
 522     waiting_for_deps = true;
 523
 524     // Ask dependencies to start, mark them as being waited on.
 525     if (startCheckDependencies(false)) {
 526         // Once all dependencies are started, we start properly:
 527         allDepsStarted();
 528     }
 529 }
 530
 531 void service_record::dependencyStarted() noexcept
 532 {
 533     if (service_state == service_state_t::STARTING && waiting_for_deps) {
 534         services->addToStartQueue(this);
 535     }
 536 }
 537
 538 bool service_record::startCheckDependencies(bool start_deps) noexcept
 539 {
 540     bool all_deps_started = true;
 541
 542     for (sr_iter i = depends_on.begin(); i != depends_on.end(); ++i) {
 543         if ((*i)->service_state != service_state_t::STARTED) {
 544             if (start_deps) {
 545                 all_deps_started = false;
 546                 (*i)->prop_start = true;
 547                 services->addToPropQueue(*i);
 548             }
 549             else {
 550                 return false;
 551             }
 552         }
 553     }
 554
 555     for (auto i = soft_deps.begin(); i != soft_deps.end(); ++i) {
 556         service_record * to = i->getTo();
 557         if (start_deps) {
 558             if (to->service_state != service_state_t::STARTED) {
 559                 to->prop_start = true;
 560                 services->addToPropQueue(to);
 561                 i->waiting_on = true;
 562                 all_deps_started = false;
 563             }
 564             else {
 565                 i->waiting_on = false;
 566             }
 567         }
 568         else if (i->waiting_on) {
 569             if (to->service_state != service_state_t::STARTING) {
 570                 // Service has either started or is no longer starting
 571                 i->waiting_on = false;
 572             }
 573             else {
 574                 // We are still waiting on this service
 575                 return false;
 576             }
 577         }
 578     }
 579
 580     return all_deps_started;
 581 }
 582
 583 bool service_record::open_socket() noexcept
 584 {
 585     if (socket_path.empty() || socket_fd != -1) {
 586         // No socket, or already open
 587         return true;
 588     }
 589
 590     const char * saddrname = socket_path.c_str();
 591     uint sockaddr_size = offsetof(struct sockaddr_un, sun_path) + socket_path.length() + 1;
 592
 593     struct sockaddr_un * name = static_cast<sockaddr_un *>(malloc(sockaddr_size));
 594     if (name == nullptr) {
 595         log(LogLevel::ERROR, service_name, ": Opening activation socket: out of memory");
 596         return false;
 597     }
 598
 599     // Un-link any stale socket. TODO: safety check? should at least confirm the path is a socket.
 600     unlink(saddrname);
 601
 602     name->sun_family = AF_UNIX;
 603     strcpy(name->sun_path, saddrname);
 604
 605     int sockfd = dinit_socket(AF_UNIX, SOCK_STREAM, 0, SOCK_NONBLOCK | SOCK_CLOEXEC);
 606     if (sockfd == -1) {
 607         log(LogLevel::ERROR, service_name, ": Error creating activation socket: ", strerror(errno));
 608         free(name);
 609         return false;
 610     }
 611
 612     if (bind(sockfd, (struct sockaddr *) name, sockaddr_size) == -1) {
 613         log(LogLevel::ERROR, service_name, ": Error binding activation socket: ", strerror(errno));
 614         close(sockfd);
 615         free(name);
 616         return false;
 617     }
 618
 619     free(name);
 620
 621     // POSIX (1003.1, 2013) says that fchown and fchmod don't necesarily work on sockets. We have to
 622     // use chown and chmod instead.
 623     if (chown(saddrname, socket_uid, socket_gid)) {
 624         log(LogLevel::ERROR, service_name, ": Error setting activation socket owner/group: ", strerror(errno));
 625         close(sockfd);
 626         return false;
 627     }
 628
 629     if (chmod(saddrname, socket_perms) == -1) {
 630         log(LogLevel::ERROR, service_name, ": Error setting activation socket permissions: ", strerror(errno));
 631         close(sockfd);
 632         return false;
 633     }
 634
 635     if (listen(sockfd, 128) == -1) { // 128 "seems reasonable".
 636         log(LogLevel::ERROR, ": Error listening on activation socket: ", strerror(errno));
 637         close(sockfd);
 638         return false;
 639     }
 640
 641     socket_fd = sockfd;
 642     return true;
 643 }
 644
 645 void service_record::allDepsStarted(bool has_console) noexcept
 646 {
 647     if (onstart_flags.starts_on_console && ! has_console) {
 648         waiting_for_deps = true;
 649         queue_for_console();
 650         return;
 651     }
 652
 653     waiting_for_deps = false;
 654
 655     // We overload can_interrupt_start to check whether there is any other
 656     // process (eg restart timer) that needs to finish before starting.
 657     if (can_interrupt_start()) {
 658         waiting_for_deps = true;
 659         return;
 660     }
 661
 662     if (! open_socket()) {
 663         failed_to_start();
 664     }
 665
 666     bool start_success = start_ps_process();
 667     if (! start_success) {
 668         failed_to_start();
 669     }
 670 }
 671
 672 void service_record::acquiredConsole() noexcept
 673 {
 674     if (service_state != service_state_t::STARTING) {
 675         // We got the console but no longer want it.
 676         release_console();
 677     }
 678     else if (startCheckDependencies(false)) {
 679         allDepsStarted(true);
 680     }
 681     else {
 682         // We got the console but can't use it yet.
 683         release_console();
 684     }
 685 }
 686
 687 bool bgproc_service::read_pid_file() noexcept
 688 {
 689     const char *pid_file_c = pid_file.c_str();
 690     int fd = open(pid_file_c, O_CLOEXEC);
 691     if (fd != -1) {
 692         char pidbuf[21]; // just enought to hold any 64-bit integer
 693         int r = read(fd, pidbuf, 20);
 694         if (r > 0) {
 695             pidbuf[r] = 0; // store nul terminator
 696             pid = std::atoi(pidbuf);
 697             if (kill(pid, 0) == 0) {
 698                 child_listener.add_watch(eventLoop, pid);
 699             }
 700             else {
 701                 log(LogLevel::ERROR, service_name, ": pid read from pidfile (", pid, ") is not valid");
 702                 pid = -1;
 703                 close(fd);
 704                 return false;
 705             }
 706         }
 707         close(fd);
 708         return true;
 709     }
 710     else {
 711         log(LogLevel::ERROR, service_name, ": read pid file: ", strerror(errno));
 712         return false;
 713     }
 714 }
 715
 716 void service_record::started() noexcept
 717 {
 718     if (onstart_flags.starts_on_console && ! onstart_flags.runs_on_console) {
 719         tcsetpgrp(0, getpgrp());
 720         release_console();
 721     }
 722
 723     logServiceStarted(service_name);
 724     service_state = service_state_t::STARTED;
 725     notifyListeners(service_event::STARTED);
 726
 727     if (onstart_flags.rw_ready) {
 728         open_control_socket();
 729     }
 730     if (onstart_flags.log_ready) {
 731         setup_external_log();
 732     }
 733
 734     if (force_stop || desired_state == service_state_t::STOPPED) {
 735         // We must now stop.
 736         do_stop();
 737         return;
 738     }
 739
 740     // Notify any dependents whose desired state is STARTED:
 741     for (auto i = dependents.begin(); i != dependents.end(); i++) {
 742         (*i)->dependencyStarted();
 743     }
 744     for (auto i = soft_dpts.begin(); i != soft_dpts.end(); i++) {
 745         (*i)->getFrom()->dependencyStarted();
 746     }
 747 }
 748
 749 void service_record::failed_to_start(bool depfailed) noexcept
 750 {
 751     if (!depfailed && onstart_flags.starts_on_console) {
 752         tcsetpgrp(0, getpgrp());
 753         release_console();
 754     }
 755
 756     logServiceFailed(service_name);
 757     service_state = service_state_t::STOPPED;
 758     if (start_explicit) {
 759         start_explicit = false;
 760         release();
 761     }
 762     notifyListeners(service_event::FAILEDSTART);
 763
 764     // Cancel start of dependents:
 765     for (sr_iter i = dependents.begin(); i != dependents.end(); i++) {
 766         if ((*i)->service_state == service_state_t::STARTING) {
 767             (*i)->prop_failure = true;
 768             services->addToPropQueue(*i);
 769         }
 770     }
 771     for (auto i = soft_dpts.begin(); i != soft_dpts.end(); i++) {
 772         // We can send 'start', because this is only a soft dependency.
 773         // Our startup failure means that they don't have to wait for us.
 774         if ((*i)->waiting_on) {
 775             (*i)->holding_acq = false;
 776             (*i)->waiting_on = false;
 777             (*i)->getFrom()->dependencyStarted();
 778             release();
 779         }
 780     }
 781 }
 782
 783 bool service_record::start_ps_process() noexcept
 784 {
 785     // default implementation: there is no process, so we are started.
 786     started();
 787     return true;
 788 }
 789
 790 bool base_process_service::start_ps_process() noexcept
 791 {
 792     if (restarting) {
 793         return restart_ps_process();
 794     }
 795     else {
 796         eventLoop.get_time(restart_interval_time, clock_type::MONOTONIC);
 797         restart_interval_count = 0;
 798         return start_ps_process(exec_arg_parts, onstart_flags.starts_on_console);
 799     }
 800 }
 801
 802 bool base_process_service::start_ps_process(const std::vector<const char *> &cmd, bool on_console) noexcept
 803 {
 804     // In general, you can't tell whether fork/exec is successful. We use a pipe to communicate
 805     // success/failure from the child to the parent. The pipe is set CLOEXEC so a successful
 806     // exec closes the pipe, and the parent sees EOF. If the exec is unsuccessful, the errno
 807     // is written to the pipe, and the parent can read it.
 808
 809     eventLoop.get_time(last_start_time, clock_type::MONOTONIC);
 810
 811     int pipefd[2];
 812     if (pipe2(pipefd, O_CLOEXEC)) {
 813         log(LogLevel::ERROR, service_name, ": can't create status check pipe: ", strerror(errno));
 814         return false;
 815     }
 816
 817     const char * logfile = this->logfile.c_str();
 818     if (*logfile == 0) {
 819         logfile = "/dev/null";
 820     }
 821
 822     bool child_status_registered = false;
 823     control_conn_t *control_conn = nullptr;
 824
 825     int control_socket[2] = {-1, -1};
 826     if (onstart_flags.pass_cs_fd) {
 827         if (dinit_socketpair(AF_UNIX, SOCK_STREAM, /* protocol */ 0, control_socket, SOCK_NONBLOCK)) {
 828             log(LogLevel::ERROR, service_name, ": can't create control socket: ", strerror(errno));
 829             goto out_p;
 830         }
 831
 832         // Make the server side socket close-on-exec:
 833         int fdflags = fcntl(control_socket[0], F_GETFD);
 834         fcntl(control_socket[0], F_SETFD, fdflags | FD_CLOEXEC);
 835
 836         try {
 837             control_conn = new control_conn_t(&eventLoop, services, control_socket[0]);
 838         }
 839         catch (std::exception &exc) {
 840             log(LogLevel::ERROR, service_name, ": can't launch process; out of memory");
 841             goto out_cs;
 842         }
 843     }
 844
 845     // Set up complete, now fork and exec:
 846
 847     pid_t forkpid;
 848
 849     try {
 850         // We add the status listener with a high priority (i.e. low priority value) so that process
 851         // termination is handled early. This means we have always recorded that the process is
 852         // terminated by the time that we handle events that might otherwise cause us to signal the
 853         // process, so we avoid sending a signal to an invalid (and possibly recycled) process ID.
 854         child_status_listener.add_watch(eventLoop, pipefd[0], IN_EVENTS, true, DEFAULT_PRIORITY - 10);
 855         child_status_registered = true;
 856
 857         forkpid = child_listener.fork(eventLoop);
 858     }
 859     catch (std::exception &e) {
 860         log(LogLevel::ERROR, service_name, ": Could not fork: ", e.what());
 861         goto out_cs_h;
 862     }
 863
 864     if (forkpid == 0) {
 865         run_child_proc(cmd.data(), logfile, on_console, pipefd[1], control_socket[1]);
 866     }
 867     else {
 868         // Parent process
 869         close(pipefd[1]); // close the 'other end' fd
 870         if (control_socket[1] != -1) {
 871             close(control_socket[1]);
 872         }
 873         pid = forkpid;
 874
 875         waiting_for_execstat = true;
 876         return true;
 877     }
 878
 879     // Failure exit:
 880
 881     out_cs_h:
 882     if (child_status_registered) {
 883         child_status_listener.deregister(eventLoop);
 884     }
 885
 886     if (onstart_flags.pass_cs_fd) {
 887         delete control_conn;
 888
 889         out_cs:
 890         close(control_socket[0]);
 891         close(control_socket[1]);
 892     }
 893
 894     out_p:
 895     close(pipefd[0]);
 896     close(pipefd[1]);
 897
 898     return false;
 899 }
 900
 901 void service_record::run_child_proc(const char * const *args, const char *logfile, bool on_console,
 902         int wpipefd, int csfd) noexcept
 903 {
 904     // Child process. Must not allocate memory (or otherwise risk throwing any exception)
 905     // from here until exit().
 906
 907     // If the console already has a session leader, presumably it is us. On the other hand
 908     // if it has no session leader, and we don't create one, then control inputs such as
 909     // ^C will have no effect.
 910     bool do_set_ctty = (tcgetsid(0) == -1);
 911
 912     // Copy signal mask, but unmask signals that we masked on startup. For the moment, we'll
 913     // also block all signals, since apparently dup() can be interrupted (!!! really, POSIX??).
 914     sigset_t sigwait_set;
 915     sigset_t sigall_set;
 916     sigfillset(&sigall_set);
 917     sigprocmask(SIG_SETMASK, &sigall_set, &sigwait_set);
 918     sigdelset(&sigwait_set, SIGCHLD);
 919     sigdelset(&sigwait_set, SIGINT);
 920     sigdelset(&sigwait_set, SIGTERM);
 921     sigdelset(&sigwait_set, SIGQUIT);
 922
 923     constexpr int bufsz = ((CHAR_BIT * sizeof(pid_t)) / 3 + 2) + 11;
 924     // "LISTEN_PID=" - 11 characters; the expression above gives a conservative estimate
 925     // on the maxiumum number of bytes required for LISTEN=nnn, including nul terminator,
 926     // where nnn is a pid_t in decimal (i.e. one decimal digit is worth just over 3 bits).
 927     char nbuf[bufsz];
 928
 929     // "DINIT_CS_FD=" - 12 bytes. (we -1 from sizeof(int) in account of sign bit).
 930     constexpr int csenvbufsz = ((CHAR_BIT * sizeof(int) - 1) / 3 + 2) + 12;
 931     char csenvbuf[csenvbufsz];
 932
 933     int minfd = (socket_fd == -1) ? 3 : 4;
 934
 935     // Move wpipefd/csfd to another fd if necessary
 936     if (wpipefd < minfd) {
 937         wpipefd = fcntl(wpipefd, F_DUPFD_CLOEXEC, minfd);
 938         if (wpipefd == -1) goto failure_out;
 939     }
 940
 941     if (csfd != -1 && csfd < minfd) {
 942         csfd = fcntl(csfd, F_DUPFD, minfd);
 943         if (csfd == -1) goto failure_out;
 944     }
 945
 946     if (socket_fd != -1) {
 947
 948         if (dup2(socket_fd, 3) == -1) goto failure_out;
 949         if (socket_fd != 3) {
 950             close(socket_fd);
 951         }
 952
 953         if (putenv(const_cast<char *>("LISTEN_FDS=1"))) goto failure_out;
 954         snprintf(nbuf, bufsz, "LISTEN_PID=%jd", static_cast<intmax_t>(getpid()));
 955         if (putenv(nbuf)) goto failure_out;
 956     }
 957
 958     if (csfd != -1) {
 959         snprintf(csenvbuf, csenvbufsz, "DINIT_CS_FD=%d", csfd);
 960         if (putenv(csenvbuf)) goto failure_out;
 961     }
 962
 963     if (! on_console) {
 964         // Re-set stdin, stdout, stderr
 965         close(0); close(1); close(2);
 966
 967         if (open("/dev/null", O_RDONLY) == 0) {
 968             // stdin = 0. That's what we should have; proceed with opening
 969             // stdout and stderr.
 970             if (open(logfile, O_WRONLY | O_CREAT | O_APPEND, S_IRUSR | S_IWUSR) != 1) {
 971                 goto failure_out;
 972             }
 973             if (dup2(1, 2) != 2) {
 974                 goto failure_out;
 975             }
 976         }
 977         else goto failure_out;
 978
 979         // We have the option of creating a session and process group, or just a new process
 980         // group. If we just create a new process group, the child process cannot make itself
 981         // a session leader if it wants to do that (eg getty/login will generally want this).
 982         // If we do neither, and we are running with a controlling terminal, a ^C or similar
 983         // will also affect the child process (which probably isn't so bad, though since we
 984         // will handle the shutdown ourselves it's not necessary). Creating a new session
 985         // (and a new process group as part of that) seems like a safe bet, and has the
 986         // advantage of letting us signal the process as part of a process group.
 987         setsid();
 988     }
 989     else {
 990         // "run on console" - run as a foreground job on the terminal/console device
 991
 992         // if do_set_ctty is false, we are the session leader; we are probably running
 993         // as a user process. Don't create a new session leader in that case, and run
 994         // as part of the parent session. Otherwise, the new session cannot claim the
 995         // terminal as a controlling terminal (it is already claimed), meaning that it
 996         // will not see control signals from ^C etc.
 997
 998         if (do_set_ctty) {
 999             // Disable suspend (^Z) (and on some systems, delayed suspend / ^Y)
1000             signal(SIGTSTP, SIG_IGN);
1001
1002             // Become session leader
1003             setsid();
1004             ioctl(0, TIOCSCTTY, 0);
1005         }
1006         setpgid(0,0);
1007         tcsetpgrp(0, getpgrp());
1008     }
1009
1010     sigprocmask(SIG_SETMASK, &sigwait_set, nullptr);
1011
1012     execvp(args[0], const_cast<char **>(args));
1013
1014     // If we got here, the exec failed:
1015     failure_out:
1016     int exec_status = errno;
1017     write(wpipefd, &exec_status, sizeof(int));
1018     _exit(0);
1019 }
1020
1021 // Mark this and all dependent services as force-stopped.
1022 void service_record::forceStop() noexcept
1023 {
1024     if (service_state != service_state_t::STOPPED) {
1025         force_stop = true;
1026         services->addToStopQueue(this);
1027     }
1028 }
1029
1030 void service_record::dependentStopped() noexcept
1031 {
1032     if (service_state == service_state_t::STOPPING && waiting_for_deps) {
1033         services->addToStopQueue(this);
1034     }
1035 }
1036
1037 void service_record::stop(bool bring_down) noexcept
1038 {
1039     if (start_explicit) {
1040         start_explicit = false;
1041         release();
1042     }
1043
1044     if (bring_down) {
1045         do_stop();
1046     }
1047 }
1048
1049 void service_record::do_stop() noexcept
1050 {
1051     if (pinned_started) return;
1052
1053     if (service_state != service_state_t::STARTED) {
1054         if (service_state == service_state_t::STARTING) {
1055             if (! can_interrupt_start()) {
1056                 // Well this is awkward: we're going to have to continue
1057                 // starting, but we don't want any dependents to think that
1058                 // they are still waiting to start.
1059                 // Make sure they remain stopped:
1060                 stopDependents();
1061                 return;
1062             }
1063
1064             // We must have had desired_state == STARTED.
1065             notifyListeners(service_event::STARTCANCELLED);
1066
1067             interrupt_start();
1068
1069             // Reaching this point, we are starting interruptibly - so we
1070             // stop now (by falling through to below).
1071         }
1072         else {
1073             // If we're starting we need to wait for that to complete.
1074             // If we're already stopping/stopped there's nothing to do.
1075             return;
1076         }
1077     }
1078
1079     service_state = service_state_t::STOPPING;
1080     waiting_for_deps = true;
1081     if (stopDependents()) {
1082         services->addToStopQueue(this);
1083     }
1084 }
1085
1086 bool service_record::stopCheckDependents() noexcept
1087 {
1088     bool all_deps_stopped = true;
1089     for (sr_iter i = dependents.begin(); i != dependents.end(); ++i) {
1090         if (! (*i)->is_stopped()) {
1091             all_deps_stopped = false;
1092             break;
1093         }
1094     }
1095
1096     return all_deps_stopped;
1097 }
1098
1099 bool service_record::stopDependents() noexcept
1100 {
1101     bool all_deps_stopped = true;
1102     for (sr_iter i = dependents.begin(); i != dependents.end(); ++i) {
1103         if (! (*i)->is_stopped()) {
1104             // Note we check *first* since if the dependent service is not stopped,
1105             // 1. We will issue a stop to it shortly and
1106             // 2. It will notify us when stopped, at which point the stopCheckDependents()
1107             //    check is run anyway.
1108             all_deps_stopped = false;
1109         }
1110
1111         if (force_stop) {
1112             // If this service is to be forcefully stopped, dependents must also be.
1113             (*i)->forceStop();
1114         }
1115
1116         (*i)->prop_stop = true;
1117         services->addToPropQueue(*i);
1118     }
1119
1120     return all_deps_stopped;
1121 }
1122
1123 // All dependents have stopped; we can stop now, too. Only called when STOPPING.
1124 void service_record::all_deps_stopped() noexcept
1125 {
1126     waiting_for_deps = false;
1127     stopped();
1128 }
1129
1130 void base_process_service::all_deps_stopped() noexcept
1131 {
1132     waiting_for_deps = false;
1133     if (pid != -1) {
1134         // The process is still kicking on - must actually kill it. We signal the process
1135         // group (-pid) rather than just the process as there's less risk then of creating
1136         // an orphaned process group:
1137         if (! onstart_flags.no_sigterm) {
1138             kill(-pid, SIGTERM);
1139         }
1140         if (term_signal != -1) {
1141             kill(-pid, term_signal);
1142         }
1143
1144         // In most cases, the rest is done in handle_exit_status.
1145         // If we are a BGPROCESS and the process is not our immediate child, however, that
1146         // won't work - check for this now:
1147         if (record_type == service_type::BGPROCESS) {
1148             int status;
1149             pid_t r = waitpid(pid, &status, WNOHANG);
1150             if (r == -1 && errno == ECHILD) {
1151                 // We can't track this child (or it's terminated already)
1152                 stopped();
1153             }
1154             else if (r == pid) {
1155                 // Process may have died due to signal since we explicitly requested it to
1156                 // stop by signalling it; no need to log any termination status.
1157                 stopped();
1158             }
1159         }
1160     }
1161     else {
1162         // The process is already dead.
1163         stopped();
1164     }
1165 }
1166
1167 void scripted_service::all_deps_stopped() noexcept
1168 {
1169     waiting_for_deps = false;
1170     if (stop_command.length() == 0) {
1171         stopped();
1172     }
1173     else if (! start_ps_process(stop_arg_parts, false)) {
1174         // Couldn't execute stop script, but there's not much we can do:
1175         stopped();
1176     }
1177 }
1178
1179 void service_record::unpin() noexcept
1180 {
1181     if (pinned_started) {
1182         pinned_started = false;
1183         if (desired_state == service_state_t::STOPPED) {
1184             do_stop();
1185             services->processQueues(false);
1186         }
1187     }
1188     if (pinned_stopped) {
1189         pinned_stopped = false;
1190         if (desired_state == service_state_t::STARTED) {
1191             do_start();
1192             services->processQueues(true);
1193         }
1194     }
1195 }
1196
1197 void service_record::queue_for_console() noexcept
1198 {
1199     services->append_console_queue(this);
1200 }
1201
1202 void service_record::release_console() noexcept
1203 {
1204     services->pull_console_queue();
1205 }
1206
1207 void service_record::interrupt_start() noexcept
1208 {
1209     services->unqueue_console(this);
1210 }
1211
1212 void service_set::service_active(service_record *sr) noexcept
1213 {
1214     active_services++;
1215 }
1216
1217 void service_set::service_inactive(service_record *sr) noexcept
1218 {
1219     active_services--;
1220 }
1221
1222 base_process_service::base_process_service(service_set *sset, string name, service_type service_type_p, string &&command,
1223         std::list<std::pair<unsigned,unsigned>> &command_offsets,
1224         sr_list * pdepends_on, sr_list * pdepends_soft)
1225      : service_record(sset, name, service_type_p, std::move(command), command_offsets,
1226          pdepends_on, pdepends_soft), child_listener(this), child_status_listener(this)
1227 {
1228     restart_interval_count = 0;
1229     restart_interval_time = {0, 0};
1230     restart_timer.service = this;
1231     restart_timer.add_timer(eventLoop);
1232
1233     // By default, allow a maximum of 3 restarts within 10.0 seconds:
1234     restart_interval.tv_sec = 10;
1235     restart_interval.tv_nsec = 0;
1236     max_restart_interval_count = 3;
1237 }
1238
1239 void base_process_service::do_restart() noexcept
1240 {
1241     restarting = false;
1242     waiting_restart_timer = false;
1243     restart_interval_count++;
1244
1245     // We may be STARTING (regular restart) or STARTED ("smooth recovery"). This affects whether
1246     // the process should be granted access to the console:
1247     bool on_console = service_state == service_state_t::STARTING
1248             ? onstart_flags.starts_on_console : onstart_flags.runs_on_console;
1249
1250     if (! start_ps_process(exec_arg_parts, on_console)) {
1251         if (service_state == service_state_t::STARTING) {
1252             failed_to_start();
1253         }
1254         else {
1255             desired_state = service_state_t::STOPPED;
1256             forceStop();
1257         }
1258         services->processQueues();
1259     }
1260 }
1261
1262 bool base_process_service::restart_ps_process() noexcept
1263 {
1264         using time_val = eventloop_t::time_val;
1265
1266     time_val current_time;
1267     eventLoop.get_time(current_time, clock_type::MONOTONIC);
1268
1269     if (max_restart_interval_count != 0) {
1270         // Check whether we're still in the most recent restart check interval:
1271         time_val int_diff = current_time - restart_interval_time;
1272         if (int_diff < restart_interval) {
1273             if (restart_interval_count >= max_restart_interval_count) {
1274                 log(LogLevel::ERROR, "Service ", service_name, " restarting too quickly; stopping.");
1275                 return false;
1276             }
1277         }
1278         else {
1279             restart_interval_time = current_time;
1280             restart_interval_count = 0;
1281         }
1282     }
1283
1284     // Check if enough time has lapsed since the prevous restart. If not, start a timer:
1285     time_val tdiff = current_time - last_start_time;
1286     if (restart_delay < tdiff) {
1287         // > restart delay (normally 200ms)
1288         do_restart();
1289     }
1290     else {
1291         time_val timeout = restart_delay - tdiff;
1292         restart_timer.arm_timer_rel(eventLoop, timeout);
1293         waiting_restart_timer = true;
1294     }
1295     return true;
1296 }
1297
1298 void base_process_service::interrupt_start() noexcept
1299 {
1300     // overridden in subclasses
1301     if (waiting_restart_timer) {
1302         restart_timer.stop_timer(eventLoop);
1303         waiting_restart_timer = false;
1304     }
1305     service_record::interrupt_start();
1306 }
1307
1308 dasynq::rearm process_restart_timer::timer_expiry(eventloop_t &, int expiry_count)
1309 {
1310     service->do_restart();
1311     return dasynq::rearm::DISARM;
1312 }