Limit number of restarts within an interval
authorDavin McCall <davmac@davmac.org>
Mon, 5 Jun 2017 09:26:23 +0000 (10:26 +0100)
committerDavin McCall <davmac@davmac.org>
Mon, 5 Jun 2017 09:26:23 +0000 (10:26 +0100)
src/service.cc
src/service.h

index 7b0461fc20bcad2b7fdb786677b4d06642c6e069..63f5a158f4aedc569a090edd4aeb2146a2729571 100644 (file)
@@ -188,7 +188,10 @@ void process_service::handle_exit_status(int exit_status) noexcept
         // TODO if we are pinned-started then we should probably check
         //      that dependencies have started before trying to re-start the
         //      service process.
-        restart_ps_process();
+        if (! restart_ps_process()) {
+            desired_state = ServiceState::STOPPED;
+            forceStop();
+        }
         return;
     }
     else {
@@ -263,7 +266,10 @@ void bgproc_service::handle_exit_status(int exit_status) noexcept
         //      that dependencies have started before trying to re-start the
         //      service process.
         doing_recovery = true;
-        restart_ps_process();
+        if (! restart_ps_process()) {
+            desired_state = ServiceState::STOPPED;
+            forceStop();
+        }
         return;
     }
     else {
@@ -763,10 +769,11 @@ bool ServiceRecord::start_ps_process() noexcept
 bool base_process_service::start_ps_process() noexcept
 {
     if (restarting) {
-        restart_ps_process();
-        return true;
+        return restart_ps_process();
     }
     else {
+        eventLoop.get_time(restart_interval_time, clock_type::MONOTONIC);
+        restart_interval_count = 0;
         return start_ps_process(exec_arg_parts, onstart_flags.starts_on_console);
     }
 }
@@ -1179,6 +1186,8 @@ base_process_service::base_process_service(ServiceSet *sset, string name, Servic
      : ServiceRecord(sset, name, service_type, std::move(command), command_offsets,
          pdepends_on, pdepends_soft), child_listener(this), child_status_listener(this)
 {
+    restart_interval_count = 0;
+    restart_interval_time = {0, 0};
     restart_timer.service = this;
     restart_timer.add_timer(eventLoop);
 }
@@ -1193,7 +1202,6 @@ void base_process_service::do_restart() noexcept
             ? onstart_flags.starts_on_console : onstart_flags.runs_on_console;
 
     if (! start_ps_process(exec_arg_parts, on_console)) {
-
         if (service_state == ServiceState::STARTING) {
             failed_to_start();
         }
@@ -1205,30 +1213,52 @@ void base_process_service::do_restart() noexcept
     }
 }
 
-void base_process_service::restart_ps_process() noexcept
+// Calculate different between two times (a more recent time, "now", and a previuos time "then").
+static timespec diff_time(timespec now, timespec then)
+{
+    timespec r;
+    r.tv_sec = now.tv_sec - then.tv_sec;
+    if (now.tv_nsec >= then.tv_nsec) {
+        r.tv_nsec = now.tv_nsec - then.tv_nsec;
+    }
+    else {
+        r.tv_sec -= 1;
+        r.tv_nsec = 1000000000 - (then.tv_nsec - now.tv_nsec);
+    }
+    return r;
+}
+
+bool base_process_service::restart_ps_process() noexcept
 {
     timespec current_time;
     eventLoop.get_time(current_time, clock_type::MONOTONIC);
-    auto tdiff_s = current_time.tv_sec - last_start_time.tv_sec;
-    decltype(current_time.tv_nsec) tdiff_ns;
-    if (current_time.tv_nsec >= last_start_time.tv_nsec) {
-        tdiff_ns = current_time.tv_nsec - last_start_time.tv_nsec;
+
+    // Check whether we're still in the most recent restart check interval:
+    timespec int_diff = diff_time(current_time, restart_interval_time);
+    if (int_diff.tv_sec < 10) {
+        if (++restart_interval_count >= 3) {
+            log(LogLevel::ERROR, "Service ", service_name, " restarting too quickly; stopping.");
+            return false;
+        }
     }
     else {
-        tdiff_s -= 1;
-        tdiff_ns = 1000000000 - (last_start_time.tv_nsec - current_time.tv_nsec);
+        restart_interval_time = current_time;
+        restart_interval_count = 0;
     }
 
-    if (tdiff_s > 0 || tdiff_ns > 200000000) {
+    // Check if enough time has lapsed since the prevous restart. If not, start a timer:
+    timespec tdiff = diff_time(current_time, last_start_time);
+    if (tdiff.tv_sec > 0 || tdiff.tv_nsec > 200000000) {
         // > 200ms
         do_restart();
     }
     else {
         timespec timeout;
         timeout.tv_sec = 0;
-        timeout.tv_nsec = 200000000 - tdiff_ns;
+        timeout.tv_nsec = 200000000 - tdiff.tv_nsec;
         restart_timer.arm_timer_rel(eventLoop, timeout);
     }
+    return true;
 }
 
 dasynq::rearm process_restart_timer::timer_expiry(EventLoop_t &, int expiry_count)
index 8bb63898bd3e8fef83e5d7e4adeb38c2074f3684..40a50297d4491ebb0f04fe5e9c01636020d004a4 100644 (file)
@@ -585,13 +585,18 @@ class base_process_service : public ServiceRecord
     process_restart_timer restart_timer;
     timespec last_start_time;
 
+    // Restart interval time and restart count are used to track the number of automatic restarts
+    // over an interval. Too many restarts over an interval will inhibit further restarts.
+    timespec restart_interval_time;
+    int restart_interval_count;
+
     // Start the process, return true on success
     virtual bool start_ps_process() noexcept;
     bool start_ps_process(const std::vector<const char *> &args, bool on_console) noexcept;
 
     // Restart the process (due to start failure or unexpected termination). Restarts will be
     // rate-limited.
-    void restart_ps_process() noexcept;
+    bool restart_ps_process() noexcept;
 
     virtual void all_deps_stopped() noexcept;
     virtual void handle_exit_status(int exit_status) noexcept = 0;