2 This file is part of GNUnet.
3 (C) 2009 Christian Grothoff (and other contributing authors)
5 GNUnet is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
10 GNUnet is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GNUnet; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
22 * @file arm/gnunet-service-arm.c
23 * @brief the automated restart manager service
24 * @author Christian Grothoff
27 * - multiple start-stop requests with RC>1 can result
28 * in UP/DOWN signals based on "pending" that are inaccurate...
29 * => have list of clients waiting for a resolution instead of
30 * giving instant (but incorrect) replies
31 * - need to test auto-restart code on configuration changes;
32 * - should refine restart code to check if *relevant* parts of the
33 * configuration were changed (anything in the section for the service)
34 * - should have a way to specify dependencies between services and
35 * manage restarts of groups of services
38 #include "gnunet_client_lib.h"
39 #include "gnunet_getopt_lib.h"
40 #include "gnunet_os_lib.h"
41 #include "gnunet_protocols.h"
42 #include "gnunet_service_lib.h"
43 #include "gnunet_signal_lib.h"
48 * Check for configuration file changes every 5s.
50 #define MAINT_FREQUENCY GNUNET_TIME_relative_multiply (GNUNET_TIME_UNIT_SECONDS, 5)
53 * Threshold after which exponential backoff shouldn't increase (in ms); 30m
55 #define EXPONENTIAL_BACKOFF_THRESHOLD (1000 * 60 * 30)
59 * List of our services.
64 * List of our services.
69 * This is a linked list.
71 struct ServiceList *next;
74 * Name of the service.
79 * Name of the binary used.
84 * Name of the configuration file used.
89 * Client to notify upon kill completion (waitpid), NULL
90 * if we should simply restart the process.
92 struct GNUNET_SERVER_Client *killing_client;
95 * Process ID of the child.
100 * Last time the config of this service was
105 /* Process exponential backoff time */
106 struct GNUNET_TIME_Relative backoff;
108 /* Absolute time at which the process is scheduled to restart in case of death */
109 struct GNUNET_TIME_Absolute restartAt;
112 * Reference counter (counts how many times we've been
113 * asked to start the service). We only actually stop
114 * it once rc hits zero.
121 * List of running services.
123 static struct ServiceList *running;
128 static const struct GNUNET_CONFIGURATION_Handle *cfg;
133 static struct GNUNET_SCHEDULER_Handle *sched;
136 * Command to prepend to each actual command.
138 static char *prefix_command;
141 * ID of task called whenever we get a SIGCHILD.
143 static GNUNET_SCHEDULER_TaskIdentifier child_death_task;
146 * ID of task called whenever the timeout for restarting a child
149 static GNUNET_SCHEDULER_TaskIdentifier child_restart_task;
152 * Context for our SIGCHILD handler.
154 static struct GNUNET_SIGNAL_Context *shc_chld;
157 * Pipe used to communicate shutdown via signal.
159 static struct GNUNET_DISK_PipeHandle *sigpipe;
162 * Reading end of the signal pipe.
164 static const struct GNUNET_DISK_FileHandle *pr;
167 * Are we in shutdown mode?
169 static int in_shutdown;
173 * Handle to our server instance. Our server is a bit special in that
174 * its service is not immediately stopped once we get a shutdown
175 * request (since we need to continue service until all of our child
176 * processes are dead). This handle is used to shut down the server
177 * (and thus trigger process termination) once all child processes are
178 * also dead. A special option in the ARM configuration modifies the
179 * behaviour of the service implementation to not do the shutdown
182 static struct GNUNET_SERVER_Handle *server;
186 * If the configuration file changes, restart tasks that depended on that
189 * @param cls closure, NULL if we need to self-restart
193 config_change_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
195 struct ServiceList *pos;
201 /* FIXME: this test for config change is a bit too coarse grained */
202 if ( (0 == STAT (pos->config, &sbuf)) &&
203 (pos->mtime < sbuf.st_mtime) &&
206 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
207 _("Restarting service `%s' due to configuration file change.\n"));
208 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
209 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
211 pos->backoff = GNUNET_TIME_UNIT_MILLISECONDS;
220 * Transmit a status result message.
222 * @param cls pointer to "unit16_t*" with message type
223 * @param size number of bytes available in buf
224 * @param buf where to copy the message, NULL on error
225 * @return number of bytes copied to buf
228 write_result (void *cls, size_t size, void *buf)
231 struct GNUNET_MessageHeader *msg;
235 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
236 _("Could not send status result to client\n"));
237 return 0; /* error, not much we can do */
239 GNUNET_assert (size >= sizeof (struct GNUNET_MessageHeader));
241 msg->size = htons (sizeof (struct GNUNET_MessageHeader));
242 msg->type = htons (*res);
244 return sizeof (struct GNUNET_MessageHeader);
250 * Signal our client that we will start or stop the
253 * @param client who is being signalled
254 * @param name name of the service
255 * @param result message type to send
256 * @return NULL if it was not found
259 signal_result (struct GNUNET_SERVER_Client *client,
260 const char *name, uint16_t result)
266 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
268 ("Not sending status result to client: no client known\n"));
272 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
273 "Telling client that service `%s' is now %s\n",
275 result == GNUNET_MESSAGE_TYPE_ARM_IS_DOWN ? "down" : "up");
277 res = GNUNET_malloc (sizeof (uint16_t));
279 GNUNET_SERVER_notify_transmit_ready (client,
280 sizeof (struct GNUNET_MessageHeader),
281 GNUNET_TIME_UNIT_FOREVER_REL,
287 * Find the process with the given service
288 * name in the given list, remove it and return it.
290 * @param name which service entry to look up
291 * @return NULL if it was not found
293 static struct ServiceList *
294 find_name (const char *name)
296 struct ServiceList *pos;
297 struct ServiceList *prev;
303 if (0 == strcmp (pos->name, name))
308 prev->next = pos->next;
320 * Free an entry in the service list.
322 * @param pos entry to free
325 free_entry (struct ServiceList *pos)
327 GNUNET_free_non_null (pos->config);
328 GNUNET_free_non_null (pos->binary);
329 GNUNET_free (pos->name);
335 * Actually start the process for the given service.
337 * @param sl identifies service to start
340 start_process (struct ServiceList *sl)
345 unsigned int argv_size;
348 const char *firstarg;
353 GNUNET_CONFIGURATION_get_value_string (cfg,
354 sl->name, "PREFIX", &loprefix))
355 loprefix = GNUNET_strdup (prefix_command);
357 GNUNET_CONFIGURATION_get_value_string (cfg,
358 sl->name, "OPTIONS", &options))
359 options = GNUNET_strdup ("");
360 use_debug = GNUNET_CONFIGURATION_get_value_yesno (cfg, sl->name, "DEBUG");
362 GNUNET_log (GNUNET_ERROR_TYPE_INFO, _("Starting service `%s'\n"), sl->name);
364 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
365 "Starting service `%s' using binary `%s' and configuration `%s'\n",
366 sl->name, sl->binary, sl->config);
372 while ('\0' != *lopos)
379 while ('\0' != *optpos)
386 argv = GNUNET_malloc (argv_size * sizeof (char *));
390 while ('\0' != *lopos)
392 while (*lopos == ' ')
398 argv[argv_size++] = lopos;
399 while (('\0' != *lopos) && (' ' != *lopos))
407 firstarg = sl->binary;
408 argv[argv_size++] = sl->binary;
409 argv[argv_size++] = "-c";
410 argv[argv_size++] = sl->config;
411 if (GNUNET_YES == use_debug)
413 argv[argv_size++] = "-L";
414 argv[argv_size++] = "DEBUG";
417 while ('\0' != *optpos)
419 while (*optpos == ' ')
423 argv[argv_size++] = optpos;
424 while (('\0' != *optpos) && (' ' != *optpos))
431 argv[argv_size++] = NULL;
432 sl->pid = GNUNET_OS_start_process_v (firstarg, argv);
433 /* FIXME: should check sl->pid */
435 GNUNET_free (loprefix);
436 GNUNET_free (options);
441 * Start the specified service.
443 * @param client who is asking for this
444 * @param servicename name of the service to start
447 start_service (struct GNUNET_SERVER_Client *client, const char *servicename)
449 struct ServiceList *sl;
454 if (GNUNET_YES == in_shutdown)
456 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
457 _("ARM is shutting down, service `%s' not started.\n"),
459 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
462 sl = find_name (servicename);
465 /* already running, just increment RC */
466 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
467 _("Service `%s' already running.\n"), servicename);
471 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
475 GNUNET_CONFIGURATION_get_value_string (cfg,
476 servicename, "BINARY", &binary))
478 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
479 _("Binary implementing service `%s' not known!\n"),
481 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
485 GNUNET_CONFIGURATION_get_value_filename (cfg,
489 (0 != STAT (config, &sbuf)))
491 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
492 _("Configuration file `%s' for service `%s' not known!\n"),
493 config, servicename);
494 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
495 GNUNET_free (binary);
496 GNUNET_free_non_null (config);
499 sl = GNUNET_malloc (sizeof (struct ServiceList));
500 sl->name = GNUNET_strdup (servicename);
505 sl->mtime = sbuf.st_mtime;
506 sl->backoff = GNUNET_TIME_UNIT_MILLISECONDS;
507 sl->restartAt = GNUNET_TIME_UNIT_FOREVER_ABS;
512 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
517 * Stop the specified service.
519 * @param client who is asking for this
520 * @param servicename name of the service to stop
523 stop_service (struct GNUNET_SERVER_Client *client, const char *servicename)
525 struct ServiceList *pos;
527 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
528 _("Preparing to stop `%s'\n"), servicename);
529 pos = find_name (servicename);
532 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UNKNOWN);
533 GNUNET_SERVER_receive_done (client, GNUNET_OK);
538 /* RC>1, just decrement RC */
543 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
544 "Service `%s' still used by %u clients, will keep it running!\n",
545 servicename, pos->rc);
547 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
548 GNUNET_SERVER_receive_done (client, GNUNET_OK);
552 pos->rc--; /* decrement RC to zero */
553 if (pos->killing_client != NULL)
555 /* killing already in progress */
557 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
558 "Service `%s' is already down\n", servicename);
560 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
561 GNUNET_SERVER_receive_done (client, GNUNET_OK);
565 if (GNUNET_YES == in_shutdown)
568 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
569 "Termination request already sent to `%s' (since ARM is in shutdown).\n",
572 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
573 GNUNET_SERVER_receive_done (client, GNUNET_OK);
577 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
578 "Sending kill signal to service `%s', waiting for process to die.\n",
581 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
582 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
585 pos->killing_client = client;
586 GNUNET_SERVER_client_keep (client);
591 * Handle START-message.
593 * @param cls closure (always NULL)
594 * @param client identification of the client
595 * @param message the actual message
596 * @return GNUNET_OK to keep the connection open,
597 * GNUNET_SYSERR to close it (signal serious error)
600 handle_start (void *cls,
601 struct GNUNET_SERVER_Client *client,
602 const struct GNUNET_MessageHeader *message)
604 const char *servicename;
607 size = ntohs (message->size);
608 size -= sizeof (struct GNUNET_MessageHeader);
609 servicename = (const char *) &message[1];
610 if ((size == 0) || (servicename[size - 1] != '\0'))
613 GNUNET_SERVER_receive_done (client, GNUNET_SYSERR);
616 start_service (client, servicename);
617 GNUNET_SERVER_receive_done (client, GNUNET_OK);
622 * Handle STOP-message.
624 * @param cls closure (always NULL)
625 * @param client identification of the client
626 * @param message the actual message
627 * @return GNUNET_OK to keep the connection open,
628 * GNUNET_SYSERR to close it (signal serious error)
631 handle_stop (void *cls,
632 struct GNUNET_SERVER_Client *client,
633 const struct GNUNET_MessageHeader *message)
635 const char *servicename;
638 size = ntohs (message->size);
639 size -= sizeof (struct GNUNET_MessageHeader);
640 servicename = (const char *) &message[1];
641 if ((size == 0) || (servicename[size - 1] != '\0'))
644 GNUNET_SERVER_receive_done (client, GNUNET_SYSERR);
647 stop_service (client, servicename);
652 * Remove all entries for tasks that are not running
653 * (pid = 0) from the running list (they will no longer
654 * be restarted since we are shutting down).
659 struct ServiceList *pos;
660 struct ServiceList *next;
661 struct ServiceList *prev;
684 * We are done with everything. Stop remaining
685 * tasks, signal handler and the server.
690 GNUNET_SERVER_destroy (server);
692 GNUNET_SIGNAL_handler_uninstall (shc_chld);
694 GNUNET_SCHEDULER_cancel (sched, child_death_task);
695 child_death_task = GNUNET_SCHEDULER_NO_TASK;
700 * Task run for shutdown.
702 * @param cls closure, NULL if we need to self-restart
706 shutdown_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
708 struct ServiceList *pos;
711 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, _("Stopping all services\n"));
713 in_shutdown = GNUNET_YES;
720 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
721 "Sending SIGTERM to `%s'\n", pos->name);
723 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
724 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
734 * Task run whenever it is time to restart a child that died.
736 * @param cls closure, always NULL
740 delayed_restart_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
742 struct ServiceList *pos;
743 struct GNUNET_TIME_Relative lowestRestartDelay;
745 child_restart_task = GNUNET_SCHEDULER_NO_TASK;
746 if (0 != (tc->reason & GNUNET_SCHEDULER_REASON_SHUTDOWN))
753 lowestRestartDelay = GNUNET_TIME_UNIT_FOREVER_REL;
755 /* check for services that need to be restarted due to
756 configuration changes or because the last restart failed */
760 if ( (pos->pid == 0) &&
761 (GNUNET_YES != in_shutdown) )
763 if (GNUNET_TIME_absolute_get_remaining (pos->restartAt).value == 0)
765 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
766 _("Restarting service `%s'.\n"), pos->name);
772 = GNUNET_TIME_relative_min (lowestRestartDelay,
773 GNUNET_TIME_absolute_get_remaining
779 if (lowestRestartDelay.value != GNUNET_TIME_UNIT_FOREVER_REL.value)
782 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
783 "Will restart process in %llums\n",
784 (unsigned long long) lowestRestartDelay.value);
787 = GNUNET_SCHEDULER_add_delayed (sched,
789 &delayed_restart_task,
796 * Task triggered whenever we receive a SIGCHLD (child
799 * @param cls closure, NULL if we need to self-restart
803 maint_child_death (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
805 struct ServiceList *pos;
806 struct ServiceList *prev;
807 struct ServiceList *next;
812 enum GNUNET_OS_ProcessStatusType statusType;
813 unsigned long statusCode;
815 child_death_task = GNUNET_SCHEDULER_NO_TASK;
816 if (0 == (tc->reason & GNUNET_SCHEDULER_REASON_READ_READY))
819 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
820 &maint_child_death, NULL);
823 /* consume the signal */
824 GNUNET_break (0 < GNUNET_DISK_file_read (pr, &c, sizeof (c)));
826 /* check for services that died (WAITPID) */
829 while (NULL != (pos = next))
837 if ((GNUNET_SYSERR == (ret = GNUNET_OS_process_status (pos->pid,
840 ( (ret == GNUNET_NO) ||
841 (statusType == GNUNET_OS_PROCESS_STOPPED) ||
842 (statusType == GNUNET_OS_PROCESS_RUNNING)) )
847 if (statusType == GNUNET_OS_PROCESS_EXITED)
849 statstr = _( /* process termination method */ "exit");
850 statcode = statusCode;
852 else if (statusType == GNUNET_OS_PROCESS_SIGNALED)
854 statstr = _( /* process termination method */ "signal");
855 statcode = statusCode;
859 statstr = _( /* process termination method */ "unknown");
863 if (NULL != pos->killing_client)
869 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
870 "Service `%s' stopped\n",
872 signal_result (pos->killing_client,
873 pos->name, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
874 GNUNET_SERVER_receive_done (pos->killing_client, GNUNET_OK);
875 GNUNET_SERVER_client_drop (pos->killing_client);
879 if (GNUNET_YES != in_shutdown)
881 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
883 ("Service `%s' terminated with status %s/%d, will try to restart it!\n"),
884 pos->name, statstr, statcode);
885 /* schedule restart */
887 = GNUNET_TIME_relative_to_absolute (pos->backoff);
888 if (pos->backoff.value < EXPONENTIAL_BACKOFF_THRESHOLD)
890 = GNUNET_TIME_relative_multiply (pos->backoff, 2);
891 if (GNUNET_SCHEDULER_NO_TASK != child_restart_task)
892 GNUNET_SCHEDULER_cancel (sched, child_restart_task);
894 = GNUNET_SCHEDULER_add_with_priority (sched,
895 GNUNET_SCHEDULER_PRIORITY_IDLE,
896 &delayed_restart_task,
901 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
902 "Service `%s' terminated with status %s/%d\n",
903 pos->name, statstr, statcode);
909 if ( (running == NULL) &&
912 GNUNET_SERVER_destroy (server);
913 GNUNET_SIGNAL_handler_uninstall (shc_chld);
919 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
920 &maint_child_death, NULL);
926 * List of handlers for the messages understood by this service.
928 static struct GNUNET_SERVER_MessageHandler handlers[] = {
929 {&handle_start, NULL, GNUNET_MESSAGE_TYPE_ARM_START, 0},
930 {&handle_stop, NULL, GNUNET_MESSAGE_TYPE_ARM_STOP, 0},
935 * Signal handler called for SIGCHLD. Triggers the
936 * respective handler by writing to the trigger pipe.
939 sighandler_child_death ()
944 GNUNET_DISK_file_write (GNUNET_DISK_pipe_handle
945 (sigpipe, GNUNET_DISK_PIPE_END_WRITE), &c,
951 * Process arm requests.
954 * @param s scheduler to use
955 * @param serv the initialized server
956 * @param c configuration to use
960 struct GNUNET_SCHEDULER_Handle *s,
961 struct GNUNET_SERVER_Handle *serv,
962 const struct GNUNET_CONFIGURATION_Handle *c)
964 char *defaultservices;
970 GNUNET_assert (serv != NULL);
971 shc_chld = GNUNET_SIGNAL_handler_install (SIGCHLD, &sighandler_child_death);
972 GNUNET_assert (sigpipe == NULL);
973 sigpipe = GNUNET_DISK_pipe (GNUNET_NO);
974 GNUNET_assert (sigpipe != NULL);
975 pr = GNUNET_DISK_pipe_handle (sigpipe, GNUNET_DISK_PIPE_END_READ);
976 GNUNET_assert (pr != NULL);
977 GNUNET_SERVER_ignore_shutdown (serv, GNUNET_YES);
978 GNUNET_SCHEDULER_add_delayed (sched,
979 GNUNET_TIME_UNIT_FOREVER_REL,
983 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
984 &maint_child_death, NULL);
987 GNUNET_CONFIGURATION_get_value_string (cfg,
991 prefix_command = GNUNET_strdup ("");
992 /* start default services... */
994 GNUNET_CONFIGURATION_get_value_string (cfg,
1000 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
1001 "Starting default services `%s'\n", defaultservices);
1003 pos = strtok (defaultservices, " ");
1006 start_service (NULL, pos);
1007 pos = strtok (NULL, " ");
1009 GNUNET_free (defaultservices);
1014 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
1015 "No default services configured.\n");
1019 /* process client requests */
1020 GNUNET_SERVER_add_handlers (server, handlers);
1022 /* manage services */
1023 GNUNET_SCHEDULER_add_with_priority (sched,
1024 GNUNET_SCHEDULER_PRIORITY_IDLE,
1025 &config_change_task, NULL);
1030 * The main function for the arm service.
1032 * @param argc number of arguments from the command line
1033 * @param argv command line arguments
1034 * @return 0 ok, 1 on error
1037 main (int argc, char *const *argv)
1039 return (GNUNET_OK ==
1040 GNUNET_SERVICE_run (argc,
1041 argv, "arm", GNUNET_YES, &run, NULL)) ? 0 : 1;
1044 /* end of gnunet-service-arm.c */