2 This file is part of GNUnet.
3 (C) 2009 Christian Grothoff (and other contributing authors)
5 GNUnet is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
10 GNUnet is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GNUnet; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
22 * @file arm/gnunet-service-arm.c
23 * @brief the automated restart manager service
24 * @author Christian Grothoff
27 * - multiple start-stop requests with RC>1 can result
28 * in UP/DOWN signals based on "pending" that are inaccurate...
29 * => have list of clients waiting for a resolution instead of
30 * giving instant (but incorrect) replies
31 * - need to test auto-restart code on configuration changes;
32 * - should refine restart code to check if *relevant* parts of the
33 * configuration were changed (anything in the section for the service)
34 * - should have a way to specify dependencies between services and
35 * manage restarts of groups of services
38 #include "gnunet_client_lib.h"
39 #include "gnunet_getopt_lib.h"
40 #include "gnunet_os_lib.h"
41 #include "gnunet_protocols.h"
42 #include "gnunet_service_lib.h"
43 #include "gnunet_signal_lib.h"
48 * Check for configuration file changes every 5s.
50 #define MAINT_FREQUENCY GNUNET_TIME_relative_multiply (GNUNET_TIME_UNIT_SECONDS, 5)
53 * Threshold after which exponential backoff shouldn't increase (in ms); 30m
55 #define EXPONENTIAL_BACKOFF_THRESHOLD (1000 * 60 * 30)
59 * List of our services.
64 * Function to call if waitpid informs us that
68 * @param pos entry in the service list of the process that died
70 typedef void (*CleanCallback) (void *cls, struct ServiceList * pos);
73 * List of our services.
78 * This is a linked list.
80 struct ServiceList *next;
83 * Name of the service.
88 * Name of the binary used.
93 * Name of the configuration file used.
98 * Function to call upon kill completion (waitpid), NULL
99 * if we should simply restart the process.
101 CleanCallback kill_continuation;
104 * Closure for kill_continuation.
106 void *kill_continuation_cls;
109 * Process ID of the child.
114 * Last time the config of this service was
119 /* Process exponential backoff time */
120 struct GNUNET_TIME_Relative backoff;
122 /* Absolute time at which the process is scheduled to restart in case of death */
123 struct GNUNET_TIME_Absolute restartAt;
126 * Reference counter (counts how many times we've been
127 * asked to start the service). We only actually stop
128 * it once rc hits zero.
135 * List of running services.
137 static struct ServiceList *running;
142 static const struct GNUNET_CONFIGURATION_Handle *cfg;
147 static struct GNUNET_SCHEDULER_Handle *sched;
150 * Command to prepend to each actual command.
152 static char *prefix_command;
155 * ID of task called whenever we get a SIGCHILD.
157 static GNUNET_SCHEDULER_TaskIdentifier child_death_task;
160 * ID of task called whenever the timeout for restarting a child
163 static GNUNET_SCHEDULER_TaskIdentifier child_restart_task;
166 * Context for our SIGCHILD handler.
168 static struct GNUNET_SIGNAL_Context *shc_chld;
171 * Pipe used to communicate shutdown via signal.
173 static struct GNUNET_DISK_PipeHandle *sigpipe;
176 * Reading end of the signal pipe.
178 static const struct GNUNET_DISK_FileHandle *pr;
181 * Are we in shutdown mode?
183 static int in_shutdown;
187 * Handle to our server instance. Our server is a bit special in that
188 * its service is not immediately stopped once we get a shutdown
189 * request (since we need to continue service until all of our child
190 * processes are dead). This handle is used to shut down the server
191 * (and thus trigger process termination) once all child processes are
192 * also dead. A special option in the ARM configuration modifies the
193 * behaviour of the service implementation to not do the shutdown
196 static struct GNUNET_SERVER_Handle *server;
200 * If the configuration file changes, restart tasks that depended on that
203 * @param cls closure, NULL if we need to self-restart
207 config_change_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
209 struct ServiceList *pos;
215 /* FIXME: this test for config change is a bit too coarse grained */
216 if ( (0 == STAT (pos->config, &sbuf)) &&
217 (pos->mtime < sbuf.st_mtime) &&
220 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
221 _("Restarting service `%s' due to configuration file change.\n"));
222 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
223 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
225 pos->backoff = GNUNET_TIME_UNIT_MILLISECONDS;
234 * Transmit a status result message.
236 * @param cls pointer to "unit16_t*" with message type
237 * @param size number of bytes available in buf
238 * @param buf where to copy the message, NULL on error
239 * @return number of bytes copied to buf
242 write_result (void *cls, size_t size, void *buf)
245 struct GNUNET_MessageHeader *msg;
249 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
250 _("Could not send status result to client\n"));
251 return 0; /* error, not much we can do */
253 GNUNET_assert (size >= sizeof (struct GNUNET_MessageHeader));
255 msg->size = htons (sizeof (struct GNUNET_MessageHeader));
256 msg->type = htons (*res);
258 return sizeof (struct GNUNET_MessageHeader);
264 * Signal our client that we will start or stop the
267 * @param client who is being signalled
268 * @param name name of the service
269 * @param result message type to send
270 * @return NULL if it was not found
273 signal_result (struct GNUNET_SERVER_Client *client,
274 const char *name, uint16_t result)
280 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
282 ("Not sending status result to client: no client known\n"));
286 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
287 "Telling client that service `%s' is now %s\n",
289 result == GNUNET_MESSAGE_TYPE_ARM_IS_DOWN ? "down" : "up");
291 res = GNUNET_malloc (sizeof (uint16_t));
293 GNUNET_SERVER_notify_transmit_ready (client,
294 sizeof (struct GNUNET_MessageHeader),
295 GNUNET_TIME_UNIT_FOREVER_REL,
301 * Find the process with the given service
302 * name in the given list, remove it and return it.
304 * @param name which service entry to look up
305 * @return NULL if it was not found
307 static struct ServiceList *
308 find_name (const char *name)
310 struct ServiceList *pos;
311 struct ServiceList *prev;
317 if (0 == strcmp (pos->name, name))
322 prev->next = pos->next;
334 * Free an entry in the service list.
336 * @param pos entry to free
339 free_entry (struct ServiceList *pos)
341 GNUNET_free_non_null (pos->config);
342 GNUNET_free_non_null (pos->binary);
343 GNUNET_free (pos->name);
349 * Actually start the process for the given service.
351 * @param sl identifies service to start
354 start_process (struct ServiceList *sl)
359 unsigned int argv_size;
362 const char *firstarg;
367 GNUNET_CONFIGURATION_get_value_string (cfg,
368 sl->name, "PREFIX", &loprefix))
369 loprefix = GNUNET_strdup (prefix_command);
371 GNUNET_CONFIGURATION_get_value_string (cfg,
372 sl->name, "OPTIONS", &options))
373 options = GNUNET_strdup ("");
374 use_debug = GNUNET_CONFIGURATION_get_value_yesno (cfg, sl->name, "DEBUG");
376 GNUNET_log (GNUNET_ERROR_TYPE_INFO, _("Starting service `%s'\n"), sl->name);
378 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
379 "Starting service `%s' using binary `%s' and configuration `%s'\n",
380 sl->name, sl->binary, sl->config);
386 while ('\0' != *lopos)
393 while ('\0' != *optpos)
400 argv = GNUNET_malloc (argv_size * sizeof (char *));
404 while ('\0' != *lopos)
406 while (*lopos == ' ')
412 argv[argv_size++] = lopos;
413 while (('\0' != *lopos) && (' ' != *lopos))
421 firstarg = sl->binary;
422 argv[argv_size++] = sl->binary;
423 argv[argv_size++] = "-c";
424 argv[argv_size++] = sl->config;
425 if (GNUNET_YES == use_debug)
427 argv[argv_size++] = "-L";
428 argv[argv_size++] = "DEBUG";
431 while ('\0' != *optpos)
433 while (*optpos == ' ')
437 argv[argv_size++] = optpos;
438 while (('\0' != *optpos) && (' ' != *optpos))
445 argv[argv_size++] = NULL;
446 sl->pid = GNUNET_OS_start_process_v (firstarg, argv);
447 /* FIXME: should check sl->pid */
449 GNUNET_free (loprefix);
450 GNUNET_free (options);
455 * Start the specified service.
457 * @param client who is asking for this
458 * @param servicename name of the service to start
461 start_service (struct GNUNET_SERVER_Client *client, const char *servicename)
463 struct ServiceList *sl;
468 if (GNUNET_YES == in_shutdown)
470 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
471 _("ARM is shutting down, service `%s' not started.\n"),
473 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
476 sl = find_name (servicename);
479 /* already running, just increment RC */
480 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
481 _("Service `%s' already running.\n"), servicename);
485 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
489 GNUNET_CONFIGURATION_get_value_string (cfg,
490 servicename, "BINARY", &binary))
492 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
493 _("Binary implementing service `%s' not known!\n"),
495 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
499 GNUNET_CONFIGURATION_get_value_filename (cfg,
503 (0 != STAT (config, &sbuf)))
505 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
506 _("Configuration file `%s' for service `%s' not known!\n"),
507 config, servicename);
508 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
509 GNUNET_free (binary);
510 GNUNET_free_non_null (config);
513 sl = GNUNET_malloc (sizeof (struct ServiceList));
514 sl->name = GNUNET_strdup (servicename);
519 sl->mtime = sbuf.st_mtime;
520 sl->backoff = GNUNET_TIME_UNIT_MILLISECONDS;
521 sl->restartAt = GNUNET_TIME_UNIT_FOREVER_ABS;
526 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
531 * Free the given entry in the service list and signal
532 * the given client that the service is now down.
534 * @param cls pointer to the client ("struct GNUNET_SERVER_Client*")
535 * @param pos entry for the service
538 free_and_signal (void *cls, struct ServiceList *pos)
540 struct GNUNET_SERVER_Client *client = cls;
541 /* find_name will remove "pos" from the list! */
542 GNUNET_log (GNUNET_ERROR_TYPE_INFO, "Service `%s' stopped\n", pos->name);
543 signal_result (client, pos->name, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
544 GNUNET_SERVER_receive_done (client, GNUNET_OK);
545 GNUNET_SERVER_client_drop (client);
551 * Stop the specified service.
553 * @param client who is asking for this
554 * @param servicename name of the service to stop
557 stop_service (struct GNUNET_SERVER_Client *client, const char *servicename)
559 struct ServiceList *pos;
561 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
562 _("Preparing to stop `%s'\n"), servicename);
563 pos = find_name (servicename);
566 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UNKNOWN);
567 GNUNET_SERVER_receive_done (client, GNUNET_OK);
572 /* RC>1, just decrement RC */
577 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
578 "Service `%s' still used by %u clients, will keep it running!\n",
579 servicename, pos->rc);
581 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
582 GNUNET_SERVER_receive_done (client, GNUNET_OK);
586 pos->rc--; /* decrement RC to zero */
587 if (pos->kill_continuation != NULL)
589 /* killing already in progress */
591 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
592 "Service `%s' is already down\n", servicename);
594 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
595 GNUNET_SERVER_receive_done (client, GNUNET_OK);
599 if (GNUNET_YES == in_shutdown)
602 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
603 "Termination request already sent to `%s' (since ARM is in shutdown).\n",
606 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
607 GNUNET_SERVER_receive_done (client, GNUNET_OK);
613 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
614 "Sending kill signal to service `%s', waiting for process to die.\n",
617 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
618 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
621 pos->kill_continuation = &free_and_signal;
622 pos->kill_continuation_cls = client;
623 GNUNET_SERVER_client_keep (client);
628 * Handle START-message.
630 * @param cls closure (always NULL)
631 * @param client identification of the client
632 * @param message the actual message
633 * @return GNUNET_OK to keep the connection open,
634 * GNUNET_SYSERR to close it (signal serious error)
637 handle_start (void *cls,
638 struct GNUNET_SERVER_Client *client,
639 const struct GNUNET_MessageHeader *message)
641 const char *servicename;
644 size = ntohs (message->size);
645 size -= sizeof (struct GNUNET_MessageHeader);
646 servicename = (const char *) &message[1];
647 if ((size == 0) || (servicename[size - 1] != '\0'))
650 GNUNET_SERVER_receive_done (client, GNUNET_SYSERR);
653 start_service (client, servicename);
654 GNUNET_SERVER_receive_done (client, GNUNET_OK);
659 * Handle STOP-message.
661 * @param cls closure (always NULL)
662 * @param client identification of the client
663 * @param message the actual message
664 * @return GNUNET_OK to keep the connection open,
665 * GNUNET_SYSERR to close it (signal serious error)
668 handle_stop (void *cls,
669 struct GNUNET_SERVER_Client *client,
670 const struct GNUNET_MessageHeader *message)
672 const char *servicename;
675 size = ntohs (message->size);
676 size -= sizeof (struct GNUNET_MessageHeader);
677 servicename = (const char *) &message[1];
678 if ((size == 0) || (servicename[size - 1] != '\0'))
681 GNUNET_SERVER_receive_done (client, GNUNET_SYSERR);
684 stop_service (client, servicename);
689 * Remove all entries for tasks that are not running
690 * (pid = 0) from the running list (they will no longer
691 * be restarted since we are shutting down).
696 struct ServiceList *pos;
697 struct ServiceList *next;
698 struct ServiceList *prev;
719 * We are done with everything. Stop remaining
720 * tasks, signal handler and the server.
725 GNUNET_SERVER_destroy (server);
727 GNUNET_SIGNAL_handler_uninstall (shc_chld);
729 GNUNET_SCHEDULER_cancel (sched, child_death_task);
730 child_death_task = GNUNET_SCHEDULER_NO_TASK;
735 * Task run for shutdown.
737 * @param cls closure, NULL if we need to self-restart
741 shutdown_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
743 struct ServiceList *pos;
745 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, _("Stopping all services\n"));
746 in_shutdown = GNUNET_YES;
753 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
754 "Sending SIGTERM to `%s'\n", pos->name);
756 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
757 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
767 * Task run whenever it is time to restart a child that died.
769 * @param cls closure, always NULL
773 delayed_restart_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
775 struct ServiceList *pos;
776 struct GNUNET_TIME_Relative lowestRestartDelay;
778 child_restart_task = GNUNET_SCHEDULER_NO_TASK;
779 if (0 != (tc->reason & GNUNET_SCHEDULER_REASON_SHUTDOWN))
786 lowestRestartDelay = GNUNET_TIME_UNIT_FOREVER_REL;
788 /* check for services that need to be restarted due to
789 configuration changes or because the last restart failed */
793 if ( (pos->pid == 0) &&
794 (GNUNET_YES != in_shutdown) )
796 if (GNUNET_TIME_absolute_get_remaining (pos->restartAt).value == 0)
798 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
799 _("Restarting service `%s'.\n"), pos->name);
805 = GNUNET_TIME_relative_min (lowestRestartDelay,
806 GNUNET_TIME_absolute_get_remaining
812 if (lowestRestartDelay.value != GNUNET_TIME_UNIT_FOREVER_REL.value)
814 = GNUNET_SCHEDULER_add_delayed (sched,
816 &delayed_restart_task,
824 * @param cls closure, NULL if we need to self-restart
828 maint_child_death (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
830 struct ServiceList *pos;
831 struct ServiceList *prev;
832 struct ServiceList *next;
838 child_death_task = GNUNET_SCHEDULER_NO_TASK;
839 if (0 == (tc->reason & GNUNET_SCHEDULER_REASON_READ_READY))
842 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
843 &maint_child_death, NULL);
846 /* consume the signal */
847 GNUNET_DISK_file_read (pr, &c, sizeof (c));
849 /* check for services that died (WAITPID) */
852 while (NULL != (pos = next))
854 enum GNUNET_OS_ProcessStatusType statusType;
855 unsigned long statusCode;
863 if ((GNUNET_SYSERR == (ret = GNUNET_OS_process_status (pos->pid,
866 ( (ret == GNUNET_NO) ||
867 (statusType == GNUNET_OS_PROCESS_STOPPED) ||
868 (statusType == GNUNET_OS_PROCESS_RUNNING)) )
873 if (statusType == GNUNET_OS_PROCESS_EXITED)
875 statstr = _( /* process termination method */ "exit");
876 statcode = statusCode;
878 else if (statusType == GNUNET_OS_PROCESS_SIGNALED)
880 statstr = _( /* process termination method */ "signal");
881 statcode = statusCode;
885 statstr = _( /* process termination method */ "unknown");
889 if (NULL != pos->kill_continuation)
895 pos->kill_continuation (pos->kill_continuation_cls, pos);
898 if (GNUNET_YES != in_shutdown)
900 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
902 ("Service `%s' terminated with status %s/%d, will try to restart it!\n"),
903 pos->name, statstr, statcode);
904 /* schedule restart */
906 = GNUNET_TIME_relative_to_absolute (pos->backoff);
907 if (pos->backoff.value < EXPONENTIAL_BACKOFF_THRESHOLD)
909 = GNUNET_TIME_relative_multiply (pos->backoff, 2);
910 if (GNUNET_SCHEDULER_NO_TASK != child_restart_task)
911 GNUNET_SCHEDULER_cancel (sched, child_restart_task);
913 = GNUNET_SCHEDULER_add_with_priority (sched,
914 GNUNET_SCHEDULER_PRIORITY_IDLE,
915 &delayed_restart_task,
920 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
921 "Service `%s' terminated with status %s/%d\n",
922 pos->name, statstr, statcode);
926 if ( (running == NULL) &&
929 GNUNET_SERVER_destroy (server);
930 GNUNET_SIGNAL_handler_uninstall (shc_chld);
936 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
937 &maint_child_death, NULL);
943 * List of handlers for the messages understood by this service.
945 static struct GNUNET_SERVER_MessageHandler handlers[] = {
946 {&handle_start, NULL, GNUNET_MESSAGE_TYPE_ARM_START, 0},
947 {&handle_stop, NULL, GNUNET_MESSAGE_TYPE_ARM_STOP, 0},
952 * Signal handler called for signals that should cause us to shutdown.
955 sighandler_child_death ()
959 GNUNET_DISK_file_write (GNUNET_DISK_pipe_handle
960 (sigpipe, GNUNET_DISK_PIPE_END_WRITE), &c,
966 * Process arm requests.
969 * @param s scheduler to use
970 * @param serv the initialized server
971 * @param c configuration to use
975 struct GNUNET_SCHEDULER_Handle *s,
976 struct GNUNET_SERVER_Handle *serv,
977 const struct GNUNET_CONFIGURATION_Handle *c)
979 char *defaultservices;
985 GNUNET_assert (serv != NULL);
986 shc_chld = GNUNET_SIGNAL_handler_install (SIGCHLD, &sighandler_child_death);
987 GNUNET_assert (sigpipe == NULL);
988 sigpipe = GNUNET_DISK_pipe (GNUNET_NO);
989 GNUNET_assert (sigpipe != NULL);
990 pr = GNUNET_DISK_pipe_handle (sigpipe, GNUNET_DISK_PIPE_END_READ);
991 GNUNET_assert (pr != NULL);
992 GNUNET_SERVER_ignore_shutdown (serv, GNUNET_YES);
993 GNUNET_SCHEDULER_add_delayed (sched,
994 GNUNET_TIME_UNIT_FOREVER_REL,
998 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
999 &maint_child_death, NULL);
1002 GNUNET_CONFIGURATION_get_value_string (cfg,
1006 prefix_command = GNUNET_strdup ("");
1007 /* start default services... */
1009 GNUNET_CONFIGURATION_get_value_string (cfg,
1015 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
1016 "Starting default services `%s'\n", defaultservices);
1018 pos = strtok (defaultservices, " ");
1021 start_service (NULL, pos);
1022 pos = strtok (NULL, " ");
1024 GNUNET_free (defaultservices);
1029 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
1030 "No default services configured.\n");
1034 /* process client requests */
1035 GNUNET_SERVER_add_handlers (server, handlers);
1037 /* manage services */
1038 GNUNET_SCHEDULER_add_with_priority (sched,
1039 GNUNET_SCHEDULER_PRIORITY_IDLE,
1040 &config_change_task, NULL);
1045 * The main function for the arm service.
1047 * @param argc number of arguments from the command line
1048 * @param argv command line arguments
1049 * @return 0 ok, 1 on error
1052 main (int argc, char *const *argv)
1054 return (GNUNET_OK ==
1055 GNUNET_SERVICE_run (argc,
1056 argv, "arm", GNUNET_YES, &run, NULL)) ? 0 : 1;
1059 /* end of gnunet-service-arm.c */