2 This file is part of GNUnet.
3 (C) 2009 Christian Grothoff (and other contributing authors)
5 GNUnet is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
10 GNUnet is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GNUnet; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
22 * @file arm/gnunet-service-arm.c
23 * @brief the automated restart manager service
24 * @author Christian Grothoff
27 * - multiple start-stop requests with RC>1 can result
28 * in UP/DOWN signals based on "pending" that are inaccurate...
29 * => have list of clients waiting for a resolution instead of
30 * giving instant (but incorrect) replies
31 * - need to test auto-restart code on configuration changes;
32 * - should refine restart code to check if *relevant* parts of the
33 * configuration were changed (anything in the section for the service)
34 * - should have a way to specify dependencies between services and
35 * manage restarts of groups of services
38 #include "gnunet_client_lib.h"
39 #include "gnunet_getopt_lib.h"
40 #include "gnunet_os_lib.h"
41 #include "gnunet_protocols.h"
42 #include "gnunet_service_lib.h"
43 #include "gnunet_signal_lib.h"
48 * Check for configuration file changes every 5s.
50 #define MAINT_FREQUENCY GNUNET_TIME_relative_multiply (GNUNET_TIME_UNIT_SECONDS, 5)
53 * Threshold after which exponential backoff shouldn't increase (in ms); 30m
55 #define EXPONENTIAL_BACKOFF_THRESHOLD (1000 * 60 * 30)
59 * List of our services.
64 * List of our services.
69 * This is a linked list.
71 struct ServiceList *next;
74 * Name of the service.
79 * Name of the binary used.
84 * Name of the configuration file used.
89 * Client to notify upon kill completion (waitpid), NULL
90 * if we should simply restart the process.
92 struct GNUNET_SERVER_Client *killing_client;
95 * Process ID of the child.
100 * Last time the config of this service was
106 * Process exponential backoff time
108 struct GNUNET_TIME_Relative backoff;
111 * Absolute time at which the process is scheduled to restart in case of death
113 struct GNUNET_TIME_Absolute restartAt;
116 * Reference counter (counts how many times we've been
117 * asked to start the service). We only actually stop
118 * it once rc hits zero.
125 * List of running services.
127 static struct ServiceList *running;
132 static const struct GNUNET_CONFIGURATION_Handle *cfg;
137 static struct GNUNET_SCHEDULER_Handle *sched;
140 * Command to prepend to each actual command.
142 static char *prefix_command;
145 * ID of task called whenever we get a SIGCHILD.
147 static GNUNET_SCHEDULER_TaskIdentifier child_death_task;
150 * ID of task called whenever the timeout for restarting a child
153 static GNUNET_SCHEDULER_TaskIdentifier child_restart_task;
156 * Context for our SIGCHILD handler.
158 static struct GNUNET_SIGNAL_Context *shc_chld;
161 * Pipe used to communicate shutdown via signal.
163 static struct GNUNET_DISK_PipeHandle *sigpipe;
166 * Reading end of the signal pipe.
168 static const struct GNUNET_DISK_FileHandle *pr;
171 * Are we in shutdown mode?
173 static int in_shutdown;
177 * Handle to our server instance. Our server is a bit special in that
178 * its service is not immediately stopped once we get a shutdown
179 * request (since we need to continue service until all of our child
180 * processes are dead). This handle is used to shut down the server
181 * (and thus trigger process termination) once all child processes are
182 * also dead. A special option in the ARM configuration modifies the
183 * behaviour of the service implementation to not do the shutdown
186 static struct GNUNET_SERVER_Handle *server;
190 * If the configuration file changes, restart tasks that depended on that
193 * @param cls closure, NULL if we need to self-restart
197 config_change_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
199 struct ServiceList *pos;
205 /* FIXME: this test for config change is a bit too coarse grained */
206 if ( (0 == STAT (pos->config, &sbuf)) &&
207 (pos->mtime < sbuf.st_mtime) &&
210 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
211 _("Restarting service `%s' due to configuration file change.\n"));
212 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
213 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
215 pos->backoff = GNUNET_TIME_UNIT_MILLISECONDS;
224 * Transmit a status result message.
226 * @param cls pointer to "unit16_t*" with message type
227 * @param size number of bytes available in buf
228 * @param buf where to copy the message, NULL on error
229 * @return number of bytes copied to buf
232 write_result (void *cls, size_t size, void *buf)
235 struct GNUNET_MessageHeader *msg;
239 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
240 _("Could not send status result to client\n"));
241 return 0; /* error, not much we can do */
244 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
245 "Sending status response %u to client\n",
246 (unsigned int) *res);
248 GNUNET_assert (size >= sizeof (struct GNUNET_MessageHeader));
250 msg->size = htons (sizeof (struct GNUNET_MessageHeader));
251 msg->type = htons (*res);
253 return sizeof (struct GNUNET_MessageHeader);
259 * Signal our client that we will start or stop the
262 * @param client who is being signalled
263 * @param name name of the service
264 * @param result message type to send
265 * @return NULL if it was not found
268 signal_result (struct GNUNET_SERVER_Client *client,
269 const char *name, uint16_t result)
275 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
277 ("Not sending status result to client: no client known\n"));
281 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
282 "Telling client that service `%s' is now %s\n",
284 result == GNUNET_MESSAGE_TYPE_ARM_IS_DOWN ? "down" : "up");
286 res = GNUNET_malloc (sizeof (uint16_t));
288 GNUNET_SERVER_notify_transmit_ready (client,
289 sizeof (struct GNUNET_MessageHeader),
290 GNUNET_TIME_UNIT_FOREVER_REL,
296 * Find the process with the given service
297 * name in the given list, remove it and return it.
299 * @param name which service entry to look up
300 * @return NULL if it was not found
302 static struct ServiceList *
303 find_name (const char *name)
305 struct ServiceList *pos;
306 struct ServiceList *prev;
312 if (0 == strcmp (pos->name, name))
317 prev->next = pos->next;
329 * Free an entry in the service list.
331 * @param pos entry to free
334 free_entry (struct ServiceList *pos)
336 GNUNET_free_non_null (pos->config);
337 GNUNET_free_non_null (pos->binary);
338 GNUNET_free (pos->name);
344 * Actually start the process for the given service.
346 * @param sl identifies service to start
349 start_process (struct ServiceList *sl)
354 unsigned int argv_size;
357 const char *firstarg;
362 GNUNET_CONFIGURATION_get_value_string (cfg,
363 sl->name, "PREFIX", &loprefix))
364 loprefix = GNUNET_strdup (prefix_command);
366 GNUNET_CONFIGURATION_get_value_string (cfg,
367 sl->name, "OPTIONS", &options))
368 options = GNUNET_strdup ("");
369 use_debug = GNUNET_CONFIGURATION_get_value_yesno (cfg, sl->name, "DEBUG");
371 GNUNET_log (GNUNET_ERROR_TYPE_INFO, _("Starting service `%s'\n"), sl->name);
373 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
374 "Starting service `%s' using binary `%s' and configuration `%s'\n",
375 sl->name, sl->binary, sl->config);
381 while ('\0' != *lopos)
388 while ('\0' != *optpos)
395 argv = GNUNET_malloc (argv_size * sizeof (char *));
399 while ('\0' != *lopos)
401 while (*lopos == ' ')
407 argv[argv_size++] = lopos;
408 while (('\0' != *lopos) && (' ' != *lopos))
416 firstarg = sl->binary;
417 argv[argv_size++] = sl->binary;
418 argv[argv_size++] = "-c";
419 argv[argv_size++] = sl->config;
420 if (GNUNET_YES == use_debug)
422 argv[argv_size++] = "-L";
423 argv[argv_size++] = "DEBUG";
426 while ('\0' != *optpos)
428 while (*optpos == ' ')
432 argv[argv_size++] = optpos;
433 while (('\0' != *optpos) && (' ' != *optpos))
440 argv[argv_size] = NULL;
441 sl->pid = GNUNET_OS_start_process_v (firstarg, argv);
442 /* FIXME: should check sl->pid */
444 GNUNET_free (loprefix);
445 GNUNET_free (options);
450 * Start the specified service.
452 * @param client who is asking for this
453 * @param servicename name of the service to start
456 start_service (struct GNUNET_SERVER_Client *client, const char *servicename)
458 struct ServiceList *sl;
463 if (GNUNET_YES == in_shutdown)
465 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
466 _("ARM is shutting down, service `%s' not started.\n"),
468 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
471 sl = find_name (servicename);
474 /* already running, just increment RC */
475 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
476 _("Service `%s' already running.\n"), servicename);
480 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
484 GNUNET_CONFIGURATION_get_value_string (cfg,
485 servicename, "BINARY", &binary))
487 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
488 _("Binary implementing service `%s' not known!\n"),
490 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
494 GNUNET_CONFIGURATION_get_value_filename (cfg,
498 (0 != STAT (config, &sbuf)))
500 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
501 _("Configuration file `%s' for service `%s' not known!\n"),
502 config, servicename);
503 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
504 GNUNET_free (binary);
505 GNUNET_free_non_null (config);
508 sl = GNUNET_malloc (sizeof (struct ServiceList));
509 sl->name = GNUNET_strdup (servicename);
514 sl->mtime = sbuf.st_mtime;
515 sl->backoff = GNUNET_TIME_UNIT_MILLISECONDS;
516 sl->restartAt = GNUNET_TIME_UNIT_FOREVER_ABS;
521 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
526 * Stop the specified service.
528 * @param client who is asking for this
529 * @param servicename name of the service to stop
532 stop_service (struct GNUNET_SERVER_Client *client, const char *servicename)
534 struct ServiceList *pos;
536 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
537 _("Preparing to stop `%s'\n"), servicename);
538 pos = find_name (servicename);
541 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UNKNOWN);
542 GNUNET_SERVER_receive_done (client, GNUNET_OK);
547 /* RC>1, just decrement RC */
552 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
553 "Service `%s' still used by %u clients, will keep it running!\n",
554 servicename, pos->rc);
556 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
557 GNUNET_SERVER_receive_done (client, GNUNET_OK);
561 pos->rc--; /* decrement RC to zero */
562 if (pos->killing_client != NULL)
564 /* killing already in progress */
566 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
567 "Service `%s' is already down\n", servicename);
569 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
570 GNUNET_SERVER_receive_done (client, GNUNET_OK);
576 if (GNUNET_YES == in_shutdown)
579 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
580 "Termination request already sent to `%s' (since ARM is in shutdown).\n",
583 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
584 GNUNET_SERVER_receive_done (client, GNUNET_OK);
591 /* process is in delayed restart, simply remove it! */
593 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
594 GNUNET_SERVER_receive_done (client, GNUNET_OK);
598 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
599 "Sending kill signal to service `%s', waiting for process to die.\n",
602 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
603 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
606 pos->killing_client = client;
607 GNUNET_SERVER_client_keep (client);
612 * Handle START-message.
614 * @param cls closure (always NULL)
615 * @param client identification of the client
616 * @param message the actual message
617 * @return GNUNET_OK to keep the connection open,
618 * GNUNET_SYSERR to close it (signal serious error)
621 handle_start (void *cls,
622 struct GNUNET_SERVER_Client *client,
623 const struct GNUNET_MessageHeader *message)
625 const char *servicename;
628 size = ntohs (message->size);
629 size -= sizeof (struct GNUNET_MessageHeader);
630 servicename = (const char *) &message[1];
631 if ((size == 0) || (servicename[size - 1] != '\0'))
634 GNUNET_SERVER_receive_done (client, GNUNET_SYSERR);
637 start_service (client, servicename);
638 GNUNET_SERVER_receive_done (client, GNUNET_OK);
643 * Handle STOP-message.
645 * @param cls closure (always NULL)
646 * @param client identification of the client
647 * @param message the actual message
648 * @return GNUNET_OK to keep the connection open,
649 * GNUNET_SYSERR to close it (signal serious error)
652 handle_stop (void *cls,
653 struct GNUNET_SERVER_Client *client,
654 const struct GNUNET_MessageHeader *message)
656 const char *servicename;
659 size = ntohs (message->size);
660 size -= sizeof (struct GNUNET_MessageHeader);
661 servicename = (const char *) &message[1];
662 if ((size == 0) || (servicename[size - 1] != '\0'))
665 GNUNET_SERVER_receive_done (client, GNUNET_SYSERR);
668 stop_service (client, servicename);
673 * Remove all entries for tasks that are not running
674 * (pid = 0) from the running list (they will no longer
675 * be restarted since we are shutting down).
680 struct ServiceList *pos;
681 struct ServiceList *next;
682 struct ServiceList *prev;
705 * We are done with everything. Stop remaining
706 * tasks, signal handler and the server.
711 GNUNET_SERVER_destroy (server);
713 GNUNET_SIGNAL_handler_uninstall (shc_chld);
715 GNUNET_SCHEDULER_cancel (sched, child_death_task);
716 child_death_task = GNUNET_SCHEDULER_NO_TASK;
721 * Task run for shutdown.
723 * @param cls closure, NULL if we need to self-restart
727 shutdown_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
729 struct ServiceList *pos;
732 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG, _("Stopping all services\n"));
734 in_shutdown = GNUNET_YES;
741 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
742 "Sending SIGTERM to `%s'\n", pos->name);
744 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
745 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
755 * Task run whenever it is time to restart a child that died.
757 * @param cls closure, always NULL
761 delayed_restart_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
763 struct ServiceList *pos;
764 struct GNUNET_TIME_Relative lowestRestartDelay;
766 child_restart_task = GNUNET_SCHEDULER_NO_TASK;
767 if (0 != (tc->reason & GNUNET_SCHEDULER_REASON_SHUTDOWN))
774 lowestRestartDelay = GNUNET_TIME_UNIT_FOREVER_REL;
776 /* check for services that need to be restarted due to
777 configuration changes or because the last restart failed */
781 if ( (pos->pid == 0) &&
782 (GNUNET_YES != in_shutdown) )
784 if (GNUNET_TIME_absolute_get_remaining (pos->restartAt).value == 0)
786 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
787 _("Restarting service `%s'.\n"), pos->name);
793 = GNUNET_TIME_relative_min (lowestRestartDelay,
794 GNUNET_TIME_absolute_get_remaining
800 if (lowestRestartDelay.value != GNUNET_TIME_UNIT_FOREVER_REL.value)
803 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
804 "Will restart process in %llums\n",
805 (unsigned long long) lowestRestartDelay.value);
808 = GNUNET_SCHEDULER_add_delayed (sched,
810 &delayed_restart_task,
817 * Task triggered whenever we receive a SIGCHLD (child
820 * @param cls closure, NULL if we need to self-restart
824 maint_child_death (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
826 struct ServiceList *pos;
827 struct ServiceList *prev;
828 struct ServiceList *next;
833 enum GNUNET_OS_ProcessStatusType statusType;
834 unsigned long statusCode;
836 child_death_task = GNUNET_SCHEDULER_NO_TASK;
837 if (0 == (tc->reason & GNUNET_SCHEDULER_REASON_READ_READY))
840 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
841 &maint_child_death, NULL);
844 /* consume the signal */
845 GNUNET_break (0 < GNUNET_DISK_file_read (pr, &c, sizeof (c)));
847 /* check for services that died (WAITPID) */
850 while (NULL != (pos = next))
858 if ((GNUNET_SYSERR == (ret = GNUNET_OS_process_status (pos->pid,
861 ( (ret == GNUNET_NO) ||
862 (statusType == GNUNET_OS_PROCESS_STOPPED) ||
863 (statusType == GNUNET_OS_PROCESS_RUNNING)) )
869 if (statusType == GNUNET_OS_PROCESS_EXITED)
871 statstr = _( /* process termination method */ "exit");
872 statcode = statusCode;
874 else if (statusType == GNUNET_OS_PROCESS_SIGNALED)
876 statstr = _( /* process termination method */ "signal");
877 statcode = statusCode;
881 statstr = _( /* process termination method */ "unknown");
885 if (NULL != pos->killing_client)
891 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
892 _("Service `%s' stopped\n"),
894 signal_result (pos->killing_client,
895 pos->name, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
896 GNUNET_SERVER_receive_done (pos->killing_client, GNUNET_OK);
897 GNUNET_SERVER_client_drop (pos->killing_client);
901 if (GNUNET_YES != in_shutdown)
903 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
905 ("Service `%s' terminated with status %s/%d, will try to restart it!\n"),
906 pos->name, statstr, statcode);
907 /* schedule restart */
909 = GNUNET_TIME_relative_to_absolute (pos->backoff);
910 if (pos->backoff.value < EXPONENTIAL_BACKOFF_THRESHOLD)
912 = GNUNET_TIME_relative_multiply (pos->backoff, 2);
913 if (GNUNET_SCHEDULER_NO_TASK != child_restart_task)
914 GNUNET_SCHEDULER_cancel (sched, child_restart_task);
916 = GNUNET_SCHEDULER_add_with_priority (sched,
917 GNUNET_SCHEDULER_PRIORITY_IDLE,
918 &delayed_restart_task,
923 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
924 "Service `%s' terminated with status %s/%d\n",
925 pos->name, statstr, statcode);
931 if ( (running == NULL) &&
934 GNUNET_SERVER_destroy (server);
935 GNUNET_SIGNAL_handler_uninstall (shc_chld);
941 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
942 &maint_child_death, NULL);
948 * List of handlers for the messages understood by this service.
950 static struct GNUNET_SERVER_MessageHandler handlers[] = {
951 {&handle_start, NULL, GNUNET_MESSAGE_TYPE_ARM_START, 0},
952 {&handle_stop, NULL, GNUNET_MESSAGE_TYPE_ARM_STOP, 0},
957 * Signal handler called for SIGCHLD. Triggers the
958 * respective handler by writing to the trigger pipe.
961 sighandler_child_death ()
966 GNUNET_DISK_file_write (GNUNET_DISK_pipe_handle
967 (sigpipe, GNUNET_DISK_PIPE_END_WRITE), &c,
973 * Process arm requests.
976 * @param s scheduler to use
977 * @param serv the initialized server
978 * @param c configuration to use
982 struct GNUNET_SCHEDULER_Handle *s,
983 struct GNUNET_SERVER_Handle *serv,
984 const struct GNUNET_CONFIGURATION_Handle *c)
986 char *defaultservices;
992 GNUNET_assert (serv != NULL);
993 shc_chld = GNUNET_SIGNAL_handler_install (SIGCHLD, &sighandler_child_death);
994 GNUNET_assert (sigpipe == NULL);
995 sigpipe = GNUNET_DISK_pipe (GNUNET_NO);
996 GNUNET_assert (sigpipe != NULL);
997 pr = GNUNET_DISK_pipe_handle (sigpipe, GNUNET_DISK_PIPE_END_READ);
998 GNUNET_assert (pr != NULL);
999 GNUNET_SERVER_ignore_shutdown (serv, GNUNET_YES);
1000 GNUNET_SCHEDULER_add_delayed (sched,
1001 GNUNET_TIME_UNIT_FOREVER_REL,
1005 GNUNET_SCHEDULER_add_read_file (sched, GNUNET_TIME_UNIT_FOREVER_REL, pr,
1006 &maint_child_death, NULL);
1009 GNUNET_CONFIGURATION_get_value_string (cfg,
1013 prefix_command = GNUNET_strdup ("");
1014 /* start default services... */
1016 GNUNET_CONFIGURATION_get_value_string (cfg,
1022 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
1023 "Starting default services `%s'\n", defaultservices);
1025 pos = strtok (defaultservices, " ");
1028 start_service (NULL, pos);
1029 pos = strtok (NULL, " ");
1031 GNUNET_free (defaultservices);
1036 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
1037 "No default services configured.\n");
1041 /* process client requests */
1042 GNUNET_SERVER_add_handlers (server, handlers);
1044 /* manage services */
1045 GNUNET_SCHEDULER_add_with_priority (sched,
1046 GNUNET_SCHEDULER_PRIORITY_IDLE,
1047 &config_change_task, NULL);
1052 * The main function for the arm service.
1054 * @param argc number of arguments from the command line
1055 * @param argv command line arguments
1056 * @return 0 ok, 1 on error
1059 main (int argc, char *const *argv)
1061 return (GNUNET_OK ==
1062 GNUNET_SERVICE_run (argc,
1063 argv, "arm", GNUNET_YES, &run, NULL)) ? 0 : 1;
1066 /* end of gnunet-service-arm.c */