2 This file is part of GNUnet.
3 (C) 2009 Christian Grothoff (and other contributing authors)
5 GNUnet is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
10 GNUnet is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GNUnet; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
22 * @file arm/gnunet-service-arm.c
23 * @brief the automated restart manager service
24 * @author Christian Grothoff
27 * - multiple start-stop requests with RC>1 can result
28 * in UP/DOWN signals based on "pending" that are inaccurate...
29 * => have list of clients waiting for a resolution instead of
30 * giving instant (but incorrect) replies
31 * - code could go into restart-loop for a service
32 * if service crashes instantly -- need exponential back-off
33 * - need to test auto-restart code on configuration changes;
34 * - should refine restart code to check if *relevant* parts of the
35 * configuration were changed (anything in the section for the service)
36 * - should have a way to specify dependencies between services and
37 * manage restarts of groups of services
40 #include "gnunet_client_lib.h"
41 #include "gnunet_getopt_lib.h"
42 #include "gnunet_os_lib.h"
43 #include "gnunet_protocols.h"
44 #include "gnunet_service_lib.h"
49 * Run normal maintenance every 2s.
51 #define MAINT_FREQUENCY GNUNET_TIME_relative_multiply (GNUNET_TIME_UNIT_SECONDS, 2)
54 * Run fast maintenance after 100ms. This is used for an extra-job
55 * that is run to check for a process that we just killed.
57 #define MAINT_FAST_FREQUENCY GNUNET_TIME_relative_multiply (GNUNET_TIME_UNIT_MILLISECONDS, 100)
60 * How long do we wait until we decide that a service
63 #define CHECK_TIMEOUT GNUNET_TIME_UNIT_MINUTES
66 * List of our services.
71 * Function to call if waitpid informs us that
75 * @param pos entry in the service list of the process that died
77 typedef void (*CleanCallback) (void *cls, struct ServiceList * pos);
80 * List of our services.
85 * This is a linked list.
87 struct ServiceList *next;
90 * Name of the service.
95 * Name of the binary used.
100 * Name of the configuration file used.
105 * Function to call upon kill completion (waitpid), NULL
106 * if we should simply restart the process.
108 CleanCallback kill_continuation;
111 * Closure for kill_continuation.
113 void *kill_continuation_cls;
116 * Process ID of the child.
121 * Last time the config of this service was
127 * Reference counter (counts how many times we've been
128 * asked to start the service). We only actually stop
129 * it once rc hits zero.
136 * List of running services.
138 static struct ServiceList *running;
143 static const struct GNUNET_CONFIGURATION_Handle *cfg;
148 static struct GNUNET_SCHEDULER_Handle *sched;
151 * Command to prepend to each actual command.
153 static char *prefix_command;
156 * Are we in shutdown mode?
158 static int in_shutdown;
161 * Handle to our server instance. Our server is a bit special in that
162 * its service is not immediately stopped once we get a shutdown
163 * request (since we need to continue service until all of our child
164 * processes are dead). This handle is used to shut down the server
165 * (and thus trigger process termination) once all child processes are
166 * also dead. A special option in the ARM configuration modifies the
167 * behaviour of the service implementation to not do the shutdown
170 static struct GNUNET_SERVER_Handle *server;
173 * Background task doing maintenance.
175 * @param cls closure, NULL if we need to self-restart
179 maint (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc);
183 * Transmit a status result message.
185 * @param cls pointer to "unit16_t*" with message type
186 * @param size number of bytes available in buf
187 * @param buf where to copy the message, NULL on error
188 * @return number of bytes copied to buf
191 write_result (void *cls, size_t size, void *buf)
194 struct GNUNET_MessageHeader *msg;
198 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
199 _("Could not send status result to client\n"));
200 return 0; /* error, not much we can do */
202 GNUNET_assert (size >= sizeof (struct GNUNET_MessageHeader));
204 msg->size = htons (sizeof (struct GNUNET_MessageHeader));
205 msg->type = htons (*res);
207 return sizeof (struct GNUNET_MessageHeader);
213 * Signal our client that we will start or stop the
216 * @param client who is being signalled
217 * @param name name of the service
218 * @param result message type to send
219 * @return NULL if it was not found
222 signal_result (struct GNUNET_SERVER_Client *client,
223 const char *name, uint16_t result)
229 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
230 _("Not sending status result to client: no client known\n"));
234 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
235 "Telling client that service `%s' is now %s\n",
237 result == GNUNET_MESSAGE_TYPE_ARM_IS_DOWN ? "down" : "up");
239 res = GNUNET_malloc (sizeof (uint16_t));
241 GNUNET_SERVER_notify_transmit_ready (client,
242 sizeof (struct GNUNET_MessageHeader),
243 GNUNET_TIME_UNIT_FOREVER_REL,
249 * Find the process with the given service
250 * name in the given list, remove it and return it.
252 * @param name which service entry to look up
253 * @return NULL if it was not found
255 static struct ServiceList *
256 find_name (const char *name)
258 struct ServiceList *pos;
259 struct ServiceList *prev;
265 if (0 == strcmp (pos->name, name))
270 prev->next = pos->next;
282 * Free an entry in the service list.
284 * @param pos entry to free
287 free_entry (struct ServiceList *pos)
289 GNUNET_free_non_null (pos->config);
290 GNUNET_free_non_null (pos->binary);
291 GNUNET_free (pos->name);
297 * Actually start the process for the given service.
299 * @param sl identifies service to start
302 start_process (struct ServiceList *sl)
307 unsigned int argv_size;
310 const char *firstarg;
315 GNUNET_CONFIGURATION_get_value_string (cfg,
316 sl->name, "PREFIX", &loprefix))
317 loprefix = GNUNET_strdup (prefix_command);
319 GNUNET_CONFIGURATION_get_value_string (cfg,
320 sl->name, "OPTIONS", &options))
321 options = GNUNET_strdup ("");
322 use_debug = GNUNET_CONFIGURATION_get_value_yesno (cfg, sl->name, "DEBUG");
324 GNUNET_log (GNUNET_ERROR_TYPE_INFO, _("Starting service `%s'\n"), sl->name);
326 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
327 "Starting service `%s' using binary `%s' and configuration `%s'\n",
328 sl->name, sl->binary, sl->config);
334 while ('\0' != *lopos)
341 while ('\0' != *optpos)
348 argv = GNUNET_malloc (argv_size * sizeof (char *));
352 while ('\0' != *lopos)
354 while (*lopos == ' ')
360 argv[argv_size++] = lopos;
361 while (('\0' != *lopos) && (' ' != *lopos))
369 firstarg = sl->binary;
370 argv[argv_size++] = sl->binary;
371 argv[argv_size++] = "-c";
372 argv[argv_size++] = sl->config;
373 if (GNUNET_YES == use_debug)
375 argv[argv_size++] = "-L";
376 argv[argv_size++] = "DEBUG";
379 while ('\0' != *optpos)
381 while (*optpos == ' ')
385 argv[argv_size++] = optpos;
386 while (('\0' != *optpos) && (' ' != *optpos))
393 argv[argv_size++] = NULL;
394 sl->pid = GNUNET_OS_start_process_v (firstarg, argv);
396 GNUNET_free (loprefix);
397 GNUNET_free (options);
402 * Start the specified service.
404 * @param client who is asking for this
405 * @param servicename name of the service to start
408 start_service (struct GNUNET_SERVER_Client *client, const char *servicename)
410 struct ServiceList *sl;
415 if (GNUNET_YES == in_shutdown)
417 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
418 _("ARM is shutting down, service `%s' not started.\n"), servicename);
419 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
422 sl = find_name (servicename);
425 /* already running, just increment RC */
426 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
427 _("Service `%s' already running.\n"), servicename);
431 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
435 GNUNET_CONFIGURATION_get_value_string (cfg,
436 servicename, "BINARY", &binary))
438 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
439 _("Binary implementing service `%s' not known!\n"),
441 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
445 GNUNET_CONFIGURATION_get_value_filename (cfg,
449 (0 != STAT (config, &sbuf)))
451 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
452 _("Configuration file `%s' for service `%s' not known!\n"),
453 config, servicename);
454 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
455 GNUNET_free (binary);
456 GNUNET_free_non_null (config);
459 sl = GNUNET_malloc (sizeof (struct ServiceList));
460 sl->name = GNUNET_strdup (servicename);
465 sl->mtime = sbuf.st_mtime;
469 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
474 * Free the given entry in the service list and signal
475 * the given client that the service is now down.
477 * @param cls pointer to the client ("struct GNUNET_SERVER_Client*")
478 * @param pos entry for the service
481 free_and_signal (void *cls, struct ServiceList *pos)
483 struct GNUNET_SERVER_Client *client = cls;
484 /* find_name will remove "pos" from the list! */
485 GNUNET_log (GNUNET_ERROR_TYPE_INFO, "Service `%s' stopped\n", pos->name);
486 signal_result (client, pos->name, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
487 GNUNET_SERVER_receive_done (client, GNUNET_OK);
488 GNUNET_SERVER_client_drop (client);
494 * Stop the specified service.
496 * @param client who is asking for this
497 * @param servicename name of the service to stop
500 stop_service (struct GNUNET_SERVER_Client *client,
501 const char *servicename)
503 struct ServiceList *pos;
505 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
506 _("Preparing to stop `%s'\n"), servicename);
507 pos = find_name (servicename);
510 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UNKNOWN);
511 GNUNET_SERVER_receive_done (client, GNUNET_OK);
516 /* RC>1, just decrement RC */
521 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
522 "Service `%s' still used by %u clients, will keep it running!\n",
526 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_UP);
527 GNUNET_SERVER_receive_done (client, GNUNET_OK);
531 pos->rc--; /* decrement RC to zero */
532 if (pos->kill_continuation != NULL)
534 /* killing already in progress */
536 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
537 "Service `%s' is already down\n", servicename);
539 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
540 GNUNET_SERVER_receive_done (client, GNUNET_OK);
544 if (GNUNET_YES == in_shutdown)
547 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
548 "Termination request already sent to `%s' (since ARM is in shutdown).\n",
551 signal_result (client, servicename, GNUNET_MESSAGE_TYPE_ARM_IS_DOWN);
552 GNUNET_SERVER_receive_done (client, GNUNET_OK);
558 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
559 "Sending kill signal to service `%s', waiting for process to die.\n",
562 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
563 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
566 pos->kill_continuation = &free_and_signal;
567 pos->kill_continuation_cls = client;
568 GNUNET_SERVER_client_keep (client);
569 GNUNET_SCHEDULER_add_delayed (sched,
570 MAINT_FAST_FREQUENCY, &maint, "non-null");
575 * Handle START-message.
577 * @param cls closure (always NULL)
578 * @param client identification of the client
579 * @param message the actual message
580 * @return GNUNET_OK to keep the connection open,
581 * GNUNET_SYSERR to close it (signal serious error)
584 handle_start (void *cls,
585 struct GNUNET_SERVER_Client *client,
586 const struct GNUNET_MessageHeader *message)
588 const char *servicename;
591 size = ntohs (message->size);
592 size -= sizeof (struct GNUNET_MessageHeader);
593 servicename = (const char *) &message[1];
594 if ((size == 0) || (servicename[size - 1] != '\0'))
597 GNUNET_SERVER_receive_done (client, GNUNET_SYSERR);
600 start_service (client, servicename);
601 GNUNET_SERVER_receive_done (client, GNUNET_OK);
606 * Handle STOP-message.
608 * @param cls closure (always NULL)
609 * @param client identification of the client
610 * @param message the actual message
611 * @return GNUNET_OK to keep the connection open,
612 * GNUNET_SYSERR to close it (signal serious error)
615 handle_stop (void *cls,
616 struct GNUNET_SERVER_Client *client,
617 const struct GNUNET_MessageHeader *message)
619 const char *servicename;
622 size = ntohs (message->size);
623 size -= sizeof (struct GNUNET_MessageHeader);
624 servicename = (const char *) &message[1];
625 if ((size == 0) || (servicename[size - 1] != '\0'))
628 GNUNET_SERVER_receive_done (client, GNUNET_SYSERR);
631 stop_service (client, servicename);
636 * Background task doing maintenance.
638 * @param cls closure, NULL if we need to self-restart
642 maint (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
644 struct ServiceList *pos;
645 struct ServiceList *prev;
646 struct ServiceList *next;
652 if (0 != (tc->reason & GNUNET_SCHEDULER_REASON_SHUTDOWN))
654 GNUNET_log (GNUNET_ERROR_TYPE_INFO, _("Stopping all services\n"));
655 in_shutdown = GNUNET_YES;
662 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
663 "Sending SIGTERM to `%s'\n",
666 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
667 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
674 if ( (in_shutdown == GNUNET_YES) &&
678 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
679 "ARM service terminates.\n");
681 GNUNET_assert (server != NULL);
682 GNUNET_SERVER_destroy (server);
684 return; /* we are done! */
686 GNUNET_SCHEDULER_add_delayed (tc->sched,
687 (in_shutdown == GNUNET_YES)
688 ? MAINT_FAST_FREQUENCY
693 /* check for services that died (WAITPID) */
696 while (NULL != (pos = next))
698 enum GNUNET_OS_ProcessStatusType statusType;
699 unsigned long statusCode;
702 if ( (NULL != pos->kill_continuation) ||
703 ( (GNUNET_YES == in_shutdown) &&
710 if (NULL != pos->kill_continuation)
711 pos->kill_continuation (pos->kill_continuation_cls, pos);
716 if ( (GNUNET_SYSERR == (ret = GNUNET_OS_process_status(pos->pid,
719 ( (ret == GNUNET_NO) ||
720 (statusType == GNUNET_OS_PROCESS_STOPPED) ||
721 (statusType == GNUNET_OS_PROCESS_RUNNING) ) )
726 if (statusType == GNUNET_OS_PROCESS_EXITED)
728 statstr = _( /* process termination method */ "exit");
729 statcode = statusCode;
731 else if (statusType == GNUNET_OS_PROCESS_SIGNALED)
733 statstr = _( /* process termination method */ "signal");
734 statcode = statusCode;
738 statstr = _( /* process termination method */ "unknown");
741 if (GNUNET_YES != in_shutdown)
742 GNUNET_log (GNUNET_ERROR_TYPE_WARNING,
743 _("Service `%s' terminated with status %s/%d, will try to restart it!\n"),
744 pos->name, statstr, statcode);
747 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
748 "Service `%s' terminated with status %s/%d\n",
749 pos->name, statstr, statcode);
751 /* schedule restart */
756 /* check for services that need to be restarted due to
757 configuration changes or because the last restart failed */
761 if ((0 == STAT (pos->config, &sbuf)) && (pos->mtime < sbuf.st_mtime))
763 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
764 _("Restarting service `%s' due to configuration file change.\n"));
765 if (0 != PLIBC_KILL (pos->pid, SIGTERM))
766 GNUNET_log_strerror (GNUNET_ERROR_TYPE_WARNING, "kill");
768 if ( (pos->pid == 0) &&
769 (GNUNET_YES != in_shutdown) )
771 GNUNET_log (GNUNET_ERROR_TYPE_INFO,
772 _("Restarting service `%s'.\n"), pos->name);
773 /* FIXME: should have some exponentially
774 increasing timer to avoid tight restart loops */
783 * List of handlers for the messages understood by this service.
785 static struct GNUNET_SERVER_MessageHandler handlers[] = {
786 {&handle_start, NULL, GNUNET_MESSAGE_TYPE_ARM_START, 0},
787 {&handle_stop, NULL, GNUNET_MESSAGE_TYPE_ARM_STOP, 0},
793 * Process arm requests.
796 * @param s scheduler to use
797 * @param serv the initialized server
798 * @param c configuration to use
802 struct GNUNET_SCHEDULER_Handle *s,
803 struct GNUNET_SERVER_Handle *serv,
804 const struct GNUNET_CONFIGURATION_Handle *c)
806 char *defaultservices;
809 GNUNET_SERVER_ignore_shutdown (serv, GNUNET_YES);
810 GNUNET_assert (serv != NULL);
815 GNUNET_CONFIGURATION_get_value_string (cfg,
819 prefix_command = GNUNET_strdup ("");
820 /* start default services... */
822 GNUNET_CONFIGURATION_get_value_string (cfg,
828 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
829 "Starting default services `%s'\n", defaultservices);
831 pos = strtok (defaultservices, " ");
834 start_service (NULL, pos);
835 pos = strtok (NULL, " ");
837 GNUNET_free (defaultservices);
842 GNUNET_log (GNUNET_ERROR_TYPE_DEBUG,
843 "No default services configured.\n");
847 /* process client requests */
848 GNUNET_SERVER_add_handlers (server, handlers);
850 /* manage services */
851 GNUNET_SCHEDULER_add_with_priority (sched,
852 GNUNET_SCHEDULER_PRIORITY_IDLE,
858 * The main function for the arm service.
860 * @param argc number of arguments from the command line
861 * @param argv command line arguments
862 * @return 0 ok, 1 on error
865 main (int argc, char *const *argv)
868 GNUNET_SERVICE_run (argc,
871 &run, NULL)) ? 0 : 1;
874 /* end of gnunet-service-arm.c */