From 188759bbee057aa94db2bbb7cf7f5855f3b9ab53 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Fri, 1 Mar 2019 21:06:23 -0500 Subject: [PATCH] overhaul shared library ctor execution for dependency order, concurrency previously, shared library constructors at program start and dlopen time were executed in reverse load order. some libraries, however, rely on a depth-first dependency order, which most other dynamic linker implementations provide. this is a much more reasonable, less arbitrary order, and it turns out to have much better properties with regard to how slow-running ctors affect multi-threaded programs, and how recursive dlopen behaves. this commit builds on previous work tracking direct dependencies of each dso (commit 403555690775f7c8806372644f543518e6664e3b), and performs a topological sort on the dependency graph at load time while the main ldso lock is held and before success is committed, producing a queue of constructors needed by the newly-loaded dso (or main application). in the case of circular dependencies, the dependency chain is simply broken at points where it becomes circular. when the ctor queue is run, the init_fini_lock is held only for iteration purposes; it's released during execution of each ctor, so that arbitrarily-long-running application code no longer runs with a lock held in the caller. this prevents a dlopen with slow ctors in one thread from arbitrarily delaying other threads that call dlopen. fully-independent ctors can run concurrently; when multiple threads call dlopen with a shared dependency, one will end up executing the ctor while the other waits on a condvar for it to finish. another corner case improved by these changes is recursive dlopen (call from a ctor). previously, recursive calls to dlopen could cause a ctor for a library to be executed before the ctor for its dependency, even when there was no relation between the calling library and the library it was loading, simply due to the naive reverse-load-order traversal. now, we can guarantee that recursive dlopen in non-circular-dependency usage preserves the desired ctor execution order properties, and that even in circular usage, at worst the libraries whose ctors call dlopen will fail to have completed construction when ctors that depend on them run. init_fini_lock is changed to a normal, non-recursive mutex, since it is no longer held while calling back into application code. --- ldso/dynlink.c | 118 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 17 deletions(-) diff --git a/ldso/dynlink.c b/ldso/dynlink.c index 2617d553..777be489 100644 --- a/ldso/dynlink.c +++ b/ldso/dynlink.c @@ -76,6 +76,8 @@ struct dso { char runtime_loaded; struct dso **deps, *needed_by; size_t ndeps_direct; + size_t next_dep; + int ctor_visitor; char *rpath_orig, *rpath; struct tls_module tls; size_t tls_id; @@ -128,7 +130,9 @@ static struct debug debug; static struct tls_module *tls_tail; static size_t tls_cnt, tls_offset, tls_align = MIN_TLS_ALIGN; static size_t static_tls_cnt; -static pthread_mutex_t init_fini_lock = { ._m_type = PTHREAD_MUTEX_RECURSIVE }; +static pthread_mutex_t init_fini_lock; +static pthread_cond_t ctor_cond; +static struct dso **main_ctor_queue; static struct fdpic_loadmap *app_loadmap; static struct fdpic_dummy_loadmap app_dummy_loadmap; @@ -1361,22 +1365,87 @@ void __libc_exit_fini() } } -static void do_init_fini(struct dso *p) +static struct dso **queue_ctors(struct dso *dso) { - size_t dyn[DYN_CNT]; - int need_locking = libc.threads_minus_1; - /* Allow recursive calls that arise when a library calls - * dlopen from one of its constructors, but block any - * other threads until all ctors have finished. */ - if (need_locking) pthread_mutex_lock(&init_fini_lock); - for (; p; p=p->prev) { + size_t cnt, qpos, spos, i; + struct dso *p, **queue, **stack; + + if (ldd_mode) return 0; + + /* Bound on queue size is the total number of indirect deps. + * If a bfs deps list was built, we can use it. Otherwise, + * bound by the total number of DSOs, which is always safe and + * is reasonable we use it (for main app at startup). */ + if (dso->bfs_built) { + for (cnt=0; dso->deps[cnt]; cnt++) + dso->deps[cnt]->mark = 0; + cnt++; /* self, not included in deps */ + } else { + for (cnt=0, p=head; p; cnt++, p=p->next) + p->mark = 0; + } + cnt++; /* termination slot */ + stack = queue = calloc(cnt, sizeof *queue); + + if (!queue) { + error("Error allocating constructor queue: %m\n"); + if (runtime) longjmp(*rtld_fail, 1); + return 0; + } + + /* Opposite ends of the allocated buffer serve as an output queue + * and a working stack. Setup initial stack with just the argument + * dso and initial queue empty... */ + qpos = 0; + spos = cnt; + stack[--spos] = dso; + dso->next_dep = 0; + dso->mark = 1; + + /* Then perform pseudo-DFS sort, but ignoring circular deps. */ + while (sposnext_dep < p->ndeps_direct) { + if (p->deps[p->next_dep]->mark) { + p->next_dep++; + } else { + stack[--spos] = p; + p = p->deps[p->next_dep]; + p->next_dep = 0; + p->mark = 1; + } + } + queue[qpos++] = p; + } + queue[qpos] = 0; + for (i=0; imark = 0; + + return queue; +} + +static void do_init_fini(struct dso **queue) +{ + struct dso *p; + size_t dyn[DYN_CNT], i; + int self = __pthread_self()->tid; + + pthread_mutex_lock(&init_fini_lock); + for (i=0; (p=queue[i]); i++) { + while (p->ctor_visitor && p->ctor_visitor!=self) + pthread_cond_wait(&ctor_cond, &init_fini_lock); + if (p->ctor_visitor || p->constructed) + continue; if (p->constructed) continue; - p->constructed = 1; + p->ctor_visitor = self; + decode_vec(p->dynv, dyn, DYN_CNT); if (dyn[0] & ((1<fini_next = fini_head; fini_head = p; } + + pthread_mutex_unlock(&init_fini_lock); + #ifndef NO_LEGACY_INITFINI if ((dyn[0] & (1<ctor_visitor = 0; + p->constructed = 1; + pthread_cond_broadcast(&ctor_cond); } - if (need_locking) pthread_mutex_unlock(&init_fini_lock); + pthread_mutex_unlock(&init_fini_lock); } void __libc_start_init(void) { - do_init_fini(tail); + do_init_fini(main_ctor_queue); + /* This is valid because the queue was allocated after redoing + * relocations with any interposed malloc having taken effect. */ + free(main_ctor_queue); + main_ctor_queue = 0; } static void dl_debug_state(void) @@ -1793,6 +1867,8 @@ _Noreturn void __dls3(size_t *sp) } static_tls_cnt = tls_cnt; + main_ctor_queue = queue_ctors(&app); + if (ldso_fail) _exit(127); if (ldd_mode) _exit(0); @@ -1852,6 +1928,7 @@ void *dlopen(const char *file, int mode) size_t i; int cs; jmp_buf jb; + struct dso **volatile ctor_queue = 0; if (!file) return head; @@ -1886,6 +1963,7 @@ void *dlopen(const char *file, int mode) free(p->deps); unmap_library(p); free(p); + free(ctor_queue); } if (!orig_tls_tail) libc.tls_head = 0; tls_tail = orig_tls_tail; @@ -1911,6 +1989,9 @@ void *dlopen(const char *file, int mode) /* First load handling */ load_deps(p); extend_bfs_deps(p); + pthread_mutex_lock(&init_fini_lock); + if (!p->constructed) ctor_queue = queue_ctors(p); + pthread_mutex_unlock(&init_fini_lock); if (!p->relocated && (mode & RTLD_LAZY)) { prepare_lazy(p); for (i=0; p->deps[i]; i++) @@ -1948,7 +2029,10 @@ end: __release_ptc(); if (p) gencnt++; pthread_rwlock_unlock(&lock); - if (p) do_init_fini(orig_tail); + if (ctor_queue) { + do_init_fini(ctor_queue); + free(ctor_queue); + } pthread_setcancelstate(cs, 0); return p; } -- 2.25.1