From ba18c1ecc6a18203ad8496791154af86f706f632 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Fri, 22 Feb 2019 02:56:10 -0500 Subject: [PATCH] add membarrier syscall wrapper, refactor dynamic tls install to use it the motivation for this change is twofold. first, it gets the fallback logic out of the dynamic linker, improving code readability and organization. second, it provides application code that wants to use the membarrier syscall, which depends on preregistration of intent before the process becomes multithreaded unless unbounded latency is acceptable, with a symbol that, when linked, ensures that this registration happens. --- include/sys/membarrier.h | 17 ++++++++ ldso/dynlink.c | 38 +++--------------- src/include/sys/membarrier.h | 9 +++++ src/internal/pthread_impl.h | 2 +- src/linux/membarrier.c | 76 ++++++++++++++++++++++++++++++++++++ src/thread/pthread_create.c | 4 +- 6 files changed, 111 insertions(+), 35 deletions(-) create mode 100644 include/sys/membarrier.h create mode 100644 src/include/sys/membarrier.h create mode 100644 src/linux/membarrier.c diff --git a/include/sys/membarrier.h b/include/sys/membarrier.h new file mode 100644 index 00000000..10cb3108 --- /dev/null +++ b/include/sys/membarrier.h @@ -0,0 +1,17 @@ +#ifndef _SYS_MEMBARRIER_H +#define _SYS_MEMBARRIER_H + +#define MEMBARRIER_CMD_QUERY 0 +#define MEMBARRIER_CMD_GLOBAL 1 +#define MEMBARRIER_CMD_GLOBAL_EXPEDITED 2 +#define MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED 4 +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED 8 +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED 16 +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE 32 +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE 64 + +#define MEMBARRIER_CMD_SHARED MEMBARRIER_CMD_GLOBAL + +int membarrier(int, int); + +#endif diff --git a/ldso/dynlink.c b/ldso/dynlink.c index c7d9dd39..025ed1b0 100644 --- a/ldso/dynlink.c +++ b/ldso/dynlink.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "pthread_impl.h" #include "libc.h" #include "dynlink.h" @@ -1351,18 +1352,6 @@ static void update_tls_size() tls_align); } -void __dl_prepare_for_threads(void) -{ - /* MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED */ - __syscall(SYS_membarrier, 1<<4, 0); -} - -static sem_t barrier_sem; -static void bcast_barrier(int s) -{ - sem_post(&barrier_sem); -} - static void install_new_tls(void) { sigset_t set; @@ -1397,26 +1386,11 @@ static void install_new_tls(void) } /* Broadcast barrier to ensure contents of new dtv is visible - * if the new dtv pointer is. Use SYS_membarrier if it works, - * otherwise emulate with a signal. */ - - /* MEMBARRIER_CMD_PRIVATE_EXPEDITED */ - if (__syscall(SYS_membarrier, 1<<3, 0)) { - sem_init(&barrier_sem, 0, 0); - struct sigaction sa = { - .sa_flags = SA_RESTART, - .sa_handler = bcast_barrier - }; - memset(&sa.sa_mask, -1, sizeof sa.sa_mask); - __libc_sigaction(SIGSYNCCALL, &sa, 0); - for (td=self->next; td!=self; td=td->next) - __syscall(SYS_tkill, td->tid, SIGSYNCCALL); - for (td=self->next; td!=self; td=td->next) - sem_wait(&barrier_sem); - sa.sa_handler = SIG_IGN; - __libc_sigaction(SIGSYNCCALL, &sa, 0); - sem_destroy(&barrier_sem); - } + * if the new dtv pointer is. The __membarrier function has a + * fallback emulation using signals for kernels that lack the + * feature at the syscall level. */ + + __membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0); /* Install new dtv for each thread. */ for (j=0, td=self; !j || td!=self; j++, td=td->next) { diff --git a/src/include/sys/membarrier.h b/src/include/sys/membarrier.h new file mode 100644 index 00000000..3654491c --- /dev/null +++ b/src/include/sys/membarrier.h @@ -0,0 +1,9 @@ +#ifndef SYS_MEMBARRIER_H +#define SYS_MEMBARRIER_H + +#include "../../../include/sys/membarrier.h" +#include + +hidden int __membarrier(int, int); + +#endif diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h index de089967..9b001421 100644 --- a/src/internal/pthread_impl.h +++ b/src/internal/pthread_impl.h @@ -130,7 +130,7 @@ hidden int __init_tp(void *); hidden void *__copy_tls(unsigned char *); hidden void __reset_tls(); -hidden void __dl_prepare_for_threads(void); +hidden void __membarrier_init(void); hidden void __dl_thread_cleanup(void); hidden void __testcancel(); hidden void __do_cleanup_push(struct __ptcb *); diff --git a/src/linux/membarrier.c b/src/linux/membarrier.c new file mode 100644 index 00000000..26d143e7 --- /dev/null +++ b/src/linux/membarrier.c @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include "pthread_impl.h" +#include "syscall.h" + +static void dummy_0(void) +{ +} + +static void dummy_1(pthread_t t) +{ +} + +weak_alias(dummy_0, __tl_lock); +weak_alias(dummy_0, __tl_unlock); +weak_alias(dummy_1, __tl_sync); + +static sem_t barrier_sem; + +static void bcast_barrier(int s) +{ + sem_post(&barrier_sem); +} + +int __membarrier(int cmd, int flags) +{ + int r = __syscall(SYS_membarrier, cmd, flags); + /* Emulate the private expedited command, which is needed by the + * dynamic linker for installation of dynamic TLS, for older + * kernels that lack the syscall. Unlike the syscall, this only + * synchronizes with threads of the process, not other processes + * sharing the VM, but such sharing is not a supported usage + * anyway. */ + if (r && cmd == MEMBARRIER_CMD_PRIVATE_EXPEDITED && !flags) { + pthread_t self=__pthread_self(), td; + sigset_t set; + __block_app_sigs(&set); + __tl_lock(); + sem_init(&barrier_sem, 0, 0); + struct sigaction sa = { + .sa_flags = SA_RESTART, + .sa_handler = bcast_barrier + }; + memset(&sa.sa_mask, -1, sizeof sa.sa_mask); + __libc_sigaction(SIGSYNCCALL, &sa, 0); + for (td=self->next; td!=self; td=td->next) + __syscall(SYS_tkill, td->tid, SIGSYNCCALL); + for (td=self->next; td!=self; td=td->next) + sem_wait(&barrier_sem); + sa.sa_handler = SIG_IGN; + __libc_sigaction(SIGSYNCCALL, &sa, 0); + sem_destroy(&barrier_sem); + __tl_unlock(); + __restore_sigs(&set); + return 0; + } + return __syscall_ret(r); +} + +void __membarrier_init(void) +{ + /* If membarrier is linked, attempt to pre-register to be able to use + * the private expedited command before the process becomes multi- + * threaded, since registering later has bad, potentially unbounded + * latency. This syscall should be essentially free, and it's arguably + * a mistake in the API design that registration was even required. + * For other commands, registration may impose some cost, so it's left + * to the application to do so if desired. Unfortunately this means + * library code initialized after the process becomes multi-threaded + * cannot use these features without accepting registration latency. */ + __syscall(SYS_membarrier, MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0); +} + +weak_alias(__membarrier, membarrier); diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c index 54c03554..7d4dc2ed 100644 --- a/src/thread/pthread_create.c +++ b/src/thread/pthread_create.c @@ -15,7 +15,7 @@ weak_alias(dummy_0, __release_ptc); weak_alias(dummy_0, __pthread_tsd_run_dtors); weak_alias(dummy_0, __do_orphaned_stdio_locks); weak_alias(dummy_0, __dl_thread_cleanup); -weak_alias(dummy_0, __dl_prepare_for_threads); +weak_alias(dummy_0, __membarrier_init); static int tl_lock_count; static int tl_lock_waiters; @@ -246,7 +246,7 @@ int __pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict att init_file_lock(__stderr_used); __syscall(SYS_rt_sigprocmask, SIG_UNBLOCK, SIGPT_SET, 0, _NSIG/8); self->tsd = (void **)__pthread_tsd_main; - __dl_prepare_for_threads(); + __membarrier_init(); libc.threaded = 1; } if (attrp && !c11) attr = *attrp; -- 2.25.1