From 0dc7e1719993444ef1d25ee0b06af7e73365cc6b Mon Sep 17 00:00:00 2001 From: Andrii Repko Date: Mon, 16 Feb 2026 19:47:49 +0100 Subject: [PATCH] [libcxxabi] Use InitByteFutex for __cxa_guard in WASM Workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using -sWASM_WORKERS, __cxa_guard_acquire uses the GlobalMutex implementation (pthread_mutex_lock + pthread_cond_wait), but libc links pthread stubs where these are all noops. This is not just a performance problem — GlobalMutex does non-atomic read-then-write on the init byte under a noop lock, so two workers can both see UNSET and both become the initializer (double initialization / undefined behavior). Switch to the InitByteFutex implementation which uses atomic CAS for correctness. Wait/wake are no-ops so losers spin in the CAS retry loop rather than sleeping. Cannot use real memory.atomic.wait32 because it traps on the main browser thread and there is no libcxxabi-compatible way to detect the main thread (emscripten_is_main_browser_thread is JS-only). In practice, contention on a single guard is rare and the spin is bounded by the static constructor duration (typically sub-microsecond). Fixes #26277 --- system/lib/libcxxabi/src/cxa_guard_impl.h | 10 ++++++++++ tools/system_libs.py | 2 ++ 2 files changed, 12 insertions(+) diff --git a/system/lib/libcxxabi/src/cxa_guard_impl.h b/system/lib/libcxxabi/src/cxa_guard_impl.h index 191a589176b1a..8acd355d89bb0 100644 --- a/system/lib/libcxxabi/src/cxa_guard_impl.h +++ b/system/lib/libcxxabi/src/cxa_guard_impl.h @@ -436,6 +436,16 @@ void PlatformFutexWake(int* addr) { __tsan_release(addr); syscall(SYS_futex, addr, WAKE, INT_MAX); } +#elif defined(__EMSCRIPTEN__) && defined(__EMSCRIPTEN_SHARED_MEMORY__) +// WASM Workers: pthread stubs are noops, making GlobalMutex broken +// (non-atomic read-then-write allows double initialization). +// InitByteFutex uses atomic CAS for correct single initialization. +// Wait/wake are no-ops — losers spin in the CAS retry loop. +// Cannot use memory.atomic.wait32 (traps on the main browser thread). +// In practice, contention on a single guard is rare and the spin is +// bounded by the static constructor duration (typically sub-microsecond). +void PlatformFutexWait(int*, int) {} +void PlatformFutexWake(int*) {} #else constexpr void (*PlatformFutexWait)(int*, int) = nullptr; constexpr void (*PlatformFutexWake)(int*) = nullptr; diff --git a/tools/system_libs.py b/tools/system_libs.py index 14eccb71331ba..22ecc3ab854fe 100644 --- a/tools/system_libs.py +++ b/tools/system_libs.py @@ -1648,6 +1648,8 @@ def get_cflags(self): cflags = super().get_cflags() if not self.is_mt and not self.is_ww: cflags.append('-D_LIBCXXABI_HAS_NO_THREADS') + elif self.is_ww: + cflags.append('-D_LIBCXXABI_USE_FUTEX') match self.eh_mode: case Exceptions.NONE: cflags.append('-D_LIBCXXABI_NO_EXCEPTIONS')