From 0dc7e1719993444ef1d25ee0b06af7e73365cc6b Mon Sep 17 00:00:00 2001
From: Andrii Repko <andrii.repko@3shape.com>
Date: Mon, 16 Feb 2026 19:47:49 +0100
Subject: [PATCH] [libcxxabi] Use InitByteFutex for __cxa_guard in WASM Workers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using -sWASM_WORKERS, __cxa_guard_acquire uses the GlobalMutex
implementation (pthread_mutex_lock + pthread_cond_wait), but libc links
pthread stubs where these are all noops.  This is not just a performance
problem — GlobalMutex does non-atomic read-then-write on the init byte
under a noop lock, so two workers can both see UNSET and both become
the initializer (double initialization / undefined behavior).

Switch to the InitByteFutex implementation which uses atomic CAS for
correctness.  Wait/wake are no-ops so losers spin in the CAS retry
loop rather than sleeping.  Cannot use real memory.atomic.wait32
because it traps on the main browser thread and there is no
libcxxabi-compatible way to detect the main thread
(emscripten_is_main_browser_thread is JS-only).  In practice,
contention on a single guard is rare and the spin is bounded by the
static constructor duration (typically sub-microsecond).

Fixes #26277
---
 system/lib/libcxxabi/src/cxa_guard_impl.h | 10 ++++++++++
 tools/system_libs.py                      |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/system/lib/libcxxabi/src/cxa_guard_impl.h b/system/lib/libcxxabi/src/cxa_guard_impl.h
index 191a589176b1a..8acd355d89bb0 100644
--- a/system/lib/libcxxabi/src/cxa_guard_impl.h
+++ b/system/lib/libcxxabi/src/cxa_guard_impl.h
@@ -436,6 +436,16 @@ void PlatformFutexWake(int* addr) {
   __tsan_release(addr);
   syscall(SYS_futex, addr, WAKE, INT_MAX);
 }
+#elif defined(__EMSCRIPTEN__) && defined(__EMSCRIPTEN_SHARED_MEMORY__)
+// WASM Workers: pthread stubs are noops, making GlobalMutex broken
+// (non-atomic read-then-write allows double initialization).
+// InitByteFutex uses atomic CAS for correct single initialization.
+// Wait/wake are no-ops — losers spin in the CAS retry loop.
+// Cannot use memory.atomic.wait32 (traps on the main browser thread).
+// In practice, contention on a single guard is rare and the spin is
+// bounded by the static constructor duration (typically sub-microsecond).
+void PlatformFutexWait(int*, int) {}
+void PlatformFutexWake(int*) {}
 #else
 constexpr void (*PlatformFutexWait)(int*, int) = nullptr;
 constexpr void (*PlatformFutexWake)(int*) = nullptr;
diff --git a/tools/system_libs.py b/tools/system_libs.py
index 14eccb71331ba..22ecc3ab854fe 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -1648,6 +1648,8 @@ def get_cflags(self):
     cflags = super().get_cflags()
     if not self.is_mt and not self.is_ww:
       cflags.append('-D_LIBCXXABI_HAS_NO_THREADS')
+    elif self.is_ww:
+      cflags.append('-D_LIBCXXABI_USE_FUTEX')
     match self.eh_mode:
       case Exceptions.NONE:
         cflags.append('-D_LIBCXXABI_NO_EXCEPTIONS')