#ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_FUTEX_H_
#define HIGHWAY_HWY_CONTRIB_THREAD_POOL_FUTEX_H_
#include <time.h>
#include <atomic>
#include <climits>
#include "hwy/base.h"
#if HWY_OS_APPLE
#include <AvailabilityMacros.h>
#if MAC_OS_X_VERSION_MAX_ALLOWED < 101200 && !defined(HWY_DISABLE_FUTEX)
#define HWY_DISABLE_FUTEX
#endif
#endif
#if HWY_OS_WIN
#ifndef NOMINMAX
#define NOMINMAX
#endif #ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif #include <windows.h>
#endif
#if HWY_ARCH_WASM
#include <emscripten/threading.h>
#include <math.h>
#elif HWY_OS_LINUX
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
#include <sys/syscall.h>
#include <unistd.h>
#ifndef SYS_futex
#ifdef SYS_futex_time64
#define SYS_futex SYS_futex_time64
#else
#define SYS_futex __NR_futex
#endif #endif #ifndef FUTEX_WAIT_PRIVATE
#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | 128)
#endif
#ifndef FUTEX_WAKE_PRIVATE
#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | 128)
#endif
#elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX)
extern "C" {
int __ulock_wait(uint32_t op, void* address, uint64_t val, uint32_t max_us);
int __ulock_wake(uint32_t op, void* address, uint64_t zero);
} #define UL_COMPARE_AND_WAIT 1
#define ULF_WAKE_ALL 0x00000100
#elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX)
#if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL
#pragma comment(lib, "synchronization.lib")
#endif
#elif HWY_CXX_LANG < 202002L
#define HWY_FUTEX_SLEEP
#endif
namespace hwy {
static inline bool NanoSleep(uint64_t ns) {
#if HWY_OS_WIN
static thread_local HANDLE hTimer = nullptr;
if (HWY_UNLIKELY(hTimer == nullptr)) {
hTimer = CreateWaitableTimer(nullptr, TRUE, nullptr);
if (hTimer == nullptr) return false;
}
LARGE_INTEGER time;
time.QuadPart = -static_cast<LONGLONG>(ns / 100);
const LONG period = 0; if (!SetWaitableTimer(hTimer, &time, period, nullptr, nullptr, FALSE)) {
return false;
}
(void)WaitForSingleObject(hTimer, INFINITE);
return true;
#else
timespec duration;
duration.tv_sec = static_cast<time_t>(ns / 1000000000);
duration.tv_nsec = static_cast<decltype(duration.tv_nsec)>(ns % 1000000000);
timespec remainder;
for (int rep = 0; rep < 3; ++rep) {
if (nanosleep(&duration, &remainder) == 0 || errno != EINTR) break;
duration = remainder;
}
return true;
#endif
}
static inline uint32_t BlockUntilDifferent(
const uint32_t prev, const std::atomic<uint32_t>& current) {
const auto acq = std::memory_order_acquire;
#if HWY_ARCH_WASM
volatile void* address =
const_cast<volatile void*>(static_cast<const volatile void*>(¤t));
const double max_ms = INFINITY;
for (;;) {
const uint32_t next = current.load(acq);
if (next != prev) return next;
const int ret = emscripten_futex_wait(address, prev, max_ms);
HWY_DASSERT(ret >= 0);
(void)ret;
}
#elif HWY_OS_LINUX
const uint32_t* address = reinterpret_cast<const uint32_t*>(¤t);
const int op = FUTEX_WAIT_PRIVATE;
for (;;) {
const uint32_t next = current.load(acq);
if (next != prev) return next;
const auto ret = syscall(SYS_futex, address, op, prev, nullptr, nullptr, 0);
if (ret == -1) {
HWY_DASSERT(errno == EAGAIN); }
}
#elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX)
volatile void* address =
const_cast<volatile void*>(static_cast<const volatile void*>(¤t));
PVOID pprev = const_cast<void*>(static_cast<const void*>(&prev));
const DWORD max_ms = INFINITE;
for (;;) {
const uint32_t next = current.load(acq);
if (next != prev) return next;
const BOOL ok = WaitOnAddress(address, pprev, sizeof(prev), max_ms);
HWY_DASSERT(ok);
(void)ok;
}
#elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX)
void* address = const_cast<void*>(static_cast<const void*>(¤t));
for (;;) {
const uint32_t next = current.load(acq);
if (next != prev) return next;
__ulock_wait(UL_COMPARE_AND_WAIT, address, prev, 0);
}
#elif defined(HWY_FUTEX_SLEEP)
for (;;) {
const uint32_t next = current.load(acq);
if (next != prev) return next;
NanoSleep(2000);
}
#elif HWY_CXX_LANG >= 202002L
current.wait(prev, acq); const uint32_t next = current.load(acq);
HWY_DASSERT(next != prev);
return next;
#else
#error "Logic error, should have reached HWY_FUTEX_SLEEP"
#endif }
static inline void WakeAll(std::atomic<uint32_t>& current) {
#if HWY_ARCH_WASM
volatile void* address = static_cast<volatile void*>(¤t);
const int max_to_wake = INT_MAX; const int ret = emscripten_futex_wake(address, max_to_wake);
HWY_DASSERT(ret >= 0);
(void)ret;
#elif HWY_OS_LINUX
uint32_t* address = reinterpret_cast<uint32_t*>(¤t);
const int max_to_wake = INT_MAX; const auto ret = syscall(SYS_futex, address, FUTEX_WAKE_PRIVATE, max_to_wake,
nullptr, nullptr, 0);
HWY_DASSERT(ret >= 0); (void)ret;
#elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX)
void* address = static_cast<void*>(¤t);
WakeByAddressAll(address);
#elif HWY_OS_APPLE && !defined(HWY_DISABLE_FUTEX)
void* address = static_cast<void*>(¤t);
__ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, address, 0);
#elif defined(HWY_FUTEX_SLEEP)
(void)current;
#elif HWY_CXX_LANG >= 202002L
current.notify_all();
#else
#error "Logic error, should have reached HWY_FUTEX_SLEEP"
#endif
}
}
#endif