#ifndef HIGHWAY_HWY_BASE_H_
#define HIGHWAY_HWY_BASE_H_
#include <stddef.h>
#include <stdint.h>
#if defined(HWY_HEADER_ONLY)
#include <cstdarg>
#include <cstdio>
#endif
#if !defined(HWY_NO_LIBCXX)
#include <ostream>
#endif
#include "hwy/detect_compiler_arch.h"
#include "hwy/highway_export.h"
#define HWY_MAJOR 1
#define HWY_MINOR 3
#define HWY_PATCH 0
#define HWY_VERSION_GE(major, minor) \
(HWY_MAJOR > (major) || (HWY_MAJOR == (major) && HWY_MINOR >= (minor)))
#define HWY_VERSION_LT(major, minor) \
(HWY_MAJOR < (major) || (HWY_MAJOR == (major) && HWY_MINOR < (minor)))
#if !HWY_IDE
#if !defined(HWY_NO_LIBCXX)
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#endif
#endif
#if !defined(HWY_NO_LIBCXX) || HWY_COMPILER_MSVC
#include <atomic>
#endif
#ifndef HWY_HAVE_COMPARE_HEADER
#define HWY_HAVE_COMPARE_HEADER 0
#if defined(__has_include)
#if __has_include(<compare>)
#undef HWY_HAVE_COMPARE_HEADER
#define HWY_HAVE_COMPARE_HEADER 1
#endif #endif #endif
#ifndef HWY_HAVE_CXX20_THREE_WAY_COMPARE
#if !defined(HWY_NO_LIBCXX) && defined(__cpp_impl_three_way_comparison) && \
__cpp_impl_three_way_comparison >= 201907L && HWY_HAVE_COMPARE_HEADER
#include <compare>
#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 1
#else
#define HWY_HAVE_CXX20_THREE_WAY_COMPARE 0
#endif
#endif
#if HWY_COMPILER_MSVC
#include <string.h>
#endif
#define HWY_STR_IMPL(macro) #macro
#define HWY_STR(macro) HWY_STR_IMPL(macro)
#if HWY_COMPILER_MSVC
#include <intrin.h>
#define HWY_FUNCTION __FUNCSIG__
#define HWY_RESTRICT __restrict
#define HWY_INLINE __forceinline
#define HWY_NOINLINE __declspec(noinline)
#define HWY_FLATTEN
#define HWY_NORETURN __declspec(noreturn)
#define HWY_LIKELY(expr) (expr)
#define HWY_UNLIKELY(expr) (expr)
#define HWY_UNREACHABLE __assume(false)
#define HWY_PRAGMA(tokens) __pragma(tokens)
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
#define HWY_MAYBE_UNUSED
#define HWY_HAS_ASSUME_ALIGNED 0
#if (_MSC_VER >= 1700)
#define HWY_MUST_USE_RESULT _Check_return_
#else
#define HWY_MUST_USE_RESULT
#endif
#else
#define HWY_FUNCTION __PRETTY_FUNCTION__
#define HWY_RESTRICT __restrict__
#ifdef __OPTIMIZE__
#define HWY_INLINE inline __attribute__((always_inline))
#else
#define HWY_INLINE inline
#endif
#define HWY_NOINLINE __attribute__((noinline))
#define HWY_FLATTEN __attribute__((flatten))
#define HWY_NORETURN __attribute__((noreturn))
#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
#if HWY_COMPILER_GCC || HWY_HAS_BUILTIN(__builtin_unreachable)
#define HWY_UNREACHABLE __builtin_unreachable()
#else
#define HWY_UNREACHABLE
#endif
#define HWY_PRAGMA(tokens) _Pragma(#tokens)
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
#define HWY_MAYBE_UNUSED __attribute__((unused))
#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
#endif
namespace hwy {
#if HWY_HAS_ATTRIBUTE(__format__)
#define HWY_FORMAT(idx_fmt, idx_arg) \
__attribute__((__format__(__printf__, idx_fmt, idx_arg)))
#else
#define HWY_FORMAT(idx_fmt, idx_arg)
#endif
#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
#else
#define HWY_ASSUME_ALIGNED(ptr, align) (ptr)
#endif
#define HWY_RCAST_ALIGNED(type, ptr) \
reinterpret_cast<type>( \
HWY_ASSUME_ALIGNED((ptr), alignof(hwy::RemovePtr<type>)))
#if HWY_COMPILER_ICC
#define HWY_PUSH_ATTRIBUTES(targets_str)
#define HWY_POP_ATTRIBUTES
#elif HWY_COMPILER_CLANG
#define HWY_PUSH_ATTRIBUTES(targets_str) \
HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
apply_to = function))
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
#elif HWY_COMPILER_GCC_ACTUAL
#define HWY_PUSH_ATTRIBUTES(targets_str) \
HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
#else
#define HWY_PUSH_ATTRIBUTES(targets_str)
#define HWY_POP_ATTRIBUTES
#endif
#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
#define HWY_CONCAT_IMPL(a, b) a##b
#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
#if HWY_COMPILER_GCC_ACTUAL
#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
#else
#define HWY_UNROLL(factor)
#define HWY_DEFAULT_UNROLL
#endif
#if (HWY_CXX_LANG >= 202302L) && HWY_HAS_CPP_ATTRIBUTE(assume)
#define HWY_ASSUME(expr) [[assume(expr)]]
#elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC
#define HWY_ASSUME(expr) __assume(expr)
#elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume)
#define HWY_ASSUME(expr) __builtin_assume(expr)
#elif HWY_COMPILER_GCC_ACTUAL >= 405
#define HWY_ASSUME(expr) \
((expr) ? static_cast<void>(0) : __builtin_unreachable())
#else
#define HWY_ASSUME(expr) static_cast<void>(0)
#endif
#if !defined(HWY_NO_LIBCXX)
#define HWY_FENCE std::atomic_signal_fence(std::memory_order_seq_cst)
#elif HWY_COMPILER_GCC
#define HWY_FENCE asm volatile("" : : : "memory")
#else
#define HWY_FENCE
#endif
#define HWY_REP4(literal) literal, literal, literal, literal
#if defined(HWY_HEADER_ONLY)
HWY_DLLEXPORT inline void HWY_FORMAT(3, 4)
Warn(const char* file, int line, const char* format, ...) {
char buf[800];
va_list args;
va_start(args, format);
vsnprintf(buf, sizeof(buf), format, args);
va_end(args);
fprintf(stderr, "Warn at %s:%d: %s\n", file, line, buf);
}
HWY_DLLEXPORT HWY_NORETURN inline void HWY_FORMAT(3, 4)
Abort(const char* file, int line, const char* format, ...) {
char buf[800];
va_list args;
va_start(args, format);
vsnprintf(buf, sizeof(buf), format, args);
va_end(args);
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
fflush(stderr);
#if HWY_ARCH_RISCV
exit(1); #else
abort(); #endif
}
#else
typedef void (*WarnFunc)(const char* file, int line, const char* message);
typedef void (*AbortFunc)(const char* file, int line, const char* message);
HWY_DLLEXPORT WarnFunc& GetWarnFunc();
HWY_DLLEXPORT AbortFunc& GetAbortFunc();
HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func);
HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func);
HWY_DLLEXPORT void HWY_FORMAT(3, 4)
Warn(const char* file, int line, const char* format, ...);
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
Abort(const char* file, int line, const char* format, ...);
#endif
#define HWY_WARN(format, ...) \
::hwy::Warn(__FILE__, __LINE__, format, ##__VA_ARGS__)
#define HWY_ABORT(format, ...) \
::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
#define HWY_ASSERT_M(condition, msg) \
do { \
if (!(condition)) { \
HWY_ABORT("Assert %s: %s", #condition, msg); \
} \
} while (0)
#define HWY_ASSERT(condition) HWY_ASSERT_M(condition, "")
#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) || \
defined(__SANITIZE_MEMORY__)
#define HWY_IS_MSAN 1
#else
#define HWY_IS_MSAN 0
#endif
#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) || \
defined(__SANITIZE_ADDRESS__)
#define HWY_IS_ASAN 1
#else
#define HWY_IS_ASAN 0
#endif
#if HWY_HAS_FEATURE(hwaddress_sanitizer) || defined(HWADDRESS_SANITIZER) || \
defined(__SANITIZE_HWADDRESS__)
#define HWY_IS_HWASAN 1
#else
#define HWY_IS_HWASAN 0
#endif
#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) || \
defined(__SANITIZE_THREAD__)
#define HWY_IS_TSAN 1
#else
#define HWY_IS_TSAN 0
#endif
#if HWY_HAS_FEATURE(undefined_behavior_sanitizer) || \
defined(UNDEFINED_BEHAVIOR_SANITIZER)
#define HWY_IS_UBSAN 1
#else
#define HWY_IS_UBSAN 0
#endif
#if HWY_IS_MSAN
#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
#else
#define HWY_ATTR_NO_MSAN
#endif
#if HWY_IS_ASAN || HWY_IS_HWASAN || HWY_IS_MSAN || HWY_IS_TSAN || HWY_IS_UBSAN
#define HWY_IS_SANITIZER 1
#else
#define HWY_IS_SANITIZER 0
#endif
#if !defined(HWY_IS_DEBUG_BUILD)
#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || \
(HWY_IS_SANITIZER && !HWY_IS_UBSAN) || defined(__clang_analyzer__)
#define HWY_IS_DEBUG_BUILD 1
#else
#define HWY_IS_DEBUG_BUILD 0
#endif
#endif
#if HWY_IS_DEBUG_BUILD
#define HWY_DASSERT_M(condition, msg) HWY_ASSERT_M(condition, msg)
#define HWY_DASSERT(condition) HWY_ASSERT_M(condition, "")
#else
#define HWY_DASSERT_M(condition, msg) \
do { \
} while (0)
#define HWY_DASSERT(condition) \
do { \
} while (0)
#endif
#if HWY_COMPILER_MSVC
#pragma intrinsic(memcpy)
#pragma intrinsic(memset)
#endif
template <size_t kBytes, typename From, typename To>
HWY_API void CopyBytes(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
#if HWY_COMPILER_MSVC
memcpy(to, from, kBytes);
#else
__builtin_memcpy(to, from, kBytes);
#endif
}
HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to,
size_t num_of_bytes_to_copy) {
#if HWY_COMPILER_MSVC
memcpy(to, from, num_of_bytes_to_copy);
#else
__builtin_memcpy(to, from, num_of_bytes_to_copy);
#endif
}
template <typename From, typename To>
HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
static_assert(sizeof(From) == sizeof(To), "");
CopyBytes<sizeof(From)>(from, to);
}
template <size_t kBytes, typename To>
HWY_API void ZeroBytes(To* to) {
#if HWY_COMPILER_MSVC
memset(to, 0, kBytes);
#else
__builtin_memset(to, 0, kBytes);
#endif
}
HWY_API void ZeroBytes(void* to, size_t num_bytes) {
#if HWY_COMPILER_MSVC
memset(to, 0, num_bytes);
#else
__builtin_memset(to, 0, num_bytes);
#endif
}
#if HWY_ARCH_X86
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; #elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
__riscv_v_intrinsic >= 11000
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
#else
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
#endif
#if HWY_ARCH_X86
#define HWY_ALIGN_MAX alignas(64)
#elif HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
__riscv_v_intrinsic >= 11000
#define HWY_ALIGN_MAX alignas(8)
#else
#define HWY_ALIGN_MAX alignas(16)
#endif
struct float16_t;
struct bfloat16_t;
using float32_t = float;
using float64_t = double;
#pragma pack(push, 1)
struct alignas(16) uint128_t {
uint64_t lo; uint64_t hi;
};
struct alignas(16) K64V64 {
uint64_t value; uint64_t key;
};
struct alignas(8) K32V32 {
uint32_t value; uint32_t key;
};
#pragma pack(pop)
static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
const uint128_t& b) {
return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
}
static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
const uint128_t& b) {
return b < a;
}
static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
const uint128_t& b) {
return a.lo == b.lo && a.hi == b.hi;
}
#if !defined(HWY_NO_LIBCXX)
static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
const uint128_t& n) {
return os << "[hi=" << n.hi << ",lo=" << n.lo << "]";
}
#endif
static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
const K64V64& b) {
return a.key < b.key;
}
static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
const K64V64& b) {
return b < a;
}
static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
const K64V64& b) {
return a.key == b.key;
}
#if !defined(HWY_NO_LIBCXX)
static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
const K64V64& n) {
return os << "[k=" << n.key << ",v=" << n.value << "]";
}
#endif
static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
const K32V32& b) {
return a.key < b.key;
}
static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
const K32V32& b) {
return b < a;
}
static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
const K32V32& b) {
return a.key == b.key;
}
#if !defined(HWY_NO_LIBCXX)
static inline HWY_MAYBE_UNUSED std::ostream& operator<<(std::ostream& os,
const K32V32& n) {
return os << "[k=" << n.key << ",v=" << n.value << "]";
}
#endif
template <bool Condition>
struct EnableIfT {};
template <>
struct EnableIfT<true> {
using type = void;
};
template <bool Condition>
using EnableIf = typename EnableIfT<Condition>::type;
template <typename T, typename U>
struct IsSameT {
enum { value = 0 };
};
template <typename T>
struct IsSameT<T, T> {
enum { value = 1 };
};
template <typename T, typename U>
HWY_API constexpr bool IsSame() {
return IsSameT<T, U>::value;
}
template <typename T, typename U1, typename U2>
HWY_API constexpr bool IsSameEither() {
return IsSameT<T, U1>::value || IsSameT<T, U2>::value;
}
template <bool Condition, typename Then, typename Else>
struct IfT {
using type = Then;
};
template <class Then, class Else>
struct IfT<false, Then, Else> {
using type = Else;
};
template <bool Condition, typename Then, typename Else>
using If = typename IfT<Condition, Then, Else>::type;
template <typename T>
struct IsConstT {
enum { value = 0 };
};
template <typename T>
struct IsConstT<const T> {
enum { value = 1 };
};
template <typename T>
HWY_API constexpr bool IsConst() {
return IsConstT<T>::value;
}
template <class T>
struct RemoveConstT {
using type = T;
};
template <class T>
struct RemoveConstT<const T> {
using type = T;
};
template <class T>
using RemoveConst = typename RemoveConstT<T>::type;
template <class T>
struct RemoveVolatileT {
using type = T;
};
template <class T>
struct RemoveVolatileT<volatile T> {
using type = T;
};
template <class T>
using RemoveVolatile = typename RemoveVolatileT<T>::type;
template <class T>
struct RemoveRefT {
using type = T;
};
template <class T>
struct RemoveRefT<T&> {
using type = T;
};
template <class T>
struct RemoveRefT<T&&> {
using type = T;
};
template <class T>
using RemoveRef = typename RemoveRefT<T>::type;
template <class T>
using RemoveCvRef = RemoveConst<RemoveVolatile<RemoveRef<T>>>;
template <class T>
struct RemovePtrT {
using type = T;
};
template <class T>
struct RemovePtrT<T*> {
using type = T;
};
template <class T>
struct RemovePtrT<const T*> {
using type = T;
};
template <class T>
struct RemovePtrT<volatile T*> {
using type = T;
};
template <class T>
struct RemovePtrT<const volatile T*> {
using type = T;
};
template <class T>
using RemovePtr = typename RemovePtrT<T>::type;
#define HWY_IF_V_SIZE(T, kN, bytes) \
hwy::EnableIf<kN * sizeof(T) == bytes>* = nullptr
#define HWY_IF_V_SIZE_LE(T, kN, bytes) \
hwy::EnableIf<kN * sizeof(T) <= bytes>* = nullptr
#define HWY_IF_V_SIZE_GT(T, kN, bytes) \
hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr
#define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr
#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr
#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!hwy::IsSigned<T>()>* = nullptr
#define HWY_IF_NOT_UNSIGNED(T) hwy::EnableIf<hwy::IsSigned<T>()>* = nullptr
#define HWY_IF_SIGNED(T) \
hwy::EnableIf<hwy::IsSigned<T>() && !hwy::IsFloat<T>() && \
!hwy::IsSpecialFloat<T>()>* = nullptr
#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
#define HWY_IF_FLOAT3264(T) hwy::EnableIf<hwy::IsFloat3264<T>()>* = nullptr
#define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf<!hwy::IsFloat3264<T>()>* = nullptr
#define HWY_IF_SPECIAL_FLOAT(T) \
hwy::EnableIf<hwy::IsSpecialFloat<T>()>* = nullptr
#define HWY_IF_NOT_SPECIAL_FLOAT(T) \
hwy::EnableIf<!hwy::IsSpecialFloat<T>()>* = nullptr
#define HWY_IF_FLOAT_OR_SPECIAL(T) \
hwy::EnableIf<hwy::IsFloat<T>() || hwy::IsSpecialFloat<T>()>* = nullptr
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \
hwy::EnableIf<!hwy::IsFloat<T>() && !hwy::IsSpecialFloat<T>()>* = nullptr
#define HWY_IF_INTEGER(T) hwy::EnableIf<hwy::IsInteger<T>()>* = nullptr
#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
#define HWY_IF_NOT_T_SIZE(T, bytes) \
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \
hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr
#define HWY_IF_T_SIZE_LE(T, bytes) \
hwy::EnableIf<(sizeof(T) <= (bytes))>* = nullptr
#define HWY_IF_T_SIZE_GT(T, bytes) \
hwy::EnableIf<(sizeof(T) > (bytes))>* = nullptr
#define HWY_IF_SAME(T, expected) \
hwy::EnableIf<hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
#define HWY_IF_NOT_SAME(T, expected) \
hwy::EnableIf<!hwy::IsSame<hwy::RemoveCvRef<T>, expected>()>* = nullptr
#define HWY_IF_SAME2(T, expected1, expected2) \
hwy::EnableIf< \
hwy::IsSameEither<hwy::RemoveCvRef<T>, expected1, expected2>()>* = \
nullptr
#define HWY_IF_U8(T) HWY_IF_SAME(T, uint8_t)
#define HWY_IF_U16(T) HWY_IF_SAME(T, uint16_t)
#define HWY_IF_U32(T) HWY_IF_SAME(T, uint32_t)
#define HWY_IF_U64(T) HWY_IF_SAME(T, uint64_t)
#define HWY_IF_I8(T) HWY_IF_SAME(T, int8_t)
#define HWY_IF_I16(T) HWY_IF_SAME(T, int16_t)
#define HWY_IF_I32(T) HWY_IF_SAME(T, int32_t)
#define HWY_IF_I64(T) HWY_IF_SAME(T, int64_t)
#define HWY_IF_BF16(T) HWY_IF_SAME(T, hwy::bfloat16_t)
#define HWY_IF_NOT_BF16(T) HWY_IF_NOT_SAME(T, hwy::bfloat16_t)
#define HWY_IF_F16(T) HWY_IF_SAME(T, hwy::float16_t)
#define HWY_IF_NOT_F16(T) HWY_IF_NOT_SAME(T, hwy::float16_t)
#define HWY_IF_F32(T) HWY_IF_SAME(T, float)
#define HWY_IF_F64(T) HWY_IF_SAME(T, double)
#define HWY_IF_UI8(T) HWY_IF_SAME2(T, uint8_t, int8_t)
#define HWY_IF_UI16(T) HWY_IF_SAME2(T, uint16_t, int16_t)
#define HWY_IF_UI32(T) HWY_IF_SAME2(T, uint32_t, int32_t)
#define HWY_IF_UI64(T) HWY_IF_SAME2(T, uint64_t, int64_t)
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
template <size_t N>
struct SizeTag {};
template <class T>
class DeclValT {
private:
template <class U, class URef = U&&>
static URef TryAddRValRef(int);
template <class U, class Arg>
static U TryAddRValRef(Arg);
public:
using type = decltype(TryAddRValRef<T>(0));
enum { kDisableDeclValEvaluation = 1 };
};
template <class T>
HWY_API typename DeclValT<T>::type DeclVal() noexcept {
static_assert(!DeclValT<T>::kDisableDeclValEvaluation,
"DeclVal() cannot be used in an evaluated context");
}
template <class T>
struct IsArrayT {
enum { value = 0 };
};
template <class T>
struct IsArrayT<T[]> {
enum { value = 1 };
};
template <class T, size_t N>
struct IsArrayT<T[N]> {
enum { value = 1 };
};
template <class T>
static constexpr bool IsArray() {
return IsArrayT<T>::value;
}
#if HWY_COMPILER_MSVC
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4180, ignored "-Wignored-qualifiers")
#endif
template <class From, class To>
class IsConvertibleT {
private:
template <class T>
static hwy::SizeTag<1> TestFuncWithToArg(T);
template <class T, class U>
static decltype(IsConvertibleT<T, U>::template TestFuncWithToArg<U>(
DeclVal<T>()))
TryConvTest(int);
template <class T, class U, class Arg>
static hwy::SizeTag<0> TryConvTest(Arg);
public:
enum {
value = (IsSame<RemoveConst<RemoveVolatile<From>>, void>() &&
IsSame<RemoveConst<RemoveVolatile<To>>, void>()) ||
(!IsArray<To>() &&
(IsSame<To, decltype(DeclVal<To>())>() ||
!IsSame<const RemoveConst<To>, RemoveConst<To>>()) &&
IsSame<decltype(TryConvTest<From, To>(0)), hwy::SizeTag<1>>())
};
};
#if HWY_COMPILER_MSVC
HWY_DIAGNOSTICS(pop)
#endif
template <class From, class To>
HWY_API constexpr bool IsConvertible() {
return IsConvertibleT<From, To>::value;
}
template <class From, class To>
class IsStaticCastableT {
private:
template <class T, class U, class = decltype(static_cast<U>(DeclVal<T>()))>
static hwy::SizeTag<1> TryStaticCastTest(int);
template <class T, class U, class Arg>
static hwy::SizeTag<0> TryStaticCastTest(Arg);
public:
enum {
value = IsSame<decltype(TryStaticCastTest<From, To>(0)), hwy::SizeTag<1>>()
};
};
template <class From, class To>
static constexpr bool IsStaticCastable() {
return IsStaticCastableT<From, To>::value;
}
#define HWY_IF_CASTABLE(From, To) \
hwy::EnableIf<IsStaticCastable<From, To>()>* = nullptr
#define HWY_IF_OP_CASTABLE(op, T, Native) \
HWY_IF_CASTABLE(decltype(DeclVal<Native>() op DeclVal<T>()), Native)
template <class T, class From>
class IsAssignableT {
private:
template <class T1, class T2, class = decltype(DeclVal<T1>() = DeclVal<T2>())>
static hwy::SizeTag<1> TryAssignTest(int);
template <class T1, class T2, class Arg>
static hwy::SizeTag<0> TryAssignTest(Arg);
public:
enum {
value = IsSame<decltype(TryAssignTest<T, From>(0)), hwy::SizeTag<1>>()
};
};
template <class T, class From>
static constexpr bool IsAssignable() {
return IsAssignableT<T, From>::value;
}
#define HWY_IF_ASSIGNABLE(T, From) \
hwy::EnableIf<IsAssignable<T, From>()>* = nullptr
template <typename T>
HWY_API constexpr bool IsSpecialFloat() {
return IsSameEither<RemoveCvRef<T>, hwy::float16_t, hwy::bfloat16_t>();
}
template <class T>
HWY_API constexpr bool IsIntegerLaneType() {
return false;
}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<int8_t>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<uint8_t>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<int16_t>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<uint16_t>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<int32_t>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<uint32_t>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<int64_t>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsIntegerLaneType<uint64_t>() {
return true;
}
namespace detail {
template <class T>
static HWY_INLINE constexpr bool IsNonCvInteger() {
return IsIntegerLaneType<T>() || IsSame<T, wchar_t>() ||
IsSameEither<T, size_t, ptrdiff_t>() ||
IsSameEither<T, intptr_t, uintptr_t>();
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<bool>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<char>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<signed char>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<unsigned char>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<short>() { return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<unsigned short>() { return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<int>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<unsigned>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<long>() { return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<unsigned long>() { return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<long long>() { return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<unsigned long long>() { return true;
}
#if defined(__cpp_char8_t) && __cpp_char8_t >= 201811L
template <>
HWY_INLINE constexpr bool IsNonCvInteger<char8_t>() {
return true;
}
#endif
template <>
HWY_INLINE constexpr bool IsNonCvInteger<char16_t>() {
return true;
}
template <>
HWY_INLINE constexpr bool IsNonCvInteger<char32_t>() {
return true;
}
}
template <class T>
HWY_API constexpr bool IsInteger() {
return detail::IsNonCvInteger<RemoveCvRef<T>>();
}
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
#define HWY_BITCASTSCALAR_CONSTEXPR constexpr
#else
#define HWY_BITCASTSCALAR_CONSTEXPR
#endif
#if __cpp_constexpr >= 201304L
#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
#else
#define HWY_BITCASTSCALAR_CXX14_CONSTEXPR
#endif
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
namespace detail {
template <class From>
struct BitCastScalarSrcCastHelper {
static HWY_INLINE constexpr const From& CastSrcValRef(const From& val) {
return val;
}
};
#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
template <class To, class From,
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<To>>() &&
hwy::IsInteger<RemoveCvRef<From>>()>* = nullptr>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
BuiltinBitCastScalar(const From& val) {
static_assert(sizeof(To) == sizeof(From),
"sizeof(To) == sizeof(From) must be true");
return static_cast<To>(val);
}
template <class To, class From,
hwy::EnableIf<!(hwy::IsInteger<RemoveCvRef<To>>() &&
hwy::IsInteger<RemoveCvRef<From>>())>* = nullptr>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR To
BuiltinBitCastScalar(const From& val) {
return __builtin_bit_cast(To, val);
}
#endif
}
template <class To, class From, HWY_IF_NOT_SPECIAL_FLOAT(To)>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
#if HWY_COMPILER_CLANG >= 900 && HWY_COMPILER_CLANG < 1000
return detail::BuiltinBitCastScalar<To>(
detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
val));
#else
return __builtin_bit_cast(
To, detail::BitCastScalarSrcCastHelper<RemoveCvRef<From>>::CastSrcValRef(
val));
#endif
}
template <class To, class From, HWY_IF_SPECIAL_FLOAT(To)>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
return To::FromBits(BitCastScalar<uint16_t>(val));
}
#else
template <class To, class From>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR To BitCastScalar(const From& val) {
To result;
CopySameSize(&val, &result);
return result;
}
#endif
#pragma pack(push, 1)
#ifndef HWY_NEON_HAVE_F16C
#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \
(HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \
(HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE))
#define HWY_NEON_HAVE_F16C 1
#else
#define HWY_NEON_HAVE_F16C 0
#endif
#endif
#if HWY_ARCH_RISCV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG >= 1600
#define HWY_RVV_HAVE_F16_VEC 1
#else
#define HWY_RVV_HAVE_F16_VEC 0
#endif
#if HWY_ARCH_X86 && defined(__SSE2__) && defined(__FLT16_MAX__) && \
((HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL) || \
HWY_COMPILER_GCC_ACTUAL >= 1200)
#define HWY_SSE2_HAVE_F16_TYPE 1
#else
#define HWY_SSE2_HAVE_F16_TYPE 0
#endif
#ifndef HWY_HAVE_SCALAR_F16_TYPE
#if HWY_NEON_HAVE_F16C || HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || \
__SPIRV_DEVICE__
#define HWY_HAVE_SCALAR_F16_TYPE 1
#else
#define HWY_HAVE_SCALAR_F16_TYPE 0
#endif
#endif
#ifndef HWY_HAVE_SCALAR_F16_OPERATORS
#if HWY_HAVE_SCALAR_F16_TYPE && \
(HWY_COMPILER_CLANG >= 1800 || HWY_COMPILER_GCC_ACTUAL >= 1200 || \
(HWY_COMPILER_CLANG >= 1500 && !HWY_COMPILER_CLANGCL && \
!defined(_WIN32)) || \
(HWY_ARCH_ARM && \
(HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)))
#define HWY_HAVE_SCALAR_F16_OPERATORS 1
#else
#define HWY_HAVE_SCALAR_F16_OPERATORS 0
#endif
#endif
namespace detail {
template <class T, class TVal = RemoveCvRef<T>, bool = IsSpecialFloat<TVal>()>
struct SpecialFloatUnwrapArithOpOperandT {};
template <class T, class TVal>
struct SpecialFloatUnwrapArithOpOperandT<T, TVal, false> {
using type = T;
};
template <class T>
using SpecialFloatUnwrapArithOpOperand =
typename SpecialFloatUnwrapArithOpOperandT<T>::type;
template <class T, class TVal = RemoveCvRef<T>>
struct NativeSpecialFloatToWrapperT {
using type = T;
};
template <class T>
using NativeSpecialFloatToWrapper =
typename NativeSpecialFloatToWrapperT<T>::type;
}
struct alignas(2) float16_t {
#if HWY_HAVE_SCALAR_F16_TYPE
#if HWY_RVV_HAVE_F16_VEC || HWY_SSE2_HAVE_F16_TYPE || __SPIRV_DEVICE__
using Native = _Float16;
#elif HWY_NEON_HAVE_F16C
using Native = __fp16;
#else
#error "Logic error: condition should be 'all but NEON_HAVE_F16C'"
#endif
#elif HWY_IDE
using Native = uint16_t;
#endif
union {
#if HWY_HAVE_SCALAR_F16_TYPE || HWY_IDE
Native native;
#endif
uint16_t bits;
};
float16_t() noexcept = default;
constexpr float16_t(const float16_t&) noexcept = default;
constexpr float16_t(float16_t&&) noexcept = default;
float16_t& operator=(const float16_t&) noexcept = default;
float16_t& operator=(float16_t&&) noexcept = default;
#if HWY_HAVE_SCALAR_F16_TYPE
constexpr float16_t(Native arg) noexcept : native(arg) {}
constexpr operator Native() const noexcept { return native; }
#endif
#if HWY_HAVE_SCALAR_F16_TYPE
static HWY_BITCASTSCALAR_CONSTEXPR float16_t FromBits(uint16_t bits) {
return float16_t(BitCastScalar<Native>(bits));
}
#else
private:
struct F16FromU16BitsTag {};
constexpr float16_t(F16FromU16BitsTag , uint16_t u16_bits)
: bits(u16_bits) {}
public:
static constexpr float16_t FromBits(uint16_t bits) {
return float16_t(F16FromU16BitsTag(), bits);
}
#endif
#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
IsConvertible<T, Native>()>* = nullptr>
constexpr float16_t(T&& arg) noexcept
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, float16_t>() &&
!IsConvertible<T, Native>() &&
IsStaticCastable<T, Native>()>* = nullptr>
explicit constexpr float16_t(T&& arg) noexcept
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
HWY_CXX14_CONSTEXPR float16_t& operator--() noexcept {
native = static_cast<Native>(native - Native{1});
return *this;
}
HWY_CXX14_CONSTEXPR float16_t operator--(int) noexcept {
float16_t result = *this;
native = static_cast<Native>(native - Native{1});
return result;
}
HWY_CXX14_CONSTEXPR float16_t& operator++() noexcept {
native = static_cast<Native>(native + Native{1});
return *this;
}
HWY_CXX14_CONSTEXPR float16_t operator++(int) noexcept {
float16_t result = *this;
native = static_cast<Native>(native + Native{1});
return result;
}
constexpr float16_t operator-() const noexcept {
return float16_t(static_cast<Native>(-native));
}
constexpr float16_t operator+() const noexcept { return *this; }
#define HWY_FLOAT16_BINARY_OP(op, op_func, assign_func) \
constexpr float16_t op_func(const float16_t& rhs) const noexcept { \
return float16_t(static_cast<Native>(native op rhs.native)); \
} \
template <typename T, HWY_IF_NOT_F16(T), \
typename UnwrappedT = \
detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
typename RawResultT = \
decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
typename ResultT = \
detail::NativeSpecialFloatToWrapper<RawResultT>, \
HWY_IF_CASTABLE(RawResultT, ResultT)> \
constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
} \
HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func( \
const hwy::float16_t& rhs) noexcept { \
native = static_cast<Native>(native op rhs.native); \
return *this; \
} \
template <typename T, HWY_IF_NOT_F16(T), \
HWY_IF_OP_CASTABLE(op, const T&, Native), \
HWY_IF_ASSIGNABLE( \
Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
HWY_CXX14_CONSTEXPR hwy::float16_t& assign_func(const T& rhs) noexcept( \
noexcept( \
static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
native = static_cast<Native>(native op rhs); \
return *this; \
}
HWY_FLOAT16_BINARY_OP(+, operator+, operator+=)
HWY_FLOAT16_BINARY_OP(-, operator-, operator-=)
HWY_FLOAT16_BINARY_OP(*, operator*, operator*=)
HWY_FLOAT16_BINARY_OP(/, operator/, operator/=)
#undef HWY_FLOAT16_BINARY_OP
#endif };
static_assert(sizeof(hwy::float16_t) == 2, "Wrong size of float16_t");
#if HWY_HAVE_SCALAR_F16_TYPE
namespace detail {
#if HWY_HAVE_SCALAR_F16_OPERATORS
template <class T>
struct SpecialFloatUnwrapArithOpOperandT<T, hwy::float16_t, true> {
using type = hwy::float16_t::Native;
};
#endif
template <class T>
struct NativeSpecialFloatToWrapperT<T, hwy::float16_t::Native> {
using type = hwy::float16_t;
};
} #endif
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
namespace detail {
template <>
struct BitCastScalarSrcCastHelper<hwy::float16_t> {
#if HWY_HAVE_SCALAR_F16_TYPE
static HWY_INLINE constexpr const hwy::float16_t::Native& CastSrcValRef(
const hwy::float16_t& val) {
return val.native;
}
#else
static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
const hwy::float16_t& val) {
return val.bits;
}
#endif
};
} #endif
#if HWY_HAVE_SCALAR_F16_OPERATORS
#define HWY_F16_CONSTEXPR constexpr
#else
#define HWY_F16_CONSTEXPR HWY_BITCASTSCALAR_CXX14_CONSTEXPR
#endif
HWY_API HWY_F16_CONSTEXPR float F32FromF16(float16_t f16) {
#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
return static_cast<float>(f16);
#endif
#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
const uint16_t bits16 = BitCastScalar<uint16_t>(f16);
const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
const uint32_t mantissa = bits16 & 0x3FF;
if (biased_exp == 0) {
const float subnormal =
(1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
return sign ? -subnormal : subnormal;
}
const uint32_t biased_exp32 =
biased_exp == 31 ? 0xFF : biased_exp + (127 - 15);
const uint32_t mantissa32 = mantissa << (23 - 10);
const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
return BitCastScalar<float>(bits32);
#endif }
#if HWY_IS_DEBUG_BUILD && \
(HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926)
#if defined(__cpp_if_consteval) && __cpp_if_consteval >= 202106L
#define HWY_F16_FROM_F32_DASSERT(condition) \
do { \
if !consteval { \
HWY_DASSERT(condition); \
} \
} while (0)
#elif HWY_HAS_BUILTIN(__builtin_is_constant_evaluated) || \
HWY_COMPILER_MSVC >= 1926
#define HWY_F16_FROM_F32_DASSERT(condition) \
do { \
if (!__builtin_is_constant_evaluated()) { \
HWY_DASSERT(condition); \
} \
} while (0)
#else
#define HWY_F16_FROM_F32_DASSERT(condition) \
do { \
} while (0)
#endif #else
#define HWY_F16_FROM_F32_DASSERT(condition) HWY_DASSERT(condition)
#endif
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF32(float f32) {
#if HWY_HAVE_SCALAR_F16_OPERATORS && !HWY_IDE
return float16_t(static_cast<float16_t::Native>(f32));
#endif
#if !HWY_HAVE_SCALAR_F16_OPERATORS || HWY_IDE
const uint32_t bits32 = BitCastScalar<uint32_t>(f32);
const uint32_t sign = bits32 >> 31;
const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
constexpr uint32_t kMantissaMask = 0x7FFFFF;
const uint32_t mantissa32 = bits32 & kMantissaMask;
const int32_t f16_ulp_bit_idx =
HWY_MIN(HWY_MAX(126 - static_cast<int32_t>(biased_exp32), 13), 24);
const uint32_t odd_bit = ((mantissa32 | 0x800000u) >> f16_ulp_bit_idx) & 1;
const uint32_t rounded =
mantissa32 + odd_bit + (uint32_t{1} << (f16_ulp_bit_idx - 1)) - 1u;
const bool carry = rounded >= (1u << 23);
const int32_t exp = static_cast<int32_t>(biased_exp32) - 127 + carry;
if (exp < -24) {
return float16_t::FromBits(static_cast<uint16_t>(sign << 15));
}
const bool is_nan = (biased_exp32 == 255) && mantissa32 != 0;
const bool overflowed = exp >= 16;
const uint32_t biased_exp16 =
static_cast<uint32_t>(HWY_MIN(HWY_MAX(0, exp + 15), 31));
const uint32_t sub_exp = static_cast<uint32_t>(HWY_MAX(-14 - exp, 0));
HWY_F16_FROM_F32_DASSERT(sub_exp < 11);
const uint32_t shifted_mantissa =
(rounded & kMantissaMask) >> (23 - 10 + sub_exp);
const uint32_t leading = sub_exp == 0u ? 0u : (1024u >> sub_exp);
const uint32_t mantissa16 = is_nan ? 0x3FF
: overflowed ? 0u
: (leading + shifted_mantissa);
#if HWY_IS_DEBUG_BUILD
if (exp < -14) {
HWY_F16_FROM_F32_DASSERT(biased_exp16 == 0);
HWY_F16_FROM_F32_DASSERT(sub_exp >= 1);
} else if (exp <= 15) {
HWY_F16_FROM_F32_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
HWY_F16_FROM_F32_DASSERT(sub_exp == 0);
}
#endif
HWY_F16_FROM_F32_DASSERT(mantissa16 < 1024);
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
HWY_F16_FROM_F32_DASSERT(bits16 < 0x10000);
const uint16_t narrowed = static_cast<uint16_t>(bits16); return float16_t::FromBits(narrowed);
#endif }
HWY_API HWY_F16_CONSTEXPR float16_t F16FromF64(double f64) {
#if HWY_HAVE_SCALAR_F16_OPERATORS
return float16_t(static_cast<float16_t::Native>(f64));
#else
return F16FromF32(
static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
(BitCastScalar<uint64_t>(f64) & 0xFFFFFFFFE0000000ULL) |
((BitCastScalar<uint64_t>(f64) + 0x000000001FFFFFFFULL) &
0x0000000020000000ULL)))));
#endif
}
HWY_F16_CONSTEXPR inline bool operator==(float16_t lhs,
float16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_F16_OPERATORS
return lhs.native == rhs.native;
#else
return F32FromF16(lhs) == F32FromF16(rhs);
#endif
}
HWY_F16_CONSTEXPR inline bool operator!=(float16_t lhs,
float16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_F16_OPERATORS
return lhs.native != rhs.native;
#else
return F32FromF16(lhs) != F32FromF16(rhs);
#endif
}
HWY_F16_CONSTEXPR inline bool operator<(float16_t lhs, float16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_F16_OPERATORS
return lhs.native < rhs.native;
#else
return F32FromF16(lhs) < F32FromF16(rhs);
#endif
}
HWY_F16_CONSTEXPR inline bool operator<=(float16_t lhs,
float16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_F16_OPERATORS
return lhs.native <= rhs.native;
#else
return F32FromF16(lhs) <= F32FromF16(rhs);
#endif
}
HWY_F16_CONSTEXPR inline bool operator>(float16_t lhs, float16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_F16_OPERATORS
return lhs.native > rhs.native;
#else
return F32FromF16(lhs) > F32FromF16(rhs);
#endif
}
HWY_F16_CONSTEXPR inline bool operator>=(float16_t lhs,
float16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_F16_OPERATORS
return lhs.native >= rhs.native;
#else
return F32FromF16(lhs) >= F32FromF16(rhs);
#endif
}
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
HWY_F16_CONSTEXPR inline std::partial_ordering operator<=>(
float16_t lhs, float16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_F16_OPERATORS
return lhs.native <=> rhs.native;
#else
return F32FromF16(lhs) <=> F32FromF16(rhs);
#endif
}
#endif
#if HWY_ARCH_ARM_A64 && \
(HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400)
#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 1
#else
#define HWY_ARM_HAVE_SCALAR_BF16_TYPE 0
#endif
#ifndef HWY_SSE2_HAVE_SCALAR_BF16_TYPE
#if HWY_ARCH_X86 && defined(__SSE2__) && \
((HWY_COMPILER_CLANG >= 1700 && !HWY_COMPILER_CLANGCL && \
!HWY_IS_DEBUG_BUILD) || \
HWY_COMPILER_GCC_ACTUAL >= 1300)
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 1
#else
#define HWY_SSE2_HAVE_SCALAR_BF16_TYPE 0
#endif
#endif
#if HWY_ARM_HAVE_SCALAR_BF16_TYPE || HWY_SSE2_HAVE_SCALAR_BF16_TYPE
#define HWY_HAVE_SCALAR_BF16_TYPE 1
#else
#define HWY_HAVE_SCALAR_BF16_TYPE 0
#endif
#ifndef HWY_HAVE_SCALAR_BF16_OPERATORS
#if HWY_HAVE_SCALAR_BF16_TYPE && (HWY_COMPILER_GCC_ACTUAL >= 1300)
#define HWY_HAVE_SCALAR_BF16_OPERATORS 1
#else
#define HWY_HAVE_SCALAR_BF16_OPERATORS 0
#endif
#endif
#if HWY_HAVE_SCALAR_BF16_OPERATORS
#define HWY_BF16_CONSTEXPR constexpr
#else
#define HWY_BF16_CONSTEXPR HWY_BITCASTSCALAR_CONSTEXPR
#endif
struct alignas(2) bfloat16_t {
#if HWY_HAVE_SCALAR_BF16_TYPE
using Native = __bf16;
#elif HWY_IDE
using Native = uint16_t;
#endif
union {
#if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE
Native native;
#endif
uint16_t bits;
};
bfloat16_t() noexcept = default;
constexpr bfloat16_t(bfloat16_t&&) noexcept = default;
constexpr bfloat16_t(const bfloat16_t&) noexcept = default;
bfloat16_t& operator=(bfloat16_t&& arg) noexcept = default;
bfloat16_t& operator=(const bfloat16_t& arg) noexcept = default;
#if HWY_HAVE_SCALAR_BF16_TYPE || HWY_IDE
constexpr bfloat16_t(Native arg) noexcept : native(arg) {}
constexpr operator Native() const noexcept { return native; }
#endif
#if HWY_HAVE_SCALAR_BF16_TYPE
static HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t FromBits(uint16_t bits) {
return bfloat16_t(BitCastScalar<Native>(bits));
}
#else
private:
struct BF16FromU16BitsTag {};
constexpr bfloat16_t(BF16FromU16BitsTag , uint16_t u16_bits)
: bits(u16_bits) {}
public:
static constexpr bfloat16_t FromBits(uint16_t bits) {
return bfloat16_t(BF16FromU16BitsTag(), bits);
}
#endif
#if HWY_HAVE_SCALAR_BF16_OPERATORS || HWY_IDE
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
!IsSame<RemoveCvRef<T>, bfloat16_t>() &&
IsConvertible<T, Native>()>* = nullptr>
constexpr bfloat16_t(T&& arg) noexcept(
noexcept(static_cast<Native>(DeclVal<T>())))
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
template <typename T, hwy::EnableIf<!IsSame<RemoveCvRef<T>, Native>() &&
!IsSame<RemoveCvRef<T>, bfloat16_t>() &&
!IsConvertible<T, Native>() &&
IsStaticCastable<T, Native>()>* = nullptr>
explicit constexpr bfloat16_t(T&& arg) noexcept(
noexcept(static_cast<Native>(DeclVal<T>())))
: native(static_cast<Native>(static_cast<T&&>(arg))) {}
HWY_CXX14_CONSTEXPR bfloat16_t& operator=(Native arg) noexcept {
native = arg;
return *this;
}
HWY_CXX14_CONSTEXPR bfloat16_t& operator--() noexcept {
native = static_cast<Native>(native - Native{1});
return *this;
}
HWY_CXX14_CONSTEXPR bfloat16_t operator--(int) noexcept {
bfloat16_t result = *this;
native = static_cast<Native>(native - Native{1});
return result;
}
HWY_CXX14_CONSTEXPR bfloat16_t& operator++() noexcept {
native = static_cast<Native>(native + Native{1});
return *this;
}
HWY_CXX14_CONSTEXPR bfloat16_t operator++(int) noexcept {
bfloat16_t result = *this;
native = static_cast<Native>(native + Native{1});
return result;
}
constexpr bfloat16_t operator-() const noexcept {
return bfloat16_t(static_cast<Native>(-native));
}
constexpr bfloat16_t operator+() const noexcept { return *this; }
#define HWY_BFLOAT16_BINARY_OP(op, op_func, assign_func) \
constexpr bfloat16_t op_func(const bfloat16_t& rhs) const noexcept { \
return bfloat16_t(static_cast<Native>(native op rhs.native)); \
} \
template <typename T, HWY_IF_NOT_BF16(T), \
typename UnwrappedT = \
detail::SpecialFloatUnwrapArithOpOperand<const T&>, \
typename RawResultT = \
decltype(DeclVal<Native>() op DeclVal<UnwrappedT>()), \
typename ResultT = \
detail::NativeSpecialFloatToWrapper<RawResultT>, \
HWY_IF_CASTABLE(RawResultT, ResultT)> \
constexpr ResultT op_func(const T& rhs) const noexcept(noexcept( \
static_cast<ResultT>(DeclVal<Native>() op DeclVal<UnwrappedT>()))) { \
return static_cast<ResultT>(native op static_cast<UnwrappedT>(rhs)); \
} \
HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func( \
const hwy::bfloat16_t& rhs) noexcept { \
native = static_cast<Native>(native op rhs.native); \
return *this; \
} \
template <typename T, HWY_IF_NOT_BF16(T), \
HWY_IF_OP_CASTABLE(op, const T&, Native), \
HWY_IF_ASSIGNABLE( \
Native, decltype(DeclVal<Native>() op DeclVal<const T&>()))> \
HWY_CXX14_CONSTEXPR hwy::bfloat16_t& assign_func(const T& rhs) noexcept( \
noexcept( \
static_cast<Native>(DeclVal<Native>() op DeclVal<const T&>()))) { \
native = static_cast<Native>(native op rhs); \
return *this; \
}
HWY_BFLOAT16_BINARY_OP(+, operator+, operator+=)
HWY_BFLOAT16_BINARY_OP(-, operator-, operator-=)
HWY_BFLOAT16_BINARY_OP(*, operator*, operator*=)
HWY_BFLOAT16_BINARY_OP(/, operator/, operator/=)
#undef HWY_BFLOAT16_BINARY_OP
#endif };
static_assert(sizeof(hwy::bfloat16_t) == 2, "Wrong size of bfloat16_t");
#pragma pack(pop)
#if HWY_HAVE_SCALAR_BF16_TYPE
namespace detail {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
template <class T>
struct SpecialFloatUnwrapArithOpOperandT<T, hwy::bfloat16_t, true> {
using type = hwy::bfloat16_t::Native;
};
#endif
template <class T>
struct NativeSpecialFloatToWrapperT<T, hwy::bfloat16_t::Native> {
using type = hwy::bfloat16_t;
};
} #endif
#if HWY_HAS_BUILTIN(__builtin_bit_cast) || HWY_COMPILER_MSVC >= 1926
namespace detail {
template <>
struct BitCastScalarSrcCastHelper<hwy::bfloat16_t> {
#if HWY_HAVE_SCALAR_BF16_TYPE
static HWY_INLINE constexpr const hwy::bfloat16_t::Native& CastSrcValRef(
const hwy::bfloat16_t& val) {
return val.native;
}
#else
static HWY_INLINE constexpr const uint16_t& CastSrcValRef(
const hwy::bfloat16_t& val) {
return val.bits;
}
#endif
};
} #endif
HWY_API HWY_BF16_CONSTEXPR float F32FromBF16(bfloat16_t bf) {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
return static_cast<float>(bf);
#else
return BitCastScalar<float>(static_cast<uint32_t>(
static_cast<uint32_t>(BitCastScalar<uint16_t>(bf)) << 16));
#endif
}
namespace detail {
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint32_t F32BitsToBF16RoundIncr(
const uint32_t f32_bits) {
return static_cast<uint32_t>(((f32_bits & 0x7FFFFFFFu) < 0x7F800000u)
? (0x7FFFu + ((f32_bits >> 16) & 1u))
: 0u);
}
static HWY_INLINE constexpr uint32_t BF16BitsIfSNAN(uint32_t f32_bits) {
return ((f32_bits & 0x7FFFFFFFu) > 0x7F800000u) ? (uint32_t{1} << 6) : 0;
}
static HWY_INLINE HWY_MAYBE_UNUSED constexpr uint16_t F32BitsToBF16Bits(
const uint32_t f32_bits) {
return static_cast<uint16_t>(
BF16BitsIfSNAN(f32_bits) |
((f32_bits + F32BitsToBF16RoundIncr(f32_bits)) >> 16));
}
}
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF32(float f) {
return bfloat16_t::FromBits(
detail::F32BitsToBF16Bits(BitCastScalar<uint32_t>(f)));
}
HWY_API HWY_BF16_CONSTEXPR bfloat16_t BF16FromF64(double f64) {
return BF16FromF32(
static_cast<float>(BitCastScalar<double>(static_cast<uint64_t>(
(BitCastScalar<uint64_t>(f64) & 0xFFFFFFC000000000ULL) |
((BitCastScalar<uint64_t>(f64) + 0x0000003FFFFFFFFFULL) &
0x0000004000000000ULL)))));
}
HWY_BF16_CONSTEXPR inline bool operator==(bfloat16_t lhs,
bfloat16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
return lhs.native == rhs.native;
#else
return F32FromBF16(lhs) == F32FromBF16(rhs);
#endif
}
HWY_BF16_CONSTEXPR inline bool operator!=(bfloat16_t lhs,
bfloat16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
return lhs.native != rhs.native;
#else
return F32FromBF16(lhs) != F32FromBF16(rhs);
#endif
}
HWY_BF16_CONSTEXPR inline bool operator<(bfloat16_t lhs,
bfloat16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
return lhs.native < rhs.native;
#else
return F32FromBF16(lhs) < F32FromBF16(rhs);
#endif
}
HWY_BF16_CONSTEXPR inline bool operator<=(bfloat16_t lhs,
bfloat16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
return lhs.native <= rhs.native;
#else
return F32FromBF16(lhs) <= F32FromBF16(rhs);
#endif
}
HWY_BF16_CONSTEXPR inline bool operator>(bfloat16_t lhs,
bfloat16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
return lhs.native > rhs.native;
#else
return F32FromBF16(lhs) > F32FromBF16(rhs);
#endif
}
HWY_BF16_CONSTEXPR inline bool operator>=(bfloat16_t lhs,
bfloat16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
return lhs.native >= rhs.native;
#else
return F32FromBF16(lhs) >= F32FromBF16(rhs);
#endif
}
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
HWY_BF16_CONSTEXPR inline std::partial_ordering operator<=>(
bfloat16_t lhs, bfloat16_t rhs) noexcept {
#if HWY_HAVE_SCALAR_BF16_OPERATORS
return lhs.native <=> rhs.native;
#else
return F32FromBF16(lhs) <=> F32FromBF16(rhs);
#endif
}
#endif
namespace detail {
template <typename T>
struct Relations;
template <>
struct Relations<uint8_t> {
using Unsigned = uint8_t;
using Signed = int8_t;
using Wide = uint16_t;
enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<int8_t> {
using Unsigned = uint8_t;
using Signed = int8_t;
using Wide = int16_t;
enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<uint16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Float = float16_t;
using Wide = uint32_t;
using Narrow = uint8_t;
enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<int16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Float = float16_t;
using Wide = int32_t;
using Narrow = int8_t;
enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<uint32_t> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = uint64_t;
using Narrow = uint16_t;
enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<int32_t> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = int64_t;
using Narrow = int16_t;
enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<uint64_t> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Wide = uint128_t;
using Narrow = uint32_t;
enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<int64_t> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Narrow = int32_t;
enum { is_signed = 1, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<uint128_t> {
using Unsigned = uint128_t;
using Narrow = uint64_t;
enum { is_signed = 0, is_float = 0, is_bf16 = 0 };
};
template <>
struct Relations<float16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Float = float16_t;
using Wide = float;
enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
};
template <>
struct Relations<bfloat16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Wide = float;
enum { is_signed = 1, is_float = 1, is_bf16 = 1 };
};
template <>
struct Relations<float> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = double;
using Narrow = float16_t;
enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
};
template <>
struct Relations<double> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Narrow = float;
enum { is_signed = 1, is_float = 1, is_bf16 = 0 };
};
template <size_t N>
struct TypeFromSize;
template <>
struct TypeFromSize<1> {
using Unsigned = uint8_t;
using Signed = int8_t;
};
template <>
struct TypeFromSize<2> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Float = float16_t;
};
template <>
struct TypeFromSize<4> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
};
template <>
struct TypeFromSize<8> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
};
template <>
struct TypeFromSize<16> {
using Unsigned = uint128_t;
};
}
template <typename T>
using MakeUnsigned = typename detail::Relations<T>::Unsigned;
template <typename T>
using MakeSigned = typename detail::Relations<T>::Signed;
template <typename T>
using MakeFloat = typename detail::Relations<T>::Float;
template <typename T>
using MakeWide = typename detail::Relations<T>::Wide;
template <typename T>
using MakeNarrow = typename detail::Relations<T>::Narrow;
template <size_t N>
using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
template <size_t N>
using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
template <size_t N>
using FloatFromSize = typename detail::TypeFromSize<N>::Float;
using UnsignedTag = SizeTag<0>;
using SignedTag = SizeTag<0x100>; using FloatTag = SizeTag<0x200>;
using SpecialTag = SizeTag<0x300>;
template <typename T, class R = detail::Relations<T>>
constexpr auto TypeTag()
-> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> {
return hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)>();
}
using NonFloatTag = SizeTag<0x400>;
template <typename T, class R = detail::Relations<T>>
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
}
template <typename T>
HWY_API constexpr bool IsFloat3264() {
return IsSameEither<RemoveCvRef<T>, float, double>();
}
template <typename T>
HWY_API constexpr bool IsFloat() {
return IsSame<RemoveCvRef<T>, float16_t>() || IsFloat3264<T>();
}
template <typename T>
HWY_API constexpr bool IsSigned() {
return static_cast<T>(0) > static_cast<T>(-1);
}
template <>
constexpr bool IsSigned<float16_t>() {
return true;
}
template <>
constexpr bool IsSigned<bfloat16_t>() {
return true;
}
template <>
constexpr bool IsSigned<hwy::uint128_t>() {
return false;
}
template <>
constexpr bool IsSigned<hwy::K64V64>() {
return false;
}
template <>
constexpr bool IsSigned<hwy::K32V32>() {
return false;
}
template <typename T>
HWY_API constexpr bool IsUnsigned() {
return IsInteger<T>() && !IsSigned<T>();
}
template <typename T, bool = IsInteger<T>() && !IsIntegerLaneType<T>()>
struct MakeLaneTypeIfIntegerT {
using type = T;
};
template <typename T>
struct MakeLaneTypeIfIntegerT<T, true> {
using type = hwy::If<IsSigned<T>(), SignedFromSize<sizeof(T)>,
UnsignedFromSize<sizeof(T)>>;
};
template <typename T>
using MakeLaneTypeIfInteger = typename MakeLaneTypeIfIntegerT<T>::type;
template <typename T>
HWY_API constexpr T LimitsMax() {
static_assert(IsInteger<T>(), "Only for integer types");
using TU = UnsignedFromSize<sizeof(T)>;
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~TU(0)) >> 1)
: static_cast<TU>(~TU(0)));
}
template <typename T>
HWY_API constexpr T LimitsMin() {
static_assert(IsInteger<T>(), "Only for integer types");
return IsSigned<T>() ? static_cast<T>(-1) - LimitsMax<T>()
: static_cast<T>(0);
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T LowestValue() {
return LimitsMin<T>();
}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t LowestValue<bfloat16_t>() {
return bfloat16_t::FromBits(uint16_t{0xFF7Fu}); }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t LowestValue<float16_t>() {
return float16_t::FromBits(uint16_t{0xFBFFu}); }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float LowestValue<float>() {
return -3.402823466e+38F;
}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double LowestValue<double>() {
return -1.7976931348623158e+308;
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T HighestValue() {
return LimitsMax<T>();
}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t HighestValue<bfloat16_t>() {
return bfloat16_t::FromBits(uint16_t{0x7F7Fu}); }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t HighestValue<float16_t>() {
return float16_t::FromBits(uint16_t{0x7BFFu}); }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float HighestValue<float>() {
return 3.402823466e+38F;
}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double HighestValue<double>() {
return 1.7976931348623158e+308;
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T Epsilon() {
return 1;
}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t Epsilon<bfloat16_t>() {
return bfloat16_t::FromBits(uint16_t{0x3C00u}); }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t Epsilon<float16_t>() {
return float16_t::FromBits(uint16_t{0x1400u}); }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float Epsilon<float>() {
return 1.192092896e-7f;
}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double Epsilon<double>() {
return 2.2204460492503131e-16;
}
template <typename T>
constexpr int MantissaBits() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr int MantissaBits<bfloat16_t>() {
return 7;
}
template <>
constexpr int MantissaBits<float16_t>() {
return 10;
}
template <>
constexpr int MantissaBits<float>() {
return 23;
}
template <>
constexpr int MantissaBits<double>() {
return 52;
}
template <typename T>
constexpr MakeSigned<T> MaxExponentTimes2() {
return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
}
template <typename T>
constexpr MakeUnsigned<T> SignMask() {
return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
}
template <typename T>
constexpr MakeUnsigned<T> ExponentMask() {
return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) &
static_cast<MakeUnsigned<T>>(~SignMask<T>());
}
template <typename T>
constexpr MakeUnsigned<T> MantissaMask() {
return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
}
template <typename T>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bfloat16_t MantissaEnd<bfloat16_t>() {
return bfloat16_t::FromBits(uint16_t{0x4300u}); }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float16_t MantissaEnd<float16_t>() {
return float16_t::FromBits(uint16_t{0x6400u}); }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR float MantissaEnd<float>() {
return 8388608.0f; }
template <>
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR double MantissaEnd<double>() {
return 4503599627370496.0; }
template <typename T>
constexpr int ExponentBits() {
return 8 * sizeof(T) - 1 - MantissaBits<T>();
}
template <typename T>
constexpr MakeSigned<T> MaxExponentField() {
return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
}
namespace detail {
template <typename T>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
NegativeInfOrLowestValue(hwy::FloatTag ) {
return BitCastScalar<T>(
static_cast<MakeUnsigned<T>>(SignMask<T>() | ExponentMask<T>()));
}
template <typename T>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
NegativeInfOrLowestValue(hwy::NonFloatTag ) {
return LowestValue<T>();
}
template <typename T>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
PositiveInfOrHighestValue(hwy::FloatTag ) {
return BitCastScalar<T>(ExponentMask<T>());
}
template <typename T>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T
PositiveInfOrHighestValue(hwy::NonFloatTag ) {
return HighestValue<T>();
}
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T NegativeInfOrLowestValue() {
return detail::NegativeInfOrLowestValue<T>(IsFloatTag<T>());
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR T PositiveInfOrHighestValue() {
return detail::PositiveInfOrHighestValue<T>(IsFloatTag<T>());
}
#if HWY_HAVE_SCALAR_F16_OPERATORS || HWY_HAVE_SCALAR_BF16_OPERATORS
#define HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T2) \
template < \
typename T1, \
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
typename RawResultT = decltype(DeclVal<T1>() op DeclVal<T2::Native>()), \
typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
HWY_IF_CASTABLE(RawResultT, ResultT)> \
static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
return static_cast<ResultT>(a op b.native); \
}
#define HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(op, assign_op, T2) \
template <typename T1, \
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T1>>() || \
hwy::IsFloat3264<RemoveCvRef<T1>>()>* = nullptr, \
typename ResultT = \
decltype(DeclVal<T1&>() assign_op DeclVal<T2::Native>())> \
static HWY_INLINE constexpr ResultT operator assign_op(T1& a, \
T2 b) noexcept { \
return (a assign_op b.native); \
}
#define HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(op, op_func, T1) \
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(op, op_func, T1) \
template < \
typename T2, \
hwy::EnableIf<hwy::IsInteger<RemoveCvRef<T2>>() || \
hwy::IsFloat3264<RemoveCvRef<T2>>()>* = nullptr, \
typename RawResultT = decltype(DeclVal<T1::Native>() op DeclVal<T2>()), \
typename ResultT = detail::NativeSpecialFloatToWrapper<RawResultT>, \
HWY_IF_CASTABLE(RawResultT, ResultT)> \
static HWY_INLINE constexpr ResultT op_func(T1 a, T2 b) noexcept { \
return static_cast<ResultT>(a.native op b); \
}
#if HWY_HAVE_SCALAR_F16_OPERATORS
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, float16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, float16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, float16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, float16_t)
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, float16_t)
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, float16_t)
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, float16_t)
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, float16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, float16_t)
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, float16_t)
#endif
#endif
#if HWY_HAVE_SCALAR_BF16_OPERATORS
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(+, operator+, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(-, operator-, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(*, operator*, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ARITH_OP(/, operator/, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(+, +=, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(-, -=, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(*, *=, bfloat16_t)
HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP(/, /=, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(==, operator==, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(!=, operator!=, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<, operator<, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=, operator<=, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>, operator>, bfloat16_t)
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(>=, operator>=, bfloat16_t)
#if HWY_HAVE_CXX20_THREE_WAY_COMPARE
HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP(<=>, operator<=>, bfloat16_t)
#endif
#endif
#undef HWY_RHS_SPECIAL_FLOAT_ARITH_OP
#undef HWY_RHS_SPECIAL_FLOAT_ASSIGN_OP
#undef HWY_SPECIAL_FLOAT_CMP_AGAINST_NON_SPECIAL_OP
#endif
HWY_API float F32FromF16Mem(const void* ptr) {
float16_t f16;
CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &f16);
return F32FromF16(f16);
}
HWY_API float F32FromBF16Mem(const void* ptr) {
bfloat16_t bf;
CopyBytes<2>(HWY_ASSUME_ALIGNED(ptr, 2), &bf);
return F32FromBF16(bf);
}
#if HWY_HAVE_SCALAR_F16_OPERATORS
#define HWY_BF16_TO_F16_CONSTEXPR HWY_BF16_CONSTEXPR
#else
#define HWY_BF16_TO_F16_CONSTEXPR HWY_F16_CONSTEXPR
#endif
namespace detail {
template <class TTo, class TFrom>
static HWY_INLINE HWY_MAYBE_UNUSED constexpr TTo ConvertScalarToResult(
hwy::SizeTag<0> , TFrom in) {
return static_cast<TTo>(static_cast<TFrom>(in));
}
template <class TTo>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo
ConvertScalarToResult(hwy::FloatTag , float in) {
return F16FromF32(in);
}
template <class TTo>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR TTo
ConvertScalarToResult(hwy::FloatTag , double in) {
return F16FromF64(in);
}
template <class TTo>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo
ConvertScalarToResult(hwy::SpecialTag , float in) {
return BF16FromF32(in);
}
template <class TTo>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR TTo
ConvertScalarToResult(hwy::SpecialTag , double in) {
return BF16FromF64(in);
}
template <class TFrom, HWY_IF_BF16(TFrom)>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_BF16_CONSTEXPR float
ConvertScalarSpecialFloatToF32(hwy::SpecialTag , TFrom in) {
return F32FromBF16(in);
}
template <class TFrom, HWY_IF_F16(TFrom)>
static HWY_INLINE HWY_MAYBE_UNUSED HWY_F16_CONSTEXPR float
ConvertScalarSpecialFloatToF32(hwy::SpecialTag , TFrom in) {
return F32FromF16(in);
}
template <class TFrom>
static HWY_INLINE HWY_MAYBE_UNUSED constexpr auto
ConvertScalarSpecialFloatToF32(hwy::FloatTag , TFrom in)
-> hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float> {
return static_cast<
hwy::If<hwy::IsSame<hwy::RemoveCvRef<TFrom>, double>(), double, float>>(
in);
}
template <class TFrom>
static HWY_INLINE HWY_MAYBE_UNUSED constexpr TFrom
ConvertScalarSpecialFloatToF32(hwy::SizeTag<0> , TFrom in) {
return static_cast<TFrom>(in);
}
}
template <typename TTo, typename TFrom>
HWY_API constexpr TTo ConvertScalarTo(TFrom in) {
return detail::ConvertScalarToResult<TTo>(
hwy::SizeTag<
(!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() &&
hwy::IsSpecialFloat<TTo>())
? (hwy::IsSame<RemoveCvRef<TTo>, hwy::bfloat16_t>() ? 0x300
: 0x200)
: 0>(),
detail::ConvertScalarSpecialFloatToF32(
hwy::SizeTag<
(!hwy::IsSame<hwy::RemoveCvRef<TFrom>, hwy::RemoveCvRef<TTo>>() &&
(hwy::IsSpecialFloat<TFrom>() || hwy::IsSpecialFloat<TTo>()))
? (hwy::IsSpecialFloat<TFrom>() ? 0x300 : 0x200)
: 0>(),
static_cast<TFrom&&>(in)));
}
template <typename T1, typename T2>
constexpr inline T1 DivCeil(T1 a, T2 b) {
#if HWY_CXX_LANG >= 201703L
HWY_DASSERT(b != 0);
#endif
return (a + b - 1) / b;
}
constexpr inline size_t RoundUpTo(size_t what, size_t align) {
return DivCeil(what, align) * align;
}
constexpr inline size_t RoundDownTo(size_t what, size_t align) {
return what - (what % align);
}
namespace detail {
template <class T>
static HWY_INLINE constexpr T ScalarShr(hwy::UnsignedTag , T val,
int shift_amt) {
return static_cast<T>(val >> shift_amt);
}
template <class T>
static HWY_INLINE constexpr T ScalarShr(hwy::SignedTag , T val,
int shift_amt) {
using TU = MakeUnsigned<MakeLaneTypeIfInteger<T>>;
return static_cast<T>(
(val < 0) ? static_cast<TU>(
~(static_cast<TU>(~static_cast<TU>(val)) >> shift_amt))
: static_cast<TU>(static_cast<TU>(val) >> shift_amt));
}
}
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>)>
HWY_API constexpr RemoveCvRef<T> ScalarShr(T val, int shift_amt) {
using NonCvRefT = RemoveCvRef<T>;
return detail::ScalarShr(
hwy::SizeTag<((IsSigned<NonCvRefT>() &&
(LimitsMin<NonCvRefT>() >> (sizeof(T) * 8 - 1)) !=
static_cast<NonCvRefT>(-1))
? 0x100
: 0)>(),
static_cast<NonCvRefT>(val), shift_amt);
}
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
HWY_DASSERT(x != 0);
#if HWY_COMPILER_MSVC
unsigned long index; _BitScanForward(&index, x);
return index;
#else
return static_cast<size_t>(__builtin_ctz(x));
#endif }
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
HWY_DASSERT(x != 0);
#if HWY_COMPILER_MSVC
#if HWY_ARCH_X86_64
unsigned long index; _BitScanForward64(&index, x);
return index;
#else
uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
unsigned long index; if (lsb == 0) {
uint32_t msb = static_cast<uint32_t>(x >> 32u);
_BitScanForward(&index, msb);
return 32 + index;
} else {
_BitScanForward(&index, lsb);
return index;
}
#endif #else
return static_cast<size_t>(__builtin_ctzll(x));
#endif }
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
HWY_DASSERT(x != 0);
#if HWY_COMPILER_MSVC
unsigned long index; _BitScanReverse(&index, x);
return 31 - index;
#else
return static_cast<size_t>(__builtin_clz(x));
#endif }
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
HWY_DASSERT(x != 0);
#if HWY_COMPILER_MSVC
#if HWY_ARCH_X86_64
unsigned long index; _BitScanReverse64(&index, x);
return 63 - index;
#else
const uint32_t msb = static_cast<uint32_t>(x >> 32u);
unsigned long index; if (msb == 0) {
const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
_BitScanReverse(&index, lsb);
return 63 - index;
} else {
_BitScanReverse(&index, msb);
return 31 - index;
}
#endif #else
return static_cast<size_t>(__builtin_clzll(x));
#endif }
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
HWY_IF_T_SIZE_ONE_OF(RemoveCvRef<T>, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API size_t PopCount(T x) {
uint32_t u32_x = static_cast<uint32_t>(
static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
return static_cast<size_t>(__builtin_popcountl(u32_x));
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
return static_cast<size_t>(_mm_popcnt_u32(u32_x));
#else
u32_x -= ((u32_x >> 1) & 0x55555555u);
u32_x = (((u32_x >> 2) & 0x33333333u) + (u32_x & 0x33333333u));
u32_x = (((u32_x >> 4) + u32_x) & 0x0F0F0F0Fu);
u32_x += (u32_x >> 8);
u32_x += (u32_x >> 16);
return static_cast<size_t>(u32_x & 0x3Fu);
#endif
}
template <class T, HWY_IF_INTEGER(RemoveCvRef<T>),
HWY_IF_T_SIZE(RemoveCvRef<T>, 8)>
HWY_API size_t PopCount(T x) {
uint64_t u64_x = static_cast<uint64_t>(
static_cast<UnsignedFromSize<sizeof(RemoveCvRef<T>)>>(x));
#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
return static_cast<size_t>(__builtin_popcountll(u64_x));
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
return _mm_popcnt_u64(u64_x);
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
return _mm_popcnt_u32(static_cast<uint32_t>(u64_x & 0xFFFFFFFFu)) +
_mm_popcnt_u32(static_cast<uint32_t>(u64_x >> 32));
#else
u64_x -= ((u64_x >> 1) & 0x5555555555555555ULL);
u64_x = (((u64_x >> 2) & 0x3333333333333333ULL) +
(u64_x & 0x3333333333333333ULL));
u64_x = (((u64_x >> 4) + u64_x) & 0x0F0F0F0F0F0F0F0FULL);
u64_x += (u64_x >> 8);
u64_x += (u64_x >> 16);
u64_x += (u64_x >> 32);
return static_cast<size_t>(u64_x & 0x7Fu);
#endif
}
template <typename TI>
constexpr size_t FloorLog2(TI x) {
return x == TI{1}
? 0
: static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
}
template <typename TI>
constexpr size_t CeilLog2(TI x) {
return x == TI{1}
? 0
: static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
}
template <typename T, typename T2, HWY_IF_FLOAT(T), HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
return t + static_cast<T>(increment);
}
template <typename T, typename T2, HWY_IF_SPECIAL_FLOAT(T)>
HWY_INLINE constexpr T AddWithWraparound(T t, T2 increment) {
return ConvertScalarTo<T>(ConvertScalarTo<float>(t) +
ConvertScalarTo<float>(increment));
}
template <typename T, typename T2, HWY_IF_NOT_FLOAT(T)>
HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) {
using TU = MakeUnsigned<T>;
return static_cast<T>(static_cast<TU>(
static_cast<unsigned long long>(static_cast<unsigned long long>(t) +
static_cast<unsigned long long>(n)) &
uint64_t{hwy::LimitsMax<TU>()}));
}
#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
#pragma intrinsic(_mul128)
#pragma intrinsic(_umul128)
#endif
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
#if defined(__SIZEOF_INT128__)
__uint128_t product = (__uint128_t)a * (__uint128_t)b;
*upper = (uint64_t)(product >> 64);
return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
return _umul128(a, b, upper);
#else
constexpr uint64_t kLo32 = 0xFFFFFFFFU;
const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
const uint64_t hi_lo = (a >> 32) * (b & kLo32);
const uint64_t lo_hi = (a & kLo32) * (b >> 32);
const uint64_t hi_hi = (a >> 32) * (b >> 32);
const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
*upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
return (t << 32) | (lo_lo & kLo32);
#endif
}
HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) {
#if defined(__SIZEOF_INT128__)
__int128_t product = (__int128_t)a * (__int128_t)b;
*upper = (int64_t)(product >> 64);
return (int64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
return _mul128(a, b, upper);
#else
uint64_t unsigned_upper;
const int64_t lower = static_cast<int64_t>(Mul128(
static_cast<uint64_t>(a), static_cast<uint64_t>(b), &unsigned_upper));
*upper = static_cast<int64_t>(
unsigned_upper -
(static_cast<uint64_t>(ScalarShr(a, 63)) & static_cast<uint64_t>(b)) -
(static_cast<uint64_t>(ScalarShr(b, 63)) & static_cast<uint64_t>(a)));
return lower;
#endif
}
class Divisor {
public:
explicit Divisor(uint32_t divisor) : divisor_(divisor) {
if (divisor <= 1) return;
const uint32_t len =
static_cast<uint32_t>(31 - Num0BitsAboveMS1Bit_Nonzero32(divisor - 1));
const uint64_t u_hi = (2ULL << len) - divisor;
const uint32_t q = Truncate((u_hi << 32) / divisor);
mul_ = q + 1;
shift1_ = 1;
shift2_ = len;
}
uint32_t GetDivisor() const { return divisor_; }
uint32_t Divide(uint32_t n) const {
const uint64_t mul = mul_;
const uint32_t t = Truncate((mul * n) >> 32);
return (t + ((n - t) >> shift1_)) >> shift2_;
}
uint32_t Remainder(uint32_t n) const { return n - (Divide(n) * divisor_); }
private:
static uint32_t Truncate(uint64_t x) {
return static_cast<uint32_t>(x & 0xFFFFFFFFu);
}
uint32_t divisor_;
uint32_t mul_ = 1;
uint32_t shift1_ = 0;
uint32_t shift2_ = 0;
};
#ifndef HWY_HAVE_DIV128
#if (HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64) || \
(defined(__SIZEOF_INT128__) && !HWY_COMPILER_CLANGCL)
#define HWY_HAVE_DIV128 1
#else
#define HWY_HAVE_DIV128 0
#endif
#endif
#if HWY_HAVE_DIV128
#if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
#pragma intrinsic(_udiv128)
#pragma intrinsic(__umulh)
#endif
class Divisor64 {
public:
explicit Divisor64(uint64_t divisor) : divisor_(divisor) {
if (divisor <= 1) return;
const uint64_t len =
static_cast<uint64_t>(63 - Num0BitsAboveMS1Bit_Nonzero64(divisor - 1));
const uint64_t u_hi = (2ULL << len) - divisor;
const uint64_t q = Div128(u_hi, divisor);
mul_ = q + 1;
shift1_ = 1;
shift2_ = len;
}
uint64_t GetDivisor() const { return divisor_; }
uint64_t Divide(uint64_t n) const {
const uint64_t t = MulHigh(mul_, n);
return (t + ((n - t) >> shift1_)) >> shift2_;
}
uint64_t Remainder(uint64_t n) const { return n - (Divide(n) * divisor_); }
private:
uint64_t divisor_;
static uint64_t Div128(uint64_t hi, uint64_t div) {
#if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
unsigned __int64 remainder; return _udiv128(hi, uint64_t{0}, div, &remainder);
#else
using u128 = unsigned __int128;
const u128 hi128 = static_cast<u128>(hi) << 64;
return static_cast<uint64_t>(hi128 / static_cast<u128>(div));
#endif
}
static uint64_t MulHigh(uint64_t a, uint64_t b) {
#if HWY_COMPILER_MSVC >= 1920 && HWY_ARCH_X86_64
return __umulh(a, b);
#else
using u128 = unsigned __int128;
const u128 a128 = static_cast<u128>(a);
const u128 b128 = static_cast<u128>(b);
return static_cast<uint64_t>((a128 * b128) >> 64);
#endif
}
uint64_t mul_ = 1;
uint64_t shift1_ = 0;
uint64_t shift2_ = 0;
};
#else
class Divisor64 {
public:
explicit Divisor64(uint64_t divisor) : divisor_(divisor) {}
uint64_t GetDivisor() const { return divisor_; }
uint64_t Divide(uint64_t n) const { return n / divisor_; }
uint64_t Remainder(uint64_t n) const { return n % divisor_; }
private:
uint64_t divisor_;
};
#endif
namespace detail {
template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T ScalarAbs(hwy::FloatTag ,
T val) {
using TU = MakeUnsigned<T>;
return BitCastScalar<T>(
static_cast<TU>(BitCastScalar<TU>(val) & (~SignMask<T>())));
}
template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
ScalarAbs(hwy::SpecialTag , T val) {
return ScalarAbs(hwy::FloatTag(), val);
}
template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
ScalarAbs(hwy::SignedTag , T val) {
using TU = MakeUnsigned<T>;
return (val < T{0}) ? static_cast<T>(TU{0} - static_cast<TU>(val)) : val;
}
template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T
ScalarAbs(hwy::UnsignedTag , T val) {
return val;
}
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarAbs(T val) {
using TVal = MakeLaneTypeIfInteger<
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
return detail::ScalarAbs(hwy::TypeTag<TVal>(), static_cast<TVal>(val));
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsNaN(T val) {
using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
using TU = MakeUnsigned<TF>;
return (BitCastScalar<TU>(ScalarAbs(val)) > ExponentMask<TF>());
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsInf(T val) {
using TF = detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>;
using TU = MakeUnsigned<TF>;
return static_cast<TU>(BitCastScalar<TU>(static_cast<TF>(val)) << 1) ==
static_cast<TU>(MaxExponentTimes2<TF>());
}
namespace detail {
template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
hwy::FloatTag , T val) {
using TU = MakeUnsigned<T>;
return (BitCastScalar<TU>(hwy::ScalarAbs(val)) < ExponentMask<T>());
}
template <typename T>
static HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(
hwy::NonFloatTag , T ) {
return true;
}
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarIsFinite(T val) {
using TVal = MakeLaneTypeIfInteger<
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
return detail::ScalarIsFinite(hwy::IsFloatTag<TVal>(),
static_cast<TVal>(val));
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR RemoveCvRef<T> ScalarCopySign(T magn,
T sign) {
using TF = RemoveCvRef<detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
using TU = MakeUnsigned<TF>;
return BitCastScalar<TF>(static_cast<TU>(
(BitCastScalar<TU>(static_cast<TF>(magn)) & (~SignMask<TF>())) |
(BitCastScalar<TU>(static_cast<TF>(sign)) & SignMask<TF>())));
}
template <typename T>
HWY_API HWY_BITCASTSCALAR_CONSTEXPR bool ScalarSignBit(T val) {
using TVal = MakeLaneTypeIfInteger<
detail::NativeSpecialFloatToWrapper<RemoveCvRef<T>>>;
using TU = MakeUnsigned<TVal>;
return ((BitCastScalar<TU>(static_cast<TVal>(val)) & SignMask<TVal>()) != 0);
}
#if HWY_ARCH_PPC && (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \
!defined(_SOFT_FLOAT)
template <class T, HWY_IF_F32(T)>
HWY_API void PreventElision(T&& output) {
asm volatile("" : "+f"(output)::"memory");
}
template <class T, HWY_IF_F64(T)>
HWY_API void PreventElision(T&& output) {
asm volatile("" : "+d"(output)::"memory");
}
template <class T, HWY_IF_NOT_FLOAT3264(T)>
HWY_API void PreventElision(T&& output) {
asm volatile("" : "+r"(output)::"memory");
}
#else
template <class T>
HWY_API void PreventElision(T&& output) {
#if HWY_COMPILER_MSVC
static std::atomic<RemoveCvRef<T>> sink;
sink.store(output, std::memory_order_relaxed);
#else
asm volatile("" : "+r"(output) : : "memory");
#endif
}
#endif
}
#endif