#if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
#define HWY_S390X_HAVE_Z14 1
#else
#define HWY_S390X_HAVE_Z14 0
#endif
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")
#undef vector
#undef pixel
#undef bool
#if HWY_S390X_HAVE_Z14
#include <vecintrin.h>
#else
#include <altivec.h>
#endif
#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")
#include "hwy/ops/shared-inl.h"
#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \
(defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__))
#define HWY_PPC_HAVE_9 1
#else
#define HWY_PPC_HAVE_9 0
#endif
#if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \
(defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
#define HWY_PPC_HAVE_10 1
#else
#define HWY_PPC_HAVE_10 0
#endif
#if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13
#define HWY_S390X_HAVE_Z15 1
#else
#define HWY_S390X_HAVE_Z15 0
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
template <typename T>
struct Raw128;
#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \
template <> \
struct Raw128<LANE_TYPE> { \
using type = __vector RAW_VECT_LANE_TYPE; \
using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \
using RawT = RAW_VECT_LANE_TYPE; \
typedef LANE_TYPE AlignedRawVec \
__attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \
typedef LANE_TYPE UnalignedRawVec __attribute__(( \
__vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \
};
HWY_VSX_RAW128(int8_t, signed char, char)
HWY_VSX_RAW128(uint8_t, unsigned char, char)
HWY_VSX_RAW128(int16_t, signed short, short) HWY_VSX_RAW128(uint16_t, unsigned short, short) HWY_VSX_RAW128(int32_t, signed int, int)
HWY_VSX_RAW128(uint32_t, unsigned int, int)
HWY_VSX_RAW128(int64_t, signed long long, long long) HWY_VSX_RAW128(uint64_t, unsigned long long, long long) HWY_VSX_RAW128(float, float, int)
HWY_VSX_RAW128(double, double, long long)
template <>
struct Raw128<bfloat16_t> : public Raw128<uint16_t> {};
template <>
struct Raw128<float16_t> : public Raw128<uint16_t> {};
#undef HWY_VSX_RAW128
}
template <typename T, size_t N = 16 / sizeof(T)>
class Vec128 {
using Raw = typename detail::Raw128<T>::type;
public:
using PrivateT = T; static constexpr size_t kPrivateN = N;
HWY_INLINE Vec128& operator*=(const Vec128 other) {
return *this = (*this * other);
}
HWY_INLINE Vec128& operator/=(const Vec128 other) {
return *this = (*this / other);
}
HWY_INLINE Vec128& operator+=(const Vec128 other) {
return *this = (*this + other);
}
HWY_INLINE Vec128& operator-=(const Vec128 other) {
return *this = (*this - other);
}
HWY_INLINE Vec128& operator%=(const Vec128 other) {
return *this = (*this % other);
}
HWY_INLINE Vec128& operator&=(const Vec128 other) {
return *this = (*this & other);
}
HWY_INLINE Vec128& operator|=(const Vec128 other) {
return *this = (*this | other);
}
HWY_INLINE Vec128& operator^=(const Vec128 other) {
return *this = (*this ^ other);
}
Raw raw;
};
template <typename T>
using Vec64 = Vec128<T, 8 / sizeof(T)>;
template <typename T>
using Vec32 = Vec128<T, 4 / sizeof(T)>;
template <typename T>
using Vec16 = Vec128<T, 2 / sizeof(T)>;
template <typename T, size_t N = 16 / sizeof(T)>
struct Mask128 {
typename detail::Raw128<T>::RawBoolVec raw;
using PrivateT = T; static constexpr size_t kPrivateN = N; };
template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
template <class M>
using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
template <class V>
using TFromV = typename V::PrivateT;
template <class D, typename T = TFromD<D>>
HWY_API Vec128<T, HWY_MAX_LANES_D(D)> Zero(D ) {
return Vec128<T, HWY_MAX_LANES_D(D)>{
reinterpret_cast<typename detail::Raw128<T>::type>(vec_splats(0))};
}
template <class D>
using VFromD = decltype(Zero(D()));
template <class D, typename FromT>
HWY_API VFromD<D> BitCast(D ,
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
return VFromD<D>{
reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
}
template <class D, typename FromV>
HWY_API VFromD<D> ResizeBitCast(D , FromV v) {
return VFromD<D>{
reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
}
template <class D, HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)>
HWY_API VFromD<D> Set(D , TFromD<D> t) {
using RawLane = typename detail::Raw128<TFromD<D>>::RawT;
return VFromD<D>{vec_splats(static_cast<RawLane>(t))};
}
template <class D, HWY_IF_SPECIAL_FLOAT(TFromD<D>)>
HWY_API VFromD<D> Set(D d, TFromD<D> t) {
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, Set(du, BitCastScalar<TFromD<decltype(du)>>(t)));
}
template <class D>
HWY_API VFromD<D> Undefined(D d) {
#if HWY_COMPILER_GCC_ACTUAL
return Zero(d);
#elif HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
return VFromD<D>{__builtin_nondeterministic_value(Zero(d).raw)};
#else
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
typename detail::Raw128<TFromD<D>>::type raw;
return VFromD<decltype(d)>{raw};
HWY_DIAGNOSTICS(pop)
#endif
}
template <typename T, size_t N>
HWY_API T GetLane(Vec128<T, N> v) {
return static_cast<T>(v.raw[0]);
}
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> Dup128VecFromValues(D , TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
TFromD<D> t11, TFromD<D> t12,
TFromD<D> t13, TFromD<D> t14,
TFromD<D> t15) {
const typename detail::Raw128<TFromD<D>>::type raw = {
t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15};
return VFromD<D>{raw};
}
template <class D, HWY_IF_UI16_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D , TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3,
t4, t5, t6, t7};
return VFromD<D>{raw};
}
template <class D, HWY_IF_SPECIAL_FLOAT_D(D)>
HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
TFromD<D> t5, TFromD<D> t6,
TFromD<D> t7) {
const RebindToUnsigned<decltype(d)> du;
return BitCast(
d, Dup128VecFromValues(
du, BitCastScalar<uint16_t>(t0), BitCastScalar<uint16_t>(t1),
BitCastScalar<uint16_t>(t2), BitCastScalar<uint16_t>(t3),
BitCastScalar<uint16_t>(t4), BitCastScalar<uint16_t>(t5),
BitCastScalar<uint16_t>(t6), BitCastScalar<uint16_t>(t7)));
}
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Dup128VecFromValues(D , TFromD<D> t0, TFromD<D> t1,
TFromD<D> t2, TFromD<D> t3) {
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1, t2, t3};
return VFromD<D>{raw};
}
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> Dup128VecFromValues(D , TFromD<D> t0, TFromD<D> t1) {
const typename detail::Raw128<TFromD<D>>::type raw = {t0, t1};
return VFromD<D>{raw};
}
template <typename T, size_t N>
HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw});
#else
return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)});
#endif
}
template <typename T, size_t N>
HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
const DFromV<decltype(mask)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
return BitCast(
d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)});
}
template <typename T, size_t N>
HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw});
#else
return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)});
#endif
}
template <typename T, size_t N>
HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw});
#else
return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)});
#endif
}
template <typename T, size_t N>
HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)});
}
namespace detail {
template <class RawV>
static HWY_INLINE bool IsConstantRawAltivecVect(
hwy::SizeTag<1> , RawV v) {
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) &&
__builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) &&
__builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) &&
__builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) &&
__builtin_constant_p(v[14]) && __builtin_constant_p(v[15]);
}
template <class RawV>
static HWY_INLINE bool IsConstantRawAltivecVect(
hwy::SizeTag<2> , RawV v) {
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) &&
__builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) &&
__builtin_constant_p(v[6]) && __builtin_constant_p(v[7]);
}
template <class RawV>
static HWY_INLINE bool IsConstantRawAltivecVect(
hwy::SizeTag<4> , RawV v) {
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) &&
__builtin_constant_p(v[2]) && __builtin_constant_p(v[3]);
}
template <class RawV>
static HWY_INLINE bool IsConstantRawAltivecVect(
hwy::SizeTag<8> , RawV v) {
return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]);
}
template <class RawV>
static HWY_INLINE bool IsConstantRawAltivecVect(RawV v) {
return IsConstantRawAltivecVect(hwy::SizeTag<sizeof(decltype(v[0]))>(), v);
}
}
#if HWY_PPC_HAVE_10
namespace detail {
template <uint8_t kTernLogOp, class V>
HWY_INLINE V TernaryLogic(V a, V b, V c) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
const auto a_raw = BitCast(du, a).raw;
const auto b_raw = BitCast(du, b).raw;
const auto c_raw = BitCast(du, c).raw;
#if HWY_COMPILER_GCC_ACTUAL
typename detail::Raw128<TFromV<VU>>::type raw_ternlog_result;
__asm__("xxeval %x0,%x1,%x2,%x3,%4"
: "=wa"(raw_ternlog_result)
: "wa"(a_raw), "wa"(b_raw), "wa"(c_raw),
"n"(static_cast<unsigned>(kTernLogOp))
:);
#else
const auto raw_ternlog_result =
vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp);
#endif
return BitCast(d, VU{raw_ternlog_result});
}
} #endif
template <typename T, size_t N>
HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
#if HWY_PPC_HAVE_10
#if defined(__OPTIMIZE__)
if (static_cast<int>(detail::IsConstantRawAltivecVect(x1.raw)) +
static_cast<int>(detail::IsConstantRawAltivecVect(x2.raw)) +
static_cast<int>(detail::IsConstantRawAltivecVect(x3.raw)) >=
2) {
return Xor(x1, Xor(x2, x3));
} else #endif
{
return detail::TernaryLogic<0x69>(x1, x2, x3);
}
#else
return Xor(x1, Xor(x2, x3));
#endif
}
template <typename T, size_t N>
HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
#if HWY_PPC_HAVE_10
#if defined(__OPTIMIZE__)
if (static_cast<int>(detail::IsConstantRawAltivecVect(o1.raw)) +
static_cast<int>(detail::IsConstantRawAltivecVect(o2.raw)) +
static_cast<int>(detail::IsConstantRawAltivecVect(o3.raw)) >=
2) {
return Or(o1, Or(o2, o3));
} else #endif
{
return detail::TernaryLogic<0x7F>(o1, o2, o3);
}
#else
return Or(o1, Or(o2, o3));
#endif
}
template <typename T, size_t N>
HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
#if HWY_PPC_HAVE_10
#if defined(__OPTIMIZE__)
if (detail::IsConstantRawAltivecVect(a1.raw) &&
detail::IsConstantRawAltivecVect(a2.raw)) {
return Or(o, And(a1, a2));
} else #endif
{
return detail::TernaryLogic<0x1F>(o, a1, a2);
}
#else
return Or(o, And(a1, a2));
#endif
}
template <typename T, size_t N>
HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
const DFromV<decltype(yes)> d;
const RebindToUnsigned<decltype(d)> du;
return BitCast(
d, VFromD<decltype(du)>{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw,
BitCast(du, mask).raw)});
}
#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
#else
#define HWY_NATIVE_BITWISE_IF_THEN_ELSE
#endif
template <class V>
HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
return IfVecThenElse(mask, yes, no);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator&(Vec128<T, N> a, Vec128<T, N> b) {
return And(a, b);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator|(Vec128<T, N> a, Vec128<T, N> b) {
return Or(a, b);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator^(Vec128<T, N> a, Vec128<T, N> b) {
return Xor(a, b);
}
template <typename T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
return Zero(d) - v;
}
template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
#if HWY_S390X_HAVE_Z14
return Xor(v, SignBit(DFromV<decltype(v)>()));
#else
return Vec128<T, N>{vec_neg(v.raw)};
#endif
}
template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
return Xor(v, SignBit(DFromV<decltype(v)>()));
}
template <class T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
return Max(v, Neg(v));
}
template <class T, size_t N, HWY_IF_FLOAT3264(T)>
HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
return Vec128<T, N>{vec_abs(v.raw)};
}
#if HWY_S390X_HAVE_Z14
template <class V>
HWY_API V CopySign(const V magn, const V sign) {
static_assert(IsFloat<TFromV<V>>(), "Only makes sense for floating-point");
const DFromV<decltype(magn)> d;
const auto msb = SignBit(d);
return BitwiseIfThenElse(msb, sign, magn);
}
#else
template <size_t N>
HWY_API Vec128<float, N> CopySign(Vec128<float, N> magn,
Vec128<float, N> sign) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
return Vec128<float, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp)
return Vec128<float, N>{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)};
#else
return Vec128<float, N>{vec_cpsgn(sign.raw, magn.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<double, N> CopySign(Vec128<double, N> magn,
Vec128<double, N> sign) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200
return Vec128<double, N>{__builtin_vec_copysign(magn.raw, sign.raw)};
#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \
HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp)
return Vec128<double, N>{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)};
#else
return Vec128<double, N>{vec_cpsgn(sign.raw, magn.raw)};
#endif
}
#endif
template <typename T, size_t N>
HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
static_assert(IsFloat<T>(), "Only makes sense for floating-point");
return CopySign(abs, sign);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API Vec128<T> Load(D , const T* HWY_RESTRICT aligned) {
#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
#endif
using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
using ResultRaw = typename detail::Raw128<T>::type;
return Vec128<T>{reinterpret_cast<ResultRaw>(*p)};
#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(pop)
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
using BitsT = UnsignedFromSize<d.MaxBytes()>;
BitsT bits;
const Repartition<BitsT, decltype(d)> d_bits;
CopyBytes<d.MaxBytes()>(p, &bits);
return BitCast(d, Set(d_bits, bits));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> MaskFromVec(Vec128<T, N> v) {
using Raw = typename detail::Raw128<T>::RawBoolVec;
return Mask128<T, N>{reinterpret_cast<Raw>(v.raw)};
}
template <class D>
using MFromD = decltype(MaskFromVec(VFromD<D>()));
template <typename T, size_t N>
HWY_API Vec128<T, N> VecFromMask(Mask128<T, N> v) {
return Vec128<T, N>{
reinterpret_cast<typename detail::Raw128<T>::type>(v.raw)};
}
template <class D>
HWY_API VFromD<D> VecFromMask(D , MFromD<D> v) {
return VFromD<D>{
reinterpret_cast<typename detail::Raw128<TFromD<D>>::type>(v.raw)};
}
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
Vec128<T, N> no) {
const DFromV<decltype(yes)> d;
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, VFromD<decltype(du)>{vec_sel(
BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)});
}
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
}
template <typename T, size_t N>
HWY_API Mask128<T, N> Not(Mask128<T, N> m) {
return Mask128<T, N>{vec_nor(m.raw, m.raw)};
}
template <typename T, size_t N>
HWY_API Mask128<T, N> And(Mask128<T, N> a, Mask128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Mask128<T, N>{a.raw & b.raw};
#else
return Mask128<T, N>{vec_and(a.raw, b.raw)};
#endif
}
template <typename T, size_t N>
HWY_API Mask128<T, N> AndNot(Mask128<T, N> a, Mask128<T, N> b) {
return Mask128<T, N>{vec_andc(b.raw, a.raw)};
}
template <typename T, size_t N>
HWY_API Mask128<T, N> Or(Mask128<T, N> a, Mask128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Mask128<T, N>{a.raw | b.raw};
#else
return Mask128<T, N>{vec_or(a.raw, b.raw)};
#endif
}
template <typename T, size_t N>
HWY_API Mask128<T, N> Xor(Mask128<T, N> a, Mask128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Mask128<T, N>{a.raw ^ b.raw};
#else
return Mask128<T, N>{vec_xor(a.raw, b.raw)};
#endif
}
template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(Mask128<T, N> a, Mask128<T, N> b) {
return Mask128<T, N>{vec_nor(a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, const int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
#if HWY_S390X_HAVE_Z14
return BitCast(d,
VFromD<decltype(du)>{BitCast(du, v).raw
<< Set(du, static_cast<TU>(bits)).raw});
#else
return BitCast(
d, VFromD<decltype(du)>{
vec_sl(BitCast(du, v).raw, Set(du, static_cast<TU>(bits)).raw)});
#endif
}
template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
#if HWY_S390X_HAVE_Z14
return Vec128<T, N>{v.raw >> vec_splats(static_cast<TU>(bits))};
#else
return Vec128<T, N>{vec_sr(v.raw, vec_splats(static_cast<TU>(bits)))};
#endif
}
template <typename T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, const int bits) {
#if HWY_S390X_HAVE_Z14
using TI = typename detail::Raw128<T>::RawT;
return Vec128<T, N>{v.raw >> vec_splats(static_cast<TI>(bits))};
#else
using TU = typename detail::Raw128<MakeUnsigned<T>>::RawT;
return Vec128<T, N>{vec_sra(v.raw, vec_splats(static_cast<TU>(bits)))};
#endif
}
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
return ShiftLeftSame(v, kBits);
}
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
return ShiftRightSame(v, kBits);
}
template <typename T, size_t N, HWY_IF_SIGNED(T)>
HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
return ShiftRightSame(v, static_cast<int>(sizeof(T) * 8 - 1));
}
template <typename T, size_t N, typename TI, size_t NI>
HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes,
Vec128<TI, NI> from) {
const Repartition<uint8_t, DFromV<decltype(from)>> du8_from;
return Vec128<TI, NI>{reinterpret_cast<typename detail::Raw128<TI>::type>(
vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))};
}
template <class V, class VI>
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
const DFromV<VI> di;
Repartition<int8_t, decltype(di)> di8;
const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from)));
return AndNot(zeroOutMask, TableLookupBytes(bytes, from));
}
#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
HWY_COMPILER_GCC_ACTUAL < 900
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
HWY_IF_T_SIZE_D(D, 1)>
HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
const Repartition<uint8_t, decltype(d)> du8;
return TableLookupBytes(
v, BitCast(d, Dup128VecFromValues(du8, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
5, 4, 3, 2, 1, 0)));
}
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
HWY_IF_T_SIZE_D(D, 2)>
HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
const Repartition<uint8_t, decltype(d)> du8;
return TableLookupBytes(
v, BitCast(d, Dup128VecFromValues(du8, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
4, 5, 2, 3, 0, 1)));
}
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
HWY_IF_T_SIZE_D(D, 4)>
HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
const Repartition<uint8_t, decltype(d)> du8;
return TableLookupBytes(
v, BitCast(d, Dup128VecFromValues(du8, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5,
6, 7, 0, 1, 2, 3)));
}
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1),
HWY_IF_T_SIZE_D(D, 8)>
HWY_API Vec128<T> Reverse(D , Vec128<T> v) {
return Vec128<T>{vec_sld(v.raw, v.raw, 8)};
}
#else
template <class D, typename T = TFromD<D>, HWY_IF_LANES_GT_D(D, 1)>
HWY_API Vec128<T> Reverse(D , Vec128<T> v) {
return Vec128<T>{vec_reve(v.raw)};
}
#endif
template <typename T, size_t N>
HWY_API Vec128<T, N> Shuffle2301(Vec128<T, N> v) {
static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
static_assert(N == 2 || N == 4, "Does not make sense for N=1");
const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
12, 13, 14, 15, 8, 9, 10, 11};
return Vec128<T, N>{vec_perm(v.raw, v.raw, kShuffle)};
}
namespace detail {
template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo2301(Vec32<T> a, Vec32<T> b) {
const __vector unsigned char kShuffle16 = {1, 0, 19, 18};
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle16)};
}
template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo2301(Vec64<T> a, Vec64<T> b) {
const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21};
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo2301(Vec128<T> a, Vec128<T> b) {
const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3,
28, 29, 30, 31, 24, 25, 26, 27};
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo1230(Vec32<T> a, Vec32<T> b) {
const __vector unsigned char kShuffle = {0, 3, 18, 17};
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo1230(Vec64<T> a, Vec64<T> b) {
const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19};
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo1230(Vec128<T> a, Vec128<T> b) {
const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15,
24, 25, 26, 27, 20, 21, 22, 23};
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <typename T, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ShuffleTwo3012(Vec32<T> a, Vec32<T> b) {
const __vector unsigned char kShuffle = {2, 1, 16, 19};
return Vec32<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <typename T, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ShuffleTwo3012(Vec64<T> a, Vec64<T> b) {
const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23};
return Vec64<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
template <typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ShuffleTwo3012(Vec128<T> a, Vec128<T> b) {
const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7,
16, 17, 18, 19, 28, 29, 30, 31};
return Vec128<T>{vec_perm(a.raw, b.raw, kShuffle)};
}
}
template <class T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
const Full128<T> d;
const Full128<uint64_t> du64;
return BitCast(d, Reverse(du64, BitCast(du64, v)));
}
template <class T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
return Reverse(Full128<T>(), v);
}
template <class T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
#if HWY_IS_LITTLE_ENDIAN
return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
#else
return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
#endif
}
template <class T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
#if HWY_IS_LITTLE_ENDIAN
return Vec128<T>{vec_sld(v.raw, v.raw, 4)};
#else
return Vec128<T>{vec_sld(v.raw, v.raw, 12)};
#endif
}
template <class T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
return Reverse(Full128<T>(), v);
}
template <class DTo, typename TFrom, size_t NFrom>
HWY_API MFromD<DTo> RebindMask(DTo , Mask128<TFrom, NFrom> m) {
static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
return MFromD<DTo>{m.raw};
}
template <typename T, size_t N>
HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
return (v & bit) == bit;
}
template <typename T, size_t N>
HWY_API Mask128<T, N> operator==(Vec128<T, N> a, Vec128<T, N> b) {
return Mask128<T, N>{vec_cmpeq(a.raw, b.raw)};
}
template <size_t N>
HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
Vec128<uint8_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<uint8_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
Vec128<uint16_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<uint16_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
Vec128<uint32_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<uint32_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
return Not(a == b);
}
template <size_t N>
HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
Vec128<int8_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<int8_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<int16_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
#if HWY_PPC_HAVE_9
return Mask128<int32_t, N>{vec_cmpne(a.raw, b.raw)};
#else
return Not(a == b);
#endif
}
template <size_t N>
HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
Vec128<int64_t, N> b) {
return Not(a == b);
}
template <size_t N>
HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
return Not(a == b);
}
template <size_t N>
HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
Vec128<double, N> b) {
return Not(a == b);
}
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
return Mask128<T, N>{vec_cmpgt(a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
return Mask128<T, N>{vec_cmpge(a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
return Not(b > a);
}
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
return b > a;
}
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
return b >= a;
}
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API Vec128<T> LoadU(D , const T* HWY_RESTRICT p) {
using LoadRaw = typename detail::Raw128<T>::UnalignedRawVec;
const LoadRaw* HWY_RESTRICT praw = reinterpret_cast<const LoadRaw*>(p);
using ResultRaw = typename detail::Raw128<T>::type;
return Vec128<T>{reinterpret_cast<ResultRaw>(*praw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
return Load(d, p);
}
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> LoadDup128(D d, const T* HWY_RESTRICT p) {
return LoadU(d, p);
}
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
#ifdef HWY_NATIVE_LOAD_N
#undef HWY_NATIVE_LOAD_N
#else
#define HWY_NATIVE_LOAD_N
#endif
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> LoadN(D d, const T* HWY_RESTRICT p,
size_t max_lanes_to_load) {
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
return Zero(d);
}
if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
return LoadU(d, p);
}
#endif
const size_t num_of_bytes_to_load =
HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
const Repartition<uint8_t, decltype(d)> du8;
#if HWY_S390X_HAVE_Z14
return (num_of_bytes_to_load > 0)
? BitCast(d, VFromD<decltype(du8)>{vec_load_len(
const_cast<unsigned char*>(
reinterpret_cast<const unsigned char*>(p)),
static_cast<unsigned>(num_of_bytes_to_load - 1))})
: Zero(d);
#else
return BitCast(
d,
VFromD<decltype(du8)>{vec_xl_len(
const_cast<unsigned char*>(reinterpret_cast<const unsigned char*>(p)),
num_of_bytes_to_load)});
#endif
}
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
size_t max_lanes_to_load) {
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) {
return no;
}
if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) &&
max_lanes_to_load >= HWY_MAX_LANES_D(D)) {
return LoadU(d, p);
}
#endif
return IfThenElse(FirstN(d, max_lanes_to_load),
LoadN(d, p, max_lanes_to_load), no);
}
#endif
namespace detail {
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> Iota0(D d) {
constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15};
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU8Iota0});
}
template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7};
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU16Iota0});
}
template <class D, HWY_IF_UI32_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3};
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU32Iota0});
}
template <class D, HWY_IF_UI64_D(D)>
HWY_INLINE VFromD<D> Iota0(D d) {
constexpr __vector unsigned long long kU64Iota0 = {0, 1};
return BitCast(d, VFromD<RebindToUnsigned<D>>{kU64Iota0});
}
template <class D, HWY_IF_F32_D(D)>
HWY_INLINE VFromD<D> Iota0(D ) {
constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f};
return VFromD<D>{kF32Iota0};
}
template <class D, HWY_IF_F64_D(D)>
HWY_INLINE VFromD<D> Iota0(D ) {
constexpr __vector double kF64Iota0 = {0.0, 1.0};
return VFromD<D>{kF64Iota0};
}
}
template <class D, typename T2>
HWY_API VFromD<D> Iota(D d, const T2 first) {
return detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
}
template <class D>
HWY_API MFromD<D> FirstN(D d, size_t num) {
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
return RebindMask(d, Iota(du, 0) < Set(du, static_cast<TU>(num)));
}
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, const T* HWY_RESTRICT p) {
return IfThenElseZero(m, LoadU(d, p));
}
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
const T* HWY_RESTRICT p) {
return IfThenElse(m, LoadU(d, p), v);
}
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API void Store(Vec128<T> v, D , T* HWY_RESTRICT aligned) {
#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4649, ignored "-Wignored-attributes")
#endif
using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
*HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
#if HWY_COMPILER_GCC
HWY_DIAGNOSTICS(pop)
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API void StoreU(Vec128<T> v, D , T* HWY_RESTRICT p) {
using StoreRaw = typename detail::Raw128<T>::UnalignedRawVec;
*reinterpret_cast<StoreRaw*>(p) = reinterpret_cast<StoreRaw>(v.raw);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
using BitsT = UnsignedFromSize<d.MaxBytes()>;
const Repartition<BitsT, decltype(d)> d_bits;
const BitsT bits = GetLane(BitCast(d_bits, v));
CopyBytes<d.MaxBytes()>(&bits, p);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
Store(v, d, p);
}
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
#ifdef HWY_NATIVE_STORE_N
#undef HWY_NATIVE_STORE_N
#else
#define HWY_NATIVE_STORE_N
#endif
template <class D, typename T = TFromD<D>>
HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
size_t max_lanes_to_store) {
#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) {
return;
}
if (__builtin_constant_p(max_lanes_to_store >= HWY_MAX_LANES_D(D)) &&
max_lanes_to_store >= HWY_MAX_LANES_D(D)) {
StoreU(v, d, p);
return;
}
#endif
const size_t num_of_bytes_to_store =
HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD<D>);
const Repartition<uint8_t, decltype(d)> du8;
#if HWY_S390X_HAVE_Z14
if (num_of_bytes_to_store > 0) {
vec_store_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
static_cast<unsigned>(num_of_bytes_to_store - 1));
}
#else
vec_xst_len(BitCast(du8, v).raw, reinterpret_cast<unsigned char*>(p),
num_of_bytes_to_store);
#endif
}
#endif
template <class D>
HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
TFromD<D>* HWY_RESTRICT p) {
const VFromD<D> old = LoadU(d, p);
StoreU(IfThenElse(RebindMask(d, m), v, old), d, p);
}
namespace detail {
template <class D>
using RebindToUnsignedIfNotFloat =
hwy::If<(!hwy::IsFloat<TFromD<D>>() && !hwy::IsSpecialFloat<TFromD<D>>()),
RebindToUnsigned<D>, D>;
}
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw +
BitCast(d_arith, b).raw});
#else
return BitCast(d, VFromD<decltype(d_arith)>{vec_add(
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
#endif
}
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw -
BitCast(d_arith, b).raw});
#else
return BitCast(d, VFromD<decltype(d_arith)>{vec_sub(
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
#endif
}
template <class V, HWY_IF_U8(TFromV<V>)>
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
return SumsOf2(SumsOf4(v));
}
template <class V, HWY_IF_I8(TFromV<V>)>
HWY_API VFromD<RepartitionToWideX3<DFromV<V>>> SumsOf8(V v) {
#if HWY_S390X_HAVE_Z14
const DFromV<decltype(v)> di8;
const RebindToUnsigned<decltype(di8)> du8;
const RepartitionToWideX3<decltype(di8)> di64;
return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) +
Set(di64, int64_t{-1024});
#else
return SumsOf2(SumsOf4(v));
#endif
}
#if HWY_S390X_HAVE_Z14
template <typename T, size_t N, HWY_IF_UNSIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
return Add(a, Min(b, Not(a)));
}
template <typename T, size_t N, HWY_IF_SIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const auto sum = Add(a, b);
const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
return IfNegativeThenElse(overflow_mask, overflow_result, sum);
}
#else
#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
#undef HWY_NATIVE_I32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I32_SATURATED_ADDSUB
#endif
#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
#undef HWY_NATIVE_U32_SATURATED_ADDSUB
#else
#define HWY_NATIVE_U32_SATURATED_ADDSUB
#endif
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_adds(a.raw, b.raw)};
}
#endif
#if HWY_PPC_HAVE_10
#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
#undef HWY_NATIVE_I64_SATURATED_ADDSUB
#else
#define HWY_NATIVE_I64_SATURATED_ADDSUB
#endif
template <class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V SaturatedAdd(V a, V b) {
const DFromV<decltype(a)> d;
const auto sum = Add(a, b);
const auto overflow_mask =
BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum));
const auto overflow_result =
Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
return IfNegativeThenElse(overflow_mask, overflow_result, sum);
}
#endif
#if HWY_S390X_HAVE_Z14
template <typename T, size_t N, HWY_IF_UNSIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
return Sub(a, Min(a, b));
}
template <typename T, size_t N, HWY_IF_SIGNED(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const auto diff = Sub(a, b);
const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax<T>()));
return IfNegativeThenElse(overflow_mask, overflow_result, diff);
}
#else
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_subs(a.raw, b.raw)};
}
#endif
#if HWY_PPC_HAVE_10
template <class V, HWY_IF_I64_D(DFromV<V>)>
HWY_API V SaturatedSub(V a, V b) {
const DFromV<decltype(a)> d;
const auto diff = Sub(a, b);
const auto overflow_mask =
BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff));
const auto overflow_result =
Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
return IfNegativeThenElse(overflow_mask, overflow_result, diff);
}
#endif
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
#undef HWY_NATIVE_AVERAGE_ROUND_UI32
#else
#define HWY_NATIVE_AVERAGE_ROUND_UI32
#endif
#if HWY_S390X_HAVE_Z14
#ifdef HWY_NATIVE_AVERAGE_ROUND_UI64
#undef HWY_NATIVE_AVERAGE_ROUND_UI64
#else
#define HWY_NATIVE_AVERAGE_ROUND_UI64
#endif
#define HWY_PPC_IF_AVERAGE_ROUND_T(T) void* = nullptr
#else
#define HWY_PPC_IF_AVERAGE_ROUND_T(T) \
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
#endif
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_PPC_IF_AVERAGE_ROUND_T(T)>
HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_avg(a.raw, b.raw)};
}
#undef HWY_PPC_IF_AVERAGE_ROUND_T
#ifdef HWY_NATIVE_MUL_8
#undef HWY_NATIVE_MUL_8
#else
#define HWY_NATIVE_MUL_8
#endif
#ifdef HWY_NATIVE_MUL_64
#undef HWY_NATIVE_MUL_64
#else
#define HWY_NATIVE_MUL_64
#endif
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> operator*(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const detail::RebindToUnsignedIfNotFloat<decltype(d)> d_arith;
#if HWY_S390X_HAVE_Z14
return BitCast(d, VFromD<decltype(d_arith)>{BitCast(d_arith, a).raw *
BitCast(d_arith, b).raw});
#else
return BitCast(d, VFromD<decltype(d_arith)>{vec_mul(
BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)});
#endif
}
#if HWY_S390X_HAVE_Z14
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
#elif HWY_PPC_HAVE_10
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))
#else
#define HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T) \
hwy::EnableIf<!hwy::IsSame<T, T>()>* = nullptr
#define HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T) \
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))
#endif
#if HWY_S390X_HAVE_Z14 || HWY_PPC_HAVE_10
template <typename T, size_t N, HWY_PPC_IF_MULHIGH_USING_VEC_MULH(T),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_mulh(a.raw, b.raw)};
}
#endif
template <typename T, HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, 1> MulHigh(Vec128<T, 1> a, Vec128<T, 1> b) {
const auto p_even = MulEven(a, b);
#if HWY_IS_LITTLE_ENDIAN
const auto p_even_full = ResizeBitCast(Full128<T>(), p_even);
return Vec128<T, 1>{
vec_sld(p_even_full.raw, p_even_full.raw, 16 - sizeof(T))};
#else
const DFromV<decltype(a)> d;
return ResizeBitCast(d, p_even);
#endif
}
template <typename T, size_t N,
HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH(T),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T), HWY_IF_LANES_GT(N, 1)>
HWY_API Vec128<T, N> MulHigh(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const auto p_even = BitCast(d, MulEven(a, b));
const auto p_odd = BitCast(d, MulOdd(a, b));
#if HWY_IS_LITTLE_ENDIAN
return InterleaveOdd(d, p_even, p_odd);
#else
return InterleaveEven(d, p_even, p_odd);
#endif
}
#if !HWY_PPC_HAVE_10
template <class T, HWY_IF_UI64(T)>
HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
T p_hi;
Mul128(GetLane(a), GetLane(b), &p_hi);
return Set(Full64<T>(), p_hi);
}
template <class T, HWY_IF_UI64(T)>
HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
const DFromV<decltype(a)> d;
const Half<decltype(d)> dh;
return Combine(d, MulHigh(UpperHalf(dh, a), UpperHalf(dh, b)),
MulHigh(LowerHalf(dh, a), LowerHalf(dh, b)));
}
#endif
#undef HWY_PPC_IF_MULHIGH_USING_VEC_MULH
#undef HWY_PPC_IF_MULHIGH_8_16_32_NOT_USING_VEC_MULH
template <typename T, size_t N,
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulEven(Vec128<T, N> a,
Vec128<T, N> b) {
return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mule(a.raw, b.raw)};
}
template <typename T, size_t N,
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<MakeWide<T>, (N + 1) / 2> MulOdd(Vec128<T, N> a,
Vec128<T, N> b) {
return Vec128<MakeWide<T>, (N + 1) / 2>{vec_mulo(a.raw, b.raw)};
}
#ifdef HWY_NATIVE_ROL_ROR_8
#undef HWY_NATIVE_ROL_ROR_8
#else
#define HWY_NATIVE_ROL_ROR_8
#endif
#ifdef HWY_NATIVE_ROL_ROR_16
#undef HWY_NATIVE_ROL_ROR_16
#else
#define HWY_NATIVE_ROL_ROR_16
#endif
#ifdef HWY_NATIVE_ROL_ROR_32_64
#undef HWY_NATIVE_ROL_ROR_32_64
#else
#define HWY_NATIVE_ROL_ROR_32_64
#endif
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> Rol(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RebindToUnsigned<decltype(d)> du;
return BitCast(
d, VFromD<decltype(du)>{vec_rl(BitCast(du, a).raw, BitCast(du, b).raw)});
}
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> Ror(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
return Rol(a, BitCast(d, Neg(BitCast(di, b))));
}
template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
const DFromV<decltype(v)> d;
constexpr size_t kSizeInBits = sizeof(T) * 8;
static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
return (kBits == 0)
? v
: Rol(v, Set(d, static_cast<T>(static_cast<int>(kSizeInBits) -
kBits)));
}
#ifdef HWY_NATIVE_ROL_ROR_SAME_8
#undef HWY_NATIVE_ROL_ROR_SAME_8
#else
#define HWY_NATIVE_ROL_ROR_SAME_8
#endif
#ifdef HWY_NATIVE_ROL_ROR_SAME_16
#undef HWY_NATIVE_ROL_ROR_SAME_16
#else
#define HWY_NATIVE_ROL_ROR_SAME_16
#endif
#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
#undef HWY_NATIVE_ROL_ROR_SAME_32_64
#else
#define HWY_NATIVE_ROL_ROR_SAME_32_64
#endif
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> RotateLeftSame(Vec128<T, N> v, int bits) {
const DFromV<decltype(v)> d;
return Rol(v, Set(d, static_cast<T>(static_cast<unsigned>(bits))));
}
template <typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
HWY_API Vec128<T, N> RotateRightSame(Vec128<T, N> v, int bits) {
const DFromV<decltype(v)> d;
return Rol(v, Set(d, static_cast<T>(0u - static_cast<unsigned>(bits))));
}
template <typename T, size_t N>
HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
Vec128<T, N> no) {
static_assert(IsSigned<T>(), "Only works for signed/float");
const DFromV<decltype(v)> d;
#if HWY_PPC_HAVE_10
const RebindToUnsigned<decltype(d)> du;
return BitCast(
d, VFromD<decltype(du)>{vec_blendv(
BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)});
#else
const RebindToSigned<decltype(d)> di;
return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no);
#endif
}
#if HWY_PPC_HAVE_10
#ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
#undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
#else
#define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
#endif
#ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
#undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
#else
#define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
#endif
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
HWY_API V IfNegativeThenElseZero(V v, V yes) {
const DFromV<decltype(v)> d;
return IfNegativeThenElse(v, yes, Zero(d));
}
template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
HWY_API V IfNegativeThenZeroElse(V v, V no) {
const DFromV<decltype(v)> d;
return IfNegativeThenElse(v, Zero(d), no);
}
#endif
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> AbsDiff(Vec128<T, N> a, Vec128<T, N> b) {
return Abs(a - b);
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> add) {
return Vec128<T, N>{vec_madd(mul.raw, x.raw, add.raw)};
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> add) {
return Vec128<T, N>{vec_nmsub(mul.raw, x.raw, add.raw)};
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> sub) {
return Vec128<T, N>{vec_msub(mul.raw, x.raw, sub.raw)};
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
Vec128<T, N> sub) {
return Vec128<T, N>{vec_nmadd(mul.raw, x.raw, sub.raw)};
}
#ifdef HWY_NATIVE_F64_APPROX_RECIP
#undef HWY_NATIVE_F64_APPROX_RECIP
#else
#define HWY_NATIVE_F64_APPROX_RECIP
#endif
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
#if HWY_S390X_HAVE_Z14
return Vec128<T, N>{a.raw / b.raw};
#else
return Vec128<T, N>{vec_div(a.raw, b.raw)};
#endif
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> ApproximateReciprocal(Vec128<T, N> v) {
#if HWY_S390X_HAVE_Z14
const DFromV<decltype(v)> d;
return Set(d, T(1.0)) / v;
#else
return Vec128<T, N>{vec_re(v.raw)};
#endif
}
#if HWY_S390X_HAVE_Z14
template <size_t N>
HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const auto half = v * Set(d, 0.5f);
const auto guess = BitCast(
d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v)));
return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f));
}
#else
#ifdef HWY_NATIVE_F64_APPROX_RSQRT
#undef HWY_NATIVE_F64_APPROX_RSQRT
#else
#define HWY_NATIVE_F64_APPROX_RSQRT
#endif
template <class T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> ApproximateReciprocalSqrt(Vec128<T, N> v) {
return Vec128<T, N>{vec_rsqrte(v.raw)};
}
#endif
template <class T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
return Vec128<T, N>{vec_sqrt(v.raw)};
}
#if HWY_PPC_HAVE_9
#ifdef HWY_NATIVE_GET_BIASED_EXPONENT
#undef HWY_NATIVE_GET_BIASED_EXPONENT
#else
#define HWY_NATIVE_GET_BIASED_EXPONENT
#endif
template <class V, HWY_IF_FLOAT3264_V(V)>
HWY_API VFromD<RebindToUnsigned<DFromV<V>>> GetBiasedExponent(V v) {
return VFromD<RebindToUnsigned<DFromV<V>>>{vec_extract_exp(v.raw)};
}
#endif
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> Min(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_min(a.raw, b.raw)};
}
template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
HWY_API Vec128<T, N> Max(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_max(a.raw, b.raw)};
}
#if HWY_PPC_HAVE_9
#ifdef HWY_NATIVE_INTEGER_ABS_DIFF
#undef HWY_NATIVE_INTEGER_ABS_DIFF
#else
#define HWY_NATIVE_INTEGER_ABS_DIFF
#endif
template <class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API V AbsDiff(const V a, const V b) {
return V{vec_absd(a.raw, b.raw)};
}
template <class V, HWY_IF_U64_D(DFromV<V>)>
HWY_API V AbsDiff(const V a, const V b) {
return Sub(Max(a, b), Min(a, b));
}
template <class V, HWY_IF_SIGNED_V(V)>
HWY_API V AbsDiff(const V a, const V b) {
return Sub(Max(a, b), Min(a, b));
}
#endif
#if HWY_PPC_HAVE_10
#ifdef HWY_NATIVE_INT_DIV
#undef HWY_NATIVE_INT_DIV
#else
#define HWY_NATIVE_INT_DIV
#endif
template <size_t N>
HWY_API Vec128<int32_t, N> operator/(Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
__vector signed int raw_result;
__asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
return Vec128<int32_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator/(Vec128<uint32_t, N> a,
Vec128<uint32_t, N> b) {
__vector unsigned int raw_result;
__asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
return Vec128<uint32_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<int64_t, N> operator/(Vec128<int64_t, N> a,
Vec128<int64_t, N> b) {
__vector signed long long raw_result;
__asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
return Vec128<int64_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> operator/(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
__vector unsigned long long raw_result;
__asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
return Vec128<uint64_t, N>{raw_result};
}
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T> operator/(Vec128<T> a, Vec128<T> b) {
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b),
PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b));
}
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const Rebind<MakeWide<T>, decltype(d)> dw;
return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b));
}
template <size_t N>
HWY_API Vec128<int32_t, N> operator%(Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
__vector signed int raw_result;
__asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
return Vec128<int32_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<uint32_t, N> operator%(Vec128<uint32_t, N> a,
Vec128<uint32_t, N> b) {
__vector unsigned int raw_result;
__asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
return Vec128<uint32_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<int64_t, N> operator%(Vec128<int64_t, N> a,
Vec128<int64_t, N> b) {
__vector signed long long raw_result;
__asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
return Vec128<int64_t, N>{raw_result};
}
template <size_t N>
HWY_API Vec128<uint64_t, N> operator%(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
__vector unsigned long long raw_result;
__asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw));
return Vec128<uint64_t, N>{raw_result};
}
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2))>
HWY_API Vec128<T> operator%(Vec128<T> a, Vec128<T> b) {
const DFromV<decltype(a)> d;
const RepartitionToWide<decltype(d)> dw;
return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b),
PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b));
}
template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2)),
HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const Rebind<MakeWide<T>, decltype(d)> dw;
return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b));
}
#endif
template <class D>
HWY_API void Stream(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
__builtin_prefetch(aligned, 1, 0);
Store(v, d, aligned);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> LowerHalf(D , VFromD<Twice<D>> v) {
return VFromD<D>{v.raw};
}
template <typename T, size_t N>
HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
return Vec128<T, N / 2>{v.raw};
}
template <int kBytes, class D>
HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
if (kBytes == 0) return v;
const auto zeros = Zero(d);
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
#else
return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
#endif
}
template <int kBytes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
}
template <int kLanes, class D, typename T = TFromD<D>>
HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}
template <int kLanes, typename T, size_t N>
HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
}
template <int kBytes, class D>
HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
if (kBytes == 0) return v;
if (d.MaxBytes() != 16) {
const Full128<TFromD<D>> dfull;
VFromD<decltype(dfull)> vfull{v.raw};
v = VFromD<D>{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw};
}
const auto zeros = Zero(d);
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)};
#else
return VFromD<D>{vec_sld(v.raw, zeros.raw, kBytes)};
#endif
}
template <int kLanes, class D>
HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
const Repartition<uint8_t, decltype(d)> d8;
constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
return BitCast(d, ShiftRightBytes<kBytes>(d8, BitCast(d8, v)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> UpperHalf(D d, VFromD<Twice<D>> v) {
return LowerHalf(d, ShiftRightBytes<d.MaxBytes()>(Twice<D>(), v));
}
template <typename T, size_t N>
HWY_API T ExtractLane(Vec128<T, N> v, size_t i) {
return static_cast<T>(v.raw[i]);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
#if HWY_IS_LITTLE_ENDIAN
typename detail::Raw128<T>::type raw_result = v.raw;
raw_result[i] = BitCastScalar<typename detail::Raw128<T>::RawT>(t);
return Vec128<T, N>{raw_result};
#else
DFromV<decltype(v)> d;
alignas(16) T lanes[16 / sizeof(T)];
Store(v, d, lanes);
lanes[i] = t;
return Load(d, lanes);
#endif
}
template <int kBytes, class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
HWY_API Vec128<T> CombineShiftRightBytes(D , Vec128<T> hi, Vec128<T> lo) {
constexpr size_t kSize = 16;
static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
#if HWY_IS_LITTLE_ENDIAN
return Vec128<T>{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)};
#else
return Vec128<T>{vec_sld(lo.raw, hi.raw, kBytes)};
#endif
}
template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
constexpr size_t kSize = d.MaxBytes();
static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
const Repartition<uint8_t, decltype(d)> d8;
using V8 = Vec128<uint8_t>;
const DFromV<V8> dfull8;
const Repartition<TFromD<D>, decltype(dfull8)> dfull;
const V8 hi8{BitCast(d8, hi).raw};
const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8);
return VFromD<D>{BitCast(dfull, r).raw};
}
template <int kLane, typename T, size_t N>
HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
static_assert(0 <= kLane && kLane < N, "Invalid lane");
return Vec128<T, N>{vec_splat(v.raw, kLane)};
}
template <typename T, size_t N = 16 / sizeof(T)>
struct Indices128 {
__vector unsigned char raw;
};
namespace detail {
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
D d) {
const Repartition<uint8_t, decltype(d)> d8;
return Iota(d8, 0);
}
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
D d) {
const Repartition<uint8_t, decltype(d)> d8;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
constexpr __vector unsigned char kBroadcastLaneBytes = {
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
#else
constexpr __vector unsigned char kBroadcastLaneBytes = {
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
#endif
return VFromD<decltype(d8)>{kBroadcastLaneBytes};
}
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
D d) {
const Repartition<uint8_t, decltype(d)> d8;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
constexpr __vector unsigned char kBroadcastLaneBytes = {
0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
#else
constexpr __vector unsigned char kBroadcastLaneBytes = {
3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15};
#endif
return VFromD<decltype(d8)>{kBroadcastLaneBytes};
}
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
D d) {
const Repartition<uint8_t, decltype(d)> d8;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
constexpr __vector unsigned char kBroadcastLaneBytes = {
0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
#else
constexpr __vector unsigned char kBroadcastLaneBytes = {
7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
#endif
return VFromD<decltype(d8)>{kBroadcastLaneBytes};
}
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
const Repartition<uint8_t, decltype(d)> d8;
return Zero(d8);
}
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
const Repartition<uint8_t, decltype(d)> d8;
constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1};
return VFromD<decltype(d8)>{kByteOffsets};
}
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
const Repartition<uint8_t, decltype(d)> d8;
constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3,
0, 1, 2, 3, 0, 1, 2, 3};
return VFromD<decltype(d8)>{kByteOffsets};
}
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
const Repartition<uint8_t, decltype(d)> d8;
constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7};
return VFromD<decltype(d8)>{kByteOffsets};
}
}
template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
D d, Vec128<TI, MaxLanes(D())> vec) {
using T = TFromD<D>;
static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
HWY_DASSERT(AllTrue(
du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
#endif
const Repartition<uint8_t, decltype(d)> d8;
return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d8, vec).raw};
}
template <class D, typename TI,
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
D d, Vec128<TI, MaxLanes(D())> vec) {
using T = TFromD<D>;
static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
#if HWY_IS_DEBUG_BUILD
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
HWY_DASSERT(AllTrue(
du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
#endif
const Repartition<uint8_t, decltype(d)> d8;
using V8 = VFromD<decltype(d8)>;
const V8 lane_indices = TableLookupBytes(
BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d));
constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
return Indices128<TFromD<D>, MaxLanes(D())>{sum.raw};
}
template <class D, typename TI>
HWY_API Indices128<TFromD<D>, HWY_MAX_LANES_D(D)> SetTableIndices(
D d, const TI* idx) {
const Rebind<TI, decltype(d)> di;
return IndicesFromVec(d, LoadU(di, idx));
}
template <typename T, size_t N>
HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
const DFromV<decltype(v)> d;
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, TableLookupBytes(v, VFromD<decltype(d8)>{idx.raw}));
}
template <typename T>
HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
Indices128<T, 1> ) {
return v;
}
template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
Indices128<T, N> idx) {
const DFromV<decltype(a)> d;
const Twice<decltype(d)> dt;
const Repartition<uint8_t, decltype(dt)> dt_u8;
#if HWY_IS_MSAN
const Vec128<T, N> idx_vec{idx.raw};
const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
#else
const Indices128<T, N * 2> idx2{idx.raw};
#endif
return LowerHalf(
d, TableLookupBytes(Combine(dt, b, a),
BitCast(dt, VFromD<decltype(dt_u8)>{idx2.raw})));
}
template <typename T>
HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
Indices128<T> idx) {
return Vec128<T>{vec_perm(a.raw, b.raw, idx.raw)};
}
template <class D>
HWY_API VFromD<D> ReverseBlocks(D , VFromD<D> v) {
return v;
}
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API Vec128<T, 1> Reverse(D , Vec128<T, 1> v) {
return v;
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec64<T> Reverse(D , Vec64<T> v) {
return Vec64<T>{Shuffle2301(Vec128<T>{v.raw}).raw};
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> Reverse(D , Vec64<T> v) {
const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
14, 15, 12, 13, 10, 11, 8, 9};
return Vec64<T>{vec_perm(v.raw, v.raw, kShuffle)};
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
}
#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \
((!HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 710) || \
(HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 900) || \
HWY_COMPILER_CLANG >= 400)
#ifdef HWY_NATIVE_REVERSE_LANE_BYTES
#undef HWY_NATIVE_REVERSE_LANE_BYTES
#else
#define HWY_NATIVE_REVERSE_LANE_BYTES
#endif
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
HWY_API V ReverseLaneBytes(V v) {
return V{vec_revb(v.raw)};
}
#ifdef HWY_NATIVE_REVERSE2_8
#undef HWY_NATIVE_REVERSE2_8
#else
#define HWY_NATIVE_REVERSE2_8
#endif
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
const Repartition<uint16_t, decltype(d)> du16;
return BitCast(d, ReverseLaneBytes(BitCast(du16, v)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
const Repartition<uint32_t, decltype(d)> du32;
return BitCast(d, ReverseLaneBytes(BitCast(du32, v)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
const Repartition<uint64_t, decltype(d)> du64;
return BitCast(d, ReverseLaneBytes(BitCast(du64, v)));
}
#endif
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec16<T> Reverse(D d, Vec16<T> v) {
return Reverse2(d, v);
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> Reverse(D d, Vec32<T> v) {
return Reverse4(d, v);
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec64<T> Reverse(D d, Vec64<T> v) {
return Reverse8(d, v);
}
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
HWY_API Vec128<T, 1> Reverse2(D , Vec128<T, 1> v) {
return v;
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
const Repartition<uint32_t, decltype(d)> du32;
return BitCast(d, RotateRight<16>(BitCast(du32, v)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
const Repartition<uint64_t, decltype(d)> du64;
return BitCast(d, RotateRight<32>(BitCast(du64, v)));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8)>
HWY_API VFromD<D> Reverse2(D , VFromD<D> v) {
return Shuffle01(v);
}
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> Reverse4(D , VFromD<D> v) {
const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1,
14, 15, 12, 13, 10, 11, 8, 9};
return VFromD<D>{vec_perm(v.raw, v.raw, kShuffle)};
}
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
return Reverse(d, v);
}
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> Reverse4(D , VFromD<D> ) {
HWY_ASSERT(0); }
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
return Reverse(d, v);
}
template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
HWY_API VFromD<D> Reverse8(D , VFromD<D> ) {
HWY_ASSERT(0); }
template <typename T, size_t N>
HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
return Vec128<T, N>{vec_mergeh(a.raw, b.raw)};
}
template <class D>
HWY_API VFromD<D> InterleaveLower(D , VFromD<D> a, VFromD<D> b) {
return InterleaveLower(a, b);
}
template <class D, typename T = TFromD<D>>
HWY_API Vec128<T> InterleaveUpper(D , Vec128<T> a, Vec128<T> b) {
return Vec128<T>{vec_mergel(a.raw, b.raw)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
const Half<decltype(d)> d2;
return InterleaveLower(d, VFromD<D>{UpperHalf(d2, a).raw},
VFromD<D>{UpperHalf(d2, b).raw});
}
template <class V, class DW = RepartitionToWide<DFromV<V>>>
HWY_API VFromD<DW> ZipLower(V a, V b) {
return BitCast(DW(), InterleaveLower(a, b));
}
template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
return BitCast(dw, InterleaveLower(D(), a, b));
}
template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
return BitCast(dw, InterleaveUpper(D(), a, b));
}
namespace detail {
#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#else
#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
#endif
template <class D>
HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
const uint32_t x2,
const uint32_t x1,
const uint32_t x0) {
const __vector unsigned int raw = {x0, x1, x2, x3};
return ResizeBitCast(d, Vec128<uint32_t>{raw});
}
}
template <class D>
HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
const Repartition<uint8_t, decltype(d)> du8;
using VU8 = VFromD<decltype(du8)>;
const auto v_shift_amt =
BitCast(Full128<uint8_t>(),
Set(Full128<uint32_t>(),
static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
#if HWY_S390X_HAVE_Z14
return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)});
#else
#if HWY_IS_LITTLE_ENDIAN
return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
#else
return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
#endif #endif }
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
using TU = UnsignedFromSize<d.MaxBytes()>;
const Repartition<TU, decltype(d)> du;
const auto v_shift_amt =
Set(du, static_cast<TU>(amt * sizeof(TFromD<D>) * 8));
#if HWY_IS_LITTLE_ENDIAN
return BitCast(d, BitCast(du, v) >> v_shift_amt);
#else
return BitCast(d, BitCast(du, v) << v_shift_amt);
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
const Repartition<uint8_t, decltype(d)> du8;
using VU8 = VFromD<decltype(du8)>;
const auto v_shift_amt =
BitCast(Full128<uint8_t>(),
Set(Full128<uint32_t>(),
static_cast<uint32_t>(amt * sizeof(TFromD<D>) * 8)));
#if HWY_S390X_HAVE_Z14
return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)});
#else
#if HWY_IS_LITTLE_ENDIAN
return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)});
#else
return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)});
#endif #endif }
template <class D, HWY_IF_V_SIZE_LE_D(D, 16), class VH = VFromD<Half<D>>>
HWY_API VFromD<D> Combine(D d, VH hi_half, VH lo_half) {
const Half<decltype(d)> dh;
using VU = Vec128<UnsignedFromSize<dh.MaxBytes()>, 2>;
using Raw = typename detail::Raw128<TFromV<VU>>::type;
const VU lo{reinterpret_cast<Raw>(lo_half.raw)};
const VU hi{reinterpret_cast<Raw>(hi_half.raw)};
return BitCast(d, InterleaveLower(lo, hi));
}
template <class D>
HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
const Half<D> dh;
return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD<D>{lo.raw});
}
template <class D, typename T = TFromD<D>>
HWY_API Vec128<T> ConcatLowerLower(D d, Vec128<T> hi, Vec128<T> lo) {
const Repartition<uint64_t, decltype(d)> d64;
return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
}
template <class D, typename T = TFromD<D>>
HWY_API Vec128<T> ConcatUpperUpper(D d, Vec128<T> hi, Vec128<T> lo) {
const Repartition<uint64_t, decltype(d)> d64;
return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
}
template <class D, typename T = TFromD<D>>
HWY_API Vec128<T> ConcatLowerUpper(D d, Vec128<T> hi, Vec128<T> lo) {
return CombineShiftRightBytes<8>(d, hi, lo);
}
template <class D, typename T = TFromD<D>>
HWY_API Vec128<T> ConcatUpperLower(D , Vec128<T> hi, Vec128<T> lo) {
const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7,
24, 25, 26, 27, 28, 29, 30, 31};
return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
const Half<decltype(d)> d2;
return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
const Half<decltype(d)> d2;
return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
const Half<decltype(d)> d2;
return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
const Half<decltype(d)> d2;
return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
}
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 2)>* = nullptr,
HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> TruncateTo(D , Vec128<FromT, 1> v) {
using Raw = typename detail::Raw128<TFromD<D>>::type;
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{reinterpret_cast<Raw>(v.raw)};
#else
return VFromD<D>{reinterpret_cast<Raw>(
vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD<D>)))};
#endif
}
namespace detail {
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> Truncate2To(
D , Vec128<FromT, Repartition<FromT, D>().MaxLanes()> lo,
Vec128<FromT, Repartition<FromT, D>().MaxLanes()> hi) {
return VFromD<D>{vec_pack(lo.raw, hi.raw)};
}
}
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2), HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> TruncateTo(D ,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
return VFromD<D>{vec_pack(v.raw, v.raw)};
}
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr,
HWY_IF_LANES_GT_D(D, 1)>
HWY_API VFromD<D> TruncateTo(D d,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
const Rebind<MakeNarrow<FromT>, decltype(d)> d2;
return TruncateTo(d, TruncateTo(d2, v));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
const Repartition<uint16_t, decltype(d)> dw;
const RebindToUnsigned<decltype(d)> du;
#if HWY_IS_LITTLE_ENDIAN
const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
#else
const Vec128<uint16_t> uH = BitCast(dw, hi);
const Vec128<uint16_t> uL = BitCast(dw, lo);
#endif
return BitCast(d, detail::Truncate2To(du, uL, uH));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec64<T> ConcatOdd(D , Vec64<T> hi, Vec64<T> lo) {
const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23};
return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ConcatOdd(D , Vec32<T> hi, Vec32<T> lo) {
const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19};
return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactOddU8)};
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
const Repartition<uint32_t, decltype(d)> dw;
const RebindToUnsigned<decltype(d)> du;
#if HWY_IS_LITTLE_ENDIAN
const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
#else
const Vec128<uint32_t> uH = BitCast(dw, hi);
const Vec128<uint32_t> uL = BitCast(dw, lo);
#endif
return BitCast(d, detail::Truncate2To(du, uL, uH));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ConcatOdd(D , Vec64<T> hi, Vec64<T> lo) {
const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23};
return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactOddU16)};
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ConcatOdd(D d, Vec128<T> hi, Vec128<T> lo) {
#if HWY_IS_LITTLE_ENDIAN
(void)d;
const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15,
20, 21, 22, 23, 28, 29, 30, 31};
return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
#else
const RebindToUnsigned<decltype(d)> du;
const Repartition<uint64_t, decltype(d)> dw;
return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
#endif
}
template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
return InterleaveUpper(d, lo, hi);
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
const Repartition<uint16_t, decltype(d)> dw;
const RebindToUnsigned<decltype(d)> du;
#if HWY_IS_LITTLE_ENDIAN
const Vec128<uint16_t> uH = BitCast(dw, hi);
const Vec128<uint16_t> uL = BitCast(dw, lo);
#else
const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
#endif
return BitCast(d, detail::Truncate2To(du, uL, uH));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec64<T> ConcatEven(D , Vec64<T> hi, Vec64<T> lo) {
const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22};
return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec32<T> ConcatEven(D , Vec32<T> hi, Vec32<T> lo) {
const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18};
return Vec32<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU8)};
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
const Repartition<uint32_t, decltype(d)> dw;
const RebindToUnsigned<decltype(d)> du;
#if HWY_IS_LITTLE_ENDIAN
const Vec128<uint32_t> uH = BitCast(dw, hi);
const Vec128<uint32_t> uL = BitCast(dw, lo);
#else
const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
#endif
return BitCast(d, detail::Truncate2To(du, uL, uH));
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec64<T> ConcatEven(D , Vec64<T> hi, Vec64<T> lo) {
const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21};
return Vec64<T>{vec_perm(lo.raw, hi.raw, kCompactEvenU16)};
}
template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> ConcatEven(D d, Vec128<T> hi, Vec128<T> lo) {
#if HWY_IS_LITTLE_ENDIAN
const Repartition<uint64_t, decltype(d)> dw;
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi)));
#else
(void)d;
constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11,
16, 17, 18, 19, 24, 25, 26, 27};
return Vec128<T>{vec_perm(lo.raw, hi.raw, kShuffle)};
#endif
}
template <typename D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
return InterleaveLower(d, lo, hi);
}
#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#else
#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
#endif
template <class D, HWY_IF_UNSIGNED_D(D), class V, HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<D> OrderedTruncate2To(D d, V a, V b) {
#if HWY_IS_LITTLE_ENDIAN
return ConcatEven(d, BitCast(d, b), BitCast(d, a));
#else
return ConcatOdd(d, BitCast(d, b), BitCast(d, a));
#endif
}
template <typename T>
HWY_API Vec128<T, 1> DupEven(Vec128<T, 1> v) {
return v;
}
template <typename T>
HWY_API Vec128<T, 2> DupEven(Vec128<T, 2> v) {
return InterleaveLower(DFromV<decltype(v)>(), v, v);
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const Repartition<uint8_t, decltype(d)> du8;
constexpr __vector unsigned char kShuffle = {0, 0, 2, 2, 4, 4, 6, 6,
8, 8, 10, 10, 12, 12, 14, 14};
return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const Repartition<uint8_t, decltype(d)> du8;
constexpr __vector unsigned char kShuffle = {0, 1, 0, 1, 4, 5, 4, 5,
8, 9, 8, 9, 12, 13, 12, 13};
return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
}
template <typename T, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T> DupEven(Vec128<T> v) {
#if HWY_S390X_HAVE_Z14
const DFromV<decltype(v)> d;
const Repartition<uint8_t, decltype(d)> du8;
return TableLookupBytes(
v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10,
11, 8, 9, 10, 11)));
#else
return Vec128<T>{vec_mergee(v.raw, v.raw)};
#endif
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const Repartition<uint8_t, decltype(d)> du8;
constexpr __vector unsigned char kShuffle = {1, 1, 3, 3, 5, 5, 7, 7,
9, 9, 11, 11, 13, 13, 15, 15};
return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const Repartition<uint8_t, decltype(d)> du8;
constexpr __vector unsigned char kShuffle = {2, 3, 2, 3, 6, 7, 6, 7,
10, 11, 10, 11, 14, 15, 14, 15};
return TableLookupBytes(v, BitCast(d, VFromD<decltype(du8)>{kShuffle}));
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
#if HWY_S390X_HAVE_Z14
const DFromV<decltype(v)> d;
const Repartition<uint8_t, decltype(d)> du8;
return TableLookupBytes(
v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14,
15, 12, 13, 14, 15)));
#else
return Vec128<T, N>{vec_mergeo(v.raw, v.raw)};
#endif
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
return InterleaveUpper(DFromV<decltype(v)>(), v, v);
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N>{mask}), b, a);
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 2>{mask}), b, a);
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0,
0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 4>{mask}), b, a);
}
template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
HWY_INLINE Vec128<T, N> OddEven(Vec128<T, N> a, Vec128<T, N> b) {
const DFromV<decltype(a)> d;
const __vector unsigned char mask = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0};
return IfVecThenElse(BitCast(d, Vec128<uint8_t, N * 8>{mask}), b, a);
}
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
const Full128<TFromD<D>> d_full;
const Indices128<TFromD<D>> idx{
Dup128VecFromValues(Full128<uint8_t>(), 0, 16, 2, 18, 4, 20, 6, 22, 8, 24,
10, 26, 12, 28, 14, 30)
.raw};
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
ResizeBitCast(d_full, b), idx));
}
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
const Full128<TFromD<D>> d_full;
const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
16, 17, 4, 5, 20, 21, 8,
9, 24, 25, 12, 13, 28, 29)
.raw};
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
ResizeBitCast(d_full, b), idx));
}
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> InterleaveEven(D d, VFromD<D> a, VFromD<D> b) {
#if HWY_S390X_HAVE_Z14
const Full128<TFromD<D>> d_full;
const Indices128<TFromD<D>> idx{Dup128VecFromValues(Full128<uint8_t>(), 0, 1,
2, 3, 16, 17, 18, 19, 8,
9, 10, 11, 24, 25, 26, 27)
.raw};
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
ResizeBitCast(d_full, b), idx));
#else
(void)d;
return VFromD<D>{vec_mergee(a.raw, b.raw)};
#endif
}
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> InterleaveEven(D , VFromD<D> a, VFromD<D> b) {
return InterleaveLower(a, b);
}
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
const Full128<TFromD<D>> d_full;
const Indices128<TFromD<D>> idx{
Dup128VecFromValues(Full128<uint8_t>(), 1, 17, 3, 19, 5, 21, 7, 23, 9, 25,
11, 27, 13, 29, 15, 31)
.raw};
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
ResizeBitCast(d_full, b), idx));
}
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
const Full128<TFromD<D>> d_full;
const Indices128<TFromD<D>> idx{
Dup128VecFromValues(Full128<uint8_t>(), 2, 3, 18, 19, 6, 7, 22, 23, 10,
11, 26, 27, 14, 15, 30, 31)
.raw};
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
ResizeBitCast(d_full, b), idx));
}
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
#if HWY_S390X_HAVE_Z14
const Full128<TFromD<D>> d_full;
const Indices128<TFromD<D>> idx{
Dup128VecFromValues(Full128<uint8_t>(), 4, 5, 6, 7, 20, 21, 22, 23, 12,
13, 14, 15, 28, 29, 30, 31)
.raw};
return ResizeBitCast(d, TwoTablesLookupLanes(ResizeBitCast(d_full, a),
ResizeBitCast(d_full, b), idx));
#else
(void)d;
return VFromD<D>{vec_mergeo(a.raw, b.raw)};
#endif
}
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
return InterleaveUpper(d, a, b);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> , Vec128<T, N> even) {
return even;
}
template <typename T, size_t N>
HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
return v;
}
template <class D, class V = VFromD<D>>
HWY_API V InterleaveEvenBlocks(D, V a, V ) {
return a;
}
template <class D, class V = VFromD<D>>
HWY_API V InterleaveOddBlocks(D, V a, V ) {
return a;
}
#if HWY_S390X_HAVE_Z14
HWY_API Vec16<int16_t> MulFixedPoint15(Vec16<int16_t> a, Vec16<int16_t> b) {
const DFromV<decltype(a)> di16;
const RepartitionToWide<decltype(di16)> di32;
const auto round_up_incr = Set(di32, 0x4000);
const auto i32_product = MulEven(a, b) + round_up_incr;
return ResizeBitCast(di16, ShiftLeft<1>(i32_product));
}
template <size_t N, HWY_IF_LANES_GT(N, 1)>
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
const DFromV<decltype(a)> di16;
const RepartitionToWide<decltype(di16)> di32;
const auto round_up_incr = Set(di32, 0x4000);
const auto even_product = MulEven(a, b) + round_up_incr;
const auto odd_product = MulOdd(a, b) + round_up_incr;
return OddEven(BitCast(di16, ShiftRight<15>(odd_product)),
BitCast(di16, ShiftLeft<1>(even_product)));
}
#else
template <size_t N>
HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
Vec128<int16_t, N> b) {
const Vec128<int16_t> zero = Zero(Full128<int16_t>());
return Vec128<int16_t, N>{vec_mradds(a.raw, b.raw, zero.raw)};
}
#endif
namespace detail {
template <typename T, size_t N>
HWY_API Vec128<T, N> Shl(hwy::UnsignedTag , Vec128<T, N> v,
Vec128<T, N> bits) {
#if HWY_S390X_HAVE_Z14
return Vec128<T, N>{v.raw << bits.raw};
#else
return Vec128<T, N>{vec_sl(v.raw, bits.raw)};
#endif
}
template <typename T, size_t N>
HWY_API Vec128<T, N> Shl(hwy::SignedTag , Vec128<T, N> v,
Vec128<T, N> bits) {
const DFromV<decltype(v)> di;
const RebindToUnsigned<decltype(di)> du;
return BitCast(di,
Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
}
}
template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
return detail::Shl(hwy::TypeTag<T>(), v, bits);
}
namespace detail {
template <typename T, size_t N>
HWY_API Vec128<T, N> Shr(hwy::UnsignedTag , Vec128<T, N> v,
Vec128<T, N> bits) {
#if HWY_S390X_HAVE_Z14
return Vec128<T, N>{v.raw >> bits.raw};
#else
return Vec128<T, N>{vec_sr(v.raw, bits.raw)};
#endif
}
template <typename T, size_t N>
HWY_API Vec128<T, N> Shr(hwy::SignedTag , Vec128<T, N> v,
Vec128<T, N> bits) {
#if HWY_S390X_HAVE_Z14
return Vec128<T, N>{v.raw >> bits.raw};
#else
const DFromV<decltype(v)> di;
const RebindToUnsigned<decltype(di)> du;
return Vec128<T, N>{vec_sra(v.raw, BitCast(du, bits).raw)};
#endif
}
}
template <typename T, size_t N>
HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, Vec128<T, N> bits) {
return detail::Shr(hwy::TypeTag<T>(), v, bits);
}
template <class T, HWY_IF_UI64(T)>
HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
using V64 = typename detail::Raw128<T>::type;
const V64 mul128_result = reinterpret_cast<V64>(vec_mule(a.raw, b.raw));
#if HWY_IS_LITTLE_ENDIAN
return Vec128<T>{mul128_result};
#else
return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
#endif
#else
alignas(16) T mul[2];
mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
return Load(Full128<T>(), mul);
#endif
}
template <class T, HWY_IF_UI64(T)>
HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
using V64 = typename detail::Raw128<T>::type;
const V64 mul128_result = reinterpret_cast<V64>(vec_mulo(a.raw, b.raw));
#if HWY_IS_LITTLE_ENDIAN
return Vec128<T>{mul128_result};
#else
return Vec128<T>{vec_sld(mul128_result, mul128_result, 8)};
#endif
#else
alignas(16) T mul[2];
const Full64<T> d2;
mul[0] =
Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
return Load(Full128<T>(), mul);
#endif
}
#include "hwy/ops/inside-inl.h"
template <class DF, HWY_IF_F32_D(DF),
class VBF = VFromD<Repartition<bfloat16_t, DF>>>
HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, VBF a, VBF b) {
return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
}
template <class D32, HWY_IF_UI32_D(D32),
class V16 = VFromD<RepartitionToNarrow<D32>>>
HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) {
#if HWY_S390X_HAVE_Z14
(void)d32;
return MulEven(a, b) + MulOdd(a, b);
#else
return VFromD<D32>{vec_msum(a.raw, b.raw, Zero(d32).raw)};
#endif
}
template <class D32, HWY_IF_UI32_D(D32),
class V16 = VFromD<RepartitionToNarrow<D32>>>
HWY_API VFromD<D32> ReorderWidenMulAccumulate(D32 , V16 a, V16 b,
VFromD<D32> sum0,
VFromD<D32>& ) {
#if HWY_S390X_HAVE_Z14
return MulEven(a, b) + MulOdd(a, b) + sum0;
#else
return VFromD<D32>{vec_msum(a.raw, b.raw, sum0.raw)};
#endif
}
template <size_t N>
HWY_API Vec128<int32_t, N> RearrangeToOddPlusEven(Vec128<int32_t, N> sum0,
Vec128<int32_t, N> ) {
return sum0; }
template <size_t N>
HWY_API Vec128<uint32_t, N> RearrangeToOddPlusEven(
Vec128<uint32_t, N> sum0, Vec128<uint32_t, N> ) {
return sum0; }
template <class VW>
HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
return Add(sum0, sum1);
}
#if !HWY_S390X_HAVE_Z14
#ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
#undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
#else
#define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
#endif
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
DI32 , VFromD<Repartition<int16_t, DI32>> a,
VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
return VFromD<DI32>{vec_msums(a.raw, b.raw, sum.raw)};
}
#endif
#if !HWY_S390X_HAVE_Z14
#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
#endif
template <class DU32, HWY_IF_U32_D(DU32)>
HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
DU32 , VFromD<Repartition<uint8_t, DU32>> a,
VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
return VFromD<DU32>{vec_msum(a.raw, b.raw, sum.raw)};
}
#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
#endif
template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 16)>
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
DI32 , VFromD<Repartition<uint8_t, DI32>> a_u,
VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
return VFromD<DI32>{vec_msum(b_i.raw, a_u.raw, sum.raw)};
}
#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#else
#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
#endif
template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
VFromD<Repartition<int8_t, DI32>> a,
VFromD<Repartition<int8_t, DI32>> b,
VFromD<DI32> sum) {
const Repartition<uint8_t, decltype(di32)> du8;
const auto result_sum_0 =
SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum);
const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a))));
return result_sum_0 - result_sum_1;
}
#endif
template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_UNSIGNED(FromT)>
HWY_API VFromD<D> PromoteTo(D ,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
const Rebind<FromT, Twice<D>> d2;
const VFromD<decltype(d2)> twice{v.raw};
const RepartitionToNarrow<D> dn;
#if HWY_IS_LITTLE_ENDIAN
return ZipLower(BitCast(dn, twice), Zero(dn));
#else
return ZipLower(Zero(dn), BitCast(dn, twice));
#endif
}
template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_SIGNED(FromT)>
HWY_API VFromD<D> PromoteTo(D ,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
using Raw = typename detail::Raw128<TFromD<D>>::type;
return VFromD<D>{reinterpret_cast<Raw>(vec_unpackh(v.raw))};
}
template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 4), HWY_IF_NOT_FLOAT_D(D),
HWY_IF_T_SIZE(FromT, 1)>
HWY_API VFromD<D> PromoteTo(D d32,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
const DFromV<decltype(v)> d8;
const Rebind<MakeWide<FromT>, decltype(d8)> d16;
return PromoteTo(d32, PromoteTo(d16, v));
}
template <class D, typename FromT, HWY_IF_T_SIZE_D(D, 8), HWY_IF_NOT_FLOAT_D(D),
HWY_IF_NOT_FLOAT_NOR_SPECIAL(FromT),
HWY_IF_T_SIZE_ONE_OF(FromT, (1 << 1) | (1 << 2))>
HWY_API VFromD<D> PromoteTo(D d64,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
const Rebind<MakeWide<FromT>, decltype(d64)> dw;
return PromoteTo(d64, PromoteTo(dw, v));
}
#if HWY_PPC_HAVE_9
#ifdef HWY_NATIVE_F16C
#undef HWY_NATIVE_F16C
#else
#define HWY_NATIVE_F16C
#endif
template <class D, HWY_IF_F32_D(D)>
HWY_INLINE VFromD<D> PromoteTo(D , VFromD<Rebind<float16_t, D>> v) {
return VFromD<D>{vec_extract_fp32_from_shorth(v.raw)};
}
#endif
template <class D, HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
const Rebind<uint16_t, decltype(df32)> du16;
const RebindToSigned<decltype(df32)> di32;
return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
}
template <class D, HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D , VFromD<Rebind<float, D>> v) {
const __vector float raw_v = InterleaveLower(v, v).raw;
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{vec_doubleo(raw_v)};
#elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
HWY_COMPILER_GCC_ACTUAL < 1000
return VFromD<D>{__builtin_s390_vflls(raw_v)};
#else
return VFromD<D>{vec_doublee(raw_v)};
#endif
}
template <class D, HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<int32_t, D>> v) {
#if HWY_S390X_HAVE_Z14
const RebindToSigned<decltype(df64)> di64;
return ConvertTo(df64, PromoteTo(di64, v));
#else
(void)df64;
const __vector signed int raw_v = InterleaveLower(v, v).raw;
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{vec_doubleo(raw_v)};
#else
return VFromD<D>{vec_doublee(raw_v)};
#endif
#endif }
template <class D, HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<uint32_t, D>> v) {
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(df64)> du64;
return ConvertTo(df64, PromoteTo(du64, v));
#else
(void)df64;
const __vector unsigned int raw_v = InterleaveLower(v, v).raw;
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{vec_doubleo(raw_v)};
#else
return VFromD<D>{vec_doublee(raw_v)};
#endif
#endif }
#if !HWY_S390X_HAVE_Z14
namespace detail {
template <class V>
static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
#if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND)
return IfThenElseZero(v == v, v);
#else
return v;
#endif
}
template <class VF32>
static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<int64_t, DFromV<VF32>>>
VsxXvcvspsxds(VF32 vf32) {
using VI64 = VFromD<Repartition<int64_t, DFromV<VF32>>>;
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \
HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)
return VI64{__builtin_vsx_xvcvspsxds(vf32.raw)};
#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN
return VI64{vec_signedo(vf32.raw)};
#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN
return VI64{vec_signede(vf32.raw)};
#else
__vector signed long long raw_result;
__asm__("xvcvspsxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :);
return VI64{raw_result};
#endif
}
template <class VF32>
static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<uint64_t, DFromV<VF32>>>
VsxXvcvspuxds(VF32 vf32) {
using VU64 = VFromD<Repartition<uint64_t, DFromV<VF32>>>;
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \
HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)
return VU64{reinterpret_cast<__vector unsigned long long>(
__builtin_vsx_xvcvspuxds(vf32.raw))};
#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN
return VU64{vec_unsignedo(vf32.raw)};
#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN
return VU64{vec_unsignede(vf32.raw)};
#else
__vector unsigned long long raw_result;
__asm__("xvcvspuxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :);
return VU64{raw_result};
#endif
}
} #endif
template <class D, HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
#if !HWY_S390X_HAVE_Z14
const Repartition<float, decltype(di64)> dt_f32;
const auto vt_f32 = ResizeBitCast(dt_f32, v);
return detail::VsxXvcvspsxds(
detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32)));
#else
const RebindToFloat<decltype(di64)> df64;
return ConvertTo(di64, PromoteTo(df64, v));
#endif
}
template <class D, HWY_IF_U64_D(D)>
HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
#if !HWY_S390X_HAVE_Z14
const Repartition<float, decltype(du64)> dt_f32;
const auto vt_f32 = ResizeBitCast(dt_f32, v);
return detail::VsxXvcvspuxds(
detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32)));
#else
const RebindToFloat<decltype(du64)> df64;
return ConvertTo(du64, PromoteTo(df64, v));
#endif
}
#ifdef HWY_NATIVE_PROMOTE_UPPER_TO
#undef HWY_NATIVE_PROMOTE_UPPER_TO
#else
#define HWY_NATIVE_PROMOTE_UPPER_TO
#endif
template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16),
HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_UNSIGNED(FromT)>
HWY_API VFromD<D> PromoteUpperTo(D d, Vec128<FromT> v) {
const RebindToUnsigned<D> du;
const RepartitionToNarrow<decltype(du)> dn;
#if HWY_IS_LITTLE_ENDIAN
return BitCast(d, ZipUpper(du, v, Zero(dn)));
#else
return BitCast(d, ZipUpper(du, Zero(dn), v));
#endif
}
template <class D, typename FromT, HWY_IF_V_SIZE_D(D, 16),
HWY_IF_T_SIZE_D(D, 2 * sizeof(FromT)),
HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D), HWY_IF_SIGNED(FromT)>
HWY_API VFromD<D> PromoteUpperTo(D , Vec128<FromT> v) {
using Raw = typename detail::Raw128<TFromD<D>>::type;
return VFromD<D>{reinterpret_cast<Raw>(vec_unpackl(v.raw))};
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteUpperTo(D df32, Vec128<float16_t> v) {
#if HWY_PPC_HAVE_9
(void)df32;
return VFromD<D>{vec_extract_fp32_from_shortl(v.raw)};
#else
const Rebind<float16_t, decltype(df32)> dh;
return PromoteTo(df32, UpperHalf(dh, v));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
HWY_API VFromD<D> PromoteUpperTo(D df32, Vec128<bfloat16_t> v) {
const Repartition<uint16_t, decltype(df32)> du16;
const RebindToSigned<decltype(df32)> di32;
return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteUpperTo(D , Vec128<float> v) {
const __vector float raw_v = InterleaveUpper(Full128<float>(), v, v).raw;
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{vec_doubleo(raw_v)};
#elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
HWY_COMPILER_GCC_ACTUAL < 1000
return VFromD<D>{__builtin_s390_vflls(raw_v)};
#else
return VFromD<D>{vec_doublee(raw_v)};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<int32_t> v) {
#if HWY_S390X_HAVE_Z14
const RebindToSigned<decltype(df64)> di64;
return ConvertTo(df64, PromoteUpperTo(di64, v));
#else
(void)df64;
const __vector signed int raw_v =
InterleaveUpper(Full128<int32_t>(), v, v).raw;
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{vec_doubleo(raw_v)};
#else
return VFromD<D>{vec_doublee(raw_v)};
#endif
#endif }
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(df64)> du64;
return ConvertTo(df64, PromoteUpperTo(du64, v));
#else
(void)df64;
const __vector unsigned int raw_v =
InterleaveUpper(Full128<uint32_t>(), v, v).raw;
#if HWY_IS_LITTLE_ENDIAN
return VFromD<D>{vec_doubleo(raw_v)};
#else
return VFromD<D>{vec_doublee(raw_v)};
#endif
#endif }
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
#if !HWY_S390X_HAVE_Z14
(void)di64;
return detail::VsxXvcvspsxds(
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v)));
#else
const RebindToFloat<decltype(di64)> df64;
return ConvertTo(di64, PromoteUpperTo(df64, v));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
HWY_API VFromD<D> PromoteUpperTo(D du64, Vec128<float> v) {
#if !HWY_S390X_HAVE_Z14
(void)du64;
return detail::VsxXvcvspuxds(
detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128<float>(), v, v)));
#else
const RebindToFloat<decltype(du64)> df64;
return ConvertTo(du64, PromoteUpperTo(df64, v));
#endif
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V>
HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
const Rebind<TFromV<V>, decltype(d)> dh;
return PromoteTo(d, UpperHalf(dh, v));
}
namespace detail {
#if HWY_PPC_HAVE_9 && \
(HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200)
#if HWY_IS_LITTLE_ENDIAN
template <class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag ,
hwy::SizeTag<4> ,
hwy::SignedTag , D ,
V v) {
return VFromD<D>{vec_signexti(v.raw)};
}
template <class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag ,
hwy::SizeTag<8> ,
hwy::SignedTag , D ,
V v) {
return VFromD<D>{vec_signextll(v.raw)};
}
#else
template <class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag ,
hwy::SizeTag<4> ,
hwy::SignedTag , D ,
V v) {
return VFromD<D>{vec_signexti(v.raw)};
}
template <class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag ,
hwy::SizeTag<8> ,
hwy::SignedTag , D ,
V v) {
return VFromD<D>{vec_signextll(v.raw)};
}
#endif
#endif
#if HWY_S390X_HAVE_Z14
template <class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag ,
hwy::SizeTag<8> ,
hwy::FloatTag , D ,
V v) {
return VFromD<D>{vec_doublee(v.raw)};
}
template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag ,
hwy::SizeTag<8> ,
FromTypeTag , D d_to, V v) {
const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
return ConvertTo(d_to, PromoteEvenTo(dw, v));
}
#else
template <class D, class V, class FromTypeTag>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::FloatTag ,
hwy::SizeTag<8> ,
FromTypeTag , D ,
V v) {
return VFromD<D>{vec_doublee(v.raw)};
}
#endif
template <class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag ,
hwy::SizeTag<8> ,
hwy::FloatTag , D d_to,
V v) {
#if !HWY_S390X_HAVE_Z14
(void)d_to;
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
#if HWY_IS_LITTLE_ENDIAN
return VsxXvcvspsxds(Shuffle2103(normalized_v));
#else
return VsxXvcvspsxds(normalized_v);
#endif
#else
const RebindToFloat<decltype(d_to)> df64;
return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
hwy::FloatTag(), df64, v));
#endif
}
template <class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag ,
hwy::SizeTag<8> ,
hwy::FloatTag , D d_to,
V v) {
#if !HWY_S390X_HAVE_Z14
(void)d_to;
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
#if HWY_IS_LITTLE_ENDIAN
return VsxXvcvspuxds(Shuffle2103(normalized_v));
#else
return VsxXvcvspuxds(normalized_v);
#endif
#else
const RebindToFloat<decltype(d_to)> df64;
return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(),
hwy::FloatTag(), df64, v));
#endif
}
#if HWY_S390X_HAVE_Z14
template <class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag ,
hwy::SizeTag<8> ,
hwy::FloatTag , D d_to,
V v) {
return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(),
d_to, V{vec_sld(v.raw, v.raw, 4)});
}
template <class D, class V, class FromTypeTag, HWY_IF_UI32(TFromV<V>)>
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag ,
hwy::SizeTag<8> ,
FromTypeTag , D d_to, V v) {
const Rebind<MakeWide<TFromV<V>>, decltype(d_to)> dw;
return ConvertTo(d_to, PromoteOddTo(dw, v));
}
#else
template <class D, class V, class FromTypeTag>
HWY_INLINE VFromD<D> PromoteOddTo(hwy::FloatTag ,
hwy::SizeTag<8> ,
FromTypeTag , D ,
V v) {
return VFromD<D>{vec_doubleo(v.raw)};
}
#endif
template <class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag ,
hwy::SizeTag<8> ,
hwy::FloatTag , D d_to,
V v) {
#if !HWY_S390X_HAVE_Z14
(void)d_to;
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
#if HWY_IS_LITTLE_ENDIAN
return VsxXvcvspsxds(normalized_v);
#else
return VsxXvcvspsxds(Shuffle0321(normalized_v));
#endif
#else
const RebindToFloat<decltype(d_to)> df64;
return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
hwy::FloatTag(), df64, v));
#endif
}
template <class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag ,
hwy::SizeTag<8> ,
hwy::FloatTag , D d_to,
V v) {
#if !HWY_S390X_HAVE_Z14
(void)d_to;
const auto normalized_v = detail::VsxF2INormalizeSrcVals(v);
#if HWY_IS_LITTLE_ENDIAN
return VsxXvcvspuxds(normalized_v);
#else
return VsxXvcvspuxds(Shuffle0321(normalized_v));
#endif
#else
const RebindToFloat<decltype(d_to)> df64;
return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(),
hwy::FloatTag(), df64, v));
#endif
}
}
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D),
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_SIGNED(FromT), HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
HWY_API VFromD<D> DemoteTo(D ,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
return VFromD<D>{vec_packsu(v.raw, v.raw)};
}
template <class D, typename FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
HWY_API VFromD<D> DemoteTo(D ,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
return VFromD<D>{vec_packs(v.raw, v.raw)};
}
template <class D, typename FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE(FromT, sizeof(TFromD<D>) * 2)>
HWY_API VFromD<D> DemoteTo(D ,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
return VFromD<D>{vec_packs(v.raw, v.raw)};
}
template <class D, class FromT, HWY_IF_SIGNED_D(D), HWY_IF_SIGNED(FromT),
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
HWY_API VFromD<D> DemoteTo(D d,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
const Rebind<MakeNarrow<FromT>, D> d2;
return DemoteTo(d, DemoteTo(d2, v));
}
template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_UNSIGNED(FromT),
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
HWY_API VFromD<D> DemoteTo(D d,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
const Rebind<MakeNarrow<FromT>, D> d2;
return DemoteTo(d, DemoteTo(d2, v));
}
template <class D, class FromT, HWY_IF_UNSIGNED_D(D), HWY_IF_SIGNED(FromT),
HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
hwy::EnableIf<(sizeof(FromT) >= sizeof(TFromD<D>) * 4)>* = nullptr>
HWY_API VFromD<D> DemoteTo(D d,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
const Rebind<MakeUnsigned<MakeNarrow<FromT>>, D> d2;
return DemoteTo(d, DemoteTo(d2, v));
}
#if HWY_PPC_HAVE_9 && \
(HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp))
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
#if HWY_COMPILER_GCC_ACTUAL
(void)df16;
return VFromD<D>{vec_pack_to_short_fp32(v.raw, v.raw)};
#elif HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)
const RebindToUnsigned<decltype(df16)> du16;
const Rebind<uint32_t, D> du;
const VFromD<decltype(du)> bits16{
reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))};
return BitCast(df16, TruncateTo(du16, bits16));
#else
#error "Only define the function if we have a native implementation"
#endif
}
#endif
#if HWY_PPC_HAVE_9
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
#else
#define HWY_NATIVE_DEMOTE_F64_TO_F16
#endif
namespace detail {
static HWY_INLINE Vec128<uint64_t> VsxXscvdphp(Vec128<double> vf64) {
__vector unsigned long long raw_result;
__asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw));
return Vec128<uint64_t>{raw_result};
}
}
template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 1)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
const RebindToUnsigned<decltype(df16)> du16;
const Rebind<uint64_t, decltype(df16)> du64;
const Full128<double> df64_full;
#if HWY_IS_LITTLE_ENDIAN
const auto bits16_as_u64 =
UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v)));
#else
const auto bits16_as_u64 =
LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v)));
#endif
return BitCast(df16, TruncateTo(du16, bits16_as_u64));
}
template <class D, HWY_IF_F16_D(D), HWY_IF_LANES_D(D, 2)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
const RebindToUnsigned<decltype(df16)> du16;
const Rebind<uint64_t, decltype(df16)> du64;
const Rebind<double, decltype(df16)> df64;
#if HWY_IS_LITTLE_ENDIAN
const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v));
const auto bits64_as_u64_1 = detail::VsxXscvdphp(v);
const auto bits64_as_u64 =
InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1);
#else
const auto bits64_as_u64_0 = detail::VsxXscvdphp(v);
const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v));
const auto bits64_as_u64 =
InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1);
#endif
return BitCast(df16, TruncateTo(du16, bits64_as_u64));
}
#elif HWY_S390X_HAVE_Z14
#ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
#undef HWY_NATIVE_DEMOTE_F64_TO_F16
#else
#define HWY_NATIVE_DEMOTE_F64_TO_F16
#endif
namespace detail {
template <class DF32, HWY_IF_F32_D(DF32)>
static HWY_INLINE VFromD<DF32> DemoteToF32WithRoundToOdd(
DF32 df32, VFromD<Rebind<double, DF32>> v) {
const Twice<DF32> dt_f32;
__vector float raw_f32_in_even;
__asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw));
const VFromD<decltype(dt_f32)> f32_in_even{raw_f32_in_even};
return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even));
}
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
const Rebind<float, decltype(df16)> df32;
return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v));
}
#endif
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
#else
#define HWY_NATIVE_DEMOTE_F32_TO_BF16
#endif
namespace detail {
template <class D, HWY_IF_BF16_D(D)>
static HWY_INLINE VFromD<Rebind<uint32_t, D>> VsxXvcvspbf16(
D dbf16, VFromD<Rebind<float, D>> v) {
const Rebind<uint32_t, decltype(dbf16)> du32;
const Repartition<uint8_t, decltype(du32)> du32_as_du8;
using VU32 = __vector unsigned int;
return VFromD<Rebind<uint32_t, D>>{reinterpret_cast<VU32>(
__builtin_vsx_xvcvspbf16(BitCast(du32_as_du8, v).raw))};
}
}
template <class D, HWY_IF_BF16_D(D)>
HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
const RebindToUnsigned<decltype(dbf16)> du16;
return BitCast(dbf16, TruncateTo(du16, detail::VsxXvcvspbf16(dbf16, v)));
}
#endif
template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4), HWY_IF_SIGNED_D(DN),
HWY_IF_SIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
const DFromV<decltype(a)> d;
const Twice<decltype(d)> dt;
return DemoteTo(dn, Combine(dt, b, a));
}
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_SIGNED_D(DN),
HWY_IF_SIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
const Twice<decltype(dn)> dn_full;
const Repartition<uint32_t, decltype(dn_full)> du32_full;
const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
const auto vu32_full = BitCast(du32_full, v_full);
return LowerHalf(
BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
}
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_SIGNED_D(DN),
HWY_IF_SIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN , V a, V b) {
return VFromD<DN>{vec_packs(a.raw, b.raw)};
}
template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
HWY_IF_UNSIGNED_D(DN), HWY_IF_SIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
const DFromV<decltype(a)> d;
const Twice<decltype(d)> dt;
return DemoteTo(dn, Combine(dt, b, a));
}
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
HWY_IF_SIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
const Twice<decltype(dn)> dn_full;
const Repartition<uint32_t, decltype(dn_full)> du32_full;
const VFromD<decltype(dn_full)> v_full{vec_packsu(a.raw, b.raw)};
const auto vu32_full = BitCast(du32_full, v_full);
return LowerHalf(
BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
}
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
HWY_IF_SIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN , V a, V b) {
return VFromD<DN>{vec_packsu(a.raw, b.raw)};
}
template <class DN, typename V, HWY_IF_V_SIZE_LE_D(DN, 4),
HWY_IF_UNSIGNED_D(DN), HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
const DFromV<decltype(a)> d;
const Twice<decltype(d)> dt;
return DemoteTo(dn, Combine(dt, b, a));
}
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 8), HWY_IF_UNSIGNED_D(DN),
HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
const Twice<decltype(dn)> dn_full;
const Repartition<uint32_t, decltype(dn_full)> du32_full;
const VFromD<decltype(dn_full)> v_full{vec_packs(a.raw, b.raw)};
const auto vu32_full = BitCast(du32_full, v_full);
return LowerHalf(
BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full)));
}
template <class DN, typename V, HWY_IF_V_SIZE_D(DN, 16), HWY_IF_UNSIGNED_D(DN),
HWY_IF_UNSIGNED_V(V),
HWY_IF_T_SIZE_ONE_OF_D(DN, (1 << 1) | (1 << 2) | (1 << 4)),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2)>
HWY_API VFromD<DN> ReorderDemote2To(DN , V a, V b) {
return VFromD<DN>{vec_packs(a.raw, b.raw)};
}
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
template <class D, class V, HWY_IF_BF16_D(D), HWY_IF_F32(TFromV<V>),
HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V) * 2)>
HWY_API VFromD<D> ReorderDemote2To(D dbf16, V a, V b) {
const RebindToUnsigned<decltype(dbf16)> du16;
const Half<decltype(dbf16)> dh_bf16;
return BitCast(dbf16,
OrderedTruncate2To(du16, detail::VsxXvcvspbf16(dh_bf16, a),
detail::VsxXvcvspbf16(dh_bf16, b)));
}
#endif
template <class D, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), class V,
HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
HWY_IF_T_SIZE_V(V, sizeof(TFromD<D>) * 2),
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
return ReorderDemote2To(d, a, b);
}
#if HWY_PPC_HAVE_10 && HWY_HAS_BUILTIN(__builtin_vsx_xvcvspbf16)
template <class D, HWY_IF_BF16_D(D), class V, HWY_IF_F32(TFromV<V>),
HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
return ReorderDemote2To(d, a, b);
}
#endif
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
HWY_API Vec32<float> DemoteTo(D , Vec64<double> v) {
#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
HWY_COMPILER_GCC_ACTUAL < 1000
return Vec32<float>{__builtin_s390_vflrd(v.raw, 0, 0)};
#else
return Vec32<float>{vec_floate(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API Vec64<float> DemoteTo(D d, Vec128<double> v) {
#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \
HWY_COMPILER_GCC_ACTUAL < 1000
const Vec128<float> f64_to_f32{__builtin_s390_vflrd(v.raw, 0, 0)};
#elif HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN
const Vec128<float> f64_to_f32{vec_floate(v.raw)};
#else
const Vec128<float> f64_to_f32{vec_floato(v.raw)};
#endif
#if HWY_S390X_HAVE_Z14
const Twice<decltype(d)> dt;
return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32));
#else
const RebindToUnsigned<D> du;
const Rebind<uint64_t, D> du64;
return Vec64<float>{
BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
HWY_API Vec32<int32_t> DemoteTo(D di32, Vec64<double> v) {
#if HWY_S390X_HAVE_Z14
const Rebind<int64_t, decltype(di32)> di64;
return DemoteTo(di32, ConvertTo(di64, v));
#else
(void)di32;
return Vec32<int32_t>{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
HWY_API Vec64<int32_t> DemoteTo(D di32, Vec128<double> v) {
#if HWY_S390X_HAVE_Z14
const Rebind<int64_t, decltype(di32)> di64;
return DemoteTo(di32, ConvertTo(di64, v));
#else
(void)di32;
#if HWY_IS_LITTLE_ENDIAN
const Vec128<int32_t> f64_to_i32{
vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)};
#else
const Vec128<int32_t> f64_to_i32{
vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)};
#endif
const Rebind<int64_t, D> di64;
const Vec128<int64_t> vi64 = BitCast(di64, f64_to_i32);
return Vec64<int32_t>{vec_pack(vi64.raw, vi64.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
HWY_API Vec32<uint32_t> DemoteTo(D du32, Vec64<double> v) {
#if HWY_S390X_HAVE_Z14
const Rebind<uint64_t, decltype(du32)> du64;
return DemoteTo(du32, ConvertTo(du64, v));
#else
(void)du32;
return Vec32<uint32_t>{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
HWY_API Vec64<uint32_t> DemoteTo(D du32, Vec128<double> v) {
#if HWY_S390X_HAVE_Z14
const Rebind<uint64_t, decltype(du32)> du64;
return DemoteTo(du32, ConvertTo(du64, v));
#else
(void)du32;
#if HWY_IS_LITTLE_ENDIAN
const Vec128<uint32_t> f64_to_u32{
vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)};
#else
const Vec128<uint32_t> f64_to_u32{
vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)};
#endif
const Rebind<uint64_t, D> du64;
const Vec128<uint64_t> vu64 = BitCast(du64, f64_to_u32);
return Vec64<uint32_t>{vec_pack(vu64.raw, vu64.raw)};
#endif
}
#if HWY_S390X_HAVE_Z14
namespace detail {
template <class V, HWY_IF_I64(TFromV<V>)>
HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
__vector double raw_result;
__asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
}
template <class V, HWY_IF_U64(TFromV<V>)>
HWY_INLINE VFromD<RebindToFloat<DFromV<V>>> ConvToF64WithRoundToOdd(V v) {
__vector double raw_result;
__asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw));
return VFromD<RebindToFloat<DFromV<V>>>{raw_result};
}
} #endif
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
HWY_API Vec32<float> DemoteTo(D df32, Vec64<int64_t> v) {
#if HWY_S390X_HAVE_Z14
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
#else
(void)df32;
return Vec32<float>{vec_floate(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API Vec64<float> DemoteTo(D df32, Vec128<int64_t> v) {
#if HWY_S390X_HAVE_Z14
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
#else
#if HWY_IS_LITTLE_ENDIAN
const Vec128<float> i64_to_f32{vec_floate(v.raw)};
#else
const Vec128<float> i64_to_f32{vec_floato(v.raw)};
#endif
const RebindToUnsigned<decltype(df32)> du32;
const Rebind<uint64_t, decltype(df32)> du64;
return Vec64<float>{
BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
HWY_API Vec32<float> DemoteTo(D df32, Vec64<uint64_t> v) {
#if HWY_S390X_HAVE_Z14
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
#else
(void)df32;
return Vec32<float>{vec_floate(v.raw)};
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
HWY_API Vec64<float> DemoteTo(D df32, Vec128<uint64_t> v) {
#if HWY_S390X_HAVE_Z14
return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v));
#else
#if HWY_IS_LITTLE_ENDIAN
const Vec128<float> u64_to_f32{vec_floate(v.raw)};
#else
const Vec128<float> u64_to_f32{vec_floato(v.raw)};
#endif
const RebindToUnsigned<decltype(df32)> du;
const Rebind<uint64_t, decltype(df32)> du64;
return Vec64<float>{
BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw};
#endif
}
template <size_t N>
HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
const Rebind<uint16_t, DFromV<decltype(v)>> du16;
const Rebind<uint8_t, decltype(du16)> du8;
return TruncateTo(du8, TruncateTo(du16, v));
}
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConvertTo(D df32,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
const Rebind<double, decltype(df32)> df64;
return DemoteTo(df32, PromoteTo(df64, v));
}
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT),
HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ConvertTo(D df32, Vec128<FromT> v) {
const RepartitionToWide<decltype(df32)> df64;
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000
const VFromD<D> vf32_lo{
__builtin_s390_vflrd(PromoteLowerTo(df64, v).raw, 0, 0)};
const VFromD<D> vf32_hi{
__builtin_s390_vflrd(PromoteUpperTo(df64, v).raw, 0, 0)};
#else
const VFromD<D> vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)};
const VFromD<D> vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)};
#endif
return ConcatEven(df32, vf32_hi, vf32_lo);
}
#else
template <class D, typename FromT, HWY_IF_F32_D(D), HWY_IF_UI32(FromT)>
HWY_API VFromD<D> ConvertTo(D ,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_CLANG
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
#endif
#if HWY_S390X_HAVE_Z15
return VFromD<D>{vec_float(v.raw)};
#else
return VFromD<D>{vec_ctf(v.raw, 0)};
#endif
HWY_DIAGNOSTICS(pop)
}
#endif
template <class D, typename FromT, HWY_IF_F64_D(D), HWY_IF_NOT_FLOAT(FromT),
HWY_IF_T_SIZE_D(D, sizeof(FromT))>
HWY_API VFromD<D> ConvertTo(D ,
Vec128<FromT, Rebind<FromT, D>().MaxLanes()> v) {
return VFromD<D>{vec_double(v.raw)};
}
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConvertTo(D di32,
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
const Rebind<int64_t, decltype(di32)> di64;
return DemoteTo(di32, PromoteTo(di64, v));
}
template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ConvertTo(D di32,
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
const RepartitionToWide<decltype(di32)> di64;
return OrderedDemote2To(di32, PromoteLowerTo(di64, v),
PromoteUpperTo(di64, v));
}
#else
template <class D, HWY_IF_I32_D(D)>
HWY_API VFromD<D> ConvertTo(D ,
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
#if defined(__OPTIMIZE__)
if (detail::IsConstantRawAltivecVect(v.raw)) {
constexpr int32_t kMinI32 = LimitsMin<int32_t>();
constexpr int32_t kMaxI32 = LimitsMax<int32_t>();
return Dup128VecFromValues(
D(),
(v.raw[0] >= -2147483648.0f)
? ((v.raw[0] < 2147483648.0f) ? static_cast<int32_t>(v.raw[0])
: kMaxI32)
: ((v.raw[0] < 0) ? kMinI32 : 0),
(v.raw[1] >= -2147483648.0f)
? ((v.raw[1] < 2147483648.0f) ? static_cast<int32_t>(v.raw[1])
: kMaxI32)
: ((v.raw[1] < 0) ? kMinI32 : 0),
(v.raw[2] >= -2147483648.0f)
? ((v.raw[2] < 2147483648.0f) ? static_cast<int32_t>(v.raw[2])
: kMaxI32)
: ((v.raw[2] < 0) ? kMinI32 : 0),
(v.raw[3] >= -2147483648.0f)
? ((v.raw[3] < 2147483648.0f) ? static_cast<int32_t>(v.raw[3])
: kMaxI32)
: ((v.raw[3] < 0) ? kMinI32 : 0));
}
#endif
#if HWY_S390X_HAVE_Z15
__vector signed int raw_result;
__asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
return VFromD<D>{raw_result};
#else
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_CLANG
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
#endif
return VFromD<D>{vec_cts(v.raw, 0)};
HWY_DIAGNOSTICS(pop)
#endif }
#endif
template <class D, HWY_IF_I64_D(D)>
HWY_API VFromD<D> ConvertTo(D ,
Vec128<double, Rebind<double, D>().MaxLanes()> v) {
#if defined(__OPTIMIZE__) && (!HWY_COMPILER_CLANG || !HWY_S390X_HAVE_Z14)
if (detail::IsConstantRawAltivecVect(v.raw)) {
constexpr int64_t kMinI64 = LimitsMin<int64_t>();
constexpr int64_t kMaxI64 = LimitsMax<int64_t>();
return Dup128VecFromValues(D(),
(v.raw[0] >= -9223372036854775808.0)
? ((v.raw[0] < 9223372036854775808.0)
? static_cast<int64_t>(v.raw[0])
: kMaxI64)
: ((v.raw[0] < 0) ? kMinI64 : 0LL),
(v.raw[1] >= -9223372036854775808.0)
? ((v.raw[1] < 9223372036854775808.0)
? static_cast<int64_t>(v.raw[1])
: kMaxI64)
: ((v.raw[1] < 0) ? kMinI64 : 0LL));
}
#endif
__vector signed long long raw_result;
#if HWY_S390X_HAVE_Z14
__asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
#else
__asm__("xvcvdpsxds %x0,%x1"
: "=wa"(raw_result)
: "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
#endif
return VFromD<D>{raw_result};
}
#if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15
template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API VFromD<D> ConvertTo(D du32,
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
const Rebind<uint64_t, decltype(du32)> du64;
return DemoteTo(du32, PromoteTo(du64, v));
}
template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_D(D, 16)>
HWY_API VFromD<D> ConvertTo(D du32,
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
const RepartitionToWide<decltype(du32)> du64;
return OrderedDemote2To(du32, PromoteLowerTo(du64, v),
PromoteUpperTo(du64, v));
}
#else
template <class D, HWY_IF_U32_D(D)>
HWY_API VFromD<D> ConvertTo(D ,
Vec128<float, Rebind<float, D>().MaxLanes()> v) {
#if defined(__OPTIMIZE__)
if (detail::IsConstantRawAltivecVect(v.raw)) {
constexpr uint32_t kMaxU32 = LimitsMax<uint32_t>();
return Dup128VecFromValues(
D(),
(v.raw[0] >= 0.0f)
? ((v.raw[0] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[0])
: kMaxU32)
: 0,
(v.raw[1] >= 0.0f)
? ((v.raw[1] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[1])
: kMaxU32)
: 0,
(v.raw[2] >= 0.0f)
? ((v.raw[2] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[2])
: kMaxU32)
: 0,
(v.raw[3] >= 0.0f)
? ((v.raw[3] < 4294967296.0f) ? static_cast<uint32_t>(v.raw[3])
: kMaxU32)
: 0);
}
#endif
#if HWY_S390X_HAVE_Z15
__vector unsigned int raw_result;
__asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
return VFromD<D>{raw_result};
#else
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_CLANG
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
#endif
VFromD<D> result{vec_ctu(v.raw, 0)};
HWY_DIAGNOSTICS(pop)
return result;
#endif }
#endif
template <class D, HWY_IF_U64_D(D)>
HWY_API VFromD<D> ConvertTo(D ,
Vec128<double, Rebind<double, D>().MaxLanes()> v) {
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_CLANG
HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all")
#endif
#if defined(__OPTIMIZE__) && (!HWY_COMPILER_CLANG || !HWY_S390X_HAVE_Z14)
if (detail::IsConstantRawAltivecVect(v.raw)) {
constexpr uint64_t kMaxU64 = LimitsMax<uint64_t>();
return Dup128VecFromValues(
D(),
(v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0)
? static_cast<uint64_t>(v.raw[0])
: kMaxU64)
: 0,
(v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0)
? static_cast<uint64_t>(v.raw[1])
: kMaxU64)
: 0);
}
#endif
__vector unsigned long long raw_result;
#if HWY_S390X_HAVE_Z14
__asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw));
#else
__asm__("xvcvdpuxds %x0,%x1"
: "=wa"(raw_result)
: "wa"(detail::VsxF2INormalizeSrcVals(v).raw));
#endif
return VFromD<D>{raw_result};
}
template <size_t N>
HWY_API Vec128<float, N> Round(Vec128<float, N> v) {
return Vec128<float, N>{vec_round(v.raw)};
}
template <size_t N>
HWY_API Vec128<double, N> Round(Vec128<double, N> v) {
#if HWY_S390X_HAVE_Z14
return Vec128<double, N>{vec_round(v.raw)};
#else
return Vec128<double, N>{vec_rint(v.raw)};
#endif
}
template <typename T, size_t N, HWY_IF_FLOAT3264(T)>
HWY_API Vec128<MakeSigned<T>, N> NearestInt(Vec128<T, N> v) {
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
return ConvertTo(di, Round(v));
}
template <class DI32, HWY_IF_I32_D(DI32)>
HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
VFromD<Rebind<double, DI32>> v) {
return DemoteTo(di32, Round(v));
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
return Vec128<T, N>{vec_trunc(v.raw)};
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Ceil(Vec128<T, N> v) {
return Vec128<T, N>{vec_ceil(v.raw)};
}
template <typename T, size_t N, HWY_IF_FLOAT(T)>
HWY_API Vec128<T, N> Floor(Vec128<T, N> v) {
return Vec128<T, N>{vec_floor(v.raw)};
}
template <typename T, size_t N>
HWY_API Mask128<T, N> IsNaN(Vec128<T, N> v) {
static_assert(IsFloat<T>(), "Only for float");
return v != v;
}
template <typename T, size_t N>
HWY_API Mask128<T, N> IsInf(Vec128<T, N> v) {
static_assert(IsFloat<T>(), "Only for float");
using TU = MakeUnsigned<T>;
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const VFromD<decltype(du)> vu = BitCast(du, v);
return RebindMask(
d,
Eq(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> IsFinite(Vec128<T, N> v) {
static_assert(IsFloat<T>(), "Only for float");
using TU = MakeUnsigned<T>;
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const VFromD<decltype(du)> vu = BitCast(du, v);
return RebindMask(
d,
Lt(Add(vu, vu), Set(du, static_cast<TU>(hwy::MaxExponentTimes2<T>()))));
}
#if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO)
#ifdef HWY_NATIVE_AES
#undef HWY_NATIVE_AES
#else
#define HWY_NATIVE_AES
#endif
namespace detail {
#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600
using CipherTag = Full128<uint64_t>;
#else
using CipherTag = Full128<uint8_t>;
#endif using CipherVec = VFromD<CipherTag>;
}
HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
Vec128<uint8_t> round_key) {
const detail::CipherTag dc;
const Full128<uint8_t> du8;
#if HWY_IS_LITTLE_ENDIAN
return Reverse(du8,
BitCast(du8, detail::CipherVec{vec_cipher_be(
BitCast(dc, Reverse(du8, state)).raw,
BitCast(dc, Reverse(du8, round_key)).raw)}));
#else
return BitCast(du8, detail::CipherVec{vec_cipher_be(
BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
#endif
}
HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
Vec128<uint8_t> round_key) {
const detail::CipherTag dc;
const Full128<uint8_t> du8;
#if HWY_IS_LITTLE_ENDIAN
return Reverse(du8,
BitCast(du8, detail::CipherVec{vec_cipherlast_be(
BitCast(dc, Reverse(du8, state)).raw,
BitCast(dc, Reverse(du8, round_key)).raw)}));
#else
return BitCast(du8, detail::CipherVec{vec_cipherlast_be(
BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
#endif
}
HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
Vec128<uint8_t> round_key) {
const detail::CipherTag dc;
const Full128<uint8_t> du8;
#if HWY_IS_LITTLE_ENDIAN
return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be(
BitCast(dc, Reverse(du8, state)).raw,
Zero(dc).raw)})),
round_key);
#else
return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be(
BitCast(dc, state).raw, Zero(dc).raw)}),
round_key);
#endif
}
HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
Vec128<uint8_t> round_key) {
const detail::CipherTag dc;
const Full128<uint8_t> du8;
#if HWY_IS_LITTLE_ENDIAN
return Reverse(du8,
BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
BitCast(dc, Reverse(du8, state)).raw,
BitCast(dc, Reverse(du8, round_key)).raw)}));
#else
return BitCast(du8, detail::CipherVec{vec_ncipherlast_be(
BitCast(dc, state).raw, BitCast(dc, round_key).raw)});
#endif
}
HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
const Full128<uint8_t> du8;
const auto zero = Zero(du8);
return AESRoundInv(AESLastRound(state, zero), zero);
}
template <uint8_t kRcon>
HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0,
0, 0, 0, 0, kRcon, 0, 0, 0};
constexpr __vector unsigned char kRotWordShuffle = {
4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12};
const detail::CipherTag dc;
const Full128<uint8_t> du8;
const auto sub_word_result =
BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)});
const auto rot_word_result =
TableLookupBytes(sub_word_result, Vec128<uint8_t>{kRotWordShuffle});
return Xor(rot_word_result, Vec128<uint8_t>{kRconXorMask});
}
template <size_t N>
HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
const DFromV<decltype(a)> d;
const auto zero = Zero(d);
using VU64 = __vector unsigned long long;
const VU64 pmsum_result = reinterpret_cast<VU64>(
vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw));
#if HWY_IS_LITTLE_ENDIAN
return Vec128<uint64_t, N>{pmsum_result};
#else
return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
#endif
}
template <size_t N>
HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
Vec128<uint64_t, N> b) {
const DFromV<decltype(a)> d;
const auto zero = Zero(d);
using VU64 = __vector unsigned long long;
const VU64 pmsum_result = reinterpret_cast<VU64>(
vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw)));
#if HWY_IS_LITTLE_ENDIAN
return Vec128<uint64_t, N>{pmsum_result};
#else
return Vec128<uint64_t, N>{vec_sld(pmsum_result, pmsum_result, 8)};
#endif
}
#endif
namespace detail {
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE MFromD<D> LoadMaskBits128(D , uint64_t mask_bits) {
#if HWY_PPC_HAVE_10
const Vec128<uint8_t> mask_vec{vec_genbm(mask_bits)};
#if HWY_IS_LITTLE_ENDIAN
return MFromD<D>{MaskFromVec(mask_vec).raw};
#else
return MFromD<D>{MaskFromVec(Reverse(Full128<uint8_t>(), mask_vec)).raw};
#endif
#else
const Full128<uint8_t> du8;
const Full128<uint16_t> du16;
const Vec128<uint8_t> vbits =
BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
#if HWY_IS_LITTLE_ENDIAN
const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1};
#else
const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0};
#endif
const Vec128<uint8_t> rep8{vec_perm(vbits.raw, vbits.raw, kRep8)};
const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128,
1, 2, 4, 8, 16, 32, 64, 128};
return MFromD<D>{TestBit(rep8, Vec128<uint8_t>{kBit}).raw};
#endif }
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE MFromD<D> LoadMaskBits128(D , uint64_t mask_bits) {
#if HWY_PPC_HAVE_10
const Vec128<uint16_t> mask_vec{vec_genhm(mask_bits)};
#if HWY_IS_LITTLE_ENDIAN
return MFromD<D>{MaskFromVec(mask_vec).raw};
#else
return MFromD<D>{MaskFromVec(Reverse(Full128<uint16_t>(), mask_vec)).raw};
#endif
#else
const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128};
const auto vmask_bits =
Set(Full128<uint16_t>(), static_cast<uint16_t>(mask_bits));
return MFromD<D>{TestBit(vmask_bits, Vec128<uint16_t>{kBit}).raw};
#endif }
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE MFromD<D> LoadMaskBits128(D , uint64_t mask_bits) {
#if HWY_PPC_HAVE_10
const Vec128<uint32_t> mask_vec{vec_genwm(mask_bits)};
#if HWY_IS_LITTLE_ENDIAN
return MFromD<D>{MaskFromVec(mask_vec).raw};
#else
return MFromD<D>{MaskFromVec(Reverse(Full128<uint32_t>(), mask_vec)).raw};
#endif
#else
const __vector unsigned int kBit = {1, 2, 4, 8};
const auto vmask_bits =
Set(Full128<uint32_t>(), static_cast<uint32_t>(mask_bits));
return MFromD<D>{TestBit(vmask_bits, Vec128<uint32_t>{kBit}).raw};
#endif }
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE MFromD<D> LoadMaskBits128(D , uint64_t mask_bits) {
#if HWY_PPC_HAVE_10
const Vec128<uint64_t> mask_vec{vec_gendm(mask_bits)};
#if HWY_IS_LITTLE_ENDIAN
return MFromD<D>{MaskFromVec(mask_vec).raw};
#else
return MFromD<D>{MaskFromVec(Reverse(Full128<uint64_t>(), mask_vec)).raw};
#endif
#else
const __vector unsigned long long kBit = {1, 2};
const auto vmask_bits =
Set(Full128<uint64_t>(), static_cast<uint64_t>(mask_bits));
return MFromD<D>{TestBit(vmask_bits, Vec128<uint64_t>{kBit}).raw};
#endif }
}
template <class D, HWY_IF_LANES_LE_D(D, 8)>
HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
uint64_t mask_bits = bits[0];
constexpr size_t kN = MaxLanes(d);
if (kN < 8) mask_bits &= (1u << kN) - 1;
return detail::LoadMaskBits128(d, mask_bits);
}
template <class D, HWY_IF_LANES_D(D, 16)>
HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
uint16_t u16_mask_bits;
CopyBytes<sizeof(uint16_t)>(bits, &u16_mask_bits);
#if HWY_IS_LITTLE_ENDIAN
return detail::LoadMaskBits128(d, u16_mask_bits);
#else
#if HWY_HAS_BUILTIN(__builtin_bswap16)
return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits));
#else
return detail::LoadMaskBits128(
d, static_cast<uint16_t>((u16_mask_bits << 8) | (u16_mask_bits >> 8)));
#endif
#endif
}
template <typename T>
struct CompressIsPartition {
enum { value = (sizeof(T) != 1) };
};
template <class D>
HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
constexpr size_t kN = MaxLanes(d);
if (kN < 8) mask_bits &= (1u << kN) - 1;
return detail::LoadMaskBits128(d, mask_bits);
}
namespace detail {
template <class D>
constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) {
return (d.MaxBytes() == 16) ? mask_bits
: mask_bits & ((1ull << d.MaxLanes()) - 1);
}
#if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN
template <size_t N>
HWY_INLINE uint64_t ExtractSignBits(Vec128<uint8_t, N> sign_bits,
__vector unsigned char bit_shuffle) {
using VU64 = detail::Raw128<uint64_t>::type;
#if HWY_S390X_HAVE_Z14
const Vec128<uint64_t> extracted{
reinterpret_cast<VU64>(vec_bperm_u128(sign_bits.raw, bit_shuffle))};
#else
const Vec128<uint64_t> extracted{
reinterpret_cast<VU64>(vec_vbpermq(sign_bits.raw, bit_shuffle))};
#endif
return extracted.raw[HWY_IS_LITTLE_ENDIAN];
}
#endif
}
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
const Repartition<uint8_t, decltype(d)> du8;
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
return detail::OnlyActive(d,
static_cast<uint64_t>(vec_extractm(sign_bits.raw)));
#else
const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64,
56, 48, 40, 32, 24, 16, 8, 0};
return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
#endif }
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
const RebindToUnsigned<decltype(d)> du;
const Repartition<uint8_t, decltype(d)> du8;
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
return detail::OnlyActive(
d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
#else
(void)du;
#if HWY_IS_LITTLE_ENDIAN
const __vector unsigned char kBitShuffle = {
112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
#else
const __vector unsigned char kBitShuffle = {
128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0};
#endif
return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
#endif }
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
const RebindToUnsigned<decltype(d)> du;
const Repartition<uint8_t, decltype(d)> du8;
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
return detail::OnlyActive(
d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
#else
(void)du;
#if HWY_IS_LITTLE_ENDIAN
const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128,
128, 128, 128, 128, 128, 128,
128, 128, 128, 128};
#else
const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128,
96, 64, 32, 0};
#endif
return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
#endif }
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
const RebindToUnsigned<decltype(d)> du;
const Repartition<uint8_t, decltype(d)> du8;
const VFromD<decltype(du8)> sign_bits = BitCast(du8, VecFromMask(d, mask));
#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN
return detail::OnlyActive(
d, static_cast<uint64_t>(vec_extractm(BitCast(du, sign_bits).raw)));
#else
(void)du;
#if HWY_IS_LITTLE_ENDIAN
const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128,
128, 128, 128, 128};
#else
const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128,
128, 128, 64, 0};
#endif
return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle));
#endif }
template <class D, HWY_IF_LANES_LE_D(D, 8)>
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
bits[0] = static_cast<uint8_t>(BitsFromMask(d, mask));
return sizeof(uint8_t);
}
template <class D, HWY_IF_LANES_D(D, 16)>
HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
const auto mask_bits = BitsFromMask(d, mask);
#if HWY_IS_LITTLE_ENDIAN
const uint16_t u16_mask_bits = static_cast<uint16_t>(mask_bits);
#else
#if HWY_HAS_BUILTIN(__builtin_bswap16)
const uint16_t u16_mask_bits =
__builtin_bswap16(static_cast<uint16_t>(mask_bits));
#else
const uint16_t u16_mask_bits = static_cast<uint16_t>(
(mask_bits << 8) | (static_cast<uint16_t>(mask_bits) >> 8));
#endif
#endif
CopyBytes<sizeof(uint16_t)>(&u16_mask_bits, bits);
return sizeof(uint16_t);
}
template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API bool AllFalse(D d, MFromD<D> mask) {
const RebindToUnsigned<decltype(d)> du;
return static_cast<bool>(
vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw));
}
template <class D, HWY_IF_V_SIZE_D(D, 16)>
HWY_API bool AllTrue(D d, MFromD<D> mask) {
const RebindToUnsigned<decltype(d)> du;
using TU = TFromD<decltype(du)>;
return static_cast<bool>(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw,
Set(du, hwy::LimitsMax<TU>()).raw));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API bool AllFalse(D d, MFromD<D> mask) {
const Full128<TFromD<D>> d_full;
constexpr size_t kN = MaxLanes(d);
return AllFalse(d_full,
And(MFromD<decltype(d_full)>{mask.raw}, FirstN(d_full, kN)));
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
HWY_API bool AllTrue(D d, MFromD<D> mask) {
const Full128<TFromD<D>> d_full;
constexpr size_t kN = MaxLanes(d);
return AllTrue(
d_full, Or(MFromD<decltype(d_full)>{mask.raw}, Not(FirstN(d_full, kN))));
}
template <class D>
HWY_API size_t CountTrue(D d, MFromD<D> mask) {
return PopCount(BitsFromMask(d, mask));
}
#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
namespace detail {
template <class V>
static HWY_INLINE size_t VsxCntlzLsbb(V v) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
HWY_IS_LITTLE_ENDIAN
int idx;
__asm__("vctzlsbb %0,%1" : "=r"(idx) : "v"(v.raw));
return static_cast<size_t>(idx);
#else
return static_cast<size_t>(vec_cntlz_lsbb(v.raw));
#endif
}
template <class V>
static HWY_INLINE size_t VsxCnttzLsbb(V v) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \
HWY_IS_LITTLE_ENDIAN
int idx;
__asm__("vclzlsbb %0,%1" : "=r"(idx) : "v"(v.raw));
return static_cast<size_t>(idx);
#else
return static_cast<size_t>(vec_cnttz_lsbb(v.raw));
#endif
}
} #endif
template <class D, typename T = TFromD<D>>
HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
if (detail::IsFull(d)) {
const Repartition<uint8_t, D> d8;
const auto bytes = BitCast(d8, VecFromMask(d, mask));
return detail::VsxCntlzLsbb(bytes) / sizeof(T);
}
#endif return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(d, mask));
}
template <class D, typename T = TFromD<D>>
HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
constexpr size_t kN = 16 / sizeof(T);
if (detail::IsFull(d)) {
const Repartition<uint8_t, D> d8;
const auto bytes = BitCast(d8, VecFromMask(d, mask));
const size_t idx = detail::VsxCntlzLsbb(bytes) / sizeof(T);
return idx == kN ? -1 : static_cast<intptr_t>(idx);
}
#endif const uint64_t mask_bits = BitsFromMask(d, mask);
return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
}
template <class D, typename T = TFromD<D>>
HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
if (detail::IsFull(d)) {
const Repartition<uint8_t, D> d8;
const auto bytes = BitCast(d8, VecFromMask(d, mask));
const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T);
return 16 / sizeof(T) - 1 - idx;
}
#endif return 63 - Num0BitsAboveMS1Bit_Nonzero64(BitsFromMask(d, mask));
}
template <class D, typename T = TFromD<D>>
HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN)
constexpr size_t kN = 16 / sizeof(T);
if (detail::IsFull(d)) {
const Repartition<uint8_t, D> d8;
const auto bytes = BitCast(d8, VecFromMask(d, mask));
const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T);
return idx == kN ? -1 : static_cast<intptr_t>(kN - 1 - idx);
}
#endif const uint64_t mask_bits = BitsFromMask(d, mask);
return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits))
: -1;
}
namespace detail {
#if HWY_PPC_HAVE_10
template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
constexpr unsigned kGenPcvmMode =
(kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
typename detail::Raw128<TFromD<D>>::type idx;
__asm__("xxgenpcvbm %x0, %1, %2"
: "=wa"(idx)
: "v"(mask.raw), "i"(kGenPcvmMode));
return VFromD<decltype(d)>{idx};
}
template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
constexpr unsigned kGenPcvmMode =
(kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
typename detail::Raw128<TFromD<D>>::type idx;
__asm__("xxgenpcvhm %x0, %1, %2"
: "=wa"(idx)
: "v"(mask.raw), "i"(kGenPcvmMode));
return VFromD<decltype(d)>{idx};
}
template <bool kIsCompress, class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<D> CompressOrExpandIndicesFromMask(D d, MFromD<D> mask) {
constexpr unsigned kGenPcvmMode =
(kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u);
typename detail::Raw128<TFromD<D>>::type idx;
__asm__("xxgenpcvwm %x0, %1, %2"
: "=wa"(idx)
: "v"(mask.raw), "i"(kGenPcvmMode));
return VFromD<decltype(d)>{idx};
}
#endif
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 256);
const Rebind<uint8_t, decltype(d)> d8;
const Twice<decltype(d8)> d8t;
const RebindToUnsigned<decltype(d)> du;
alignas(16) static constexpr uint8_t table[2048] = {
0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
constexpr uint16_t kPairIndexIncrement =
HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
return BitCast(d, pairs + Set(du, kPairIndexIncrement));
}
template <class D, HWY_IF_T_SIZE_D(D, 2)>
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 256);
const Rebind<uint8_t, decltype(d)> d8;
const Twice<decltype(d8)> d8t;
const RebindToUnsigned<decltype(d)> du;
alignas(16) static constexpr uint8_t table[2048] = {
0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 0, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 0, 2, 6, 8, 10, 12, 14, 4, 2, 6, 8, 10, 12, 14, 0, 4, 0, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 0, 2, 4, 8, 10, 12, 14, 6, 2, 4, 8, 10, 12, 14, 0, 6, 0, 4, 8, 10, 12, 14, 2, 6, 4, 8, 10, 12, 14, 0, 2, 6, 0, 2, 8, 10, 12, 14, 4, 6, 2, 8, 10, 12, 14, 0, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 0, 2, 4, 6, 10, 12, 14, 8, 2, 4, 6, 10, 12, 14, 0, 8, 0, 4, 6, 10, 12, 14, 2, 8, 4, 6, 10, 12, 14, 0, 2, 8, 0, 2, 6, 10, 12, 14, 4, 8, 2, 6, 10, 12, 14, 0, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, 6, 10, 12, 14, 0, 2, 4, 8, 0, 2, 4, 10, 12, 14, 6, 8, 2, 4, 10, 12, 14, 0, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, 4, 10, 12, 14, 0, 2, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, 2, 10, 12, 14, 0, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 12, 14, 10, 2, 4, 6, 8, 12, 14, 0, 10, 0, 4, 6, 8, 12, 14, 2, 10, 4, 6, 8, 12, 14, 0, 2, 10, 0, 2, 6, 8, 12, 14, 4, 10, 2, 6, 8, 12, 14, 0, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, 6, 8, 12, 14, 0, 2, 4, 10, 0, 2, 4, 8, 12, 14, 6, 10, 2, 4, 8, 12, 14, 0, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, 4, 8, 12, 14, 0, 2, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, 2, 8, 12, 14, 0, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, 8, 12, 14, 0, 2, 4, 6, 10, 0, 2, 4, 6, 12, 14, 8, 10, 2, 4, 6, 12, 14, 0, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, 4, 6, 12, 14, 0, 2, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, 2, 6, 12, 14, 0, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, 6, 12, 14, 0, 2, 4, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, 2, 4, 12, 14, 0, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, 4, 12, 14, 0, 2, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, 2, 12, 14, 0, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 0, 2, 4, 6, 8, 10, 14, 12, 2, 4, 6, 8, 10, 14, 0, 12, 0, 4, 6, 8, 10, 14, 2, 12, 4, 6, 8, 10, 14, 0, 2, 12, 0, 2, 6, 8, 10, 14, 4, 12, 2, 6, 8, 10, 14, 0, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, 6, 8, 10, 14, 0, 2, 4, 12, 0, 2, 4, 8, 10, 14, 6, 12, 2, 4, 8, 10, 14, 0, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, 4, 8, 10, 14, 0, 2, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, 2, 8, 10, 14, 0, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, 8, 10, 14, 0, 2, 4, 6, 12, 0, 2, 4, 6, 10, 14, 8, 12, 2, 4, 6, 10, 14, 0, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, 4, 6, 10, 14, 0, 2, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, 2, 6, 10, 14, 0, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, 6, 10, 14, 0, 2, 4, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, 2, 4, 10, 14, 0, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, 4, 10, 14, 0, 2, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, 2, 10, 14, 0, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, 10, 14, 0, 2, 4, 6, 8, 12, 0, 2, 4, 6, 8, 14, 10, 12, 2, 4, 6, 8, 14, 0, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, 4, 6, 8, 14, 0, 2, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, 2, 6, 8, 14, 0, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, 6, 8, 14, 0, 2, 4, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, 2, 4, 8, 14, 0, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, 4, 8, 14, 0, 2, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, 2, 8, 14, 0, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, 8, 14, 0, 2, 4, 6, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, 2, 4, 6, 14, 0, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, 4, 6, 14, 0, 2, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, 2, 6, 14, 0, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, 6, 14, 0, 2, 4, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, 2, 4, 14, 0, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, 4, 14, 0, 2, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, 2, 14, 0, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 12, 0, 14, 0, 4, 6, 8, 10, 12, 2, 14, 4, 6, 8, 10, 12, 0, 2, 14, 0, 2, 6, 8, 10, 12, 4, 14, 2, 6, 8, 10, 12, 0, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, 6, 8, 10, 12, 0, 2, 4, 14, 0, 2, 4, 8, 10, 12, 6, 14, 2, 4, 8, 10, 12, 0, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, 4, 8, 10, 12, 0, 2, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, 2, 8, 10, 12, 0, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, 8, 10, 12, 0, 2, 4, 6, 14, 0, 2, 4, 6, 10, 12, 8, 14, 2, 4, 6, 10, 12, 0, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, 4, 6, 10, 12, 0, 2, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, 2, 6, 10, 12, 0, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, 6, 10, 12, 0, 2, 4, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, 2, 4, 10, 12, 0, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, 4, 10, 12, 0, 2, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, 2, 10, 12, 0, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, 10, 12, 0, 2, 4, 6, 8, 14, 0, 2, 4, 6, 8, 12, 10, 14, 2, 4, 6, 8, 12, 0, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, 4, 6, 8, 12, 0, 2, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, 2, 6, 8, 12, 0, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, 6, 8, 12, 0, 2, 4, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, 2, 4, 8, 12, 0, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, 4, 8, 12, 0, 2, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, 2, 8, 12, 0, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, 8, 12, 0, 2, 4, 6, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, 2, 4, 6, 12, 0, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, 4, 6, 12, 0, 2, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, 2, 6, 12, 0, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, 6, 12, 0, 2, 4, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, 2, 4, 12, 0, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, 4, 12, 0, 2, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, 2, 12, 0, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, 12, 0, 2, 4, 6, 8, 10, 14, 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 10, 0, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, 4, 6, 8, 10, 0, 2, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, 2, 6, 8, 10, 0, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, 6, 8, 10, 0, 2, 4, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, 2, 4, 8, 10, 0, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, 4, 8, 10, 0, 2, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, 2, 8, 10, 0, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, 8, 10, 0, 2, 4, 6, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, 2, 4, 6, 10, 0, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, 4, 6, 10, 0, 2, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, 2, 6, 10, 0, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, 6, 10, 0, 2, 4, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, 2, 4, 10, 0, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, 4, 10, 0, 2, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, 2, 10, 0, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, 10, 0, 2, 4, 6, 8, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 8, 0, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, 4, 6, 8, 0, 2, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, 2, 6, 8, 0, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, 6, 8, 0, 2, 4, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, 2, 4, 8, 0, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, 4, 8, 0, 2, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, 2, 8, 0, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, 8, 0, 2, 4, 6, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 6, 0, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, 4, 6, 0, 2, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, 2, 6, 0, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, 6, 0, 2, 4, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 2, 4, 0, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, 4, 0, 2, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14};
const VFromD<decltype(d8t)> byte_idx{Load(d8, table + mask_bits * 8).raw};
const VFromD<decltype(du)> pairs = ZipLower(byte_idx, byte_idx);
constexpr uint16_t kPairIndexIncrement =
HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001;
return BitCast(d, pairs + Set(du, kPairIndexIncrement));
}
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 16);
alignas(16) static constexpr uint8_t u8_indices[256] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}
template <class D, HWY_IF_T_SIZE_D(D, 4)>
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 16);
alignas(16) static constexpr uint8_t u8_indices[256] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3,
8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1,
2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1,
2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3,
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15};
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<D> IndicesFromBits128(D d, uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 4);
alignas(16) static constexpr uint8_t u8_indices[64] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}
template <class D, HWY_IF_T_SIZE_D(D, 8)>
HWY_INLINE VFromD<D> IndicesFromNotBits128(D d, uint64_t mask_bits) {
HWY_DASSERT(mask_bits < 4);
alignas(16) static constexpr uint8_t u8_indices[64] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
}
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
HWY_DASSERT(mask_bits < (1ull << N));
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
}
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
HWY_DASSERT(mask_bits < (1ull << N));
const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits));
return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
}
}
template <typename T>
HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> ) {
return v;
}
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
const Full128<T> d;
const Vec128<T> m = VecFromMask(d, mask);
const Vec128<T> maskL = DupEven(m);
const Vec128<T> maskH = DupOdd(m);
const Vec128<T> swap = AndNot(maskL, maskH);
return IfVecThenElse(swap, Shuffle01(v), v);
}
#if HWY_PPC_HAVE_10
#ifdef HWY_NATIVE_COMPRESS8
#undef HWY_NATIVE_COMPRESS8
#else
#define HWY_NATIVE_COMPRESS8
#endif
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
const DFromV<decltype(v)> d;
return TableLookupBytes(
v, detail::CompressOrExpandIndicesFromMask<true>(d, mask));
}
#endif
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
const DFromV<decltype(v)> d;
return detail::CompressBits(v, BitsFromMask(d, mask));
}
template <typename T>
HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> ) {
return v;
}
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
const Full128<T> d;
const Vec128<T> m = VecFromMask(d, mask);
const Vec128<T> maskL = DupEven(m);
const Vec128<T> maskH = DupOdd(m);
const Vec128<T> swap = AndNot(maskH, maskL);
return IfVecThenElse(swap, Shuffle01(v), v);
}
#if HWY_PPC_HAVE_10
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
const DFromV<decltype(v)> d;
return TableLookupBytes(
v, detail::CompressOrExpandIndicesFromMask<true>(d, Not(mask)));
}
#endif
template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
const DFromV<decltype(v)> d;
if (N < 16 / sizeof(T)) {
return detail::CompressBits(v, BitsFromMask(d, Not(mask)));
}
return detail::CompressNotBits(v, BitsFromMask(d, mask));
}
HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
Mask128<uint64_t> ) {
return v;
}
#if HWY_PPC_HAVE_10
template <typename T, size_t N, HWY_IF_T_SIZE(T, 1)>
HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
const uint8_t* HWY_RESTRICT bits) {
const DFromV<decltype(v)> d;
return Compress(v, LoadMaskBits(d, bits));
}
#endif
template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
const uint8_t* HWY_RESTRICT bits) {
uint64_t mask_bits = bits[0];
if (N < 8) {
mask_bits &= (1ull << N) - 1;
}
return detail::CompressBits(v, mask_bits);
}
#if HWY_PPC_HAVE_10
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const size_t count = CountTrue(d, m);
const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m);
const auto compressed = TableLookupBytes(v, indices);
StoreU(compressed, d, unaligned);
return count;
}
#endif
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> m, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const RebindToUnsigned<decltype(d)> du;
const uint64_t mask_bits = BitsFromMask(d, m);
HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
const size_t count = PopCount(mask_bits);
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
StoreU(compressed, d, unaligned);
return count;
}
#if HWY_PPC_HAVE_10
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const size_t count = CountTrue(d, m);
const auto indices = detail::CompressOrExpandIndicesFromMask<true>(d, m);
const auto compressed = TableLookupBytes(v, indices);
StoreN(compressed, d, unaligned, count);
return count;
}
#endif
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
TFromD<D>* HWY_RESTRICT unaligned) {
const RebindToUnsigned<decltype(d)> du;
const uint64_t mask_bits = BitsFromMask(d, m);
HWY_DASSERT(mask_bits < (1ull << MaxLanes(d)));
const size_t count = PopCount(mask_bits);
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
#if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14
StoreN(compressed, d, unaligned, count);
#else
BlendedStore(compressed, FirstN(d, count), d, unaligned);
#endif
return count;
}
#if HWY_PPC_HAVE_10
template <class D, HWY_IF_T_SIZE_D(D, 1)>
HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
D d, TFromD<D>* HWY_RESTRICT unaligned) {
return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
}
#endif
template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
D d, TFromD<D>* HWY_RESTRICT unaligned) {
const RebindToUnsigned<decltype(d)> du;
uint64_t mask_bits = bits[0];
constexpr size_t kN = MaxLanes(d);
if (kN < 8) {
mask_bits &= (1ull << kN) - 1;
}
const size_t count = PopCount(mask_bits);
const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits));
const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
StoreU(compressed, d, unaligned);
return count;
}
#if HWY_PPC_HAVE_10
#ifdef HWY_NATIVE_EXPAND
#undef HWY_NATIVE_EXPAND
#else
#define HWY_NATIVE_EXPAND
#endif
template <typename T, size_t N,
HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
const DFromV<decltype(v)> d;
const auto idx = detail::CompressOrExpandIndicesFromMask<false>(d, mask);
return IfThenElseZero(mask, TableLookupBytes(v, idx));
}
template <typename T, HWY_IF_T_SIZE(T, 8)>
HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
return IfThenElseZero(mask, Compress(v, mask));
}
template <typename T>
HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
return IfThenElseZero(mask, v);
}
template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
const TFromD<D>* HWY_RESTRICT unaligned) {
return Expand(LoadU(d, unaligned), mask);
}
#endif
namespace detail {
#if HWY_IS_LITTLE_ENDIAN
template <class V>
HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
return v;
}
template <class V>
HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
return v;
}
#else
template <class V, HWY_IF_T_SIZE_V(V, 1)>
HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
const DFromV<decltype(v)> d;
return Reverse8(d, v);
}
template <class V, HWY_IF_T_SIZE_V(V, 2)>
HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
const DFromV<decltype(v)> d;
return Reverse4(d, v);
}
template <class V, HWY_IF_T_SIZE_V(V, 4)>
HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
const DFromV<decltype(v)> d;
return Reverse2(d, v);
}
template <class V, HWY_IF_T_SIZE_V(V, 8)>
HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) {
return v;
}
template <class V>
HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) {
const DFromV<decltype(v)> d;
return Reverse(d, v);
}
#endif
template <class V>
HWY_INLINE V I128Subtract(V a, V b) {
#if HWY_S390X_HAVE_Z14
#if HWY_COMPILER_CLANG
typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
reinterpret_cast<VU128>(a.raw) - reinterpret_cast<VU128>(b.raw))};
#else
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw),
reinterpret_cast<__vector unsigned char>(b.raw)))};
#endif #elif defined(__SIZEOF_INT128__)
using VU128 = __vector unsigned __int128;
const V diff_i128{reinterpret_cast<typename detail::Raw128<TFromV<V>>::type>(
vec_sub(reinterpret_cast<VU128>(a.raw), reinterpret_cast<VU128>(b.raw)))};
#else
const DFromV<decltype(a)> d;
const Repartition<uint64_t, decltype(d)> du64;
const auto u64_a = BitCast(du64, a);
const auto u64_b = BitCast(du64, b);
const auto diff_u64 = u64_a - u64_b;
const auto borrow_u64 = VecFromMask(du64, u64_a < u64_b);
#if HWY_IS_LITTLE_ENDIAN
const auto borrow_u64_shifted = ShiftLeftBytes<8>(du64, borrow_u64);
#else
const auto borrow_u64_shifted = ShiftRightBytes<8>(du64, borrow_u64);
#endif
const auto diff_i128 = BitCast(d, diff_u64 + borrow_u64_shifted);
#endif
return diff_i128;
}
}
template <class T>
HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
return mask;
}
template <class T>
HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
const FixedTag<T, 2> d;
const auto vmask = VecFromMask(d, mask);
return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
}
template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
const Simd<T, N, 0> d;
const Full64<T> d_full64;
const auto vmask = VecFromMask(d, mask);
const auto vmask_le64 =
BitCast(Full64<int64_t>(),
detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask)));
const auto neg_vmask_le64 = Neg(vmask_le64);
const auto neg_vmask = ResizeBitCast(
d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64)));
return MaskFromVec(Or(vmask, neg_vmask));
}
template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
const Full128<T> d;
auto vmask = VecFromMask(d, mask);
const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask);
const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128);
const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128);
return MaskFromVec(BitCast(d, Or(vmask, neg_vmask)));
}
template <class T, size_t N>
HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
return Not(SetAtOrAfterFirst(mask));
}
template <class T>
HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
return mask;
}
template <class T>
HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
const FixedTag<T, 2> d;
const RebindToSigned<decltype(d)> di;
const auto vmask = BitCast(di, VecFromMask(d, mask));
const auto zero = Zero(di);
const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
return MaskFromVec(BitCast(d, And(vmask, vmask2)));
}
template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
const Simd<T, N, 0> d;
const Full64<T> d_full64;
const RebindToSigned<decltype(d)> di;
const auto vmask = VecFromMask(d, mask);
const auto vmask_le64 =
BitCast(Full64<int64_t>(),
detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask)));
const auto neg_vmask_le64 = Neg(vmask_le64);
const auto neg_vmask = ResizeBitCast(
d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64)));
const auto first_vmask = BitCast(di, And(vmask, neg_vmask));
return MaskFromVec(BitCast(d, Or(first_vmask, Neg(first_vmask))));
}
template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
const Full128<T> d;
const RebindToSigned<decltype(d)> di;
const auto vmask = VecFromMask(d, mask);
const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask);
const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128);
const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128);
return MaskFromVec(BitCast(d, Neg(BitCast(di, And(vmask, neg_vmask)))));
}
template <class T>
HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> ) {
const FixedTag<T, 1> d;
const RebindToSigned<decltype(d)> di;
using TI = MakeSigned<T>;
return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
}
template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
const Simd<T, N, 0> d;
return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
}
namespace detail {
#if !HWY_S390X_HAVE_Z14
template <class D>
HWY_INLINE VFromD<D> AltivecVsum4sbs(D d, __vector signed char a,
__vector signed int b) {
const Repartition<int32_t, D> di32;
#ifdef __OPTIMIZE__
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
const int64_t sum0 =
static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
static_cast<int64_t>(b[0]);
const int64_t sum1 =
static_cast<int64_t>(a[4]) + static_cast<int64_t>(a[5]) +
static_cast<int64_t>(a[6]) + static_cast<int64_t>(a[7]) +
static_cast<int64_t>(b[1]);
const int64_t sum2 =
static_cast<int64_t>(a[8]) + static_cast<int64_t>(a[9]) +
static_cast<int64_t>(a[10]) + static_cast<int64_t>(a[11]) +
static_cast<int64_t>(b[2]);
const int64_t sum3 =
static_cast<int64_t>(a[12]) + static_cast<int64_t>(a[13]) +
static_cast<int64_t>(a[14]) + static_cast<int64_t>(a[15]) +
static_cast<int64_t>(b[3]);
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
using Raw = typename detail::Raw128<int32_t>::type;
return BitCast(
d,
VFromD<decltype(di32)>{Raw{
(sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
: static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
(sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
: static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
(sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
: static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
(sign3 == (sum3 >> 31))
? static_cast<int32_t>(sum3)
: static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
} else #endif
{
return BitCast(d, VFromD<decltype(di32)>{vec_vsum4sbs(a, b)});
}
}
template <class D>
HWY_INLINE VFromD<D> AltivecVsum4ubs(D d, __vector unsigned char a,
__vector unsigned int b) {
const Repartition<uint32_t, D> du32;
#ifdef __OPTIMIZE__
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
const uint64_t sum0 =
static_cast<uint64_t>(a[0]) + static_cast<uint64_t>(a[1]) +
static_cast<uint64_t>(a[2]) + static_cast<uint64_t>(a[3]) +
static_cast<uint64_t>(b[0]);
const uint64_t sum1 =
static_cast<uint64_t>(a[4]) + static_cast<uint64_t>(a[5]) +
static_cast<uint64_t>(a[6]) + static_cast<uint64_t>(a[7]) +
static_cast<uint64_t>(b[1]);
const uint64_t sum2 =
static_cast<uint64_t>(a[8]) + static_cast<uint64_t>(a[9]) +
static_cast<uint64_t>(a[10]) + static_cast<uint64_t>(a[11]) +
static_cast<uint64_t>(b[2]);
const uint64_t sum3 =
static_cast<uint64_t>(a[12]) + static_cast<uint64_t>(a[13]) +
static_cast<uint64_t>(a[14]) + static_cast<uint64_t>(a[15]) +
static_cast<uint64_t>(b[3]);
return BitCast(
d,
VFromD<decltype(du32)>{(__vector unsigned int){
static_cast<unsigned int>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
static_cast<unsigned int>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
static_cast<unsigned int>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
static_cast<unsigned int>(sum3 <= 0xFFFFFFFFu ? sum3
: 0xFFFFFFFFu)}});
} else #endif
{
return BitCast(d, VFromD<decltype(du32)>{vec_vsum4ubs(a, b)});
}
}
template <class D>
HWY_INLINE VFromD<D> AltivecVsum2sws(D d, __vector signed int a,
__vector signed int b) {
const Repartition<int32_t, D> di32;
#ifdef __OPTIMIZE__
const Repartition<uint64_t, D> du64;
constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN;
if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) &&
__builtin_constant_p(b[kDestLaneOffset + 2])) {
const int64_t sum0 = static_cast<int64_t>(a[0]) +
static_cast<int64_t>(a[1]) +
static_cast<int64_t>(b[kDestLaneOffset]);
const int64_t sum1 = static_cast<int64_t>(a[2]) +
static_cast<int64_t>(a[3]) +
static_cast<int64_t>(b[kDestLaneOffset + 2]);
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
return BitCast(d, VFromD<decltype(du64)>{(__vector unsigned long long){
(sign0 == (sum0 >> 31))
? static_cast<uint32_t>(sum0)
: static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
(sign1 == (sum1 >> 31))
? static_cast<uint32_t>(sum1)
: static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)}});
} else #endif
{
__vector signed int sum;
__asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
return BitCast(d, VFromD<decltype(di32)>{sum});
}
}
template <class D>
HWY_INLINE VFromD<D> AltivecVsum4shs(D d, __vector signed short a,
__vector signed int b) {
const Repartition<int32_t, D> di32;
#ifdef __OPTIMIZE__
if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) {
const int64_t sum0 = static_cast<int64_t>(a[0]) +
static_cast<int64_t>(a[1]) +
static_cast<int64_t>(b[0]);
const int64_t sum1 = static_cast<int64_t>(a[2]) +
static_cast<int64_t>(a[3]) +
static_cast<int64_t>(b[1]);
const int64_t sum2 = static_cast<int64_t>(a[4]) +
static_cast<int64_t>(a[5]) +
static_cast<int64_t>(b[2]);
const int64_t sum3 = static_cast<int64_t>(a[6]) +
static_cast<int64_t>(a[7]) +
static_cast<int64_t>(b[3]);
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
const int32_t sign2 = static_cast<int32_t>(sum2 >> 63);
const int32_t sign3 = static_cast<int32_t>(sum3 >> 63);
using Raw = typename detail::Raw128<int32_t>::type;
return BitCast(
d,
VFromD<decltype(di32)>{Raw{
(sign0 == (sum0 >> 31)) ? static_cast<int32_t>(sum0)
: static_cast<int32_t>(sign0 ^ 0x7FFFFFFF),
(sign1 == (sum1 >> 31)) ? static_cast<int32_t>(sum1)
: static_cast<int32_t>(sign1 ^ 0x7FFFFFFF),
(sign2 == (sum2 >> 31)) ? static_cast<int32_t>(sum2)
: static_cast<int32_t>(sign2 ^ 0x7FFFFFFF),
(sign3 == (sum3 >> 31))
? static_cast<int32_t>(sum3)
: static_cast<int32_t>(sign3 ^ 0x7FFFFFFF)}});
} else #endif
{
return BitCast(d, VFromD<decltype(di32)>{vec_vsum4shs(a, b)});
}
}
template <class D>
HWY_INLINE VFromD<D> AltivecVsumsws(D d, __vector signed int a,
__vector signed int b) {
const Repartition<int32_t, D> di32;
#ifdef __OPTIMIZE__
constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) {
const int64_t sum =
static_cast<int64_t>(a[0]) + static_cast<int64_t>(a[1]) +
static_cast<int64_t>(a[2]) + static_cast<int64_t>(a[3]) +
static_cast<int64_t>(b[kDestLaneOffset]);
const int32_t sign = static_cast<int32_t>(sum >> 63);
#if HWY_IS_LITTLE_ENDIAN
return BitCast(
d, VFromD<decltype(di32)>{(__vector signed int){
(sign == (sum >> 31)) ? static_cast<int32_t>(sum)
: static_cast<int32_t>(sign ^ 0x7FFFFFFF),
0, 0, 0}});
#else
return BitCast(d, VFromD<decltype(di32)>{(__vector signed int){
0, 0, 0,
(sign == (sum >> 31))
? static_cast<int32_t>(sum)
: static_cast<int32_t>(sign ^ 0x7FFFFFFF)}});
#endif
} else #endif
{
__vector signed int sum;
__asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b));
return BitCast(d, VFromD<decltype(di32)>{sum});
}
}
template <size_t N>
HWY_INLINE Vec128<int32_t, N / 2> AltivecU16SumsOf2(Vec128<uint16_t, N> v) {
const RebindToSigned<DFromV<decltype(v)>> di16;
const RepartitionToWide<decltype(di16)> di32;
return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw,
Set(di32, 65536).raw);
}
#endif
template <class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
hwy::UnsignedTag , hwy::SizeTag<2> , V v) {
const DFromV<V> d;
const RepartitionToWide<decltype(d)> dw;
#if HWY_S390X_HAVE_Z14
return VFromD<decltype(dw)>{vec_sum4(v.raw, Zero(d).raw)};
#else
return BitCast(dw, AltivecU16SumsOf2(v));
#endif
}
template <class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
hwy::SignedTag , hwy::SizeTag<2> , V v) {
const DFromV<V> d;
const RepartitionToWide<decltype(d)> dw;
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(d)> du;
return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(),
BitCast(du, Xor(v, SignBit(d))))) +
Set(dw, int32_t{-65536});
#else
return AltivecVsum4shs(dw, v.raw, Zero(dw).raw);
#endif
}
#if HWY_S390X_HAVE_Z14
template <class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
hwy::UnsignedTag , hwy::SizeTag<4> , V v) {
const DFromV<V> d;
const RepartitionToWide<decltype(d)> dw;
return VFromD<decltype(dw)>{vec_sum2(v.raw, Zero(d).raw)};
}
template <class V>
HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
hwy::SignedTag , hwy::SizeTag<4> , V v) {
const DFromV<V> d;
const RepartitionToWide<decltype(d)> dw;
const RebindToUnsigned<decltype(d)> du;
return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
BitCast(du, Xor(v, SignBit(d))))) +
Set(dw, int64_t{-4294967296LL});
}
#endif
template <class V>
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
hwy::UnsignedTag , hwy::SizeTag<1> , V v) {
const DFromV<V> d;
const RepartitionToWideX2<decltype(d)> dw2;
#if HWY_S390X_HAVE_Z14
return VFromD<decltype(dw2)>{vec_sum4(v.raw, Zero(d).raw)};
#else
return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw);
#endif
}
template <class V>
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
hwy::SignedTag , hwy::SizeTag<1> , V v) {
const DFromV<V> d;
const RepartitionToWideX2<decltype(d)> dw2;
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(d)> du;
return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(),
BitCast(du, Xor(v, SignBit(d))))) +
Set(dw2, int32_t{-512});
#else
return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw);
#endif
}
template <class V>
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
hwy::UnsignedTag , hwy::SizeTag<2> , V v) {
const DFromV<V> d;
const RepartitionToWide<decltype(d)> dw;
const RepartitionToWide<decltype(dw)> dw2;
#if HWY_S390X_HAVE_Z14
return VFromD<decltype(dw2)>{vec_sum2(v.raw, Zero(d).raw)};
#else
const RebindToSigned<decltype(dw)> dw_i;
return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw);
#endif
}
template <class V>
HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
hwy::SignedTag , hwy::SizeTag<2> , V v) {
const DFromV<V> d;
const RepartitionToWide<decltype(d)> dw;
const RepartitionToWide<decltype(dw)> dw2;
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(d)> du;
return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(),
BitCast(du, Xor(v, SignBit(d))))) +
Set(dw2, int64_t{-131072});
#else
const auto sums_of_4_in_lo32 =
AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw);
#if HWY_IS_LITTLE_ENDIAN
return PromoteEvenTo(dw2, sums_of_4_in_lo32);
#else
return PromoteOddTo(dw2, sums_of_4_in_lo32);
#endif #endif }
}
#undef HWY_IF_SUM_OF_LANES_D
#if HWY_S390X_HAVE_Z14
#define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D)
#else
#define HWY_IF_SUM_OF_LANES_D(D) \
HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))
#endif
#if HWY_S390X_HAVE_Z14
namespace detail {
#if HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_s390_vsumqf) && \
HWY_HAS_BUILTIN(__builtin_s390_vsumqg)
template <class T, HWY_IF_UI32(T)>
HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const VU128 sum = {__builtin_s390_vsumqf(BitCast(du, v).raw, Zero(du).raw)};
return Vec128<T>{reinterpret_cast<typename detail::Raw128<T>::type>(sum)};
}
template <class T, HWY_IF_UI64(T)>
HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
typedef __uint128_t VU128 __attribute__((__vector_size__(16)));
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const VU128 sum = {__builtin_s390_vsumqg(BitCast(du, v).raw, Zero(du).raw)};
return Vec128<T>{reinterpret_cast<typename detail::Raw128<T>::type>(sum)};
}
#else
template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T),
HWY_IF_T_SIZE_ONE_OF(T, (1 << 4) | (1 << 8))>
HWY_INLINE Vec128<T> SumOfU32OrU64LanesAsU128(Vec128<T> v) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
return BitCast(
d, Vec128<uint8_t>{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)});
}
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI64_D(D)>
HWY_API VFromD<D> SumOfLanes(D , VFromD<D> v) {
return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v));
}
#endif
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U16_D(D)>
HWY_API Vec32<uint16_t> SumOfLanes(D du16, Vec32<uint16_t> v) {
constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
return Broadcast<kSumLaneIdx>(
BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
HWY_API Vec64<uint16_t> SumOfLanes(D du16, Vec64<uint16_t> v) {
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
return Broadcast<kSumLaneIdx>(
BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v)));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
HWY_API Vec128<uint16_t> SumOfLanes(D du16, Vec128<uint16_t> v) {
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
#if HWY_S390X_HAVE_Z14
return Broadcast<kSumLaneIdx>(
BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
hwy::UnsignedTag(), hwy::SizeTag<2>(), v))));
#else
const auto zero = Zero(Full128<int32_t>());
return Broadcast<kSumLaneIdx>(
detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I16_D(D)>
HWY_API Vec32<int16_t> SumOfLanes(D di16, Vec32<int16_t> v) {
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(di16)> du16;
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
#else
constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN;
return Broadcast<kSumLaneIdx>(
BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
HWY_API Vec64<int16_t> SumOfLanes(D di16, Vec64<int16_t> v) {
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(di16)> du16;
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
#else
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
return Broadcast<kSumLaneIdx>(
BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v)));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
HWY_API Vec128<int16_t> SumOfLanes(D di16, Vec128<int16_t> v) {
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(di16)> du16;
return BitCast(di16, SumOfLanes(du16, BitCast(du16, v)));
#else
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
const Full128<int32_t> di32;
const auto zero = Zero(di32);
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U8_D(D)>
HWY_API Vec32<uint8_t> SumOfLanes(D du8, Vec32<uint8_t> v) {
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
return Broadcast<kSumLaneIdx>(
BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v)));
}
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U8_D(D)>
HWY_API Vec16<uint8_t> SumOfLanes(D du8, Vec16<uint8_t> v) {
const Twice<decltype(du8)> dt_u8;
return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v)));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
HWY_API Vec64<uint8_t> SumOfLanes(D du8, Vec64<uint8_t> v) {
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
return Broadcast<kSumLaneIdx>(BitCast(du8, SumsOf8(v)));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
HWY_API Vec128<uint8_t> SumOfLanes(D du8, Vec128<uint8_t> v) {
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
#if HWY_S390X_HAVE_Z14
return Broadcast<kSumLaneIdx>(
BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4(
hwy::UnsignedTag(), hwy::SizeTag<1>(), v))));
#else
const Full128<uint32_t> du32;
const RebindToSigned<decltype(du32)> di32;
const Vec128<uint32_t> zero = Zero(du32);
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw,
BitCast(di32, zero).raw));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I8_D(D)>
HWY_API Vec32<int8_t> SumOfLanes(D di8, Vec32<int8_t> v) {
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(di8)> du8;
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
#else
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3;
return Broadcast<kSumLaneIdx>(
BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v)));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I8_D(D)>
HWY_API Vec16<int8_t> SumOfLanes(D di8, Vec16<int8_t> v) {
const Twice<decltype(di8)> dt_i8;
return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v)));
}
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
HWY_API Vec64<int8_t> SumOfLanes(D di8, Vec64<int8_t> v) {
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(di8)> du8;
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
#else
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7;
return Broadcast<kSumLaneIdx>(BitCast(di8, SumsOf8(v)));
#endif
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
HWY_API Vec128<int8_t> SumOfLanes(D di8, Vec128<int8_t> v) {
#if HWY_S390X_HAVE_Z14
const RebindToUnsigned<decltype(di8)> du8;
return BitCast(di8, SumOfLanes(du8, BitCast(du8, v)));
#else
constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15;
const Full128<int32_t> di32;
const Vec128<int32_t> zero = Zero(di32);
return Broadcast<kSumLaneIdx>(detail::AltivecVsumsws(
di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw));
#endif
}
#if HWY_S390X_HAVE_Z14
template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> SumOfLanes(D d32, VFromD<D> v) {
const RebindToUnsigned<decltype(d32)> du32;
return Broadcast<1>(
BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(),
BitCast(du32, v))));
}
template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_UI32_D(D)>
HWY_API VFromD<D> SumOfLanes(D , VFromD<D> v) {
return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v));
}
#endif
#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
#undef HWY_NATIVE_REDUCE_SUM_4_UI8
#else
#define HWY_NATIVE_REDUCE_SUM_4_UI8
#endif
template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
HWY_API TFromD<D> ReduceSum(D , VFromD<D> v) {
return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
}
#ifdef HWY_NATIVE_BITSHUFFLE
#undef HWY_NATIVE_BITSHUFFLE
#else
#define HWY_NATIVE_BITSHUFFLE
#endif
template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>),
HWY_IF_V_SIZE_V(VI, HWY_MAX_LANES_V(V) * 8)>
HWY_API V BitShuffle(V v, VI idx) {
const DFromV<decltype(v)> d64;
const RebindToUnsigned<decltype(d64)> du64;
const Repartition<uint8_t, decltype(d64)> du8;
const Full128<TFromD<decltype(du64)>> d_full_u64;
const Full128<TFromD<decltype(du8)>> d_full_u8;
using RawVU64 = __vector unsigned long long;
#if HWY_PPC_HAVE_9
#if HWY_IS_LITTLE_ENDIAN
(void)d_full_u64;
auto bit_idx = ResizeBitCast(d_full_u8, idx);
#else
auto bit_idx =
BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx)));
#endif
bit_idx = Xor(bit_idx, Set(d_full_u8, uint8_t{0x3F}));
return BitCast(d64, VFromD<decltype(du64)>{reinterpret_cast<RawVU64>(
vec_bperm(BitCast(du64, v).raw, bit_idx.raw))});
#else
#if HWY_IS_LITTLE_ENDIAN
const auto bit_idx_xor_mask = BitCast(
d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x7F7F7F7F7F7F7F7Fu},
uint64_t{0x3F3F3F3F3F3F3F3Fu}));
const auto bit_idx = Xor(ResizeBitCast(d_full_u8, idx), bit_idx_xor_mask);
constexpr int kBitShufResultByteShrAmt = 8;
#else
const auto bit_idx_xor_mask = BitCast(
d_full_u8, Dup128VecFromValues(d_full_u64, uint64_t{0x3F3F3F3F3F3F3F3Fu},
uint64_t{0x7F7F7F7F7F7F7F7Fu}));
const auto bit_idx =
Xor(BitCast(d_full_u8, ReverseLaneBytes(ResizeBitCast(d_full_u64, idx))),
bit_idx_xor_mask);
constexpr int kBitShufResultByteShrAmt = 6;
#endif
#if HWY_S390X_HAVE_Z14
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
vec_bperm_u128(BitCast(du8, v).raw, bit_idx.raw))};
#elif defined(__SIZEOF_INT128__)
using RawVU128 = __vector unsigned __int128;
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
#else
using RawVU128 = __vector unsigned char;
const VFromD<decltype(d_full_u64)> bit_shuf_result{reinterpret_cast<RawVU64>(
vec_vbpermq(reinterpret_cast<RawVU128>(v.raw), bit_idx.raw))};
#endif
return ResizeBitCast(
d64, PromoteTo(d_full_u64,
ResizeBitCast(
Rebind<uint8_t, decltype(d_full_u64)>(),
CombineShiftRightBytes<kBitShufResultByteShrAmt>(
d_full_u64, bit_shuf_result, bit_shuf_result))));
#endif }
namespace detail {
template <class D, class V = VFromD<D>>
HWY_INLINE V Lt128Vec(D d, V a, V b) {
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
(void)d;
using VU64 = __vector unsigned long long;
using VU128 = __vector unsigned __int128;
#if HWY_IS_LITTLE_ENDIAN
const VU128 a_u128 = reinterpret_cast<VU128>(a.raw);
const VU128 b_u128 = reinterpret_cast<VU128>(b.raw);
#else
const VU128 a_u128 = reinterpret_cast<VU128>(vec_sld(a.raw, a.raw, 8));
const VU128 b_u128 = reinterpret_cast<VU128>(vec_sld(b.raw, b.raw, 8));
#endif
return V{reinterpret_cast<VU64>(vec_cmplt(a_u128, b_u128))};
#else
const auto eqHL = Eq(a, b);
const V ltHL = VecFromMask(d, Lt(a, b));
const V ltLX = ShiftLeftLanes<1>(ltHL);
const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
return InterleaveUpper(d, vecHx, vecHx);
#endif
}
template <class D, class V = VFromD<D>>
HWY_INLINE V Eq128Vec(D d, V a, V b) {
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
(void)d;
using VU64 = __vector unsigned long long;
using VU128 = __vector unsigned __int128;
return V{reinterpret_cast<VU64>(vec_cmpeq(reinterpret_cast<VU128>(a.raw),
reinterpret_cast<VU128>(b.raw)))};
#else
const auto eqHL = VecFromMask(d, Eq(a, b));
const auto eqLH = Reverse2(d, eqHL);
return And(eqHL, eqLH);
#endif
}
template <class D, class V = VFromD<D>>
HWY_INLINE V Ne128Vec(D d, V a, V b) {
static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__)
(void)d;
using VU64 = __vector unsigned long long;
using VU128 = __vector unsigned __int128;
return V{reinterpret_cast<VU64>(vec_cmpne(reinterpret_cast<VU128>(a.raw),
reinterpret_cast<VU128>(b.raw)))};
#else
const auto neHL = VecFromMask(d, Ne(a, b));
const auto neLH = Reverse2(d, neHL);
return Or(neHL, neLH);
#endif
}
template <class D, class V = VFromD<D>>
HWY_INLINE V Lt128UpperVec(D d, V a, V b) {
const V ltHL = VecFromMask(d, Lt(a, b));
return InterleaveUpper(d, ltHL, ltHL);
}
template <class D, class V = VFromD<D>>
HWY_INLINE V Eq128UpperVec(D d, V a, V b) {
const V eqHL = VecFromMask(d, Eq(a, b));
return InterleaveUpper(d, eqHL, eqHL);
}
template <class D, class V = VFromD<D>>
HWY_INLINE V Ne128UpperVec(D d, V a, V b) {
const V neHL = VecFromMask(d, Ne(a, b));
return InterleaveUpper(d, neHL, neHL);
}
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Lt128(D d, V a, V b) {
return MaskFromVec(detail::Lt128Vec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Eq128(D d, V a, V b) {
return MaskFromVec(detail::Eq128Vec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Ne128(D d, V a, V b) {
return MaskFromVec(detail::Ne128Vec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Lt128Upper(D d, V a, V b) {
return MaskFromVec(detail::Lt128UpperVec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Eq128Upper(D d, V a, V b) {
return MaskFromVec(detail::Eq128UpperVec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Ne128Upper(D d, V a, V b) {
return MaskFromVec(detail::Ne128UpperVec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API V Min128(D d, const V a, const V b) {
return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
}
template <class D, class V = VFromD<D>>
HWY_API V Max128(D d, const V a, const V b) {
return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
}
template <class D, class V = VFromD<D>>
HWY_API V Min128Upper(D d, const V a, const V b) {
return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
}
template <class D, class V = VFromD<D>>
HWY_API V Max128Upper(D d, const V a, const V b) {
return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
}
#ifdef HWY_NATIVE_LEADING_ZERO_COUNT
#undef HWY_NATIVE_LEADING_ZERO_COUNT
#else
#define HWY_NATIVE_LEADING_ZERO_COUNT
#endif
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V LeadingZeroCount(V v) {
#if HWY_S390X_HAVE_Z14
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
__asm__("" : "+v"(v.raw));
#endif
return BitCast(d, VFromD<decltype(du)>{vec_cntlz(BitCast(du, v).raw)});
#else
return V{vec_cntlz(v.raw)};
#endif
}
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V HighestSetBitIndex(V v) {
const DFromV<decltype(v)> d;
using T = TFromD<decltype(d)>;
return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
}
#if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V TrailingZeroCount(V v) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
return V{vec_vctz(v.raw)};
#else
#if HWY_S390X_HAVE_Z14
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
#if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__)
__asm__("" : "+v"(v.raw));
#endif
return BitCast(d, VFromD<decltype(du)>{vec_cnttz(BitCast(du, v).raw)});
#else
return V{vec_cnttz(v.raw)};
#endif #endif }
#else
template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
HWY_API V TrailingZeroCount(V v) {
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
using TI = TFromD<decltype(di)>;
const auto vi = BitCast(di, v);
const auto lowest_bit = And(vi, Neg(vi));
constexpr TI kNumOfBitsInT{sizeof(TI) * 8};
const auto bit_idx = HighestSetBitIndex(lowest_bit);
return BitCast(d, IfThenElse(MaskFromVec(BroadcastSignBit(bit_idx)),
Set(di, kNumOfBitsInT), bit_idx));
}
#endif
#undef HWY_PPC_HAVE_9
#undef HWY_PPC_HAVE_10
#undef HWY_S390X_HAVE_Z14
#undef HWY_S390X_HAVE_Z15
} } HWY_AFTER_NAMESPACE();