#if !defined(HWY_NAMESPACE)
#include "hwy/ops/shared-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
#define HWY_INSIDE_END_NAMESPACE
template <class D>
using VFromD = int;
template <class D>
using TFromV = int;
template <class D>
struct DFromV {};
#endif
#if HWY_IDE || (!HWY_HAVE_SCALABLE && !HWY_TARGET_IS_SVE)
template <class D>
struct Vec2 {
VFromD<D> v0;
VFromD<D> v1;
};
template <class D>
struct Vec3 {
VFromD<D> v0;
VFromD<D> v1;
VFromD<D> v2;
};
template <class D>
struct Vec4 {
VFromD<D> v0;
VFromD<D> v1;
VFromD<D> v2;
VFromD<D> v3;
};
template <class D>
HWY_API Vec2<D> Create2(D , VFromD<D> v0, VFromD<D> v1) {
return Vec2<D>{v0, v1};
}
template <class D>
HWY_API Vec3<D> Create3(D , VFromD<D> v0, VFromD<D> v1, VFromD<D> v2) {
return Vec3<D>{v0, v1, v2};
}
template <class D>
HWY_API Vec4<D> Create4(D , VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
VFromD<D> v3) {
return Vec4<D>{v0, v1, v2, v3};
}
template <size_t kIndex, class D>
HWY_API VFromD<D> Get2(Vec2<D> tuple) {
static_assert(kIndex < 2, "Tuple index out of bounds");
return kIndex == 0 ? tuple.v0 : tuple.v1;
}
template <size_t kIndex, class D>
HWY_API VFromD<D> Get3(Vec3<D> tuple) {
static_assert(kIndex < 3, "Tuple index out of bounds");
return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2;
}
template <size_t kIndex, class D>
HWY_API VFromD<D> Get4(Vec4<D> tuple) {
static_assert(kIndex < 4, "Tuple index out of bounds");
return kIndex == 0 ? tuple.v0
: kIndex == 1 ? tuple.v1
: kIndex == 2 ? tuple.v2
: tuple.v3;
}
template <size_t kIndex, class D>
HWY_API Vec2<D> Set2(Vec2<D> tuple, VFromD<D> val) {
static_assert(kIndex < 2, "Tuple index out of bounds");
if (kIndex == 0) {
tuple.v0 = val;
} else {
tuple.v1 = val;
}
return tuple;
}
template <size_t kIndex, class D>
HWY_API Vec3<D> Set3(Vec3<D> tuple, VFromD<D> val) {
static_assert(kIndex < 3, "Tuple index out of bounds");
if (kIndex == 0) {
tuple.v0 = val;
} else if (kIndex == 1) {
tuple.v1 = val;
} else {
tuple.v2 = val;
}
return tuple;
}
template <size_t kIndex, class D>
HWY_API Vec4<D> Set4(Vec4<D> tuple, VFromD<D> val) {
static_assert(kIndex < 4, "Tuple index out of bounds");
if (kIndex == 0) {
tuple.v0 = val;
} else if (kIndex == 1) {
tuple.v1 = val;
} else if (kIndex == 2) {
tuple.v2 = val;
} else {
tuple.v3 = val;
}
return tuple;
}
#endif
#if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ROL_ROR_8
#undef HWY_NATIVE_ROL_ROR_8
#else
#define HWY_NATIVE_ROL_ROR_8
#endif
template <class V, HWY_IF_UI8(TFromV<V>)>
HWY_API V Rol(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
const RebindToUnsigned<decltype(d)> du;
const auto shift_amt_mask = Set(du, uint8_t{7});
const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
const auto vu = BitCast(du, a);
return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
}
template <class V, HWY_IF_UI8(TFromV<V>)>
HWY_API V Ror(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
const RebindToUnsigned<decltype(d)> du;
const auto shift_amt_mask = Set(du, uint8_t{7});
const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
const auto vu = BitCast(du, a);
return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
}
#endif
#if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ROL_ROR_16
#undef HWY_NATIVE_ROL_ROR_16
#else
#define HWY_NATIVE_ROL_ROR_16
#endif
template <class V, HWY_IF_UI16(TFromV<V>)>
HWY_API V Rol(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
const RebindToUnsigned<decltype(d)> du;
const auto shift_amt_mask = Set(du, uint16_t{15});
const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
const auto vu = BitCast(du, a);
return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
}
template <class V, HWY_IF_UI16(TFromV<V>)>
HWY_API V Ror(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
const RebindToUnsigned<decltype(d)> du;
const auto shift_amt_mask = Set(du, uint16_t{15});
const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
const auto vu = BitCast(du, a);
return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
}
#endif
#if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ROL_ROR_32_64
#undef HWY_NATIVE_ROL_ROR_32_64
#else
#define HWY_NATIVE_ROL_ROR_32_64
#endif
template <class V, HWY_IF_UI32(TFromV<V>)>
HWY_API V Rol(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
const RebindToUnsigned<decltype(d)> du;
const auto shift_amt_mask = Set(du, uint32_t{31});
const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
const auto vu = BitCast(du, a);
return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
}
template <class V, HWY_IF_UI32(TFromV<V>)>
HWY_API V Ror(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
const RebindToUnsigned<decltype(d)> du;
const auto shift_amt_mask = Set(du, uint32_t{31});
const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
const auto vu = BitCast(du, a);
return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
}
#if HWY_HAVE_INTEGER64
template <class V, HWY_IF_UI64(TFromV<V>)>
HWY_API V Rol(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
const RebindToUnsigned<decltype(d)> du;
const auto shift_amt_mask = Set(du, uint64_t{63});
const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
const auto vu = BitCast(du, a);
return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
}
template <class V, HWY_IF_UI64(TFromV<V>)>
HWY_API V Ror(V a, V b) {
const DFromV<decltype(a)> d;
const RebindToSigned<decltype(d)> di;
const RebindToUnsigned<decltype(d)> du;
const auto shift_amt_mask = Set(du, uint64_t{63});
const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
const auto vu = BitCast(du, a);
return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
}
#endif
#endif
#if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ROL_ROR_SAME_8
#undef HWY_NATIVE_ROL_ROR_SAME_8
#else
#define HWY_NATIVE_ROL_ROR_SAME_8
#endif
template <class V, HWY_IF_UI8(TFromV<V>)>
HWY_API V RotateLeftSame(V v, int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const int shl_amt = bits & 7;
const int shr_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
const auto vu = BitCast(du, v);
return BitCast(d,
Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
}
template <class V, HWY_IF_UI8(TFromV<V>)>
HWY_API V RotateRightSame(V v, int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const int shr_amt = bits & 7;
const int shl_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
const auto vu = BitCast(du, v);
return BitCast(d,
Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
}
#endif
#if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ROL_ROR_SAME_16
#undef HWY_NATIVE_ROL_ROR_SAME_16
#else
#define HWY_NATIVE_ROL_ROR_SAME_16
#endif
template <class V, HWY_IF_UI16(TFromV<V>)>
HWY_API V RotateLeftSame(V v, int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const int shl_amt = bits & 15;
const int shr_amt =
static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
const auto vu = BitCast(du, v);
return BitCast(d,
Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
}
template <class V, HWY_IF_UI16(TFromV<V>)>
HWY_API V RotateRightSame(V v, int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const int shr_amt = bits & 15;
const int shl_amt =
static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
const auto vu = BitCast(du, v);
return BitCast(d,
Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
}
#endif
#if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
#undef HWY_NATIVE_ROL_ROR_SAME_32_64
#else
#define HWY_NATIVE_ROL_ROR_SAME_32_64
#endif
template <class V, HWY_IF_UI32(TFromV<V>)>
HWY_API V RotateLeftSame(V v, int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const int shl_amt = bits & 31;
const int shr_amt =
static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
const auto vu = BitCast(du, v);
return BitCast(d,
Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
}
template <class V, HWY_IF_UI32(TFromV<V>)>
HWY_API V RotateRightSame(V v, int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const int shr_amt = bits & 31;
const int shl_amt =
static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
const auto vu = BitCast(du, v);
return BitCast(d,
Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
}
#if HWY_HAVE_INTEGER64
template <class V, HWY_IF_UI64(TFromV<V>)>
HWY_API V RotateLeftSame(V v, int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const int shl_amt = bits & 63;
const int shr_amt =
static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
const auto vu = BitCast(du, v);
return BitCast(d,
Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
}
template <class V, HWY_IF_UI64(TFromV<V>)>
HWY_API V RotateRightSame(V v, int bits) {
const DFromV<decltype(v)> d;
const RebindToUnsigned<decltype(d)> du;
const int shr_amt = bits & 63;
const int shl_amt =
static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
const auto vu = BitCast(du, v);
return BitCast(d,
Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
}
#endif
#endif
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
namespace detail {
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
hwy::SignedTag ,
hwy::SizeTag<kToLaneSize> ,
hwy::SignedTag , D d_to, V v) {
#if HWY_TARGET_IS_SVE
return NativePromoteEvenTo(BitCast(d_to, v));
#else
#if HWY_IS_LITTLE_ENDIAN
const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
#else
const auto even_in_hi = BitCast(d_to, v);
#endif
return ShiftRight<kToLaneSize * 4>(even_in_hi);
#endif }
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
hwy::UnsignedTag ,
hwy::SizeTag<kToLaneSize> ,
hwy::UnsignedTag , D d_to, V v) {
#if HWY_TARGET_IS_SVE
return NativePromoteEvenTo(BitCast(d_to, v));
#else
#if HWY_IS_LITTLE_ENDIAN
return And(BitCast(d_to, v),
Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
#else
return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
#endif
#endif }
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
hwy::SignedTag ,
hwy::SizeTag<kToLaneSize> ,
hwy::SignedTag , D d_to, V v) {
#if HWY_IS_LITTLE_ENDIAN
const auto odd_in_hi = BitCast(d_to, v);
#else
const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
#endif
return ShiftRight<kToLaneSize * 4>(odd_in_hi);
}
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
hwy::UnsignedTag ,
hwy::SizeTag<kToLaneSize> ,
hwy::UnsignedTag , D d_to, V v) {
#if HWY_IS_LITTLE_ENDIAN
return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
#else
return And(BitCast(d_to, v),
Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
#endif
}
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteEvenTo(
hwy::SignedTag ,
hwy::SizeTag<kToLaneSize> ,
hwy::UnsignedTag , D d_to, V v) {
const RebindToUnsigned<decltype(d_to)> du_to;
return BitCast(d_to,
PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
hwy::UnsignedTag(), du_to, v));
}
template <size_t kToLaneSize, class D, class V>
HWY_INLINE VFromD<D> PromoteOddTo(
hwy::SignedTag ,
hwy::SizeTag<kToLaneSize> ,
hwy::UnsignedTag , D d_to, V v) {
const RebindToUnsigned<decltype(d_to)> du_to;
return BitCast(d_to,
PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
hwy::UnsignedTag(), du_to, v));
}
template <class FromTypeTag, class DF32, class VBF16,
class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag ,
hwy::SizeTag<4> ,
FromTypeTag , DF32 d_to,
VBF16 v) {
const RebindToUnsigned<decltype(d_to)> du_to;
#if HWY_IS_LITTLE_ENDIAN
return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
#else
return BitCast(d_to,
And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
#endif
}
template <class FromTypeTag, class DF32, class VBF16,
class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag ,
hwy::SizeTag<4> ,
FromTypeTag , DF32 d_to,
VBF16 v) {
const RebindToUnsigned<decltype(d_to)> du_to;
#if HWY_IS_LITTLE_ENDIAN
return BitCast(d_to,
And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
#else
return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
#endif
}
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
class V, HWY_IF_LANES_D(D, 1)>
HWY_INLINE VFromD<D> PromoteEvenTo(
ToTypeTag , hwy::SizeTag<kToLaneSize> ,
FromTypeTag , D d_to, V v) {
return PromoteLowerTo(d_to, v);
}
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
class V, HWY_IF_LANES_GT_D(D, 1)>
HWY_INLINE VFromD<D> PromoteEvenTo(
ToTypeTag , hwy::SizeTag<kToLaneSize> ,
FromTypeTag , D d_to, V v) {
const DFromV<decltype(v)> d;
return PromoteLowerTo(d_to, ConcatEven(d, v, v));
}
template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
class V>
HWY_INLINE VFromD<D> PromoteOddTo(
ToTypeTag , hwy::SizeTag<kToLaneSize> ,
FromTypeTag , D d_to, V v) {
const DFromV<decltype(v)> d;
return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
}
}
template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
class V2 = VFromD<Repartition<TFromV<V>, D>>,
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
hwy::SizeTag<sizeof(TFromD<D>)>(),
hwy::TypeTag<TFromV<V>>(), d, v);
}
template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
class V2 = VFromD<Repartition<TFromV<V>, D>>,
HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
HWY_API VFromD<D> PromoteOddTo(D d, V v) {
return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
hwy::SizeTag<sizeof(TFromD<D>)>(),
hwy::TypeTag<TFromV<V>>(), d, v);
}
#endif
#ifdef HWY_INSIDE_END_NAMESPACE
#undef HWY_INSIDE_END_NAMESPACE
} } HWY_AFTER_NAMESPACE();
#endif