use super::*;
#[cfg(target_arch = "x86_64")]
use crate::core_arch::x86::Avx2;
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
macro_rules! x86_call_128 {
($ext: expr, $func: ident, f32, $($arg: expr),*) => {
paste!($ext.[<_mm_ $func _ ps>]($($arg),*))
};
($ext: expr, $func: ident, f64, $($arg: expr),*) => {
paste!($ext.[<_mm_ $func _ pd>]($($arg),*))
};
($ext: expr, $func: ident, $ty: ty, $($arg: expr),*) => {
paste!($ext.[<_mm_ $func _ep $ty>]($($arg),*))
}
}
macro_rules! x86_call_256 {
($ext: expr, $func: ident, f32, $($arg: expr),*) => {
paste!($ext.[<_mm256_ $func _ ps>]($($arg),*))
};
($ext: expr, $func: ident, f64, $($arg: expr),*) => {
paste!($ext.[<_mm256_ $func _ pd>]($($arg),*))
};
($ext: expr, $func: ident, $ty: ty, $($arg: expr),*) => {
paste!($ext.[<_mm256_ $func _ep $ty>]($($arg),*))
}
}
macro_rules! x86_call_128_nosign {
($ext: expr, $func: ident, u8, $($arg: expr),*) => {
x86_call_128!($ext, $func, i8, $($arg),*)
};
($ext: expr, $func: ident, u16, $($arg: expr),*) => {
x86_call_128!($ext, $func, i16, $($arg),*)
};
($ext: expr, $func: ident, u32, $($arg: expr),*) => {
x86_call_128!($ext, $func, i32, $($arg),*)
};
($ext: expr, $func: ident, u64, $($arg: expr),*) => {
x86_call_128!($ext, $func, i64, $($arg),*)
};
($ext: expr, $func: ident, m8, $($arg: expr),*) => {
x86_call_128!($ext, $func, i8, $($arg),*)
};
($ext: expr, $func: ident, m16, $($arg: expr),*) => {
x86_call_128!($ext, $func, i16, $($arg),*)
};
($ext: expr, $func: ident, m32, $($arg: expr),*) => {
x86_call_128!($ext, $func, i32, $($arg),*)
};
($ext: expr, $func: ident, m64, $($arg: expr),*) => {
x86_call_128!($ext, $func, i64, $($arg),*)
};
($ext: expr, $func: ident, $ty: ident, $($arg: expr),*) => {
x86_call_128!($ext, $func, $ty, $($arg),*)
};
}
macro_rules! x86_call_256_nosign {
($ext: expr, $func: ident, u8, $($arg: expr),*) => {
x86_call_256!($ext, $func, i8, $($arg),*)
};
($ext: expr, $func: ident, u16, $($arg: expr),*) => {
x86_call_256!($ext, $func, i16, $($arg),*)
};
($ext: expr, $func: ident, u32, $($arg: expr),*) => {
x86_call_256!($ext, $func, i32, $($arg),*)
};
($ext: expr, $func: ident, u64, $($arg: expr),*) => {
x86_call_256!($ext, $func, i64, $($arg),*)
};
($ext: expr, $func: ident, m8, $($arg: expr),*) => {
x86_call_256!($ext, $func, i8, $($arg),*)
};
($ext: expr, $func: ident, m16, $($arg: expr),*) => {
x86_call_256!($ext, $func, i16, $($arg),*)
};
($ext: expr, $func: ident, m32, $($arg: expr),*) => {
x86_call_256!($ext, $func, i32, $($arg),*)
};
($ext: expr, $func: ident, m64, $($arg: expr),*) => {
x86_call_256!($ext, $func, i64, $($arg),*)
};
($ext: expr, $func: ident, $ty: ident, $($arg: expr),*) => {
x86_call_256!($ext, $func, $ty, $($arg),*)
};
}
macro_rules! binop_128 {
($func: ident, $op: ident, $doc: literal, $ty: ident, $out: ident, $factor: literal, $ext: ident) => {
paste! {
#[inline(always)]
#[doc = $doc]
pub fn [<$func _ $ty x $factor>](self, a: [<$ty x $factor>], b: [<$ty x $factor>]) -> [<$out x $factor>] {
cast!(x86_call_128!(self.$ext, $op, $ty, cast!(a), cast!(b)))
}
}
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal => $out: ident),*) => {
$(binop_128!($func, $op, $doc, $ty, $out, $factor, $ext);)*
};
($ext: ident: $op: ident, $doc: literal, $($ty: ident x $factor: literal),*) => {
$(binop_128!($op, $op, $doc, $ty, $ty, $factor, $ext);)*
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal),*) => {
$(binop_128!($func, $op, $doc, $ty, $ty, $factor, $ext);)*
};
}
macro_rules! binop_256 {
($func: ident, $op: ident, $doc: literal, $ty: ident, $out: ident, $factor: literal, $ext: ident) => {
paste! {
#[inline(always)]
#[doc = $doc]
pub fn [<$func _ $ty x $factor>](self, a: [<$ty x $factor>], b: [<$ty x $factor>]) -> [<$out x $factor>] {
cast!(x86_call_256!(self.$ext, $op, $ty, cast!(a), cast!(b)))
}
}
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal => $out: ident),*) => {
$(binop_256!($func, $op, $doc, $ty, $out, $factor, $ext);)*
};
($ext: ident: $op: ident, $doc: literal, $($ty: ident x $factor: literal),*) => {
$(binop_256!($op, $op, $doc, $ty, $ty, $factor, $ext);)*
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal),*) => {
$(binop_256!($func, $op, $doc, $ty, $ty, $factor, $ext);)*
};
}
macro_rules! unop_128 {
($func: ident, $op: ident, $doc: literal, $ty: ident, $factor: literal, $ext: ident) => {
paste! {
#[inline(always)]
#[doc = $doc]
pub fn [<$func _ $ty x $factor>](self, a: [<$ty x $factor>]) -> [<$ty x $factor>] {
cast!(x86_call_128!(self.$ext, $op, $ty, cast!(a)))
}
}
};
($ext: ident: $op: ident, $doc: literal, $($ty: ident x $factor: literal),*) => {
$(unop_128!($op, $op, $doc, $ty, $factor, $ext);)*
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal),*) => {
$(unop_128!($func, $op, $doc, $ty, $factor, $ext);)*
};
}
macro_rules! unop_256 {
($func: ident, $op: ident, $doc: literal, $ty: ident, $factor: literal, $ext: ident) => {
paste! {
#[inline(always)]
#[doc = $doc]
pub fn [<$func _ $ty x $factor>](self, a: [<$ty x $factor>]) -> [<$ty x $factor>] {
cast!(x86_call_256!(self.$ext, $op, $ty, cast!(a)))
}
}
};
($ext: ident: $op: ident, $doc: literal, $($ty: ident x $factor: literal),*) => {
$(unop_256!($op, $op, $doc, $ty, $factor, $ext);)*
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal),*) => {
$(unop_256!($func, $op, $doc, $ty, $factor, $ext);)*
};
}
macro_rules! binop_128_nosign {
($func: ident, $op: ident, $doc: literal, $ty: ident, $out: ident, $factor: literal, $ext: ident) => {
paste! {
#[inline(always)]
#[doc = $doc]
pub fn [<$func _ $ty x $factor>](self, a: [<$ty x $factor>], b: [<$ty x $factor>]) -> [<$out x $factor>] {
cast!(x86_call_128_nosign!(self.$ext, $op, $ty, cast!(a), cast!(b)))
}
}
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal => $out: ident),*) => {
$(binop_128_nosign!($func, $op, $doc, $ty, $out, $factor, $ext);)*
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal),*) => {
$(binop_128_nosign!($func, $op, $doc, $ty, $ty, $factor, $ext);)*
};
($ext: ident: $func: ident, $doc: literal, $($ty: ident x $factor: literal),*) => {
$(binop_128_nosign!($func, $func, $doc, $ty, $ty, $factor, $ext);)*
};
}
macro_rules! binop_256_nosign {
($func: ident, $op: ident, $doc: literal, $ty: ident, $out: ident, $factor: literal, $ext: ident) => {
paste! {
#[inline(always)]
#[doc = $doc]
pub fn [<$func _ $ty x $factor>](self, a: [<$ty x $factor>], b: [<$ty x $factor>]) -> [<$out x $factor>] {
cast!(x86_call_256_nosign!(self.$ext, $op, $ty, cast!(a), cast!(b)))
}
}
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal => $out: ident),*) => {
$(binop_256_nosign!($func, $op, $doc, $ty, $out, $factor, $ext);)*
};
($ext: ident: $op: ident, $doc: literal, $func: ident, $($ty: ident x $factor: literal),*) => {
$(binop_256_nosign!($func, $op, $doc, $ty, $ty, $factor, $ext);)*
};
($ext: ident: $func: ident, $doc: literal, $($ty: ident x $factor: literal),*) => {
$(binop_256_nosign!($func, $func, $doc, $ty, $ty, $factor, $ext);)*
};
}
macro_rules! binop_128_full {
($func: ident, $doc: literal, $ty: ident, $factor: literal, $ext: ident) => {
paste! {
#[inline(always)]
#[doc = $doc]
pub fn [<$func _ $ty x $factor>](self, a: [<$ty x $factor>], b: [<$ty x $factor>]) -> [<$ty x $factor>] {
cast!(self.$ext.[<_mm_ $func _si128>](cast!(a), cast!(b)))
}
}
};
($ext: ident: $func: ident, $doc: literal, $($ty: ident x $factor: literal),*) => {
$(binop_128_full!($func, $doc, $ty, $factor, $ext);)*
};
}
macro_rules! binop_256_full {
($func: ident, $doc: literal, $ty: ident, $factor: literal, $ext: ident) => {
paste! {
#[inline(always)]
#[doc = $doc]
pub fn [<$func _ $ty x $factor>](self, a: [<$ty x $factor>], b: [<$ty x $factor>]) -> [<$ty x $factor>] {
cast!(self.$ext.[<_mm256_ $func _si256>](cast!(a), cast!(b)))
}
}
};
($ext: ident: $func: ident, $doc: literal, $($ty: ident x $factor: literal),*) => {
$(binop_256_full!($func, $doc, $ty, $factor, $ext);)*
};
}
mod v1;
mod v2;
mod v3;
#[cfg(feature = "x86-v4")]
#[cfg_attr(docsrs, doc(cfg(feature = "x86-v4")))]
mod v4;
pub use v1::*;
pub use v2::*;
pub use v3::*;
#[cfg(feature = "x86-v4")]
#[cfg_attr(docsrs, doc(cfg(feature = "x86-v4")))]
pub use v4::*;
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx,avx2")]
#[inline]
unsafe fn avx_ld_u32s(ptr: *const u32, f: unsafe extern "C" fn()) -> u32x8 {
let ret: __m256;
core::arch::asm! {
"lea rcx, [rip + 2f]",
"jmp {f}",
"2:",
f = in(reg) f,
in("rax") ptr,
out("rcx") _,
out("ymm0") ret,
out("ymm1") _,
};
cast!(ret)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx,avx2")]
#[inline]
unsafe fn avx_st_u32s(ptr: *mut u32, value: u32x8, f: unsafe extern "C" fn()) {
core::arch::asm! {
"lea rcx, [rip + 2f]",
"jmp {f}",
"2:",
f = in(reg) f,
in("rax") ptr,
out("rcx") _,
inout("ymm0") cast::<_, __m256>(value) => _,
out("ymm1") _,
};
}
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
#[repr(u8)]
pub enum Arch {
Scalar = 0,
#[cfg(feature = "x86-v3")]
#[cfg_attr(docsrs, doc(cfg(feature = "x86-v3")))]
V3(V3) = 1,
#[cfg(feature = "x86-v4")]
#[cfg_attr(docsrs, doc(cfg(feature = "x86-v4")))]
V4(V4) = 2,
}
impl Arch {
#[inline]
pub fn new() -> Self {
#[cfg(feature = "x86-v4")]
if let Some(simd) = V4::try_new() {
return Self::V4(simd);
}
#[cfg(feature = "x86-v3")]
if let Some(simd) = V3::try_new() {
return Self::V3(simd);
}
Self::Scalar
}
#[inline(always)]
pub fn dispatch<Op: WithSimd>(self, op: Op) -> Op::Output {
match self {
#[cfg(feature = "x86-v4")]
Arch::V4(simd) => Simd::vectorize(simd, op),
#[cfg(feature = "x86-v3")]
Arch::V3(simd) => Simd::vectorize(simd, op),
Arch::Scalar => Simd::vectorize(Scalar, op),
}
}
}
impl Default for Arch {
#[inline]
fn default() -> Self {
Self::new()
}
}
#[cfg(target_arch = "x86_64")]
include!(concat!(env!("OUT_DIR"), "/x86_64_asm.rs"));
#[cfg(test)]
mod tests {
extern crate alloc;
use super::*;
use alloc::vec;
use alloc::vec::Vec;
use assert_approx_eq::assert_approx_eq;
use core::iter::zip;
use rand::random;
#[allow(unused_macros)]
macro_rules! dbgx {
() => {
::std::eprintln!("[{}:{}]", ::std::file!(), ::std::line!())
};
($val:expr $(,)?) => {
match $val {
tmp => {
::std::eprintln!("[{}:{}] {} = {:#X?}",
::std::file!(), ::std::line!(), ::std::stringify!($val), &tmp);
tmp
}
}
};
($($val:expr),+ $(,)?) => {
($(dbgx!($val)),+,)
};
}
#[test]
fn times_two() {
let n = 1312;
let mut v = (0..n).map(|i| i as f64).collect::<Vec<_>>();
let arch = Arch::new();
struct TimesThree<'a>(&'a mut [f64]);
impl WithSimd for TimesThree<'_> {
type Output = ();
#[inline(always)]
fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
let v = self.0;
let (head, tail) = S::as_mut_simd_f64s(v);
let three = simd.splat_f64s(3.0);
for x in head {
*x = simd.mul_f64s(three, *x);
}
for x in tail {
*x *= 3.0;
}
}
}
arch.dispatch(|| {
for x in &mut v {
*x *= 2.0;
}
});
arch.dispatch(TimesThree(&mut v));
for (i, x) in v.into_iter().enumerate() {
assert_eq!(x, 6.0 * i as f64);
}
}
#[test]
fn cplx_ops() {
let n = 16;
let a = (0..n)
.map(|_| c32 {
re: random(),
im: random(),
})
.collect::<Vec<_>>();
let b = (0..n)
.map(|_| c32 {
re: random(),
im: random(),
})
.collect::<Vec<_>>();
let c = (0..n)
.map(|_| c32 {
re: random(),
im: random(),
})
.collect::<Vec<_>>();
let axb_target = zip(&a, &b).map(|(a, b)| a * b).collect::<Vec<_>>();
let conjaxb_target = zip(&a, &b).map(|(a, b)| a.conj() * b).collect::<Vec<_>>();
let axbpc_target = zip(zip(&a, &b), &c)
.map(|((a, b), c)| a * b + c)
.collect::<Vec<_>>();
let conjaxbpc_target = zip(zip(&a, &b), &c)
.map(|((a, b), c)| a.conj() * b + c)
.collect::<Vec<_>>();
if let Some(simd) = V3::try_new() {
let mut axb = vec![c32::new(0.0, 0.0); n];
let mut conjaxb = vec![c32::new(0.0, 0.0); n];
let mut axbpc = vec![c32::new(0.0, 0.0); n];
let mut conjaxbpc = vec![c32::new(0.0, 0.0); n];
{
let a = V3::as_simd_c32s(&a).0;
let b = V3::as_simd_c32s(&b).0;
let c = V3::as_simd_c32s(&c).0;
let axb = V3::as_mut_simd_c32s(&mut axb).0;
let conjaxb = V3::as_mut_simd_c32s(&mut conjaxb).0;
let axbpc = V3::as_mut_simd_c32s(&mut axbpc).0;
let conjaxbpc = V3::as_mut_simd_c32s(&mut conjaxbpc).0;
for (axb, (a, b)) in zip(axb, zip(a, b)) {
*axb = simd.mul_e_c32s(*a, *b);
}
for (conjaxb, (a, b)) in zip(conjaxb, zip(a, b)) {
*conjaxb = simd.conj_mul_e_c32s(*a, *b);
}
for (axbpc, ((a, b), c)) in zip(axbpc, zip(zip(a, b), c)) {
*axbpc = simd.mul_add_e_c32s(*a, *b, *c);
}
for (conjaxbpc, ((a, b), c)) in zip(conjaxbpc, zip(zip(a, b), c)) {
*conjaxbpc = simd.conj_mul_add_e_c32s(*a, *b, *c);
}
}
for (target, actual) in zip(&axb_target, &axb) {
assert_approx_eq!(target.re, actual.re);
assert_approx_eq!(target.im, actual.im);
}
for (target, actual) in zip(&conjaxb_target, &conjaxb) {
assert_approx_eq!(target.re, actual.re);
assert_approx_eq!(target.im, actual.im);
}
for (target, actual) in zip(&axbpc_target, &axbpc) {
assert_approx_eq!(target.re, actual.re);
assert_approx_eq!(target.im, actual.im);
}
for (target, actual) in zip(&conjaxbpc_target, &conjaxbpc) {
assert_approx_eq!(target.re, actual.re);
assert_approx_eq!(target.im, actual.im);
}
}
#[cfg(feature = "x86-v4")]
if let Some(simd) = V4::try_new() {
let mut axb = vec![c32::new(0.0, 0.0); n];
let mut conjaxb = vec![c32::new(0.0, 0.0); n];
let mut axbpc = vec![c32::new(0.0, 0.0); n];
let mut conjaxbpc = vec![c32::new(0.0, 0.0); n];
{
let a = V4::as_simd_c32s(&a).0;
let b = V4::as_simd_c32s(&b).0;
let c = V4::as_simd_c32s(&c).0;
let axb = V4::as_mut_simd_c32s(&mut axb).0;
let conjaxb = V4::as_mut_simd_c32s(&mut conjaxb).0;
let axbpc = V4::as_mut_simd_c32s(&mut axbpc).0;
let conjaxbpc = V4::as_mut_simd_c32s(&mut conjaxbpc).0;
for (axb, (a, b)) in zip(axb, zip(a, b)) {
*axb = simd.mul_e_c32s(*a, *b);
}
for (conjaxb, (a, b)) in zip(conjaxb, zip(a, b)) {
*conjaxb = simd.conj_mul_e_c32s(*a, *b);
}
for (axbpc, ((a, b), c)) in zip(axbpc, zip(zip(a, b), c)) {
*axbpc = simd.mul_add_e_c32s(*a, *b, *c);
}
for (conjaxbpc, ((a, b), c)) in zip(conjaxbpc, zip(zip(a, b), c)) {
*conjaxbpc = simd.conj_mul_add_e_c32s(*a, *b, *c);
}
}
for (target, actual) in zip(&axb_target, &axb) {
assert_approx_eq!(target.re, actual.re);
assert_approx_eq!(target.im, actual.im);
}
for (target, actual) in zip(&conjaxb_target, &conjaxb) {
assert_approx_eq!(target.re, actual.re);
assert_approx_eq!(target.im, actual.im);
}
for (target, actual) in zip(&axbpc_target, &axbpc) {
assert_approx_eq!(target.re, actual.re);
assert_approx_eq!(target.im, actual.im);
}
for (target, actual) in zip(&conjaxbpc_target, &conjaxbpc) {
assert_approx_eq!(target.re, actual.re);
assert_approx_eq!(target.im, actual.im);
}
}
}
#[test]
fn test_to_ref() {
let simd_ref = unsafe { V2::new_unchecked() }.to_ref();
let _ = *simd_ref;
}
#[test]
fn test_widening_mul_u32x4() {
if let Some(simd) = V2::try_new() {
const N: usize = 4;
let a = u32x4(2298413717, 568259975, 2905436181, 175547995);
let b = u32x4(2022374205, 1446824162, 3165580604, 3011091403);
let a_array: [u32; N] = cast!(a);
let b_array: [u32; N] = cast!(b);
let mut lo_array = [0u32; N];
let mut hi_array = [0u32; N];
for i in 0..N {
let prod = a_array[i] as u64 * b_array[i] as u64;
let lo = prod as u32;
let hi = (prod >> 32) as u32;
lo_array[i] = lo;
hi_array[i] = hi;
}
let (lo, hi) = simd.widening_mul_u32x4(a, b);
assert_eq!(lo, cast!(lo_array));
assert_eq!(hi, cast!(hi_array));
}
if let Some(simd) = V3::try_new() {
const N: usize = 8;
let a = u32x8(
2298413717, 568259975, 2905436181, 175547995, 2298413717, 568259975, 2905436181,
175547995,
);
let b = u32x8(
2022374205, 1446824162, 3165580604, 3011091403, 2022374205, 1446824162, 3165580604,
3011091403,
);
let a_array: [u32; N] = cast!(a);
let b_array: [u32; N] = cast!(b);
let mut lo_array = [0u32; N];
let mut hi_array = [0u32; N];
for i in 0..N {
let prod = a_array[i] as u64 * b_array[i] as u64;
let lo = prod as u32;
let hi = (prod >> 32) as u32;
lo_array[i] = lo;
hi_array[i] = hi;
}
let (lo, hi) = simd.widening_mul_u32x8(a, b);
assert_eq!(lo, cast!(lo_array));
assert_eq!(hi, cast!(hi_array));
}
}
#[test]
fn test_widening_mul_i32() {
if let Some(simd) = V2::try_new() {
const N: usize = 4;
let a = cast!(u32x4(2298413717, 568259975, 2905436181, 175547995));
let b = cast!(u32x4(2022374205, 1446824162, 3165580604, 3011091403));
let a_array: [i32; N] = cast!(a);
let b_array: [i32; N] = cast!(b);
let mut lo_array = [0i32; N];
let mut hi_array = [0i32; N];
for i in 0..N {
let prod = a_array[i] as i64 * b_array[i] as i64;
let lo = prod as i32;
let hi = (prod >> 32) as i32;
lo_array[i] = lo;
hi_array[i] = hi;
}
let (lo, hi) = simd.widening_mul_i32x4(a, b);
assert_eq!(lo, cast!(lo_array));
assert_eq!(hi, cast!(hi_array));
}
if let Some(simd) = V3::try_new() {
const N: usize = 8;
let a = cast!(u32x8(
2298413717, 568259975, 2905436181, 175547995, 2298413717, 568259975, 2905436181,
175547995,
));
let b = cast!(u32x8(
2022374205, 1446824162, 3165580604, 3011091403, 2022374205, 1446824162, 3165580604,
3011091403,
));
let a_array: [i32; N] = cast!(a);
let b_array: [i32; N] = cast!(b);
let mut lo_array = [0i32; N];
let mut hi_array = [0i32; N];
for i in 0..N {
let prod = a_array[i] as i64 * b_array[i] as i64;
let lo = prod as i32;
let hi = (prod >> 32) as i32;
lo_array[i] = lo;
hi_array[i] = hi;
}
let (lo, hi) = simd.widening_mul_i32x8(a, b);
assert_eq!(lo, cast!(lo_array));
assert_eq!(hi, cast!(hi_array));
}
}
#[test]
fn test_shift() {
if let Some(simd) = V2::try_new() {
let a = u16x8(54911, 46958, 49991, 22366, 46365, 39572, 22704, 60060);
assert_eq!(simd.shl_const_u16x8::<16>(a), simd.splat_u16x8(0));
assert_eq!(simd.shl_u16x8(a, simd.splat_u64x2(!0)), simd.splat_u16x8(0),);
}
}
#[test]
fn test_abs() {
if let Some(simd) = V2::try_new() {
let a = f32x4(1.0, -2.0, -1.0, 2.0);
assert_eq!(simd.abs_f32x4(a), f32x4(1.0, 2.0, 1.0, 2.0));
let a = f64x2(1.0, -2.0);
assert_eq!(simd.abs_f64x2(a), f64x2(1.0, 2.0));
}
}
#[test]
fn test_subadd() {
if let Some(simd) = V2::try_new() {
let a = f32x4(1.0, -2.0, -1.0, 2.0);
assert_eq!(simd.subadd_f32x4(a, a), f32x4(0.0, -4.0, 0.0, 4.0));
}
}
#[test]
fn test_signed_to_unsigned() {
if let Some(simd) = V2::try_new() {
let a = i8x16(1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
assert_eq!(simd.convert_i8x16_to_u64x2(a), u64x2(1, !0));
}
}
#[test]
fn test_int_cmp() {
if let Some(simd) = V2::try_new() {
{
const N: usize = 16;
let a = u8x16(
174, 191, 248, 232, 11, 186, 42, 236, 3, 59, 223, 72, 161, 146, 98, 69,
);
let b = u8x16(
97, 239, 164, 173, 208, 0, 121, 247, 218, 58, 119, 131, 213, 133, 22, 128,
);
let lt = simd.cmp_lt_u8x16(a, b);
let a_array: [u8; N] = cast!(a);
let b_array: [u8; N] = cast!(b);
let mut lt_array = [m8::new(false); N];
for i in 0..N {
lt_array[i] = m8::new(a_array[i] < b_array[i]);
}
assert_eq!(lt, cast!(lt_array));
}
{
const N: usize = 8;
let a = u16x8(174, 191, 248, 232, 11, 186, 42, 236);
let b = u16x8(97, 239, 164, 173, 208, 0, 121, 247);
let lt = simd.cmp_lt_u16x8(a, b);
let a_array: [u16; N] = cast!(a);
let b_array: [u16; N] = cast!(b);
let mut lt_array = [m16::new(false); N];
for i in 0..N {
lt_array[i] = m16::new(a_array[i] < b_array[i]);
}
assert_eq!(lt, cast!(lt_array));
}
{
const N: usize = 4;
let a = u32x4(174, 191, 248, 232);
let b = u32x4(97, 239, 164, 173);
let lt = simd.cmp_lt_u32x4(a, b);
let a_array: [u32; N] = cast!(a);
let b_array: [u32; N] = cast!(b);
let mut lt_array = [m32::new(false); N];
for i in 0..N {
lt_array[i] = m32::new(a_array[i] < b_array[i]);
}
assert_eq!(lt, cast!(lt_array));
}
{
const N: usize = 2;
let a = u64x2(174, 191);
let b = u64x2(97, 239);
let lt = simd.cmp_lt_u64x2(a, b);
let a_array: [u64; N] = cast!(a);
let b_array: [u64; N] = cast!(b);
let mut lt_array = [m64::new(false); N];
for i in 0..N {
lt_array[i] = m64::new(a_array[i] < b_array[i]);
}
assert_eq!(lt, cast!(lt_array));
}
}
}
#[test]
fn test_is_nan() {
if let Some(simd) = V2::try_new() {
assert_eq!(
simd.is_nan_f32x4(f32x4(0.0, f32::NAN, f32::INFINITY, -f32::NAN)),
m32x4(
m32::new(false),
m32::new(true),
m32::new(false),
m32::new(true),
),
);
assert_eq!(
simd.is_nan_f64x2(f64x2(0.0, f64::NAN)),
m64x2(m64::new(false), m64::new(true)),
);
}
}
#[test]
fn test_rotate() {
if let Some(simd) = V3::try_new() {
for amount in 0..128 {
let mut array = [0u32; 8];
for (i, dst) in array.iter_mut().enumerate() {
*dst = 1000 + i as u32;
}
let rot: [u32; 8] = cast!(simd.rotate_right_u32s(cast!(array), amount));
for i in 0..8 {
assert_eq!(rot[(i + amount) % 8], array[i]);
}
}
for amount in 0..128 {
let mut array = [0u64; 4];
for (i, dst) in array.iter_mut().enumerate() {
*dst = 1000 + i as u64;
}
let rot: [u64; 4] = cast!(simd.rotate_right_u64s(cast!(array), amount));
for i in 0..4 {
assert_eq!(rot[(i + amount) % 4], array[i]);
}
}
}
#[cfg(feature = "x86-v4")]
if let Some(simd) = V4::try_new() {
for amount in 0..128 {
let mut array = [0u32; 16];
for (i, dst) in array.iter_mut().enumerate() {
*dst = 1000 + i as u32;
}
let rot: [u32; 16] = cast!(simd.rotate_right_u32s(cast!(array), amount));
for i in 0..16 {
assert_eq!(rot[(i + amount) % 16], array[i]);
}
}
for amount in 0..128 {
let mut array = [0u64; 8];
for (i, dst) in array.iter_mut().enumerate() {
*dst = 1000 + i as u64;
}
let rot: [u64; 8] = cast!(simd.rotate_right_u64s(cast!(array), amount));
for i in 0..8 {
assert_eq!(rot[(i + amount) % 8], array[i]);
}
}
}
}
#[test]
fn test_partial() {
if let Some(simd) = V3::try_new() {
for n in 0..=8 {
let src = core::array::from_fn::<f32, 8, _>(|i| i as _);
let mut dst = [0.0f32; 8];
let zero = dst;
assert_eq!(simd.partial_load_f32s(&src[..n]), unsafe {
simd.mask_load_ptr_f32s(simd.mask_between_m32s(0, n as u32), src.as_ptr())
});
{
let src = &src[..n];
let dst = &mut dst[..n];
simd.partial_store_f32s(dst, simd.partial_load_f32s(src));
assert_eq!(src, dst);
}
assert_eq!(dst[n..], zero[n..]);
}
}
#[cfg(feature = "x86-v4")]
if let Some(simd) = V4::try_new() {
for n in 0..=16 {
let src = core::array::from_fn::<f32, 16, _>(|i| i as _);
let mut dst = [0.0f32; 16];
let zero = dst;
assert_eq!(simd.partial_load_f32s(&src[..n]), unsafe {
simd.mask_load_ptr_f32s(simd.mask_between_m32s(0, n as u32), src.as_ptr())
});
{
let src = &src[..n];
let dst = &mut dst[..n];
simd.partial_store_f32s(dst, simd.partial_load_f32s(src));
assert_eq!(src, dst);
}
assert_eq!(dst[n..], zero[n..]);
}
}
}
#[test]
fn test_interleave() {
if let Some(simd) = V3::try_new() {
{
let src = [f64x4(0.0, 0.1, 1.0, 1.1), f64x4(2.0, 2.1, 3.0, 3.1)];
let dst = simd.deinterleave_shfl_f64s(src);
assert_eq!(dst[1], simd.add_f64x4(dst[0], simd.splat_f64x4(0.1)));
assert_eq!(src, simd.interleave_shfl_f64s(dst));
}
{
let src = [
f64x4(0.0, 0.1, 0.2, 0.3),
f64x4(1.0, 1.1, 1.2, 1.3),
f64x4(2.0, 2.1, 2.2, 2.3),
f64x4(3.0, 3.1, 3.2, 3.3),
];
let dst = simd.deinterleave_shfl_f64s(src);
assert_eq!(dst[1], simd.add_f64x4(dst[0], simd.splat_f64x4(0.1)));
assert_eq!(dst[2], simd.add_f64x4(dst[0], simd.splat_f64x4(0.2)));
assert_eq!(dst[3], simd.add_f64x4(dst[0], simd.splat_f64x4(0.3)));
assert_eq!(src, simd.interleave_shfl_f64s(dst));
}
{
let src = [
f32x8(0.0, 0.1, 1.0, 1.1, 2.0, 2.1, 3.0, 3.1),
f32x8(4.0, 4.1, 5.0, 5.1, 6.0, 6.1, 7.0, 7.1),
];
let dst = simd.deinterleave_shfl_f32s(src);
assert_eq!(dst[1], simd.add_f32x8(dst[0], simd.splat_f32x8(0.1)));
assert_eq!(src, simd.interleave_shfl_f32s(dst));
}
{
let src = [
f32x8(0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3),
f32x8(2.0, 2.1, 2.2, 2.3, 3.0, 3.1, 3.2, 3.3),
f32x8(4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3),
f32x8(6.0, 6.1, 6.2, 6.3, 7.0, 7.1, 7.2, 7.3),
];
let dst = simd.deinterleave_shfl_f32s(src);
assert_eq!(dst[1], simd.add_f32x8(dst[0], simd.splat_f32x8(0.1)));
assert_eq!(dst[2], simd.add_f32x8(dst[0], simd.splat_f32x8(0.2)));
assert_eq!(dst[3], simd.add_f32x8(dst[0], simd.splat_f32x8(0.3)));
assert_eq!(src, simd.interleave_shfl_f32s(dst));
}
}
#[cfg(feature = "x86-v4")]
if let Some(simd) = V4::try_new() {
{
let src = [
f64x8(0.0, 0.1, 1.0, 1.1, 2.0, 2.1, 3.0, 3.1),
f64x8(4.0, 4.1, 5.0, 5.1, 6.0, 6.1, 7.0, 7.1),
];
let dst = simd.deinterleave_shfl_f64s(src);
assert_eq!(dst[1], simd.add_f64x8(dst[0], simd.splat_f64x8(0.1)));
assert_eq!(src, simd.interleave_shfl_f64s(dst));
}
{
let src = [
f64x8(0.0, 0.1, 0.2, 1.0, 1.1, 1.2, 2.0, 2.1),
f64x8(2.2, 3.0, 3.1, 3.2, 4.0, 4.1, 4.2, 5.0),
f64x8(5.1, 5.2, 6.0, 6.1, 6.2, 7.0, 7.1, 7.2),
];
let dst = simd.deinterleave_shfl_f64s(src);
assert_eq!(dst[1], simd.add_f64x8(dst[0], simd.splat_f64x8(0.1)));
assert_eq!(dst[2], simd.add_f64x8(dst[0], simd.splat_f64x8(0.2)));
assert_eq!(src, simd.interleave_shfl_f64s(dst));
}
{
let src = [
f64x8(0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3),
f64x8(2.0, 2.1, 2.2, 2.3, 3.0, 3.1, 3.2, 3.3),
f64x8(4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3),
f64x8(6.0, 6.1, 6.2, 6.3, 7.0, 7.1, 7.2, 7.3),
];
let dst = simd.deinterleave_shfl_f64s(src);
assert_eq!(dst[1], simd.add_f64x8(dst[0], simd.splat_f64x8(0.1)));
assert_eq!(dst[2], simd.add_f64x8(dst[0], simd.splat_f64x8(0.2)));
assert_eq!(dst[3], simd.add_f64x8(dst[0], simd.splat_f64x8(0.3)));
assert_eq!(src, simd.interleave_shfl_f64s(dst));
}
{
let src = [
f32x16(
0.0, 0.1, 1.0, 1.1, 2.0, 2.1, 3.0, 3.1, 4.0, 4.1, 5.0, 5.1, 6.0, 6.1, 7.0,
7.1,
),
f32x16(
8.0, 8.1, 9.0, 9.1, 10.0, 10.1, 11.0, 11.1, 12.0, 12.1, 13.0, 13.1, 14.0,
14.1, 15.0, 15.1,
),
];
let dst = simd.deinterleave_shfl_f32s(src);
assert_eq!(dst[1], simd.add_f32x16(dst[0], simd.splat_f32x16(0.1)));
assert_eq!(src, simd.interleave_shfl_f32s(dst));
}
{
let src = [
f32x16(
0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3, 2.0, 2.1, 2.2, 2.3, 3.0, 3.1, 3.2,
3.3,
),
f32x16(
4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3, 6.0, 6.1, 6.2, 6.3, 7.0, 7.1, 7.2,
7.3,
),
f32x16(
8.0, 8.1, 8.2, 8.3, 9.0, 9.1, 9.2, 9.3, 10.0, 10.1, 10.2, 10.3, 11.0, 11.1,
11.2, 11.3,
),
f32x16(
12.0, 12.1, 12.2, 12.3, 13.0, 13.1, 13.2, 13.3, 14.0, 14.1, 14.2, 14.3,
15.0, 15.1, 15.2, 15.3,
),
];
let dst = simd.deinterleave_shfl_f32s(src);
assert_eq!(dst[1], simd.add_f32x16(dst[0], simd.splat_f32x16(0.1)));
assert_eq!(dst[2], simd.add_f32x16(dst[0], simd.splat_f32x16(0.2)));
assert_eq!(dst[3], simd.add_f32x16(dst[0], simd.splat_f32x16(0.3)));
assert_eq!(src, simd.interleave_shfl_f32s(dst));
}
}
}
}