#![allow(
non_camel_case_types,
clippy::zero_prefixed_literal,
clippy::identity_op,
clippy::too_many_arguments
)]
#![cfg_attr(feature = "nightly", feature(stdsimd), feature(avx512_target_feature))]
#![cfg_attr(not(feature = "std"), no_std)]
#![cfg_attr(docsrs, feature(doc_cfg))]
use bytemuck::{AnyBitPattern, NoUninit, Pod, Zeroable};
use core::fmt::Debug;
use core::marker::PhantomData;
use core::slice::{from_raw_parts, from_raw_parts_mut};
use num_complex::Complex;
use seal::Seal;
pub type c32 = Complex<f32>;
pub type c64 = Complex<f64>;
mod seal {
pub trait Seal {}
}
pub trait NullaryFnOnce {
type Output;
fn call(self) -> Self::Output;
}
impl<R, F: FnOnce() -> R> NullaryFnOnce for F {
type Output = R;
#[inline(always)]
fn call(self) -> Self::Output {
self()
}
}
pub trait WithSimd {
type Output;
fn with_simd<S: Simd>(self, simd: S) -> Self::Output;
}
impl<F: NullaryFnOnce> WithSimd for F {
type Output = F::Output;
#[inline(always)]
fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
let _simd = &simd;
self.call()
}
}
pub trait Simd: Seal + Debug + Copy + Send + Sync + 'static {
type m32s: Debug + Copy + Send + Sync + 'static;
type f32s: Debug + Copy + Send + Sync + Pod + 'static;
type c32s: Debug + Copy + Send + Sync + Pod + 'static;
type i32s: Debug + Copy + Send + Sync + Pod + 'static;
type u32s: Debug + Copy + Send + Sync + Pod + 'static;
type m64s: Debug + Copy + Send + Sync + 'static;
type f64s: Debug + Copy + Send + Sync + Pod + 'static;
type c64s: Debug + Copy + Send + Sync + Pod + 'static;
type i64s: Debug + Copy + Send + Sync + Pod + 'static;
type u64s: Debug + Copy + Send + Sync + Pod + 'static;
fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output;
#[inline(always)]
fn f32s_as_simd(slice: &[f32]) -> (&[Self::f32s], &[f32]) {
unsafe { split_slice(slice) }
}
#[inline(always)]
fn f32s_as_mut_simd(slice: &mut [f32]) -> (&mut [Self::f32s], &mut [f32]) {
unsafe { split_mut_slice(slice) }
}
#[inline(always)]
fn c32s_as_simd(slice: &[c32]) -> (&[Self::c32s], &[c32]) {
unsafe { split_slice(slice) }
}
#[inline(always)]
fn c32s_as_mut_simd(slice: &mut [c32]) -> (&mut [Self::c32s], &mut [c32]) {
unsafe { split_mut_slice(slice) }
}
#[inline(always)]
fn i32s_as_simd(slice: &[i32]) -> (&[Self::i32s], &[i32]) {
unsafe { split_slice(slice) }
}
#[inline(always)]
fn i32s_as_mut_simd(slice: &mut [i32]) -> (&mut [Self::i32s], &mut [i32]) {
unsafe { split_mut_slice(slice) }
}
#[inline(always)]
fn u32s_as_simd(slice: &[u32]) -> (&[Self::u32s], &[u32]) {
unsafe { split_slice(slice) }
}
#[inline(always)]
fn u32s_as_mut_simd(slice: &mut [u32]) -> (&mut [Self::u32s], &mut [u32]) {
unsafe { split_mut_slice(slice) }
}
#[inline(always)]
fn f64s_as_simd(slice: &[f64]) -> (&[Self::f64s], &[f64]) {
unsafe { split_slice(slice) }
}
#[inline(always)]
fn f64s_as_mut_simd(slice: &mut [f64]) -> (&mut [Self::f64s], &mut [f64]) {
unsafe { split_mut_slice(slice) }
}
#[inline(always)]
fn c64s_as_simd(slice: &[c64]) -> (&[Self::c64s], &[c64]) {
unsafe { split_slice(slice) }
}
#[inline(always)]
fn c64s_as_mut_simd(slice: &mut [c64]) -> (&mut [Self::c64s], &mut [c64]) {
unsafe { split_mut_slice(slice) }
}
#[inline(always)]
fn i64s_as_simd(slice: &[i64]) -> (&[Self::i64s], &[i64]) {
unsafe { split_slice(slice) }
}
#[inline(always)]
fn i64s_as_mut_simd(slice: &mut [i64]) -> (&mut [Self::i64s], &mut [i64]) {
unsafe { split_mut_slice(slice) }
}
#[inline(always)]
fn u64s_as_simd(slice: &[u64]) -> (&[Self::u64s], &[u64]) {
unsafe { split_slice(slice) }
}
#[inline(always)]
fn u64s_as_mut_simd(slice: &mut [u64]) -> (&mut [Self::u64s], &mut [u64]) {
unsafe { split_mut_slice(slice) }
}
fn u32s_partial_load(self, slice: &[u32]) -> Self::u32s;
fn u32s_partial_store(self, slice: &mut [u32], values: Self::u32s);
fn u64s_partial_load(self, slice: &[u64]) -> Self::u64s;
fn u64s_partial_store(self, slice: &mut [u64], values: Self::u64s);
#[inline(always)]
fn i32s_partial_load(self, slice: &[i32]) -> Self::i32s {
cast(self.u32s_partial_load(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn i32s_partial_store(self, slice: &mut [i32], values: Self::i32s) {
self.u32s_partial_store(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn i64s_partial_load(self, slice: &[i64]) -> Self::i64s {
cast(self.u64s_partial_load(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn i64s_partial_store(self, slice: &mut [i64], values: Self::i64s) {
self.u64s_partial_store(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn f32s_partial_load(self, slice: &[f32]) -> Self::f32s {
cast(self.u32s_partial_load(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn f32s_partial_store(self, slice: &mut [f32], values: Self::f32s) {
self.u32s_partial_store(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn f64s_partial_load(self, slice: &[f64]) -> Self::f64s {
cast(self.u64s_partial_load(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn f64s_partial_store(self, slice: &mut [f64], values: Self::f64s) {
self.u64s_partial_store(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn c32s_partial_load(self, slice: &[c32]) -> Self::c32s {
cast(self.f64s_partial_load(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn c32s_partial_store(self, slice: &mut [c32], values: Self::c32s) {
self.f64s_partial_store(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn c64s_partial_load(self, slice: &[c64]) -> Self::c64s {
cast(self.f64s_partial_load(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn c64s_partial_store(self, slice: &mut [c64], values: Self::c64s) {
self.f64s_partial_store(bytemuck::cast_slice_mut(slice), cast(values))
}
fn u32s_partial_load_last(self, slice: &[u32]) -> Self::u32s;
fn u32s_partial_store_last(self, slice: &mut [u32], values: Self::u32s);
fn u64s_partial_load_last(self, slice: &[u64]) -> Self::u64s;
fn u64s_partial_store_last(self, slice: &mut [u64], values: Self::u64s);
#[inline(always)]
fn i32s_partial_load_last(self, slice: &[i32]) -> Self::i32s {
cast(self.u32s_partial_load_last(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn i32s_partial_store_last(self, slice: &mut [i32], values: Self::i32s) {
self.u32s_partial_store_last(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn i64s_partial_load_last(self, slice: &[i64]) -> Self::i64s {
cast(self.u64s_partial_load_last(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn i64s_partial_store_last(self, slice: &mut [i64], values: Self::i64s) {
self.u64s_partial_store_last(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn f32s_partial_load_last(self, slice: &[f32]) -> Self::f32s {
cast(self.u32s_partial_load_last(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn f32s_partial_store_last(self, slice: &mut [f32], values: Self::f32s) {
self.u32s_partial_store_last(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn f64s_partial_load_last(self, slice: &[f64]) -> Self::f64s {
cast(self.u64s_partial_load_last(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn f64s_partial_store_last(self, slice: &mut [f64], values: Self::f64s) {
self.u64s_partial_store_last(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn c32s_partial_load_last(self, slice: &[c32]) -> Self::c32s {
cast(self.f64s_partial_load_last(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn c32s_partial_store_last(self, slice: &mut [c32], values: Self::c32s) {
self.f64s_partial_store_last(bytemuck::cast_slice_mut(slice), cast(values))
}
#[inline(always)]
fn c64s_partial_load_last(self, slice: &[c64]) -> Self::c64s {
cast(self.f64s_partial_load_last(bytemuck::cast_slice(slice)))
}
#[inline(always)]
fn c64s_partial_store_last(self, slice: &mut [c64], values: Self::c64s) {
self.f64s_partial_store_last(bytemuck::cast_slice_mut(slice), cast(values))
}
fn m32s_not(self, a: Self::m32s) -> Self::m32s;
fn m32s_and(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
fn m32s_or(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
fn m32s_xor(self, a: Self::m32s, b: Self::m32s) -> Self::m32s;
fn m64s_not(self, a: Self::m64s) -> Self::m64s;
fn m64s_and(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
fn m64s_or(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
fn m64s_xor(self, a: Self::m64s, b: Self::m64s) -> Self::m64s;
fn u32s_not(self, a: Self::u32s) -> Self::u32s;
fn u32s_and(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
fn u32s_or(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
fn u32s_xor(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
fn u64s_not(self, a: Self::u64s) -> Self::u64s;
fn u64s_and(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
fn u64s_or(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
fn u64s_xor(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
fn m32s_select_u32s(
self,
mask: Self::m32s,
if_true: Self::u32s,
if_false: Self::u32s,
) -> Self::u32s;
fn m64s_select_u64s(
self,
mask: Self::m64s,
if_true: Self::u64s,
if_false: Self::u64s,
) -> Self::u64s;
#[inline]
fn i32s_not(self, a: Self::i32s) -> Self::i32s {
self.u32s_transmute_i32s(self.u32s_not(self.i32s_transmute_u32s(a)))
}
#[inline]
fn i32s_and(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
self.u32s_transmute_i32s(
self.u32s_and(self.i32s_transmute_u32s(a), self.i32s_transmute_u32s(b)),
)
}
#[inline]
fn i32s_or(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
self.u32s_transmute_i32s(
self.u32s_or(self.i32s_transmute_u32s(a), self.i32s_transmute_u32s(b)),
)
}
#[inline]
fn i32s_xor(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
self.u32s_transmute_i32s(
self.u32s_xor(self.i32s_transmute_u32s(a), self.i32s_transmute_u32s(b)),
)
}
#[inline]
fn i64s_not(self, a: Self::i64s) -> Self::i64s {
self.u64s_transmute_i64s(self.u64s_not(self.i64s_transmute_u64s(a)))
}
#[inline]
fn i64s_and(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
self.u64s_transmute_i64s(
self.u64s_and(self.i64s_transmute_u64s(a), self.i64s_transmute_u64s(b)),
)
}
#[inline]
fn i64s_or(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
self.u64s_transmute_i64s(
self.u64s_or(self.i64s_transmute_u64s(a), self.i64s_transmute_u64s(b)),
)
}
#[inline]
fn i64s_xor(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
self.u64s_transmute_i64s(
self.u64s_xor(self.i64s_transmute_u64s(a), self.i64s_transmute_u64s(b)),
)
}
#[inline]
fn f32s_not(self, a: Self::f32s) -> Self::f32s {
self.u32s_transmute_f32s(self.u32s_not(self.f32s_transmute_u32s(a)))
}
#[inline]
fn f32s_and(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
self.u32s_transmute_f32s(
self.u32s_and(self.f32s_transmute_u32s(a), self.f32s_transmute_u32s(b)),
)
}
#[inline]
fn f32s_or(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
self.u32s_transmute_f32s(
self.u32s_or(self.f32s_transmute_u32s(a), self.f32s_transmute_u32s(b)),
)
}
#[inline]
fn f32s_xor(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
self.u32s_transmute_f32s(
self.u32s_xor(self.f32s_transmute_u32s(a), self.f32s_transmute_u32s(b)),
)
}
#[inline]
fn f64s_not(self, a: Self::f64s) -> Self::f64s {
self.u64s_transmute_f64s(self.u64s_not(self.f64s_transmute_u64s(a)))
}
#[inline]
fn f64s_and(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
self.u64s_transmute_f64s(
self.u64s_and(self.f64s_transmute_u64s(a), self.f64s_transmute_u64s(b)),
)
}
#[inline]
fn f64s_or(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
self.u64s_transmute_f64s(
self.u64s_or(self.f64s_transmute_u64s(a), self.f64s_transmute_u64s(b)),
)
}
#[inline]
fn f64s_xor(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
self.u64s_transmute_f64s(
self.u64s_xor(self.f64s_transmute_u64s(a), self.f64s_transmute_u64s(b)),
)
}
#[inline]
fn m32s_select_i32s(
self,
mask: Self::m32s,
if_true: Self::i32s,
if_false: Self::i32s,
) -> Self::i32s {
self.u32s_transmute_i32s(self.m32s_select_u32s(
mask,
self.i32s_transmute_u32s(if_true),
self.i32s_transmute_u32s(if_false),
))
}
#[inline]
fn m32s_select_f32s(
self,
mask: Self::m32s,
if_true: Self::f32s,
if_false: Self::f32s,
) -> Self::f32s {
self.u32s_transmute_f32s(self.m32s_select_u32s(
mask,
self.f32s_transmute_u32s(if_true),
self.f32s_transmute_u32s(if_false),
))
}
#[inline]
fn m64s_select_i64s(
self,
mask: Self::m64s,
if_true: Self::i64s,
if_false: Self::i64s,
) -> Self::i64s {
self.u64s_transmute_i64s(self.m64s_select_u64s(
mask,
self.i64s_transmute_u64s(if_true),
self.i64s_transmute_u64s(if_false),
))
}
#[inline]
fn m64s_select_f64s(
self,
mask: Self::m64s,
if_true: Self::f64s,
if_false: Self::f64s,
) -> Self::f64s {
self.u64s_transmute_f64s(self.m64s_select_u64s(
mask,
self.f64s_transmute_u64s(if_true),
self.f64s_transmute_u64s(if_false),
))
}
fn u32s_splat(self, value: u32) -> Self::u32s;
fn u32s_add(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
fn u32s_sub(self, a: Self::u32s, b: Self::u32s) -> Self::u32s;
fn u32s_less_than(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
fn u32s_greater_than(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
fn u32s_less_than_or_equal(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
fn u32s_greater_than_or_equal(self, a: Self::u32s, b: Self::u32s) -> Self::m32s;
fn u32s_wrapping_dyn_shl(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
fn u32s_wrapping_dyn_shr(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s;
fn u32s_widening_mul(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s);
fn u64s_splat(self, value: u64) -> Self::u64s;
fn u64s_add(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
fn u64s_sub(self, a: Self::u64s, b: Self::u64s) -> Self::u64s;
#[inline]
fn i32s_splat(self, value: i32) -> Self::i32s {
self.u32s_transmute_i32s(self.u32s_splat(value as u32))
}
#[inline]
fn i32s_add(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
self.u32s_transmute_i32s(
self.u32s_add(self.i32s_transmute_u32s(a), self.i32s_transmute_u32s(b)),
)
}
#[inline]
fn i32s_sub(self, a: Self::i32s, b: Self::i32s) -> Self::i32s {
self.u32s_transmute_i32s(
self.u32s_sub(self.i32s_transmute_u32s(a), self.i32s_transmute_u32s(b)),
)
}
#[inline]
fn i64s_splat(self, value: i64) -> Self::i64s {
self.u64s_transmute_i64s(self.u64s_splat(value as u64))
}
#[inline]
fn i64s_add(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
self.u64s_transmute_i64s(
self.u64s_add(self.i64s_transmute_u64s(a), self.i64s_transmute_u64s(b)),
)
}
#[inline]
fn i64s_sub(self, a: Self::i64s, b: Self::i64s) -> Self::i64s {
self.u64s_transmute_i64s(
self.u64s_sub(self.i64s_transmute_u64s(a), self.i64s_transmute_u64s(b)),
)
}
fn f32s_splat(self, value: f32) -> Self::f32s;
#[inline]
fn f32s_abs(self, a: Self::f32s) -> Self::f32s {
self.f32s_and(self.f32s_not(self.f32s_splat(-0.0)), a)
}
#[inline]
fn f32s_neg(self, a: Self::f32s) -> Self::f32s {
self.f32s_xor(self.f32s_splat(-0.0), a)
}
fn f32s_add(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
fn f32s_sub(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
fn f32s_mul(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
fn f32s_div(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
#[inline]
fn f32s_mul_adde(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
self.f32s_add(self.f32s_mul(a, b), c)
}
#[inline]
fn f32_scalar_mul_adde(self, a: f32, b: f32, c: f32) -> f32 {
a * b + c
}
fn f32s_mul_add(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s;
fn f32s_equal(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
fn f32s_less_than(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
fn f32s_less_than_or_equal(self, a: Self::f32s, b: Self::f32s) -> Self::m32s;
#[inline]
fn f32s_greater_than(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
self.f32s_less_than(b, a)
}
#[inline]
fn f32s_greater_than_or_equal(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
self.f32s_less_than_or_equal(b, a)
}
fn f32s_min(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
fn f32s_max(self, a: Self::f32s, b: Self::f32s) -> Self::f32s;
fn f32s_reduce_sum(self, a: Self::f32s) -> f32;
fn f32s_reduce_product(self, a: Self::f32s) -> f32;
fn f32s_reduce_min(self, a: Self::f32s) -> f32;
fn f32s_reduce_max(self, a: Self::f32s) -> f32;
fn c32s_splat(self, value: c32) -> Self::c32s;
fn c32s_conj(self, a: Self::c32s) -> Self::c32s;
fn c32s_neg(self, a: Self::c32s) -> Self::c32s;
fn c32s_add(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
fn c32s_sub(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
fn c32s_mul(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
fn c32_scalar_mul(self, a: c32, b: c32) -> c32 {
a * b
}
fn c32s_conj_mul(self, a: Self::c32s, b: Self::c32s) -> Self::c32s;
fn c32_scalar_conj_mul(self, a: c32, b: c32) -> c32 {
a.conj() * b
}
fn c32s_mul_adde(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
fn c32_scalar_mul_adde(self, a: c32, b: c32, c: c32) -> c32 {
a * b + c
}
fn c32s_conj_mul_adde(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s;
fn c32_scalar_conj_mul_adde(self, a: c32, b: c32, c: c32) -> c32 {
a.conj() * b + c
}
fn c32s_abs2(self, a: Self::c32s) -> Self::c32s;
fn c32s_reduce_sum(self, a: Self::c32s) -> c32;
fn f64s_splat(self, value: f64) -> Self::f64s;
#[inline]
fn f64s_abs(self, a: Self::f64s) -> Self::f64s {
self.f64s_and(self.f64s_not(self.f64s_splat(-0.0)), a)
}
#[inline]
fn f64s_neg(self, a: Self::f64s) -> Self::f64s {
self.f64s_xor(a, self.f64s_splat(-0.0))
}
fn f64s_add(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
fn f64s_sub(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
fn f64s_mul(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
fn f64s_div(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
#[inline]
fn f64s_mul_adde(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
self.f64s_add(self.f64s_mul(a, b), c)
}
#[inline]
fn f64_scalar_mul_adde(self, a: f64, b: f64, c: f64) -> f64 {
a * b + c
}
fn f64s_mul_add(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s;
fn f64s_equal(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
fn f64s_less_than(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
fn f64s_less_than_or_equal(self, a: Self::f64s, b: Self::f64s) -> Self::m64s;
#[inline]
fn f64s_greater_than(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
self.f64s_less_than(b, a)
}
#[inline]
fn f64s_greater_than_or_equal(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
self.f64s_less_than_or_equal(b, a)
}
fn f64s_min(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
fn f64s_max(self, a: Self::f64s, b: Self::f64s) -> Self::f64s;
fn f64s_reduce_sum(self, a: Self::f64s) -> f64;
fn f64s_reduce_product(self, a: Self::f64s) -> f64;
fn f64s_reduce_min(self, a: Self::f64s) -> f64;
fn f64s_reduce_max(self, a: Self::f64s) -> f64;
fn c64s_splat(self, value: c64) -> Self::c64s;
fn c64s_conj(self, a: Self::c64s) -> Self::c64s;
fn c64s_neg(self, a: Self::c64s) -> Self::c64s;
fn c64s_add(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
fn c64s_sub(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
fn c64s_mul(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
fn c64_scalar_mul(self, a: c64, b: c64) -> c64 {
a * b
}
fn c64s_conj_mul(self, a: Self::c64s, b: Self::c64s) -> Self::c64s;
fn c64_scalar_conj_mul(self, a: c64, b: c64) -> c64 {
a.conj() * b
}
fn c64s_mul_adde(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
fn c64_scalar_mul_adde(self, a: c64, b: c64, c: c64) -> c64 {
a * b + c
}
fn c64s_conj_mul_adde(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s;
fn c64_scalar_conj_mul_adde(self, a: c64, b: c64, c: c64) -> c64 {
a.conj() * b + c
}
fn c64s_abs2(self, a: Self::c64s) -> Self::c64s;
fn c64s_reduce_sum(self, a: Self::c64s) -> c64;
#[inline]
fn f32s_transmute_i32s(self, a: Self::f32s) -> Self::i32s {
cast(a)
}
#[inline]
fn f32s_transmute_u32s(self, a: Self::f32s) -> Self::u32s {
cast(a)
}
#[inline]
fn i32s_transmute_f32s(self, a: Self::i32s) -> Self::f32s {
cast(a)
}
#[inline]
fn i32s_transmute_u32s(self, a: Self::i32s) -> Self::u32s {
cast(a)
}
#[inline]
fn u32s_transmute_f32s(self, a: Self::u32s) -> Self::f32s {
cast(a)
}
#[inline]
fn u32s_transmute_i32s(self, a: Self::u32s) -> Self::i32s {
cast(a)
}
#[inline]
fn f64s_transmute_i64s(self, a: Self::f64s) -> Self::i64s {
cast(a)
}
#[inline]
fn f64s_transmute_u64s(self, a: Self::f64s) -> Self::u64s {
cast(a)
}
#[inline]
fn i64s_transmute_f64s(self, a: Self::i64s) -> Self::f64s {
cast(a)
}
#[inline]
fn i64s_transmute_u64s(self, a: Self::i64s) -> Self::u64s {
cast(a)
}
#[inline]
fn u64s_transmute_f64s(self, a: Self::u64s) -> Self::f64s {
cast(a)
}
#[inline]
fn u64s_transmute_i64s(self, a: Self::u64s) -> Self::i64s {
cast(a)
}
}
#[derive(Debug, Copy, Clone)]
pub struct Scalar;
impl Default for Scalar {
#[inline]
fn default() -> Self {
Self::new()
}
}
impl Scalar {
#[inline]
pub fn new() -> Self {
Self
}
}
impl Seal for Scalar {}
impl Simd for Scalar {
#[inline]
fn vectorize<Op: WithSimd>(self, op: Op) -> Op::Output {
op.with_simd(self)
}
type m32s = bool;
type f32s = f32;
type c32s = c32;
type i32s = i32;
type u32s = u32;
type m64s = bool;
type f64s = f64;
type c64s = c64;
type i64s = i64;
type u64s = u64;
#[inline]
fn m32s_not(self, a: Self::m32s) -> Self::m32s {
!a
}
#[inline]
fn m32s_and(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
a & b
}
#[inline]
fn m32s_or(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
a | b
}
#[inline]
fn m32s_xor(self, a: Self::m32s, b: Self::m32s) -> Self::m32s {
a ^ b
}
#[inline]
fn f32s_splat(self, value: f32) -> Self::f32s {
value
}
#[inline]
fn f32s_add(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
a + b
}
#[inline]
fn f32s_sub(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
a - b
}
#[inline]
fn f32s_mul(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
a * b
}
#[inline]
fn f32s_div(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
a / b
}
#[inline]
fn f32s_equal(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
a == b
}
#[inline]
fn f32s_less_than(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
a < b
}
#[inline]
fn f32s_less_than_or_equal(self, a: Self::f32s, b: Self::f32s) -> Self::m32s {
a <= b
}
#[inline]
fn c32s_splat(self, value: c32) -> Self::c32s {
value
}
#[inline]
fn c32s_add(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
a + b
}
#[inline]
fn c32s_sub(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
a - b
}
#[inline]
fn c32s_mul(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
a * b
}
#[inline]
fn c64s_splat(self, value: c64) -> Self::c64s {
value
}
#[inline]
fn c64s_add(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
a + b
}
#[inline]
fn c64s_sub(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
a - b
}
#[inline]
fn c64s_mul(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
a * b
}
#[inline]
fn m64s_not(self, a: Self::m64s) -> Self::m64s {
!a
}
#[inline]
fn m64s_and(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
a & b
}
#[inline]
fn m64s_or(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
a | b
}
#[inline]
fn m64s_xor(self, a: Self::m64s, b: Self::m64s) -> Self::m64s {
a ^ b
}
#[inline]
fn f64s_splat(self, value: f64) -> Self::f64s {
value
}
#[inline]
fn f64s_add(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
a + b
}
#[inline]
fn f64s_sub(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
a - b
}
#[inline]
fn f64s_mul(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
a * b
}
#[inline]
fn f64s_div(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
a / b
}
#[inline]
fn f64s_equal(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
a == b
}
#[inline]
fn f64s_less_than(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
a < b
}
#[inline]
fn f64s_less_than_or_equal(self, a: Self::f64s, b: Self::f64s) -> Self::m64s {
a <= b
}
#[inline]
fn u32s_not(self, a: Self::u32s) -> Self::u32s {
!a
}
#[inline]
fn u32s_and(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
a & b
}
#[inline]
fn u32s_or(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
a | b
}
#[inline]
fn u32s_xor(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
a ^ b
}
#[inline]
fn u64s_not(self, a: Self::u64s) -> Self::u64s {
!a
}
#[inline]
fn u64s_and(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
a & b
}
#[inline]
fn u64s_or(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
a | b
}
#[inline]
fn u64s_xor(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
a ^ b
}
#[inline]
fn m32s_select_u32s(
self,
mask: Self::m32s,
if_true: Self::u32s,
if_false: Self::u32s,
) -> Self::u32s {
if mask {
if_true
} else {
if_false
}
}
#[inline]
fn m64s_select_u64s(
self,
mask: Self::m64s,
if_true: Self::u64s,
if_false: Self::u64s,
) -> Self::u64s {
if mask {
if_true
} else {
if_false
}
}
#[inline]
fn f32s_min(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
a.min(b)
}
#[inline]
fn f32s_max(self, a: Self::f32s, b: Self::f32s) -> Self::f32s {
a.max(b)
}
#[inline]
fn f64s_min(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
a.min(b)
}
#[inline]
fn f64s_max(self, a: Self::f64s, b: Self::f64s) -> Self::f64s {
a.max(b)
}
#[inline]
fn u32s_add(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
a.wrapping_add(b)
}
#[inline]
fn u32s_sub(self, a: Self::u32s, b: Self::u32s) -> Self::u32s {
a.wrapping_sub(b)
}
#[inline]
fn u64s_add(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
a.wrapping_add(b)
}
#[inline]
fn u64s_sub(self, a: Self::u64s, b: Self::u64s) -> Self::u64s {
a.wrapping_sub(b)
}
#[inline]
fn u32s_splat(self, value: u32) -> Self::u32s {
value
}
#[inline]
fn u64s_splat(self, value: u64) -> Self::u64s {
value
}
#[inline]
fn f32s_reduce_sum(self, a: Self::f32s) -> f32 {
a
}
#[inline]
fn f32s_reduce_product(self, a: Self::f32s) -> f32 {
a
}
#[inline]
fn f32s_reduce_min(self, a: Self::f32s) -> f32 {
a
}
#[inline]
fn f32s_reduce_max(self, a: Self::f32s) -> f32 {
a
}
#[inline]
fn f64s_reduce_sum(self, a: Self::f64s) -> f64 {
a
}
#[inline]
fn f64s_reduce_product(self, a: Self::f64s) -> f64 {
a
}
#[inline]
fn f64s_reduce_min(self, a: Self::f64s) -> f64 {
a
}
#[inline]
fn f64s_reduce_max(self, a: Self::f64s) -> f64 {
a
}
#[inline]
fn f32s_mul_add(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
f32::mul_add(a, b, c)
}
#[inline]
fn f64s_mul_add(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
f64::mul_add(a, b, c)
}
#[inline]
fn c32s_abs2(self, a: Self::c32s) -> Self::c32s {
let norm2 = a.re * a.re + a.im * a.im;
c32::new(norm2, norm2)
}
#[inline]
fn c64s_abs2(self, a: Self::c64s) -> Self::c64s {
let norm2 = a.re * a.re + a.im * a.im;
c64::new(norm2, norm2)
}
#[inline]
fn u32s_partial_load(self, slice: &[u32]) -> Self::u32s {
if let Some((head, _)) = slice.split_first() {
*head
} else {
0
}
}
#[inline]
fn u32s_partial_store(self, slice: &mut [u32], values: Self::u32s) {
if let Some((head, _)) = slice.split_first_mut() {
*head = values;
}
}
#[inline]
fn u64s_partial_load(self, slice: &[u64]) -> Self::u64s {
if let Some((head, _)) = slice.split_first() {
*head
} else {
0
}
}
#[inline]
fn u64s_partial_store(self, slice: &mut [u64], values: Self::u64s) {
if let Some((head, _)) = slice.split_first_mut() {
*head = values;
}
}
#[inline]
fn c64s_partial_load(self, slice: &[c64]) -> Self::c64s {
if let Some((head, _)) = slice.split_first() {
*head
} else {
c64 { re: 0.0, im: 0.0 }
}
}
#[inline]
fn c64s_partial_store(self, slice: &mut [c64], values: Self::c64s) {
if let Some((head, _)) = slice.split_first_mut() {
*head = values;
}
}
#[inline]
fn u32s_partial_load_last(self, slice: &[u32]) -> Self::u32s {
if let Some((head, _)) = slice.split_last() {
*head
} else {
0
}
}
#[inline]
fn u32s_partial_store_last(self, slice: &mut [u32], values: Self::u32s) {
if let Some((head, _)) = slice.split_last_mut() {
*head = values;
}
}
#[inline]
fn u64s_partial_load_last(self, slice: &[u64]) -> Self::u64s {
if let Some((head, _)) = slice.split_last() {
*head
} else {
0
}
}
#[inline]
fn u64s_partial_store_last(self, slice: &mut [u64], values: Self::u64s) {
if let Some((head, _)) = slice.split_last_mut() {
*head = values;
}
}
#[inline]
fn c64s_partial_load_last(self, slice: &[c64]) -> Self::c64s {
if let Some((head, _)) = slice.split_last() {
*head
} else {
c64 { re: 0.0, im: 0.0 }
}
}
#[inline]
fn c64s_partial_store_last(self, slice: &mut [c64], values: Self::c64s) {
if let Some((head, _)) = slice.split_last_mut() {
*head = values;
}
}
#[inline]
fn c32s_conj_mul(self, a: Self::c32s, b: Self::c32s) -> Self::c32s {
a.conj() * b
}
#[inline]
fn c32s_mul_adde(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
a * b + c
}
#[inline]
fn c32s_conj_mul_adde(self, a: Self::c32s, b: Self::c32s, c: Self::c32s) -> Self::c32s {
a.conj() * b + c
}
#[inline]
fn c64s_conj_mul(self, a: Self::c64s, b: Self::c64s) -> Self::c64s {
a.conj() * b
}
#[inline]
fn c64s_mul_adde(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
a * b + c
}
#[inline]
fn c64s_conj_mul_adde(self, a: Self::c64s, b: Self::c64s, c: Self::c64s) -> Self::c64s {
a.conj() * b + c
}
#[inline]
fn c32s_conj(self, a: Self::c32s) -> Self::c32s {
a.conj()
}
#[inline]
fn c64s_conj(self, a: Self::c64s) -> Self::c64s {
a.conj()
}
#[inline]
fn c32s_neg(self, a: Self::c32s) -> Self::c32s {
-a
}
#[inline]
fn c32s_reduce_sum(self, a: Self::c32s) -> c32 {
a
}
#[inline]
fn c64s_neg(self, a: Self::c64s) -> Self::c64s {
-a
}
#[inline]
fn c64s_reduce_sum(self, a: Self::c64s) -> c64 {
a
}
#[inline]
fn u32s_wrapping_dyn_shl(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
a.wrapping_shl(amount)
}
#[inline]
fn u32s_wrapping_dyn_shr(self, a: Self::u32s, amount: Self::u32s) -> Self::u32s {
a.wrapping_shr(amount)
}
#[inline]
fn u32s_widening_mul(self, a: Self::u32s, b: Self::u32s) -> (Self::u32s, Self::u32s) {
let c = a as u64 * b as u64;
let lo = c as u32;
let hi = (c >> 32) as u32;
(lo, hi)
}
#[inline]
fn u32s_less_than(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
a < b
}
#[inline]
fn u32s_greater_than(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
a > b
}
#[inline]
fn u32s_less_than_or_equal(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
a <= b
}
#[inline]
fn u32s_greater_than_or_equal(self, a: Self::u32s, b: Self::u32s) -> Self::m32s {
a >= b
}
}
#[inline(always)]
unsafe fn split_slice<T, U>(slice: &[T]) -> (&[U], &[T]) {
assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
let len = slice.len();
let data = slice.as_ptr();
let div = len / chunk_size;
let rem = len % chunk_size;
(
from_raw_parts(data as *const U, div),
from_raw_parts(data.add(len - rem), rem),
)
}
#[inline(always)]
unsafe fn split_mut_slice<T, U>(slice: &mut [T]) -> (&mut [U], &mut [T]) {
assert_eq!(core::mem::size_of::<U>() % core::mem::size_of::<T>(), 0);
assert_eq!(core::mem::align_of::<U>(), core::mem::align_of::<T>());
let chunk_size = core::mem::size_of::<U>() / core::mem::size_of::<T>();
let len = slice.len();
let data = slice.as_mut_ptr();
let div = len / chunk_size;
let rem = len % chunk_size;
(
from_raw_parts_mut(data as *mut U, div),
from_raw_parts_mut(data.add(len - rem), rem),
)
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
enum ArchInner {
Scalar(crate::Scalar),
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
impl ArchInner {
#[inline]
pub fn new() -> Self {
Self::Scalar(crate::Scalar::new())
}
#[inline(always)]
pub fn dispatch<Op: WithSimd>(self, op: Op) -> Op::Output {
match self {
ArchInner::Scalar(simd) => simd.vectorize(op),
}
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use x86::ArchInner;
impl Arch {
#[inline]
pub fn new() -> Self {
Self {
inner: ArchInner::new(),
}
}
#[inline(always)]
pub fn dispatch<Op: WithSimd>(self, op: Op) -> Op::Output {
self.inner.dispatch(op)
}
}
impl Default for Arch {
#[inline]
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy)]
pub struct Arch {
inner: ArchInner,
}
#[doc(hidden)]
pub struct CheckSameSize<T, U>(PhantomData<(T, U)>);
impl<T, U> CheckSameSize<T, U> {
pub const VALID: () = {
assert!(core::mem::size_of::<T>() == core::mem::size_of::<U>());
};
}
#[doc(hidden)]
pub struct CheckSizeLessThanOrEqual<T, U>(PhantomData<(T, U)>);
impl<T, U> CheckSizeLessThanOrEqual<T, U> {
pub const VALID: () = {
assert!(core::mem::size_of::<T>() <= core::mem::size_of::<U>());
};
}
#[macro_export]
macro_rules! static_assert_same_size {
($t: ty, $u: ty) => {
let _ = $crate::CheckSameSize::<$t, $u>::VALID;
};
}
#[macro_export]
macro_rules! static_assert_size_less_than_or_equal {
($t: ty, $u: ty) => {
let _ = $crate::CheckSizeLessThanOrEqual::<$t, $u>::VALID;
};
}
#[inline(always)]
pub fn cast<T: NoUninit, U: AnyBitPattern>(value: T) -> U {
static_assert_same_size!(T, U);
let value = core::mem::ManuallyDrop::new(value);
let ptr = &value as *const core::mem::ManuallyDrop<T> as *const U;
unsafe { ptr.read_unaligned() }
}
#[inline(always)]
pub fn cast_lossy<T: NoUninit, U: AnyBitPattern>(value: T) -> U {
static_assert_size_less_than_or_equal!(U, T);
let value = core::mem::ManuallyDrop::new(value);
let ptr = &value as *const core::mem::ManuallyDrop<T> as *const U;
unsafe { ptr.read_unaligned() }
}
#[inline(always)]
pub fn as_arrays<const N: usize, T>(slice: &[T]) -> (&[[T; N]], &[T]) {
let n = slice.len();
let mid_div_n = n / N;
let mid = mid_div_n * N;
let ptr = slice.as_ptr();
unsafe {
(
from_raw_parts(ptr as *const [T; N], mid_div_n),
from_raw_parts(ptr.add(mid), n - mid),
)
}
}
#[inline(always)]
pub fn as_arrays_mut<const N: usize, T>(slice: &mut [T]) -> (&mut [[T; N]], &mut [T]) {
let n = slice.len();
let mid_div_n = n / N;
let mid = mid_div_n * N;
let ptr = slice.as_mut_ptr();
unsafe {
(
from_raw_parts_mut(ptr as *mut [T; N], mid_div_n),
from_raw_parts_mut(ptr.add(mid), n - mid),
)
}
}
#[doc(hidden)]
pub mod core_arch;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg_attr(docsrs, doc(cfg(any(target_arch = "x86", target_arch = "x86_64"))))]
pub mod x86;