use crate::Transformable;
use pulp::cast;
use std::marker::PhantomData;
use std::sync::LazyLock;
pub trait Alignable {
fn simd_lanes<S: pulp::Simd>(_: S) -> usize;
fn lanes() -> usize {
struct Impl<T: ?Sized>(PhantomData<T>);
impl<T> WithSimd for Impl<T>
where
T: Alignable + ?Sized,
{
type Output = usize;
#[inline(always)]
fn with_simd<S: Simd>(self, simd: S) -> Self::Output {
T::simd_lanes(simd)
}
}
crate::simd::ARCH.dispatch_wvlt(Impl(PhantomData::<Self>))
}
}
macro_rules! impl_alignable {
($t:ty, $n:tt) => {
impl Alignable for $t {
fn simd_lanes<S: pulp::Simd>(_: S) -> usize {
S::$n
}
}
};
}
impl_alignable!(i8, I8_LANES);
impl_alignable!(i16, I16_LANES);
impl_alignable!(i32, I32_LANES);
impl_alignable!(i64, I64_LANES);
impl_alignable!(f32, F32_LANES);
impl_alignable!(f64, F64_LANES);
impl_alignable!(num_complex::Complex32, C32_LANES);
impl_alignable!(num_complex::Complex64, C64_LANES);
pub trait Simd: pulp::Simd {
#[inline(always)]
fn neg_mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
let neg_a = self.neg_f32s(a);
self.mul_add_f32s(neg_a, b, c)
}
#[inline(always)]
fn neg_mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
let neg_a = self.neg_f64s(a);
self.mul_add_f64s(neg_a, b, c)
}
#[inline(always)]
fn neg_mul_add_c32s_f32s(self, a: Self::c32s, b: Self::f32s, c: Self::c32s) -> Self::c32s {
cast(self.neg_mul_add_f32s(cast(a), b, cast(c)))
}
#[inline(always)]
fn neg_mul_add_c64s_f64s(self, a: Self::c64s, b: Self::f64s, c: Self::c64s) -> Self::c64s {
cast(self.neg_mul_add_f64s(cast(a), b, cast(c)))
}
#[inline(always)]
fn mul_add_c32s_f32s(self, a: Self::c32s, b: Self::f32s, c: Self::c32s) -> Self::c32s {
cast(self.mul_add_f32s(cast(a), b, cast(c)))
}
#[inline(always)]
fn mul_add_c64s_f64s(self, a: Self::c64s, b: Self::f64s, c: Self::c64s) -> Self::c64s {
cast(self.mul_add_f64s(cast(a), b, cast(c)))
}
#[inline(always)]
fn mul_c32s_f32s(self, a: Self::c32s, b: Self::f32s) -> Self::c32s {
cast(self.mul_f32s(cast(a), b))
}
#[inline(always)]
fn mul_c64s_f64s(self, a: Self::c64s, b: Self::f64s) -> Self::c64s {
cast(self.mul_f64s(cast(a), b))
}
#[inline(always)]
fn div_c32s_f32s(self, a: Self::c32s, b: Self::f32s) -> Self::c32s {
cast(self.div_f32s(cast(a), b))
}
#[inline(always)]
fn div_c64s_f64s(self, a: Self::c64s, b: Self::f64s) -> Self::c64s {
cast(self.div_f64s(cast(a), b))
}
fn vectorize_wvlt<Op: WithSimd>(self, op: Op) -> Op::Output;
}
pub trait WithSimd {
type Output;
fn with_simd<S: Simd>(self, simd: S) -> Self::Output;
}
impl Simd for pulp::Scalar {
#[inline(always)]
fn mul_add_c32s_f32s(self, a: Self::c32s, b: Self::f32s, c: Self::c32s) -> Self::c32s {
Self::c32s {
re: f32::mul_add(a.re, b, c.re),
im: f32::mul_add(a.im, b, c.im),
}
}
#[inline(always)]
fn mul_add_c64s_f64s(self, a: Self::c64s, b: Self::f64s, c: Self::c64s) -> Self::c64s {
Self::c64s {
re: f64::mul_add(a.re, b, c.re),
im: f64::mul_add(a.im, b, c.im),
}
}
#[inline(always)]
fn neg_mul_add_c32s_f32s(self, a: Self::c32s, b: Self::f32s, c: Self::c32s) -> Self::c32s {
Self::c32s {
re: f32::mul_add(-a.re, b, c.re),
im: f32::mul_add(-a.im, b, c.im),
}
}
#[inline(always)]
fn neg_mul_add_c64s_f64s(self, a: Self::c64s, b: Self::f64s, c: Self::c64s) -> Self::c64s {
Self::c64s {
re: f64::mul_add(-a.re, b, c.re),
im: f64::mul_add(-a.im, b, c.im),
}
}
#[inline(always)]
fn mul_c32s_f32s(self, a: Self::c32s, b: Self::f32s) -> Self::c32s {
a * b
}
#[inline(always)]
fn mul_c64s_f64s(self, a: Self::c64s, b: Self::f64s) -> Self::c64s {
a * b
}
#[inline(always)]
fn div_c32s_f32s(self, a: Self::c32s, b: Self::f32s) -> Self::c32s {
a / b
}
#[inline(always)]
fn div_c64s_f64s(self, a: Self::c64s, b: Self::f64s) -> Self::c64s {
a / b
}
#[inline]
fn vectorize_wvlt<Op: WithSimd>(self, op: Op) -> Op::Output {
op.with_simd(self)
}
}
pub trait Dispatch {
fn dispatch_wvlt<Op: WithSimd>(self, op: Op) -> Op::Output;
}
macro_rules! impl_vectorize_wvlt {
($t:ty) => {
#[inline(always)]
fn vectorize_wvlt<Op: WithSimd>(self, op: Op) -> Op::Output {
struct Impl<Op> {
this: $t,
op: Op,
}
impl<Op: WithSimd> pulp::NullaryFnOnce for Impl<Op> {
type Output = Op::Output;
#[inline(always)]
fn call(self) -> Self::Output {
self.op.with_simd(self.this)
}
}
self.vectorize(Impl { this: self, op })
}
};
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub(crate) mod x86 {
use super::*;
#[cfg(feature = "x86-v4")]
use pulp::x86::V4;
use pulp::x86::{V2, V3};
impl Simd for V2 {
impl_vectorize_wvlt! {V2}
}
impl Simd for V3 {
#[inline(always)]
fn neg_mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
cast!(self.fma._mm256_fnmadd_ps(cast!(a), cast!(b), cast!(c)))
}
#[inline(always)]
fn neg_mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
cast!(self.fma._mm256_fnmadd_pd(cast!(a), cast!(b), cast!(c)))
}
impl_vectorize_wvlt! {V3}
}
#[cfg(feature = "x86-v4")]
impl Simd for V4 {
#[inline(always)]
fn neg_mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
cast!(self.avx512f._mm512_fnmadd_ps(cast!(a), cast!(b), cast!(c)))
}
#[inline(always)]
fn neg_mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
cast!(self.avx512f._mm512_fnmadd_pd(cast!(a), cast!(b), cast!(c)))
}
impl_vectorize_wvlt! {V4}
}
impl Dispatch for pulp::x86::Arch {
#[inline]
fn dispatch_wvlt<Op: WithSimd>(self, op: Op) -> Op::Output {
match self {
#[cfg(feature = "x86-v4")]
Self::V4(simd) => Simd::vectorize_wvlt(simd, op),
#[cfg(feature = "x86-v3")]
Self::V3(simd) => Simd::vectorize_wvlt(simd, op),
_ => Simd::vectorize_wvlt(pulp::Scalar, op),
}
}
}
}
#[cfg(target_arch = "wasm32")]
pub(crate) mod wasm {
use super::*;
use pulp::wasm::{RelaxedSimd, Simd128};
impl Simd for Simd128 {
impl_vectorize_wvlt! {Simd128}
}
impl Simd for RelaxedSimd {
impl_vectorize_wvlt! {RelaxedSimd}
}
impl Dispatch for pulp::wasm::Arch {
#[inline]
fn dispatch_wvlt<Op: WithSimd>(self, op: Op) -> Op::Output {
match self {
Self::RelaxedSimd(simd) => Simd::vectorize_wvlt(simd, op),
Self::Simd128(simd) => Simd::vectorize_wvlt(simd, op),
_ => Simd::vectorize_wvlt(pulp::Scalar, op),
}
}
}
}
#[cfg(target_arch = "aarch64")]
pub(crate) mod aarch64 {
use super::*;
use core::arch::aarch64::{vfmsq_f32, vfmsq_f64};
use pulp::aarch64::Neon;
impl Simd for Neon {
#[inline(always)]
fn neg_mul_add_f32s(self, a: Self::f32s, b: Self::f32s, c: Self::f32s) -> Self::f32s {
unsafe { cast!(vfmsq_f32(cast!(c), cast!(a), cast!(b))) }
}
#[inline(always)]
fn neg_mul_add_f64s(self, a: Self::f64s, b: Self::f64s, c: Self::f64s) -> Self::f64s {
unsafe { cast!(vfmsq_f64(cast!(c), cast!(a), cast!(b))) }
}
impl_vectorize_wvlt! {Neon}
}
impl Dispatch for pulp::aarch64::Arch {
#[inline]
fn dispatch_wvlt<Op: WithSimd>(self, op: Op) -> Op::Output {
match self {
Self::Neon(simd) => Simd::vectorize_wvlt(simd, op),
_ => Simd::vectorize_wvlt(pulp::Scalar, op),
}
}
}
}
pub trait SimdTransformable: Sized + Transformable + Alignable {
type Vector<S: Simd>: Copy + std::fmt::Debug;
type SplatVector<S: Simd>: Copy + std::fmt::Debug;
fn as_simd<S: Simd>(simd: S, x: &[Self]) -> (&[Self::Vector<S>], &[Self]);
fn as_mut_simd<S: Simd>(simd: S, x: &mut [Self]) -> (&mut [Self::Vector<S>], &mut [Self]);
fn simd_splat<S: Simd>(simd: S, v: Self::Scalar) -> Self::SplatVector<S>;
fn simd_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::SplatVector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S>;
fn simd_negate_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::SplatVector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S>;
fn simd_add<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S>;
fn simd_sub<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S>;
fn simd_mul<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::SplatVector<S>) -> Self::Vector<S>;
fn simd_div<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::SplatVector<S>) -> Self::Vector<S>;
}
impl SimdTransformable for f32 {
type Vector<S: Simd> = S::f32s;
type SplatVector<S: Simd> = Self::Vector<S>;
#[inline(always)]
fn as_simd<S: Simd>(_: S, slice: &[Self]) -> (&[Self::Vector<S>], &[Self]) {
S::as_simd_f32s(slice)
}
#[inline(always)]
fn as_mut_simd<S: Simd>(_: S, slice: &mut [Self]) -> (&mut [Self::Vector<S>], &mut [Self]) {
S::as_mut_simd_f32s(slice)
}
#[inline(always)]
fn simd_splat<S: Simd>(simd: S, v: Self) -> Self::SplatVector<S> {
simd.splat_f32s(v)
}
#[inline(always)]
fn simd_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::Vector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S> {
simd.mul_add_f32s(a, b, c)
}
#[inline(always)]
fn simd_negate_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::Vector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S> {
simd.neg_mul_add_f32s(a, b, c)
}
#[inline(always)]
fn simd_add<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.add_f32s(a, b)
}
#[inline(always)]
fn simd_sub<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.sub_f32s(a, b)
}
#[inline(always)]
fn simd_mul<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.mul_f32s(a, b)
}
#[inline(always)]
fn simd_div<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.div_f32s(a, b)
}
}
impl SimdTransformable for f64 {
type Vector<S: Simd> = S::f64s;
type SplatVector<S: Simd> = Self::Vector<S>;
#[inline(always)]
fn as_simd<S: Simd>(_: S, slice: &[Self]) -> (&[Self::Vector<S>], &[Self]) {
S::as_simd_f64s(slice)
}
#[inline(always)]
fn as_mut_simd<S: Simd>(_: S, slice: &mut [Self]) -> (&mut [Self::Vector<S>], &mut [Self]) {
S::as_mut_simd_f64s(slice)
}
#[inline(always)]
fn simd_splat<S: Simd>(simd: S, v: Self) -> Self::Vector<S> {
simd.splat_f64s(v)
}
#[inline(always)]
fn simd_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::Vector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S> {
simd.mul_add_f64s(a, b, c)
}
#[inline(always)]
fn simd_negate_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::Vector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S> {
simd.neg_mul_add_f64s(a, b, c)
}
#[inline(always)]
fn simd_add<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.add_f64s(a, b)
}
#[inline(always)]
fn simd_sub<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.sub_f64s(a, b)
}
#[inline(always)]
fn simd_mul<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.mul_f64s(a, b)
}
#[inline(always)]
fn simd_div<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.div_f64s(a, b)
}
}
impl SimdTransformable for num_complex::Complex32 {
type Vector<S: Simd> = S::c32s;
type SplatVector<S: Simd> = S::f32s;
#[inline(always)]
fn as_simd<S: Simd>(_: S, slice: &[Self]) -> (&[Self::Vector<S>], &[Self]) {
S::as_simd_c32s(slice)
}
#[inline(always)]
fn as_mut_simd<S: Simd>(_: S, slice: &mut [Self]) -> (&mut [Self::Vector<S>], &mut [Self]) {
S::as_mut_simd_c32s(slice)
}
#[inline(always)]
fn simd_splat<S: Simd>(simd: S, v: Self::Scalar) -> Self::SplatVector<S> {
simd.splat_f32s(v)
}
#[inline(always)]
fn simd_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::SplatVector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S> {
simd.mul_add_c32s_f32s(a, b, c)
}
#[inline(always)]
fn simd_negate_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::SplatVector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S> {
simd.neg_mul_add_c32s_f32s(a, b, c)
}
#[inline(always)]
fn simd_add<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.add_c32s(a, b)
}
#[inline(always)]
fn simd_sub<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.sub_c32s(a, b)
}
#[inline(always)]
fn simd_mul<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::SplatVector<S>) -> Self::Vector<S> {
simd.mul_c32s_f32s(a, b)
}
#[inline(always)]
fn simd_div<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::SplatVector<S>) -> Self::Vector<S> {
simd.div_c32s_f32s(a, b)
}
}
impl SimdTransformable for num_complex::Complex64 {
type Vector<S: Simd> = S::c64s;
type SplatVector<S: Simd> = S::f64s;
#[inline(always)]
fn as_simd<S: Simd>(_: S, slice: &[Self]) -> (&[Self::Vector<S>], &[Self]) {
S::as_simd_c64s(slice)
}
#[inline(always)]
fn as_mut_simd<S: Simd>(_: S, slice: &mut [Self]) -> (&mut [Self::Vector<S>], &mut [Self]) {
S::as_mut_simd_c64s(slice)
}
#[inline(always)]
fn simd_splat<S: Simd>(simd: S, v: Self::Scalar) -> Self::SplatVector<S> {
simd.splat_f64s(v)
}
#[inline(always)]
fn simd_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::SplatVector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S> {
simd.mul_add_c64s_f64s(a, b, c)
}
#[inline(always)]
fn simd_negate_mul_add<S: Simd>(
simd: S,
a: Self::Vector<S>,
b: Self::SplatVector<S>,
c: Self::Vector<S>,
) -> Self::Vector<S> {
simd.neg_mul_add_c64s_f64s(a, b, c)
}
#[inline(always)]
fn simd_add<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.add_c64s(a, b)
}
#[inline(always)]
fn simd_sub<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::Vector<S>) -> Self::Vector<S> {
simd.sub_c64s(a, b)
}
#[inline(always)]
fn simd_mul<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::SplatVector<S>) -> Self::Vector<S> {
simd.mul_c64s_f64s(a, b)
}
#[inline(always)]
fn simd_div<S: Simd>(simd: S, a: Self::Vector<S>, b: Self::SplatVector<S>) -> Self::Vector<S> {
simd.div_c64s_f64s(a, b)
}
}
pub static ARCH: LazyLock<pulp::Arch> = LazyLock::new(pulp::Arch::new);