ferray-ufunc 0.2.10

// ferray-ufunc: SIMD dispatch via pulp (REQ-17, REQ-18, REQ-20)
//
// Provides runtime CPU feature detection and dispatch for SIMD-accelerated
// elementwise operations. Uses `pulp::Arch` for portable dispatch across
// SSE2, AVX2, AVX-512 on x86_64 and NEON on aarch64.

use pulp::Arch;

use std::sync::atomic::{AtomicU8, Ordering};

/// Cached value of `FERRAY_FORCE_SCALAR`: 0 = uninitialized, 1 = false, 2 = true.
static FORCE_SCALAR_CACHED: AtomicU8 = AtomicU8::new(0);

/// Check if SIMD is forcibly disabled via `FERRAY_FORCE_SCALAR=1`.
///
/// The env var is read once and cached. Call [`reset_force_scalar`] to
/// re-read the env var (useful for testing).
#[inline]
pub fn force_scalar() -> bool {
    let val = FORCE_SCALAR_CACHED.load(Ordering::Relaxed);
    if val != 0 {
        return val == 2;
    }
    let result = std::env::var("FERRAY_FORCE_SCALAR")
        .ok()
        .is_some_and(|v| v == "1");
    FORCE_SCALAR_CACHED.store(if result { 2 } else { 1 }, Ordering::Relaxed);
    result
}

/// Re-read `FERRAY_FORCE_SCALAR` on the next call to [`force_scalar`].
///
/// Useful in tests that need to toggle between SIMD and scalar paths.
pub fn reset_force_scalar() {
    FORCE_SCALAR_CACHED.store(0, Ordering::SeqCst);
}

// ---------------------------------------------------------------------------
// Generic unary / binary dispatch on contiguous float slices
//
// These four entry points used to route through `pulp::Arch::dispatch` and
// a `WithSimd` wrapper, giving the impression that the kernel was
// vectorized. That was misleading — the wrappers ignored the `simd`
// parameter and ran the same scalar loop on every ISA. See #377 for the
// audit. Until a real vectorized transcendental library (sleef-rs,
// libmvec, or hand-rolled pulp intrinsics) lands, these are honestly
// labeled as scalar loops. The signatures are stable so a future fix
// can swap in a real SIMD kernel without breaking callers.
//
// The real SIMD kernels below (`SqrtF64Op`, `AbsF64Op`, `NegF64Op`,
// `SquareF64Op`, `ReciprocalF64Op`, `ExpFastF64Op`, `ExpFastF32Op`) DO
// use pulp intrinsics and are unaffected by this change.
// ---------------------------------------------------------------------------

/// Apply a unary scalar kernel elementwise to a contiguous `f32` slice.
///
/// Historically claimed to dispatch through SIMD; actually ran a scalar
/// loop on every code path. Now honestly labeled as scalar — callers
/// that want real vectorization should use one of the hardware-SIMD
/// entry points below (`simd_sqrt_f32`, `simd_exp_fast_f32`, …).
#[inline]
pub fn dispatch_unary_f32(input: &[f32], output: &mut [f32], scalar_fn: fn(f32) -> f32) {
    debug_assert_eq!(input.len(), output.len());
    for (o, &i) in output.iter_mut().zip(input.iter()) {
        *o = scalar_fn(i);
    }
}

/// Apply a unary scalar kernel elementwise to a contiguous `f64` slice.
/// See [`dispatch_unary_f32`] for background.
#[inline]
pub fn dispatch_unary_f64(input: &[f64], output: &mut [f64], scalar_fn: fn(f64) -> f64) {
    debug_assert_eq!(input.len(), output.len());
    for (o, &i) in output.iter_mut().zip(input.iter()) {
        *o = scalar_fn(i);
    }
}

/// Apply a binary scalar kernel elementwise to two contiguous `f32` slices.
#[inline]
pub fn dispatch_binary_f32(
    a: &[f32],
    b: &[f32],
    output: &mut [f32],
    scalar_fn: fn(f32, f32) -> f32,
) {
    debug_assert_eq!(a.len(), b.len());
    debug_assert_eq!(a.len(), output.len());
    for ((o, &ai), &bi) in output.iter_mut().zip(a.iter()).zip(b.iter()) {
        *o = scalar_fn(ai, bi);
    }
}

/// Apply a binary scalar kernel elementwise to two contiguous `f64` slices.
#[inline]
pub fn dispatch_binary_f64(
    a: &[f64],
    b: &[f64],
    output: &mut [f64],
    scalar_fn: fn(f64, f64) -> f64,
) {
    debug_assert_eq!(a.len(), b.len());
    debug_assert_eq!(a.len(), output.len());
    for ((o, &ai), &bi) in output.iter_mut().zip(a.iter()).zip(b.iter()) {
        *o = scalar_fn(ai, bi);
    }
}

/// Apply a unary operation on `f16` slices via f32 promotion.
///
/// Each input `f16` is promoted to `f32`, the scalar function is applied,
/// and the result is converted back to `f16`.
#[cfg(feature = "f16")]
#[inline]
pub fn dispatch_unary_f16(
    input: &[half::f16],
    output: &mut [half::f16],
    scalar_fn: fn(f32) -> f32,
) {
    debug_assert_eq!(input.len(), output.len());
    for (o, &i) in output.iter_mut().zip(input.iter()) {
        *o = half::f16::from_f32(scalar_fn(i.to_f32()));
    }
}

/// Apply a binary operation on `f16` slices via f32 promotion.
///
/// Each pair of input `f16` values is promoted to `f32`, the scalar function
/// is applied, and the result is converted back to `f16`.
#[cfg(feature = "f16")]
#[inline]
pub fn dispatch_binary_f16(
    a: &[half::f16],
    b: &[half::f16],
    output: &mut [half::f16],
    scalar_fn: fn(f32, f32) -> f32,
) {
    debug_assert_eq!(a.len(), b.len());
    debug_assert_eq!(a.len(), output.len());
    for ((o, &ai), &bi) in output.iter_mut().zip(a.iter()).zip(b.iter()) {
        *o = half::f16::from_f32(scalar_fn(ai.to_f32(), bi.to_f32()));
    }
}

// ---------------------------------------------------------------------------
// SIMD-accelerated operations that use actual hardware SIMD intrinsics
// ---------------------------------------------------------------------------

/// SIMD sqrt for f64 slices using hardware `vsqrtpd` / equivalent.
#[inline]
pub fn simd_sqrt_f64(input: &[f64], output: &mut [f64]) {
    debug_assert_eq!(input.len(), output.len());
    if force_scalar() {
        for (o, &i) in output.iter_mut().zip(input.iter()) {
            *o = i.sqrt();
        }
    } else {
        let arch = Arch::new();
        arch.dispatch(SqrtF64Op { input, output });
    }
}

/// SIMD sqrt for f32 slices using hardware `vsqrtps` / equivalent.
#[inline]
pub fn simd_sqrt_f32(input: &[f32], output: &mut [f32]) {
    debug_assert_eq!(input.len(), output.len());
    if force_scalar() {
        for (o, &i) in output.iter_mut().zip(input.iter()) {
            *o = i.sqrt();
        }
    } else {
        let arch = Arch::new();
        arch.dispatch(SqrtF32Op { input, output });
    }
}

/// SIMD abs for f64 slices.
#[inline]
pub fn simd_abs_f64(input: &[f64], output: &mut [f64]) {
    debug_assert_eq!(input.len(), output.len());
    if force_scalar() {
        for (o, &i) in output.iter_mut().zip(input.iter()) {
            *o = i.abs();
        }
    } else {
        let arch = Arch::new();
        arch.dispatch(AbsF64Op { input, output });
    }
}

/// SIMD neg for f64 slices.
#[inline]
pub fn simd_neg_f64(input: &[f64], output: &mut [f64]) {
    debug_assert_eq!(input.len(), output.len());
    if force_scalar() {
        for (o, &i) in output.iter_mut().zip(input.iter()) {
            *o = -i;
        }
    } else {
        let arch = Arch::new();
        arch.dispatch(NegF64Op { input, output });
    }
}

/// SIMD square (x*x) for f64 slices.
#[inline]
pub fn simd_square_f64(input: &[f64], output: &mut [f64]) {
    debug_assert_eq!(input.len(), output.len());
    if force_scalar() {
        for (o, &i) in output.iter_mut().zip(input.iter()) {
            *o = i * i;
        }
    } else {
        let arch = Arch::new();
        arch.dispatch(SquareF64Op { input, output });
    }
}

/// SIMD reciprocal (1/x) for f64 slices.
#[inline]
pub fn simd_reciprocal_f64(input: &[f64], output: &mut [f64]) {
    debug_assert_eq!(input.len(), output.len());
    if force_scalar() {
        for (o, &i) in output.iter_mut().zip(input.iter()) {
            *o = 1.0 / i;
        }
    } else {
        let arch = Arch::new();
        arch.dispatch(ReciprocalF64Op { input, output });
    }
}

// ---------------------------------------------------------------------------
// SIMD intrinsic implementations (actual hardware SIMD, not scalar fallback)
// ---------------------------------------------------------------------------
//
// Everything below this point genuinely uses pulp SIMD intrinsics and
// will vectorize to SSE2/AVX2/AVX-512 on x86_64 or NEON on aarch64.
// The four generic dispatch functions at the top of this file were
// previously also routed through pulp via Unary{F32,F64}Op /
// Binary{F32,F64}Op wrappers, but those impls ignored the SIMD
// parameter and ran scalar loops — see #377 for the audit. They have
// been collapsed into direct scalar loops above until a real
// vectorized transcendental library is integrated.

struct SqrtF64Op<'a> {
    input: &'a [f64],
    output: &'a mut [f64],
}

impl pulp::WithSimd for SqrtF64Op<'_> {
    type Output = ();

    #[inline(always)]
    fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
        let n = self.input.len();
        let lane_count = size_of::<S::f64s>() / size_of::<f64>();
        let stride = lane_count * 4;
        let unrolled_end = n - (n % stride);
        let simd_end = n - (n % lane_count);

        // 4-wide unroll to hide sqrt's ~12-cycle latency
        let mut i = 0;
        while i < unrolled_end {
            let v0 = simd.partial_load_f64s(&self.input[i..i + lane_count]);
            let v1 = simd.partial_load_f64s(&self.input[i + lane_count..i + lane_count * 2]);
            let v2 = simd.partial_load_f64s(&self.input[i + lane_count * 2..i + lane_count * 3]);
            let v3 = simd.partial_load_f64s(&self.input[i + lane_count * 3..i + stride]);
            let r0 = simd.sqrt_f64s(v0);
            let r1 = simd.sqrt_f64s(v1);
            let r2 = simd.sqrt_f64s(v2);
            let r3 = simd.sqrt_f64s(v3);
            simd.partial_store_f64s(&mut self.output[i..i + lane_count], r0);
            simd.partial_store_f64s(&mut self.output[i + lane_count..i + lane_count * 2], r1);
            simd.partial_store_f64s(&mut self.output[i + lane_count * 2..i + lane_count * 3], r2);
            simd.partial_store_f64s(&mut self.output[i + lane_count * 3..i + stride], r3);
            i += stride;
        }
        while i < simd_end {
            let v = simd.partial_load_f64s(&self.input[i..i + lane_count]);
            let r = simd.sqrt_f64s(v);
            simd.partial_store_f64s(&mut self.output[i..i + lane_count], r);
            i += lane_count;
        }
        for j in simd_end..n {
            self.output[j] = self.input[j].sqrt();
        }
    }
}

struct SqrtF32Op<'a> {
    input: &'a [f32],
    output: &'a mut [f32],
}

impl pulp::WithSimd for SqrtF32Op<'_> {
    type Output = ();

    #[inline(always)]
    fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
        let n = self.input.len();
        let lane_count = size_of::<S::f32s>() / size_of::<f32>();
        let simd_end = n - (n % lane_count);

        for i in (0..simd_end).step_by(lane_count) {
            let v = simd.partial_load_f32s(&self.input[i..i + lane_count]);
            let r = simd.sqrt_f32s(v);
            simd.partial_store_f32s(&mut self.output[i..i + lane_count], r);
        }
        for i in simd_end..n {
            self.output[i] = self.input[i].sqrt();
        }
    }
}

struct AbsF64Op<'a> {
    input: &'a [f64],
    output: &'a mut [f64],
}

impl pulp::WithSimd for AbsF64Op<'_> {
    type Output = ();

    #[inline(always)]
    fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
        let n = self.input.len();
        let lane_count = size_of::<S::f64s>() / size_of::<f64>();
        let simd_end = n - (n % lane_count);

        for i in (0..simd_end).step_by(lane_count) {
            let v = simd.partial_load_f64s(&self.input[i..i + lane_count]);
            let r = simd.abs_f64s(v);
            simd.partial_store_f64s(&mut self.output[i..i + lane_count], r);
        }
        for i in simd_end..n {
            self.output[i] = self.input[i].abs();
        }
    }
}

struct NegF64Op<'a> {
    input: &'a [f64],
    output: &'a mut [f64],
}

impl pulp::WithSimd for NegF64Op<'_> {
    type Output = ();

    #[inline(always)]
    fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
        let n = self.input.len();
        let lane_count = size_of::<S::f64s>() / size_of::<f64>();
        let simd_end = n - (n % lane_count);

        for i in (0..simd_end).step_by(lane_count) {
            let v = simd.partial_load_f64s(&self.input[i..i + lane_count]);
            let r = simd.neg_f64s(v);
            simd.partial_store_f64s(&mut self.output[i..i + lane_count], r);
        }
        for i in simd_end..n {
            self.output[i] = -self.input[i];
        }
    }
}

struct SquareF64Op<'a> {
    input: &'a [f64],
    output: &'a mut [f64],
}

impl pulp::WithSimd for SquareF64Op<'_> {
    type Output = ();

    #[inline(always)]
    fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
        let n = self.input.len();
        let lane_count = size_of::<S::f64s>() / size_of::<f64>();
        let simd_end = n - (n % lane_count);

        for i in (0..simd_end).step_by(lane_count) {
            let v = simd.partial_load_f64s(&self.input[i..i + lane_count]);
            let r = simd.mul_f64s(v, v);
            simd.partial_store_f64s(&mut self.output[i..i + lane_count], r);
        }
        for i in simd_end..n {
            self.output[i] = self.input[i] * self.input[i];
        }
    }
}

struct ReciprocalF64Op<'a> {
    input: &'a [f64],
    output: &'a mut [f64],
}

impl pulp::WithSimd for ReciprocalF64Op<'_> {
    type Output = ();

    #[inline(always)]
    fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
        let n = self.input.len();
        let lane_count = size_of::<S::f64s>() / size_of::<f64>();
        let simd_end = n - (n % lane_count);
        let one = simd.splat_f64s(1.0);

        for i in (0..simd_end).step_by(lane_count) {
            let v = simd.partial_load_f64s(&self.input[i..i + lane_count]);
            let r = simd.div_f64s(one, v);
            simd.partial_store_f64s(&mut self.output[i..i + lane_count], r);
        }
        for i in simd_end..n {
            self.output[i] = 1.0 / self.input[i];
        }
    }
}

// ---------------------------------------------------------------------------
// exp_fast: pulp-dispatched batch operations
// ---------------------------------------------------------------------------
// These route exp_fast through pulp's per-ISA compilation so LLVM can
// auto-vectorize the Remez polynomial for AVX2+FMA without requiring
// the user to set -C target-cpu=native.
// See: https://github.com/dollspace-gay/ferray/issues/6

struct ExpFastF64Op<'a> {
    input: &'a [f64],
    output: &'a mut [f64],
}

impl pulp::WithSimd for ExpFastF64Op<'_> {
    type Output = ();

    #[inline(always)]
    fn with_simd<S: pulp::Simd>(self, _simd: S) -> Self::Output {
        for i in 0..self.input.len() {
            self.output[i] = crate::fast_exp::exp_fast_f64(self.input[i]);
        }
    }
}

struct ExpFastF32Op<'a> {
    input: &'a [f32],
    output: &'a mut [f32],
}

impl pulp::WithSimd for ExpFastF32Op<'_> {
    type Output = ();

    #[inline(always)]
    fn with_simd<S: pulp::Simd>(self, _simd: S) -> Self::Output {
        for i in 0..self.input.len() {
            self.output[i] = crate::fast_exp::exp_fast_f32(self.input[i]);
        }
    }
}

/// Dispatch `exp_fast` for f64 slices through pulp's per-ISA compilation.
///
/// This ensures the Even/Odd Remez polynomial auto-vectorizes for AVX2+FMA
/// even when the crate is built without `-C target-cpu=native`.
#[inline]
pub fn dispatch_exp_fast_f64(input: &[f64], output: &mut [f64]) {
    debug_assert_eq!(input.len(), output.len());
    if force_scalar() {
        for i in 0..input.len() {
            output[i] = crate::fast_exp::exp_fast_f64(input[i]);
        }
    } else {
        let arch = Arch::new();
        arch.dispatch(ExpFastF64Op { input, output });
    }
}

/// Dispatch `exp_fast` for f32 slices through pulp's per-ISA compilation.
#[inline]
pub fn dispatch_exp_fast_f32(input: &[f32], output: &mut [f32]) {
    debug_assert_eq!(input.len(), output.len());
    if force_scalar() {
        for i in 0..input.len() {
            output[i] = crate::fast_exp::exp_fast_f32(input[i]);
        }
    } else {
        let arch = Arch::new();
        arch.dispatch(ExpFastF32Op { input, output });
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn dispatch_unary_f32_works() {
        // Tests the dispatch path (SIMD or scalar depending on platform).
        // To test the forced-scalar path, run with FERRAY_FORCE_SCALAR=1.
        let input = [1.0f32, 4.0, 9.0, 16.0];
        let mut output = [0.0f32; 4];
        dispatch_unary_f32(&input, &mut output, f32::sqrt);
        assert_eq!(output, [1.0, 2.0, 3.0, 4.0]);
    }

    #[test]
    fn dispatch_unary_f64_simd() {
        let input = [1.0f64, 4.0, 9.0, 16.0];
        let mut output = [0.0f64; 4];
        dispatch_unary_f64(&input, &mut output, f64::sqrt);
        assert_eq!(output, [1.0, 2.0, 3.0, 4.0]);
    }

    #[test]
    fn dispatch_binary_f32_works() {
        let a = [1.0f32, 2.0, 3.0];
        let b = [4.0f32, 5.0, 6.0];
        let mut out = [0.0f32; 3];
        dispatch_binary_f32(&a, &b, &mut out, |x, y| x + y);
        assert_eq!(out, [5.0, 7.0, 9.0]);
    }

    #[test]
    fn dispatch_binary_f64_works() {
        let a = [1.0f64, 2.0, 3.0];
        let b = [4.0f64, 5.0, 6.0];
        let mut out = [0.0f64; 3];
        dispatch_binary_f64(&a, &b, &mut out, |x, y| x * y);
        assert_eq!(out, [4.0, 10.0, 18.0]);
    }

    #[test]
    fn force_scalar_env() {
        // force_scalar() is cached via LazyLock for performance.
        // In normal test runs, FERRAY_FORCE_SCALAR is not set,
        // so force_scalar() returns false. We verify that here.
        // To test the FERRAY_FORCE_SCALAR=1 path, run tests with
        // the env var set: FERRAY_FORCE_SCALAR=1 cargo test
        assert!(!force_scalar());
    }

    #[cfg(feature = "f16")]
    #[test]
    fn dispatch_unary_f16_works() {
        let input = [
            half::f16::from_f32(1.0),
            half::f16::from_f32(4.0),
            half::f16::from_f32(9.0),
            half::f16::from_f32(16.0),
        ];
        let mut output = [half::f16::ZERO; 4];
        super::dispatch_unary_f16(&input, &mut output, f32::sqrt);
        let expected = [1.0f32, 2.0, 3.0, 4.0];
        for (o, &e) in output.iter().zip(expected.iter()) {
            assert!((o.to_f32() - e).abs() < 0.01);
        }
    }

    #[cfg(feature = "f16")]
    #[test]
    fn dispatch_binary_f16_works() {
        let a = [
            half::f16::from_f32(1.0),
            half::f16::from_f32(2.0),
            half::f16::from_f32(3.0),
        ];
        let b = [
            half::f16::from_f32(4.0),
            half::f16::from_f32(5.0),
            half::f16::from_f32(6.0),
        ];
        let mut out = [half::f16::ZERO; 3];
        super::dispatch_binary_f16(&a, &b, &mut out, |x, y| x + y);
        let expected = [5.0f32, 7.0, 9.0];
        for (o, &e) in out.iter().zip(expected.iter()) {
            assert!((o.to_f32() - e).abs() < 0.01);
        }
    }
}