ferray-ufunc 0.5.0

// ferray-ufunc: Arithmetic functions
//
// add, subtract, multiply, divide, true_divide, floor_divide, power,
// remainder, mod_, fmod, divmod, absolute, fabs, sign, negative, positive,
// reciprocal, sqrt, cbrt, square, heaviside, gcd, lcm
//
// Cumulative: cumsum, cumprod, nancumsum, nancumprod
// Differences: diff, ediff1d, gradient
// Products: cross
// Integration: trapezoid
//
// Reduction: add_reduce, add_accumulate, multiply_outer
//
// ## REQ status — integer-dtype arithmetic + NEP-50 true-division
//
// SHIPPED:
//   - `divide` / `divide_into` / `true_divide` are *true division* (NumPy
//     `PyUFunc_TrueDivisionTypeResolver`, generate_umath.py:422): integer
//     operands promote to f64, float operands keep their float type. The
//     output element type is `<T as TrueDivide>::Output` (i*/u* -> f64,
//     f32 -> f32, f64 -> f64). Integer divide-by-zero NEVER panics — the
//     operands are cast to f64 first so `x/0.0` yields inf/nan exactly as
//     NumPy does. f32/f64 callers keep the byte-identical SIMD fast path.
//   - `floor_divide_int` / `remainder_int` / `mod_int`: integer floor-div
//     and Python-style modulo with divisor-zero -> 0 (NumPy
//     `loops_modulo`/`PyUFunc_RemainderTypeResolver` returns 0 + a
//     RuntimeWarning, never a panic — generate_umath.py:405,1039).
//   - `power_int`: integer exponentiation, int -> int (NumPy `power`
//     `TD(ints)`, generate_umath.py:480).
//   - `sign_int` / `negative_int` / `absolute_int`: preserve the integer
//     dtype (NumPy `sign`/`negative`/`absolute` `TD(ints)`,
//     generate_umath.py:496,516,534). Wrapping where NumPy wraps
//     (e.g. `negative(i8::MIN)`).
//   - `cumsum` / `cumprod` / `cumulative_sum` / `cumulative_prod` /
//     `add_accumulate` narrow-int accumulator promotion: each widens every
//     element into the NumPy reduction accumulator `ReduceAcc::Acc`
//     (ferray-core `array/reductions.rs`, reused here) — i8/i16/i32 -> i64,
//     u8/u16/u32 -> u64, bool -> i64, f32/f64/i64/complex unchanged — so a
//     narrow-int cumulative never overflows and its dtype matches NumPy's
//     promoted result (`numpy/_core/fromnumeric.py:2850-2855` cumsum,
//     `:3424-3429` cumprod; live oracle
//     `np.cumsum(np.array([100,100,100],dtype=np.int8)) == [100,200,300]`
//     int64). The shared `cumulative_promoted` kernel (this file) is the
//     consumer; `f32`/`f64` callers keep the byte-identical accumulation.
//
// Consumers: `divide` (public re-export, also reached via the `/` operator
// through `operator_overloads::array_div`) consumes `TrueDivide`. The int
// ops are re-exported from the crate root and reached through the public
// ufunc surface.

use ferray_core::Array;
use ferray_core::array::reductions::ReduceAcc;
use ferray_core::dimension::{Dimension, Ix1, IxDyn};
use ferray_core::dtype::Element;
use ferray_core::error::{FerrayError, FerrayResult};
use num_complex::Complex;
use num_traits::Float;

use crate::helpers::{
    binary_broadcast_op, binary_elementwise_op, binary_elementwise_op_into, try_simd_f32_binary,
    try_simd_f64_binary, unary_float_op, unary_float_op_compute, unary_float_op_into,
};
use crate::kernels::simd_f32::{add_f32, div_f32, mul_f32, sub_f32};
use crate::kernels::simd_f64::{add_f64, div_f64, mul_f64, sub_f64};

// ---------------------------------------------------------------------------
// Basic arithmetic (binary, same-shape)
// ---------------------------------------------------------------------------

/// NumPy fixed-width integer arithmetic: `add`/`subtract`/`multiply` on
/// integer dtypes wrap on overflow (modular arithmetic), while floats use
/// ordinary IEEE arithmetic. This trait selects the right per-element
/// kernel so `np.add(int8(100), int8(100)) == int8(-56)` instead of
/// panicking in debug builds.
///
/// Floats implement the methods as plain `+`/`-`/`*` (byte-identical to the
/// previous behaviour); integers implement them as `wrapping_*`.
pub trait WrappingArith: Copy {
    /// `self + rhs`, wrapping on overflow for integer types.
    fn wadd(self, rhs: Self) -> Self;
    /// `self - rhs`, wrapping on overflow for integer types.
    fn wsub(self, rhs: Self) -> Self;
    /// `self * rhs`, wrapping on overflow for integer types.
    fn wmul(self, rhs: Self) -> Self;
}

macro_rules! impl_wrapping_arith_int {
    ($($ty:ty),* $(,)?) => {
        $(
            impl WrappingArith for $ty {
                #[inline]
                fn wadd(self, rhs: $ty) -> $ty { self.wrapping_add(rhs) }
                #[inline]
                fn wsub(self, rhs: $ty) -> $ty { self.wrapping_sub(rhs) }
                #[inline]
                fn wmul(self, rhs: $ty) -> $ty { self.wrapping_mul(rhs) }
            }
        )*
    };
}

impl_wrapping_arith_int!(i8, i16, i32, i64, i128, u8, u16, u32, u64, u128);

macro_rules! impl_wrapping_arith_float {
    ($($ty:ty),* $(,)?) => {
        $(
            impl WrappingArith for $ty {
                #[inline]
                fn wadd(self, rhs: $ty) -> $ty { self + rhs }
                #[inline]
                fn wsub(self, rhs: $ty) -> $ty { self - rhs }
                #[inline]
                fn wmul(self, rhs: $ty) -> $ty { self * rhs }
            }
        )*
    };
}

impl_wrapping_arith_float!(f32, f64);

// Complex add/subtract/multiply have NO wrapping semantics — they are the
// native `num_complex` `Add`/`Sub`/`Mul` (component-wise float arithmetic for
// `+`/`-`, the `(ac-bd, ad+bc)` cross-product for `*`). numpy.ma computes
// `complex + complex` / `* ` / `- ` directly (verified live, numpy 2.4.5:
// `np.ma.array([1+2j])*np.ma.array([2+0j]) -> [(2+4j)]`), so `WrappingArith`
// for `Complex` just forwards to those operators. This lets the generic
// `add`/`subtract`/`multiply` fns accept `Complex<f32>`/`Complex<f64>`.
macro_rules! impl_wrapping_arith_complex {
    ($($ty:ty),* $(,)?) => {
        $(
            impl WrappingArith for Complex<$ty> {
                #[inline]
                fn wadd(self, rhs: Complex<$ty>) -> Complex<$ty> { self + rhs }
                #[inline]
                fn wsub(self, rhs: Complex<$ty>) -> Complex<$ty> { self - rhs }
                #[inline]
                fn wmul(self, rhs: Complex<$ty>) -> Complex<$ty> { self * rhs }
            }
        )*
    };
}

impl_wrapping_arith_complex!(f32, f64);

#[cfg(feature = "f16")]
impl WrappingArith for half::f16 {
    #[inline]
    fn wadd(self, rhs: half::f16) -> half::f16 {
        half::f16::from_f32(self.to_f32() + rhs.to_f32())
    }
    #[inline]
    fn wsub(self, rhs: half::f16) -> half::f16 {
        half::f16::from_f32(self.to_f32() - rhs.to_f32())
    }
    #[inline]
    fn wmul(self, rhs: half::f16) -> half::f16 {
        half::f16::from_f32(self.to_f32() * rhs.to_f32())
    }
}

/// Elementwise addition with `NumPy` broadcasting.
///
/// Same-shape f64 / f32 inputs go through the explicit SIMD slice
/// kernel (`add_f64` / `add_f32` via `pulp::Arch` runtime dispatch).
/// Other dtypes and broadcasting paths fall through to the generic
/// auto-vectorised loop. Integer dtypes wrap on overflow (NumPy
/// fixed-width contract — `np.add(int8(100), int8(100)) == -56`), via
/// [`WrappingArith`]; floats are unchanged. (#88)
pub fn add<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + WrappingArith,
    D: Dimension,
{
    if let Some(r) = try_simd_f64_binary(a, b, add_f64) {
        return r;
    }
    if let Some(r) = try_simd_f32_binary(a, b, add_f32) {
        return r;
    }
    binary_elementwise_op(a, b, WrappingArith::wadd)
}

/// In-place elementwise addition, equivalent to `NumPy`'s
/// `np.add(a, b, out=out)`. Writes `a + b` directly into `out` without
/// allocating. All three arrays must be contiguous (C-order) and have the
/// same shape; broadcasting is not supported on this fast path — use
/// [`add`] if you need it.
///
/// # Errors
/// - `FerrayError::ShapeMismatch` if shapes differ.
/// - `FerrayError::InvalidValue` if any array is non-contiguous.
pub fn add_into<T, D>(a: &Array<T, D>, b: &Array<T, D>, out: &mut Array<T, D>) -> FerrayResult<()>
where
    T: Element + WrappingArith,
    D: Dimension,
{
    binary_elementwise_op_into(a, b, out, "add", WrappingArith::wadd)
}

/// Elementwise subtraction with `NumPy` broadcasting. SIMD-dispatched
/// for same-shape f64/f32 inputs; integer dtypes wrap on overflow (#88).
pub fn subtract<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + WrappingArith,
    D: Dimension,
{
    if let Some(r) = try_simd_f64_binary(a, b, sub_f64) {
        return r;
    }
    if let Some(r) = try_simd_f32_binary(a, b, sub_f32) {
        return r;
    }
    binary_elementwise_op(a, b, WrappingArith::wsub)
}

/// In-place subtraction — the `_into` counterpart of [`subtract`].
pub fn subtract_into<T, D>(
    a: &Array<T, D>,
    b: &Array<T, D>,
    out: &mut Array<T, D>,
) -> FerrayResult<()>
where
    T: Element + WrappingArith,
    D: Dimension,
{
    binary_elementwise_op_into(a, b, out, "subtract", WrappingArith::wsub)
}

/// Elementwise multiplication with `NumPy` broadcasting. SIMD-dispatched
/// for same-shape f64/f32 inputs; integer dtypes wrap on overflow (#88).
pub fn multiply<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + WrappingArith,
    D: Dimension,
{
    if let Some(r) = try_simd_f64_binary(a, b, mul_f64) {
        return r;
    }
    if let Some(r) = try_simd_f32_binary(a, b, mul_f32) {
        return r;
    }
    binary_elementwise_op(a, b, WrappingArith::wmul)
}

/// In-place multiplication — the `_into` counterpart of [`multiply`].
pub fn multiply_into<T, D>(
    a: &Array<T, D>,
    b: &Array<T, D>,
    out: &mut Array<T, D>,
) -> FerrayResult<()>
where
    T: Element + WrappingArith,
    D: Dimension,
{
    binary_elementwise_op_into(a, b, out, "multiply", WrappingArith::wmul)
}

/// NumPy true-division element resolver (`PyUFunc_TrueDivisionTypeResolver`,
/// generate_umath.py:422).
///
/// `np.divide` / `np.true_divide` always perform *true* (float) division.
/// The output dtype depends on the input dtype: integer inputs promote to
/// `float64`, `float32` stays `float32`, `float64` stays `float64`. The
/// associated [`TrueDivide::Output`] captures that mapping at compile time
/// so `divide(&Array<i32>, &Array<i32>)` returns `Array<f64>`.
///
/// Implementations cast through `f64`/`f32` (never Rust integer `/`), so an
/// integer divide-by-zero produces `inf`/`nan` instead of panicking —
/// matching NumPy's RuntimeWarning-only behaviour.
pub trait TrueDivide: Element + Copy {
    /// The dtype of `np.true_divide(self_dtype, self_dtype)`.
    type Output: Element + Copy;

    /// True (float) division of a single pair of elements.
    fn true_div(self, rhs: Self) -> Self::Output;
}

impl TrueDivide for f64 {
    type Output = f64;
    #[inline]
    fn true_div(self, rhs: f64) -> f64 {
        self / rhs
    }
}

impl TrueDivide for f32 {
    type Output = f32;
    #[inline]
    fn true_div(self, rhs: f32) -> f32 {
        self / rhs
    }
}

macro_rules! impl_true_divide_int {
    ($($ty:ty),* $(,)?) => {
        $(
            impl TrueDivide for $ty {
                type Output = f64;
                #[inline]
                #[allow(
                    clippy::cast_lossless,
                    reason = "NEP-50: integer true-division promotes to f64; \
                              the f64 widening is the documented contract, not a bug"
                )]
                fn true_div(self, rhs: $ty) -> f64 {
                    // Cast to f64 BEFORE dividing: f64 div-by-zero yields
                    // inf/nan (no panic), unlike Rust integer `/`.
                    (self as f64) / (rhs as f64)
                }
            }
        )*
    };
}

impl_true_divide_int!(i8, i16, i32, i64, u8, u16, u32, u64);

/// Smith-algorithm complex division of `a / b`, mirroring NumPy's
/// `cdiv@c@` (`numpy/_core/src/npymath/npy_math_complex.c.src:94`).
///
/// `num_complex`'s `Div` uses the naive `(ac+bd)/(c²+d²)` form, whose
/// `c²+d²` denominator OVERFLOWS for a near-subnormal divisor: e.g.
/// `(2+0j)/(6.675e-308+0j)` → `inf + NaN·j` even though the true quotient
/// `2.996e307+0j` is FINITE (R-DEV-1). NumPy avoids this by scaling on the
/// larger-magnitude denominator component (Smith 1962): if `|br| >= |bi|`,
/// `rat = bi/br; scl = 1/(br + bi*rat); re = (ar+ai*rat)*scl,
/// im = (ai-ar*rat)*scl`, else the symmetric branch. A true-zero divisor
/// still yields a non-finite (`inf`/`nan`) complex — matching numpy's
/// `ar/abs_br, ai/abs_bi` divide-by-zero result — so the binding's domain
/// mask for `complex / 0` is unaffected.
#[inline]
fn complex_smith_div<F>(a: Complex<F>, b: Complex<F>) -> Complex<F>
where
    F: num_traits::Float,
{
    let (ar, ai) = (a.re, a.im);
    let (br, bi) = (b.re, b.im);
    let abs_br = br.abs();
    let abs_bi = bi.abs();
    if abs_br >= abs_bi {
        if abs_br == F::zero() && abs_bi == F::zero() {
            // divide by zeros yields a complex inf or nan (matches numpy)
            Complex::new(ar / abs_br, ai / abs_bi)
        } else {
            let rat = bi / br;
            let scl = F::one() / (br + bi * rat);
            Complex::new((ar + ai * rat) * scl, (ai - ar * rat) * scl)
        }
    } else {
        let rat = br / bi;
        let scl = F::one() / (bi + br * rat);
        Complex::new((ar * rat + ai) * scl, (ai * rat - ar) * scl)
    }
}

// Complex true-division: `complex / complex -> complex` (NEVER promoted to
// f64 — numpy keeps the complex width, verified live numpy 2.4.5:
// `np.ma.array([1+2j])/np.ma.array([2+0j]) -> [(0.5+1j)]`). Uses NumPy's
// Smith algorithm (see [`complex_smith_div`]) rather than `num_complex`'s
// naive `Div`, so a near-subnormal divisor keeps the finite quotient numpy
// computes instead of overflowing. A true-zero divisor still yields
// `inf`/`nan` complex (no panic); the numpy.ma DOMAIN mask for `complex / 0`
// is applied by the binding, not here.
macro_rules! impl_true_divide_complex {
    ($($ty:ty),* $(,)?) => {
        $(
            impl TrueDivide for Complex<$ty> {
                type Output = Complex<$ty>;
                #[inline]
                fn true_div(self, rhs: Complex<$ty>) -> Complex<$ty> {
                    complex_smith_div(self, rhs)
                }
            }
        )*
    };
}

impl_true_divide_complex!(f32, f64);

/// Elementwise *true* division with `NumPy` broadcasting.
///
/// Matches `np.divide` / `np.true_divide`: integer operands promote to
/// `float64`, floats keep their float dtype (see [`TrueDivide`]). Integer
/// divide-by-zero returns `inf`/`nan` and never panics. Same-shape f64/f32
/// inputs keep the byte-identical SIMD fast path (#88).
pub fn divide<T, D>(
    a: &Array<T, D>,
    b: &Array<T, D>,
) -> FerrayResult<Array<<T as TrueDivide>::Output, D>>
where
    T: TrueDivide,
    D: Dimension,
{
    use std::any::TypeId;

    // f64/f32 fast path: Output == T, so the SIMD kernel result can be
    // reinterpreted to the (identical) Output type. This keeps existing
    // float callers byte-identical with the pre-true-division behaviour.
    if TypeId::of::<T>() == TypeId::of::<f64>() {
        if let Some(r) = try_simd_f64_binary(a, b, div_f64) {
            // SAFETY: T == f64 == <T as TrueDivide>::Output (verified by the
            // TypeId check); reinterpreting Array<T> as Array<Output> is a
            // no-op identity transmute of the element type.
            return r.map(|arr| unsafe {
                crate::helpers::reinterpret_array::<T, <T as TrueDivide>::Output, D>(arr)
            });
        }
    } else if TypeId::of::<T>() == TypeId::of::<f32>()
        && let Some(r) = try_simd_f32_binary(a, b, div_f32)
    {
        // SAFETY: T == f32 == <T as TrueDivide>::Output (verified above).
        return r.map(|arr| unsafe {
            crate::helpers::reinterpret_array::<T, <T as TrueDivide>::Output, D>(arr)
        });
    }
    crate::helpers::binary_map_op(a, b, T::true_div)
}

/// In-place true division — the `_into` counterpart of [`divide`].
///
/// Writes into `out`, whose element type is the promoted true-division
/// output (`<T as TrueDivide>::Output`). For integer `T` this is an `f64`
/// `out`, so `np.divide(int, int, out=float_arr)` is expressible.
pub fn divide_into<T, D>(
    a: &Array<T, D>,
    b: &Array<T, D>,
    out: &mut Array<<T as TrueDivide>::Output, D>,
) -> FerrayResult<()>
where
    T: TrueDivide,
    D: Dimension,
{
    if a.shape() != b.shape() || a.shape() != out.shape() {
        return Err(FerrayError::shape_mismatch(format!(
            "divide_into: shapes {:?}, {:?}, out {:?} must match",
            a.shape(),
            b.shape(),
            out.shape()
        )));
    }
    let result = divide(a, b)?;
    for (dst, src) in out.iter_mut().zip(result.iter()) {
        *dst = *src;
    }
    Ok(())
}

/// Alias for [`divide`] — true (float) division. Identical semantics:
/// integer inputs promote to `float64`.
pub fn true_divide<T, D>(
    a: &Array<T, D>,
    b: &Array<T, D>,
) -> FerrayResult<Array<<T as TrueDivide>::Output, D>>
where
    T: TrueDivide,
    D: Dimension,
{
    divide(a, b)
}

/// Floor division: floor(a / b) for float inputs.
///
/// For integer arrays use [`floor_divide_int`] (NumPy keeps integer
/// floor-division in the integer dtype with divisor-zero -> 0).
pub fn floor_divide<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    binary_elementwise_op(a, b, |x, y| (x / y).floor())
}

/// Integer floor division (`np.floor_divide` on integer dtypes,
/// generate_umath.py:405 `TD(ints, dispatch=loops_modulo)`).
///
/// Result has the same integer dtype as the inputs and rounds toward
/// negative infinity (Python `//` semantics, not Rust truncating `/`).
/// Divisor-zero yields `0` (NumPy returns 0 + a RuntimeWarning, never a
/// panic).
pub fn floor_divide_int<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy + num_traits::PrimInt,
    D: Dimension,
{
    let z = <T as Element>::zero();
    let one = <T as Element>::one();
    let min = <T as num_traits::Bounded>::min_value();
    binary_elementwise_op(a, b, move |x, y| {
        if y == z {
            return z;
        }
        // INT_MIN / -1 overflows two's-complement: NumPy wraps to INT_MIN
        // (C signed-division wraparound, never a panic). `min < z` excludes
        // unsigned types; `y < z && y + one == z` identifies the divisor -1
        // (the `+ one` cannot overflow, y being -1).
        if min < z && x == min && y < z && y + one == z {
            return min;
        }
        // Truncating quotient, then adjust toward -inf when the signs of
        // the operands differ and the division is inexact (Python `//`).
        let q = x / y;
        let r = x - q * y;
        if r != z && ((r < z) != (y < z)) {
            q - one
        } else {
            q
        }
    })
}

/// Elementwise power for float inputs: a^b.
///
/// For integer bases/exponents use [`power_int`] (NumPy keeps integer
/// power in the integer dtype).
pub fn power<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    binary_elementwise_op(a, b, num_traits::Float::powf)
}

/// Elementwise complex power `a^b` (`np.power` on complex dtypes,
/// generate_umath.py:480 `TD(cmplx)`), mirroring `npy_cpow`
/// (`numpy/_core/src/npymath/npy_math_complex.c.src:438`).
///
/// `num_traits::Float` is not impl'd for `Complex`, so the generic [`power`]
/// (bound `T: Float`) cannot serve complex — this is the complex arm.
/// The kernel reproduces `npy_cpow`'s special cases verbatim so the result is
/// byte-identical to numpy's `power` ufunc:
///   - `b == 0` → `1 + 0j` (`npy_math_complex.c.src:452`);
///   - zero base `a == 0`: `0` if `Re(b) > 0`, else `nan + nan*j`
///     (`:462`-`:483`);
///   - real integral exponent `Im(b) == 0`, `Re(b)` an integer in `(-100,
///     100)`: integer power by repeated squaring over the complex `*` (`:485`-
///     `:521`) — this is why `(1+2j)**2` is EXACTLY `-3+4j` (not the
///     `4.0000000002j` that `powc`'s `exp(b*log(a))` would give);
///   - otherwise the general branch `a.powc(b)` = `exp(b * log(a))` (`:524`).
pub fn power_complex<T, D>(
    a: &Array<Complex<T>, D>,
    b: &Array<Complex<T>, D>,
) -> FerrayResult<Array<Complex<T>, D>>
where
    T: Element + Float,
    Complex<T>: Element,
    D: Dimension,
{
    binary_elementwise_op(a, b, cpow_kernel)
}

/// Single-element complex power mirroring `npy_cpow`
/// (`numpy/_core/src/npymath/npy_math_complex.c.src:438`). Factored out so the
/// same kernel serves both the array op and the unit tests.
#[inline]
fn cpow_kernel<T: Float>(a: Complex<T>, b: Complex<T>) -> Complex<T> {
    let zero = <T as num_traits::Zero>::zero();
    let one = <T as num_traits::One>::one();

    // a^0 == 1 (including 0^0, by convention — npy_math_complex.c.src:452).
    if b.re == zero && b.im == zero {
        return Complex::new(one, zero);
    }
    // 0^b for non-zero b: 0 if Re(b) > 0, else nan + nan*j (:462-:483).
    if a.re == zero && a.im == zero {
        if b.re > zero {
            return Complex::new(zero, zero);
        }
        let nan = <T as Float>::nan();
        return Complex::new(nan, nan);
    }
    // Real integral exponent in (-100, 100): integer power by repeated
    // squaring over the complex product, so e.g. (1+2j)**2 is exactly -3+4j
    // (:485-:521). `br as i64` truncates toward zero; the `(n as T) == b.re`
    // guard accepts it only when br is exactly integral.
    if b.im == zero {
        let br = b.re;
        // Bound check mirrors numpy's `br > -100 && br < 100`.
        let hundred = {
            let mut acc = one;
            for _ in 0..100 {
                acc = acc + one;
            }
            acc
        };
        if br > zero - hundred && br < hundred {
            // Round toward zero, then verify the round-trip is exact.
            let n_t = br.trunc();
            if n_t == br {
                let neg = br < zero;
                let mut n = n_t.abs();
                let mut aa = Complex::new(one, zero);
                let mut p = a;
                // Repeated-squaring exponentiation over the complex product.
                loop {
                    // Is the current low bit set? n is integral; (n/2)*2 != n
                    // ⇔ n is odd.
                    let two = one + one;
                    let half = (n / two).trunc();
                    let is_odd = (half + half) != n;
                    if is_odd {
                        aa = aa * p;
                    }
                    n = half;
                    if n == zero {
                        break;
                    }
                    p = p * p;
                }
                if neg {
                    return Complex::new(one, zero) / aa;
                }
                return aa;
            }
        }
    }
    // General branch: a^b = exp(b * log(a)) (:524, `cpow`).
    a.powc(b)
}

/// Integer power: a^b, int -> int (`np.power` on integer dtypes,
/// generate_umath.py:480 `TD(ints)`).
///
/// Uses wrapping exponentiation so overflow wraps (matching NumPy's
/// fixed-width integer behaviour) instead of panicking in debug builds.
/// A negative exponent on an integer base yields `0` for `|base| > 1`
/// (matching NumPy, which returns 0 for integer `a ** -n`), `1` for
/// `base == 1`, and `0` otherwise.
pub fn power_int<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy + num_traits::PrimInt + num_traits::WrappingMul,
    D: Dimension,
{
    let z = <T as Element>::zero();
    let one = <T as Element>::one();
    binary_elementwise_op(a, b, move |base, exp| {
        if exp < z {
            // NumPy integer power with negative exponent: 1**-n == 1,
            // (-1)**-n is ±1, everything else truncates to 0.
            if base == one {
                return one;
            }
            if base == z - one {
                // (-1)^exp: even -> 1, odd -> -1.
                let two = one + one;
                return if (z - exp) % two == z { one } else { z - one };
            }
            return z;
        }
        let mut result = one;
        let mut b_acc = base;
        let mut e = exp;
        let two = one + one;
        while e > z {
            if e % two == one {
                result = result.wrapping_mul(&b_acc);
            }
            e = e / two;
            if e > z {
                b_acc = b_acc.wrapping_mul(&b_acc);
            }
        }
        result
    })
}

/// Elementwise remainder (Python-style modulo).
pub fn remainder<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    let z = <T as Element>::zero();
    binary_elementwise_op(a, b, |x, y| {
        let r = x % y;
        // Python/NumPy mod: result has same sign as divisor
        if (r < z && y > z) || (r > z && y < z) {
            r + y
        } else {
            r
        }
    })
}

/// Alias for [`remainder`].
pub fn mod_<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    remainder(a, b)
}

/// Integer remainder (Python-style modulo) preserving the integer dtype.
///
/// `np.remainder` / `np.mod` on integer dtypes
/// (`PyUFunc_RemainderTypeResolver`, generate_umath.py:1039) returns a
/// result with the sign of the divisor. Divisor-zero yields `0` (NumPy
/// returns 0 + a RuntimeWarning, never a panic).
pub fn remainder_int<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy + num_traits::PrimInt,
    D: Dimension,
{
    let z = <T as Element>::zero();
    let one = <T as Element>::one();
    let min = <T as num_traits::Bounded>::min_value();
    binary_elementwise_op(a, b, move |x, y| {
        if y == z {
            return z;
        }
        // INT_MIN % -1 overflows two's-complement: NumPy wraps to 0 (C
        // signed-remainder wraparound, never a panic). Mirrors the
        // floor_divide_int overflow guard.
        if min < z && x == min && y < z && y + one == z {
            return z;
        }
        let r = x % y;
        // Python/NumPy mod: result takes the sign of the divisor.
        if r != z && ((r < z) != (y < z)) {
            r + y
        } else {
            r
        }
    })
}

/// Alias for [`remainder_int`] — integer Python-style modulo.
pub fn mod_int<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy + num_traits::PrimInt,
    D: Dimension,
{
    remainder_int(a, b)
}

/// C-style fmod (remainder has same sign as dividend).
pub fn fmod<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    binary_elementwise_op(a, b, |x, y| x % y)
}

/// Return `(floor_divide, remainder)` as a tuple of arrays, with broadcasting.
///
/// Computes both results in a single pass over the (broadcast) data,
/// avoiding the redundant division that would occur from calling
/// `floor_divide` and `remainder` separately.
pub fn divmod<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<(Array<T, D>, Array<T, D>)>
where
    T: Element + Float,
    D: Dimension,
{
    use ferray_core::dimension::broadcast::{broadcast_shapes, broadcast_to};

    let z = <T as Element>::zero();

    // Inline the divmod kernel so we can route both fast and broadcast
    // paths through a single closure body.
    let kernel = |x: T, y: T| -> (T, T) {
        let q = (x / y).floor();
        let mut r = x - q * y;
        if (r < z && y > z) || (r > z && y < z) {
            r = r + y;
        }
        (q, r)
    };

    // Fast path: identical shapes.
    if a.shape() == b.shape() {
        let mut quot_data = Vec::with_capacity(a.size());
        let mut rem_data = Vec::with_capacity(a.size());
        for (&x, &y) in a.iter().zip(b.iter()) {
            let (q, r) = kernel(x, y);
            quot_data.push(q);
            rem_data.push(r);
        }
        let quot = Array::from_vec(a.dim().clone(), quot_data)?;
        let rem = Array::from_vec(a.dim().clone(), rem_data)?;
        return Ok((quot, rem));
    }

    // Broadcasting path.
    let target_shape = broadcast_shapes(a.shape(), b.shape()).map_err(|_| {
        FerrayError::shape_mismatch(format!(
            "divmod: shapes {:?} and {:?} are not broadcast-compatible",
            a.shape(),
            b.shape()
        ))
    })?;
    let a_view = broadcast_to(a, &target_shape)?;
    let b_view = broadcast_to(b, &target_shape)?;
    let n: usize = target_shape.iter().product();
    let mut quot_data = Vec::with_capacity(n);
    let mut rem_data = Vec::with_capacity(n);
    for (&x, &y) in a_view.iter().zip(b_view.iter()) {
        let (q, r) = kernel(x, y);
        quot_data.push(q);
        rem_data.push(r);
    }
    let result_dim = D::from_dim_slice(&target_shape).ok_or_else(|| {
        FerrayError::shape_mismatch(format!(
            "divmod: cannot represent broadcast result shape {target_shape:?} as the input dimension type"
        ))
    })?;
    let quot = Array::from_vec(result_dim.clone(), quot_data)?;
    let rem = Array::from_vec(result_dim, rem_data)?;
    Ok((quot, rem))
}

// ---------------------------------------------------------------------------
// Unary arithmetic
// ---------------------------------------------------------------------------

/// Elementwise absolute value.
///
/// Uses hardware SIMD for contiguous f64 arrays.
pub fn absolute<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    if let Some(r) = crate::helpers::try_simd_f64_unary(input, crate::dispatch::simd_abs_f64) {
        return r;
    }
    if let Some(r) = crate::helpers::try_simd_f32_unary(input, crate::dispatch::simd_abs_f32) {
        return r;
    }
    unary_float_op(input, T::abs)
}

/// In-place elementwise absolute value — `_into` counterpart of [`absolute`].
pub fn absolute_into<T, D>(input: &Array<T, D>, out: &mut Array<T, D>) -> FerrayResult<()>
where
    T: Element + Float,
    D: Dimension,
{
    unary_float_op_into(input, out, "absolute", T::abs)
}

/// Alias for [`absolute`] — float abs.
pub fn fabs<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    absolute(input)
}

/// Elementwise absolute value preserving the integer dtype.
///
/// `np.absolute` on integer arrays keeps the integer dtype
/// (generate_umath.py:496 `TD(bints + ... ints)`). Uses wrapping negation
/// so `absolute(i8::MIN)` wraps (matching NumPy's fixed-width behaviour)
/// rather than panicking in debug builds.
pub fn absolute_int<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy + PartialOrd + num_traits::WrappingNeg,
    D: Dimension,
{
    let z = <T as Element>::zero();
    let data: Vec<T> = input
        .iter()
        .map(|&x| if x < z { x.wrapping_neg() } else { x })
        .collect();
    Array::from_vec(input.dim().clone(), data)
}

/// Elementwise sign preserving the integer dtype: -1, 0, or +1.
///
/// `np.sign` on integer arrays returns the same integer dtype
/// (generate_umath.py:534 `TD(ints + flts)`).
pub fn sign_int<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy + PartialOrd + num_traits::WrappingSub,
    D: Dimension,
{
    let z = <T as Element>::zero();
    let one = <T as Element>::one();
    // `0.wrapping_sub(1)` is -1 for signed types; for unsigned types the
    // `x < z` branch is unreachable, so the wrapped value is never read.
    let neg_one = z.wrapping_sub(&one);
    let data: Vec<T> = input
        .iter()
        .map(|&x| {
            if x > z {
                one
            } else if x < z {
                neg_one
            } else {
                z
            }
        })
        .collect();
    Array::from_vec(input.dim().clone(), data)
}

/// Elementwise negation preserving the integer dtype.
///
/// `np.negative` on integer arrays keeps the integer dtype
/// (generate_umath.py:516 `TD(ints + flts)`). Uses wrapping negation so
/// `negative(i8::MIN)` wraps like NumPy rather than panicking.
pub fn negative_int<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy + num_traits::WrappingNeg,
    D: Dimension,
{
    let data: Vec<T> = input.iter().map(|&x| x.wrapping_neg()).collect();
    Array::from_vec(input.dim().clone(), data)
}

/// Elementwise sign: -1 for negative, 0 for zero, +1 for positive.
pub fn sign<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    unary_float_op(input, |x| {
        if x.is_nan() {
            <T as Float>::nan()
        } else if x > <T as Element>::zero() {
            <T as Element>::one()
        } else if x < <T as Element>::zero() {
            -<T as Element>::one()
        } else {
            <T as Element>::zero()
        }
    })
}

/// Elementwise negation.
///
/// Uses hardware SIMD for contiguous f64 arrays.
pub fn negative<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    if let Some(r) = crate::helpers::try_simd_f64_unary(input, crate::dispatch::simd_neg_f64) {
        return r;
    }
    if let Some(r) = crate::helpers::try_simd_f32_unary(input, crate::dispatch::simd_neg_f32) {
        return r;
    }
    unary_float_op(input, |x| -x)
}

/// In-place elementwise negation — `_into` counterpart of [`negative`].
pub fn negative_into<T, D>(input: &Array<T, D>, out: &mut Array<T, D>) -> FerrayResult<()>
where
    T: Element + Float,
    D: Dimension,
{
    unary_float_op_into(input, out, "negative", |x| -x)
}

/// Elementwise positive (identity for numeric types).
pub fn positive<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    unary_float_op(input, |x| x)
}

/// Elementwise reciprocal: 1/x.
///
/// Uses hardware SIMD for contiguous f64 arrays.
pub fn reciprocal<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    if let Some(r) = crate::helpers::try_simd_f64_unary(input, crate::dispatch::simd_reciprocal_f64)
    {
        return r;
    }
    if let Some(r) = crate::helpers::try_simd_f32_unary(input, crate::dispatch::simd_reciprocal_f32)
    {
        return r;
    }
    unary_float_op(input, T::recip)
}

/// Elementwise square root.
///
/// Uses hardware SIMD (`vsqrtpd`) for contiguous f64 arrays.
pub fn sqrt<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    if let Some(r) = crate::helpers::try_simd_f64_unary(input, crate::dispatch::simd_sqrt_f64) {
        return r;
    }
    unary_float_op(input, T::sqrt)
}

/// In-place elementwise square root — the `_into` counterpart of [`sqrt`].
pub fn sqrt_into<T, D>(input: &Array<T, D>, out: &mut Array<T, D>) -> FerrayResult<()>
where
    T: Element + Float,
    D: Dimension,
{
    unary_float_op_into(input, out, "sqrt", T::sqrt)
}

/// Elementwise cube root.
pub fn cbrt<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float + crate::cr_math::CrMath,
    D: Dimension,
{
    unary_float_op_compute(input, T::cr_cbrt)
}

/// Elementwise square: x^2.
///
/// Uses hardware SIMD for contiguous f64 arrays.
pub fn square<T, D>(input: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    if let Some(r) = crate::helpers::try_simd_f64_unary(input, crate::dispatch::simd_square_f64) {
        return r;
    }
    if let Some(r) = crate::helpers::try_simd_f32_unary(input, crate::dispatch::simd_square_f32) {
        return r;
    }
    unary_float_op(input, |x| x * x)
}

/// In-place elementwise square — the `_into` counterpart of [`square`].
pub fn square_into<T, D>(input: &Array<T, D>, out: &mut Array<T, D>) -> FerrayResult<()>
where
    T: Element + Float,
    D: Dimension,
{
    unary_float_op_into(input, out, "square", |x| x * x)
}

/// Heaviside step function.
///
/// `heaviside(x, h0)` returns 0 for x < 0, h0 for x == 0, and 1 for x > 0.
pub fn heaviside<T, D>(x: &Array<T, D>, h0: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    binary_elementwise_op(x, h0, |xi, h0i| {
        if xi.is_nan() {
            xi
        } else if xi < <T as Element>::zero() {
            <T as Element>::zero()
        } else if xi == <T as Element>::zero() {
            h0i
        } else {
            <T as Element>::one()
        }
    })
}

/// `np.gcd` — registered INTEGER-ONLY in NumPy
/// (`numpy/_core/code_generators/generate_umath.py:1156` `'gcd': Ufunc(...
/// TD(ints) ...)`): the only registered loops are integer (plus the
/// object loop), so a FLOAT input array has no matching ufunc loop and
/// NumPy raises `TypeError` ("no loop matching the specified signature").
///
/// The integer domain that NumPy accepts is served by [`gcd_int`]
/// (Euclidean GCD preserving the integer dtype). This public symbol is
/// bound `T: Element + Float`, so every reachable `T` is a float type —
/// exactly the input NumPy rejects. It therefore returns
/// [`FerrayError::invalid_dtype`] (the TypeError analog at the library
/// boundary) instead of silently computing a value, mirroring NumPy's
/// integer-only domain.
///
/// # Errors
/// Always returns `FerrayError::invalid_dtype` — `gcd` has no float loop in
/// NumPy. Use [`gcd_int`] for the integer domain NumPy accepts.
pub fn gcd<T, D>(a: &Array<T, D>, _b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    let _ = a;
    Err(FerrayError::invalid_dtype(format!(
        "gcd: no loop matching the specified signature for dtype {:?}; \
         np.gcd is registered integer-only (generate_umath.py:1156 \
         TD(ints)) and rejects float input — use gcd_int for integer arrays",
        <T as Element>::dtype()
    )))
}

/// `np.lcm` — registered INTEGER-ONLY in NumPy
/// (`numpy/_core/code_generators/generate_umath.py:1163` `'lcm': Ufunc(...
/// TD(ints) ...)`): like [`gcd`], the only registered loops are integer
/// (plus the object loop), so FLOAT input raises `TypeError` in NumPy.
///
/// The integer domain is served by [`lcm_int`]. This `T: Element + Float`
/// symbol can only be reached with a float type — the input NumPy rejects —
/// so it returns [`FerrayError::invalid_dtype`] (the TypeError analog),
/// matching NumPy's integer-only domain.
///
/// # Errors
/// Always returns `FerrayError::invalid_dtype` — `lcm` has no float loop in
/// NumPy. Use [`lcm_int`] for the integer domain NumPy accepts.
pub fn lcm<T, D>(a: &Array<T, D>, _b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    let _ = a;
    Err(FerrayError::invalid_dtype(format!(
        "lcm: no loop matching the specified signature for dtype {:?}; \
         np.lcm is registered integer-only (generate_umath.py:1163 \
         TD(ints)) and rejects float input — use lcm_int for integer arrays",
        <T as Element>::dtype()
    )))
}

/// Non-negative absolute value for the integer dtypes `gcd_int`/`lcm_int`
/// serve, abstracting away the `num_traits::Signed` bound the kernels used to
/// require purely for `.abs()`.
///
/// NumPy's `gcd`/`lcm` are registered for ALL integer dtypes incl. unsigned
/// (`numpy/_core/code_generators/generate_umath.py:1156` gcd, `:1163` lcm —
/// `TD(ints)` covers uint8/16/32/64) and return the NON-NEGATIVE gcd. The only
/// part of the Euclidean algorithm that needed `Signed` was the initial
/// magnitude step; everything else (`Rem`/`Div`/`Mul`) is dtype-agnostic.
///
/// - Signed `i8/i16/i32/i64`: `self.wrapping_abs()`. This is MIN-safe —
///   `i8::MIN.wrapping_abs() == i8::MIN` — and matches numpy, whose integer
///   `gcd`/`lcm` loops compute `|x|` with two's-complement wraparound rather
///   than trapping (verified live: `np.gcd(np.int8(-128), np.int8(0)) == -128`).
///   The previous `Signed::abs()` would instead PANIC in debug builds on
///   `i*::MIN` — a latent bug this abstraction fixes.
/// - Unsigned `u8/u16/u32/u64`: identity (`self`) — an unsigned value is
///   already its own magnitude.
pub trait GcdAbs: Copy {
    /// Non-negative magnitude, wrapping at `MIN` for signed types (identity
    /// for unsigned). See the trait docs for the numpy-matching rationale.
    fn gcd_abs(self) -> Self;
}

macro_rules! impl_gcd_abs_signed {
    ($($ty:ty),* $(,)?) => {
        $(
            impl GcdAbs for $ty {
                #[inline]
                fn gcd_abs(self) -> $ty { self.wrapping_abs() }
            }
        )*
    };
}

impl_gcd_abs_signed!(i8, i16, i32, i64);

macro_rules! impl_gcd_abs_unsigned {
    ($($ty:ty),* $(,)?) => {
        $(
            impl GcdAbs for $ty {
                #[inline]
                fn gcd_abs(self) -> $ty { self }
            }
        )*
    };
}

impl_gcd_abs_unsigned!(u8, u16, u32, u64);

/// Integer GCD using the Euclidean algorithm.
///
/// Works on every integer element type `gcd`/`lcm` serve — signed
/// `i8/i16/i32/i64` AND unsigned `u8/u16/u32/u64` — matching numpy's
/// `TD(ints)` registration (`generate_umath.py:1156`), which covers unsigned.
/// The non-negative magnitude step goes through [`GcdAbs`] (`.gcd_abs()`)
/// rather than `num_traits::Signed::abs()`, so the kernel is dtype-agnostic
/// and MIN-safe (`gcd_int(&[i8::MIN], &[0]) -> i8::MIN`, the wrapped abs,
/// matching numpy, instead of panicking). For float-typed arrays, use [`gcd`].
pub fn gcd_int<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy + PartialEq + std::ops::Rem<Output = T> + GcdAbs,
    D: Dimension,
{
    binary_elementwise_op(a, b, |x, y| {
        let mut ax = x.gcd_abs();
        let mut ay = y.gcd_abs();
        while ay != <T as Element>::zero() {
            let t = ay;
            ay = ax % ay;
            ax = t;
        }
        ax
    })
}

/// Integer LCM using the Euclidean GCD algorithm.
///
/// Works on every integer element type (signed AND unsigned), matching numpy's
/// `TD(ints)` registration (`generate_umath.py:1163`). The magnitude step uses
/// [`GcdAbs`] (`.gcd_abs()`) instead of `num_traits::Signed::abs()`, so it
/// serves unsigned dtypes and is MIN-safe. For float-typed arrays, use [`lcm`].
pub fn lcm_int<T, D>(a: &Array<T, D>, b: &Array<T, D>) -> FerrayResult<Array<T, D>>
where
    T: Element
        + Copy
        + PartialEq
        + std::ops::Rem<Output = T>
        + std::ops::Div<Output = T>
        + std::ops::Mul<Output = T>
        + GcdAbs,
    D: Dimension,
{
    binary_elementwise_op(a, b, |x, y| {
        let ax = x.gcd_abs();
        let ay = y.gcd_abs();
        if ax == <T as Element>::zero() || ay == <T as Element>::zero() {
            return <T as Element>::zero();
        }
        let mut gx = ax;
        let mut gy = ay;
        while gy != <T as Element>::zero() {
            let t = gy;
            gy = gx % gy;
            gx = t;
        }
        ax / gx * ay
    })
}

// ---------------------------------------------------------------------------
// Broadcasting binary arithmetic
// ---------------------------------------------------------------------------

/// Elementwise addition with broadcasting.
///
/// Integer dtypes wrap on overflow (NumPy fixed-width contract —
/// `np.add(int8([100,100]), int8([100])) == [-56,-56]`), via
/// [`WrappingArith`]; floats are unchanged. Mirrors the same-shape [`add`]
/// (#88). NumPy registers fixed-width integer `add` loops at
/// `generate_umath.py:355` (`TD(no_bool_times_obj, ..., ('loops_autovec', ints))`).
pub fn add_broadcast<T, D1, D2>(a: &Array<T, D1>, b: &Array<T, D2>) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + WrappingArith,
    D1: Dimension,
    D2: Dimension,
{
    binary_broadcast_op(a, b, WrappingArith::wadd)
}

/// Elementwise subtraction with broadcasting.
///
/// Integer dtypes wrap on overflow (NumPy fixed-width contract —
/// `np.subtract(int8([-100,-100]), int8([100])) == [56,56]`), via
/// [`WrappingArith`]; floats are unchanged. Mirrors the same-shape
/// [`subtract`] (#88). NumPy registers fixed-width integer `subtract` loops
/// at `generate_umath.py:371`.
pub fn subtract_broadcast<T, D1, D2>(
    a: &Array<T, D1>,
    b: &Array<T, D2>,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + WrappingArith,
    D1: Dimension,
    D2: Dimension,
{
    binary_broadcast_op(a, b, WrappingArith::wsub)
}

/// Elementwise multiplication with broadcasting.
///
/// Integer dtypes wrap on overflow (NumPy fixed-width contract —
/// `np.multiply(int8([100,100]), int8([100])) == [16,16]`), via
/// [`WrappingArith`]; floats are unchanged. Mirrors the same-shape
/// [`multiply`] (#88). NumPy registers fixed-width integer `multiply` loops
/// at `generate_umath.py:386`.
pub fn multiply_broadcast<T, D1, D2>(
    a: &Array<T, D1>,
    b: &Array<T, D2>,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + WrappingArith,
    D1: Dimension,
    D2: Dimension,
{
    binary_broadcast_op(a, b, WrappingArith::wmul)
}

/// Elementwise *true* division with `NumPy` broadcasting.
///
/// Matches `np.divide` / `np.true_divide`, which are ALWAYS true division
/// (`PyUFunc_TrueDivisionTypeResolver`, generate_umath.py:419-422): integer
/// operands promote to `float64`, floats keep their float dtype (see
/// [`TrueDivide`]). Because integer operands are cast to `f64` BEFORE the
/// division (via [`TrueDivide::true_div`]), an integer divide-by-zero yields
/// `inf`/`nan` and NEVER panics — exactly NumPy's RuntimeWarning-only
/// behaviour. The output element type is `<T as TrueDivide>::Output`
/// (i*/u* -> f64, f32 -> f32, f64 -> f64).
pub fn divide_broadcast<T, D1, D2>(
    a: &Array<T, D1>,
    b: &Array<T, D2>,
) -> FerrayResult<Array<<T as TrueDivide>::Output, IxDyn>>
where
    T: TrueDivide,
    D1: Dimension,
    D2: Dimension,
{
    crate::helpers::binary_broadcast_map_op(a, b, T::true_div)
}

// ---------------------------------------------------------------------------
// Reductions
// ---------------------------------------------------------------------------

/// Reduce by addition along an axis (column sums, row sums, etc.).
///
/// Equivalent to `np.add.reduce(arr, axis=...)`. Delegates to the generic
/// [`crate::ufunc_methods::reduce_axis`] with the `+` kernel and `0` seed.
///
/// AC-2: `add_reduce` computes correct column sums.
pub fn add_reduce<T, D>(input: &Array<T, D>, axis: usize) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + std::ops::Add<Output = T> + Copy,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axis(input, axis, <T as Element>::zero(), |acc, x| acc + x)
}

/// Reduce by addition along an axis with an optional `keepdims` flag.
///
/// Equivalent to `np.add.reduce(arr, axis=..., keepdims=...)` /
/// `np.sum(arr, axis=..., keepdims=...)`. When `keepdims = true` the
/// reduced axis is preserved as a size-1 dimension so the result is
/// broadcastable back against the original input — the classic pattern
/// for row/column centering (`arr - arr.sum(axis=1, keepdims=True)`).
///
/// With `keepdims = false` this behaves exactly like [`add_reduce`].
/// Added for #394.
pub fn add_reduce_keepdims<T, D>(
    input: &Array<T, D>,
    axis: usize,
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + std::ops::Add<Output = T> + Copy,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axis_keepdims(
        input,
        axis,
        <T as Element>::zero(),
        keepdims,
        |acc, x| acc + x,
    )
}

/// Reduce by addition over multiple axes simultaneously.
///
/// Equivalent to `np.add.reduce(arr, axis=axes, keepdims=keepdims)` /
/// `np.sum(arr, axis=axes, keepdims=keepdims)` where `axes` is a tuple
/// of axes to collapse. Reduces every listed axis in a single pass over
/// the input — never materializes intermediates the way chained
/// `add_reduce` calls would, and the order of `axes` is irrelevant.
/// Added for #395.
pub fn add_reduce_axes<T, D>(
    input: &Array<T, D>,
    axes: &[usize],
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + std::ops::Add<Output = T> + Copy,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axes(input, axes, <T as Element>::zero(), keepdims, |acc, x| {
        acc + x
    })
}

/// Reduce by addition over the entire array (the `axis=None` form).
///
/// Equivalent to `np.add.reduce(arr, axis=None)` / `np.sum(arr)`.
/// Returns a single scalar — use [`add_reduce_axes`] when you want a
/// wrapped array result that supports `keepdims`.
///
/// Added for #395.
pub fn add_reduce_all<T, D>(input: &Array<T, D>) -> T
where
    T: Element + std::ops::Add<Output = T> + Copy,
    D: Dimension,
{
    crate::ufunc_methods::reduce_all(input, <T as Element>::zero(), |acc, x| acc + x)
}

// ---------------------------------------------------------------------------
// NaN-aware reductions (#388)
//
// Parallel to add_reduce / multiply_reduce / max_reduce / min_reduce but
// with NaN-skipping kernels. ferray-stats already exposes high-level
// nansum / nanmean / etc. wrappers; these are the lower-level ufunc
// primitives that match the cumulative nancumsum/nancumprod pattern in
// the same module — they live here so the full reduction family
// (whole-array + axis + axes + keepdims) is available without depending
// on ferray-stats.
//
// All four functions require `T: Element + Float` so the kernel can call
// `.is_nan()`. NaNs are dropped via per-element preprocessing into the
// reduction identity (0 for sum, 1 for product, +inf for min, -inf for
// max). Whole-array forms return a scalar; axis-aware forms delegate to
// the generic reduce_axes / reduce_axis_keepdims helpers.
// ---------------------------------------------------------------------------

/// Reduce by NaN-skipping addition along an axis with optional keepdims.
///
/// Equivalent to `np.nansum(arr, axis=axis, keepdims=keepdims)`. NaN
/// elements are treated as zero and contribute nothing to the sum.
pub fn nan_add_reduce<T, D>(
    input: &Array<T, D>,
    axis: usize,
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axis_keepdims(
        input,
        axis,
        <T as Element>::zero(),
        keepdims,
        |acc, x| acc + nan_to_zero(x),
    )
}

/// Reduce by NaN-skipping addition over multiple axes simultaneously.
pub fn nan_add_reduce_axes<T, D>(
    input: &Array<T, D>,
    axes: &[usize],
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axes(input, axes, <T as Element>::zero(), keepdims, |acc, x| {
        acc + nan_to_zero(x)
    })
}

/// Reduce by NaN-skipping addition over the entire array.
///
/// Equivalent to `np.nansum(arr)` / `np.nansum(arr, axis=None)`. Returns
/// a scalar. NaN elements contribute nothing to the sum; an array of all
/// NaNs sums to zero.
pub fn nan_add_reduce_all<T, D>(input: &Array<T, D>) -> T
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_all(input, <T as Element>::zero(), |acc, x| acc + nan_to_zero(x))
}

/// Reduce by NaN-skipping multiplication along an axis with optional keepdims.
///
/// Equivalent to `np.nanprod(arr, axis=axis, keepdims=keepdims)`. NaN
/// elements are treated as one and contribute nothing to the product.
pub fn nan_multiply_reduce<T, D>(
    input: &Array<T, D>,
    axis: usize,
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axis_keepdims(
        input,
        axis,
        <T as Element>::one(),
        keepdims,
        |acc, x| acc * nan_to_one(x),
    )
}

/// Reduce by NaN-skipping multiplication over multiple axes.
pub fn nan_multiply_reduce_axes<T, D>(
    input: &Array<T, D>,
    axes: &[usize],
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axes(input, axes, <T as Element>::one(), keepdims, |acc, x| {
        acc * nan_to_one(x)
    })
}

/// Reduce by NaN-skipping multiplication over the entire array.
pub fn nan_multiply_reduce_all<T, D>(input: &Array<T, D>) -> T
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_all(input, <T as Element>::one(), |acc, x| acc * nan_to_one(x))
}

/// Reduce by NaN-skipping maximum along an axis with optional keepdims.
///
/// Equivalent to `np.nanmax(arr, axis=axis, keepdims=keepdims)`. NaN
/// elements are skipped (treated as `-inf`).
pub fn nan_max_reduce<T, D>(
    input: &Array<T, D>,
    axis: usize,
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axis_keepdims(
        input,
        axis,
        <T as Float>::neg_infinity(),
        keepdims,
        |acc, x| {
            if x.is_nan() {
                acc
            } else if x > acc {
                x
            } else {
                acc
            }
        },
    )
}

/// Reduce by NaN-skipping maximum over multiple axes.
pub fn nan_max_reduce_axes<T, D>(
    input: &Array<T, D>,
    axes: &[usize],
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axes(
        input,
        axes,
        <T as Float>::neg_infinity(),
        keepdims,
        |acc, x| {
            if x.is_nan() {
                acc
            } else if x > acc {
                x
            } else {
                acc
            }
        },
    )
}

/// Reduce by NaN-skipping maximum over the entire array.
///
/// Equivalent to `np.nanmax(arr)`. Returns `-inf` for an all-NaN input
/// rather than raising — callers that need the all-NaN error semantics
/// should use ferray-stats' `nanmax` (which checks the result and errors
/// out instead of returning the seed).
pub fn nan_max_reduce_all<T, D>(input: &Array<T, D>) -> T
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_all(input, <T as Float>::neg_infinity(), |acc, x| {
        if x.is_nan() {
            acc
        } else if x > acc {
            x
        } else {
            acc
        }
    })
}

/// Reduce by NaN-skipping minimum along an axis with optional keepdims.
///
/// Equivalent to `np.nanmin(arr, axis=axis, keepdims=keepdims)`. NaN
/// elements are skipped (treated as `+inf`).
pub fn nan_min_reduce<T, D>(
    input: &Array<T, D>,
    axis: usize,
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axis_keepdims(
        input,
        axis,
        <T as Float>::infinity(),
        keepdims,
        |acc, x| {
            if x.is_nan() {
                acc
            } else if x < acc {
                x
            } else {
                acc
            }
        },
    )
}

/// Reduce by NaN-skipping minimum over multiple axes.
pub fn nan_min_reduce_axes<T, D>(
    input: &Array<T, D>,
    axes: &[usize],
    keepdims: bool,
) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_axes(input, axes, <T as Float>::infinity(), keepdims, |acc, x| {
        if x.is_nan() {
            acc
        } else if x < acc {
            x
        } else {
            acc
        }
    })
}

/// Reduce by NaN-skipping minimum over the entire array.
pub fn nan_min_reduce_all<T, D>(input: &Array<T, D>) -> T
where
    T: Element + Float,
    D: Dimension,
{
    crate::ufunc_methods::reduce_all(input, <T as Float>::infinity(), |acc, x| {
        if x.is_nan() {
            acc
        } else if x < acc {
            x
        } else {
            acc
        }
    })
}

#[inline]
fn nan_to_zero<T: Float + Element>(x: T) -> T {
    if x.is_nan() {
        <T as Element>::zero()
    } else {
        x
    }
}

#[inline]
fn nan_to_one<T: Float + Element>(x: T) -> T {
    if x.is_nan() { <T as Element>::one() } else { x }
}

/// Running (cumulative) addition along an axis.
///
/// AC-2: `add_accumulate` produces running sums. Like `np.add.accumulate`,
/// the result element type is the promoted [`ReduceAcc::Acc`] (narrow ints
/// widen so the accumulation never overflows — live oracle
/// `np.add.accumulate(np.array([100,100,100],dtype=np.int8)).dtype == int64`).
pub fn add_accumulate<T, D>(
    input: &Array<T, D>,
    axis: usize,
) -> FerrayResult<Array<<T as ReduceAcc>::Acc, D>>
where
    T: ReduceAcc,
    D: Dimension,
{
    cumsum(input, Some(axis))
}

/// Outer product: `multiply_outer(a, b)[i, j] = a[i] * b[j]`.
///
/// Equivalent to `np.multiply.outer(a, b)`. Delegates to the generic
/// [`crate::ufunc_methods::outer`] with the `*` kernel.
///
/// AC-3: `multiply_outer` produces correct outer product.
pub fn multiply_outer<T>(a: &Array<T, Ix1>, b: &Array<T, Ix1>) -> FerrayResult<Array<T, IxDyn>>
where
    T: Element + std::ops::Mul<Output = T> + Copy,
{
    crate::ufunc_methods::outer(a, b, |x, y| x * y)
}

// ---------------------------------------------------------------------------
// Cumulative operations
// ---------------------------------------------------------------------------

/// Shared cumulative kernel: build the result buffer by applying
/// `preprocess` to every input element, then walk it in place with
/// `accumulate` along `axis` (or flat if `None`). Factored out so
/// `cumsum`, `cumprod`, `nancumsum` and `nancumprod` all share a single
/// pass — previously `nancumsum`/`nancumprod` materialized a cleaned
/// copy and then called `cumsum`/`cumprod`, which materialized a
/// second buffer (#156).
fn cumulative_with_preprocess<T, D, Pre, Acc>(
    input: &Array<T, D>,
    axis: Option<usize>,
    preprocess: Pre,
    accumulate: Acc,
) -> FerrayResult<Array<T, D>>
where
    T: Element + Copy,
    D: Dimension,
    Pre: Fn(T) -> T,
    Acc: Fn(T, T) -> T,
{
    if let Some(ax) = axis {
        if ax >= input.ndim() {
            return Err(FerrayError::axis_out_of_bounds(ax, input.ndim()));
        }
        let shape = input.shape().to_vec();
        let mut result: Vec<T> = input.iter().map(|&x| preprocess(x)).collect();
        let mut stride = 1usize;
        for d in shape.iter().skip(ax + 1) {
            stride *= d;
        }
        let axis_len = shape[ax];
        let outer_size: usize = shape[..ax].iter().product();
        let inner_size = stride;

        for outer in 0..outer_size {
            for inner in 0..inner_size {
                let base = outer * axis_len * inner_size + inner;
                for k in 1..axis_len {
                    let prev = base + (k - 1) * inner_size;
                    let curr = base + k * inner_size;
                    result[curr] = accumulate(result[prev], result[curr]);
                }
            }
        }
        Array::from_vec(input.dim().clone(), result)
    } else {
        let mut data: Vec<T> = input.iter().map(|&x| preprocess(x)).collect();
        for i in 1..data.len() {
            data[i] = accumulate(data[i - 1], data[i]);
        }
        Array::from_vec(input.dim().clone(), data)
    }
}

/// Promoted cumulative kernel: widen every input element into the NumPy
/// reduction accumulator [`ReduceAcc::Acc`], then walk the buffer in place
/// with `accumulate` along `axis` (or flat if `None`). The output element
/// type is the promoted `T::Acc`, so a narrow-int cumulative can never
/// overflow and its dtype matches NumPy's promoted result.
///
/// NumPy promotes any integer dtype "with a precision less than that of the
/// default platform integer" before accumulating
/// (`numpy/_core/fromnumeric.py:2850-2855` cumsum, `:3424-3429` cumprod),
/// identical to the `sum`/`prod` rule (`:2321-2327`). For `f32`/`f64`/`i64`/
/// complex, `Acc == Self`, so those paths are byte-identical to before.
fn cumulative_promoted<T, D, Acc>(
    input: &Array<T, D>,
    axis: Option<usize>,
    accumulate: Acc,
) -> FerrayResult<Array<<T as ReduceAcc>::Acc, D>>
where
    T: ReduceAcc,
    D: Dimension,
    Acc: Fn(<T as ReduceAcc>::Acc, <T as ReduceAcc>::Acc) -> <T as ReduceAcc>::Acc,
{
    let mut result: Vec<<T as ReduceAcc>::Acc> = input.iter().map(|&x| x.widen()).collect();
    if let Some(ax) = axis {
        if ax >= input.ndim() {
            return Err(FerrayError::axis_out_of_bounds(ax, input.ndim()));
        }
        let shape = input.shape().to_vec();
        let mut stride = 1usize;
        for d in shape.iter().skip(ax + 1) {
            stride *= d;
        }
        let axis_len = shape[ax];
        let outer_size: usize = shape[..ax].iter().product();
        let inner_size = stride;

        for outer in 0..outer_size {
            for inner in 0..inner_size {
                let base = outer * axis_len * inner_size + inner;
                for k in 1..axis_len {
                    let prev = base + (k - 1) * inner_size;
                    let curr = base + k * inner_size;
                    result[curr] = accumulate(result[prev], result[curr]);
                }
            }
        }
    } else {
        for i in 1..result.len() {
            result[i] = accumulate(result[i - 1], result[i]);
        }
    }
    Array::from_vec(input.dim().clone(), result)
}

/// Cumulative sum along an axis (or flattened if axis is None).
///
/// When `axis=None`, data is flattened and accumulated, but the result retains
/// the original shape (unlike `NumPy` which returns a 1-D array). This is due to
/// the generic return type `Array<T::Acc, D>`.
///
/// The result element type is the NumPy reduction accumulator
/// [`ReduceAcc::Acc`]: narrow signed ints widen to `i64`, narrow unsigned
/// ints to `u64`, `bool` to `i64`, and `f32`/`f64`/`i64`/complex stay
/// themselves. So a narrow-int cumsum never overflows and its dtype matches
/// `np.cumsum`'s promoted result (`numpy/_core/fromnumeric.py:2850-2855`).
///
/// AC-11: `cumsum([1,2,3,4]) == [1,3,6,10]`.
pub fn cumsum<T, D>(
    input: &Array<T, D>,
    axis: Option<usize>,
) -> FerrayResult<Array<<T as ReduceAcc>::Acc, D>>
where
    T: ReduceAcc,
    D: Dimension,
{
    cumulative_promoted(input, axis, |a, b| a + b)
}

/// Cumulative product along an axis (or flattened if axis is None).
///
/// When `axis=None`, data is flattened and accumulated, but the result retains
/// the original shape (unlike `NumPy` which returns a 1-D array).
///
/// The result element type is the promoted [`ReduceAcc::Acc`] (same narrow-int
/// promotion as [`cumsum`]; `numpy/_core/fromnumeric.py:3424-3429`).
pub fn cumprod<T, D>(
    input: &Array<T, D>,
    axis: Option<usize>,
) -> FerrayResult<Array<<T as ReduceAcc>::Acc, D>>
where
    T: ReduceAcc,
    D: Dimension,
{
    cumulative_promoted(input, axis, |a, b| a * b)
}

/// Cumulative sum (Array API standard name).
///
/// Alias of [`cumsum`] matching the Python Array API specification's
/// `cumulative_sum` name (added to `numpy` in 2.0). Result element type is the
/// promoted [`ReduceAcc::Acc`], exactly as `cumsum`.
pub fn cumulative_sum<T, D>(
    input: &Array<T, D>,
    axis: Option<usize>,
) -> FerrayResult<Array<<T as ReduceAcc>::Acc, D>>
where
    T: ReduceAcc,
    D: Dimension,
{
    cumsum(input, axis)
}

/// Cumulative product (Array API standard name).
///
/// Alias of [`cumprod`] matching the Python Array API specification's
/// `cumulative_prod` name (added to `numpy` in 2.0). Result element type is the
/// promoted [`ReduceAcc::Acc`], exactly as `cumprod`.
pub fn cumulative_prod<T, D>(
    input: &Array<T, D>,
    axis: Option<usize>,
) -> FerrayResult<Array<<T as ReduceAcc>::Acc, D>>
where
    T: ReduceAcc,
    D: Dimension,
{
    cumprod(input, axis)
}

/// Cumulative sum ignoring NaNs.
pub fn nancumsum<T, D>(input: &Array<T, D>, axis: Option<usize>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    cumulative_with_preprocess(
        input,
        axis,
        |x| {
            if x.is_nan() {
                <T as Element>::zero()
            } else {
                x
            }
        },
        |a, b| a + b,
    )
}

/// Cumulative product ignoring NaNs.
pub fn nancumprod<T, D>(input: &Array<T, D>, axis: Option<usize>) -> FerrayResult<Array<T, D>>
where
    T: Element + Float,
    D: Dimension,
{
    cumulative_with_preprocess(
        input,
        axis,
        |x| {
            if x.is_nan() { <T as Element>::one() } else { x }
        },
        |a, b| a * b,
    )
}

// ---------------------------------------------------------------------------
// Differences
// ---------------------------------------------------------------------------

/// Compute the n-th discrete difference along the given axis.
///
/// AC-11: `diff([1,3,6,10], 1) == [2,3,4]`.
pub fn diff<T>(input: &Array<T, Ix1>, n: usize) -> FerrayResult<Array<T, Ix1>>
where
    T: Element + std::ops::Sub<Output = T> + Copy,
{
    let mut data: Vec<T> = input.iter().copied().collect();
    for _ in 0..n {
        if data.len() <= 1 {
            data.clear();
            break;
        }
        let mut new_data = Vec::with_capacity(data.len() - 1);
        for i in 1..data.len() {
            new_data.push(data[i] - data[i - 1]);
        }
        data = new_data;
    }
    Array::from_vec(Ix1::new([data.len()]), data)
}

/// Differences between consecutive elements of an array, with optional
/// prepend/append values.
pub fn ediff1d<T>(
    input: &Array<T, Ix1>,
    to_end: Option<&[T]>,
    to_begin: Option<&[T]>,
) -> FerrayResult<Array<T, Ix1>>
where
    T: Element + std::ops::Sub<Output = T> + Copy,
{
    let data: Vec<T> = input.iter().copied().collect();
    let mut result = Vec::new();

    if let Some(begin) = to_begin {
        result.extend_from_slice(begin);
    }

    for i in 1..data.len() {
        result.push(data[i] - data[i - 1]);
    }

    if let Some(end) = to_end {
        result.extend_from_slice(end);
    }

    Array::from_vec(Ix1::new([result.len()]), result)
}

/// Compute the gradient of a 1-D array using central differences.
///
/// Edge values use forward/backward differences.
pub fn gradient<T>(input: &Array<T, Ix1>, spacing: Option<T>) -> FerrayResult<Array<T, Ix1>>
where
    T: Element + Float,
{
    let data: Vec<T> = input.iter().copied().collect();
    let n = data.len();
    if n == 0 {
        return Array::from_vec(Ix1::new([0]), vec![]);
    }
    let h = spacing.unwrap_or_else(|| <T as Element>::one());
    let two = <T as Element>::one() + <T as Element>::one();
    let mut result = Vec::with_capacity(n);

    if n == 1 {
        result.push(<T as Element>::zero());
    } else {
        // Forward difference for first element
        result.push((data[1] - data[0]) / h);
        // Central differences for interior
        for i in 1..n - 1 {
            result.push((data[i + 1] - data[i - 1]) / (two * h));
        }
        // Backward difference for last element
        result.push((data[n - 1] - data[n - 2]) / h);
    }

    Array::from_vec(Ix1::new([n]), result)
}

// ---------------------------------------------------------------------------
// Cross product
// ---------------------------------------------------------------------------

/// Cross product of two 3-element 1-D arrays.
pub fn cross<T>(a: &Array<T, Ix1>, b: &Array<T, Ix1>) -> FerrayResult<Array<T, Ix1>>
where
    T: Element + std::ops::Mul<Output = T> + std::ops::Sub<Output = T> + Copy,
{
    if a.size() != 3 || b.size() != 3 {
        return Err(FerrayError::invalid_value(
            "cross product requires 3-element vectors",
        ));
    }
    let ad: Vec<T> = a.iter().copied().collect();
    let bd: Vec<T> = b.iter().copied().collect();
    let result = vec![
        ad[1] * bd[2] - ad[2] * bd[1],
        ad[2] * bd[0] - ad[0] * bd[2],
        ad[0] * bd[1] - ad[1] * bd[0],
    ];
    Array::from_vec(Ix1::new([3]), result)
}

// ---------------------------------------------------------------------------
// Integration
// ---------------------------------------------------------------------------

/// Integrate using the trapezoidal rule.
///
/// If `dx` is provided, it is the spacing between sample points.
/// If `x` is provided, it gives the sample point coordinates.
pub fn trapezoid<T>(y: &Array<T, Ix1>, x: Option<&Array<T, Ix1>>, dx: Option<T>) -> FerrayResult<T>
where
    T: Element + Float,
{
    let ydata: Vec<T> = y.iter().copied().collect();
    let n = ydata.len();
    if n < 2 {
        return Ok(<T as Element>::zero());
    }

    let two = <T as Element>::one() + <T as Element>::one();
    let mut total = <T as Element>::zero();

    if let Some(xarr) = x {
        let xdata: Vec<T> = xarr.iter().copied().collect();
        if xdata.len() != n {
            return Err(FerrayError::shape_mismatch(
                "x and y must have the same length for trapezoid",
            ));
        }
        for i in 1..n {
            total = total + (ydata[i] + ydata[i - 1]) / two * (xdata[i] - xdata[i - 1]);
        }
    } else {
        let h = dx.unwrap_or_else(|| <T as Element>::one());
        for i in 1..n {
            total = total + (ydata[i] + ydata[i - 1]) / two * h;
        }
    }

    Ok(total)
}

// ---------------------------------------------------------------------------
// f16 variants (f32-promoted) — generated via the shared macros (#142).
// ---------------------------------------------------------------------------

use crate::helpers::{binary_f16_fn, unary_f16_fn};

unary_f16_fn!(
    /// Elementwise absolute value for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    absolute_f16,
    f32::abs
);
unary_f16_fn!(
    /// Elementwise negation for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    negative_f16,
    |x: f32| -x
);
unary_f16_fn!(
    /// Elementwise square root for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    sqrt_f16,
    f32::sqrt
);
unary_f16_fn!(
    /// Elementwise cube root for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    cbrt_f16,
    f32::cbrt
);
unary_f16_fn!(
    /// Elementwise square for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    square_f16,
    |x: f32| x * x
);
unary_f16_fn!(
    /// Elementwise reciprocal for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    reciprocal_f16,
    f32::recip
);
unary_f16_fn!(
    /// Elementwise sign for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    sign_f16,
    |x: f32| {
        if x.is_nan() {
            f32::NAN
        } else if x > 0.0 {
            1.0
        } else if x < 0.0 {
            -1.0
        } else {
            0.0
        }
    }
);
binary_f16_fn!(
    /// Elementwise addition for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    add_f16,
    |x: f32, y: f32| x + y
);
binary_f16_fn!(
    /// Elementwise subtraction for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    subtract_f16,
    |x: f32, y: f32| x - y
);
binary_f16_fn!(
    /// Elementwise multiplication for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    multiply_f16,
    |x: f32, y: f32| x * y
);
binary_f16_fn!(
    /// Elementwise division for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    divide_f16,
    |x: f32, y: f32| x / y
);
binary_f16_fn!(
    /// Elementwise power for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    power_f16,
    f32::powf
);
binary_f16_fn!(
    /// Floor division for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    floor_divide_f16,
    |x: f32, y: f32| (x / y).floor()
);
binary_f16_fn!(
    /// Elementwise remainder for f16 arrays via f32 promotion.
    #[cfg(feature = "f16")]
    remainder_f16,
    |x: f32, y: f32| {
        let r = x % y;
        if (r < 0.0 && y > 0.0) || (r > 0.0 && y < 0.0) {
            r + y
        } else {
            r
        }
    }
);

#[cfg(test)]
mod tests {
    use super::*;
    use ferray_core::dimension::Ix2;

    use crate::test_util::arr1;

    fn arr1_i32(data: Vec<i32>) -> Array<i32, Ix1> {
        let n = data.len();
        Array::from_vec(Ix1::new([n]), data).unwrap()
    }

    /// Build a 1-D integer test array of any element width (i8/i16/i32/i64).
    fn ints<T: Element + Copy>(data: Vec<T>) -> Array<T, Ix1> {
        let n = data.len();
        Array::from_vec(Ix1::new([n]), data).expect("test array shape is valid")
    }

    fn arr1_f32(data: Vec<f32>) -> Array<f32, Ix1> {
        let n = data.len();
        Array::from_vec(Ix1::new([n]), data).unwrap()
    }

    // ---- f32 coverage (#721) -------------------------------------------
    //
    // Existing arithmetic tests run against f64 only; the f32 SIMD
    // dispatch (try_simd_f32_binary / try_simd_f32_unary) is exercised
    // here. Arrays are sized at 32 elements so the SIMD path engages
    // (the threshold is >= a small multiple of the lane width).

    #[test]
    fn add_f32_simd_path() {
        let n = 32;
        let a = arr1_f32((0..n).map(|i| i as f32).collect());
        let b = arr1_f32((0..n).map(|i| i as f32 * 2.0).collect());
        let r = add(&a, &b).unwrap();
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            assert!((v - (i as f32 * 3.0)).abs() < 1e-6);
        }
    }

    #[test]
    fn sub_f32_simd_path() {
        let n = 32;
        let a = arr1_f32((0..n).map(|i| i as f32 * 5.0).collect());
        let b = arr1_f32((0..n).map(|i| i as f32 * 2.0).collect());
        let r = subtract(&a, &b).unwrap();
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            assert!((v - (i as f32 * 3.0)).abs() < 1e-6);
        }
    }

    #[test]
    fn mul_f32_simd_path() {
        let n = 32;
        let a = arr1_f32((0..n).map(|i| i as f32).collect());
        let b = arr1_f32(vec![3.0_f32; n]);
        let r = multiply(&a, &b).unwrap();
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            assert!((v - (i as f32 * 3.0)).abs() < 1e-6);
        }
    }

    #[test]
    fn div_f32_simd_path() {
        let n = 32;
        let a = arr1_f32((0..n).map(|i| (i as f32 + 1.0) * 4.0).collect());
        let b = arr1_f32(vec![2.0_f32; n]);
        let r = divide(&a, &b).unwrap();
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            assert!((v - (i as f32 + 1.0) * 2.0).abs() < 1e-6);
        }
    }

    #[test]
    fn abs_f32_simd_path() {
        let n = 32;
        let a = arr1_f32(
            (0..n)
                .map(|i| if i % 2 == 0 { -(i as f32) } else { i as f32 })
                .collect(),
        );
        let r = absolute(&a).unwrap();
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            assert!((v - (i as f32)).abs() < 1e-6);
        }
    }

    #[test]
    fn neg_f32_simd_path() {
        let n = 32;
        let a = arr1_f32((0..n).map(|i| i as f32).collect());
        let r = negative(&a).unwrap();
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            assert!((v - (-(i as f32))).abs() < 1e-6);
        }
    }

    #[test]
    fn reciprocal_f32_simd_path() {
        let n = 32;
        let a = arr1_f32((1..=n).map(|i| i as f32).collect());
        let r = reciprocal(&a).unwrap();
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            let want = 1.0_f32 / ((i + 1) as f32);
            assert!((v - want).abs() < 1e-6);
        }
    }

    #[test]
    fn square_f32_simd_path() {
        let n = 32;
        let a = arr1_f32((0..n).map(|i| i as f32).collect());
        let r = square(&a).unwrap();
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            let want = (i as f32) * (i as f32);
            assert!((v - want).abs() < 1e-4);
        }
    }

    #[test]
    fn add_f32_below_simd_threshold_scalar_path() {
        // Tiny array — SIMD dispatch typically falls back to scalar.
        let a = arr1_f32(vec![1.5, 2.5, 3.5]);
        let b = arr1_f32(vec![0.5, 0.5, 0.5]);
        let r = add(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[2.0, 3.0, 4.0]);
    }

    #[test]
    fn add_f32_force_scalar_env_var() {
        // FERRAY_FORCE_SCALAR=1 should bypass SIMD; result must still
        // be correct.
        // SAFETY: This test is single-threaded by default per cargo
        // test runner; we set then unset the env var around the call.
        unsafe {
            std::env::set_var("FERRAY_FORCE_SCALAR", "1");
        }
        let a = arr1_f32((0..32).map(|i| i as f32).collect());
        let b = arr1_f32(vec![1.0_f32; 32]);
        let r = add(&a, &b).unwrap();
        unsafe {
            std::env::remove_var("FERRAY_FORCE_SCALAR");
        }
        for (i, &v) in r.as_slice().unwrap().iter().enumerate() {
            assert!((v - (i as f32 + 1.0)).abs() < 1e-6);
        }
    }

    #[test]
    fn test_add() {
        let a = arr1(vec![1.0, 2.0, 3.0]);
        let b = arr1(vec![4.0, 5.0, 6.0]);
        let r = add(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[5.0, 7.0, 9.0]);
    }

    #[test]
    fn test_subtract() {
        let a = arr1(vec![5.0, 7.0, 9.0]);
        let b = arr1(vec![1.0, 2.0, 3.0]);
        let r = subtract(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[4.0, 5.0, 6.0]);
    }

    #[test]
    fn test_multiply() {
        let a = arr1(vec![2.0, 3.0, 4.0]);
        let b = arr1(vec![5.0, 6.0, 7.0]);
        let r = multiply(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[10.0, 18.0, 28.0]);
    }

    #[test]
    fn test_divide() {
        let a = arr1(vec![10.0, 20.0, 30.0]);
        let b = arr1(vec![2.0, 4.0, 5.0]);
        let r = divide(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[5.0, 5.0, 6.0]);
    }

    #[test]
    fn test_floor_divide() {
        let a = arr1(vec![7.0, -7.0]);
        let b = arr1(vec![2.0, 2.0]);
        let r = floor_divide(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[3.0, -4.0]);
    }

    #[test]
    fn test_floor_divide_int_min_over_neg1_wraps() {
        // INT_MIN // -1 overflows; NumPy wraps to INT_MIN (no panic).
        // Pins divergence #1031. Live oracle: np.divmod(INT_MIN, -1)[0] == INT_MIN.
        let q8 = floor_divide_int(&ints(vec![i8::MIN]), &ints(vec![-1i8]))
            .ok()
            .and_then(|v| v.as_slice().map(<[_]>::to_vec));
        assert_eq!(q8, Some(vec![i8::MIN]));
        let q64 = floor_divide_int(&ints(vec![i64::MIN]), &ints(vec![-1i64]))
            .ok()
            .and_then(|v| v.as_slice().map(<[_]>::to_vec));
        assert_eq!(q64, Some(vec![i64::MIN]));
        // Normal floor-division (toward -inf) must not regress.
        let r = floor_divide_int(&ints(vec![-7i32, 7, -7]), &ints(vec![2i32, -2, -2]))
            .ok()
            .and_then(|v| v.as_slice().map(<[_]>::to_vec));
        assert_eq!(r, Some(vec![-4, -4, 3]));
    }

    #[test]
    fn test_remainder_int_min_over_neg1_wraps() {
        // INT_MIN % -1 overflows; NumPy wraps to 0 (no panic). Pins #1031.
        let r8 = remainder_int(&ints(vec![i8::MIN]), &ints(vec![-1i8]))
            .ok()
            .and_then(|v| v.as_slice().map(<[_]>::to_vec));
        assert_eq!(r8, Some(vec![0i8]));
        let r64 = remainder_int(&ints(vec![i64::MIN]), &ints(vec![-1i64]))
            .ok()
            .and_then(|v| v.as_slice().map(<[_]>::to_vec));
        assert_eq!(r64, Some(vec![0i64]));
        // Normal modulo (sign of divisor) must not regress.
        let r = remainder_int(&ints(vec![-7i32, 7]), &ints(vec![2i32, -2]))
            .ok()
            .and_then(|v| v.as_slice().map(<[_]>::to_vec));
        assert_eq!(r, Some(vec![1, -1]));
    }

    #[test]
    fn test_power() {
        let a = arr1(vec![2.0, 3.0]);
        let b = arr1(vec![3.0, 2.0]);
        let r = power(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[8.0, 9.0]);
    }

    #[test]
    fn test_remainder() {
        let a = arr1(vec![7.0, -7.0]);
        let b = arr1(vec![3.0, 3.0]);
        let r = remainder(&a, &b).unwrap();
        let s = r.as_slice().unwrap();
        assert!((s[0] - 1.0).abs() < 1e-12);
        assert!((s[1] - 2.0).abs() < 1e-12);
    }

    #[test]
    fn test_fmod() {
        let a = arr1(vec![7.0, -7.0]);
        let b = arr1(vec![3.0, 3.0]);
        let r = fmod(&a, &b).unwrap();
        let s = r.as_slice().unwrap();
        assert!((s[0] - 1.0).abs() < 1e-12);
        assert!((s[1] - (-1.0)).abs() < 1e-12);
    }

    #[test]
    fn test_absolute() {
        let a = arr1(vec![-1.0, 2.0, -3.0]);
        let r = absolute(&a).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 2.0, 3.0]);
    }

    #[test]
    fn test_sign() {
        let a = arr1(vec![-5.0, 0.0, 3.0]);
        let r = sign(&a).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[-1.0, 0.0, 1.0]);
    }

    #[test]
    fn test_negative() {
        let a = arr1(vec![1.0, -2.0, 3.0]);
        let r = negative(&a).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[-1.0, 2.0, -3.0]);
    }

    #[test]
    fn test_sqrt() {
        let a = arr1(vec![1.0, 4.0, 9.0, 16.0]);
        let r = sqrt(&a).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 2.0, 3.0, 4.0]);
    }

    #[test]
    fn test_cbrt() {
        let a = arr1(vec![8.0, 27.0]);
        let r = cbrt(&a).unwrap();
        let s = r.as_slice().unwrap();
        assert!((s[0] - 2.0).abs() < 1e-12);
        assert!((s[1] - 3.0).abs() < 1e-12);
    }

    #[test]
    fn test_square() {
        let a = arr1(vec![2.0, 3.0, 4.0]);
        let r = square(&a).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[4.0, 9.0, 16.0]);
    }

    #[test]
    fn test_reciprocal() {
        let a = arr1(vec![2.0, 4.0, 5.0]);
        let r = reciprocal(&a).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[0.5, 0.25, 0.2]);
    }

    #[test]
    fn test_heaviside() {
        let x = arr1(vec![-1.0, 0.0, 1.0]);
        let h0 = arr1(vec![0.5, 0.5, 0.5]);
        let r = heaviside(&x, &h0).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[0.0, 0.5, 1.0]);
    }

    #[test]
    fn test_gcd() {
        // np.gcd is integer-only (generate_umath.py:1156 TD(ints)): a FLOAT
        // input has no matching ufunc loop and raises TypeError. The public
        // `gcd` mirrors that — it rejects float input. The integer domain
        // NumPy accepts is served by `gcd_int`.
        let a = arr1(vec![12.0, 15.0]);
        let b = arr1(vec![8.0, 25.0]);
        assert!(gcd(&a, &b).is_err());
        // np.gcd([12,15],[8,25]) -> array([4, 5]) (live numpy 2.4.5).
        let ai = arr1_i32(vec![12, 15]);
        let bi = arr1_i32(vec![8, 25]);
        let r = gcd_int(&ai, &bi).ok();
        assert_eq!(r.as_ref().and_then(|x| x.as_slice()), Some(&[4, 5][..]));
    }

    #[test]
    fn test_lcm() {
        // np.lcm is integer-only (generate_umath.py:1163 TD(ints)): float
        // input raises TypeError; the public `lcm` rejects it. Integer domain
        // is served by `lcm_int`.
        let a = arr1(vec![4.0, 6.0]);
        let b = arr1(vec![6.0, 8.0]);
        assert!(lcm(&a, &b).is_err());
        // np.lcm([4,6],[6,8]) -> array([12, 24]) (live numpy 2.4.5).
        let ai = arr1_i32(vec![4, 6]);
        let bi = arr1_i32(vec![6, 8]);
        let r = lcm_int(&ai, &bi).ok();
        assert_eq!(r.as_ref().and_then(|x| x.as_slice()), Some(&[12, 24][..]));
    }

    #[test]
    fn test_gcd_int() {
        let a = Array::<i32, Ix1>::from_vec(Ix1::new([3]), vec![12, 15, 0]).unwrap();
        let b = Array::<i32, Ix1>::from_vec(Ix1::new([3]), vec![8, 25, 7]).unwrap();
        let r = gcd_int(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[4, 5, 7]);
    }

    #[test]
    fn test_lcm_int() {
        let a = Array::<i64, Ix1>::from_vec(Ix1::new([3]), vec![4, 6, 0]).unwrap();
        let b = Array::<i64, Ix1>::from_vec(Ix1::new([3]), vec![6, 8, 5]).unwrap();
        let r = lcm_int(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[12, 24, 0]);
    }

    #[test]
    fn test_gcd_int_negative() {
        let a = Array::<i32, Ix1>::from_vec(Ix1::new([2]), vec![-12, 15]).unwrap();
        let b = Array::<i32, Ix1>::from_vec(Ix1::new([2]), vec![8, -25]).unwrap();
        let r = gcd_int(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[4, 5]);
    }

    #[test]
    fn test_cumsum_ac11() {
        // AC-11: cumsum([1,2,3,4]) == [1,3,6,10]
        let a = arr1(vec![1.0, 2.0, 3.0, 4.0]);
        let r = cumsum(&a, None).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 3.0, 6.0, 10.0]);
    }

    #[test]
    fn test_cumsum_i32() {
        let a = arr1_i32(vec![1, 2, 3, 4]);
        let r = cumsum(&a, None).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1, 3, 6, 10]);
    }

    #[test]
    fn test_cumprod() {
        let a = arr1(vec![1.0, 2.0, 3.0, 4.0]);
        let r = cumprod(&a, None).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 2.0, 6.0, 24.0]);
    }

    #[test]
    fn test_cumulative_sum_alias() {
        let a = arr1(vec![1.0, 2.0, 3.0, 4.0]);
        let r = cumulative_sum(&a, None).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 3.0, 6.0, 10.0]);
    }

    #[test]
    fn test_cumulative_prod_alias() {
        let a = arr1(vec![1.0, 2.0, 3.0, 4.0]);
        let r = cumulative_prod(&a, None).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 2.0, 6.0, 24.0]);
    }

    #[test]
    fn test_diff_ac11() {
        // AC-11: diff([1,3,6,10], 1) == [2,3,4]
        let a = arr1(vec![1.0, 3.0, 6.0, 10.0]);
        let r = diff(&a, 1).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[2.0, 3.0, 4.0]);
    }

    #[test]
    fn test_diff_n2() {
        let a = arr1(vec![1.0, 3.0, 6.0, 10.0]);
        let r = diff(&a, 2).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 1.0]);
    }

    #[test]
    fn test_ediff1d() {
        let a = arr1(vec![1.0, 2.0, 4.0, 7.0]);
        let r = ediff1d(&a, None, None).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 2.0, 3.0]);
    }

    #[test]
    fn test_gradient() {
        let a = arr1(vec![1.0, 2.0, 4.0, 7.0, 11.0]);
        let r = gradient(&a, None).unwrap();
        let s = r.as_slice().unwrap();
        // forward: 2-1=1, central: (4-1)/2=1.5, (7-2)/2=2.5, (11-4)/2=3.5, backward: 11-7=4
        assert!((s[0] - 1.0).abs() < 1e-12);
        assert!((s[1] - 1.5).abs() < 1e-12);
        assert!((s[2] - 2.5).abs() < 1e-12);
        assert!((s[3] - 3.5).abs() < 1e-12);
        assert!((s[4] - 4.0).abs() < 1e-12);
    }

    #[test]
    fn test_cross() {
        let a = arr1(vec![1.0, 0.0, 0.0]);
        let b = arr1(vec![0.0, 1.0, 0.0]);
        let r = cross(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[0.0, 0.0, 1.0]);
    }

    #[test]
    fn test_trapezoid() {
        // Integrate y=x from 0 to 4: area = 8
        let y = arr1(vec![0.0, 1.0, 2.0, 3.0, 4.0]);
        let r = trapezoid(&y, None, Some(1.0)).unwrap();
        assert!((r - 8.0).abs() < 1e-12);
    }

    #[test]
    fn test_trapezoid_with_x() {
        let y = arr1(vec![0.0, 1.0, 4.0]);
        let x = arr1(vec![0.0, 1.0, 2.0]);
        let r = trapezoid(&y, Some(&x), None).unwrap();
        // (0+1)/2*1 + (1+4)/2*1 = 0.5 + 2.5 = 3.0
        assert!((r - 3.0).abs() < 1e-12);
    }

    #[test]
    fn test_add_reduce_ac2() {
        // AC-2: add_reduce computes correct column sums
        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
            .unwrap();
        let r = add_reduce(&a, 0).unwrap();
        assert_eq!(r.shape(), &[3]);
        let s: Vec<f64> = r.iter().copied().collect();
        assert_eq!(s, vec![5.0, 7.0, 9.0]);
    }

    #[test]
    fn test_add_accumulate_ac2() {
        let a = arr1(vec![1.0, 2.0, 3.0, 4.0]);
        let r = add_accumulate(&a, 0).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[1.0, 3.0, 6.0, 10.0]);
    }

    #[test]
    fn add_reduce_keepdims_true_preserves_row_axis() {
        // (2,3) + axis=1 + keepdims=true → (2,1) with row sums.
        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
            .unwrap();
        let r = add_reduce_keepdims(&a, 1, true).unwrap();
        assert_eq!(r.shape(), &[2, 1]);
        assert_eq!(r.as_slice().unwrap(), &[6.0, 15.0]);
    }

    #[test]
    fn add_reduce_keepdims_true_preserves_col_axis() {
        // (2,3) + axis=0 + keepdims=true → (1,3) with column sums.
        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
            .unwrap();
        let r = add_reduce_keepdims(&a, 0, true).unwrap();
        assert_eq!(r.shape(), &[1, 3]);
        assert_eq!(r.as_slice().unwrap(), &[5.0, 7.0, 9.0]);
    }

    #[test]
    fn add_reduce_keepdims_false_matches_legacy_add_reduce() {
        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
            .unwrap();
        let legacy = add_reduce(&a, 1).unwrap();
        let new_false = add_reduce_keepdims(&a, 1, false).unwrap();
        assert_eq!(legacy.shape(), new_false.shape());
        assert_eq!(legacy.as_slice().unwrap(), new_false.as_slice().unwrap());
    }

    #[test]
    fn add_reduce_axes_two_axes_3d() {
        // (2, 3, 4) reducing axes (0, 2) → length-3 result.
        use ferray_core::dimension::Ix3;
        let data: Vec<f64> = (0..24).map(f64::from).collect();
        let a = Array::<f64, Ix3>::from_vec(Ix3::new([2, 3, 4]), data).unwrap();
        let r = add_reduce_axes(&a, &[0, 2], false).unwrap();
        assert_eq!(r.shape(), &[3]);
        // For each j in 0..3: sum_{i,k} (i*12 + j*4 + k)
        let expected: Vec<f64> = (0..3)
            .map(|j| {
                let mut s = 0.0;
                for i in 0..2 {
                    for k in 0..4 {
                        s += f64::from(i * 12 + j * 4 + k);
                    }
                }
                s
            })
            .collect();
        assert_eq!(r.as_slice().unwrap(), expected.as_slice());
    }

    #[test]
    fn add_reduce_axes_keepdims_preserves_rank() {
        use ferray_core::dimension::Ix3;
        let data: Vec<f64> = (0..24).map(f64::from).collect();
        let a = Array::<f64, Ix3>::from_vec(Ix3::new([2, 3, 4]), data).unwrap();
        let r = add_reduce_axes(&a, &[0, 2], true).unwrap();
        assert_eq!(r.shape(), &[1, 3, 1]);
    }

    #[test]
    fn add_reduce_axes_all_axes_collapses_to_scalar_array() {
        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
            .unwrap();
        let r = add_reduce_axes(&a, &[0, 1], false).unwrap();
        assert_eq!(r.shape(), &[1]);
        assert_eq!(r.as_slice().unwrap(), &[21.0]);
    }

    #[test]
    fn add_reduce_all_returns_scalar_sum() {
        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
            .unwrap();
        let s = add_reduce_all(&a);
        assert!((s - 21.0).abs() < 1e-12);
    }

    #[test]
    fn add_reduce_all_integer_input_works() {
        // Multi-axis reductions must work for integer Element types too.
        let a = Array::<i32, Ix2>::from_vec(Ix2::new([2, 3]), vec![1, 2, 3, 4, 5, 6]).unwrap();
        let s = add_reduce_all(&a);
        assert_eq!(s, 21);
    }

    // ---- nan-aware reductions (#388) ----

    #[test]
    fn nan_add_reduce_all_skips_nans() {
        let a = arr1(vec![1.0, f64::NAN, 3.0, f64::NAN, 5.0]);
        let s = nan_add_reduce_all(&a);
        assert!((s - 9.0).abs() < 1e-12);
    }

    #[test]
    fn nan_add_reduce_all_nans_only_returns_zero() {
        let a = arr1(vec![f64::NAN, f64::NAN]);
        let s = nan_add_reduce_all(&a);
        assert!((s - 0.0).abs() < 1e-12);
    }

    #[test]
    fn nan_add_reduce_axis_skips_nans_per_row() {
        // (2, 3) with row-1 having a NaN; reduce axis=1 → row sums.
        let a =
            Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, f64::NAN, 6.0])
                .unwrap();
        let r = nan_add_reduce(&a, 1, false).unwrap();
        assert_eq!(r.shape(), &[2]);
        let s = r.as_slice().unwrap();
        assert!((s[0] - 6.0).abs() < 1e-12);
        assert!((s[1] - 10.0).abs() < 1e-12);
    }

    #[test]
    fn nan_add_reduce_axes_multi_axis_skips_nans() {
        use ferray_core::dimension::Ix3;
        // (2, 2, 2) with one NaN; reduce axes (0, 2).
        let data = vec![1.0, 2.0, 3.0, 4.0, f64::NAN, 6.0, 7.0, 8.0];
        let a = Array::<f64, Ix3>::from_vec(Ix3::new([2, 2, 2]), data).unwrap();
        let r = nan_add_reduce_axes(&a, &[0, 2], false).unwrap();
        assert_eq!(r.shape(), &[2]);
        // For j=0: sum(1, 2, NaN→0, 6) = 9.0
        // For j=1: sum(3, 4, 7, 8) = 22.0
        let s = r.as_slice().unwrap();
        assert!((s[0] - 9.0).abs() < 1e-12);
        assert!((s[1] - 22.0).abs() < 1e-12);
    }

    #[test]
    fn nan_multiply_reduce_all_skips_nans() {
        let a = arr1(vec![2.0, f64::NAN, 3.0, f64::NAN, 4.0]);
        let p = nan_multiply_reduce_all(&a);
        assert!((p - 24.0).abs() < 1e-12);
    }

    #[test]
    fn nan_multiply_reduce_all_nans_only_returns_one() {
        let a = arr1(vec![f64::NAN, f64::NAN]);
        let p = nan_multiply_reduce_all(&a);
        assert!((p - 1.0).abs() < 1e-12);
    }

    #[test]
    fn nan_multiply_reduce_axis_per_row() {
        let a =
            Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![2.0, 3.0, 4.0, 5.0, f64::NAN, 6.0])
                .unwrap();
        let r = nan_multiply_reduce(&a, 1, false).unwrap();
        let s = r.as_slice().unwrap();
        assert!((s[0] - 24.0).abs() < 1e-12); // 2*3*4
        assert!((s[1] - 30.0).abs() < 1e-12); // 5*1*6
    }

    #[test]
    fn nan_max_reduce_all_skips_nans() {
        let a = arr1(vec![1.0, f64::NAN, 3.0, f64::NAN, 5.0, 2.0]);
        let m = nan_max_reduce_all(&a);
        assert!((m - 5.0).abs() < 1e-12);
    }

    #[test]
    fn nan_max_reduce_all_nans_only_returns_neg_infinity() {
        let a = arr1(vec![f64::NAN, f64::NAN]);
        let m = nan_max_reduce_all(&a);
        assert!(m.is_infinite() && m.is_sign_negative());
    }

    #[test]
    fn nan_max_reduce_axis_per_row_with_nans() {
        let a = Array::<f64, Ix2>::from_vec(
            Ix2::new([2, 3]),
            vec![1.0, f64::NAN, 3.0, f64::NAN, 5.0, 4.0],
        )
        .unwrap();
        let r = nan_max_reduce(&a, 1, false).unwrap();
        let s = r.as_slice().unwrap();
        assert!((s[0] - 3.0).abs() < 1e-12);
        assert!((s[1] - 5.0).abs() < 1e-12);
    }

    #[test]
    fn nan_min_reduce_all_skips_nans() {
        let a = arr1(vec![5.0, f64::NAN, 3.0, f64::NAN, 1.0, 4.0]);
        let m = nan_min_reduce_all(&a);
        assert!((m - 1.0).abs() < 1e-12);
    }

    #[test]
    fn nan_min_reduce_all_nans_only_returns_infinity() {
        let a = arr1(vec![f64::NAN, f64::NAN]);
        let m = nan_min_reduce_all(&a);
        assert!(m.is_infinite() && m.is_sign_positive());
    }

    #[test]
    fn nan_min_reduce_axis_per_row_with_nans() {
        let a = Array::<f64, Ix2>::from_vec(
            Ix2::new([2, 3]),
            vec![5.0, f64::NAN, 3.0, f64::NAN, 5.0, 4.0],
        )
        .unwrap();
        let r = nan_min_reduce(&a, 1, false).unwrap();
        let s = r.as_slice().unwrap();
        assert!((s[0] - 3.0).abs() < 1e-12);
        assert!((s[1] - 4.0).abs() < 1e-12);
    }

    #[test]
    fn nan_reductions_with_no_nans_match_regular_reductions() {
        // When the input has no NaNs the nan-aware versions must give
        // the exact same result as the regular reductions.
        let a = arr1(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
        assert!((nan_add_reduce_all(&a) - 15.0).abs() < 1e-12);
        assert!((nan_multiply_reduce_all(&a) - 120.0).abs() < 1e-12);
        assert!((nan_max_reduce_all(&a) - 5.0).abs() < 1e-12);
        assert!((nan_min_reduce_all(&a) - 1.0).abs() < 1e-12);
    }

    #[test]
    fn nan_add_reduce_keepdims_preserves_axis() {
        let a =
            Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![1.0, 2.0, 3.0, 4.0, f64::NAN, 6.0])
                .unwrap();
        let r = nan_add_reduce(&a, 1, true).unwrap();
        assert_eq!(r.shape(), &[2, 1]);
    }

    #[test]
    fn test_multiply_outer_ac3() {
        // AC-3: multiply_outer produces correct outer product
        let a = arr1(vec![1.0, 2.0, 3.0]);
        let b = arr1(vec![4.0, 5.0]);
        let r = multiply_outer(&a, &b).unwrap();
        assert_eq!(r.shape(), &[3, 2]);
        let s: Vec<f64> = r.iter().copied().collect();
        assert_eq!(s, vec![4.0, 5.0, 8.0, 10.0, 12.0, 15.0]);
    }

    #[test]
    fn test_nancumsum() {
        let a = arr1(vec![1.0, f64::NAN, 3.0, 4.0]);
        let r = nancumsum(&a, None).unwrap();
        let s = r.as_slice().unwrap();
        assert_eq!(s[0], 1.0);
        assert_eq!(s[1], 1.0); // NaN treated as 0
        assert_eq!(s[2], 4.0);
        assert_eq!(s[3], 8.0);
    }

    #[test]
    fn test_nancumprod() {
        let a = arr1(vec![1.0, f64::NAN, 3.0, 4.0]);
        let r = nancumprod(&a, None).unwrap();
        let s = r.as_slice().unwrap();
        assert_eq!(s[0], 1.0);
        assert_eq!(s[1], 1.0); // NaN treated as 1
        assert_eq!(s[2], 3.0);
        assert_eq!(s[3], 12.0);
    }

    #[test]
    fn test_add_broadcast() {
        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 1]), vec![1.0, 2.0]).unwrap();
        let b = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![10.0, 20.0, 30.0]).unwrap();
        let r = add_broadcast(&a, &b).unwrap();
        assert_eq!(r.shape(), &[2, 3]);
    }

    #[test]
    fn test_divmod() {
        let a = arr1(vec![7.0, -7.0]);
        let b = arr1(vec![3.0, 3.0]);
        let (q, r) = divmod(&a, &b).unwrap();
        assert_eq!(q.as_slice().unwrap(), &[2.0, -3.0]);
        let rs = r.as_slice().unwrap();
        assert!((rs[0] - 1.0).abs() < 1e-12);
        assert!((rs[1] - 2.0).abs() < 1e-12);
    }

    #[test]
    fn test_positive() {
        let a = arr1(vec![-1.0, 2.0]);
        let r = positive(&a).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[-1.0, 2.0]);
    }

    #[test]
    fn test_true_divide() {
        let a = arr1(vec![10.0, 20.0]);
        let b = arr1(vec![3.0, 7.0]);
        let r = true_divide(&a, &b).unwrap();
        let s = r.as_slice().unwrap();
        assert!((s[0] - 10.0 / 3.0).abs() < 1e-12);
        assert!((s[1] - 20.0 / 7.0).abs() < 1e-12);
    }

    // -----------------------------------------------------------------------
    // Broadcasting tests for arithmetic ops (issue #379)
    // -----------------------------------------------------------------------

    #[test]
    fn test_add_broadcasts_within_same_rank() {
        // (3, 1) + (1, 4) -> (3, 4) — both Ix2
        let col = Array::<f64, Ix2>::from_vec(Ix2::new([3, 1]), vec![1.0, 2.0, 3.0]).unwrap();
        let row =
            Array::<f64, Ix2>::from_vec(Ix2::new([1, 4]), vec![10.0, 20.0, 30.0, 40.0]).unwrap();
        let r = add(&col, &row).unwrap();
        assert_eq!(r.shape(), &[3, 4]);
        assert_eq!(
            r.iter().copied().collect::<Vec<_>>(),
            vec![
                11.0, 21.0, 31.0, 41.0, 12.0, 22.0, 32.0, 42.0, 13.0, 23.0, 33.0, 43.0,
            ]
        );
    }

    #[test]
    fn test_subtract_broadcasts() {
        let a =
            Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![10.0, 20.0, 30.0, 40.0, 50.0, 60.0])
                .unwrap();
        let b = Array::<f64, Ix2>::from_vec(Ix2::new([1, 3]), vec![1.0, 2.0, 3.0]).unwrap();
        let r = subtract(&a, &b).unwrap();
        assert_eq!(r.shape(), &[2, 3]);
        assert_eq!(
            r.iter().copied().collect::<Vec<_>>(),
            vec![9.0, 18.0, 27.0, 39.0, 48.0, 57.0]
        );
    }

    #[test]
    fn test_multiply_broadcasts() {
        let col = Array::<i32, Ix2>::from_vec(Ix2::new([3, 1]), vec![1, 2, 3]).unwrap();
        let row = Array::<i32, Ix2>::from_vec(Ix2::new([1, 3]), vec![10, 20, 30]).unwrap();
        let r = multiply(&col, &row).unwrap();
        assert_eq!(r.shape(), &[3, 3]);
        assert_eq!(
            r.iter().copied().collect::<Vec<_>>(),
            vec![10, 20, 30, 20, 40, 60, 30, 60, 90]
        );
    }

    #[test]
    fn test_divide_broadcasts() {
        let a =
            Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![10.0, 20.0, 30.0, 40.0, 50.0, 60.0])
                .unwrap();
        let b = Array::<f64, Ix2>::from_vec(Ix2::new([1, 3]), vec![10.0, 5.0, 2.0]).unwrap();
        let r = divide(&a, &b).unwrap();
        assert_eq!(r.shape(), &[2, 3]);
        assert_eq!(
            r.iter().copied().collect::<Vec<_>>(),
            vec![1.0, 4.0, 15.0, 4.0, 10.0, 30.0]
        );
    }

    #[test]
    fn test_power_broadcasts() {
        let bases = Array::<f64, Ix2>::from_vec(Ix2::new([3, 1]), vec![2.0, 3.0, 4.0]).unwrap();
        let exps = Array::<f64, Ix2>::from_vec(Ix2::new([1, 3]), vec![1.0, 2.0, 3.0]).unwrap();
        let r = power(&bases, &exps).unwrap();
        assert_eq!(r.shape(), &[3, 3]);
        assert_eq!(
            r.iter().copied().collect::<Vec<_>>(),
            vec![2.0, 4.0, 8.0, 3.0, 9.0, 27.0, 4.0, 16.0, 64.0]
        );
    }

    #[test]
    fn test_remainder_broadcasts() {
        let a =
            Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), vec![7.0, 8.0, 9.0, 10.0, 11.0, 12.0])
                .unwrap();
        let b = Array::<f64, Ix2>::from_vec(Ix2::new([1, 3]), vec![3.0, 4.0, 5.0]).unwrap();
        let r = remainder(&a, &b).unwrap();
        assert_eq!(r.shape(), &[2, 3]);
        assert_eq!(
            r.iter().copied().collect::<Vec<_>>(),
            vec![1.0, 0.0, 4.0, 1.0, 3.0, 2.0]
        );
    }

    #[test]
    fn test_divmod_broadcasts() {
        let a = Array::<f64, Ix2>::from_vec(Ix2::new([2, 1]), vec![7.0, 13.0]).unwrap();
        let b = Array::<f64, Ix1>::from_vec(Ix1::new([3]), vec![2.0, 3.0, 4.0]).unwrap();
        // Need both inputs to have the same D for the typed divmod entry point.
        // Use the cross-rank broadcast helper instead — but divmod is typed,
        // so route via an explicit Ix2 reshape of b.
        let b2 = Array::<f64, Ix2>::from_vec(Ix2::new([1, 3]), vec![2.0, 3.0, 4.0]).unwrap();
        let (q, r) = divmod(&a, &b2).unwrap();
        assert_eq!(q.shape(), &[2, 3]);
        assert_eq!(r.shape(), &[2, 3]);
        // a broadcasts to [[7,7,7],[13,13,13]], b broadcasts to [[2,3,4],[2,3,4]]
        // divmod(7,2)=(3,1), divmod(7,3)=(2,1), divmod(7,4)=(1,3)
        // divmod(13,2)=(6,1), divmod(13,3)=(4,1), divmod(13,4)=(3,1)
        let q_vec: Vec<f64> = q.iter().copied().collect();
        let r_vec: Vec<f64> = r.iter().copied().collect();
        assert_eq!(q_vec, vec![3.0, 2.0, 1.0, 6.0, 4.0, 3.0]);
        assert_eq!(r_vec, vec![1.0, 1.0, 3.0, 1.0, 1.0, 1.0]);
        let _ = b; // silence unused
    }

    #[test]
    fn test_gcd_int_broadcasts() {
        let a = Array::<i32, Ix2>::from_vec(Ix2::new([3, 1]), vec![12, 18, 24]).unwrap();
        let b = Array::<i32, Ix2>::from_vec(Ix2::new([1, 2]), vec![8, 9]).unwrap();
        let r = gcd_int(&a, &b).unwrap();
        assert_eq!(r.shape(), &[3, 2]);
        // gcd(12,8)=4, gcd(12,9)=3, gcd(18,8)=2, gcd(18,9)=9, gcd(24,8)=8, gcd(24,9)=3
        assert_eq!(
            r.iter().copied().collect::<Vec<_>>(),
            vec![4, 3, 2, 9, 8, 3]
        );
    }

    #[test]
    fn test_lcm_int_broadcasts() {
        let a = Array::<i32, Ix2>::from_vec(Ix2::new([2, 1]), vec![4, 6]).unwrap();
        let b = Array::<i32, Ix2>::from_vec(Ix2::new([1, 2]), vec![6, 8]).unwrap();
        let r = lcm_int(&a, &b).unwrap();
        assert_eq!(r.shape(), &[2, 2]);
        // lcm(4,6)=12, lcm(4,8)=8, lcm(6,6)=6, lcm(6,8)=24
        assert_eq!(r.iter().copied().collect::<Vec<_>>(), vec![12, 8, 6, 24]);
    }

    #[test]
    fn test_add_incompatible_shapes_errors() {
        let a = arr1(vec![1.0, 2.0, 3.0]);
        let b = arr1(vec![1.0, 2.0, 3.0, 4.0]);
        assert!(add(&a, &b).is_err());
    }

    #[cfg(feature = "f16")]
    mod f16_tests {
        use super::*;

        fn arr1_f16(data: &[f32]) -> Array<half::f16, Ix1> {
            let n = data.len();
            let vals: Vec<half::f16> = data.iter().map(|&x| half::f16::from_f32(x)).collect();
            Array::from_vec(Ix1::new([n]), vals).unwrap()
        }

        #[test]
        fn test_add_f16() {
            let a = arr1_f16(&[1.0, 2.0, 3.0]);
            let b = arr1_f16(&[4.0, 5.0, 6.0]);
            let r = add_f16(&a, &b).unwrap();
            let s = r.as_slice().unwrap();
            assert!((s[0].to_f32() - 5.0).abs() < 0.01);
            assert!((s[1].to_f32() - 7.0).abs() < 0.01);
            assert!((s[2].to_f32() - 9.0).abs() < 0.01);
        }

        #[test]
        fn test_multiply_f16() {
            let a = arr1_f16(&[2.0, 3.0]);
            let b = arr1_f16(&[4.0, 5.0]);
            let r = multiply_f16(&a, &b).unwrap();
            let s = r.as_slice().unwrap();
            assert!((s[0].to_f32() - 8.0).abs() < 0.01);
            assert!((s[1].to_f32() - 15.0).abs() < 0.1);
        }

        #[test]
        fn test_sqrt_f16() {
            let a = arr1_f16(&[1.0, 4.0, 9.0, 16.0]);
            let r = sqrt_f16(&a).unwrap();
            let s = r.as_slice().unwrap();
            assert!((s[0].to_f32() - 1.0).abs() < 0.01);
            assert!((s[1].to_f32() - 2.0).abs() < 0.01);
            assert!((s[2].to_f32() - 3.0).abs() < 0.01);
            assert!((s[3].to_f32() - 4.0).abs() < 0.01);
        }

        #[test]
        fn test_absolute_f16() {
            let a = arr1_f16(&[-1.0, 2.0, -3.0]);
            let r = absolute_f16(&a).unwrap();
            let s = r.as_slice().unwrap();
            assert!((s[0].to_f32() - 1.0).abs() < 0.01);
            assert!((s[1].to_f32() - 2.0).abs() < 0.01);
            assert!((s[2].to_f32() - 3.0).abs() < 0.01);
        }

        #[test]
        fn test_power_f16() {
            let a = arr1_f16(&[2.0, 3.0]);
            let b = arr1_f16(&[3.0, 2.0]);
            let r = power_f16(&a, &b).unwrap();
            let s = r.as_slice().unwrap();
            assert!((s[0].to_f32() - 8.0).abs() < 0.1);
            assert!((s[1].to_f32() - 9.0).abs() < 0.1);
        }

        #[test]
        fn test_divide_f16() {
            let a = arr1_f16(&[10.0, 20.0]);
            let b = arr1_f16(&[2.0, 4.0]);
            let r = divide_f16(&a, &b).unwrap();
            let s = r.as_slice().unwrap();
            assert!((s[0].to_f32() - 5.0).abs() < 0.01);
            assert!((s[1].to_f32() - 5.0).abs() < 0.01);
        }
    }

    // -----------------------------------------------------------------------
    // In-place (_into) variants (issue #378)
    // -----------------------------------------------------------------------

    mod into_tests {
        use super::*;
        use ferray_core::Array;
        use ferray_core::dimension::Ix1;

        fn arr(data: &[f64]) -> Array<f64, Ix1> {
            Array::<f64, Ix1>::from_vec(Ix1::new([data.len()]), data.to_vec()).unwrap()
        }

        #[test]
        fn add_into_writes_result() {
            let a = arr(&[1.0, 2.0, 3.0]);
            let b = arr(&[10.0, 20.0, 30.0]);
            let mut out = arr(&[0.0, 0.0, 0.0]);
            add_into(&a, &b, &mut out).unwrap();
            assert_eq!(out.as_slice().unwrap(), &[11.0, 22.0, 33.0]);
        }

        #[test]
        fn subtract_into_writes_result() {
            let a = arr(&[10.0, 20.0, 30.0]);
            let b = arr(&[1.0, 2.0, 3.0]);
            let mut out = arr(&[0.0, 0.0, 0.0]);
            subtract_into(&a, &b, &mut out).unwrap();
            assert_eq!(out.as_slice().unwrap(), &[9.0, 18.0, 27.0]);
        }

        #[test]
        fn multiply_into_writes_result() {
            let a = arr(&[1.0, 2.0, 3.0]);
            let b = arr(&[4.0, 5.0, 6.0]);
            let mut out = arr(&[0.0; 3]);
            multiply_into(&a, &b, &mut out).unwrap();
            assert_eq!(out.as_slice().unwrap(), &[4.0, 10.0, 18.0]);
        }

        #[test]
        fn divide_into_writes_result() {
            let a = arr(&[10.0, 20.0, 30.0]);
            let b = arr(&[2.0, 4.0, 6.0]);
            let mut out = arr(&[0.0; 3]);
            divide_into(&a, &b, &mut out).unwrap();
            assert_eq!(out.as_slice().unwrap(), &[5.0, 5.0, 5.0]);
        }

        #[test]
        fn add_into_shape_mismatch_errors() {
            let a = arr(&[1.0, 2.0, 3.0]);
            let b = arr(&[1.0, 2.0]);
            let mut out = arr(&[0.0, 0.0, 0.0]);
            assert!(add_into(&a, &b, &mut out).is_err());
        }

        #[test]
        fn add_into_out_shape_mismatch_errors() {
            let a = arr(&[1.0, 2.0, 3.0]);
            let b = arr(&[4.0, 5.0, 6.0]);
            let mut out = arr(&[0.0, 0.0]); // wrong size
            assert!(add_into(&a, &b, &mut out).is_err());
        }

        #[test]
        fn sqrt_into_writes_result() {
            let a = arr(&[1.0, 4.0, 9.0, 16.0]);
            let mut out = arr(&[0.0; 4]);
            sqrt_into(&a, &mut out).unwrap();
            assert_eq!(out.as_slice().unwrap(), &[1.0, 2.0, 3.0, 4.0]);
        }

        #[test]
        fn square_into_writes_result() {
            let a = arr(&[1.0, -2.0, 3.0, -4.0]);
            let mut out = arr(&[0.0; 4]);
            square_into(&a, &mut out).unwrap();
            assert_eq!(out.as_slice().unwrap(), &[1.0, 4.0, 9.0, 16.0]);
        }

        #[test]
        fn absolute_into_writes_result() {
            let a = arr(&[-1.0, 2.0, -3.0]);
            let mut out = arr(&[0.0; 3]);
            absolute_into(&a, &mut out).unwrap();
            assert_eq!(out.as_slice().unwrap(), &[1.0, 2.0, 3.0]);
        }

        #[test]
        fn negative_into_writes_result() {
            let a = arr(&[1.0, -2.0, 3.0]);
            let mut out = arr(&[0.0; 3]);
            negative_into(&a, &mut out).unwrap();
            assert_eq!(out.as_slice().unwrap(), &[-1.0, 2.0, -3.0]);
        }

        #[test]
        fn into_variants_are_chainable_no_alloc() {
            // A realistic pattern: apply a pipeline in-place over and over
            // without touching the allocator after initial setup.
            let mut state = arr(&[1.0, 2.0, 3.0, 4.0]);
            let ones = arr(&[1.0; 4]);
            let mut scratch = arr(&[0.0; 4]);
            for _ in 0..100 {
                add_into(&state, &ones, &mut scratch).unwrap();
                std::mem::swap(&mut state, &mut scratch);
            }
            // After 100 increments of 1: [101, 102, 103, 104]
            assert_eq!(state.as_slice().unwrap(), &[101.0, 102.0, 103.0, 104.0]);
        }

        #[test]
        fn exp_into_matches_exp() {
            use crate::ops::explog::{exp, exp_into};
            let a = arr(&[0.0, 1.0, 2.0]);
            let expected = exp(&a).unwrap();
            let mut out = arr(&[0.0; 3]);
            exp_into(&a, &mut out).unwrap();
            for (&x, &y) in expected
                .as_slice()
                .unwrap()
                .iter()
                .zip(out.as_slice().unwrap().iter())
            {
                assert!((x - y).abs() < 1e-14);
            }
        }

        #[test]
        fn sin_into_matches_sin() {
            use crate::ops::trig::{sin, sin_into};
            let a = arr(&[0.0, std::f64::consts::FRAC_PI_2, std::f64::consts::PI]);
            let expected = sin(&a).unwrap();
            let mut out = arr(&[0.0; 3]);
            sin_into(&a, &mut out).unwrap();
            for (&x, &y) in expected
                .as_slice()
                .unwrap()
                .iter()
                .zip(out.as_slice().unwrap().iter())
            {
                assert!((x - y).abs() < 1e-14);
            }
        }

        #[test]
        fn cos_into_matches_cos() {
            use crate::ops::trig::{cos, cos_into};
            let a = arr(&[0.0, std::f64::consts::FRAC_PI_2, std::f64::consts::PI]);
            let expected = cos(&a).unwrap();
            let mut out = arr(&[0.0; 3]);
            cos_into(&a, &mut out).unwrap();
            for (&x, &y) in expected
                .as_slice()
                .unwrap()
                .iter()
                .zip(out.as_slice().unwrap().iter())
            {
                assert!((x - y).abs() < 1e-14);
            }
        }

        #[test]
        fn log_into_matches_log() {
            use crate::ops::explog::{log, log_into};
            let a = arr(&[1.0, std::f64::consts::E, 10.0]);
            let expected = log(&a).unwrap();
            let mut out = arr(&[0.0; 3]);
            log_into(&a, &mut out).unwrap();
            for (&x, &y) in expected
                .as_slice()
                .unwrap()
                .iter()
                .zip(out.as_slice().unwrap().iter())
            {
                assert!((x - y).abs() < 1e-14);
            }
        }
    }

    // ---- complex arithmetic (#869) -------------------------------------
    //
    // Expected values are hand-computed from the complex algebra and
    // cross-checked against the live numpy 2.4.5 oracle (R-CHAR-3):
    //   (1+2j)+(2+0j) = 3+2j      (1+2j)-(2+0j) = -1+2j
    //   (1+2j)*(2+0j) = 2+4j      (1+2j)/(2+0j) = 0.5+1j
    //   (1+2j)*(3+4j) = -5+10j
    //   (1+2j)**2     = -3+4j     (1+2j)**0.5   = 1.27201965+0.78615138j
    //   (1+2j)**(1+1j)= -0.24720004+0.69645049j
    mod complex_arith {
        use super::*;
        use num_complex::{Complex32, Complex64};

        fn c64(data: Vec<Complex64>) -> FerrayResult<Array<Complex64, Ix1>> {
            let n = data.len();
            Array::from_vec(Ix1::new([n]), data)
        }
        fn c32(data: Vec<Complex32>) -> FerrayResult<Array<Complex32, Ix1>> {
            let n = data.len();
            Array::from_vec(Ix1::new([n]), data)
        }
        // First element of a 1-element array result (tests build singletons).
        fn first<T: Element + Copy, D: Dimension>(a: &Array<T, D>) -> T {
            *a.iter().next().expect("test array is non-empty")
        }
        fn close(a: Complex64, b: Complex64) {
            assert!(
                (a.re - b.re).abs() < 1e-12 && (a.im - b.im).abs() < 1e-12,
                "{a:?} != {b:?}"
            );
        }

        #[test]
        fn complex_add_sub_mul() -> FerrayResult<()> {
            let a = c64(vec![Complex64::new(1.0, 2.0)])?;
            let b = c64(vec![Complex64::new(2.0, 0.0)])?;
            assert_eq!(first(&add(&a, &b)?), Complex64::new(3.0, 2.0));
            assert_eq!(first(&subtract(&a, &b)?), Complex64::new(-1.0, 2.0));
            assert_eq!(first(&multiply(&a, &b)?), Complex64::new(2.0, 4.0));
            Ok(())
        }

        #[test]
        fn complex_mul_cross_term() -> FerrayResult<()> {
            // (1+2j)*(3+4j) = (3-8) + (4+6)j = -5+10j
            let a = c64(vec![Complex64::new(1.0, 2.0)])?;
            let b = c64(vec![Complex64::new(3.0, 4.0)])?;
            assert_eq!(first(&multiply(&a, &b)?), Complex64::new(-5.0, 10.0));
            Ok(())
        }

        #[test]
        fn complex_true_divide_keeps_complex() -> FerrayResult<()> {
            let a = c64(vec![Complex64::new(1.0, 2.0)])?;
            let b = c64(vec![Complex64::new(2.0, 0.0)])?;
            // TrueDivide::Output for Complex<f64> is Complex<f64> (NOT f64).
            close(first(&divide(&a, &b)?), Complex64::new(0.5, 1.0));
            Ok(())
        }

        #[test]
        fn complex_divide_by_zero_is_nan_not_panic() -> FerrayResult<()> {
            let a = c64(vec![Complex64::new(2.0, 0.0)])?;
            let z = c64(vec![Complex64::new(0.0, 0.0)])?;
            let v = first(&divide(&a, &z)?);
            assert!(!v.is_finite(), "complex /0 should be non-finite, got {v:?}");
            Ok(())
        }

        #[test]
        fn complex_divide_subnormal_divisor_stays_finite() -> FerrayResult<()> {
            // Smith algorithm (numpy cdiv, npy_math_complex.c.src:94): a
            // near-subnormal divisor must NOT overflow the way num_complex's
            // naive (c²+d²) form does. Live numpy 2.4.4:
            // np.divide([2+0j], [6.675e-308+0j])[0] == 2.99625468164794e+307+0j.
            let a = c64(vec![Complex64::new(2.0, 0.0)])?;
            let b = c64(vec![Complex64::new(6.675e-308, 0.0)])?;
            let q = first(&divide(&a, &b)?);
            assert!(
                q.is_finite(),
                "subnormal-divisor quotient must be finite, got {q:?}"
            );
            let expected = 2.996_254_681_647_94e307;
            assert!(
                (q.re - expected).abs() <= expected.abs() * 1e-12 && q.im == 0.0,
                "{q:?} != numpy {expected:e}+0j"
            );
            Ok(())
        }

        #[test]
        fn complex_divide_normal_case_smith() -> FerrayResult<()> {
            // Normal case stays correct under Smith: (1+2j)/(3+4j) = 0.44+0.08j
            // (live numpy: np.divide([1+2j],[3+4j])[0] == (0.44+0.08j)).
            let a = c64(vec![Complex64::new(1.0, 2.0)])?;
            let b = c64(vec![Complex64::new(3.0, 4.0)])?;
            close(first(&divide(&a, &b)?), Complex64::new(0.44, 0.08));
            Ok(())
        }

        #[test]
        fn complex_power_int_exponent_exact() -> FerrayResult<()> {
            // (1+2j)**2 == -3+4j EXACTLY (npy_cpow integer-exponent fast path).
            let a = c64(vec![Complex64::new(1.0, 2.0)])?;
            let two = c64(vec![Complex64::new(2.0, 0.0)])?;
            assert_eq!(first(&power_complex(&a, &two)?), Complex64::new(-3.0, 4.0));
            Ok(())
        }

        #[test]
        fn complex_power_zero_and_negative_int() -> FerrayResult<()> {
            let a = c64(vec![Complex64::new(1.0, 2.0)])?;
            // a**0 == 1
            let p0 = power_complex(&a, &c64(vec![Complex64::new(0.0, 0.0)])?)?;
            assert_eq!(first(&p0), Complex64::new(1.0, 0.0));
            // a**-1 == 1/a == (1-2j)/5 = 0.2-0.4j
            let pm1 = power_complex(&a, &c64(vec![Complex64::new(-1.0, 0.0)])?)?;
            close(first(&pm1), Complex64::new(0.2, -0.4));
            Ok(())
        }

        #[test]
        fn complex_power_float_exponent() -> FerrayResult<()> {
            // (1+2j)**0.5 = 1.27201965+0.78615138j (general powc branch).
            let a = c64(vec![Complex64::new(1.0, 2.0)])?;
            let half = c64(vec![Complex64::new(0.5, 0.0)])?;
            close(
                first(&power_complex(&a, &half)?),
                Complex64::new(1.272_019_649_514_069, 0.786_151_377_757_423_3),
            );
            Ok(())
        }

        #[test]
        fn complex_power_complex_exponent() -> FerrayResult<()> {
            // (1+2j)**(1+1j) = -0.24720004+0.69645049j.
            let a = c64(vec![Complex64::new(1.0, 2.0)])?;
            let e = c64(vec![Complex64::new(1.0, 1.0)])?;
            close(
                first(&power_complex(&a, &e)?),
                Complex64::new(-0.247_200_044_262_917_22, 0.696_450_487_082_543_2),
            );
            Ok(())
        }

        #[test]
        fn complex_power_zero_base() -> FerrayResult<()> {
            // 0**positive = 0; 0**negative-real = nan.
            let z = c64(vec![Complex64::new(0.0, 0.0)])?;
            let pos = power_complex(&z, &c64(vec![Complex64::new(2.0, 0.0)])?)?;
            assert_eq!(first(&pos), Complex64::new(0.0, 0.0));
            let neg = power_complex(&z, &c64(vec![Complex64::new(-1.0, 0.0)])?)?;
            assert!(!first(&neg).is_finite());
            Ok(())
        }

        #[test]
        fn complex64_add_and_divide() -> FerrayResult<()> {
            // complex64 stays complex64 through add/divide.
            let a = c32(vec![Complex32::new(1.0, 2.0)])?;
            let b = c32(vec![Complex32::new(1.0, 1.0)])?;
            assert_eq!(first(&add(&a, &b)?), Complex32::new(2.0, 3.0));
            let d = divide(&a, &c32(vec![Complex32::new(2.0, 0.0)])?)?;
            let v = first(&d);
            assert!((v.re - 0.5).abs() < 1e-6 && (v.im - 1.0).abs() < 1e-6);
            Ok(())
        }
    }
}