simdeez 3.0.1

SIMD library to abstract over different instruction sets and widths
Documentation
use crate::math::{f64, scalar};
use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};

// DECISION(2026-03-23): KEEP_SIMD_PORTABLE
// Function(s): f64 log10_u35
// Why kept:
// - local runtime-selected performance is clearly better than native scalar
// - the current implementation now rides the kept portable f64 log2_u35 core
// Revisit when:
// - f64 core log2_u35 changes materially or non-x86 evidence disagrees sharply

// DECISION(2026-03-23): KEEP_SIMD_PORTABLE
// Function(s): f64 atan2_u35 / hypot_u35 / fmod
// Why kept:
// - all three are strong local wins over native scalar, especially hypot and fmod
// - targeted adversarial tests already cover signed-zero, scale, and exceptional-lane behavior
// Revisit when:
// - backend-specific behavior diverges or the scalar contracts change materially

type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;

const F64_SIGN_MASK: i64 = i64::MIN;
const FAST_FMOD_MAX_QUOTIENT: f64 = 4_503_599_627_370_496.0;

#[inline(always)]
fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    unsafe {
        let lanes = mask.as_array();
        for lane in 0..V::WIDTH {
            if lanes[lane] != 0 {
                return true;
            }
        }
    }

    false
}

#[inline(always)]
fn patch_unary_exceptional_lanes<V>(
    input: V,
    output: V,
    exceptional_mask: SimdI64<V>,
    scalar_fallback: fn(f64) -> f64,
) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    if !any_lane_nonzero::<V>(exceptional_mask) {
        return output;
    }

    unsafe {
        let input_lanes = input.as_array();
        let mask_lanes = exceptional_mask.as_array();
        let mut output_lanes = output.as_array();

        for lane in 0..V::WIDTH {
            if mask_lanes[lane] != 0 {
                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
            }
        }

        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
    }
}

#[inline(always)]
fn patch_binary_exceptional_lanes<V>(
    lhs: V,
    rhs: V,
    output: V,
    exceptional_mask: SimdI64<V>,
    scalar_fallback: fn(f64, f64) -> f64,
) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    if !any_lane_nonzero::<V>(exceptional_mask) {
        return output;
    }

    unsafe {
        let lhs_lanes = lhs.as_array();
        let rhs_lanes = rhs.as_array();
        let mask_lanes = exceptional_mask.as_array();
        let mut output_lanes = output.as_array();

        for lane in 0..V::WIDTH {
            if mask_lanes[lane] != 0 {
                output_lanes[lane] = scalar_fallback(lhs_lanes[lane], rhs_lanes[lane]);
            }
        }

        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
    }
}

#[inline(always)]
fn finite_mask<V>(input: V) -> SimdI64<V>
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    input.cmp_eq(input).bitcast_i64() & input.abs().cmp_neq(V::set1(f64::INFINITY)).bitcast_i64()
}

#[inline(always)]
pub(crate) fn log10_u35<V>(input: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let positive_finite = input.cmp_gt(V::zeroes()).bitcast_i64() & finite_mask(input);
    let fast = f64::log2_u35(input) * V::set1(core::f64::consts::LOG10_2);
    patch_unary_exceptional_lanes(
        input,
        fast,
        positive_finite.cmp_eq(SimdI64::<V>::zeroes()),
        scalar::log10_u35_f64,
    )
}

#[inline(always)]
pub(crate) fn atan2_u35<V>(y: V, x: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let abs_y = y.abs();
    let abs_x = x.abs();

    let x_is_zero = abs_x.cmp_eq(V::zeroes()).bitcast_i64();
    let fast_mask = finite_mask(x) & finite_mask(y) & x_is_zero.cmp_eq(SimdI64::<V>::zeroes());
    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());

    let large_mask = abs_y.cmp_gt(abs_x).bitcast_i64();
    let reduced_base = large_mask
        .bitcast_f64()
        .blendv(abs_y / abs_x, abs_x / abs_y);

    let z_atan = f64::atan_u35(reduced_base);

    let pi_over_2 = V::set1(core::f64::consts::FRAC_PI_2);
    let pi = V::set1(core::f64::consts::PI);
    let y_sign = y.bitcast_i64() & SimdI64::<V>::set1(F64_SIGN_MASK);
    let x_negative = x.cmp_lt(V::zeroes()).bitcast_i64();

    let base = large_mask.bitcast_f64().blendv(z_atan, pi_over_2 - z_atan);
    let with_x_quadrant = x_negative.bitcast_f64().blendv(base, pi - base);
    let fast = (y_sign ^ with_x_quadrant.bitcast_i64()).bitcast_f64();

    patch_binary_exceptional_lanes(y, x, fast, exceptional_mask, scalar::atan2_u35_f64)
}

#[inline(always)]
pub(crate) fn hypot_u35<V>(x: V, y: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let abs_x = x.abs();
    let abs_y = y.abs();
    let x_gt_y = abs_x.cmp_gt(abs_y);
    let max_xy = x_gt_y.blendv(abs_y, abs_x);
    let min_xy = x_gt_y.blendv(abs_x, abs_y);

    let fast_mask = finite_mask(x) & finite_mask(y) & max_xy.cmp_neq(V::zeroes()).bitcast_i64();
    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());

    let ratio = min_xy / max_xy;
    let fast = max_xy * (V::set1(1.0) + ratio * ratio).sqrt();

    patch_binary_exceptional_lanes(x, y, fast, exceptional_mask, scalar::hypot_u35_f64)
}

#[inline(always)]
pub(crate) fn fmod<V>(x: V, y: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let quotient = x / y;
    let trunc = quotient
        .cmp_lt(V::zeroes())
        .blendv(quotient.floor(), quotient.ceil());
    let fast = x - trunc * y;
    let zero_mask = fast.cmp_eq(V::zeroes()).bitcast_i64();
    let signed_zero = (x.bitcast_i64() & SimdI64::<V>::set1(F64_SIGN_MASK)).bitcast_f64();
    let fast = zero_mask.bitcast_f64().blendv(fast, signed_zero);

    let finite_inputs = finite_mask(x) & finite_mask(y);
    let y_nonzero = y.cmp_neq(V::zeroes()).bitcast_i64();
    let x_not_inf = x.abs().cmp_neq(V::set1(f64::INFINITY)).bitcast_i64();
    let small_quotient = quotient
        .abs()
        .cmp_lt(V::set1(FAST_FMOD_MAX_QUOTIENT))
        .bitcast_i64();
    let fast_mask = finite_inputs & y_nonzero & x_not_inf & small_quotient;
    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());

    patch_binary_exceptional_lanes(x, y, fast, exceptional_mask, scalar::fmod_f64)
}