simdeez 3.0.1

SIMD library to abstract over different instruction sets and widths
Documentation
use crate::math::scalar;
use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};

// DECISION(2026-03-23): KEEP_SIMD_PORTABLE
// Function(s): f64 asin_u35 / acos_u35 / atan_u35
// Why kept:
// - local runtime-selected throughput stays well above native scalar for all three functions
// - the current targeted tests cover near-one fallback boundaries and atan reduction thresholds
// Revisit when:
// - the fallback boundaries or rational approximations change materially

type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;

const F64_EXPONENT_MASK: i64 = 0x7FF0_0000_0000_0000u64 as i64;
const F64_SIGN_MASK: i64 = i64::MIN;

const ASIN_SQRT_FALLBACK_BOUND_BITS: u64 = 0x3FEF_F000_0000_0000;

const FRAC_PI_2: f64 = core::f64::consts::FRAC_PI_2;
const FRAC_PI_4: f64 = core::f64::consts::FRAC_PI_4;
const PI: f64 = core::f64::consts::PI;

const TAN_PI_8: f64 = 0.414_213_562_373_095_03;
const TAN_3PI_8: f64 = 2.414_213_562_373_095;

#[inline(always)]
fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    unsafe {
        let lanes = mask.as_array();
        for lane in 0..V::WIDTH {
            if lanes[lane] != 0 {
                return true;
            }
        }
    }

    false
}

#[inline(always)]
fn patch_exceptional_lanes<V>(
    input: V,
    output: V,
    exceptional_mask: SimdI64<V>,
    scalar_fallback: fn(f64) -> f64,
) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    if !any_lane_nonzero::<V>(exceptional_mask) {
        return output;
    }

    unsafe {
        let input_lanes = input.as_array();
        let mask_lanes = exceptional_mask.as_array();
        let mut output_lanes = output.as_array();

        for lane in 0..V::WIDTH {
            if mask_lanes[lane] != 0 {
                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
            }
        }

        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
    }
}

#[inline(always)]
fn restore_sign<V>(magnitude: V, sign_bits: SimdI64<V>) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    (magnitude.bitcast_i64() ^ sign_bits).bitcast_f64()
}

#[inline(always)]
fn finite_mask<V>(input: V) -> SimdI64<V>
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let exponent_bits = input.bitcast_i64() & SimdI64::<V>::set1(F64_EXPONENT_MASK);
    exponent_bits.cmp_neq(SimdI64::<V>::set1(F64_EXPONENT_MASK))
}

#[inline(always)]
fn atan_poly<V>(z: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let p0 = V::set1(-8.750_608_600_031_904e-1);
    let p1 = V::set1(-1.615_753_718_733_365_2e1);
    let p2 = V::set1(-7.500_855_792_314_705e1);
    let p3 = V::set1(-1.228_866_684_490_136_2e2);
    let p4 = V::set1(-6.485_021_904_942_025e1);

    let q0 = V::set1(2.485_846_490_142_306_4e1);
    let q1 = V::set1(1.650_270_098_316_988_6e2);
    let q2 = V::set1(4.328_810_604_912_903e2);
    let q3 = V::set1(4.853_903_996_359_137e2);
    let q4 = V::set1(1.945_506_571_482_614e2);

    let numerator = (((((p0 * z) + p1) * z) + p2) * z + p3) * z + p4;
    let denominator = (((((z + q0) * z) + q1) * z + q2) * z + q3) * z + q4;
    numerator / denominator
}

#[inline(always)]
fn atan_reduced<V>(reduced: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let z = reduced * reduced;
    reduced + reduced * z * atan_poly(z)
}

#[inline(always)]
fn atan_fast<V>(input: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let sign_bits = input.bitcast_i64() & SimdI64::<V>::set1(F64_SIGN_MASK);
    let abs_input = input.abs();

    let large_mask = abs_input.cmp_gt(V::set1(TAN_3PI_8)).bitcast_i64();
    let medium_base = abs_input.cmp_gt(V::set1(TAN_PI_8)).bitcast_i64();
    let medium_mask = medium_base & large_mask.cmp_eq(SimdI64::<V>::zeroes());
    let large_mask_f = large_mask.bitcast_f64();
    let medium_mask_f = medium_mask.bitcast_f64();

    let mut reduced = abs_input;
    reduced = medium_mask_f.blendv(
        reduced,
        (abs_input - V::set1(1.0)) / (abs_input + V::set1(1.0)),
    );
    reduced = large_mask_f.blendv(reduced, -V::set1(1.0) / abs_input);

    let mut offset = V::zeroes();
    offset = medium_mask_f.blendv(offset, V::set1(FRAC_PI_4));
    offset = large_mask_f.blendv(offset, V::set1(FRAC_PI_2));

    let magnitude = offset + atan_reduced(reduced);
    restore_sign(magnitude, sign_bits)
}

#[inline(always)]
fn asin_exceptional_mask<V>(input: V) -> SimdI64<V>
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let finite = finite_mask(input);
    let abs_input = input.abs();
    let out_of_domain = abs_input.cmp_gt(V::set1(1.0)).bitcast_i64();
    let near_edge = abs_input
        .bitcast_i64()
        .cmp_gt(SimdI64::<V>::set1(ASIN_SQRT_FALLBACK_BOUND_BITS as i64));
    finite.cmp_eq(SimdI64::<V>::zeroes()) | out_of_domain | near_edge
}

#[inline(always)]
fn acos_exceptional_mask<V>(input: V) -> SimdI64<V>
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    asin_exceptional_mask(input)
}

#[inline(always)]
fn asin_acos_core<V>(input: V) -> (V, V)
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let one = V::set1(1.0);
    let half = V::set1(0.5);
    let abs_input = input.abs();
    let sign_bits = input.bitcast_i64() & SimdI64::<V>::set1(F64_SIGN_MASK);

    let small_mask = abs_input.cmp_lt(half).bitcast_i64();
    let small_mask_f = small_mask.bitcast_f64();

    let small_t = abs_input / (one - abs_input * abs_input).sqrt();
    let small_asin_mag = atan_fast(small_t);

    let large_t = ((one - abs_input) / (one + abs_input)).sqrt();
    let large_angle = V::set1(2.0) * atan_fast(large_t);
    let large_asin_mag = V::set1(FRAC_PI_2) - large_angle;

    let asin_mag = small_mask_f.blendv(large_asin_mag, small_asin_mag);
    let asin_out = restore_sign(asin_mag, sign_bits);

    let small_acos = V::set1(FRAC_PI_2) - asin_out;
    let large_acos_pos = large_angle;
    let large_acos_neg = V::set1(PI) - large_angle;
    let negative_mask = input.cmp_lt(V::zeroes()).bitcast_i64().bitcast_f64();
    let large_acos = negative_mask.blendv(large_acos_pos, large_acos_neg);
    let acos_out = small_mask_f.blendv(large_acos, small_acos);

    (asin_out, acos_out)
}

#[inline(always)]
pub(crate) fn asin_u35<V>(input: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let exceptional_mask = asin_exceptional_mask(input);
    let (fast, _) = asin_acos_core(input);
    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::asin_u35_f64)
}

#[inline(always)]
pub(crate) fn acos_u35<V>(input: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let exceptional_mask = acos_exceptional_mask(input);
    let (_, fast) = asin_acos_core(input);
    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acos_u35_f64)
}

#[inline(always)]
pub(crate) fn atan_u35<V>(input: V) -> V
where
    V: SimdFloat64,
    V::Engine: Simd<Vf64 = V>,
{
    let fast = atan_fast(input);
    let exceptional_mask = finite_mask(input).cmp_eq(SimdI64::<V>::zeroes());
    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atan_u35_f64)
}