doubled 0.3.1

Dekker arithmetic
Documentation
use crate::*;
use core::simd::{LaneCount, Simd, SimdFloat, SupportedLaneCount};
use std::simd::StdFloat;

type F64x<const N: usize> = Simd<f64, N>;
type U64x<const N: usize> = Simd<u64, N>;

impl<const N: usize> Upper for F64x<N>
where
    LaneCount<N>: SupportedLaneCount,
{
    #[inline]
    fn upper(self) -> Self {
        Self::from_bits(self.to_bits() & U64x::splat((0x_ffff_ffff << 32) + 0x_f800_0000))
    }
}

impl<const N: usize> FromMask for Doubled<F64x<N>>
where
    LaneCount<N>: SupportedLaneCount,
{
    type Mask = U64x<N>;
    fn from_mask(u0: Self::Mask, u1: Self::Mask) -> Self {
        Self::new(F64x::from_bits(u0), F64x::from_bits(u1))
    }
}

impl<const N: usize> core::convert::From<F64x<N>> for Doubled<F64x<N>>
where
    LaneCount<N>: SupportedLaneCount,
{
    #[inline]
    fn from(f: F64x<N>) -> Self {
        Self::new(f, F64x::splat(0.))
    }
}

impl<const N: usize> core::convert::From<Doubled<F64x<N>>> for F64x<N>
where
    LaneCount<N>: SupportedLaneCount,
{
    #[inline]
    fn from(f: Doubled<F64x<N>>) -> Self {
        f.0 + f.1
    }
}

impl<const N: usize> Doubled<F64x<N>>
where
    LaneCount<N>: SupportedLaneCount,
{
    #[inline]
    pub const fn splat(value: Doubled<f64>) -> Self {
        Self::new(
            F64x::from_array([value.0; N]),
            F64x::from_array([value.1; N]),
        )
    }

    #[inline]
    pub fn abs(self) -> Self {
        Self::new(
            self.0.abs(),
            F64x::from_bits(self.1.to_bits() ^ (self.0.to_bits() & F64x::splat(-0.).to_bits())),
        )
    }

    #[cfg(target_feature = "fma")]
    #[inline]
    pub fn square(self) -> Self {
        let r0 = self.0 * self.0;
        Self::new(
            r0,
            (self.0 + self.0).mul_add(self.1, self.0.mul_add(self.0, -r0)),
        )
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    pub fn square(self) -> Self {
        let xh = self.0.upper();
        let xl = self.0 - xh;
        let r0 = self.0 * self.0;
        Self::new(
            r0,
            xh * xh + (-r0) + (xh + xh) * xl + xl * xl + self.0 * (self.1 + self.1),
        )
    }

    #[cfg(target_feature = "fma")]
    #[inline]
    pub fn square_as_f(self) -> F64x<N> {
        self.0.mul_add(self.0, self.0 * self.1 + self.0 * self.1)
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    pub fn square_as_f(self) -> F64x<N> {
        let xh = self.0.upper();
        let xl = self.0 - xh;

        xh * self.1 + xh * self.1 + xl * xl + (xh * xl + xh * xl) + xh * xh
    }

    #[inline]
    pub fn sqrt(self) -> Self {
        let t = (self.0 + self.1).sqrt();
        ((self + t.mul_as_doubled(t)) * t.recip_as_doubled()).scale(F64x::splat(0.5))
    }

    #[cfg(target_feature = "fma")]
    #[inline]
    pub fn mul_as_f(self, other: Self) -> F64x<N> {
        self.0
            .mul_add(other.0, self.1.mul_add(other.0, self.0 * other.1))
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    pub fn mul_as_f(self, other: Self) -> F64x<N> {
        let xh = self.0.upper();
        let xl = self.0 - xh;
        let yh = other.0.upper();
        let yl = other.0 - yh;

        self.1 * yh + xh * other.1 + xl * yl + xh * yl + xl * yh + xh * yh
    }
    #[cfg(target_feature = "fma")]
    #[inline]
    pub fn recip(self) -> Self {
        let q0 = self.0.recip();
        Self::new(
            q0,
            q0 * (-self.1).mul_add(q0, (-self.0).mul_add(q0, F64x::splat(1.))),
        )
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    pub fn recip(self) -> Doubled<F64x<N>> {
        let t = self.0.recip();
        let dh = self.0.upper();
        let dl = self.0 - dh;
        let th = t.upper();
        let tl = t - th;
        let q0 = t;
        Self::new(
            q0,
            t * (F64x::splat(1.) - dh * th - dh * tl - dl * th - dl * tl - self.1 * t),
        )
    }
}

impl<const N: usize> core::ops::Add<Doubled<F64x<N>>> for F64x<N>
where
    LaneCount<N>: SupportedLaneCount,
{
    type Output = Doubled<Self>;
    #[inline]
    fn add(self, other: Doubled<Self>) -> Self::Output {
        let r0 = self + other.0;
        let v = r0 - self;
        Doubled::new(r0, self - (r0 - v) + (other.0 - v) + other.1)
    }
}

impl<const N: usize> core::ops::Mul for Doubled<F64x<N>>
where
    LaneCount<N>: SupportedLaneCount,
{
    type Output = Self;
    #[cfg(target_feature = "fma")]
    #[inline]
    fn mul(self, other: Self) -> Self {
        let r0 = self.0 * other.0;
        Self::new(
            r0,
            self.0.mul_add(
                other.1,
                self.1.mul_add(other.0, self.0.mul_add(other.0, -r0)),
            ),
        )
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    fn mul(self, other: Self) -> Self {
        let xh = self.0.upper();
        let xl = self.0 - xh;
        let yh = other.0.upper();
        let yl = other.0 - yh;
        let r0 = self.0 * other.0;
        Self::new(
            r0,
            xh * yh + (-r0) + xl * yh + xh * yl + xl * yl + self.0 * other.1 + self.1 * other.0,
        )
    }
}

impl<const N: usize> core::ops::Mul<F64x<N>> for Doubled<F64x<N>>
where
    LaneCount<N>: SupportedLaneCount,
{
    type Output = Self;
    #[cfg(target_feature = "fma")]
    #[inline]
    fn mul(self, other: F64x<N>) -> Self {
        let r0 = self.0 * other;
        Self::new(r0, self.1.mul_add(other, self.0.mul_add(other, -r0)))
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    fn mul(self, other: F64x<N>) -> Self {
        let xh = self.0.upper();
        let xl = self.0 - xh;
        let yh = other.upper();
        let yl = other - yh;
        let r0 = self.0 * other;
        Self::new(
            r0,
            xh * yh + (-r0) + xl * yh + xh * yl + xl * yl + self.1 * other,
        )
    }
}

impl<const N: usize> core::ops::Div for Doubled<F64x<N>>
where
    LaneCount<N>: SupportedLaneCount,
{
    type Output = Self;
    #[cfg(target_feature = "fma")]
    #[inline]
    fn div(self, other: Self) -> Self {
        let t = other.0.recip();

        let q0 = self.0 * t;
        let u = t.mul_add(self.0, -q0);
        let mut q1 = (-other.1).mul_add(t, (-other.0).mul_add(t, F64x::splat(1.)));
        q1 = q0.mul_add(q1, self.1.mul_add(t, u));

        Self::new(q0, q1)
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    fn div(self, other: Self) -> Self {
        let t = other.0.recip();
        let dh = other.0.upper();
        let dl = other.0 - dh;
        let th = t.upper();
        let tl = t - th;
        let nhh = self.0.upper();
        let nhl = self.0 - nhh;

        let q0 = self.0 * t;

        let u = nhh * th - q0
            + nhh * tl
            + nhl * th
            + nhl * tl
            + q0 * (F64x::splat(1.) - dh * th - dh * tl - dl * th - dl * tl);

        Self::new(q0, t * (self.1 - q0 * other.1) + u)
    }
}

impl<const N: usize> CheckOrder for Doubled<F64x<N>>
where
    LaneCount<N>: SupportedLaneCount,
{
    fn check_order(self, _other: Self) {}
}

impl<const N: usize> CheckOrder<F64x<N>> for Doubled<F64x<N>>
where
    LaneCount<N>: SupportedLaneCount,
{
    fn check_order(self, _other: F64x<N>) {}
}

impl<const N: usize> CheckOrder<Doubled<F64x<N>>> for F64x<N>
where
    LaneCount<N>: SupportedLaneCount,
{
    fn check_order(self, _other: Doubled<F64x<N>>) {}
}

impl<const N: usize> CheckOrder for F64x<N>
where
    LaneCount<N>: SupportedLaneCount,
{
    fn check_order(self, _other: Self) {}
}

impl<const N: usize> AsDoubled for F64x<N>
where
    LaneCount<N>: SupportedLaneCount,
{
    #[inline]
    fn as_doubled(self) -> Doubled<Self> {
        Doubled::new(self, Self::splat(0.))
    }
}

impl<const N: usize> MulAsDoubled for F64x<N>
where
    LaneCount<N>: SupportedLaneCount,
{
    #[cfg(target_feature = "fma")]
    #[inline]
    fn mul_as_doubled(self, other: Self) -> Doubled<Self> {
        let r0 = self * other;
        Doubled::new(r0, self.mul_add(other, -r0))
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    fn mul_as_doubled(self, other: Self) -> Doubled<Self> {
        let xh = self.upper();
        let xl = self - xh;
        let yh = other.upper();
        let yl = other - yh;
        let r0 = self * other;
        Doubled::new(r0, xh * yh + (-r0) + xl * yh + xh * yl + xl * yl)
    }
}

impl<const N: usize> RecipAsDoubled for F64x<N>
where
    LaneCount<N>: SupportedLaneCount,
{
    #[cfg(target_feature = "fma")]
    #[inline]
    fn recip_as_doubled(self) -> Doubled<Self> {
        let q0 = self.recip();
        Doubled::new(q0, q0 * (-self).mul_add(q0, Self::splat(1.)))
    }
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    fn recip_as_doubled(self) -> Doubled<Self> {
        let t = self.recip();
        let dh = self.upper();
        let dl = self - dh;
        let th = t.upper();
        let tl = t - th;
        let q0 = t;
        Doubled::new(
            q0,
            t * (Self::splat(1.) - dh * th - dh * tl - dl * th - dl * tl),
        )
    }
}