tract-linalg 0.23.0-dev.4

Tiny, no-nonsense, self contained, TensorFlow and ONNX inference
Documentation
use crate::num_traits::Zero;

reduce_impl_wrap!(
    f32,
    arm64simd_sum_f32_16n,
    16,
    4,
    (),
    f32::zero(),
    #[inline(never)]
    fn run(buf: &[f32], _: ()) -> f32 {
        assert!(buf.len() % 16 == 0);
        assert!(buf.len() > 0);
        unsafe fn run(buf: &[f32]) -> f32 {
            unsafe {
                let len = buf.len();
                let ptr = buf.as_ptr();
                let mut out: u32;
                std::arch::asm!("
                movi v0.4s, #0
                movi v1.4s, #0
                movi v2.4s, #0
                movi v3.4s, #0
                2:
                    ld1 {{v4.4s, v5.4s, v6.4s, v7.4s}}, [{ptr}], 64
                    fadd v0.4s, v0.4s, v4.4s
                    fadd v1.4s, v1.4s, v5.4s
                    fadd v2.4s, v2.4s, v6.4s
                    fadd v3.4s, v3.4s, v7.4s

                    subs {len}, {len}, 16
                    bne 2b

                fadd v0.4s, v0.4s, v1.4s
                fadd v2.4s, v2.4s, v3.4s
                fadd v0.4s, v0.4s, v2.4s
                faddp v0.4s, v0.4s, v0.4s
                faddp v0.4s, v0.4s, v0.4s
                ",
                ptr = inout(reg) ptr => _,
                len = inout(reg) len => _,
                out("s0") out, out("v1") _, out("v2") _, out("v3") _,
                out("v4") _, out("v5") _, out("v6") _, out("v7") _,);
                f32::from_bits(out)
            }
        }
        unsafe { run(buf) }
    },
    #[inline(never)]
    fn reduce_two(a: f32, b: f32) -> f32 {
        a + b
    }
);

#[cfg(test)]
mod test_arm64simd_sum_f32_16n {
    use super::*;
    crate::sum_frame_tests!(true, f32, arm64simd_sum_f32_16n);
}