tract-linalg 0.23.0-dev.4

Tiny, no-nonsense, self contained, TensorFlow and ONNX inference
Documentation
ew_impl_wrap!(
    f32,
    x86_64_avx_f32_mul_by_scalar_32n,
    32,
    8,
    f32,
    fn run(x: &mut [f32], s: f32) {
        debug_assert!(x.len() % Self::nr() == 0);
        debug_assert!(x.as_ptr() as usize % Self::alignment_bytes() == 0);
        unsafe { x86_64_avx_f32_mul_by_scalar_32n_run(x, s) }
    }
);

#[target_feature(enable = "avx")]
unsafe fn x86_64_avx_f32_mul_by_scalar_32n_run(buf: &mut [f32], scalar: f32) {
    unsafe {
        let len = buf.len();
        let ptr = buf.as_ptr();
        std::arch::asm!("
            vbroadcastss ymm0, xmm0
            2:
                vmovaps ymm4, [{ptr}]
                vmovaps ymm5, [{ptr} + 32]
                vmovaps ymm6, [{ptr} + 64]
                vmovaps ymm7, [{ptr} + 96]
                vmulps ymm4, ymm4, ymm0
                vmulps ymm5, ymm5, ymm0
                vmulps ymm6, ymm6, ymm0
                vmulps ymm7, ymm7, ymm0
                vmovaps [{ptr}], ymm4
                vmovaps [{ptr} + 32], ymm5
                vmovaps [{ptr} + 64], ymm6
                vmovaps [{ptr} + 96], ymm7
                add {ptr}, 128
                sub {len}, 32
                jnz 2b
            ",
        len = inout(reg) len => _,
        ptr = inout(reg) ptr => _,
        in("xmm0") scalar,
        out("ymm4") _, out("ymm5") _, out("ymm6") _, out("ymm7") _
        );
    }
}

#[cfg(test)]
#[macro_use]
pub mod test_x86_64_avx_f32_mul_by_scalar_32n {
    use super::*;
    by_scalar_frame_tests!(
        is_x86_feature_detected!("avx2"),
        f32,
        x86_64_avx_f32_mul_by_scalar_32n,
        |a, b| a * b
    );
}