libblur 0.10.7

High performance blur in pure rust
Documentation
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(target_feature = "sse4.1")]
pub(crate) mod sse_utils {
    #[cfg(target_arch = "x86_64")]
    use std::arch::x86_64::*;
    #[cfg(target_arch = "x86")]
    use std::arch::x86::*;

    #[inline]
    pub(crate) unsafe fn load_u8_s32_fast<const CHANNELS_COUNT: usize>(
        ptr: *const u8,
    ) -> __m128i {
        let u_first = u32::from_le_bytes([*ptr, 0, 0, 0]);
        let u_second = u32::from_le_bytes([*ptr.add(1), 0, 0, 0]);
        let u_third = u32::from_le_bytes([*ptr.add(2), 0, 0, 0]);
        let u_fourth = match CHANNELS_COUNT {
            4 => u32::from_le_bytes([*ptr.add(3), 0, 0, 0]),
            _ => 0,
        };
        let store: [u32; 4] = [u_first, u_second, u_third, u_fourth];
        return _mm_loadu_si128(store.as_ptr() as *const __m128i);
    }

    #[inline]
    pub(crate) unsafe fn load_u8_f32_fast<const CHANNELS_COUNT: usize>(
        ptr: *const u8,
    ) -> __m128 {
        let vl = load_u8_s32_fast::<CHANNELS_COUNT>(ptr);
        return _mm_cvtepi32_ps(vl);
    }

    #[inline(always)]
    pub(crate) unsafe fn load_u8_u32_one(
        ptr: *const u8,
    ) -> __m128i {
        let u_first = u32::from_le_bytes([*ptr, 0, 0, 0]);
        return _mm_set1_epi32(u_first as i32);
    }

    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
    #[cfg(not(target_feature = "fma"))]
    #[inline]
    pub unsafe fn _mm_prefer_fma_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
        return _mm_add_ps(_mm_mul_ps(b, c), a);
    }

    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
    #[cfg(target_feature = "fma")]
    #[inline]
    pub unsafe fn _mm_prefer_fma_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
        return _mm_fmadd_ps(b, c, a);
    }
}