npsimd 0.3.0 - Docs.rs

//! Intrinsics for SSE4.1.

use super::*;

/// SSE4.1 (Streaming SIMD Extensions 4.1).
///
/// SSE4.1 was the first part of SSE4, providing some general integer vector
/// operations that generally extend the capabilities of the SSE generation.  It
/// was first released in 2007 with the Penryn microarchitecture.  While these
/// instructions are nice to have, they usually aren't strong requirements for
/// vectorized algorithms.
///
/// # Instructions
///
/// These instructions are implemented by [`Use`].
///
/// Element insertion and extraction:
///
/// - [x] `pextrb`
///   - [`Use::get_u8x16`]
///   - [`Use::get_s8x16`]
/// - [x] `pextrd`
///   - [`Use::get_u16x8`]
///   - [`Use::get_s16x8`]
/// - [x] `pextrq`
///   - [`Use::get_u32x4`]
///   - [`Use::get_s32x4`]
///
/// - [x] `pinsrb`
///   - [`Use::put_u8x16`]
///   - [`Use::put_s8x16`]
/// - [x] `pinsrd`
///   - [`Use::put_u16x8`]
///   - [`Use::put_s16x8`]
/// - [x] `pinsrq`
///   - [`Use::put_u32x4`]
///   - [`Use::put_s32x4`]
///
/// Basic integer operations:
///
/// - [x] `pmaxuw`
///   - [`Use::max_u16x8`]
/// - [x] `pmaxud`
///   - [`Use::max_u32x4`]
/// - [x] `pmaxsb`
///   - [`Use::max_s8x16`]
/// - [x] `pmaxsd`
///   - [`Use::max_s32x4`]
///
/// - [x] `pminuw`
///   - [`Use::min_u16x8`]
/// - [x] `pminud`
///   - [`Use::min_u32x4`]
/// - [x] `pminsb`
///   - [`Use::min_s8x16`]
/// - [x] `pminsd`
///   - [`Use::min_s32x4`]
///
/// - [x] `pmuldq`
///   - [`Use::mul_s32_s64x2`]
/// - [x] `pmulld`
///   - [`Use::mul_lo_s32x4`]
///
/// Integer comparisons:
///
/// - [x] `pcmpeqq`
///   - [`Use::cmp_eq_u64x2`]
///   - [`Use::cmp_eq_s64x2`]
///
/// - [ ] `ptest`
///
/// Specialized integer operations:
///
/// - [x] `phminposw`
///   - [`Use::min_pos_u16x8`]
///
/// - [ ] `mpsadbw`
///
/// Integer resizing:
///
/// - [x] `pmovzxbw`
///   - [`Use::expand_u8x16_u16x8`]
///   - [`Use::expand_u8x16_s16x8`]
/// - [x] `pmovzxbd`
///   - [`Use::expand_u8x16_u32x4`]
///   - [`Use::expand_u8x16_s32x4`]
/// - [x] `pmovzxbq`
///   - [`Use::expand_u8x16_u64x2`]
///   - [`Use::expand_u8x16_s64x2`]
/// - [x] `pmovzxwd`
///   - [`Use::expand_u16x8_u32x4`]
///   - [`Use::expand_u16x8_s32x4`]
/// - [x] `pmovzxwq`
///   - [`Use::expand_u16x8_u64x2`]
///   - [`Use::expand_u16x8_s64x2`]
/// - [x] `pmovzxdq`
///   - [`Use::expand_u32x4_u64x2`]
///   - [`Use::expand_u32x4_s64x2`]
///
/// - [x] `pmovsxbw`
///   - [`Use::expand_s8x16_s16x8`]
/// - [x] `pmovsxbd`
///   - [`Use::expand_s8x16_s32x4`]
/// - [x] `pmovsxbq`
///   - [`Use::expand_s8x16_s64x2`]
/// - [x] `pmovsxwd`
///   - [`Use::expand_s16x8_s32x4`]
/// - [x] `pmovsxwq`
///   - [`Use::expand_s16x8_s64x2`]
/// - [x] `pmovsxdq`
///   - [`Use::expand_s32x4_s64x2`]
///
/// - [x] `packusdw`
///   - [`Use::concat_and_saturate_u16_s32x4`]
///
/// Data shuffling:
///
/// - [x] `pblendvb`
///   - [`Use::blend_u8x16`]
///   - [`Use::blend_s8x16`]
///
/// - [x] `pblendw`
///   - [`Use::blend_by_u16x8`]
///   - [`Use::blend_by_s16x8`]
pub struct SSE4_1(());

impl Feature<FeatureGroup> for SSE4_1 {
    fn get_support(runtime: &RuntimeSupport) -> Option<Self> {
        runtime.sse4_1().then_some(Self(()))
    }
}

/// SSE4.1 intrinsics.
///
/// See [`SSE4_1`] for information about this extension and which instructions
/// are supported here.
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, SSE4_1> {
    // Blend elements in 16-byte integer vectors with a constant mask.
    defn_simd_shared!("sse4.1", {
        simd_select_bitmask(MASK, y, x)
    }, {
        /// Blend 16-bit integers between vectors using a constant mask.
        #[intrinsic_for("pblendw")]
        #[intel_equivalents("_mm_blend_epi16")]
        pub fn blend_by_u16x8<MASK: u8>(x: u16x8, y: u16x8) -> u16x8;

        /// Blend 16-bit integers between vectors using a constant mask.
        #[intrinsic_for("pblendw")]
        #[intel_equivalents("_mm_blend_epi16")]
        pub fn blend_by_s16x8<MASK: u8>(x: s16x8, y: s16x8) -> s16x8;
    });

    // Blend elements in 16-byte integer vectors with a variable mask.
    defn_simd_shared!("sse4.1", {
        let mask: s8x16 = simd_lt(mask, s8x16::splat(0));
        simd_select(mask, y, x)
    }, {
        /// Blend 8-bit integers between vectors.
        #[intrinsic_for("pblendvb")]
        #[intel_equivalents("_mm_blendv_epi8")]
        pub fn blend_u8x16(x: u8x16, y: u8x16, mask: s8x16) -> u8x16;

        /// Blend 8-bit integers between vectors.
        #[intrinsic_for("pblendvb")]
        #[intel_equivalents("_mm_blendv_epi8")]
        pub fn blend_s8x16(x: s8x16, y: s8x16, mask: s8x16) -> s8x16;
    });

    // Multiply low words in integer elements in 16-byte vectors.
    defn_simd_manual!("sse4.1", {
        /// Multiply the low 32 bits in signed 64-bit integers.
        #[intrinsic_for("pmuldq")]
        #[intel_equivalents("_mm_mul_epi32")]
        pub fn mul_s32_s64x2(a: s64x2, b: s64x2) -> s64x2 {
            let [a, b]: [s32x2; 2] = [simd_cast(a), simd_cast(b)];
            simd_mul(simd_cast(a), simd_cast(b))
        }
    });

    // Multiply integer elements in 16-byte vectors for low words.
    defn_simd_manual!("sse4.1", {
        /// Multiply signed 32-bit integers for the low words.
        #[intrinsic_for("pmulld")]
        #[intel_equivalents("_mm_mullo_epi32")]
        pub fn mul_lo_s32x4(a: s32x4, b: s32x4) -> s32x4 {
            simd_mul(a, b)
        }
    });

    // Compare integer elements in two 16-byte vectors for equality.
    defn_simd_manual!("sse4.1", {
        /// Compare unsigned 64-bit integers for equality.
        #[intrinsic_for("pcmpeqq")]
        #[intel_equivalents("_mm_cmpeq_epi64")]
        pub fn cmp_eq_u64x2(a: u64x2, b: u64x2) -> u64x2 = simd_eq;

        /// Compare signed 64-bit integers for equality.
        #[intrinsic_for("pcmpeqq")]
        #[intel_equivalents("_mm_cmpeq_epi64")]
        pub fn cmp_eq_s64x2(a: s64x2, b: s64x2) -> s64x2 = simd_eq;
    });

    // Maximum of integer elements in 16-byte vectors.
    defn_simd_shared!("sse4.1", { simd_max(a, b) }, {
        /// Maximum of unsigned 16-bit integers.
        #[intrinsic_for("pmaxuw")]
        #[intel_equivalents("_mm_max_epu16")]
        pub fn max_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Maximum of unsigned 32-bit integers.
        #[intrinsic_for("pmaxud")]
        #[intel_equivalents("_mm_max_epu32")]
        pub fn max_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Maximum of signed 8-bit integers.
        #[intrinsic_for("pmaxsb")]
        #[intel_equivalents("_mm_max_epi8")]
        pub fn max_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Maximum of signed 32-bit integers.
        #[intrinsic_for("pmaxsd")]
        #[intel_equivalents("_mm_max_epi32")]
        pub fn max_s32x4(a: s32x4, b: s32x4) -> s32x4;
    });

    // Minimum of integer elements in 16-byte vectors.
    defn_simd_shared!("sse4.1", { simd_min(a, b) }, {
        /// Minimum of unsigned 16-bit integers.
        #[intrinsic_for("pminuw")]
        #[intel_equivalents("_mm_min_epu16")]
        pub fn min_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Minimum of unsigned 32-bit integers.
        #[intrinsic_for("pminud")]
        #[intel_equivalents("_mm_min_epu32")]
        pub fn min_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Minimum of signed 8-bit integers.
        #[intrinsic_for("pminsb")]
        #[intel_equivalents("_mm_min_epi8")]
        pub fn min_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Minimum of signed 32-bit integers.
        #[intrinsic_for("pminsd")]
        #[intel_equivalents("_mm_min_epi32")]
        pub fn min_s32x4(a: s32x4, b: s32x4) -> s32x4;
    });

    // Expand the lower integers in 16-byte vectors.
    defn_simd_manual!("sse4.1", {
        /// Expand unsigned 8-bit integers to unsigned 16-bit integers.
        #[intrinsic_for("pmovzxbw")]
        #[intel_equivalents("_mm_cvtepu8_epi16")]
        pub fn expand_u8x16_u16x8(x: u8x16) -> u16x8
            = simd_expand::<_, u8x8, _>;

        /// Expand unsigned 8-bit integers to signed 16-bit integers.
        #[intrinsic_for("pmovzxbw")]
        #[intel_equivalents("_mm_cvtepu8_epi16")]
        pub fn expand_u8x16_s16x8(x: u8x16) -> s16x8
            = simd_expand::<_, u8x8, _>;

        /// Expand unsigned 8-bit integers to unsigned 32-bit integers.
        #[intrinsic_for("pmovzxbd")]
        #[intel_equivalents("_mm_cvtepu8_epi32")]
        pub fn expand_u8x16_u32x4(x: u8x16) -> u32x4
            = simd_expand::<_, u8x4, _>;

        /// Expand unsigned 8-bit integers to signed 32-bit integers.
        #[intrinsic_for("pmovzxbd")]
        #[intel_equivalents("_mm_cvtepu8_epi32")]
        pub fn expand_u8x16_s32x4(x: u8x16) -> s32x4
            = simd_expand::<_, u8x4, _>;

        /// Expand unsigned 8-bit integers to unsigned 64-bit integers.
        #[intrinsic_for("pmovzxbq")]
        #[intel_equivalents("_mm_cvtepu8_epi64")]
        pub fn expand_u8x16_u64x2(x: u8x16) -> u64x2
            = simd_expand::<_, u8x2, _>;

        /// Expand unsigned 8-bit integers to signed 64-bit integers.
        #[intrinsic_for("pmovzxbq")]
        #[intel_equivalents("_mm_cvtepu8_epi64")]
        pub fn expand_u8x16_s64x2(x: u8x16) -> s64x2
            = simd_expand::<_, u8x2, _>;

        /// Expand unsigned 16-bit integers to unsigned 32-bit integers.
        #[intrinsic_for("pmovzxwd")]
        #[intel_equivalents("_mm_cvtepu16_epi32")]
        pub fn expand_u16x8_u32x4(x: u16x8) -> u32x4
            = simd_expand::<_, u16x4, _>;

        /// Expand unsigned 16-bit integers to signed 32-bit integers.
        #[intrinsic_for("pmovzxwd")]
        #[intel_equivalents("_mm_cvtepu16_epi32")]
        pub fn expand_u16x8_s32x4(x: u16x8) -> s32x4
            = simd_expand::<_, u16x4, _>;

        /// Expand unsigned 16-bit integers to unsigned 64-bit integers.
        #[intrinsic_for("pmovzxwq")]
        #[intel_equivalents("_mm_cvtepu16_epi64")]
        pub fn expand_u16x8_u64x2(x: u16x8) -> u64x2
            = simd_expand::<_, u16x2, _>;

        /// Expand unsigned 16-bit integers to signed 64-bit integers.
        #[intrinsic_for("pmovzxwq")]
        #[intel_equivalents("_mm_cvtepu16_epi64")]
        pub fn expand_u16x8_s64x2(x: u16x8) -> s64x2
            = simd_expand::<_, u16x2, _>;

        /// Expand unsigned 32-bit integers to unsigned 64-bit integers.
        #[intrinsic_for("pmovzxdq")]
        #[intel_equivalents("_mm_cvtepu32_epi64")]
        pub fn expand_u32x4_u64x2(x: u32x4) -> u64x2
            = simd_expand::<_, u32x2, _>;

        /// Expand unsigned 32-bit integers to signed 64-bit integers.
        #[intrinsic_for("pmovzxdq")]
        #[intel_equivalents("_mm_cvtepu32_epi64")]
        pub fn expand_u32x4_s64x2(x: u32x4) -> s64x2
            = simd_expand::<_, u32x2, _>;

        /// Expand signed 8-bit integers to signed 16-bit integers.
        #[intrinsic_for("pmovsxbw")]
        #[intel_equivalents("_mm_cvtepi8_epi16")]
        pub fn expand_s8x16_s16x8(x: s8x16) -> s16x8
            = simd_expand::<_, s8x8, _>;

        /// Expand signed 8-bit integers to signed 32-bit integers.
        #[intrinsic_for("pmovsxbd")]
        #[intel_equivalents("_mm_cvtepi8_epi32")]
        pub fn expand_s8x16_s32x4(x: s8x16) -> s32x4
            = simd_expand::<_, s8x4, _>;

        /// Expand signed 8-bit integers to signed 64-bit integers.
        #[intrinsic_for("pmovsxbq")]
        #[intel_equivalents("_mm_cvtepi8_epi64")]
        pub fn expand_s8x16_s64x2(x: s8x16) -> s64x2
            = simd_expand::<_, s8x2, _>;

        /// Expand signed 16-bit integers to signed 32-bit integers.
        #[intrinsic_for("pmovsxwd")]
        #[intel_equivalents("_mm_cvtepi16_epi32")]
        pub fn expand_s16x8_s32x4(x: s16x8) -> s32x4
            = simd_expand::<_, s16x4, _>;

        /// Expand signed 16-bit integers to signed 64-bit integers.
        #[intrinsic_for("pmovsxwq")]
        #[intel_equivalents("_mm_cvtepi16_epi64")]
        pub fn expand_s16x8_s64x2(x: s16x8) -> s64x2
            = simd_expand::<_, s16x2, _>;

        /// Expand signed 32-bit integers to signed 64-bit integers.
        #[intrinsic_for("pmovsxdq")]
        #[intel_equivalents("_mm_cvtepi32_epi64")]
        pub fn expand_s32x4_s64x2(x: s32x4) -> s64x2
            = simd_expand::<_, s32x2, _>;
    });

    // Saturate and pack integer elements from two 16-byte vectors.
    defn_simd_llvm!("sse4.1", {
        /// Saturate two vectors of signed 32-bit integers to unsigned 16 bits.
        #[intrinsic_for("packusdw")]
        #[intel_equivalents("_mm_packus_epi32")]
        pub fn concat_and_saturate_u16_s32x4(a: s32x4, b: s32x4) -> u16x8
            = "llvm.x86.sse41.packusdw.128";
    });

    // Extract a primitive element from a 16-byte vector.
    defn_simd_shared!("sse4.1", fn(T) -> R {
        const_assert!(INDEX < T::LEN as u8);
        simd_extract(x, const { INDEX as u32 })
    }, {
        /// Extract an unsigned 8-bit integer from a constant index.
        #[intrinsic_for("pextrb")]
        #[intel_equivalents("_mm_extract_epi8")]
        pub fn get_u8x16<INDEX: u8>(x: u8x16) -> u8;

        /// Extract an unsigned 32-bit integer from a constant index.
        #[intrinsic_for("pextrd")]
        #[intel_equivalents("_mm_extract_epi32")]
        pub fn get_u32x4<INDEX: u8>(x: u32x4) -> u32;

        /// Extract an unsigned 64-bit integer from a constant index.
        #[intrinsic_for("pextrq")]
        #[intel_equivalents("_mm_extract_epi64")]
        pub fn get_u64x2<INDEX: u8>(x: u64x2) -> u64;

        /// Extract a signed 8-bit integer from a constant index.
        #[intrinsic_for("pextrb")]
        #[intel_equivalents("_mm_extract_epi8")]
        pub fn get_s8x16<INDEX: u8>(x: s8x16) -> i8;

        /// Extract a signed 32-bit integer from a constant index.
        #[intrinsic_for("pextrd")]
        #[intel_equivalents("_mm_extract_epi32")]
        pub fn get_s32x4<INDEX: u8>(x: s32x4) -> i32;

        /// Extract a signed 64-bit integer from a constant index.
        #[intrinsic_for("pextrq")]
        #[intel_equivalents("_mm_extract_epi64")]
        pub fn get_s64x2<INDEX: u8>(x: s64x2) -> i64;
    });

    // Insert a primitive element into a 16-byte vector.
    defn_simd_shared!("sse4.1", fn(T, E) -> R {
        const_assert!(INDEX < T::LEN as u8);
        simd_insert(x, const { INDEX as u32 }, e)
    }, {
        /// Replace an unsigned 8-bit integer at a constant index.
        #[intrinsic_for("pinsrb")]
        #[intel_equivalents("_mm_insert_epi8")]
        pub fn put_u8x16<INDEX: u8>(x: u8x16, e: u8) -> u8x16;

        /// Replace an unsigned 32-bit integer at a constant index.
        #[intrinsic_for("pinsrd")]
        #[intel_equivalents("_mm_insert_epi32")]
        pub fn put_u32x4<INDEX: u8>(x: u32x4, e: u32) -> u32x4;

        /// Replace an unsigned 64-bit integer at a constant index.
        #[intrinsic_for("pinsrq")]
        #[intel_equivalents("_mm_insert_epi64")]
        pub fn put_u64x2<INDEX: u8>(x: u64x2, e: u64) -> u64x2;

        /// Replace a signed 8-bit integer at a constant index.
        #[intrinsic_for("pinsrb")]
        #[intel_equivalents("_mm_insert_epi8")]
        pub fn put_s8x16<INDEX: u8>(x: s8x16, e: i8) -> s8x16;

        /// Replace a signed 32-bit integer at a constant index.
        #[intrinsic_for("pinsrd")]
        #[intel_equivalents("_mm_insert_epi8")]
        pub fn put_s32x4<INDEX: u8>(x: s32x4, e: i32) -> s32x4;

        /// Replace a signed 64-bit integer at a constant index.
        #[intrinsic_for("pinsrq")]
        #[intel_equivalents("_mm_insert_epi64")]
        pub fn put_s64x2<INDEX: u8>(x: s64x2, e: i64) -> s64x2;
    });

    // Find the minimum value and index in a 16-byte integer vector.
    defn_simd_llvm!("sse4.1", {
        /// Find the minimum 16-bit integer and its index.
        #[intrinsic_for("phminposw")]
        #[intel_equivalents("_mm_minpos_epu16")]
        pub fn min_pos_u16x8(x: u16x8) -> u16x8
            = "llvm.x86.sse41.phminposw";
    });
}