npsimd 0.3.0

An ergonomic library for architecture-specific vectorization.
Documentation
//! Intrinsics for SSSE3.

use super::*;

/// SSSE3 (Supplementary Streaming SIMD Extensions 3).
///
/// SSSE3 is the integer counterpart to SSE3: it adds useful functionality for
/// some integer operations, common and specialized, as well as a very crucial
/// data shuffling instruction, [`pshufb`].  It was introduced in 2006 with the
/// Intel Core microarchitecture.  If you're writing a vectorized algorithm by
/// hand, `pshufb` makes SSSE3 quite essential, even if you are just mapping on
/// array elements.
///
/// [`pshufb`]: Use::shuffle_u8x16
///
/// # Instructions
///
/// These instructions are implemented by [`Use`].
///
/// Common integer operations:
///
/// - [x] `pabsb`
///   - [`Use::abs_s8x16`]
/// - [x] `pabsd`
///   - [`Use::abs_s16x8`]
/// - [x] `pabsw`
///   - [`Use::abs_s32x4`]
///
/// - [x] `phaddw`
///   - [`Use::concat_and_reduce_add_u16x2x4`]
///   - [`Use::concat_and_reduce_add_s16x2x4`]
/// - [x] `phaddd`
///   - [`Use::concat_and_reduce_add_u32x2x2`]
///   - [`Use::concat_and_reduce_add_s32x2x2`]
///
/// - [x] `phaddsw`
///   - [`Use::concat_and_reduce_saturating_add_s16x2x4`]
///
/// - [x] `phsubw`
///   - [`Use::concat_and_reduce_sub_u16x2x4`]
///   - [`Use::concat_and_reduce_sub_s16x2x4`]
/// - [x] `phsubd`
///   - [`Use::concat_and_reduce_sub_u32x2x2`]
///   - [`Use::concat_and_reduce_sub_s32x2x2`]
///
/// - [x] `phsubsw`
///   - [`Use::concat_and_reduce_saturating_sub_s16x2x4`]
///
/// - [x] `psignb`
///   - [`Use::mul_sign_s8x16`]
/// - [x] `psignw`
///   - [`Use::mul_sign_s16x8`]
/// - [x] `psignd`
///   - [`Use::mul_sign_s32x4`]
///
/// Specialized integer operations:
///
/// - [x] `pmaddubsw`
///   - [`Use::sum_of_prod_u8x2x8`]
/// - [x] `pmulhrsw`
///   - [`Use::mul_and_halve_s16x8`]
///
/// Data shuffling:
///
/// - [x] `palignr`
///   - [`Use::align_elems_by_u8x16`]
///   - [`Use::align_elems_by_s8x16`]
///
/// - [x] `pshufb`
///   - [`Use::shuffle_u8x16`]
///   - [`Use::shuffle_s8x16`]
pub struct SSSE3(());

impl Feature<FeatureGroup> for SSSE3 {
    fn get_support(runtime: &RuntimeSupport) -> Option<Self> {
        runtime.ssse3().then_some(Self(()))
    }
}

/// SSSE3 intrinsics.
///
/// See [`SSSE3`] for information about this extension and which instructions
/// are supported here.
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, SSSE3> {
    // Take the absolute value of integer elements in a 16-byte vector.
    defn_simd_shared!("ssse3", { simd_abs(x) }, {
        /// Absolute value of signed 8-bit integers.
        #[intrinsic_for("pabsb")]
        #[intel_equivalents("_mm_abs_epi8")]
        pub fn abs_s8x16(x: s8x16) -> s8x16;

        /// Absolute value of signed 16-bit integers.
        #[intrinsic_for("pabsw")]
        #[intel_equivalents("_mm_abs_epi16")]
        pub fn abs_s16x8(x: s16x8) -> s16x8;

        /// Absolute value of signed 32-bit integers.
        #[intrinsic_for("pabsd")]
        #[intel_equivalents("_mm_abs_epi32")]
        pub fn abs_s32x4(x: s32x4) -> s32x4;
    });

    // Horizontally add pairs of integers from two concatenated 16-byte vectors.
    defn_simd_llvm!("ssse3", {
        /// Horizontally add pairs of unsigned 16-bit integers from two vectors.
        #[intrinsic_for("phaddw")]
        #[intel_equivalents("_mm_hadd_epi16")]
        pub fn concat_and_reduce_add_u16x2x4(x: u16x8, y: u16x8) -> u16x8
            = "llvm.x86.ssse3.phadd.w.128";

        /// Horizontally add pairs of unsigned 32-bit integers from two vectors.
        #[intrinsic_for("phaddd")]
        #[intel_equivalents("_mm_hadd_epi32")]
        pub fn concat_and_reduce_add_u32x2x2(x: u32x4, y: u32x4) -> u32x4
            = "llvm.x86.ssse3.phadd.d.128";

        /// Horizontally add pairs of signed 16-bit integers from two vectors.
        #[intrinsic_for("phaddw")]
        #[intel_equivalents("_mm_hadd_epi16")]
        pub fn concat_and_reduce_add_s16x2x4(x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.phadd.w.128";

        /// Horizontally add pairs of signed 32-bit integers from two vectors.
        #[intrinsic_for("phaddd")]
        #[intel_equivalents("_mm_hadd_epi32")]
        pub fn concat_and_reduce_add_s32x2x2(x: s32x4, y: s32x4) -> s32x4
            = "llvm.x86.ssse3.phadd.d.128";
    });

    // Horizontally add pairs of integers, with saturation, from two
    // concatenated 16-byte vectors.
    defn_simd_llvm!("ssse3", {
        /// Horizontally add pairs of signed 16-bit integers with saturation
        /// from two vectors.
        #[intrinsic_for("phaddsw")]
        #[intel_equivalents("_mm_hadds_epi16")]
        pub fn concat_and_reduce_saturating_add_s16x2x4
            (x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.phadd.sw.128";
    });

    // Horizontally subtract pairs of integers from two concatenated 16-byte
    // vectors.
    defn_simd_llvm!("ssse3", {
        /// Horizontally subtract pairs of unsigned 16-bit integers from two
        /// vectors.
        #[intrinsic_for("phsubw")]
        #[intel_equivalents("_mm_hsub_epi16")]
        pub fn concat_and_reduce_sub_u16x2x4(x: u16x8, y: u16x8) -> u16x8
            = "llvm.x86.ssse3.phsub.w.128";

        /// Horizontally subtract pairs of unsigned 32-bit integers from two
        /// vectors.
        #[intrinsic_for("phsubd")]
        #[intel_equivalents("_mm_hsub_epi32")]
        pub fn concat_and_reduce_sub_u32x2x2(x: u32x4, y: u32x4) -> u32x4
            = "llvm.x86.ssse3.phsub.d.128";

        /// Horizontally subtract pairs of signed 16-bit integers from two
        /// vectors.
        #[intrinsic_for("phsubw")]
        #[intel_equivalents("_mm_hsub_epi16")]
        pub fn concat_and_reduce_sub_s16x2x4(x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.phsub.w.128";

        /// Horizontally subtract pairs of signed 32-bit integers from two
        /// vectors.
        #[intrinsic_for("phsubd")]
        #[intel_equivalents("_mm_hsub_epi32")]
        pub fn concat_and_reduce_sub_s32x2x2(x: s32x4, y: s32x4) -> s32x4
            = "llvm.x86.ssse3.phsub.d.128";
    });

    // Horizontally subtract pairs of integers, with saturation, from two
    // concatenated 16-byte vectors.
    defn_simd_llvm!("ssse3", {
        /// Horizontally subtract pairs of signed 16-bit integers with
        /// saturation from two vectors.
        #[intrinsic_for("phsubsw")]
        #[intel_equivalents("_mm_hsubs_epi16")]
        pub fn concat_and_reduce_saturating_sub_s16x2x4
            (x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.phsub.sw.128";
    });

    // Multiply integers from two 16-byte vectors and horizontally add pairs of
    // full-width products with saturation.
    defn_simd_llvm!("ssse3", {
        /// Multiply unsigned 8-bit integers, horizontally add pairs of 16-bit
        /// results, then saturate the sums to signed 16-bit integers.
        #[intrinsic_for("pmaddubsw")]
        #[intel_equivalents("_mm_maddubs_epi16")]
        pub fn sum_of_prod_u8x2x8
            (x: u16x8, y: u16x8) -> s16x8
            = "llvm.x86.ssse3.pmadd.ub.sw.128";
    });

    // Multiply integers from two 16-byte vectors and halve the results with
    // rounding.
    defn_simd_llvm!("ssse3", {
        /// Multiply signed 16-bit integers and halve the results with rounding.
        #[intrinsic_for("pmulhrsw")]
        #[intel_equivalents("_mm_mulhrs_epi16")]
        pub fn mul_and_halve_s16x8
            (x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.pmul.hr.sw.128";
    });

    // Multiply integers in a 16-byte vector by corresponding signs.
    defn_simd_llvm!("ssse3", {
        /// Multiply each signed 8-bit integer by the sign of the corresponding
        /// integer.
        #[intrinsic_for("psignb")]
        #[intel_equivalents("_mm_sign_epi8")]
        pub fn mul_sign_s8x16(x: s8x16, y: s8x16) -> s8x16
            = "llvm.x86.ssse3.psign.b.128";

        /// Multiply each signed 16-bit integer by the sign of the corresponding
        /// integer.
        #[intrinsic_for("psignw")]
        #[intel_equivalents("_mm_sign_epi16")]
        pub fn mul_sign_s16x8(x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.psign.w.128";

        /// Multiply each signed 16-bit integer by the sign of the corresponding
        /// integer.
        #[intrinsic_for("psignd")]
        #[intel_equivalents("_mm_sign_epi32")]
        pub fn mul_sign_s32x4(x: s32x4, y: s32x4) -> s32x4
            = "llvm.x86.ssse3.psign.d.128";
    });

    // Slice the concatenation of two 16-byte vectors.
    defn_simd_shared!("ssse3", fn(T, U) -> R {
        const_assert!(SHIFT < 16);
        simd_shuffle(x, y, const {
            simd_slice_indices::<R>(SHIFT as usize)
        })
    }, {
        /// Extract an unaligned vector between two contiguous vectors.
        ///
        /// The given vectors are concatenated in order and the given number of
        /// elements are truncated from its beginning.
        #[intrinsic_for("palignr")]
        #[intel_equivalents("_mm_alignr_epi8")]
        pub fn align_elems_by_u8x16<SHIFT: u8>(x: u8x16, y: u8x16) -> u8x16;

        /// Extract an unaligned vector between two contiguous vectors.
        ///
        /// The given vectors are concatenated in order and the given number of
        /// elements are truncated from its beginning.
        #[intrinsic_for("palignr")]
        #[intel_equivalents("_mm_alignr_epi8")]
        pub fn align_elems_by_s8x16<SHIFT: u8>(x: s8x16, y: s8x16) -> s8x16;
    });

    // Shuffle bytes within 16-byte vectors.
    defn_simd_llvm!("ssse3", {
        /// Index this array of unsigned 8-bit integers with the given vector.
        ///
        /// Each element of the index value is used modulo 16.  If an index is
        /// negative, then that position is filled with zero.
        #[intrinsic_for("pshufb")]
        #[intel_equivalents("_mm_shuffle_epi8")]
        pub fn shuffle_u8x16(x: u8x16, idxs: s8x16) -> u8x16
            = "llvm.x86.ssse3.pshuf.b.128";

        /// Index this array of signed 8-bit integers with the given vector.
        ///
        /// Each element of the index value is used modulo 16.  If an index is
        /// negative, then that position is filled with zero.
        #[intrinsic_for("pshufb")]
        #[intel_equivalents("_mm_shuffle_epi8")]
        pub fn shuffle_s8x16(x: s8x16, idxs: s8x16) -> s8x16
            = "llvm.x86.ssse3.pshuf.b.128";
    });
}