npsimd 0.3.0

An ergonomic library for architecture-specific vectorization.
Documentation
//! Intrinsics for SSSE3.

use core::ops::BitAnd;

use super::*;

/// SSSE3 (Supplemental Streaming SIMD Extensions 3).
///
/// SSSE3 is a component of the third extension in the SSE generation, released
/// in 2006 on the Intel "Woodcrest" Xeon.  It provides a crucial instruction,
/// `PSHUFB`, which is incredibly useful for complex byte manipulation.  It can
/// be used to look up a byte value by a 4-bit index, simplifying set membership
/// or lookup-based computations.  Most character processing algorithms need
/// SSSE3.
#[derive(Copy, Clone)]
pub struct SSSE3(());

impl SSSE3 {
    /// Construct a new [`SSSE3`].
    #[cfg(any(doc, target_feature = "ssse3"))]
    pub const fn new() -> Self {
        Self(())
    }
}

#[cfg(any(doc, target_feature = "ssse3"))]
impl Default for SSSE3 {
    fn default() -> Self {
        Self::new()
    }
}

impl Features<SSE> for SSSE3 {
    fn query(support: &RuntimeSupport) -> Option<Self> {
        support.ssse3().then_some(Self(()))
    }
}

impl Feature<SSE> for SSSE3 {}

// Absolute value of elements in 16-byte vectors.

impl_simd!(SSE[SSSE3: "ssse3"], [i8; 16] {
    /// The absolute value of signed 8-bit integers.
    #[intrinsic_for("pabsb")]
    #[intel_equivalents("_mm_abs_epi8")]
    pub fn abs(self => this) -> Self {
        simd_select(
            simd_lt::<_, i8x16>(this.primitive, vector![0i8; 16].primitive),
            simd_neg(this.primitive),
            this.primitive)
    }
});

impl_simd!(SSE[SSSE3: "ssse3"], [i16; 8] {
    /// The absolute value of signed 16-bit integers.
    #[intrinsic_for("pabsw")]
    #[intel_equivalents("_mm_abs_epi16")]
    pub fn abs(self => this) -> Self {
        simd_select(
            simd_lt::<_, i16x8>(this.primitive, vector![0i16; 8].primitive),
            simd_neg(this.primitive),
            this.primitive)
    }
});

impl_simd!(SSE[SSSE3: "ssse3"], [i32; 4] {
    /// The absolute value of signed 32-bit integers.
    #[intrinsic_for("pabsd")]
    #[intel_equivalents("_mm_abs_epi32")]
    pub fn abs(self => this) -> Self {
        simd_select(
            simd_lt::<_, i32x4>(this.primitive, vector![0i32; 4].primitive),
            simd_neg(this.primitive),
            this.primitive)
    }
});

// Shuffle elements of 16-byte vectors by variable indices.

impl_simd!(SSE[SSE2, SSSE3: "sse2,ssse3"], [u8; 16] {
    /// Extract elements from this vector at the given indices.
    ///
    /// Index values are interpreted modulo 16.  If the indices are guaranteed
    /// to be less than 128 (i.e. the most significant bit is never set), prefer
    /// [`Self::masking_index()`] -- it is faster.
    ///
    /// This evaluates to `PAND` + `PSHUFB`.
    pub fn index(self => this, idxs: Vector<[u8; 16], SSE, FS>) -> Self {
        this.masking_index(idxs.bitand(imm!(0x0F, u8)).cast()).primitive
    }

    /// Extract or mask elements from this vector at the given indices.
    ///
    /// Negative index values do not index into the vector, instead writing
    /// zero.  Otherwise, index values are interpreted modulo 16.
    #[intrinsic_for("pshufb")]
    #[intel_equivalents("_mm_shuffle_epi8")]
    pub fn masking_index(
        self => this,
        idxs: Vector<[i8; 16], SSE, FS>,
    ) -> Self {
        decl_llvm_func!(
            "llvm.x86.ssse3.pshuf.b.128" as
            pshufb(v: u8x16, i: i8x16) -> u8x16);
        pshufb(this.primitive, idxs.primitive)
    }
});

impl_simd!(SSE[SSE2, SSSE3: "sse2,ssse3"], [i8; 16] {
    /// Extract elements from this vector at the given indices.
    ///
    /// Index values are interpreted modulo 16.  If the indices are guaranteed
    /// to be less than 128 (i.e. the most significant bit is never set), prefer
    /// [`Self::masking_index()`] -- it is faster.
    ///
    /// This evaluates to `PAND` + `PSHUFB`.
    pub fn index(self => this, idxs: Vector<[u8; 16], SSE, FS>) -> Self {
        this.masking_index(idxs.bitand(imm!(0x0F, u8)).cast()).primitive
    }

    /// Extract or mask elements from this vector at the given indices.
    ///
    /// Negative index values do not index into the vector, instead writing
    /// zero.  Otherwise, index values are interpreted modulo 16.
    #[intrinsic_for("pshufb")]
    #[intel_equivalents("_mm_shuffle_epi8")]
    pub fn masking_index(
        self => this,
        idxs: Vector<[i8; 16], SSE, FS>,
    ) -> Self {
        decl_llvm_func!(
            "llvm.x86.ssse3.pshuf.b.128" as
            pshufb(v: i8x16, i: i8x16) -> i8x16);
        pshufb(this.primitive, idxs.primitive)
    }
});

// Concatenate 16-byte vectors and extract a vector at a constant offset.

impl_simd!(SSE[SSSE3: "ssse3"], [u8; 16] {
    /// Append a vector and extract elements at a constant offset.
    #[intrinsic_for("palignr")]
    #[intel_equivalents("_mm_alignr_epi8")]
    pub fn concat_and_slice<Offset: Imm<u8>>(
        self => this, that: Self, _offset: Offset
    ) -> Self {
        const_assert!(Offset::VAL <= 16);
        if Offset::VAL == 0  { return this.primitive; }
        if Offset::VAL == 16 { return that.primitive; }
        simd_shuffle(this.primitive, that.primitive, const {
            simd_slice_indices::<16>(Offset::VAL as usize)
        })
    }
});

impl_simd!(SSE[SSSE3: "ssse3"], [i8; 16] {
    /// Append a vector and extract elements at a constant offset.
    #[intrinsic_for("palignr")]
    #[intel_equivalents("_mm_alignr_epi8")]
    pub fn concat_and_slice<Offset: Imm<u8>>(
        self => this, that: Self, _offset: Offset
    ) -> Self {
        const_assert!(Offset::VAL <= 16);
        if Offset::VAL == 0  { return this.primitive; }
        if Offset::VAL == 16 { return that.primitive; }
        simd_shuffle(this.primitive, that.primitive, const {
            simd_slice_indices::<16>(Offset::VAL as usize)
        })
    }
});


#[cfg(test)]
#[cfg(target_feature = "ssse3")]
mod tests {
    use core::fmt::Debug;

    use super::*;

    use proptest::prelude::*;
    use proptest::test_runner::TestCaseResult;

    /// The feature set type for these tests.
    type Feats = features!(SSE2, SSSE3);

    /// The feature set for these tests.
    const FEATS: FeatureSet<SSE, Feats> =
        FeatureSet::new((SSE2::new(), (SSSE3::new(), ())));

    /// The specialized vector type for these tests.
    type Vector<T> = super::Vector<T, SSE, Feats>;

    /// Construct a vector from an array.
    fn make<E, const LEN: usize>(a: [E; LEN]) -> Vector<[E; LEN]>
    where E: Element<LEN> + Movable<SSE, LEN> {
        super::Vector::load(&a, FEATS)
    }

    /// Test that a unary function maps over vectors correctly.
    fn test_una_map<T, R, const LEN: usize>(
        v: [T; LEN],
        vop: impl FnOnce(Vector<[T; LEN]>) -> Vector<[R; LEN]>,
        uop: impl Fn(T) -> R
    ) -> TestCaseResult
    where T: Element<LEN> + Movable<SSE, LEN>,
          R: Element<LEN> + PartialEq + Debug {
        let x = (vop)(make(v));
        let a = v.map(uop);
        prop_assert_eq!(x.as_array(), &a);
        Ok(())
    }

    // Test taking the absolute value of elements of a 16-byte vector.
    proptest! {
        #[test]
        fn abs_i8(v: [i8; 16]) {
            test_una_map(v, |v| v.abs(), |v| v.wrapping_abs())?;
        }

        #[test]
        fn abs_i16(v: [i16; 8]) {
            test_una_map(v, |v| v.abs(), |v| v.wrapping_abs())?;
        }

        #[test]
        fn abs_i32(v: [i32; 4]) {
            test_una_map(v, |v| v.abs(), |v| v.wrapping_abs())?;
        }
    }

    // Test shuffling elements of a 16-byte vector by variable indices.
    proptest! {
        #[test]
        fn index_u8(v: [u8; 16], i: [u8; 16]) {
            let x = make(v).index(make(i));
            let a = i.map(|i| v[i as usize % 16]);
            prop_assert_eq!(x.as_array(), &a);
        }

        #[test]
        fn masking_index_u8(v: [u8; 16], i: [i8; 16]) {
            let x = make(v).masking_index(make(i));
            let a = i.map(|i| if i < 0 { 0 } else { v[i as usize % 16] });
            prop_assert_eq!(x.as_array(), &a);
        }

        #[test]
        fn index_i8(v: [i8; 16], i: [u8; 16]) {
            let x = make(v).index(make(i));
            let a = i.map(|i| v[i as usize % 16]);
            prop_assert_eq!(x.as_array(), &a);
        }

        #[test]
        fn masking_index_i8(v: [i8; 16], i: [i8; 16]) {
            let x = make(v).masking_index(make(i));
            let a = i.map(|i| if i < 0 { 0 } else { v[i as usize % 16] });
            prop_assert_eq!(x.as_array(), &a);
        }
    }

    proptest! {
        #[test]
        fn concat_and_slice_u8(v: [u8; 32]) {
            let l = Vector::<[u8; 16]>::load(v[..16].try_into().unwrap(), FEATS);
            let r = Vector::<[u8; 16]>::load(v[16..].try_into().unwrap(), FEATS);

            let x = l.concat_and_slice(r, imm!(0, u8));
            prop_assert_eq!(x.as_array(), &v[0..][..16]);

            let x = l.concat_and_slice(r, imm!(1, u8));
            prop_assert_eq!(x.as_array(), &v[1..][..16]);

            let x = l.concat_and_slice(r, imm!(8, u8));
            prop_assert_eq!(x.as_array(), &v[8..][..16]);

            let x = l.concat_and_slice(r, imm!(16, u8));
            prop_assert_eq!(x.as_array(), &v[16..][..16]);
        }

        #[test]
        fn concat_and_slice_i8(v: [i8; 32]) {
            let l = Vector::<[i8; 16]>::load(v[..16].try_into().unwrap(), FEATS);
            let r = Vector::<[i8; 16]>::load(v[16..].try_into().unwrap(), FEATS);

            let x = l.concat_and_slice(r, imm!(0, u8));
            prop_assert_eq!(x.as_array(), &v[0..][..16]);

            let x = l.concat_and_slice(r, imm!(1, u8));
            prop_assert_eq!(x.as_array(), &v[1..][..16]);

            let x = l.concat_and_slice(r, imm!(8, u8));
            prop_assert_eq!(x.as_array(), &v[8..][..16]);

            let x = l.concat_and_slice(r, imm!(16, u8));
            prop_assert_eq!(x.as_array(), &v[16..][..16]);
        }
    }
}