use super::*;
pub struct SSSE3(());
impl Feature<FeatureGroup> for SSSE3 {
fn get_support(runtime: &RuntimeSupport) -> Option<Self> {
runtime.ssse3().then_some(Self(()))
}
}
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, SSSE3> {
defn_simd_shared!("ssse3", { simd_abs(x) }, {
#[intrinsic_for("pabsb")]
#[intel_equivalents("_mm_abs_epi8")]
pub fn abs_s8x16(x: s8x16) -> s8x16;
#[intrinsic_for("pabsw")]
#[intel_equivalents("_mm_abs_epi16")]
pub fn abs_s16x8(x: s16x8) -> s16x8;
#[intrinsic_for("pabsd")]
#[intel_equivalents("_mm_abs_epi32")]
pub fn abs_s32x4(x: s32x4) -> s32x4;
});
defn_simd_llvm!("ssse3", {
#[intrinsic_for("phaddw")]
#[intel_equivalents("_mm_hadd_epi16")]
pub fn concat_and_reduce_add_u16x2x4(x: u16x8, y: u16x8) -> u16x8
= "llvm.x86.ssse3.phadd.w.128";
#[intrinsic_for("phaddd")]
#[intel_equivalents("_mm_hadd_epi32")]
pub fn concat_and_reduce_add_u32x2x2(x: u32x4, y: u32x4) -> u32x4
= "llvm.x86.ssse3.phadd.d.128";
#[intrinsic_for("phaddw")]
#[intel_equivalents("_mm_hadd_epi16")]
pub fn concat_and_reduce_add_s16x2x4(x: s16x8, y: s16x8) -> s16x8
= "llvm.x86.ssse3.phadd.w.128";
#[intrinsic_for("phaddd")]
#[intel_equivalents("_mm_hadd_epi32")]
pub fn concat_and_reduce_add_s32x2x2(x: s32x4, y: s32x4) -> s32x4
= "llvm.x86.ssse3.phadd.d.128";
});
defn_simd_llvm!("ssse3", {
#[intrinsic_for("phaddsw")]
#[intel_equivalents("_mm_hadds_epi16")]
pub fn concat_and_reduce_saturating_add_s16x2x4
(x: s16x8, y: s16x8) -> s16x8
= "llvm.x86.ssse3.phadd.sw.128";
});
defn_simd_llvm!("ssse3", {
#[intrinsic_for("phsubw")]
#[intel_equivalents("_mm_hsub_epi16")]
pub fn concat_and_reduce_sub_u16x2x4(x: u16x8, y: u16x8) -> u16x8
= "llvm.x86.ssse3.phsub.w.128";
#[intrinsic_for("phsubd")]
#[intel_equivalents("_mm_hsub_epi32")]
pub fn concat_and_reduce_sub_u32x2x2(x: u32x4, y: u32x4) -> u32x4
= "llvm.x86.ssse3.phsub.d.128";
#[intrinsic_for("phsubw")]
#[intel_equivalents("_mm_hsub_epi16")]
pub fn concat_and_reduce_sub_s16x2x4(x: s16x8, y: s16x8) -> s16x8
= "llvm.x86.ssse3.phsub.w.128";
#[intrinsic_for("phsubd")]
#[intel_equivalents("_mm_hsub_epi32")]
pub fn concat_and_reduce_sub_s32x2x2(x: s32x4, y: s32x4) -> s32x4
= "llvm.x86.ssse3.phsub.d.128";
});
defn_simd_llvm!("ssse3", {
#[intrinsic_for("phsubsw")]
#[intel_equivalents("_mm_hsubs_epi16")]
pub fn concat_and_reduce_saturating_sub_s16x2x4
(x: s16x8, y: s16x8) -> s16x8
= "llvm.x86.ssse3.phsub.sw.128";
});
defn_simd_llvm!("ssse3", {
#[intrinsic_for("pmaddubsw")]
#[intel_equivalents("_mm_maddubs_epi16")]
pub fn sum_of_prod_u8x2x8
(x: u16x8, y: u16x8) -> s16x8
= "llvm.x86.ssse3.pmadd.ub.sw.128";
});
defn_simd_llvm!("ssse3", {
#[intrinsic_for("pmulhrsw")]
#[intel_equivalents("_mm_mulhrs_epi16")]
pub fn mul_and_halve_s16x8
(x: s16x8, y: s16x8) -> s16x8
= "llvm.x86.ssse3.pmul.hr.sw.128";
});
defn_simd_llvm!("ssse3", {
#[intrinsic_for("psignb")]
#[intel_equivalents("_mm_sign_epi8")]
pub fn mul_sign_s8x16(x: s8x16, y: s8x16) -> s8x16
= "llvm.x86.ssse3.psign.b.128";
#[intrinsic_for("psignw")]
#[intel_equivalents("_mm_sign_epi16")]
pub fn mul_sign_s16x8(x: s16x8, y: s16x8) -> s16x8
= "llvm.x86.ssse3.psign.w.128";
#[intrinsic_for("psignd")]
#[intel_equivalents("_mm_sign_epi32")]
pub fn mul_sign_s32x4(x: s32x4, y: s32x4) -> s32x4
= "llvm.x86.ssse3.psign.d.128";
});
defn_simd_shared!("ssse3", fn(T, U) -> R {
const_assert!(SHIFT < 16);
simd_shuffle(x, y, const {
simd_slice_indices::<R>(SHIFT as usize)
})
}, {
#[intrinsic_for("palignr")]
#[intel_equivalents("_mm_alignr_epi8")]
pub fn align_elems_by_u8x16<SHIFT: u8>(x: u8x16, y: u8x16) -> u8x16;
#[intrinsic_for("palignr")]
#[intel_equivalents("_mm_alignr_epi8")]
pub fn align_elems_by_s8x16<SHIFT: u8>(x: s8x16, y: s8x16) -> s8x16;
});
defn_simd_llvm!("ssse3", {
#[intrinsic_for("pshufb")]
#[intel_equivalents("_mm_shuffle_epi8")]
pub fn shuffle_u8x16(x: u8x16, idxs: s8x16) -> u8x16
= "llvm.x86.ssse3.pshuf.b.128";
#[intrinsic_for("pshufb")]
#[intel_equivalents("_mm_shuffle_epi8")]
pub fn shuffle_s8x16(x: s8x16, idxs: s8x16) -> s8x16
= "llvm.x86.ssse3.pshuf.b.128";
});
}