use super::*;
pub struct SSE4_1(());
impl Feature<FeatureGroup> for SSE4_1 {
fn get_support(runtime: &RuntimeSupport) -> Option<Self> {
runtime.sse4_1().then_some(Self(()))
}
}
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, SSE4_1> {
defn_simd_shared!("sse4.1", {
simd_select_bitmask(MASK, y, x)
}, {
#[intrinsic_for("pblendw")]
#[intel_equivalents("_mm_blend_epi16")]
pub fn blend_by_u16x8<MASK: u8>(x: u16x8, y: u16x8) -> u16x8;
#[intrinsic_for("pblendw")]
#[intel_equivalents("_mm_blend_epi16")]
pub fn blend_by_s16x8<MASK: u8>(x: s16x8, y: s16x8) -> s16x8;
});
defn_simd_shared!("sse4.1", {
let mask: s8x16 = simd_lt(mask, s8x16::splat(0));
simd_select(mask, y, x)
}, {
#[intrinsic_for("pblendvb")]
#[intel_equivalents("_mm_blendv_epi8")]
pub fn blend_u8x16(x: u8x16, y: u8x16, mask: s8x16) -> u8x16;
#[intrinsic_for("pblendvb")]
#[intel_equivalents("_mm_blendv_epi8")]
pub fn blend_s8x16(x: s8x16, y: s8x16, mask: s8x16) -> s8x16;
});
defn_simd_manual!("sse4.1", {
#[intrinsic_for("pmuldq")]
#[intel_equivalents("_mm_mul_epi32")]
pub fn mul_s32_s64x2(a: s64x2, b: s64x2) -> s64x2 {
let [a, b]: [s32x2; 2] = [simd_cast(a), simd_cast(b)];
simd_mul(simd_cast(a), simd_cast(b))
}
});
defn_simd_manual!("sse4.1", {
#[intrinsic_for("pmulld")]
#[intel_equivalents("_mm_mullo_epi32")]
pub fn mul_lo_s32x4(a: s32x4, b: s32x4) -> s32x4 {
simd_mul(a, b)
}
});
defn_simd_manual!("sse4.1", {
#[intrinsic_for("pcmpeqq")]
#[intel_equivalents("_mm_cmpeq_epi64")]
pub fn cmp_eq_u64x2(a: u64x2, b: u64x2) -> u64x2 = simd_eq;
#[intrinsic_for("pcmpeqq")]
#[intel_equivalents("_mm_cmpeq_epi64")]
pub fn cmp_eq_s64x2(a: s64x2, b: s64x2) -> s64x2 = simd_eq;
});
defn_simd_shared!("sse4.1", { simd_max(a, b) }, {
#[intrinsic_for("pmaxuw")]
#[intel_equivalents("_mm_max_epu16")]
pub fn max_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("pmaxud")]
#[intel_equivalents("_mm_max_epu32")]
pub fn max_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("pmaxsb")]
#[intel_equivalents("_mm_max_epi8")]
pub fn max_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("pmaxsd")]
#[intel_equivalents("_mm_max_epi32")]
pub fn max_s32x4(a: s32x4, b: s32x4) -> s32x4;
});
defn_simd_shared!("sse4.1", { simd_min(a, b) }, {
#[intrinsic_for("pminuw")]
#[intel_equivalents("_mm_min_epu16")]
pub fn min_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("pminud")]
#[intel_equivalents("_mm_min_epu32")]
pub fn min_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("pminsb")]
#[intel_equivalents("_mm_min_epi8")]
pub fn min_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("pminsd")]
#[intel_equivalents("_mm_min_epi32")]
pub fn min_s32x4(a: s32x4, b: s32x4) -> s32x4;
});
defn_simd_manual!("sse4.1", {
#[intrinsic_for("pmovzxbw")]
#[intel_equivalents("_mm_cvtepu8_epi16")]
pub fn expand_u8x16_u16x8(x: u8x16) -> u16x8
= simd_expand::<_, u8x8, _>;
#[intrinsic_for("pmovzxbw")]
#[intel_equivalents("_mm_cvtepu8_epi16")]
pub fn expand_u8x16_s16x8(x: u8x16) -> s16x8
= simd_expand::<_, u8x8, _>;
#[intrinsic_for("pmovzxbd")]
#[intel_equivalents("_mm_cvtepu8_epi32")]
pub fn expand_u8x16_u32x4(x: u8x16) -> u32x4
= simd_expand::<_, u8x4, _>;
#[intrinsic_for("pmovzxbd")]
#[intel_equivalents("_mm_cvtepu8_epi32")]
pub fn expand_u8x16_s32x4(x: u8x16) -> s32x4
= simd_expand::<_, u8x4, _>;
#[intrinsic_for("pmovzxbq")]
#[intel_equivalents("_mm_cvtepu8_epi64")]
pub fn expand_u8x16_u64x2(x: u8x16) -> u64x2
= simd_expand::<_, u8x2, _>;
#[intrinsic_for("pmovzxbq")]
#[intel_equivalents("_mm_cvtepu8_epi64")]
pub fn expand_u8x16_s64x2(x: u8x16) -> s64x2
= simd_expand::<_, u8x2, _>;
#[intrinsic_for("pmovzxwd")]
#[intel_equivalents("_mm_cvtepu16_epi32")]
pub fn expand_u16x8_u32x4(x: u16x8) -> u32x4
= simd_expand::<_, u16x4, _>;
#[intrinsic_for("pmovzxwd")]
#[intel_equivalents("_mm_cvtepu16_epi32")]
pub fn expand_u16x8_s32x4(x: u16x8) -> s32x4
= simd_expand::<_, u16x4, _>;
#[intrinsic_for("pmovzxwq")]
#[intel_equivalents("_mm_cvtepu16_epi64")]
pub fn expand_u16x8_u64x2(x: u16x8) -> u64x2
= simd_expand::<_, u16x2, _>;
#[intrinsic_for("pmovzxwq")]
#[intel_equivalents("_mm_cvtepu16_epi64")]
pub fn expand_u16x8_s64x2(x: u16x8) -> s64x2
= simd_expand::<_, u16x2, _>;
#[intrinsic_for("pmovzxdq")]
#[intel_equivalents("_mm_cvtepu32_epi64")]
pub fn expand_u32x4_u64x2(x: u32x4) -> u64x2
= simd_expand::<_, u32x2, _>;
#[intrinsic_for("pmovzxdq")]
#[intel_equivalents("_mm_cvtepu32_epi64")]
pub fn expand_u32x4_s64x2(x: u32x4) -> s64x2
= simd_expand::<_, u32x2, _>;
#[intrinsic_for("pmovsxbw")]
#[intel_equivalents("_mm_cvtepi8_epi16")]
pub fn expand_s8x16_s16x8(x: s8x16) -> s16x8
= simd_expand::<_, s8x8, _>;
#[intrinsic_for("pmovsxbd")]
#[intel_equivalents("_mm_cvtepi8_epi32")]
pub fn expand_s8x16_s32x4(x: s8x16) -> s32x4
= simd_expand::<_, s8x4, _>;
#[intrinsic_for("pmovsxbq")]
#[intel_equivalents("_mm_cvtepi8_epi64")]
pub fn expand_s8x16_s64x2(x: s8x16) -> s64x2
= simd_expand::<_, s8x2, _>;
#[intrinsic_for("pmovsxwd")]
#[intel_equivalents("_mm_cvtepi16_epi32")]
pub fn expand_s16x8_s32x4(x: s16x8) -> s32x4
= simd_expand::<_, s16x4, _>;
#[intrinsic_for("pmovsxwq")]
#[intel_equivalents("_mm_cvtepi16_epi64")]
pub fn expand_s16x8_s64x2(x: s16x8) -> s64x2
= simd_expand::<_, s16x2, _>;
#[intrinsic_for("pmovsxdq")]
#[intel_equivalents("_mm_cvtepi32_epi64")]
pub fn expand_s32x4_s64x2(x: s32x4) -> s64x2
= simd_expand::<_, s32x2, _>;
});
defn_simd_llvm!("sse4.1", {
#[intrinsic_for("packusdw")]
#[intel_equivalents("_mm_packus_epi32")]
pub fn concat_and_saturate_u16_s32x4(a: s32x4, b: s32x4) -> u16x8
= "llvm.x86.sse41.packusdw.128";
});
defn_simd_shared!("sse4.1", fn(T) -> R {
const_assert!(INDEX < T::LEN as u8);
simd_extract(x, const { INDEX as u32 })
}, {
#[intrinsic_for("pextrb")]
#[intel_equivalents("_mm_extract_epi8")]
pub fn get_u8x16<INDEX: u8>(x: u8x16) -> u8;
#[intrinsic_for("pextrd")]
#[intel_equivalents("_mm_extract_epi32")]
pub fn get_u32x4<INDEX: u8>(x: u32x4) -> u32;
#[intrinsic_for("pextrq")]
#[intel_equivalents("_mm_extract_epi64")]
pub fn get_u64x2<INDEX: u8>(x: u64x2) -> u64;
#[intrinsic_for("pextrb")]
#[intel_equivalents("_mm_extract_epi8")]
pub fn get_s8x16<INDEX: u8>(x: s8x16) -> i8;
#[intrinsic_for("pextrd")]
#[intel_equivalents("_mm_extract_epi32")]
pub fn get_s32x4<INDEX: u8>(x: s32x4) -> i32;
#[intrinsic_for("pextrq")]
#[intel_equivalents("_mm_extract_epi64")]
pub fn get_s64x2<INDEX: u8>(x: s64x2) -> i64;
});
defn_simd_shared!("sse4.1", fn(T, E) -> R {
const_assert!(INDEX < T::LEN as u8);
simd_insert(x, const { INDEX as u32 }, e)
}, {
#[intrinsic_for("pinsrb")]
#[intel_equivalents("_mm_insert_epi8")]
pub fn put_u8x16<INDEX: u8>(x: u8x16, e: u8) -> u8x16;
#[intrinsic_for("pinsrd")]
#[intel_equivalents("_mm_insert_epi32")]
pub fn put_u32x4<INDEX: u8>(x: u32x4, e: u32) -> u32x4;
#[intrinsic_for("pinsrq")]
#[intel_equivalents("_mm_insert_epi64")]
pub fn put_u64x2<INDEX: u8>(x: u64x2, e: u64) -> u64x2;
#[intrinsic_for("pinsrb")]
#[intel_equivalents("_mm_insert_epi8")]
pub fn put_s8x16<INDEX: u8>(x: s8x16, e: i8) -> s8x16;
#[intrinsic_for("pinsrd")]
#[intel_equivalents("_mm_insert_epi8")]
pub fn put_s32x4<INDEX: u8>(x: s32x4, e: i32) -> s32x4;
#[intrinsic_for("pinsrq")]
#[intel_equivalents("_mm_insert_epi64")]
pub fn put_s64x2<INDEX: u8>(x: s64x2, e: i64) -> s64x2;
});
defn_simd_llvm!("sse4.1", {
#[intrinsic_for("phminposw")]
#[intel_equivalents("_mm_minpos_epu16")]
pub fn min_pos_u16x8(x: u16x8) -> u16x8
= "llvm.x86.sse41.phminposw";
});
}