use core::ptr;
use super::*;
pub struct SSE2(());
impl Feature<FeatureGroup> for SSE2 {
fn get_support(runtime: &RuntimeSupport) -> Option<Self> {
runtime.sse2().then_some(Self(()))
}
}
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, SSE2> {
defn_simd_shared!("sse2", { simd_set_first(value) }, {
#[intrinsic_for("movd")]
#[intel_equivalents("_mm_cvtsi32_si128")]
pub fn set_first_u32x4(value: u32) -> u32x4;
#[intrinsic_for("movd")]
#[intel_equivalents("_mm_cvtsi32_si128")]
pub fn set_first_s32x4(value: i32) -> s32x4;
#[intrinsic_for("movq")]
#[intel_equivalents("_mm_cvtsi64_si128", "_mm_cvtsi64x_si128")]
#[cfg(target_arch = "x86_64")]
pub fn set_first_u64x2(value: u64) -> u64x2;
#[intrinsic_for("movq")]
#[intel_equivalents("_mm_cvtsi64_si128", "_mm_cvtsi64x_si128")]
#[cfg(target_arch = "x86_64")]
pub fn set_first_s64x2(value: i64) -> s64x2;
});
defn_simd_shared!("sse2", { simd_extract(x, 0) }, {
#[intrinsic_for("movd")]
#[intel_equivalents("_mm_cvtsi128_si32")]
pub fn get_first_u32x4(x: u32x4) -> u32;
#[intrinsic_for("movq")]
#[intel_equivalents("_mm_cvtsi128_si64", "_mm_cvtsi128_si64x")]
pub fn get_first_u64x2(x: u64x2) -> u64;
#[intrinsic_for("movd")]
#[intel_equivalents("_mm_cvtsi128_si32")]
pub fn get_first_s32x4(x: s32x4) -> i32;
#[intrinsic_for("movq")]
#[intel_equivalents("_mm_cvtsi128_si64", "_mm_cvtsi128_si64x")]
pub fn get_first_s64x2(x: s64x2) -> i64;
});
defn_simd_shared!("sse2", fn(T, E) -> R {
const_assert!(INDEX < T::LEN as u8);
simd_insert(x, const { INDEX as u32 }, e)
}, {
#[intrinsic_for("pinsrw")]
#[intel_equivalents("_mm_insert_epi16")]
pub fn put_u16x8<INDEX: u8>(x: u16x8, e: u16) -> u16x8;
#[intrinsic_for("pinsrw")]
#[intel_equivalents("_mm_insert_epi16")]
pub fn put_s16x8<INDEX: u8>(x: s16x8, e: i16) -> s16x8;
});
defn_simd_shared!("sse2", fn(T) -> E {
const_assert!(INDEX < T::LEN as u8);
simd_extract(x, const { INDEX as u32 })
}, {
#[intrinsic_for("pextrw")]
#[intel_equivalents("_mm_extract_epi16")]
pub fn get_u16x8<INDEX: u8>(x: u16x8) -> u16;
#[intrinsic_for("pextrw")]
#[intel_equivalents("_mm_extract_epi16")]
pub fn get_s16x8<INDEX: u8>(x: s16x8) -> i16;
});
defn_simd_shared!("sse2", {
ptr::read_unaligned(ptr as *const _ as *const _)
}, {
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_loadu_si128")]
pub fn load_u8x16(ptr: &[u8; 16]) -> u8x16;
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_loadu_si128")]
pub fn load_u16x8(ptr: &[u16; 8]) -> u16x8;
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_loadu_si128")]
pub fn load_u32x4(ptr: &[u32; 4]) -> u32x4;
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_loadu_si128")]
pub fn load_u64x2(ptr: &[u64; 2]) -> u64x2;
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_loadu_si128")]
pub fn load_s8x16(ptr: &[i8; 16]) -> s8x16;
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_loadu_si128")]
pub fn load_s16x8(ptr: &[i16; 8]) -> s16x8;
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_loadu_si128")]
pub fn load_s32x4(ptr: &[i32; 4]) -> s32x4;
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_loadu_si128")]
pub fn load_s64x2(ptr: &[i64; 2]) -> s64x2;
});
defn_simd_shared!("sse2", { *ptr }, {
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_load_si128")]
pub fn load_aligned_u8x16(ptr: &u8x16) -> u8x16;
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_load_si128")]
pub fn load_aligned_u16x8(ptr: &u16x8) -> u16x8;
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_load_si128")]
pub fn load_aligned_u32x4(ptr: &u32x4) -> u32x4;
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_load_si128")]
pub fn load_aligned_u64x2(ptr: &u64x2) -> u64x2;
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_load_si128")]
pub fn load_aligned_s8x16(ptr: &s8x16) -> s8x16;
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_load_si128")]
pub fn load_aligned_s16x8(ptr: &s16x8) -> s16x8;
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_load_si128")]
pub fn load_aligned_s32x4(ptr: &s32x4) -> s32x4;
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_load_si128")]
pub fn load_aligned_s64x2(ptr: &s64x2) -> s64x2;
});
defn_simd_shared!("sse2", {
ptr::write_unaligned(ptr as *mut _ as *mut _, x)
}, {
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_storeu_si128")]
pub fn store_u8x16(x: u8x16, ptr: &mut [u8; 16]);
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_storeu_si128")]
pub fn store_u16x8(x: u16x8, ptr: &mut [u16; 8]);
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_storeu_si128")]
pub fn store_u32x4(x: u32x4, ptr: &mut [u32; 4]);
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_storeu_si128")]
pub fn store_u64x2(x: u64x2, ptr: &mut [u64; 2]);
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_storeu_si128")]
pub fn store_s8x16(x: s8x16, ptr: &mut [i8; 16]);
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_storeu_si128")]
pub fn store_s16x8(x: s16x8, ptr: &mut [i16; 8]);
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_storeu_si128")]
pub fn store_s32x4(x: s32x4, ptr: &mut [i32; 4]);
#[intrinsic_for("movdqu")]
#[intel_equivalents("_mm_storeu_si128")]
pub fn store_s64x2(x: s64x2, ptr: &mut [i64; 2]);
});
defn_simd_shared!("sse2", { *ptr = x }, {
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_store_si128")]
pub fn store_aligned_u8x16(x: u8x16, ptr: &mut u8x16);
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_store_si128")]
pub fn store_aligned_u16x8(x: u16x8, ptr: &mut u16x8);
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_store_si128")]
pub fn store_aligned_u32x4(x: u32x4, ptr: &mut u32x4);
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_store_si128")]
pub fn store_aligned_u64x2(x: u64x2, ptr: &mut u64x2);
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_store_si128")]
pub fn store_aligned_s8x16(x: s8x16, ptr: &mut s8x16);
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_store_si128")]
pub fn store_aligned_s16x8(x: s16x8, ptr: &mut s16x8);
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_store_si128")]
pub fn store_aligned_s32x4(x: s32x4, ptr: &mut s32x4);
#[intrinsic_for("movdqa")]
#[intel_equivalents("_mm_store_si128")]
pub fn store_aligned_s64x2(x: s64x2, ptr: &mut s64x2);
});
defn_simd_shared!("sse2", { simd_add(a, b) }, {
#[intrinsic_for("paddb")]
#[intel_equivalents("_mm_add_epi8")]
pub fn add_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("paddw")]
#[intel_equivalents("_mm_add_epi16")]
pub fn add_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("paddd")]
#[intel_equivalents("_mm_add_epi32")]
pub fn add_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("paddq")]
#[intel_equivalents("_mm_add_epi64")]
pub fn add_u64x2(a: u64x2, b: u64x2) -> u64x2;
#[intrinsic_for("paddb")]
#[intel_equivalents("_mm_add_epi8")]
pub fn add_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("paddw")]
#[intel_equivalents("_mm_add_epi16")]
pub fn add_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("paddd")]
#[intel_equivalents("_mm_add_epi32")]
pub fn add_s32x4(a: s32x4, b: s32x4) -> s32x4;
#[intrinsic_for("paddq")]
#[intel_equivalents("_mm_add_epi64")]
pub fn add_s64x2(a: s64x2, b: s64x2) -> s64x2;
});
defn_simd_shared!("sse2", { simd_sub(a, b) }, {
#[intrinsic_for("psubb")]
#[intel_equivalents("_mm_sub_epi8")]
pub fn sub_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("psubw")]
#[intel_equivalents("_mm_sub_epi16")]
pub fn sub_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("psubd")]
#[intel_equivalents("_mm_sub_epi32")]
pub fn sub_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("psubq")]
#[intel_equivalents("_mm_sub_epi64")]
pub fn sub_u64x2(a: u64x2, b: u64x2) -> u64x2;
#[intrinsic_for("psubb")]
#[intel_equivalents("_mm_sub_epi8")]
pub fn sub_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("psubw")]
#[intel_equivalents("_mm_sub_epi16")]
pub fn sub_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("psubd")]
#[intel_equivalents("_mm_sub_epi32")]
pub fn sub_s32x4(a: s32x4, b: s32x4) -> s32x4;
#[intrinsic_for("psubq")]
#[intel_equivalents("_mm_sub_epi64")]
pub fn sub_s64x2(a: s64x2, b: s64x2) -> s64x2;
});
defn_simd_shared!("sse2", { simd_saturating_add(a, b) }, {
#[intrinsic_for("paddsb")]
#[intel_equivalents("_mm_adds_epi8")]
pub fn saturating_add_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("paddsw")]
#[intel_equivalents("_mm_adds_epi16")]
pub fn saturating_add_s16x8(a: s16x8, b: s16x8) -> s16x8;
});
defn_simd_shared!("sse2", { simd_saturating_sub(a, b) }, {
#[intrinsic_for("psubsb")]
#[intel_equivalents("_mm_subs_epi8")]
pub fn saturating_sub_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("psubsw")]
#[intel_equivalents("_mm_subs_epi16")]
pub fn saturating_sub_s16x8(a: s16x8, b: s16x8) -> s16x8;
});
defn_simd_shared!("sse2", { simd_mul(a, b) }, {
#[intrinsic_for("pmullw")]
#[intel_equivalents("_mm_mullo_epi16")]
pub fn mul_lo_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("pmullw")]
#[intel_equivalents("_mm_mullo_epi16")]
pub fn mul_lo_s16x8(a: s16x8, b: s16x8) -> s16x8;
});
defn_simd_manual!("sse2", {
#[intrinsic_for("pmulhuw")]
#[intel_equivalents("_mm_mulhi_epu16")]
pub fn mul_hi_u16x8(a: u16x8, b: u16x8) -> u16x8 {
let prod = simd_mul(simd_cast(a), simd_cast(b));
simd_cast(simd_shr(prod, u32x8::splat(16)))
}
#[intrinsic_for("pmulhw")]
#[intel_equivalents("_mm_mulhi_epi16")]
pub fn mul_hi_s16x8(a: s16x8, b: s16x8) -> s16x8 {
let prod = simd_mul(simd_cast(a), simd_cast(b));
simd_cast(simd_shr(prod, s32x8::splat(16)))
}
});
defn_simd_manual!("sse2", {
#[intrinsic_for("pmuludq")]
#[intel_equivalents("_mm_mul_epu32")]
pub fn mul_u32_u64x2(a: u64x2, b: u64x2) -> u64x2 {
let [a, b]: [u32x2; 2] = [simd_cast(a), simd_cast(b)];
simd_mul(simd_cast(a), simd_cast(b))
}
});
defn_simd_llvm!("sse2", {
#[intrinsic_for("pmaddwd")]
#[intel_equivalents("_mm_madd_epi16")]
pub fn sum_of_prod_s16x2x4
(x: s16x8, y: s16x8) -> s32x4
= "llvm.x86.sse2.pmadd.wd";
});
defn_simd_shared!("sse2", { simd_eq(a, b) }, {
#[intrinsic_for("pcmpeqb")]
#[intel_equivalents("_mm_cmpeq_epi8")]
pub fn cmp_eq_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("pcmpeqw")]
#[intel_equivalents("_mm_cmpeq_epi16")]
pub fn cmp_eq_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("pcmpeqd")]
#[intel_equivalents("_mm_cmpeq_epi32")]
pub fn cmp_eq_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("pcmpeqb")]
#[intel_equivalents("_mm_cmpeq_epi8")]
pub fn cmp_eq_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("pcmpeqw")]
#[intel_equivalents("_mm_cmpeq_epi16")]
pub fn cmp_eq_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("pcmpeqd")]
#[intel_equivalents("_mm_cmpeq_epi32")]
pub fn cmp_eq_s32x4(a: s32x4, b: s32x4) -> s32x4;
});
defn_simd_shared!("sse2", { simd_gt(a, b) }, {
#[intrinsic_for("pcmpgtb")]
#[intel_equivalents("_mm_cmpgt_epi8", "_mm_cmplt_epi8")]
pub fn cmp_gt_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("pcmpgtw")]
#[intel_equivalents("_mm_cmpgt_epi16", "_mm_cmplt_epi16")]
pub fn cmp_gt_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("pcmpgtd")]
#[intel_equivalents("_mm_cmpgt_epi32", "_mm_cmplt_epi32")]
pub fn cmp_gt_s32x4(a: s32x4, b: s32x4) -> s32x4;
});
defn_simd_shared!("sse2", { simd_and(a, b) }, {
#[intrinsic_for("pand")]
#[intel_equivalents("_mm_and_si128")]
pub fn and_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("pand")]
#[intel_equivalents("_mm_and_si128")]
pub fn and_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("pand")]
#[intel_equivalents("_mm_and_si128")]
pub fn and_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("pand")]
#[intel_equivalents("_mm_and_si128")]
pub fn and_u64x2(a: u64x2, b: u64x2) -> u64x2;
#[intrinsic_for("pand")]
#[intel_equivalents("_mm_and_si128")]
pub fn and_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("pand")]
#[intel_equivalents("_mm_and_si128")]
pub fn and_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("pand")]
#[intel_equivalents("_mm_and_si128")]
pub fn and_s32x4(a: s32x4, b: s32x4) -> s32x4;
#[intrinsic_for("pand")]
#[intel_equivalents("_mm_and_si128")]
pub fn and_s64x2(a: s64x2, b: s64x2) -> s64x2;
});
defn_simd_shared!("sse2", { simd_or(a, b) }, {
#[intrinsic_for("por")]
#[intel_equivalents("_mm_or_si128")]
pub fn ior_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("por")]
#[intel_equivalents("_mm_or_si128")]
pub fn ior_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("por")]
#[intel_equivalents("_mm_or_si128")]
pub fn ior_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("por")]
#[intel_equivalents("_mm_or_si128")]
pub fn ior_u64x2(a: u64x2, b: u64x2) -> u64x2;
#[intrinsic_for("por")]
#[intel_equivalents("_mm_or_si128")]
pub fn ior_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("por")]
#[intel_equivalents("_mm_or_si128")]
pub fn ior_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("por")]
#[intel_equivalents("_mm_or_si128")]
pub fn ior_s32x4(a: s32x4, b: s32x4) -> s32x4;
#[intrinsic_for("por")]
#[intel_equivalents("_mm_or_si128")]
pub fn ior_s64x2(a: s64x2, b: s64x2) -> s64x2;
});
defn_simd_shared!("sse2", { simd_xor(a, b) }, {
#[intrinsic_for("pxor")]
#[intel_equivalents("_mm_xor_si128")]
pub fn xor_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("pxor")]
#[intel_equivalents("_mm_xor_si128")]
pub fn xor_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("pxor")]
#[intel_equivalents("_mm_xor_si128")]
pub fn xor_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("pxor")]
#[intel_equivalents("_mm_xor_si128")]
pub fn xor_u64x2(a: u64x2, b: u64x2) -> u64x2;
#[intrinsic_for("pxor")]
#[intel_equivalents("_mm_xor_si128")]
pub fn xor_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("pxor")]
#[intel_equivalents("_mm_xor_si128")]
pub fn xor_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("pxor")]
#[intel_equivalents("_mm_xor_si128")]
pub fn xor_s32x4(a: s32x4, b: s32x4) -> s32x4;
#[intrinsic_for("pxor")]
#[intel_equivalents("_mm_xor_si128")]
pub fn xor_s64x2(a: s64x2, b: s64x2) -> s64x2;
});
defn_simd_shared!("sse2", { simd_andnot(a, b) }, {
#[intrinsic_for("pandn")]
#[intel_equivalents("_mm_andnot_si128")]
pub fn and_not_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("pandn")]
#[intel_equivalents("_mm_andnot_si128")]
pub fn and_not_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("pandn")]
#[intel_equivalents("_mm_andnot_si128")]
pub fn and_not_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("pandn")]
#[intel_equivalents("_mm_andnot_si128")]
pub fn and_not_u64x2(a: u64x2, b: u64x2) -> u64x2;
#[intrinsic_for("pandn")]
#[intel_equivalents("_mm_andnot_si128")]
pub fn and_not_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("pandn")]
#[intel_equivalents("_mm_andnot_si128")]
pub fn and_not_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("pandn")]
#[intel_equivalents("_mm_andnot_si128")]
pub fn and_not_s32x4(a: s32x4, b: s32x4) -> s32x4;
#[intrinsic_for("pandn")]
#[intel_equivalents("_mm_andnot_si128")]
pub fn and_not_s64x2(a: s64x2, b: s64x2) -> s64x2;
});
defn_simd_manual!("sse2", {
#[intrinsic_for("pavgb")]
#[intel_equivalents("_mm_avg_epu8")]
pub fn avg_u8x16(a: u8x16, b: u8x16) -> u8x16
= simd_avg::<u8x16, u16x16>;
#[intrinsic_for("pavgw")]
#[intel_equivalents("_mm_avg_epu16")]
pub fn avg_u16x8(a: u16x8, b: u16x8) -> u16x8
= simd_avg::<u16x8, u32x8>;
});
defn_simd_shared!("sse2", { simd_max(a, b) }, {
#[intrinsic_for("pmaxub")]
#[intel_equivalents("_mm_max_epu8")]
pub fn max_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("pmaxsw")]
#[intel_equivalents("_mm_max_epi16")]
pub fn max_s16x8(a: s16x8, b: s16x8) -> s16x8;
});
defn_simd_shared!("sse2", { simd_min(a, b) }, {
#[intrinsic_for("pminub")]
#[intel_equivalents("_mm_min_epu8")]
pub fn min_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("pminsw")]
#[intel_equivalents("_mm_min_epi16")]
pub fn min_s16x8(a: s16x8, b: s16x8) -> s16x8;
});
defn_simd_llvm!("sse2", {
#[intrinsic_for("psllw")]
#[intel_equivalents("_mm_sll_epi16")]
pub fn shl_all_u16x8(x: u16x8, s: u64x2) -> u16x8
= "llvm.x86.sse2.psll.w";
#[intrinsic_for("pslld")]
#[intel_equivalents("_mm_sll_epi32")]
pub fn shl_all_u32x4(x: u32x4, s: u64x2) -> u32x4
= "llvm.x86.sse2.psll.d";
#[intrinsic_for("psllq")]
#[intel_equivalents("_mm_sll_epi64")]
pub fn shl_all_u64x2(x: u64x2, s: u64x2) -> u64x2
= "llvm.x86.sse2.psll.q";
#[intrinsic_for("psllw")]
#[intel_equivalents("_mm_sll_epi16")]
pub fn shl_all_s16x8(x: s16x8, s: u64x2) -> s16x8
= "llvm.x86.sse2.psll.w";
#[intrinsic_for("pslld")]
#[intel_equivalents("_mm_sll_epi32")]
pub fn shl_all_s32x4(x: s32x4, s: u64x2) -> s32x4
= "llvm.x86.sse2.psll.d";
#[intrinsic_for("psllq")]
#[intel_equivalents("_mm_sll_epi64")]
pub fn shl_all_s64x2(x: s64x2, s: u64x2) -> s64x2
= "llvm.x86.sse2.psll.q";
});
defn_simd_shared!("sse2", { simd_shl_all::<_, BITS>(x) }, {
#[intrinsic_for("psllw")]
#[intel_equivalents("_mm_slli_epi16")]
pub fn shl_all_by_u16x8<BITS: u8>(x: u16x8) -> u16x8;
#[intrinsic_for("pslld")]
#[intel_equivalents("_mm_slli_epi32")]
pub fn shl_all_by_u32x4<BITS: u8>(x: u32x4) -> u32x4;
#[intrinsic_for("psllq")]
#[intel_equivalents("_mm_slli_epi64")]
pub fn shl_all_by_u64x2<BITS: u8>(x: u64x2) -> u64x2;
#[intrinsic_for("psllw")]
#[intel_equivalents("_mm_slli_epi16")]
pub fn shl_all_by_s16x8<BITS: u8>(x: s16x8) -> s16x8;
#[intrinsic_for("pslld")]
#[intel_equivalents("_mm_slli_epi32")]
pub fn shl_all_by_s32x4<BITS: u8>(x: s32x4) -> s32x4;
#[intrinsic_for("psllq")]
#[intel_equivalents("_mm_slli_epi64")]
pub fn shl_all_by_s64x2<BITS: u8>(x: s64x2) -> s64x2;
});
defn_simd_llvm!("sse2", {
#[intrinsic_for("psrlw")]
#[intel_equivalents("_mm_srl_epi16")]
pub fn shr_all_u16x8(x: u16x8, s: u64x2) -> u16x8
= "llvm.x86.sse2.psrl.w";
#[intrinsic_for("psrld")]
#[intel_equivalents("_mm_srl_epi32")]
pub fn shr_all_u32x4(x: u32x4, s: u64x2) -> u32x4
= "llvm.x86.sse2.psrl.d";
#[intrinsic_for("psrlq")]
#[intel_equivalents("_mm_srl_epi64")]
pub fn shr_all_u64x2(x: u64x2, s: u64x2) -> u64x2
= "llvm.x86.sse2.psrl.q";
#[intrinsic_for("psraw")]
#[intel_equivalents("_mm_sra_epi16")]
pub fn shr_all_s16x8(x: s16x8, s: u64x2) -> s16x8
= "llvm.x86.sse2.psra.w";
#[intrinsic_for("psrad")]
#[intel_equivalents("_mm_sra_epi32")]
pub fn shr_all_s32x4(x: s32x4, s: u64x2) -> s32x4
= "llvm.x86.sse2.psra.d";
});
defn_simd_shared!("sse2", fn(T) -> R { simd_shr_all::<_, BITS>(x) }, {
#[intrinsic_for("psrlw")]
#[intel_equivalents("_mm_srli_epi16")]
pub fn shr_all_by_u16x8<BITS: u8>(x: u16x8) -> u16x8;
#[intrinsic_for("psrld")]
#[intel_equivalents("_mm_srli_epi32")]
pub fn shr_all_by_u32x4<BITS: u8>(x: u32x4) -> u32x4;
#[intrinsic_for("psrlq")]
#[intel_equivalents("_mm_srli_epi64")]
pub fn shr_all_by_u64x2<BITS: u8>(x: u64x2) -> u64x2;
#[intrinsic_for("psraw")]
#[intel_equivalents("_mm_srai_epi16")]
pub fn shr_all_by_s16x8<BITS: u8>(x: s16x8) -> s16x8;
#[intrinsic_for("psrad")]
#[intel_equivalents("_mm_srai_epi32")]
pub fn shr_all_by_s32x4<BITS: u8>(x: s32x4) -> s32x4;
});
defn_simd_llvm!("sse2", {
#[intrinsic_for("packuswb")]
#[intel_equivalents("_mm_packus_epi16")]
pub fn concat_and_saturate_u8_s16x8(a: s16x8, b: s16x8) -> u8x16
= "llvm.x86.sse2.packuswb.128";
#[intrinsic_for("packsswb")]
#[intel_equivalents("_mm_packs_epi16")]
pub fn concat_and_saturate_s8_s16x8(a: s16x8, b: s16x8) -> s8x16
= "llvm.x86.sse2.packsswb.128";
#[intrinsic_for("packssdw")]
#[intel_equivalents("_mm_packs_epi32")]
pub fn concat_and_saturate_s16_s32x4(a: s32x4, b: s32x4) -> s16x8
= "llvm.x86.sse2.packssdw.128";
});
defn_simd_shared!("sse2", fn(T) -> R {
const fn indices(idxs: [u8; T::LEN]) -> [i32; T::LEN] {
let mut indices = [0i32; T::LEN];
let mut index = 0usize;
while index < idxs.len() {
assert!((idxs[index] as usize) < T::LEN);
indices[index] = idxs[index] as i32;
index += 1;
}
indices
}
simd_shuffle(x, T::splat(0), const { indices(Indices::VAL) })
}, {
#[intrinsic_for("pshufd")]
#[intel_equivalents("_mm_shuffle_epi32")]
pub fn shuffle_by_u32x4[Indices: [u8; 4]](x: u32x4) -> u32x4;
#[intrinsic_for("pshufd")]
#[intel_equivalents("_mm_shuffle_epi32")]
pub fn shuffle_by_s32x4[Indices: [u8; 4]](x: s32x4) -> s32x4;
});
defn_simd_shared!("sse2", fn(T, U) -> R {
simd_shuffle(a, b, const { simd_unpack_indices::<R>(0) })
}, {
#[intrinsic_for("punpcklbw")]
#[intel_equivalents("_mm_unpacklo_epi8")]
pub fn interleave_lo_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("punpcklwd")]
#[intel_equivalents("_mm_unpacklo_epi16")]
pub fn interleave_lo_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("punpckldq")]
#[intel_equivalents("_mm_unpacklo_epi32")]
pub fn interleave_lo_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("punpcklqdq")]
#[intel_equivalents("_mm_unpacklo_epi64")]
pub fn interleave_lo_u64x2(a: u64x2, b: u64x2) -> u64x2;
#[intrinsic_for("punpcklbw")]
#[intel_equivalents("_mm_unpacklo_epi8")]
pub fn interleave_lo_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("punpcklwd")]
#[intel_equivalents("_mm_unpacklo_epi16")]
pub fn interleave_lo_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("punpckldq")]
#[intel_equivalents("_mm_unpacklo_epi32")]
pub fn interleave_lo_s32x4(a: s32x4, b: s32x4) -> s32x4;
#[intrinsic_for("punpcklqdq")]
#[intel_equivalents("_mm_unpacklo_epi64")]
pub fn interleave_lo_s64x2(a: s64x2, b: s64x2) -> s64x2;
});
defn_simd_shared!("sse2", fn(T, U) -> R {
simd_shuffle(a, b, const { simd_unpack_indices::<R>(T::LEN / 2) })
}, {
#[intrinsic_for("punpckhbw")]
#[intel_equivalents("_mm_unpackhi_epi8")]
pub fn interleave_hi_u8x16(a: u8x16, b: u8x16) -> u8x16;
#[intrinsic_for("punpckhwd")]
#[intel_equivalents("_mm_unpackhi_epi16")]
pub fn interleave_hi_u16x8(a: u16x8, b: u16x8) -> u16x8;
#[intrinsic_for("punpckhdq")]
#[intel_equivalents("_mm_unpackhi_epi32")]
pub fn interleave_hi_u32x4(a: u32x4, b: u32x4) -> u32x4;
#[intrinsic_for("punpckhqdq")]
#[intel_equivalents("_mm_unpackhi_epi64")]
pub fn interleave_hi_u64x2(a: u64x2, b: u64x2) -> u64x2;
#[intrinsic_for("punpckhbw")]
#[intel_equivalents("_mm_unpackhi_epi8")]
pub fn interleave_hi_s8x16(a: s8x16, b: s8x16) -> s8x16;
#[intrinsic_for("punpckhwd")]
#[intel_equivalents("_mm_unpackhi_epi16")]
pub fn interleave_hi_s16x8(a: s16x8, b: s16x8) -> s16x8;
#[intrinsic_for("punpckhdq")]
#[intel_equivalents("_mm_unpackhi_epi32")]
pub fn interleave_hi_s32x4(a: s32x4, b: s32x4) -> s32x4;
#[intrinsic_for("punpckhqdq")]
#[intel_equivalents("_mm_unpackhi_epi64")]
pub fn interleave_hi_s64x2(a: s64x2, b: s64x2) -> s64x2;
});
defn_simd_llvm!("sse2", {
#[intrinsic_for("psadbw")]
#[intel_equivalents("_mm_sad_epu8")]
pub fn sum_of_abs_diff_u8x16(a: u8x16, b: u8x16) -> u64x2
= "llvm.x86.sse2.psad.bw";
});
defn_simd_shared!("sse2", fn(T) -> R {
simd_shuffle(T::splat(0), x, const {
simd_slice_indices::<T>(T::LEN.saturating_sub(ELEMS as usize))
})
}, {
#[intrinsic_for("pslldq")]
#[intel_equivalents("_mm_bslli_si128", "_mm_slli_si128")]
pub fn move_l_by_u8x16<ELEMS: u8>(x: u8x16) -> u8x16;
#[intrinsic_for("pslldq")]
#[intel_equivalents("_mm_bslli_si128", "_mm_slli_si128")]
pub fn move_l_by_s8x16<ELEMS: u8>(x: s8x16) -> s8x16;
});
defn_simd_shared!("sse2", fn(T) -> R {
simd_shuffle(T::splat(0), x, const {
simd_slice_indices::<T>(ELEMS as usize)
})
}, {
#[intrinsic_for("psrldq")]
#[intel_equivalents("_mm_bsrli_si128", "_mm_srli_si128")]
pub fn move_r_by_u8x16<ELEMS: u8>(x: u8x16) -> u8x16;
#[intrinsic_for("psrldq")]
#[intel_equivalents("_mm_bsrli_si128", "_mm_srli_si128")]
pub fn move_r_by_s8x16<ELEMS: u8>(x: s8x16) -> s8x16;
});
defn_simd_manual!("sse2", {
#[intrinsic_for("pmovmskb")]
#[intel_equivalents("_mm_movemask_epi8")]
pub fn bitmask_u8x16(x: u8x16) -> u16 {
simd_bitmask(simd_ge::<_, u8x16>(x, u8x16::splat(0x80)))
}
#[intrinsic_for("pmovmskb")]
#[intel_equivalents("_mm_movemask_epi8")]
pub fn bitmask_s8x16(x: s8x16) -> u16 {
simd_bitmask(simd_lt::<_, s8x16>(x, s8x16::splat(0)))
}
});
}