use core::arch::x86_64::*;
#[cfg(feature = "rgb")]
const BSWAP_U32X4_MASK: __m128i =
unsafe { core::mem::transmute([3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]) };
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn x2_load_endian_u32x4<const BE: bool>(ptr: *const u8) -> __m128i {
unsafe {
let v = _mm_loadu_si128(ptr.cast());
if BE {
_mm_shuffle_epi8(v, BSWAP_U32X4_MASK)
} else {
v
}
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "rgb-legacy",
feature = "mono",
feature = "rgb",
feature = "yuv-packed",
feature = "gbr",
feature = "yuv-semi-planar",
feature = "yuv-planar",
feature = "y2xx",
feature = "xyz",
))]
#[inline(always)]
pub(super) unsafe fn write_rgb_16(r: __m128i, g: __m128i, b: __m128i, ptr: *mut u8) {
unsafe {
let r0 = _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5);
let g0 = _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1);
let b0 = _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1);
let out0 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
_mm_shuffle_epi8(b, b0),
);
let r1 = _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1);
let g1 = _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10);
let b1 = _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1);
let out1 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
_mm_shuffle_epi8(b, b1),
);
let r2 = _mm_setr_epi8(
-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1,
);
let g2 = _mm_setr_epi8(
-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1,
);
let b2 = _mm_setr_epi8(
10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15,
);
let out2 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)),
_mm_shuffle_epi8(b, b2),
);
_mm_storeu_si128(ptr.cast(), out0);
_mm_storeu_si128(ptr.add(16).cast(), out1);
_mm_storeu_si128(ptr.add(32).cast(), out2);
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "rgb-legacy",
feature = "mono",
feature = "rgb",
feature = "yuv-packed",
feature = "gbr",
feature = "yuv-semi-planar",
feature = "yuv-planar",
feature = "y2xx",
feature = "xyz",
))]
#[inline(always)]
pub(super) unsafe fn write_rgba_16(r: __m128i, g: __m128i, b: __m128i, a: __m128i, ptr: *mut u8) {
unsafe {
let r0 = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1);
let g0 = _mm_setr_epi8(-1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1);
let b0 = _mm_setr_epi8(-1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1);
let a0 = _mm_setr_epi8(-1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3);
let out0 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
_mm_or_si128(_mm_shuffle_epi8(b, b0), _mm_shuffle_epi8(a, a0)),
);
let r1 = _mm_setr_epi8(4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1);
let g1 = _mm_setr_epi8(-1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1);
let b1 = _mm_setr_epi8(-1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1);
let a1 = _mm_setr_epi8(-1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7);
let out1 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
_mm_or_si128(_mm_shuffle_epi8(b, b1), _mm_shuffle_epi8(a, a1)),
);
let r2 = _mm_setr_epi8(8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1, -1);
let g2 = _mm_setr_epi8(-1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1);
let b2 = _mm_setr_epi8(-1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1);
let a2 = _mm_setr_epi8(-1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11);
let out2 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)),
_mm_or_si128(_mm_shuffle_epi8(b, b2), _mm_shuffle_epi8(a, a2)),
);
let r3 = _mm_setr_epi8(
12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1,
);
let g3 = _mm_setr_epi8(
-1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
);
let b3 = _mm_setr_epi8(
-1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1,
);
let a3 = _mm_setr_epi8(
-1, -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15,
);
let out3 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r3), _mm_shuffle_epi8(g, g3)),
_mm_or_si128(_mm_shuffle_epi8(b, b3), _mm_shuffle_epi8(a, a3)),
);
_mm_storeu_si128(ptr.cast(), out0);
_mm_storeu_si128(ptr.add(16).cast(), out1);
_mm_storeu_si128(ptr.add(32).cast(), out2);
_mm_storeu_si128(ptr.add(48).cast(), out3);
}
}
#[cfg(any(feature = "std", feature = "alloc"))]
#[cfg(feature = "xyz")]
#[inline(always)]
pub(super) unsafe fn write_rgb_u8_8(r: __m128i, g: __m128i, b: __m128i, ptr: *mut u8) {
unsafe {
let r0 = _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5);
let g0 = _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1);
let b0 = _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1);
let out0 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
_mm_shuffle_epi8(b, b0),
);
let r1 = _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let g1 = _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let b1 = _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1);
let out1 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
_mm_shuffle_epi8(b, b1),
);
_mm_storeu_si128(ptr.cast(), out0);
_mm_storel_epi64(ptr.add(16).cast(), out1);
}
}
#[cfg(any(feature = "std", feature = "alloc"))]
#[cfg(feature = "xyz")]
#[inline(always)]
pub(super) unsafe fn write_rgba_u8_8(r: __m128i, g: __m128i, b: __m128i, a: __m128i, ptr: *mut u8) {
unsafe {
let rg = _mm_unpacklo_epi8(r, g);
let ba = _mm_unpacklo_epi8(b, a);
let rgba_lo = _mm_unpacklo_epi16(rg, ba);
let rgba_hi = _mm_unpackhi_epi16(rg, ba);
_mm_storeu_si128(ptr.cast(), rgba_lo);
_mm_storeu_si128(ptr.add(16).cast(), rgba_hi);
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "rgb-legacy",
feature = "mono",
feature = "rgb",
feature = "gbr",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
#[inline(always)]
pub(super) unsafe fn write_rgb_u16_8(r: __m128i, g: __m128i, b: __m128i, ptr: *mut u16) {
unsafe {
let r0 = _mm_setr_epi8(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1);
let g0 = _mm_setr_epi8(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5);
let b0 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1);
let out0 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
_mm_shuffle_epi8(b, b0),
);
let r1 = _mm_setr_epi8(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11);
let g1 = _mm_setr_epi8(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1);
let b1 = _mm_setr_epi8(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1);
let out1 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
_mm_shuffle_epi8(b, b1),
);
let r2 = _mm_setr_epi8(
-1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1,
);
let g2 = _mm_setr_epi8(
10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1,
);
let b2 = _mm_setr_epi8(
-1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15,
);
let out2 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)),
_mm_shuffle_epi8(b, b2),
);
_mm_storeu_si128(ptr.cast(), out0);
_mm_storeu_si128(ptr.add(8).cast(), out1);
_mm_storeu_si128(ptr.add(16).cast(), out2);
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "rgb-legacy",
feature = "mono",
feature = "rgb",
feature = "gbr",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
#[inline(always)]
pub(super) unsafe fn write_rgba_u16_8(
r: __m128i,
g: __m128i,
b: __m128i,
a: __m128i,
ptr: *mut u16,
) {
unsafe {
let rg_lo = _mm_unpacklo_epi16(r, g);
let rg_hi = _mm_unpackhi_epi16(r, g);
let ba_lo = _mm_unpacklo_epi16(b, a);
let ba_hi = _mm_unpackhi_epi16(b, a);
let q0 = _mm_unpacklo_epi32(rg_lo, ba_lo);
let q1 = _mm_unpackhi_epi32(rg_lo, ba_lo);
let q2 = _mm_unpacklo_epi32(rg_hi, ba_hi);
let q3 = _mm_unpackhi_epi32(rg_hi, ba_hi);
_mm_storeu_si128(ptr.cast(), q0);
_mm_storeu_si128(ptr.add(8).cast(), q1);
_mm_storeu_si128(ptr.add(16).cast(), q2);
_mm_storeu_si128(ptr.add(24).cast(), q3);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn swap_rb_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let in0 = _mm_loadu_si128(input_ptr.cast());
let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
let m00 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, -1);
let m01 = _mm_setr_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,
);
let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));
let m10 = _mm_setr_epi8(
-1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
);
let m11 = _mm_setr_epi8(0, -1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, -1, 15);
let m12 = _mm_setr_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1,
);
let out1 = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(in0, m10), _mm_shuffle_epi8(in1, m11)),
_mm_shuffle_epi8(in2, m12),
);
let m20 = _mm_setr_epi8(
14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
);
let m21 = _mm_setr_epi8(-1, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
let out2 = _mm_or_si128(_mm_shuffle_epi8(in1, m20), _mm_shuffle_epi8(in2, m21));
_mm_storeu_si128(output_ptr.cast(), out0);
_mm_storeu_si128(output_ptr.add(16).cast(), out1);
_mm_storeu_si128(output_ptr.add(32).cast(), out2);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn swap_rb_alpha_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let mask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
let in_v = _mm_loadu_si128(input_ptr.cast());
let out_v = _mm_shuffle_epi8(in_v, mask);
_mm_storeu_si128(output_ptr.cast(), out_v);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn drop_alpha_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let in0 = _mm_loadu_si128(input_ptr.cast());
let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
let in3 = _mm_loadu_si128(input_ptr.add(48).cast());
let m00 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1);
let m01 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4);
let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));
let m11 = _mm_setr_epi8(5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1);
let m12 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9);
let out1 = _mm_or_si128(_mm_shuffle_epi8(in1, m11), _mm_shuffle_epi8(in2, m12));
let m22 = _mm_setr_epi8(
10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
);
let m23 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14);
let out2 = _mm_or_si128(_mm_shuffle_epi8(in2, m22), _mm_shuffle_epi8(in3, m23));
_mm_storeu_si128(output_ptr.cast(), out0);
_mm_storeu_si128(output_ptr.add(16).cast(), out1);
_mm_storeu_si128(output_ptr.add(32).cast(), out2);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn argb_to_rgb_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let in0 = _mm_loadu_si128(input_ptr.cast());
let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
let in3 = _mm_loadu_si128(input_ptr.add(48).cast());
let m00 = _mm_setr_epi8(1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, -1, -1, -1, -1);
let m01 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 5);
let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));
let m11 = _mm_setr_epi8(6, 7, 9, 10, 11, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
let m12 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 5, 6, 7, 9, 10);
let out1 = _mm_or_si128(_mm_shuffle_epi8(in1, m11), _mm_shuffle_epi8(in2, m12));
let m22 = _mm_setr_epi8(
11, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
);
let m23 = _mm_setr_epi8(-1, -1, -1, -1, 1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15);
let out2 = _mm_or_si128(_mm_shuffle_epi8(in2, m22), _mm_shuffle_epi8(in3, m23));
_mm_storeu_si128(output_ptr.cast(), out0);
_mm_storeu_si128(output_ptr.add(16).cast(), out1);
_mm_storeu_si128(output_ptr.add(32).cast(), out2);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn abgr_to_rgb_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let in0 = _mm_loadu_si128(input_ptr.cast());
let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
let in3 = _mm_loadu_si128(input_ptr.add(48).cast());
let m00 = _mm_setr_epi8(3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1);
let m01 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 2, 1, 7);
let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));
let m11 = _mm_setr_epi8(6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1, -1, -1, -1, -1);
let m12 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10);
let out1 = _mm_or_si128(_mm_shuffle_epi8(in1, m11), _mm_shuffle_epi8(in2, m12));
let m22 = _mm_setr_epi8(
9, 15, 14, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
);
let m23 = _mm_setr_epi8(-1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13);
let out2 = _mm_or_si128(_mm_shuffle_epi8(in2, m22), _mm_shuffle_epi8(in3, m23));
_mm_storeu_si128(output_ptr.cast(), out0);
_mm_storeu_si128(output_ptr.add(16).cast(), out1);
_mm_storeu_si128(output_ptr.add(32).cast(), out2);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn argb_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let mask = _mm_setr_epi8(1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
let in_v = _mm_loadu_si128(input_ptr.cast());
let out_v = _mm_shuffle_epi8(in_v, mask);
_mm_storeu_si128(output_ptr.cast(), out_v);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn abgr_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let mask = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
let in_v = _mm_loadu_si128(input_ptr.cast());
let out_v = _mm_shuffle_epi8(in_v, mask);
_mm_storeu_si128(output_ptr.cast(), out_v);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn xrgb_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let mask = _mm_setr_epi8(1, 2, 3, -1, 5, 6, 7, -1, 9, 10, 11, -1, 13, 14, 15, -1);
let alpha = _mm_set1_epi32(0xFF00_0000_u32 as i32);
let in_v = _mm_loadu_si128(input_ptr.cast());
let shuffled = _mm_shuffle_epi8(in_v, mask);
let out_v = _mm_or_si128(shuffled, alpha);
_mm_storeu_si128(output_ptr.cast(), out_v);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn rgbx_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let mask = _mm_setr_epi8(0, 1, 2, -1, 4, 5, 6, -1, 8, 9, 10, -1, 12, 13, 14, -1);
let alpha = _mm_set1_epi32(0xFF00_0000_u32 as i32);
let in_v = _mm_loadu_si128(input_ptr.cast());
let shuffled = _mm_shuffle_epi8(in_v, mask);
let out_v = _mm_or_si128(shuffled, alpha);
_mm_storeu_si128(output_ptr.cast(), out_v);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn xbgr_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let mask = _mm_setr_epi8(3, 2, 1, -1, 7, 6, 5, -1, 11, 10, 9, -1, 15, 14, 13, -1);
let alpha = _mm_set1_epi32(0xFF00_0000_u32 as i32);
let in_v = _mm_loadu_si128(input_ptr.cast());
let shuffled = _mm_shuffle_epi8(in_v, mask);
let out_v = _mm_or_si128(shuffled, alpha);
_mm_storeu_si128(output_ptr.cast(), out_v);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn bgrx_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let mask = _mm_setr_epi8(2, 1, 0, -1, 6, 5, 4, -1, 10, 9, 8, -1, 14, 13, 12, -1);
let alpha = _mm_set1_epi32(0xFF00_0000_u32 as i32);
let in_v = _mm_loadu_si128(input_ptr.cast());
let shuffled = _mm_shuffle_epi8(in_v, mask);
let out_v = _mm_or_si128(shuffled, alpha);
_mm_storeu_si128(output_ptr.cast(), out_v);
}
}
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn bgra_to_rgb_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
unsafe {
let in0 = _mm_loadu_si128(input_ptr.cast());
let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
let in3 = _mm_loadu_si128(input_ptr.add(48).cast());
let m00 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
let m01 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 1, 0, 6);
let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));
let m11 = _mm_setr_epi8(5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1);
let m12 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9);
let out1 = _mm_or_si128(_mm_shuffle_epi8(in1, m11), _mm_shuffle_epi8(in2, m12));
let m22 = _mm_setr_epi8(
8, 14, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
);
let m23 = _mm_setr_epi8(-1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12);
let out2 = _mm_or_si128(_mm_shuffle_epi8(in2, m22), _mm_shuffle_epi8(in3, m23));
_mm_storeu_si128(output_ptr.cast(), out0);
_mm_storeu_si128(output_ptr.add(16).cast(), out1);
_mm_storeu_si128(output_ptr.add(32).cast(), out2);
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
unsafe fn extract_10bit_to_u8_lane<const SHIFT: i32>(pix: __m128i) -> __m128i {
unsafe {
let mask_ff = _mm_set1_epi32(0xFF);
_mm_and_si128(_mm_srli_epi32::<SHIFT>(pix), mask_ff)
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
unsafe fn extract_10bit_to_u16_lane<const SHIFT: i32>(pix: __m128i) -> __m128i {
unsafe {
let mask_3ff = _mm_set1_epi32(0x3FF);
_mm_and_si128(_mm_srli_epi32::<SHIFT>(pix), mask_3ff)
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
unsafe fn pack_u32x4_quad_to_u8x16(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
unsafe {
let lo = _mm_packus_epi32(v0, v1);
let hi = _mm_packus_epi32(v2, v3);
_mm_packus_epi16(lo, hi)
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2rgb10_to_rgb_16_pixels<const BE: bool>(
input_ptr: *const u8,
output_ptr: *mut u8,
) {
unsafe {
let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
let p2 = x2_load_endian_u32x4::<BE>(input_ptr.add(32));
let p3 = x2_load_endian_u32x4::<BE>(input_ptr.add(48));
let r = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<22>(p0),
extract_10bit_to_u8_lane::<22>(p1),
extract_10bit_to_u8_lane::<22>(p2),
extract_10bit_to_u8_lane::<22>(p3),
);
let g = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<12>(p0),
extract_10bit_to_u8_lane::<12>(p1),
extract_10bit_to_u8_lane::<12>(p2),
extract_10bit_to_u8_lane::<12>(p3),
);
let b = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<2>(p0),
extract_10bit_to_u8_lane::<2>(p1),
extract_10bit_to_u8_lane::<2>(p2),
extract_10bit_to_u8_lane::<2>(p3),
);
write_rgb_16(r, g, b, output_ptr);
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2rgb10_to_rgba_16_pixels<const BE: bool>(
input_ptr: *const u8,
output_ptr: *mut u8,
) {
unsafe {
let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
let p2 = x2_load_endian_u32x4::<BE>(input_ptr.add(32));
let p3 = x2_load_endian_u32x4::<BE>(input_ptr.add(48));
let r = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<22>(p0),
extract_10bit_to_u8_lane::<22>(p1),
extract_10bit_to_u8_lane::<22>(p2),
extract_10bit_to_u8_lane::<22>(p3),
);
let g = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<12>(p0),
extract_10bit_to_u8_lane::<12>(p1),
extract_10bit_to_u8_lane::<12>(p2),
extract_10bit_to_u8_lane::<12>(p3),
);
let b = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<2>(p0),
extract_10bit_to_u8_lane::<2>(p1),
extract_10bit_to_u8_lane::<2>(p2),
extract_10bit_to_u8_lane::<2>(p3),
);
let alpha = _mm_set1_epi8(-1i8);
write_rgba_16(r, g, b, alpha, output_ptr);
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2rgb10_to_rgb_u16_8_pixels<const BE: bool>(
input_ptr: *const u8,
output_ptr: *mut u8,
) {
unsafe {
let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
let r = _mm_packus_epi32(
extract_10bit_to_u16_lane::<20>(p0),
extract_10bit_to_u16_lane::<20>(p1),
);
let g = _mm_packus_epi32(
extract_10bit_to_u16_lane::<10>(p0),
extract_10bit_to_u16_lane::<10>(p1),
);
let b = _mm_packus_epi32(
extract_10bit_to_u16_lane::<0>(p0),
extract_10bit_to_u16_lane::<0>(p1),
);
write_rgb_u16_8(r, g, b, output_ptr.cast::<u16>());
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2bgr10_to_rgb_16_pixels<const BE: bool>(
input_ptr: *const u8,
output_ptr: *mut u8,
) {
unsafe {
let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
let p2 = x2_load_endian_u32x4::<BE>(input_ptr.add(32));
let p3 = x2_load_endian_u32x4::<BE>(input_ptr.add(48));
let r = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<2>(p0),
extract_10bit_to_u8_lane::<2>(p1),
extract_10bit_to_u8_lane::<2>(p2),
extract_10bit_to_u8_lane::<2>(p3),
);
let g = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<12>(p0),
extract_10bit_to_u8_lane::<12>(p1),
extract_10bit_to_u8_lane::<12>(p2),
extract_10bit_to_u8_lane::<12>(p3),
);
let b = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<22>(p0),
extract_10bit_to_u8_lane::<22>(p1),
extract_10bit_to_u8_lane::<22>(p2),
extract_10bit_to_u8_lane::<22>(p3),
);
write_rgb_16(r, g, b, output_ptr);
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2bgr10_to_rgba_16_pixels<const BE: bool>(
input_ptr: *const u8,
output_ptr: *mut u8,
) {
unsafe {
let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
let p2 = x2_load_endian_u32x4::<BE>(input_ptr.add(32));
let p3 = x2_load_endian_u32x4::<BE>(input_ptr.add(48));
let r = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<2>(p0),
extract_10bit_to_u8_lane::<2>(p1),
extract_10bit_to_u8_lane::<2>(p2),
extract_10bit_to_u8_lane::<2>(p3),
);
let g = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<12>(p0),
extract_10bit_to_u8_lane::<12>(p1),
extract_10bit_to_u8_lane::<12>(p2),
extract_10bit_to_u8_lane::<12>(p3),
);
let b = pack_u32x4_quad_to_u8x16(
extract_10bit_to_u8_lane::<22>(p0),
extract_10bit_to_u8_lane::<22>(p1),
extract_10bit_to_u8_lane::<22>(p2),
extract_10bit_to_u8_lane::<22>(p3),
);
let alpha = _mm_set1_epi8(-1i8);
write_rgba_16(r, g, b, alpha, output_ptr);
}
}
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2bgr10_to_rgb_u16_8_pixels<const BE: bool>(
input_ptr: *const u8,
output_ptr: *mut u8,
) {
unsafe {
let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
let r = _mm_packus_epi32(
extract_10bit_to_u16_lane::<0>(p0),
extract_10bit_to_u16_lane::<0>(p1),
);
let g = _mm_packus_epi32(
extract_10bit_to_u16_lane::<10>(p0),
extract_10bit_to_u16_lane::<10>(p1),
);
let b = _mm_packus_epi32(
extract_10bit_to_u16_lane::<20>(p0),
extract_10bit_to_u16_lane::<20>(p1),
);
write_rgb_u16_8(r, g, b, output_ptr.cast::<u16>());
}
}
#[inline(always)]
pub(super) unsafe fn deinterleave_rgb_16(input_ptr: *const u8) -> (__m128i, __m128i, __m128i) {
unsafe {
let in0 = _mm_loadu_si128(input_ptr.cast());
let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
let mr0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let mr1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1);
let mr2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13);
let r = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(in0, mr0), _mm_shuffle_epi8(in1, mr1)),
_mm_shuffle_epi8(in2, mr2),
);
let mg0 = _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let mg1 = _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1);
let mg2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14);
let g = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(in0, mg0), _mm_shuffle_epi8(in1, mg1)),
_mm_shuffle_epi8(in2, mg2),
);
let mb0 = _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let mb1 = _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1);
let mb2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15);
let b = _mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(in0, mb0), _mm_shuffle_epi8(in1, mb1)),
_mm_shuffle_epi8(in2, mb2),
);
(r, g, b)
}
}
#[inline(always)]
fn u8x16_to_f32x4_quad(v: __m128i) -> (__m128, __m128, __m128, __m128) {
unsafe {
let i0 = _mm_cvtepu8_epi32(v);
let i1 = _mm_cvtepu8_epi32(_mm_srli_si128::<4>(v));
let i2 = _mm_cvtepu8_epi32(_mm_srli_si128::<8>(v));
let i3 = _mm_cvtepu8_epi32(_mm_srli_si128::<12>(v));
(
_mm_cvtepi32_ps(i0),
_mm_cvtepi32_ps(i1),
_mm_cvtepi32_ps(i2),
_mm_cvtepi32_ps(i3),
)
}
}
#[inline(always)]
fn f32x4_quad_to_u8x16(a: __m128, b: __m128, c: __m128, d: __m128) -> __m128i {
unsafe {
let ai = _mm_cvttps_epi32(a);
let bi = _mm_cvttps_epi32(b);
let ci = _mm_cvttps_epi32(c);
let di = _mm_cvttps_epi32(d);
let ab = _mm_packus_epi32(ai, bi); let cd = _mm_packus_epi32(ci, di);
_mm_packus_epi16(ab, cd) }
}
#[inline(always)]
fn hsv_group(r: __m128, g: __m128, b: __m128) -> (__m128, __m128, __m128) {
unsafe {
let zero = _mm_setzero_ps();
let half = _mm_set1_ps(0.5);
let sixty = _mm_set1_ps(60.0);
let one_twenty = _mm_set1_ps(120.0);
let two_forty = _mm_set1_ps(240.0);
let three_sixty = _mm_set1_ps(360.0);
let one_seventy_nine = _mm_set1_ps(179.0);
let two_fifty_five = _mm_set1_ps(255.0);
let two = _mm_set1_ps(2.0);
let v = _mm_max_ps(_mm_max_ps(r, g), b);
let min_rgb = _mm_min_ps(_mm_min_ps(r, g), b);
let delta = _mm_sub_ps(v, min_rgb);
let v_rcp0 = _mm_rcp_ps(v);
let v_rcp = _mm_mul_ps(v_rcp0, _mm_sub_ps(two, _mm_mul_ps(v, v_rcp0)));
let delta_rcp0 = _mm_rcp_ps(delta);
let delta_rcp = _mm_mul_ps(delta_rcp0, _mm_sub_ps(two, _mm_mul_ps(delta, delta_rcp0)));
let mask_v_zero = _mm_cmpeq_ps(v, zero);
let s_nonzero = _mm_mul_ps(_mm_mul_ps(two_fifty_five, delta), v_rcp);
let s = _mm_blendv_ps(s_nonzero, zero, mask_v_zero);
let mask_delta_zero = _mm_cmpeq_ps(delta, zero);
let mask_v_is_r = _mm_cmpeq_ps(v, r);
let mask_v_is_g = _mm_cmpeq_ps(v, g);
let h_r_raw = _mm_mul_ps(_mm_mul_ps(sixty, _mm_sub_ps(g, b)), delta_rcp);
let mask_neg = _mm_cmplt_ps(h_r_raw, zero);
let h_r = _mm_blendv_ps(h_r_raw, _mm_add_ps(h_r_raw, three_sixty), mask_neg);
let h_g = _mm_add_ps(
_mm_mul_ps(_mm_mul_ps(sixty, _mm_sub_ps(b, r)), delta_rcp),
one_twenty,
);
let h_b = _mm_add_ps(
_mm_mul_ps(_mm_mul_ps(sixty, _mm_sub_ps(r, g)), delta_rcp),
two_forty,
);
let h_g_or_b = _mm_blendv_ps(h_b, h_g, mask_v_is_g);
let h_nonzero = _mm_blendv_ps(h_g_or_b, h_r, mask_v_is_r);
let hue = _mm_blendv_ps(h_nonzero, zero, mask_delta_zero);
let h_quant = _mm_min_ps(
_mm_max_ps(_mm_add_ps(_mm_mul_ps(hue, half), half), zero),
one_seventy_nine,
);
let s_quant = _mm_min_ps(_mm_max_ps(_mm_add_ps(s, half), zero), two_fifty_five);
let v_quant = _mm_min_ps(_mm_max_ps(_mm_add_ps(v, half), zero), two_fifty_five);
(h_quant, s_quant, v_quant)
}
}
#[inline(always)]
pub(super) unsafe fn rgb_to_hsv_16_pixels(
input_ptr: *const u8,
h_ptr: *mut u8,
s_ptr: *mut u8,
v_ptr: *mut u8,
) {
unsafe {
let (r_u8, g_u8, b_u8) = deinterleave_rgb_16(input_ptr);
let (r0, r1, r2, r3) = u8x16_to_f32x4_quad(r_u8);
let (g0, g1, g2, g3) = u8x16_to_f32x4_quad(g_u8);
let (b0, b1, b2, b3) = u8x16_to_f32x4_quad(b_u8);
let (h0, s0, v0) = hsv_group(r0, g0, b0);
let (h1, s1, v1) = hsv_group(r1, g1, b1);
let (h2, s2, v2) = hsv_group(r2, g2, b2);
let (h3, s3, v3) = hsv_group(r3, g3, b3);
_mm_storeu_si128(h_ptr.cast(), f32x4_quad_to_u8x16(h0, h1, h2, h3));
_mm_storeu_si128(s_ptr.cast(), f32x4_quad_to_u8x16(s0, s1, s2, s3));
_mm_storeu_si128(v_ptr.cast(), f32x4_quad_to_u8x16(v0, v1, v2, v3));
}
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
pub(super) unsafe fn rgb_to_luma_16_pixels(
input_ptr: *const u8,
output_ptr: *mut u8,
kr_v: __m128i,
kg_v: __m128i,
kb_v: __m128i,
rnd_v: __m128i,
full_range: bool,
) {
unsafe {
let (r_u8, g_u8, b_u8) = deinterleave_rgb_16(input_ptr);
let y0 = q15_luma_quarter::<0>(r_u8, g_u8, b_u8, kr_v, kg_v, kb_v, rnd_v);
let y1 = q15_luma_quarter::<4>(r_u8, g_u8, b_u8, kr_v, kg_v, kb_v, rnd_v);
let y2 = q15_luma_quarter::<8>(r_u8, g_u8, b_u8, kr_v, kg_v, kb_v, rnd_v);
let y3 = q15_luma_quarter::<12>(r_u8, g_u8, b_u8, kr_v, kg_v, kb_v, rnd_v);
let y_lo_i16 = _mm_packs_epi32(y0, y1);
let y_hi_i16 = _mm_packs_epi32(y2, y3);
let y_u8 = if full_range {
_mm_packus_epi16(y_lo_i16, y_hi_i16)
} else {
let y_clamp_u8 = _mm_packus_epi16(y_lo_i16, y_hi_i16);
limited_range_scale_16(y_clamp_u8, rnd_v)
};
_mm_storeu_si128(output_ptr.cast(), y_u8);
}
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn q15_luma_quarter<const LANE: i32>(
r_u8: __m128i,
g_u8: __m128i,
b_u8: __m128i,
kr_v: __m128i,
kg_v: __m128i,
kb_v: __m128i,
rnd_v: __m128i,
) -> __m128i {
unsafe {
let r_shifted = _mm_srli_si128::<LANE>(r_u8);
let g_shifted = _mm_srli_si128::<LANE>(g_u8);
let b_shifted = _mm_srli_si128::<LANE>(b_u8);
let r = _mm_cvtepu8_epi32(r_shifted);
let g = _mm_cvtepu8_epi32(g_shifted);
let b = _mm_cvtepu8_epi32(b_shifted);
let acc = _mm_mullo_epi32(r, kr_v);
let acc = _mm_add_epi32(acc, _mm_mullo_epi32(g, kg_v));
let acc = _mm_add_epi32(acc, _mm_mullo_epi32(b, kb_v));
let acc = _mm_add_epi32(acc, rnd_v);
_mm_srai_epi32::<15>(acc)
}
}
#[inline(always)]
fn limited_range_scale_16(y_clamp_u8: __m128i, rnd_v: __m128i) -> __m128i {
unsafe {
let scale = _mm_set1_epi32(28142);
let off = _mm_set1_epi16(16);
let y_lo_i16 = _mm_cvtepu8_epi16(y_clamp_u8);
let y_hi_i16 = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(y_clamp_u8));
let y_lim_lo = limited_range_quarter(y_lo_i16, scale, rnd_v);
let y_lim_hi = limited_range_quarter(y_hi_i16, scale, rnd_v);
let y_lim_lo = _mm_add_epi16(y_lim_lo, off);
let y_lim_hi = _mm_add_epi16(y_lim_hi, off);
_mm_packus_epi16(y_lim_lo, y_lim_hi)
}
}
#[inline(always)]
fn limited_range_quarter(y_i16: __m128i, scale_i32: __m128i, rnd_v: __m128i) -> __m128i {
unsafe {
let lo_i32 = _mm_cvtepi16_epi32(y_i16);
let hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(y_i16));
let lo = _mm_srai_epi32::<15>(_mm_add_epi32(_mm_mullo_epi32(lo_i32, scale_i32), rnd_v));
let hi = _mm_srai_epi32::<15>(_mm_add_epi32(_mm_mullo_epi32(hi_i32, scale_i32), rnd_v));
_mm_packs_epi32(lo, hi)
}
}