use core::arch::x86_64::*;
#[allow(unused_imports)]
#[cfg(feature = "rgb")]
pub(super) use crate::row::arch::x86_common::{
abgr_to_rgb_16_pixels, abgr_to_rgba_4_pixels, argb_to_rgb_16_pixels, argb_to_rgba_4_pixels,
bgra_to_rgb_16_pixels, bgrx_to_rgba_4_pixels, drop_alpha_16_pixels, rgbx_to_rgba_4_pixels,
swap_rb_16_pixels, swap_rb_alpha_4_pixels, x2bgr10_to_rgb_16_pixels, x2bgr10_to_rgb_u16_8_pixels,
x2bgr10_to_rgba_16_pixels, x2rgb10_to_rgb_16_pixels, x2rgb10_to_rgb_u16_8_pixels,
x2rgb10_to_rgba_16_pixels, xbgr_to_rgba_4_pixels, xrgb_to_rgba_4_pixels,
};
#[allow(unused_imports)]
#[cfg(any(
feature = "yuv-444-packed",
feature = "rgb-legacy",
feature = "mono",
feature = "rgb",
feature = "yuv-packed",
feature = "gbr",
feature = "yuv-semi-planar",
feature = "yuv-planar",
feature = "y2xx",
feature = "xyz",
))]
pub(super) use crate::row::arch::x86_common::{write_rgb_16, write_rgba_16};
#[allow(unused_imports)]
#[cfg(any(
feature = "yuv-444-packed",
feature = "rgb-legacy",
feature = "mono",
feature = "rgb",
feature = "gbr",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
pub(super) use crate::row::arch::x86_common::{write_rgb_u16_8, write_rgba_u16_8};
#[allow(unused_imports)]
pub(super) use crate::{
ColorMatrix,
row::{
arch::x86_common::{deinterleave_rgb_16, rgb_to_hsv_16_pixels, rgb_to_luma_16_pixels},
scalar,
},
};
#[cfg(any(feature = "gbr", feature = "yuv-444-packed", feature = "yuva"))]
mod alpha_extract;
#[cfg(feature = "yuv-444-packed")]
mod ayuv64;
pub(crate) mod endian;
#[cfg(feature = "gray")]
mod gray;
mod hsv;
#[cfg(feature = "rgb-legacy")]
pub(crate) mod legacy_rgb;
#[cfg(feature = "mono")]
pub(crate) mod mono1bit;
#[cfg(feature = "rgb")]
mod packed_rgb;
#[cfg(feature = "rgb")]
mod packed_rgb_16bit;
#[cfg(feature = "rgb-float")]
mod packed_rgb_float;
#[cfg(feature = "yuv-packed")]
mod packed_yuv_4_1_1;
#[cfg(feature = "yuv-packed")]
mod packed_yuv_8bit;
#[cfg(feature = "gbr")]
mod planar_gbr;
#[cfg(feature = "gbr")]
mod planar_gbr_float;
#[cfg(feature = "gbr")]
mod planar_gbr_high_bit;
#[cfg(feature = "yuv-semi-planar")]
mod semi_planar_8bit;
#[cfg(all(feature = "yuv-planar", feature = "yuv-semi-planar"))]
mod subsampled_high_bit_pn_4_2_0;
#[cfg(feature = "yuv-semi-planar")]
mod subsampled_high_bit_pn_4_4_4;
#[cfg(feature = "v210")]
mod v210;
#[cfg(feature = "yuv-444-packed")]
mod v30x;
#[cfg(feature = "yuv-444-packed")]
mod v410;
#[cfg(feature = "yuv-444-packed")]
mod vuya;
#[cfg(feature = "yuv-444-packed")]
mod xv36;
#[cfg(all(feature = "xyz", any(feature = "std", feature = "alloc")))]
pub(crate) mod xyz12;
#[cfg(feature = "y2xx")]
mod y216;
#[cfg(feature = "y2xx")]
mod y2xx;
#[cfg(any(
feature = "gray",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "yuva",
))]
mod y_plane_to_luma_u16;
#[cfg(feature = "yuv-planar")]
mod yuv_planar_16bit;
#[cfg(feature = "yuv-planar")]
mod yuv_planar_8bit;
#[cfg(feature = "yuv-planar")]
mod yuv_planar_high_bit;
#[cfg(any(feature = "gbr", feature = "yuv-444-packed", feature = "yuva"))]
pub(crate) use alpha_extract::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use ayuv64::*;
#[cfg(feature = "gray")]
pub(crate) use gray::*;
pub(crate) use hsv::*;
#[cfg(feature = "rgb-legacy")]
#[allow(unused_imports)] pub(crate) use legacy_rgb::*;
#[cfg(feature = "mono")]
pub(crate) use mono1bit::*;
#[cfg(feature = "rgb")]
pub(crate) use packed_rgb::*;
#[cfg(feature = "rgb")]
#[allow(unused_imports)] pub(crate) use packed_rgb_16bit::*;
#[cfg(feature = "rgb-float")]
pub(crate) use packed_rgb_float::*;
#[cfg(feature = "yuv-packed")]
pub(crate) use packed_yuv_4_1_1::*;
#[cfg(feature = "yuv-packed")]
pub(crate) use packed_yuv_8bit::*;
#[cfg(feature = "gbr")]
pub(crate) use planar_gbr::*;
#[cfg(feature = "gbr")]
pub(crate) use planar_gbr_float::*;
#[cfg(feature = "gbr")]
#[allow(unused_imports)] pub(crate) use planar_gbr_high_bit::*;
#[cfg(feature = "yuv-semi-planar")]
pub(crate) use semi_planar_8bit::*;
#[cfg(all(feature = "yuv-planar", feature = "yuv-semi-planar"))]
pub(crate) use subsampled_high_bit_pn_4_2_0::*;
#[cfg(feature = "yuv-semi-planar")]
pub(crate) use subsampled_high_bit_pn_4_4_4::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use v30x::*;
#[cfg(feature = "v210")]
pub(crate) use v210::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use v410::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use vuya::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use xv36::*;
#[cfg(any(
feature = "gray",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "yuva",
))]
pub(crate) use y_plane_to_luma_u16::*;
#[cfg(feature = "y2xx")]
pub(crate) use y2xx::*;
#[cfg(feature = "y2xx")]
pub(crate) use y216::*;
#[cfg(feature = "yuv-planar")]
pub(crate) use yuv_planar_8bit::*;
#[cfg(feature = "yuv-planar")]
pub(crate) use yuv_planar_16bit::*;
#[cfg(feature = "yuv-planar")]
pub(crate) use yuv_planar_high_bit::*;
#[cfg(any(
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "v210",
feature = "yuv-444-packed",
feature = "y2xx",
))]
#[inline(always)]
pub(super) fn clamp_u16_max_x32(v: __m512i, zero_v: __m512i, max_v: __m512i) -> __m512i {
unsafe { _mm512_min_epi16(_mm512_max_epi16(v, zero_v), max_v) }
}
#[cfg(any(feature = "yuv-planar", feature = "yuv-semi-planar"))]
#[inline(always)]
pub(super) unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u16) {
unsafe {
let (rq, gq, bq) = match idx {
0 => (
_mm512_extracti32x4_epi32::<0>(r),
_mm512_extracti32x4_epi32::<0>(g),
_mm512_extracti32x4_epi32::<0>(b),
),
1 => (
_mm512_extracti32x4_epi32::<1>(r),
_mm512_extracti32x4_epi32::<1>(g),
_mm512_extracti32x4_epi32::<1>(b),
),
2 => (
_mm512_extracti32x4_epi32::<2>(r),
_mm512_extracti32x4_epi32::<2>(g),
_mm512_extracti32x4_epi32::<2>(b),
),
_ => (
_mm512_extracti32x4_epi32::<3>(r),
_mm512_extracti32x4_epi32::<3>(g),
_mm512_extracti32x4_epi32::<3>(b),
),
};
write_rgb_u16_8(rq, gq, bq, ptr);
}
}
#[cfg(any(feature = "yuv-planar", feature = "yuv-semi-planar"))]
#[inline(always)]
pub(super) unsafe fn write_quarter_rgba(
r: __m512i,
g: __m512i,
b: __m512i,
a: __m128i,
idx: u8,
ptr: *mut u16,
) {
unsafe {
let (rq, gq, bq) = match idx {
0 => (
_mm512_extracti32x4_epi32::<0>(r),
_mm512_extracti32x4_epi32::<0>(g),
_mm512_extracti32x4_epi32::<0>(b),
),
1 => (
_mm512_extracti32x4_epi32::<1>(r),
_mm512_extracti32x4_epi32::<1>(g),
_mm512_extracti32x4_epi32::<1>(b),
),
2 => (
_mm512_extracti32x4_epi32::<2>(r),
_mm512_extracti32x4_epi32::<2>(g),
_mm512_extracti32x4_epi32::<2>(b),
),
_ => (
_mm512_extracti32x4_epi32::<3>(r),
_mm512_extracti32x4_epi32::<3>(g),
_mm512_extracti32x4_epi32::<3>(b),
),
};
write_rgba_u16_8(rq, gq, bq, a, ptr);
}
}
#[cfg(feature = "yuv-semi-planar")]
#[inline(always)]
pub(super) unsafe fn deinterleave_uv_u16_avx512(ptr: *const u16) -> (__m512i, __m512i) {
unsafe {
let split_mask = _mm512_broadcast_i32x4(_mm_setr_epi8(
0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
));
let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
let u_perm = _mm512_setr_epi64(0, 1, 2, 3, 8, 9, 10, 11);
let v_perm = _mm512_setr_epi64(4, 5, 6, 7, 12, 13, 14, 15);
let uv0 = _mm512_loadu_si512(ptr.cast());
let uv1 = _mm512_loadu_si512(ptr.add(32).cast());
let s0 = _mm512_shuffle_epi8(uv0, split_mask);
let s1 = _mm512_shuffle_epi8(uv1, split_mask);
let s0_p = _mm512_permutexvar_epi64(pack_fixup, s0);
let s1_p = _mm512_permutexvar_epi64(pack_fixup, s1);
let u_vec = _mm512_permutex2var_epi64(s0_p, u_perm, s1_p);
let v_vec = _mm512_permutex2var_epi64(s0_p, v_perm, s1_p);
(u_vec, v_vec)
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-packed",
feature = "yuv-semi-planar",
feature = "v210",
feature = "y2xx",
feature = "yuv-planar",
))]
#[inline(always)]
pub(super) fn q15_shift(v: __m512i) -> __m512i {
unsafe { _mm512_srai_epi32::<15>(v) }
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-packed",
feature = "yuv-semi-planar",
feature = "v210",
feature = "y2xx",
feature = "yuv-planar",
))]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
pub(super) fn chroma_i16x32(
cu: __m512i,
cv: __m512i,
u_d_lo: __m512i,
v_d_lo: __m512i,
u_d_hi: __m512i,
v_d_hi: __m512i,
rnd: __m512i,
pack_fixup: __m512i,
) -> __m512i {
unsafe {
let lo = _mm512_srai_epi32::<15>(_mm512_add_epi32(
_mm512_add_epi32(
_mm512_mullo_epi32(cu, u_d_lo),
_mm512_mullo_epi32(cv, v_d_lo),
),
rnd,
));
let hi = _mm512_srai_epi32::<15>(_mm512_add_epi32(
_mm512_add_epi32(
_mm512_mullo_epi32(cu, u_d_hi),
_mm512_mullo_epi32(cv, v_d_hi),
),
rnd,
));
_mm512_permutexvar_epi64(pack_fixup, _mm512_packs_epi32(lo, hi))
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-packed",
feature = "yuv-semi-planar",
feature = "v210",
feature = "y2xx",
feature = "yuv-planar",
))]
#[inline(always)]
pub(super) fn scale_y(
y_i16: __m512i,
y_off_v: __m512i,
y_scale_v: __m512i,
rnd: __m512i,
pack_fixup: __m512i,
) -> __m512i {
unsafe {
let shifted = _mm512_sub_epi16(y_i16, y_off_v);
let lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(shifted));
let hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(shifted));
let lo_scaled =
_mm512_srai_epi32::<15>(_mm512_add_epi32(_mm512_mullo_epi32(lo_i32, y_scale_v), rnd));
let hi_scaled =
_mm512_srai_epi32::<15>(_mm512_add_epi32(_mm512_mullo_epi32(hi_i32, y_scale_v), rnd));
_mm512_permutexvar_epi64(pack_fixup, _mm512_packs_epi32(lo_scaled, hi_scaled))
}
}
#[cfg(any(
feature = "yuv-packed",
feature = "yuv-semi-planar",
feature = "v210",
feature = "y2xx",
feature = "yuv-planar",
))]
#[inline(always)]
pub(super) fn chroma_dup(
chroma: __m512i,
dup_lo_idx: __m512i,
dup_hi_idx: __m512i,
) -> (__m512i, __m512i) {
unsafe {
let a = _mm512_unpacklo_epi16(chroma, chroma);
let b = _mm512_unpackhi_epi16(chroma, chroma);
let lo32 = _mm512_permutex2var_epi64(a, dup_lo_idx, b);
let hi32 = _mm512_permutex2var_epi64(a, dup_hi_idx, b);
(lo32, hi32)
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-packed",
feature = "yuv-semi-planar",
feature = "v210",
feature = "y2xx",
feature = "yuv-planar",
))]
#[inline(always)]
pub(super) fn narrow_u8x64(lo: __m512i, hi: __m512i, pack_fixup: __m512i) -> __m512i {
unsafe { _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi16(lo, hi)) }
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-packed",
feature = "yuv-semi-planar",
feature = "y2xx",
feature = "yuv-planar",
))]
#[inline(always)]
pub(super) unsafe fn write_rgb_64(r: __m512i, g: __m512i, b: __m512i, ptr: *mut u8) {
unsafe {
let r0: __m128i = _mm512_castsi512_si128(r);
let r1: __m128i = _mm512_extracti32x4_epi32::<1>(r);
let r2: __m128i = _mm512_extracti32x4_epi32::<2>(r);
let r3: __m128i = _mm512_extracti32x4_epi32::<3>(r);
let g0: __m128i = _mm512_castsi512_si128(g);
let g1: __m128i = _mm512_extracti32x4_epi32::<1>(g);
let g2: __m128i = _mm512_extracti32x4_epi32::<2>(g);
let g3: __m128i = _mm512_extracti32x4_epi32::<3>(g);
let b0: __m128i = _mm512_castsi512_si128(b);
let b1: __m128i = _mm512_extracti32x4_epi32::<1>(b);
let b2: __m128i = _mm512_extracti32x4_epi32::<2>(b);
let b3: __m128i = _mm512_extracti32x4_epi32::<3>(b);
write_rgb_16(r0, g0, b0, ptr);
write_rgb_16(r1, g1, b1, ptr.add(48));
write_rgb_16(r2, g2, b2, ptr.add(96));
write_rgb_16(r3, g3, b3, ptr.add(144));
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-packed",
feature = "yuv-semi-planar",
feature = "y2xx",
feature = "yuv-planar",
))]
#[inline(always)]
pub(super) unsafe fn write_rgba_64(r: __m512i, g: __m512i, b: __m512i, a: __m512i, ptr: *mut u8) {
unsafe {
let r0: __m128i = _mm512_castsi512_si128(r);
let r1: __m128i = _mm512_extracti32x4_epi32::<1>(r);
let r2: __m128i = _mm512_extracti32x4_epi32::<2>(r);
let r3: __m128i = _mm512_extracti32x4_epi32::<3>(r);
let g0: __m128i = _mm512_castsi512_si128(g);
let g1: __m128i = _mm512_extracti32x4_epi32::<1>(g);
let g2: __m128i = _mm512_extracti32x4_epi32::<2>(g);
let g3: __m128i = _mm512_extracti32x4_epi32::<3>(g);
let b0: __m128i = _mm512_castsi512_si128(b);
let b1: __m128i = _mm512_extracti32x4_epi32::<1>(b);
let b2: __m128i = _mm512_extracti32x4_epi32::<2>(b);
let b3: __m128i = _mm512_extracti32x4_epi32::<3>(b);
let a0: __m128i = _mm512_castsi512_si128(a);
let a1: __m128i = _mm512_extracti32x4_epi32::<1>(a);
let a2: __m128i = _mm512_extracti32x4_epi32::<2>(a);
let a3: __m128i = _mm512_extracti32x4_epi32::<3>(a);
write_rgba_16(r0, g0, b0, a0, ptr);
write_rgba_16(r1, g1, b1, a1, ptr.add(64));
write_rgba_16(r2, g2, b2, a2, ptr.add(128));
write_rgba_16(r3, g3, b3, a3, ptr.add(192));
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
#[inline(always)]
pub(super) fn chroma_i64x8_avx512(
cu: __m512i,
cv: __m512i,
u_d_even: __m512i,
v_d_even: __m512i,
rnd_i64: __m512i,
) -> __m512i {
unsafe {
_mm512_srai_epi64::<15>(_mm512_add_epi64(
_mm512_add_epi64(
_mm512_mul_epi32(cu, u_d_even),
_mm512_mul_epi32(cv, v_d_even),
),
rnd_i64,
))
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
#[inline(always)]
pub(super) fn reassemble_i32x16(
even_i64: __m512i,
odd_i64: __m512i,
interleave_idx: __m512i,
) -> __m512i {
unsafe {
let even_i32 = _mm512_cvtepi64_epi32(even_i64); let odd_i32 = _mm512_cvtepi64_epi32(odd_i64);
_mm512_permutex2var_epi32(
_mm512_castsi256_si512(even_i32),
interleave_idx,
_mm512_castsi256_si512(odd_i32),
)
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
#[inline(always)]
pub(super) fn scale_y_i32x16_i64(
y_minus_off: __m512i,
y_scale_v: __m512i,
rnd_i64: __m512i,
interleave_idx: __m512i,
) -> __m512i {
unsafe {
let even = _mm512_srai_epi64::<15>(_mm512_add_epi64(
_mm512_mul_epi32(y_scale_v, y_minus_off),
rnd_i64,
));
let odd = _mm512_srai_epi64::<15>(_mm512_add_epi64(
_mm512_mul_epi32(y_scale_v, _mm512_shuffle_epi32::<0xF5>(y_minus_off)),
rnd_i64,
));
reassemble_i32x16(even, odd, interleave_idx)
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "rgb",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
#[inline(always)]
pub(super) unsafe fn write_rgb_u16_32(r: __m512i, g: __m512i, b: __m512i, ptr: *mut u16) {
unsafe {
let r0: __m128i = _mm512_castsi512_si128(r);
let r1: __m128i = _mm512_extracti32x4_epi32::<1>(r);
let r2: __m128i = _mm512_extracti32x4_epi32::<2>(r);
let r3: __m128i = _mm512_extracti32x4_epi32::<3>(r);
let g0: __m128i = _mm512_castsi512_si128(g);
let g1: __m128i = _mm512_extracti32x4_epi32::<1>(g);
let g2: __m128i = _mm512_extracti32x4_epi32::<2>(g);
let g3: __m128i = _mm512_extracti32x4_epi32::<3>(g);
let b0: __m128i = _mm512_castsi512_si128(b);
let b1: __m128i = _mm512_extracti32x4_epi32::<1>(b);
let b2: __m128i = _mm512_extracti32x4_epi32::<2>(b);
let b3: __m128i = _mm512_extracti32x4_epi32::<3>(b);
write_rgb_u16_8(r0, g0, b0, ptr);
write_rgb_u16_8(r1, g1, b1, ptr.add(24));
write_rgb_u16_8(r2, g2, b2, ptr.add(48));
write_rgb_u16_8(r3, g3, b3, ptr.add(72));
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
#[inline(always)]
pub(super) unsafe fn write_rgba_u16_32(
r: __m512i,
g: __m512i,
b: __m512i,
a: __m128i,
ptr: *mut u16,
) {
unsafe {
let r0: __m128i = _mm512_castsi512_si128(r);
let r1: __m128i = _mm512_extracti32x4_epi32::<1>(r);
let r2: __m128i = _mm512_extracti32x4_epi32::<2>(r);
let r3: __m128i = _mm512_extracti32x4_epi32::<3>(r);
let g0: __m128i = _mm512_castsi512_si128(g);
let g1: __m128i = _mm512_extracti32x4_epi32::<1>(g);
let g2: __m128i = _mm512_extracti32x4_epi32::<2>(g);
let g3: __m128i = _mm512_extracti32x4_epi32::<3>(g);
let b0: __m128i = _mm512_castsi512_si128(b);
let b1: __m128i = _mm512_extracti32x4_epi32::<1>(b);
let b2: __m128i = _mm512_extracti32x4_epi32::<2>(b);
let b3: __m128i = _mm512_extracti32x4_epi32::<3>(b);
write_rgba_u16_8(r0, g0, b0, a, ptr);
write_rgba_u16_8(r1, g1, b1, a, ptr.add(32));
write_rgba_u16_8(r2, g2, b2, a, ptr.add(64));
write_rgba_u16_8(r3, g3, b3, a, ptr.add(96));
}
}
#[cfg(any(
feature = "yuv-444-packed",
feature = "yuv-planar",
feature = "yuv-semi-planar",
feature = "y2xx",
))]
#[inline(always)]
pub(super) fn scale_y_u16_avx512(
y_u16x32: __m512i,
y_off_v: __m512i,
y_scale_v: __m512i,
rnd: __m512i,
pack_fixup: __m512i,
) -> __m512i {
unsafe {
let y_lo_i32 = _mm512_sub_epi32(
_mm512_cvtepu16_epi32(_mm512_castsi512_si256(y_u16x32)),
y_off_v,
);
let y_hi_i32 = _mm512_sub_epi32(
_mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64::<1>(y_u16x32)),
y_off_v,
);
let lo = _mm512_srai_epi32::<15>(_mm512_add_epi32(
_mm512_mullo_epi32(y_lo_i32, y_scale_v),
rnd,
));
let hi = _mm512_srai_epi32::<15>(_mm512_add_epi32(
_mm512_mullo_epi32(y_hi_i32, y_scale_v),
rnd,
));
_mm512_permutexvar_epi64(pack_fixup, _mm512_packs_epi32(lo, hi))
}
}
#[cfg(all(test, feature = "std"))]
mod tests;