use core::arch::x86_64::*;
use super::endian::load_endian_u32x16;
use super::scalar;
use crate::row::arch::x86_avx2::endian::load_endian_u16x16;
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn load_f32x16<const BE: bool>(ptr: *const f32) -> __m512 {
unsafe {
let u = load_endian_u32x16::<BE>(ptr as *const u8);
_mm512_castsi512_ps(u)
}
}
#[inline(always)]
unsafe fn clamp_scale_to_u32_512(v: __m512, zero: __m512, one: __m512, scale: __m512) -> __m512i {
unsafe {
let clamped = _mm512_min_ps(_mm512_max_ps(v, zero), one);
let scaled = _mm512_mul_ps(clamped, scale);
_mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(scaled)
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm512_setzero_ps();
let one = _mm512_set1_ps(1.0);
let scale = _mm512_set1_ps(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 48 <= total_lanes {
let v0 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 16));
let v2 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 32));
let i0 = clamp_scale_to_u32_512(v0, zero, one, scale);
let i1 = clamp_scale_to_u32_512(v1, zero, one, scale);
let i2 = clamp_scale_to_u32_512(v2, zero, one, scale);
let b0 = _mm512_cvtusepi32_epi8(i0);
let b1 = _mm512_cvtusepi32_epi8(i1);
let b2 = _mm512_cvtusepi32_epi8(i2);
_mm_storeu_si128(rgb_out.as_mut_ptr().add(lane) as *mut __m128i, b0);
_mm_storeu_si128(rgb_out.as_mut_ptr().add(lane + 16) as *mut __m128i, b1);
_mm_storeu_si128(rgb_out.as_mut_ptr().add(lane + 32) as *mut __m128i, b2);
lane += 48;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf32_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm512_setzero_ps();
let one = _mm512_set1_ps(1.0);
let scale = _mm512_set1_ps(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 48 <= total_lanes {
let v0 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 16));
let v2 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 32));
let i0 = clamp_scale_to_u32_512(v0, zero, one, scale);
let i1 = clamp_scale_to_u32_512(v1, zero, one, scale);
let i2 = clamp_scale_to_u32_512(v2, zero, one, scale);
let b0 = _mm512_cvtusepi32_epi8(i0);
let b1 = _mm512_cvtusepi32_epi8(i1);
let b2 = _mm512_cvtusepi32_epi8(i2);
let mut tmp = [0u8; 48];
_mm_storeu_si128(tmp.as_mut_ptr() as *mut __m128i, b0);
_mm_storeu_si128(tmp.as_mut_ptr().add(16) as *mut __m128i, b1);
_mm_storeu_si128(tmp.as_mut_ptr().add(32) as *mut __m128i, b2);
let dst = rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64);
for p in 0..16 {
dst[p * 4] = tmp[p * 3];
dst[p * 4 + 1] = tmp[p * 3 + 1];
dst[p * 4 + 2] = tmp[p * 3 + 2];
dst[p * 4 + 3] = 0xFF;
}
lane += 48;
pix += 16;
}
if pix < width {
scalar::rgbf32_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
unsafe {
let zero = _mm512_setzero_ps();
let one = _mm512_set1_ps(1.0);
let scale = _mm512_set1_ps(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 48 <= total_lanes {
let v0 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 16));
let v2 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 32));
let i0 = clamp_scale_to_u32_512(v0, zero, one, scale);
let i1 = clamp_scale_to_u32_512(v1, zero, one, scale);
let i2 = clamp_scale_to_u32_512(v2, zero, one, scale);
let h0 = _mm512_cvtusepi32_epi16(i0);
let h1 = _mm512_cvtusepi32_epi16(i1);
let h2 = _mm512_cvtusepi32_epi16(i2);
_mm256_storeu_si256(rgb_out.as_mut_ptr().add(lane) as *mut __m256i, h0);
_mm256_storeu_si256(rgb_out.as_mut_ptr().add(lane + 16) as *mut __m256i, h1);
_mm256_storeu_si256(rgb_out.as_mut_ptr().add(lane + 32) as *mut __m256i, h2);
lane += 48;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf32_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
unsafe {
let zero = _mm512_setzero_ps();
let one = _mm512_set1_ps(1.0);
let scale = _mm512_set1_ps(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 48 <= total_lanes {
let v0 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 16));
let v2 = load_f32x16::<BE>(rgb_in.as_ptr().add(lane + 32));
let i0 = clamp_scale_to_u32_512(v0, zero, one, scale);
let i1 = clamp_scale_to_u32_512(v1, zero, one, scale);
let i2 = clamp_scale_to_u32_512(v2, zero, one, scale);
let h0 = _mm512_cvtusepi32_epi16(i0);
let h1 = _mm512_cvtusepi32_epi16(i1);
let h2 = _mm512_cvtusepi32_epi16(i2);
let mut tmp = [0u16; 48];
_mm256_storeu_si256(tmp.as_mut_ptr() as *mut __m256i, h0);
_mm256_storeu_si256(tmp.as_mut_ptr().add(16) as *mut __m256i, h1);
_mm256_storeu_si256(tmp.as_mut_ptr().add(32) as *mut __m256i, h2);
let dst = rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64);
for p in 0..16 {
dst[p * 4] = tmp[p * 3];
dst[p * 4 + 1] = tmp[p * 3 + 1];
dst[p * 4 + 2] = tmp[p * 3 + 2];
dst[p * 4 + 3] = 0xFFFF;
}
lane += 48;
pix += 16;
}
if pix < width {
scalar::rgbf32_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
unsafe {
let total = width * 3;
let mut i = 0usize;
if BE == HOST_NATIVE_BE {
while i + 16 <= total {
let v = _mm512_loadu_ps(rgb_in.as_ptr().add(i));
_mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
i += 16;
}
while i < total {
*rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
i += 1;
}
} else {
while i + 16 <= total {
let v = load_f32x16::<BE>(rgb_in.as_ptr().add(i));
_mm512_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
i += 16;
}
while i < total {
let bits = (*rgb_in.get_unchecked(i)).to_bits();
let host_bits = if BE {
u32::from_be(bits)
} else {
u32::from_le(bits)
};
*rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
i += 1;
}
}
}
}
#[inline]
#[target_feature(enable = "avx512f,f16c")]
unsafe fn widen_f16x16_avx512<const BE: bool>(ptr: *const half::f16) -> __m512 {
unsafe {
let raw = load_endian_u16x16::<BE>(ptr as *const u8);
_mm512_cvtph_ps(raw)
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 48 <= total_lanes {
let mut buf = [0.0f32; 48];
unsafe {
let f0 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 16));
let f2 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 32));
_mm512_storeu_ps(buf.as_mut_ptr(), f0);
_mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
_mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
}
lane += 48;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw,f16c")]
pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 48 <= total_lanes {
let mut buf = [0.0f32; 48];
unsafe {
let f0 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 16));
let f2 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 32));
_mm512_storeu_ps(buf.as_mut_ptr(), f0);
_mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
_mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64),
16,
);
}
lane += 48;
pix += 16;
}
if pix < width {
scalar::rgbf16_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 48 <= total_lanes {
let mut buf = [0.0f32; 48];
unsafe {
let f0 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 16));
let f2 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 32));
_mm512_storeu_ps(buf.as_mut_ptr(), f0);
_mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
_mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 48), 16);
}
lane += 48;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw,f16c")]
pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 48 <= total_lanes {
let mut buf = [0.0f32; 48];
unsafe {
let f0 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 16));
let f2 = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane + 32));
_mm512_storeu_ps(buf.as_mut_ptr(), f0);
_mm512_storeu_ps(buf.as_mut_ptr().add(16), f1);
_mm512_storeu_ps(buf.as_mut_ptr().add(32), f2);
rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 64),
16,
);
}
lane += 48;
pix += 16;
}
if pix < width {
scalar::rgbf16_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 16 <= total_lanes {
unsafe {
let f = widen_f16x16_avx512::<BE>(rgb_in.as_ptr().add(lane));
_mm512_storeu_ps(rgb_out.as_mut_ptr().add(lane), f);
}
lane += 16;
}
#[allow(clippy::needless_range_loop)]
for i in lane..total_lanes {
let bits = rgb_in[i].to_bits();
let h = half::f16::from_bits(if BE {
u16::from_be(bits)
} else {
u16::from_le(bits)
});
unsafe {
*rgb_out.get_unchecked_mut(i) = h.to_f32();
}
}
}
#[inline]
#[target_feature(enable = "avx512f,avx512bw,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [half::f16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
}