use core::arch::x86_64::*;
use super::endian::load_endian_u32x8;
use super::scalar;
use crate::row::arch::x86_sse41::endian::load_endian_u16x8;
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn load_f32x8<const BE: bool>(ptr: *const f32) -> __m256 {
unsafe {
let u = load_endian_u32x8::<BE>(ptr as *const u8);
_mm256_castsi256_ps(u)
}
}
#[inline(always)]
unsafe fn clamp_scale_to_u32_256(v: __m256, zero: __m256, one: __m256, scale: __m256) -> __m256i {
unsafe {
let clamped = _mm256_min_ps(_mm256_max_ps(v, zero), one);
let scaled = _mm256_mul_ps(clamped, scale);
_mm256_cvttps_epi32(_mm256_round_ps::<
{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC },
>(scaled))
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let scale = _mm256_set1_ps(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 24 <= total_lanes {
let v0 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 8));
let v2 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 16));
let i0 = clamp_scale_to_u32_256(v0, zero, one, scale);
let i1 = clamp_scale_to_u32_256(v1, zero, one, scale);
let i2 = clamp_scale_to_u32_256(v2, zero, one, scale);
let i01 = _mm256_packs_epi32(i0, i1);
let i01 = _mm256_permute4x64_epi64::<0b11_01_10_00>(i01);
let i22 = _mm256_packs_epi32(i2, i2);
let i22 = _mm256_permute4x64_epi64::<0b11_01_10_00>(i22);
let bytes_lo = _mm256_packus_epi16(i01, i22);
let bytes_lo = _mm256_permute4x64_epi64::<0b11_01_10_00>(bytes_lo);
let mut tmp = [0u8; 32];
_mm256_storeu_si256(tmp.as_mut_ptr() as *mut __m256i, bytes_lo);
rgb_out
.get_unchecked_mut(lane..lane + 16)
.copy_from_slice(&tmp[..16]);
rgb_out
.get_unchecked_mut(lane + 16..lane + 24)
.copy_from_slice(&tmp[16..24]);
lane += 24;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf32_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let scale = _mm256_set1_ps(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 24 <= total_lanes {
let v0 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 8));
let v2 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 16));
let i0 = clamp_scale_to_u32_256(v0, zero, one, scale);
let i1 = clamp_scale_to_u32_256(v1, zero, one, scale);
let i2 = clamp_scale_to_u32_256(v2, zero, one, scale);
let i01 = _mm256_packs_epi32(i0, i1);
let i01 = _mm256_permute4x64_epi64::<0b11_01_10_00>(i01);
let i22 = _mm256_packs_epi32(i2, i2);
let i22 = _mm256_permute4x64_epi64::<0b11_01_10_00>(i22);
let bytes = _mm256_packus_epi16(i01, i22);
let bytes = _mm256_permute4x64_epi64::<0b11_01_10_00>(bytes);
let mut tmp = [0u8; 32];
_mm256_storeu_si256(tmp.as_mut_ptr() as *mut __m256i, bytes);
let dst = rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32);
for p in 0..8 {
dst[p * 4] = tmp[p * 3];
dst[p * 4 + 1] = tmp[p * 3 + 1];
dst[p * 4 + 2] = tmp[p * 3 + 2];
dst[p * 4 + 3] = 0xFF;
}
lane += 24;
pix += 8;
}
if pix < width {
scalar::rgbf32_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
unsafe {
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let scale = _mm256_set1_ps(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 24 <= total_lanes {
let v0 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 8));
let v2 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 16));
let i0 = clamp_scale_to_u32_256(v0, zero, one, scale);
let i1 = clamp_scale_to_u32_256(v1, zero, one, scale);
let i2 = clamp_scale_to_u32_256(v2, zero, one, scale);
let u01 = _mm256_packus_epi32(i0, i1);
let u01 = _mm256_permute4x64_epi64::<0b11_01_10_00>(u01);
let u22 = _mm256_packus_epi32(i2, i2);
let u22 = _mm256_permute4x64_epi64::<0b11_01_10_00>(u22);
let mut tmp = [0u16; 32];
_mm256_storeu_si256(tmp.as_mut_ptr() as *mut __m256i, u01);
_mm256_storeu_si256(tmp.as_mut_ptr().add(16) as *mut __m256i, u22);
rgb_out
.get_unchecked_mut(lane..lane + 16)
.copy_from_slice(&tmp[..16]);
rgb_out
.get_unchecked_mut(lane + 16..lane + 24)
.copy_from_slice(&tmp[16..24]);
lane += 24;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf32_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
unsafe {
let zero = _mm256_setzero_ps();
let one = _mm256_set1_ps(1.0);
let scale = _mm256_set1_ps(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 24 <= total_lanes {
let v0 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 8));
let v2 = load_f32x8::<BE>(rgb_in.as_ptr().add(lane + 16));
let i0 = clamp_scale_to_u32_256(v0, zero, one, scale);
let i1 = clamp_scale_to_u32_256(v1, zero, one, scale);
let i2 = clamp_scale_to_u32_256(v2, zero, one, scale);
let u01 = _mm256_packus_epi32(i0, i1);
let u01 = _mm256_permute4x64_epi64::<0b11_01_10_00>(u01);
let u22 = _mm256_packus_epi32(i2, i2);
let u22 = _mm256_permute4x64_epi64::<0b11_01_10_00>(u22);
let mut tmp = [0u16; 32];
_mm256_storeu_si256(tmp.as_mut_ptr() as *mut __m256i, u01);
_mm256_storeu_si256(tmp.as_mut_ptr().add(16) as *mut __m256i, u22);
let dst = rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32);
for p in 0..8 {
dst[p * 4] = tmp[p * 3];
dst[p * 4 + 1] = tmp[p * 3 + 1];
dst[p * 4 + 2] = tmp[p * 3 + 2];
dst[p * 4 + 3] = 0xFFFF;
}
lane += 24;
pix += 8;
}
if pix < width {
scalar::rgbf32_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
}
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
unsafe {
let total = width * 3;
let mut i = 0usize;
if BE == HOST_NATIVE_BE {
while i + 8 <= total {
let v = _mm256_loadu_ps(rgb_in.as_ptr().add(i));
_mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
i += 8;
}
while i < total {
*rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
i += 1;
}
} else {
while i + 8 <= total {
let v = load_f32x8::<BE>(rgb_in.as_ptr().add(i));
_mm256_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
i += 8;
}
while i < total {
let bits = (*rgb_in.get_unchecked(i)).to_bits();
let host_bits = if BE {
u32::from_be(bits)
} else {
u32::from_le(bits)
};
*rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
i += 1;
}
}
}
}
#[inline]
#[target_feature(enable = "avx2,f16c")]
unsafe fn widen_f16x8_avx<const BE: bool>(ptr: *const half::f16) -> __m256 {
unsafe {
let raw = load_endian_u16x8::<BE>(ptr as *const u8);
_mm256_cvtph_ps(raw)
}
}
#[inline]
#[target_feature(enable = "avx2,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 24 <= total_lanes {
let mut buf = [0.0f32; 24];
unsafe {
let f0 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 8));
let f2 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 16));
_mm256_storeu_ps(buf.as_mut_ptr(), f0);
_mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
_mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
}
lane += 24;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "avx2,f16c")]
pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 24 <= total_lanes {
let mut buf = [0.0f32; 24];
unsafe {
let f0 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 8));
let f2 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 16));
_mm256_storeu_ps(buf.as_mut_ptr(), f0);
_mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
_mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32),
8,
);
}
lane += 24;
pix += 8;
}
if pix < width {
scalar::rgbf16_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "avx2,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 24 <= total_lanes {
let mut buf = [0.0f32; 24];
unsafe {
let f0 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 8));
let f2 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 16));
_mm256_storeu_ps(buf.as_mut_ptr(), f0);
_mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
_mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 24), 8);
}
lane += 24;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "avx2,f16c")]
pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 24 <= total_lanes {
let mut buf = [0.0f32; 24];
unsafe {
let f0 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 8));
let f2 = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane + 16));
_mm256_storeu_ps(buf.as_mut_ptr(), f0);
_mm256_storeu_ps(buf.as_mut_ptr().add(8), f1);
_mm256_storeu_ps(buf.as_mut_ptr().add(16), f2);
rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 32),
8,
);
}
lane += 24;
pix += 8;
}
if pix < width {
scalar::rgbf16_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "avx2,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 8 <= total_lanes {
unsafe {
let f = widen_f16x8_avx::<BE>(rgb_in.as_ptr().add(lane));
_mm256_storeu_ps(rgb_out.as_mut_ptr().add(lane), f);
}
lane += 8;
}
#[allow(clippy::needless_range_loop)]
for i in lane..total_lanes {
let bits = rgb_in[i].to_bits();
let h = half::f16::from_bits(if BE {
u16::from_be(bits)
} else {
u16::from_le(bits)
});
unsafe {
*rgb_out.get_unchecked_mut(i) = h.to_f32();
}
}
}
#[inline]
#[target_feature(enable = "avx2,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [half::f16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
}