use core::arch::x86_64::*;
use super::{endian::load_endian_u32x4, scalar};
#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn load_f32x4<const BE: bool>(ptr: *const f32) -> __m128 {
unsafe {
let u = load_endian_u32x4::<BE>(ptr as *const u8);
_mm_castsi128_ps(u)
}
}
#[inline(always)]
unsafe fn clamp_scale_to_u32(v: __m128, zero: __m128, one: __m128, scale: __m128) -> __m128i {
unsafe {
let clamped = _mm_min_ps(_mm_max_ps(v, zero), one);
let scaled = _mm_mul_ps(clamped, scale);
_mm_cvttps_epi32(_mm_round_ps::<
{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC },
>(scaled))
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let i0 = clamp_scale_to_u32(v0, zero, one, scale);
let i1 = clamp_scale_to_u32(v1, zero, one, scale);
let i2 = clamp_scale_to_u32(v2, zero, one, scale);
let i01 = _mm_packs_epi32(i0, i1);
let i22 = _mm_packs_epi32(i2, i2);
let bytes = _mm_packus_epi16(i01, i22);
let mut tmp = [0u8; 16];
_mm_storeu_si128(tmp.as_mut_ptr() as *mut __m128i, bytes);
rgb_out
.get_unchecked_mut(lane..lane + 12)
.copy_from_slice(&tmp[..12]);
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf32_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let i0 = clamp_scale_to_u32(v0, zero, one, scale);
let i1 = clamp_scale_to_u32(v1, zero, one, scale);
let i2 = clamp_scale_to_u32(v2, zero, one, scale);
let i01 = _mm_packs_epi32(i0, i1);
let i22 = _mm_packs_epi32(i2, i2);
let bytes = _mm_packus_epi16(i01, i22);
let mut tmp = [0u8; 16];
_mm_storeu_si128(tmp.as_mut_ptr() as *mut __m128i, bytes);
let dst = rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16);
for p in 0..4 {
dst[p * 4] = tmp[p * 3];
dst[p * 4 + 1] = tmp[p * 3 + 1];
dst[p * 4 + 2] = tmp[p * 3 + 2];
dst[p * 4 + 3] = 0xFF;
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf32_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let i0 = clamp_scale_to_u32(v0, zero, one, scale);
let i1 = clamp_scale_to_u32(v1, zero, one, scale);
let i2 = clamp_scale_to_u32(v2, zero, one, scale);
let u01 = _mm_packus_epi32(i0, i1);
let u22 = _mm_packus_epi32(i2, i2);
_mm_storeu_si128(rgb_out.as_mut_ptr().add(lane) as *mut __m128i, u01);
_mm_storel_epi64(rgb_out.as_mut_ptr().add(lane + 8) as *mut __m128i, u22);
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf32_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let i0 = clamp_scale_to_u32(v0, zero, one, scale);
let i1 = clamp_scale_to_u32(v1, zero, one, scale);
let i2 = clamp_scale_to_u32(v2, zero, one, scale);
let u01 = _mm_packus_epi32(i0, i1);
let u22 = _mm_packus_epi32(i2, i2);
let mut tmp = [0u16; 16];
_mm_storeu_si128(tmp.as_mut_ptr() as *mut __m128i, u01);
_mm_storel_epi64(tmp.as_mut_ptr().add(8) as *mut __m128i, u22);
let dst = rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16);
for p in 0..4 {
dst[p * 4] = tmp[p * 3];
dst[p * 4 + 1] = tmp[p * 3 + 1];
dst[p * 4 + 2] = tmp[p * 3 + 2];
dst[p * 4 + 3] = 0xFFFF;
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf32_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
unsafe {
let total = width * 3;
let mut i = 0usize;
if BE == HOST_NATIVE_BE {
while i + 4 <= total {
let v = _mm_loadu_ps(rgb_in.as_ptr().add(i));
_mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
i += 4;
}
while i < total {
*rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
i += 1;
}
} else {
while i + 4 <= total {
let v = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
_mm_storeu_ps(rgb_out.as_mut_ptr().add(i), v);
i += 4;
}
while i < total {
let bits = (*rgb_in.get_unchecked(i)).to_bits();
let host_bits = if BE {
u32::from_be(bits)
} else {
u32::from_le(bits)
};
*rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
i += 1;
}
}
}
}
use super::endian::load_endian_u16x4;
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
unsafe fn widen_f16x4_sse<const BE: bool>(ptr: *const half::f16) -> __m128 {
unsafe {
let raw = load_endian_u16x4::<BE>(ptr as *const u8);
_mm_cvtph_ps(raw)
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
unsafe {
let f0 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 4));
let f2 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 8));
_mm_storeu_ps(buf.as_mut_ptr(), f0);
_mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
_mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
}
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
unsafe {
let f0 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 4));
let f2 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 8));
_mm_storeu_ps(buf.as_mut_ptr(), f0);
_mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
_mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
4,
);
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf16_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
unsafe {
let f0 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 4));
let f2 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 8));
_mm_storeu_ps(buf.as_mut_ptr(), f0);
_mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
_mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
}
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
unsafe {
let f0 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
let f1 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 4));
let f2 = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane + 8));
_mm_storeu_ps(buf.as_mut_ptr(), f0);
_mm_storeu_ps(buf.as_mut_ptr().add(4), f1);
_mm_storeu_ps(buf.as_mut_ptr().add(8), f2);
rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
4,
);
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf16_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 4 <= total_lanes {
unsafe {
let f = widen_f16x4_sse::<BE>(rgb_in.as_ptr().add(lane));
_mm_storeu_ps(rgb_out.as_mut_ptr().add(lane), f);
}
lane += 4;
}
for i in lane..total_lanes {
unsafe {
let v = load_f16_scalar::<BE>(rgb_in, i);
*rgb_out.get_unchecked_mut(i) = v.to_f32();
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [half::f16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
}
#[inline(always)]
fn load_f16_scalar<const BE: bool>(rgb_in: &[half::f16], i: usize) -> half::f16 {
let bits = rgb_in[i].to_bits();
half::f16::from_bits(if BE {
u16::from_be(bits)
} else {
u16::from_le(bits)
})
}