use core::arch::x86_64::*;
use crate::{
ColorMatrix,
row::{
arch::x86_sse41::endian,
scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar},
},
};
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
unsafe fn clamp01(v: __m128, zero: __m128, one: __m128) -> __m128 {
unsafe { _mm_min_ps(_mm_max_ps(v, zero), one) }
}
#[inline(always)]
unsafe fn scale_round_i32(v: __m128, scale: __m128) -> __m128i {
unsafe { _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(v, scale), _mm_set1_ps(0.5))) }
}
#[inline(always)]
unsafe fn i32x4_to_u8x4(i32v: __m128i) -> [u8; 4] {
unsafe {
let pack16 = _mm_packs_epi32(i32v, i32v);
let pack8 = _mm_packus_epi16(pack16, pack16);
[
_mm_extract_epi8::<0>(pack8) as u8,
_mm_extract_epi8::<1>(pack8) as u8,
_mm_extract_epi8::<2>(pack8) as u8,
_mm_extract_epi8::<3>(pack8) as u8,
]
}
}
#[inline(always)]
unsafe fn i32x4_to_u16x4(i32v: __m128i) -> [u16; 4] {
unsafe {
[
_mm_extract_epi32::<0>(i32v) as u16,
_mm_extract_epi32::<1>(i32v) as u16,
_mm_extract_epi32::<2>(i32v) as u16,
_mm_extract_epi32::<3>(i32v) as u16,
]
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrpf32_to_rgb_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(255.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let bv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let rv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let gi = i32x4_to_u8x4(scale_round_i32(gv, scale));
let bi = i32x4_to_u8x4(scale_round_i32(bv, scale));
let ri = i32x4_to_u8x4(scale_round_i32(rv, scale));
let base = x * 3;
for p in 0..4 {
out[base + p * 3] = ri[p];
out[base + p * 3 + 1] = gi[p];
out[base + p * 3 + 2] = bi[p];
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgb_row::<BE>(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrpf32_to_rgba_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(255.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let bv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let rv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let gi = i32x4_to_u8x4(scale_round_i32(gv, scale));
let bi = i32x4_to_u8x4(scale_round_i32(bv, scale));
let ri = i32x4_to_u8x4(scale_round_i32(rv, scale));
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = ri[p];
out[base + p * 4 + 1] = gi[p];
out[base + p * 4 + 2] = bi[p];
out[base + p * 4 + 3] = 0xFF;
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgba_row::<BE>(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrpf32_to_rgb_u16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(65535.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let bv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let rv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let gu = i32x4_to_u16x4(scale_round_i32(gv, scale));
let bu = i32x4_to_u16x4(scale_round_i32(bv, scale));
let ru = i32x4_to_u16x4(scale_round_i32(rv, scale));
let base = x * 3;
for p in 0..4 {
out[base + p * 3] = ru[p];
out[base + p * 3 + 1] = gu[p];
out[base + p * 3 + 2] = bu[p];
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgb_u16_row::<BE>(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrpf32_to_rgba_u16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(65535.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let bv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let rv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let gu = i32x4_to_u16x4(scale_round_i32(gv, scale));
let bu = i32x4_to_u16x4(scale_round_i32(bv, scale));
let ru = i32x4_to_u16x4(scale_round_i32(rv, scale));
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = ru[p];
out[base + p * 4 + 1] = gu[p];
out[base + p * 4 + 2] = bu[p];
out[base + p * 4 + 3] = 0xFFFF;
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgba_u16_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&mut out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf32_to_rgb_f32_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
scalar::gbrpf32_to_rgb_f32_row::<BE>(g, b, r, out, width);
}
#[inline]
#[target_feature(enable = "sse4.1")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf32_to_rgba_f32_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
scalar::gbrpf32_to_rgba_f32_row::<BE>(g, b, r, out, width);
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn gbrpf32_to_rgb_f16_row_f16c<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv);
let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv);
let rh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv);
let mut rh_buf = [0u16; 4];
let mut gh_buf = [0u16; 4];
let mut bh_buf = [0u16; 4];
_mm_storel_epi64(rh_buf.as_mut_ptr().cast(), rh);
_mm_storel_epi64(gh_buf.as_mut_ptr().cast(), gh);
_mm_storel_epi64(bh_buf.as_mut_ptr().cast(), bh);
let base = x * 3;
for p in 0..4 {
let dst = out.as_mut_ptr().add(base + p * 3);
*dst.cast::<u16>() = rh_buf[p];
*dst.add(1).cast::<u16>() = gh_buf[p];
*dst.add(2).cast::<u16>() = bh_buf[p];
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgb_f16_row::<BE>(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn gbrpf32_to_rgba_f16_row_f16c<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv);
let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv);
let rh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv);
let mut rh_buf = [0u16; 4];
let mut gh_buf = [0u16; 4];
let mut bh_buf = [0u16; 4];
_mm_storel_epi64(rh_buf.as_mut_ptr().cast(), rh);
_mm_storel_epi64(gh_buf.as_mut_ptr().cast(), gh);
_mm_storel_epi64(bh_buf.as_mut_ptr().cast(), bh);
let base = x * 4;
for p in 0..4 {
let dst = out.as_mut_ptr().add(base + p * 4);
*dst.cast::<u16>() = rh_buf[p];
*dst.add(1).cast::<u16>() = gh_buf[p];
*dst.add(2).cast::<u16>() = bh_buf[p];
*dst.add(3).cast::<u16>() = 0x3C00u16; }
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgba_f16_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&mut out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
#[allow(clippy::too_many_arguments)]
pub(crate) unsafe fn gbrpf32_to_luma_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width, "out row too short");
const CHUNK: usize = 64;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf32_to_rgb_row::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_luma_row(
&scratch[..n * 3],
&mut out[offset..offset + n],
n,
matrix,
full_range,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
#[allow(clippy::too_many_arguments)]
pub(crate) unsafe fn gbrpf32_to_luma_u16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u16],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width, "out row too short");
const CHUNK: usize = 64;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf32_to_rgb_row::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_luma_u16_row(
&scratch[..n * 3],
&mut out[offset..offset + n],
n,
matrix,
full_range,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrpf32_to_hsv_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(h_out.len() >= width, "h_out row too short");
debug_assert!(s_out.len() >= width, "s_out row too short");
debug_assert!(v_out.len() >= width, "v_out row too short");
const CHUNK: usize = 64;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf32_to_rgb_row::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_hsv_row(
&scratch[..n * 3],
&mut h_out[offset..offset + n],
&mut s_out[offset..offset + n],
&mut v_out[offset..offset + n],
n,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrapf32_to_rgba_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
a: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(255.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let bv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let rv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let av = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
a.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let gi = i32x4_to_u8x4(scale_round_i32(gv, scale));
let bi = i32x4_to_u8x4(scale_round_i32(bv, scale));
let ri = i32x4_to_u8x4(scale_round_i32(rv, scale));
let ai = i32x4_to_u8x4(scale_round_i32(av, scale));
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = ri[p];
out[base + p * 4 + 1] = gi[p];
out[base + p * 4 + 2] = bi[p];
out[base + p * 4 + 3] = ai[p];
}
x += 4;
}
if x < width {
scalar::gbrapf32_to_rgba_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&a[x..],
&mut out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrapf32_to_rgba_u16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
a: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(65535.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let bv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let rv = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let av = clamp01(
_mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
a.as_ptr().add(x).cast::<u8>(),
)),
zero,
one,
);
let gu = i32x4_to_u16x4(scale_round_i32(gv, scale));
let bu = i32x4_to_u16x4(scale_round_i32(bv, scale));
let ru = i32x4_to_u16x4(scale_round_i32(rv, scale));
let au = i32x4_to_u16x4(scale_round_i32(av, scale));
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = ru[p];
out[base + p * 4 + 1] = gu[p];
out[base + p * 4 + 2] = bu[p];
out[base + p * 4 + 3] = au[p];
}
x += 4;
}
if x < width {
scalar::gbrapf32_to_rgba_u16_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&a[x..],
&mut out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
#[allow(dead_code)] pub(crate) unsafe fn gbrapf32_to_rgba_f32_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
a: &[f32],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
scalar::gbrapf32_to_rgba_f32_row::<BE>(g, b, r, a, out, width);
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn gbrapf32_to_rgba_f16_row_f16c<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
a: &[f32],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let av = _mm_castsi128_ps(endian::load_endian_u32x4::<BE>(
a.as_ptr().add(x).cast::<u8>(),
));
let gh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(gv);
let bh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(bv);
let rh = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(rv);
let ah = _mm_cvtps_ph::<{ _MM_FROUND_TO_NEAREST_INT }>(av);
let mut rh_buf = [0u16; 4];
let mut gh_buf = [0u16; 4];
let mut bh_buf = [0u16; 4];
let mut ah_buf = [0u16; 4];
_mm_storel_epi64(rh_buf.as_mut_ptr().cast(), rh);
_mm_storel_epi64(gh_buf.as_mut_ptr().cast(), gh);
_mm_storel_epi64(bh_buf.as_mut_ptr().cast(), bh);
_mm_storel_epi64(ah_buf.as_mut_ptr().cast(), ah);
let base = x * 4;
for p in 0..4 {
let dst = out.as_mut_ptr().add(base + p * 4);
*dst.cast::<u16>() = rh_buf[p];
*dst.add(1).cast::<u16>() = gh_buf[p];
*dst.add(2).cast::<u16>() = bh_buf[p];
*dst.add(3).cast::<u16>() = ah_buf[p];
}
x += 4;
}
if x < width {
scalar::gbrapf32_to_rgba_f16_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&a[x..],
&mut out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn gbrpf16_to_rgb_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(255.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let gc = clamp01(gv, zero, one);
let bc = clamp01(bv, zero, one);
let rc = clamp01(rv, zero, one);
let gi = i32x4_to_u8x4(scale_round_i32(gc, scale));
let bi = i32x4_to_u8x4(scale_round_i32(bc, scale));
let ri = i32x4_to_u8x4(scale_round_i32(rc, scale));
let base = x * 3;
for p in 0..4 {
out[base + p * 3] = ri[p];
out[base + p * 3 + 1] = gi[p];
out[base + p * 3 + 2] = bi[p];
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar::gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&mut out[x * 3..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
pub(crate) unsafe fn gbrpf16_to_rgba_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(255.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let gc = clamp01(gv, zero, one);
let bc = clamp01(bv, zero, one);
let rc = clamp01(rv, zero, one);
let gi = i32x4_to_u8x4(scale_round_i32(gc, scale));
let bi = i32x4_to_u8x4(scale_round_i32(bc, scale));
let ri = i32x4_to_u8x4(scale_round_i32(rc, scale));
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = ri[p];
out[base + p * 4 + 1] = gi[p];
out[base + p * 4 + 2] = bi[p];
out[base + p * 4 + 3] = 0xFF;
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar::gbrpf32_to_rgba_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&mut out[x * 4..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf16_to_rgb_u16_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(65535.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let gc = clamp01(gv, zero, one);
let bc = clamp01(bv, zero, one);
let rc = clamp01(rv, zero, one);
let gu = i32x4_to_u16x4(scale_round_i32(gc, scale));
let bu = i32x4_to_u16x4(scale_round_i32(bc, scale));
let ru = i32x4_to_u16x4(scale_round_i32(rc, scale));
let base = x * 3;
for p in 0..4 {
out[base + p * 3] = ru[p];
out[base + p * 3 + 1] = gu[p];
out[base + p * 3 + 2] = bu[p];
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar::gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&mut out[x * 3..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf16_to_rgba_u16_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(65535.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let gc = clamp01(gv, zero, one);
let bc = clamp01(bv, zero, one);
let rc = clamp01(rv, zero, one);
let gu = i32x4_to_u16x4(scale_round_i32(gc, scale));
let bu = i32x4_to_u16x4(scale_round_i32(bc, scale));
let ru = i32x4_to_u16x4(scale_round_i32(rc, scale));
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = ru[p];
out[base + p * 4 + 1] = gu[p];
out[base + p * 4 + 2] = bu[p];
out[base + p * 4 + 3] = 0xFFFF;
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar::gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&mut out[x * 4..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf16_to_rgb_f32_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
_mm_storeu_ps(gf.as_mut_ptr(), gv);
_mm_storeu_ps(bf.as_mut_ptr(), bv);
_mm_storeu_ps(rf.as_mut_ptr(), rv);
let base = x * 3;
for p in 0..4 {
out[base + p * 3] = rf[p];
out[base + p * 3 + 1] = gf[p];
out[base + p * 3 + 2] = bf[p];
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar::gbrpf32_to_rgb_f32_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&mut out[x * 3..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf16_to_rgba_f32_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
_mm_storeu_ps(gf.as_mut_ptr(), gv);
_mm_storeu_ps(bf.as_mut_ptr(), bv);
_mm_storeu_ps(rf.as_mut_ptr(), rv);
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = rf[p];
out[base + p * 4 + 1] = gf[p];
out[base + p * 4 + 2] = bf[p];
out[base + p * 4 + 3] = 1.0;
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar::gbrpf32_to_rgba_f32_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&mut out[x * 4..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrpf16_to_rgb_f16_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gu = endian::load_endian_u16x4::<BE>(g.as_ptr().add(x).cast::<u8>());
let bu = endian::load_endian_u16x4::<BE>(b.as_ptr().add(x).cast::<u8>());
let ru = endian::load_endian_u16x4::<BE>(r.as_ptr().add(x).cast::<u8>());
let base = x * 3;
for p in 0..4usize {
let dst = out.as_mut_ptr().add(base + p * 3);
let g_word = match p {
0 => _mm_extract_epi16::<0>(gu) as u16,
1 => _mm_extract_epi16::<1>(gu) as u16,
2 => _mm_extract_epi16::<2>(gu) as u16,
_ => _mm_extract_epi16::<3>(gu) as u16,
};
let b_word = match p {
0 => _mm_extract_epi16::<0>(bu) as u16,
1 => _mm_extract_epi16::<1>(bu) as u16,
2 => _mm_extract_epi16::<2>(bu) as u16,
_ => _mm_extract_epi16::<3>(bu) as u16,
};
let r_word = match p {
0 => _mm_extract_epi16::<0>(ru) as u16,
1 => _mm_extract_epi16::<1>(ru) as u16,
2 => _mm_extract_epi16::<2>(ru) as u16,
_ => _mm_extract_epi16::<3>(ru) as u16,
};
*dst.cast::<u16>() = r_word;
*dst.add(1).cast::<u16>() = g_word;
*dst.add(2).cast::<u16>() = b_word;
}
x += 4;
}
if x < width {
scalar_f16::gbrpf16_to_rgb_f16_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&mut out[x * 3..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrpf16_to_rgba_f16_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gu = endian::load_endian_u16x4::<BE>(g.as_ptr().add(x).cast::<u8>());
let bu = endian::load_endian_u16x4::<BE>(b.as_ptr().add(x).cast::<u8>());
let ru = endian::load_endian_u16x4::<BE>(r.as_ptr().add(x).cast::<u8>());
let base = x * 4;
for p in 0..4usize {
let g_word = match p {
0 => _mm_extract_epi16::<0>(gu) as u16,
1 => _mm_extract_epi16::<1>(gu) as u16,
2 => _mm_extract_epi16::<2>(gu) as u16,
_ => _mm_extract_epi16::<3>(gu) as u16,
};
let b_word = match p {
0 => _mm_extract_epi16::<0>(bu) as u16,
1 => _mm_extract_epi16::<1>(bu) as u16,
2 => _mm_extract_epi16::<2>(bu) as u16,
_ => _mm_extract_epi16::<3>(bu) as u16,
};
let r_word = match p {
0 => _mm_extract_epi16::<0>(ru) as u16,
1 => _mm_extract_epi16::<1>(ru) as u16,
2 => _mm_extract_epi16::<2>(ru) as u16,
_ => _mm_extract_epi16::<3>(ru) as u16,
};
let dst = out.as_mut_ptr().add(base + p * 4);
*dst.cast::<u16>() = r_word;
*dst.add(1).cast::<u16>() = g_word;
*dst.add(2).cast::<u16>() = b_word;
*dst.add(3).cast::<u16>() = 0x3C00u16; }
x += 4;
}
if x < width {
scalar_f16::gbrpf16_to_rgba_f16_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&mut out[x * 4..],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(clippy::too_many_arguments)]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf16_to_luma_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width, "out row too short");
const CHUNK: usize = 64;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf16_to_rgb_row_f16c::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_luma_row(
&scratch[..n * 3],
&mut out[offset..offset + n],
n,
matrix,
full_range,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(clippy::too_many_arguments)]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf16_to_luma_u16_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u16],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width, "out row too short");
const CHUNK: usize = 64;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf16_to_rgb_row_f16c::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_luma_u16_row(
&scratch[..n * 3],
&mut out[offset..offset + n],
n,
matrix,
full_range,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf16_to_hsv_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(h_out.len() >= width, "h_out row too short");
debug_assert!(s_out.len() >= width, "s_out row too short");
debug_assert!(v_out.len() >= width, "v_out row too short");
const CHUNK: usize = 64;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf16_to_rgb_row_f16c::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_hsv_row(
&scratch[..n * 3],
&mut h_out[offset..offset + n],
&mut s_out[offset..offset + n],
&mut v_out[offset..offset + n],
n,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(dead_code)] pub(crate) unsafe fn gbrapf16_to_rgba_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
a: &[half::f16],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(255.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let av = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
a.as_ptr().add(x).cast::<u8>(),
));
let gc = clamp01(gv, zero, one);
let bc = clamp01(bv, zero, one);
let rc = clamp01(rv, zero, one);
let ac = clamp01(av, zero, one);
let gi = i32x4_to_u8x4(scale_round_i32(gc, scale));
let bi = i32x4_to_u8x4(scale_round_i32(bc, scale));
let ri = i32x4_to_u8x4(scale_round_i32(rc, scale));
let ai = i32x4_to_u8x4(scale_round_i32(ac, scale));
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = ri[p];
out[base + p * 4 + 1] = gi[p];
out[base + p * 4 + 2] = bi[p];
out[base + p * 4 + 3] = ai[p];
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
let mut af = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(a, x, &mut af, tail);
scalar::gbrapf32_to_rgba_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&af[..tail],
&mut out[x * 4..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(dead_code)] pub(crate) unsafe fn gbrapf16_to_rgba_u16_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
a: &[half::f16],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let zero = _mm_setzero_ps();
let one = _mm_set1_ps(1.0);
let scale = _mm_set1_ps(65535.0);
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let av = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
a.as_ptr().add(x).cast::<u8>(),
));
let gc = clamp01(gv, zero, one);
let bc = clamp01(bv, zero, one);
let rc = clamp01(rv, zero, one);
let ac = clamp01(av, zero, one);
let gu = i32x4_to_u16x4(scale_round_i32(gc, scale));
let bu = i32x4_to_u16x4(scale_round_i32(bc, scale));
let ru = i32x4_to_u16x4(scale_round_i32(rc, scale));
let au = i32x4_to_u16x4(scale_round_i32(ac, scale));
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = ru[p];
out[base + p * 4 + 1] = gu[p];
out[base + p * 4 + 2] = bu[p];
out[base + p * 4 + 3] = au[p];
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
let mut af = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(a, x, &mut af, tail);
scalar::gbrapf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&af[..tail],
&mut out[x * 4..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1,f16c")]
#[allow(dead_code)] pub(crate) unsafe fn gbrapf16_to_rgba_f32_row_f16c<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
a: &[half::f16],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
g.as_ptr().add(x).cast::<u8>(),
));
let bv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
b.as_ptr().add(x).cast::<u8>(),
));
let rv = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
r.as_ptr().add(x).cast::<u8>(),
));
let av = _mm_cvtph_ps(endian::load_endian_u16x4::<BE>(
a.as_ptr().add(x).cast::<u8>(),
));
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
let mut af = [0.0f32; 4];
_mm_storeu_ps(gf.as_mut_ptr(), gv);
_mm_storeu_ps(bf.as_mut_ptr(), bv);
_mm_storeu_ps(rf.as_mut_ptr(), rv);
_mm_storeu_ps(af.as_mut_ptr(), av);
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = rf[p];
out[base + p * 4 + 1] = gf[p];
out[base + p * 4 + 2] = bf[p];
out[base + p * 4 + 3] = af[p];
}
x += 4;
}
if x < width {
let tail = width - x;
let mut gf = [0.0f32; 4];
let mut bf = [0.0f32; 4];
let mut rf = [0.0f32; 4];
let mut af = [0.0f32; 4];
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, tail);
scalar_f16::widen_f16_be_to_host_f32::<BE>(a, x, &mut af, tail);
scalar::gbrapf32_to_rgba_f32_row::<HOST_NATIVE_BE>(
&gf[..tail],
&bf[..tail],
&rf[..tail],
&af[..tail],
&mut out[x * 4..],
tail,
);
}
}
}
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn gbrapf16_to_rgba_f16_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
a: &[half::f16],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let gu = endian::load_endian_u16x4::<BE>(g.as_ptr().add(x).cast::<u8>());
let bu = endian::load_endian_u16x4::<BE>(b.as_ptr().add(x).cast::<u8>());
let ru = endian::load_endian_u16x4::<BE>(r.as_ptr().add(x).cast::<u8>());
let au = endian::load_endian_u16x4::<BE>(a.as_ptr().add(x).cast::<u8>());
let base = x * 4;
for p in 0..4usize {
let g_word = match p {
0 => _mm_extract_epi16::<0>(gu) as u16,
1 => _mm_extract_epi16::<1>(gu) as u16,
2 => _mm_extract_epi16::<2>(gu) as u16,
_ => _mm_extract_epi16::<3>(gu) as u16,
};
let b_word = match p {
0 => _mm_extract_epi16::<0>(bu) as u16,
1 => _mm_extract_epi16::<1>(bu) as u16,
2 => _mm_extract_epi16::<2>(bu) as u16,
_ => _mm_extract_epi16::<3>(bu) as u16,
};
let r_word = match p {
0 => _mm_extract_epi16::<0>(ru) as u16,
1 => _mm_extract_epi16::<1>(ru) as u16,
2 => _mm_extract_epi16::<2>(ru) as u16,
_ => _mm_extract_epi16::<3>(ru) as u16,
};
let a_word = match p {
0 => _mm_extract_epi16::<0>(au) as u16,
1 => _mm_extract_epi16::<1>(au) as u16,
2 => _mm_extract_epi16::<2>(au) as u16,
_ => _mm_extract_epi16::<3>(au) as u16,
};
let dst = out.as_mut_ptr().add(base + p * 4);
*dst.cast::<u16>() = r_word;
*dst.add(1).cast::<u16>() = g_word;
*dst.add(2).cast::<u16>() = b_word;
*dst.add(3).cast::<u16>() = a_word;
}
x += 4;
}
if x < width {
scalar_f16::gbrapf16_to_rgba_f16_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&a[x..],
&mut out[x * 4..],
width - x,
);
}
}
}