#![cfg_attr(not(feature = "std"), allow(dead_code))]
use core::arch::wasm32::*;
use crate::{
ColorMatrix,
row::{
arch::wasm_simd128::endian,
scalar::{planar_gbr_f16 as scalar_f16, planar_gbr_float as scalar},
},
};
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
fn clamp01(v: v128, zero: v128, one: v128) -> v128 {
f32x4_min(f32x4_max(v, zero), one)
}
#[inline(always)]
fn scale_round_i32(v: v128, scale: v128, half: v128) -> v128 {
i32x4_trunc_sat_f32x4(f32x4_add(f32x4_mul(v, scale), half))
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf32_to_rgb_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(255.0);
let half = f32x4_splat(0.5);
let mut x = 0usize;
while x + 4 <= width {
unsafe {
let gv = clamp01(
endian::load_endian_u32x4::<BE>(g.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let bv = clamp01(
endian::load_endian_u32x4::<BE>(b.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let rv = clamp01(
endian::load_endian_u32x4::<BE>(r.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let gi = scale_round_i32(gv, scale, half);
let bi = scale_round_i32(bv, scale, half);
let ri = scale_round_i32(rv, scale, half);
let g16 = i16x8_narrow_i32x4(gi, gi);
let b16 = i16x8_narrow_i32x4(bi, bi);
let r16 = i16x8_narrow_i32x4(ri, ri);
let g8 = u8x16_narrow_i16x8(g16, g16);
let b8 = u8x16_narrow_i16x8(b16, b16);
let r8 = u8x16_narrow_i16x8(r16, r16);
let mut g_buf = [0u8; 16];
let mut b_buf = [0u8; 16];
let mut r_buf = [0u8; 16];
v128_store(g_buf.as_mut_ptr().cast(), g8);
v128_store(b_buf.as_mut_ptr().cast(), b8);
v128_store(r_buf.as_mut_ptr().cast(), r8);
let base = x * 3;
for p in 0..4 {
out[base + p * 3] = r_buf[p];
out[base + p * 3 + 1] = g_buf[p];
out[base + p * 3 + 2] = b_buf[p];
}
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgb_row::<BE>(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf32_to_rgba_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(255.0);
let half = f32x4_splat(0.5);
let mut x = 0usize;
while x + 4 <= width {
unsafe {
let gv = clamp01(
endian::load_endian_u32x4::<BE>(g.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let bv = clamp01(
endian::load_endian_u32x4::<BE>(b.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let rv = clamp01(
endian::load_endian_u32x4::<BE>(r.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let gi = scale_round_i32(gv, scale, half);
let bi = scale_round_i32(bv, scale, half);
let ri = scale_round_i32(rv, scale, half);
let g16 = i16x8_narrow_i32x4(gi, gi);
let b16 = i16x8_narrow_i32x4(bi, bi);
let r16 = i16x8_narrow_i32x4(ri, ri);
let g8 = u8x16_narrow_i16x8(g16, g16);
let b8 = u8x16_narrow_i16x8(b16, b16);
let r8 = u8x16_narrow_i16x8(r16, r16);
let mut g_buf = [0u8; 16];
let mut b_buf = [0u8; 16];
let mut r_buf = [0u8; 16];
v128_store(g_buf.as_mut_ptr().cast(), g8);
v128_store(b_buf.as_mut_ptr().cast(), b8);
v128_store(r_buf.as_mut_ptr().cast(), r8);
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = r_buf[p];
out[base + p * 4 + 1] = g_buf[p];
out[base + p * 4 + 2] = b_buf[p];
out[base + p * 4 + 3] = 0xFF;
}
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgba_row::<BE>(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf32_to_rgb_u16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(65535.0);
let half = f32x4_splat(0.5);
let mut x = 0usize;
while x + 4 <= width {
unsafe {
let gv = clamp01(
endian::load_endian_u32x4::<BE>(g.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let bv = clamp01(
endian::load_endian_u32x4::<BE>(b.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let rv = clamp01(
endian::load_endian_u32x4::<BE>(r.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let gi = scale_round_i32(gv, scale, half);
let bi = scale_round_i32(bv, scale, half);
let ri = scale_round_i32(rv, scale, half);
let gw = u16x8_narrow_i32x4(gi, gi);
let bw = u16x8_narrow_i32x4(bi, bi);
let rw = u16x8_narrow_i32x4(ri, ri);
let mut g_buf = [0u16; 8];
let mut b_buf = [0u16; 8];
let mut r_buf = [0u16; 8];
v128_store(g_buf.as_mut_ptr().cast(), gw);
v128_store(b_buf.as_mut_ptr().cast(), bw);
v128_store(r_buf.as_mut_ptr().cast(), rw);
let base = x * 3;
for p in 0..4 {
out[base + p * 3] = r_buf[p];
out[base + p * 3 + 1] = g_buf[p];
out[base + p * 3 + 2] = b_buf[p];
}
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgb_u16_row::<BE>(&g[x..], &b[x..], &r[x..], &mut out[x * 3..], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf32_to_rgba_u16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(65535.0);
let half = f32x4_splat(0.5);
let mut x = 0usize;
while x + 4 <= width {
unsafe {
let gv = clamp01(
endian::load_endian_u32x4::<BE>(g.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let bv = clamp01(
endian::load_endian_u32x4::<BE>(b.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let rv = clamp01(
endian::load_endian_u32x4::<BE>(r.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let gi = scale_round_i32(gv, scale, half);
let bi = scale_round_i32(bv, scale, half);
let ri = scale_round_i32(rv, scale, half);
let gw = u16x8_narrow_i32x4(gi, gi);
let bw = u16x8_narrow_i32x4(bi, bi);
let rw = u16x8_narrow_i32x4(ri, ri);
let mut g_buf = [0u16; 8];
let mut b_buf = [0u16; 8];
let mut r_buf = [0u16; 8];
v128_store(g_buf.as_mut_ptr().cast(), gw);
v128_store(b_buf.as_mut_ptr().cast(), bw);
v128_store(r_buf.as_mut_ptr().cast(), rw);
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = r_buf[p];
out[base + p * 4 + 1] = g_buf[p];
out[base + p * 4 + 2] = b_buf[p];
out[base + p * 4 + 3] = 0xFFFF;
}
}
x += 4;
}
if x < width {
scalar::gbrpf32_to_rgba_u16_row::<BE>(&g[x..], &b[x..], &r[x..], &mut out[x * 4..], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf32_to_rgb_f32_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
scalar::gbrpf32_to_rgb_f32_row::<BE>(g, b, r, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
#[allow(dead_code)] pub(crate) unsafe fn gbrpf32_to_rgba_f32_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
scalar::gbrpf32_to_rgba_f32_row::<BE>(g, b, r, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf32_to_rgb_f16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
scalar::gbrpf32_to_rgb_f16_row::<BE>(g, b, r, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf32_to_rgba_f16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
scalar::gbrpf32_to_rgba_f16_row::<BE>(g, b, r, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
#[allow(clippy::too_many_arguments)]
pub(crate) unsafe fn gbrpf32_to_luma_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width, "out row too short");
const CHUNK: usize = 16;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf32_to_rgb_row::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_luma_row(
&scratch[..n * 3],
&mut out[offset..offset + n],
n,
matrix,
full_range,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "simd128")]
#[allow(clippy::too_many_arguments)]
pub(crate) unsafe fn gbrpf32_to_luma_u16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
out: &mut [u16],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width, "out row too short");
const CHUNK: usize = 16;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf32_to_rgb_row::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_luma_u16_row(
&scratch[..n * 3],
&mut out[offset..offset + n],
n,
matrix,
full_range,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf32_to_hsv_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(h_out.len() >= width, "h_out row too short");
debug_assert!(s_out.len() >= width, "s_out row too short");
debug_assert!(v_out.len() >= width, "v_out row too short");
const CHUNK: usize = 16;
let mut scratch = [0u8; CHUNK * 3];
let mut offset = 0;
while offset < width {
let n = (width - offset).min(CHUNK);
unsafe {
gbrpf32_to_rgb_row::<BE>(
&g[offset..],
&b[offset..],
&r[offset..],
&mut scratch[..n * 3],
n,
);
}
crate::row::scalar::rgb_to_hsv_row(
&scratch[..n * 3],
&mut h_out[offset..offset + n],
&mut s_out[offset..offset + n],
&mut v_out[offset..offset + n],
n,
);
offset += n;
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrapf32_to_rgba_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
a: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(255.0);
let half = f32x4_splat(0.5);
let mut x = 0usize;
while x + 4 <= width {
unsafe {
let gv = clamp01(
endian::load_endian_u32x4::<BE>(g.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let bv = clamp01(
endian::load_endian_u32x4::<BE>(b.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let rv = clamp01(
endian::load_endian_u32x4::<BE>(r.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let av = clamp01(
endian::load_endian_u32x4::<BE>(a.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let gi = scale_round_i32(gv, scale, half);
let bi = scale_round_i32(bv, scale, half);
let ri = scale_round_i32(rv, scale, half);
let ai = scale_round_i32(av, scale, half);
let g16 = i16x8_narrow_i32x4(gi, gi);
let b16 = i16x8_narrow_i32x4(bi, bi);
let r16 = i16x8_narrow_i32x4(ri, ri);
let a16 = i16x8_narrow_i32x4(ai, ai);
let g8 = u8x16_narrow_i16x8(g16, g16);
let b8 = u8x16_narrow_i16x8(b16, b16);
let r8 = u8x16_narrow_i16x8(r16, r16);
let a8 = u8x16_narrow_i16x8(a16, a16);
let mut g_buf = [0u8; 16];
let mut b_buf = [0u8; 16];
let mut r_buf = [0u8; 16];
let mut a_buf = [0u8; 16];
v128_store(g_buf.as_mut_ptr().cast(), g8);
v128_store(b_buf.as_mut_ptr().cast(), b8);
v128_store(r_buf.as_mut_ptr().cast(), r8);
v128_store(a_buf.as_mut_ptr().cast(), a8);
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = r_buf[p];
out[base + p * 4 + 1] = g_buf[p];
out[base + p * 4 + 2] = b_buf[p];
out[base + p * 4 + 3] = a_buf[p];
}
}
x += 4;
}
if x < width {
scalar::gbrapf32_to_rgba_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&a[x..],
&mut out[x * 4..],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrapf32_to_rgba_u16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
a: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(65535.0);
let half = f32x4_splat(0.5);
let mut x = 0usize;
while x + 4 <= width {
unsafe {
let gv = clamp01(
endian::load_endian_u32x4::<BE>(g.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let bv = clamp01(
endian::load_endian_u32x4::<BE>(b.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let rv = clamp01(
endian::load_endian_u32x4::<BE>(r.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let av = clamp01(
endian::load_endian_u32x4::<BE>(a.as_ptr().add(x).cast::<u8>()),
zero,
one,
);
let gi = scale_round_i32(gv, scale, half);
let bi = scale_round_i32(bv, scale, half);
let ri = scale_round_i32(rv, scale, half);
let ai = scale_round_i32(av, scale, half);
let gw = u16x8_narrow_i32x4(gi, gi);
let bw = u16x8_narrow_i32x4(bi, bi);
let rw = u16x8_narrow_i32x4(ri, ri);
let aw = u16x8_narrow_i32x4(ai, ai);
let mut g_buf = [0u16; 8];
let mut b_buf = [0u16; 8];
let mut r_buf = [0u16; 8];
let mut a_buf = [0u16; 8];
v128_store(g_buf.as_mut_ptr().cast(), gw);
v128_store(b_buf.as_mut_ptr().cast(), bw);
v128_store(r_buf.as_mut_ptr().cast(), rw);
v128_store(a_buf.as_mut_ptr().cast(), aw);
let base = x * 4;
for p in 0..4 {
out[base + p * 4] = r_buf[p];
out[base + p * 4 + 1] = g_buf[p];
out[base + p * 4 + 2] = b_buf[p];
out[base + p * 4 + 3] = a_buf[p];
}
}
x += 4;
}
if x < width {
scalar::gbrapf32_to_rgba_u16_row::<BE>(
&g[x..],
&b[x..],
&r[x..],
&a[x..],
&mut out[x * 4..],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
#[allow(dead_code)] pub(crate) unsafe fn gbrapf32_to_rgba_f32_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
a: &[f32],
out: &mut [f32],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
scalar::gbrapf32_to_rgba_f32_row::<BE>(g, b, r, a, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrapf32_to_rgba_f16_row<const BE: bool>(
g: &[f32],
b: &[f32],
r: &[f32],
a: &[f32],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
scalar::gbrapf32_to_rgba_f16_row::<BE>(g, b, r, a, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf16_to_rgb_f16_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
scalar_f16::gbrpf16_to_rgb_f16_row::<BE>(g, b, r, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf16_to_rgba_f16_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
scalar_f16::gbrpf16_to_rgba_f16_row::<BE>(g, b, r, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrapf16_to_rgba_f16_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
a: &[half::f16],
out: &mut [half::f16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
scalar_f16::gbrapf16_to_rgba_f16_row::<BE>(g, b, r, a, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf16_to_rgb_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
const CHUNK: usize = 4;
let mut gf = [0.0f32; CHUNK];
let mut bf = [0.0f32; CHUNK];
let mut rf = [0.0f32; CHUNK];
let mut x = 0usize;
while x + CHUNK <= width {
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, CHUNK);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, CHUNK);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, CHUNK);
unsafe {
gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(&gf, &bf, &rf, &mut out[x * 3..(x + CHUNK) * 3], CHUNK);
}
x += CHUNK;
}
if x < width {
let n = width - x;
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, n);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, n);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, n);
scalar::gbrpf32_to_rgb_row::<HOST_NATIVE_BE>(
&gf[..n],
&bf[..n],
&rf[..n],
&mut out[x * 3..width * 3],
n,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf16_to_rgba_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
const CHUNK: usize = 4;
let mut gf = [0.0f32; CHUNK];
let mut bf = [0.0f32; CHUNK];
let mut rf = [0.0f32; CHUNK];
let mut x = 0usize;
while x + CHUNK <= width {
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, CHUNK);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, CHUNK);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, CHUNK);
unsafe {
gbrpf32_to_rgba_row::<HOST_NATIVE_BE>(&gf, &bf, &rf, &mut out[x * 4..(x + CHUNK) * 4], CHUNK);
}
x += CHUNK;
}
if x < width {
let n = width - x;
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, n);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, n);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, n);
scalar::gbrpf32_to_rgba_row::<HOST_NATIVE_BE>(
&gf[..n],
&bf[..n],
&rf[..n],
&mut out[x * 4..width * 4],
n,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf16_to_rgb_u16_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 3, "out row too short");
const CHUNK: usize = 4;
let mut gf = [0.0f32; CHUNK];
let mut bf = [0.0f32; CHUNK];
let mut rf = [0.0f32; CHUNK];
let mut x = 0usize;
while x + CHUNK <= width {
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, CHUNK);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, CHUNK);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, CHUNK);
unsafe {
gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(
&gf,
&bf,
&rf,
&mut out[x * 3..(x + CHUNK) * 3],
CHUNK,
);
}
x += CHUNK;
}
if x < width {
let n = width - x;
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, n);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, n);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, n);
scalar::gbrpf32_to_rgb_u16_row::<HOST_NATIVE_BE>(
&gf[..n],
&bf[..n],
&rf[..n],
&mut out[x * 3..width * 3],
n,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gbrpf16_to_rgba_u16_row<const BE: bool>(
g: &[half::f16],
b: &[half::f16],
r: &[half::f16],
out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(out.len() >= width * 4, "out row too short");
const CHUNK: usize = 4;
let mut gf = [0.0f32; CHUNK];
let mut bf = [0.0f32; CHUNK];
let mut rf = [0.0f32; CHUNK];
let mut x = 0usize;
while x + CHUNK <= width {
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, CHUNK);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, CHUNK);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, CHUNK);
unsafe {
gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&gf,
&bf,
&rf,
&mut out[x * 4..(x + CHUNK) * 4],
CHUNK,
);
}
x += CHUNK;
}
if x < width {
let n = width - x;
scalar_f16::widen_f16_be_to_host_f32::<BE>(g, x, &mut gf, n);
scalar_f16::widen_f16_be_to_host_f32::<BE>(b, x, &mut bf, n);
scalar_f16::widen_f16_be_to_host_f32::<BE>(r, x, &mut rf, n);
scalar::gbrpf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&gf[..n],
&bf[..n],
&rf[..n],
&mut out[x * 4..width * 4],
n,
);
}
}