use crate::SizeError;
#[cfg(target_arch = "x86_64")]
use archmage::X64V3Token;
use archmage::incant;
use archmage::prelude::*;
pub fn rgb24_to_planes_f32(
src: &[u8],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) -> Result<(), SizeError> {
if src.is_empty() || !src.len().is_multiple_of(3) {
return Err(SizeError::NotPixelAligned);
}
let pixels = src.len() / 3;
if r.len() < pixels || g.len() < pixels || b.len() < pixels {
return Err(SizeError::PixelCountMismatch);
}
let r = &mut r[..pixels];
let g = &mut g[..pixels];
let b = &mut b[..pixels];
incant!(rgb24_to_planes_impl(src, r, g, b), [v3, neon, scalar]);
Ok(())
}
pub fn rgb48_to_planes_f32(
src: &[u16],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) -> Result<(), SizeError> {
if src.is_empty() || !src.len().is_multiple_of(3) {
return Err(SizeError::NotPixelAligned);
}
let pixels = src.len() / 3;
if r.len() < pixels || g.len() < pixels || b.len() < pixels {
return Err(SizeError::PixelCountMismatch);
}
let r = &mut r[..pixels];
let g = &mut g[..pixels];
let b = &mut b[..pixels];
incant!(rgb48_to_planes_impl(src, r, g, b), [v3, neon, scalar]);
Ok(())
}
#[doc(hidden)]
pub fn scalar_only_rgb24(src: &[u8], r: &mut [f32], g: &mut [f32], b: &mut [f32]) {
let pixels = src.len() / 3;
rgb24_to_planes_loop_scalar(src, &mut r[..pixels], &mut g[..pixels], &mut b[..pixels]);
}
#[doc(hidden)]
pub fn scalar_only_rgb48(src: &[u16], r: &mut [f32], g: &mut [f32], b: &mut [f32]) {
let pixels = src.len() / 3;
rgb48_to_planes_loop_scalar(src, &mut r[..pixels], &mut g[..pixels], &mut b[..pixels]);
}
#[doc(hidden)]
pub fn autovec_avx2_rgb24(src: &[u8], r: &mut [f32], g: &mut [f32], b: &mut [f32]) -> bool {
let pixels = src.len() / 3;
let r = &mut r[..pixels];
let g = &mut g[..pixels];
let b = &mut b[..pixels];
#[cfg(target_arch = "x86_64")]
{
if let Some(t) = X64V3Token::summon() {
x86::rgb24_to_planes_impl_v3_autovec(t, src, r, g, b);
return true;
}
}
rgb24_to_planes_loop_scalar(src, r, g, b);
false
}
#[doc(hidden)]
pub fn autovec_avx2_rgb48(src: &[u16], r: &mut [f32], g: &mut [f32], b: &mut [f32]) -> bool {
let pixels = src.len() / 3;
let r = &mut r[..pixels];
let g = &mut g[..pixels];
let b = &mut b[..pixels];
#[cfg(target_arch = "x86_64")]
{
if let Some(t) = X64V3Token::summon() {
x86::rgb48_to_planes_impl_v3_autovec(t, src, r, g, b);
return true;
}
}
rgb48_to_planes_loop_scalar(src, r, g, b);
false
}
#[inline(always)]
fn rgb24_chunk8_scalar(c: &[u8; 24]) -> ([f32; 8], [f32; 8], [f32; 8]) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = c[i * 3] as f32;
g[i] = c[i * 3 + 1] as f32;
b[i] = c[i * 3 + 2] as f32;
}
(r, g, b)
}
#[inline(always)]
fn rgb48_chunk8_scalar(c: &[u16; 24]) -> ([f32; 8], [f32; 8], [f32; 8]) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = c[i * 3] as f32;
g[i] = c[i * 3 + 1] as f32;
b[i] = c[i * 3 + 2] as f32;
}
(r, g, b)
}
#[inline(always)]
fn rgb24_to_planes_loop_scalar(src: &[u8], r: &mut [f32], g: &mut [f32], b: &mut [f32]) {
let pixels = src.len() / 3;
let n_chunks = pixels / 8;
for ci in 0..n_chunks {
let bs = ci * 24;
let ps = ci * 8;
let c: &[u8; 24] = src[bs..bs + 24].try_into().unwrap();
let (rv, gv, bv) = rgb24_chunk8_scalar(c);
r[ps..ps + 8].copy_from_slice(&rv);
g[ps..ps + 8].copy_from_slice(&gv);
b[ps..ps + 8].copy_from_slice(&bv);
}
for p in (n_chunks * 8)..pixels {
r[p] = src[p * 3] as f32;
g[p] = src[p * 3 + 1] as f32;
b[p] = src[p * 3 + 2] as f32;
}
}
#[inline(always)]
fn rgb48_to_planes_loop_scalar(src: &[u16], r: &mut [f32], g: &mut [f32], b: &mut [f32]) {
let pixels = src.len() / 3;
let n_chunks = pixels / 8;
for ci in 0..n_chunks {
let bs = ci * 24;
let ps = ci * 8;
let c: &[u16; 24] = src[bs..bs + 24].try_into().unwrap();
let (rv, gv, bv) = rgb48_chunk8_scalar(c);
r[ps..ps + 8].copy_from_slice(&rv);
g[ps..ps + 8].copy_from_slice(&gv);
b[ps..ps + 8].copy_from_slice(&bv);
}
for p in (n_chunks * 8)..pixels {
r[p] = src[p * 3] as f32;
g[p] = src[p * 3 + 1] as f32;
b[p] = src[p * 3 + 2] as f32;
}
}
pub(crate) fn rgb24_to_planes_impl_scalar(
_t: ScalarToken,
src: &[u8],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
rgb24_to_planes_loop_scalar(src, r, g, b);
}
pub(crate) fn rgb48_to_planes_impl_scalar(
_t: ScalarToken,
src: &[u16],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
rgb48_to_planes_loop_scalar(src, r, g, b);
}
#[cfg(target_arch = "x86_64")]
mod x86 {
use super::*;
const R_LO: [i8; 16] = [
0, 3, 6, 9, 12, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
];
const R_HI: [i8; 16] = [
-128, -128, -128, -128, -128, 7, 10, 13, -128, -128, -128, -128, -128, -128, -128, -128,
];
const G_LO: [i8; 16] = [
1, 4, 7, 10, 13, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
];
const G_HI: [i8; 16] = [
-128, -128, -128, -128, -128, 8, 11, 14, -128, -128, -128, -128, -128, -128, -128, -128,
];
const B_LO: [i8; 16] = [
2, 5, 8, 11, 14, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
];
const B_HI: [i8; 16] = [
-128, -128, -128, -128, -128, 9, 12, 15, -128, -128, -128, -128, -128, -128, -128, -128,
];
const R16_V1: [i8; 16] = [
0, 1, 6, 7, 12, 13, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
];
const R16_V2: [i8; 16] = [
-128, -128, -128, -128, -128, -128, 2, 3, 8, 9, 14, 15, -128, -128, -128, -128,
];
const R16_V3: [i8; 16] = [
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 4, 5, 10, 11,
];
const G16_V1: [i8; 16] = [
2, 3, 8, 9, 14, 15, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
];
const G16_V2: [i8; 16] = [
-128, -128, -128, -128, -128, -128, 4, 5, 10, 11, -128, -128, -128, -128, -128, -128,
];
const G16_V3: [i8; 16] = [
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 0, 1, 6, 7, 12, 13,
];
const B16_V1: [i8; 16] = [
4, 5, 10, 11, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
];
const B16_V2: [i8; 16] = [
-128, -128, -128, -128, 0, 1, 6, 7, 12, 13, -128, -128, -128, -128, -128, -128,
];
const B16_V3: [i8; 16] = [
-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 2, 3, 8, 9, 14, 15,
];
#[rite]
pub fn rgb24_chunk8_v3(_t: X64V3Token, c: &[u8; 24]) -> ([f32; 8], [f32; 8], [f32; 8]) {
let lo16: &[u8; 16] = c[0..16].try_into().unwrap();
let hi16: &[u8; 16] = c[8..24].try_into().unwrap();
let lo = _mm_loadu_si128(lo16);
let hi = _mm_loadu_si128(hi16);
let r_lo = _mm_shuffle_epi8(lo, _mm_loadu_si128(&R_LO));
let r_hi = _mm_shuffle_epi8(hi, _mm_loadu_si128(&R_HI));
let r_u8 = _mm_or_si128(r_lo, r_hi);
let g_lo = _mm_shuffle_epi8(lo, _mm_loadu_si128(&G_LO));
let g_hi = _mm_shuffle_epi8(hi, _mm_loadu_si128(&G_HI));
let g_u8 = _mm_or_si128(g_lo, g_hi);
let b_lo = _mm_shuffle_epi8(lo, _mm_loadu_si128(&B_LO));
let b_hi = _mm_shuffle_epi8(hi, _mm_loadu_si128(&B_HI));
let b_u8 = _mm_or_si128(b_lo, b_hi);
let r_i32 = _mm256_cvtepu8_epi32(r_u8);
let g_i32 = _mm256_cvtepu8_epi32(g_u8);
let b_i32 = _mm256_cvtepu8_epi32(b_u8);
let r_f32 = _mm256_cvtepi32_ps(r_i32);
let g_f32 = _mm256_cvtepi32_ps(g_i32);
let b_f32 = _mm256_cvtepi32_ps(b_i32);
let mut r_out = [0.0f32; 8];
let mut g_out = [0.0f32; 8];
let mut b_out = [0.0f32; 8];
_mm256_storeu_ps(&mut r_out, r_f32);
_mm256_storeu_ps(&mut g_out, g_f32);
_mm256_storeu_ps(&mut b_out, b_f32);
(r_out, g_out, b_out)
}
#[rite]
pub fn rgb48_chunk8_v3(_t: X64V3Token, c: &[u16; 24]) -> ([f32; 8], [f32; 8], [f32; 8]) {
let bytes: &[u8; 48] = bytemuck::cast_ref(c);
let v1b: &[u8; 16] = bytes[0..16].try_into().unwrap();
let v2b: &[u8; 16] = bytes[16..32].try_into().unwrap();
let v3b: &[u8; 16] = bytes[32..48].try_into().unwrap();
let v1 = _mm_loadu_si128(v1b);
let v2 = _mm_loadu_si128(v2b);
let v3 = _mm_loadu_si128(v3b);
let r_v1 = _mm_shuffle_epi8(v1, _mm_loadu_si128(&R16_V1));
let r_v2 = _mm_shuffle_epi8(v2, _mm_loadu_si128(&R16_V2));
let r_v3 = _mm_shuffle_epi8(v3, _mm_loadu_si128(&R16_V3));
let r_u16 = _mm_or_si128(_mm_or_si128(r_v1, r_v2), r_v3);
let g_v1 = _mm_shuffle_epi8(v1, _mm_loadu_si128(&G16_V1));
let g_v2 = _mm_shuffle_epi8(v2, _mm_loadu_si128(&G16_V2));
let g_v3 = _mm_shuffle_epi8(v3, _mm_loadu_si128(&G16_V3));
let g_u16 = _mm_or_si128(_mm_or_si128(g_v1, g_v2), g_v3);
let b_v1 = _mm_shuffle_epi8(v1, _mm_loadu_si128(&B16_V1));
let b_v2 = _mm_shuffle_epi8(v2, _mm_loadu_si128(&B16_V2));
let b_v3 = _mm_shuffle_epi8(v3, _mm_loadu_si128(&B16_V3));
let b_u16 = _mm_or_si128(_mm_or_si128(b_v1, b_v2), b_v3);
let r_i32 = _mm256_cvtepu16_epi32(r_u16);
let g_i32 = _mm256_cvtepu16_epi32(g_u16);
let b_i32 = _mm256_cvtepu16_epi32(b_u16);
let r_f32 = _mm256_cvtepi32_ps(r_i32);
let g_f32 = _mm256_cvtepi32_ps(g_i32);
let b_f32 = _mm256_cvtepi32_ps(b_i32);
let mut r_out = [0.0f32; 8];
let mut g_out = [0.0f32; 8];
let mut b_out = [0.0f32; 8];
_mm256_storeu_ps(&mut r_out, r_f32);
_mm256_storeu_ps(&mut g_out, g_f32);
_mm256_storeu_ps(&mut b_out, b_f32);
(r_out, g_out, b_out)
}
#[arcane]
pub(crate) fn rgb24_to_planes_impl_v3_autovec(
_t: X64V3Token,
src: &[u8],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
super::rgb24_to_planes_loop_scalar(src, r, g, b);
}
#[arcane]
pub(crate) fn rgb48_to_planes_impl_v3_autovec(
_t: X64V3Token,
src: &[u16],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
super::rgb48_to_planes_loop_scalar(src, r, g, b);
}
#[arcane]
pub(crate) fn rgb24_to_planes_impl_v3(
t: X64V3Token,
src: &[u8],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
let pixels = src.len() / 3;
let n_chunks = pixels / 8;
for ci in 0..n_chunks {
let bs = ci * 24;
let ps = ci * 8;
let c: &[u8; 24] = src[bs..bs + 24].try_into().unwrap();
let (rv, gv, bv) = rgb24_chunk8_v3(t, c);
r[ps..ps + 8].copy_from_slice(&rv);
g[ps..ps + 8].copy_from_slice(&gv);
b[ps..ps + 8].copy_from_slice(&bv);
}
for p in (n_chunks * 8)..pixels {
r[p] = src[p * 3] as f32;
g[p] = src[p * 3 + 1] as f32;
b[p] = src[p * 3 + 2] as f32;
}
}
#[arcane]
pub(crate) fn rgb48_to_planes_impl_v3(
t: X64V3Token,
src: &[u16],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
let pixels = src.len() / 3;
let n_chunks = pixels / 8;
for ci in 0..n_chunks {
let bs = ci * 24;
let ps = ci * 8;
let c: &[u16; 24] = src[bs..bs + 24].try_into().unwrap();
let (rv, gv, bv) = rgb48_chunk8_v3(t, c);
r[ps..ps + 8].copy_from_slice(&rv);
g[ps..ps + 8].copy_from_slice(&gv);
b[ps..ps + 8].copy_from_slice(&bv);
}
for p in (n_chunks * 8)..pixels {
r[p] = src[p * 3] as f32;
g[p] = src[p * 3 + 1] as f32;
b[p] = src[p * 3 + 2] as f32;
}
}
}
#[cfg(target_arch = "x86_64")]
use x86::{rgb24_to_planes_impl_v3, rgb48_to_planes_impl_v3};
#[cfg(target_arch = "x86_64")]
#[doc(hidden)]
pub use x86::rgb24_chunk8_v3 as rgb24_chunk8_to_planes_v3;
#[cfg(target_arch = "x86_64")]
#[doc(hidden)]
pub use x86::rgb48_chunk8_v3 as rgb48_chunk8_to_planes_v3;
#[doc(hidden)]
#[inline(always)]
pub fn rgb24_chunk8_to_planes_scalar(chunk: &[u8; 24]) -> ([f32; 8], [f32; 8], [f32; 8]) {
rgb24_chunk8_scalar(chunk)
}
#[doc(hidden)]
#[inline(always)]
pub fn rgb48_chunk8_to_planes_scalar(chunk: &[u16; 24]) -> ([f32; 8], [f32; 8], [f32; 8]) {
rgb48_chunk8_scalar(chunk)
}
#[cfg(target_arch = "aarch64")]
mod arm {
use super::*;
#[arcane]
pub(crate) fn rgb24_to_planes_impl_neon(
_t: NeonToken,
src: &[u8],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
let pixels = src.len() / 3;
let n_chunks = pixels / 16;
for ci in 0..n_chunks {
let bs = ci * 48;
let ps = ci * 16;
let c: &[u8; 48] = src[bs..bs + 48].try_into().unwrap();
let uint8x16x3_t(r_u8x16, g_u8x16, b_u8x16) = vld3q_u8(c);
let r_u16_lo = vmovl_u8(vget_low_u8(r_u8x16));
let r_u16_hi = vmovl_high_u8(r_u8x16);
let g_u16_lo = vmovl_u8(vget_low_u8(g_u8x16));
let g_u16_hi = vmovl_high_u8(g_u8x16);
let b_u16_lo = vmovl_u8(vget_low_u8(b_u8x16));
let b_u16_hi = vmovl_high_u8(b_u8x16);
let r0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(r_u16_lo)));
let r1 = vcvtq_f32_u32(vmovl_high_u16(r_u16_lo));
let r2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(r_u16_hi)));
let r3 = vcvtq_f32_u32(vmovl_high_u16(r_u16_hi));
let g0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(g_u16_lo)));
let g1 = vcvtq_f32_u32(vmovl_high_u16(g_u16_lo));
let g2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(g_u16_hi)));
let g3 = vcvtq_f32_u32(vmovl_high_u16(g_u16_hi));
let b0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_lo)));
let b1 = vcvtq_f32_u32(vmovl_high_u16(b_u16_lo));
let b2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_hi)));
let b3 = vcvtq_f32_u32(vmovl_high_u16(b_u16_hi));
let r_chunk: &mut [f32; 16] = (&mut r[ps..ps + 16]).try_into().unwrap();
let g_chunk: &mut [f32; 16] = (&mut g[ps..ps + 16]).try_into().unwrap();
let b_chunk: &mut [f32; 16] = (&mut b[ps..ps + 16]).try_into().unwrap();
vst1q_f32_x4(bytemuck::cast_mut(r_chunk), float32x4x4_t(r0, r1, r2, r3));
vst1q_f32_x4(bytemuck::cast_mut(g_chunk), float32x4x4_t(g0, g1, g2, g3));
vst1q_f32_x4(bytemuck::cast_mut(b_chunk), float32x4x4_t(b0, b1, b2, b3));
}
for p in (n_chunks * 16)..pixels {
r[p] = src[p * 3] as f32;
g[p] = src[p * 3 + 1] as f32;
b[p] = src[p * 3 + 2] as f32;
}
}
#[arcane]
pub(crate) fn rgb48_to_planes_impl_neon(
_t: NeonToken,
src: &[u16],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
let pixels = src.len() / 3;
let n_chunks = pixels / 8;
for ci in 0..n_chunks {
let us = ci * 24;
let ps = ci * 8;
let c: &[u16; 24] = src[us..us + 24].try_into().unwrap();
let uint16x8x3_t(r_u16, g_u16, b_u16) = vld3q_u16(c);
let r_lo_f = vcvtq_f32_u32(vmovl_u16(vget_low_u16(r_u16)));
let r_hi_f = vcvtq_f32_u32(vmovl_high_u16(r_u16));
let g_lo_f = vcvtq_f32_u32(vmovl_u16(vget_low_u16(g_u16)));
let g_hi_f = vcvtq_f32_u32(vmovl_high_u16(g_u16));
let b_lo_f = vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16)));
let b_hi_f = vcvtq_f32_u32(vmovl_high_u16(b_u16));
let r_chunk: &mut [f32; 8] = (&mut r[ps..ps + 8]).try_into().unwrap();
let g_chunk: &mut [f32; 8] = (&mut g[ps..ps + 8]).try_into().unwrap();
let b_chunk: &mut [f32; 8] = (&mut b[ps..ps + 8]).try_into().unwrap();
vst1q_f32_x2(bytemuck::cast_mut(r_chunk), float32x4x2_t(r_lo_f, r_hi_f));
vst1q_f32_x2(bytemuck::cast_mut(g_chunk), float32x4x2_t(g_lo_f, g_hi_f));
vst1q_f32_x2(bytemuck::cast_mut(b_chunk), float32x4x2_t(b_lo_f, b_hi_f));
}
for p in (n_chunks * 8)..pixels {
r[p] = src[p * 3] as f32;
g[p] = src[p * 3 + 1] as f32;
b[p] = src[p * 3 + 2] as f32;
}
}
}
#[cfg(target_arch = "aarch64")]
use arm::{rgb24_to_planes_impl_neon, rgb48_to_planes_impl_neon};
#[inline(always)]
fn rgb_f32_to_planes_loop_scalar(src: &[f32], r: &mut [f32], g: &mut [f32], b: &mut [f32]) {
let pixels = src.len() / 3;
for i in 0..pixels {
r[i] = src[i * 3];
g[i] = src[i * 3 + 1];
b[i] = src[i * 3 + 2];
}
}
#[inline(always)]
fn rgba_f32_to_planes_loop_scalar(
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
a: &mut [f32],
) {
let pixels = src.len() / 4;
for i in 0..pixels {
r[i] = src[i * 4];
g[i] = src[i * 4 + 1];
b[i] = src[i * 4 + 2];
a[i] = src[i * 4 + 3];
}
}
#[inline(always)]
fn planes_to_rgb_f32_loop_scalar(r: &[f32], g: &[f32], b: &[f32], dst: &mut [f32]) {
let pixels = r.len();
for i in 0..pixels {
dst[i * 3] = r[i];
dst[i * 3 + 1] = g[i];
dst[i * 3 + 2] = b[i];
}
}
#[inline(always)]
fn planes_to_rgba_f32_loop_scalar(r: &[f32], g: &[f32], b: &[f32], a: &[f32], dst: &mut [f32]) {
let pixels = r.len();
for i in 0..pixels {
dst[i * 4] = r[i];
dst[i * 4 + 1] = g[i];
dst[i * 4 + 2] = b[i];
dst[i * 4 + 3] = a[i];
}
}
pub(crate) fn rgb_f32_to_planes_impl_scalar(
_t: ScalarToken,
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
rgb_f32_to_planes_loop_scalar(src, r, g, b);
}
pub(crate) fn rgba_f32_to_planes_impl_scalar(
_t: ScalarToken,
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
a: &mut [f32],
) {
rgba_f32_to_planes_loop_scalar(src, r, g, b, a);
}
pub(crate) fn planes_to_rgb_f32_impl_scalar(
_t: ScalarToken,
r: &[f32],
g: &[f32],
b: &[f32],
dst: &mut [f32],
) {
planes_to_rgb_f32_loop_scalar(r, g, b, dst);
}
pub(crate) fn planes_to_rgba_f32_impl_scalar(
_t: ScalarToken,
r: &[f32],
g: &[f32],
b: &[f32],
a: &[f32],
dst: &mut [f32],
) {
planes_to_rgba_f32_loop_scalar(r, g, b, a, dst);
}
#[cfg(target_arch = "x86_64")]
mod x86_f32 {
use super::*;
#[arcane]
pub(crate) fn rgb_f32_to_planes_impl_v3(
_t: X64V3Token,
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
super::rgb_f32_to_planes_loop_scalar(src, r, g, b);
}
#[arcane]
pub(crate) fn rgba_f32_to_planes_impl_v3(
_t: X64V3Token,
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
a: &mut [f32],
) {
super::rgba_f32_to_planes_loop_scalar(src, r, g, b, a);
}
#[arcane]
pub(crate) fn planes_to_rgb_f32_impl_v3(
_t: X64V3Token,
r: &[f32],
g: &[f32],
b: &[f32],
dst: &mut [f32],
) {
super::planes_to_rgb_f32_loop_scalar(r, g, b, dst);
}
#[arcane]
pub(crate) fn planes_to_rgba_f32_impl_v3(
_t: X64V3Token,
r: &[f32],
g: &[f32],
b: &[f32],
a: &[f32],
dst: &mut [f32],
) {
super::planes_to_rgba_f32_loop_scalar(r, g, b, a, dst);
}
}
#[cfg(target_arch = "x86_64")]
use x86_f32::{
planes_to_rgb_f32_impl_v3, planes_to_rgba_f32_impl_v3, rgb_f32_to_planes_impl_v3,
rgba_f32_to_planes_impl_v3,
};
#[cfg(target_arch = "aarch64")]
mod arm_f32 {
use super::*;
#[arcane]
pub(crate) fn rgb_f32_to_planes_impl_neon(
_t: NeonToken,
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) {
super::rgb_f32_to_planes_loop_scalar(src, r, g, b);
}
#[arcane]
pub(crate) fn rgba_f32_to_planes_impl_neon(
_t: NeonToken,
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
a: &mut [f32],
) {
super::rgba_f32_to_planes_loop_scalar(src, r, g, b, a);
}
#[arcane]
pub(crate) fn planes_to_rgb_f32_impl_neon(
_t: NeonToken,
r: &[f32],
g: &[f32],
b: &[f32],
dst: &mut [f32],
) {
super::planes_to_rgb_f32_loop_scalar(r, g, b, dst);
}
#[arcane]
pub(crate) fn planes_to_rgba_f32_impl_neon(
_t: NeonToken,
r: &[f32],
g: &[f32],
b: &[f32],
a: &[f32],
dst: &mut [f32],
) {
super::planes_to_rgba_f32_loop_scalar(r, g, b, a, dst);
}
}
#[cfg(target_arch = "aarch64")]
use arm_f32::{
planes_to_rgb_f32_impl_neon, planes_to_rgba_f32_impl_neon, rgb_f32_to_planes_impl_neon,
rgba_f32_to_planes_impl_neon,
};
pub fn rgb_f32_to_planes_f32(
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
) -> Result<(), SizeError> {
if src.is_empty() || !src.len().is_multiple_of(3) {
return Err(SizeError::NotPixelAligned);
}
let pixels = src.len() / 3;
if r.len() < pixels || g.len() < pixels || b.len() < pixels {
return Err(SizeError::PixelCountMismatch);
}
let r = &mut r[..pixels];
let g = &mut g[..pixels];
let b = &mut b[..pixels];
incant!(rgb_f32_to_planes_impl(src, r, g, b), [v3, neon, scalar]);
Ok(())
}
pub fn rgba_f32_to_planes_f32(
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
a: &mut [f32],
) -> Result<(), SizeError> {
if src.is_empty() || !src.len().is_multiple_of(4) {
return Err(SizeError::NotPixelAligned);
}
let pixels = src.len() / 4;
if r.len() < pixels || g.len() < pixels || b.len() < pixels || a.len() < pixels {
return Err(SizeError::PixelCountMismatch);
}
let r = &mut r[..pixels];
let g = &mut g[..pixels];
let b = &mut b[..pixels];
let a = &mut a[..pixels];
incant!(rgba_f32_to_planes_impl(src, r, g, b, a), [v3, neon, scalar]);
Ok(())
}
pub fn planes_f32_to_rgb_f32(
r: &[f32],
g: &[f32],
b: &[f32],
dst: &mut [f32],
) -> Result<(), SizeError> {
if r.is_empty() || r.len() != g.len() || r.len() != b.len() {
return Err(SizeError::NotPixelAligned);
}
let pixels = r.len();
if dst.len() < pixels * 3 {
return Err(SizeError::PixelCountMismatch);
}
let dst = &mut dst[..pixels * 3];
incant!(planes_to_rgb_f32_impl(r, g, b, dst), [v3, neon, scalar]);
Ok(())
}
pub fn planes_f32_to_rgba_f32(
r: &[f32],
g: &[f32],
b: &[f32],
a: &[f32],
dst: &mut [f32],
) -> Result<(), SizeError> {
if r.is_empty() || r.len() != g.len() || r.len() != b.len() || r.len() != a.len() {
return Err(SizeError::NotPixelAligned);
}
let pixels = r.len();
if dst.len() < pixels * 4 {
return Err(SizeError::PixelCountMismatch);
}
let dst = &mut dst[..pixels * 4];
incant!(planes_to_rgba_f32_impl(r, g, b, a, dst), [v3, neon, scalar]);
Ok(())
}
#[doc(hidden)]
#[inline(always)]
pub fn scalar_only_rgb_f32_to_planes(src: &[f32], r: &mut [f32], g: &mut [f32], b: &mut [f32]) {
let pixels = src.len() / 3;
rgb_f32_to_planes_loop_scalar(src, &mut r[..pixels], &mut g[..pixels], &mut b[..pixels]);
}
#[doc(hidden)]
#[inline(always)]
pub fn scalar_only_rgba_f32_to_planes(
src: &[f32],
r: &mut [f32],
g: &mut [f32],
b: &mut [f32],
a: &mut [f32],
) {
let pixels = src.len() / 4;
rgba_f32_to_planes_loop_scalar(
src,
&mut r[..pixels],
&mut g[..pixels],
&mut b[..pixels],
&mut a[..pixels],
);
}
#[doc(hidden)]
#[inline(always)]
pub fn scalar_only_planes_f32_to_rgb(r: &[f32], g: &[f32], b: &[f32], dst: &mut [f32]) {
planes_to_rgb_f32_loop_scalar(r, g, b, &mut dst[..r.len() * 3]);
}
#[doc(hidden)]
#[inline(always)]
pub fn scalar_only_planes_f32_to_rgba(r: &[f32], g: &[f32], b: &[f32], a: &[f32], dst: &mut [f32]) {
planes_to_rgba_f32_loop_scalar(r, g, b, a, &mut dst[..r.len() * 4]);
}
#[cfg(test)]
mod tests {
extern crate alloc;
use super::*;
use alloc::vec;
fn ref_planes_u8(
src: &[u8],
) -> (
alloc::vec::Vec<f32>,
alloc::vec::Vec<f32>,
alloc::vec::Vec<f32>,
) {
let pixels = src.len() / 3;
let mut r = vec![0.0f32; pixels];
let mut g = vec![0.0f32; pixels];
let mut b = vec![0.0f32; pixels];
for i in 0..pixels {
r[i] = src[i * 3] as f32;
g[i] = src[i * 3 + 1] as f32;
b[i] = src[i * 3 + 2] as f32;
}
(r, g, b)
}
fn ref_planes_u16(
src: &[u16],
) -> (
alloc::vec::Vec<f32>,
alloc::vec::Vec<f32>,
alloc::vec::Vec<f32>,
) {
let pixels = src.len() / 3;
let mut r = vec![0.0f32; pixels];
let mut g = vec![0.0f32; pixels];
let mut b = vec![0.0f32; pixels];
for i in 0..pixels {
r[i] = src[i * 3] as f32;
g[i] = src[i * 3 + 1] as f32;
b[i] = src[i * 3 + 2] as f32;
}
(r, g, b)
}
#[test]
fn rgb24_round_trip_aligned() {
let pixels = 8 * 64;
let src: alloc::vec::Vec<u8> = (0..pixels * 3).map(|i| (i & 0xFF) as u8).collect();
let (r_ref, g_ref, b_ref) = ref_planes_u8(&src);
let mut r = vec![0.0f32; pixels];
let mut g = vec![0.0f32; pixels];
let mut b = vec![0.0f32; pixels];
rgb24_to_planes_f32(&src, &mut r, &mut g, &mut b).unwrap();
assert_eq!(r, r_ref);
assert_eq!(g, g_ref);
assert_eq!(b, b_ref);
}
#[test]
fn rgb24_round_trip_with_tail() {
let pixels = 67;
let src: alloc::vec::Vec<u8> = (0..pixels * 3)
.map(|i: usize| (i.wrapping_mul(31) & 0xFF) as u8)
.collect();
let (r_ref, g_ref, b_ref) = ref_planes_u8(&src);
let mut r = vec![0.0f32; pixels];
let mut g = vec![0.0f32; pixels];
let mut b = vec![0.0f32; pixels];
rgb24_to_planes_f32(&src, &mut r, &mut g, &mut b).unwrap();
assert_eq!(r, r_ref);
assert_eq!(g, g_ref);
assert_eq!(b, b_ref);
}
#[test]
fn rgb48_round_trip_aligned() {
let pixels = 8 * 32;
let src: alloc::vec::Vec<u16> = (0..pixels * 3)
.map(|i: usize| (i.wrapping_mul(257) & 0xFFFF) as u16)
.collect();
let (r_ref, g_ref, b_ref) = ref_planes_u16(&src);
let mut r = vec![0.0f32; pixels];
let mut g = vec![0.0f32; pixels];
let mut b = vec![0.0f32; pixels];
rgb48_to_planes_f32(&src, &mut r, &mut g, &mut b).unwrap();
assert_eq!(r, r_ref);
assert_eq!(g, g_ref);
assert_eq!(b, b_ref);
}
#[test]
fn rgb48_round_trip_with_tail() {
let pixels = 51;
let src: alloc::vec::Vec<u16> = (0..pixels * 3)
.map(|i: usize| (i.wrapping_mul(8191) & 0xFFFF) as u16)
.collect();
let (r_ref, g_ref, b_ref) = ref_planes_u16(&src);
let mut r = vec![0.0f32; pixels];
let mut g = vec![0.0f32; pixels];
let mut b = vec![0.0f32; pixels];
rgb48_to_planes_f32(&src, &mut r, &mut g, &mut b).unwrap();
assert_eq!(r, r_ref);
assert_eq!(g, g_ref);
assert_eq!(b, b_ref);
}
#[test]
fn errors_rejected() {
let mut r = vec![0.0; 4];
let mut g = vec![0.0; 4];
let mut b = vec![0.0; 4];
assert_eq!(
rgb24_to_planes_f32(&[], &mut r, &mut g, &mut b),
Err(SizeError::NotPixelAligned)
);
let bad = [0u8; 7];
assert_eq!(
rgb24_to_planes_f32(&bad, &mut r, &mut g, &mut b),
Err(SizeError::NotPixelAligned)
);
let src = [0u8; 24];
let mut tiny = vec![0.0; 2];
assert_eq!(
rgb24_to_planes_f32(&src, &mut tiny, &mut g, &mut b),
Err(SizeError::PixelCountMismatch)
);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn rgb24_v3_matches_scalar() {
if let Some(t) = X64V3Token::summon() {
let pixels = 8 * 17;
let src: alloc::vec::Vec<u8> = (0..pixels * 3).map(|i| (i & 0xFF) as u8).collect();
let mut r_v = vec![0.0f32; pixels];
let mut g_v = vec![0.0f32; pixels];
let mut b_v = vec![0.0f32; pixels];
let mut r_s = vec![0.0f32; pixels];
let mut g_s = vec![0.0f32; pixels];
let mut b_s = vec![0.0f32; pixels];
x86::rgb24_to_planes_impl_v3(t, &src, &mut r_v, &mut g_v, &mut b_v);
rgb24_to_planes_impl_scalar(
ScalarToken::summon().unwrap(),
&src,
&mut r_s,
&mut g_s,
&mut b_s,
);
assert_eq!(r_v, r_s);
assert_eq!(g_v, g_s);
assert_eq!(b_v, b_s);
}
}
#[cfg(target_arch = "x86_64")]
#[test]
fn rgb48_v3_matches_scalar() {
if let Some(t) = X64V3Token::summon() {
let pixels = 8 * 17;
let src: alloc::vec::Vec<u16> = (0..pixels * 3)
.map(|i: usize| (i.wrapping_mul(257) & 0xFFFF) as u16)
.collect();
let mut r_v = vec![0.0f32; pixels];
let mut g_v = vec![0.0f32; pixels];
let mut b_v = vec![0.0f32; pixels];
let mut r_s = vec![0.0f32; pixels];
let mut g_s = vec![0.0f32; pixels];
let mut b_s = vec![0.0f32; pixels];
x86::rgb48_to_planes_impl_v3(t, &src, &mut r_v, &mut g_v, &mut b_v);
rgb48_to_planes_impl_scalar(
ScalarToken::summon().unwrap(),
&src,
&mut r_s,
&mut g_s,
&mut b_s,
);
assert_eq!(r_v, r_s);
assert_eq!(g_v, g_s);
assert_eq!(b_v, b_s);
}
}
#[test]
fn rgb_f32_round_trip() {
let pixels = 67;
let src: alloc::vec::Vec<f32> = (0..pixels * 3)
.map(|i: usize| i as f32 * 0.5 - 100.0)
.collect();
let mut r = vec![0.0f32; pixels];
let mut g = vec![0.0f32; pixels];
let mut b = vec![0.0f32; pixels];
rgb_f32_to_planes_f32(&src, &mut r, &mut g, &mut b).unwrap();
for i in 0..pixels {
assert_eq!(r[i], src[i * 3]);
assert_eq!(g[i], src[i * 3 + 1]);
assert_eq!(b[i], src[i * 3 + 2]);
}
let mut interleaved = vec![0.0f32; pixels * 3];
planes_f32_to_rgb_f32(&r, &g, &b, &mut interleaved).unwrap();
assert_eq!(interleaved, src);
}
#[test]
fn rgba_f32_round_trip() {
let pixels = 51;
let src: alloc::vec::Vec<f32> = (0..pixels * 4)
.map(|i: usize| i as f32 * 0.25 + 1.0)
.collect();
let mut r = vec![0.0f32; pixels];
let mut g = vec![0.0f32; pixels];
let mut b = vec![0.0f32; pixels];
let mut a = vec![0.0f32; pixels];
rgba_f32_to_planes_f32(&src, &mut r, &mut g, &mut b, &mut a).unwrap();
for i in 0..pixels {
assert_eq!(r[i], src[i * 4]);
assert_eq!(g[i], src[i * 4 + 1]);
assert_eq!(b[i], src[i * 4 + 2]);
assert_eq!(a[i], src[i * 4 + 3]);
}
let mut interleaved = vec![0.0f32; pixels * 4];
planes_f32_to_rgba_f32(&r, &g, &b, &a, &mut interleaved).unwrap();
assert_eq!(interleaved, src);
}
#[test]
fn f32_errors_rejected() {
let mut r = vec![0.0; 4];
let mut g = vec![0.0; 4];
let mut b = vec![0.0; 4];
let a = vec![0.0; 4];
let mut dst = vec![0.0; 16];
assert_eq!(
rgb_f32_to_planes_f32(&[], &mut r, &mut g, &mut b),
Err(SizeError::NotPixelAligned)
);
let bad = [0.0f32; 7];
assert_eq!(
rgb_f32_to_planes_f32(&bad, &mut r, &mut g, &mut b),
Err(SizeError::NotPixelAligned)
);
let src = [0.0f32; 12]; let mut tiny = vec![0.0f32; 2];
assert_eq!(
rgb_f32_to_planes_f32(&src, &mut tiny, &mut g, &mut b),
Err(SizeError::PixelCountMismatch)
);
let r_short = vec![0.0; 3];
assert_eq!(
planes_f32_to_rgb_f32(&r_short, &g, &b, &mut dst),
Err(SizeError::NotPixelAligned)
);
let mut tiny_dst = vec![0.0; 5];
assert_eq!(
planes_f32_to_rgb_f32(&r, &g, &b, &mut tiny_dst),
Err(SizeError::PixelCountMismatch)
);
let a_short = vec![0.0; 3];
assert_eq!(
planes_f32_to_rgba_f32(&r, &g, &b, &a_short, &mut dst),
Err(SizeError::NotPixelAligned)
);
let _ = &a;
}
}