pub fn deinterleave_rgba(rgba: &[u8]) -> Vec<u8> {
assert_eq!(rgba.len() % 4, 0, "rgba length must be a multiple of 4");
let n = rgba.len() / 4;
let mut out = vec![0u8; rgba.len()];
#[cfg(target_arch = "aarch64")]
unsafe {
deinterleave_neon(rgba, &mut out, n);
return out;
}
#[cfg_attr(target_arch = "aarch64", allow(unreachable_code))]
{
#[cfg(not(target_arch = "wasm32"))]
deinterleave_parallel(rgba, &mut out, n);
#[cfg(target_arch = "wasm32")]
deinterleave_scalar(rgba, &mut out, n);
out
}
}
pub fn reinterleave_rgba(planar: &[u8], n_pixels: usize) -> Vec<u8> {
assert_eq!(
planar.len(),
n_pixels * 4,
"planar length must be n_pixels * 4"
);
let mut out = vec![0u8; planar.len()];
#[cfg(target_arch = "aarch64")]
unsafe {
reinterleave_neon(planar, &mut out, n_pixels);
return out;
}
#[cfg_attr(target_arch = "aarch64", allow(unreachable_code))]
{
#[cfg(not(target_arch = "wasm32"))]
reinterleave_parallel(planar, &mut out, n_pixels);
#[cfg(target_arch = "wasm32")]
reinterleave_scalar(planar, &mut out, n_pixels);
out
}
}
#[cfg(not(target_arch = "wasm32"))]
fn deinterleave_parallel(rgba: &[u8], out: &mut [u8], n: usize) {
use rayon::prelude::*;
let (r, rest) = out.split_at_mut(n);
let (g, rest) = rest.split_at_mut(n);
let (b, a) = rest.split_at_mut(n);
r.par_iter_mut()
.zip(g.par_iter_mut())
.zip(b.par_iter_mut())
.zip(a.par_iter_mut())
.zip(rgba.par_chunks_exact(4))
.for_each(|((((rv, gv), bv), av), px)| {
*rv = px[0];
*gv = px[1];
*bv = px[2];
*av = px[3];
});
}
#[cfg(not(target_arch = "wasm32"))]
fn reinterleave_parallel(planar: &[u8], out: &mut [u8], n: usize) {
use rayon::prelude::*;
let (r, rest) = planar.split_at(n);
let (g, rest) = rest.split_at(n);
let (b, a) = rest.split_at(n);
r.par_iter()
.zip(g.par_iter())
.zip(b.par_iter())
.zip(a.par_iter())
.zip(out.par_chunks_exact_mut(4))
.for_each(|((((rv, gv), bv), av), px)| {
px[0] = *rv;
px[1] = *gv;
px[2] = *bv;
px[3] = *av;
});
}
#[cfg(target_arch = "wasm32")]
fn deinterleave_scalar(rgba: &[u8], out: &mut [u8], n: usize) {
let (r, rest) = out.split_at_mut(n);
let (g, rest) = rest.split_at_mut(n);
let (b, a) = rest.split_at_mut(n);
for (i, px) in rgba.chunks_exact(4).enumerate() {
r[i] = px[0];
g[i] = px[1];
b[i] = px[2];
a[i] = px[3];
}
}
#[cfg(target_arch = "wasm32")]
fn reinterleave_scalar(planar: &[u8], out: &mut [u8], n: usize) {
let (r, rest) = planar.split_at(n);
let (g, rest) = rest.split_at(n);
let (b, a) = rest.split_at(n);
for (i, px) in out.chunks_exact_mut(4).enumerate() {
px[0] = r[i];
px[1] = g[i];
px[2] = b[i];
px[3] = a[i];
}
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn deinterleave_neon(rgba: &[u8], out: &mut [u8], n: usize) {
use std::arch::aarch64::*;
let n_blocks = n / 16;
let remainder = n % 16;
let src = rgba.as_ptr();
let r_ptr = out.as_mut_ptr();
let g_ptr = r_ptr.add(n);
let b_ptr = g_ptr.add(n);
let a_ptr = b_ptr.add(n);
for i in 0..n_blocks {
let quad = vld4q_u8(src.add(i * 64));
vst1q_u8(r_ptr.add(i * 16), quad.0);
vst1q_u8(g_ptr.add(i * 16), quad.1);
vst1q_u8(b_ptr.add(i * 16), quad.2);
vst1q_u8(a_ptr.add(i * 16), quad.3);
}
let done = n_blocks * 16;
for j in 0..remainder {
let px = src.add((done + j) * 4);
*r_ptr.add(done + j) = *px;
*g_ptr.add(done + j) = *px.add(1);
*b_ptr.add(done + j) = *px.add(2);
*a_ptr.add(done + j) = *px.add(3);
}
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn reinterleave_neon(planar: &[u8], out: &mut [u8], n: usize) {
use std::arch::aarch64::*;
let n_blocks = n / 16;
let remainder = n % 16;
let r_ptr = planar.as_ptr();
let g_ptr = r_ptr.add(n);
let b_ptr = g_ptr.add(n);
let a_ptr = b_ptr.add(n);
let dst = out.as_mut_ptr();
for i in 0..n_blocks {
let quad = uint8x16x4_t(
vld1q_u8(r_ptr.add(i * 16)),
vld1q_u8(g_ptr.add(i * 16)),
vld1q_u8(b_ptr.add(i * 16)),
vld1q_u8(a_ptr.add(i * 16)),
);
vst4q_u8(dst.add(i * 64), quad);
}
let done = n_blocks * 16;
for j in 0..remainder {
let base = (done + j) * 4;
out[base] = *r_ptr.add(done + j);
out[base + 1] = *g_ptr.add(done + j);
out[base + 2] = *b_ptr.add(done + j);
out[base + 3] = *a_ptr.add(done + j);
}
}
pub fn premultiply_alpha(rgba: &mut [u8]) {
assert_eq!(
rgba.len() % 4,
0,
"rgba buffer must be a multiple of 4 bytes"
);
for chunk in rgba.chunks_exact_mut(4) {
let a = chunk[3] as u32;
chunk[0] = ((chunk[0] as u32 * a + 127) / 255) as u8;
chunk[1] = ((chunk[1] as u32 * a + 127) / 255) as u8;
chunk[2] = ((chunk[2] as u32 * a + 127) / 255) as u8;
}
}
pub fn rgba_to_bgra(buf: &mut [u8]) {
assert_eq!(buf.len() % 4, 0, "buffer must be a multiple of 4 bytes");
for chunk in buf.chunks_exact_mut(4) {
chunk.swap(0, 2);
}
}
pub fn srgb_to_linear(input: &[u8], output: &mut [f32]) {
assert_eq!(
input.len(),
output.len(),
"output must be same length as input"
);
let lut = srgb_lut();
for (out, &byte) in output.iter_mut().zip(input.iter()) {
*out = lut[byte as usize];
}
}
fn srgb_lut() -> &'static [f32; 256] {
use std::sync::OnceLock;
static LUT: OnceLock<[f32; 256]> = OnceLock::new();
LUT.get_or_init(|| {
std::array::from_fn(|i| {
let s = i as f32 / 255.0;
if s <= 0.04045 {
s / 12.92
} else {
((s + 0.055) / 1.055).powf(2.4)
}
})
})
}