use archmage::prelude::*;
#[cfg(target_arch = "x86_64")]
use linear_srgb::tokens::x8 as trc_x8;
#[cfg(target_arch = "x86_64")]
use magetypes::simd::f32x8 as mt_f32x8;
#[inline(always)]
pub(crate) fn mat3x3_scalar(m: &[[f32; 3]; 3], r: f32, g: f32, b: f32) -> (f32, f32, f32) {
mat3x3(m, r, g, b)
}
#[inline(always)]
fn mat3x3(m: &[[f32; 3]; 3], r: f32, g: f32, b: f32) -> (f32, f32, f32) {
(
m[0][0].mul_add(r, m[0][1].mul_add(g, m[0][2] * b)),
m[1][0].mul_add(r, m[1][1].mul_add(g, m[1][2] * b)),
m[2][0].mul_add(r, m[2][1].mul_add(g, m[2][2] * b)),
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn mat3x3_x8(
token: X64V3Token,
m: &[[f32; 3]; 3],
rl: mt_f32x8,
gl: mt_f32x8,
bl: mt_f32x8,
) -> (mt_f32x8, mt_f32x8, mt_f32x8) {
let or = mt_f32x8::splat(token, m[0][0]).mul_add(
rl,
mt_f32x8::splat(token, m[0][1]).mul_add(gl, mt_f32x8::splat(token, m[0][2]) * bl),
);
let og = mt_f32x8::splat(token, m[1][0]).mul_add(
rl,
mt_f32x8::splat(token, m[1][1]).mul_add(gl, mt_f32x8::splat(token, m[1][2]) * bl),
);
let ob = mt_f32x8::splat(token, m[2][0]).mul_add(
rl,
mt_f32x8::splat(token, m[2][1]).mul_add(gl, mt_f32x8::splat(token, m[2][2]) * bl),
);
(or, og, ob)
}
macro_rules! stamp_trc_kernels {
(
$name:ident,
simd_linearize: $simd_lin:path,
simd_encode: $simd_enc:path,
scalar_linearize: $scalar_lin:path,
scalar_encode: $scalar_enc:path
) => {
paste::paste! {
#[cfg(target_arch = "x86_64")]
#[rite]
fn [<fused_8px_rgb_ $name>](token: X64V3Token, m: &[[f32; 3]; 3], data: &mut [f32]) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = data[i * 3];
g[i] = data[i * 3 + 1];
b[i] = data[i * 3 + 2];
}
let rl = mt_f32x8::from_array(token, $simd_lin(token, r));
let gl = mt_f32x8::from_array(token, $simd_lin(token, g));
let bl = mt_f32x8::from_array(token, $simd_lin(token, b));
let (or, og, ob) = mat3x3_x8(token, m, rl, gl, bl);
let ro = $simd_enc(token, or.to_array());
let go = $simd_enc(token, og.to_array());
let bo = $simd_enc(token, ob.to_array());
for i in 0..8 {
data[i * 3] = ro[i];
data[i * 3 + 1] = go[i];
data[i * 3 + 2] = bo[i];
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn [<fused_8px_rgba_ $name>](token: X64V3Token, m: &[[f32; 3]; 3], data: &mut [f32]) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = data[i * 4];
g[i] = data[i * 4 + 1];
b[i] = data[i * 4 + 2];
}
let rl = mt_f32x8::from_array(token, $simd_lin(token, r));
let gl = mt_f32x8::from_array(token, $simd_lin(token, g));
let bl = mt_f32x8::from_array(token, $simd_lin(token, b));
let (or, og, ob) = mat3x3_x8(token, m, rl, gl, bl);
let ro = $simd_enc(token, or.to_array());
let go = $simd_enc(token, og.to_array());
let bo = $simd_enc(token, ob.to_array());
for i in 0..8 {
data[i * 4] = ro[i];
data[i * 4 + 1] = go[i];
data[i * 4 + 2] = bo[i];
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn [<convert_rgb_ $name _v3>](token: X64V3Token, m: &[[f32; 3]; 3], data: &mut [f32]) {
let bulk = (data.len() / 24) * 24;
for off in (0..bulk).step_by(24) {
[<fused_8px_rgb_ $name>](token, m, &mut data[off..off + 24]);
}
for pixel in data[bulk..].chunks_exact_mut(3) {
let r = $scalar_lin(pixel[0]);
let g = $scalar_lin(pixel[1]);
let b = $scalar_lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = $scalar_enc(nr);
pixel[1] = $scalar_enc(ng);
pixel[2] = $scalar_enc(nb);
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn [<convert_rgba_ $name _v3>](token: X64V3Token, m: &[[f32; 3]; 3], data: &mut [f32]) {
let bulk = (data.len() / 32) * 32;
for off in (0..bulk).step_by(32) {
[<fused_8px_rgba_ $name>](token, m, &mut data[off..off + 32]);
}
for pixel in data[bulk..].chunks_exact_mut(4) {
let r = $scalar_lin(pixel[0]);
let g = $scalar_lin(pixel[1]);
let b = $scalar_lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = $scalar_enc(nr);
pixel[1] = $scalar_enc(ng);
pixel[2] = $scalar_enc(nb);
}
}
fn [<convert_rgb_ $name _scalar>](_token: ScalarToken, m: &[[f32; 3]; 3], data: &mut [f32]) {
for pixel in data.chunks_exact_mut(3) {
let r = $scalar_lin(pixel[0]);
let g = $scalar_lin(pixel[1]);
let b = $scalar_lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = $scalar_enc(nr);
pixel[1] = $scalar_enc(ng);
pixel[2] = $scalar_enc(nb);
}
}
fn [<convert_rgba_ $name _scalar>](_token: ScalarToken, m: &[[f32; 3]; 3], data: &mut [f32]) {
for pixel in data.chunks_exact_mut(4) {
let r = $scalar_lin(pixel[0]);
let g = $scalar_lin(pixel[1]);
let b = $scalar_lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = $scalar_enc(nr);
pixel[1] = $scalar_enc(ng);
pixel[2] = $scalar_enc(nb);
}
}
}
};
}
stamp_trc_kernels!(srgb,
simd_linearize: trc_x8::srgb_to_linear_v3,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: linear_srgb::tf::srgb_to_linear,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(bt709,
simd_linearize: trc_x8::bt709_to_linear_v3,
simd_encode: trc_x8::linear_to_bt709_v3,
scalar_linearize: linear_srgb::tf::bt709_to_linear,
scalar_encode: linear_srgb::tf::linear_to_bt709
);
stamp_trc_kernels!(pq,
simd_linearize: trc_x8::pq_to_linear_v3,
simd_encode: trc_x8::linear_to_pq_v3,
scalar_linearize: linear_srgb::tf::pq_to_linear,
scalar_encode: linear_srgb::tf::linear_to_pq
);
stamp_trc_kernels!(hlg,
simd_linearize: trc_x8::hlg_to_linear_v3,
simd_encode: trc_x8::linear_to_hlg_v3,
scalar_linearize: linear_srgb::tf::hlg_to_linear,
scalar_encode: linear_srgb::tf::linear_to_hlg
);
pub fn convert_linear_rgb(m: &[[f32; 3]; 3], data: &mut [f32]) {
debug_assert_eq!(data.len() % 3, 0);
for pixel in data.chunks_exact_mut(3) {
let (r, g, b) = (pixel[0], pixel[1], pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = nr;
pixel[1] = ng;
pixel[2] = nb;
}
}
pub fn convert_linear_rgba(m: &[[f32; 3]; 3], data: &mut [f32]) {
debug_assert_eq!(data.len() % 4, 0);
for pixel in data.chunks_exact_mut(4) {
let (r, g, b) = (pixel[0], pixel[1], pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = nr;
pixel[1] = ng;
pixel[2] = nb;
}
}
stamp_trc_kernels!(pq_to_srgb,
simd_linearize: trc_x8::pq_to_linear_v3,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: linear_srgb::tf::pq_to_linear,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(hlg_to_srgb,
simd_linearize: trc_x8::hlg_to_linear_v3,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: linear_srgb::tf::hlg_to_linear,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(srgb_to_pq,
simd_linearize: trc_x8::srgb_to_linear_v3,
simd_encode: trc_x8::linear_to_pq_v3,
scalar_linearize: linear_srgb::tf::srgb_to_linear,
scalar_encode: linear_srgb::tf::linear_to_pq
);
stamp_trc_kernels!(bt709_to_srgb,
simd_linearize: trc_x8::bt709_to_linear_v3,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: linear_srgb::tf::bt709_to_linear,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(srgb_to_bt709,
simd_linearize: trc_x8::srgb_to_linear_v3,
simd_encode: trc_x8::linear_to_bt709_v3,
scalar_linearize: linear_srgb::tf::srgb_to_linear,
scalar_encode: linear_srgb::tf::linear_to_bt709
);
const ADOBE_GAMMA: f32 = 2.19921875;
#[cfg(target_arch = "x86_64")]
#[rite]
fn adobe_to_linear_x8(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
trc_x8::gamma_to_linear_v3(token, v, ADOBE_GAMMA)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn adobe_from_linear_x8(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
trc_x8::linear_to_gamma_v3(token, v, ADOBE_GAMMA)
}
#[inline(always)]
fn adobe_to_linear_scalar(v: f32) -> f32 {
linear_srgb::default::gamma_to_linear(v, ADOBE_GAMMA)
}
#[inline(always)]
fn adobe_from_linear_scalar(v: f32) -> f32 {
linear_srgb::default::linear_to_gamma(v, ADOBE_GAMMA)
}
stamp_trc_kernels!(adobe,
simd_linearize: adobe_to_linear_x8,
simd_encode: adobe_from_linear_x8,
scalar_linearize: adobe_to_linear_scalar,
scalar_encode: adobe_from_linear_scalar
);
stamp_trc_kernels!(adobe_to_srgb,
simd_linearize: adobe_to_linear_x8,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: adobe_to_linear_scalar,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(srgb_to_adobe,
simd_linearize: trc_x8::srgb_to_linear_v3,
simd_encode: adobe_from_linear_x8,
scalar_linearize: linear_srgb::tf::srgb_to_linear,
scalar_encode: adobe_from_linear_scalar
);
use crate::TransferFunction;
#[inline(always)]
fn linearize_srgb_extended(v: f32) -> f32 {
if v >= 0.0 {
linear_srgb::precise::srgb_to_linear_extended(v)
} else {
-linear_srgb::precise::srgb_to_linear_extended(-v)
}
}
#[inline(always)]
fn encode_srgb_extended(v: f32) -> f32 {
if v >= 0.0 {
linear_srgb::precise::linear_to_srgb_extended(v)
} else {
-linear_srgb::precise::linear_to_srgb_extended(-v)
}
}
#[inline(always)]
fn linearize_gamma_extended(v: f32, gamma: f32) -> f32 {
if v >= 0.0 {
v.powf(gamma)
} else {
-((-v).powf(gamma))
}
}
#[inline(always)]
fn encode_gamma_extended(v: f32, inv_gamma: f32) -> f32 {
if v >= 0.0 {
v.powf(inv_gamma)
} else {
-((-v).powf(inv_gamma))
}
}
fn scalar_linearize_extended(trc: TransferFunction) -> Option<fn(f32) -> f32> {
match trc {
TransferFunction::Srgb => Some(linearize_srgb_extended),
TransferFunction::Gamma22 => Some(|v| linearize_gamma_extended(v, ADOBE_GAMMA)),
TransferFunction::Linear => Some(core::convert::identity),
_ => scalar_linearize(trc),
}
}
fn scalar_encode_extended(trc: TransferFunction) -> Option<fn(f32) -> f32> {
match trc {
TransferFunction::Srgb => Some(encode_srgb_extended),
TransferFunction::Gamma22 => Some(|v| encode_gamma_extended(v, 1.0 / ADOBE_GAMMA)),
TransferFunction::Linear => Some(core::convert::identity),
_ => scalar_encode(trc),
}
}
pub(crate) fn convert_f32_rgb_extended(
m: &[[f32; 3]; 3],
data: &mut [f32],
src_trc: TransferFunction,
dst_trc: TransferFunction,
) -> bool {
let Some(lin) = scalar_linearize_extended(src_trc) else {
return false;
};
let Some(enc) = scalar_encode_extended(dst_trc) else {
return false;
};
for pixel in data.chunks_exact_mut(3) {
let r = lin(pixel[0]);
let g = lin(pixel[1]);
let b = lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = enc(nr);
pixel[1] = enc(ng);
pixel[2] = enc(nb);
}
true
}
pub(crate) fn convert_f32_rgba_extended(
m: &[[f32; 3]; 3],
data: &mut [f32],
src_trc: TransferFunction,
dst_trc: TransferFunction,
) -> bool {
let Some(lin) = scalar_linearize_extended(src_trc) else {
return false;
};
let Some(enc) = scalar_encode_extended(dst_trc) else {
return false;
};
for pixel in data.chunks_exact_mut(4) {
let r = lin(pixel[0]);
let g = lin(pixel[1]);
let b = lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = enc(nr);
pixel[1] = enc(ng);
pixel[2] = enc(nb);
}
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn has_simd_encode(trc: TransferFunction) -> bool {
matches!(
trc,
TransferFunction::Srgb
| TransferFunction::Bt709
| TransferFunction::Pq
| TransferFunction::Hlg
| TransferFunction::Gamma22
)
}
#[cfg(not(target_arch = "x86_64"))]
pub(crate) fn has_simd_encode(_trc: TransferFunction) -> bool {
false
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn simd_encode_x8_dispatch(token: X64V3Token, trc: TransferFunction, v: [f32; 8]) -> [f32; 8] {
match trc {
TransferFunction::Srgb => trc_x8::linear_to_srgb_v3(token, v),
TransferFunction::Bt709 => trc_x8::linear_to_bt709_v3(token, v),
TransferFunction::Pq => trc_x8::linear_to_pq_v3(token, v),
TransferFunction::Hlg => trc_x8::linear_to_hlg_v3(token, v),
TransferFunction::Gamma22 => adobe_from_linear_x8(token, v),
_ => v,
}
}
pub(crate) fn scalar_linearize(trc: TransferFunction) -> Option<fn(f32) -> f32> {
match trc {
TransferFunction::Srgb => Some(linear_srgb::tf::srgb_to_linear),
TransferFunction::Bt709 => Some(linear_srgb::tf::bt709_to_linear),
TransferFunction::Pq => Some(linear_srgb::tf::pq_to_linear),
TransferFunction::Hlg => Some(linear_srgb::tf::hlg_to_linear),
TransferFunction::Gamma22 => Some(adobe_to_linear_scalar),
TransferFunction::Linear => Some(core::convert::identity),
_ => None,
}
}
pub(crate) fn scalar_encode(trc: TransferFunction) -> Option<fn(f32) -> f32> {
match trc {
TransferFunction::Srgb => Some(linear_srgb::tf::linear_to_srgb),
TransferFunction::Bt709 => Some(linear_srgb::tf::linear_to_bt709),
TransferFunction::Pq => Some(linear_srgb::tf::linear_to_pq),
TransferFunction::Hlg => Some(linear_srgb::tf::linear_to_hlg),
TransferFunction::Gamma22 => Some(adobe_from_linear_scalar),
TransferFunction::Linear => Some(core::convert::identity),
_ => None,
}
}
pub(crate) fn convert_f32_rgb_dispatch(
m: &[[f32; 3]; 3],
data: &mut [f32],
src_trc: TransferFunction,
dst_trc: TransferFunction,
) -> bool {
use TransferFunction::*;
debug_assert_eq!(data.len() % 3, 0);
if src_trc == Linear && dst_trc == Linear {
convert_linear_rgb(m, data);
return true;
}
#[cfg(target_arch = "x86_64")]
match (src_trc, dst_trc) {
(Srgb, Srgb) => {
incant!(convert_rgb_srgb(m, data));
return true;
}
(Bt709, Bt709) => {
incant!(convert_rgb_bt709(m, data));
return true;
}
(Pq, Pq) => {
incant!(convert_rgb_pq(m, data));
return true;
}
(Hlg, Hlg) => {
incant!(convert_rgb_hlg(m, data));
return true;
}
(Gamma22, Gamma22) => {
incant!(convert_rgb_adobe(m, data));
return true;
}
(Pq, Srgb) => {
incant!(convert_rgb_pq_to_srgb(m, data));
return true;
}
(Hlg, Srgb) => {
incant!(convert_rgb_hlg_to_srgb(m, data));
return true;
}
(Srgb, Pq) => {
incant!(convert_rgb_srgb_to_pq(m, data));
return true;
}
(Bt709, Srgb) => {
incant!(convert_rgb_bt709_to_srgb(m, data));
return true;
}
(Srgb, Bt709) => {
incant!(convert_rgb_srgb_to_bt709(m, data));
return true;
}
(Gamma22, Srgb) => {
incant!(convert_rgb_adobe_to_srgb(m, data));
return true;
}
(Srgb, Gamma22) => {
incant!(convert_rgb_srgb_to_adobe(m, data));
return true;
}
_ => {} }
let Some(lin) = scalar_linearize(src_trc) else {
return false;
};
let Some(enc) = scalar_encode(dst_trc) else {
return false;
};
for pixel in data.chunks_exact_mut(3) {
let r = lin(pixel[0]);
let g = lin(pixel[1]);
let b = lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = enc(nr);
pixel[1] = enc(ng);
pixel[2] = enc(nb);
}
true
}
pub(crate) fn convert_f32_rgba_dispatch(
m: &[[f32; 3]; 3],
data: &mut [f32],
src_trc: TransferFunction,
dst_trc: TransferFunction,
) -> bool {
use TransferFunction::*;
debug_assert_eq!(data.len() % 4, 0);
if src_trc == Linear && dst_trc == Linear {
convert_linear_rgba(m, data);
return true;
}
#[cfg(target_arch = "x86_64")]
match (src_trc, dst_trc) {
(Srgb, Srgb) => {
incant!(convert_rgba_srgb(m, data));
return true;
}
(Bt709, Bt709) => {
incant!(convert_rgba_bt709(m, data));
return true;
}
(Pq, Pq) => {
incant!(convert_rgba_pq(m, data));
return true;
}
(Hlg, Hlg) => {
incant!(convert_rgba_hlg(m, data));
return true;
}
(Gamma22, Gamma22) => {
incant!(convert_rgba_adobe(m, data));
return true;
}
(Pq, Srgb) => {
incant!(convert_rgba_pq_to_srgb(m, data));
return true;
}
(Hlg, Srgb) => {
incant!(convert_rgba_hlg_to_srgb(m, data));
return true;
}
(Srgb, Pq) => {
incant!(convert_rgba_srgb_to_pq(m, data));
return true;
}
(Bt709, Srgb) => {
incant!(convert_rgba_bt709_to_srgb(m, data));
return true;
}
(Srgb, Bt709) => {
incant!(convert_rgba_srgb_to_bt709(m, data));
return true;
}
(Gamma22, Srgb) => {
incant!(convert_rgba_adobe_to_srgb(m, data));
return true;
}
(Srgb, Gamma22) => {
incant!(convert_rgba_srgb_to_adobe(m, data));
return true;
}
_ => {}
}
let Some(lin) = scalar_linearize(src_trc) else {
return false;
};
let Some(enc) = scalar_encode(dst_trc) else {
return false;
};
for pixel in data.chunks_exact_mut(4) {
let r = lin(pixel[0]);
let g = lin(pixel[1]);
let b = lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = enc(nr);
pixel[1] = enc(ng);
pixel[2] = enc(nb);
}
true
}
pub(crate) fn build_linearize_lut(linearize_fn: fn(f32) -> f32) -> alloc::boxed::Box<[f32; 256]> {
let mut lut = alloc::vec![0.0f32; 256].into_boxed_slice();
for i in 0..256 {
lut[i] = linearize_fn(i as f32 / 255.0);
}
lut.try_into().ok().unwrap()
}
pub(crate) fn convert_u8_rgb(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
linearize_fn: fn(f32) -> f32,
encode_fn: fn(f32) -> f32,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = linearize_fn(src_px[0] as f32 / 255.0);
let g = linearize_fn(src_px[1] as f32 / 255.0);
let b = linearize_fn(src_px[2] as f32 / 255.0);
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = (encode_fn(nr) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[1] = (encode_fn(ng) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[2] = (encode_fn(nb) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
}
}
pub(crate) fn scalar_encode_u8(trc: TransferFunction) -> Option<fn(f32) -> u8> {
fn srgb_u8(v: f32) -> u8 {
linear_srgb::default::linear_to_srgb_u8(v)
}
fn quantize_with(enc: fn(f32) -> f32, v: f32) -> u8 {
(enc(v) * 255.0 + 0.5).clamp(0.0, 255.0) as u8
}
match trc {
TransferFunction::Srgb => Some(srgb_u8),
TransferFunction::Bt709 => Some(|v| quantize_with(linear_srgb::tf::linear_to_bt709, v)),
TransferFunction::Pq => Some(|v| quantize_with(linear_srgb::tf::linear_to_pq, v)),
TransferFunction::Hlg => Some(|v| quantize_with(linear_srgb::tf::linear_to_hlg, v)),
TransferFunction::Gamma22 => Some(|v| quantize_with(adobe_from_linear_scalar, v)),
TransferFunction::Linear => Some(|v| (v * 255.0 + 0.5).clamp(0.0, 255.0) as u8),
_ => None,
}
}
pub(crate) fn convert_u8_rgb_lut_lut(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = enc_u8(nr);
dst_px[1] = enc_u8(ng);
dst_px[2] = enc_u8(nb);
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn convert_8px_u8_rgb_fused(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8; 24],
dst: &mut [u8; 24],
lin_lut: &[f32; 256],
dst_trc: TransferFunction,
) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = lin_lut[src[i * 3] as usize];
g[i] = lin_lut[src[i * 3 + 1] as usize];
b[i] = lin_lut[src[i * 3 + 2] as usize];
}
let rv = mt_f32x8::from_array(token, r);
let gv = mt_f32x8::from_array(token, g);
let bv = mt_f32x8::from_array(token, b);
let (or, og, ob) = mat3x3_x8(token, m, rv, gv, bv);
let ro = simd_encode_x8_dispatch(token, dst_trc, or.to_array());
let go = simd_encode_x8_dispatch(token, dst_trc, og.to_array());
let bo = simd_encode_x8_dispatch(token, dst_trc, ob.to_array());
for i in 0..8 {
dst[i * 3] = (ro[i] * 255.0 + 0.5) as u8;
dst[i * 3 + 1] = (go[i] * 255.0 + 0.5) as u8;
dst[i * 3 + 2] = (bo[i] * 255.0 + 0.5) as u8;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn convert_8px_u8_rgb_matlut(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8; 24],
dst: &mut [u8; 24],
lin_lut: &[f32; 256],
enc_lut: &[u8; 4096],
) {
let m0 = mt_f32x8::from_array(
token,
[
m[0][0], m[1][0], m[2][0], 0.0, m[0][0], m[1][0], m[2][0], 0.0,
],
);
let m1 = mt_f32x8::from_array(
token,
[
m[0][1], m[1][1], m[2][1], 0.0, m[0][1], m[1][1], m[2][1], 0.0,
],
);
let m2 = mt_f32x8::from_array(
token,
[
m[0][2], m[1][2], m[2][2], 0.0, m[0][2], m[1][2], m[2][2], 0.0,
],
);
let scale = mt_f32x8::splat(token, 4095.0);
let zero = mt_f32x8::zero(token);
let one = mt_f32x8::splat(token, 1.0);
let mut temp = [0i32; 8];
for pair in 0..4 {
let off = pair * 6;
let p0r = lin_lut[src[off] as usize];
let p0g = lin_lut[src[off + 1] as usize];
let p0b = lin_lut[src[off + 2] as usize];
let p1r = lin_lut[src[off + 3] as usize];
let p1g = lin_lut[src[off + 4] as usize];
let p1b = lin_lut[src[off + 5] as usize];
let r = mt_f32x8::from_array(token, [p0r, p0r, p0r, p0r, p1r, p1r, p1r, p1r]);
let g = mt_f32x8::from_array(token, [p0g, p0g, p0g, p0g, p1g, p1g, p1g, p1g]);
let b = mt_f32x8::from_array(token, [p0b, p0b, p0b, p0b, p1b, p1b, p1b, p1b]);
let v = r * m0 + g * m1 + b * m2;
let clamped = v.max(zero).min(one);
let scaled = clamped * scale;
let idx = scaled.to_i32_round();
idx.store(&mut temp);
dst[off] = enc_lut[temp[0] as usize & 0xFFF];
dst[off + 1] = enc_lut[temp[1] as usize & 0xFFF];
dst[off + 2] = enc_lut[temp[2] as usize & 0xFFF];
dst[off + 3] = enc_lut[temp[4] as usize & 0xFFF];
dst[off + 4] = enc_lut[temp[5] as usize & 0xFFF];
dst[off + 5] = enc_lut[temp[6] as usize & 0xFFF];
}
}
pub(crate) fn srgb_lin_lut_u8() -> &'static [f32; 256] {
use once_cell::race::OnceBox;
static LUT: OnceBox<[f32; 256]> = OnceBox::new();
LUT.get_or_init(|| {
let mut t = alloc::boxed::Box::new([0.0f32; 256]);
for (i, slot) in t.iter_mut().enumerate() {
*slot = linear_srgb::default::srgb_u8_to_linear(i as u8);
}
t
})
}
pub(crate) fn srgb_enc_lut_u8() -> &'static [u8; 4096] {
use once_cell::race::OnceBox;
static LUT: OnceBox<[u8; 4096]> = OnceBox::new();
LUT.get_or_init(|| {
let mut t = alloc::boxed::Box::new([0u8; 4096]);
for (i, slot) in t.iter_mut().enumerate() {
let lin = i as f32 / 4095.0;
*slot = linear_srgb::default::linear_to_srgb_u8(lin);
}
t
})
}
fn srgb_enc_lut_4096() -> &'static [u8; 4096] {
srgb_enc_lut_u8()
}
pub(crate) fn srgb_lin_lut_u16() -> &'static [f32; 65536] {
use once_cell::race::OnceBox;
static LUT: OnceBox<[f32; 65536]> = OnceBox::new();
LUT.get_or_init(|| {
let mut t: alloc::boxed::Box<[f32; 65536]> = alloc::vec![0.0f32; 65536]
.into_boxed_slice()
.try_into()
.ok()
.unwrap();
for (i, slot) in t.iter_mut().enumerate() {
*slot = linear_srgb::default::srgb_u16_to_linear(i as u16);
}
t
})
}
pub(crate) fn srgb_enc_lut_u16() -> &'static [u16; 65536] {
use once_cell::race::OnceBox;
static LUT: OnceBox<[u16; 65536]> = OnceBox::new();
LUT.get_or_init(|| {
let mut t: alloc::boxed::Box<[u16; 65536]> = alloc::vec![0u16; 65536]
.into_boxed_slice()
.try_into()
.ok()
.unwrap();
for (i, slot) in t.iter_mut().enumerate() {
let lin = i as f32 / 65535.0;
*slot = linear_srgb::default::linear_to_srgb_u16(lin);
}
t
})
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn convert_8px_u16_rgb_matlut(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u16; 24],
dst: &mut [u16; 24],
lin_lut: &[f32; 65536],
enc_lut: &[u16; 65536],
) {
let m0 = mt_f32x8::from_array(
token,
[
m[0][0], m[1][0], m[2][0], 0.0, m[0][0], m[1][0], m[2][0], 0.0,
],
);
let m1 = mt_f32x8::from_array(
token,
[
m[0][1], m[1][1], m[2][1], 0.0, m[0][1], m[1][1], m[2][1], 0.0,
],
);
let m2 = mt_f32x8::from_array(
token,
[
m[0][2], m[1][2], m[2][2], 0.0, m[0][2], m[1][2], m[2][2], 0.0,
],
);
let scale = mt_f32x8::splat(token, 65535.0);
let zero = mt_f32x8::zero(token);
let one = mt_f32x8::splat(token, 1.0);
let mut temp = [0i32; 8];
for pair in 0..4 {
let off = pair * 6;
let p0r = lin_lut[src[off] as usize];
let p0g = lin_lut[src[off + 1] as usize];
let p0b = lin_lut[src[off + 2] as usize];
let p1r = lin_lut[src[off + 3] as usize];
let p1g = lin_lut[src[off + 4] as usize];
let p1b = lin_lut[src[off + 5] as usize];
let r = mt_f32x8::from_array(token, [p0r, p0r, p0r, p0r, p1r, p1r, p1r, p1r]);
let g = mt_f32x8::from_array(token, [p0g, p0g, p0g, p0g, p1g, p1g, p1g, p1g]);
let b = mt_f32x8::from_array(token, [p0b, p0b, p0b, p0b, p1b, p1b, p1b, p1b]);
let v = r * m0 + g * m1 + b * m2;
let clamped = v.max(zero).min(one);
let scaled = clamped * scale;
let idx = scaled.to_i32_round();
idx.store(&mut temp);
dst[off] = enc_lut[temp[0] as usize & 0xFFFF];
dst[off + 1] = enc_lut[temp[1] as usize & 0xFFFF];
dst[off + 2] = enc_lut[temp[2] as usize & 0xFFFF];
dst[off + 3] = enc_lut[temp[4] as usize & 0xFFFF];
dst[off + 4] = enc_lut[temp[5] as usize & 0xFFFF];
dst[off + 5] = enc_lut[temp[6] as usize & 0xFFFF];
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn convert_u16_rgb_matlut_v3(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u16],
dst: &mut [u16],
lin_lut: &[f32; 65536],
enc_lut: &[u16; 65536],
) {
let pixel_count = src.len() / 3;
let bulk = (pixel_count / 8) * 8;
let bulk_chans = bulk * 3;
for off in (0..bulk_chans).step_by(24) {
let s: &[u16; 24] = src[off..off + 24].try_into().unwrap();
let d: &mut [u16; 24] = (&mut dst[off..off + 24]).try_into().unwrap();
convert_8px_u16_rgb_matlut(token, m, s, d, lin_lut, enc_lut);
}
for i in bulk..pixel_count {
let base = i * 3;
let r = lin_lut[src[base] as usize];
let g = lin_lut[src[base + 1] as usize];
let b = lin_lut[src[base + 2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst[base] = enc_lut[(nr.clamp(0.0, 1.0) * 65535.0 + 0.5) as usize & 0xFFFF];
dst[base + 1] = enc_lut[(ng.clamp(0.0, 1.0) * 65535.0 + 0.5) as usize & 0xFFFF];
dst[base + 2] = enc_lut[(nb.clamp(0.0, 1.0) * 65535.0 + 0.5) as usize & 0xFFFF];
}
}
fn convert_u16_rgb_matlut_scalar(
_token: ScalarToken,
m: &[[f32; 3]; 3],
src: &[u16],
dst: &mut [u16],
lin_lut: &[f32; 65536],
enc_lut: &[u16; 65536],
) {
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = enc_lut[(nr.clamp(0.0, 1.0) * 65535.0 + 0.5) as usize & 0xFFFF];
dst_px[1] = enc_lut[(ng.clamp(0.0, 1.0) * 65535.0 + 0.5) as usize & 0xFFFF];
dst_px[2] = enc_lut[(nb.clamp(0.0, 1.0) * 65535.0 + 0.5) as usize & 0xFFFF];
}
}
pub(crate) fn convert_u16_rgb_simd_matlut(
m: &[[f32; 3]; 3],
src: &[u16],
dst: &mut [u16],
lin_lut: &[f32; 65536],
enc_lut: &[u16; 65536],
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
#[cfg(target_arch = "x86_64")]
{
incant!(convert_u16_rgb_matlut(m, src, dst, lin_lut, enc_lut));
return;
}
#[cfg(not(target_arch = "x86_64"))]
convert_u16_rgb_matlut_scalar(ScalarToken, m, src, dst, lin_lut, enc_lut);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn convert_8px_u8_to_f32_lin(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8; 24],
dst: &mut [f32; 24],
lin_lut: &[f32; 256],
) {
let m0 = mt_f32x8::from_array(
token,
[
m[0][0], m[1][0], m[2][0], 0.0, m[0][0], m[1][0], m[2][0], 0.0,
],
);
let m1 = mt_f32x8::from_array(
token,
[
m[0][1], m[1][1], m[2][1], 0.0, m[0][1], m[1][1], m[2][1], 0.0,
],
);
let m2 = mt_f32x8::from_array(
token,
[
m[0][2], m[1][2], m[2][2], 0.0, m[0][2], m[1][2], m[2][2], 0.0,
],
);
for pair in 0..4 {
let off = pair * 6;
let off_out = pair * 6;
let p0r = lin_lut[src[off] as usize];
let p0g = lin_lut[src[off + 1] as usize];
let p0b = lin_lut[src[off + 2] as usize];
let p1r = lin_lut[src[off + 3] as usize];
let p1g = lin_lut[src[off + 4] as usize];
let p1b = lin_lut[src[off + 5] as usize];
let r = mt_f32x8::from_array(token, [p0r, p0r, p0r, p0r, p1r, p1r, p1r, p1r]);
let g = mt_f32x8::from_array(token, [p0g, p0g, p0g, p0g, p1g, p1g, p1g, p1g]);
let b = mt_f32x8::from_array(token, [p0b, p0b, p0b, p0b, p1b, p1b, p1b, p1b]);
let v = r * m0 + g * m1 + b * m2;
let out = v.to_array();
dst[off_out] = out[0];
dst[off_out + 1] = out[1];
dst[off_out + 2] = out[2];
dst[off_out + 3] = out[4];
dst[off_out + 4] = out[5];
dst[off_out + 5] = out[6];
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn convert_u8_to_f32_lin_v3(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [f32],
lin_lut: &[f32; 256],
) {
let pixel_count = src.len() / 3;
let bulk = (pixel_count / 8) * 8;
for off in (0..bulk).step_by(8) {
let s: &[u8; 24] = src[off * 3..off * 3 + 24].try_into().unwrap();
let d: &mut [f32; 24] = (&mut dst[off * 3..off * 3 + 24]).try_into().unwrap();
convert_8px_u8_to_f32_lin(token, m, s, d, lin_lut);
}
for i in bulk..pixel_count {
let base = i * 3;
let r = lin_lut[src[base] as usize];
let g = lin_lut[src[base + 1] as usize];
let b = lin_lut[src[base + 2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst[base] = nr;
dst[base + 1] = ng;
dst[base + 2] = nb;
}
}
fn convert_u8_to_f32_lin_scalar(
_token: ScalarToken,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [f32],
lin_lut: &[f32; 256],
) {
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = nr;
dst_px[1] = ng;
dst_px[2] = nb;
}
}
pub(crate) fn convert_u8_to_f32_lin_simd(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [f32],
lin_lut: &[f32; 256],
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
#[cfg(target_arch = "x86_64")]
{
incant!(convert_u8_to_f32_lin(m, src, dst, lin_lut));
return;
}
#[cfg(not(target_arch = "x86_64"))]
convert_u8_to_f32_lin_scalar(ScalarToken, m, src, dst, lin_lut);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn convert_8px_f32_lin_to_u8(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[f32; 24],
dst: &mut [u8; 24],
enc_lut: &[u8; 4096],
) {
let m0 = mt_f32x8::from_array(
token,
[
m[0][0], m[1][0], m[2][0], 0.0, m[0][0], m[1][0], m[2][0], 0.0,
],
);
let m1 = mt_f32x8::from_array(
token,
[
m[0][1], m[1][1], m[2][1], 0.0, m[0][1], m[1][1], m[2][1], 0.0,
],
);
let m2 = mt_f32x8::from_array(
token,
[
m[0][2], m[1][2], m[2][2], 0.0, m[0][2], m[1][2], m[2][2], 0.0,
],
);
let scale = mt_f32x8::splat(token, 4095.0);
let zero = mt_f32x8::zero(token);
let one = mt_f32x8::splat(token, 1.0);
let mut temp = [0i32; 8];
for pair in 0..4 {
let off = pair * 6;
let r = mt_f32x8::from_array(
token,
[
src[off],
src[off],
src[off],
src[off],
src[off + 3],
src[off + 3],
src[off + 3],
src[off + 3],
],
);
let g = mt_f32x8::from_array(
token,
[
src[off + 1],
src[off + 1],
src[off + 1],
src[off + 1],
src[off + 4],
src[off + 4],
src[off + 4],
src[off + 4],
],
);
let b = mt_f32x8::from_array(
token,
[
src[off + 2],
src[off + 2],
src[off + 2],
src[off + 2],
src[off + 5],
src[off + 5],
src[off + 5],
src[off + 5],
],
);
let v = r * m0 + g * m1 + b * m2;
let clamped = v.max(zero).min(one);
let scaled = clamped * scale;
let idx = scaled.to_i32_round();
idx.store(&mut temp);
dst[off] = enc_lut[temp[0] as usize & 0xFFF];
dst[off + 1] = enc_lut[temp[1] as usize & 0xFFF];
dst[off + 2] = enc_lut[temp[2] as usize & 0xFFF];
dst[off + 3] = enc_lut[temp[4] as usize & 0xFFF];
dst[off + 4] = enc_lut[temp[5] as usize & 0xFFF];
dst[off + 5] = enc_lut[temp[6] as usize & 0xFFF];
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn convert_f32_lin_to_u8_v3(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[f32],
dst: &mut [u8],
enc_lut: &[u8; 4096],
) {
let pixel_count = src.len() / 3;
let bulk = (pixel_count / 8) * 8;
for off in (0..bulk).step_by(8) {
let s: &[f32; 24] = src[off * 3..off * 3 + 24].try_into().unwrap();
let d: &mut [u8; 24] = (&mut dst[off * 3..off * 3 + 24]).try_into().unwrap();
convert_8px_f32_lin_to_u8(token, m, s, d, enc_lut);
}
for i in bulk..pixel_count {
let base = i * 3;
let (nr, ng, nb) = mat3x3(m, src[base], src[base + 1], src[base + 2]);
dst[base] = enc_lut[(nr.clamp(0.0, 1.0) * 4095.0 + 0.5) as usize & 0xFFF];
dst[base + 1] = enc_lut[(ng.clamp(0.0, 1.0) * 4095.0 + 0.5) as usize & 0xFFF];
dst[base + 2] = enc_lut[(nb.clamp(0.0, 1.0) * 4095.0 + 0.5) as usize & 0xFFF];
}
}
fn convert_f32_lin_to_u8_scalar(
_token: ScalarToken,
m: &[[f32; 3]; 3],
src: &[f32],
dst: &mut [u8],
enc_lut: &[u8; 4096],
) {
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let (nr, ng, nb) = mat3x3(m, src_px[0], src_px[1], src_px[2]);
dst_px[0] = enc_lut[(nr.clamp(0.0, 1.0) * 4095.0 + 0.5) as usize & 0xFFF];
dst_px[1] = enc_lut[(ng.clamp(0.0, 1.0) * 4095.0 + 0.5) as usize & 0xFFF];
dst_px[2] = enc_lut[(nb.clamp(0.0, 1.0) * 4095.0 + 0.5) as usize & 0xFFF];
}
}
pub(crate) fn convert_f32_lin_to_u8_simd(
m: &[[f32; 3]; 3],
src: &[f32],
dst: &mut [u8],
enc_lut: &[u8; 4096],
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
#[cfg(target_arch = "x86_64")]
{
incant!(convert_f32_lin_to_u8(m, src, dst, enc_lut));
return;
}
#[cfg(not(target_arch = "x86_64"))]
convert_f32_lin_to_u8_scalar(ScalarToken, m, src, dst, enc_lut);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn convert_u8_rgb_matlut_v3(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
let pixel_count = src.len() / 3;
let bulk = (pixel_count / 8) * 8;
let bulk_bytes = bulk * 3;
let enc_lut = srgb_enc_lut_4096();
for off in (0..bulk_bytes).step_by(24) {
let s: &[u8; 24] = src[off..off + 24].try_into().unwrap();
let d: &mut [u8; 24] = (&mut dst[off..off + 24]).try_into().unwrap();
convert_8px_u8_rgb_matlut(token, m, s, d, lin_lut, enc_lut);
}
for i in bulk..pixel_count {
let base = i * 3;
let r = lin_lut[src[base] as usize];
let g = lin_lut[src[base + 1] as usize];
let b = lin_lut[src[base + 2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst[base] = enc_u8(nr);
dst[base + 1] = enc_u8(ng);
dst[base + 2] = enc_u8(nb);
}
}
fn convert_u8_rgb_matlut_scalar(
_token: ScalarToken,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = enc_u8(nr);
dst_px[1] = enc_u8(ng);
dst_px[2] = enc_u8(nb);
}
}
pub(crate) fn convert_u8_rgb_simd_matlut(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
#[cfg(target_arch = "x86_64")]
{
incant!(convert_u8_rgb_matlut(m, src, dst, lin_lut, enc_u8));
return;
}
#[cfg(not(target_arch = "x86_64"))]
convert_u8_rgb_matlut_scalar(ScalarToken, m, src, dst, lin_lut, enc_u8);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn convert_u8_rgb_fused_v3(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
dst_trc: TransferFunction,
scalar_enc: fn(f32) -> f32,
) {
let pixel_count = src.len() / 3;
let bulk = (pixel_count / 8) * 8;
let bulk_bytes = bulk * 3;
for off in (0..bulk_bytes).step_by(24) {
let s: &[u8; 24] = src[off..off + 24].try_into().unwrap();
let d: &mut [u8; 24] = (&mut dst[off..off + 24]).try_into().unwrap();
convert_8px_u8_rgb_fused(token, m, s, d, lin_lut, dst_trc);
}
for i in bulk..pixel_count {
let base = i * 3;
let r = lin_lut[src[base] as usize];
let g = lin_lut[src[base + 1] as usize];
let b = lin_lut[src[base + 2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst[base] = (scalar_enc(nr) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst[base + 1] = (scalar_enc(ng) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst[base + 2] = (scalar_enc(nb) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
}
}
fn convert_u8_rgb_fused_scalar(
_token: ScalarToken,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
_dst_trc: TransferFunction,
scalar_enc: fn(f32) -> f32,
) {
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = (scalar_enc(nr) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[1] = (scalar_enc(ng) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[2] = (scalar_enc(nb) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
}
}
pub(crate) fn convert_u8_rgb_simd_fused(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
dst_trc: TransferFunction,
scalar_enc: fn(f32) -> f32,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
#[cfg(target_arch = "x86_64")]
{
incant!(convert_u8_rgb_fused(
m, src, dst, lin_lut, dst_trc, scalar_enc
));
return;
}
#[cfg(not(target_arch = "x86_64"))]
convert_u8_rgb_fused_scalar(ScalarToken, m, src, dst, lin_lut, dst_trc, scalar_enc);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn convert_8px_u8_rgba_simd(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = lin_lut[src[i * 4] as usize];
g[i] = lin_lut[src[i * 4 + 1] as usize];
b[i] = lin_lut[src[i * 4 + 2] as usize];
}
let rv = mt_f32x8::from_array(token, r);
let gv = mt_f32x8::from_array(token, g);
let bv = mt_f32x8::from_array(token, b);
let (or, og, ob) = mat3x3_x8(token, m, rv, gv, bv);
let ro = or.to_array();
let go = og.to_array();
let bo = ob.to_array();
for i in 0..8 {
dst[i * 4] = enc_u8(ro[i]);
dst[i * 4 + 1] = enc_u8(go[i]);
dst[i * 4 + 2] = enc_u8(bo[i]);
dst[i * 4 + 3] = src[i * 4 + 3]; }
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn convert_u8_rgba_lut_simd_v3(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
let pixel_count = src.len() / 4;
let bulk = (pixel_count / 8) * 8;
let bulk_bytes = bulk * 4;
for off in (0..bulk_bytes).step_by(32) {
convert_8px_u8_rgba_simd(
token,
m,
&src[off..off + 32],
&mut dst[off..off + 32],
lin_lut,
enc_u8,
);
}
for i in bulk..pixel_count {
let base = i * 4;
let r = lin_lut[src[base] as usize];
let g = lin_lut[src[base + 1] as usize];
let b = lin_lut[src[base + 2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst[base] = enc_u8(nr);
dst[base + 1] = enc_u8(ng);
dst[base + 2] = enc_u8(nb);
dst[base + 3] = src[base + 3];
}
}
fn convert_u8_rgba_lut_simd_scalar(
_token: ScalarToken,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
for (src_px, dst_px) in src.chunks_exact(4).zip(dst.chunks_exact_mut(4)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = enc_u8(nr);
dst_px[1] = enc_u8(ng);
dst_px[2] = enc_u8(nb);
dst_px[3] = src_px[3];
}
}
pub(crate) fn convert_u8_rgba_simd_lut(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
debug_assert_eq!(src.len() % 4, 0);
debug_assert_eq!(src.len(), dst.len());
#[cfg(target_arch = "x86_64")]
{
incant!(convert_u8_rgba_lut_simd(m, src, dst, lin_lut, enc_u8));
return;
}
#[cfg(not(target_arch = "x86_64"))]
convert_u8_rgba_lut_simd_scalar(ScalarToken, m, src, dst, lin_lut, enc_u8);
}
pub(crate) fn convert_u8_rgba(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
linearize_fn: fn(f32) -> f32,
encode_fn: fn(f32) -> f32,
) {
debug_assert_eq!(src.len() % 4, 0);
debug_assert_eq!(src.len(), dst.len());
for (src_px, dst_px) in src.chunks_exact(4).zip(dst.chunks_exact_mut(4)) {
let r = linearize_fn(src_px[0] as f32 / 255.0);
let g = linearize_fn(src_px[1] as f32 / 255.0);
let b = linearize_fn(src_px[2] as f32 / 255.0);
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = (encode_fn(nr) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[1] = (encode_fn(ng) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[2] = (encode_fn(nb) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[3] = src_px[3];
}
}
pub(crate) fn convert_u16_rgb(
m: &[[f32; 3]; 3],
src: &[u16],
dst: &mut [u16],
linearize_fn: fn(f32) -> f32,
encode_fn: fn(f32) -> f32,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = linearize_fn(src_px[0] as f32 / 65535.0);
let g = linearize_fn(src_px[1] as f32 / 65535.0);
let b = linearize_fn(src_px[2] as f32 / 65535.0);
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = (encode_fn(nr) * 65535.0 + 0.5).clamp(0.0, 65535.0) as u16;
dst_px[1] = (encode_fn(ng) * 65535.0 + 0.5).clamp(0.0, 65535.0) as u16;
dst_px[2] = (encode_fn(nb) * 65535.0 + 0.5).clamp(0.0, 65535.0) as u16;
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ColorPrimaries;
fn m(src: ColorPrimaries, dst: ColorPrimaries) -> [[f32; 3]; 3] {
src.gamut_matrix_to(dst).unwrap()
}
#[test]
fn dispatch_p3_srgb_white() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [1.0f32, 1.0, 1.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for c in &px {
assert!((c - 1.0).abs() < 1e-4, "white: {px:?}");
}
}
#[test]
fn dispatch_p3_srgb_black() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [0.0f32, 0.0, 0.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for c in &px {
assert!(c.abs() < 1e-6, "black: {px:?}");
}
}
#[test]
fn dispatch_p3_srgb_roundtrip() {
let fwd = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let inv = m(ColorPrimaries::Bt709, ColorPrimaries::DisplayP3);
let original = [0.5f32, 0.3, 0.7];
let mut px = original;
convert_f32_rgb_dispatch(
&fwd,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
convert_f32_rgb_dispatch(
&inv,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for i in 0..3 {
assert!((original[i] - px[i]).abs() < 1e-4, "ch{i}: {}", px[i]);
}
}
#[test]
fn dispatch_bt2020_sdr_srgb_white() {
let mat = m(ColorPrimaries::Bt2020, ColorPrimaries::Bt709);
let mut px = [1.0f32, 1.0, 1.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Bt709,
TransferFunction::Srgb,
);
for c in &px {
assert!((c - 1.0).abs() < 1e-4, "white: {px:?}");
}
}
#[test]
fn dispatch_bt2020_pq_srgb_black() {
let mat = m(ColorPrimaries::Bt2020, ColorPrimaries::Bt709);
let mut px = [0.0f32, 0.0, 0.0];
convert_f32_rgb_dispatch(&mat, &mut px, TransferFunction::Pq, TransferFunction::Srgb);
for c in &px {
assert!(c.abs() < 1e-5, "black: {px:?}");
}
}
#[test]
fn dispatch_adobe_srgb_white() {
let mat = m(ColorPrimaries::AdobeRgb, ColorPrimaries::Bt709);
let mut px = [1.0f32, 1.0, 1.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Gamma22,
TransferFunction::Srgb,
);
for c in &px {
assert!((c - 1.0).abs() < 1e-4, "white: {px:?}");
}
}
#[test]
fn dispatch_rgba_alpha_passthrough() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [0.5f32, 0.5, 0.5, 0.7];
convert_f32_rgba_dispatch(
&mat,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
assert!((px[3] - 0.7).abs() < f32::EPSILON, "alpha: {px:?}");
}
#[test]
fn dispatch_linear_white() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [1.0f32, 1.0, 1.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Linear,
TransferFunction::Linear,
);
for c in &px {
assert!((c - 1.0).abs() < 1e-6, "linear white: {px:?}");
}
}
#[test]
fn dispatch_returns_false_for_unknown_trc() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [0.5f32, 0.5, 0.5];
assert!(!convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Unknown,
TransferFunction::Srgb
));
}
#[test]
fn u8_white_black() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut dst = [0u8; 3];
convert_u8_rgb(
&mat,
&[255, 255, 255],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst, [255, 255, 255]);
convert_u8_rgb(
&mat,
&[0, 0, 0],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst, [0, 0, 0]);
}
#[test]
fn u8_lut_lut_matches_scalar() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let lut = build_linearize_lut(linear_srgb::tf::srgb_to_linear);
let enc_u8 = scalar_encode_u8(TransferFunction::Srgb).unwrap();
let src = [128u8, 64, 200];
let mut dst_scalar = [0u8; 3];
let mut dst_lut = [0u8; 3];
convert_u8_rgb(
&mat,
&src,
&mut dst_scalar,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
convert_u8_rgb_lut_lut(&mat, &src, &mut dst_lut, &lut, enc_u8);
assert_eq!(
dst_scalar, dst_lut,
"LUT-LUT and scalar should produce identical u8 output"
);
}
#[test]
fn u8_rgba_alpha_passthrough() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut dst = [0u8; 4];
convert_u8_rgba(
&mat,
&[128, 64, 32, 200],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst[3], 200);
}
#[test]
fn u16_white_black() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut dst = [0u16; 3];
convert_u16_rgb(
&mat,
&[65535, 65535, 65535],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst, [65535, 65535, 65535]);
convert_u16_rgb(
&mat,
&[0, 0, 0],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst, [0, 0, 0]);
}
#[test]
fn f32_roundtrip_accuracy() {
let fwd = m(ColorPrimaries::Bt709, ColorPrimaries::DisplayP3);
let inv = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut max_err: f32 = 0.0;
for r in (0..=255).step_by(4) {
for g in (0..=255).step_by(4) {
for b in (0..=255).step_by(16) {
let original = [r as f32 / 255.0, g as f32 / 255.0, b as f32 / 255.0];
let mut px = original;
convert_f32_rgb_dispatch(
&fwd,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
convert_f32_rgb_dispatch(
&inv,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for i in 0..3 {
let err = (original[i] - px[i]).abs();
if err > max_err {
max_err = err;
}
}
}
}
}
assert!(max_err < 1e-4, "max roundtrip error: {max_err}");
}
#[test]
fn srgb_subset_of_p3() {
let mat = m(ColorPrimaries::Bt709, ColorPrimaries::DisplayP3);
for r in (0..=255).step_by(17) {
for g in (0..=255).step_by(17) {
for b in (0..=255).step_by(17) {
let mut px = [r as f32 / 255.0, g as f32 / 255.0, b as f32 / 255.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for (i, c) in px.iter().enumerate() {
assert!(*c >= -1e-5 && *c <= 1.0 + 1e-5, "({r},{g},{b}) ch{i}: {c}");
}
}
}
}
}
}