use archmage::prelude::*;
#[cfg(target_arch = "x86_64")]
use linear_srgb::tokens::x8 as trc_x8;
#[cfg(target_arch = "x86_64")]
use magetypes::simd::f32x8 as mt_f32x8;
#[inline(always)]
pub(crate) fn mat3x3_scalar(m: &[[f32; 3]; 3], r: f32, g: f32, b: f32) -> (f32, f32, f32) {
mat3x3(m, r, g, b)
}
#[inline(always)]
fn mat3x3(m: &[[f32; 3]; 3], r: f32, g: f32, b: f32) -> (f32, f32, f32) {
(
m[0][0].mul_add(r, m[0][1].mul_add(g, m[0][2] * b)),
m[1][0].mul_add(r, m[1][1].mul_add(g, m[1][2] * b)),
m[2][0].mul_add(r, m[2][1].mul_add(g, m[2][2] * b)),
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn mat3x3_x8(
token: X64V3Token,
m: &[[f32; 3]; 3],
rl: mt_f32x8,
gl: mt_f32x8,
bl: mt_f32x8,
) -> (mt_f32x8, mt_f32x8, mt_f32x8) {
let or = mt_f32x8::splat(token, m[0][0]).mul_add(
rl,
mt_f32x8::splat(token, m[0][1]).mul_add(gl, mt_f32x8::splat(token, m[0][2]) * bl),
);
let og = mt_f32x8::splat(token, m[1][0]).mul_add(
rl,
mt_f32x8::splat(token, m[1][1]).mul_add(gl, mt_f32x8::splat(token, m[1][2]) * bl),
);
let ob = mt_f32x8::splat(token, m[2][0]).mul_add(
rl,
mt_f32x8::splat(token, m[2][1]).mul_add(gl, mt_f32x8::splat(token, m[2][2]) * bl),
);
(or, og, ob)
}
macro_rules! stamp_trc_kernels {
(
$name:ident,
simd_linearize: $simd_lin:path,
simd_encode: $simd_enc:path,
scalar_linearize: $scalar_lin:path,
scalar_encode: $scalar_enc:path
) => {
paste::paste! {
#[cfg(target_arch = "x86_64")]
#[rite]
fn [<fused_8px_rgb_ $name>](token: X64V3Token, m: &[[f32; 3]; 3], data: &mut [f32]) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = data[i * 3];
g[i] = data[i * 3 + 1];
b[i] = data[i * 3 + 2];
}
let rl = mt_f32x8::from_array(token, $simd_lin(token, r));
let gl = mt_f32x8::from_array(token, $simd_lin(token, g));
let bl = mt_f32x8::from_array(token, $simd_lin(token, b));
let (or, og, ob) = mat3x3_x8(token, m, rl, gl, bl);
let ro = $simd_enc(token, or.to_array());
let go = $simd_enc(token, og.to_array());
let bo = $simd_enc(token, ob.to_array());
for i in 0..8 {
data[i * 3] = ro[i];
data[i * 3 + 1] = go[i];
data[i * 3 + 2] = bo[i];
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn [<fused_8px_rgba_ $name>](token: X64V3Token, m: &[[f32; 3]; 3], data: &mut [f32]) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = data[i * 4];
g[i] = data[i * 4 + 1];
b[i] = data[i * 4 + 2];
}
let rl = mt_f32x8::from_array(token, $simd_lin(token, r));
let gl = mt_f32x8::from_array(token, $simd_lin(token, g));
let bl = mt_f32x8::from_array(token, $simd_lin(token, b));
let (or, og, ob) = mat3x3_x8(token, m, rl, gl, bl);
let ro = $simd_enc(token, or.to_array());
let go = $simd_enc(token, og.to_array());
let bo = $simd_enc(token, ob.to_array());
for i in 0..8 {
data[i * 4] = ro[i];
data[i * 4 + 1] = go[i];
data[i * 4 + 2] = bo[i];
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn [<convert_rgb_ $name _v3>](token: X64V3Token, m: &[[f32; 3]; 3], data: &mut [f32]) {
let bulk = (data.len() / 24) * 24;
for off in (0..bulk).step_by(24) {
[<fused_8px_rgb_ $name>](token, m, &mut data[off..off + 24]);
}
for pixel in data[bulk..].chunks_exact_mut(3) {
let r = $scalar_lin(pixel[0]);
let g = $scalar_lin(pixel[1]);
let b = $scalar_lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = $scalar_enc(nr);
pixel[1] = $scalar_enc(ng);
pixel[2] = $scalar_enc(nb);
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn [<convert_rgba_ $name _v3>](token: X64V3Token, m: &[[f32; 3]; 3], data: &mut [f32]) {
let bulk = (data.len() / 32) * 32;
for off in (0..bulk).step_by(32) {
[<fused_8px_rgba_ $name>](token, m, &mut data[off..off + 32]);
}
for pixel in data[bulk..].chunks_exact_mut(4) {
let r = $scalar_lin(pixel[0]);
let g = $scalar_lin(pixel[1]);
let b = $scalar_lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = $scalar_enc(nr);
pixel[1] = $scalar_enc(ng);
pixel[2] = $scalar_enc(nb);
}
}
fn [<convert_rgb_ $name _scalar>](_token: ScalarToken, m: &[[f32; 3]; 3], data: &mut [f32]) {
for pixel in data.chunks_exact_mut(3) {
let r = $scalar_lin(pixel[0]);
let g = $scalar_lin(pixel[1]);
let b = $scalar_lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = $scalar_enc(nr);
pixel[1] = $scalar_enc(ng);
pixel[2] = $scalar_enc(nb);
}
}
fn [<convert_rgba_ $name _scalar>](_token: ScalarToken, m: &[[f32; 3]; 3], data: &mut [f32]) {
for pixel in data.chunks_exact_mut(4) {
let r = $scalar_lin(pixel[0]);
let g = $scalar_lin(pixel[1]);
let b = $scalar_lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = $scalar_enc(nr);
pixel[1] = $scalar_enc(ng);
pixel[2] = $scalar_enc(nb);
}
}
}
};
}
stamp_trc_kernels!(srgb,
simd_linearize: trc_x8::srgb_to_linear_v3,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: linear_srgb::tf::srgb_to_linear,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(bt709,
simd_linearize: trc_x8::bt709_to_linear_v3,
simd_encode: trc_x8::linear_to_bt709_v3,
scalar_linearize: linear_srgb::tf::bt709_to_linear,
scalar_encode: linear_srgb::tf::linear_to_bt709
);
stamp_trc_kernels!(pq,
simd_linearize: trc_x8::pq_to_linear_v3,
simd_encode: trc_x8::linear_to_pq_v3,
scalar_linearize: linear_srgb::tf::pq_to_linear,
scalar_encode: linear_srgb::tf::linear_to_pq
);
stamp_trc_kernels!(hlg,
simd_linearize: trc_x8::hlg_to_linear_v3,
simd_encode: trc_x8::linear_to_hlg_v3,
scalar_linearize: linear_srgb::tf::hlg_to_linear,
scalar_encode: linear_srgb::tf::linear_to_hlg
);
pub fn convert_linear_rgb(m: &[[f32; 3]; 3], data: &mut [f32]) {
debug_assert_eq!(data.len() % 3, 0);
for pixel in data.chunks_exact_mut(3) {
let (r, g, b) = (pixel[0], pixel[1], pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = nr;
pixel[1] = ng;
pixel[2] = nb;
}
}
pub fn convert_linear_rgba(m: &[[f32; 3]; 3], data: &mut [f32]) {
debug_assert_eq!(data.len() % 4, 0);
for pixel in data.chunks_exact_mut(4) {
let (r, g, b) = (pixel[0], pixel[1], pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = nr;
pixel[1] = ng;
pixel[2] = nb;
}
}
stamp_trc_kernels!(pq_to_srgb,
simd_linearize: trc_x8::pq_to_linear_v3,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: linear_srgb::tf::pq_to_linear,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(hlg_to_srgb,
simd_linearize: trc_x8::hlg_to_linear_v3,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: linear_srgb::tf::hlg_to_linear,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(srgb_to_pq,
simd_linearize: trc_x8::srgb_to_linear_v3,
simd_encode: trc_x8::linear_to_pq_v3,
scalar_linearize: linear_srgb::tf::srgb_to_linear,
scalar_encode: linear_srgb::tf::linear_to_pq
);
stamp_trc_kernels!(bt709_to_srgb,
simd_linearize: trc_x8::bt709_to_linear_v3,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: linear_srgb::tf::bt709_to_linear,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(srgb_to_bt709,
simd_linearize: trc_x8::srgb_to_linear_v3,
simd_encode: trc_x8::linear_to_bt709_v3,
scalar_linearize: linear_srgb::tf::srgb_to_linear,
scalar_encode: linear_srgb::tf::linear_to_bt709
);
const ADOBE_GAMMA: f32 = 2.19921875;
#[cfg(target_arch = "x86_64")]
#[rite]
fn adobe_to_linear_x8(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
trc_x8::gamma_to_linear_v3(token, v, ADOBE_GAMMA)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn adobe_from_linear_x8(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
trc_x8::linear_to_gamma_v3(token, v, ADOBE_GAMMA)
}
#[inline(always)]
fn adobe_to_linear_scalar(v: f32) -> f32 {
linear_srgb::default::gamma_to_linear(v, ADOBE_GAMMA)
}
#[inline(always)]
fn adobe_from_linear_scalar(v: f32) -> f32 {
linear_srgb::default::linear_to_gamma(v, ADOBE_GAMMA)
}
stamp_trc_kernels!(adobe,
simd_linearize: adobe_to_linear_x8,
simd_encode: adobe_from_linear_x8,
scalar_linearize: adobe_to_linear_scalar,
scalar_encode: adobe_from_linear_scalar
);
stamp_trc_kernels!(adobe_to_srgb,
simd_linearize: adobe_to_linear_x8,
simd_encode: trc_x8::linear_to_srgb_v3,
scalar_linearize: adobe_to_linear_scalar,
scalar_encode: linear_srgb::tf::linear_to_srgb
);
stamp_trc_kernels!(srgb_to_adobe,
simd_linearize: trc_x8::srgb_to_linear_v3,
simd_encode: adobe_from_linear_x8,
scalar_linearize: linear_srgb::tf::srgb_to_linear,
scalar_encode: adobe_from_linear_scalar
);
use crate::TransferFunction;
#[inline(always)]
fn linearize_srgb_extended(v: f32) -> f32 {
if v >= 0.0 {
linear_srgb::precise::srgb_to_linear_extended(v)
} else {
-linear_srgb::precise::srgb_to_linear_extended(-v)
}
}
#[inline(always)]
fn encode_srgb_extended(v: f32) -> f32 {
if v >= 0.0 {
linear_srgb::precise::linear_to_srgb_extended(v)
} else {
-linear_srgb::precise::linear_to_srgb_extended(-v)
}
}
#[inline(always)]
fn linearize_gamma_extended(v: f32, gamma: f32) -> f32 {
if v >= 0.0 {
v.powf(gamma)
} else {
-((-v).powf(gamma))
}
}
#[inline(always)]
fn encode_gamma_extended(v: f32, inv_gamma: f32) -> f32 {
if v >= 0.0 {
v.powf(inv_gamma)
} else {
-((-v).powf(inv_gamma))
}
}
fn scalar_linearize_extended(trc: TransferFunction) -> Option<fn(f32) -> f32> {
match trc {
TransferFunction::Srgb => Some(linearize_srgb_extended),
TransferFunction::Gamma22 => Some(|v| linearize_gamma_extended(v, ADOBE_GAMMA)),
TransferFunction::Linear => Some(core::convert::identity),
_ => scalar_linearize(trc),
}
}
fn scalar_encode_extended(trc: TransferFunction) -> Option<fn(f32) -> f32> {
match trc {
TransferFunction::Srgb => Some(encode_srgb_extended),
TransferFunction::Gamma22 => Some(|v| encode_gamma_extended(v, 1.0 / ADOBE_GAMMA)),
TransferFunction::Linear => Some(core::convert::identity),
_ => scalar_encode(trc),
}
}
pub(crate) fn convert_f32_rgb_extended(
m: &[[f32; 3]; 3],
data: &mut [f32],
src_trc: TransferFunction,
dst_trc: TransferFunction,
) -> bool {
let Some(lin) = scalar_linearize_extended(src_trc) else {
return false;
};
let Some(enc) = scalar_encode_extended(dst_trc) else {
return false;
};
for pixel in data.chunks_exact_mut(3) {
let r = lin(pixel[0]);
let g = lin(pixel[1]);
let b = lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = enc(nr);
pixel[1] = enc(ng);
pixel[2] = enc(nb);
}
true
}
pub(crate) fn convert_f32_rgba_extended(
m: &[[f32; 3]; 3],
data: &mut [f32],
src_trc: TransferFunction,
dst_trc: TransferFunction,
) -> bool {
let Some(lin) = scalar_linearize_extended(src_trc) else {
return false;
};
let Some(enc) = scalar_encode_extended(dst_trc) else {
return false;
};
for pixel in data.chunks_exact_mut(4) {
let r = lin(pixel[0]);
let g = lin(pixel[1]);
let b = lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = enc(nr);
pixel[1] = enc(ng);
pixel[2] = enc(nb);
}
true
}
#[cfg(target_arch = "x86_64")]
pub(crate) fn has_simd_encode(trc: TransferFunction) -> bool {
matches!(
trc,
TransferFunction::Srgb
| TransferFunction::Bt709
| TransferFunction::Pq
| TransferFunction::Hlg
| TransferFunction::Gamma22
)
}
#[cfg(not(target_arch = "x86_64"))]
pub(crate) fn has_simd_encode(_trc: TransferFunction) -> bool {
false
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn simd_encode_x8_dispatch(token: X64V3Token, trc: TransferFunction, v: [f32; 8]) -> [f32; 8] {
match trc {
TransferFunction::Srgb => trc_x8::linear_to_srgb_v3(token, v),
TransferFunction::Bt709 => trc_x8::linear_to_bt709_v3(token, v),
TransferFunction::Pq => trc_x8::linear_to_pq_v3(token, v),
TransferFunction::Hlg => trc_x8::linear_to_hlg_v3(token, v),
TransferFunction::Gamma22 => adobe_from_linear_x8(token, v),
_ => v,
}
}
pub(crate) fn scalar_linearize(trc: TransferFunction) -> Option<fn(f32) -> f32> {
match trc {
TransferFunction::Srgb => Some(linear_srgb::tf::srgb_to_linear),
TransferFunction::Bt709 => Some(linear_srgb::tf::bt709_to_linear),
TransferFunction::Pq => Some(linear_srgb::tf::pq_to_linear),
TransferFunction::Hlg => Some(linear_srgb::tf::hlg_to_linear),
TransferFunction::Gamma22 => Some(adobe_to_linear_scalar),
TransferFunction::Linear => Some(core::convert::identity),
_ => None,
}
}
pub(crate) fn scalar_encode(trc: TransferFunction) -> Option<fn(f32) -> f32> {
match trc {
TransferFunction::Srgb => Some(linear_srgb::tf::linear_to_srgb),
TransferFunction::Bt709 => Some(linear_srgb::tf::linear_to_bt709),
TransferFunction::Pq => Some(linear_srgb::tf::linear_to_pq),
TransferFunction::Hlg => Some(linear_srgb::tf::linear_to_hlg),
TransferFunction::Gamma22 => Some(adobe_from_linear_scalar),
TransferFunction::Linear => Some(core::convert::identity),
_ => None,
}
}
pub(crate) fn convert_f32_rgb_dispatch(
m: &[[f32; 3]; 3],
data: &mut [f32],
src_trc: TransferFunction,
dst_trc: TransferFunction,
) -> bool {
use TransferFunction::*;
debug_assert_eq!(data.len() % 3, 0);
if src_trc == Linear && dst_trc == Linear {
convert_linear_rgb(m, data);
return true;
}
#[cfg(target_arch = "x86_64")]
match (src_trc, dst_trc) {
(Srgb, Srgb) => {
incant!(convert_rgb_srgb(m, data));
return true;
}
(Bt709, Bt709) => {
incant!(convert_rgb_bt709(m, data));
return true;
}
(Pq, Pq) => {
incant!(convert_rgb_pq(m, data));
return true;
}
(Hlg, Hlg) => {
incant!(convert_rgb_hlg(m, data));
return true;
}
(Gamma22, Gamma22) => {
incant!(convert_rgb_adobe(m, data));
return true;
}
(Pq, Srgb) => {
incant!(convert_rgb_pq_to_srgb(m, data));
return true;
}
(Hlg, Srgb) => {
incant!(convert_rgb_hlg_to_srgb(m, data));
return true;
}
(Srgb, Pq) => {
incant!(convert_rgb_srgb_to_pq(m, data));
return true;
}
(Bt709, Srgb) => {
incant!(convert_rgb_bt709_to_srgb(m, data));
return true;
}
(Srgb, Bt709) => {
incant!(convert_rgb_srgb_to_bt709(m, data));
return true;
}
(Gamma22, Srgb) => {
incant!(convert_rgb_adobe_to_srgb(m, data));
return true;
}
(Srgb, Gamma22) => {
incant!(convert_rgb_srgb_to_adobe(m, data));
return true;
}
_ => {} }
let Some(lin) = scalar_linearize(src_trc) else {
return false;
};
let Some(enc) = scalar_encode(dst_trc) else {
return false;
};
for pixel in data.chunks_exact_mut(3) {
let r = lin(pixel[0]);
let g = lin(pixel[1]);
let b = lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = enc(nr);
pixel[1] = enc(ng);
pixel[2] = enc(nb);
}
true
}
pub(crate) fn convert_f32_rgba_dispatch(
m: &[[f32; 3]; 3],
data: &mut [f32],
src_trc: TransferFunction,
dst_trc: TransferFunction,
) -> bool {
use TransferFunction::*;
debug_assert_eq!(data.len() % 4, 0);
if src_trc == Linear && dst_trc == Linear {
convert_linear_rgba(m, data);
return true;
}
#[cfg(target_arch = "x86_64")]
match (src_trc, dst_trc) {
(Srgb, Srgb) => {
incant!(convert_rgba_srgb(m, data));
return true;
}
(Bt709, Bt709) => {
incant!(convert_rgba_bt709(m, data));
return true;
}
(Pq, Pq) => {
incant!(convert_rgba_pq(m, data));
return true;
}
(Hlg, Hlg) => {
incant!(convert_rgba_hlg(m, data));
return true;
}
(Gamma22, Gamma22) => {
incant!(convert_rgba_adobe(m, data));
return true;
}
(Pq, Srgb) => {
incant!(convert_rgba_pq_to_srgb(m, data));
return true;
}
(Hlg, Srgb) => {
incant!(convert_rgba_hlg_to_srgb(m, data));
return true;
}
(Srgb, Pq) => {
incant!(convert_rgba_srgb_to_pq(m, data));
return true;
}
(Bt709, Srgb) => {
incant!(convert_rgba_bt709_to_srgb(m, data));
return true;
}
(Srgb, Bt709) => {
incant!(convert_rgba_srgb_to_bt709(m, data));
return true;
}
(Gamma22, Srgb) => {
incant!(convert_rgba_adobe_to_srgb(m, data));
return true;
}
(Srgb, Gamma22) => {
incant!(convert_rgba_srgb_to_adobe(m, data));
return true;
}
_ => {}
}
let Some(lin) = scalar_linearize(src_trc) else {
return false;
};
let Some(enc) = scalar_encode(dst_trc) else {
return false;
};
for pixel in data.chunks_exact_mut(4) {
let r = lin(pixel[0]);
let g = lin(pixel[1]);
let b = lin(pixel[2]);
let (nr, ng, nb) = mat3x3(m, r, g, b);
pixel[0] = enc(nr);
pixel[1] = enc(ng);
pixel[2] = enc(nb);
}
true
}
pub(crate) fn build_linearize_lut(linearize_fn: fn(f32) -> f32) -> alloc::boxed::Box<[f32; 256]> {
let mut lut = alloc::vec![0.0f32; 256].into_boxed_slice();
for i in 0..256 {
lut[i] = linearize_fn(i as f32 / 255.0);
}
lut.try_into().ok().unwrap()
}
pub(crate) fn convert_u8_rgb(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
linearize_fn: fn(f32) -> f32,
encode_fn: fn(f32) -> f32,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = linearize_fn(src_px[0] as f32 / 255.0);
let g = linearize_fn(src_px[1] as f32 / 255.0);
let b = linearize_fn(src_px[2] as f32 / 255.0);
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = (encode_fn(nr) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[1] = (encode_fn(ng) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[2] = (encode_fn(nb) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
}
}
pub(crate) fn scalar_encode_u8(trc: TransferFunction) -> Option<fn(f32) -> u8> {
fn srgb_u8(v: f32) -> u8 {
linear_srgb::default::linear_to_srgb_u8(v)
}
fn quantize_with(enc: fn(f32) -> f32, v: f32) -> u8 {
(enc(v) * 255.0 + 0.5).clamp(0.0, 255.0) as u8
}
match trc {
TransferFunction::Srgb => Some(srgb_u8),
TransferFunction::Bt709 => Some(|v| quantize_with(linear_srgb::tf::linear_to_bt709, v)),
TransferFunction::Pq => Some(|v| quantize_with(linear_srgb::tf::linear_to_pq, v)),
TransferFunction::Hlg => Some(|v| quantize_with(linear_srgb::tf::linear_to_hlg, v)),
TransferFunction::Gamma22 => Some(|v| quantize_with(adobe_from_linear_scalar, v)),
TransferFunction::Linear => Some(|v| (v * 255.0 + 0.5).clamp(0.0, 255.0) as u8),
_ => None,
}
}
pub(crate) fn convert_u8_rgb_lut_lut(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = enc_u8(nr);
dst_px[1] = enc_u8(ng);
dst_px[2] = enc_u8(nb);
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn convert_8px_u8_rgb_fused(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8; 24],
dst: &mut [u8; 24],
lin_lut: &[f32; 256],
dst_trc: TransferFunction,
) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = lin_lut[src[i * 3] as usize];
g[i] = lin_lut[src[i * 3 + 1] as usize];
b[i] = lin_lut[src[i * 3 + 2] as usize];
}
let rv = mt_f32x8::from_array(token, r);
let gv = mt_f32x8::from_array(token, g);
let bv = mt_f32x8::from_array(token, b);
let (or, og, ob) = mat3x3_x8(token, m, rv, gv, bv);
let ro = simd_encode_x8_dispatch(token, dst_trc, or.to_array());
let go = simd_encode_x8_dispatch(token, dst_trc, og.to_array());
let bo = simd_encode_x8_dispatch(token, dst_trc, ob.to_array());
for i in 0..8 {
dst[i * 3] = (ro[i] * 255.0 + 0.5) as u8;
dst[i * 3 + 1] = (go[i] * 255.0 + 0.5) as u8;
dst[i * 3 + 2] = (bo[i] * 255.0 + 0.5) as u8;
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn convert_u8_rgb_fused_v3(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
dst_trc: TransferFunction,
scalar_enc: fn(f32) -> f32,
) {
let pixel_count = src.len() / 3;
let bulk = (pixel_count / 8) * 8;
let bulk_bytes = bulk * 3;
for off in (0..bulk_bytes).step_by(24) {
let s: &[u8; 24] = src[off..off + 24].try_into().unwrap();
let d: &mut [u8; 24] = (&mut dst[off..off + 24]).try_into().unwrap();
convert_8px_u8_rgb_fused(token, m, s, d, lin_lut, dst_trc);
}
for i in bulk..pixel_count {
let base = i * 3;
let r = lin_lut[src[base] as usize];
let g = lin_lut[src[base + 1] as usize];
let b = lin_lut[src[base + 2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst[base] = (scalar_enc(nr) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst[base + 1] = (scalar_enc(ng) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst[base + 2] = (scalar_enc(nb) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
}
}
fn convert_u8_rgb_fused_scalar(
_token: ScalarToken,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
_dst_trc: TransferFunction,
scalar_enc: fn(f32) -> f32,
) {
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = (scalar_enc(nr) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[1] = (scalar_enc(ng) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[2] = (scalar_enc(nb) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
}
}
pub(crate) fn convert_u8_rgb_simd_fused(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
dst_trc: TransferFunction,
scalar_enc: fn(f32) -> f32,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
#[cfg(target_arch = "x86_64")]
{
incant!(convert_u8_rgb_fused(
m, src, dst, lin_lut, dst_trc, scalar_enc
));
return;
}
#[cfg(not(target_arch = "x86_64"))]
convert_u8_rgb_fused_scalar(ScalarToken, m, src, dst, lin_lut, dst_trc, scalar_enc);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn convert_8px_u8_rgba_simd(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
let mut r = [0.0f32; 8];
let mut g = [0.0f32; 8];
let mut b = [0.0f32; 8];
for i in 0..8 {
r[i] = lin_lut[src[i * 4] as usize];
g[i] = lin_lut[src[i * 4 + 1] as usize];
b[i] = lin_lut[src[i * 4 + 2] as usize];
}
let rv = mt_f32x8::from_array(token, r);
let gv = mt_f32x8::from_array(token, g);
let bv = mt_f32x8::from_array(token, b);
let (or, og, ob) = mat3x3_x8(token, m, rv, gv, bv);
let ro = or.to_array();
let go = og.to_array();
let bo = ob.to_array();
for i in 0..8 {
dst[i * 4] = enc_u8(ro[i]);
dst[i * 4 + 1] = enc_u8(go[i]);
dst[i * 4 + 2] = enc_u8(bo[i]);
dst[i * 4 + 3] = src[i * 4 + 3]; }
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn convert_u8_rgba_lut_simd_v3(
token: X64V3Token,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
let pixel_count = src.len() / 4;
let bulk = (pixel_count / 8) * 8;
let bulk_bytes = bulk * 4;
for off in (0..bulk_bytes).step_by(32) {
convert_8px_u8_rgba_simd(
token,
m,
&src[off..off + 32],
&mut dst[off..off + 32],
lin_lut,
enc_u8,
);
}
for i in bulk..pixel_count {
let base = i * 4;
let r = lin_lut[src[base] as usize];
let g = lin_lut[src[base + 1] as usize];
let b = lin_lut[src[base + 2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst[base] = enc_u8(nr);
dst[base + 1] = enc_u8(ng);
dst[base + 2] = enc_u8(nb);
dst[base + 3] = src[base + 3];
}
}
fn convert_u8_rgba_lut_simd_scalar(
_token: ScalarToken,
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
for (src_px, dst_px) in src.chunks_exact(4).zip(dst.chunks_exact_mut(4)) {
let r = lin_lut[src_px[0] as usize];
let g = lin_lut[src_px[1] as usize];
let b = lin_lut[src_px[2] as usize];
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = enc_u8(nr);
dst_px[1] = enc_u8(ng);
dst_px[2] = enc_u8(nb);
dst_px[3] = src_px[3];
}
}
pub(crate) fn convert_u8_rgba_simd_lut(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
lin_lut: &[f32; 256],
enc_u8: fn(f32) -> u8,
) {
debug_assert_eq!(src.len() % 4, 0);
debug_assert_eq!(src.len(), dst.len());
#[cfg(target_arch = "x86_64")]
{
incant!(convert_u8_rgba_lut_simd(m, src, dst, lin_lut, enc_u8));
return;
}
#[cfg(not(target_arch = "x86_64"))]
convert_u8_rgba_lut_simd_scalar(ScalarToken, m, src, dst, lin_lut, enc_u8);
}
pub(crate) fn convert_u8_rgba(
m: &[[f32; 3]; 3],
src: &[u8],
dst: &mut [u8],
linearize_fn: fn(f32) -> f32,
encode_fn: fn(f32) -> f32,
) {
debug_assert_eq!(src.len() % 4, 0);
debug_assert_eq!(src.len(), dst.len());
for (src_px, dst_px) in src.chunks_exact(4).zip(dst.chunks_exact_mut(4)) {
let r = linearize_fn(src_px[0] as f32 / 255.0);
let g = linearize_fn(src_px[1] as f32 / 255.0);
let b = linearize_fn(src_px[2] as f32 / 255.0);
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = (encode_fn(nr) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[1] = (encode_fn(ng) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[2] = (encode_fn(nb) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
dst_px[3] = src_px[3];
}
}
pub(crate) fn convert_u16_rgb(
m: &[[f32; 3]; 3],
src: &[u16],
dst: &mut [u16],
linearize_fn: fn(f32) -> f32,
encode_fn: fn(f32) -> f32,
) {
debug_assert_eq!(src.len() % 3, 0);
debug_assert_eq!(src.len(), dst.len());
for (src_px, dst_px) in src.chunks_exact(3).zip(dst.chunks_exact_mut(3)) {
let r = linearize_fn(src_px[0] as f32 / 65535.0);
let g = linearize_fn(src_px[1] as f32 / 65535.0);
let b = linearize_fn(src_px[2] as f32 / 65535.0);
let (nr, ng, nb) = mat3x3(m, r, g, b);
dst_px[0] = (encode_fn(nr) * 65535.0 + 0.5).clamp(0.0, 65535.0) as u16;
dst_px[1] = (encode_fn(ng) * 65535.0 + 0.5).clamp(0.0, 65535.0) as u16;
dst_px[2] = (encode_fn(nb) * 65535.0 + 0.5).clamp(0.0, 65535.0) as u16;
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ColorPrimaries;
fn m(src: ColorPrimaries, dst: ColorPrimaries) -> [[f32; 3]; 3] {
src.gamut_matrix_to(dst).unwrap()
}
#[test]
fn dispatch_p3_srgb_white() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [1.0f32, 1.0, 1.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for c in &px {
assert!((c - 1.0).abs() < 1e-4, "white: {px:?}");
}
}
#[test]
fn dispatch_p3_srgb_black() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [0.0f32, 0.0, 0.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for c in &px {
assert!(c.abs() < 1e-6, "black: {px:?}");
}
}
#[test]
fn dispatch_p3_srgb_roundtrip() {
let fwd = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let inv = m(ColorPrimaries::Bt709, ColorPrimaries::DisplayP3);
let original = [0.5f32, 0.3, 0.7];
let mut px = original;
convert_f32_rgb_dispatch(
&fwd,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
convert_f32_rgb_dispatch(
&inv,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for i in 0..3 {
assert!((original[i] - px[i]).abs() < 1e-4, "ch{i}: {}", px[i]);
}
}
#[test]
fn dispatch_bt2020_sdr_srgb_white() {
let mat = m(ColorPrimaries::Bt2020, ColorPrimaries::Bt709);
let mut px = [1.0f32, 1.0, 1.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Bt709,
TransferFunction::Srgb,
);
for c in &px {
assert!((c - 1.0).abs() < 1e-4, "white: {px:?}");
}
}
#[test]
fn dispatch_bt2020_pq_srgb_black() {
let mat = m(ColorPrimaries::Bt2020, ColorPrimaries::Bt709);
let mut px = [0.0f32, 0.0, 0.0];
convert_f32_rgb_dispatch(&mat, &mut px, TransferFunction::Pq, TransferFunction::Srgb);
for c in &px {
assert!(c.abs() < 1e-5, "black: {px:?}");
}
}
#[test]
fn dispatch_adobe_srgb_white() {
let mat = m(ColorPrimaries::AdobeRgb, ColorPrimaries::Bt709);
let mut px = [1.0f32, 1.0, 1.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Gamma22,
TransferFunction::Srgb,
);
for c in &px {
assert!((c - 1.0).abs() < 1e-4, "white: {px:?}");
}
}
#[test]
fn dispatch_rgba_alpha_passthrough() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [0.5f32, 0.5, 0.5, 0.7];
convert_f32_rgba_dispatch(
&mat,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
assert!((px[3] - 0.7).abs() < f32::EPSILON, "alpha: {px:?}");
}
#[test]
fn dispatch_linear_white() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [1.0f32, 1.0, 1.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Linear,
TransferFunction::Linear,
);
for c in &px {
assert!((c - 1.0).abs() < 1e-6, "linear white: {px:?}");
}
}
#[test]
fn dispatch_returns_false_for_unknown_trc() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut px = [0.5f32, 0.5, 0.5];
assert!(!convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Unknown,
TransferFunction::Srgb
));
}
#[test]
fn u8_white_black() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut dst = [0u8; 3];
convert_u8_rgb(
&mat,
&[255, 255, 255],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst, [255, 255, 255]);
convert_u8_rgb(
&mat,
&[0, 0, 0],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst, [0, 0, 0]);
}
#[test]
fn u8_lut_lut_matches_scalar() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let lut = build_linearize_lut(linear_srgb::tf::srgb_to_linear);
let enc_u8 = scalar_encode_u8(TransferFunction::Srgb).unwrap();
let src = [128u8, 64, 200];
let mut dst_scalar = [0u8; 3];
let mut dst_lut = [0u8; 3];
convert_u8_rgb(
&mat,
&src,
&mut dst_scalar,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
convert_u8_rgb_lut_lut(&mat, &src, &mut dst_lut, &lut, enc_u8);
assert_eq!(
dst_scalar, dst_lut,
"LUT-LUT and scalar should produce identical u8 output"
);
}
#[test]
fn u8_rgba_alpha_passthrough() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut dst = [0u8; 4];
convert_u8_rgba(
&mat,
&[128, 64, 32, 200],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst[3], 200);
}
#[test]
fn u16_white_black() {
let mat = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut dst = [0u16; 3];
convert_u16_rgb(
&mat,
&[65535, 65535, 65535],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst, [65535, 65535, 65535]);
convert_u16_rgb(
&mat,
&[0, 0, 0],
&mut dst,
linear_srgb::tf::srgb_to_linear,
linear_srgb::tf::linear_to_srgb,
);
assert_eq!(dst, [0, 0, 0]);
}
#[test]
fn f32_roundtrip_accuracy() {
let fwd = m(ColorPrimaries::Bt709, ColorPrimaries::DisplayP3);
let inv = m(ColorPrimaries::DisplayP3, ColorPrimaries::Bt709);
let mut max_err: f32 = 0.0;
for r in (0..=255).step_by(4) {
for g in (0..=255).step_by(4) {
for b in (0..=255).step_by(16) {
let original = [r as f32 / 255.0, g as f32 / 255.0, b as f32 / 255.0];
let mut px = original;
convert_f32_rgb_dispatch(
&fwd,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
convert_f32_rgb_dispatch(
&inv,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for i in 0..3 {
let err = (original[i] - px[i]).abs();
if err > max_err {
max_err = err;
}
}
}
}
}
assert!(max_err < 1e-4, "max roundtrip error: {max_err}");
}
#[test]
fn srgb_subset_of_p3() {
let mat = m(ColorPrimaries::Bt709, ColorPrimaries::DisplayP3);
for r in (0..=255).step_by(17) {
for g in (0..=255).step_by(17) {
for b in (0..=255).step_by(17) {
let mut px = [r as f32 / 255.0, g as f32 / 255.0, b as f32 / 255.0];
convert_f32_rgb_dispatch(
&mat,
&mut px,
TransferFunction::Srgb,
TransferFunction::Srgb,
);
for (i, c) in px.iter().enumerate() {
assert!(*c >= -1e-5 && *c <= 1.0 + 1e-5, "({r},{g},{b}) ch{i}: {c}");
}
}
}
}
}
}