use archmage::rite;
pub use archmage::X64V3Token;
use magetypes::simd::f32x8 as mt_f32x8;
const SRGB_LINEAR_THRESHOLD: f32 = 0.039_293_37;
const LINEAR_THRESHOLD: f32 = 0.003_041_282_6;
const LINEAR_SCALE: f32 = 1.0 / 12.92;
const TWELVE_92: f32 = 12.92;
#[rite]
pub fn srgb_to_linear_v3(token: X64V3Token, srgb: [f32; 8]) -> [f32; 8] {
use crate::rational_poly::{S2L_P, S2L_Q};
let zero = mt_f32x8::zero(token);
let one = mt_f32x8::splat(token, 1.0);
let srgb = mt_f32x8::from_array(token, srgb).max(zero).min(one);
let linear_result = srgb * mt_f32x8::splat(token, LINEAR_SCALE);
let x = srgb;
let yp = mt_f32x8::splat(token, S2L_P[4]).mul_add(x, mt_f32x8::splat(token, S2L_P[3]));
let yp = yp.mul_add(x, mt_f32x8::splat(token, S2L_P[2]));
let yp = yp.mul_add(x, mt_f32x8::splat(token, S2L_P[1]));
let yp = yp.mul_add(x, mt_f32x8::splat(token, S2L_P[0]));
let yq = mt_f32x8::splat(token, S2L_Q[4]).mul_add(x, mt_f32x8::splat(token, S2L_Q[3]));
let yq = yq.mul_add(x, mt_f32x8::splat(token, S2L_Q[2]));
let yq = yq.mul_add(x, mt_f32x8::splat(token, S2L_Q[1]));
let yq = yq.mul_add(x, mt_f32x8::splat(token, S2L_Q[0]));
let power_result = (yp / yq).min(one);
let mask = srgb.simd_lt(mt_f32x8::splat(token, SRGB_LINEAR_THRESHOLD));
mt_f32x8::blend(mask, linear_result, power_result).to_array()
}
#[rite]
pub fn linear_to_srgb_v3(token: X64V3Token, linear: [f32; 8]) -> [f32; 8] {
use crate::rational_poly::{L2S_P, L2S_Q};
let zero = mt_f32x8::zero(token);
let one = mt_f32x8::splat(token, 1.0);
let linear = mt_f32x8::from_array(token, linear).max(zero).min(one);
let linear_result = linear * mt_f32x8::splat(token, TWELVE_92);
let x = linear.sqrt();
let yp = mt_f32x8::splat(token, L2S_P[4]).mul_add(x, mt_f32x8::splat(token, L2S_P[3]));
let yp = yp.mul_add(x, mt_f32x8::splat(token, L2S_P[2]));
let yp = yp.mul_add(x, mt_f32x8::splat(token, L2S_P[1]));
let yp = yp.mul_add(x, mt_f32x8::splat(token, L2S_P[0]));
let yq = mt_f32x8::splat(token, L2S_Q[4]).mul_add(x, mt_f32x8::splat(token, L2S_Q[3]));
let yq = yq.mul_add(x, mt_f32x8::splat(token, L2S_Q[2]));
let yq = yq.mul_add(x, mt_f32x8::splat(token, L2S_Q[1]));
let yq = yq.mul_add(x, mt_f32x8::splat(token, L2S_Q[0]));
let power_result = (yp / yq).min(one);
let mask = linear.simd_lt(mt_f32x8::splat(token, LINEAR_THRESHOLD));
mt_f32x8::blend(mask, linear_result, power_result).to_array()
}
#[rite]
pub fn gamma_to_linear_v3(token: X64V3Token, encoded: [f32; 8], gamma: f32) -> [f32; 8] {
let zero = mt_f32x8::zero(token);
let one = mt_f32x8::splat(token, 1.0);
let encoded = mt_f32x8::from_array(token, encoded).max(zero).min(one);
encoded.pow_midp(gamma).to_array()
}
#[rite]
pub fn linear_to_gamma_v3(token: X64V3Token, linear: [f32; 8], gamma: f32) -> [f32; 8] {
let zero = mt_f32x8::zero(token);
let one = mt_f32x8::splat(token, 1.0);
let linear = mt_f32x8::from_array(token, linear).max(zero).min(one);
linear.pow_midp(1.0 / gamma).to_array()
}
#[rite]
pub fn srgb_u8_to_linear_v3(_token: X64V3Token, srgb: [u8; 8]) -> [f32; 8] {
let lut = &crate::const_luts::LINEAR_TABLE_8;
[
lut[srgb[0] as usize],
lut[srgb[1] as usize],
lut[srgb[2] as usize],
lut[srgb[3] as usize],
lut[srgb[4] as usize],
lut[srgb[5] as usize],
lut[srgb[6] as usize],
lut[srgb[7] as usize],
]
}
#[rite]
pub fn srgb_u8_to_linear_slice_v3(_token: X64V3Token, input: &[u8], output: &mut [f32]) {
assert_eq!(input.len(), output.len());
let lut = &crate::const_luts::LINEAR_TABLE_8;
let (in_chunks, in_remainder) = input.as_chunks::<8>();
let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
*out = [
lut[inp[0] as usize],
lut[inp[1] as usize],
lut[inp[2] as usize],
lut[inp[3] as usize],
lut[inp[4] as usize],
lut[inp[5] as usize],
lut[inp[6] as usize],
lut[inp[7] as usize],
];
}
for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
*out = lut[*inp as usize];
}
}
#[rite]
pub fn linear_to_srgb_u8_v3(token: X64V3Token, linear: [f32; 8]) -> [u8; 8] {
let zero = mt_f32x8::zero(token);
let one = mt_f32x8::splat(token, 1.0);
let linear = mt_f32x8::from_array(token, linear).max(zero).min(one);
let scaled = linear * mt_f32x8::splat(token, 4095.0) + mt_f32x8::splat(token, 0.5);
let arr = scaled.to_array();
let lut = &crate::const_luts::LINEAR_TO_SRGB_U8;
[
lut[arr[0] as usize & 0xFFF],
lut[arr[1] as usize & 0xFFF],
lut[arr[2] as usize & 0xFFF],
lut[arr[3] as usize & 0xFFF],
lut[arr[4] as usize & 0xFFF],
lut[arr[5] as usize & 0xFFF],
lut[arr[6] as usize & 0xFFF],
lut[arr[7] as usize & 0xFFF],
]
}
#[rite]
pub fn srgb_to_linear_slice_v3(token: X64V3Token, values: &mut [f32]) {
let (chunks, remainder) = values.as_chunks_mut::<8>();
for chunk in chunks {
*chunk = srgb_to_linear_v3(token, *chunk);
}
for v in remainder {
*v = crate::scalar::srgb_to_linear(*v);
}
}
#[rite]
pub fn linear_to_srgb_slice_v3(token: X64V3Token, values: &mut [f32]) {
let (chunks, remainder) = values.as_chunks_mut::<8>();
for chunk in chunks {
*chunk = linear_to_srgb_v3(token, *chunk);
}
for v in remainder {
*v = crate::scalar::linear_to_srgb(*v);
}
}
#[rite]
pub fn gamma_to_linear_slice_v3(token: X64V3Token, values: &mut [f32], gamma: f32) {
let (chunks, remainder) = values.as_chunks_mut::<8>();
for chunk in chunks {
*chunk = gamma_to_linear_v3(token, *chunk, gamma);
}
for v in remainder {
*v = crate::scalar::gamma_to_linear(*v, gamma);
}
}
#[rite]
pub fn linear_to_gamma_slice_v3(token: X64V3Token, values: &mut [f32], gamma: f32) {
let (chunks, remainder) = values.as_chunks_mut::<8>();
for chunk in chunks {
*chunk = linear_to_gamma_v3(token, *chunk, gamma);
}
for v in remainder {
*v = crate::scalar::linear_to_gamma(*v, gamma);
}
}
#[rite]
pub fn linear_to_srgb_u8_slice_v3(token: X64V3Token, input: &[f32], output: &mut [u8]) {
assert_eq!(input.len(), output.len());
let (in_chunks, in_remainder) = input.as_chunks::<8>();
let (out_chunks, out_remainder) = output.as_chunks_mut::<8>();
for (inp, out) in in_chunks.iter().zip(out_chunks.iter_mut()) {
*out = linear_to_srgb_u8_v3(token, *inp);
}
for (inp, out) in in_remainder.iter().zip(out_remainder.iter_mut()) {
*out = crate::scalar::linear_to_srgb_u8(*inp);
}
}
#[cfg(feature = "transfer")]
#[rite]
pub fn tf_srgb_to_linear_v3(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
crate::tf::srgb::srgb_to_linear_x8(token, mt_f32x8::from_array(token, v)).to_array()
}
#[cfg(feature = "transfer")]
#[rite]
pub fn tf_linear_to_srgb_v3(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
crate::tf::srgb::linear_to_srgb_x8(token, mt_f32x8::from_array(token, v)).to_array()
}
#[cfg(feature = "transfer")]
#[rite]
pub fn bt709_to_linear_v3(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
crate::tf::bt709::bt709_to_linear_x8(token, mt_f32x8::from_array(token, v)).to_array()
}
#[cfg(feature = "transfer")]
#[rite]
pub fn linear_to_bt709_v3(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
crate::tf::bt709::linear_to_bt709_x8(token, mt_f32x8::from_array(token, v)).to_array()
}
#[cfg(feature = "transfer")]
#[rite]
pub fn pq_to_linear_v3(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
crate::tf::pq::pq_to_linear_x8(token, mt_f32x8::from_array(token, v)).to_array()
}
#[cfg(feature = "transfer")]
#[rite]
pub fn linear_to_pq_v3(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
crate::tf::pq::linear_to_pq_x8(token, mt_f32x8::from_array(token, v)).to_array()
}
#[cfg(feature = "transfer")]
#[rite]
pub fn hlg_to_linear_v3(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
crate::tf::hlg::hlg_to_linear_x8(token, mt_f32x8::from_array(token, v)).to_array()
}
#[cfg(feature = "transfer")]
#[rite]
pub fn linear_to_hlg_v3(token: X64V3Token, v: [f32; 8]) -> [f32; 8] {
crate::tf::hlg::linear_to_hlg_x8(token, mt_f32x8::from_array(token, v)).to_array()
}
#[cfg(feature = "transfer")]
#[rite]
pub fn tf_srgb_to_linear_slice_v3(token: X64V3Token, values: &mut [f32]) {
tf_slice_x8(
values,
|v| tf_srgb_to_linear_v3(token, v),
crate::tf::srgb_to_linear,
);
}
#[cfg(feature = "transfer")]
#[rite]
pub fn tf_linear_to_srgb_slice_v3(token: X64V3Token, values: &mut [f32]) {
tf_slice_x8(
values,
|v| tf_linear_to_srgb_v3(token, v),
crate::tf::linear_to_srgb,
);
}
#[cfg(feature = "transfer")]
#[rite]
pub fn bt709_to_linear_slice_v3(token: X64V3Token, values: &mut [f32]) {
tf_slice_x8(
values,
|v| bt709_to_linear_v3(token, v),
crate::tf::bt709_to_linear,
);
}
#[cfg(feature = "transfer")]
#[rite]
pub fn linear_to_bt709_slice_v3(token: X64V3Token, values: &mut [f32]) {
tf_slice_x8(
values,
|v| linear_to_bt709_v3(token, v),
crate::tf::linear_to_bt709,
);
}
#[cfg(feature = "transfer")]
#[rite]
pub fn pq_to_linear_slice_v3(token: X64V3Token, values: &mut [f32]) {
tf_slice_x8(
values,
|v| pq_to_linear_v3(token, v),
crate::tf::pq_to_linear,
);
}
#[cfg(feature = "transfer")]
#[rite]
pub fn linear_to_pq_slice_v3(token: X64V3Token, values: &mut [f32]) {
tf_slice_x8(
values,
|v| linear_to_pq_v3(token, v),
crate::tf::linear_to_pq,
);
}
#[cfg(feature = "transfer")]
#[rite]
pub fn hlg_to_linear_slice_v3(token: X64V3Token, values: &mut [f32]) {
tf_slice_x8(
values,
|v| hlg_to_linear_v3(token, v),
crate::tf::hlg_to_linear,
);
}
#[cfg(feature = "transfer")]
#[rite]
pub fn linear_to_hlg_slice_v3(token: X64V3Token, values: &mut [f32]) {
tf_slice_x8(
values,
|v| linear_to_hlg_v3(token, v),
crate::tf::linear_to_hlg,
);
}
#[cfg(feature = "transfer")]
#[inline(always)]
fn tf_slice_x8(
values: &mut [f32],
tf_x8: impl Fn([f32; 8]) -> [f32; 8],
tf_scalar: fn(f32) -> f32,
) {
let (chunks, remainder) = values.as_chunks_mut::<8>();
for chunk in chunks {
*chunk = tf_x8(*chunk);
}
for v in remainder {
*v = tf_scalar(*v);
}
}
#[cfg(test)]
mod tests {
use super::*;
use archmage::SimdToken;
#[cfg(not(feature = "std"))]
use alloc::{vec, vec::Vec};
fn get_token() -> Option<X64V3Token> {
X64V3Token::try_new()
}
#[archmage::arcane]
fn call_srgb_to_linear(token: X64V3Token, input: [f32; 8]) -> [f32; 8] {
srgb_to_linear_v3(token, input)
}
#[archmage::arcane]
fn call_linear_to_srgb(token: X64V3Token, input: [f32; 8]) -> [f32; 8] {
linear_to_srgb_v3(token, input)
}
#[archmage::arcane]
fn call_srgb_to_linear_slice(token: X64V3Token, values: &mut [f32]) {
srgb_to_linear_slice_v3(token, values);
}
#[archmage::arcane]
fn call_linear_to_srgb_slice(token: X64V3Token, values: &mut [f32]) {
linear_to_srgb_slice_v3(token, values);
}
#[archmage::arcane]
fn call_linear_to_srgb_u8(token: X64V3Token, input: [f32; 8]) -> [u8; 8] {
linear_to_srgb_u8_v3(token, input)
}
#[test]
fn test_x8_linear_to_srgb_u8() {
let Some(token) = get_token() else {
eprintln!("Skipping test: AVX2+FMA not available");
return;
};
let input = [0.0, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0];
let result = call_linear_to_srgb_u8(token, input);
for (i, (&got, &inp)) in result.iter().zip(input.iter()).enumerate() {
let expected = crate::scalar::linear_to_srgb_u8(inp);
assert!(
(got as i32 - expected as i32).abs() <= 1,
"u8 mismatch at {}: got {}, expected {} (input={})",
i,
got,
expected,
inp
);
}
}
#[test]
fn test_x8_srgb_roundtrip() {
let Some(token) = get_token() else {
eprintln!("Skipping test: AVX2+FMA not available");
return;
};
let input = [0.0, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0];
let linear = call_srgb_to_linear(token, input);
let roundtrip = call_linear_to_srgb(token, linear);
for (i, (&orig, &rt)) in input.iter().zip(roundtrip.iter()).enumerate() {
assert!(
(orig - rt).abs() < 1e-4,
"roundtrip failed at {}: {} -> {}",
i,
orig,
rt
);
}
}
#[test]
fn test_x8_matches_scalar() {
let Some(token) = get_token() else {
eprintln!("Skipping test: AVX2+FMA not available");
return;
};
let input = [0.0, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1.0];
let result = call_srgb_to_linear(token, input);
for (i, (&got, &inp)) in result.iter().zip(input.iter()).enumerate() {
let expected = crate::scalar::srgb_to_linear(inp);
assert!(
(got - expected).abs() < 1e-5,
"mismatch at {}: got {}, expected {}",
i,
got,
expected
);
}
}
#[test]
fn test_slice_matches_scalar() {
let Some(token) = get_token() else {
eprintln!("Skipping test: AVX2+FMA not available");
return;
};
let mut values: Vec<f32> = (0..100).map(|i| i as f32 / 99.0).collect();
let expected: Vec<f32> = values
.iter()
.map(|&v| crate::scalar::srgb_to_linear(v))
.collect();
call_srgb_to_linear_slice(token, &mut values);
for (i, (&got, &exp)) in values.iter().zip(expected.iter()).enumerate() {
assert!(
(got - exp).abs() < 1e-5,
"mismatch at {}: got {}, expected {}",
i,
got,
exp
);
}
}
#[test]
fn test_slice_roundtrip() {
let Some(token) = get_token() else {
eprintln!("Skipping test: AVX2+FMA not available");
return;
};
let mut values: Vec<f32> = (0..1000).map(|i| i as f32 / 999.0).collect();
let original = values.clone();
call_srgb_to_linear_slice(token, &mut values);
call_linear_to_srgb_slice(token, &mut values);
for (i, (&orig, &conv)) in original.iter().zip(values.iter()).enumerate() {
assert!(
(orig - conv).abs() < 1e-4,
"roundtrip failed at {}: {} -> {}",
i,
orig,
conv
);
}
}
}