#[cfg(not(target_arch = "x86_64"))]
fn main() {}
#[cfg(target_arch = "x86_64")]
mod x86_impl {
#![allow(clippy::needless_borrows_for_generic_args)]
use archmage::{SimdToken, X64V3Token, arcane};
use magetypes::simd::f32x8;
fn ulp_diff(a: f32, b: f32) -> u32 {
if a.is_nan() || b.is_nan() {
return u32::MAX;
}
if a == b {
return 0;
}
if a == 0.0 || b == 0.0 {
let non_zero = if a == 0.0 { b } else { a };
return non_zero.to_bits();
}
if a.is_sign_positive() != b.is_sign_positive() {
return u32::MAX; }
let a_bits = a.to_bits();
let b_bits = b.to_bits();
a_bits.abs_diff(b_bits)
}
#[allow(dead_code)]
struct AccuracyStats {
max_ulp: u32,
avg_ulp: f64,
max_rel_err: f64,
avg_rel_err: f64,
samples: usize,
}
impl std::fmt::Display for AccuracyStats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"max_ulp: {:>8}, avg_ulp: {:>8.2}, max_rel: {:.2e}, avg_rel: {:.2e}",
self.max_ulp, self.avg_ulp, self.max_rel_err, self.avg_rel_err
)
}
}
fn measure_accuracy<F, R>(
_name: &str,
test_values: &[f32],
archmage_fn: F,
reference_fn: R,
) -> AccuracyStats
where
F: Fn(&[f32]) -> Vec<f32>,
R: Fn(f32) -> f32,
{
let archmage_results = archmage_fn(test_values);
let mut max_ulp = 0u32;
let mut sum_ulp = 0u64;
let mut max_rel_err = 0.0f64;
let mut sum_rel_err = 0.0f64;
let mut count = 0usize;
for (i, &input) in test_values.iter().enumerate() {
let expected = reference_fn(input);
let got = archmage_results[i];
if !expected.is_finite() || !got.is_finite() {
continue;
}
let ulp = ulp_diff(got, expected);
max_ulp = max_ulp.max(ulp);
sum_ulp += ulp as u64;
if expected.abs() > 1e-10 {
let rel_err = ((got - expected) / expected).abs() as f64;
max_rel_err = max_rel_err.max(rel_err);
sum_rel_err += rel_err;
}
count += 1;
}
AccuracyStats {
max_ulp,
avg_ulp: sum_ulp as f64 / count as f64,
max_rel_err,
avg_rel_err: sum_rel_err / count as f64,
samples: count,
}
}
#[arcane]
fn exp2_lowp_chunk(token: X64V3Token, input: &[f32; 8]) -> [f32; 8] {
f32x8::load(token, input).exp2_lowp().to_array()
}
#[arcane]
fn log2_lowp_chunk(token: X64V3Token, input: &[f32; 8]) -> [f32; 8] {
f32x8::load(token, input).log2_lowp().to_array()
}
#[arcane]
fn pow_lowp_chunk(token: X64V3Token, input: &[f32; 8], exp: f32) -> [f32; 8] {
f32x8::load(token, input).pow_lowp(exp).to_array()
}
#[arcane]
fn pow_midp_chunk(token: X64V3Token, input: &[f32; 8], exp: f32) -> [f32; 8] {
f32x8::load(token, input).pow_midp(exp).to_array()
}
fn archmage_exp2(input: &[f32]) -> Vec<f32> {
let Some(token) = X64V3Token::summon() else {
return input.iter().map(|&x| x.exp2()).collect();
};
let mut output = vec![0.0f32; input.len()];
let chunks = input.len() / 8;
for i in 0..chunks {
let start = i * 8;
let arr: &[f32; 8] = input[start..start + 8].try_into().unwrap();
output[start..start + 8].copy_from_slice(&exp2_lowp_chunk(token, arr));
}
for i in (chunks * 8)..input.len() {
output[i] = input[i].exp2();
}
output
}
fn archmage_log2(input: &[f32]) -> Vec<f32> {
let Some(token) = X64V3Token::summon() else {
return input.iter().map(|&x| x.log2()).collect();
};
let mut output = vec![0.0f32; input.len()];
let chunks = input.len() / 8;
for i in 0..chunks {
let start = i * 8;
let arr: &[f32; 8] = input[start..start + 8].try_into().unwrap();
output[start..start + 8].copy_from_slice(&log2_lowp_chunk(token, arr));
}
for i in (chunks * 8)..input.len() {
output[i] = input[i].log2();
}
output
}
fn archmage_pow_lowp(input: &[f32], exp: f32) -> Vec<f32> {
let Some(token) = X64V3Token::summon() else {
return input.iter().map(|&x| x.powf(exp)).collect();
};
let mut output = vec![0.0f32; input.len()];
let chunks = input.len() / 8;
for i in 0..chunks {
let start = i * 8;
let arr: &[f32; 8] = input[start..start + 8].try_into().unwrap();
output[start..start + 8].copy_from_slice(&pow_lowp_chunk(token, arr, exp));
}
for i in (chunks * 8)..input.len() {
output[i] = input[i].powf(exp);
}
output
}
fn archmage_pow_midp(input: &[f32], exp: f32) -> Vec<f32> {
let Some(token) = X64V3Token::summon() else {
return input.iter().map(|&x| x.powf(exp)).collect();
};
let mut output = vec![0.0f32; input.len()];
let chunks = input.len() / 8;
for i in 0..chunks {
let start = i * 8;
let arr: &[f32; 8] = input[start..start + 8].try_into().unwrap();
output[start..start + 8].copy_from_slice(&pow_midp_chunk(token, arr, exp));
}
for i in (chunks * 8)..input.len() {
output[i] = input[i].powf(exp);
}
output
}
fn test_roundtrip_with<F>(name: &str, bit_depth: u32, pow_fn: F)
where
F: Fn(&[f32], f32) -> Vec<f32>,
{
let levels = 1u32 << bit_depth;
let max_val = (levels - 1) as f32;
let inputs: Vec<f32> = (0..levels).map(|i| i as f32 / max_val).collect();
let linear = pow_fn(&inputs, 2.4);
let back = pow_fn(&linear, 1.0 / 2.4);
let mut exact = 0u32;
let mut off_one = 0u32;
let mut off_more = 0u32;
let mut max_err = 0i32;
for (i, &original) in inputs.iter().enumerate() {
let recovered = back[i];
let orig_level = (original * max_val).round() as i32;
let recv_level = (recovered * max_val).round() as i32;
let error = (recv_level - orig_level).abs();
if error == 0 {
exact += 1;
} else if error == 1 {
off_one += 1;
} else {
off_more += 1;
max_err = max_err.max(error);
}
}
let total = levels as f32;
print!(
" {:12} {:>2}-bit: exact {:>5.1}%, ±1 {:>5.1}%, >1 {:>5.1}%",
name,
bit_depth,
100.0 * exact as f32 / total,
100.0 * off_one as f32 / total,
100.0 * off_more as f32 / total
);
if max_err > 0 {
println!(" (max {})", max_err);
} else {
println!();
}
}
#[allow(dead_code)]
fn test_srgb_roundtrip(bit_depth: u32) {
let levels = 1u32 << bit_depth;
let max_val = (levels - 1) as f32;
println!(
"\n=== {}-bit sRGB Round-trip Test ({} levels) ===",
bit_depth, levels
);
let Some(_token) = X64V3Token::summon() else {
eprintln!("AVX2+FMA not available");
return;
};
let inputs: Vec<f32> = (0..levels).map(|i| i as f32 / max_val).collect();
let linear = archmage_pow_lowp(&inputs, 2.4);
let back_to_srgb = archmage_pow_lowp(&linear, 1.0 / 2.4);
let mut exact_matches = 0u32;
let mut off_by_one = 0u32;
let mut off_by_more = 0u32;
let mut max_error_levels = 0i32;
for (i, &original) in inputs.iter().enumerate() {
let recovered = back_to_srgb[i];
let orig_level = (original * max_val).round() as i32;
let recv_level = (recovered * max_val).round() as i32;
let error = (recv_level - orig_level).abs();
if error == 0 {
exact_matches += 1;
} else if error == 1 {
off_by_one += 1;
} else {
off_by_more += 1;
max_error_levels = max_error_levels.max(error);
}
}
let total = levels as f32;
println!(
" Exact matches: {:>6} ({:>5.1}%)",
exact_matches,
100.0 * exact_matches as f32 / total
);
println!(
" Off by 1 level: {:>6} ({:>5.1}%)",
off_by_one,
100.0 * off_by_one as f32 / total
);
println!(
" Off by >1: {:>6} ({:>5.1}%)",
off_by_more,
100.0 * off_by_more as f32 / total
);
if off_by_more > 0 {
println!(" Max error: {} levels", max_error_levels);
}
let linear_std: Vec<f32> = inputs.iter().map(|&x| x.powf(2.4)).collect();
let back_std: Vec<f32> = linear_std.iter().map(|&x| x.powf(1.0 / 2.4)).collect();
let mut std_exact = 0u32;
let mut std_off_one = 0u32;
let mut std_off_more = 0u32;
for (i, &original) in inputs.iter().enumerate() {
let recovered = back_std[i];
let orig_level = (original * max_val).round() as i32;
let recv_level = (recovered * max_val).round() as i32;
let error = (recv_level - orig_level).abs();
if error == 0 {
std_exact += 1;
} else if error == 1 {
std_off_one += 1;
} else {
std_off_more += 1;
}
}
println!("\n (std::f32 comparison)");
println!(
" Exact matches: {:>6} ({:>5.1}%)",
std_exact,
100.0 * std_exact as f32 / total
);
println!(
" Off by 1 level: {:>6} ({:>5.1}%)",
std_off_one,
100.0 * std_off_one as f32 / total
);
println!(
" Off by >1: {:>6} ({:>5.1}%)",
std_off_more,
100.0 * std_off_more as f32 / total
);
}
pub fn main() {
println!("Archmage Transcendental Accuracy Analysis");
println!("=========================================\n");
if X64V3Token::summon().is_none() {
eprintln!("AVX2+FMA not available, skipping");
return;
}
let range_0_1: Vec<f32> = (1..=10000).map(|i| i as f32 / 10000.0).collect();
let range_0_255: Vec<f32> = (1..=255).map(|i| i as f32).collect();
let range_0_16k: Vec<f32> = (1..=16384).map(|i| i as f32).collect();
let log_range_small: Vec<f32> = (1..=10000).map(|i| i as f32 / 10000.0).collect();
let _log_range_medium: Vec<f32> = (1..=255).map(|i| i as f32 / 255.0 + 0.001).collect();
println!("=== exp2 Accuracy ===\n");
println!("Range [0, 1] (10K samples, normalized sRGB input):");
let stats = measure_accuracy("exp2", &range_0_1, archmage_exp2, |x| x.exp2());
println!(" {}", stats);
println!("\nRange [-10, 10] (gain map range):");
let gain_range: Vec<f32> = (-10000..=10000).map(|i| i as f32 / 1000.0).collect();
let stats = measure_accuracy("exp2", &gain_range, archmage_exp2, |x| x.exp2());
println!(" {}", stats);
println!("\n=== log2 Accuracy ===\n");
println!("Range (0, 1] (normalized sRGB):");
let stats = measure_accuracy("log2", &log_range_small, archmage_log2, |x| x.log2());
println!(" {}", stats);
println!("\nRange [1, 255] (8-bit levels):");
let stats = measure_accuracy("log2", &range_0_255, archmage_log2, |x| x.log2());
println!(" {}", stats);
println!("\nRange [1, 16384] (HDR range):");
let stats = measure_accuracy("log2", &range_0_16k, archmage_log2, |x| x.log2());
println!(" {}", stats);
println!("\n=== pow(x, 2.4) Accuracy (sRGB decode) ===\n");
println!("Range (0, 1] (normalized sRGB input):");
let stats = measure_accuracy(
"pow_2.4",
&range_0_1,
|x| archmage_pow_lowp(x, 2.4),
|x| x.powf(2.4),
);
println!(" {}", stats);
println!("\n=== pow_lowp(x, 1/2.4) Accuracy (sRGB encode) ===\n");
println!("Range (0, 1] (linear RGB input):");
let stats = measure_accuracy(
"pow_0.417",
&range_0_1,
|x| archmage_pow_lowp(x, 1.0 / 2.4),
|x| x.powf(1.0 / 2.4),
);
println!(" {}", stats);
println!("\n=== pow_lowp(x, 2.2) Accuracy (simple gamma) ===\n");
println!("Range (0, 1]:");
let stats = measure_accuracy(
"pow_2.2",
&range_0_1,
|x| archmage_pow_lowp(x, 2.2),
|x| x.powf(2.2),
);
println!(" {}", stats);
println!("\n=== MID-PRECISION Functions ===\n");
println!("pow_midp(x, 2.4) - Range (0, 1]:");
let stats = measure_accuracy(
"pow_midp_2.4",
&range_0_1,
|x| archmage_pow_midp(x, 2.4),
|x| x.powf(2.4),
);
println!(" {}", stats);
println!("\npow_midp(x, 1/2.4) - Range (0, 1]:");
let stats = measure_accuracy(
"pow_midp_0.417",
&range_0_1,
|x| archmage_pow_midp(x, 1.0 / 2.4),
|x| x.powf(1.0 / 2.4),
);
println!(" {}", stats);
println!("\n=== sRGB Round-trip Comparison ===");
println!("(pow(x, 2.4) then pow(result, 1/2.4), checking if we get back original level)\n");
let std_pow =
|input: &[f32], exp: f32| -> Vec<f32> { input.iter().map(|&x| x.powf(exp)).collect() };
for bits in [8, 10, 12, 16] {
test_roundtrip_with("pow_lowp", bits, &archmage_pow_lowp);
test_roundtrip_with("pow_midp", bits, &archmage_pow_midp);
test_roundtrip_with("std::f32", bits, &std_pow);
println!();
}
println!("=== Summary ===\n");
println!("LOWP functions (pow_lowp, exp2_lowp, log2_lowp):");
println!(" - ~90,000 ULP max error, ~0.5% relative error");
println!(" - Suitable for preview/thumbnails only");
println!();
println!("MIDP functions (pow_midp, exp2_midp, log2_midp):");
println!(" - ~140 ULP max error, ~8e-6 relative error");
println!(" - 100% exact round-trips for 8-bit, 10-bit, 12-bit");
println!(" - 97% exact / 3% off-by-1 for 16-bit");
println!(" - Suitable for production color processing");
println!();
println!("For perfect precision, use std::f32 (scalar) at ~7-15x slower throughput.");
}
}
#[cfg(target_arch = "x86_64")]
fn main() {
x86_impl::main()
}