#[cfg(not(target_arch = "x86_64"))]
fn main() {
println!("This example requires x86_64 with AVX2");
}
#[cfg(target_arch = "x86_64")]
fn main() {
x86_impl::run();
}
#[cfg(target_arch = "x86_64")]
mod x86_impl {
use archmage::{SimdToken, X64V3Token, arcane};
#[arcane(import_intrinsics)]
fn premultiply_2px(_token: X64V3Token, pixels: &[f32; 8]) -> [f32; 8] {
let v = _mm256_loadu_ps(pixels);
let alpha = _mm256_permutevar8x32_ps(v, _mm256_set_epi32(7, 7, 7, 7, 3, 3, 3, 3));
let premul = _mm256_mul_ps(v, alpha);
let mask = _mm256_set_epi32(-1, 0, 0, 0, -1, 0, 0, 0); let result = _mm256_blendv_ps(premul, v, _mm256_castsi256_ps(mask));
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[arcane(import_intrinsics)]
fn unpremultiply_2px(_token: X64V3Token, pixels: &[f32; 8]) -> [f32; 8] {
let v = _mm256_loadu_ps(pixels);
let alpha = _mm256_permutevar8x32_ps(v, _mm256_set_epi32(7, 7, 7, 7, 3, 3, 3, 3));
let epsilon = _mm256_set1_ps(1e-10);
let safe_alpha = _mm256_max_ps(alpha, epsilon);
let divided = _mm256_div_ps(v, safe_alpha);
let mask = _mm256_set_epi32(-1, 0, 0, 0, -1, 0, 0, 0);
let result = _mm256_blendv_ps(divided, v, _mm256_castsi256_ps(mask));
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[arcane(import_intrinsics)]
fn composite_over_2px(_token: X64V3Token, src: &[f32; 8], dst: &[f32; 8]) -> [f32; 8] {
let s = _mm256_loadu_ps(src);
let d = _mm256_loadu_ps(dst);
let src_alpha = _mm256_permutevar8x32_ps(s, _mm256_set_epi32(7, 7, 7, 7, 3, 3, 3, 3));
let one = _mm256_set1_ps(1.0);
let inv_alpha = _mm256_sub_ps(one, src_alpha);
let result = _mm256_fmadd_ps(inv_alpha, d, s);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
pub fn run() {
let Some(token) = X64V3Token::summon() else {
println!("AVX2+FMA not available");
return;
};
let pixels = [1.0, 0.0, 0.0, 0.5, 0.0, 1.0, 0.0, 0.8];
println!("Input: {pixels:?}");
let premul = premultiply_2px(token, &pixels);
println!("Premultiplied: {premul:?}");
let restored = unpremultiply_2px(token, &premul);
println!("Restored: {restored:?}");
let src = [0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.5, 0.5]; let dst = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; let over = composite_over_2px(token, &src, &dst);
println!("Over: {over:?}");
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_premultiply() {
let Some(token) = X64V3Token::summon() else {
return;
};
let pixels = [1.0, 0.5, 0.0, 0.5, 0.0, 1.0, 0.0, 0.8];
let result = premultiply_2px(token, &pixels);
assert_eq!(result, [0.5, 0.25, 0.0, 0.5, 0.0, 0.8, 0.0, 0.8]);
}
#[test]
fn test_roundtrip() {
let Some(token) = X64V3Token::summon() else {
return;
};
let pixels = [0.8, 0.4, 0.2, 0.6, 0.1, 0.9, 0.5, 1.0];
let premul = premultiply_2px(token, &pixels);
let restored = unpremultiply_2px(token, &premul);
for (a, b) in pixels.iter().zip(restored.iter()) {
assert!((a - b).abs() < 1e-5, "{a} != {b}");
}
}
#[test]
fn test_composite_over() {
let Some(token) = X64V3Token::summon() else {
return;
};
let src = [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0];
let dst = [0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0];
let result = composite_over_2px(token, &src, &dst);
assert_eq!(result, [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]);
}
}
}