#![cfg(target_arch = "x86_64")]
use archmage::{Desktop64, SimdToken, X64V3Token, arcane, rite};
use std::arch::x86_64::*;
#[rite]
fn add_vectors(_token: X64V3Token, a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
unsafe {
let va = _mm256_loadu_ps(a.as_ptr());
let vb = _mm256_loadu_ps(b.as_ptr());
let sum = _mm256_add_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(out.as_mut_ptr(), sum);
out
}
}
#[rite]
fn mul_vectors(_token: X64V3Token, a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
unsafe {
let va = _mm256_loadu_ps(a.as_ptr());
let vb = _mm256_loadu_ps(b.as_ptr());
let prod = _mm256_mul_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(out.as_mut_ptr(), prod);
out
}
}
#[rite]
fn horizontal_sum(_token: X64V3Token, v: __m256) -> f32 {
let sum = _mm256_hadd_ps(v, v);
let sum = _mm256_hadd_ps(sum, sum);
let low = _mm256_castps256_ps128(sum);
let high = _mm256_extractf128_ps::<1>(sum);
_mm_cvtss_f32(_mm_add_ss(low, high))
}
#[arcane]
fn dot_product(token: X64V3Token, a: &[f32; 8], b: &[f32; 8]) -> f32 {
let products = mul_vectors(token, a, b);
unsafe {
let v = _mm256_loadu_ps(products.as_ptr());
horizontal_sum(token, v)
}
}
#[arcane]
fn weighted_sum(
token: X64V3Token,
a: &[f32; 8],
b: &[f32; 8],
weight_a: f32,
weight_b: f32,
) -> f32 {
let scaled_a = {
let weights = [weight_a; 8];
mul_vectors(token, a, &weights)
};
let scaled_b = {
let weights = [weight_b; 8];
mul_vectors(token, b, &weights)
};
let sum = add_vectors(token, &scaled_a, &scaled_b);
unsafe {
let v = _mm256_loadu_ps(sum.as_ptr());
horizontal_sum(token, v)
}
}
#[test]
fn test_rite_basic() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [1.0f32; 8];
let sum = unsafe { add_vectors(token, &a, &b) };
assert_eq!(sum, [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]);
}
}
#[test]
fn test_rite_from_arcane() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [2.0f32; 8];
let result = dot_product(token, &a, &b);
assert_eq!(result, 72.0);
}
}
#[test]
fn test_rite_complex() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32; 8];
let b = [2.0f32; 8];
let result = weighted_sum(token, &a, &b, 0.5, 0.5);
assert_eq!(result, 12.0);
}
}
#[rite(v3)]
fn add_vectors_tierless(a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
unsafe {
let va = _mm256_loadu_ps(a.as_ptr());
let vb = _mm256_loadu_ps(b.as_ptr());
let sum = _mm256_add_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(out.as_mut_ptr(), sum);
out
}
}
#[rite(v3, import_intrinsics)]
fn mul_vectors_tierless(a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let prod = _mm256_mul_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, prod);
out
}
#[arcane]
fn dot_product_tierless(token: X64V3Token, a: &[f32; 8], b: &[f32; 8]) -> f32 {
let products = mul_vectors_tierless(a, b); unsafe {
let v = _mm256_loadu_ps(products.as_ptr());
horizontal_sum(token, v)
}
}
#[test]
fn test_rite_tier_basic() {
if X64V3Token::summon().is_some() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [1.0f32; 8];
let sum = unsafe { add_vectors_tierless(&a, &b) };
assert_eq!(sum, [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]);
}
}
#[test]
fn test_rite_tier_import_intrinsics() {
if X64V3Token::summon().is_some() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [2.0f32; 8];
let products = unsafe { mul_vectors_tierless(&a, &b) };
assert_eq!(products, [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]);
}
}
#[test]
fn test_rite_tier_from_arcane() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [2.0f32; 8];
let result = dot_product_tierless(token, &a, &b);
assert_eq!(result, 72.0);
}
}
#[rite(v3)]
fn negate_tierless(a: &[f32; 8]) -> [f32; 8] {
let zero = _mm256_setzero_ps();
unsafe {
let va = _mm256_loadu_ps(a.as_ptr());
let neg = _mm256_sub_ps(zero, va);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(out.as_mut_ptr(), neg);
out
}
}
#[rite(v2)]
fn popcount_tierless(val: i32) -> i32 {
core::arch::x86_64::_popcnt32(val)
}
#[test]
fn test_rite_tier_v2() {
if archmage::X64V2Token::summon().is_some() {
let result = unsafe { popcount_tierless(0b1010_1010) };
assert_eq!(result, 4);
}
}
#[test]
fn test_rite_tier_negate() {
if X64V3Token::summon().is_some() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let result = unsafe { negate_tierless(&a) };
assert_eq!(result, [-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0]);
}
}
#[rite]
fn scale_vector(_: X64V3Token, a: &[f32; 8], factor: f32) -> [f32; 8] {
unsafe {
let va = _mm256_loadu_ps(a.as_ptr());
let vf = _mm256_set1_ps(factor);
let result = _mm256_mul_ps(va, vf);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(out.as_mut_ptr(), result);
out
}
}
#[test]
fn test_rite_wildcard_token() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let result = unsafe { scale_vector(token, &a, 3.0) };
assert_eq!(result, [3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0]);
}
}
#[test]
fn test_rite_with_desktop64_alias() {
if let Some(token) = Desktop64::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [1.0f32; 8];
let sum = unsafe { add_vectors(token, &a, &b) };
assert_eq!(sum, [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]);
}
}
#[rite(v1)]
fn add_i32x4_v1(a: &[i32; 4], b: &[i32; 4]) -> [i32; 4] {
unsafe {
let va = _mm_loadu_si128(a.as_ptr().cast());
let vb = _mm_loadu_si128(b.as_ptr().cast());
let sum = _mm_add_epi32(va, vb);
let mut out = [0i32; 4];
_mm_storeu_si128(out.as_mut_ptr().cast(), sum);
out
}
}
#[rite(v1)]
fn f64_add_v1(a: &[f64; 2], b: &[f64; 2]) -> [f64; 2] {
unsafe {
let va = _mm_loadu_pd(a.as_ptr());
let vb = _mm_loadu_pd(b.as_ptr());
let sum = _mm_add_pd(va, vb);
let mut out = [0.0f64; 2];
_mm_storeu_pd(out.as_mut_ptr(), sum);
out
}
}
#[test]
fn test_rite_tier_v1_i32_add() {
use archmage::X64V1Token;
if X64V1Token::summon().is_some() {
let a = [10i32, 20, 30, 40];
let b = [1, 2, 3, 4];
let result = unsafe { add_i32x4_v1(&a, &b) };
assert_eq!(result, [11, 22, 33, 44]);
}
}
#[test]
fn test_rite_tier_v1_f64_add() {
use archmage::X64V1Token;
if X64V1Token::summon().is_some() {
let a = [1.5f64, 2.5];
let b = [3.0, 4.0];
let result = unsafe { f64_add_v1(&a, &b) };
assert_eq!(result, [4.5, 6.5]);
}
}
#[rite(v2)]
fn blend_i16_v2(a: &[i16; 8], b: &[i16; 8]) -> [i16; 8] {
unsafe {
let va = _mm_loadu_si128(a.as_ptr().cast());
let vb = _mm_loadu_si128(b.as_ptr().cast());
let result = _mm_blend_epi16::<0b1010_1010>(va, vb);
let mut out = [0i16; 8];
_mm_storeu_si128(out.as_mut_ptr().cast(), result);
out
}
}
#[rite(v2, import_intrinsics)]
fn crc32_step_v2(crc: u32, data: u8) -> u32 {
_mm_crc32_u8(crc, data)
}
#[test]
fn test_rite_tier_v2_blend() {
use archmage::X64V2Token;
if X64V2Token::summon().is_some() {
let a = [1i16, 2, 3, 4, 5, 6, 7, 8];
let b = [10i16, 20, 30, 40, 50, 60, 70, 80];
let result = unsafe { blend_i16_v2(&a, &b) };
assert_eq!(result, [1, 20, 3, 40, 5, 60, 7, 80]);
}
}
#[test]
fn test_rite_tier_v2_crc32_import_intrinsics() {
use archmage::X64V2Token;
if X64V2Token::summon().is_some() {
let result = unsafe { crc32_step_v2(0, 1) };
assert_ne!(result, 0); let result2 = unsafe { crc32_step_v2(0, 1) };
assert_eq!(result, result2);
}
}
#[rite(v3)]
fn fma_f32x8(a: &[f32; 8], b: &[f32; 8], c: &[f32; 8]) -> [f32; 8] {
unsafe {
let va = _mm256_loadu_ps(a.as_ptr());
let vb = _mm256_loadu_ps(b.as_ptr());
let vc = _mm256_loadu_ps(c.as_ptr());
let result = _mm256_fmadd_ps(va, vb, vc); let mut out = [0.0f32; 8];
_mm256_storeu_ps(out.as_mut_ptr(), result);
out
}
}
#[test]
fn test_rite_tier_v3_fma() {
if X64V3Token::summon().is_some() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [2.0f32; 8];
let c = [10.0f32; 8];
let result = unsafe { fma_f32x8(&a, &b, &c) };
assert_eq!(result, [12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0]);
}
}
#[rite(v3, import_intrinsics)]
fn abs_f32x8_all_options(a: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let sign_mask = _mm256_set1_ps(-0.0);
let abs = _mm256_andnot_ps(sign_mask, va);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, abs);
out
}
#[test]
fn test_rite_tier_import_intrinsics_combo() {
if X64V3Token::summon().is_some() {
let a = [-1.0f32, 2.0, -3.0, 4.0, -5.0, 6.0, -7.0, 8.0];
let result = unsafe { abs_f32x8_all_options(&a) };
assert_eq!(result, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
}
}
#[rite(v3, import_intrinsics)]
fn sum_f32x8_tierless(data: &[f32; 8]) -> f32 {
let v = _mm256_loadu_ps(data);
let sum = _mm256_hadd_ps(v, v);
let sum = _mm256_hadd_ps(sum, sum);
let low = _mm256_castps256_ps128(sum);
let high = _mm256_extractf128_ps::<1>(sum);
_mm_cvtss_f32(_mm_add_ss(low, high))
}
#[rite(v3, import_intrinsics)]
fn scale_f32x8_tierless(data: &[f32; 8], factor: f32) -> [f32; 8] {
let v = _mm256_loadu_ps(data);
let f = _mm256_set1_ps(factor);
let result = _mm256_mul_ps(v, f);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[arcane(import_intrinsics)]
fn normalize_f32x8(_token: X64V3Token, data: &[f32; 8]) -> [f32; 8] {
let total = sum_f32x8_tierless(data);
if total == 0.0 {
return *data;
}
let inv = 1.0 / total;
scale_f32x8_tierless(data, inv)
}
#[test]
fn test_rite_tier_called_from_arcane_normalize() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let result = normalize_f32x8(token, &data);
let expected: [f32; 8] = core::array::from_fn(|i| (i as f32 + 1.0) / 36.0);
for (r, e) in result.iter().zip(expected.iter()) {
assert!((r - e).abs() < 1e-6, "got {r}, expected {e}");
}
}
}
#[rite(v3, import_intrinsics)]
fn square_f32x8(data: &[f32; 8]) -> [f32; 8] {
let v = _mm256_loadu_ps(data);
let sq = _mm256_mul_ps(v, v);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, sq);
out
}
#[rite(v3, import_intrinsics)]
fn sum_of_squares_tierless(data: &[f32; 8]) -> f32 {
let squared = square_f32x8(data);
sum_f32x8_tierless(&squared)
}
#[arcane]
fn l2_norm_squared(_token: X64V3Token, data: &[f32; 8]) -> f32 {
sum_of_squares_tierless(data)
}
#[test]
fn test_rite_tier_calling_tier_rite() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0, 0.0, 0.0, 0.0, 0.0];
let result = l2_norm_squared(token, &data);
assert_eq!(result, 30.0);
}
}
#[rite]
fn mixed_caller_token_based(_token: X64V3Token, data: &[f32; 8]) -> f32 {
let scaled = scale_f32x8_tierless(data, 2.0);
sum_f32x8_tierless(&scaled)
}
#[test]
fn test_mixed_token_rite_calls_tier_rite() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32; 8];
let result = unsafe { mixed_caller_token_based(token, &data) };
assert_eq!(result, 16.0);
}
}
#[arcane(import_intrinsics)]
fn compose_mixed_flavors(token: X64V3Token, a: &[f32; 8], b: &[f32; 8]) -> f32 {
let sum_a = sum_f32x8_tierless(a);
let products = mul_vectors(token, a, b);
let v = _mm256_loadu_ps(&products);
let sum_prod = horizontal_sum(token, v);
sum_a + sum_prod
}
#[test]
fn test_mixed_arcane_calls_both_rite_flavors() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [1.0f32; 8];
let result = compose_mixed_flavors(token, &a, &b);
assert_eq!(result, 72.0);
}
}
struct SimdProcessor {
scale: f32,
offset: f32,
}
impl SimdProcessor {
#[rite(v3, import_intrinsics)]
fn process_chunk(&self, data: &[f32; 8]) -> [f32; 8] {
let v = _mm256_loadu_ps(data);
let scale = _mm256_set1_ps(self.scale);
let offset = _mm256_set1_ps(self.offset);
let result = _mm256_fmadd_ps(v, scale, offset); let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[rite(v3, import_intrinsics)]
fn reduce_sum(&self, data: &[f32; 8]) -> f32 {
let v = _mm256_loadu_ps(data);
let sum = _mm256_hadd_ps(v, v);
let sum = _mm256_hadd_ps(sum, sum);
let low = _mm256_castps256_ps128(sum);
let high = _mm256_extractf128_ps::<1>(sum);
_mm_cvtss_f32(_mm_add_ss(low, high))
}
}
#[test]
fn test_rite_tier_self_receiver() {
if X64V3Token::summon().is_some() {
let processor = SimdProcessor {
scale: 2.0,
offset: 10.0,
};
let data = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let result = unsafe { processor.process_chunk(&data) };
assert_eq!(result, [12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0]);
}
}
#[test]
fn test_rite_tier_self_reduce() {
if X64V3Token::summon().is_some() {
let processor = SimdProcessor {
scale: 1.0,
offset: 0.0,
};
let data = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let result = unsafe { processor.reduce_sum(&data) };
assert_eq!(result, 36.0);
}
}
#[rite(v3, import_intrinsics)]
fn minmax_f32x8(data: &[f32; 8]) -> (f32, f32) {
let v = _mm256_loadu_ps(data);
let hi128 = _mm256_extractf128_ps::<1>(v);
let lo128 = _mm256_castps256_ps128(v);
let min128 = _mm_min_ps(lo128, hi128);
let min64 = _mm_min_ps(min128, _mm_movehl_ps(min128, min128));
let min32 = _mm_min_ps(min64, _mm_shuffle_ps::<0x01>(min64, min64));
let min_val = _mm_cvtss_f32(min32);
let max128 = _mm_max_ps(lo128, hi128);
let max64 = _mm_max_ps(max128, _mm_movehl_ps(max128, max128));
let max32 = _mm_max_ps(max64, _mm_shuffle_ps::<0x01>(max64, max64));
let max_val = _mm_cvtss_f32(max32);
(min_val, max_val)
}
#[test]
fn test_rite_tier_tuple_return() {
if X64V3Token::summon().is_some() {
let data = [5.0f32, -3.0, 8.0, 1.0, -7.0, 4.0, 2.0, 6.0];
let (min, max) = unsafe { minmax_f32x8(&data) };
assert_eq!(min, -7.0);
assert_eq!(max, 8.0);
}
}
#[rite(v3, import_intrinsics)]
fn dot_with_offset(a: &[f32; 8], b: &[f32; 8], offset_a: f32, offset_b: f32, scale: f32) -> f32 {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let off_a = _mm256_set1_ps(offset_a);
let off_b = _mm256_set1_ps(offset_b);
let adjusted_a = _mm256_add_ps(va, off_a);
let adjusted_b = _mm256_add_ps(vb, off_b);
let products = _mm256_mul_ps(adjusted_a, adjusted_b);
let scaled = _mm256_mul_ps(products, _mm256_set1_ps(scale));
let sum = _mm256_hadd_ps(scaled, scaled);
let sum = _mm256_hadd_ps(sum, sum);
let low = _mm256_castps256_ps128(sum);
let high = _mm256_extractf128_ps::<1>(sum);
_mm_cvtss_f32(_mm_add_ss(low, high))
}
#[test]
fn test_rite_tier_many_args() {
if X64V3Token::summon().is_some() {
let a = [1.0f32; 8];
let b = [1.0f32; 8];
let result = unsafe { dot_with_offset(&a, &b, 2.0, 3.0, 0.5) };
assert_eq!(result, 48.0);
}
}
#[rite(v3, import_intrinsics)]
fn sum_first_n<const N: usize>(data: &[f32; 8]) -> f32 {
let v = _mm256_loadu_ps(data);
let mut mask_arr = [0.0f32; 8];
let limit = if N > 8 { 8 } else { N };
for slot in mask_arr.iter_mut().take(limit) {
*slot = f32::from_bits(0xFFFF_FFFF);
}
let mask = _mm256_loadu_ps(&mask_arr);
let masked = _mm256_and_ps(v, mask);
let sum = _mm256_hadd_ps(masked, masked);
let sum = _mm256_hadd_ps(sum, sum);
let low = _mm256_castps256_ps128(sum);
let high = _mm256_extractf128_ps::<1>(sum);
_mm_cvtss_f32(_mm_add_ss(low, high))
}
#[test]
fn test_rite_tier_const_generic() {
if X64V3Token::summon().is_some() {
let data = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let sum4 = unsafe { sum_first_n::<4>(&data) };
assert_eq!(sum4, 10.0);
let sum2 = unsafe { sum_first_n::<2>(&data) };
assert_eq!(sum2, 3.0);
let sum8 = unsafe { sum_first_n::<8>(&data) };
assert_eq!(sum8, 36.0); }
}
#[rite(v3, import_intrinsics)]
fn clamp_f32x8(data: &[f32; 8], lo: f32, hi: f32) -> [f32; 8] {
let v = _mm256_loadu_ps(data);
let vlo = _mm256_set1_ps(lo);
let vhi = _mm256_set1_ps(hi);
let clamped = _mm256_min_ps(_mm256_max_ps(v, vlo), vhi);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, clamped);
out
}
#[rite(v3, import_intrinsics)]
fn subtract_f32x8(a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let result = _mm256_sub_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[arcane]
fn pipeline_norm(_token: X64V3Token, data: &[f32; 8]) -> f32 {
let clamped = clamp_f32x8(data, 0.0, 10.0);
let total = sum_f32x8_tierless(&clamped);
let mean = total / 8.0;
let mean_arr = [mean; 8];
let centered = subtract_f32x8(&clamped, &mean_arr);
let sq = square_f32x8(¢ered);
sum_f32x8_tierless(&sq)
}
#[test]
fn test_rite_tier_pipeline_composition() {
if let Some(token) = X64V3Token::summon() {
let data = [2.0f32, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0];
let result = pipeline_norm(token, &data);
assert_eq!(result, 40.0);
}
}
#[rite(v1, import_intrinsics)]
fn shuffle_f32x4_v1(data: &[f32; 4]) -> [f32; 4] {
let v = _mm_loadu_ps(data);
let reversed = _mm_shuffle_ps::<0b00_01_10_11>(v, v);
let mut out = [0.0f32; 4];
_mm_storeu_ps(&mut out, reversed);
out
}
#[test]
fn test_rite_tier_v1_shuffle_always_available() {
use archmage::X64V1Token;
if X64V1Token::summon().is_some() {
let data = [1.0f32, 2.0, 3.0, 4.0];
let result = unsafe { shuffle_f32x4_v1(&data) };
assert_eq!(result, [4.0, 3.0, 2.0, 1.0]);
}
}
#[rite(v2, import_intrinsics)]
fn round_f32x4_v2(data: &[f32; 4]) -> [f32; 4] {
let v = _mm_loadu_ps(data);
let rounded = _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(v);
let mut out = [0.0f32; 4];
_mm_storeu_ps(&mut out, rounded);
out
}
#[test]
fn test_rite_tier_v2_rounding() {
use archmage::X64V2Token;
if X64V2Token::summon().is_some() {
let data = [1.3f32, 2.7, -0.5, 3.5];
let result = unsafe { round_f32x4_v2(&data) };
assert_eq!(result, [1.0, 3.0, 0.0, 4.0]);
}
}
#[rite(v3, import_intrinsics)]
fn poly_eval_v3(x: &[f32; 8], a: f32, b: f32, c: f32) -> [f32; 8] {
let vx = _mm256_loadu_ps(x);
let va = _mm256_set1_ps(a);
let vb = _mm256_set1_ps(b);
let vc = _mm256_set1_ps(c);
let ax_plus_b = _mm256_fmadd_ps(va, vx, vb);
let result = _mm256_fmadd_ps(ax_plus_b, vx, vc);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[test]
fn test_rite_tier_v3_polynomial_fma() {
if X64V3Token::summon().is_some() {
let x = [0.0f32, 1.0, 2.0, 3.0, -1.0, -2.0, 0.5, 10.0];
let result = unsafe { poly_eval_v3(&x, 2.0, 3.0, 1.0) };
let expected = [1.0, 6.0, 15.0, 28.0, 0.0, 3.0, 3.0, 231.0];
for (r, e) in result.iter().zip(expected.iter()) {
assert!((r - e).abs() < 1e-4, "poly_eval: got {r}, expected {e}");
}
}
}
struct Accumulator {
buffer: [f32; 8],
}
impl Accumulator {
fn new() -> Self {
Self { buffer: [0.0; 8] }
}
#[rite(v3, import_intrinsics)]
fn accumulate(&mut self, data: &[f32; 8]) {
let current = _mm256_loadu_ps(&self.buffer);
let incoming = _mm256_loadu_ps(data);
let sum = _mm256_add_ps(current, incoming);
_mm256_storeu_ps(&mut self.buffer, sum);
}
#[rite(v3, import_intrinsics)]
fn result(&self) -> f32 {
let v = _mm256_loadu_ps(&self.buffer);
let sum = _mm256_hadd_ps(v, v);
let sum = _mm256_hadd_ps(sum, sum);
let low = _mm256_castps256_ps128(sum);
let high = _mm256_extractf128_ps::<1>(sum);
_mm_cvtss_f32(_mm_add_ss(low, high))
}
}
#[test]
fn test_rite_tier_mut_self_accumulator() {
if X64V3Token::summon().is_some() {
let mut acc = Accumulator::new();
let batch1 = [1.0f32; 8];
let batch2 = [2.0f32; 8];
let batch3 = [3.0f32; 8];
unsafe {
acc.accumulate(&batch1);
acc.accumulate(&batch2);
acc.accumulate(&batch3);
}
let total = unsafe { acc.result() };
assert_eq!(total, 48.0);
}
}
#[rite(v2)]
fn popcnt_u32_v2(val: u32) -> u32 {
core::arch::x86_64::_popcnt32(val as i32) as u32
}
#[arcane(import_intrinsics)]
fn count_nonzero_lanes(_token: X64V3Token, data: &[f32; 8]) -> u32 {
let v = _mm256_loadu_ps(data);
let zero = _mm256_setzero_ps();
let cmp = _mm256_cmp_ps::<_CMP_NEQ_OQ>(v, zero);
let mask = _mm256_movemask_ps(cmp) as u32;
popcnt_u32_v2(mask)
}
#[test]
fn test_rite_two_tiers_from_arcane() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 0.0, 3.0, 0.0, 5.0, 6.0, 0.0, 8.0];
let count = count_nonzero_lanes(token, &data);
assert_eq!(count, 5); }
}
#[rite(v3, import_intrinsics)]
fn all_positive(data: &[f32; 8]) -> bool {
let v = _mm256_loadu_ps(data);
let zero = _mm256_setzero_ps();
let cmp = _mm256_cmp_ps::<_CMP_GT_OQ>(v, zero);
let mask = _mm256_movemask_ps(cmp);
mask == 0xFF }
#[rite(v3, import_intrinsics)]
fn any_negative(data: &[f32; 8]) -> bool {
let v = _mm256_loadu_ps(data);
let zero = _mm256_setzero_ps();
let cmp = _mm256_cmp_ps::<_CMP_LT_OQ>(v, zero);
let mask = _mm256_movemask_ps(cmp);
mask != 0 }
#[test]
fn test_rite_tier_bool_return() {
if X64V3Token::summon().is_some() {
let all_pos = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
assert!(unsafe { all_positive(&all_pos) });
assert!(!unsafe { any_negative(&all_pos) });
let has_neg = [1.0f32, -2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
assert!(!unsafe { all_positive(&has_neg) });
assert!(unsafe { any_negative(&has_neg) });
let all_neg = [-1.0f32; 8];
assert!(!unsafe { all_positive(&all_neg) });
assert!(unsafe { any_negative(&all_neg) });
}
}
#[rite(v1, import_intrinsics)]
fn reverse_pairs_v1(data: &[f64; 2]) -> [f64; 2] {
let v = _mm_loadu_pd(data);
let swapped = _mm_shuffle_pd::<0b01>(v, v);
let mut out = [0.0f64; 2];
_mm_storeu_pd(&mut out, swapped);
out
}
#[test]
fn test_rite_tier_v1_import_intrinsics() {
use archmage::X64V1Token;
if X64V1Token::summon().is_some() {
let data = [42.0f64, 99.0];
let result = unsafe { reverse_pairs_v1(&data) };
assert_eq!(result, [99.0, 42.0]);
}
}
#[rite(v3, import_intrinsics)]
fn interleave_low_f32x8(a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let result = _mm256_unpacklo_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[rite(v3, import_intrinsics)]
fn interleave_high_f32x8(a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let result = _mm256_unpackhi_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[arcane]
fn interleave_and_sum(_token: X64V3Token, a: &[f32; 8], b: &[f32; 8]) -> (f32, f32) {
let lo = interleave_low_f32x8(a, b);
let hi = interleave_high_f32x8(a, b);
let sum_lo = sum_f32x8_tierless(&lo);
let sum_hi = sum_f32x8_tierless(&hi);
(sum_lo, sum_hi)
}
#[test]
fn test_rite_tier_chain_three_helpers() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [10.0f32, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0];
let (sum_lo, sum_hi) = interleave_and_sum(token, &a, &b);
assert_eq!(sum_lo, 154.0);
assert_eq!(sum_hi, 242.0);
}
}
#[rite(v3, import_intrinsics)]
fn load_and_compare<'a>(data: &'a [f32; 8], threshold: f32) -> i32 {
let v = _mm256_loadu_ps(data);
let t = _mm256_set1_ps(threshold);
let cmp = _mm256_cmp_ps::<_CMP_GE_OQ>(v, t);
_mm256_movemask_ps(cmp)
}
#[test]
fn test_rite_tier_lifetime_generic() {
if X64V3Token::summon().is_some() {
let data = [1.0f32, 5.0, 3.0, 7.0, 2.0, 6.0, 4.0, 8.0];
let mask = unsafe { load_and_compare(&data, 5.0) };
assert_eq!(mask, 0b1010_1010_u32 as i32);
}
}
#[rite(v1, v3)]
fn sum_array(data: &[f32; 4]) -> f32 {
data[0] + data[1] + data[2] + data[3]
}
#[test]
fn test_rite_multi_tier_v1_v3() {
let data = [1.0f32, 2.0, 3.0, 4.0];
let r1 = unsafe { sum_array_v1(&data) };
assert_eq!(r1, 10.0);
if X64V3Token::summon().is_some() {
let r3 = unsafe { sum_array_v3(&data) };
assert_eq!(r3, 10.0);
}
}
#[rite(v2, v3, import_intrinsics)]
fn multi_tier_loadu(data: &[f32; 4]) -> f32 {
let v = _mm_loadu_ps(data);
let sum = _mm_hadd_ps(v, v);
let sum = _mm_hadd_ps(sum, sum);
_mm_cvtss_f32(sum)
}
#[test]
fn test_rite_multi_tier_import_intrinsics() {
use archmage::X64V2Token;
let data = [1.0f32, 2.0, 3.0, 4.0];
if X64V2Token::summon().is_some() {
let r2 = unsafe { multi_tier_loadu_v2(&data) };
assert_eq!(r2, 10.0);
}
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_loadu_v3(&data) };
assert_eq!(r3, 10.0);
}
}
#[rite(v2, v3)]
fn inner_scale(data: &[f32; 4], factor: f32) -> [f32; 4] {
[
data[0] * factor,
data[1] * factor,
data[2] * factor,
data[3] * factor,
]
}
#[arcane]
fn process_with_multi_tier(_token: X64V3Token, data: &[f32; 4]) -> [f32; 4] {
inner_scale_v3(data, 2.0)
}
#[test]
fn test_rite_multi_tier_called_from_arcane() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0];
let result = process_with_multi_tier(token, &data);
assert_eq!(result, [2.0, 4.0, 6.0, 8.0]);
}
}
#[cfg(feature = "avx512")]
mod multi_tier_avx512 {
use archmage::{SimdToken, X64V3Token, X64V4Token, rite};
#[rite(v3, v4)]
fn add_pair(a: f32, b: f32) -> f32 {
a + b
}
#[test]
fn test_multi_tier_v3_v4() {
if X64V3Token::summon().is_some() {
let r3 = unsafe { add_pair_v3(3.0, 4.0) };
assert_eq!(r3, 7.0);
}
if X64V4Token::summon().is_some() {
let r4 = unsafe { add_pair_v4(3.0, 4.0) };
assert_eq!(r4, 7.0);
}
}
#[rite(v3, v4, import_intrinsics)]
fn multi_tier_intrinsics_v3_v4(data: &[f32; 4]) -> f32 {
let v = _mm_loadu_ps(data);
let sum = _mm_hadd_ps(v, v);
let sum = _mm_hadd_ps(sum, sum);
_mm_cvtss_f32(sum)
}
#[test]
fn test_multi_tier_intrinsics_v3_v4() {
let data = [10.0f32, 20.0, 30.0, 40.0];
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_intrinsics_v3_v4_v3(&data) };
assert_eq!(r3, 100.0);
}
if X64V4Token::summon().is_some() {
let r4 = unsafe { multi_tier_intrinsics_v3_v4_v4(&data) };
assert_eq!(r4, 100.0);
}
}
}
mod multi_tier_visibility {
use archmage::rite;
#[rite(v1, v3)]
pub fn public_multi(x: f32) -> f32 {
x * 2.0
}
#[rite(v1, v3)]
pub(crate) fn crate_multi(x: f32) -> f32 {
x * 3.0
}
}
#[test]
fn test_multi_tier_visibility() {
let r1 = unsafe { multi_tier_visibility::public_multi_v1(5.0) };
assert_eq!(r1, 10.0);
let r2 = unsafe { multi_tier_visibility::crate_multi_v1(5.0) };
assert_eq!(r2, 15.0);
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_visibility::public_multi_v3(5.0) };
assert_eq!(r3, 10.0);
}
}
#[rite(v1, v3)]
fn multi_tier_returns_tuple(a: f32, b: f32) -> (f32, f32) {
(a + b, a * b)
}
#[rite(v1, v3)]
fn multi_tier_returns_bool(x: f32) -> bool {
x > 0.0
}
#[test]
fn test_multi_tier_return_types() {
let (sum, prod) = unsafe { multi_tier_returns_tuple_v1(3.0, 4.0) };
assert_eq!(sum, 7.0);
assert_eq!(prod, 12.0);
assert!(unsafe { multi_tier_returns_bool_v1(1.0) });
assert!(!unsafe { multi_tier_returns_bool_v1(-1.0) });
}
#[rite(v1, v2, v3)]
fn three_tier_dot(a: &[f32; 4], b: &[f32; 4]) -> f32 {
a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]
}
#[test]
fn test_rite_multi_tier_three_tiers_agree() {
let a = [1.0f32, 2.0, 3.0, 4.0];
let b = [5.0f32, 6.0, 7.0, 8.0];
let expected = 70.0;
let r1 = unsafe { three_tier_dot_v1(&a, &b) };
assert_eq!(r1, expected);
if archmage::X64V2Token::summon().is_some() {
let r2 = unsafe { three_tier_dot_v2(&a, &b) };
assert_eq!(r2, expected);
}
if X64V3Token::summon().is_some() {
let r3 = unsafe { three_tier_dot_v3(&a, &b) };
assert_eq!(r3, expected);
}
}
#[rite(v1, v3)]
fn multi_tier_generic_add<T: core::ops::Add<Output = T> + Copy>(a: T, b: T) -> T {
a + b
}
#[test]
fn test_rite_multi_tier_generic() {
let r1 = unsafe { multi_tier_generic_add_v1(10i32, 20i32) };
assert_eq!(r1, 30);
let r1f = unsafe { multi_tier_generic_add_v1(1.5f64, 2.5f64) };
assert_eq!(r1f, 4.0);
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_generic_add_v3(10i32, 20i32) };
assert_eq!(r3, 30);
}
}
#[rite(v1, v3)]
fn multi_tier_const_generic<const N: usize>(data: &[f32; N]) -> f32
where
[(); N]:,
{
let mut sum = 0.0f32;
for x in data {
sum += x;
}
sum
}
#[test]
fn test_rite_multi_tier_const_generic() {
let data3 = [1.0f32, 2.0, 3.0];
let data5 = [1.0f32, 2.0, 3.0, 4.0, 5.0];
let r1_3 = unsafe { multi_tier_const_generic_v1(&data3) };
assert_eq!(r1_3, 6.0);
let r1_5 = unsafe { multi_tier_const_generic_v1(&data5) };
assert_eq!(r1_5, 15.0);
if X64V3Token::summon().is_some() {
let r3_3 = unsafe { multi_tier_const_generic_v3(&data3) };
assert_eq!(r3_3, 6.0);
}
}
#[rite(v1, v3)]
fn multi_tier_lifetime<'a>(data: &'a [f32]) -> &'a f32 {
&data[0]
}
#[test]
fn test_rite_multi_tier_lifetime() {
let data = [42.0f32, 1.0, 2.0];
let r1 = unsafe { multi_tier_lifetime_v1(&data) };
assert_eq!(*r1, 42.0);
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_lifetime_v3(&data) };
assert_eq!(*r3, 42.0);
}
}
#[rite(v1, v3)]
#[allow(unused_variables)]
fn multi_tier_with_attrs(x: f32, y: f32) -> f32 {
let unused = x * 2.0; y
}
#[test]
fn test_rite_multi_tier_preserves_attrs() {
let r1 = unsafe { multi_tier_with_attrs_v1(1.0, 2.0) };
assert_eq!(r1, 2.0);
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_with_attrs_v3(1.0, 2.0) };
assert_eq!(r3, 2.0);
}
}
#[rite(v1, v3)]
fn multi_tier_mutate(data: &mut [f32; 4], factor: f32) {
data[0] *= factor;
data[1] *= factor;
data[2] *= factor;
data[3] *= factor;
}
#[test]
fn test_rite_multi_tier_mutate() {
let mut data = [1.0f32, 2.0, 3.0, 4.0];
unsafe { multi_tier_mutate_v1(&mut data, 3.0) };
assert_eq!(data, [3.0, 6.0, 9.0, 12.0]);
if X64V3Token::summon().is_some() {
let mut data2 = [1.0f32, 2.0, 3.0, 4.0];
unsafe { multi_tier_mutate_v3(&mut data2, 3.0) };
assert_eq!(data2, [3.0, 6.0, 9.0, 12.0]);
}
}
#[rite(v1, v3)]
fn multi_tier_helper_square(x: f32) -> f32 {
x * x
}
#[rite(v3)]
fn caller_uses_v3_variant(data: &[f32; 4]) -> f32 {
multi_tier_helper_square_v3(data[0])
+ multi_tier_helper_square_v3(data[1])
+ multi_tier_helper_square_v3(data[2])
+ multi_tier_helper_square_v3(data[3])
}
#[arcane]
fn entry_for_multi_tier_chain(_token: X64V3Token, data: &[f32; 4]) -> f32 {
caller_uses_v3_variant(data)
}
#[test]
fn test_rite_multi_tier_rite_calls_rite() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0];
let result = entry_for_multi_tier_chain(token, &data);
assert_eq!(result, 30.0); }
}
#[rite(v2, v3, import_intrinsics)]
fn multi_tier_simd_add(a: &[f32; 4], b: &[f32; 4]) -> [f32; 4] {
let va = _mm_loadu_ps(a);
let vb = _mm_loadu_ps(b);
let sum = _mm_add_ps(va, vb);
let mut out = [0.0f32; 4];
_mm_storeu_ps(&mut out, sum);
out
}
#[test]
fn test_rite_multi_tier_real_simd() {
let a = [1.0f32, 2.0, 3.0, 4.0];
let b = [10.0f32, 20.0, 30.0, 40.0];
if archmage::X64V2Token::summon().is_some() {
let r2 = unsafe { multi_tier_simd_add_v2(&a, &b) };
assert_eq!(r2, [11.0, 22.0, 33.0, 44.0]);
}
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_simd_add_v3(&a, &b) };
assert_eq!(r3, [11.0, 22.0, 33.0, 44.0]);
}
}
#[rite(v3, import_intrinsics)]
fn single_tier_256bit_mul(a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let prod = _mm256_mul_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, prod);
out
}
#[rite(v3, import_intrinsics)]
fn multi_tier_256bit_fma(a: &[f32; 8], b: &[f32; 8], c: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let vc = _mm256_loadu_ps(c);
let result = _mm256_fmadd_ps(va, vb, vc);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[test]
fn test_rite_single_tier_256bit_ops() {
if X64V3Token::summon().is_some() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [2.0f32; 8];
let c = [10.0f32; 8];
let mul = unsafe { single_tier_256bit_mul(&a, &b) };
assert_eq!(mul, [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]);
let fma = unsafe { multi_tier_256bit_fma(&a, &b, &c) };
assert_eq!(fma, [12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0]);
}
}
#[rite(v1, v2, v3)]
fn multi_tier_downgrade_target(x: f32) -> f32 {
x.abs() + 1.0
}
#[arcane]
fn entry_calls_v2_from_v3(_token: X64V3Token) -> f32 {
multi_tier_downgrade_target_v2(-5.0)
}
#[arcane]
fn entry_calls_v1_from_v3(_token: X64V3Token) -> f32 {
multi_tier_downgrade_target_v1(-7.0)
}
#[test]
fn test_rite_multi_tier_downgrade_from_arcane() {
if let Some(token) = X64V3Token::summon() {
assert_eq!(entry_calls_v2_from_v3(token), 6.0);
assert_eq!(entry_calls_v1_from_v3(token), 8.0);
}
}
#[rite(v1, v2, v3)]
fn multi_tier_identity(x: i32) -> i32 {
x
}
#[test]
fn test_rite_multi_tier_all_variants_callable() {
assert_eq!(unsafe { multi_tier_identity_v1(42) }, 42);
assert_eq!(unsafe { multi_tier_identity_v1(-1) }, -1);
assert_eq!(unsafe { multi_tier_identity_v1(0) }, 0);
if archmage::X64V2Token::summon().is_some() {
assert_eq!(unsafe { multi_tier_identity_v2(42) }, 42);
assert_eq!(unsafe { multi_tier_identity_v2(-1) }, -1);
}
if X64V3Token::summon().is_some() {
assert_eq!(unsafe { multi_tier_identity_v3(42) }, 42);
assert_eq!(unsafe { multi_tier_identity_v3(i32::MAX) }, i32::MAX);
assert_eq!(unsafe { multi_tier_identity_v3(i32::MIN) }, i32::MIN);
}
}
#[rite(v1, v3)]
fn multi_tier_complex_body(data: &[f32], threshold: f32) -> usize {
let mut count = 0;
for &x in data {
if x > threshold {
count += 1;
}
}
count
}
#[test]
fn test_rite_multi_tier_complex_body() {
let data = [1.0f32, 5.0, 3.0, 7.0, 2.0, 6.0, 4.0, 8.0];
let r1 = unsafe { multi_tier_complex_body_v1(&data, 4.0) };
assert_eq!(r1, 4);
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_complex_body_v3(&data, 4.0) };
assert_eq!(r3, 4);
}
}
#[rite(v1, v3)]
fn multi_tier_out_params(input: &[f32; 4], min_out: &mut f32, max_out: &mut f32) {
*min_out = input[0];
*max_out = input[0];
for &x in &input[1..] {
if x < *min_out {
*min_out = x;
}
if x > *max_out {
*max_out = x;
}
}
}
#[test]
fn test_rite_multi_tier_out_params() {
let input = [3.0f32, 1.0, 4.0, 2.0];
let mut min = 0.0f32;
let mut max = 0.0f32;
unsafe { multi_tier_out_params_v1(&input, &mut min, &mut max) };
assert_eq!(min, 1.0);
assert_eq!(max, 4.0);
if X64V3Token::summon().is_some() {
let mut min3 = 0.0f32;
let mut max3 = 0.0f32;
unsafe { multi_tier_out_params_v3(&input, &mut min3, &mut max3) };
assert_eq!(min3, 1.0);
assert_eq!(max3, 4.0);
}
}
#[rite(v1, v3)]
fn multi_tier_result(x: f32) -> Result<f32, &'static str> {
if x >= 0.0 {
Ok(x.sqrt())
} else {
Err("negative input")
}
}
#[test]
fn test_rite_multi_tier_result() {
assert_eq!(unsafe { multi_tier_result_v1(4.0) }, Ok(2.0));
assert_eq!(unsafe { multi_tier_result_v1(-1.0) }, Err("negative input"));
if X64V3Token::summon().is_some() {
assert_eq!(unsafe { multi_tier_result_v3(9.0) }, Ok(3.0));
assert_eq!(unsafe { multi_tier_result_v3(-1.0) }, Err("negative input"));
}
}
#[rite(v1, v3)]
fn multi_tier_option(data: &[f32], idx: usize) -> Option<f32> {
data.get(idx).copied()
}
#[test]
fn test_rite_multi_tier_option() {
let data = [10.0f32, 20.0, 30.0];
assert_eq!(unsafe { multi_tier_option_v1(&data, 1) }, Some(20.0));
assert_eq!(unsafe { multi_tier_option_v1(&data, 5) }, None);
if X64V3Token::summon().is_some() {
assert_eq!(unsafe { multi_tier_option_v3(&data, 0) }, Some(10.0));
assert_eq!(unsafe { multi_tier_option_v3(&data, 99) }, None);
}
}
mod multi_tier_methods {
use archmage::rite;
pub struct Accumulator {
pub value: f32,
}
impl Accumulator {
#[rite(v1, v3)]
pub fn add(&self, x: f32) -> f32 {
self.value + x
}
#[rite(v1, v3)]
pub fn scale(&self, factor: f32) -> Accumulator {
Accumulator {
value: self.value * factor,
}
}
#[rite(v1, v3)]
pub fn accumulate(&mut self, x: f32) {
self.value += x;
}
}
}
#[test]
fn test_rite_multi_tier_self_receiver() {
use multi_tier_methods::Accumulator;
let acc = Accumulator { value: 10.0 };
assert_eq!(unsafe { acc.add_v1(5.0) }, 15.0);
let scaled = unsafe { acc.scale_v1(3.0) };
assert_eq!(scaled.value, 30.0);
let mut acc2 = Accumulator { value: 0.0 };
unsafe { acc2.accumulate_v1(5.0) };
unsafe { acc2.accumulate_v1(3.0) };
assert_eq!(acc2.value, 8.0);
if X64V3Token::summon().is_some() {
let acc3 = Accumulator { value: 100.0 };
assert_eq!(unsafe { acc3.add_v3(50.0) }, 150.0);
let mut acc4 = Accumulator { value: 0.0 };
unsafe { acc4.accumulate_v3(7.0) };
assert_eq!(acc4.value, 7.0);
}
}
#[rite(v1, v3)]
fn multi_tier_apply(data: &mut [f32; 4], f: fn(f32) -> f32) {
for x in data.iter_mut() {
*x = f(*x);
}
}
fn double(x: f32) -> f32 {
x * 2.0
}
fn negate(x: f32) -> f32 {
-x
}
#[test]
fn test_rite_multi_tier_fn_pointer() {
let mut data = [1.0f32, 2.0, 3.0, 4.0];
unsafe { multi_tier_apply_v1(&mut data, double) };
assert_eq!(data, [2.0, 4.0, 6.0, 8.0]);
unsafe { multi_tier_apply_v1(&mut data, negate) };
assert_eq!(data, [-2.0, -4.0, -6.0, -8.0]);
if X64V3Token::summon().is_some() {
let mut data2 = [10.0f32, 20.0, 30.0, 40.0];
unsafe { multi_tier_apply_v3(&mut data2, double) };
assert_eq!(data2, [20.0, 40.0, 60.0, 80.0]);
}
}
#[rite(v1, v3)]
fn multi_tier_edge_cases(x: f32) -> f32 {
if x.is_nan() {
-1.0
} else if x.is_infinite() {
if x > 0.0 { 1.0 } else { -2.0 }
} else if x == 0.0 {
0.0
} else {
x.signum()
}
}
#[test]
fn test_rite_multi_tier_edge_cases_agree() {
let cases = [
(0.0f32, 0.0),
(1.0, 1.0),
(-1.0, -1.0),
(f32::NAN, -1.0),
(f32::INFINITY, 1.0),
(f32::NEG_INFINITY, -2.0),
(42.0, 1.0),
(-0.001, -1.0),
];
for (input, expected) in cases {
let r1 = unsafe { multi_tier_edge_cases_v1(input) };
assert_eq!(r1, expected, "v1 failed for input {input}");
}
if X64V3Token::summon().is_some() {
for (input, expected) in cases {
let r3 = unsafe { multi_tier_edge_cases_v3(input) };
assert_eq!(r3, expected, "v3 failed for input {input}");
}
}
}
#[rite(v1, v3)]
fn multi_tier_sum_of_squares(data: &[f32]) -> f32 {
let mut sum = 0.0f32;
for &x in data {
sum += x * x;
}
sum
}
#[test]
fn test_rite_multi_tier_auto_vectorizable() {
let data: Vec<f32> = (0..64).map(|i| i as f32).collect();
let expected: f32 = data.iter().map(|x| x * x).sum();
let r1 = unsafe { multi_tier_sum_of_squares_v1(&data) };
assert!((r1 - expected).abs() < 0.01, "v1: {r1} != {expected}");
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_sum_of_squares_v3(&data) };
assert!((r3 - expected).abs() < 0.01, "v3: {r3} != {expected}");
}
}
#[rite(v1, v3)]
fn multi_tier_slice_ops(data: &[u8]) -> u32 {
let mut sum = 0u32;
for &b in data {
sum += b as u32;
}
sum
}
#[test]
fn test_rite_multi_tier_byte_slice() {
let data: Vec<u8> = (0..=255).collect();
let expected: u32 = (0..=255u32).sum();
let r1 = unsafe { multi_tier_slice_ops_v1(&data) };
assert_eq!(r1, expected);
if X64V3Token::summon().is_some() {
let r3 = unsafe { multi_tier_slice_ops_v3(&data) };
assert_eq!(r3, expected);
}
assert_eq!(unsafe { multi_tier_slice_ops_v1(&[]) }, 0);
}
fn manually_written_v1(x: f32) -> f32 {
x + 100.0
}
#[rite(v2, v3)]
fn manually_written(x: f32) -> f32 {
x + 200.0
}
#[test]
fn test_rite_multi_tier_no_collision_with_manual() {
let manual = manually_written_v1(1.0);
assert_eq!(manual, 101.0);
if archmage::X64V2Token::summon().is_some() {
let generated = unsafe { manually_written_v2(1.0) };
assert_eq!(generated, 201.0);
}
}
#[rite(v3)]
fn single_tier_helper(x: f32) -> f32 {
x * x
}
#[rite(v1, v3)]
fn multi_tier_uses_single(data: &[f32; 4]) -> f32 {
data[0] + data[1] + data[2] + data[3]
}
#[test]
fn test_rite_multi_tier_coexists_with_single() {
if X64V3Token::summon().is_some() {
let s = unsafe { single_tier_helper(5.0) };
assert_eq!(s, 25.0);
let m = unsafe { multi_tier_uses_single_v3(&[1.0, 2.0, 3.0, 4.0]) };
assert_eq!(m, 10.0);
}
let m1 = unsafe { multi_tier_uses_single_v1(&[10.0, 20.0, 30.0, 40.0]) };
assert_eq!(m1, 100.0);
}