#![allow(dead_code, unused_variables, unused_imports)]
#[cfg(target_arch = "x86_64")]
mod pattern_concrete_token {
use archmage::intrinsics::x86_64::*;
use archmage::{Desktop64, SimdToken, X64V3Token, arcane};
#[arcane(import_intrinsics)]
pub fn sum_f32x8(token: X64V3Token, data: &[f32; 8]) -> f32 {
let v = _mm256_loadu_ps(data);
let sum1 = _mm256_hadd_ps(v, v);
let sum2 = _mm256_hadd_ps(sum1, sum1);
let low = _mm256_castps256_ps128(sum2);
let high = _mm256_extractf128_ps::<1>(sum2);
let final_sum = _mm_add_ss(low, high);
_mm_cvtss_f32(final_sum)
}
#[arcane(import_intrinsics)]
pub fn fma_f32x8(token: Desktop64, a: &[f32; 8], b: &[f32; 8], c: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let vc = _mm256_loadu_ps(c);
let result = _mm256_fmadd_ps(va, vb, vc);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[test]
fn test_concrete_token() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let sum = sum_f32x8(token, &data);
assert_eq!(sum, 36.0);
}
}
#[test]
fn test_desktop64_alias() {
if let Some(token) = Desktop64::summon() {
let a = [2.0f32; 8];
let b = [3.0f32; 8];
let c = [1.0f32; 8];
let result = fma_f32x8(token, &a, &b, &c);
assert_eq!(result, [7.0f32; 8]); }
}
}
#[cfg(target_arch = "x86_64")]
mod pattern_feature_traits {
use archmage::intrinsics::x86_64::*;
use archmage::{HasX64V2, SimdToken, X64V2Token, X64V3Token, arcane};
#[cfg(feature = "avx512")]
use archmage::{HasX64V4, X64V4Token};
#[arcane(import_intrinsics)]
pub fn popcnt_array(token: impl HasX64V2, data: &[u64; 4]) -> u32 {
let mut count = 0u32;
for &val in data {
count += _popcnt64(val as i64) as u32;
}
count
}
#[arcane(import_intrinsics)]
pub fn sum_sse<T: HasX64V2>(token: T, data: &[f32; 4]) -> f32 {
let v = _mm_loadu_ps(data);
let sum1 = _mm_hadd_ps(v, v);
let sum2 = _mm_hadd_ps(sum1, sum1);
_mm_cvtss_f32(sum2)
}
#[arcane(import_intrinsics)]
pub fn dot_sse<T>(token: T, a: &[f32; 4], b: &[f32; 4]) -> f32
where
T: HasX64V2,
{
let va = _mm_loadu_ps(a);
let vb = _mm_loadu_ps(b);
let dp = _mm_dp_ps::<0xFF>(va, vb);
_mm_cvtss_f32(dp)
}
#[test]
fn test_hasx64v2_with_v2_token() {
if let Some(token) = X64V2Token::summon() {
let data = [0xFFFF_FFFF_FFFF_FFFFu64; 4];
let count = popcnt_array(token, &data);
assert_eq!(count, 256); }
}
#[test]
fn test_hasx64v2_with_v3_token() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0];
let sum = sum_sse(token, &data);
assert_eq!(sum, 10.0);
}
}
#[cfg(feature = "avx512")]
#[test]
fn test_hasx64v4_with_v4_token() {
if let Some(token) = X64V4Token::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0];
let b = [2.0f32, 2.0, 2.0, 2.0];
let dot = dot_sse(token, &a, &b);
assert_eq!(dot, 20.0); }
}
}
#[cfg(target_arch = "x86_64")]
mod pattern_width_traits_deprecated {
use archmage::intrinsics::x86_64::*;
use archmage::{Has256BitSimd, SimdToken, X64V3Token, arcane};
#[arcane(import_intrinsics)]
pub fn add_f32x8_deprecated(token: impl Has256BitSimd, a: &[f32; 8], b: &[f32; 8]) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let result = _mm256_add_ps(va, vb);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[arcane(import_intrinsics)]
pub fn fma_f32x8_correct(
token: X64V3Token,
a: &[f32; 8],
b: &[f32; 8],
c: &[f32; 8],
) -> [f32; 8] {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let vc = _mm256_loadu_ps(c);
let result = _mm256_fmadd_ps(va, vb, vc);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[test]
fn test_deprecated_pattern_still_works_for_now() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32; 8];
let b = [2.0f32; 8];
let result = add_f32x8_deprecated(token, &a, &b);
assert_eq!(result, [3.0f32; 8]);
}
}
}
#[cfg(target_arch = "x86_64")]
mod pattern_self_receiver {
use archmage::intrinsics::x86_64::*;
use archmage::{HasX64V2, SimdToken, X64V3Token, arcane};
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct Vec8f32(pub [f32; 8]);
pub trait SimdOps {
fn double(&self, token: X64V3Token) -> Self;
fn square(self, token: X64V3Token) -> Self;
fn scale(&mut self, token: X64V3Token, factor: f32);
}
impl SimdOps for Vec8f32 {
#[arcane(_self = Vec8f32, import_intrinsics)]
fn double(&self, _token: X64V3Token) -> Self {
let v = _mm256_loadu_ps(&_self.0);
let doubled = _mm256_add_ps(v, v);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, doubled);
Vec8f32(out)
}
#[arcane(_self = Vec8f32, import_intrinsics)]
fn square(self, _token: X64V3Token) -> Self {
let v = _mm256_loadu_ps(&_self.0);
let squared = _mm256_mul_ps(v, v);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, squared);
Vec8f32(out)
}
#[arcane(_self = Vec8f32, import_intrinsics)]
fn scale(&mut self, _token: X64V3Token, factor: f32) {
let v = _mm256_loadu_ps(&_self.0);
let scale = _mm256_set1_ps(factor);
let scaled = _mm256_mul_ps(v, scale);
_mm256_storeu_ps(&mut _self.0, scaled);
}
}
#[test]
fn test_self_ref() {
if let Some(token) = X64V3Token::summon() {
let v = Vec8f32([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let result = v.double(token);
assert_eq!(result.0, [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]);
}
}
#[test]
fn test_self_owned() {
if let Some(token) = X64V3Token::summon() {
let v = Vec8f32([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let result = v.square(token);
assert_eq!(result.0, [1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0]);
}
}
#[test]
fn test_self_mut_ref() {
if let Some(token) = X64V3Token::summon() {
let mut v = Vec8f32([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
v.scale(token, 3.0);
assert_eq!(v.0, [3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0]);
}
}
}
#[cfg(target_arch = "x86_64")]
mod pattern_token_passthrough {
use archmage::intrinsics::x86_64::*;
use archmage::{SimdToken, X64V3Token, arcane};
#[arcane(import_intrinsics)]
fn add_vectors(token: X64V3Token, a: __m256, b: __m256) -> __m256 {
_mm256_add_ps(a, b)
}
#[arcane(import_intrinsics)]
fn mul_vectors(token: X64V3Token, a: __m256, b: __m256) -> __m256 {
_mm256_mul_ps(a, b)
}
#[arcane(import_intrinsics)]
pub fn dot_product(token: X64V3Token, a: &[f32; 8], b: &[f32; 8]) -> f32 {
let va = _mm256_loadu_ps(a);
let vb = _mm256_loadu_ps(b);
let product = mul_vectors(token, va, vb);
let sum1 = _mm256_hadd_ps(product, product);
let sum2 = _mm256_hadd_ps(sum1, sum1);
let low = _mm256_castps256_ps128(sum2);
let high = _mm256_extractf128_ps::<1>(sum2);
let final_sum = _mm_add_ss(low, high);
_mm_cvtss_f32(final_sum)
}
#[arcane(import_intrinsics)]
pub fn polynomial(token: X64V3Token, x: &[f32; 8], a: f32, b: f32, c: f32) -> [f32; 8] {
let vx = _mm256_loadu_ps(x);
let va = _mm256_set1_ps(a);
let vb = _mm256_set1_ps(b);
let vc = _mm256_set1_ps(c);
let x_squared = mul_vectors(token, vx, vx);
let ax2 = mul_vectors(token, va, x_squared);
let bx = mul_vectors(token, vb, vx);
let ax2_plus_bx = add_vectors(token, ax2, bx);
let result = add_vectors(token, ax2_plus_bx, vc);
let mut out = [0.0f32; 8];
_mm256_storeu_ps(&mut out, result);
out
}
#[test]
fn test_token_passthrough() {
if let Some(token) = X64V3Token::summon() {
let a = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = [2.0f32; 8];
let dot = dot_product(token, &a, &b);
assert_eq!(dot, 72.0);
}
}
#[test]
fn test_composed_helpers() {
if let Some(token) = X64V3Token::summon() {
let x = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let result = polynomial(token, &x, 2.0, 3.0, 1.0);
let expected: [f32; 8] = x.map(|xi| 2.0 * xi * xi + 3.0 * xi + 1.0);
assert_eq!(result, expected);
}
}
}
#[cfg(target_arch = "x86_64")]
mod pattern_manual_dispatch_x86 {
#[cfg(feature = "avx512")]
use archmage::X64V4Token;
use archmage::intrinsics::x86_64::*;
use archmage::{SimdToken, X64V3Token, arcane};
#[arcane(import_intrinsics)]
pub fn sum_v3(token: X64V3Token, data: &[f32]) -> f32 {
let mut acc = _mm256_setzero_ps();
let chunks = data.chunks_exact(8);
let remainder = chunks.remainder();
for chunk in chunks {
let v = _mm256_loadu_ps(chunk.first_chunk().unwrap());
acc = _mm256_add_ps(acc, v);
}
let sum1 = _mm256_hadd_ps(acc, acc);
let sum2 = _mm256_hadd_ps(sum1, sum1);
let low = _mm256_castps256_ps128(sum2);
let high = _mm256_extractf128_ps::<1>(sum2);
let mut result = _mm_cvtss_f32(_mm_add_ss(low, high));
for &val in remainder {
result += val;
}
result
}
pub fn sum_scalar(data: &[f32]) -> f32 {
data.iter().sum()
}
pub fn sum(data: &[f32]) -> f32 {
#[cfg(feature = "avx512")]
if let Some(token) = X64V4Token::summon() {
return sum_v3(token.v3(), data);
}
if let Some(token) = X64V3Token::summon() {
return sum_v3(token, data);
}
sum_scalar(data)
}
#[test]
fn test_dispatch() {
let data: Vec<f32> = (1..=100).map(|x| x as f32).collect();
let result = sum(&data);
assert_eq!(result, 5050.0); }
}
#[cfg(target_arch = "x86_64")]
mod pattern_magetypes {
use archmage::{SimdToken, X64V3Token};
use magetypes::simd::f32x8;
#[test]
fn test_magetypes_basic() {
if let Some(token) = X64V3Token::summon() {
let a = f32x8::splat(token, 2.0);
let b = f32x8::splat(token, 3.0);
let c = a + b;
assert_eq!(c.to_array(), [5.0f32; 8]);
}
}
#[test]
fn test_magetypes_load_store() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let v = f32x8::load(token, &data);
let doubled = v + v;
assert_eq!(
doubled.to_array(),
[2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]
);
}
}
#[test]
fn test_magetypes_reduce() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let v = f32x8::load(token, &data);
let sum = v.reduce_add();
assert_eq!(sum, 36.0);
}
}
}
#[cfg(target_arch = "x86_64")]
mod pattern_separate_platforms_x86 {
use archmage::intrinsics::x86_64::*;
use archmage::{SimdToken, X64V3Token, arcane};
#[arcane(import_intrinsics)]
pub fn process_x86(token: X64V3Token, data: &mut [f32]) {
for chunk in data.chunks_exact_mut(8) {
let v = _mm256_loadu_ps(chunk.first_chunk().unwrap());
let processed = _mm256_mul_ps(v, v); _mm256_storeu_ps(chunk.first_chunk_mut().unwrap(), processed);
}
}
#[test]
fn test_x86_impl() {
if let Some(token) = X64V3Token::summon() {
let mut data = [1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
process_x86(token, &mut data);
assert_eq!(data, [1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0]);
}
}
}
#[cfg(target_arch = "aarch64")]
mod pattern_separate_platforms_arm {
use archmage::intrinsics::aarch64::*;
use archmage::{NeonToken, SimdToken, arcane};
#[arcane(import_intrinsics)]
pub fn process_arm(token: NeonToken, data: &mut [f32]) {
for chunk in data.chunks_exact_mut(4) {
let v = vld1q_f32(chunk.first_chunk().unwrap());
let processed = vmulq_f32(v, v); vst1q_f32(chunk.first_chunk_mut().unwrap(), processed);
}
}
#[test]
fn test_arm_impl() {
if let Some(token) = NeonToken::summon() {
let mut data = [1.0f32, 2.0, 3.0, 4.0];
process_arm(token, &mut data);
assert_eq!(data, [1.0, 4.0, 9.0, 16.0]);
}
}
}
#[cfg(target_arch = "x86_64")]
mod pattern_token_extraction {
#[cfg(feature = "avx512")]
use archmage::X64V4Token;
use archmage::intrinsics::x86_64::*;
use archmage::{SimdToken, X64V2Token, X64V3Token, arcane};
#[arcane(import_intrinsics)]
fn sse_operation(token: X64V2Token, data: &[f32; 4]) -> f32 {
let v = _mm_loadu_ps(data);
let sum = _mm_hadd_ps(v, v);
let sum = _mm_hadd_ps(sum, sum);
_mm_cvtss_f32(sum)
}
#[arcane(import_intrinsics)]
pub fn flexible_sum(token: X64V3Token, data: &[f32; 4]) -> f32 {
let v2_token = token.v2();
sse_operation(v2_token, data)
}
#[cfg(feature = "avx512")]
#[arcane(import_intrinsics)]
pub fn avx512_with_fallback(token: X64V4Token, data: &[f32; 4]) -> f32 {
let v3_token = token.v3();
flexible_sum(v3_token, data)
}
#[test]
fn test_token_extraction() {
if let Some(token) = X64V3Token::summon() {
let data = [1.0f32, 2.0, 3.0, 4.0];
let sum = flexible_sum(token, &data);
assert_eq!(sum, 10.0);
}
}
}