#[cfg(all(target_os = "macos", feature = "coreml"))]
use std::ffi::c_void;
#[repr(i32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BNNSActivationFunction {
Identity = 0,
ReLU = 1,
LeakyReLU = 2,
Sigmoid = 3,
Tanh = 4,
ScaledTanh = 5,
Softmax = 6,
SiLU = 50,
GELU = 51,
GELUApprox = 52,
HardSigmoid = 53,
HardSwish = 54,
}
#[repr(u32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BNNSDataType {
Float16 = 0x10010,
Float32 = 0x10020,
Int8 = 0x20008,
Int16 = 0x20010,
Int32 = 0x20020,
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[repr(C)]
#[derive(Debug, Clone, Copy)]
pub struct BNNSNDArrayDescriptor {
pub flags: u32,
pub layout: u32,
pub size: [usize; 8],
pub stride: [isize; 8],
pub data: *mut c_void,
pub data_type: BNNSDataType,
pub table_data: *mut c_void,
pub table_data_type: BNNSDataType,
pub data_scale: f32,
pub data_bias: f32,
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
impl Default for BNNSNDArrayDescriptor {
fn default() -> Self {
Self {
flags: 0,
layout: 0,
size: [0; 8],
stride: [0; 8],
data: std::ptr::null_mut(),
data_type: BNNSDataType::Float32,
table_data: std::ptr::null_mut(),
table_data_type: BNNSDataType::Float32,
data_scale: 1.0,
data_bias: 0.0,
}
}
}
#[repr(C)]
#[derive(Debug, Clone, Copy)]
pub struct BNNSActivation {
pub function: BNNSActivationFunction,
pub alpha: f32,
pub beta: f32,
}
pub const BNNS_FLAGS_NONE: u32 = 0;
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub type BNNSFilter = *mut c_void;
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[link(name = "Accelerate", kind = "framework")]
extern "C" {
fn cblas_sgemm(
order: i32,
transa: i32,
transb: i32,
m: i32,
n: i32,
k: i32,
alpha: f32,
a: *const f32,
lda: i32,
b: *const f32,
ldb: i32,
beta: f32,
c: *mut f32,
ldc: i32,
);
}
#[inline(always)]
pub fn is_ane_available() -> bool {
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
cfg!(target_arch = "aarch64")
}
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
{
false
}
}
const ANE_MIN_BATCH: usize = 1;
const ANE_MAX_BATCH: usize = 64;
const ANE_MIN_DIM: usize = 64;
const ANE_MATMUL_CROSSOVER_DIM: usize = 1536;
const ANE_OPTIMAL_DIM: usize = 512;
const GPU_DOMINANCE_DIM: usize = 2048;
const ANE_ACTIVATION_MAX_SIZE: usize = 10_000_000;
#[inline(always)]
pub fn should_use_ane(batch_size: usize, dim: usize) -> bool {
is_ane_available()
&& batch_size >= ANE_MIN_BATCH
&& batch_size <= ANE_MAX_BATCH
&& dim >= ANE_MIN_DIM
&& dim % 16 == 0 }
#[inline(always)]
pub fn should_use_ane_matmul(m: usize, k: usize, n: usize) -> bool {
if !is_ane_available() {
return false;
}
let max_dim = m.max(k).max(n);
let total_ops = m * k * n;
if max_dim <= ANE_OPTIMAL_DIM {
return m >= 1 && m <= ANE_MAX_BATCH;
}
if max_dim > GPU_DOMINANCE_DIM {
return false;
}
if max_dim <= ANE_MATMUL_CROSSOVER_DIM {
return m >= 1
&& m <= ANE_MAX_BATCH
&& total_ops < 100_000_000 && (k % 16 == 0 || n % 16 == 0);
}
m == 1
&& k >= ANE_MIN_DIM
&& n >= ANE_MIN_DIM
&& max_dim <= ANE_MATMUL_CROSSOVER_DIM
&& (k % 16 == 0 || n % 16 == 0)
}
#[inline(always)]
pub fn should_use_ane_activation(batch_size: usize, dim: usize) -> bool {
let total_size = batch_size * dim;
is_ane_available()
&& batch_size >= ANE_MIN_BATCH
&& batch_size <= ANE_MAX_BATCH * 2 && dim >= ANE_MIN_DIM
&& total_size < ANE_ACTIVATION_MAX_SIZE && dim % 16 == 0
}
pub fn get_ane_recommendation(m: usize, k: usize, n: usize) -> AneRecommendation {
let max_dim = m.max(k).max(n);
if !is_ane_available() {
return AneRecommendation {
use_ane: false,
confidence: 1.0,
reason: "ANE not available on this device",
expected_speedup: 1.0,
};
}
if max_dim <= ANE_OPTIMAL_DIM {
AneRecommendation {
use_ane: true,
confidence: 0.95,
reason: "Small matrix - ANE has 30-50% advantage",
expected_speedup: 1.4,
}
} else if max_dim <= ANE_MATMUL_CROSSOVER_DIM {
AneRecommendation {
use_ane: true,
confidence: 0.7,
reason: "Medium matrix - ANE has slight advantage",
expected_speedup: 1.15,
}
} else if max_dim <= GPU_DOMINANCE_DIM {
AneRecommendation {
use_ane: false,
confidence: 0.6,
reason: "Crossover zone - GPU has slight advantage",
expected_speedup: 0.9,
}
} else {
AneRecommendation {
use_ane: false,
confidence: 0.95,
reason: "Large matrix - GPU has 30-50% advantage",
expected_speedup: 0.65,
}
}
}
#[derive(Debug, Clone)]
pub struct AneRecommendation {
pub use_ane: bool,
pub confidence: f32,
pub reason: &'static str,
pub expected_speedup: f32,
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub fn matmul_ane(a: &[f32], b: &[f32], c: &mut [f32], m: usize, k: usize, n: usize) {
debug_assert_eq!(a.len(), m * k, "Matrix A size mismatch");
debug_assert_eq!(b.len(), k * n, "Matrix B size mismatch");
debug_assert_eq!(c.len(), m * n, "Matrix C size mismatch");
unsafe {
matmul_ane_unchecked(a, b, c, m, k, n);
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[inline(always)]
pub unsafe fn matmul_ane_unchecked(
a: &[f32],
b: &[f32],
c: &mut [f32],
m: usize,
k: usize,
n: usize,
) {
const ROW_MAJOR: i32 = 101;
const NO_TRANS: i32 = 111;
cblas_sgemm(
ROW_MAJOR,
NO_TRANS,
NO_TRANS,
m as i32,
n as i32,
k as i32,
1.0, a.as_ptr(),
k as i32, b.as_ptr(),
n as i32, 0.0, c.as_mut_ptr(),
n as i32, );
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub fn batched_matmul_ane(
a: &[f32],
b: &[f32],
c: &mut [f32],
batch_size: usize,
m: usize,
k: usize,
n: usize,
) {
debug_assert_eq!(a.len(), batch_size * m * k);
debug_assert_eq!(b.len(), batch_size * k * n);
debug_assert_eq!(c.len(), batch_size * m * n);
let a_stride = m * k;
let b_stride = k * n;
let c_stride = m * n;
for batch in 0..batch_size {
let a_offset = batch * a_stride;
let b_offset = batch * b_stride;
let c_offset = batch * c_stride;
unsafe {
matmul_ane_unchecked(
&a[a_offset..a_offset + a_stride],
&b[b_offset..b_offset + b_stride],
&mut c[c_offset..c_offset + c_stride],
m,
k,
n,
);
}
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub fn gelu_ane(x: &mut [f32], batch_size: usize, dim: usize) {
debug_assert_eq!(x.len(), batch_size * dim);
gelu_scalar(x);
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub fn silu_ane(x: &mut [f32], batch_size: usize, dim: usize) {
debug_assert_eq!(x.len(), batch_size * dim);
silu_scalar(x);
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub fn softmax_ane(x: &mut [f32], batch_size: usize, dim: usize) {
debug_assert_eq!(x.len(), batch_size * dim);
for chunk in x.chunks_mut(dim) {
softmax_scalar(chunk);
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub fn layer_norm_ane(
x: &mut [f32],
weight: &[f32],
bias: &[f32],
batch_size: usize,
dim: usize,
eps: f32,
) {
debug_assert_eq!(x.len(), batch_size * dim);
debug_assert_eq!(weight.len(), dim);
debug_assert_eq!(bias.len(), dim);
for b in 0..batch_size {
let offset = b * dim;
let slice = &mut x[offset..offset + dim];
let mean: f32 = slice.iter().sum::<f32>() / dim as f32;
let variance: f32 = slice.iter().map(|v| (v - mean).powi(2)).sum::<f32>() / dim as f32;
let inv_std = 1.0 / (variance + eps).sqrt();
for (i, v) in slice.iter_mut().enumerate() {
*v = (*v - mean) * inv_std * weight[i] + bias[i];
}
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub fn rms_norm_ane(x: &mut [f32], weight: &[f32], batch_size: usize, dim: usize, eps: f32) {
debug_assert_eq!(x.len(), batch_size * dim);
debug_assert_eq!(weight.len(), dim);
for b in 0..batch_size {
let offset = b * dim;
let slice = &mut x[offset..offset + dim];
let sum_sq: f32 = slice.iter().map(|v| v * v).sum();
let rms = (sum_sq / dim as f32 + eps).sqrt();
let inv_rms = 1.0 / rms;
for (i, v) in slice.iter_mut().enumerate() {
*v = *v * inv_rms * weight[i];
}
}
}
fn gelu_scalar(x: &mut [f32]) {
const SQRT_2_OVER_PI: f32 = 0.7978845608;
const COEFF: f32 = 0.044715;
for v in x.iter_mut() {
let inner = SQRT_2_OVER_PI * (*v + COEFF * *v * *v * *v);
*v = 0.5 * *v * (1.0 + inner.tanh());
}
}
fn silu_scalar(x: &mut [f32]) {
for v in x.iter_mut() {
*v = *v / (1.0 + (-*v).exp());
}
}
fn softmax_scalar(x: &mut [f32]) {
if x.is_empty() {
return;
}
let max_val = x.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let mut sum = 0.0;
for v in x.iter_mut() {
*v = (*v - max_val).exp();
sum += *v;
}
let inv_sum = 1.0 / sum;
for v in x.iter_mut() {
*v *= inv_sum;
}
}
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
pub fn matmul_ane(_a: &[f32], _b: &[f32], _c: &mut [f32], _m: usize, _k: usize, _n: usize) {
panic!("ANE operations require macOS with 'coreml' feature enabled");
}
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
pub fn batched_matmul_ane(
_a: &[f32],
_b: &[f32],
_c: &mut [f32],
_batch_size: usize,
_m: usize,
_k: usize,
_n: usize,
) {
panic!("ANE operations require macOS with 'coreml' feature enabled");
}
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
pub fn gelu_ane(_x: &mut [f32], _batch_size: usize, _dim: usize) {
panic!("ANE operations require macOS with 'coreml' feature enabled");
}
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
pub fn silu_ane(_x: &mut [f32], _batch_size: usize, _dim: usize) {
panic!("ANE operations require macOS with 'coreml' feature enabled");
}
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
pub fn softmax_ane(_x: &mut [f32], _batch_size: usize, _dim: usize) {
panic!("ANE operations require macOS with 'coreml' feature enabled");
}
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
pub fn layer_norm_ane(
_x: &mut [f32],
_weight: &[f32],
_bias: &[f32],
_batch_size: usize,
_dim: usize,
_eps: f32,
) {
panic!("ANE operations require macOS with 'coreml' feature enabled");
}
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
pub fn rms_norm_ane(_x: &mut [f32], _weight: &[f32], _batch_size: usize, _dim: usize, _eps: f32) {
panic!("ANE operations require macOS with 'coreml' feature enabled");
}
pub fn matmul_auto(a: &[f32], b: &[f32], c: &mut [f32], m: usize, k: usize, n: usize) {
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
if should_use_ane_matmul(m, k, n) {
matmul_ane(a, b, c, m, k, n);
return;
}
}
#[cfg(all(target_os = "macos", feature = "accelerate"))]
{
crate::kernels::accelerate::gemm_accelerate(a, b, c, m, k, n);
return;
}
#[cfg(not(all(target_os = "macos", feature = "accelerate")))]
{
crate::kernels::matmul::gemm_neon(a, b, c, m, k, n);
}
}
pub fn gelu_auto(x: &mut [f32], batch_size: usize, dim: usize) {
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
if should_use_ane(batch_size, dim) {
gelu_ane(x, batch_size, dim);
return;
}
}
crate::kernels::activations::batch_gelu(x, dim);
}
pub fn silu_auto(x: &mut [f32], batch_size: usize, dim: usize) {
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
if should_use_ane(batch_size, dim) {
silu_ane(x, batch_size, dim);
return;
}
}
crate::kernels::activations::batch_silu(x, dim);
}
pub fn softmax_auto(x: &mut [f32], batch_size: usize, dim: usize) {
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
if should_use_ane(batch_size, dim) {
softmax_ane(x, batch_size, dim);
return;
}
}
crate::kernels::activations::batch_softmax(x, dim);
}
pub fn layer_norm_auto(
x: &mut [f32],
weight: &[f32],
bias: &[f32],
batch_size: usize,
dim: usize,
eps: f32,
) {
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
if should_use_ane(batch_size, dim) {
layer_norm_ane(x, weight, bias, batch_size, dim, eps);
return;
}
}
crate::kernels::norm::batched_layer_norm_neon(x, weight, bias, batch_size, dim, eps);
}
pub fn rms_norm_auto(x: &mut [f32], weight: &[f32], batch_size: usize, dim: usize, eps: f32) {
#[cfg(all(target_os = "macos", feature = "coreml"))]
{
if should_use_ane(batch_size, dim) {
rms_norm_ane(x, weight, batch_size, dim, eps);
return;
}
}
crate::kernels::norm::batched_rms_norm_neon(x, weight, batch_size, dim, eps);
}
#[cfg(test)]
mod tests {
use super::*;
const EPSILON: f32 = 1e-4;
const LOOSE_EPSILON: f32 = 0.01;
fn approx_eq(a: f32, b: f32, eps: f32) -> bool {
(a - b).abs() < eps
}
#[test]
fn test_ane_availability() {
let _ = is_ane_available();
}
#[test]
fn test_ane_availability_consistency() {
let result1 = is_ane_available();
let result2 = is_ane_available();
let result3 = is_ane_available();
assert_eq!(result1, result2);
assert_eq!(result2, result3);
}
#[test]
fn test_should_use_ane_thresholds() {
assert!(!should_use_ane(1, 32));
assert!(!should_use_ane(1, 100));
assert!(!should_use_ane(100, 256));
if is_ane_available() {
assert!(should_use_ane(1, 128));
assert!(should_use_ane(32, 256));
assert!(should_use_ane(64, 4096));
}
}
#[test]
fn test_should_use_ane_boundary_conditions() {
assert!(!should_use_ane(0, 64)); assert!(!should_use_ane(1, 63)); assert!(!should_use_ane(65, 64));
assert!(!should_use_ane(1, 65)); assert!(!should_use_ane(1, 17));
if is_ane_available() {
assert!(should_use_ane(1, 64)); assert!(should_use_ane(64, 64)); assert!(should_use_ane(1, 80)); }
}
#[test]
fn test_should_use_ane_matmul_boundaries() {
assert!(!should_use_ane_matmul(0, 64, 64));
if is_ane_available() {
assert!(should_use_ane_matmul(1, 64, 64));
assert!(should_use_ane_matmul(32, 128, 256));
}
}
#[test]
fn test_should_use_ane_activation() {
assert!(!should_use_ane_activation(0, 64));
if is_ane_available() {
assert!(should_use_ane_activation(1, 64));
assert!(should_use_ane_activation(64, 256));
assert!(should_use_ane_activation(100, 128));
}
assert!(!should_use_ane_activation(10000, 10000));
}
#[test]
fn test_get_ane_recommendation() {
let rec_small = get_ane_recommendation(1, 256, 256);
let rec_large = get_ane_recommendation(1, 4096, 4096);
if is_ane_available() {
assert!(rec_small.use_ane);
assert!(rec_small.confidence > 0.5);
assert!(rec_small.expected_speedup > 1.0);
assert!(!rec_large.use_ane);
assert!(rec_large.confidence > 0.5);
assert!(rec_large.expected_speedup < 1.0);
} else {
assert!(!rec_small.use_ane);
assert!(!rec_large.use_ane);
assert_eq!(rec_small.expected_speedup, 1.0);
assert_eq!(rec_large.expected_speedup, 1.0);
}
}
#[test]
fn test_ane_recommendation_struct() {
let rec = AneRecommendation {
use_ane: true,
confidence: 0.9,
reason: "Test reason",
expected_speedup: 1.5,
};
let cloned = rec.clone();
assert_eq!(rec.use_ane, cloned.use_ane);
assert_eq!(rec.confidence, cloned.confidence);
assert_eq!(rec.reason, cloned.reason);
assert_eq!(rec.expected_speedup, cloned.expected_speedup);
let debug_str = format!("{:?}", rec);
assert!(debug_str.contains("use_ane"));
assert!(debug_str.contains("confidence"));
}
#[test]
fn test_gelu_scalar_correctness() {
let mut x = vec![0.0, 1.0, -1.0, 2.0];
let expected = vec![
0.0, 0.8412, -0.159, 1.954, ];
gelu_scalar(&mut x);
for (got, exp) in x.iter().zip(expected.iter()) {
assert!(
approx_eq(*got, *exp, LOOSE_EPSILON),
"GELU mismatch: got {}, expected {}",
got,
exp
);
}
}
#[test]
fn test_gelu_scalar_edge_cases() {
let mut empty: Vec<f32> = vec![];
gelu_scalar(&mut empty);
assert!(empty.is_empty());
let mut single = vec![0.5];
gelu_scalar(&mut single);
assert!(single[0].is_finite());
let mut large = vec![100.0];
gelu_scalar(&mut large);
assert!(large[0].is_finite());
assert!(large[0] > 99.0);
let mut small = vec![-100.0];
gelu_scalar(&mut small);
assert!(small[0].is_finite());
assert!(small[0].abs() < 0.1); }
#[test]
fn test_gelu_scalar_zero() {
let mut x = vec![0.0];
gelu_scalar(&mut x);
assert_eq!(x[0], 0.0);
}
#[test]
fn test_gelu_scalar_symmetry() {
let mut pos = vec![1.0];
let mut neg = vec![-1.0];
gelu_scalar(&mut pos);
gelu_scalar(&mut neg);
assert!(pos[0] > neg[0].abs());
}
#[test]
fn test_silu_scalar_correctness() {
let mut x = vec![0.0f32, 1.0, -1.0, 2.0];
let expected: Vec<f32> = vec![0.0f32, 1.0, -1.0, 2.0]
.iter()
.map(|&v: &f32| v / (1.0 + (-v).exp()))
.collect();
silu_scalar(&mut x);
for (got, exp) in x.iter().zip(expected.iter()) {
assert!(
approx_eq(*got, *exp, EPSILON),
"SiLU mismatch: got {}, expected {}",
got,
exp
);
}
}
#[test]
fn test_silu_scalar_edge_cases() {
let mut empty: Vec<f32> = vec![];
silu_scalar(&mut empty);
assert!(empty.is_empty());
let mut single = vec![0.5];
silu_scalar(&mut single);
assert!(single[0].is_finite());
let mut large_pos = vec![50.0];
silu_scalar(&mut large_pos);
assert!(large_pos[0].is_finite());
assert!(approx_eq(large_pos[0], 50.0, 0.001));
let mut large_neg = vec![-50.0];
silu_scalar(&mut large_neg);
assert!(large_neg[0].is_finite());
assert!(large_neg[0].abs() < 0.001); }
#[test]
fn test_silu_scalar_zero() {
let mut x = vec![0.0];
silu_scalar(&mut x);
assert_eq!(x[0], 0.0);
}
#[test]
fn test_silu_scalar_monotonicity() {
let mut values: Vec<f32> = (0..100).map(|i| i as f32 * 0.1).collect();
silu_scalar(&mut values);
for i in 1..values.len() {
assert!(
values[i] >= values[i - 1],
"SiLU should be monotonic for positive x: {} < {} at indices {}, {}",
values[i],
values[i - 1],
i,
i - 1
);
}
}
#[test]
fn test_softmax_scalar_correctness() {
let mut x = vec![1.0, 2.0, 3.0, 4.0];
softmax_scalar(&mut x);
let sum: f32 = x.iter().sum();
assert!(
approx_eq(sum, 1.0, EPSILON),
"Softmax sum should be 1.0, got {}",
sum
);
assert!(x.iter().all(|&v| v > 0.0));
for i in 1..x.len() {
assert!(x[i] > x[i - 1], "Softmax should preserve order");
}
}
#[test]
fn test_softmax_scalar_empty() {
let mut empty: Vec<f32> = vec![];
softmax_scalar(&mut empty);
assert!(empty.is_empty());
}
#[test]
fn test_softmax_scalar_single_element() {
let mut single = vec![5.0];
softmax_scalar(&mut single);
assert!(approx_eq(single[0], 1.0, EPSILON));
}
#[test]
fn test_softmax_scalar_uniform() {
let mut uniform = vec![1.0, 1.0, 1.0, 1.0];
softmax_scalar(&mut uniform);
let expected = 0.25;
for v in &uniform {
assert!(approx_eq(*v, expected, EPSILON));
}
}
#[test]
fn test_softmax_scalar_numerical_stability() {
let mut large = vec![1000.0, 1001.0, 1002.0];
softmax_scalar(&mut large);
let sum: f32 = large.iter().sum();
assert!(
approx_eq(sum, 1.0, EPSILON),
"Softmax should sum to 1 even with large inputs"
);
assert!(large.iter().all(|v| v.is_finite()));
}
#[test]
fn test_softmax_scalar_negative_values() {
let mut negative = vec![-1.0, -2.0, -3.0];
softmax_scalar(&mut negative);
let sum: f32 = negative.iter().sum();
assert!(approx_eq(sum, 1.0, EPSILON));
assert!(negative.iter().all(|&v| v > 0.0));
assert!(negative[0] > negative[1]);
assert!(negative[1] > negative[2]);
}
#[test]
fn test_softmax_scalar_extreme_difference() {
let mut extreme = vec![0.0, 0.0, 100.0];
softmax_scalar(&mut extreme);
assert!(extreme[2] > 0.99, "Dominant value should be close to 1.0");
assert!(extreme[0] < 0.01 && extreme[1] < 0.01);
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_matmul_ane_correctness() {
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
let mut c = vec![0.0; 4];
matmul_ane(&a, &b, &mut c, 2, 2, 2);
assert!(approx_eq(c[0], 19.0, EPSILON));
assert!(approx_eq(c[1], 22.0, EPSILON));
assert!(approx_eq(c[2], 43.0, EPSILON));
assert!(approx_eq(c[3], 50.0, EPSILON));
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_matmul_ane_identity() {
let a = vec![1.0, 2.0, 3.0, 4.0];
let identity = vec![1.0, 0.0, 0.0, 1.0];
let mut c = vec![0.0; 4];
matmul_ane(&a, &identity, &mut c, 2, 2, 2);
for (got, exp) in c.iter().zip(a.iter()) {
assert!(approx_eq(*got, *exp, EPSILON));
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_matmul_ane_zero_matrix() {
let a = vec![1.0, 2.0, 3.0, 4.0];
let zero = vec![0.0; 4];
let mut c = vec![999.0; 4];
matmul_ane(&a, &zero, &mut c, 2, 2, 2);
for v in &c {
assert!(approx_eq(*v, 0.0, EPSILON));
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_matmul_ane_larger_matrices() {
let m = 8;
let k = 16;
let n = 8;
let a: Vec<f32> = (0..m * k).map(|i| (i % 10) as f32).collect();
let b: Vec<f32> = (0..k * n).map(|i| ((i + 1) % 10) as f32).collect();
let mut c = vec![0.0; m * n];
matmul_ane(&a, &b, &mut c, m, k, n);
assert!(c.iter().all(|v| v.is_finite()));
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_batched_matmul_ane() {
let batch_size = 2;
let m = 2;
let k = 2;
let n = 2;
let a = vec![
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ];
let b = vec![
1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 2.0, ];
let mut c = vec![0.0; batch_size * m * n];
batched_matmul_ane(&a, &b, &mut c, batch_size, m, k, n);
assert!(approx_eq(c[0], 1.0, EPSILON));
assert!(approx_eq(c[1], 2.0, EPSILON));
assert!(approx_eq(c[2], 3.0, EPSILON));
assert!(approx_eq(c[3], 4.0, EPSILON));
assert!(approx_eq(c[4], 10.0, EPSILON));
assert!(approx_eq(c[5], 12.0, EPSILON));
assert!(approx_eq(c[6], 14.0, EPSILON));
assert!(approx_eq(c[7], 16.0, EPSILON));
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_gelu_ane_matches_scalar() {
let dim = 64;
let batch_size = 4;
let mut x_ane: Vec<f32> = (0..batch_size * dim)
.map(|i| (i as f32) * 0.1 - 3.0)
.collect();
let mut x_scalar = x_ane.clone();
gelu_ane(&mut x_ane, batch_size, dim);
gelu_scalar(&mut x_scalar);
for i in 0..(batch_size * dim) {
assert!(
approx_eq(x_ane[i], x_scalar[i], LOOSE_EPSILON),
"GELU mismatch at {}: {} vs {}",
i,
x_ane[i],
x_scalar[i]
);
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_silu_ane_matches_scalar() {
let dim = 64;
let batch_size = 4;
let mut x_ane: Vec<f32> = (0..batch_size * dim)
.map(|i| (i as f32) * 0.1 - 3.0)
.collect();
let mut x_scalar = x_ane.clone();
silu_ane(&mut x_ane, batch_size, dim);
silu_scalar(&mut x_scalar);
for i in 0..(batch_size * dim) {
assert!(
approx_eq(x_ane[i], x_scalar[i], LOOSE_EPSILON),
"SiLU mismatch at {}: {} vs {}",
i,
x_ane[i],
x_scalar[i]
);
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_softmax_ane_matches_scalar() {
let dim = 64;
let batch_size = 4;
let mut x_ane: Vec<f32> = (0..batch_size * dim).map(|i| (i as f32) * 0.01).collect();
let mut x_scalar = x_ane.clone();
softmax_ane(&mut x_ane, batch_size, dim);
for chunk in x_scalar.chunks_mut(dim) {
softmax_scalar(chunk);
}
for i in 0..(batch_size * dim) {
assert!(
approx_eq(x_ane[i], x_scalar[i], LOOSE_EPSILON),
"Softmax mismatch at {}: {} vs {}",
i,
x_ane[i],
x_scalar[i]
);
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_layer_norm_ane() {
let dim = 16;
let batch_size = 2;
let mut x: Vec<f32> = (0..batch_size * dim).map(|i| (i as f32) * 0.1).collect();
let weight = vec![1.0; dim];
let bias = vec![0.0; dim];
layer_norm_ane(&mut x, &weight, &bias, batch_size, dim, 1e-6);
for b in 0..batch_size {
let offset = b * dim;
let mean: f32 = x[offset..offset + dim].iter().sum::<f32>() / dim as f32;
assert!(
mean.abs() < 1e-4,
"Batch {} mean should be ~0, got {}",
b,
mean
);
}
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_layer_norm_ane_with_weights() {
let dim = 8;
let batch_size = 1;
let mut x: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let weight = vec![2.0; dim]; let bias = vec![1.0; dim];
layer_norm_ane(&mut x, &weight, &bias, batch_size, dim, 1e-6);
let mean: f32 = x.iter().sum::<f32>() / dim as f32;
assert!(approx_eq(mean, 1.0, LOOSE_EPSILON));
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_rms_norm_ane() {
let dim = 16;
let batch_size = 2;
let mut x: Vec<f32> = (0..batch_size * dim)
.map(|i| (i as f32) * 0.1 + 0.1)
.collect();
let weight = vec![1.0; dim];
rms_norm_ane(&mut x, &weight, batch_size, dim, 1e-6);
assert!(x.iter().all(|v| v.is_finite()));
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_rms_norm_ane_constant_input() {
let dim = 8;
let batch_size = 1;
let mut x = vec![2.0; dim];
let weight = vec![1.0; dim];
rms_norm_ane(&mut x, &weight, batch_size, dim, 1e-6);
for v in &x {
assert!(approx_eq(*v, 1.0, LOOSE_EPSILON));
}
}
#[test]
fn test_auto_dispatch_functions() {
let dim = 64;
let batch_size = 2;
let mut x = vec![1.0f32; batch_size * dim];
gelu_auto(&mut x, batch_size, dim);
assert!(x.iter().all(|v| v.is_finite()));
let mut x = vec![1.0f32; batch_size * dim];
silu_auto(&mut x, batch_size, dim);
assert!(x.iter().all(|v| v.is_finite()));
let mut x = vec![1.0f32; batch_size * dim];
softmax_auto(&mut x, batch_size, dim);
let sum: f32 = x[0..dim].iter().sum();
assert!(approx_eq(sum, 1.0, EPSILON));
let mut x = vec![1.0f32; batch_size * dim];
let weight = vec![1.0f32; dim];
let bias = vec![0.0f32; dim];
layer_norm_auto(&mut x, &weight, &bias, batch_size, dim, 1e-6);
assert!(x.iter().all(|v| v.is_finite()));
let mut x = vec![1.0f32; batch_size * dim];
rms_norm_auto(&mut x, &weight, batch_size, dim, 1e-6);
assert!(x.iter().all(|v| v.is_finite()));
}
#[test]
fn test_auto_dispatch_small_dimensions() {
let dim = 32; let batch_size = 1;
let mut x = vec![1.0f32; batch_size * dim];
gelu_auto(&mut x, batch_size, dim);
assert!(x.iter().all(|v| v.is_finite()));
let mut x = vec![1.0f32; batch_size * dim];
silu_auto(&mut x, batch_size, dim);
assert!(x.iter().all(|v| v.is_finite()));
}
#[test]
fn test_auto_dispatch_large_batch() {
let dim = 128;
let batch_size = 100;
let mut x = vec![1.0f32; batch_size * dim];
gelu_auto(&mut x, batch_size, dim);
assert!(x.iter().all(|v| v.is_finite()));
}
#[test]
fn test_bnns_activation_function_values() {
assert_eq!(BNNSActivationFunction::Identity as i32, 0);
assert_eq!(BNNSActivationFunction::ReLU as i32, 1);
assert_eq!(BNNSActivationFunction::Sigmoid as i32, 3);
assert_eq!(BNNSActivationFunction::Softmax as i32, 6);
assert_eq!(BNNSActivationFunction::SiLU as i32, 50);
assert_eq!(BNNSActivationFunction::GELU as i32, 51);
}
#[test]
fn test_bnns_data_type_values() {
assert_eq!(BNNSDataType::Float16 as u32, 0x10010);
assert_eq!(BNNSDataType::Float32 as u32, 0x10020);
assert_eq!(BNNSDataType::Int8 as u32, 0x20008);
assert_eq!(BNNSDataType::Int16 as u32, 0x20010);
assert_eq!(BNNSDataType::Int32 as u32, 0x20020);
}
#[test]
fn test_bnns_activation_function_traits() {
let func = BNNSActivationFunction::GELU;
let cloned = func.clone();
let copied = func;
assert_eq!(func, cloned);
assert_eq!(func, copied);
let debug_str = format!("{:?}", func);
assert!(debug_str.contains("GELU"));
assert_eq!(BNNSActivationFunction::GELU, BNNSActivationFunction::GELU);
assert_ne!(BNNSActivationFunction::GELU, BNNSActivationFunction::SiLU);
}
#[test]
fn test_bnns_data_type_traits() {
let dtype = BNNSDataType::Float32;
let cloned = dtype.clone();
let copied = dtype;
assert_eq!(dtype, cloned);
assert_eq!(dtype, copied);
let debug_str = format!("{:?}", dtype);
assert!(debug_str.contains("Float32"));
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
fn test_bnns_nd_array_descriptor_default() {
let desc = BNNSNDArrayDescriptor::default();
assert_eq!(desc.flags, 0);
assert_eq!(desc.layout, 0);
assert_eq!(desc.size, [0; 8]);
assert_eq!(desc.stride, [0; 8]);
assert!(desc.data.is_null());
assert_eq!(desc.data_type, BNNSDataType::Float32);
assert!(desc.table_data.is_null());
assert_eq!(desc.table_data_type, BNNSDataType::Float32);
assert_eq!(desc.data_scale, 1.0);
assert_eq!(desc.data_bias, 0.0);
}
#[test]
fn test_gelu_precision_near_zero() {
let mut x: Vec<f32> = (-10..=10).map(|i| i as f32 * 0.01).collect();
gelu_scalar(&mut x);
for i in 1..x.len() - 1 {
let diff1 = x[i] - x[i - 1];
let diff2 = x[i + 1] - x[i];
assert!(
(diff1 - diff2).abs() < 0.1,
"Discontinuity detected at index {}",
i
);
}
}
#[test]
fn test_silu_precision_near_zero() {
let mut x: Vec<f32> = (-10..=10).map(|i| i as f32 * 0.01).collect();
silu_scalar(&mut x);
assert!(x.iter().all(|v| v.is_finite()));
for i in 11..x.len() {
assert!(x[i] >= x[i - 1], "SiLU should be monotonic for positive x");
}
}
#[test]
fn test_softmax_precision_extreme_values() {
let mut x = vec![-1000.0, 0.0, 1000.0];
softmax_scalar(&mut x);
assert!(x.iter().all(|v| v.is_finite()));
let sum: f32 = x.iter().sum();
assert!(approx_eq(sum, 1.0, EPSILON));
assert!(x[2] > 0.99);
}
#[test]
fn test_ane_availability_thread_safe() {
use std::sync::Arc;
use std::thread;
let results: Vec<_> = (0..4)
.map(|_| thread::spawn(|| is_ane_available()))
.collect();
let first = results.into_iter().next().unwrap().join().unwrap();
for _ in 0..3 {
assert_eq!(is_ane_available(), first);
}
}
#[test]
fn test_scalar_operations_concurrent() {
use std::thread;
let handles: Vec<_> = (0..4)
.map(|i| {
thread::spawn(move || {
let mut data: Vec<f32> = (0..64).map(|j| (i * 64 + j) as f32 * 0.1).collect();
gelu_scalar(&mut data);
data.iter().all(|v| v.is_finite())
})
})
.collect();
for handle in handles {
assert!(handle.join().unwrap());
}
}
#[test]
#[ignore] fn test_activation_performance() {
use std::time::Instant;
let dim = 4096;
let batch_size = 32;
let iterations = 100;
let mut data: Vec<f32> = (0..batch_size * dim)
.map(|i| (i as f32) * 0.001 - 1.0)
.collect();
let start = Instant::now();
for _ in 0..iterations {
gelu_scalar(&mut data);
}
let gelu_time = start.elapsed();
for (i, v) in data.iter_mut().enumerate() {
*v = (i as f32) * 0.001 - 1.0;
}
let start = Instant::now();
for _ in 0..iterations {
silu_scalar(&mut data);
}
let silu_time = start.elapsed();
println!(
"GELU: {:?} per iteration, SiLU: {:?} per iteration",
gelu_time / iterations as u32,
silu_time / iterations as u32
);
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
#[test]
#[ignore] fn test_ane_vs_scalar_performance() {
use std::time::Instant;
let dim = 4096;
let batch_size = 32;
let iterations = 100;
let mut data_scalar: Vec<f32> = (0..batch_size * dim)
.map(|i| (i as f32) * 0.001 - 1.0)
.collect();
let start = Instant::now();
for _ in 0..iterations {
gelu_scalar(&mut data_scalar);
}
let scalar_time = start.elapsed();
let mut data_ane: Vec<f32> = (0..batch_size * dim)
.map(|i| (i as f32) * 0.001 - 1.0)
.collect();
let start = Instant::now();
for _ in 0..iterations {
gelu_ane(&mut data_ane, batch_size, dim);
}
let ane_time = start.elapsed();
println!(
"Scalar GELU: {:?} total, ANE GELU: {:?} total, speedup: {:.2}x",
scalar_time,
ane_time,
scalar_time.as_secs_f64() / ane_time.as_secs_f64()
);
}
}