pub mod activations;
pub mod attention;
pub mod matmul;
pub mod norm;
pub mod quantized;
pub mod rope;
#[cfg(any(target_os = "macos", doc))]
pub mod accelerate;
pub mod ane_ops;
pub use attention::{
flash_attention_auto,
flash_attention_into,
flash_attention_neon,
flash_attention_v2,
flash_attention_with_scratch,
grouped_query_attention_neon,
multi_query_attention_neon,
paged_attention_neon,
select_block_size,
AttentionScratch,
PagedKvCache,
BLOCK_SIZE_LARGE,
BLOCK_SIZE_MEDIUM,
BLOCK_SIZE_SMALL,
};
#[cfg(not(target_arch = "wasm32"))]
pub use attention::THREAD_LOCAL_SCRATCH;
#[cfg(all(feature = "parallel", not(target_arch = "wasm32")))]
pub use attention::{
grouped_query_attention_parallel, multi_head_attention_parallel, multi_query_attention_parallel,
};
pub use matmul::{batched_gemm_neon, gemm_neon, gemv_neon};
#[cfg(all(feature = "parallel", not(target_arch = "wasm32")))]
pub use matmul::{
batched_gemm_parallel, configure_thread_pool, gemm_parallel, gemv_parallel, get_physical_cores,
};
pub use norm::{layer_norm_neon, rms_norm_neon};
pub use quantized::{
dequantize_int4, dequantize_int8, int4_gemv_neon, int8_gemv_neon, q4k_gemv_neon,
quantize_to_int4, quantize_to_int8, quantize_to_q4k, BlockQ4K, QuantizedInt4, QuantizedInt8,
INT4_BLOCK_SIZE, Q4K_SUPER_BLOCK_SIZE,
};
pub use rope::{apply_rope_neon, precompute_rope_tables, RopeConfig};
pub use activations::{
batch_gelu, batch_silu, batch_softmax, gelu, gelu_exact, gelu_vec, leaky_relu, relu, relu_vec,
silu, silu_vec, softmax, softmax_temperature, softmax_vec,
};
#[cfg(all(target_os = "macos", feature = "accelerate"))]
pub use accelerate::{
axpy_accelerate, dot_accelerate, gemm_accelerate, gemv_accelerate, gemv_scaled_accelerate,
gemv_transpose_accelerate, is_accelerate_available, scal_accelerate, should_use_accelerate,
MatrixLayout,
};
#[cfg(not(all(target_os = "macos", feature = "accelerate")))]
pub fn is_accelerate_available() -> bool {
false
}
#[cfg(all(target_os = "macos", feature = "coreml"))]
pub use ane_ops::{
batched_matmul_ane,
gelu_ane,
gelu_auto,
get_ane_recommendation,
is_ane_available,
layer_norm_ane,
layer_norm_auto,
matmul_ane,
matmul_auto,
rms_norm_ane,
rms_norm_auto,
should_use_ane,
should_use_ane_activation,
should_use_ane_matmul,
silu_ane,
silu_auto,
softmax_ane,
softmax_auto,
AneRecommendation,
};
#[cfg(not(all(target_os = "macos", feature = "coreml")))]
pub use ane_ops::is_ane_available;
pub const NEON_LANE_WIDTH: usize = 4;
pub const UNROLL_FACTOR: usize = 4;
pub const PREFETCH_DISTANCE: usize = 64;
#[inline(always)]
pub fn is_neon_available() -> bool {
#[cfg(target_arch = "aarch64")]
{
true }
#[cfg(not(target_arch = "aarch64"))]
{
false
}
}
#[derive(Debug, Clone, Copy)]
pub struct AttentionConfig {
pub num_heads: usize,
pub num_kv_heads: usize,
pub head_dim: usize,
pub max_seq_len: usize,
pub causal: bool,
pub scale: f32,
}
impl Default for AttentionConfig {
fn default() -> Self {
Self {
num_heads: 32,
num_kv_heads: 8,
head_dim: 128,
max_seq_len: 4096,
causal: true,
scale: 0.0, }
}
}
impl AttentionConfig {
#[inline(always)]
pub fn effective_scale(&self) -> f32 {
if self.scale == 0.0 {
1.0 / (self.head_dim as f32).sqrt()
} else {
self.scale
}
}
#[inline(always)]
pub fn gqa_ratio(&self) -> usize {
self.num_heads / self.num_kv_heads
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_attention_config() {
let config = AttentionConfig::default();
assert_eq!(config.gqa_ratio(), 4);
assert!((config.effective_scale() - 0.088388).abs() < 0.001);
}
#[test]
fn test_neon_available() {
#[cfg(target_arch = "aarch64")]
assert!(is_neon_available());
}
}