use crate::cpu::platform_optimization::{ArmMicroarchitecture, CpuFeatures};
use crate::error::BackendResult;
use torsh_core::error::TorshError;
use std::sync::OnceLock;
#[cfg(not(feature = "std"))]
use alloc::{boxed::Box, string::String, vec::Vec};
#[derive(Debug)]
pub struct EnhancedARM64Optimizer {
microarch: ArmMicroarchitecture,
features: CpuFeatures,
optimization_params: ARM64OptimizationParameters,
vector_unit_config: ARM64VectorUnitConfig,
cache_config: ARM64CacheConfiguration,
apple_silicon_config: Option<AppleSiliconConfig>,
execution_units: ARM64ExecutionUnitInfo,
}
#[derive(Debug, Clone)]
pub struct ARM64OptimizationParameters {
pub loop_unroll_factor: usize,
pub preferred_vector_width: usize,
pub sve_vector_length: Option<usize>,
pub max_memory_bandwidth_utilization: f64,
pub scheduling_window_size: usize,
pub branch_prediction_accuracy: f64,
pub cache_blocking_factors: ARM64CacheBlockingFactors,
pub parallel_params: ARM64ParallelExecutionParams,
pub apple_silicon_params: Option<AppleSiliconOptimizationParams>,
}
#[derive(Debug, Clone)]
pub struct ARM64CacheBlockingFactors {
pub l1_block_size: usize,
pub l2_block_size: usize,
pub l3_block_size: usize, pub tlb_block_size: usize,
pub system_cache_block_size: Option<usize>, }
#[derive(Debug, Clone)]
pub struct ARM64ParallelExecutionParams {
pub optimal_thread_count: usize,
pub performance_core_count: usize,
pub efficiency_core_count: usize,
pub work_stealing_threshold: usize,
pub chunk_size_multiplier: f64,
pub load_balancing_strategy: ARM64LoadBalancingStrategy,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ARM64LoadBalancingStrategy {
Uniform,
PerformancePreferred,
EfficiencyPreferred,
AdaptivePowerThermal,
NumaAware,
}
#[derive(Debug, Clone)]
pub struct ARM64VectorUnitConfig {
pub neon_available: bool,
pub sve_available: bool,
pub sve2_available: bool,
pub neon_register_count: usize,
pub sve_register_count: usize,
pub vector_lane_width: usize,
pub optimal_vector_size: usize,
pub crypto_available: bool,
pub fp16_available: bool,
pub bf16_available: bool,
pub i8mm_available: bool,
pub matrix_multiply_available: bool, }
#[derive(Debug, Clone)]
pub struct ARM64CacheConfiguration {
pub l1i_size: usize,
pub l1d_size: usize,
pub l2_size: usize,
pub l3_size: Option<usize>, pub system_cache_size: Option<usize>, pub l1_associativity: usize,
pub l2_associativity: usize,
pub l3_associativity: Option<usize>,
pub cache_line_size: usize,
pub tlb_entries: usize,
pub prefetch_distance: usize,
pub cache_coherency_level: CacheCoherencyLevel,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CacheCoherencyLevel {
PoC,
PoU,
SystemCache,
}
#[derive(Debug, Clone)]
pub struct AppleSiliconConfig {
pub neural_engine_available: bool,
pub neural_engine_ops_per_second: f64,
pub media_engine_available: bool,
pub accelerated_ml_compute: bool,
pub unified_memory_bandwidth: f64, pub performance_controller_available: bool,
pub thermal_management_advanced: bool,
pub memory_compression_available: bool,
pub secure_enclave_available: bool,
pub system_cache_size: usize,
}
#[derive(Debug, Clone)]
pub struct AppleSiliconOptimizationParams {
pub prefer_neural_engine_threshold: usize, pub use_media_engine_for_conv: bool,
pub optimize_for_unified_memory: bool,
pub thermal_throttling_mitigation: bool,
pub power_efficiency_mode: PowerEfficiencyMode,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PowerEfficiencyMode {
MaxPerformance,
Balanced,
MaxEfficiency,
Adaptive,
}
#[derive(Debug, Clone)]
pub struct ARM64ExecutionUnitInfo {
pub integer_units: usize,
pub fp_units: usize,
pub neon_units: usize,
pub sve_units: usize,
pub load_units: usize,
pub store_units: usize,
pub branch_units: usize,
pub crypto_units: usize,
pub issue_width: usize,
pub retire_width: usize,
pub out_of_order_window: usize,
}
impl EnhancedARM64Optimizer {
pub fn new() -> BackendResult<Self> {
let microarch = Self::detect_microarchitecture()?;
let features = Self::detect_cpu_features()?;
let optimization_params = Self::get_optimization_parameters(µarch, &features);
let vector_unit_config = Self::get_vector_unit_config(µarch, &features);
let cache_config = Self::detect_cache_configuration(µarch)?;
let apple_silicon_config = Self::detect_apple_silicon_config(µarch);
let execution_units = Self::get_execution_unit_info(µarch);
Ok(Self {
microarch,
features,
optimization_params,
vector_unit_config,
cache_config,
apple_silicon_config,
execution_units,
})
}
fn detect_microarchitecture() -> BackendResult<ArmMicroarchitecture> {
#[cfg(target_arch = "aarch64")]
{
if Self::is_apple_silicon() {
return Ok(Self::detect_apple_microarch());
}
#[cfg(target_os = "linux")]
{
if let Ok(cpuinfo) = std::fs::read_to_string("/proc/cpuinfo") {
return Ok(Self::parse_arm_cpuinfo(&cpuinfo));
}
}
}
Ok(ArmMicroarchitecture::Unknown)
}
#[cfg(target_arch = "aarch64")]
fn is_apple_silicon() -> bool {
#[cfg(target_os = "macos")]
{
use std::process::Command;
if let Ok(output) = Command::new("sysctl")
.arg("-n")
.arg("machdep.cpu.brand_string")
.output()
{
let brand = String::from_utf8_lossy(&output.stdout);
return brand.contains("Apple");
}
}
false
}
#[cfg(target_arch = "aarch64")]
fn detect_apple_microarch() -> ArmMicroarchitecture {
#[cfg(target_os = "macos")]
{
use std::process::Command;
if let Ok(output) = Command::new("sysctl")
.arg("-n")
.arg("machdep.cpu.brand_string")
.output()
{
let brand = String::from_utf8_lossy(&output.stdout);
if brand.contains("M1") {
return ArmMicroarchitecture::M1;
} else if brand.contains("M2") {
return ArmMicroarchitecture::M2;
} else if brand.contains("M3") {
return ArmMicroarchitecture::M3;
}
}
if let Ok(output) = Command::new("sysctl")
.arg("-n")
.arg("hw.ncpu")
.output()
{
let core_count_str = String::from_utf8_lossy(&output.stdout);
if let Ok(core_count) = core_count_str.trim().parse::<u32>() {
match core_count {
8 => ArmMicroarchitecture::M1, 10 => ArmMicroarchitecture::M2, 11..=16 => ArmMicroarchitecture::M3, _ => ArmMicroarchitecture::M1, }
} else {
ArmMicroarchitecture::M1
}
} else {
ArmMicroarchitecture::M1
}
}
#[cfg(not(target_os = "macos"))]
ArmMicroarchitecture::Unknown
}
#[cfg(target_os = "linux")]
fn parse_arm_cpuinfo(cpuinfo: &str) -> ArmMicroarchitecture {
for line in cpuinfo.lines() {
if line.starts_with("CPU part") {
if let Some(part) = line.split(':').nth(1) {
let part = part.trim();
match part {
"0xd03" => return ArmMicroarchitecture::CortexA53,
"0xd05" => return ArmMicroarchitecture::CortexA55,
"0xd07" => return ArmMicroarchitecture::CortexA57,
"0xd08" => return ArmMicroarchitecture::CortexA72,
"0xd09" => return ArmMicroarchitecture::CortexA73,
"0xd0a" => return ArmMicroarchitecture::CortexA75,
"0xd0b" => return ArmMicroarchitecture::CortexA76,
"0xd0c" => return ArmMicroarchitecture::CortexA77,
"0xd0d" => return ArmMicroarchitecture::CortexA78,
"0xd44" => return ArmMicroarchitecture::CortexX1,
"0xd46" => return ArmMicroarchitecture::CortexA510,
"0xd47" => return ArmMicroarchitecture::CortexA710,
"0xd48" => return ArmMicroarchitecture::CortexX2,
_ => {}
}
}
}
}
ArmMicroarchitecture::Unknown
}
fn detect_cpu_features() -> BackendResult<CpuFeatures> {
let mut features = CpuFeatures::default();
#[cfg(target_arch = "aarch64")]
{
features.neon = true; features.fp = true; features.asimd = true;
#[cfg(target_os = "linux")]
{
if let Ok(cpuinfo) = std::fs::read_to_string("/proc/cpuinfo") {
Self::parse_arm_features(&mut features, &cpuinfo);
}
}
#[cfg(target_os = "macos")]
{
Self::detect_macos_arm_features(&mut features);
}
}
Ok(features)
}
#[cfg(target_os = "linux")]
fn parse_arm_features(features: &mut CpuFeatures, cpuinfo: &str) {
for line in cpuinfo.lines() {
if line.starts_with("Features") {
if let Some(feature_list) = line.split(':').nth(1) {
for feature in feature_list.split_whitespace() {
match feature {
"aes" => features.aes_arm = true,
"pmull" => features.pmull = true,
"sha1" => features.sha1 = true,
"sha2" => features.sha256 = true,
"crc32" => features.crc32 = true,
"atomics" => features.atomics = true,
"fphp" => features.fphp = true,
"asimdhp" => features.asimdhp = true,
"cpuid" => features.cpuid = true,
"asimdrdm" => features.asimdrdm = true,
"jscvt" => features.jscvt = true,
"fcma" => features.fcma = true,
"lrcpc" => features.lrcpc = true,
"dcpop" => features.dcpop = true,
"sha3" => features.sha3 = true,
"sm3" => features.sm3 = true,
"sm4" => features.sm4 = true,
"asimddp" => features.asimddp = true,
"sha512" => features.sha512 = true,
"sve" => features.sve = true,
"sve2" => features.sve2 = true,
"sveaes" => features.sveaes = true,
"svepmull" => features.svepmull = true,
"svebitperm" => features.svebitperm = true,
"svesha3" => features.svesha3 = true,
"svesm4" => features.svesm4 = true,
"flagm" => features.flagm = true,
"ssbs" => features.ssbs = true,
"sb" => features.sb = true,
"paca" => features.paca = true,
"pacg" => features.pacg = true,
"dgh" => features.dgh = true,
"bf16" => features.bf16 = true,
"i8mm" => features.i8mm = true,
"rng" => features.rng = true,
"bti" => features.bti = true,
"mte" => features.mte = true,
_ => {}
}
}
}
}
}
}
#[cfg(target_os = "macos")]
fn detect_macos_arm_features(features: &mut CpuFeatures) {
use std::process::Command;
features.aes_arm = true;
features.pmull = true;
features.sha1 = true;
features.sha256 = true;
features.crc32 = true;
features.atomics = true;
features.fphp = true;
features.asimdhp = true;
features.asimdrdm = true;
features.jscvt = true;
features.fcma = true;
features.lrcpc = true;
features.dcpop = true;
features.sha3 = true;
features.asimddp = true;
features.sha512 = true;
features.flagm = true;
features.ssbs = true;
features.sb = true;
features.dgh = true;
if let Ok(output) = Command::new("sysctl")
.arg("-n")
.arg("machdep.cpu.brand_string")
.output()
{
let brand = String::from_utf8_lossy(&output.stdout);
if brand.contains("M2") || brand.contains("M3") {
features.bf16 = true;
features.i8mm = true;
}
}
}
fn get_optimization_parameters(microarch: &ArmMicroarchitecture, features: &CpuFeatures) -> ARM64OptimizationParameters {
let (loop_unroll, vector_width, sve_length, memory_bw, sched_window, branch_acc) = match microarch {
ArmMicroarchitecture::M1 => {
(8, 128, None, 0.95, 256, 0.98) }
ArmMicroarchitecture::M2 => {
(12, 128, None, 0.96, 288, 0.985) }
ArmMicroarchitecture::M3 => {
(16, 128, None, 0.97, 320, 0.99) }
ArmMicroarchitecture::CortexA78 | ArmMicroarchitecture::CortexX1 => {
(8, 128, if features.sve { Some(256) } else { None }, 0.88, 224, 0.95)
}
ArmMicroarchitecture::CortexA710 | ArmMicroarchitecture::CortexX2 => {
(12, 128, if features.sve { Some(512) } else { None }, 0.90, 256, 0.96)
}
ArmMicroarchitecture::NeoverseV1 => {
(16, 128, Some(2048), 0.92, 288, 0.97) }
_ => (4, 128, None, 0.85, 192, 0.93),
};
let cache_blocking = match microarch {
ArmMicroarchitecture::M1 | ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3 => {
ARM64CacheBlockingFactors {
l1_block_size: 48 * 1024, l2_block_size: 3 * 1024 * 1024, l3_block_size: 0, tlb_block_size: 16 * 1024 * 1024, system_cache_block_size: Some(24 * 1024 * 1024), }
}
ArmMicroarchitecture::CortexA78..=ArmMicroarchitecture::CortexX3 => {
ARM64CacheBlockingFactors {
l1_block_size: 48 * 1024, l2_block_size: 384 * 1024, l3_block_size: 2 * 1024 * 1024, tlb_block_size: 2 * 1024 * 1024,
system_cache_block_size: None,
}
}
ArmMicroarchitecture::NeoverseV1 | ArmMicroarchitecture::NeoverseN1 | ArmMicroarchitecture::NeoverseN2 => {
ARM64CacheBlockingFactors {
l1_block_size: 48 * 1024, l2_block_size: 768 * 1024, l3_block_size: 16 * 1024 * 1024, tlb_block_size: 2 * 1024 * 1024,
system_cache_block_size: None,
}
}
_ => {
ARM64CacheBlockingFactors {
l1_block_size: 24 * 1024,
l2_block_size: 192 * 1024,
l3_block_size: 1024 * 1024,
tlb_block_size: 2 * 1024 * 1024,
system_cache_block_size: None,
}
}
};
let (perf_cores, eff_cores) = match microarch {
ArmMicroarchitecture::M1 => (4, 4),
ArmMicroarchitecture::M2 => (4, 4),
ArmMicroarchitecture::M3 => (6, 6), _ => {
let total_cores = num_cpus::get();
(total_cores, 0) }
};
let parallel_params = ARM64ParallelExecutionParams {
optimal_thread_count: perf_cores + eff_cores,
performance_core_count: perf_cores,
efficiency_core_count: eff_cores,
work_stealing_threshold: 1000,
chunk_size_multiplier: 1.2, load_balancing_strategy: if eff_cores > 0 {
ARM64LoadBalancingStrategy::PerformancePreferred
} else {
ARM64LoadBalancingStrategy::Uniform
},
};
let apple_silicon_params = if matches!(microarch, ArmMicroarchitecture::M1 | ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3) {
Some(AppleSiliconOptimizationParams {
prefer_neural_engine_threshold: 10000,
use_media_engine_for_conv: true,
optimize_for_unified_memory: true,
thermal_throttling_mitigation: true,
power_efficiency_mode: PowerEfficiencyMode::Balanced,
})
} else {
None
};
ARM64OptimizationParameters {
loop_unroll_factor: loop_unroll,
preferred_vector_width: vector_width,
sve_vector_length: sve_length,
max_memory_bandwidth_utilization: memory_bw,
scheduling_window_size: sched_window,
branch_prediction_accuracy: branch_acc,
cache_blocking_factors: cache_blocking,
parallel_params,
apple_silicon_params,
}
}
fn get_vector_unit_config(microarch: &ArmMicroarchitecture, features: &CpuFeatures) -> ARM64VectorUnitConfig {
match microarch {
ArmMicroarchitecture::M1 | ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3 => {
ARM64VectorUnitConfig {
neon_available: true,
sve_available: false, sve2_available: false,
neon_register_count: 32,
sve_register_count: 0,
vector_lane_width: 128,
optimal_vector_size: 128,
crypto_available: true,
fp16_available: true,
bf16_available: matches!(microarch, ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3),
i8mm_available: matches!(microarch, ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3),
matrix_multiply_available: false, }
}
ArmMicroarchitecture::CortexA78..=ArmMicroarchitecture::CortexX3 => {
ARM64VectorUnitConfig {
neon_available: true,
sve_available: features.sve,
sve2_available: features.sve2,
neon_register_count: 32,
sve_register_count: if features.sve { 32 } else { 0 },
vector_lane_width: 128,
optimal_vector_size: if features.sve { 256 } else { 128 },
crypto_available: features.aes_arm && features.sha256,
fp16_available: features.fphp,
bf16_available: features.bf16,
i8mm_available: features.i8mm,
matrix_multiply_available: false,
}
}
ArmMicroarchitecture::NeoverseV1 | ArmMicroarchitecture::NeoverseN2 => {
ARM64VectorUnitConfig {
neon_available: true,
sve_available: true,
sve2_available: features.sve2,
neon_register_count: 32,
sve_register_count: 32,
vector_lane_width: 128,
optimal_vector_size: 512, crypto_available: true,
fp16_available: true,
bf16_available: features.bf16,
i8mm_available: features.i8mm,
matrix_multiply_available: false,
}
}
_ => {
ARM64VectorUnitConfig {
neon_available: true,
sve_available: features.sve,
sve2_available: features.sve2,
neon_register_count: 32,
sve_register_count: if features.sve { 32 } else { 0 },
vector_lane_width: 128,
optimal_vector_size: 128,
crypto_available: features.aes_arm,
fp16_available: features.fphp,
bf16_available: features.bf16,
i8mm_available: features.i8mm,
matrix_multiply_available: false,
}
}
}
}
fn detect_cache_configuration(microarch: &ArmMicroarchitecture) -> BackendResult<ARM64CacheConfiguration> {
let config = match microarch {
ArmMicroarchitecture::M1 => {
ARM64CacheConfiguration {
l1i_size: 128 * 1024,
l1d_size: 64 * 1024,
l2_size: 4 * 1024 * 1024,
l3_size: None,
system_cache_size: Some(32 * 1024 * 1024),
l1_associativity: 8,
l2_associativity: 12,
l3_associativity: None,
cache_line_size: 64,
tlb_entries: 128,
prefetch_distance: 256,
cache_coherency_level: CacheCoherencyLevel::SystemCache,
}
}
ArmMicroarchitecture::M2 => {
ARM64CacheConfiguration {
l1i_size: 128 * 1024,
l1d_size: 64 * 1024,
l2_size: 4 * 1024 * 1024,
l3_size: None,
system_cache_size: Some(32 * 1024 * 1024),
l1_associativity: 8,
l2_associativity: 12,
l3_associativity: None,
cache_line_size: 64,
tlb_entries: 128,
prefetch_distance: 384,
cache_coherency_level: CacheCoherencyLevel::SystemCache,
}
}
ArmMicroarchitecture::M3 => {
ARM64CacheConfiguration {
l1i_size: 128 * 1024,
l1d_size: 64 * 1024,
l2_size: 6 * 1024 * 1024, l3_size: None,
system_cache_size: Some(48 * 1024 * 1024),
l1_associativity: 8,
l2_associativity: 12,
l3_associativity: None,
cache_line_size: 64,
tlb_entries: 256,
prefetch_distance: 512,
cache_coherency_level: CacheCoherencyLevel::SystemCache,
}
}
ArmMicroarchitecture::CortexA78 | ArmMicroarchitecture::CortexX1 => {
ARM64CacheConfiguration {
l1i_size: 64 * 1024,
l1d_size: 64 * 1024,
l2_size: 512 * 1024,
l3_size: Some(2 * 1024 * 1024),
system_cache_size: None,
l1_associativity: 4,
l2_associativity: 8,
l3_associativity: Some(16),
cache_line_size: 64,
tlb_entries: 64,
prefetch_distance: 128,
cache_coherency_level: CacheCoherencyLevel::PoC,
}
}
ArmMicroarchitecture::NeoverseV1 => {
ARM64CacheConfiguration {
l1i_size: 64 * 1024,
l1d_size: 64 * 1024,
l2_size: 1024 * 1024,
l3_size: Some(32 * 1024 * 1024), system_cache_size: None,
l1_associativity: 4,
l2_associativity: 8,
l3_associativity: Some(16),
cache_line_size: 64,
tlb_entries: 128,
prefetch_distance: 256,
cache_coherency_level: CacheCoherencyLevel::PoC,
}
}
_ => {
ARM64CacheConfiguration {
l1i_size: 32 * 1024,
l1d_size: 32 * 1024,
l2_size: 256 * 1024,
l3_size: Some(1024 * 1024),
system_cache_size: None,
l1_associativity: 4,
l2_associativity: 8,
l3_associativity: Some(8),
cache_line_size: 64,
tlb_entries: 64,
prefetch_distance: 64,
cache_coherency_level: CacheCoherencyLevel::PoC,
}
}
};
Ok(config)
}
fn detect_apple_silicon_config(microarch: &ArmMicroarchitecture) -> Option<AppleSiliconConfig> {
match microarch {
ArmMicroarchitecture::M1 => {
Some(AppleSiliconConfig {
neural_engine_available: true,
neural_engine_ops_per_second: 11.5e12, media_engine_available: true,
accelerated_ml_compute: true,
unified_memory_bandwidth: 68.25, performance_controller_available: true,
thermal_management_advanced: true,
memory_compression_available: true,
secure_enclave_available: true,
system_cache_size: 32 * 1024 * 1024,
})
}
ArmMicroarchitecture::M2 => {
Some(AppleSiliconConfig {
neural_engine_available: true,
neural_engine_ops_per_second: 15.8e12, media_engine_available: true,
accelerated_ml_compute: true,
unified_memory_bandwidth: 100.0, performance_controller_available: true,
thermal_management_advanced: true,
memory_compression_available: true,
secure_enclave_available: true,
system_cache_size: 32 * 1024 * 1024,
})
}
ArmMicroarchitecture::M3 => {
Some(AppleSiliconConfig {
neural_engine_available: true,
neural_engine_ops_per_second: 18.0e12, media_engine_available: true,
accelerated_ml_compute: true,
unified_memory_bandwidth: 150.0, performance_controller_available: true,
thermal_management_advanced: true,
memory_compression_available: true,
secure_enclave_available: true,
system_cache_size: 48 * 1024 * 1024,
})
}
_ => None,
}
}
fn get_execution_unit_info(microarch: &ArmMicroarchitecture) -> ARM64ExecutionUnitInfo {
match microarch {
ArmMicroarchitecture::M1 => {
ARM64ExecutionUnitInfo {
integer_units: 6,
fp_units: 4,
neon_units: 4,
sve_units: 0,
load_units: 3,
store_units: 2,
branch_units: 2,
crypto_units: 2,
issue_width: 8,
retire_width: 8,
out_of_order_window: 630, }
}
ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3 => {
ARM64ExecutionUnitInfo {
integer_units: 6,
fp_units: 4,
neon_units: 4,
sve_units: 0,
load_units: 3,
store_units: 2,
branch_units: 2,
crypto_units: 2,
issue_width: 8,
retire_width: 8,
out_of_order_window: 700, }
}
ArmMicroarchitecture::CortexA78 | ArmMicroarchitecture::CortexX1 => {
ARM64ExecutionUnitInfo {
integer_units: 4,
fp_units: 2,
neon_units: 2,
sve_units: 1,
load_units: 2,
store_units: 1,
branch_units: 1,
crypto_units: 1,
issue_width: 4,
retire_width: 4,
out_of_order_window: 288,
}
}
ArmMicroarchitecture::NeoverseV1 => {
ARM64ExecutionUnitInfo {
integer_units: 8,
fp_units: 4,
neon_units: 2,
sve_units: 4, load_units: 3,
store_units: 2,
branch_units: 2,
crypto_units: 2,
issue_width: 8,
retire_width: 8,
out_of_order_window: 288,
}
}
_ => {
ARM64ExecutionUnitInfo {
integer_units: 2,
fp_units: 2,
neon_units: 1,
sve_units: 0,
load_units: 1,
store_units: 1,
branch_units: 1,
crypto_units: 0,
issue_width: 2,
retire_width: 2,
out_of_order_window: 128,
}
}
}
}
pub fn get_matmul_params(&self, m: usize, n: usize, k: usize) -> ARM64MatmulParams {
let vector_width = self.vector_unit_config.optimal_vector_size / 32; let cache_blocking = &self.optimization_params.cache_blocking_factors;
let (block_m, block_n, block_k) = if matches!(self.microarch, ArmMicroarchitecture::M1 | ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3) {
let use_large_blocks = m * n * k > 1_000_000;
if use_large_blocks {
(
(cache_blocking.l1_block_size / (k * 4)).min(512).max(64),
(cache_blocking.l2_block_size / (m * 4)).min(1024).max(128),
(cache_blocking.system_cache_block_size.unwrap_or(0) / (m * n * 4)).min(2048).max(256),
)
} else {
(
(cache_blocking.l1_block_size / (k * 4)).min(256).max(32),
(cache_blocking.l2_block_size / (m * 4)).min(512).max(64),
(cache_blocking.system_cache_block_size.unwrap_or(0) / (m * n * 4)).min(1024).max(128),
)
}
} else {
(
(cache_blocking.l1_block_size / (k * 4)).min(128).max(16),
(cache_blocking.l2_block_size / (m * 4)).min(256).max(32),
(cache_blocking.l3_block_size / (m * n * 4)).min(512).max(64),
)
};
ARM64MatmulParams {
block_m,
block_n,
block_k,
vector_width,
unroll_factor: self.optimization_params.loop_unroll_factor,
use_neon: self.vector_unit_config.neon_available,
use_sve: self.vector_unit_config.sve_available && m * n * k > 10000,
use_fp16: self.vector_unit_config.fp16_available,
use_bf16: self.vector_unit_config.bf16_available,
use_i8mm: self.vector_unit_config.i8mm_available,
prefer_neural_engine: self.should_use_neural_engine(m * n * k),
}
}
pub fn get_conv_params(&self, batch: usize, channels: usize, height: usize, width: usize) -> ARM64ConvParams {
let total_elements = batch * channels * height * width;
let vector_width = self.vector_unit_config.optimal_vector_size / 32;
let (tile_h, tile_w, unroll_channels) = match self.microarch {
ArmMicroarchitecture::M1 | ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3 => {
if total_elements > 2_000_000 {
(32, 32, 16) } else {
(16, 16, 8)
}
}
ArmMicroarchitecture::NeoverseV1 | ArmMicroarchitecture::NeoverseN2 => {
if total_elements > 1_000_000 {
(24, 24, 16)
} else {
(12, 12, 8)
}
}
_ => {
if total_elements > 500_000 {
(16, 16, 8)
} else {
(8, 8, 4)
}
}
};
ARM64ConvParams {
tile_height: tile_h,
tile_width: tile_w,
channel_unroll: unroll_channels,
vector_width,
use_neon: self.vector_unit_config.neon_available,
use_sve: self.vector_unit_config.sve_available,
use_fp16: self.vector_unit_config.fp16_available && total_elements > 100_000,
use_winograd: height >= 3 && width >= 3 && total_elements > 50_000,
prefer_media_engine: self.should_use_media_engine(total_elements),
}
}
fn should_use_neural_engine(&self, element_count: usize) -> bool {
if let Some(apple_config) = &self.apple_silicon_config {
if let Some(apple_params) = &self.optimization_params.apple_silicon_params {
return apple_config.neural_engine_available &&
element_count >= apple_params.prefer_neural_engine_threshold;
}
}
false
}
fn should_use_media_engine(&self, element_count: usize) -> bool {
if let Some(apple_config) = &self.apple_silicon_config {
if let Some(apple_params) = &self.optimization_params.apple_silicon_params {
return apple_config.media_engine_available &&
apple_params.use_media_engine_for_conv &&
element_count > 100_000;
}
}
false
}
pub fn get_microarch_info(&self) -> ARM64MicroarchInfo {
ARM64MicroarchInfo {
name: format!("{:?}", self.microarch),
vendor: if matches!(self.microarch, ArmMicroarchitecture::M1 | ArmMicroarchitecture::M2 | ArmMicroarchitecture::M3) {
"Apple".to_string()
} else {
"ARM".to_string()
},
features: self.features,
optimization_params: self.optimization_params.clone(),
cache_config: self.cache_config.clone(),
apple_silicon_config: self.apple_silicon_config.clone(),
}
}
}
#[derive(Debug, Clone)]
pub struct ARM64MatmulParams {
pub block_m: usize,
pub block_n: usize,
pub block_k: usize,
pub vector_width: usize,
pub unroll_factor: usize,
pub use_neon: bool,
pub use_sve: bool,
pub use_fp16: bool,
pub use_bf16: bool,
pub use_i8mm: bool,
pub prefer_neural_engine: bool,
}
#[derive(Debug, Clone)]
pub struct ARM64ConvParams {
pub tile_height: usize,
pub tile_width: usize,
pub channel_unroll: usize,
pub vector_width: usize,
pub use_neon: bool,
pub use_sve: bool,
pub use_fp16: bool,
pub use_winograd: bool,
pub prefer_media_engine: bool,
}
#[derive(Debug, Clone)]
pub struct ARM64MicroarchInfo {
pub name: String,
pub vendor: String,
pub features: CpuFeatures,
pub optimization_params: ARM64OptimizationParameters,
pub cache_config: ARM64CacheConfiguration,
pub apple_silicon_config: Option<AppleSiliconConfig>,
}
static GLOBAL_ARM64_OPTIMIZER: OnceLock<EnhancedARM64Optimizer> = OnceLock::new();
pub fn get_arm64_optimizer() -> &'static EnhancedARM64Optimizer {
GLOBAL_ARM64_OPTIMIZER.get_or_init(|| {
EnhancedARM64Optimizer::new().unwrap_or_else(|_| {
EnhancedARM64Optimizer {
microarch: ArmMicroarchitecture::Unknown,
features: CpuFeatures::default(),
optimization_params: ARM64OptimizationParameters {
loop_unroll_factor: 4,
preferred_vector_width: 128,
sve_vector_length: None,
max_memory_bandwidth_utilization: 0.8,
scheduling_window_size: 128,
branch_prediction_accuracy: 0.9,
cache_blocking_factors: ARM64CacheBlockingFactors {
l1_block_size: 24 * 1024,
l2_block_size: 192 * 1024,
l3_block_size: 1024 * 1024,
tlb_block_size: 2 * 1024 * 1024,
system_cache_block_size: None,
},
parallel_params: ARM64ParallelExecutionParams {
optimal_thread_count: num_cpus::get(),
performance_core_count: num_cpus::get(),
efficiency_core_count: 0,
work_stealing_threshold: 1000,
chunk_size_multiplier: 1.0,
load_balancing_strategy: ARM64LoadBalancingStrategy::Uniform,
},
apple_silicon_params: None,
},
vector_unit_config: ARM64VectorUnitConfig {
neon_available: true,
sve_available: false,
sve2_available: false,
neon_register_count: 32,
sve_register_count: 0,
vector_lane_width: 128,
optimal_vector_size: 128,
crypto_available: false,
fp16_available: false,
bf16_available: false,
i8mm_available: false,
matrix_multiply_available: false,
},
cache_config: ARM64CacheConfiguration {
l1i_size: 32 * 1024,
l1d_size: 32 * 1024,
l2_size: 256 * 1024,
l3_size: Some(1024 * 1024),
system_cache_size: None,
l1_associativity: 4,
l2_associativity: 8,
l3_associativity: Some(8),
cache_line_size: 64,
tlb_entries: 64,
prefetch_distance: 64,
cache_coherency_level: CacheCoherencyLevel::PoC,
},
apple_silicon_config: None,
execution_units: ARM64ExecutionUnitInfo {
integer_units: 2,
fp_units: 2,
neon_units: 1,
sve_units: 0,
load_units: 1,
store_units: 1,
branch_units: 1,
crypto_units: 0,
issue_width: 2,
retire_width: 2,
out_of_order_window: 128,
},
}
})
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_arm64_optimizer_creation() {
let optimizer = EnhancedARM64Optimizer::new();
assert!(optimizer.is_ok());
}
#[test]
fn test_arm64_feature_detection() {
let features = EnhancedARM64Optimizer::detect_cpu_features();
assert!(features.is_ok());
#[cfg(target_arch = "aarch64")]
{
let features = features.expect("operation should succeed");
assert!(features.neon);
assert!(features.fp);
assert!(features.asimd);
}
}
#[test]
fn test_arm64_matmul_params() {
let optimizer = get_arm64_optimizer();
let params = optimizer.get_matmul_params(128, 128, 128);
assert!(params.block_m > 0);
assert!(params.block_n > 0);
assert!(params.block_k > 0);
assert!(params.vector_width > 0);
assert!(params.use_neon);
}
#[test]
fn test_arm64_conv_params() {
let optimizer = get_arm64_optimizer();
let params = optimizer.get_conv_params(1, 32, 224, 224);
assert!(params.tile_height > 0);
assert!(params.tile_width > 0);
assert!(params.channel_unroll > 0);
assert!(params.vector_width > 0);
assert!(params.use_neon);
}
#[test]
fn test_arm64_microarch_info() {
let optimizer = get_arm64_optimizer();
let info = optimizer.get_microarch_info();
assert!(!info.name.is_empty());
assert!(!info.vendor.is_empty());
}
#[cfg(target_arch = "aarch64")]
#[test]
fn test_apple_silicon_detection() {
let is_apple = EnhancedARM64Optimizer::is_apple_silicon();
assert!(is_apple || !is_apple);
}
#[test]
fn test_cache_configuration() {
let optimizer = get_arm64_optimizer();
let cache = &optimizer.cache_config;
assert!(cache.l1d_size > 0);
assert!(cache.l2_size > 0);
assert!(cache.cache_line_size > 0);
}
}