ruvllm 2.2.1 - Docs.rs

//! Intelligent Auto-Detection System for RuvLLM
//!
//! This module provides automatic detection of system capabilities and optimal
//! configuration selection based on the runtime environment. It handles:
//!
//! - Platform and architecture detection (macOS, Linux, Windows, WASM, iOS, Android)
//! - CPU feature detection (NEON, AVX2, AVX-512, SSE4.2)
//! - GPU capability detection (Metal, CUDA, WebGPU)
//! - Memory and core count detection
//! - Automatic configuration selection based on detected capabilities
//!
//! ## Quick Start
//!
//! ```rust,ignore
//! use ruvllm::autodetect::{SystemCapabilities, InferenceConfig};
//!
//! // Auto-detect system capabilities
//! let caps = SystemCapabilities::detect();
//! println!("Platform: {:?}, Arch: {:?}", caps.platform, caps.arch);
//! println!("GPU: {:?}", caps.gpu);
//!
//! // Get optimal configuration
//! let config = caps.optimal_config();
//! println!("Recommended backend: {:?}", config.compute_backend);
//! println!("Recommended threads: {}", config.thread_count);
//!
//! // Or use auto-configuration directly
//! let config = InferenceConfig::auto();
//! ```
//!
//! ## Platform Support Matrix
//!
//! | Platform | Architecture | GPU Backend | Features |
//! |----------|--------------|-------------|----------|
//! | macOS | aarch64 | Metal | NEON always available |
//! | macOS | x86_64 | Metal | AVX2/AVX-512 if available |
//! | Linux | x86_64 | CUDA/CPU | AVX2/AVX-512, SSE4.2 |
//! | Linux | aarch64 | CPU | NEON always available |
//! | Windows | x86_64 | CUDA/CPU | AVX2/AVX-512, SSE4.2 |
//! | WASM | wasm32 | WebGPU | Limited feature detection |
//! | iOS | aarch64 | Metal | NEON always available |
//! | Android | aarch64 | CPU | NEON always available |

use serde::{Deserialize, Serialize};

#[cfg(feature = "coreml")]
use crate::backends::{AneCapabilities, ComputeUnits};
use crate::backends::{DType, DeviceType, Quantization};
use crate::kernels::AttentionConfig;

// =============================================================================
// Platform and Architecture Types
// =============================================================================

/// Supported operating system platforms
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Platform {
    /// macOS (Intel or Apple Silicon)
    MacOS,
    /// Linux distributions
    Linux,
    /// Windows
    Windows,
    /// WebAssembly (browser or Node.js)
    Wasm,
    /// iOS (iPhone, iPad)
    IOS,
    /// Android
    Android,
    /// Unknown or unsupported platform
    Unknown,
}

impl Default for Platform {
    fn default() -> Self {
        Self::detect()
    }
}

impl Platform {
    /// Detect the current platform at compile time with runtime refinement
    pub fn detect() -> Self {
        #[cfg(target_os = "macos")]
        {
            Self::MacOS
        }

        #[cfg(target_os = "linux")]
        {
            // Check if running on Android (Linux kernel)
            #[cfg(target_os = "android")]
            {
                Self::Android
            }
            #[cfg(not(target_os = "android"))]
            {
                Self::Linux
            }
        }

        #[cfg(target_os = "windows")]
        {
            Self::Windows
        }

        #[cfg(target_arch = "wasm32")]
        {
            Self::Wasm
        }

        #[cfg(target_os = "ios")]
        {
            Self::IOS
        }

        #[cfg(target_os = "android")]
        {
            Self::Android
        }

        #[cfg(not(any(
            target_os = "macos",
            target_os = "linux",
            target_os = "windows",
            target_arch = "wasm32",
            target_os = "ios",
            target_os = "android"
        )))]
        {
            Self::Unknown
        }
    }

    /// Check if this platform supports GPU acceleration
    pub fn supports_gpu(&self) -> bool {
        matches!(
            self,
            Self::MacOS | Self::Linux | Self::Windows | Self::IOS | Self::Wasm
        )
    }

    /// Get the default GPU backend for this platform
    pub fn default_gpu_backend(&self) -> Option<GpuBackend> {
        match self {
            Self::MacOS | Self::IOS => Some(GpuBackend::Metal),
            Self::Linux | Self::Windows => Some(GpuBackend::Cuda),
            Self::Wasm => Some(GpuBackend::WebGPU),
            Self::Android | Self::Unknown => None,
        }
    }
}

/// CPU architecture
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Architecture {
    /// ARM 64-bit (Apple Silicon, ARM servers)
    Aarch64,
    /// x86 64-bit (Intel, AMD)
    X86_64,
    /// WebAssembly 32-bit
    Wasm32,
    /// Unknown architecture
    Unknown,
}

impl Default for Architecture {
    fn default() -> Self {
        Self::detect()
    }
}

impl Architecture {
    /// Detect the current architecture
    pub fn detect() -> Self {
        #[cfg(target_arch = "aarch64")]
        {
            Self::Aarch64
        }

        #[cfg(target_arch = "x86_64")]
        {
            Self::X86_64
        }

        #[cfg(target_arch = "wasm32")]
        {
            Self::Wasm32
        }

        #[cfg(not(any(
            target_arch = "aarch64",
            target_arch = "x86_64",
            target_arch = "wasm32"
        )))]
        {
            Self::Unknown
        }
    }

    /// Check if SIMD is available for this architecture
    pub fn has_simd(&self) -> bool {
        matches!(self, Self::Aarch64 | Self::X86_64)
    }
}

// =============================================================================
// CPU Features Detection
// =============================================================================

/// CPU SIMD feature flags
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct CpuFeatures {
    /// ARM NEON (128-bit SIMD, always available on aarch64)
    pub neon: bool,
    /// Intel/AMD AVX2 (256-bit SIMD)
    pub avx2: bool,
    /// Intel AVX-512 (512-bit SIMD)
    pub avx512: bool,
    /// Intel SSE 4.2
    pub sse42: bool,
    /// ARM SVE (Scalable Vector Extension)
    pub sve: bool,
    /// ARM SVE2
    pub sve2: bool,
}

impl CpuFeatures {
    /// Detect CPU features at runtime
    pub fn detect() -> Self {
        let mut features = Self::default();

        // aarch64 detection
        #[cfg(target_arch = "aarch64")]
        {
            // NEON is always available on aarch64
            features.neon = true;

            // SVE/SVE2 detection would require runtime checks
            // For now, assume not available unless we can detect it
            #[cfg(target_os = "linux")]
            {
                // On Linux, we could check /proc/cpuinfo or use getauxval
                // For simplicity, assume SVE is not available
                features.sve = false;
                features.sve2 = false;
            }
        }

        // x86_64 detection
        #[cfg(target_arch = "x86_64")]
        {
            #[cfg(target_feature = "sse4.2")]
            {
                features.sse42 = true;
            }

            #[cfg(target_feature = "avx2")]
            {
                features.avx2 = true;
            }

            #[cfg(target_feature = "avx512f")]
            {
                features.avx512 = true;
            }

            // Runtime detection using std::arch (if the feature was not detected at compile time)
            #[cfg(not(target_feature = "avx2"))]
            {
                features.avx2 = Self::detect_avx2_runtime();
            }

            #[cfg(not(target_feature = "sse4.2"))]
            {
                features.sse42 = Self::detect_sse42_runtime();
            }
        }

        features
    }

    /// Runtime AVX2 detection for x86_64
    #[cfg(target_arch = "x86_64")]
    fn detect_avx2_runtime() -> bool {
        #[cfg(all(target_arch = "x86_64", not(target_feature = "avx2")))]
        {
            // ruvllm always links std; no `feature = "std"` gate needed.
            std::arch::is_x86_feature_detected!("avx2")
        }
        #[cfg(target_feature = "avx2")]
        {
            true
        }
    }

    /// Runtime SSE 4.2 detection for x86_64
    #[cfg(target_arch = "x86_64")]
    fn detect_sse42_runtime() -> bool {
        #[cfg(all(target_arch = "x86_64", not(target_feature = "sse4.2")))]
        {
            std::arch::is_x86_feature_detected!("sse4.2")
        }
        #[cfg(target_feature = "sse4.2")]
        {
            true
        }
    }

    /// Get the best available SIMD width in bits
    pub fn best_simd_width(&self) -> usize {
        if self.avx512 {
            512
        } else if self.avx2 {
            256
        } else if self.neon || self.sse42 {
            128
        } else {
            0
        }
    }

    /// Get the number of floats that can be processed in parallel
    pub fn simd_float_lanes(&self) -> usize {
        self.best_simd_width() / 32 // f32 is 32 bits
    }
}

// =============================================================================
// GPU Capabilities
// =============================================================================

/// GPU compute backend types
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum GpuBackend {
    /// Apple Metal (macOS, iOS)
    Metal,
    /// NVIDIA CUDA
    Cuda,
    /// WebGPU (browser, cross-platform)
    WebGPU,
    /// Vulkan compute
    Vulkan,
    /// OpenCL
    OpenCL,
}

/// GPU capabilities and specifications
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct GpuCapabilities {
    /// GPU compute backend
    pub backend: GpuBackend,
    /// Video RAM in megabytes (if detectable)
    pub vram_mb: Option<usize>,
    /// Number of compute units/streaming multiprocessors
    pub compute_units: Option<usize>,
    /// GPU name/model
    pub name: Option<String>,
    /// Whether the GPU supports FP16 compute
    pub supports_fp16: bool,
    /// Whether the GPU supports INT8 compute
    pub supports_int8: bool,
    /// Whether the GPU supports tensor cores / matrix engines
    pub has_tensor_cores: bool,
    /// Maximum shared memory per compute unit (bytes)
    pub max_shared_memory: Option<usize>,
}

impl GpuCapabilities {
    /// Detect GPU capabilities
    pub fn detect() -> Option<Self> {
        // Metal detection for macOS/iOS
        #[cfg(all(target_os = "macos", feature = "metal-compute"))]
        {
            return Self::detect_metal();
        }

        #[cfg(all(target_os = "macos", not(feature = "metal-compute")))]
        {
            // Metal is available on macOS but the feature isn't enabled
            // Return basic capabilities
            return Some(Self {
                backend: GpuBackend::Metal,
                vram_mb: None,
                compute_units: None,
                name: Some("Apple GPU (metal-compute feature not enabled)".to_string()),
                supports_fp16: true,
                supports_int8: true,
                has_tensor_cores: false,
                max_shared_memory: Some(32 * 1024), // 32KB typical
            });
        }

        #[cfg(target_os = "ios")]
        {
            return Some(Self {
                backend: GpuBackend::Metal,
                vram_mb: None,
                compute_units: None,
                name: Some("Apple GPU (iOS)".to_string()),
                supports_fp16: true,
                supports_int8: true,
                has_tensor_cores: false,
                max_shared_memory: Some(32 * 1024),
            });
        }

        // CUDA detection for Linux/Windows
        #[cfg(any(target_os = "linux", target_os = "windows"))]
        {
            if let Some(cuda) = Self::detect_cuda() {
                return Some(cuda);
            }
        }

        // WebGPU for WASM
        #[cfg(target_arch = "wasm32")]
        {
            return Self::detect_webgpu();
        }

        #[allow(unreachable_code)]
        None
    }

    /// Detect Metal GPU capabilities
    #[cfg(all(target_os = "macos", feature = "metal-compute"))]
    fn detect_metal() -> Option<Self> {
        use crate::metal::{get_device_info, is_metal_available};

        if !is_metal_available() {
            return None;
        }

        match get_device_info() {
            Some(info) => {
                // Check if this is Apple Silicon (M-series) for feature detection
                let is_apple_silicon = info.has_unified_memory;

                Some(Self {
                    backend: GpuBackend::Metal,
                    vram_mb: Some(info.recommended_max_working_set_size / (1024 * 1024)),
                    compute_units: Some(info.max_threads_per_threadgroup),
                    name: Some(info.name),
                    supports_fp16: is_apple_silicon, // Apple Silicon has excellent FP16
                    supports_int8: true,
                    has_tensor_cores: is_apple_silicon, // AMX on Apple Silicon
                    max_shared_memory: Some(32 * 1024), // 32KB typical threadgroup memory
                })
            }
            None => Some(Self {
                backend: GpuBackend::Metal,
                vram_mb: None,
                compute_units: None,
                name: Some("Apple GPU".to_string()),
                supports_fp16: true,
                supports_int8: true,
                has_tensor_cores: false,
                max_shared_memory: Some(32 * 1024),
            }),
        }
    }

    /// Detect CUDA GPU capabilities
    #[cfg(any(target_os = "linux", target_os = "windows"))]
    fn detect_cuda() -> Option<Self> {
        // CUDA detection would require CUDA runtime
        // For now, return None and let the user configure manually
        // In a full implementation, this would use cuda_runtime_sys or similar
        None
    }

    /// Detect WebGPU capabilities
    #[cfg(target_arch = "wasm32")]
    fn detect_webgpu() -> Option<Self> {
        // WebGPU detection requires JavaScript interop
        // Return a placeholder that indicates WebGPU might be available
        Some(Self {
            backend: GpuBackend::WebGPU,
            vram_mb: None,
            compute_units: None,
            name: Some("WebGPU (browser)".to_string()),
            supports_fp16: true,
            supports_int8: false, // WebGPU INT8 support varies
            has_tensor_cores: false,
            max_shared_memory: Some(16 * 1024), // 16KB typical for WebGPU
        })
    }

    /// Estimate VRAM needed for a model of given size
    pub fn can_fit_model(&self, model_size_gb: f32) -> bool {
        if let Some(vram_mb) = self.vram_mb {
            let vram_gb = vram_mb as f32 / 1024.0;
            // Need ~1.2x model size for activations and KV cache
            vram_gb >= model_size_gb * 1.2
        } else {
            // Unknown VRAM, assume it can fit
            true
        }
    }
}

// =============================================================================
// Core Information
// =============================================================================

/// CPU core information
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct CoreInfo {
    /// Number of physical CPU cores
    pub physical_cores: usize,
    /// Number of logical CPU cores (with hyperthreading)
    pub logical_cores: usize,
    /// Number of performance cores (if heterogeneous, e.g., Apple M-series)
    pub performance_cores: Option<usize>,
    /// Number of efficiency cores (if heterogeneous)
    pub efficiency_cores: Option<usize>,
}

impl Default for CoreInfo {
    fn default() -> Self {
        Self::detect()
    }
}

impl CoreInfo {
    /// Detect core information
    pub fn detect() -> Self {
        let logical_cores = Self::detect_logical_cores();
        let physical_cores = Self::detect_physical_cores(logical_cores);

        // Detect heterogeneous cores on Apple Silicon
        #[cfg(target_os = "macos")]
        {
            let (perf, eff) = Self::detect_apple_cores();
            return Self {
                physical_cores,
                logical_cores,
                performance_cores: perf,
                efficiency_cores: eff,
            };
        }

        #[cfg(not(target_os = "macos"))]
        Self {
            physical_cores,
            logical_cores,
            performance_cores: None,
            efficiency_cores: None,
        }
    }

    /// Detect logical core count
    fn detect_logical_cores() -> usize {
        // Try std::thread::available_parallelism first
        std::thread::available_parallelism()
            .map(|n| n.get())
            .unwrap_or(1)
    }

    /// Detect physical core count
    fn detect_physical_cores(logical: usize) -> usize {
        // On most systems, physical = logical / 2 if hyperthreading is enabled
        // This is a heuristic; accurate detection requires platform-specific APIs

        #[cfg(target_os = "macos")]
        {
            // Use sysctl on macOS
            Self::sysctl_physical_cores().unwrap_or(logical)
        }

        #[cfg(target_os = "linux")]
        {
            // Parse /proc/cpuinfo on Linux
            Self::linux_physical_cores().unwrap_or(logical / 2).max(1)
        }

        #[cfg(target_os = "windows")]
        {
            // Windows detection would use GetLogicalProcessorInformation
            // For now, use heuristic
            (logical / 2).max(1)
        }

        #[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
        {
            logical
        }
    }

    /// Get physical cores via sysctl on macOS
    #[cfg(target_os = "macos")]
    fn sysctl_physical_cores() -> Option<usize> {
        use std::process::Command;

        let output = Command::new("sysctl")
            .args(["-n", "hw.physicalcpu"])
            .output()
            .ok()?;

        String::from_utf8_lossy(&output.stdout).trim().parse().ok()
    }

    /// Get physical cores from /proc/cpuinfo on Linux
    #[cfg(target_os = "linux")]
    fn linux_physical_cores() -> Option<usize> {
        use std::fs;

        let cpuinfo = fs::read_to_string("/proc/cpuinfo").ok()?;

        // Count unique physical id + core id pairs
        let mut cores = std::collections::HashSet::new();

        let mut physical_id = None;
        let mut core_id = None;

        for line in cpuinfo.lines() {
            if line.starts_with("physical id") {
                physical_id = line
                    .split(':')
                    .nth(1)
                    .and_then(|s| s.trim().parse::<usize>().ok());
            } else if line.starts_with("core id") {
                core_id = line
                    .split(':')
                    .nth(1)
                    .and_then(|s| s.trim().parse::<usize>().ok());
            }

            if let (Some(pid), Some(cid)) = (physical_id, core_id) {
                cores.insert((pid, cid));
                physical_id = None;
                core_id = None;
            }
        }

        if cores.is_empty() {
            // Fallback: count "processor" lines
            Some(
                cpuinfo
                    .lines()
                    .filter(|l| l.starts_with("processor"))
                    .count(),
            )
        } else {
            Some(cores.len())
        }
    }

    /// Detect Apple Silicon core configuration
    #[cfg(target_os = "macos")]
    fn detect_apple_cores() -> (Option<usize>, Option<usize>) {
        use std::process::Command;

        // Try to get performance core count
        let perf = Command::new("sysctl")
            .args(["-n", "hw.perflevel0.physicalcpu"])
            .output()
            .ok()
            .and_then(|o| String::from_utf8_lossy(&o.stdout).trim().parse().ok());

        // Try to get efficiency core count
        let eff = Command::new("sysctl")
            .args(["-n", "hw.perflevel1.physicalcpu"])
            .output()
            .ok()
            .and_then(|o| String::from_utf8_lossy(&o.stdout).trim().parse().ok());

        (perf, eff)
    }

    /// Get the recommended thread count for parallel workloads
    pub fn recommended_threads(&self) -> usize {
        // Prefer performance cores if available
        if let Some(perf) = self.performance_cores {
            perf
        } else {
            // Use physical cores to avoid cache contention from hyperthreading
            self.physical_cores
        }
    }
}

// =============================================================================
// System Capabilities (Main Detection Struct)
// =============================================================================

/// Apple Neural Engine (ANE) capabilities
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AneInfo {
    /// Whether ANE is available on this device
    pub available: bool,
    /// ANE compute power in TOPS (Trillion Operations Per Second)
    pub tops: f32,
    /// Maximum recommended model size in MB for ANE
    pub max_model_size_mb: usize,
    /// Supported operation types
    pub supported_ops: Vec<String>,
}

impl Default for AneInfo {
    fn default() -> Self {
        Self::detect()
    }
}

impl AneInfo {
    /// Detect ANE capabilities
    pub fn detect() -> Self {
        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
        {
            // Apple Silicon has ANE
            // M4 Pro: 38 TOPS, M3: 18 TOPS, M2: 15.8 TOPS, M1: 11 TOPS
            Self {
                available: true,
                tops: Self::detect_ane_tops(),
                max_model_size_mb: 2048, // ~2GB models work well on ANE
                supported_ops: vec![
                    "MatMul".to_string(),
                    "Conv2D".to_string(),
                    "GELU".to_string(),
                    "SiLU".to_string(),
                    "LayerNorm".to_string(),
                    "Softmax".to_string(),
                    "Add".to_string(),
                    "Mul".to_string(),
                ],
            }
        }

        #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
        {
            Self {
                available: false,
                tops: 0.0,
                max_model_size_mb: 0,
                supported_ops: vec![],
            }
        }
    }

    /// Detect ANE TOPS based on chip model
    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
    fn detect_ane_tops() -> f32 {
        use std::process::Command;

        // Try to get chip model from sysctl
        if let Ok(output) = Command::new("sysctl")
            .args(["-n", "machdep.cpu.brand_string"])
            .output()
        {
            let brand = String::from_utf8_lossy(&output.stdout).to_lowercase();

            // M4 series
            if brand.contains("m4") {
                if brand.contains("max") {
                    return 38.0; // M4 Max
                } else if brand.contains("pro") {
                    return 38.0; // M4 Pro
                } else {
                    return 38.0; // M4 base
                }
            }

            // M3 series
            if brand.contains("m3") {
                if brand.contains("max") {
                    return 18.0;
                } else if brand.contains("pro") {
                    return 18.0;
                } else {
                    return 18.0;
                }
            }

            // M2 series
            if brand.contains("m2") {
                if brand.contains("ultra") {
                    return 31.6; // 2x M2 Max
                } else if brand.contains("max") {
                    return 15.8;
                } else if brand.contains("pro") {
                    return 15.8;
                } else {
                    return 15.8;
                }
            }

            // M1 series
            if brand.contains("m1") {
                if brand.contains("ultra") {
                    return 22.0; // 2x M1 Max
                } else if brand.contains("max") {
                    return 11.0;
                } else if brand.contains("pro") {
                    return 11.0;
                } else {
                    return 11.0;
                }
            }
        }

        // Default to M1 level if detection fails
        11.0
    }

    /// Check if a model of given size is suitable for ANE
    pub fn is_model_suitable(&self, model_size_mb: usize) -> bool {
        self.available && model_size_mb <= self.max_model_size_mb
    }

    /// Get recommended compute strategy for a given model size
    pub fn recommended_strategy(&self, model_size_mb: usize) -> AneStrategy {
        if !self.available {
            return AneStrategy::GpuOnly;
        }

        if model_size_mb <= 500 {
            // Small models: ANE is great
            AneStrategy::AneOnly
        } else if model_size_mb <= self.max_model_size_mb {
            // Medium models: hybrid is best
            AneStrategy::Hybrid
        } else {
            // Large models: GPU is better
            AneStrategy::GpuOnly
        }
    }
}

/// ANE usage strategy
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AneStrategy {
    /// Use only ANE (best for small models)
    AneOnly,
    /// Use GPU + ANE hybrid (ANE for MLP, GPU for attention)
    Hybrid,
    /// Use only GPU (best for large models)
    GpuOnly,
}

/// Complete system capabilities for inference configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemCapabilities {
    /// Operating system platform
    pub platform: Platform,
    /// CPU architecture
    pub arch: Architecture,
    /// CPU SIMD features
    pub cpu_features: CpuFeatures,
    /// GPU capabilities (if available)
    pub gpu: Option<GpuCapabilities>,
    /// Apple Neural Engine capabilities (if available)
    pub ane: AneInfo,
    /// Total system memory in megabytes
    pub memory_mb: usize,
    /// Available memory in megabytes (if detectable)
    pub available_memory_mb: Option<usize>,
    /// CPU core information
    pub cores: CoreInfo,
}

impl Default for SystemCapabilities {
    fn default() -> Self {
        Self::detect()
    }
}

impl SystemCapabilities {
    /// Detect all system capabilities
    pub fn detect() -> Self {
        Self {
            platform: Platform::detect(),
            arch: Architecture::detect(),
            cpu_features: CpuFeatures::detect(),
            gpu: GpuCapabilities::detect(),
            ane: AneInfo::detect(),
            memory_mb: Self::detect_total_memory(),
            available_memory_mb: Self::detect_available_memory(),
            cores: CoreInfo::detect(),
        }
    }

    /// Detect total system memory in MB
    fn detect_total_memory() -> usize {
        #[cfg(target_os = "macos")]
        {
            Self::macos_total_memory().unwrap_or(8 * 1024) // Default 8GB
        }

        #[cfg(target_os = "linux")]
        {
            Self::linux_total_memory().unwrap_or(8 * 1024)
        }

        #[cfg(target_os = "windows")]
        {
            Self::windows_total_memory().unwrap_or(8 * 1024)
        }

        #[cfg(target_arch = "wasm32")]
        {
            // WASM: estimate based on navigator.deviceMemory (typically 4-8GB)
            4 * 1024
        }

        #[cfg(not(any(
            target_os = "macos",
            target_os = "linux",
            target_os = "windows",
            target_arch = "wasm32"
        )))]
        {
            4 * 1024 // Conservative default
        }
    }

    /// Detect available memory (not just total)
    fn detect_available_memory() -> Option<usize> {
        #[cfg(target_os = "macos")]
        {
            // macOS doesn't easily expose available memory
            // Would need vm_statistics or memory_pressure
            None
        }

        #[cfg(target_os = "linux")]
        {
            Self::linux_available_memory()
        }

        #[cfg(not(any(target_os = "macos", target_os = "linux")))]
        {
            None
        }
    }

    #[cfg(target_os = "macos")]
    fn macos_total_memory() -> Option<usize> {
        use std::process::Command;

        let output = Command::new("sysctl")
            .args(["-n", "hw.memsize"])
            .output()
            .ok()?;

        let bytes: u64 = String::from_utf8_lossy(&output.stdout)
            .trim()
            .parse()
            .ok()?;

        Some((bytes / (1024 * 1024)) as usize)
    }

    #[cfg(target_os = "linux")]
    fn linux_total_memory() -> Option<usize> {
        use std::fs;

        let meminfo = fs::read_to_string("/proc/meminfo").ok()?;

        for line in meminfo.lines() {
            if line.starts_with("MemTotal:") {
                let parts: Vec<&str> = line.split_whitespace().collect();
                if parts.len() >= 2 {
                    let kb: usize = parts[1].parse().ok()?;
                    return Some(kb / 1024); // Convert KB to MB
                }
            }
        }

        None
    }

    #[cfg(target_os = "linux")]
    fn linux_available_memory() -> Option<usize> {
        use std::fs;

        let meminfo = fs::read_to_string("/proc/meminfo").ok()?;

        for line in meminfo.lines() {
            if line.starts_with("MemAvailable:") {
                let parts: Vec<&str> = line.split_whitespace().collect();
                if parts.len() >= 2 {
                    let kb: usize = parts[1].parse().ok()?;
                    return Some(kb / 1024);
                }
            }
        }

        None
    }

    #[cfg(target_os = "windows")]
    fn windows_total_memory() -> Option<usize> {
        // Would use GetPhysicallyInstalledSystemMemory or GlobalMemoryStatusEx
        // For now, return None to use default
        None
    }

    /// Get the optimal inference configuration based on detected capabilities
    pub fn optimal_config(&self) -> InferenceConfig {
        let compute_backend = self.select_compute_backend();
        let quantization = self.optimal_quantization(7.0); // Default to 7B model size
        let batch_size = self.recommended_batch_size(2048); // Default 2K context
        let thread_count = self.cores.recommended_threads();
        let block_size = self.optimal_block_size();

        InferenceConfig {
            compute_backend,
            quantization,
            batch_size,
            thread_count,
            block_size,
            use_flash_attention: true,
            device_type: self.optimal_device_type(),
            dtype: self.optimal_dtype(),
        }
    }

    /// Get optimal attention configuration
    pub fn optimal_attention_config(&self) -> AttentionConfig {
        // Default Mistral-7B style configuration
        let mut config = AttentionConfig {
            num_heads: 32,
            num_kv_heads: 8, // GQA 4:1
            head_dim: 128,
            max_seq_len: self.optimal_max_seq_len(),
            causal: true,
            scale: 0.0, // Auto-compute
        };

        // Adjust for memory constraints
        let available_mb = self.available_memory_mb.unwrap_or(self.memory_mb / 2);
        if available_mb < 4096 {
            // Low memory: reduce max sequence length
            config.max_seq_len = 2048;
        } else if available_mb < 8192 {
            config.max_seq_len = 4096;
        } else {
            config.max_seq_len = 8192;
        }

        config
    }

    /// Select optimal quantization based on model size and available memory
    pub fn optimal_quantization(&self, model_size_gb: f32) -> Quantization {
        let available_mb = self.available_memory_mb.unwrap_or(self.memory_mb / 2);
        let available_gb = available_mb as f32 / 1024.0;

        // Check GPU VRAM if available
        if let Some(ref gpu) = self.gpu {
            if let Some(vram_mb) = gpu.vram_mb {
                let vram_gb = vram_mb as f32 / 1024.0;

                // Need ~1.5x model size for activations and KV cache
                if vram_gb >= model_size_gb * 1.5 {
                    // Full precision fits
                    return Quantization::F16;
                } else if vram_gb >= model_size_gb * 0.75 {
                    // INT8 fits
                    return Quantization::Q8;
                } else if vram_gb >= model_size_gb * 0.4 {
                    // Q4K fits (best quality 4-bit)
                    return Quantization::Q4K;
                }
            }
        }

        // Fall back to CPU memory estimation
        if available_gb >= model_size_gb * 4.0 {
            Quantization::F16
        } else if available_gb >= model_size_gb * 1.5 {
            Quantization::Q8
        } else if available_gb >= model_size_gb * 0.6 {
            Quantization::Q4K
        } else {
            // Very low memory: use aggressive quantization
            Quantization::Q4
        }
    }

    /// Calculate recommended batch size based on memory and sequence length
    pub fn recommended_batch_size(&self, seq_len: usize) -> usize {
        let available_mb = self.available_memory_mb.unwrap_or(self.memory_mb / 2);

        // Estimate memory per batch item (very rough):
        // KV cache: 2 * num_layers * num_kv_heads * head_dim * seq_len * 2 bytes (FP16)
        // For Mistral-7B style: 2 * 32 * 8 * 128 * seq_len * 2 = ~128KB per 1K tokens per batch
        let kv_per_token_kb = 128.0 / 1024.0; // KB per token
        let kv_per_batch_mb = (kv_per_token_kb * seq_len as f32) / 1024.0;

        // Reserve 50% of available memory for model weights
        let available_for_batch_mb = available_mb as f32 * 0.5;

        let max_batch = (available_for_batch_mb / kv_per_batch_mb).floor() as usize;

        // Clamp to reasonable range
        max_batch.clamp(1, 64)
    }

    /// Select the best compute backend
    fn select_compute_backend(&self) -> ComputeBackend {
        self.select_compute_backend_for_model(7.0 * 1024.0) // Default to 7B model (~7GB)
    }

    /// Select the best compute backend for a specific model size (in MB)
    pub fn select_compute_backend_for_model(&self, model_size_mb: f32) -> ComputeBackend {
        // Check if ANE is available and suitable for this model
        #[cfg(feature = "coreml")]
        {
            if self.ane.available {
                let strategy = self.ane.recommended_strategy(model_size_mb as usize);
                match strategy {
                    AneStrategy::AneOnly => {
                        // Small model: pure ANE is best
                        return ComputeBackend::CoreML;
                    }
                    AneStrategy::Hybrid => {
                        // Medium model: hybrid ANE+GPU if Metal is available
                        if let Some(ref gpu) = self.gpu {
                            if matches!(gpu.backend, GpuBackend::Metal) {
                                return ComputeBackend::HybridAne;
                            }
                        }
                        // Fall back to CoreML if no GPU
                        return ComputeBackend::CoreML;
                    }
                    AneStrategy::GpuOnly => {
                        // Large model: use GPU (fall through)
                    }
                }
            }
        }

        // Prefer GPU if available
        if let Some(ref gpu) = self.gpu {
            match gpu.backend {
                GpuBackend::Metal => return ComputeBackend::Metal,
                GpuBackend::Cuda => return ComputeBackend::Cuda,
                GpuBackend::WebGPU => return ComputeBackend::WebGPU,
                _ => {}
            }
        }

        // Fall back to CPU with SIMD
        if self.cpu_features.avx512 {
            ComputeBackend::CpuAvx512
        } else if self.cpu_features.avx2 {
            ComputeBackend::CpuAvx2
        } else if self.cpu_features.neon {
            ComputeBackend::CpuNeon
        } else {
            ComputeBackend::CpuScalar
        }
    }

    /// Select compute backend optimized for power efficiency (battery life)
    pub fn select_power_efficient_backend(&self) -> ComputeBackend {
        // ANE is 3-4x more power efficient than GPU
        #[cfg(feature = "coreml")]
        {
            if self.ane.available {
                return ComputeBackend::CoreML;
            }
        }

        // Fall back to standard selection
        self.select_compute_backend()
    }

    /// Get optimal device type for the backend crate
    fn optimal_device_type(&self) -> DeviceType {
        if let Some(ref gpu) = self.gpu {
            match gpu.backend {
                GpuBackend::Metal => DeviceType::Metal,
                GpuBackend::Cuda => DeviceType::Cuda(0),
                _ => DeviceType::Cpu,
            }
        } else {
            DeviceType::Cpu
        }
    }

    /// Get optimal dtype for the backend
    fn optimal_dtype(&self) -> DType {
        // Prefer FP16 if GPU supports it, otherwise F32
        if let Some(ref gpu) = self.gpu {
            if gpu.supports_fp16 {
                return DType::F16;
            }
        }

        // CPU: use F32 for best compatibility
        // (NEON and AVX2 have good F32 support)
        DType::F32
    }

    /// Get optimal block size for attention
    fn optimal_block_size(&self) -> usize {
        // Based on cache hierarchy
        if let Some(ref gpu) = self.gpu {
            if let Some(shared_mem) = gpu.max_shared_memory {
                // Target 50% shared memory utilization
                // block_size * head_dim * 4 bytes * 2 (K+V) = shared_mem / 2
                let head_dim = 128; // Typical
                let max_block = shared_mem / (head_dim * 4 * 2 * 2);
                return max_block.clamp(32, 128);
            }
        }

        // CPU: optimize for L1 cache (32KB typical, 192KB on M4 Pro)
        #[cfg(target_os = "macos")]
        {
            64 // M4 Pro has 192KB L1, can fit 64-token blocks
        }

        #[cfg(not(target_os = "macos"))]
        {
            32 // Conservative for 32KB L1
        }
    }

    /// Get optimal max sequence length
    fn optimal_max_seq_len(&self) -> usize {
        let available_mb = self.available_memory_mb.unwrap_or(self.memory_mb / 2);

        if available_mb >= 32 * 1024 {
            // 32GB+: can handle very long contexts
            32768
        } else if available_mb >= 16 * 1024 {
            16384
        } else if available_mb >= 8 * 1024 {
            8192
        } else if available_mb >= 4 * 1024 {
            4096
        } else {
            2048
        }
    }

    /// Check if the system can run a model of given size
    pub fn can_run_model(&self, model_size_gb: f32) -> bool {
        let available_mb = self.available_memory_mb.unwrap_or(self.memory_mb / 2);
        let available_gb = available_mb as f32 / 1024.0;

        // With Q4K quantization, need ~0.4x model size in memory
        // Plus overhead for activations and KV cache
        let min_required_gb = model_size_gb * 0.4 + 2.0; // 2GB overhead

        available_gb >= min_required_gb
    }

    /// Get a human-readable summary of capabilities
    pub fn summary(&self) -> String {
        let mut parts = vec![];

        parts.push(format!("{:?} ({:?})", self.platform, self.arch));
        parts.push(format!(
            "{} cores ({} physical)",
            self.cores.logical_cores, self.cores.physical_cores
        ));

        if let Some(perf) = self.cores.performance_cores {
            parts.push(format!(
                "{}P+{}E cores",
                perf,
                self.cores.efficiency_cores.unwrap_or(0)
            ));
        }

        parts.push(format!("{}GB RAM", self.memory_mb / 1024));

        if let Some(ref gpu) = self.gpu {
            let gpu_info = match gpu.vram_mb {
                Some(vram) => format!("{:?} ({}GB VRAM)", gpu.backend, vram / 1024),
                None => format!("{:?}", gpu.backend),
            };
            parts.push(gpu_info);
        } else {
            parts.push("No GPU".to_string());
        }

        // Add ANE info if available
        if self.ane.available {
            parts.push(format!("ANE ({:.0} TOPS)", self.ane.tops));
        }

        let simd = if self.cpu_features.avx512 {
            "AVX-512"
        } else if self.cpu_features.avx2 {
            "AVX2"
        } else if self.cpu_features.neon {
            "NEON"
        } else if self.cpu_features.sse42 {
            "SSE4.2"
        } else {
            "Scalar"
        };
        parts.push(simd.to_string());

        parts.join(", ")
    }

    /// Get ANE-specific summary
    pub fn ane_summary(&self) -> String {
        if !self.ane.available {
            return "ANE: Not available".to_string();
        }

        format!(
            "ANE: {:.0} TOPS, max model {}MB, {} supported ops",
            self.ane.tops,
            self.ane.max_model_size_mb,
            self.ane.supported_ops.len()
        )
    }
}

// =============================================================================
// Compute Backend Selection
// =============================================================================

/// Compute backend for inference
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum ComputeBackend {
    /// Apple Metal GPU
    Metal,
    /// Apple Neural Engine via Core ML (38 TOPS on M4 Pro)
    /// Optimal for small models (<1B params) and batch inference
    CoreML,
    /// Hybrid Metal GPU + ANE (best of both worlds)
    /// Uses ANE for MLP/FFN layers, GPU for attention
    HybridAne,
    /// NVIDIA CUDA GPU
    Cuda,
    /// WebGPU (browser/cross-platform)
    WebGPU,
    /// CPU with AVX-512 SIMD
    CpuAvx512,
    /// CPU with AVX2 SIMD
    CpuAvx2,
    /// CPU with ARM NEON SIMD
    CpuNeon,
    /// CPU scalar (no SIMD)
    CpuScalar,
}

impl ComputeBackend {
    /// Check if this is a GPU/accelerator backend
    pub fn is_gpu(&self) -> bool {
        matches!(
            self,
            Self::Metal | Self::CoreML | Self::HybridAne | Self::Cuda | Self::WebGPU
        )
    }

    /// Check if this backend uses the Neural Engine
    pub fn uses_ane(&self) -> bool {
        matches!(self, Self::CoreML | Self::HybridAne)
    }

    /// Get expected relative performance (higher = better)
    /// Note: ANE performance depends heavily on model size and batch configuration
    pub fn relative_performance(&self) -> f32 {
        match self {
            Self::HybridAne => 12.0, // Best for models that benefit from ANE+GPU
            Self::Metal => 10.0,     // Apple Silicon GPU is very efficient
            Self::CoreML => 8.0,     // ANE alone (great for small models, limited for large)
            Self::Cuda => 15.0,      // NVIDIA is fastest for large models
            Self::WebGPU => 5.0,     // WebGPU has overhead
            Self::CpuAvx512 => 4.0,  // AVX-512 is fast
            Self::CpuAvx2 => 2.5,    // AVX2 is good
            Self::CpuNeon => 2.0,    // NEON is comparable to AVX2
            Self::CpuScalar => 1.0,  // Baseline
        }
    }

    /// Get power efficiency rating (higher = more efficient)
    /// ANE is significantly more power efficient than GPU
    pub fn power_efficiency(&self) -> f32 {
        match self {
            Self::CoreML => 4.0,    // ANE is 3-4x more power efficient than GPU
            Self::HybridAne => 3.0, // Hybrid gets some efficiency benefits
            Self::Metal => 2.0,     // Apple Silicon GPU is efficient
            Self::Cuda => 1.0,      // NVIDIA uses more power
            Self::WebGPU => 1.5,    // Varies
            Self::CpuAvx512 => 1.2,
            Self::CpuAvx2 => 1.3,
            Self::CpuNeon => 1.5, // ARM is power efficient
            Self::CpuScalar => 1.0,
        }
    }
}

// =============================================================================
// Inference Configuration
// =============================================================================

/// Configuration generated by auto-detection
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InferenceConfig {
    /// Selected compute backend
    pub compute_backend: ComputeBackend,
    /// Recommended quantization
    pub quantization: Quantization,
    /// Recommended batch size
    pub batch_size: usize,
    /// Recommended thread count for CPU inference
    pub thread_count: usize,
    /// Optimal block size for attention
    pub block_size: usize,
    /// Whether to use flash attention
    pub use_flash_attention: bool,
    /// Device type for the backend crate
    pub device_type: DeviceType,
    /// Data type for tensors
    pub dtype: DType,
}

impl Default for InferenceConfig {
    fn default() -> Self {
        Self::auto()
    }
}

impl InferenceConfig {
    /// Create an auto-configured inference config
    pub fn auto() -> Self {
        SystemCapabilities::detect().optimal_config()
    }

    /// Create a config optimized for low memory usage
    pub fn low_memory() -> Self {
        let mut config = Self::auto();
        config.quantization = Quantization::Q4K;
        config.batch_size = 1;
        config.block_size = 32;
        config
    }

    /// Create a config optimized for high throughput
    pub fn high_throughput() -> Self {
        let caps = SystemCapabilities::detect();
        let mut config = caps.optimal_config();

        // Increase batch size for throughput
        config.batch_size = (config.batch_size * 2).min(32);

        // Use larger blocks
        config.block_size = 128;

        config
    }

    /// Create a config optimized for low latency
    pub fn low_latency() -> Self {
        let mut config = Self::auto();

        // Use single batch for lowest latency
        config.batch_size = 1;

        // Smaller blocks reduce per-block overhead
        config.block_size = 32;

        // Use all threads for parallel decode
        let caps = SystemCapabilities::detect();
        config.thread_count = caps.cores.logical_cores;

        config
    }

    /// Get estimated tokens per second for this configuration
    pub fn estimated_tokens_per_second(&self) -> f32 {
        let base = match self.compute_backend {
            ComputeBackend::HybridAne => 90.0, // Hybrid can exceed pure Metal for suitable models
            ComputeBackend::Metal => 80.0,
            ComputeBackend::CoreML => 60.0, // ANE alone (great for small models)
            ComputeBackend::Cuda => 100.0,
            ComputeBackend::WebGPU => 40.0,
            ComputeBackend::CpuAvx512 => 30.0,
            ComputeBackend::CpuAvx2 => 20.0,
            ComputeBackend::CpuNeon => 20.0,
            ComputeBackend::CpuScalar => 5.0,
        };

        // Adjust for quantization
        let quant_factor = match self.quantization {
            Quantization::Q4 | Quantization::Q4K => 2.0, // 4-bit is fastest
            Quantization::Q8 => 1.5,
            Quantization::F16 | Quantization::Bf16 => 1.0,
            Quantization::None => 0.5,
            Quantization::Q2K => 2.5, // Most aggressive quantization
        };

        // Adjust for batch size (throughput scales sublinearly)
        let batch_factor = (self.batch_size as f32).sqrt();

        base * quant_factor * batch_factor
    }

    /// Create a config optimized for power efficiency (uses ANE when available)
    pub fn power_efficient() -> Self {
        let caps = SystemCapabilities::detect();
        let mut config = caps.optimal_config();

        // Override with power-efficient backend selection
        config.compute_backend = caps.select_power_efficient_backend();

        // Use smaller batches for better power efficiency
        config.batch_size = 1;

        config
    }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_platform_detection() {
        let platform = Platform::detect();

        #[cfg(target_os = "macos")]
        assert_eq!(platform, Platform::MacOS);

        #[cfg(target_os = "linux")]
        assert_eq!(platform, Platform::Linux);

        #[cfg(target_os = "windows")]
        assert_eq!(platform, Platform::Windows);
    }

    #[test]
    fn test_architecture_detection() {
        let arch = Architecture::detect();

        #[cfg(target_arch = "aarch64")]
        assert_eq!(arch, Architecture::Aarch64);

        #[cfg(target_arch = "x86_64")]
        assert_eq!(arch, Architecture::X86_64);
    }

    #[test]
    fn test_cpu_features_detection() {
        let features = CpuFeatures::detect();

        #[cfg(target_arch = "aarch64")]
        assert!(features.neon, "NEON should always be available on aarch64");

        // SIMD width should be non-zero on supported architectures
        #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
        assert!(
            features.best_simd_width() >= 128,
            "Should have at least 128-bit SIMD"
        );
    }

    #[test]
    fn test_system_capabilities_detect() {
        let caps = SystemCapabilities::detect();

        // Should always have at least 1 core
        assert!(caps.cores.physical_cores >= 1);
        assert!(caps.cores.logical_cores >= 1);

        // Should have some memory detected
        assert!(caps.memory_mb > 0, "Memory should be detected");

        // Platform and arch should match
        #[cfg(target_os = "macos")]
        assert_eq!(caps.platform, Platform::MacOS);

        #[cfg(target_arch = "aarch64")]
        assert_eq!(caps.arch, Architecture::Aarch64);
    }

    #[test]
    fn test_optimal_config() {
        let caps = SystemCapabilities::detect();
        let config = caps.optimal_config();

        // Config should have reasonable values
        assert!(config.batch_size >= 1);
        assert!(config.thread_count >= 1);
        assert!(config.block_size >= 16);

        // Backend should match platform capabilities
        #[cfg(all(target_os = "macos", feature = "metal-compute"))]
        {
            if caps.gpu.is_some() {
                assert_eq!(config.compute_backend, ComputeBackend::Metal);
            }
        }

        #[cfg(target_arch = "aarch64")]
        {
            if !config.compute_backend.is_gpu() {
                assert_eq!(config.compute_backend, ComputeBackend::CpuNeon);
            }
        }
    }

    #[test]
    fn test_inference_config_auto() {
        let config = InferenceConfig::auto();

        assert!(config.batch_size >= 1);
        assert!(config.thread_count >= 1);
        assert!(config.use_flash_attention);
    }

    #[test]
    fn test_inference_config_presets() {
        let low_mem = InferenceConfig::low_memory();
        let high_throughput = InferenceConfig::high_throughput();
        let low_latency = InferenceConfig::low_latency();

        // Low memory should use aggressive quantization
        assert!(matches!(
            low_mem.quantization,
            Quantization::Q4 | Quantization::Q4K | Quantization::Q2K
        ));
        assert_eq!(low_mem.batch_size, 1);

        // Low latency should use batch size 1
        assert_eq!(low_latency.batch_size, 1);

        // High throughput should have larger batch
        assert!(high_throughput.batch_size >= 2);
    }

    #[test]
    fn test_optimal_quantization() {
        let caps = SystemCapabilities::detect();

        // Small model should use higher precision
        let quant_small = caps.optimal_quantization(1.0);

        // Large model should use more aggressive quantization
        let quant_large = caps.optimal_quantization(70.0);

        // Large model quantization should save more memory
        assert!(
            quant_large.bytes_per_weight() <= quant_small.bytes_per_weight(),
            "Larger models should use more aggressive quantization"
        );
    }

    #[test]
    fn test_recommended_batch_size() {
        let caps = SystemCapabilities::detect();

        // Shorter sequences should allow larger batches
        let batch_short = caps.recommended_batch_size(512);
        let batch_long = caps.recommended_batch_size(8192);

        assert!(
            batch_short >= batch_long,
            "Shorter sequences should allow larger batches"
        );
    }

    #[test]
    fn test_can_run_model() {
        let caps = SystemCapabilities::detect();

        // Should be able to run a tiny model
        assert!(caps.can_run_model(0.1), "Should be able to run 100MB model");

        // Likely can't run a 1TB model
        assert!(
            !caps.can_run_model(1000.0),
            "Should not be able to run 1TB model"
        );
    }

    #[test]
    fn test_system_summary() {
        let caps = SystemCapabilities::detect();
        let summary = caps.summary();

        // Summary should contain platform info
        assert!(!summary.is_empty());
        assert!(summary.contains("cores") || summary.contains("RAM"));
    }

    #[test]
    fn test_compute_backend_properties() {
        assert!(ComputeBackend::Metal.is_gpu());
        assert!(ComputeBackend::Cuda.is_gpu());
        assert!(!ComputeBackend::CpuNeon.is_gpu());
        assert!(!ComputeBackend::CpuScalar.is_gpu());

        // GPU should have higher relative performance
        assert!(
            ComputeBackend::Metal.relative_performance()
                > ComputeBackend::CpuNeon.relative_performance()
        );
    }

    #[test]
    fn test_gpu_can_fit_model() {
        let gpu = GpuCapabilities {
            backend: GpuBackend::Metal,
            vram_mb: Some(16 * 1024), // 16GB
            compute_units: Some(128),
            name: Some("Test GPU".to_string()),
            supports_fp16: true,
            supports_int8: true,
            has_tensor_cores: true,
            max_shared_memory: Some(32 * 1024),
        };

        // 16GB should fit 7B model (needs ~10GB with overhead)
        assert!(gpu.can_fit_model(7.0));

        // 16GB should not fit 70B model (needs ~100GB)
        assert!(!gpu.can_fit_model(70.0));
    }

    #[test]
    fn test_core_info() {
        let cores = CoreInfo::detect();

        // Should have at least 1 core
        assert!(cores.physical_cores >= 1);
        assert!(cores.logical_cores >= 1);

        // Logical should be >= physical
        assert!(cores.logical_cores >= cores.physical_cores);

        // Recommended threads should be reasonable
        let recommended = cores.recommended_threads();
        assert!(recommended >= 1);
        assert!(recommended <= cores.logical_cores);
    }

    #[test]
    fn test_estimated_tokens_per_second() {
        let config = InferenceConfig::auto();
        let tps = config.estimated_tokens_per_second();

        // Should be positive
        assert!(tps > 0.0);

        // Low latency config should have lower throughput but same latency
        let low_latency = InferenceConfig::low_latency();
        let tps_low_latency = low_latency.estimated_tokens_per_second();
        assert!(tps_low_latency > 0.0);
    }

    // =========================================================================
    // ANE (Apple Neural Engine) Tests
    // =========================================================================

    #[test]
    fn test_ane_info_detect() {
        let ane = AneInfo::detect();

        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
        {
            assert!(ane.available, "ANE should be available on Apple Silicon");
            assert!(ane.tops > 0.0, "ANE TOPS should be positive");
            assert!(
                ane.max_model_size_mb > 0,
                "ANE max model size should be positive"
            );
            assert!(
                !ane.supported_ops.is_empty(),
                "ANE should have supported ops"
            );
        }

        #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
        {
            assert!(
                !ane.available,
                "ANE should not be available on non-Apple Silicon"
            );
        }
    }

    #[test]
    fn test_ane_model_suitability() {
        let ane = AneInfo {
            available: true,
            tops: 38.0,
            max_model_size_mb: 2048,
            supported_ops: vec!["MatMul".to_string()],
        };

        // Small model should be suitable
        assert!(ane.is_model_suitable(500));
        assert!(ane.is_model_suitable(2048));

        // Large model should not be suitable
        assert!(!ane.is_model_suitable(4096));
        assert!(!ane.is_model_suitable(8192));
    }

    #[test]
    fn test_ane_strategy_recommendation() {
        let ane = AneInfo {
            available: true,
            tops: 38.0,
            max_model_size_mb: 2048,
            supported_ops: vec!["MatMul".to_string()],
        };

        // Small model: ANE only
        assert_eq!(ane.recommended_strategy(300), AneStrategy::AneOnly);

        // Medium model: Hybrid
        assert_eq!(ane.recommended_strategy(1000), AneStrategy::Hybrid);

        // Large model: GPU only
        assert_eq!(ane.recommended_strategy(4000), AneStrategy::GpuOnly);
    }

    #[test]
    fn test_ane_strategy_unavailable() {
        let ane = AneInfo {
            available: false,
            tops: 0.0,
            max_model_size_mb: 0,
            supported_ops: vec![],
        };

        // All sizes should recommend GPU when ANE unavailable
        assert_eq!(ane.recommended_strategy(100), AneStrategy::GpuOnly);
        assert_eq!(ane.recommended_strategy(1000), AneStrategy::GpuOnly);
        assert_eq!(ane.recommended_strategy(10000), AneStrategy::GpuOnly);
    }

    #[test]
    fn test_compute_backend_ane_properties() {
        // CoreML and HybridAne should use ANE
        assert!(ComputeBackend::CoreML.uses_ane());
        assert!(ComputeBackend::HybridAne.uses_ane());

        // Other backends should not use ANE
        assert!(!ComputeBackend::Metal.uses_ane());
        assert!(!ComputeBackend::Cuda.uses_ane());
        assert!(!ComputeBackend::CpuNeon.uses_ane());

        // ANE backends should be considered GPU/accelerator
        assert!(ComputeBackend::CoreML.is_gpu());
        assert!(ComputeBackend::HybridAne.is_gpu());
    }

    #[test]
    fn test_compute_backend_power_efficiency() {
        // ANE should have highest power efficiency
        assert!(
            ComputeBackend::CoreML.power_efficiency() > ComputeBackend::Metal.power_efficiency(),
            "CoreML should be more power efficient than Metal"
        );
        assert!(
            ComputeBackend::HybridAne.power_efficiency() > ComputeBackend::Metal.power_efficiency(),
            "HybridAne should be more power efficient than Metal"
        );
    }

    #[test]
    fn test_system_capabilities_includes_ane() {
        let caps = SystemCapabilities::detect();

        // ANE info should be populated
        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
        {
            assert!(caps.ane.available);
            // Summary should mention ANE
            let summary = caps.summary();
            assert!(summary.contains("ANE"), "Summary should include ANE info");
        }
    }

    #[test]
    fn test_ane_summary() {
        let caps = SystemCapabilities::detect();
        let ane_summary = caps.ane_summary();

        #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
        {
            assert!(ane_summary.contains("TOPS"));
            assert!(ane_summary.contains("supported ops"));
        }

        #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
        {
            assert!(ane_summary.contains("Not available"));
        }
    }

    #[test]
    fn test_power_efficient_config() {
        let config = InferenceConfig::power_efficient();

        // Power efficient config should use batch size 1
        assert_eq!(config.batch_size, 1);

        // On Apple Silicon with coreml feature, should prefer ANE
        #[cfg(all(target_os = "macos", target_arch = "aarch64", feature = "coreml"))]
        {
            assert!(
                config.compute_backend.uses_ane(),
                "Power efficient config should use ANE on Apple Silicon"
            );
        }
    }

    #[test]
    fn test_select_compute_backend_for_model_size() {
        let caps = SystemCapabilities::detect();

        // Different model sizes should potentially get different backends
        let _small_backend = caps.select_compute_backend_for_model(500.0);
        let _medium_backend = caps.select_compute_backend_for_model(2000.0);
        let _large_backend = caps.select_compute_backend_for_model(10000.0);

        // All backends should be valid
        // (Actual values depend on platform and feature flags)
    }
}