ai-hwaccel 1.1.0

//! Hardware detection: probes sysfs, /dev, and PATH tools to discover accelerators.

#[cfg(feature = "amd-xdna")]
pub(crate) mod amd_xdna;
#[cfg(feature = "apple")]
pub(crate) mod apple;
pub mod bandwidth;
#[cfg(feature = "cerebras")]
pub(crate) mod cerebras;
pub(crate) mod command;
#[cfg(feature = "cuda")]
pub mod cuda;
pub(crate) mod disk;
pub(crate) mod environment;
#[cfg(feature = "gaudi")]
pub mod gaudi;
#[cfg(feature = "graphcore")]
pub(crate) mod graphcore;
#[cfg(feature = "groq")]
pub(crate) mod groq;
#[cfg(feature = "intel-npu")]
pub(crate) mod intel_npu;
#[cfg(feature = "intel-oneapi")]
pub(crate) mod intel_oneapi;
pub mod interconnect;
#[cfg(feature = "mediatek-apu")]
pub(crate) mod mediatek_apu;
#[cfg(feature = "aws-neuron")]
pub(crate) mod neuron;
pub(crate) mod numa;
pub mod pcie;
#[cfg(feature = "qualcomm")]
pub(crate) mod qualcomm;
#[cfg(feature = "rocm")]
pub(crate) mod rocm;
#[cfg(feature = "samsung-npu")]
pub(crate) mod samsung_npu;
#[cfg(feature = "tpu")]
pub(crate) mod tpu;
#[cfg(feature = "vulkan")]
pub mod vulkan;
#[cfg(feature = "windows-wmi")]
pub(crate) mod windows;

pub mod platform;

use std::collections::HashMap;
use std::path::Path;
use std::time::{Duration, Instant};

use tracing::debug;

use crate::error::DetectionError;
use crate::hardware::AcceleratorType;
use crate::profile::AcceleratorProfile;
use crate::registry::{AcceleratorRegistry, Backend, DetectBuilder};
use crate::system_io::SystemIo;

/// Per-backend detection result.
type DetectResult = (Vec<AcceleratorProfile>, Vec<DetectionError>);

/// Per-backend detection result with timing.
type TimedDetectResult = (Vec<AcceleratorProfile>, Vec<DetectionError>, Duration);

// ---------------------------------------------------------------------------
// Backend registration table — single source of truth
// ---------------------------------------------------------------------------

/// Invokes `$callback!(feature, Backend, name, sync_detect_fn)` for every backend.
///
/// Adding a new backend? Add one line here and the corresponding `mod` + feature
/// declarations. All sync/timed/async dispatch code expands from this table.
macro_rules! backend_table {
    ($callback:ident) => {
        $callback!("cuda", Backend::Cuda, "cuda", cuda::detect_cuda);
        $callback!("rocm", Backend::Rocm, "rocm", rocm::detect_rocm);
        $callback!(
            "apple",
            Backend::Apple,
            "apple",
            apple::detect_metal_and_ane
        );
        $callback!("vulkan", Backend::Vulkan, "vulkan", vulkan::detect_vulkan);
        $callback!(
            "intel-npu",
            Backend::IntelNpu,
            "intel_npu",
            intel_npu::detect_intel_npu
        );
        $callback!(
            "amd-xdna",
            Backend::AmdXdna,
            "amd_xdna",
            amd_xdna::detect_amd_xdna
        );
        $callback!("tpu", Backend::Tpu, "tpu", tpu::detect_tpu);
        $callback!("gaudi", Backend::Gaudi, "gaudi", gaudi::detect_gaudi);
        $callback!(
            "aws-neuron",
            Backend::AwsNeuron,
            "aws_neuron",
            neuron::detect_aws_neuron
        );
        $callback!(
            "intel-oneapi",
            Backend::IntelOneApi,
            "intel_oneapi",
            intel_oneapi::detect_intel_oneapi
        );
        $callback!(
            "qualcomm",
            Backend::Qualcomm,
            "qualcomm",
            qualcomm::detect_qualcomm_ai100
        );
        $callback!(
            "cerebras",
            Backend::Cerebras,
            "cerebras",
            cerebras::detect_cerebras_wse
        );
        $callback!(
            "graphcore",
            Backend::Graphcore,
            "graphcore",
            graphcore::detect_graphcore_ipu
        );
        $callback!("groq", Backend::Groq, "groq", groq::detect_groq_lpu);
        $callback!(
            "samsung-npu",
            Backend::SamsungNpu,
            "samsung_npu",
            samsung_npu::detect_samsung_npu
        );
        $callback!(
            "mediatek-apu",
            Backend::MediaTekApu,
            "mediatek_apu",
            mediatek_apu::detect_mediatek_apu
        );
        $callback!(
            "windows-wmi",
            Backend::WindowsWmi,
            "windows_wmi",
            windows::detect_windows_gpu
        );
    };
}

/// Async CLI backends — those with `detect_*_async` functions.
/// Invokes `$callback!(feature, Backend, async_detect_fn)` per entry.
#[cfg(feature = "async-detect")]
macro_rules! async_cli_backends {
    ($callback:ident) => {
        $callback!("cuda", Backend::Cuda, cuda::detect_cuda_async);
        $callback!("vulkan", Backend::Vulkan, vulkan::detect_vulkan_async);
        $callback!("gaudi", Backend::Gaudi, gaudi::detect_gaudi_async);
        $callback!(
            "aws-neuron",
            Backend::AwsNeuron,
            neuron::detect_aws_neuron_async
        );
        $callback!("apple", Backend::Apple, apple::detect_metal_and_ane_async);
        $callback!(
            "intel-oneapi",
            Backend::IntelOneApi,
            intel_oneapi::detect_intel_oneapi_async
        );
    };
}

/// Sysfs-only backends for async path (everything not in `async_cli_backends`).
#[cfg(feature = "async-detect")]
macro_rules! sysfs_backends {
    ($callback:ident) => {
        $callback!("rocm", Backend::Rocm, rocm::detect_rocm);
        $callback!("intel-npu", Backend::IntelNpu, intel_npu::detect_intel_npu);
        $callback!("amd-xdna", Backend::AmdXdna, amd_xdna::detect_amd_xdna);
        $callback!("tpu", Backend::Tpu, tpu::detect_tpu);
        $callback!(
            "qualcomm",
            Backend::Qualcomm,
            qualcomm::detect_qualcomm_ai100
        );
        $callback!("cerebras", Backend::Cerebras, cerebras::detect_cerebras_wse);
        $callback!(
            "graphcore",
            Backend::Graphcore,
            graphcore::detect_graphcore_ipu
        );
        $callback!("groq", Backend::Groq, groq::detect_groq_lpu);
        $callback!(
            "samsung-npu",
            Backend::SamsungNpu,
            samsung_npu::detect_samsung_npu
        );
        $callback!(
            "mediatek-apu",
            Backend::MediaTekApu,
            mediatek_apu::detect_mediatek_apu
        );
        $callback!(
            "windows-wmi",
            Backend::WindowsWmi,
            windows::detect_windows_gpu
        );
    };
}

/// Detection results with per-backend timing information.
#[derive(Debug, Clone)]
pub struct TimedDetection {
    /// The registry with all detected hardware.
    pub registry: AcceleratorRegistry,
    /// Per-backend detection duration.
    pub timings: HashMap<String, Duration>,
    /// Total wall-clock detection time.
    pub total: Duration,
}

impl AcceleratorRegistry {
    /// Probes the system for all available accelerators.
    ///
    /// Detection is best-effort: missing tools or sysfs entries simply mean
    /// the corresponding accelerator is not registered. Non-fatal issues are
    /// collected in [`AcceleratorRegistry::warnings`].
    ///
    /// All backends run **in parallel** via [`std::thread::scope`] for
    /// lower wall-clock latency on systems with multiple CLI tools.
    ///
    /// Backends can be disabled at compile time via cargo features
    /// (e.g. `default-features = false, features = ["cuda", "tpu"]`).
    pub fn detect() -> Self {
        detect_with_builder(DetectBuilder::new())
    }

    /// Like [`detect`](Self::detect), but also returns per-backend timing.
    ///
    /// Useful for diagnosing slow backends. The `timings` map contains
    /// backend names (e.g. `"cuda"`, `"vulkan"`) and how long each took.
    ///
    /// # Example
    ///
    /// ```rust,no_run
    /// use ai_hwaccel::AcceleratorRegistry;
    ///
    /// let result = AcceleratorRegistry::detect_with_timing();
    /// for (backend, duration) in &result.timings {
    ///     println!("{}: {:.1}ms", backend, duration.as_secs_f64() * 1000.0);
    /// }
    /// ```
    pub fn detect_with_timing() -> TimedDetection {
        detect_with_builder_timed(DetectBuilder::new())
    }
}

/// Run detection with a builder's backend selection.
///
/// When 2+ backends are enabled, runs them in parallel via `std::thread::scope`.
/// When 0-1 are enabled, runs sequentially to avoid thread spawn overhead.
pub(crate) fn detect_with_builder(builder: DetectBuilder) -> AcceleratorRegistry {
    // Pre-allocate for typical system: 1 CPU + up to 8 accelerators.
    let mut all_profiles = Vec::with_capacity(8);
    all_profiles.push(cpu_profile());
    let mut all_warnings: Vec<DetectionError> = Vec::new();

    let use_threads = builder.enabled_count() >= 2;

    if use_threads {
        std::thread::scope(|s| {
            let mut handles: Vec<std::thread::ScopedJoinHandle<'_, DetectResult>> = Vec::new();

            macro_rules! do_spawn {
                ($feature:literal, $backend:expr, $name:literal, $detect_fn:expr) => {
                    #[cfg(feature = $feature)]
                    if builder.backend_enabled($backend) {
                        handles.push(s.spawn(|| {
                            let mut p = Vec::new();
                            let mut w = Vec::new();
                            $detect_fn(&mut p, &mut w);
                            (p, w)
                        }));
                    }
                };
            }
            backend_table!(do_spawn);

            for handle in handles {
                if let Ok((profiles, warnings)) = handle.join() {
                    all_profiles.extend(profiles);
                    all_warnings.extend(warnings);
                }
            }
        });
    } else {
        macro_rules! do_run {
            ($feature:literal, $backend:expr, $name:literal, $detect_fn:expr) => {
                #[cfg(feature = $feature)]
                if builder.backend_enabled($backend) {
                    $detect_fn(&mut all_profiles, &mut all_warnings);
                }
            };
        }
        backend_table!(do_run);
    }

    // Post-pass: if vulkaninfo found no Vulkan devices, try sysfs fallback.
    #[cfg(feature = "vulkan")]
    {
        let has_vulkan = all_profiles
            .iter()
            .any(|p| matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
        let has_dedicated = all_profiles.iter().any(|p| {
            matches!(
                p.accelerator,
                AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
            )
        });
        if !has_vulkan && !has_dedicated && builder.backend_enabled(Backend::Vulkan) {
            vulkan::detect_vulkan_sysfs(&mut all_profiles, &mut all_warnings);
        }
    }

    // Post-pass: remove Vulkan GPUs if a dedicated CUDA or ROCm GPU was found.
    let has_dedicated = all_profiles.iter().any(|p| {
        matches!(
            p.accelerator,
            AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
        )
    });
    if has_dedicated {
        all_profiles.retain(|p| !matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
    }

    // Post-pass: enrich profiles with memory bandwidth, PCIe, and NUMA.
    // Compute PCI address lists once, shared between PCIe and NUMA passes.
    bandwidth::enrich_bandwidth(&mut all_profiles, &mut all_warnings);
    let nvidia_pci = list_driver_pci_addrs("nvidia");
    let amdgpu_pci = list_driver_pci_addrs("amdgpu");
    pcie::enrich_pcie(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
    numa::enrich_numa(&mut all_profiles, &nvidia_pci, &amdgpu_pci);

    // Detect system-level I/O: interconnects and storage.
    let system_interconnects = interconnect::detect_interconnects(&mut all_warnings);
    let system_storage = disk::detect_storage();
    let system_environment = environment::detect_environment();
    let system_io = SystemIo {
        interconnects: system_interconnects,
        storage: system_storage,
        environment: Some(system_environment),
    };

    debug!(
        count = all_profiles.len(),
        warnings = all_warnings.len(),
        interconnects = system_io.interconnects.len(),
        storage_devices = system_io.storage.len(),
        "accelerator detection complete"
    );
    AcceleratorRegistry {
        schema_version: crate::registry::SCHEMA_VERSION,
        profiles: all_profiles,
        warnings: all_warnings,
        system_io,
    }
}

/// Run detection with timing information per backend.
pub(crate) fn detect_with_builder_timed(builder: DetectBuilder) -> TimedDetection {
    let wall_start = Instant::now();
    let mut all_profiles = Vec::with_capacity(8);
    all_profiles.push(cpu_profile());
    let mut all_warnings: Vec<DetectionError> = Vec::new();
    let mut timings: HashMap<String, Duration> = HashMap::new();

    let use_threads = builder.enabled_count() >= 2;

    if use_threads {
        std::thread::scope(|s| {
            let mut handles: Vec<(&str, std::thread::ScopedJoinHandle<'_, TimedDetectResult>)> =
                Vec::new();

            macro_rules! do_spawn_timed {
                ($feature:literal, $backend:expr, $name:literal, $detect_fn:expr) => {
                    #[cfg(feature = $feature)]
                    if builder.backend_enabled($backend) {
                        handles.push((
                            $name,
                            s.spawn(|| {
                                let start = Instant::now();
                                let mut p = Vec::new();
                                let mut w = Vec::new();
                                $detect_fn(&mut p, &mut w);
                                (p, w, start.elapsed())
                            }),
                        ));
                    }
                };
            }
            backend_table!(do_spawn_timed);

            for (name, handle) in handles {
                if let Ok((profiles, warnings, duration)) = handle.join() {
                    all_profiles.extend(profiles);
                    all_warnings.extend(warnings);
                    timings.insert(name.into(), duration);
                }
            }
        });
    } else {
        macro_rules! do_run_timed {
            ($feature:literal, $backend:expr, $name:literal, $detect_fn:expr) => {
                #[cfg(feature = $feature)]
                if builder.backend_enabled($backend) {
                    let start = Instant::now();
                    $detect_fn(&mut all_profiles, &mut all_warnings);
                    timings.insert($name.into(), start.elapsed());
                }
            };
        }
        backend_table!(do_run_timed);
    }

    // Post-pass: sysfs Vulkan fallback (same as detect_with_builder).
    #[cfg(feature = "vulkan")]
    {
        let has_vulkan = all_profiles
            .iter()
            .any(|p| matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
        let has_dedicated = all_profiles.iter().any(|p| {
            matches!(
                p.accelerator,
                AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
            )
        });
        if !has_vulkan && !has_dedicated && builder.backend_enabled(Backend::Vulkan) {
            let start = Instant::now();
            vulkan::detect_vulkan_sysfs(&mut all_profiles, &mut all_warnings);
            timings.insert("vulkan_sysfs".into(), start.elapsed());
        }
    }

    // Post-pass: remove Vulkan GPUs if a dedicated CUDA or ROCm GPU was found.
    let has_dedicated = all_profiles.iter().any(|p| {
        matches!(
            p.accelerator,
            AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
        )
    });
    if has_dedicated {
        all_profiles.retain(|p| !matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
    }

    let enrich_start = Instant::now();
    bandwidth::enrich_bandwidth(&mut all_profiles, &mut all_warnings);
    let nvidia_pci = list_driver_pci_addrs("nvidia");
    let amdgpu_pci = list_driver_pci_addrs("amdgpu");
    pcie::enrich_pcie(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
    numa::enrich_numa(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
    timings.insert("_enrich".into(), enrich_start.elapsed());

    let sysio_start = Instant::now();
    let system_interconnects = interconnect::detect_interconnects(&mut all_warnings);
    let system_storage = disk::detect_storage();
    let system_environment = environment::detect_environment();
    let system_io = SystemIo {
        interconnects: system_interconnects,
        storage: system_storage,
        environment: Some(system_environment),
    };
    timings.insert("_system_io".into(), sysio_start.elapsed());

    let registry = AcceleratorRegistry {
        schema_version: crate::registry::SCHEMA_VERSION,
        profiles: all_profiles,
        warnings: all_warnings,
        system_io,
    };

    TimedDetection {
        registry,
        timings,
        total: wall_start.elapsed(),
    }
}

// ---------------------------------------------------------------------------
// Shared helpers
// ---------------------------------------------------------------------------

/// List PCI addresses bound to a given driver (sorted).
pub(super) fn list_driver_pci_addrs(driver: &str) -> Vec<String> {
    let dir = Path::new("/sys/bus/pci/drivers").join(driver);
    if !dir.exists() {
        return Vec::new();
    }
    let mut addrs: Vec<String> = std::fs::read_dir(&dir)
        .into_iter()
        .flatten()
        .flatten()
        .filter_map(|e| {
            let name = e.file_name();
            let name_bytes = name.as_encoded_bytes();
            // PCI addresses look like "0000:01:00.0" — only hex digits, colons, dots.
            if name_bytes.contains(&b':')
                && name_bytes.contains(&b'.')
                && name_bytes
                    .iter()
                    .all(|&b| b.is_ascii_hexdigit() || b == b':' || b == b'.')
            {
                Some(name.to_string_lossy().into_owned())
            } else {
                None
            }
        })
        .collect();
    addrs.sort();
    addrs
}

/// Enumerate `/dev` device nodes matching a prefix with numeric suffixes.
///
/// For example, `count_dev_devices("neuron")` counts `/dev/neuron0`, `/dev/neuron1`, etc.
/// Returns an iterator of parsed device IDs.
pub(super) fn iter_dev_devices(prefix: &str) -> impl Iterator<Item = u32> + '_ {
    std::fs::read_dir("/dev")
        .into_iter()
        .flatten()
        .flatten()
        .filter_map(move |entry| {
            let name = entry.file_name();
            let name_str = name.to_string_lossy();
            let suffix = name_str.strip_prefix(prefix)?;
            if suffix.is_empty() || !suffix.chars().all(|c| c.is_ascii_digit()) {
                return None;
            }
            suffix.parse::<u32>().ok()
        })
}

/// Check if any `/dev` device node matches a prefix (any suffix).
///
/// For example, `has_dev_device("groq")` returns true if `/dev/groq*` exists.
pub(super) fn has_dev_device(prefix: &str) -> bool {
    std::fs::read_dir("/dev")
        .into_iter()
        .flatten()
        .flatten()
        .any(|entry| entry.file_name().to_string_lossy().starts_with(prefix))
}

/// Build a default CPU profile with detected system memory.
pub(crate) fn cpu_profile() -> AcceleratorProfile {
    AcceleratorProfile {
        accelerator: AcceleratorType::Cpu,
        available: true,
        memory_bytes: detect_cpu_memory(),
        ..Default::default()
    }
}

/// System memory from /proc/meminfo (fallback: 16 GiB).
pub(crate) fn detect_cpu_memory() -> u64 {
    if let Some(info) = read_sysfs_string(std::path::Path::new("/proc/meminfo"), 64 * 1024) {
        for line in info.lines() {
            if line.starts_with("MemTotal:")
                && let Some(kb_str) = line.split_whitespace().nth(1)
                && let Ok(kb) = kb_str.parse::<u64>()
            {
                return kb.saturating_mul(1024);
            }
        }
    }
    // macOS fallback via safe command runner (absolute path, timeout).
    if let Ok(output) = command::run_tool("sysctl", &["-n", "hw.memsize"], command::DEFAULT_TIMEOUT)
        && let Ok(bytes) = output.stdout.trim().parse::<u64>()
    {
        return bytes;
    }
    debug!("could not read system memory, defaulting to 16 GiB");
    16 * 1024 * 1024 * 1024
}

/// Read a u64 from a sysfs file, capped at 64 bytes.
pub(super) fn read_sysfs_u64(path: &Path) -> Option<u64> {
    read_sysfs_string(path, 64).and_then(|s| s.trim().parse().ok())
}

/// Read a string from a sysfs file, capped at `max_bytes` to prevent DoS.
///
/// Sysfs pseudo-files report `st_size = 4096` regardless of actual content,
/// so we can't use metadata for size checking. Instead, we read up to
/// `max_bytes` and discard if truncated.
///
/// Uses a stack buffer for small reads (≤ 512 bytes) to avoid heap allocation
/// in the common case.
pub(super) fn read_sysfs_string(path: &Path, max_bytes: usize) -> Option<String> {
    use std::io::Read;
    let mut file = std::fs::File::open(path).ok()?;

    // Stack buffer for common small reads, heap for larger ones.
    const STACK_SIZE: usize = 512;
    if max_bytes < STACK_SIZE {
        let mut buf = [0u8; STACK_SIZE];
        let n = file.read(&mut buf[..max_bytes + 1]).ok()?;
        if n > max_bytes {
            return None;
        }
        return String::from_utf8(buf[..n].to_vec()).ok();
    }

    let mut buf = vec![0u8; max_bytes + 1];
    let n = file.read(&mut buf).ok()?;
    if n > max_bytes {
        return None;
    }
    buf.truncate(n);
    String::from_utf8(buf).ok()
}

// ---------------------------------------------------------------------------
// True async detection (requires `async-detect` feature)
// ---------------------------------------------------------------------------

/// Async detection orchestrator using `tokio::process::Command`.
///
/// CLI backends run as concurrent tokio tasks with true async subprocess I/O.
/// Sysfs-only backends run in a single `spawn_blocking` task since they are
/// fast filesystem reads. Post-passes (bandwidth, PCIe, NUMA) run after all
/// backends complete.
#[cfg(feature = "async-detect")]
pub(crate) async fn detect_with_builder_async(builder: DetectBuilder) -> AcceleratorRegistry {
    let mut all_profiles = vec![cpu_profile()];
    let mut all_warnings: Vec<DetectionError> = Vec::new();

    debug!(
        backends = builder.enabled_count(),
        "starting async detection"
    );

    // Spawn async CLI backends as concurrent tokio tasks.
    let mut handles: Vec<tokio::task::JoinHandle<DetectResult>> = Vec::new();

    macro_rules! do_spawn_async {
        ($feature:literal, $backend:expr, $detect_fn:path) => {
            #[cfg(feature = $feature)]
            if builder.backend_enabled($backend) {
                handles.push(tokio::spawn($detect_fn()));
            }
        };
    }
    async_cli_backends!(do_spawn_async);

    // Sysfs-only backends run in a single blocking task.
    let sysfs_builder = builder.clone();
    let sysfs_handle = tokio::task::spawn_blocking(move || {
        let mut profiles = Vec::new();
        let mut warnings: Vec<DetectionError> = Vec::new();

        macro_rules! do_run_sysfs {
            ($feature:literal, $backend:expr, $detect_fn:expr) => {
                #[cfg(feature = $feature)]
                if sysfs_builder.backend_enabled($backend) {
                    $detect_fn(&mut profiles, &mut warnings);
                }
            };
        }
        sysfs_backends!(do_run_sysfs);

        (profiles, warnings)
    });

    // Collect async CLI results.
    for handle in handles {
        if let Ok((profiles, warnings)) = handle.await {
            all_profiles.extend(profiles);
            all_warnings.extend(warnings);
        }
    }

    // Collect sysfs results.
    if let Ok((profiles, warnings)) = sysfs_handle.await {
        all_profiles.extend(profiles);
        all_warnings.extend(warnings);
    }

    // Post-pass: sysfs Vulkan fallback.
    #[cfg(feature = "vulkan")]
    {
        let has_vulkan = all_profiles
            .iter()
            .any(|p| matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
        let has_dedicated = all_profiles.iter().any(|p| {
            matches!(
                p.accelerator,
                AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
            )
        });
        if !has_vulkan && !has_dedicated && builder.backend_enabled(Backend::Vulkan) {
            vulkan::detect_vulkan_sysfs(&mut all_profiles, &mut all_warnings);
        }
    }

    // Post-pass: remove Vulkan GPUs if a dedicated CUDA or ROCm GPU was found.
    let has_dedicated = all_profiles.iter().any(|p| {
        matches!(
            p.accelerator,
            AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
        )
    });
    if has_dedicated {
        all_profiles.retain(|p| !matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
    }

    // Post-pass: enrich with bandwidth (async), PCIe, NUMA.
    bandwidth::enrich_bandwidth_async(&mut all_profiles, &mut all_warnings).await;
    let nvidia_pci = list_driver_pci_addrs("nvidia");
    let amdgpu_pci = list_driver_pci_addrs("amdgpu");
    pcie::enrich_pcie(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
    numa::enrich_numa(&mut all_profiles, &nvidia_pci, &amdgpu_pci);

    // System I/O: async interconnects + blocking storage.
    let (system_interconnects, ic_warnings) = interconnect::detect_interconnects_async().await;
    all_warnings.extend(ic_warnings);

    let system_storage = tokio::task::spawn_blocking(disk::detect_storage)
        .await
        .unwrap_or_default();

    let system_environment = environment::detect_environment();
    let system_io = SystemIo {
        interconnects: system_interconnects,
        storage: system_storage,
        environment: Some(system_environment),
    };

    debug!(
        count = all_profiles.len(),
        warnings = all_warnings.len(),
        interconnects = system_io.interconnects.len(),
        storage_devices = system_io.storage.len(),
        "async accelerator detection complete"
    );
    AcceleratorRegistry {
        schema_version: crate::registry::SCHEMA_VERSION,
        profiles: all_profiles,
        warnings: all_warnings,
        system_io,
    }
}