ai_hwaccel/detect/
mod.rs

1//! Hardware detection: probes sysfs, /dev, and PATH tools to discover accelerators.
2
3#[cfg(feature = "amd-xdna")]
4pub(crate) mod amd_xdna;
5#[cfg(feature = "apple")]
6pub(crate) mod apple;
7pub mod bandwidth;
8#[cfg(feature = "cerebras")]
9pub(crate) mod cerebras;
10pub(crate) mod command;
11#[cfg(feature = "cuda")]
12pub mod cuda;
13pub(crate) mod disk;
14pub(crate) mod environment;
15#[cfg(feature = "gaudi")]
16pub mod gaudi;
17#[cfg(feature = "graphcore")]
18pub(crate) mod graphcore;
19#[cfg(feature = "groq")]
20pub(crate) mod groq;
21#[cfg(feature = "intel-npu")]
22pub(crate) mod intel_npu;
23#[cfg(feature = "intel-oneapi")]
24pub(crate) mod intel_oneapi;
25pub mod interconnect;
26#[cfg(feature = "mediatek-apu")]
27pub(crate) mod mediatek_apu;
28#[cfg(feature = "aws-neuron")]
29pub(crate) mod neuron;
30pub(crate) mod numa;
31pub mod pcie;
32#[cfg(feature = "qualcomm")]
33pub(crate) mod qualcomm;
34#[cfg(feature = "rocm")]
35pub(crate) mod rocm;
36#[cfg(feature = "samsung-npu")]
37pub(crate) mod samsung_npu;
38#[cfg(feature = "tpu")]
39pub(crate) mod tpu;
40#[cfg(feature = "vulkan")]
41pub mod vulkan;
42#[cfg(feature = "windows-wmi")]
43pub(crate) mod windows;
44
45pub mod platform;
46
47use std::collections::HashMap;
48use std::path::Path;
49use std::time::{Duration, Instant};
50
51use tracing::debug;
52
53use crate::error::DetectionError;
54use crate::hardware::AcceleratorType;
55use crate::profile::AcceleratorProfile;
56use crate::registry::{AcceleratorRegistry, Backend, DetectBuilder};
57use crate::system_io::SystemIo;
58
59/// Per-backend detection result.
60type DetectResult = (Vec<AcceleratorProfile>, Vec<DetectionError>);
61
62/// Per-backend detection result with timing.
63type TimedDetectResult = (Vec<AcceleratorProfile>, Vec<DetectionError>, Duration);
64
65/// Detection results with per-backend timing information.
66#[derive(Debug, Clone)]
67pub struct TimedDetection {
68    /// The registry with all detected hardware.
69    pub registry: AcceleratorRegistry,
70    /// Per-backend detection duration.
71    pub timings: HashMap<String, Duration>,
72    /// Total wall-clock detection time.
73    pub total: Duration,
74}
75
76impl AcceleratorRegistry {
77    /// Probes the system for all available accelerators.
78    ///
79    /// Detection is best-effort: missing tools or sysfs entries simply mean
80    /// the corresponding accelerator is not registered. Non-fatal issues are
81    /// collected in [`AcceleratorRegistry::warnings`].
82    ///
83    /// All backends run **in parallel** via [`std::thread::scope`] for
84    /// lower wall-clock latency on systems with multiple CLI tools.
85    ///
86    /// Backends can be disabled at compile time via cargo features
87    /// (e.g. `default-features = false, features = ["cuda", "tpu"]`).
88    pub fn detect() -> Self {
89        detect_with_builder(DetectBuilder::new())
90    }
91
92    /// Like [`detect`](Self::detect), but also returns per-backend timing.
93    ///
94    /// Useful for diagnosing slow backends. The `timings` map contains
95    /// backend names (e.g. `"cuda"`, `"vulkan"`) and how long each took.
96    ///
97    /// # Example
98    ///
99    /// ```rust,no_run
100    /// use ai_hwaccel::AcceleratorRegistry;
101    ///
102    /// let result = AcceleratorRegistry::detect_with_timing();
103    /// for (backend, duration) in &result.timings {
104    ///     println!("{}: {:.1}ms", backend, duration.as_secs_f64() * 1000.0);
105    /// }
106    /// ```
107    pub fn detect_with_timing() -> TimedDetection {
108        detect_with_builder_timed(DetectBuilder::new())
109    }
110}
111
112/// Run detection with a builder's backend selection.
113///
114/// When 2+ backends are enabled, runs them in parallel via `std::thread::scope`.
115/// When 0-1 are enabled, runs sequentially to avoid thread spawn overhead.
116pub(crate) fn detect_with_builder(builder: DetectBuilder) -> AcceleratorRegistry {
117    // Pre-allocate for typical system: 1 CPU + up to 8 accelerators.
118    let mut all_profiles = Vec::with_capacity(8);
119    all_profiles.push(cpu_profile());
120    let mut all_warnings: Vec<DetectionError> = Vec::new();
121
122    let use_threads = builder.enabled_count() >= 2;
123
124    macro_rules! run_backend {
125        ($feature:literal, $backend:expr, $detect_fn:expr) => {
126            #[cfg(feature = $feature)]
127            if builder.backend_enabled($backend) {
128                $detect_fn(&mut all_profiles, &mut all_warnings);
129            }
130        };
131    }
132
133    macro_rules! spawn_backend {
134        ($feature:literal, $backend:expr, $detect_fn:expr, $handles:expr, $s:expr) => {
135            #[cfg(feature = $feature)]
136            if builder.backend_enabled($backend) {
137                $handles.push($s.spawn(|| {
138                    let mut p = Vec::new();
139                    let mut w = Vec::new();
140                    $detect_fn(&mut p, &mut w);
141                    (p, w)
142                }));
143            }
144        };
145    }
146
147    if use_threads {
148        std::thread::scope(|s| {
149            let mut handles: Vec<std::thread::ScopedJoinHandle<'_, DetectResult>> = Vec::new();
150
151            spawn_backend!("cuda", Backend::Cuda, cuda::detect_cuda, handles, s);
152            spawn_backend!("rocm", Backend::Rocm, rocm::detect_rocm, handles, s);
153            spawn_backend!(
154                "apple",
155                Backend::Apple,
156                apple::detect_metal_and_ane,
157                handles,
158                s
159            );
160            spawn_backend!("vulkan", Backend::Vulkan, vulkan::detect_vulkan, handles, s);
161            spawn_backend!(
162                "intel-npu",
163                Backend::IntelNpu,
164                intel_npu::detect_intel_npu,
165                handles,
166                s
167            );
168            spawn_backend!(
169                "amd-xdna",
170                Backend::AmdXdna,
171                amd_xdna::detect_amd_xdna,
172                handles,
173                s
174            );
175            spawn_backend!("tpu", Backend::Tpu, tpu::detect_tpu, handles, s);
176            spawn_backend!("gaudi", Backend::Gaudi, gaudi::detect_gaudi, handles, s);
177            spawn_backend!(
178                "aws-neuron",
179                Backend::AwsNeuron,
180                neuron::detect_aws_neuron,
181                handles,
182                s
183            );
184            spawn_backend!(
185                "intel-oneapi",
186                Backend::IntelOneApi,
187                intel_oneapi::detect_intel_oneapi,
188                handles,
189                s
190            );
191            spawn_backend!(
192                "qualcomm",
193                Backend::Qualcomm,
194                qualcomm::detect_qualcomm_ai100,
195                handles,
196                s
197            );
198            spawn_backend!(
199                "cerebras",
200                Backend::Cerebras,
201                cerebras::detect_cerebras_wse,
202                handles,
203                s
204            );
205            spawn_backend!(
206                "graphcore",
207                Backend::Graphcore,
208                graphcore::detect_graphcore_ipu,
209                handles,
210                s
211            );
212            spawn_backend!("groq", Backend::Groq, groq::detect_groq_lpu, handles, s);
213            spawn_backend!(
214                "samsung-npu",
215                Backend::SamsungNpu,
216                samsung_npu::detect_samsung_npu,
217                handles,
218                s
219            );
220            spawn_backend!(
221                "mediatek-apu",
222                Backend::MediaTekApu,
223                mediatek_apu::detect_mediatek_apu,
224                handles,
225                s
226            );
227            spawn_backend!(
228                "windows-wmi",
229                Backend::WindowsWmi,
230                windows::detect_windows_gpu,
231                handles,
232                s
233            );
234
235            for handle in handles {
236                if let Ok((profiles, warnings)) = handle.join() {
237                    all_profiles.extend(profiles);
238                    all_warnings.extend(warnings);
239                }
240            }
241        });
242    } else {
243        run_backend!("cuda", Backend::Cuda, cuda::detect_cuda);
244        run_backend!("rocm", Backend::Rocm, rocm::detect_rocm);
245        run_backend!("apple", Backend::Apple, apple::detect_metal_and_ane);
246        run_backend!("vulkan", Backend::Vulkan, vulkan::detect_vulkan);
247        run_backend!("intel-npu", Backend::IntelNpu, intel_npu::detect_intel_npu);
248        run_backend!("amd-xdna", Backend::AmdXdna, amd_xdna::detect_amd_xdna);
249        run_backend!("tpu", Backend::Tpu, tpu::detect_tpu);
250        run_backend!("gaudi", Backend::Gaudi, gaudi::detect_gaudi);
251        run_backend!("aws-neuron", Backend::AwsNeuron, neuron::detect_aws_neuron);
252        run_backend!(
253            "intel-oneapi",
254            Backend::IntelOneApi,
255            intel_oneapi::detect_intel_oneapi
256        );
257        run_backend!(
258            "qualcomm",
259            Backend::Qualcomm,
260            qualcomm::detect_qualcomm_ai100
261        );
262        run_backend!("cerebras", Backend::Cerebras, cerebras::detect_cerebras_wse);
263        run_backend!(
264            "graphcore",
265            Backend::Graphcore,
266            graphcore::detect_graphcore_ipu
267        );
268        run_backend!("groq", Backend::Groq, groq::detect_groq_lpu);
269        run_backend!(
270            "samsung-npu",
271            Backend::SamsungNpu,
272            samsung_npu::detect_samsung_npu
273        );
274        run_backend!(
275            "mediatek-apu",
276            Backend::MediaTekApu,
277            mediatek_apu::detect_mediatek_apu
278        );
279        run_backend!(
280            "windows-wmi",
281            Backend::WindowsWmi,
282            windows::detect_windows_gpu
283        );
284    }
285
286    // Post-pass: if vulkaninfo found no Vulkan devices, try sysfs fallback.
287    #[cfg(feature = "vulkan")]
288    {
289        let has_vulkan = all_profiles
290            .iter()
291            .any(|p| matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
292        let has_dedicated = all_profiles.iter().any(|p| {
293            matches!(
294                p.accelerator,
295                AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
296            )
297        });
298        if !has_vulkan && !has_dedicated && builder.backend_enabled(Backend::Vulkan) {
299            vulkan::detect_vulkan_sysfs(&mut all_profiles, &mut all_warnings);
300        }
301    }
302
303    // Post-pass: remove Vulkan GPUs if a dedicated CUDA or ROCm GPU was found.
304    let has_dedicated = all_profiles.iter().any(|p| {
305        matches!(
306            p.accelerator,
307            AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
308        )
309    });
310    if has_dedicated {
311        all_profiles.retain(|p| !matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
312    }
313
314    // Post-pass: enrich profiles with memory bandwidth, PCIe, and NUMA.
315    // Compute PCI address lists once, shared between PCIe and NUMA passes.
316    bandwidth::enrich_bandwidth(&mut all_profiles, &mut all_warnings);
317    let nvidia_pci = list_driver_pci_addrs("nvidia");
318    let amdgpu_pci = list_driver_pci_addrs("amdgpu");
319    pcie::enrich_pcie(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
320    numa::enrich_numa(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
321
322    // Detect system-level I/O: interconnects and storage.
323    let system_interconnects = interconnect::detect_interconnects(&mut all_warnings);
324    let system_storage = disk::detect_storage();
325    let system_environment = environment::detect_environment();
326    let system_io = SystemIo {
327        interconnects: system_interconnects,
328        storage: system_storage,
329        environment: Some(system_environment),
330    };
331
332    debug!(
333        count = all_profiles.len(),
334        warnings = all_warnings.len(),
335        interconnects = system_io.interconnects.len(),
336        storage_devices = system_io.storage.len(),
337        "accelerator detection complete"
338    );
339    AcceleratorRegistry {
340        schema_version: crate::registry::SCHEMA_VERSION,
341        profiles: all_profiles,
342        warnings: all_warnings,
343        system_io,
344    }
345}
346
347/// Run detection with timing information per backend.
348pub(crate) fn detect_with_builder_timed(builder: DetectBuilder) -> TimedDetection {
349    let wall_start = Instant::now();
350    let mut all_profiles = Vec::with_capacity(8);
351    all_profiles.push(cpu_profile());
352    let mut all_warnings: Vec<DetectionError> = Vec::new();
353    let mut timings: HashMap<String, Duration> = HashMap::new();
354
355    macro_rules! run_backend_timed {
356        ($feature:literal, $backend:expr, $name:literal, $detect_fn:expr) => {
357            #[cfg(feature = $feature)]
358            if builder.backend_enabled($backend) {
359                let start = Instant::now();
360                $detect_fn(&mut all_profiles, &mut all_warnings);
361                timings.insert($name.into(), start.elapsed());
362            }
363        };
364    }
365
366    macro_rules! spawn_backend_timed {
367        ($feature:literal, $backend:expr, $name:literal, $detect_fn:expr, $handles:expr, $s:expr) => {
368            #[cfg(feature = $feature)]
369            if builder.backend_enabled($backend) {
370                $handles.push((
371                    $name,
372                    $s.spawn(|| {
373                        let start = Instant::now();
374                        let mut p = Vec::new();
375                        let mut w = Vec::new();
376                        $detect_fn(&mut p, &mut w);
377                        (p, w, start.elapsed())
378                    }),
379                ));
380            }
381        };
382    }
383
384    let use_threads = builder.enabled_count() >= 2;
385
386    if use_threads {
387        std::thread::scope(|s| {
388            let mut handles: Vec<(&str, std::thread::ScopedJoinHandle<'_, TimedDetectResult>)> =
389                Vec::new();
390
391            spawn_backend_timed!("cuda", Backend::Cuda, "cuda", cuda::detect_cuda, handles, s);
392            spawn_backend_timed!("rocm", Backend::Rocm, "rocm", rocm::detect_rocm, handles, s);
393            spawn_backend_timed!(
394                "apple",
395                Backend::Apple,
396                "apple",
397                apple::detect_metal_and_ane,
398                handles,
399                s
400            );
401            spawn_backend_timed!(
402                "vulkan",
403                Backend::Vulkan,
404                "vulkan",
405                vulkan::detect_vulkan,
406                handles,
407                s
408            );
409            spawn_backend_timed!(
410                "intel-npu",
411                Backend::IntelNpu,
412                "intel_npu",
413                intel_npu::detect_intel_npu,
414                handles,
415                s
416            );
417            spawn_backend_timed!(
418                "amd-xdna",
419                Backend::AmdXdna,
420                "amd_xdna",
421                amd_xdna::detect_amd_xdna,
422                handles,
423                s
424            );
425            spawn_backend_timed!("tpu", Backend::Tpu, "tpu", tpu::detect_tpu, handles, s);
426            spawn_backend_timed!(
427                "gaudi",
428                Backend::Gaudi,
429                "gaudi",
430                gaudi::detect_gaudi,
431                handles,
432                s
433            );
434            spawn_backend_timed!(
435                "aws-neuron",
436                Backend::AwsNeuron,
437                "aws_neuron",
438                neuron::detect_aws_neuron,
439                handles,
440                s
441            );
442            spawn_backend_timed!(
443                "intel-oneapi",
444                Backend::IntelOneApi,
445                "intel_oneapi",
446                intel_oneapi::detect_intel_oneapi,
447                handles,
448                s
449            );
450            spawn_backend_timed!(
451                "qualcomm",
452                Backend::Qualcomm,
453                "qualcomm",
454                qualcomm::detect_qualcomm_ai100,
455                handles,
456                s
457            );
458            spawn_backend_timed!(
459                "cerebras",
460                Backend::Cerebras,
461                "cerebras",
462                cerebras::detect_cerebras_wse,
463                handles,
464                s
465            );
466            spawn_backend_timed!(
467                "graphcore",
468                Backend::Graphcore,
469                "graphcore",
470                graphcore::detect_graphcore_ipu,
471                handles,
472                s
473            );
474            spawn_backend_timed!(
475                "groq",
476                Backend::Groq,
477                "groq",
478                groq::detect_groq_lpu,
479                handles,
480                s
481            );
482            spawn_backend_timed!(
483                "samsung-npu",
484                Backend::SamsungNpu,
485                "samsung_npu",
486                samsung_npu::detect_samsung_npu,
487                handles,
488                s
489            );
490            spawn_backend_timed!(
491                "mediatek-apu",
492                Backend::MediaTekApu,
493                "mediatek_apu",
494                mediatek_apu::detect_mediatek_apu,
495                handles,
496                s
497            );
498            spawn_backend_timed!(
499                "windows-wmi",
500                Backend::WindowsWmi,
501                "windows_wmi",
502                windows::detect_windows_gpu,
503                handles,
504                s
505            );
506
507            for (name, handle) in handles {
508                if let Ok((profiles, warnings, duration)) = handle.join() {
509                    all_profiles.extend(profiles);
510                    all_warnings.extend(warnings);
511                    timings.insert(name.into(), duration);
512                }
513            }
514        });
515    } else {
516        run_backend_timed!("cuda", Backend::Cuda, "cuda", cuda::detect_cuda);
517        run_backend_timed!("rocm", Backend::Rocm, "rocm", rocm::detect_rocm);
518        run_backend_timed!(
519            "apple",
520            Backend::Apple,
521            "apple",
522            apple::detect_metal_and_ane
523        );
524        run_backend_timed!("vulkan", Backend::Vulkan, "vulkan", vulkan::detect_vulkan);
525        run_backend_timed!(
526            "intel-npu",
527            Backend::IntelNpu,
528            "intel_npu",
529            intel_npu::detect_intel_npu
530        );
531        run_backend_timed!(
532            "amd-xdna",
533            Backend::AmdXdna,
534            "amd_xdna",
535            amd_xdna::detect_amd_xdna
536        );
537        run_backend_timed!("tpu", Backend::Tpu, "tpu", tpu::detect_tpu);
538        run_backend_timed!("gaudi", Backend::Gaudi, "gaudi", gaudi::detect_gaudi);
539        run_backend_timed!(
540            "aws-neuron",
541            Backend::AwsNeuron,
542            "aws_neuron",
543            neuron::detect_aws_neuron
544        );
545        run_backend_timed!(
546            "intel-oneapi",
547            Backend::IntelOneApi,
548            "intel_oneapi",
549            intel_oneapi::detect_intel_oneapi
550        );
551        run_backend_timed!(
552            "qualcomm",
553            Backend::Qualcomm,
554            "qualcomm",
555            qualcomm::detect_qualcomm_ai100
556        );
557        run_backend_timed!(
558            "cerebras",
559            Backend::Cerebras,
560            "cerebras",
561            cerebras::detect_cerebras_wse
562        );
563        run_backend_timed!(
564            "graphcore",
565            Backend::Graphcore,
566            "graphcore",
567            graphcore::detect_graphcore_ipu
568        );
569        run_backend_timed!("groq", Backend::Groq, "groq", groq::detect_groq_lpu);
570        run_backend_timed!(
571            "samsung-npu",
572            Backend::SamsungNpu,
573            "samsung_npu",
574            samsung_npu::detect_samsung_npu
575        );
576        run_backend_timed!(
577            "mediatek-apu",
578            Backend::MediaTekApu,
579            "mediatek_apu",
580            mediatek_apu::detect_mediatek_apu
581        );
582        run_backend_timed!(
583            "windows-wmi",
584            Backend::WindowsWmi,
585            "windows_wmi",
586            windows::detect_windows_gpu
587        );
588    }
589
590    // Post-pass: sysfs Vulkan fallback (same as detect_with_builder).
591    #[cfg(feature = "vulkan")]
592    {
593        let has_vulkan = all_profiles
594            .iter()
595            .any(|p| matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
596        let has_dedicated = all_profiles.iter().any(|p| {
597            matches!(
598                p.accelerator,
599                AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
600            )
601        });
602        if !has_vulkan && !has_dedicated && builder.backend_enabled(Backend::Vulkan) {
603            let start = Instant::now();
604            vulkan::detect_vulkan_sysfs(&mut all_profiles, &mut all_warnings);
605            timings.insert("vulkan_sysfs".into(), start.elapsed());
606        }
607    }
608
609    // Post-pass: remove Vulkan GPUs if a dedicated CUDA or ROCm GPU was found.
610    let has_dedicated = all_profiles.iter().any(|p| {
611        matches!(
612            p.accelerator,
613            AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
614        )
615    });
616    if has_dedicated {
617        all_profiles.retain(|p| !matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
618    }
619
620    let enrich_start = Instant::now();
621    bandwidth::enrich_bandwidth(&mut all_profiles, &mut all_warnings);
622    let nvidia_pci = list_driver_pci_addrs("nvidia");
623    let amdgpu_pci = list_driver_pci_addrs("amdgpu");
624    pcie::enrich_pcie(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
625    numa::enrich_numa(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
626    timings.insert("_enrich".into(), enrich_start.elapsed());
627
628    let sysio_start = Instant::now();
629    let system_interconnects = interconnect::detect_interconnects(&mut all_warnings);
630    let system_storage = disk::detect_storage();
631    let system_environment = environment::detect_environment();
632    let system_io = SystemIo {
633        interconnects: system_interconnects,
634        storage: system_storage,
635        environment: Some(system_environment),
636    };
637    timings.insert("_system_io".into(), sysio_start.elapsed());
638
639    let registry = AcceleratorRegistry {
640        schema_version: crate::registry::SCHEMA_VERSION,
641        profiles: all_profiles,
642        warnings: all_warnings,
643        system_io,
644    };
645
646    TimedDetection {
647        registry,
648        timings,
649        total: wall_start.elapsed(),
650    }
651}
652
653// ---------------------------------------------------------------------------
654// Shared helpers
655// ---------------------------------------------------------------------------
656
657/// List PCI addresses bound to a given driver (sorted).
658pub(super) fn list_driver_pci_addrs(driver: &str) -> Vec<String> {
659    let dir = Path::new("/sys/bus/pci/drivers").join(driver);
660    if !dir.exists() {
661        return Vec::new();
662    }
663    let mut addrs: Vec<String> = std::fs::read_dir(&dir)
664        .into_iter()
665        .flatten()
666        .flatten()
667        .filter_map(|e| {
668            let name = e.file_name();
669            let name_bytes = name.as_encoded_bytes();
670            // PCI addresses look like "0000:01:00.0" — only hex digits, colons, dots.
671            if name_bytes.contains(&b':')
672                && name_bytes.contains(&b'.')
673                && name_bytes
674                    .iter()
675                    .all(|&b| b.is_ascii_hexdigit() || b == b':' || b == b'.')
676            {
677                Some(name.to_string_lossy().into_owned())
678            } else {
679                None
680            }
681        })
682        .collect();
683    addrs.sort();
684    addrs
685}
686
687/// Enumerate `/dev` device nodes matching a prefix with numeric suffixes.
688///
689/// For example, `count_dev_devices("neuron")` counts `/dev/neuron0`, `/dev/neuron1`, etc.
690/// Returns an iterator of parsed device IDs.
691pub(super) fn iter_dev_devices(prefix: &str) -> impl Iterator<Item = u32> + '_ {
692    std::fs::read_dir("/dev")
693        .into_iter()
694        .flatten()
695        .flatten()
696        .filter_map(move |entry| {
697            let name = entry.file_name();
698            let name_str = name.to_string_lossy();
699            let suffix = name_str.strip_prefix(prefix)?;
700            if suffix.is_empty() || !suffix.chars().all(|c| c.is_ascii_digit()) {
701                return None;
702            }
703            suffix.parse::<u32>().ok()
704        })
705}
706
707/// Check if any `/dev` device node matches a prefix (any suffix).
708///
709/// For example, `has_dev_device("groq")` returns true if `/dev/groq*` exists.
710pub(super) fn has_dev_device(prefix: &str) -> bool {
711    std::fs::read_dir("/dev")
712        .into_iter()
713        .flatten()
714        .flatten()
715        .any(|entry| entry.file_name().to_string_lossy().starts_with(prefix))
716}
717
718/// Build a default CPU profile with detected system memory.
719pub(crate) fn cpu_profile() -> AcceleratorProfile {
720    AcceleratorProfile {
721        accelerator: AcceleratorType::Cpu,
722        available: true,
723        memory_bytes: detect_cpu_memory(),
724        ..Default::default()
725    }
726}
727
728/// System memory from /proc/meminfo (fallback: 16 GiB).
729pub(crate) fn detect_cpu_memory() -> u64 {
730    if let Some(info) = read_sysfs_string(std::path::Path::new("/proc/meminfo"), 64 * 1024) {
731        for line in info.lines() {
732            if line.starts_with("MemTotal:")
733                && let Some(kb_str) = line.split_whitespace().nth(1)
734                && let Ok(kb) = kb_str.parse::<u64>()
735            {
736                return kb.saturating_mul(1024);
737            }
738        }
739    }
740    // macOS fallback via safe command runner (absolute path, timeout).
741    if let Ok(output) = command::run_tool("sysctl", &["-n", "hw.memsize"], command::DEFAULT_TIMEOUT)
742        && let Ok(bytes) = output.stdout.trim().parse::<u64>()
743    {
744        return bytes;
745    }
746    debug!("could not read system memory, defaulting to 16 GiB");
747    16 * 1024 * 1024 * 1024
748}
749
750/// Read a u64 from a sysfs file, capped at 64 bytes.
751pub(super) fn read_sysfs_u64(path: &Path) -> Option<u64> {
752    read_sysfs_string(path, 64).and_then(|s| s.trim().parse().ok())
753}
754
755/// Read a string from a sysfs file, capped at `max_bytes` to prevent DoS.
756///
757/// Sysfs pseudo-files report `st_size = 4096` regardless of actual content,
758/// so we can't use metadata for size checking. Instead, we read up to
759/// `max_bytes` and discard if truncated.
760///
761/// Uses a stack buffer for small reads (≤ 512 bytes) to avoid heap allocation
762/// in the common case.
763pub(super) fn read_sysfs_string(path: &Path, max_bytes: usize) -> Option<String> {
764    use std::io::Read;
765    let mut file = std::fs::File::open(path).ok()?;
766
767    // Stack buffer for common small reads, heap for larger ones.
768    const STACK_SIZE: usize = 512;
769    if max_bytes < STACK_SIZE {
770        let mut buf = [0u8; STACK_SIZE];
771        let n = file.read(&mut buf[..max_bytes + 1]).ok()?;
772        if n > max_bytes {
773            return None;
774        }
775        return String::from_utf8(buf[..n].to_vec()).ok();
776    }
777
778    let mut buf = vec![0u8; max_bytes + 1];
779    let n = file.read(&mut buf).ok()?;
780    if n > max_bytes {
781        return None;
782    }
783    buf.truncate(n);
784    String::from_utf8(buf).ok()
785}
786
787// ---------------------------------------------------------------------------
788// True async detection (requires `async-detect` feature)
789// ---------------------------------------------------------------------------
790
791/// Async detection orchestrator using `tokio::process::Command`.
792///
793/// CLI backends run as concurrent tokio tasks with true async subprocess I/O.
794/// Sysfs-only backends run in a single `spawn_blocking` task since they are
795/// fast filesystem reads. Post-passes (bandwidth, PCIe, NUMA) run after all
796/// backends complete.
797#[cfg(feature = "async-detect")]
798pub(crate) async fn detect_with_builder_async(builder: DetectBuilder) -> AcceleratorRegistry {
799    let mut all_profiles = vec![cpu_profile()];
800    let mut all_warnings: Vec<DetectionError> = Vec::new();
801
802    debug!(
803        backends = builder.enabled_count(),
804        "starting async detection"
805    );
806
807    // Spawn async CLI backends as concurrent tokio tasks.
808    let mut handles: Vec<tokio::task::JoinHandle<DetectResult>> = Vec::new();
809
810    macro_rules! spawn_async_backend {
811        ($feature:literal, $backend:expr, $detect_fn:path) => {
812            #[cfg(feature = $feature)]
813            if builder.backend_enabled($backend) {
814                handles.push(tokio::spawn($detect_fn()));
815            }
816        };
817    }
818
819    spawn_async_backend!("cuda", Backend::Cuda, cuda::detect_cuda_async);
820    spawn_async_backend!("vulkan", Backend::Vulkan, vulkan::detect_vulkan_async);
821    spawn_async_backend!("gaudi", Backend::Gaudi, gaudi::detect_gaudi_async);
822    spawn_async_backend!(
823        "aws-neuron",
824        Backend::AwsNeuron,
825        neuron::detect_aws_neuron_async
826    );
827    spawn_async_backend!("apple", Backend::Apple, apple::detect_metal_and_ane_async);
828    spawn_async_backend!(
829        "intel-oneapi",
830        Backend::IntelOneApi,
831        intel_oneapi::detect_intel_oneapi_async
832    );
833
834    // Sysfs-only backends run in a single blocking task.
835    let sysfs_builder = builder.clone();
836    let sysfs_handle = tokio::task::spawn_blocking(move || {
837        let mut profiles = Vec::new();
838        let mut warnings: Vec<DetectionError> = Vec::new();
839
840        macro_rules! run_sysfs {
841            ($feature:literal, $backend:expr, $detect_fn:expr) => {
842                #[cfg(feature = $feature)]
843                if sysfs_builder.backend_enabled($backend) {
844                    $detect_fn(&mut profiles, &mut warnings);
845                }
846            };
847        }
848
849        run_sysfs!("rocm", Backend::Rocm, rocm::detect_rocm);
850        run_sysfs!("intel-npu", Backend::IntelNpu, intel_npu::detect_intel_npu);
851        run_sysfs!("amd-xdna", Backend::AmdXdna, amd_xdna::detect_amd_xdna);
852        run_sysfs!("tpu", Backend::Tpu, tpu::detect_tpu);
853        run_sysfs!(
854            "qualcomm",
855            Backend::Qualcomm,
856            qualcomm::detect_qualcomm_ai100
857        );
858        run_sysfs!("cerebras", Backend::Cerebras, cerebras::detect_cerebras_wse);
859        run_sysfs!(
860            "graphcore",
861            Backend::Graphcore,
862            graphcore::detect_graphcore_ipu
863        );
864        run_sysfs!("groq", Backend::Groq, groq::detect_groq_lpu);
865        run_sysfs!(
866            "samsung-npu",
867            Backend::SamsungNpu,
868            samsung_npu::detect_samsung_npu
869        );
870        run_sysfs!(
871            "mediatek-apu",
872            Backend::MediaTekApu,
873            mediatek_apu::detect_mediatek_apu
874        );
875        run_sysfs!(
876            "windows-wmi",
877            Backend::WindowsWmi,
878            windows::detect_windows_gpu
879        );
880
881        (profiles, warnings)
882    });
883
884    // Collect async CLI results.
885    for handle in handles {
886        if let Ok((profiles, warnings)) = handle.await {
887            all_profiles.extend(profiles);
888            all_warnings.extend(warnings);
889        }
890    }
891
892    // Collect sysfs results.
893    if let Ok((profiles, warnings)) = sysfs_handle.await {
894        all_profiles.extend(profiles);
895        all_warnings.extend(warnings);
896    }
897
898    // Post-pass: sysfs Vulkan fallback.
899    #[cfg(feature = "vulkan")]
900    {
901        let has_vulkan = all_profiles
902            .iter()
903            .any(|p| matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
904        let has_dedicated = all_profiles.iter().any(|p| {
905            matches!(
906                p.accelerator,
907                AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
908            )
909        });
910        if !has_vulkan && !has_dedicated && builder.backend_enabled(Backend::Vulkan) {
911            vulkan::detect_vulkan_sysfs(&mut all_profiles, &mut all_warnings);
912        }
913    }
914
915    // Post-pass: remove Vulkan GPUs if a dedicated CUDA or ROCm GPU was found.
916    let has_dedicated = all_profiles.iter().any(|p| {
917        matches!(
918            p.accelerator,
919            AcceleratorType::CudaGpu { .. } | AcceleratorType::RocmGpu { .. }
920        )
921    });
922    if has_dedicated {
923        all_profiles.retain(|p| !matches!(p.accelerator, AcceleratorType::VulkanGpu { .. }));
924    }
925
926    // Post-pass: enrich with bandwidth (async), PCIe, NUMA.
927    bandwidth::enrich_bandwidth_async(&mut all_profiles, &mut all_warnings).await;
928    let nvidia_pci = list_driver_pci_addrs("nvidia");
929    let amdgpu_pci = list_driver_pci_addrs("amdgpu");
930    pcie::enrich_pcie(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
931    numa::enrich_numa(&mut all_profiles, &nvidia_pci, &amdgpu_pci);
932
933    // System I/O: async interconnects + blocking storage.
934    let (system_interconnects, ic_warnings) = interconnect::detect_interconnects_async().await;
935    all_warnings.extend(ic_warnings);
936
937    let system_storage = tokio::task::spawn_blocking(disk::detect_storage)
938        .await
939        .unwrap_or_default();
940
941    let system_environment = environment::detect_environment();
942    let system_io = SystemIo {
943        interconnects: system_interconnects,
944        storage: system_storage,
945        environment: Some(system_environment),
946    };
947
948    debug!(
949        count = all_profiles.len(),
950        warnings = all_warnings.len(),
951        interconnects = system_io.interconnects.len(),
952        storage_devices = system_io.storage.len(),
953        "async accelerator detection complete"
954    );
955    AcceleratorRegistry {
956        schema_version: crate::registry::SCHEMA_VERSION,
957        profiles: all_profiles,
958        warnings: all_warnings,
959        system_io,
960    }
961}
ai_hwaccel/detect/mod.rs

ai_hwaccel/detect/
mod.rs