Skip to main content

neuronbox_runtime/host/
probe.rs

1//! Build a single `HostSnapshot` for the whole runtime / CLI.
2
3use std::sync::Mutex;
4use std::time::{Duration, Instant};
5
6use super::apple;
7use super::nvidia;
8use super::rocm;
9use super::snapshot::{
10    infer_training_backend, platform_info, HostSnapshot, ProbeStatus, HOST_SNAPSHOT_SCHEMA_VERSION,
11};
12
13/// Cache TTL to avoid multiple heavy probes in the same CLI flow.
14const SNAPSHOT_CACHE_TTL: Duration = Duration::from_secs(2);
15
16static SNAPSHOT_CACHE: Mutex<Option<(Instant, HostSnapshot)>> = Mutex::new(None);
17
18/// Probe the host once (system tools + training-backend heuristic).
19pub struct HostProbe;
20
21impl HostProbe {
22    pub fn snapshot() -> HostSnapshot {
23        let now = Instant::now();
24        let mut guard = SNAPSHOT_CACHE.lock().unwrap_or_else(|e| e.into_inner());
25        if let Some((t, ref snap)) = *guard {
26            if now.duration_since(t) < SNAPSHOT_CACHE_TTL {
27                return snap.clone();
28            }
29        }
30        let snap = Self::snapshot_uncached();
31        *guard = Some((now, snap.clone()));
32        snap
33    }
34
35    /// Bypass cache (tests or forced refresh).
36    pub fn snapshot_fresh() -> HostSnapshot {
37        Self::snapshot_uncached()
38    }
39
40    fn snapshot_uncached() -> HostSnapshot {
41        let platform = platform_info();
42        let mut probes = ProbeStatus::default();
43        let mut gpus = Vec::new();
44
45        let nvidia_list = nvidia::query_gpus();
46        probes.nvidia_smi_gpu_list = nvidia_list.probe_ok;
47        probes.nvml = nvidia_list.used_nvml;
48        if let Some(list) = nvidia_list.gpus {
49            if !list.is_empty() {
50                gpus = list;
51            }
52        }
53
54        if gpus.is_empty() {
55            let (rocm_gpus, rocm_ok) = rocm::query_gpus();
56            probes.rocm_smi = rocm_ok;
57            if let Some(list) = rocm_gpus {
58                if !list.is_empty() {
59                    gpus = list;
60                }
61            }
62        }
63
64        if gpus.is_empty() {
65            let (apple_gpu, apple_ok) = apple::query_gpu();
66            probes.apple_system_profiler = apple_ok;
67            if let Some(g) = apple_gpu {
68                gpus.push(g);
69            }
70        }
71
72        probes.nvidia_smi_compute = nvidia::compute_apps_pid_memory_mb().is_some();
73
74        let training_backend = infer_training_backend(&gpus);
75
76        HostSnapshot {
77            schema_version: HOST_SNAPSHOT_SCHEMA_VERSION,
78            platform,
79            gpus,
80            training_backend,
81            probes,
82        }
83    }
84}