neuronbox_runtime/host/
probe.rs1use std::sync::Mutex;
4use std::time::{Duration, Instant};
5
6use super::apple;
7use super::nvidia;
8use super::rocm;
9use super::snapshot::{
10 infer_training_backend, platform_info, HostSnapshot, ProbeStatus, HOST_SNAPSHOT_SCHEMA_VERSION,
11};
12
13const SNAPSHOT_CACHE_TTL: Duration = Duration::from_secs(2);
15
16static SNAPSHOT_CACHE: Mutex<Option<(Instant, HostSnapshot)>> = Mutex::new(None);
17
18pub struct HostProbe;
20
21impl HostProbe {
22 pub fn snapshot() -> HostSnapshot {
23 let now = Instant::now();
24 let mut guard = SNAPSHOT_CACHE.lock().unwrap_or_else(|e| e.into_inner());
25 if let Some((t, ref snap)) = *guard {
26 if now.duration_since(t) < SNAPSHOT_CACHE_TTL {
27 return snap.clone();
28 }
29 }
30 let snap = Self::snapshot_uncached();
31 *guard = Some((now, snap.clone()));
32 snap
33 }
34
35 pub fn snapshot_fresh() -> HostSnapshot {
37 Self::snapshot_uncached()
38 }
39
40 fn snapshot_uncached() -> HostSnapshot {
41 let platform = platform_info();
42 let mut probes = ProbeStatus::default();
43 let mut gpus = Vec::new();
44
45 let nvidia_list = nvidia::query_gpus();
46 probes.nvidia_smi_gpu_list = nvidia_list.probe_ok;
47 probes.nvml = nvidia_list.used_nvml;
48 if let Some(list) = nvidia_list.gpus {
49 if !list.is_empty() {
50 gpus = list;
51 }
52 }
53
54 if gpus.is_empty() {
55 let (rocm_gpus, rocm_ok) = rocm::query_gpus();
56 probes.rocm_smi = rocm_ok;
57 if let Some(list) = rocm_gpus {
58 if !list.is_empty() {
59 gpus = list;
60 }
61 }
62 }
63
64 if gpus.is_empty() {
65 let (apple_gpu, apple_ok) = apple::query_gpu();
66 probes.apple_system_profiler = apple_ok;
67 if let Some(g) = apple_gpu {
68 gpus.push(g);
69 }
70 }
71
72 probes.nvidia_smi_compute = nvidia::compute_apps_pid_memory_mb().is_some();
73
74 let training_backend = infer_training_backend(&gpus);
75
76 HostSnapshot {
77 schema_version: HOST_SNAPSHOT_SCHEMA_VERSION,
78 platform,
79 gpus,
80 training_backend,
81 probes,
82 }
83 }
84}