Skip to main content

neuronbox_runtime/host/
snapshot.rs

1//! Serializable schema of machine state (GPU / platform) for CLI debug and client-side decisions.
2
3use serde::{Deserialize, Serialize};
4
5/// `HostSnapshot` schema version (bump if incompatible fields change).
6pub const HOST_SNAPSHOT_SCHEMA_VERSION: u32 = 1;
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct PlatformInfo {
10    pub os: String,
11    pub arch: String,
12}
13
14#[derive(Debug, Clone, Default, Serialize, Deserialize)]
15pub struct ProbeStatus {
16    /// `nvidia-smi --query-gpu=...` succeeded.
17    pub nvidia_smi_gpu_list: bool,
18    /// `nvidia-smi --query-compute-apps=pid,used_gpu_memory` succeeded (VRAM per PID).
19    pub nvidia_smi_compute: bool,
20    /// NVIDIA GPU list from **NVML** (`nvml` feature, Linux) rather than `nvidia-smi`.
21    #[serde(default)]
22    pub nvml: bool,
23    pub rocm_smi: bool,
24    pub apple_system_profiler: bool,
25}
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
28#[serde(rename_all = "snake_case")]
29pub enum TrainingBackend {
30    Cuda,
31    Rocm,
32    Metal,
33    /// GPUs present but backend not classified (rare).
34    Cpu,
35    Unknown,
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct GpuRecord {
40    pub index: u32,
41    pub name: String,
42    pub memory_total_mb: u64,
43    /// E.g. `CUDA (driver 535.x)`, `ROCm`, or `Metal`.
44    pub backend: String,
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct HostSnapshot {
49    pub schema_version: u32,
50    pub platform: PlatformInfo,
51    pub gpus: Vec<GpuRecord>,
52    pub training_backend: TrainingBackend,
53    pub probes: ProbeStatus,
54}
55
56impl HostSnapshot {
57    /// Primary VRAM for pre-checks: NVIDIA GPU at index 0, else first ROCm GPU with known VRAM.
58    /// `None` if unknown or zero (e.g. Apple with no exposed VRAM size).
59    pub fn primary_vram_mb(&self) -> Option<u64> {
60        for g in &self.gpus {
61            if g.backend.contains("CUDA") && g.index == 0 && g.memory_total_mb > 0 {
62                return Some(g.memory_total_mb);
63            }
64        }
65        for g in &self.gpus {
66            if g.backend.contains("ROCm") && g.memory_total_mb > 0 {
67                return Some(g.memory_total_mb);
68            }
69        }
70        None
71    }
72}
73
74pub(crate) fn infer_training_backend(gpus: &[GpuRecord]) -> TrainingBackend {
75    if gpus.iter().any(|g| g.backend.contains("CUDA")) {
76        return TrainingBackend::Cuda;
77    }
78    if gpus.iter().any(|g| g.backend.contains("ROCm")) {
79        return TrainingBackend::Rocm;
80    }
81    if gpus.iter().any(|g| g.backend.contains("Metal")) {
82        return TrainingBackend::Metal;
83    }
84    if !gpus.is_empty() {
85        return TrainingBackend::Cpu;
86    }
87    TrainingBackend::Unknown
88}
89
90pub(crate) fn platform_info() -> PlatformInfo {
91    PlatformInfo {
92        os: std::env::consts::OS.to_string(),
93        arch: std::env::consts::ARCH.to_string(),
94    }
95}