Skip to main content

neuronbox_runtime/host/
nvidia.rs

1//! NVIDIA: NVML on Linux (`nvml` feature), else `nvidia-smi` subprocess.
2
3use std::collections::HashMap;
4use std::process::Command;
5
6use super::snapshot::GpuRecord;
7
8#[cfg(all(target_os = "linux", feature = "nvml"))]
9use super::nvml_linux;
10
11/// Result of the NVIDIA GPU list probe.
12#[derive(Debug, Clone)]
13pub struct NvidiaGpuListResult {
14    pub gpus: Option<Vec<GpuRecord>>,
15    /// NVIDIA tool responded (NVML or `nvidia-smi`).
16    pub probe_ok: bool,
17    /// Data from NVML (otherwise `nvidia-smi`).
18    pub used_nvml: bool,
19}
20
21pub fn query_gpus() -> NvidiaGpuListResult {
22    #[cfg(all(target_os = "linux", feature = "nvml"))]
23    if let Some(snap) = nvml_linux::try_snapshot() {
24        if !snap.gpus.is_empty() {
25            return NvidiaGpuListResult {
26                gpus: Some(snap.gpus),
27                probe_ok: true,
28                used_nvml: true,
29            };
30        }
31    }
32
33    let (gpus, ok) = query_gpus_nvidia_smi();
34    NvidiaGpuListResult {
35        gpus,
36        probe_ok: ok,
37        used_nvml: false,
38    }
39}
40
41fn query_gpus_nvidia_smi() -> (Option<Vec<GpuRecord>>, bool) {
42    let out = match Command::new("nvidia-smi")
43        .args([
44            "--query-gpu=index,name,memory.total,driver_version",
45            "--format=csv,noheader,nounits",
46        ])
47        .output()
48    {
49        Ok(o) => o,
50        Err(_) => return (None, false),
51    };
52
53    if !out.status.success() {
54        return (None, false);
55    }
56
57    let text = String::from_utf8_lossy(&out.stdout);
58    let mut gpus = Vec::new();
59    for line in text.lines() {
60        let line = line.trim();
61        if line.is_empty() {
62            continue;
63        }
64        let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
65        if parts.len() < 3 {
66            continue;
67        }
68        let Ok(index) = parts[0].parse::<u32>() else {
69            continue;
70        };
71        let name = parts[1].to_string();
72        let Ok(memory_total_mb) = parts[2].parse::<u64>() else {
73            continue;
74        };
75        let driver = parts.get(3).map(|s| s.to_string()).unwrap_or_default();
76        gpus.push(GpuRecord {
77            index,
78            name,
79            memory_total_mb,
80            backend: format!("CUDA (driver {driver})"),
81        });
82    }
83
84    if gpus.is_empty() {
85        (None, true)
86    } else {
87        (Some(gpus), true)
88    }
89}
90
91/// PID → used MiB (soft VRAM monitoring, stats).
92pub fn compute_apps_pid_memory_mb() -> Option<HashMap<u32, u64>> {
93    #[cfg(all(target_os = "linux", feature = "nvml"))]
94    if let Some(snap) = nvml_linux::try_snapshot() {
95        return Some(snap.pid_memory_mb);
96    }
97
98    compute_apps_pid_memory_mb_smi()
99}
100
101fn compute_apps_pid_memory_mb_smi() -> Option<HashMap<u32, u64>> {
102    let out = Command::new("nvidia-smi")
103        .args([
104            "--query-compute-apps=pid,used_gpu_memory",
105            "--format=csv,noheader,nounits",
106        ])
107        .output()
108        .ok()?;
109    if !out.status.success() {
110        return None;
111    }
112    let text = String::from_utf8_lossy(&out.stdout);
113    let mut map = HashMap::new();
114    for line in text.lines() {
115        let line = line.trim();
116        if line.is_empty() {
117            continue;
118        }
119        let parts: Vec<&str> = line.split(',').map(|x| x.trim()).collect();
120        if parts.len() < 2 {
121            continue;
122        }
123        let Ok(pid) = parts[0].parse::<u32>() else {
124            continue;
125        };
126        let mem = parts[1].replace(" MiB", "");
127        let Ok(mb) = mem.trim().parse::<u64>() else {
128            continue;
129        };
130        map.insert(pid, mb);
131    }
132    Some(map)
133}
134
135/// Lines for `DaemonResponse::Stats`.
136pub fn compute_apps_display_lines() -> Vec<String> {
137    #[cfg(all(target_os = "linux", feature = "nvml"))]
138    if let Some(snap) = nvml_linux::try_snapshot() {
139        return snap.display_lines;
140    }
141
142    compute_apps_display_lines_smi()
143}
144
145fn compute_apps_display_lines_smi() -> Vec<String> {
146    let out = Command::new("nvidia-smi")
147        .args([
148            "--query-compute-apps=pid,process_name,used_memory",
149            "--format=csv,noheader",
150        ])
151        .output();
152    match out {
153        Ok(o) if o.status.success() => {
154            let text = String::from_utf8_lossy(&o.stdout);
155            text.lines()
156                .map(|l| l.trim().to_string())
157                .filter(|l| !l.is_empty())
158                .collect()
159        }
160        _ => vec![],
161    }
162}