Skip to main content

studio_worker/
sys.rs

1//! Host-system probes: hostname, OS user, VRAM.
2//!
3//! Every probe emits a structured tracing breadcrumb so an operator can
4//! tell from the logs *why* a worker reports the values it does (in
5//! particular, why VRAM came back as `0.0` — was the sysfs tree missing,
6//! present-but-unparseable, or is the worker running on a non-Linux
7//! host?).  Silent `0.0` makes "this worker claims nothing" impossible
8//! to diagnose from logs alone.
9use anyhow::Result;
10use std::path::Path;
11use std::sync::OnceLock;
12
13pub fn machine_name() -> String {
14    let name = hostname::get()
15        .ok()
16        .and_then(|s| s.into_string().ok())
17        .unwrap_or_else(|| "unknown-host".to_string());
18    tracing::debug!(
19        target: "studio_worker::sys",
20        op = "machine_name",
21        value = %name,
22        "resolved host machine name"
23    );
24    name
25}
26
27pub fn username() -> String {
28    username_from_probe(whoami::username())
29}
30
31/// Resolve the OS-user probe into a username, logging the outcome so a
32/// silent fallback can't hide a failing probe.  `whoami::username`
33/// became fallible in whoami 2.x; on the error path we emit a `warn`
34/// breadcrumb naming the underlying error and fall back to
35/// `unknown-user`, mirroring `machine_name`'s `unknown-host` default.
36fn username_from_probe<E: std::fmt::Display>(probe: std::result::Result<String, E>) -> String {
37    let user = match probe {
38        Ok(user) => user,
39        Err(e) => {
40            tracing::warn!(
41                target: "studio_worker::sys",
42                op = "username",
43                error = %e,
44                "failed to resolve OS user; falling back to unknown-user"
45            );
46            "unknown-user".to_string()
47        }
48    };
49    tracing::debug!(
50        target: "studio_worker::sys",
51        op = "username",
52        value = %user,
53        "resolved OS user"
54    );
55    user
56}
57
58/// Cached result of the (relatively expensive) VRAM probe.  Total VRAM
59/// is a static hardware property, so we probe at most once per process —
60/// `build_capabilities` runs on every 5s heartbeat and must not spawn an
61/// `nvidia-smi` subprocess each tick.
62static VRAM_GB: OnceLock<f32> = OnceLock::new();
63
64/// Detect physical VRAM on the host, in GB.  Returns 0.0 when we can't
65/// probe (no NVIDIA GPU, no driver) — the engine still runs in synthetic
66/// mode for low-end / CI machines.
67///
68/// This intentionally avoids a hard dependency on `nvml-wrapper` because
69/// it brings a heavy NVML build dep that we don't want at the CI layer.
70/// On Linux we first try the dependency-free
71/// `/proc/driver/nvidia/gpus/*/information` sysfs probe; current NVIDIA
72/// drivers (5xx) dropped the `Video Memory` line from that file, so we
73/// fall back to `nvidia-smi` (which ships with every driver, on every
74/// platform, and whose `--query-gpu` interface is stable across
75/// versions).  The result is memoised since it can't change while the
76/// process runs.
77pub fn detect_vram_gb() -> Result<f32> {
78    Ok(*VRAM_GB.get_or_init(probe_vram_gb))
79}
80
81fn probe_vram_gb() -> f32 {
82    // Linux exposes a cheap, dependency-free sysfs probe; try it first
83    // so the common case never spawns a subprocess.
84    #[cfg(target_os = "linux")]
85    {
86        let from_sysfs = detect_vram_gb_from_sysfs(Path::new("/proc/driver/nvidia/gpus"));
87        if from_sysfs > 0.0 {
88            return from_sysfs;
89        }
90    }
91    // Fallback for every platform: `nvidia-smi`.  On a host with no
92    // NVIDIA tooling the command simply fails to spawn and we return 0.
93    detect_vram_gb_via_nvidia_smi().unwrap_or(0.0)
94}
95
96/// Probe VRAM via `nvidia-smi --query-gpu=memory.total`.  Returns `None`
97/// when the binary is absent (no driver / non-NVIDIA host) or exits
98/// non-zero, in which cases the caller defaults to 0 GB.
99///
100/// Coverage-off: spawning a real `nvidia-smi` is host-dependent (CI has
101/// none), so its success / non-zero-exit arms can't be exercised
102/// deterministically.  The parse + GB conversion + logging it delegates
103/// to ([`vram_gb_from_smi_stdout`], [`parse_nvidia_smi_mib`]) are
104/// unit-tested directly.
105#[cfg_attr(coverage_nightly, coverage(off))]
106fn detect_vram_gb_via_nvidia_smi() -> Option<f32> {
107    let output = std::process::Command::new("nvidia-smi")
108        .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
109        .output();
110    match output {
111        Ok(o) if o.status.success() => vram_gb_from_smi_stdout(&String::from_utf8_lossy(&o.stdout)),
112        Ok(o) => {
113            tracing::warn!(
114                target: "studio_worker::sys",
115                op = "probe_vram",
116                source = "nvidia_smi_failed",
117                code = ?o.status.code(),
118                "nvidia-smi exited non-zero while probing VRAM — defaulting to 0 GB"
119            );
120            None
121        }
122        Err(e) => {
123            tracing::info!(
124                target: "studio_worker::sys",
125                op = "probe_vram",
126                source = "nvidia_smi_absent",
127                error = %e,
128                "nvidia-smi not available — cannot probe VRAM; defaulting to 0 GB"
129            );
130            None
131        }
132    }
133}
134
135/// Summed VRAM (MiB) from an `nvidia-smi` memory query plus the count of
136/// GPU lines that were dropped from that total.
137///
138/// `dropped` is the number of non-empty lines whose leading token wasn't
139/// a number — nvidia-smi emits `[N/A]` for `memory.total` when a card has
140/// fallen off the bus, hit an ECC fault, or sits in a MIG state with no
141/// resolvable total.  Carrying the count (rather than silently summing
142/// the survivors) means a multi-GPU box that under-reports its VRAM — and
143/// then refuses jobs it could actually run — leaves a breadcrumb instead
144/// of vanishing the card without a trace.
145struct SmiMemTotal {
146    mib: f64,
147    dropped: u32,
148}
149
150/// Convert the stdout of an `nvidia-smi` memory query to GB and emit the
151/// probe breadcrumb.  Split out from the subprocess plumbing so the
152/// parse + conversion + logging are unit-testable without a real
153/// `nvidia-smi` on the box (CI has none).
154fn vram_gb_from_smi_stdout(stdout: &str) -> Option<f32> {
155    let SmiMemTotal { mib, dropped } = parse_nvidia_smi_mib(stdout)?;
156    let vram_gb = (mib / 1024.0) as f32;
157    tracing::info!(
158        target: "studio_worker::sys",
159        op = "probe_vram",
160        source = "nvidia_smi",
161        vram_gb = vram_gb,
162        dropped = dropped,
163        "detected NVIDIA VRAM via nvidia-smi fallback"
164    );
165    Some(vram_gb)
166}
167
168/// Sum the per-GPU MiB totals from
169/// `nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits`.
170/// One line per GPU, each a bare MiB integer (e.g. `24564`).  Tolerates
171/// a trailing unit token (if `nounits` is ever dropped) and ignores
172/// blank lines.  Every non-empty line that fails to parse (e.g. `[N/A]`)
173/// is warn-logged and counted in [`SmiMemTotal::dropped`] before being
174/// left out of the total.  Returns `None` when no line yielded a number.
175fn parse_nvidia_smi_mib(stdout: &str) -> Option<SmiMemTotal> {
176    let mut total: f64 = 0.0;
177    let mut any = false;
178    let mut dropped: u32 = 0;
179    for (idx, line) in stdout.lines().enumerate() {
180        let trimmed = line.trim();
181        if trimmed.is_empty() {
182            continue;
183        }
184        match trimmed
185            .split_whitespace()
186            .next()
187            .and_then(|tok| tok.parse::<f64>().ok())
188        {
189            Some(mib) => {
190                total += mib;
191                any = true;
192            }
193            None => {
194                dropped += 1;
195                tracing::warn!(
196                    target: "studio_worker::sys",
197                    op = "probe_vram",
198                    source = "nvidia_smi",
199                    line = idx,
200                    content = trimmed,
201                    "nvidia-smi VRAM line did not parse as MiB — dropping this GPU from the total"
202                );
203            }
204        }
205    }
206    any.then_some(SmiMemTotal {
207        mib: total,
208        dropped,
209    })
210}
211
212/// VRAM probe driven by a configurable sysfs root.  Public-in-crate so
213/// the integration tests can exercise both the "missing root" and
214/// "populated root" branches without a real `/proc/driver/nvidia` tree.
215///
216/// Emits a summary tracing event per call, plus a `WARN` for every GPU
217/// dropped from the total so a multi-GPU box never under-reports its
218/// VRAM silently:
219///
220/// - `INFO source="no_nvidia_sysfs"` — `root` is not a directory.  This
221///   is the normal case on CI runners / non-GPU hosts.
222/// - `INFO source="nvidia_sysfs"` — at least one GPU's `information`
223///   file was parseable.  `gpu_count` is how many contributed; `dropped`
224///   is how many were present but unreadable / had no parseable `Video
225///   Memory` line (each of those also gets its own `WARN` naming it).
226/// - `WARN source="sysfs_unparseable"` — directories were present but
227///   none parseable (current 5xx drivers dropped the `Video Memory`
228///   line).  The caller then falls back to `nvidia-smi`; the warn is the
229///   breadcrumb that the cheap sysfs path no longer works on this host.
230/// - `WARN source="nvidia_sysfs" reason="no_video_memory_line"|"video_memory_unparseable"|"info_unreadable"`
231///   — a specific GPU was dropped from the total while others survived.
232///   `video_memory_unparseable` means the `Video Memory` line was
233///   present but its value didn't parse (the warn echoes the offending
234///   `content`); `no_video_memory_line` means no such line at all.
235pub fn detect_vram_gb_from_sysfs(root: &Path) -> f32 {
236    let entries = match std::fs::read_dir(root) {
237        Ok(e) => e,
238        Err(_) => {
239            tracing::info!(
240                target: "studio_worker::sys",
241                op = "probe_vram",
242                source = "no_nvidia_sysfs",
243                vram_gb = 0.0,
244                root = %root.display(),
245                "no NVIDIA sysfs tree at probe root — defaulting to 0 GB VRAM"
246            );
247            return 0.0;
248        }
249    };
250
251    let mut total_mib: f64 = 0.0;
252    let mut gpu_count: u32 = 0;
253    let mut parseable: u32 = 0;
254    for entry in entries.flatten() {
255        gpu_count += 1;
256        let gpu_path = entry.path();
257        let info_path = gpu_path.join("information");
258        match std::fs::read_to_string(&info_path) {
259            Ok(content) => {
260                let mut found = false;
261                // A `Video Memory:` line that's present but whose value
262                // can't be parsed (e.g. `N/A` on a driver that stubbed
263                // the field) must be surfaced differently from a GPU
264                // with no such line at all — otherwise the operator is
265                // told the line is missing when it's right there.  Keep
266                // the first offending value to echo in the warn.
267                let mut unparseable: Option<String> = None;
268                for line in content.lines() {
269                    if let Some(rest) = line.trim().strip_prefix("Video Memory:") {
270                        if let Some(mib) = parse_mib(rest) {
271                            total_mib += mib;
272                            found = true;
273                        } else if unparseable.is_none() {
274                            unparseable = Some(rest.trim().to_string());
275                        }
276                    }
277                }
278                if found {
279                    parseable += 1;
280                } else if let Some(content) = unparseable {
281                    tracing::warn!(
282                        target: "studio_worker::sys",
283                        op = "probe_vram",
284                        source = "nvidia_sysfs",
285                        reason = "video_memory_unparseable",
286                        gpu = %gpu_path.display(),
287                        content = content.as_str(),
288                        "sysfs GPU Video Memory line did not parse as MiB — dropping it from the total"
289                    );
290                } else {
291                    tracing::warn!(
292                        target: "studio_worker::sys",
293                        op = "probe_vram",
294                        source = "nvidia_sysfs",
295                        reason = "no_video_memory_line",
296                        gpu = %gpu_path.display(),
297                        "sysfs GPU has no parseable Video Memory line — dropping it from the total"
298                    );
299                }
300            }
301            Err(e) => {
302                tracing::warn!(
303                    target: "studio_worker::sys",
304                    op = "probe_vram",
305                    source = "nvidia_sysfs",
306                    reason = "info_unreadable",
307                    gpu = %gpu_path.display(),
308                    error = %e,
309                    "could not read a sysfs GPU information file — dropping it from the total"
310                );
311            }
312        }
313    }
314
315    let vram_gb = (total_mib / 1024.0) as f32;
316    let dropped = gpu_count.saturating_sub(parseable);
317    if parseable > 0 {
318        tracing::info!(
319            target: "studio_worker::sys",
320            op = "probe_vram",
321            source = "nvidia_sysfs",
322            vram_gb = vram_gb,
323            gpu_count = parseable,
324            dropped = dropped,
325            "detected NVIDIA VRAM via sysfs"
326        );
327    } else {
328        tracing::warn!(
329            target: "studio_worker::sys",
330            op = "probe_vram",
331            source = "sysfs_unparseable",
332            vram_gb = 0.0,
333            gpu_count = gpu_count,
334            root = %root.display(),
335            "NVIDIA sysfs entries present but no Video Memory line (current 5xx drivers dropped it) — falling back to nvidia-smi"
336        );
337    }
338    vram_gb
339}
340
341fn parse_mib(s: &str) -> Option<f64> {
342    // Strings look like " 24576 MiB" or "24576 MB"
343    let trimmed = s.trim();
344    let mut parts = trimmed.split_whitespace();
345    let value = parts.next()?.parse::<f64>().ok()?;
346    let unit = parts.next().unwrap_or("MiB");
347    match unit.to_ascii_lowercase().as_str() {
348        "mib" | "mb" => Some(value),
349        "gib" | "gb" => Some(value * 1024.0),
350        _ => Some(value),
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    use super::*;
357
358    #[test]
359    fn parse_mib_handles_mib() {
360        assert_eq!(parse_mib(" 24576 MiB"), Some(24576.0));
361        assert_eq!(parse_mib("12288 MB"), Some(12288.0));
362        assert_eq!(parse_mib("24 GiB"), Some(24576.0));
363        assert_eq!(parse_mib("8 GB"), Some(8192.0));
364    }
365
366    #[test]
367    fn parse_mib_defaults_to_mib_when_the_unit_is_omitted() {
368        // A bare value (no unit token) is assumed to already be in MiB,
369        // matching nvidia-smi's `--units` output where the suffix is
370        // sometimes stripped.
371        assert_eq!(parse_mib("4096"), Some(4096.0));
372    }
373
374    #[test]
375    fn parse_mib_treats_an_unknown_unit_as_raw_mib() {
376        // An unrecognised suffix must not silently zero the GPU out of
377        // the VRAM total: the worker claims jobs by VRAM, so dropping a
378        // card to 0 would make it refuse work it can actually run. We
379        // keep the numeric value as-is (best-effort MiB) rather than
380        // returning `None`.
381        assert_eq!(parse_mib("2048 KiB"), Some(2048.0));
382        assert_eq!(parse_mib("4 TB"), Some(4.0));
383    }
384
385    #[test]
386    fn parse_mib_rejects_unparseable_or_empty_values() {
387        // A non-numeric leading token (e.g. an `[N/A]` placeholder) or
388        // an empty / whitespace-only line yields `None` so the caller
389        // skips it instead of polluting the total with a bogus number.
390        assert_eq!(parse_mib("N/A MiB"), None);
391        assert_eq!(parse_mib(""), None);
392        assert_eq!(parse_mib("   "), None);
393    }
394
395    #[test]
396    fn machine_name_returns_non_empty() {
397        assert!(!machine_name().is_empty());
398    }
399
400    #[test]
401    fn username_returns_non_empty() {
402        assert!(!username().is_empty());
403    }
404
405    #[test]
406    fn username_from_probe_returns_the_resolved_value() {
407        let user = username_from_probe(Ok::<_, std::io::Error>("alice".to_string()));
408        assert_eq!(user, "alice");
409    }
410
411    #[test]
412    fn username_from_probe_falls_back_to_unknown_user_on_error() {
413        let user =
414            username_from_probe(Err::<String, _>(std::io::Error::other("no entropy source")));
415        assert_eq!(user, "unknown-user");
416    }
417
418    #[test]
419    fn username_from_probe_warns_with_the_error_on_failure() {
420        // whoami 2.x made the probe fallible; a failure must leave an
421        // operator-visible breadcrumb naming the error rather than a
422        // silent fallback that hides why the user came back unknown.
423        let logs = crate::test_support::capture(|| {
424            let _ =
425                username_from_probe(Err::<String, _>(std::io::Error::other("permission denied")));
426        });
427        assert!(logs.contains("WARN"), "expected WARN level, got: {logs}");
428        assert!(
429            logs.contains("op=\"username\""),
430            "expected username op, got: {logs}"
431        );
432        assert!(
433            logs.contains("permission denied"),
434            "expected underlying error, got: {logs}"
435        );
436    }
437
438    #[test]
439    fn username_from_probe_emits_debug_value_on_success() {
440        let logs = crate::test_support::capture(|| {
441            let _ = username_from_probe(Ok::<_, std::io::Error>("bob".to_string()));
442        });
443        assert!(logs.contains("DEBUG"), "expected DEBUG event, got: {logs}");
444        assert!(
445            logs.contains("value=bob"),
446            "expected resolved value, got: {logs}"
447        );
448    }
449
450    #[test]
451    fn detect_vram_gb_from_sysfs_returns_zero_when_root_missing() {
452        let dir = tempfile::tempdir().unwrap();
453        let missing = dir.path().join("nope");
454        assert_eq!(detect_vram_gb_from_sysfs(&missing), 0.0);
455    }
456
457    #[test]
458    fn detect_vram_gb_from_sysfs_sums_parseable_gpus() {
459        let dir = tempfile::tempdir().unwrap();
460        for (bus, mib) in [("0000:01:00.0", "12288"), ("0000:02:00.0", "24576")] {
461            let gpu = dir.path().join(bus);
462            std::fs::create_dir_all(&gpu).unwrap();
463            std::fs::write(
464                gpu.join("information"),
465                format!("Model: x\nVideo Memory: {mib} MiB\n"),
466            )
467            .unwrap();
468        }
469        // (12288 + 24576) / 1024 = 36 GiB
470        let gb = detect_vram_gb_from_sysfs(dir.path());
471        assert!((gb - 36.0).abs() < 1e-3, "got {gb}");
472    }
473
474    #[test]
475    fn detect_vram_gb_from_sysfs_sums_only_survivors_when_one_gpu_is_unreadable() {
476        // A healthy card next to one whose `information` can't be read
477        // (here a *directory* named `information`, so `read_to_string`
478        // fails on every platform): the survivor still totals, the bad
479        // card is dropped from the sum rather than zeroing the host out.
480        let dir = tempfile::tempdir().unwrap();
481        let good = dir.path().join("0000:01:00.0");
482        std::fs::create_dir_all(&good).unwrap();
483        std::fs::write(good.join("information"), "Video Memory: 12288 MiB\n").unwrap();
484        let bad = dir.path().join("0000:02:00.0");
485        std::fs::create_dir_all(bad.join("information")).unwrap();
486        // Only the healthy card's 12288 MiB / 1024 = 12 GiB counts.
487        let gb = detect_vram_gb_from_sysfs(dir.path());
488        assert!((gb - 12.0).abs() < 1e-3, "got {gb}");
489    }
490
491    // -----------------------------------------------------------------
492    // nvidia-smi fallback — current NVIDIA drivers (5xx) dropped the
493    // "Video Memory" line from the sysfs `information` file, so the
494    // sysfs probe yields 0 on otherwise-capable hosts.  `nvidia-smi`
495    // ships with every driver and its `--query-gpu` interface is stable
496    // across versions, so it's the layout-proof fallback.
497    // -----------------------------------------------------------------
498
499    #[test]
500    fn parse_nvidia_smi_mib_reads_a_single_bare_value() {
501        let total = parse_nvidia_smi_mib("24564\n").unwrap();
502        assert_eq!(total.mib, 24564.0);
503        assert_eq!(total.dropped, 0);
504    }
505
506    #[test]
507    fn parse_nvidia_smi_mib_sums_multiple_gpus() {
508        let total = parse_nvidia_smi_mib("24564\n24564\n").unwrap();
509        assert_eq!(total.mib, 49128.0);
510        assert_eq!(total.dropped, 0);
511    }
512
513    #[test]
514    fn parse_nvidia_smi_mib_tolerates_units_and_crlf_whitespace() {
515        // If `nounits` is ever dropped the value arrives as "24564 MiB".
516        let total = parse_nvidia_smi_mib("  24564 MiB \r\n").unwrap();
517        assert_eq!(total.mib, 24564.0);
518        assert_eq!(total.dropped, 0);
519    }
520
521    #[test]
522    fn parse_nvidia_smi_mib_returns_none_on_empty_or_na() {
523        assert!(parse_nvidia_smi_mib("").is_none());
524        assert!(parse_nvidia_smi_mib("\n[N/A]\n").is_none());
525    }
526
527    #[test]
528    fn parse_nvidia_smi_mib_sums_survivors_and_counts_a_dropped_gpu() {
529        // A healthy 24 GiB card next to one nvidia-smi reports `[N/A]`
530        // for (fell off the bus / ECC fault): the survivor's VRAM still
531        // totals, but the dropped card is counted, not silently lost.
532        let total = parse_nvidia_smi_mib("24564\n[N/A]\n24564\n").unwrap();
533        assert_eq!(total.mib, 49128.0);
534        assert_eq!(total.dropped, 1);
535    }
536
537    #[test]
538    fn parse_nvidia_smi_mib_warns_on_each_dropped_gpu_line() {
539        // A multi-GPU box that under-reports its VRAM (and then refuses
540        // jobs it can run) must leave a per-line breadcrumb naming the
541        // offending value, not vanish the card without a trace.
542        let logs = crate::test_support::capture(|| {
543            let _ = parse_nvidia_smi_mib("24564\n[N/A]\n");
544        });
545        assert!(logs.contains("WARN"), "expected WARN level, got: {logs}");
546        assert!(
547            logs.contains("op=\"probe_vram\""),
548            "expected probe_vram op, got: {logs}"
549        );
550        assert!(
551            logs.contains("source=\"nvidia_smi\""),
552            "expected source=nvidia_smi, got: {logs}"
553        );
554        assert!(
555            logs.contains("[N/A]"),
556            "the warning must name the unparseable value, got: {logs}"
557        );
558        assert!(
559            logs.contains("dropping this GPU"),
560            "the warning must explain the drop, got: {logs}"
561        );
562    }
563
564    #[test]
565    fn vram_gb_from_smi_stdout_reports_dropped_count_in_breadcrumb() {
566        // The success breadcrumb must surface how many GPUs were dropped
567        // so a truncated VRAM total can't pass for a complete one.
568        let logs = crate::test_support::capture(|| {
569            let gb = vram_gb_from_smi_stdout("24564\n[N/A]\n").unwrap();
570            assert!((gb - 23.99).abs() < 0.05, "survivor still totals: {gb}");
571        });
572        assert!(
573            logs.contains("dropped=1"),
574            "the breadcrumb must report the dropped count, got: {logs}"
575        );
576    }
577
578    #[test]
579    fn vram_gb_from_smi_stdout_converts_mib_to_gb() {
580        // 24564 MiB / 1024 = 23.99 GiB
581        let gb = vram_gb_from_smi_stdout("24564\n").unwrap();
582        assert!((gb - 23.99).abs() < 0.05, "got {gb}");
583    }
584
585    #[test]
586    fn vram_gb_from_smi_stdout_is_none_when_unparseable() {
587        assert_eq!(vram_gb_from_smi_stdout("\n[N/A]\n"), None);
588    }
589
590    #[test]
591    fn vram_gb_from_smi_stdout_emits_info_breadcrumb_on_success() {
592        let logs = crate::test_support::capture(|| {
593            let _ = vram_gb_from_smi_stdout("24564\n");
594        });
595        assert!(logs.contains("INFO"), "expected INFO level, got: {logs}");
596        assert!(
597            logs.contains("op=\"probe_vram\""),
598            "expected probe_vram op, got: {logs}"
599        );
600        assert!(
601            logs.contains("source=\"nvidia_smi\""),
602            "expected source=nvidia_smi, got: {logs}"
603        );
604    }
605}