studio-worker 0.4.6

Pull-based image-generation worker for the minis.gg studio.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
//! Host-system probes: hostname, OS user, VRAM.
//!
//! Every probe emits a structured tracing breadcrumb so an operator can
//! tell from the logs *why* a worker reports the values it does (in
//! particular, why VRAM came back as `0.0` — was the sysfs tree missing,
//! present-but-unparseable, or is the worker running on a non-Linux
//! host?).  Silent `0.0` makes "this worker claims nothing" impossible
//! to diagnose from logs alone.
use anyhow::Result;
use std::path::Path;
use std::sync::OnceLock;

pub fn machine_name() -> String {
    let name = hostname::get()
        .ok()
        .and_then(|s| s.into_string().ok())
        .unwrap_or_else(|| "unknown-host".to_string());
    tracing::debug!(
        target: "studio_worker::sys",
        op = "machine_name",
        value = %name,
        "resolved host machine name"
    );
    name
}

pub fn username() -> String {
    username_from_probe(whoami::username())
}

/// Resolve the OS-user probe into a username, logging the outcome so a
/// silent fallback can't hide a failing probe.  `whoami::username`
/// became fallible in whoami 2.x; on the error path we emit a `warn`
/// breadcrumb naming the underlying error and fall back to
/// `unknown-user`, mirroring `machine_name`'s `unknown-host` default.
fn username_from_probe<E: std::fmt::Display>(probe: std::result::Result<String, E>) -> String {
    let user = match probe {
        Ok(user) => user,
        Err(e) => {
            tracing::warn!(
                target: "studio_worker::sys",
                op = "username",
                error = %e,
                "failed to resolve OS user; falling back to unknown-user"
            );
            "unknown-user".to_string()
        }
    };
    tracing::debug!(
        target: "studio_worker::sys",
        op = "username",
        value = %user,
        "resolved OS user"
    );
    user
}

/// Cached result of the (relatively expensive) VRAM probe.  Total VRAM
/// is a static hardware property, so we probe at most once per process —
/// `build_capabilities` runs on every 5s heartbeat and must not spawn an
/// `nvidia-smi` subprocess each tick.
static VRAM_GB: OnceLock<f32> = OnceLock::new();

/// Detect physical VRAM on the host, in GB.  Returns 0.0 when we can't
/// probe (no NVIDIA GPU, no driver) — the engine still runs in synthetic
/// mode for low-end / CI machines.
///
/// This intentionally avoids a hard dependency on `nvml-wrapper` because
/// it brings a heavy NVML build dep that we don't want at the CI layer.
/// On Linux we first try the dependency-free
/// `/proc/driver/nvidia/gpus/*/information` sysfs probe; current NVIDIA
/// drivers (5xx) dropped the `Video Memory` line from that file, so we
/// fall back to `nvidia-smi` (which ships with every driver, on every
/// platform, and whose `--query-gpu` interface is stable across
/// versions).  The result is memoised since it can't change while the
/// process runs.
pub fn detect_vram_gb() -> Result<f32> {
    Ok(*VRAM_GB.get_or_init(probe_vram_gb))
}

fn probe_vram_gb() -> f32 {
    // Linux exposes a cheap, dependency-free sysfs probe; try it first
    // so the common case never spawns a subprocess.
    #[cfg(target_os = "linux")]
    {
        let from_sysfs = detect_vram_gb_from_sysfs(Path::new("/proc/driver/nvidia/gpus"));
        if from_sysfs > 0.0 {
            return from_sysfs;
        }
    }
    // Fallback for every platform: `nvidia-smi`.  On a host with no
    // NVIDIA tooling the command simply fails to spawn and we return 0.
    detect_vram_gb_via_nvidia_smi().unwrap_or(0.0)
}

/// Probe VRAM via `nvidia-smi --query-gpu=memory.total`.  Returns `None`
/// when the binary is absent (no driver / non-NVIDIA host) or exits
/// non-zero, in which cases the caller defaults to 0 GB.
///
/// Coverage-off: spawning a real `nvidia-smi` is host-dependent (CI has
/// none), so its success / non-zero-exit arms can't be exercised
/// deterministically.  The parse + GB conversion + logging it delegates
/// to ([`vram_gb_from_smi_stdout`], [`parse_nvidia_smi_mib`]) are
/// unit-tested directly.
#[cfg_attr(coverage_nightly, coverage(off))]
fn detect_vram_gb_via_nvidia_smi() -> Option<f32> {
    let output = std::process::Command::new("nvidia-smi")
        .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
        .output();
    match output {
        Ok(o) if o.status.success() => vram_gb_from_smi_stdout(&String::from_utf8_lossy(&o.stdout)),
        Ok(o) => {
            tracing::warn!(
                target: "studio_worker::sys",
                op = "probe_vram",
                source = "nvidia_smi_failed",
                code = ?o.status.code(),
                "nvidia-smi exited non-zero while probing VRAM — defaulting to 0 GB"
            );
            None
        }
        Err(e) => {
            tracing::info!(
                target: "studio_worker::sys",
                op = "probe_vram",
                source = "nvidia_smi_absent",
                error = %e,
                "nvidia-smi not available — cannot probe VRAM; defaulting to 0 GB"
            );
            None
        }
    }
}

/// Summed VRAM (MiB) from an `nvidia-smi` memory query plus the count of
/// GPU lines that were dropped from that total.
///
/// `dropped` is the number of non-empty lines whose leading token wasn't
/// a number — nvidia-smi emits `[N/A]` for `memory.total` when a card has
/// fallen off the bus, hit an ECC fault, or sits in a MIG state with no
/// resolvable total.  Carrying the count (rather than silently summing
/// the survivors) means a multi-GPU box that under-reports its VRAM — and
/// then refuses jobs it could actually run — leaves a breadcrumb instead
/// of vanishing the card without a trace.
struct SmiMemTotal {
    mib: f64,
    dropped: u32,
}

/// Convert the stdout of an `nvidia-smi` memory query to GB and emit the
/// probe breadcrumb.  Split out from the subprocess plumbing so the
/// parse + conversion + logging are unit-testable without a real
/// `nvidia-smi` on the box (CI has none).
fn vram_gb_from_smi_stdout(stdout: &str) -> Option<f32> {
    let SmiMemTotal { mib, dropped } = parse_nvidia_smi_mib(stdout)?;
    let vram_gb = (mib / 1024.0) as f32;
    tracing::info!(
        target: "studio_worker::sys",
        op = "probe_vram",
        source = "nvidia_smi",
        vram_gb = vram_gb,
        dropped = dropped,
        "detected NVIDIA VRAM via nvidia-smi fallback"
    );
    Some(vram_gb)
}

/// Sum the per-GPU MiB totals from
/// `nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits`.
/// One line per GPU, each a bare MiB integer (e.g. `24564`).  Tolerates
/// a trailing unit token (if `nounits` is ever dropped) and ignores
/// blank lines.  Every non-empty line that fails to parse (e.g. `[N/A]`)
/// is warn-logged and counted in [`SmiMemTotal::dropped`] before being
/// left out of the total.  Returns `None` when no line yielded a number.
fn parse_nvidia_smi_mib(stdout: &str) -> Option<SmiMemTotal> {
    let mut total: f64 = 0.0;
    let mut any = false;
    let mut dropped: u32 = 0;
    for (idx, line) in stdout.lines().enumerate() {
        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }
        match trimmed
            .split_whitespace()
            .next()
            .and_then(|tok| tok.parse::<f64>().ok())
        {
            Some(mib) => {
                total += mib;
                any = true;
            }
            None => {
                dropped += 1;
                tracing::warn!(
                    target: "studio_worker::sys",
                    op = "probe_vram",
                    source = "nvidia_smi",
                    line = idx,
                    content = trimmed,
                    "nvidia-smi VRAM line did not parse as MiB — dropping this GPU from the total"
                );
            }
        }
    }
    any.then_some(SmiMemTotal {
        mib: total,
        dropped,
    })
}

/// VRAM probe driven by a configurable sysfs root.  Public-in-crate so
/// the integration tests can exercise both the "missing root" and
/// "populated root" branches without a real `/proc/driver/nvidia` tree.
///
/// Emits a summary tracing event per call, plus a `WARN` for every GPU
/// dropped from the total so a multi-GPU box never under-reports its
/// VRAM silently:
///
/// - `INFO source="no_nvidia_sysfs"` — `root` is not a directory.  This
///   is the normal case on CI runners / non-GPU hosts.
/// - `INFO source="nvidia_sysfs"` — at least one GPU's `information`
///   file was parseable.  `gpu_count` is how many contributed; `dropped`
///   is how many were present but unreadable / had no parseable `Video
///   Memory` line (each of those also gets its own `WARN` naming it).
/// - `WARN source="sysfs_unparseable"` — directories were present but
///   none parseable (current 5xx drivers dropped the `Video Memory`
///   line).  The caller then falls back to `nvidia-smi`; the warn is the
///   breadcrumb that the cheap sysfs path no longer works on this host.
/// - `WARN source="nvidia_sysfs" reason="no_video_memory_line"|"video_memory_unparseable"|"info_unreadable"`
///   — a specific GPU was dropped from the total while others survived.
///   `video_memory_unparseable` means the `Video Memory` line was
///   present but its value didn't parse (the warn echoes the offending
///   `content`); `no_video_memory_line` means no such line at all.
pub fn detect_vram_gb_from_sysfs(root: &Path) -> f32 {
    let entries = match std::fs::read_dir(root) {
        Ok(e) => e,
        Err(_) => {
            tracing::info!(
                target: "studio_worker::sys",
                op = "probe_vram",
                source = "no_nvidia_sysfs",
                vram_gb = 0.0,
                root = %root.display(),
                "no NVIDIA sysfs tree at probe root — defaulting to 0 GB VRAM"
            );
            return 0.0;
        }
    };

    let mut total_mib: f64 = 0.0;
    let mut gpu_count: u32 = 0;
    let mut parseable: u32 = 0;
    for entry in entries.flatten() {
        gpu_count += 1;
        let gpu_path = entry.path();
        let info_path = gpu_path.join("information");
        match std::fs::read_to_string(&info_path) {
            Ok(content) => {
                let mut found = false;
                // A `Video Memory:` line that's present but whose value
                // can't be parsed (e.g. `N/A` on a driver that stubbed
                // the field) must be surfaced differently from a GPU
                // with no such line at all — otherwise the operator is
                // told the line is missing when it's right there.  Keep
                // the first offending value to echo in the warn.
                let mut unparseable: Option<String> = None;
                for line in content.lines() {
                    if let Some(rest) = line.trim().strip_prefix("Video Memory:") {
                        if let Some(mib) = parse_mib(rest) {
                            total_mib += mib;
                            found = true;
                        } else if unparseable.is_none() {
                            unparseable = Some(rest.trim().to_string());
                        }
                    }
                }
                if found {
                    parseable += 1;
                } else if let Some(content) = unparseable {
                    tracing::warn!(
                        target: "studio_worker::sys",
                        op = "probe_vram",
                        source = "nvidia_sysfs",
                        reason = "video_memory_unparseable",
                        gpu = %gpu_path.display(),
                        content = content.as_str(),
                        "sysfs GPU Video Memory line did not parse as MiB — dropping it from the total"
                    );
                } else {
                    tracing::warn!(
                        target: "studio_worker::sys",
                        op = "probe_vram",
                        source = "nvidia_sysfs",
                        reason = "no_video_memory_line",
                        gpu = %gpu_path.display(),
                        "sysfs GPU has no parseable Video Memory line — dropping it from the total"
                    );
                }
            }
            Err(e) => {
                tracing::warn!(
                    target: "studio_worker::sys",
                    op = "probe_vram",
                    source = "nvidia_sysfs",
                    reason = "info_unreadable",
                    gpu = %gpu_path.display(),
                    error = %e,
                    "could not read a sysfs GPU information file — dropping it from the total"
                );
            }
        }
    }

    let vram_gb = (total_mib / 1024.0) as f32;
    let dropped = gpu_count.saturating_sub(parseable);
    if parseable > 0 {
        tracing::info!(
            target: "studio_worker::sys",
            op = "probe_vram",
            source = "nvidia_sysfs",
            vram_gb = vram_gb,
            gpu_count = parseable,
            dropped = dropped,
            "detected NVIDIA VRAM via sysfs"
        );
    } else {
        tracing::warn!(
            target: "studio_worker::sys",
            op = "probe_vram",
            source = "sysfs_unparseable",
            vram_gb = 0.0,
            gpu_count = gpu_count,
            root = %root.display(),
            "NVIDIA sysfs entries present but no Video Memory line (current 5xx drivers dropped it) — falling back to nvidia-smi"
        );
    }
    vram_gb
}

fn parse_mib(s: &str) -> Option<f64> {
    // Strings look like " 24576 MiB" or "24576 MB"
    let trimmed = s.trim();
    let mut parts = trimmed.split_whitespace();
    let value = parts.next()?.parse::<f64>().ok()?;
    let unit = parts.next().unwrap_or("MiB");
    match unit.to_ascii_lowercase().as_str() {
        "mib" | "mb" => Some(value),
        "gib" | "gb" => Some(value * 1024.0),
        _ => Some(value),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_mib_handles_mib() {
        assert_eq!(parse_mib(" 24576 MiB"), Some(24576.0));
        assert_eq!(parse_mib("12288 MB"), Some(12288.0));
        assert_eq!(parse_mib("24 GiB"), Some(24576.0));
        assert_eq!(parse_mib("8 GB"), Some(8192.0));
    }

    #[test]
    fn parse_mib_defaults_to_mib_when_the_unit_is_omitted() {
        // A bare value (no unit token) is assumed to already be in MiB,
        // matching nvidia-smi's `--units` output where the suffix is
        // sometimes stripped.
        assert_eq!(parse_mib("4096"), Some(4096.0));
    }

    #[test]
    fn parse_mib_treats_an_unknown_unit_as_raw_mib() {
        // An unrecognised suffix must not silently zero the GPU out of
        // the VRAM total: the worker claims jobs by VRAM, so dropping a
        // card to 0 would make it refuse work it can actually run. We
        // keep the numeric value as-is (best-effort MiB) rather than
        // returning `None`.
        assert_eq!(parse_mib("2048 KiB"), Some(2048.0));
        assert_eq!(parse_mib("4 TB"), Some(4.0));
    }

    #[test]
    fn parse_mib_rejects_unparseable_or_empty_values() {
        // A non-numeric leading token (e.g. an `[N/A]` placeholder) or
        // an empty / whitespace-only line yields `None` so the caller
        // skips it instead of polluting the total with a bogus number.
        assert_eq!(parse_mib("N/A MiB"), None);
        assert_eq!(parse_mib(""), None);
        assert_eq!(parse_mib("   "), None);
    }

    #[test]
    fn machine_name_returns_non_empty() {
        assert!(!machine_name().is_empty());
    }

    #[test]
    fn username_returns_non_empty() {
        assert!(!username().is_empty());
    }

    #[test]
    fn username_from_probe_returns_the_resolved_value() {
        let user = username_from_probe(Ok::<_, std::io::Error>("alice".to_string()));
        assert_eq!(user, "alice");
    }

    #[test]
    fn username_from_probe_falls_back_to_unknown_user_on_error() {
        let user =
            username_from_probe(Err::<String, _>(std::io::Error::other("no entropy source")));
        assert_eq!(user, "unknown-user");
    }

    #[test]
    fn username_from_probe_warns_with_the_error_on_failure() {
        // whoami 2.x made the probe fallible; a failure must leave an
        // operator-visible breadcrumb naming the error rather than a
        // silent fallback that hides why the user came back unknown.
        let logs = crate::test_support::capture(|| {
            let _ =
                username_from_probe(Err::<String, _>(std::io::Error::other("permission denied")));
        });
        assert!(logs.contains("WARN"), "expected WARN level, got: {logs}");
        assert!(
            logs.contains("op=\"username\""),
            "expected username op, got: {logs}"
        );
        assert!(
            logs.contains("permission denied"),
            "expected underlying error, got: {logs}"
        );
    }

    #[test]
    fn username_from_probe_emits_debug_value_on_success() {
        let logs = crate::test_support::capture(|| {
            let _ = username_from_probe(Ok::<_, std::io::Error>("bob".to_string()));
        });
        assert!(logs.contains("DEBUG"), "expected DEBUG event, got: {logs}");
        assert!(
            logs.contains("value=bob"),
            "expected resolved value, got: {logs}"
        );
    }

    #[test]
    fn detect_vram_gb_from_sysfs_returns_zero_when_root_missing() {
        let dir = tempfile::tempdir().unwrap();
        let missing = dir.path().join("nope");
        assert_eq!(detect_vram_gb_from_sysfs(&missing), 0.0);
    }

    #[test]
    fn detect_vram_gb_from_sysfs_sums_parseable_gpus() {
        let dir = tempfile::tempdir().unwrap();
        for (bus, mib) in [("0000:01:00.0", "12288"), ("0000:02:00.0", "24576")] {
            let gpu = dir.path().join(bus);
            std::fs::create_dir_all(&gpu).unwrap();
            std::fs::write(
                gpu.join("information"),
                format!("Model: x\nVideo Memory: {mib} MiB\n"),
            )
            .unwrap();
        }
        // (12288 + 24576) / 1024 = 36 GiB
        let gb = detect_vram_gb_from_sysfs(dir.path());
        assert!((gb - 36.0).abs() < 1e-3, "got {gb}");
    }

    #[test]
    fn detect_vram_gb_from_sysfs_sums_only_survivors_when_one_gpu_is_unreadable() {
        // A healthy card next to one whose `information` can't be read
        // (here a *directory* named `information`, so `read_to_string`
        // fails on every platform): the survivor still totals, the bad
        // card is dropped from the sum rather than zeroing the host out.
        let dir = tempfile::tempdir().unwrap();
        let good = dir.path().join("0000:01:00.0");
        std::fs::create_dir_all(&good).unwrap();
        std::fs::write(good.join("information"), "Video Memory: 12288 MiB\n").unwrap();
        let bad = dir.path().join("0000:02:00.0");
        std::fs::create_dir_all(bad.join("information")).unwrap();
        // Only the healthy card's 12288 MiB / 1024 = 12 GiB counts.
        let gb = detect_vram_gb_from_sysfs(dir.path());
        assert!((gb - 12.0).abs() < 1e-3, "got {gb}");
    }

    // -----------------------------------------------------------------
    // nvidia-smi fallback — current NVIDIA drivers (5xx) dropped the
    // "Video Memory" line from the sysfs `information` file, so the
    // sysfs probe yields 0 on otherwise-capable hosts.  `nvidia-smi`
    // ships with every driver and its `--query-gpu` interface is stable
    // across versions, so it's the layout-proof fallback.
    // -----------------------------------------------------------------

    #[test]
    fn parse_nvidia_smi_mib_reads_a_single_bare_value() {
        let total = parse_nvidia_smi_mib("24564\n").unwrap();
        assert_eq!(total.mib, 24564.0);
        assert_eq!(total.dropped, 0);
    }

    #[test]
    fn parse_nvidia_smi_mib_sums_multiple_gpus() {
        let total = parse_nvidia_smi_mib("24564\n24564\n").unwrap();
        assert_eq!(total.mib, 49128.0);
        assert_eq!(total.dropped, 0);
    }

    #[test]
    fn parse_nvidia_smi_mib_tolerates_units_and_crlf_whitespace() {
        // If `nounits` is ever dropped the value arrives as "24564 MiB".
        let total = parse_nvidia_smi_mib("  24564 MiB \r\n").unwrap();
        assert_eq!(total.mib, 24564.0);
        assert_eq!(total.dropped, 0);
    }

    #[test]
    fn parse_nvidia_smi_mib_returns_none_on_empty_or_na() {
        assert!(parse_nvidia_smi_mib("").is_none());
        assert!(parse_nvidia_smi_mib("\n[N/A]\n").is_none());
    }

    #[test]
    fn parse_nvidia_smi_mib_sums_survivors_and_counts_a_dropped_gpu() {
        // A healthy 24 GiB card next to one nvidia-smi reports `[N/A]`
        // for (fell off the bus / ECC fault): the survivor's VRAM still
        // totals, but the dropped card is counted, not silently lost.
        let total = parse_nvidia_smi_mib("24564\n[N/A]\n24564\n").unwrap();
        assert_eq!(total.mib, 49128.0);
        assert_eq!(total.dropped, 1);
    }

    #[test]
    fn parse_nvidia_smi_mib_warns_on_each_dropped_gpu_line() {
        // A multi-GPU box that under-reports its VRAM (and then refuses
        // jobs it can run) must leave a per-line breadcrumb naming the
        // offending value, not vanish the card without a trace.
        let logs = crate::test_support::capture(|| {
            let _ = parse_nvidia_smi_mib("24564\n[N/A]\n");
        });
        assert!(logs.contains("WARN"), "expected WARN level, got: {logs}");
        assert!(
            logs.contains("op=\"probe_vram\""),
            "expected probe_vram op, got: {logs}"
        );
        assert!(
            logs.contains("source=\"nvidia_smi\""),
            "expected source=nvidia_smi, got: {logs}"
        );
        assert!(
            logs.contains("[N/A]"),
            "the warning must name the unparseable value, got: {logs}"
        );
        assert!(
            logs.contains("dropping this GPU"),
            "the warning must explain the drop, got: {logs}"
        );
    }

    #[test]
    fn vram_gb_from_smi_stdout_reports_dropped_count_in_breadcrumb() {
        // The success breadcrumb must surface how many GPUs were dropped
        // so a truncated VRAM total can't pass for a complete one.
        let logs = crate::test_support::capture(|| {
            let gb = vram_gb_from_smi_stdout("24564\n[N/A]\n").unwrap();
            assert!((gb - 23.99).abs() < 0.05, "survivor still totals: {gb}");
        });
        assert!(
            logs.contains("dropped=1"),
            "the breadcrumb must report the dropped count, got: {logs}"
        );
    }

    #[test]
    fn vram_gb_from_smi_stdout_converts_mib_to_gb() {
        // 24564 MiB / 1024 = 23.99 GiB
        let gb = vram_gb_from_smi_stdout("24564\n").unwrap();
        assert!((gb - 23.99).abs() < 0.05, "got {gb}");
    }

    #[test]
    fn vram_gb_from_smi_stdout_is_none_when_unparseable() {
        assert_eq!(vram_gb_from_smi_stdout("\n[N/A]\n"), None);
    }

    #[test]
    fn vram_gb_from_smi_stdout_emits_info_breadcrumb_on_success() {
        let logs = crate::test_support::capture(|| {
            let _ = vram_gb_from_smi_stdout("24564\n");
        });
        assert!(logs.contains("INFO"), "expected INFO level, got: {logs}");
        assert!(
            logs.contains("op=\"probe_vram\""),
            "expected probe_vram op, got: {logs}"
        );
        assert!(
            logs.contains("source=\"nvidia_smi\""),
            "expected source=nvidia_smi, got: {logs}"
        );
    }
}