Skip to main content

neuronbox_runtime/
vram_watch.rs

1//! Soft monitoring: if VRAM usage reported by `nvidia-smi` for a PID exceeds
2//! ~115% of the registered estimate, send SIGKILL (Linux only).
3
4use std::process::Command;
5use std::time::Duration;
6
7use tokio::time::sleep;
8
9use crate::gpu_manager::GpuManager;
10use crate::host::compute_apps_pid_memory_mb;
11
12pub async fn run_soft_vram_enforcement(gm: GpuManager) {
13    loop {
14        sleep(Duration::from_secs(5)).await;
15        let usage =
16            match tokio::task::spawn_blocking(|| compute_apps_pid_memory_mb().ok_or(())).await {
17                Ok(Ok(m)) => m,
18                _ => continue,
19            };
20        let sessions = gm.list().await;
21        for s in sessions {
22            let Some(used_mb) = usage.get(&s.pid).copied() else {
23                continue;
24            };
25            let limit = s.estimated_vram_mb.saturating_mul(115) / 100;
26            if used_mb > limit && limit > 0 {
27                tracing::warn!(
28                    "VRAM soft limit: pid {} ({}) used {} MiB > {} MiB — SIGKILL",
29                    s.pid,
30                    s.name,
31                    used_mb,
32                    limit
33                );
34                kill_pid_hard(s.pid);
35                gm.unregister(s.pid).await;
36            }
37        }
38    }
39}
40
41#[cfg(unix)]
42fn kill_pid_hard(pid: u32) {
43    let _ = Command::new("kill")
44        .args(["-KILL", &pid.to_string()])
45        .status();
46}
47
48#[cfg(not(unix))]
49fn kill_pid_hard(_pid: u32) {}