supermachine 0.7.72

//! Process-wide memory admission control for microVM workers.
//!
//! Each worker subprocess reserves roughly its configured guest RAM
//! plus a fixed host-side overhead. When a pool (or an embedder loop)
//! spawns many workers at once, the committed total can exceed host RAM
//! and macOS jetsam starts killing workers — observed under load as
//! "agent closed connection before EXIT", `EUCLEAN`, or vanished VMs.
//!
//! This gate bounds the in-flight committed total. A spawn that would
//! push the total past the budget BLOCKS until a live worker is
//! released (its [`AdmissionGuard`] drops), instead of overcommitting.
//! It is a back-pressure valve, not a hard limit: a single worker whose
//! footprint alone exceeds the budget is still admitted (we can never do
//! better than running it alone), so the gate can never deadlock.
//!
//! Budget = `SUPERMACHINE_MEMORY_BUDGET_MIB` if set (`0` disables the
//! gate entirely — unlimited, legacy behaviour), else host RAM × 0.8.

use std::path::PathBuf;
use std::sync::{Condvar, Mutex, OnceLock};
use std::time::{Duration, Instant};

/// Fixed host-side overhead charged per worker on top of its guest RAM:
/// page tables, device DMA buffers, the muxer RX queue + per-connection
/// `pending_rx`, agent/exec buffers, and the worker's thread stacks.
/// The big safety margin comes from the budget factor (host RAM × 0.8);
/// this just keeps each worker's charge a little above its bare guest
/// RAM so the count-of-workers estimate isn't optimistic.
pub const WORKER_OVERHEAD_MIB: u64 = 64;

/// Default budget as a fraction of host RAM (numerator / denominator)
/// when `SUPERMACHINE_MEMORY_BUDGET_MIB` is unset. Leaves headroom for
/// the host OS, the parent process, and the lazy-fault slack between
/// "admitted" (worst-case resident) and actually-resident.
const DEFAULT_BUDGET_NUM: u64 = 80;
const DEFAULT_BUDGET_DEN: u64 = 100;

/// How long a blocked spawn waits for a release before admitting anyway
/// (overcommitting) to guarantee forward progress. The gate is a valve
/// for *transient* spikes — concurrent acquires that will release as
/// their work finishes wake well within this window. The timeout only
/// bites when releases genuinely aren't coming, e.g. a pool whose
/// minimum size alone exceeds the budget: there we must not hang, so we
/// proceed with a loud warning instead. Override with
/// `SUPERMACHINE_MEMORY_ADMISSION_TIMEOUT_MS`.
const DEFAULT_ADMISSION_TIMEOUT: Duration = Duration::from_secs(60);

struct Inner {
    /// Sum of the footprints of all admitted-but-not-yet-released
    /// workers (MiB).
    committed_mib: u64,
    /// Count of live admissions (outstanding guards).
    live: u64,
}

/// The accounting state. Process-wide via [`accountant`]; constructed
/// standalone in tests via [`MemoryAccountant::with_budget_for_test`].
pub struct MemoryAccountant {
    inner: Mutex<Inner>,
    cv: Condvar,
    /// `0` = gate disabled (unlimited / legacy behaviour).
    budget_mib: u64,
    /// How long a blocked spawn waits before admitting anyway (forward-
    /// progress guarantee — see [`DEFAULT_ADMISSION_TIMEOUT`]).
    timeout: Duration,
    /// `Some` when `SUPERMACHINE_MEMORY_BUDGET_SCOPE=host`: the budget is
    /// coordinated across processes via the shared dir, so N independent
    /// supermachine processes can't each assume the whole budget and
    /// collectively overcommit. `None` (default) = per-process budget.
    host: Option<HostCoord>,
}

static ACCOUNTANT: OnceLock<MemoryAccountant> = OnceLock::new();

/// The process-wide accountant, initialised on first use (budget +
/// timeout probed once from the env vars / host RAM).
pub fn accountant() -> &'static MemoryAccountant {
    ACCOUNTANT.get_or_init(|| {
        let budget = compute_budget_mib();
        let mut acc = MemoryAccountant::new(budget, admission_timeout());
        // Opt-in cross-process coordination. Only when a budget is set
        // (0 = disabled) AND host scope is requested. Falls back to
        // per-process (with a warning) if the shared dir is unusable, so
        // a broken /tmp never breaks spawning.
        if budget != 0 && budget_scope_is_host() {
            match HostCoord::new(host_budget_dir()) {
                Ok(h) => acc.host = Some(h),
                Err(e) => eprintln!(
                    "[memory-admission] host scope requested but shared dir is unusable \
                     ({e}); falling back to per-process budget"
                ),
            }
        }
        acc
    })
}

/// RAII receipt for an admitted worker. Holds `mib` of the budget until
/// dropped; on drop the budget is returned and any blocked spawn is
/// woken. Store it in the worker so the reservation tracks the worker's
/// lifetime exactly. `mib == 0` means the gate was disabled — a
/// zero-cost no-op guard.
#[must_use = "dropping the guard immediately releases the memory reservation"]
pub struct AdmissionGuard {
    accountant: &'static MemoryAccountant,
    mib: u64,
}

impl AdmissionGuard {
    /// MiB this guard has reserved (0 when the gate is disabled).
    pub fn reserved_mib(&self) -> u64 {
        self.mib
    }
}

impl Drop for AdmissionGuard {
    fn drop(&mut self) {
        if self.mib > 0 {
            self.accountant.release(self.mib);
        }
    }
}

impl std::fmt::Debug for AdmissionGuard {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("AdmissionGuard")
            .field("mib", &self.mib)
            .finish()
    }
}

/// Block until `mib` fits within the process-wide budget, then reserve
/// it. The returned guard releases the reservation on drop. With the
/// gate disabled (budget 0) this returns immediately with a no-op guard.
pub fn admit(mib: u64) -> AdmissionGuard {
    admit_on(accountant(), mib)
}

/// [`admit`] against a specific accountant — the production path uses
/// the singleton; tests use a leaked standalone instance.
fn admit_on(accountant: &'static MemoryAccountant, mib: u64) -> AdmissionGuard {
    let reserved = accountant.reserve_blocking(mib);
    AdmissionGuard {
        accountant,
        mib: reserved,
    }
}

/// Charge `mib` to the budget WITHOUT blocking — for workers that are
/// ALREADY running (e.g. a bake-time warm worker handed into a pool),
/// where the RAM is already spent so gating would be pointless and
/// could even deadlock. Keeps the committed total honest so later
/// blocking [`admit`] calls account for this worker too.
pub fn charge(mib: u64) -> AdmissionGuard {
    let accountant = accountant();
    let reserved = accountant.charge(mib);
    AdmissionGuard {
        accountant,
        mib: reserved,
    }
}

impl MemoryAccountant {
    fn new(budget_mib: u64, timeout: Duration) -> Self {
        MemoryAccountant {
            inner: Mutex::new(Inner {
                committed_mib: 0,
                live: 0,
            }),
            cv: Condvar::new(),
            budget_mib,
            timeout,
            host: None,
        }
    }

    /// The configured budget in MiB (`0` = disabled).
    pub fn budget_mib(&self) -> u64 {
        self.budget_mib
    }

    /// `(committed_mib, live_workers)` snapshot — diagnostics / tests.
    /// In host scope, `committed_mib` is the host-wide total and the
    /// second element is this process's own contribution.
    pub fn snapshot(&self) -> (u64, u64) {
        if let Some(h) = &self.host {
            return h.snapshot();
        }
        let g = self.inner.lock().unwrap();
        (g.committed_mib, g.live)
    }

    /// Reserve `mib`, blocking until it fits OR the admission timeout
    /// elapses. Returns the MiB actually charged (0 when the gate is
    /// disabled, so the guard's drop is a no-op). Guaranteed to make
    /// forward progress — it never blocks forever:
    ///   - an over-budget worker is admitted immediately once nothing
    ///     else is committed (can't do better than running it solo);
    ///   - if releases simply aren't coming (e.g. a pool whose minimum
    ///     size alone exceeds the budget), it admits anyway after the
    ///     timeout with a loud warning, rather than hanging the spawn.
    fn reserve_blocking(&self, mib: u64) -> u64 {
        if self.budget_mib == 0 || mib == 0 {
            return 0;
        }
        if let Some(h) = &self.host {
            return h.reserve_blocking(mib, self.budget_mib, self.timeout);
        }
        let mut g = self.inner.lock().unwrap();
        let mut waited_from: Option<Instant> = None;
        loop {
            // Admit when it fits, OR when nothing else is committed (we
            // must never block the only worker — even one that alone
            // exceeds the budget; refusing would be pointless).
            if g.committed_mib == 0 || g.committed_mib + mib <= self.budget_mib {
                if mib > self.budget_mib && g.committed_mib == 0 {
                    eprintln!(
                        "[memory-admission] worker footprint {mib} MiB exceeds the entire \
                         budget {} MiB — admitting it alone; raise \
                         SUPERMACHINE_MEMORY_BUDGET_MIB or lower the VM's --memory",
                        self.budget_mib
                    );
                }
                g.committed_mib += mib;
                g.live += 1;
                if let Some(t0) = waited_from {
                    eprintln!(
                        "[memory-admission] admitted {mib} MiB after waiting {:?} \
                         (now {}/{} MiB across {} workers)",
                        t0.elapsed(),
                        g.committed_mib,
                        self.budget_mib,
                        g.live
                    );
                }
                return mib;
            }
            let t0 = *waited_from.get_or_insert_with(|| {
                eprintln!(
                    "[memory-admission] spawn needs {mib} MiB but {}/{} MiB already \
                     committed across {} workers — waiting for a release rather than \
                     overcommitting host RAM (set SUPERMACHINE_MEMORY_BUDGET_MIB=0 to disable)",
                    g.committed_mib, self.budget_mib, g.live
                );
                Instant::now()
            });
            // Forward-progress guarantee: if nothing has released within
            // the timeout, admit anyway rather than hang. This means an
            // oversized pool overcommits (with a warning) instead of
            // dead-locking on releases that will never come.
            let elapsed = t0.elapsed();
            if elapsed >= self.timeout {
                eprintln!(
                    "[memory-admission] waited {:?} for {mib} MiB with no release \
                     ({}/{} MiB committed across {} workers) — admitting anyway to avoid a \
                     hang; the host may now be overcommitted (lower pool size / VM --memory, \
                     or raise SUPERMACHINE_MEMORY_BUDGET_MIB)",
                    elapsed, g.committed_mib, self.budget_mib, g.live
                );
                g.committed_mib += mib;
                g.live += 1;
                return mib;
            }
            let (ng, _) = self.cv.wait_timeout(g, self.timeout - elapsed).unwrap();
            g = ng;
        }
    }

    /// Account `mib` without blocking — see the free [`charge`] fn.
    fn charge(&self, mib: u64) -> u64 {
        if self.budget_mib == 0 || mib == 0 {
            return 0;
        }
        if let Some(h) = &self.host {
            return h.charge(mib);
        }
        let mut g = self.inner.lock().unwrap();
        g.committed_mib += mib;
        g.live += 1;
        mib
    }

    fn release(&self, mib: u64) {
        if let Some(h) = &self.host {
            h.release(mib);
            return;
        }
        {
            let mut g = self.inner.lock().unwrap();
            g.committed_mib = g.committed_mib.saturating_sub(mib);
            g.live = g.live.saturating_sub(1);
        }
        // Wake every waiter: a freed slot may fit several small pending
        // spawns, and the ones that still don't fit re-block cheaply.
        self.cv.notify_all();
    }

    /// Build a standalone accountant with an explicit budget for tests.
    /// Leaked to `&'static` so its [`AdmissionGuard`]s satisfy the
    /// guard's `'static` accountant reference. Uses the default (long)
    /// timeout so block-then-release tests aren't races.
    #[cfg(test)]
    fn with_budget_for_test(budget_mib: u64) -> &'static MemoryAccountant {
        Box::leak(Box::new(MemoryAccountant::new(
            budget_mib,
            DEFAULT_ADMISSION_TIMEOUT,
        )))
    }

    /// As [`Self::with_budget_for_test`] but with an explicit (short)
    /// timeout to exercise the admit-anyway forward-progress path.
    #[cfg(test)]
    fn with_budget_and_timeout_for_test(
        budget_mib: u64,
        timeout: Duration,
    ) -> &'static MemoryAccountant {
        Box::leak(Box::new(MemoryAccountant::new(budget_mib, timeout)))
    }
}

/// Compute the budget once: explicit env override (incl. `0` = disabled)
/// wins; otherwise host RAM × 0.8. If host RAM can't be probed we
/// disable the gate rather than guess a bound that might serialise every
/// spawn.
fn compute_budget_mib() -> u64 {
    if let Ok(v) = std::env::var("SUPERMACHINE_MEMORY_BUDGET_MIB") {
        let t = v.trim();
        if let Ok(n) = t.parse::<u64>() {
            return n;
        }
        eprintln!(
            "[memory-admission] ignoring unparseable SUPERMACHINE_MEMORY_BUDGET_MIB={v:?}; \
             falling back to the host-RAM default"
        );
    }
    let host_mib = host_ram_mib();
    if host_mib == 0 {
        return 0;
    }
    host_mib.saturating_mul(DEFAULT_BUDGET_NUM) / DEFAULT_BUDGET_DEN
}

/// The blocked-spawn timeout: `SUPERMACHINE_MEMORY_ADMISSION_TIMEOUT_MS`
/// if set and parseable, else [`DEFAULT_ADMISSION_TIMEOUT`].
fn admission_timeout() -> Duration {
    if let Ok(v) = std::env::var("SUPERMACHINE_MEMORY_ADMISSION_TIMEOUT_MS") {
        if let Ok(ms) = v.trim().parse::<u64>() {
            return Duration::from_millis(ms);
        }
    }
    DEFAULT_ADMISSION_TIMEOUT
}

#[cfg(target_os = "macos")]
fn host_ram_mib() -> u64 {
    // sysctl `hw.memsize` → total physical RAM in bytes.
    let mut size: u64 = 0;
    let mut len = std::mem::size_of::<u64>();
    let name = c"hw.memsize";
    let r = unsafe {
        libc::sysctlbyname(
            name.as_ptr(),
            &mut size as *mut _ as *mut libc::c_void,
            &mut len,
            std::ptr::null_mut(),
            0,
        )
    };
    if r == 0 && size > 0 {
        size / (1024 * 1024)
    } else {
        0
    }
}

#[cfg(not(target_os = "macos"))]
fn host_ram_mib() -> u64 {
    // Linux: MemTotal from /proc/meminfo. 0 if unreadable (admission then falls
    // back to the configured budget, same as the old stub).
    std::fs::read_to_string("/proc/meminfo")
        .ok()
        .and_then(|s| parse_meminfo_total_kb(&s))
        .map(|kb| kb / 1024)
        .unwrap_or(0)
}

/// Parse `MemTotal: <N> kB` (kB) from /proc/meminfo contents. Portable so it's
/// unit-testable on any host.
#[cfg_attr(target_os = "macos", allow(dead_code))]
fn parse_meminfo_total_kb(s: &str) -> Option<u64> {
    s.lines().find_map(|l| {
        let rest = l.strip_prefix("MemTotal:")?;
        rest.split_whitespace().next()?.parse::<u64>().ok()
    })
}

/// Parse `VmRSS: <N> kB` (kB) from a /proc/<pid>/status blob.
#[cfg_attr(target_os = "macos", allow(dead_code))]
fn parse_status_vmrss_kb(s: &str) -> Option<u64> {
    s.lines().find_map(|l| {
        let rest = l.strip_prefix("VmRSS:")?;
        rest.split_whitespace().next()?.parse::<u64>().ok()
    })
}

/// Parse the `some avg10=<f>` field from /proc/pressure/memory (PSI). `avg10` is
/// the percentage of the last 10s that at least one task stalled on memory.
#[cfg_attr(target_os = "macos", allow(dead_code))]
fn parse_psi_some_avg10(s: &str) -> Option<f64> {
    s.lines()
        .find_map(|l| l.strip_prefix("some "))
        .and_then(|rest| {
            rest.split_whitespace()
                .find_map(|tok| tok.strip_prefix("avg10="))
        })
        .and_then(|v| v.parse::<f64>().ok())
}

// ---------- runtime footprint + memory-pressure probes ----------
//
// These let the admission gate charge a worker's MEASURED resident
// footprint (phys_footprint — the metric macOS jetsam actually accounts)
// instead of its configured `--memory` cap, which is usually a big
// over-estimate. Both are cheap host syscalls used only off the hot path
// (per-spawn / pre-admit), never per-packet.

/// This process's resident footprint for `pid` in MiB, via
/// `proc_pid_rusage(RUSAGE_INFO_V2).ri_phys_footprint` — the same
/// "memory footprint" Activity Monitor shows and jetsam uses to decide
/// what to kill. `None` if the pid is gone / the call fails.
#[cfg(target_os = "macos")]
pub fn phys_footprint_mib(pid: u32) -> Option<u64> {
    let mut info: libc::rusage_info_v2 = unsafe { std::mem::zeroed() };
    // proc_pid_rusage takes a `rusage_info_t *` (= `void **`); the
    // canonical call casts `&mut struct` to that. See <libproc.h>.
    let rc = unsafe {
        libc::proc_pid_rusage(
            pid as libc::c_int,
            libc::RUSAGE_INFO_V2,
            &mut info as *mut libc::rusage_info_v2 as *mut libc::rusage_info_t,
        )
    };
    if rc == 0 {
        Some(info.ri_phys_footprint / (1024 * 1024))
    } else {
        None
    }
}

/// Linux: resident set (`VmRSS`) for `pid` in MiB, from /proc/<pid>/status — the
/// closest analog to macOS `phys_footprint` for charging a worker's MEASURED
/// footprint instead of its configured `--memory` cap. `None` if the pid is gone.
#[cfg(not(target_os = "macos"))]
pub fn phys_footprint_mib(pid: u32) -> Option<u64> {
    let status = std::fs::read_to_string(format!("/proc/{pid}/status")).ok()?;
    parse_status_vmrss_kb(&status).map(|kb| kb / 1024)
}

/// macOS VM-pressure level: `1` = normal, `2` = warn, `4` = critical
/// (`kern.memorystatus_vm_pressure_level`). Used as a best-effort safety
/// backstop: when the host is genuinely under pressure we stop admitting
/// regardless of the budget arithmetic. Returns `1` (normal) if the
/// sysctl is unavailable — so a missing probe never adds spurious
/// back-pressure; the budget + margin remain the primary guard.
#[cfg(target_os = "macos")]
pub fn memory_pressure_level() -> u32 {
    let mut level: u32 = 1;
    let mut len = std::mem::size_of::<u32>();
    let name = c"kern.memorystatus_vm_pressure_level";
    let rc = unsafe {
        libc::sysctlbyname(
            name.as_ptr(),
            &mut level as *mut u32 as *mut libc::c_void,
            &mut len,
            std::ptr::null_mut(),
            0,
        )
    };
    if rc == 0 {
        level
    } else {
        1
    }
}

/// Linux memory-pressure level on the macOS-compatible scale (`1` normal / `2`
/// warn / `4` critical), from PSI (`/proc/pressure/memory`, kernels ≥ 4.20 with
/// `CONFIG_PSI`). `some avg10` is the % of the last 10s ≥1 task stalled on
/// memory: >=20% → critical, >=5% → warn, else normal. Missing PSI (old kernel /
/// not mounted) → `1`, so a missing probe never adds spurious back-pressure —
/// the budget + margin stay the primary guard, exactly like the macOS fallback.
#[cfg(not(target_os = "macos"))]
pub fn memory_pressure_level() -> u32 {
    match std::fs::read_to_string("/proc/pressure/memory")
        .ok()
        .and_then(|s| parse_psi_some_avg10(&s))
    {
        Some(avg10) => psi_avg10_to_level(avg10),
        None => 1,
    }
}

/// Map a PSI `some avg10` percentage to the 1/2/4 pressure scale.
#[cfg_attr(target_os = "macos", allow(dead_code))]
fn psi_avg10_to_level(avg10: f64) -> u32 {
    if avg10 >= 20.0 {
        4
    } else if avg10 >= 5.0 {
        2
    } else {
        1
    }
}

/// Best-effort safety backstop for the rss-charge overcommit mode: if the
/// host is under memory pressure (warn/critical), pause before admitting
/// a new spawn until it relieves. Bounded — proceeds after the timeout
/// regardless (forward progress; the budget + per-worker margin remain
/// the primary guard). A no-op when pressure is normal, so it costs one
/// sysctl on the (already slow) spawn path. Only called in rss-charge
/// mode; the default cap-charge path never invokes it.
pub fn await_pressure_relief() {
    const WARN: u32 = 2;
    let timeout = Duration::from_secs(10);
    let t0 = Instant::now();
    let mut announced = false;
    while memory_pressure_level() >= WARN {
        if t0.elapsed() >= timeout {
            eprintln!(
                "[memory-admission] host under memory pressure for {:?} — admitting anyway \
                 to avoid a stall",
                t0.elapsed()
            );
            return;
        }
        if !announced {
            eprintln!(
                "[memory-admission] host memory pressure elevated — pausing new spawns until \
                 it relieves"
            );
            announced = true;
        }
        std::thread::sleep(Duration::from_millis(250));
    }
    if announced {
        eprintln!(
            "[memory-admission] memory pressure relieved after {:?}",
            t0.elapsed()
        );
    }
}

// ---------- host-wide (cross-process) coordination ----------

/// True when `SUPERMACHINE_MEMORY_BUDGET_SCOPE=host`. Default is
/// per-process (`process`), so single-process embedders see zero change.
fn budget_scope_is_host() -> bool {
    std::env::var("SUPERMACHINE_MEMORY_BUDGET_SCOPE")
        .map(|v| v.trim().eq_ignore_ascii_case("host"))
        .unwrap_or(false)
}

/// Directory holding the per-PID reservation files + lock, shared by all
/// supermachine processes on the host. Override with
/// `SUPERMACHINE_MEMORY_BUDGET_DIR`.
fn host_budget_dir() -> PathBuf {
    if let Ok(d) = std::env::var("SUPERMACHINE_MEMORY_BUDGET_DIR") {
        if !d.trim().is_empty() {
            return PathBuf::from(d);
        }
    }
    std::env::temp_dir().join("supermachine-mem-admission")
}

/// Cross-process memory-budget coordinator (opt-in `scope=host`).
///
/// Each process records its own committed MiB in `<dir>/<pid>`. The
/// host-wide committed total is the sum over **live** PID files; a file
/// whose PID is dead is reclaimed (unlinked) on the next sweep, so a
/// crashed process can never leak its reservation forever. An flock'd
/// lock file serialises the read-sum-decide-write critical section across
/// processes. All of this happens only on spawn/release — never on the
/// acquire/exec/dataplane hot paths — and spawn is already a multi-second
/// operation, so the file I/O + flock are negligible.
struct HostCoord {
    dir: PathBuf,
    lock_path: PathBuf,
    my_path: PathBuf,
    /// This process's current contribution (mirror of `my_path`), kept in
    /// memory so we don't re-read our own file on every op.
    my_committed: Mutex<u64>,
}

impl HostCoord {
    fn new(dir: PathBuf) -> std::io::Result<Self> {
        std::fs::create_dir_all(&dir)?;
        let pid = std::process::id();
        // Probe writability up front so accountant() can fall back cleanly.
        let my_path = dir.join(pid.to_string());
        let lock_path = dir.join(".lock");
        std::fs::OpenOptions::new()
            .create(true)
            .write(true)
            .truncate(false)
            .open(&lock_path)?;
        Ok(HostCoord {
            dir,
            lock_path,
            my_path,
            my_committed: Mutex::new(0),
        })
    }

    /// Run `f` under an exclusive cross-process advisory lock (flock).
    /// Released when the lock file handle drops. Returns `None` if the
    /// lock can't be taken (caller degrades gracefully — never hangs).
    fn with_lock<T>(&self, f: impl FnOnce() -> T) -> Option<T> {
        use std::os::unix::io::AsRawFd;
        let file = std::fs::OpenOptions::new()
            .create(true)
            .write(true)
            .truncate(false) // flock lock file — content is irrelevant
            .open(&self.lock_path)
            .ok()?;
        let fd = file.as_raw_fd();
        if unsafe { libc::flock(fd, libc::LOCK_EX) } != 0 {
            return None;
        }
        let out = f();
        unsafe {
            libc::flock(fd, libc::LOCK_UN);
        }
        Some(out)
    }

    /// Sum committed MiB over all LIVE pid files; unlink dead ones. Must be
    /// called while holding the flock.
    fn host_committed_locked(&self) -> u64 {
        let mut total = 0u64;
        let Ok(rd) = std::fs::read_dir(&self.dir) else {
            return 0;
        };
        for ent in rd.flatten() {
            let name = ent.file_name();
            let name = name.to_string_lossy();
            if name.starts_with('.') {
                continue; // skip .lock and dotfiles
            }
            let Ok(pid) = name.parse::<i32>() else {
                continue;
            };
            // Liveness: kill(pid, 0) → 0 (alive) or EPERM (alive, not ours).
            // ESRCH ⇒ dead ⇒ reclaim the stale reservation.
            let alive = unsafe { libc::kill(pid, 0) } == 0
                || std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM);
            if !alive {
                let _ = std::fs::remove_file(ent.path());
                continue;
            }
            if let Ok(s) = std::fs::read_to_string(ent.path()) {
                if let Ok(v) = s.trim().parse::<u64>() {
                    total = total.saturating_add(v);
                }
            }
        }
        total
    }

    /// Persist this process's contribution. Must hold the flock.
    fn write_my_locked(&self, mib: u64) {
        if mib == 0 {
            let _ = std::fs::remove_file(&self.my_path);
        } else {
            let _ = std::fs::write(&self.my_path, mib.to_string());
        }
        *self.my_committed.lock().unwrap() = mib;
    }

    fn reserve_blocking(&self, mib: u64, budget: u64, timeout: Duration) -> u64 {
        let mut waited_from: Option<Instant> = None;
        loop {
            let decided = self.with_lock(|| {
                let host = self.host_committed_locked(); // includes our own file
                if host == 0 || host + mib <= budget {
                    let mine = *self.my_committed.lock().unwrap();
                    self.write_my_locked(mine + mib);
                    true
                } else {
                    false
                }
            });
            // If the lock itself failed, degrade to admitting (never hang
            // a spawn on a broken shared dir).
            match decided {
                Some(true) | None => {
                    if let Some(t0) = waited_from {
                        eprintln!(
                            "[memory-admission/host] admitted {mib} MiB after waiting {:?}",
                            t0.elapsed()
                        );
                    }
                    if decided.is_none() {
                        // Lock failed → still record our charge best-effort.
                        let mine = *self.my_committed.lock().unwrap();
                        *self.my_committed.lock().unwrap() = mine + mib;
                    }
                    return mib;
                }
                Some(false) => {}
            }
            let t0 = *waited_from.get_or_insert_with(|| {
                eprintln!(
                    "[memory-admission/host] spawn needs {mib} MiB but the host-wide budget \
                     {budget} MiB is full across processes — waiting for a release"
                );
                Instant::now()
            });
            if t0.elapsed() >= timeout {
                eprintln!(
                    "[memory-admission/host] waited {:?} with no host-wide release — admitting \
                     anyway to avoid a hang (host may be overcommitted across processes)",
                    t0.elapsed()
                );
                let _ = self.with_lock(|| {
                    let mine = *self.my_committed.lock().unwrap();
                    self.write_my_locked(mine + mib);
                });
                return mib;
            }
            std::thread::sleep(Duration::from_millis(200));
        }
    }

    fn charge(&self, mib: u64) -> u64 {
        let _ = self.with_lock(|| {
            let mine = *self.my_committed.lock().unwrap();
            self.write_my_locked(mine + mib);
        });
        mib
    }

    fn release(&self, mib: u64) {
        let _ = self.with_lock(|| {
            let mine = *self.my_committed.lock().unwrap();
            self.write_my_locked(mine.saturating_sub(mib));
        });
    }

    /// `(host_wide_committed_mib, this_process_committed_mib)`.
    fn snapshot(&self) -> (u64, u64) {
        let host = self.with_lock(|| self.host_committed_locked()).unwrap_or(0);
        let mine = *self.my_committed.lock().unwrap();
        (host, mine)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicU64, Ordering};
    use std::sync::Arc;

    #[test]
    fn parses_proc_meminfo_total() {
        let s = "MemTotal:       32791528 kB\nMemFree:    1234 kB\nMemAvailable: 9 kB\n";
        assert_eq!(parse_meminfo_total_kb(s), Some(32791528));
        assert_eq!(parse_meminfo_total_kb("Buffers: 1 kB\n"), None);
    }

    #[test]
    fn parses_proc_status_vmrss() {
        let s = "Name:\tnginx\nVmPeak:\t  720896 kB\nVmRSS:\t   52428 kB\nThreads: 2\n";
        assert_eq!(parse_status_vmrss_kb(s), Some(52428));
        assert_eq!(parse_status_vmrss_kb("Name:\tx\n"), None);
    }

    #[test]
    fn parses_and_grades_psi_some_avg10() {
        let s = "some avg10=12.34 avg60=4.00 avg300=1.00 total=999\n\
                 full avg10=2.00 avg60=1.00 avg300=0.50 total=10\n";
        assert_eq!(parse_psi_some_avg10(s), Some(12.34));
        // Grading: <5 normal(1), 5..20 warn(2), >=20 critical(4).
        assert_eq!(psi_avg10_to_level(0.0), 1);
        assert_eq!(psi_avg10_to_level(4.99), 1);
        assert_eq!(psi_avg10_to_level(5.0), 2);
        assert_eq!(psi_avg10_to_level(19.99), 2);
        assert_eq!(psi_avg10_to_level(20.0), 4);
        assert_eq!(psi_avg10_to_level(80.0), 4);
        // Missing/garbage PSI parses to None (caller defaults to level 1).
        assert_eq!(parse_psi_some_avg10("garbage\n"), None);
    }

    /// On Linux the live /proc probes return real numbers (parity with the macOS
    /// proc_pid_rusage / pressure sysctls), not the old stubs (0 / None / 1).
    #[cfg(target_os = "linux")]
    #[test]
    fn linux_proc_probes_return_real_values() {
        assert!(host_ram_mib() > 0, "MemTotal should be readable on Linux");
        let me = std::process::id();
        let rss = phys_footprint_mib(me).expect("VmRSS for self");
        // The test process is resident; RSS is at least a few MiB.
        assert!(rss > 0, "self VmRSS should be > 0 MiB, got {rss}");
        // Pressure is one of the three valid levels (usually 1 on an idle box;
        // PSI may be absent → 1).
        assert!(matches!(memory_pressure_level(), 1 | 2 | 4));
    }

    #[test]
    fn disabled_gate_is_a_noop() {
        let acc = MemoryAccountant::with_budget_for_test(0);
        let g1 = admit_on(acc, 100_000);
        let g2 = admit_on(acc, 100_000);
        // Nothing is charged; both "fit" instantly.
        assert_eq!(acc.snapshot(), (0, 0));
        assert_eq!(g1.reserved_mib(), 0);
        assert_eq!(g2.reserved_mib(), 0);
    }

    #[test]
    fn accounting_charges_and_releases() {
        let acc = MemoryAccountant::with_budget_for_test(1000);
        let g1 = admit_on(acc, 300);
        let g2 = admit_on(acc, 400);
        assert_eq!(acc.snapshot(), (700, 2));
        drop(g1);
        assert_eq!(acc.snapshot(), (400, 1));
        drop(g2);
        assert_eq!(acc.snapshot(), (0, 0));
    }

    #[test]
    fn over_budget_single_worker_is_admitted_not_deadlocked() {
        let acc = MemoryAccountant::with_budget_for_test(512);
        // 4 GiB worker on a 512 MiB budget: must still run (alone).
        let g = admit_on(acc, 4096);
        assert_eq!(acc.snapshot(), (4096, 1));
        drop(g);
        assert_eq!(acc.snapshot(), (0, 0));
    }

    #[test]
    fn third_spawn_blocks_until_a_release_then_proceeds() {
        // Budget fits two 400-MiB workers but not three.
        let acc = MemoryAccountant::with_budget_for_test(1000);
        let g1 = admit_on(acc, 400);
        let _g2 = admit_on(acc, 400);
        assert_eq!(acc.snapshot(), (800, 2));

        let admitted = Arc::new(AtomicU64::new(0));
        let admitted_t = Arc::clone(&admitted);
        // A third spawn can't fit (800 + 400 > 1000) — it must block.
        let h = std::thread::spawn(move || {
            let _g3 = admit_on(acc, 400);
            admitted_t.store(1, Ordering::SeqCst);
            // Hold briefly so the assertions below can observe it.
            std::thread::sleep(std::time::Duration::from_millis(20));
        });

        // Give the thread time to reach the wait. It must NOT have been
        // admitted while the budget is full.
        std::thread::sleep(std::time::Duration::from_millis(50));
        assert_eq!(
            admitted.load(Ordering::SeqCst),
            0,
            "third spawn must block while full"
        );
        assert_eq!(acc.snapshot().1, 2, "still only two live workers");

        // Release one — the blocked spawn now fits and proceeds.
        drop(g1);
        h.join().unwrap();
        assert_eq!(
            admitted.load(Ordering::SeqCst),
            1,
            "third spawn admitted after release"
        );
    }

    #[test]
    fn admits_after_timeout_when_no_release_comes() {
        // Budget full and nothing will ever release: the gate must NOT
        // hang — after the (short) timeout it admits anyway, overcommit
        // and all. This is the pool-init-larger-than-budget safety net.
        let acc = MemoryAccountant::with_budget_and_timeout_for_test(
            1000,
            std::time::Duration::from_millis(80),
        );
        let _g1 = admit_on(acc, 800); // 800/1000 committed, never released
        let t0 = std::time::Instant::now();
        let g2 = admit_on(acc, 800); // can't fit; must time out then admit
        let waited = t0.elapsed();
        assert!(
            waited >= std::time::Duration::from_millis(70),
            "should have waited ~the timeout before admitting, waited {waited:?}"
        );
        assert_eq!(g2.reserved_mib(), 800, "admitted anyway after timeout");
        assert_eq!(
            acc.snapshot(),
            (1600, 2),
            "overcommitted (1600 > 1000 budget)"
        );
    }

    // macOS-only: `phys_footprint_mib` is a `proc_pid_rusage` FFI on Darwin and a
    // `None`-returning stub elsewhere, so this end-to-end assertion only holds there.
    #[cfg(target_os = "macos")]
    #[test]
    fn phys_footprint_of_self_is_measurable() {
        // Validates the proc_pid_rusage FFI end-to-end: our own running
        // test process must have a measurable, non-zero footprint. If this
        // returns None/0 the whole rss-charge feature would be inert.
        let mib = phys_footprint_mib(std::process::id());
        assert!(
            mib.is_some(),
            "proc_pid_rusage must succeed for our own pid"
        );
        assert!(mib.unwrap() > 0, "our own phys_footprint should be > 0 MiB");
        // Pressure level is a valid enum-ish value (1/2/4) or our 1 default.
        assert!(memory_pressure_level() >= 1);
    }

    #[test]
    fn worker_overhead_is_modest() {
        // A guard against someone "fixing" the overhead to a value that
        // would make a normal 512-MiB VM unable to coexist on a typical
        // budget. Sanity only.
        assert!(
            WORKER_OVERHEAD_MIB <= 256,
            "overhead should stay a small per-worker add"
        );
    }

    #[test]
    fn host_coord_reserves_releases_and_reclaims_stale_pids() {
        use std::sync::atomic::{AtomicU64, Ordering};
        static SEQ: AtomicU64 = AtomicU64::new(0);
        let dir = std::env::temp_dir().join(format!(
            "sm-memadm-test-{}-{}",
            std::process::id(),
            SEQ.fetch_add(1, Ordering::Relaxed)
        ));
        let coord = HostCoord::new(dir.clone()).expect("host coord");
        let budget = 1000u64;
        let timeout = Duration::from_millis(200);

        // Reserve 400 → host-wide and our own contribution both 400.
        assert_eq!(coord.reserve_blocking(400, budget, timeout), 400);
        assert_eq!(coord.snapshot(), (400, 400));

        // Plant a STALE reservation from a dead PID (i32::MAX is never a
        // live process). If it counted, 400 + 500 + 200 > 1000 would block
        // the next reserve; instead it must be reclaimed so 400 + 200 fits.
        std::fs::write(dir.join(i32::MAX.to_string()), "500").unwrap();
        assert_eq!(coord.reserve_blocking(200, budget, timeout), 200);
        assert_eq!(
            coord.snapshot(),
            (600, 600),
            "stale entry reclaimed, not counted"
        );
        assert!(
            !dir.join(i32::MAX.to_string()).exists(),
            "dead-PID reservation file must be reclaimed"
        );

        // Release everything; our pid file is removed at zero.
        coord.release(600);
        assert_eq!(coord.snapshot(), (0, 0));
        assert!(
            !coord.my_path.exists(),
            "our pid file removed when contribution hits 0"
        );

        let _ = std::fs::remove_dir_all(&dir);
    }

    fn unique_coord_dir(tag: &str) -> PathBuf {
        use std::sync::atomic::{AtomicU64, Ordering};
        static SEQ: AtomicU64 = AtomicU64::new(0);
        std::env::temp_dir().join(format!(
            "sm-memadm-{tag}-{}-{}",
            std::process::id(),
            SEQ.fetch_add(1, Ordering::Relaxed)
        ))
    }

    #[test]
    fn host_coord_counts_other_live_process_then_reclaims_when_it_dies() {
        let dir = unique_coord_dir("xproc");
        let coord = HostCoord::new(dir.clone()).expect("coord");
        let budget = 1000u64;

        // A real, live "other process" holding a 700-MiB reservation.
        let mut child = std::process::Command::new("sleep")
            .arg("30")
            .spawn()
            .expect("spawn child");
        std::fs::write(dir.join(child.id().to_string()), "700").unwrap();

        // 700 (other) + 400 (us) = 1100 > 1000 → won't fit. With a short
        // timeout the coordinator must ADMIT ANYWAY (forward progress beats
        // a cross-process hang) while still recording our charge.
        let t0 = Instant::now();
        assert_eq!(
            coord.reserve_blocking(400, budget, Duration::from_millis(60)),
            400
        );
        assert!(
            t0.elapsed() >= Duration::from_millis(50),
            "should have waited out the timeout before admitting"
        );
        assert_eq!(
            coord.snapshot(),
            (1100, 400),
            "a live peer's reservation is counted in the host-wide total"
        );

        // Kill the peer; its now-dead reservation is reclaimed on the next
        // locked sweep, freeing the host-wide budget.
        let dead_path = dir.join(child.id().to_string());
        child.kill().ok();
        child.wait().ok();
        assert_eq!(
            coord.snapshot(),
            (400, 400),
            "dead peer's reservation reclaimed"
        );
        assert!(!dead_path.exists(), "stale peer file unlinked");

        coord.release(400);
        let _ = std::fs::remove_dir_all(&dir);
    }

    #[test]
    fn host_coord_partial_release_and_saturates_at_zero() {
        let dir = unique_coord_dir("release");
        let coord = HostCoord::new(dir.clone()).expect("coord");
        let timeout = Duration::from_millis(50);

        assert_eq!(coord.reserve_blocking(600, 10_000, timeout), 600);
        coord.release(200);
        assert_eq!(
            coord.snapshot(),
            (400, 400),
            "partial release leaves the remainder"
        );
        assert!(coord.my_path.exists(), "pid file persists while > 0");

        // Over-release saturates to 0 (never underflows) and drops the file.
        coord.release(99_999);
        assert_eq!(coord.snapshot(), (0, 0));
        assert!(!coord.my_path.exists(), "pid file removed at zero");

        let _ = std::fs::remove_dir_all(&dir);
    }

    #[test]
    fn host_coord_budget_boundary_inclusive_then_admits_over() {
        let dir = unique_coord_dir("boundary");
        let coord = HostCoord::new(dir.clone()).expect("coord");
        let timeout = Duration::from_millis(50);

        // Fill the budget in two reserves; the second exactly reaches the
        // budget and must be admitted (the check is `host + mib <= budget`).
        assert_eq!(coord.reserve_blocking(600, 1000, timeout), 600);
        assert_eq!(
            coord.reserve_blocking(400, 1000, timeout),
            400,
            "reserving up to exactly the budget is inclusive"
        );
        assert_eq!(coord.snapshot(), (1000, 1000));

        // One more byte is over budget → waits, then admits anyway.
        let t0 = Instant::now();
        assert_eq!(
            coord.reserve_blocking(1, 1000, Duration::from_millis(60)),
            1
        );
        assert!(
            t0.elapsed() >= Duration::from_millis(50),
            "over-budget reserve must wait out the timeout"
        );

        coord.release(1001);
        let _ = std::fs::remove_dir_all(&dir);
    }
}