cellos-host-firecracker 0.5.0

//! Pre-booted Firecracker VM pool — L2-06-2.
//!
//! Cold-booting a Firecracker microVM costs ~125 ms wall-clock on warm hosts
//! (kernel decompression + init + cellos-init handshake). For agentic
//! workloads that spawn many short-lived cells, that overhead dominates the
//! workload's actual runtime. The remedy is the snapshot-restore path:
//! pre-boot a VM to a known state (kernel up, vsock + virtio devices wired,
//! cellos-init parked waiting for a `cellos.argv` cmdline), take a snapshot,
//! and restore from the snapshot at cell-create time. Restore is ~10 ms.
//!
//! This module implements the *pool state machine* and the *fill* API. The
//! integration into [`FirecrackerCellBackend::create`] is gated behind the
//! `CELLOS_FIRECRACKER_POOL_SIZE` environment variable — default `0` means
//! the pool is disabled and `create` follows the cold-boot path verbatim.
//! When `>0`, a future commit wires `checkout` into `create` ahead of
//! `configure_vm` and `checkin` into `destroy`.
//!
//! # Why a skeleton?
//!
//! The full snapshot path needs:
//!   * a Firecracker child managed by [`tokio::process::Child`] long enough to
//!     accept `PUT /snapshot/create` and then exit cleanly;
//!   * disk space accounting for memory snapshots (the `--mem-file-path` blob
//!     is the same size as the VM's RAM allocation);
//!   * a separate restore code path inside `create` that calls
//!     `PUT /snapshot/load` instead of `PUT /machine-config` + `PUT /boot-source`.
//!
//! All three of those land in subsequent L2-06 commits. This file pins the
//! contract — the state machine, the `checkout`/`checkin` API shape, and the
//! gating env var — so the wiring change in the live `create` path is a
//! mechanical follow-up rather than a redesign.
//!
//! # State machine
//!
//! Each slot transitions:
//!
//! ```text
//!   Empty ──fill()──▶ Available ──checkout()──▶ InUse ──checkin()──▶ Empty
//! ```
//!
//! `checkin` returns the slot to `Empty` (not `Available`) by design: a VM
//! that ran a cell is no longer at the parked-init snapshot state, so it
//! cannot be re-used without re-snapshotting from a fresh boot. A later
//! background filler re-populates the slot. This is the same lifecycle
//! AWS Lambda uses for warm-pool execution environments.

use std::path::PathBuf;

#[cfg(target_os = "linux")]
use std::time::Duration;

#[cfg(target_os = "linux")]
use crate::api_client::{
    BootSource, Drive, FirecrackerApiClient, InstanceAction, InstanceActionType, MachineConfig,
    MemBackend, MemBackendType, SnapshotCreate, SnapshotLoad, SnapshotType, VmState, VmStatePatch,
};
#[cfg(target_os = "linux")]
use cellos_core::CellosError;

/// Environment variable that toggles the warm pool. Default `0` (disabled);
/// any positive integer enables the pool with that many slots.
pub const POOL_SIZE_ENV: &str = "CELLOS_FIRECRACKER_POOL_SIZE";

/// State of one slot in the warm pool.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum PoolSlot {
    /// Slot has a snapshot on disk and is ready to be checked out.
    Available {
        /// Filesystem path to the Firecracker VM state file written by
        /// `PUT /snapshot/create`. `PUT /snapshot/load` consumes this as
        /// `snapshot_path` at restore time.
        snapshot_path: PathBuf,
        /// Filesystem path to the paired memory dump (`mem_file_path` in the
        /// `SnapshotCreate` body). Required at restore time via the
        /// `mem_backend.backend_path` field on `PUT /snapshot/load`.
        mem_file_path: PathBuf,
        /// Stable identifier for the pre-booted VM (used in logs / metrics).
        vm_id: String,
    },
    /// Slot has been handed to an active cell — not eligible for checkout.
    InUse {
        /// Cell id currently holding this slot. Used by `checkin` to validate
        /// the caller is releasing the slot they actually checked out.
        cell_id: String,
    },
    /// Slot has no snapshot yet (initial state, or post-`checkin` awaiting
    /// background re-fill).
    Empty,
}

/// Pre-booted Firecracker VM pool for fast cell startup.
///
/// Each slot is a VM that has booted to the kernel's init stage and been
/// snapshot'd — ready to restore in ~10 ms vs cold-boot ~125 ms.
///
/// **Thread-safety:** the pool is currently `&mut self`-driven for clarity.
/// The wiring inside [`FirecrackerCellBackend`] will wrap it in
/// `tokio::sync::Mutex<FirecrackerPool>` (same pattern as `running_vms`) so
/// concurrent `create` / `destroy` calls serialize on slot allocation.
///
/// [`FirecrackerCellBackend`]: crate::FirecrackerCellBackend
pub struct FirecrackerPool {
    size: usize,
    slots: Vec<PoolSlot>,
}

impl FirecrackerPool {
    /// Construct an empty pool with `size` slots, all in [`PoolSlot::Empty`].
    /// `size==0` is valid and yields a pool whose `checkout` always returns
    /// `None` — the wiring code uses this to short-circuit when the env var
    /// is unset or zero.
    pub fn new(size: usize) -> Self {
        Self {
            size,
            slots: (0..size).map(|_| PoolSlot::Empty).collect(),
        }
    }

    /// Number of slots configured for this pool (any state).
    pub fn size(&self) -> usize {
        self.size
    }

    /// Number of [`PoolSlot::Available`] slots — the number of cells that can
    /// be served by the fast-path right now.
    pub fn available(&self) -> usize {
        self.slots
            .iter()
            .filter(|s| matches!(s, PoolSlot::Available { .. }))
            .count()
    }

    /// Number of [`PoolSlot::InUse`] slots.
    pub fn in_use(&self) -> usize {
        self.slots
            .iter()
            .filter(|s| matches!(s, PoolSlot::InUse { .. }))
            .count()
    }

    /// Reserve an available snapshot for `cell_id`, transitioning the slot
    /// from `Available` to `InUse`. Returns the snapshot path on success, or
    /// `None` if no `Available` slot exists (caller falls back to cold-boot).
    ///
    /// Marked `async` for symmetry with the future implementation that will
    /// hold a `tokio::sync::Mutex`. The body is currently synchronous.
    pub async fn checkout(&mut self, cell_id: &str) -> Option<PathBuf> {
        for slot in self.slots.iter_mut() {
            if let PoolSlot::Available { snapshot_path, .. } = slot {
                let path = snapshot_path.clone();
                *slot = PoolSlot::InUse {
                    cell_id: cell_id.to_string(),
                };
                return Some(path);
            }
        }
        None
    }

    /// Release the slot previously checked out by `cell_id`, transitioning it
    /// to [`PoolSlot::Empty`]. A background filler is expected to re-populate
    /// the slot via [`Self::fill`]; this is intentional — a VM that ran a
    /// real cell is no longer at the parked-init state, so re-using its
    /// snapshot would leak workload-side state into the next cell.
    ///
    /// Returns `true` if a matching `InUse { cell_id }` slot was found and
    /// reset, `false` otherwise (call was a no-op).
    pub async fn checkin(&mut self, cell_id: &str) -> bool {
        for slot in self.slots.iter_mut() {
            if let PoolSlot::InUse { cell_id: held } = slot {
                if held == cell_id {
                    *slot = PoolSlot::Empty;
                    return true;
                }
            }
        }
        false
    }

    /// Boot one VM per `Empty` slot, snapshot it, and transition the slot to
    /// [`PoolSlot::Available`]. No-op for slots already in `Available` or
    /// `InUse`.
    ///
    /// On Linux (the only platform Firecracker runs on) this spawns one VMM
    /// per empty slot, drives the configure → InstanceStart → wait-for-init
    /// → PATCH-Paused → PUT-snapshot/create sequence, then kills the child
    /// process. The pair of `(snapshot_path, mem_file_path)` files left
    /// behind on disk is the durable artifact a future `checkout` will load.
    ///
    /// Off-Linux this is a no-op — Firecracker is not available, so the
    /// pool stays empty and `checkout` returns `None`, falling
    /// `FirecrackerCellBackend::create` through to its cold-boot path.
    ///
    /// Failures during fill are logged and the slot is left `Empty` (so a
    /// subsequent fill can retry); we don't propagate errors out of `fill`
    /// because the pool is a best-effort latency optimisation, not a
    /// correctness gate.
    #[cfg(target_os = "linux")]
    pub async fn fill(&mut self, firecracker_bin: &str, kernel: &str, rootfs: &str) {
        for (idx, slot) in self.slots.iter_mut().enumerate() {
            if !matches!(slot, PoolSlot::Empty) {
                continue;
            }
            match fill_one_slot(firecracker_bin, kernel, rootfs, idx).await {
                Ok((snapshot_path, mem_file_path, vm_id)) => {
                    tracing::info!(
                        slot = idx,
                        snapshot = %snapshot_path.display(),
                        mem = %mem_file_path.display(),
                        "warm pool slot filled"
                    );
                    *slot = PoolSlot::Available {
                        snapshot_path,
                        mem_file_path,
                        vm_id,
                    };
                }
                Err(e) => {
                    tracing::warn!(slot = idx, error = %e, "warm pool fill failed; slot stays Empty");
                }
            }
        }
    }

    /// Off-Linux stub — Firecracker only runs on Linux/KVM.
    #[cfg(not(target_os = "linux"))]
    pub async fn fill(&mut self, _firecracker_bin: &str, _kernel: &str, _rootfs: &str) {
        tracing::debug!(
            pool_size = self.size,
            "FirecrackerPool::fill no-op: target_os != linux"
        );
    }
}

/// Restore a previously-captured snapshot into a fresh Firecracker VMM via
/// `PUT /snapshot/load`. The caller owns the VMM process and its API socket
/// — this helper only drives the load + resume call sequence.
///
/// Linux-only because the API client transport (`UnixStream`) is Linux-only.
/// `FirecrackerCellBackend::create` calls this with the path returned by
/// [`FirecrackerPool::checkout`] when the pool produced a fast-path slot;
/// off-Linux the pool is always empty so this helper is never reached.
#[cfg(target_os = "linux")]
pub async fn restore_into(
    client: &FirecrackerApiClient,
    snapshot_path: &std::path::Path,
    mem_file_path: &std::path::Path,
) -> Result<(), CellosError> {
    let status = client
        .put(
            "/snapshot/load",
            &SnapshotLoad {
                snapshot_path: snapshot_path.to_string_lossy().into_owned(),
                mem_backend: MemBackend {
                    backend_type: MemBackendType::File,
                    backend_path: mem_file_path.to_string_lossy().into_owned(),
                },
                enable_diff_snapshots: false,
                resume_vm: true,
            },
        )
        .await?;
    if !status.is_success() {
        return Err(CellosError::Host(format!(
            "firecracker /snapshot/load returned HTTP {status}"
        )));
    }
    Ok(())
}

/// Boot one Firecracker VMM, snapshot it, kill the child, return the on-disk
/// paths plus a stable vm-id. Linux-only.
///
/// Path discipline: snapshot files land at
/// `/tmp/cellos-pool-<vm_id>.snap` (state) and `/tmp/cellos-pool-<vm_id>.mem`
/// (memory dump). The VMM API socket lives at
/// `/tmp/cellos-pool-<vm_id>.socket`. We `remove_file` the socket on
/// teardown so re-fills don't `EEXIST` on `bind`.
#[cfg(target_os = "linux")]
async fn fill_one_slot(
    firecracker_bin: &str,
    kernel: &str,
    rootfs: &str,
    slot_idx: usize,
) -> Result<(PathBuf, PathBuf, String), CellosError> {
    use tokio::time::sleep;
    use uuid::Uuid;

    let vm_id = format!("pool-{}-{}", slot_idx, Uuid::new_v4().simple());
    let socket_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.socket"));
    let snapshot_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.snap"));
    let mem_file_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.mem"));

    // Stale socket from a crashed previous run would make Firecracker fail
    // to `bind()`. Best-effort remove (ignore NotFound).
    let _ = std::fs::remove_file(&socket_path);

    // Spawn the VMM. Same direct-invocation shape as `build_direct_argv`
    // in lib.rs — no jailer because the warm pool's VM never runs workload
    // code; it boots cellos-init, gets snapshotted, and dies. The chroot
    // boundary is therefore not load-bearing for the fill path.
    let socket_str = socket_path.to_string_lossy().into_owned();
    let mut child = tokio::process::Command::new(firecracker_bin)
        .args(["--api-sock", socket_str.as_str(), "--level", "Error"])
        .kill_on_drop(true)
        .spawn()
        .map_err(|e| CellosError::Host(format!("spawn firecracker for pool fill: {e}")))?;

    // From here on, any error path must kill the child + clean up sockets
    // before surfacing.
    let fill = async {
        let client = FirecrackerApiClient::new(&socket_path);
        client.wait_for_ready().await?;

        // Minimal machine config — pool VMs are stamped out from a single
        // snapshot, so we use a small static footprint. The supervisor's
        // hot path can still attach a larger scratch image at restore-time
        // via a subsequent `PUT /drives/...`.
        let mc = client
            .put(
                "/machine-config",
                &MachineConfig {
                    vcpu_count: 1,
                    mem_size_mib: 128,
                    track_dirty_pages: false,
                },
            )
            .await?;
        if !mc.is_success() {
            return Err(CellosError::Host(format!(
                "firecracker /machine-config returned HTTP {mc}"
            )));
        }

        let bs = client
            .put(
                "/boot-source",
                &BootSource {
                    kernel_image_path: kernel.to_string(),
                    // `reboot=k panic=1` is the standard Firecracker pair —
                    // we never expect to reboot, but if the kernel panics
                    // during snapshot prep we want a clean exit rather than
                    // a hung VMM.
                    boot_args: Some("console=ttyS0 reboot=k panic=1 pci=off nomodules".to_string()),
                },
            )
            .await?;
        if !bs.is_success() {
            return Err(CellosError::Host(format!(
                "firecracker /boot-source returned HTTP {bs}"
            )));
        }

        let drv = client
            .put(
                "/drives/rootfs",
                &Drive {
                    drive_id: "rootfs".into(),
                    path_on_host: rootfs.to_string(),
                    is_root_device: true,
                    is_read_only: true,
                },
            )
            .await?;
        if !drv.is_success() {
            return Err(CellosError::Host(format!(
                "firecracker /drives/rootfs returned HTTP {drv}"
            )));
        }

        let start = client
            .put(
                "/actions",
                &InstanceAction {
                    action_type: InstanceActionType::InstanceStart,
                },
            )
            .await?;
        if !start.is_success() {
            return Err(CellosError::Host(format!(
                "firecracker InstanceStart returned HTTP {start}"
            )));
        }

        // Wait for cellos-init to reach the parked state. The robust
        // signal is a vsock readiness ping (see the lib.rs `boot_result`
        // block's exit-code listener), but for the warm-pool path we don't
        // yet require an init-side vsock dialog — the kernel-mode handoff
        // to userspace is what we want to capture in the snapshot, not the
        // full init handshake. A short fixed wait gives Firecracker enough
        // wall time to bring up the vCPU and reach the parked userspace
        // before we pause. This matches the wall-clock that AWS Lambda's
        // microVM warmer uses for its pre-warm pool.
        sleep(Duration::from_millis(500)).await;

        // Pause the VM before snapshotting — Firecracker refuses to
        // snapshot a Running VM.
        let pause = client
            .patch(
                "/vm",
                &VmStatePatch {
                    state: VmState::Paused,
                },
            )
            .await?;
        if !pause.is_success() {
            return Err(CellosError::Host(format!(
                "firecracker PATCH /vm Paused returned HTTP {pause}"
            )));
        }

        let snap = client
            .put(
                "/snapshot/create",
                &SnapshotCreate {
                    snapshot_type: SnapshotType::Full,
                    snapshot_path: snapshot_path.to_string_lossy().into_owned(),
                    mem_file_path: mem_file_path.to_string_lossy().into_owned(),
                },
            )
            .await?;
        if !snap.is_success() {
            return Err(CellosError::Host(format!(
                "firecracker /snapshot/create returned HTTP {snap}"
            )));
        }

        Ok::<(), CellosError>(())
    };

    let result = fill.await;

    // Tear down the source VMM. The snapshot is the durable artifact; the
    // original Running-then-Paused process is no longer needed. `kill()`
    // sends SIGKILL; we then `wait()` so we don't leave a zombie.
    let _ = child.kill().await;
    let _ = child.wait().await;
    let _ = std::fs::remove_file(&socket_path);

    result.map(|()| (snapshot_path, mem_file_path, vm_id))
}

/// Read [`POOL_SIZE_ENV`] from the process environment and parse it.
/// Returns `0` (pool disabled) when unset, empty, or unparseable — the
/// fail-closed default. A non-zero value enables the pool.
pub fn pool_size_from_env() -> usize {
    std::env::var(POOL_SIZE_ENV)
        .ok()
        .and_then(|v| v.trim().parse::<usize>().ok())
        .unwrap_or(0)
}

// ── Tests ────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    /// A fresh pool of size N has N `Empty` slots, zero `Available`, zero
    /// `InUse`. `checkout` returns `None` because nothing has been filled.
    #[tokio::test]
    async fn new_pool_starts_empty() {
        let mut pool = FirecrackerPool::new(3);
        assert_eq!(pool.size(), 3);
        assert_eq!(pool.available(), 0);
        assert_eq!(pool.in_use(), 0);
        // checkout on an empty pool must yield None — the gate that lets
        // create() fall through to cold-boot.
        assert!(pool.checkout("cell-1").await.is_none());
    }

    /// `size=0` is the disabled-pool sentinel: `checkout` always yields
    /// `None`, `checkin` always yields `false`, no panics.
    #[tokio::test]
    async fn zero_size_pool_is_inert() {
        let mut pool = FirecrackerPool::new(0);
        assert_eq!(pool.size(), 0);
        assert!(pool.checkout("any-cell").await.is_none());
        assert!(!pool.checkin("any-cell").await);
    }

    /// State machine: an `Available` slot can be checked out (→ `InUse`),
    /// then checked in (→ `Empty`). Two cells trying to checkout from a
    /// one-slot pool: first wins, second gets `None`.
    #[tokio::test]
    async fn checkout_then_checkin_cycles_slot_through_states() {
        let mut pool = FirecrackerPool::new(1);
        // Hand-place an Available slot so we can exercise checkout without
        // relying on the (stubbed) fill() implementation.
        pool.slots[0] = PoolSlot::Available {
            snapshot_path: PathBuf::from("/tmp/snap-1"),
            mem_file_path: PathBuf::from("/tmp/snap-1.mem"),
            vm_id: "vm-1".to_string(),
        };
        assert_eq!(pool.available(), 1);

        let path = pool.checkout("cell-1").await;
        assert_eq!(path, Some(PathBuf::from("/tmp/snap-1")));
        assert_eq!(pool.available(), 0);
        assert_eq!(pool.in_use(), 1);

        // Second checkout from a now-empty pool returns None — the cold-boot
        // fallback signal.
        assert!(pool.checkout("cell-2").await.is_none());

        // Checkin by the holding cell_id transitions the slot to Empty.
        assert!(pool.checkin("cell-1").await);
        assert_eq!(pool.available(), 0);
        assert_eq!(pool.in_use(), 0);

        // Re-checkin is a no-op (returns false).
        assert!(!pool.checkin("cell-1").await);
    }

    /// `checkin` with a non-matching `cell_id` is a no-op. This protects
    /// against a stale destroy from another cell accidentally releasing
    /// someone else's slot.
    #[tokio::test]
    async fn checkin_wrong_cell_id_is_noop() {
        let mut pool = FirecrackerPool::new(1);
        pool.slots[0] = PoolSlot::InUse {
            cell_id: "real-cell".to_string(),
        };
        assert!(!pool.checkin("imposter-cell").await);
        // Slot still InUse with the real cell.
        assert_eq!(pool.in_use(), 1);
        // The real cell can still check in.
        assert!(pool.checkin("real-cell").await);
        assert_eq!(pool.in_use(), 0);
    }

    /// `fill` against a non-existent firecracker binary path is a soft
    /// failure: the spawn fails, the slot stays `Empty`, and the call does
    /// not propagate an error (the pool is best-effort latency optimisation,
    /// not a correctness gate). Off-Linux `fill` is a documented no-op so
    /// the assertion is the same on every platform.
    #[tokio::test]
    async fn fill_with_missing_binary_leaves_slots_empty() {
        let mut pool = FirecrackerPool::new(2);
        pool.fill(
            "/nonexistent/firecracker",
            "/nonexistent/vmlinux",
            "/nonexistent/rootfs.ext4",
        )
        .await;
        // Either Linux-spawn-failure or off-Linux-noop leaves the slots Empty.
        assert_eq!(pool.available(), 0);
        assert_eq!(pool.in_use(), 0);
        assert_eq!(
            pool.slots
                .iter()
                .filter(|s| matches!(s, PoolSlot::Empty))
                .count(),
            2
        );
    }

    /// State-machine cycle test: hand-place two `Available` slots (one per
    /// snapshot pair on disk would be the production path; here we skip the
    /// firecracker spawn and pin the transition matrix directly). Drive
    /// `checkout` twice and confirm both succeed, the third returns `None`,
    /// then `checkin` cycles both back to `Empty` exactly once each.
    #[tokio::test]
    async fn checkout_checkin_cycle_two_slots() {
        let mut pool = FirecrackerPool::new(2);
        pool.slots[0] = PoolSlot::Available {
            snapshot_path: PathBuf::from("/tmp/snap-a"),
            mem_file_path: PathBuf::from("/tmp/snap-a.mem"),
            vm_id: "vm-a".into(),
        };
        pool.slots[1] = PoolSlot::Available {
            snapshot_path: PathBuf::from("/tmp/snap-b"),
            mem_file_path: PathBuf::from("/tmp/snap-b.mem"),
            vm_id: "vm-b".into(),
        };
        assert_eq!(pool.available(), 2);

        let p1 = pool.checkout("cell-1").await.expect("first checkout");
        let p2 = pool.checkout("cell-2").await.expect("second checkout");
        assert_ne!(p1, p2, "each cell got a distinct snapshot path");
        assert_eq!(pool.available(), 0);
        assert_eq!(pool.in_use(), 2);

        // Third checkout from a fully in-use pool is the cold-boot signal.
        assert!(pool.checkout("cell-3").await.is_none());

        assert!(pool.checkin("cell-1").await);
        assert!(pool.checkin("cell-2").await);
        assert_eq!(pool.in_use(), 0);
        // checkin transitions to Empty (not Available) — the next fill()
        // re-populates from a fresh boot, because a VM that ran a workload
        // is no longer at the parked-init state.
        assert_eq!(
            pool.slots
                .iter()
                .filter(|s| matches!(s, PoolSlot::Empty))
                .count(),
            2
        );

        // Repeated checkin is a no-op (no slot in InUse matches).
        assert!(!pool.checkin("cell-1").await);
        assert!(!pool.checkin("cell-2").await);
    }

    /// `Available` slot carries the paired snapshot+mem paths verbatim
    /// through `checkout` — the caller needs the snapshot path to feed
    /// `restore_into`, and on the supervisor side the mem path is paired
    /// with it via the on-disk `<vm_id>.mem` convention. This pins the
    /// "snapshot path round-trips unchanged" contract that the
    /// `FirecrackerCellBackend::create` wiring relies on.
    #[tokio::test]
    async fn checkout_returns_snapshot_path_verbatim() {
        let mut pool = FirecrackerPool::new(1);
        pool.slots[0] = PoolSlot::Available {
            snapshot_path: PathBuf::from("/tmp/cellos-pool-X.snap"),
            mem_file_path: PathBuf::from("/tmp/cellos-pool-X.mem"),
            vm_id: "X".into(),
        };
        let got = pool.checkout("cell-X").await;
        assert_eq!(got, Some(PathBuf::from("/tmp/cellos-pool-X.snap")));
        // After checkout the slot is InUse{cell-X}.
        match &pool.slots[0] {
            PoolSlot::InUse { cell_id } => assert_eq!(cell_id, "cell-X"),
            other => panic!("expected InUse after checkout, got {other:?}"),
        }
    }

    /// `pool_size_from_env` returns 0 when the env var is unset. We can't
    /// reliably test the *set* path here (env mutation is racy across tests
    /// in the same process), but pinning the unset default is the gate that
    /// matters: if the env reader regressed to a non-zero default the warm
    /// pool would activate accidentally and changes in `create()` would take
    /// a different code path than expected.
    #[test]
    fn pool_size_from_env_defaults_to_zero_when_unset() {
        // Best-effort: only assert when the var is genuinely unset in this
        // test process. If a parallel test set it, skip — we'd rather skip
        // than be flaky.
        if std::env::var(POOL_SIZE_ENV).is_err() {
            assert_eq!(pool_size_from_env(), 0);
        }
    }
}