Skip to main content

cellos_host_firecracker/
pool.rs

1//! Pre-booted Firecracker VM pool — L2-06-2.
2//!
3//! Cold-booting a Firecracker microVM costs ~125 ms wall-clock on warm hosts
4//! (kernel decompression + init + cellos-init handshake). For agentic
5//! workloads that spawn many short-lived cells, that overhead dominates the
6//! workload's actual runtime. The remedy is the snapshot-restore path:
7//! pre-boot a VM to a known state (kernel up, vsock + virtio devices wired,
8//! cellos-init parked waiting for a `cellos.argv` cmdline), take a snapshot,
9//! and restore from the snapshot at cell-create time. Restore is ~10 ms.
10//!
11//! This module implements the *pool state machine* and the *fill* API. The
12//! integration into [`FirecrackerCellBackend::create`] is gated behind the
13//! `CELLOS_FIRECRACKER_POOL_SIZE` environment variable — default `0` means
14//! the pool is disabled and `create` follows the cold-boot path verbatim.
15//! When `>0`, a future commit wires `checkout` into `create` ahead of
16//! `configure_vm` and `checkin` into `destroy`.
17//!
18//! # Why a skeleton?
19//!
20//! The full snapshot path needs:
21//!   * a Firecracker child managed by [`tokio::process::Child`] long enough to
22//!     accept `PUT /snapshot/create` and then exit cleanly;
23//!   * disk space accounting for memory snapshots (the `--mem-file-path` blob
24//!     is the same size as the VM's RAM allocation);
25//!   * a separate restore code path inside `create` that calls
26//!     `PUT /snapshot/load` instead of `PUT /machine-config` + `PUT /boot-source`.
27//!
28//! All three of those land in subsequent L2-06 commits. This file pins the
29//! contract — the state machine, the `checkout`/`checkin` API shape, and the
30//! gating env var — so the wiring change in the live `create` path is a
31//! mechanical follow-up rather than a redesign.
32//!
33//! # State machine
34//!
35//! Each slot transitions:
36//!
37//! ```text
38//!   Empty ──fill()──▶ Available ──checkout()──▶ InUse ──checkin()──▶ Empty
39//! ```
40//!
41//! `checkin` returns the slot to `Empty` (not `Available`) by design: a VM
42//! that ran a cell is no longer at the parked-init snapshot state, so it
43//! cannot be re-used without re-snapshotting from a fresh boot. A later
44//! background filler re-populates the slot. This is the same lifecycle
45//! AWS Lambda uses for warm-pool execution environments.
46
47use std::path::PathBuf;
48
49#[cfg(target_os = "linux")]
50use std::time::Duration;
51
52#[cfg(target_os = "linux")]
53use crate::api_client::{
54    BootSource, Drive, FirecrackerApiClient, InstanceAction, InstanceActionType, MachineConfig,
55    MemBackend, MemBackendType, SnapshotCreate, SnapshotLoad, SnapshotType, VmState, VmStatePatch,
56};
57#[cfg(target_os = "linux")]
58use cellos_core::CellosError;
59
60/// Environment variable that toggles the warm pool. Default `0` (disabled);
61/// any positive integer enables the pool with that many slots.
62pub const POOL_SIZE_ENV: &str = "CELLOS_FIRECRACKER_POOL_SIZE";
63
64/// State of one slot in the warm pool.
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub enum PoolSlot {
67    /// Slot has a snapshot on disk and is ready to be checked out.
68    Available {
69        /// Filesystem path to the Firecracker VM state file written by
70        /// `PUT /snapshot/create`. `PUT /snapshot/load` consumes this as
71        /// `snapshot_path` at restore time.
72        snapshot_path: PathBuf,
73        /// Filesystem path to the paired memory dump (`mem_file_path` in the
74        /// `SnapshotCreate` body). Required at restore time via the
75        /// `mem_backend.backend_path` field on `PUT /snapshot/load`.
76        mem_file_path: PathBuf,
77        /// Stable identifier for the pre-booted VM (used in logs / metrics).
78        vm_id: String,
79    },
80    /// Slot has been handed to an active cell — not eligible for checkout.
81    InUse {
82        /// Cell id currently holding this slot. Used by `checkin` to validate
83        /// the caller is releasing the slot they actually checked out.
84        cell_id: String,
85    },
86    /// Slot has no snapshot yet (initial state, or post-`checkin` awaiting
87    /// background re-fill).
88    Empty,
89}
90
91/// Pre-booted Firecracker VM pool for fast cell startup.
92///
93/// Each slot is a VM that has booted to the kernel's init stage and been
94/// snapshot'd — ready to restore in ~10 ms vs cold-boot ~125 ms.
95///
96/// **Thread-safety:** the pool is currently `&mut self`-driven for clarity.
97/// The wiring inside [`FirecrackerCellBackend`] will wrap it in
98/// `tokio::sync::Mutex<FirecrackerPool>` (same pattern as `running_vms`) so
99/// concurrent `create` / `destroy` calls serialize on slot allocation.
100///
101/// [`FirecrackerCellBackend`]: crate::FirecrackerCellBackend
102pub struct FirecrackerPool {
103    size: usize,
104    slots: Vec<PoolSlot>,
105}
106
107impl FirecrackerPool {
108    /// Construct an empty pool with `size` slots, all in [`PoolSlot::Empty`].
109    /// `size==0` is valid and yields a pool whose `checkout` always returns
110    /// `None` — the wiring code uses this to short-circuit when the env var
111    /// is unset or zero.
112    pub fn new(size: usize) -> Self {
113        Self {
114            size,
115            slots: (0..size).map(|_| PoolSlot::Empty).collect(),
116        }
117    }
118
119    /// Number of slots configured for this pool (any state).
120    pub fn size(&self) -> usize {
121        self.size
122    }
123
124    /// Number of [`PoolSlot::Available`] slots — the number of cells that can
125    /// be served by the fast-path right now.
126    pub fn available(&self) -> usize {
127        self.slots
128            .iter()
129            .filter(|s| matches!(s, PoolSlot::Available { .. }))
130            .count()
131    }
132
133    /// Number of [`PoolSlot::InUse`] slots.
134    pub fn in_use(&self) -> usize {
135        self.slots
136            .iter()
137            .filter(|s| matches!(s, PoolSlot::InUse { .. }))
138            .count()
139    }
140
141    /// Reserve an available snapshot for `cell_id`, transitioning the slot
142    /// from `Available` to `InUse`. Returns the snapshot path on success, or
143    /// `None` if no `Available` slot exists (caller falls back to cold-boot).
144    ///
145    /// Marked `async` for symmetry with the future implementation that will
146    /// hold a `tokio::sync::Mutex`. The body is currently synchronous.
147    pub async fn checkout(&mut self, cell_id: &str) -> Option<PathBuf> {
148        for slot in self.slots.iter_mut() {
149            if let PoolSlot::Available { snapshot_path, .. } = slot {
150                let path = snapshot_path.clone();
151                *slot = PoolSlot::InUse {
152                    cell_id: cell_id.to_string(),
153                };
154                return Some(path);
155            }
156        }
157        None
158    }
159
160    /// Release the slot previously checked out by `cell_id`, transitioning it
161    /// to [`PoolSlot::Empty`]. A background filler is expected to re-populate
162    /// the slot via [`Self::fill`]; this is intentional — a VM that ran a
163    /// real cell is no longer at the parked-init state, so re-using its
164    /// snapshot would leak workload-side state into the next cell.
165    ///
166    /// Returns `true` if a matching `InUse { cell_id }` slot was found and
167    /// reset, `false` otherwise (call was a no-op).
168    pub async fn checkin(&mut self, cell_id: &str) -> bool {
169        for slot in self.slots.iter_mut() {
170            if let PoolSlot::InUse { cell_id: held } = slot {
171                if held == cell_id {
172                    *slot = PoolSlot::Empty;
173                    return true;
174                }
175            }
176        }
177        false
178    }
179
180    /// Boot one VM per `Empty` slot, snapshot it, and transition the slot to
181    /// [`PoolSlot::Available`]. No-op for slots already in `Available` or
182    /// `InUse`.
183    ///
184    /// On Linux (the only platform Firecracker runs on) this spawns one VMM
185    /// per empty slot, drives the configure → InstanceStart → wait-for-init
186    /// → PATCH-Paused → PUT-snapshot/create sequence, then kills the child
187    /// process. The pair of `(snapshot_path, mem_file_path)` files left
188    /// behind on disk is the durable artifact a future `checkout` will load.
189    ///
190    /// Off-Linux this is a no-op — Firecracker is not available, so the
191    /// pool stays empty and `checkout` returns `None`, falling
192    /// `FirecrackerCellBackend::create` through to its cold-boot path.
193    ///
194    /// Failures during fill are logged and the slot is left `Empty` (so a
195    /// subsequent fill can retry); we don't propagate errors out of `fill`
196    /// because the pool is a best-effort latency optimisation, not a
197    /// correctness gate.
198    #[cfg(target_os = "linux")]
199    pub async fn fill(&mut self, firecracker_bin: &str, kernel: &str, rootfs: &str) {
200        for (idx, slot) in self.slots.iter_mut().enumerate() {
201            if !matches!(slot, PoolSlot::Empty) {
202                continue;
203            }
204            match fill_one_slot(firecracker_bin, kernel, rootfs, idx).await {
205                Ok((snapshot_path, mem_file_path, vm_id)) => {
206                    tracing::info!(
207                        slot = idx,
208                        snapshot = %snapshot_path.display(),
209                        mem = %mem_file_path.display(),
210                        "warm pool slot filled"
211                    );
212                    *slot = PoolSlot::Available {
213                        snapshot_path,
214                        mem_file_path,
215                        vm_id,
216                    };
217                }
218                Err(e) => {
219                    tracing::warn!(slot = idx, error = %e, "warm pool fill failed; slot stays Empty");
220                }
221            }
222        }
223    }
224
225    /// Off-Linux stub — Firecracker only runs on Linux/KVM.
226    #[cfg(not(target_os = "linux"))]
227    pub async fn fill(&mut self, _firecracker_bin: &str, _kernel: &str, _rootfs: &str) {
228        tracing::debug!(
229            pool_size = self.size,
230            "FirecrackerPool::fill no-op: target_os != linux"
231        );
232    }
233}
234
235/// Restore a previously-captured snapshot into a fresh Firecracker VMM via
236/// `PUT /snapshot/load`. The caller owns the VMM process and its API socket
237/// — this helper only drives the load + resume call sequence.
238///
239/// Linux-only because the API client transport (`UnixStream`) is Linux-only.
240/// `FirecrackerCellBackend::create` calls this with the path returned by
241/// [`FirecrackerPool::checkout`] when the pool produced a fast-path slot;
242/// off-Linux the pool is always empty so this helper is never reached.
243#[cfg(target_os = "linux")]
244pub async fn restore_into(
245    client: &FirecrackerApiClient,
246    snapshot_path: &std::path::Path,
247    mem_file_path: &std::path::Path,
248) -> Result<(), CellosError> {
249    let status = client
250        .put(
251            "/snapshot/load",
252            &SnapshotLoad {
253                snapshot_path: snapshot_path.to_string_lossy().into_owned(),
254                mem_backend: MemBackend {
255                    backend_type: MemBackendType::File,
256                    backend_path: mem_file_path.to_string_lossy().into_owned(),
257                },
258                enable_diff_snapshots: false,
259                resume_vm: true,
260            },
261        )
262        .await?;
263    if !status.is_success() {
264        return Err(CellosError::Host(format!(
265            "firecracker /snapshot/load returned HTTP {status}"
266        )));
267    }
268    Ok(())
269}
270
271/// Boot one Firecracker VMM, snapshot it, kill the child, return the on-disk
272/// paths plus a stable vm-id. Linux-only.
273///
274/// Path discipline: snapshot files land at
275/// `/tmp/cellos-pool-<vm_id>.snap` (state) and `/tmp/cellos-pool-<vm_id>.mem`
276/// (memory dump). The VMM API socket lives at
277/// `/tmp/cellos-pool-<vm_id>.socket`. We `remove_file` the socket on
278/// teardown so re-fills don't `EEXIST` on `bind`.
279#[cfg(target_os = "linux")]
280async fn fill_one_slot(
281    firecracker_bin: &str,
282    kernel: &str,
283    rootfs: &str,
284    slot_idx: usize,
285) -> Result<(PathBuf, PathBuf, String), CellosError> {
286    use tokio::time::sleep;
287    use uuid::Uuid;
288
289    let vm_id = format!("pool-{}-{}", slot_idx, Uuid::new_v4().simple());
290    let socket_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.socket"));
291    let snapshot_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.snap"));
292    let mem_file_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.mem"));
293
294    // Stale socket from a crashed previous run would make Firecracker fail
295    // to `bind()`. Best-effort remove (ignore NotFound).
296    let _ = std::fs::remove_file(&socket_path);
297
298    // Spawn the VMM. Same direct-invocation shape as `build_direct_argv`
299    // in lib.rs — no jailer because the warm pool's VM never runs workload
300    // code; it boots cellos-init, gets snapshotted, and dies. The chroot
301    // boundary is therefore not load-bearing for the fill path.
302    let socket_str = socket_path.to_string_lossy().into_owned();
303    let mut child = tokio::process::Command::new(firecracker_bin)
304        .args(["--api-sock", socket_str.as_str(), "--level", "Error"])
305        .kill_on_drop(true)
306        .spawn()
307        .map_err(|e| CellosError::Host(format!("spawn firecracker for pool fill: {e}")))?;
308
309    // From here on, any error path must kill the child + clean up sockets
310    // before surfacing.
311    let fill = async {
312        let client = FirecrackerApiClient::new(&socket_path);
313        client.wait_for_ready().await?;
314
315        // Minimal machine config — pool VMs are stamped out from a single
316        // snapshot, so we use a small static footprint. The supervisor's
317        // hot path can still attach a larger scratch image at restore-time
318        // via a subsequent `PUT /drives/...`.
319        let mc = client
320            .put(
321                "/machine-config",
322                &MachineConfig {
323                    vcpu_count: 1,
324                    mem_size_mib: 128,
325                    track_dirty_pages: false,
326                },
327            )
328            .await?;
329        if !mc.is_success() {
330            return Err(CellosError::Host(format!(
331                "firecracker /machine-config returned HTTP {mc}"
332            )));
333        }
334
335        let bs = client
336            .put(
337                "/boot-source",
338                &BootSource {
339                    kernel_image_path: kernel.to_string(),
340                    // `reboot=k panic=1` is the standard Firecracker pair —
341                    // we never expect to reboot, but if the kernel panics
342                    // during snapshot prep we want a clean exit rather than
343                    // a hung VMM.
344                    boot_args: Some("console=ttyS0 reboot=k panic=1 pci=off nomodules".to_string()),
345                },
346            )
347            .await?;
348        if !bs.is_success() {
349            return Err(CellosError::Host(format!(
350                "firecracker /boot-source returned HTTP {bs}"
351            )));
352        }
353
354        let drv = client
355            .put(
356                "/drives/rootfs",
357                &Drive {
358                    drive_id: "rootfs".into(),
359                    path_on_host: rootfs.to_string(),
360                    is_root_device: true,
361                    is_read_only: true,
362                },
363            )
364            .await?;
365        if !drv.is_success() {
366            return Err(CellosError::Host(format!(
367                "firecracker /drives/rootfs returned HTTP {drv}"
368            )));
369        }
370
371        let start = client
372            .put(
373                "/actions",
374                &InstanceAction {
375                    action_type: InstanceActionType::InstanceStart,
376                },
377            )
378            .await?;
379        if !start.is_success() {
380            return Err(CellosError::Host(format!(
381                "firecracker InstanceStart returned HTTP {start}"
382            )));
383        }
384
385        // Wait for cellos-init to reach the parked state. The robust
386        // signal is a vsock readiness ping (see the lib.rs `boot_result`
387        // block's exit-code listener), but for the warm-pool path we don't
388        // yet require an init-side vsock dialog — the kernel-mode handoff
389        // to userspace is what we want to capture in the snapshot, not the
390        // full init handshake. A short fixed wait gives Firecracker enough
391        // wall time to bring up the vCPU and reach the parked userspace
392        // before we pause. This matches the wall-clock that AWS Lambda's
393        // microVM warmer uses for its pre-warm pool.
394        sleep(Duration::from_millis(500)).await;
395
396        // Pause the VM before snapshotting — Firecracker refuses to
397        // snapshot a Running VM.
398        let pause = client
399            .patch(
400                "/vm",
401                &VmStatePatch {
402                    state: VmState::Paused,
403                },
404            )
405            .await?;
406        if !pause.is_success() {
407            return Err(CellosError::Host(format!(
408                "firecracker PATCH /vm Paused returned HTTP {pause}"
409            )));
410        }
411
412        let snap = client
413            .put(
414                "/snapshot/create",
415                &SnapshotCreate {
416                    snapshot_type: SnapshotType::Full,
417                    snapshot_path: snapshot_path.to_string_lossy().into_owned(),
418                    mem_file_path: mem_file_path.to_string_lossy().into_owned(),
419                },
420            )
421            .await?;
422        if !snap.is_success() {
423            return Err(CellosError::Host(format!(
424                "firecracker /snapshot/create returned HTTP {snap}"
425            )));
426        }
427
428        Ok::<(), CellosError>(())
429    };
430
431    let result = fill.await;
432
433    // Tear down the source VMM. The snapshot is the durable artifact; the
434    // original Running-then-Paused process is no longer needed. `kill()`
435    // sends SIGKILL; we then `wait()` so we don't leave a zombie.
436    let _ = child.kill().await;
437    let _ = child.wait().await;
438    let _ = std::fs::remove_file(&socket_path);
439
440    result.map(|()| (snapshot_path, mem_file_path, vm_id))
441}
442
443/// Read [`POOL_SIZE_ENV`] from the process environment and parse it.
444/// Returns `0` (pool disabled) when unset, empty, or unparseable — the
445/// fail-closed default. A non-zero value enables the pool.
446pub fn pool_size_from_env() -> usize {
447    std::env::var(POOL_SIZE_ENV)
448        .ok()
449        .and_then(|v| v.trim().parse::<usize>().ok())
450        .unwrap_or(0)
451}
452
453// ── Tests ────────────────────────────────────────────────────────────────────
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    /// A fresh pool of size N has N `Empty` slots, zero `Available`, zero
460    /// `InUse`. `checkout` returns `None` because nothing has been filled.
461    #[tokio::test]
462    async fn new_pool_starts_empty() {
463        let mut pool = FirecrackerPool::new(3);
464        assert_eq!(pool.size(), 3);
465        assert_eq!(pool.available(), 0);
466        assert_eq!(pool.in_use(), 0);
467        // checkout on an empty pool must yield None — the gate that lets
468        // create() fall through to cold-boot.
469        assert!(pool.checkout("cell-1").await.is_none());
470    }
471
472    /// `size=0` is the disabled-pool sentinel: `checkout` always yields
473    /// `None`, `checkin` always yields `false`, no panics.
474    #[tokio::test]
475    async fn zero_size_pool_is_inert() {
476        let mut pool = FirecrackerPool::new(0);
477        assert_eq!(pool.size(), 0);
478        assert!(pool.checkout("any-cell").await.is_none());
479        assert!(!pool.checkin("any-cell").await);
480    }
481
482    /// State machine: an `Available` slot can be checked out (→ `InUse`),
483    /// then checked in (→ `Empty`). Two cells trying to checkout from a
484    /// one-slot pool: first wins, second gets `None`.
485    #[tokio::test]
486    async fn checkout_then_checkin_cycles_slot_through_states() {
487        let mut pool = FirecrackerPool::new(1);
488        // Hand-place an Available slot so we can exercise checkout without
489        // relying on the (stubbed) fill() implementation.
490        pool.slots[0] = PoolSlot::Available {
491            snapshot_path: PathBuf::from("/tmp/snap-1"),
492            mem_file_path: PathBuf::from("/tmp/snap-1.mem"),
493            vm_id: "vm-1".to_string(),
494        };
495        assert_eq!(pool.available(), 1);
496
497        let path = pool.checkout("cell-1").await;
498        assert_eq!(path, Some(PathBuf::from("/tmp/snap-1")));
499        assert_eq!(pool.available(), 0);
500        assert_eq!(pool.in_use(), 1);
501
502        // Second checkout from a now-empty pool returns None — the cold-boot
503        // fallback signal.
504        assert!(pool.checkout("cell-2").await.is_none());
505
506        // Checkin by the holding cell_id transitions the slot to Empty.
507        assert!(pool.checkin("cell-1").await);
508        assert_eq!(pool.available(), 0);
509        assert_eq!(pool.in_use(), 0);
510
511        // Re-checkin is a no-op (returns false).
512        assert!(!pool.checkin("cell-1").await);
513    }
514
515    /// `checkin` with a non-matching `cell_id` is a no-op. This protects
516    /// against a stale destroy from another cell accidentally releasing
517    /// someone else's slot.
518    #[tokio::test]
519    async fn checkin_wrong_cell_id_is_noop() {
520        let mut pool = FirecrackerPool::new(1);
521        pool.slots[0] = PoolSlot::InUse {
522            cell_id: "real-cell".to_string(),
523        };
524        assert!(!pool.checkin("imposter-cell").await);
525        // Slot still InUse with the real cell.
526        assert_eq!(pool.in_use(), 1);
527        // The real cell can still check in.
528        assert!(pool.checkin("real-cell").await);
529        assert_eq!(pool.in_use(), 0);
530    }
531
532    /// `fill` against a non-existent firecracker binary path is a soft
533    /// failure: the spawn fails, the slot stays `Empty`, and the call does
534    /// not propagate an error (the pool is best-effort latency optimisation,
535    /// not a correctness gate). Off-Linux `fill` is a documented no-op so
536    /// the assertion is the same on every platform.
537    #[tokio::test]
538    async fn fill_with_missing_binary_leaves_slots_empty() {
539        let mut pool = FirecrackerPool::new(2);
540        pool.fill(
541            "/nonexistent/firecracker",
542            "/nonexistent/vmlinux",
543            "/nonexistent/rootfs.ext4",
544        )
545        .await;
546        // Either Linux-spawn-failure or off-Linux-noop leaves the slots Empty.
547        assert_eq!(pool.available(), 0);
548        assert_eq!(pool.in_use(), 0);
549        assert_eq!(
550            pool.slots
551                .iter()
552                .filter(|s| matches!(s, PoolSlot::Empty))
553                .count(),
554            2
555        );
556    }
557
558    /// State-machine cycle test: hand-place two `Available` slots (one per
559    /// snapshot pair on disk would be the production path; here we skip the
560    /// firecracker spawn and pin the transition matrix directly). Drive
561    /// `checkout` twice and confirm both succeed, the third returns `None`,
562    /// then `checkin` cycles both back to `Empty` exactly once each.
563    #[tokio::test]
564    async fn checkout_checkin_cycle_two_slots() {
565        let mut pool = FirecrackerPool::new(2);
566        pool.slots[0] = PoolSlot::Available {
567            snapshot_path: PathBuf::from("/tmp/snap-a"),
568            mem_file_path: PathBuf::from("/tmp/snap-a.mem"),
569            vm_id: "vm-a".into(),
570        };
571        pool.slots[1] = PoolSlot::Available {
572            snapshot_path: PathBuf::from("/tmp/snap-b"),
573            mem_file_path: PathBuf::from("/tmp/snap-b.mem"),
574            vm_id: "vm-b".into(),
575        };
576        assert_eq!(pool.available(), 2);
577
578        let p1 = pool.checkout("cell-1").await.expect("first checkout");
579        let p2 = pool.checkout("cell-2").await.expect("second checkout");
580        assert_ne!(p1, p2, "each cell got a distinct snapshot path");
581        assert_eq!(pool.available(), 0);
582        assert_eq!(pool.in_use(), 2);
583
584        // Third checkout from a fully in-use pool is the cold-boot signal.
585        assert!(pool.checkout("cell-3").await.is_none());
586
587        assert!(pool.checkin("cell-1").await);
588        assert!(pool.checkin("cell-2").await);
589        assert_eq!(pool.in_use(), 0);
590        // checkin transitions to Empty (not Available) — the next fill()
591        // re-populates from a fresh boot, because a VM that ran a workload
592        // is no longer at the parked-init state.
593        assert_eq!(
594            pool.slots
595                .iter()
596                .filter(|s| matches!(s, PoolSlot::Empty))
597                .count(),
598            2
599        );
600
601        // Repeated checkin is a no-op (no slot in InUse matches).
602        assert!(!pool.checkin("cell-1").await);
603        assert!(!pool.checkin("cell-2").await);
604    }
605
606    /// `Available` slot carries the paired snapshot+mem paths verbatim
607    /// through `checkout` — the caller needs the snapshot path to feed
608    /// `restore_into`, and on the supervisor side the mem path is paired
609    /// with it via the on-disk `<vm_id>.mem` convention. This pins the
610    /// "snapshot path round-trips unchanged" contract that the
611    /// `FirecrackerCellBackend::create` wiring relies on.
612    #[tokio::test]
613    async fn checkout_returns_snapshot_path_verbatim() {
614        let mut pool = FirecrackerPool::new(1);
615        pool.slots[0] = PoolSlot::Available {
616            snapshot_path: PathBuf::from("/tmp/cellos-pool-X.snap"),
617            mem_file_path: PathBuf::from("/tmp/cellos-pool-X.mem"),
618            vm_id: "X".into(),
619        };
620        let got = pool.checkout("cell-X").await;
621        assert_eq!(got, Some(PathBuf::from("/tmp/cellos-pool-X.snap")));
622        // After checkout the slot is InUse{cell-X}.
623        match &pool.slots[0] {
624            PoolSlot::InUse { cell_id } => assert_eq!(cell_id, "cell-X"),
625            other => panic!("expected InUse after checkout, got {other:?}"),
626        }
627    }
628
629    /// `pool_size_from_env` returns 0 when the env var is unset. We can't
630    /// reliably test the *set* path here (env mutation is racy across tests
631    /// in the same process), but pinning the unset default is the gate that
632    /// matters: if the env reader regressed to a non-zero default the warm
633    /// pool would activate accidentally and changes in `create()` would take
634    /// a different code path than expected.
635    #[test]
636    fn pool_size_from_env_defaults_to_zero_when_unset() {
637        // Best-effort: only assert when the var is genuinely unset in this
638        // test process. If a parallel test set it, skip — we'd rather skip
639        // than be flaky.
640        if std::env::var(POOL_SIZE_ENV).is_err() {
641            assert_eq!(pool_size_from_env(), 0);
642        }
643    }
644}