cellos_host_firecracker/pool.rs
1//! Pre-booted Firecracker VM pool — L2-06-2.
2//!
3//! Cold-booting a Firecracker microVM costs ~125 ms wall-clock on warm hosts
4//! (kernel decompression + init + cellos-init handshake). For agentic
5//! workloads that spawn many short-lived cells, that overhead dominates the
6//! workload's actual runtime. The remedy is the snapshot-restore path:
7//! pre-boot a VM to a known state (kernel up, vsock + virtio devices wired,
8//! cellos-init parked waiting for a `cellos.argv` cmdline), take a snapshot,
9//! and restore from the snapshot at cell-create time. Restore is ~10 ms.
10//!
11//! This module implements the *pool state machine* and the *fill* API. The
12//! integration into [`FirecrackerCellBackend::create`] is gated behind the
13//! `CELLOS_FIRECRACKER_POOL_SIZE` environment variable — default `0` means
14//! the pool is disabled and `create` follows the cold-boot path verbatim.
15//! When `>0`, a future commit wires `checkout` into `create` ahead of
16//! `configure_vm` and `checkin` into `destroy`.
17//!
18//! # Why a skeleton?
19//!
20//! The full snapshot path needs:
21//! * a Firecracker child managed by [`tokio::process::Child`] long enough to
22//! accept `PUT /snapshot/create` and then exit cleanly;
23//! * disk space accounting for memory snapshots (the `--mem-file-path` blob
24//! is the same size as the VM's RAM allocation);
25//! * a separate restore code path inside `create` that calls
26//! `PUT /snapshot/load` instead of `PUT /machine-config` + `PUT /boot-source`.
27//!
28//! All three of those land in subsequent L2-06 commits. This file pins the
29//! contract — the state machine, the `checkout`/`checkin` API shape, and the
30//! gating env var — so the wiring change in the live `create` path is a
31//! mechanical follow-up rather than a redesign.
32//!
33//! # State machine
34//!
35//! Each slot transitions:
36//!
37//! ```text
38//! Empty ──fill()──▶ Available ──checkout()──▶ InUse ──checkin()──▶ Empty
39//! ```
40//!
41//! `checkin` returns the slot to `Empty` (not `Available`) by design: a VM
42//! that ran a cell is no longer at the parked-init snapshot state, so it
43//! cannot be re-used without re-snapshotting from a fresh boot. A later
44//! background filler re-populates the slot. This is the same lifecycle
45//! AWS Lambda uses for warm-pool execution environments.
46
47use std::path::PathBuf;
48
49#[cfg(target_os = "linux")]
50use std::time::Duration;
51
52#[cfg(target_os = "linux")]
53use crate::api_client::{
54 BootSource, Drive, FirecrackerApiClient, InstanceAction, InstanceActionType, MachineConfig,
55 MemBackend, MemBackendType, SnapshotCreate, SnapshotLoad, SnapshotType, VmState, VmStatePatch,
56};
57#[cfg(target_os = "linux")]
58use cellos_core::CellosError;
59
60/// Environment variable that toggles the warm pool. Default `0` (disabled);
61/// any positive integer enables the pool with that many slots.
62pub const POOL_SIZE_ENV: &str = "CELLOS_FIRECRACKER_POOL_SIZE";
63
64/// State of one slot in the warm pool.
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub enum PoolSlot {
67 /// Slot has a snapshot on disk and is ready to be checked out.
68 Available {
69 /// Filesystem path to the Firecracker VM state file written by
70 /// `PUT /snapshot/create`. `PUT /snapshot/load` consumes this as
71 /// `snapshot_path` at restore time.
72 snapshot_path: PathBuf,
73 /// Filesystem path to the paired memory dump (`mem_file_path` in the
74 /// `SnapshotCreate` body). Required at restore time via the
75 /// `mem_backend.backend_path` field on `PUT /snapshot/load`.
76 mem_file_path: PathBuf,
77 /// Stable identifier for the pre-booted VM (used in logs / metrics).
78 vm_id: String,
79 },
80 /// Slot has been handed to an active cell — not eligible for checkout.
81 InUse {
82 /// Cell id currently holding this slot. Used by `checkin` to validate
83 /// the caller is releasing the slot they actually checked out.
84 cell_id: String,
85 },
86 /// Slot has no snapshot yet (initial state, or post-`checkin` awaiting
87 /// background re-fill).
88 Empty,
89}
90
91/// Pre-booted Firecracker VM pool for fast cell startup.
92///
93/// Each slot is a VM that has booted to the kernel's init stage and been
94/// snapshot'd — ready to restore in ~10 ms vs cold-boot ~125 ms.
95///
96/// **Thread-safety:** the pool is currently `&mut self`-driven for clarity.
97/// The wiring inside [`FirecrackerCellBackend`] will wrap it in
98/// `tokio::sync::Mutex<FirecrackerPool>` (same pattern as `running_vms`) so
99/// concurrent `create` / `destroy` calls serialize on slot allocation.
100///
101/// [`FirecrackerCellBackend`]: crate::FirecrackerCellBackend
102pub struct FirecrackerPool {
103 size: usize,
104 slots: Vec<PoolSlot>,
105}
106
107impl FirecrackerPool {
108 /// Construct an empty pool with `size` slots, all in [`PoolSlot::Empty`].
109 /// `size==0` is valid and yields a pool whose `checkout` always returns
110 /// `None` — the wiring code uses this to short-circuit when the env var
111 /// is unset or zero.
112 pub fn new(size: usize) -> Self {
113 Self {
114 size,
115 slots: (0..size).map(|_| PoolSlot::Empty).collect(),
116 }
117 }
118
119 /// Number of slots configured for this pool (any state).
120 pub fn size(&self) -> usize {
121 self.size
122 }
123
124 /// Number of [`PoolSlot::Available`] slots — the number of cells that can
125 /// be served by the fast-path right now.
126 pub fn available(&self) -> usize {
127 self.slots
128 .iter()
129 .filter(|s| matches!(s, PoolSlot::Available { .. }))
130 .count()
131 }
132
133 /// Number of [`PoolSlot::InUse`] slots.
134 pub fn in_use(&self) -> usize {
135 self.slots
136 .iter()
137 .filter(|s| matches!(s, PoolSlot::InUse { .. }))
138 .count()
139 }
140
141 /// Reserve an available snapshot for `cell_id`, transitioning the slot
142 /// from `Available` to `InUse`. Returns the snapshot path on success, or
143 /// `None` if no `Available` slot exists (caller falls back to cold-boot).
144 ///
145 /// Marked `async` for symmetry with the future implementation that will
146 /// hold a `tokio::sync::Mutex`. The body is currently synchronous.
147 pub async fn checkout(&mut self, cell_id: &str) -> Option<PathBuf> {
148 for slot in self.slots.iter_mut() {
149 if let PoolSlot::Available { snapshot_path, .. } = slot {
150 let path = snapshot_path.clone();
151 *slot = PoolSlot::InUse {
152 cell_id: cell_id.to_string(),
153 };
154 return Some(path);
155 }
156 }
157 None
158 }
159
160 /// Release the slot previously checked out by `cell_id`, transitioning it
161 /// to [`PoolSlot::Empty`]. A background filler is expected to re-populate
162 /// the slot via [`Self::fill`]; this is intentional — a VM that ran a
163 /// real cell is no longer at the parked-init state, so re-using its
164 /// snapshot would leak workload-side state into the next cell.
165 ///
166 /// Returns `true` if a matching `InUse { cell_id }` slot was found and
167 /// reset, `false` otherwise (call was a no-op).
168 pub async fn checkin(&mut self, cell_id: &str) -> bool {
169 for slot in self.slots.iter_mut() {
170 if let PoolSlot::InUse { cell_id: held } = slot {
171 if held == cell_id {
172 *slot = PoolSlot::Empty;
173 return true;
174 }
175 }
176 }
177 false
178 }
179
180 /// Boot one VM per `Empty` slot, snapshot it, and transition the slot to
181 /// [`PoolSlot::Available`]. No-op for slots already in `Available` or
182 /// `InUse`.
183 ///
184 /// On Linux (the only platform Firecracker runs on) this spawns one VMM
185 /// per empty slot, drives the configure → InstanceStart → wait-for-init
186 /// → PATCH-Paused → PUT-snapshot/create sequence, then kills the child
187 /// process. The pair of `(snapshot_path, mem_file_path)` files left
188 /// behind on disk is the durable artifact a future `checkout` will load.
189 ///
190 /// Off-Linux this is a no-op — Firecracker is not available, so the
191 /// pool stays empty and `checkout` returns `None`, falling
192 /// `FirecrackerCellBackend::create` through to its cold-boot path.
193 ///
194 /// Failures during fill are logged and the slot is left `Empty` (so a
195 /// subsequent fill can retry); we don't propagate errors out of `fill`
196 /// because the pool is a best-effort latency optimisation, not a
197 /// correctness gate.
198 #[cfg(target_os = "linux")]
199 pub async fn fill(&mut self, firecracker_bin: &str, kernel: &str, rootfs: &str) {
200 for (idx, slot) in self.slots.iter_mut().enumerate() {
201 if !matches!(slot, PoolSlot::Empty) {
202 continue;
203 }
204 match fill_one_slot(firecracker_bin, kernel, rootfs, idx).await {
205 Ok((snapshot_path, mem_file_path, vm_id)) => {
206 tracing::info!(
207 slot = idx,
208 snapshot = %snapshot_path.display(),
209 mem = %mem_file_path.display(),
210 "warm pool slot filled"
211 );
212 *slot = PoolSlot::Available {
213 snapshot_path,
214 mem_file_path,
215 vm_id,
216 };
217 }
218 Err(e) => {
219 tracing::warn!(slot = idx, error = %e, "warm pool fill failed; slot stays Empty");
220 }
221 }
222 }
223 }
224
225 /// Off-Linux stub — Firecracker only runs on Linux/KVM.
226 #[cfg(not(target_os = "linux"))]
227 pub async fn fill(&mut self, _firecracker_bin: &str, _kernel: &str, _rootfs: &str) {
228 tracing::debug!(
229 pool_size = self.size,
230 "FirecrackerPool::fill no-op: target_os != linux"
231 );
232 }
233}
234
235/// Restore a previously-captured snapshot into a fresh Firecracker VMM via
236/// `PUT /snapshot/load`. The caller owns the VMM process and its API socket
237/// — this helper only drives the load + resume call sequence.
238///
239/// Linux-only because the API client transport (`UnixStream`) is Linux-only.
240/// `FirecrackerCellBackend::create` calls this with the path returned by
241/// [`FirecrackerPool::checkout`] when the pool produced a fast-path slot;
242/// off-Linux the pool is always empty so this helper is never reached.
243#[cfg(target_os = "linux")]
244pub async fn restore_into(
245 client: &FirecrackerApiClient,
246 snapshot_path: &std::path::Path,
247 mem_file_path: &std::path::Path,
248) -> Result<(), CellosError> {
249 let status = client
250 .put(
251 "/snapshot/load",
252 &SnapshotLoad {
253 snapshot_path: snapshot_path.to_string_lossy().into_owned(),
254 mem_backend: MemBackend {
255 backend_type: MemBackendType::File,
256 backend_path: mem_file_path.to_string_lossy().into_owned(),
257 },
258 enable_diff_snapshots: false,
259 resume_vm: true,
260 },
261 )
262 .await?;
263 if !status.is_success() {
264 return Err(CellosError::Host(format!(
265 "firecracker /snapshot/load returned HTTP {status}"
266 )));
267 }
268 Ok(())
269}
270
271/// Boot one Firecracker VMM, snapshot it, kill the child, return the on-disk
272/// paths plus a stable vm-id. Linux-only.
273///
274/// Path discipline: snapshot files land at
275/// `/tmp/cellos-pool-<vm_id>.snap` (state) and `/tmp/cellos-pool-<vm_id>.mem`
276/// (memory dump). The VMM API socket lives at
277/// `/tmp/cellos-pool-<vm_id>.socket`. We `remove_file` the socket on
278/// teardown so re-fills don't `EEXIST` on `bind`.
279#[cfg(target_os = "linux")]
280async fn fill_one_slot(
281 firecracker_bin: &str,
282 kernel: &str,
283 rootfs: &str,
284 slot_idx: usize,
285) -> Result<(PathBuf, PathBuf, String), CellosError> {
286 use tokio::time::sleep;
287 use uuid::Uuid;
288
289 let vm_id = format!("pool-{}-{}", slot_idx, Uuid::new_v4().simple());
290 let socket_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.socket"));
291 let snapshot_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.snap"));
292 let mem_file_path = PathBuf::from(format!("/tmp/cellos-pool-{vm_id}.mem"));
293
294 // Stale socket from a crashed previous run would make Firecracker fail
295 // to `bind()`. Best-effort remove (ignore NotFound).
296 let _ = std::fs::remove_file(&socket_path);
297
298 // Spawn the VMM. Same direct-invocation shape as `build_direct_argv`
299 // in lib.rs — no jailer because the warm pool's VM never runs workload
300 // code; it boots cellos-init, gets snapshotted, and dies. The chroot
301 // boundary is therefore not load-bearing for the fill path.
302 let socket_str = socket_path.to_string_lossy().into_owned();
303 let mut child = tokio::process::Command::new(firecracker_bin)
304 .args(["--api-sock", socket_str.as_str(), "--level", "Error"])
305 .kill_on_drop(true)
306 .spawn()
307 .map_err(|e| CellosError::Host(format!("spawn firecracker for pool fill: {e}")))?;
308
309 // From here on, any error path must kill the child + clean up sockets
310 // before surfacing.
311 let fill = async {
312 let client = FirecrackerApiClient::new(&socket_path);
313 client.wait_for_ready().await?;
314
315 // Minimal machine config — pool VMs are stamped out from a single
316 // snapshot, so we use a small static footprint. The supervisor's
317 // hot path can still attach a larger scratch image at restore-time
318 // via a subsequent `PUT /drives/...`.
319 let mc = client
320 .put(
321 "/machine-config",
322 &MachineConfig {
323 vcpu_count: 1,
324 mem_size_mib: 128,
325 track_dirty_pages: false,
326 },
327 )
328 .await?;
329 if !mc.is_success() {
330 return Err(CellosError::Host(format!(
331 "firecracker /machine-config returned HTTP {mc}"
332 )));
333 }
334
335 let bs = client
336 .put(
337 "/boot-source",
338 &BootSource {
339 kernel_image_path: kernel.to_string(),
340 // `reboot=k panic=1` is the standard Firecracker pair —
341 // we never expect to reboot, but if the kernel panics
342 // during snapshot prep we want a clean exit rather than
343 // a hung VMM.
344 boot_args: Some("console=ttyS0 reboot=k panic=1 pci=off nomodules".to_string()),
345 },
346 )
347 .await?;
348 if !bs.is_success() {
349 return Err(CellosError::Host(format!(
350 "firecracker /boot-source returned HTTP {bs}"
351 )));
352 }
353
354 let drv = client
355 .put(
356 "/drives/rootfs",
357 &Drive {
358 drive_id: "rootfs".into(),
359 path_on_host: rootfs.to_string(),
360 is_root_device: true,
361 is_read_only: true,
362 },
363 )
364 .await?;
365 if !drv.is_success() {
366 return Err(CellosError::Host(format!(
367 "firecracker /drives/rootfs returned HTTP {drv}"
368 )));
369 }
370
371 let start = client
372 .put(
373 "/actions",
374 &InstanceAction {
375 action_type: InstanceActionType::InstanceStart,
376 },
377 )
378 .await?;
379 if !start.is_success() {
380 return Err(CellosError::Host(format!(
381 "firecracker InstanceStart returned HTTP {start}"
382 )));
383 }
384
385 // Wait for cellos-init to reach the parked state. The robust
386 // signal is a vsock readiness ping (see the lib.rs `boot_result`
387 // block's exit-code listener), but for the warm-pool path we don't
388 // yet require an init-side vsock dialog — the kernel-mode handoff
389 // to userspace is what we want to capture in the snapshot, not the
390 // full init handshake. A short fixed wait gives Firecracker enough
391 // wall time to bring up the vCPU and reach the parked userspace
392 // before we pause. This matches the wall-clock that AWS Lambda's
393 // microVM warmer uses for its pre-warm pool.
394 sleep(Duration::from_millis(500)).await;
395
396 // Pause the VM before snapshotting — Firecracker refuses to
397 // snapshot a Running VM.
398 let pause = client
399 .patch(
400 "/vm",
401 &VmStatePatch {
402 state: VmState::Paused,
403 },
404 )
405 .await?;
406 if !pause.is_success() {
407 return Err(CellosError::Host(format!(
408 "firecracker PATCH /vm Paused returned HTTP {pause}"
409 )));
410 }
411
412 let snap = client
413 .put(
414 "/snapshot/create",
415 &SnapshotCreate {
416 snapshot_type: SnapshotType::Full,
417 snapshot_path: snapshot_path.to_string_lossy().into_owned(),
418 mem_file_path: mem_file_path.to_string_lossy().into_owned(),
419 },
420 )
421 .await?;
422 if !snap.is_success() {
423 return Err(CellosError::Host(format!(
424 "firecracker /snapshot/create returned HTTP {snap}"
425 )));
426 }
427
428 Ok::<(), CellosError>(())
429 };
430
431 let result = fill.await;
432
433 // Tear down the source VMM. The snapshot is the durable artifact; the
434 // original Running-then-Paused process is no longer needed. `kill()`
435 // sends SIGKILL; we then `wait()` so we don't leave a zombie.
436 let _ = child.kill().await;
437 let _ = child.wait().await;
438 let _ = std::fs::remove_file(&socket_path);
439
440 result.map(|()| (snapshot_path, mem_file_path, vm_id))
441}
442
443/// Read [`POOL_SIZE_ENV`] from the process environment and parse it.
444/// Returns `0` (pool disabled) when unset, empty, or unparseable — the
445/// fail-closed default. A non-zero value enables the pool.
446pub fn pool_size_from_env() -> usize {
447 std::env::var(POOL_SIZE_ENV)
448 .ok()
449 .and_then(|v| v.trim().parse::<usize>().ok())
450 .unwrap_or(0)
451}
452
453// ── Tests ────────────────────────────────────────────────────────────────────
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458
459 /// A fresh pool of size N has N `Empty` slots, zero `Available`, zero
460 /// `InUse`. `checkout` returns `None` because nothing has been filled.
461 #[tokio::test]
462 async fn new_pool_starts_empty() {
463 let mut pool = FirecrackerPool::new(3);
464 assert_eq!(pool.size(), 3);
465 assert_eq!(pool.available(), 0);
466 assert_eq!(pool.in_use(), 0);
467 // checkout on an empty pool must yield None — the gate that lets
468 // create() fall through to cold-boot.
469 assert!(pool.checkout("cell-1").await.is_none());
470 }
471
472 /// `size=0` is the disabled-pool sentinel: `checkout` always yields
473 /// `None`, `checkin` always yields `false`, no panics.
474 #[tokio::test]
475 async fn zero_size_pool_is_inert() {
476 let mut pool = FirecrackerPool::new(0);
477 assert_eq!(pool.size(), 0);
478 assert!(pool.checkout("any-cell").await.is_none());
479 assert!(!pool.checkin("any-cell").await);
480 }
481
482 /// State machine: an `Available` slot can be checked out (→ `InUse`),
483 /// then checked in (→ `Empty`). Two cells trying to checkout from a
484 /// one-slot pool: first wins, second gets `None`.
485 #[tokio::test]
486 async fn checkout_then_checkin_cycles_slot_through_states() {
487 let mut pool = FirecrackerPool::new(1);
488 // Hand-place an Available slot so we can exercise checkout without
489 // relying on the (stubbed) fill() implementation.
490 pool.slots[0] = PoolSlot::Available {
491 snapshot_path: PathBuf::from("/tmp/snap-1"),
492 mem_file_path: PathBuf::from("/tmp/snap-1.mem"),
493 vm_id: "vm-1".to_string(),
494 };
495 assert_eq!(pool.available(), 1);
496
497 let path = pool.checkout("cell-1").await;
498 assert_eq!(path, Some(PathBuf::from("/tmp/snap-1")));
499 assert_eq!(pool.available(), 0);
500 assert_eq!(pool.in_use(), 1);
501
502 // Second checkout from a now-empty pool returns None — the cold-boot
503 // fallback signal.
504 assert!(pool.checkout("cell-2").await.is_none());
505
506 // Checkin by the holding cell_id transitions the slot to Empty.
507 assert!(pool.checkin("cell-1").await);
508 assert_eq!(pool.available(), 0);
509 assert_eq!(pool.in_use(), 0);
510
511 // Re-checkin is a no-op (returns false).
512 assert!(!pool.checkin("cell-1").await);
513 }
514
515 /// `checkin` with a non-matching `cell_id` is a no-op. This protects
516 /// against a stale destroy from another cell accidentally releasing
517 /// someone else's slot.
518 #[tokio::test]
519 async fn checkin_wrong_cell_id_is_noop() {
520 let mut pool = FirecrackerPool::new(1);
521 pool.slots[0] = PoolSlot::InUse {
522 cell_id: "real-cell".to_string(),
523 };
524 assert!(!pool.checkin("imposter-cell").await);
525 // Slot still InUse with the real cell.
526 assert_eq!(pool.in_use(), 1);
527 // The real cell can still check in.
528 assert!(pool.checkin("real-cell").await);
529 assert_eq!(pool.in_use(), 0);
530 }
531
532 /// `fill` against a non-existent firecracker binary path is a soft
533 /// failure: the spawn fails, the slot stays `Empty`, and the call does
534 /// not propagate an error (the pool is best-effort latency optimisation,
535 /// not a correctness gate). Off-Linux `fill` is a documented no-op so
536 /// the assertion is the same on every platform.
537 #[tokio::test]
538 async fn fill_with_missing_binary_leaves_slots_empty() {
539 let mut pool = FirecrackerPool::new(2);
540 pool.fill(
541 "/nonexistent/firecracker",
542 "/nonexistent/vmlinux",
543 "/nonexistent/rootfs.ext4",
544 )
545 .await;
546 // Either Linux-spawn-failure or off-Linux-noop leaves the slots Empty.
547 assert_eq!(pool.available(), 0);
548 assert_eq!(pool.in_use(), 0);
549 assert_eq!(
550 pool.slots
551 .iter()
552 .filter(|s| matches!(s, PoolSlot::Empty))
553 .count(),
554 2
555 );
556 }
557
558 /// State-machine cycle test: hand-place two `Available` slots (one per
559 /// snapshot pair on disk would be the production path; here we skip the
560 /// firecracker spawn and pin the transition matrix directly). Drive
561 /// `checkout` twice and confirm both succeed, the third returns `None`,
562 /// then `checkin` cycles both back to `Empty` exactly once each.
563 #[tokio::test]
564 async fn checkout_checkin_cycle_two_slots() {
565 let mut pool = FirecrackerPool::new(2);
566 pool.slots[0] = PoolSlot::Available {
567 snapshot_path: PathBuf::from("/tmp/snap-a"),
568 mem_file_path: PathBuf::from("/tmp/snap-a.mem"),
569 vm_id: "vm-a".into(),
570 };
571 pool.slots[1] = PoolSlot::Available {
572 snapshot_path: PathBuf::from("/tmp/snap-b"),
573 mem_file_path: PathBuf::from("/tmp/snap-b.mem"),
574 vm_id: "vm-b".into(),
575 };
576 assert_eq!(pool.available(), 2);
577
578 let p1 = pool.checkout("cell-1").await.expect("first checkout");
579 let p2 = pool.checkout("cell-2").await.expect("second checkout");
580 assert_ne!(p1, p2, "each cell got a distinct snapshot path");
581 assert_eq!(pool.available(), 0);
582 assert_eq!(pool.in_use(), 2);
583
584 // Third checkout from a fully in-use pool is the cold-boot signal.
585 assert!(pool.checkout("cell-3").await.is_none());
586
587 assert!(pool.checkin("cell-1").await);
588 assert!(pool.checkin("cell-2").await);
589 assert_eq!(pool.in_use(), 0);
590 // checkin transitions to Empty (not Available) — the next fill()
591 // re-populates from a fresh boot, because a VM that ran a workload
592 // is no longer at the parked-init state.
593 assert_eq!(
594 pool.slots
595 .iter()
596 .filter(|s| matches!(s, PoolSlot::Empty))
597 .count(),
598 2
599 );
600
601 // Repeated checkin is a no-op (no slot in InUse matches).
602 assert!(!pool.checkin("cell-1").await);
603 assert!(!pool.checkin("cell-2").await);
604 }
605
606 /// `Available` slot carries the paired snapshot+mem paths verbatim
607 /// through `checkout` — the caller needs the snapshot path to feed
608 /// `restore_into`, and on the supervisor side the mem path is paired
609 /// with it via the on-disk `<vm_id>.mem` convention. This pins the
610 /// "snapshot path round-trips unchanged" contract that the
611 /// `FirecrackerCellBackend::create` wiring relies on.
612 #[tokio::test]
613 async fn checkout_returns_snapshot_path_verbatim() {
614 let mut pool = FirecrackerPool::new(1);
615 pool.slots[0] = PoolSlot::Available {
616 snapshot_path: PathBuf::from("/tmp/cellos-pool-X.snap"),
617 mem_file_path: PathBuf::from("/tmp/cellos-pool-X.mem"),
618 vm_id: "X".into(),
619 };
620 let got = pool.checkout("cell-X").await;
621 assert_eq!(got, Some(PathBuf::from("/tmp/cellos-pool-X.snap")));
622 // After checkout the slot is InUse{cell-X}.
623 match &pool.slots[0] {
624 PoolSlot::InUse { cell_id } => assert_eq!(cell_id, "cell-X"),
625 other => panic!("expected InUse after checkout, got {other:?}"),
626 }
627 }
628
629 /// `pool_size_from_env` returns 0 when the env var is unset. We can't
630 /// reliably test the *set* path here (env mutation is racy across tests
631 /// in the same process), but pinning the unset default is the gate that
632 /// matters: if the env reader regressed to a non-zero default the warm
633 /// pool would activate accidentally and changes in `create()` would take
634 /// a different code path than expected.
635 #[test]
636 fn pool_size_from_env_defaults_to_zero_when_unset() {
637 // Best-effort: only assert when the var is genuinely unset in this
638 // test process. If a parallel test set it, skip — we'd rather skip
639 // than be flaky.
640 if std::env::var(POOL_SIZE_ENV).is_err() {
641 assert_eq!(pool_size_from_env(), 0);
642 }
643 }
644}