Skip to main content

cellos_host_firecracker/
lib.rs

1//! Firecracker-backed host backend (L2-06).
2//!
3//! # Architecture
4//!
5//! `FirecrackerCellBackend` implements the [`CellBackend`] trait by managing
6//! one Firecracker VMM process per cell:
7//!
8//! 1. **`create`** — spawns `firecracker --api-sock <socket>`, waits for the
9//!    socket to appear, then calls the Firecracker Management API to configure
10//!    the machine (vCPUs, memory, kernel, rootfs) and boot it.
11//! 2. **`destroy`** — sends a graceful `SendCtrlAltDel` action, waits for the
12//!    process to exit, and cleans up the socket file.
13//!
14//! # Cell command execution
15//!
16//! The Firecracker path now runs `spec.run.argv` inside the guest via the
17//! `cellos-init` PID-1 binary and a vsock exit-code bridge:
18//!
19//! 1. The host encodes `spec.run.argv` into the kernel boot args as
20//!    `cellos.argv=<base64-json>`.
21//! 2. The VM is configured with a vsock device and the host starts a matching
22//!    Unix-socket listener.
23//! 3. `cellos-init` reads `/proc/cmdline`, forks and execs the workload inside
24//!    the guest, then writes the 4-byte little-endian exit code back to the
25//!    host over vsock before powering off the VM.
26//! 4. The supervisor calls `CellBackend::wait_for_in_vm_exit()` and skips the
27//!    host-side subprocess path when this backend reports an in-VM exit code.
28//!
29//! The host-side subprocess fallback still exists for backends that do not
30//! override `wait_for_in_vm_exit()`, but it is no longer the execution path for
31//! `FirecrackerCellBackend`.
32
33pub mod api_client;
34pub mod pool;
35
36use std::path::{Path, PathBuf};
37use std::sync::Arc;
38use std::time::Duration;
39
40#[cfg(target_os = "linux")]
41use std::collections::HashMap;
42
43use async_trait::async_trait;
44
45#[cfg(target_os = "linux")]
46use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
47#[cfg(target_os = "linux")]
48use base64::Engine;
49#[cfg(target_os = "linux")]
50use tokio::io::AsyncReadExt;
51#[cfg(target_os = "linux")]
52use tokio::io::AsyncWriteExt;
53#[cfg(target_os = "linux")]
54use tokio::net::UnixListener;
55#[cfg(target_os = "linux")]
56use tokio::process::Child;
57#[cfg(target_os = "linux")]
58use tokio::sync::Mutex;
59#[cfg(target_os = "linux")]
60use tracing::instrument;
61#[cfg(target_os = "linux")]
62use uuid::Uuid;
63
64use cellos_core::ports::{CellBackend, CellHandle, TeardownReport};
65#[cfg(target_os = "linux")]
66use cellos_core::EgressRule;
67use cellos_core::{CellosError, ExecutionCellDocument};
68
69#[cfg(target_os = "linux")]
70use api_client::{
71    BootSource, Drive, FirecrackerApiClient, InstanceAction, InstanceActionType, MachineConfig,
72    NetworkInterface, VsockDevice,
73};
74
75/// How long to wait for the Firecracker socket to appear after process start.
76#[cfg(target_os = "linux")]
77const SOCKET_READY_TIMEOUT: Duration = Duration::from_secs(10);
78/// How long to wait for graceful VM shutdown before SIGKILL.
79///
80/// FC-21: this is the *fallback* applied when `spec.run.limits.gracefulShutdownSeconds`
81/// is absent. Per-spec overrides land in [`VmRecord::graceful_shutdown_timeout`]
82/// at `create()` time and are read back in `destroy()`.
83#[cfg(target_os = "linux")]
84const GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(5);
85
86/// Resolve the graceful-shutdown timeout for a spec.
87///
88/// Reads `spec.run.limits.graceful_shutdown_seconds` (FC-21). When absent or
89/// `None`, returns [`GRACEFUL_SHUTDOWN_TIMEOUT`]. Schema validation already
90/// bounds the value to `[1, 120]` seconds, so we trust whatever made it past
91/// admission here.
92#[cfg(target_os = "linux")]
93fn resolve_graceful_shutdown_timeout(spec: &cellos_core::ExecutionCellSpec) -> Duration {
94    spec.run
95        .as_ref()
96        .and_then(|r| r.limits.as_ref())
97        .and_then(|l| l.graceful_shutdown_seconds)
98        .map(Duration::from_secs)
99        .unwrap_or(GRACEFUL_SHUTDOWN_TIMEOUT)
100}
101
102/// Default vCPUs for a cell VM (override via spec.run.limits in future).
103#[cfg(target_os = "linux")]
104const DEFAULT_VCPU_COUNT: u32 = 1;
105/// Default memory in MiB for a cell VM.
106#[cfg(target_os = "linux")]
107const DEFAULT_MEM_SIZE_MIB: u32 = 128;
108
109/// Derive vCPU count from spec run limits.
110///
111/// Maps `spec.run.limits.cpu_max` (quota_micros / period_micros) to a vCPU
112/// count by rounding UP. Clamped to [1, 32] (Firecracker's supported range).
113/// Fractional CPU allocations get 1 vCPU minimum — underprovisioning a build
114/// runner causes more latency regression than the 1-vCPU overhead.
115///
116/// Returns `DEFAULT_VCPU_COUNT` when no limits are declared.
117#[cfg(target_os = "linux")]
118fn derive_vcpu_count(spec: &cellos_core::ExecutionCellSpec) -> u32 {
119    let Some(cpu_max) = spec
120        .run
121        .as_ref()
122        .and_then(|r| r.limits.as_ref())
123        .and_then(|l| l.cpu_max.as_ref())
124    else {
125        return DEFAULT_VCPU_COUNT;
126    };
127    let period = cpu_max.period_micros.unwrap_or(100_000).max(1);
128    let vcpus = cpu_max.quota_micros.div_ceil(period) as u32;
129    vcpus.clamp(1, 32)
130}
131
132/// vsock port that `cellos-init` inside the VM connects to after the cell
133/// command exits.  Must match the constant in `cellos-init/src/main.rs`.
134pub const VSOCK_EXIT_PORT: u32 = 9000;
135
136/// FC-18: size in bytes of the per-cell HMAC-SHA256 authentication key.
137/// `pub(crate)` — tests reach this via the doc-hidden `__fc18` shim.
138pub(crate) const EXIT_HMAC_KEY_LEN: usize = 32;
139
140/// FC-18: size in bytes of the HMAC-SHA256 tag the guest appends to its
141/// exit-code report. Must match `compute_exit_hmac` in `cellos-init`.
142pub(crate) const EXIT_HMAC_TAG_LEN: usize = 32;
143
144/// FC-18: full authenticated exit-code wire format size — `4` bytes of
145/// little-endian `i32` followed by `EXIT_HMAC_TAG_LEN` bytes of MAC.
146#[cfg(target_os = "linux")]
147const EXIT_AUTHED_FRAME_LEN: usize = 4 + EXIT_HMAC_TAG_LEN;
148
149/// FC-18: generate a fresh 32-byte HMAC key for one cell by reading from
150/// `/dev/urandom`. Linux-only because Firecracker is Linux-only and the
151/// surrounding crate is `cfg(target_os = "linux")`-gated; no generic random
152/// source crate is in the workspace dependency tree, and we explicitly do
153/// not want to pull `getrandom` into cellos-host-firecracker for this single
154/// call site.
155///
156/// Returns `Err` only if `/dev/urandom` is unavailable or short-reads — both
157/// are catastrophic on a Linux host (the kernel guarantees /dev/urandom is
158/// always readable post-boot) so the caller should treat this as a fatal
159/// host configuration error and refuse to launch the cell.
160#[cfg(target_os = "linux")]
161fn generate_exit_hmac_key() -> Result<[u8; EXIT_HMAC_KEY_LEN], CellosError> {
162    use std::io::Read;
163    let mut key = [0u8; EXIT_HMAC_KEY_LEN];
164    let mut f = std::fs::File::open("/dev/urandom")
165        .map_err(|e| CellosError::Host(format!("open /dev/urandom: {e}")))?;
166    f.read_exact(&mut key)
167        .map_err(|e| CellosError::Host(format!("read /dev/urandom: {e}")))?;
168    Ok(key)
169}
170
171/// FC-18: pure-Rust verification helper. Recompute the HMAC-SHA256 tag the
172/// guest should have produced (`HMAC(key, exit_code_bytes ‖ cell_id_bytes)`)
173/// and compare it in constant time against the bytes the supervisor read
174/// off the wire.
175///
176/// `exit_code_bytes` is the raw 4-byte little-endian buffer the supervisor
177/// already received — passing the bytes directly (rather than the parsed
178/// `i32`) keeps this function trivially testable against arbitrary
179/// adversarial inputs without hard-coding the byte order in two places.
180///
181/// Returns `true` iff the recomputed tag matches the received tag. Note:
182/// uses `hmac::Mac::verify_slice` which performs a constant-time compare —
183/// avoid replacing with naive `==` on the byte arrays, which leaks timing
184/// information about the prefix match.
185///
186/// `pub(crate)`: tests reach this via the doc-hidden `__fc18` shim.
187pub(crate) fn verify_exit_hmac(
188    key: &[u8],
189    exit_code_bytes: &[u8; 4],
190    cell_id: &str,
191    received_tag: &[u8],
192) -> bool {
193    use hmac::{digest::KeyInit, Hmac, Mac};
194    use sha2::Sha256;
195    type HmacSha256 = Hmac<Sha256>;
196
197    if received_tag.len() != EXIT_HMAC_TAG_LEN {
198        return false;
199    }
200    let mut mac = match HmacSha256::new_from_slice(key) {
201        Ok(m) => m,
202        Err(_) => return false,
203    };
204    mac.update(exit_code_bytes);
205    mac.update(cell_id.as_bytes());
206    mac.verify_slice(received_tag).is_ok()
207}
208
209/// Guest CID assigned to every cell VM.  The host CID (2) is used by
210/// `cellos-init` to connect back; 3 is the conventional first guest CID.
211#[cfg(target_os = "linux")]
212const VSOCK_GUEST_CID: u32 = 3;
213
214/// MAC address handed to the guest virtio-net device.  Each cell runs in its
215/// own L2 segment (single-tap, point-to-point with the host bridge), so reusing
216/// one MAC across cells is safe — there is no shared broadcast domain.
217#[cfg(target_os = "linux")]
218const GUEST_NIC_MAC: &str = "AA:FC:00:00:00:01";
219
220/// Linker prefix for per-cell host TAP interfaces. Combined with an 8-character
221/// sanitized cell-id slug to stay within the 15-byte `IFNAMSIZ` kernel limit
222/// (`cfc-` = 4, slug = 8, total = 12).
223#[cfg(target_os = "linux")]
224const TAP_NAME_PREFIX: &str = "cfc-";
225
226/// True when host TAP / nftables enforcement is supported. The runtime helpers
227/// for [`create_tap_device`], [`apply_network_policy`], etc. only function on
228/// Linux; on every other OS they short-circuit to a clear error so a developer
229/// running tests on macOS sees the same code paths but does not need root.
230#[cfg(target_os = "linux")]
231const NETWORK_DEFAULT_ENABLED: bool = true;
232#[cfg(not(target_os = "linux"))]
233const NETWORK_DEFAULT_ENABLED: bool = false;
234
235// ── Config ───────────────────────────────────────────────────────────────────
236
237#[derive(Debug, Clone, PartialEq, Eq)]
238pub struct FirecrackerConfig {
239    pub binary_path: PathBuf,
240    pub kernel_image_path: PathBuf,
241    pub rootfs_image_path: PathBuf,
242    pub jailer_binary_path: Option<PathBuf>,
243    pub chroot_base_dir: PathBuf,
244    /// Directory for Firecracker API socket files; defaults to `/tmp`.
245    /// Ignored when the jailer is active — the jailer places the API socket
246    /// inside the chroot at `<chroot_base>/<fc_filename>/<cell_id>/root/run/firecracker.socket`.
247    pub socket_dir: PathBuf,
248    /// Numeric uid the jailer drops to before exec'ing firecracker. Defaults to 10002.
249    /// This user must exist on the host and must own no sensitive files.
250    pub jailer_uid: u32,
251    /// Numeric gid the jailer drops to before exec'ing firecracker. Defaults to 10002.
252    pub jailer_gid: u32,
253    /// Directory where per-cell writable scratch ext4 images are created.
254    /// When set, rootfs is mounted read-only and a writable scratch drive is
255    /// attached as a second virtio-blk device. When None (default), rootfs is
256    /// mounted read-write (v0.2 behaviour — not safe for concurrent cells sharing
257    /// the same rootfs image).
258    pub scratch_dir: Option<PathBuf>,
259    /// Path to the artifact manifest file produced by the build pipeline.
260    /// When set, `create()` verifies the SHA256 digest of each declared
261    /// artifact (kernel, rootfs, optionally firecracker) before booting the
262    /// VM. When `None` AND `allow_no_manifest` is `true`, verification is
263    /// skipped with a loud warning — supported for development only.
264    /// Otherwise, `from_env()` rejects the configuration outright.
265    pub manifest_path: Option<PathBuf>,
266    /// When `true` (the default), `create()` refuses to proceed unless the
267    /// jailer binary is configured. Operators may opt out for development
268    /// by setting `CELLOS_FIRECRACKER_ALLOW_NO_JAILER=1`, which logs a loud
269    /// warning and downgrades this flag to `false`.
270    pub require_jailer: bool,
271    /// When `true`, `create()` will permit booting a VM without a manifest
272    /// (skipping pre-boot artifact digest verification). Defaults to `false`
273    /// — a missing `CELLOS_FIRECRACKER_MANIFEST` is a hard error. Operators
274    /// may opt out for development by setting BOTH
275    /// `CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1` AND the second escape-hatch
276    /// flag `CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1`; only the
277    /// combination flips this flag to `true` and emits a loud `WARN` log
278    /// containing the literal string `MANIFEST VERIFICATION DISABLED`.
279    ///
280    /// The two-flag handshake is deliberate. A single env var can be set in a
281    /// shared base image, a Helm chart copied between environments, or an
282    /// `.env` file leaking from dev to prod by mistake. Requiring a paired
283    /// `_REALLY` flag forces the operator to make the trade-off explicit on
284    /// the same line, in the same operation, so the dev opt-out cannot drift
285    /// into a production deployment unnoticed.
286    ///
287    /// It is an error to set both `CELLOS_FIRECRACKER_MANIFEST` and the
288    /// two-flag opt-out — that combination is inconsistent and `from_env()`
289    /// rejects it. Setting `CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1` without
290    /// the paired `_REALLY` flag (or vice-versa) is also rejected with a
291    /// hard error so misconfigured deployments fail closed.
292    pub allow_no_manifest: bool,
293    /// When true, `create()` provisions a per-cell TAP interface, attaches it
294    /// to the VM as virtio-net, and installs an nftables ruleset that drops
295    /// all egress except destinations declared in `spec.authority.egressRules`.
296    /// Linux-only: defaults to `true` on Linux and `false` on every other OS
297    /// (the `ip` and `nft` commands and TAP devices do not exist there).
298    /// Override with `CELLOS_FIRECRACKER_ENABLE_NETWORK=0|1`.
299    pub enable_network: bool,
300    /// When `true`, [`wait_for_command_exit`] uses a short bounded timeout
301    /// instead of waiting indefinitely for the in-VM vsock exit code.
302    ///
303    /// Motivation: a kernel built with the wrong vsock symbol
304    /// (`CONFIG_VIRTIO_VSOCK` is a typo for `CONFIG_VIRTIO_VSOCKETS`) silently
305    /// has no vsock device. The supervisor then waits forever for a 4-byte
306    /// exit code that no one will ever send, and the only signal the operator
307    /// gets is a hung process. Set
308    /// `CELLOS_FIRECRACKER_ALLOW_NO_VSOCK=1` (and optionally
309    /// `CELLOS_FIRECRACKER_NO_VSOCK_TIMEOUT_SECS=<n>` — default 5) to fail
310    /// fast and surface the misconfiguration in seconds.
311    ///
312    /// The terminal state for the cell will be `forced` (no authenticated
313    /// in-VM exit was received), which is the correct audit signal.
314    pub allow_no_vsock: bool,
315    /// Wait budget when `allow_no_vsock` is true. Ignored otherwise.
316    /// Default: 5 seconds. Override via
317    /// `CELLOS_FIRECRACKER_NO_VSOCK_TIMEOUT_SECS`.
318    pub no_vsock_timeout: Duration,
319    /// When `true`, passes `--no-seccomp` to the Firecracker process.
320    ///
321    /// Firecracker's seccomp BPF filters are compiled with x86-64 syscall
322    /// numbers. Under arm64 emulation (Rosetta/QEMU in Colima) the BPF
323    /// program is rejected by the kernel with EINVAL because the syscall
324    /// table doesn't match. Set `CELLOS_FIRECRACKER_NO_SECCOMP=1` to bypass
325    /// seccomp for emulated development environments.
326    ///
327    /// **Never set this in production.** Seccomp is a critical attack-surface
328    /// reduction — bypassing it removes a significant isolation layer.
329    pub no_seccomp: bool,
330}
331
332impl FirecrackerConfig {
333    pub fn from_env() -> Result<Self, CellosError> {
334        Self::from_lookup(|key| std::env::var(key).ok())
335    }
336
337    pub(crate) fn from_lookup<F>(lookup: F) -> Result<Self, CellosError>
338    where
339        F: Fn(&str) -> Option<String>,
340    {
341        let cfg = Self {
342            binary_path: required_absolute_path(
343                &lookup,
344                "CELLOS_FIRECRACKER_BINARY",
345                "firecracker VMM binary",
346            )?,
347            kernel_image_path: required_absolute_path(
348                &lookup,
349                "CELLOS_FIRECRACKER_KERNEL_IMAGE",
350                "Firecracker kernel image",
351            )?,
352            rootfs_image_path: required_absolute_path(
353                &lookup,
354                "CELLOS_FIRECRACKER_ROOTFS_IMAGE",
355                "Firecracker rootfs image",
356            )?,
357            jailer_binary_path: optional_absolute_path(
358                &lookup,
359                "CELLOS_FIRECRACKER_JAILER_BINARY",
360                "Firecracker jailer binary",
361            )?,
362            chroot_base_dir: optional_absolute_path(
363                &lookup,
364                "CELLOS_FIRECRACKER_CHROOT_BASE",
365                "Firecracker chroot base directory",
366            )?
367            .unwrap_or_else(|| PathBuf::from("/var/lib/cellos/firecracker")),
368            socket_dir: optional_absolute_path(
369                &lookup,
370                "CELLOS_FIRECRACKER_SOCKET_DIR",
371                "Firecracker socket directory",
372            )?
373            .unwrap_or_else(|| PathBuf::from("/tmp")),
374            jailer_uid: lookup("CELLOS_FIRECRACKER_JAILER_UID")
375                .and_then(|v| v.parse().ok())
376                .unwrap_or(10002),
377            jailer_gid: lookup("CELLOS_FIRECRACKER_JAILER_GID")
378                .and_then(|v| v.parse().ok())
379                .unwrap_or(10002),
380            scratch_dir: optional_absolute_path(
381                &lookup,
382                "CELLOS_FIRECRACKER_SCRATCH_DIR",
383                "Firecracker scratch image directory",
384            )?,
385            manifest_path: optional_absolute_path(
386                &lookup,
387                "CELLOS_FIRECRACKER_MANIFEST",
388                "Firecracker artifact manifest file",
389            )?,
390            require_jailer: {
391                // Default to require_jailer=true. Operators may opt out by
392                // setting CELLOS_FIRECRACKER_ALLOW_NO_JAILER=1; this is a
393                // production safety knob and we log loudly when it is used.
394                let allow_no_jailer = lookup("CELLOS_FIRECRACKER_ALLOW_NO_JAILER")
395                    .map(|v| v.trim() == "1")
396                    .unwrap_or(false);
397                if allow_no_jailer {
398                    tracing::warn!(
399                        "CELLOS_FIRECRACKER_ALLOW_NO_JAILER=1 is set — running Firecracker WITHOUT the jailer. \
400                         This is unsafe for production and should only be used for local development."
401                    );
402                    false
403                } else {
404                    // Explicit override via CELLOS_FIRECRACKER_REQUIRE_JAILER, otherwise default true.
405                    lookup("CELLOS_FIRECRACKER_REQUIRE_JAILER")
406                        .map(|v| {
407                            let t = v.trim();
408                            // Accept common truthy values; anything else flips to false.
409                            !matches!(t, "0" | "false" | "FALSE" | "no" | "NO")
410                        })
411                        .unwrap_or(true)
412                }
413            },
414            allow_no_manifest: {
415                // Two-flag handshake (FC-05 / SEAM-23 hardening): the
416                // bypass requires BOTH `CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1`
417                // AND the paired escape-hatch `CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1`.
418                // A single env var can leak from a dev `.env` into a
419                // production rollout; requiring a second, explicitly named
420                // flag forces the operator to make the trade-off on the
421                // same line. We compute the boolean here and validate the
422                // combination explicitly — when only one of the two flags
423                // is set, return a *specific* error ("you set the first
424                // flag but not the second") instead of falling through to
425                // the generic "manifest is mandatory" message.
426                let primary = lookup("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST")
427                    .map(|v| v.trim() == "1")
428                    .unwrap_or(false);
429                let secondary = lookup("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY")
430                    .map(|v| v.trim() == "1")
431                    .unwrap_or(false);
432                if primary && !secondary {
433                    return Err(CellosError::Host(
434                        "firecracker init: CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1 \
435                         requires the paired escape-hatch flag \
436                         CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1 to take \
437                         effect. Without both flags set, manifest verification \
438                         remains mandatory (production posture). The two-flag \
439                         handshake exists so a dev `.env` cannot accidentally \
440                         disable digest verification in production — set both \
441                         on the same line, on purpose, or set neither and \
442                         provide CELLOS_FIRECRACKER_MANIFEST instead."
443                            .into(),
444                    ));
445                }
446                if secondary && !primary {
447                    return Err(CellosError::Host(
448                        "firecracker init: CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1 \
449                         is set but CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1 is not. \
450                         The escape-hatch is two-flag by design: set both to opt \
451                         out of pre-boot artifact digest verification (development \
452                         only), or unset both for the production posture."
453                            .into(),
454                    ));
455                }
456                primary && secondary
457            },
458            enable_network: parse_enable_network(&lookup),
459            allow_no_vsock: lookup("CELLOS_FIRECRACKER_ALLOW_NO_VSOCK")
460                .map(|v| v.trim() == "1")
461                .unwrap_or(false),
462            no_vsock_timeout: lookup("CELLOS_FIRECRACKER_NO_VSOCK_TIMEOUT_SECS")
463                .and_then(|v| v.trim().parse::<u64>().ok())
464                .map(Duration::from_secs)
465                .unwrap_or_else(|| Duration::from_secs(5)),
466            no_seccomp: lookup("CELLOS_FIRECRACKER_NO_SECCOMP")
467                .map(|v| v.trim() == "1")
468                .unwrap_or(false),
469        };
470
471        // FC-41: reject jailer_uid/gid == 0. Without this gate an operator
472        // can set `CELLOS_FIRECRACKER_JAILER_UID=0` and silently run as root.
473        // Error carries "FC-41" for audit searchability.
474        if cfg.jailer_uid == 0 {
475            return Err(CellosError::Host(
476                "FirecrackerConfig: jailer_uid must be non-zero (running jailer as root defeats the privilege boundary) [FC-41]"
477                    .into(),
478            ));
479        }
480        if cfg.jailer_gid == 0 {
481            return Err(CellosError::Host(
482                "FirecrackerConfig: jailer_gid must be non-zero (running jailer in root group defeats the privilege boundary) [FC-41]"
483                    .into(),
484            ));
485        }
486
487        if cfg.allow_no_vsock {
488            tracing::warn!(
489                timeout_secs = cfg.no_vsock_timeout.as_secs(),
490                "CELLOS_FIRECRACKER_ALLOW_NO_VSOCK=1 is set — vsock exit-code wait \
491                 will time out after the configured budget instead of blocking. \
492                 Cell terminal state will be `forced` (no authenticated in-VM exit). \
493                 This is intended for development against kernels without vsock \
494                 support; production deployments MUST keep this off."
495            );
496        }
497
498        if cfg.no_seccomp {
499            tracing::warn!(
500                "CELLOS_FIRECRACKER_NO_SECCOMP=1 is set — Firecracker will start \
501                 with --no-seccomp. Seccomp syscall filtering is DISABLED. This is \
502                 only safe for emulated development environments (e.g. arm64 Rosetta) \
503                 where the x86-64 BPF filters are rejected by the host kernel. \
504                 NEVER set this in production."
505            );
506        }
507
508        // Manifest enforcement. Mirrors the jailer guard above: production
509        // deployments MUST verify kernel/rootfs/firecracker artifact digests
510        // before boot. Operators may opt out for development by setting BOTH
511        // CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1 AND
512        // CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1 (the two-flag
513        // handshake validated when `cfg.allow_no_manifest` was assembled
514        // above); we log loudly when that combination is used. Path-required
515        // errors (binary/kernel/rootfs missing) take precedence over the
516        // manifest gate so callers see the most specific misconfiguration
517        // first.
518        match (cfg.manifest_path.is_some(), cfg.allow_no_manifest) {
519            (true, true) => {
520                // Inconsistent config — fail closed rather than silently
521                // picking one interpretation.
522                return Err(CellosError::Host(
523                    "firecracker init: CELLOS_FIRECRACKER_MANIFEST is set AND \
524                     the two-flag opt-out (CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1 \
525                     plus CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1) is also \
526                     set — these are mutually exclusive. Unset both opt-out flags \
527                     to perform manifest verification, or unset \
528                     CELLOS_FIRECRACKER_MANIFEST to run in dev mode without it."
529                        .into(),
530                ));
531            }
532            (false, false) => {
533                return Err(CellosError::Host(
534                    "firecracker init: CELLOS_FIRECRACKER_MANIFEST is not set \
535                     — pre-boot artifact digest verification is mandatory by \
536                     default. Set CELLOS_FIRECRACKER_MANIFEST to a v1 \
537                     manifest path, or set BOTH \
538                     CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1 AND \
539                     CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1 to opt \
540                     out (development only — the second flag is a deliberate \
541                     speed-bump to keep dev opt-outs from leaking into prod)."
542                        .into(),
543                ));
544            }
545            (false, true) => {
546                tracing::warn!(
547                    "MANIFEST VERIFICATION DISABLED — CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1 \
548                     and CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1 are both set. \
549                     Booting Firecracker cells WITHOUT pre-boot artifact digest verification. \
550                     This is unsafe for production and should only be used for local development."
551                );
552            }
553            (true, false) => {
554                // Standard production posture: manifest set, opt-out unset.
555            }
556        }
557
558        Ok(cfg)
559    }
560}
561
562/// Parse `CELLOS_FIRECRACKER_ENABLE_NETWORK`.  Accepts `1`/`true`/`yes`/`on`
563/// (case-insensitive) as true and `0`/`false`/`no`/`off` as false; any other
564/// value falls back to the platform default.  On non-Linux hosts where the
565/// default is `false`, log a warning the first time the value is read so the
566/// operator knows TAP/nftables are not active in their dev environment.
567fn parse_enable_network<F>(lookup: &F) -> bool
568where
569    F: Fn(&str) -> Option<String>,
570{
571    let raw = lookup("CELLOS_FIRECRACKER_ENABLE_NETWORK");
572    let parsed = raw
573        .as_deref()
574        .and_then(|v| match v.trim().to_ascii_lowercase().as_str() {
575            "1" | "true" | "yes" | "on" => Some(true),
576            "0" | "false" | "no" | "off" | "" => Some(false),
577            _ => None,
578        });
579    let enabled = parsed.unwrap_or(NETWORK_DEFAULT_ENABLED);
580    #[cfg(not(target_os = "linux"))]
581    {
582        if enabled {
583            tracing::warn!(
584                "CELLOS_FIRECRACKER_ENABLE_NETWORK requested on non-Linux host — \
585                 TAP and nftables enforcement are Linux-only and will fail at runtime"
586            );
587        } else if raw.is_none() {
588            tracing::warn!(
589                "firecracker network enforcement disabled by default on non-Linux host \
590                 (set CELLOS_FIRECRACKER_ENABLE_NETWORK=1 to override; runtime calls will error)"
591            );
592        }
593    }
594    enabled
595}
596
597// ── Live VM record ────────────────────────────────────────────────────────────
598
599#[cfg(target_os = "linux")]
600struct VmRecord {
601    socket_path: PathBuf,
602    /// Base path used for the vsock UDS (`<vsock_uds_path>_<port>`).
603    vsock_uds_path: PathBuf,
604    /// The spawned Firecracker process (kept alive for the lifetime of the cell).
605    child: Child,
606    /// Receives the `cellos-init` exit code reported over vsock.
607    /// `None` until the guest connects and writes the 4-byte value; `Some(code)`
608    /// once it is received.
609    exit_rx: tokio::sync::watch::Receiver<Option<i32>>,
610    /// When the jailer is in use, the chroot tree the jailer created for this
611    /// cell (e.g. `<chroot_base>/firecracker/<cell_id>`). `destroy()` removes
612    /// this tree on best-effort basis. `None` when the jailer is not in use.
613    chroot_cell_dir: Option<PathBuf>,
614    /// Path to the per-cell scratch ext4 image (when scratch_dir is configured).
615    /// Set to `Some(path)` if a scratch image was created in `create()`; cleaned
616    /// up in `destroy()`.
617    scratch_image_path: Option<PathBuf>,
618    /// Name of the host TAP interface created for this cell, when network
619    /// enforcement is enabled.  `None` when `enable_network` is false.
620    /// Used by `destroy()` to tear down the TAP and nftables table.
621    tap_iface: Option<String>,
622    /// Per-cell graceful-shutdown timeout (FC-21). Captured at `create()` from
623    /// `spec.run.limits.graceful_shutdown_seconds`, falling back to
624    /// [`GRACEFUL_SHUTDOWN_TIMEOUT`]. Read in `destroy()` when waiting for the
625    /// VM to exit gracefully before SIGKILL.
626    graceful_shutdown_timeout: Duration,
627}
628
629// ── Backend ───────────────────────────────────────────────────────────────────
630
631/// Firecracker-backed [`CellBackend`] (L2-06).
632///
633/// On non-Linux hosts the live-VM table collapses to an unused unit field —
634/// the backend still constructs (so the supervisor's composition root keeps
635/// compiling) but every [`CellBackend`] method short-circuits to an
636/// `Unsupported`-shaped [`CellosError::Host`]. This shapes Windows/macOS
637/// `cargo check` builds without dragging Linux-only kernel surface
638/// (`tokio::net::UnixStream`, TAP, nftables) into the cross-platform side
639/// of the workspace.
640pub struct FirecrackerCellBackend {
641    config: FirecrackerConfig,
642    #[cfg(target_os = "linux")]
643    running_vms: Arc<Mutex<HashMap<String, VmRecord>>>,
644    /// L2-06-2 warm pool of pre-booted/snapshotted VMs. `checkout` is consulted
645    /// at the head of `create()` so a populated pool slot bypasses the
646    /// ~125ms cold-boot path in favour of a ~10ms `PUT /snapshot/load`.
647    /// Constructed with [`pool::pool_size_from_env()`] slots — when the
648    /// `CELLOS_FIRECRACKER_POOL_SIZE` env var is unset/zero, the pool is a
649    /// zero-slot no-op and `checkout` always yields `None`, making the
650    /// wiring an inert pass-through on the cold-boot path.
651    #[cfg(target_os = "linux")]
652    pool: Arc<Mutex<pool::FirecrackerPool>>,
653    /// Optional CellOS event sink. When set, the backend emits a
654    /// `dev.cellos.events.cell.firecracker.v1.pool_checkout` CloudEvent
655    /// after every warm-pool checkout attempt in `create()` (best-effort —
656    /// emit failures are logged but do not fail VM creation). Left `None`
657    /// in tests and in compositions that do not opt in.
658    event_sink: Option<Arc<dyn cellos_core::ports::EventSink>>,
659}
660
661impl FirecrackerCellBackend {
662    #[cfg(target_os = "linux")]
663    pub fn new(config: FirecrackerConfig) -> Self {
664        let pool_size = pool::pool_size_from_env();
665        Self {
666            config,
667            running_vms: Arc::new(Mutex::new(HashMap::new())),
668            pool: Arc::new(Mutex::new(pool::FirecrackerPool::new(pool_size))),
669            event_sink: None,
670        }
671    }
672
673    /// Non-Linux constructor for the compile-time stub. The backend cannot be
674    /// driven on Windows/macOS — Firecracker is Linux-only — so any
675    /// [`CellBackend`] method returns a clear error. The constructor itself
676    /// succeeds so that the supervisor's composition root, which gates
677    /// runtime selection on the `CELLOS_CELL_BACKEND=firecracker` env var,
678    /// still type-checks on every host.
679    #[cfg(not(target_os = "linux"))]
680    pub fn new(config: FirecrackerConfig) -> Self {
681        Self {
682            config,
683            event_sink: None,
684        }
685    }
686
687    pub fn from_env() -> Result<Self, CellosError> {
688        Ok(Self::new(FirecrackerConfig::from_env()?))
689    }
690
691    /// Attach a CellOS [`EventSink`](cellos_core::ports::EventSink) for
692    /// best-effort emission of warm-pool checkout CloudEvents.
693    ///
694    /// When set, every `create()` call emits one
695    /// `dev.cellos.events.cell.firecracker.v1.pool_checkout` event after
696    /// consulting the warm pool, recording whether the boot took the
697    /// snapshot fast path (`poolHit`) and the pre-checkout `Available` slot
698    /// count. Emission failures are logged at `warn` and never abort VM
699    /// creation — the audit event must not become a critical-path
700    /// dependency.
701    pub fn with_event_sink(mut self, event_sink: Arc<dyn cellos_core::ports::EventSink>) -> Self {
702        self.event_sink = Some(event_sink);
703        self
704    }
705
706    pub fn config(&self) -> &FirecrackerConfig {
707        &self.config
708    }
709
710    /// Number of warm-pool slots configured (any state). Returns the value of
711    /// `CELLOS_FIRECRACKER_POOL_SIZE` resolved at backend construction. Useful
712    /// for the supervisor composition root to decide whether to spawn the
713    /// background fill task at all.
714    #[cfg(target_os = "linux")]
715    pub async fn pool_size(&self) -> usize {
716        self.pool.lock().await.size()
717    }
718
719    /// Off-Linux stub — Firecracker is Linux-only, so the pool is always empty.
720    #[cfg(not(target_os = "linux"))]
721    pub async fn pool_size(&self) -> usize {
722        0
723    }
724
725    /// Number of warm-pool slots currently in `Available` state (callable from
726    /// tests to observe that a fill cycle has run).
727    #[cfg(target_os = "linux")]
728    pub async fn pool_available(&self) -> usize {
729        self.pool.lock().await.available()
730    }
731
732    /// Off-Linux stub.
733    #[cfg(not(target_os = "linux"))]
734    pub async fn pool_available(&self) -> usize {
735        0
736    }
737
738    /// Drive one `fill()` cycle on the warm pool using the validated
739    /// firecracker binary / kernel / rootfs paths from [`FirecrackerConfig`].
740    ///
741    /// Best-effort: per-slot failures are logged and leave the slot `Empty`
742    /// (see [`pool::FirecrackerPool::fill`]). Intended to be called once at
743    /// supervisor startup from a detached `tokio::spawn` so that subsequent
744    /// `create()` calls can take the fast snapshot-restore path.
745    #[cfg(target_os = "linux")]
746    pub async fn fill_pool(&self) {
747        let binary = self.config.binary_path.to_string_lossy().into_owned();
748        let kernel = self.config.kernel_image_path.to_string_lossy().into_owned();
749        let rootfs = self.config.rootfs_image_path.to_string_lossy().into_owned();
750        let mut pool = self.pool.lock().await;
751        pool.fill(&binary, &kernel, &rootfs).await;
752    }
753
754    /// Off-Linux stub — Firecracker is Linux-only, so there is no pool to fill.
755    #[cfg(not(target_os = "linux"))]
756    pub async fn fill_pool(&self) {
757        tracing::debug!("FirecrackerCellBackend::fill_pool no-op: target_os != linux");
758    }
759
760    /// Number of VMs the backend currently tracks (for tests and operators).
761    #[cfg(target_os = "linux")]
762    pub async fn tracked_vm_count(&self) -> usize {
763        self.running_vms.lock().await.len()
764    }
765
766    /// Number of VMs the backend currently tracks (for tests and operators).
767    /// Always zero on non-Linux hosts — the backend cannot run there.
768    #[cfg(not(target_os = "linux"))]
769    pub async fn tracked_vm_count(&self) -> usize {
770        0
771    }
772
773    /// Wait until `cellos-init` inside the VM reports the cell command's exit
774    /// code over vsock, then return it.
775    ///
776    /// Returns `Err` if the cell is not tracked or the vsock channel closes
777    /// before the exit code arrives.  The lock is released before awaiting, so
778    /// other operations on the backend can proceed concurrently.
779    #[cfg(target_os = "linux")]
780    pub async fn wait_for_command_exit(&self, cell_id: &str) -> Result<i32, CellosError> {
781        let mut exit_rx = {
782            let vms = self.running_vms.lock().await;
783            let record = vms.get(cell_id).ok_or_else(|| {
784                CellosError::Host(format!(
785                    "wait_for_command_exit: no VM tracked for cell {cell_id}"
786                ))
787            })?;
788            record.exit_rx.clone()
789        };
790
791        let wait_loop = async {
792            // Wait until Some(code) is set by the vsock listener background task.
793            loop {
794                if let Some(code) = *exit_rx.borrow() {
795                    return Ok::<i32, CellosError>(code);
796                }
797                exit_rx.changed().await.map_err(|_| {
798                    CellosError::Host(format!(
799                        "vsock exit channel for cell {cell_id} closed without exit code"
800                    ))
801                })?;
802            }
803        };
804
805        if self.config.allow_no_vsock {
806            // Bounded wait. Surfaces "guest kernel has no vsock" in seconds
807            // instead of an indefinite hang. Caller maps the timeout error to
808            // a `forced` terminal state in the lifecycle event.
809            match tokio::time::timeout(self.config.no_vsock_timeout, wait_loop).await {
810                Ok(result) => result,
811                Err(_) => Err(CellosError::Host(format!(
812                    "vsock exit-code wait timed out after {}s for cell {} \
813                     (CELLOS_FIRECRACKER_ALLOW_NO_VSOCK=1). Most likely cause: \
814                     guest kernel has no virtio-vsock support — verify \
815                     CONFIG_VIRTIO_VSOCKETS=y (NOT VIRTIO_VSOCK — that's a \
816                     non-existent symbol) in scripts/firecracker/kernel.config \
817                     and rebuild. Set CELLOS_FIRECRACKER_ALLOW_NO_VSOCK=0 to \
818                     wait indefinitely.",
819                    self.config.no_vsock_timeout.as_secs(),
820                    cell_id,
821                ))),
822            }
823        } else {
824            wait_loop.await
825        }
826    }
827}
828
829#[cfg(target_os = "linux")]
830#[async_trait]
831impl CellBackend for FirecrackerCellBackend {
832    /// Boot a Firecracker microVM for the cell.
833    ///
834    /// The VM is configured with the image paths from [`FirecrackerConfig`].
835    /// If `spec.environment.imageDigest` is set it is recorded but not yet
836    /// verified by this crate (digest verification is a future L2-06 milestone).
837    #[instrument(skip(self, spec), fields(cell_id = %spec.spec.id))]
838    async fn create(&self, spec: &ExecutionCellDocument) -> Result<CellHandle, CellosError> {
839        if spec.spec.id.is_empty() {
840            return Err(CellosError::InvalidSpec("spec.id must be non-empty".into()));
841        }
842
843        // Mandatory jailer enforcement. Production deployments must run
844        // Firecracker under the jailer (chroot + uid/gid drop + seccomp).
845        // Operators may opt out for development by setting
846        // CELLOS_FIRECRACKER_ALLOW_NO_JAILER=1 which flips this flag to false.
847        if self.config.require_jailer && self.config.jailer_binary_path.is_none() {
848            return Err(CellosError::Host(
849                "jailer is required for production use (set CELLOS_FIRECRACKER_JAILER_BINARY or CELLOS_FIRECRACKER_ALLOW_NO_JAILER=1 to opt out)"
850                    .into(),
851            ));
852        }
853
854        // Refuse to start a cell that declares egress when the host backend
855        // cannot enforce it.  This is a fail-closed guard: silently ignoring
856        // egressRules would let a spec believe its network is locked down when
857        // in fact the VM has unrestricted host networking (or none, depending
858        // on the operator's TAP setup).
859        let declared_egress: &[EgressRule] = spec
860            .spec
861            .authority
862            .egress_rules
863            .as_deref()
864            .unwrap_or_default();
865        if !self.config.enable_network && !declared_egress.is_empty() {
866            return Err(CellosError::Host(
867                "spec declares egress_rules but network enforcement is disabled \
868                 (set CELLOS_FIRECRACKER_ENABLE_NETWORK=1)"
869                    .into(),
870            ));
871        }
872
873        // Unique run token per cell.
874        let run_token = Uuid::new_v4();
875
876        // Compute the API socket path. With jailer active this is inside the
877        // chroot; without the jailer it is a per-run file in `socket_dir`.
878        let socket_path = resolve_socket_path(&self.config, &spec.spec.id, &run_token);
879
880        // Log declared environment if present.
881        if let Some(env) = &spec.spec.environment {
882            tracing::info!(
883                image_reference = %env.image_reference,
884                image_digest = env.image_digest.as_deref().unwrap_or("(not pinned)"),
885                template_id = env.template_id.as_deref().unwrap_or("(none)"),
886                "cell environment declared"
887            );
888        }
889
890        // Build the spawn command. When the jailer is configured, invoke it
891        // instead of firecracker directly so the VMM ends up in a chroot under
892        // a dedicated uid/gid. Otherwise (development mode) spawn firecracker
893        // directly.
894        //
895        // String fields passed via `args(...)` must outlive the array, so we
896        // materialize the path strings first and then borrow them.
897        let mut cmd = if let Some(jailer_bin) = &self.config.jailer_binary_path {
898            let exec_file_str = self.config.binary_path.to_string_lossy().into_owned();
899            let uid_str = self.config.jailer_uid.to_string();
900            let gid_str = self.config.jailer_gid.to_string();
901            let chroot_str = self.config.chroot_base_dir.to_string_lossy().into_owned();
902            let mut c = tokio::process::Command::new(jailer_bin);
903            let argv = build_jailer_argv(
904                spec.spec.id.as_str(),
905                exec_file_str.as_str(),
906                uid_str.as_str(),
907                gid_str.as_str(),
908                chroot_str.as_str(),
909                self.config.no_seccomp,
910            );
911            c.args(&argv);
912            c
913        } else {
914            let socket_str = socket_path.to_string_lossy().into_owned();
915            let mut c = tokio::process::Command::new(&self.config.binary_path);
916            let argv = build_direct_argv(socket_str.as_str(), self.config.no_seccomp);
917            c.args(&argv);
918            c
919        };
920        cmd.kill_on_drop(true);
921
922        let child = cmd.spawn().map_err(|e| {
923            let bin = if let Some(j) = &self.config.jailer_binary_path {
924                j.display().to_string()
925            } else {
926                self.config.binary_path.display().to_string()
927            };
928            let label = if self.config.jailer_binary_path.is_some() {
929                "jailer"
930            } else {
931                "firecracker"
932            };
933            CellosError::Host(format!("spawn {label} ({bin}): {e}"))
934        })?;
935
936        tracing::info!(
937            cell_id = %spec.spec.id,
938            socket = %socket_path.display(),
939            "firecracker process spawned"
940        );
941
942        // Wait for the socket to become available.
943        let client = FirecrackerApiClient::new(&socket_path);
944        wait_for_socket_ready(&socket_path, SOCKET_READY_TIMEOUT).await?;
945
946        // Derive the vsock UDS base path.  Firecracker will connect to
947        // `<vsock_uds_path>_<port>` when the guest initiates a connection.
948        let vsock_uds_path = self.config.socket_dir.join(format!(
949            "cellos-vsock-{}-{}.socket",
950            spec.spec.id, run_token
951        ));
952
953        // FC-18: generate a fresh per-cell HMAC key BEFORE binding the
954        // listener, so the same key value flows into both the listener task
955        // (for verification) and the kernel cmdline (for the guest's tag
956        // computation). A short-read on /dev/urandom is fatal — refuse to
957        // launch a cell with weak/known authentication material.
958        let exit_hmac_key = generate_exit_hmac_key()?;
959
960        // Start listening for the exit-code report from cellos-init.
961        // We must bind the socket BEFORE booting the VM so no connection is missed.
962        let (exit_watch_tx, exit_watch_rx) = tokio::sync::watch::channel::<Option<i32>>(None);
963        let exit_socket_path = PathBuf::from(format!("{}_9000", vsock_uds_path.display()));
964        let exit_socket_path_bg = exit_socket_path.clone();
965        let listener_key = exit_hmac_key;
966        let listener_cell_id = spec.spec.id.clone();
967        tokio::spawn(async move {
968            match listen_for_exit_code(&exit_socket_path_bg, &listener_key, &listener_cell_id).await
969            {
970                Ok(code) => {
971                    let _ = exit_watch_tx.send(Some(code));
972                }
973                Err(e) => {
974                    tracing::warn!(error = %e, "vsock exit-code listener failed");
975                }
976            }
977            let _ = std::fs::remove_file(&exit_socket_path_bg);
978        });
979
980        // Optionally create a per-cell scratch ext4 image.
981        // When scratch_dir is set, the rootfs will be mounted read-only and this
982        // writable drive will be attached as /dev/vdb.
983        let scratch_image_path = if let Some(scratch_dir) = &self.config.scratch_dir {
984            std::fs::create_dir_all(scratch_dir).map_err(|e| {
985                CellosError::Host(format!("create scratch_dir {}: {e}", scratch_dir.display()))
986            })?;
987            let scratch_path = scratch_dir.join(format!(
988                "cellos-scratch-{}-{}.ext4",
989                spec.spec.id, run_token
990            ));
991            let scratch_mib = spec
992                .spec
993                .run
994                .as_ref()
995                .and_then(|r| r.limits.as_ref())
996                .and_then(|l| l.memory_max_bytes)
997                .map(|b| ((b / (1024 * 1024)) as u32).clamp(64, 2048))
998                .unwrap_or(512);
999            create_scratch_image(&scratch_path, scratch_mib).await?;
1000            Some(scratch_path)
1001        } else {
1002            None
1003        };
1004
1005        // Provision per-cell network isolation: TAP device + nftables ruleset.
1006        // The TAP is created BEFORE the API configuration call so Firecracker
1007        // can `open(/dev/net/tun)` and bind to a device that already exists and
1008        // is owned by the uid the VMM will drop to.  Failure here is fatal —
1009        // we do not boot a cell that declared egress without enforcement.
1010        let cell_short = cell_id_short(&spec.spec.id);
1011        let tap_iface = if self.config.enable_network {
1012            let name = create_tap_device(&cell_short, self.config.jailer_uid).await?;
1013            if let Err(e) = apply_network_policy(&cell_short, &name, declared_egress).await {
1014                // TAP was created; nftables were not applied. Clean up the TAP
1015                // before surfacing the error so we don't orphan the device.
1016                let _ = delete_tap_device(&name).await;
1017                return Err(e);
1018            }
1019            Some(name)
1020        } else {
1021            None
1022        };
1023
1024        // L2-06-2 warm-pool fast path. Consult the pool BEFORE the cold-boot
1025        // `configure_vm` + `InstanceStart` sequence. A populated slot lets us
1026        // skip ~125ms of kernel decompression + init handshake in favour of a
1027        // ~10ms `PUT /snapshot/load`. When the pool is disabled
1028        // (`CELLOS_FIRECRACKER_POOL_SIZE` unset/zero) or has no `Available`
1029        // slot, `checkout` returns `None` and we drop through to the
1030        // cold-boot path verbatim.
1031        //
1032        // Path discipline: `fill_one_slot` writes the paired snapshot files
1033        // at `/tmp/cellos-pool-<vm_id>.snap` (state) and
1034        // `/tmp/cellos-pool-<vm_id>.mem` (memory dump). `checkout` returns
1035        // only the state path; the mem path is derived by suffix-swap so
1036        // the contract stays a single `Option<PathBuf>` instead of widening
1037        // to a 2-tuple. If the on-disk convention diverges from this swap,
1038        // the restore will fail and the caller (this site) falls back to
1039        // cold boot — the warm pool is a best-effort latency optimisation,
1040        // not a correctness gate (mirrors the doc on `FirecrackerPool::fill`).
1041        let (pool_snapshot, pre_checkout_available): (Option<(PathBuf, PathBuf)>, usize) = {
1042            let mut pool = self.pool.lock().await;
1043            // Capture Available slot count *before* checkout — the audit
1044            // event records the supply observed at decision time, not the
1045            // post-decrement count.
1046            let pre_available = pool.available();
1047            let snap = pool.checkout(&spec.spec.id).await.map(|snap_path| {
1048                let mem_path = snap_path.with_extension("mem");
1049                (snap_path, mem_path)
1050            });
1051            (snap, pre_available)
1052        };
1053
1054        // FC-warm-pool audit: emit a single `pool_checkout` CloudEvent per
1055        // create() call, recording whether the fast path was taken and the
1056        // pool's pre-checkout supply. Best-effort: a sink failure is logged
1057        // at warn level and never converted into a create() failure. When
1058        // no event sink is wired (the default in tests and minimal
1059        // compositions) this branch is a no-op.
1060        if let Some(ref event_sink) = self.event_sink {
1061            let event = cellos_core::events::cloud_event_v1_firecracker_pool_checkout(
1062                "cellos-host-firecracker",
1063                &chrono::Utc::now().to_rfc3339(),
1064                &spec.spec.id,
1065                pool_snapshot.is_some(),
1066                pre_checkout_available,
1067            );
1068            if let Err(e) = event_sink.emit(&event).await {
1069                tracing::warn!(
1070                    target: "cellos.host.firecracker",
1071                    cell_id = %spec.spec.id,
1072                    error = %e,
1073                    "pool_checkout CloudEvent emit failed (best-effort)"
1074                );
1075            }
1076        }
1077
1078        // Configure the VM, verify artifact digests, and boot — all three can
1079        // fail.  If any step fails after TAP+nftables are provisioned, clean
1080        // them up before returning so resources track live cells only.
1081        let boot_result: Result<VerifiedDigests, CellosError> = async {
1082            if let Some((snap_path, mem_path)) = pool_snapshot.as_ref() {
1083                tracing::info!(
1084                    cell_id = %spec.spec.id,
1085                    snapshot = %snap_path.display(),
1086                    mem = %mem_path.display(),
1087                    "warm-pool fast path: attempting PUT /snapshot/load"
1088                );
1089                // Manifest digest verification still applies — a snapshotted
1090                // VM is only as trustworthy as the kernel/rootfs/firecracker
1091                // binaries it was captured from. We verify against the
1092                // current config's pinned digests so a swapped-out artifact
1093                // on disk still trips the manifest gate.
1094                let verified = verify_artifacts(&self.config).await?;
1095                // `restore_into` issues `PUT /snapshot/load` with
1096                // `resume_vm: true`, so the VM is running on return — no
1097                // separate `InstanceStart` call is needed on this path.
1098                pool::restore_into(&client, snap_path, mem_path).await?;
1099                return Ok(verified);
1100            }
1101
1102            configure_vm(
1103                &client,
1104                &self.config,
1105                spec,
1106                &vsock_uds_path,
1107                scratch_image_path.as_deref(),
1108                tap_iface.as_deref(),
1109                &exit_hmac_key,
1110            )
1111            .await?;
1112
1113            // Verify artifact digests BEFORE booting the VM. If any digest does
1114            // not match the manifest, refuse to start the cell — running an
1115            // unverified kernel/rootfs/firecracker binary in a production
1116            // deployment is exactly the kind of supply-chain failure mode the
1117            // manifest exists to prevent.
1118            //
1119            // FC-08: capture the verified digests so we can surface them on
1120            // the [`CellHandle`] for the supervisor to embed on
1121            // `cell.lifecycle.v1.started`.
1122            let verified = verify_artifacts(&self.config).await?;
1123
1124            // Boot the VM.
1125            let status = client
1126                .put(
1127                    "/actions",
1128                    &InstanceAction {
1129                        action_type: InstanceActionType::InstanceStart,
1130                    },
1131                )
1132                .await?;
1133
1134            if !status.is_success() {
1135                return Err(CellosError::Host(format!(
1136                    "firecracker InstanceStart returned HTTP {status}"
1137                )));
1138            }
1139
1140            Ok(verified)
1141        }
1142        .await;
1143
1144        let verified_digests = match boot_result {
1145            Ok(v) => v,
1146            Err(e) => {
1147                if let Some(ref tap) = tap_iface {
1148                    let _ = delete_tap_device(tap).await;
1149                    let _ = remove_network_policy(&cell_short).await;
1150                }
1151                // If the warm-pool fast path failed mid-restore, release the
1152                // slot back to `Empty` so a background filler can re-populate
1153                // it from a fresh boot. Leaving the slot in `InUse` would
1154                // permanently shrink the warm pool every time a restore
1155                // errors. `checkin` is a no-op when the slot wasn't held by
1156                // this cell, so the call is safe regardless of whether the
1157                // fast path was taken.
1158                if pool_snapshot.is_some() {
1159                    let _ = self.pool.lock().await.checkin(&spec.spec.id).await;
1160                }
1161                return Err(e);
1162            }
1163        };
1164
1165        tracing::info!(cell_id = %spec.spec.id, "firecracker VM booted");
1166
1167        // Track the live VM.
1168        let chroot_cell_dir = self.config.jailer_binary_path.as_ref().map(|_| {
1169            let fc_name = self
1170                .config
1171                .binary_path
1172                .file_name()
1173                .expect("firecracker binary path must have a filename")
1174                .to_string_lossy()
1175                .into_owned();
1176            self.config
1177                .chroot_base_dir
1178                .join(fc_name)
1179                .join(&spec.spec.id)
1180        });
1181        // Capture the nft enforcement signal *before* `tap_iface` moves into
1182        // `VmRecord`. Surfacing this on the returned [`CellHandle`] lets the
1183        // supervisor emit a `network_enforcement` CloudEvent on the in-VM exit
1184        // path with parity to the host-subprocess path: `Some(true)` when a TAP
1185        // was provisioned (which implies `apply_network_policy` ran), or
1186        // `Some(false)` when networking was explicitly disabled.
1187        let nft_rules_applied = Some(tap_iface.is_some());
1188
1189        let graceful_shutdown_timeout = resolve_graceful_shutdown_timeout(&spec.spec);
1190
1191        self.running_vms.lock().await.insert(
1192            spec.spec.id.clone(),
1193            VmRecord {
1194                socket_path,
1195                vsock_uds_path,
1196                child,
1197                exit_rx: exit_watch_rx,
1198                chroot_cell_dir,
1199                scratch_image_path,
1200                tap_iface,
1201                graceful_shutdown_timeout,
1202            },
1203        );
1204
1205        Ok(CellHandle {
1206            cell_id: spec.spec.id.clone(),
1207            cgroup_path: None,
1208            nft_rules_applied,
1209            // FC-08: surface the verified manifest digests so the supervisor
1210            // can stamp them onto cell.lifecycle.v1.started. When manifest
1211            // verification was skipped (allow_no_manifest opt-out), all three
1212            // are `None` and the started event omits the fields entirely.
1213            kernel_digest_sha256: verified_digests.kernel,
1214            rootfs_digest_sha256: verified_digests.rootfs,
1215            firecracker_digest_sha256: verified_digests.firecracker,
1216        })
1217    }
1218
1219    /// Wait for `cellos-init` inside the VM to report the cell command's exit
1220    /// code over vsock, then return it.
1221    ///
1222    /// This overrides the default `None` so the supervisor skips its host-side
1223    /// `run_cell_command` path and waits for the in-VM result instead.
1224    async fn wait_for_in_vm_exit(&self, cell_id: &str) -> Option<Result<i32, CellosError>> {
1225        Some(self.wait_for_command_exit(cell_id).await)
1226    }
1227
1228    /// Gracefully shut down the Firecracker VM, then SIGKILL if it does not
1229    /// exit within the cell's graceful-shutdown window.
1230    ///
1231    /// The window is the per-spec `run.limits.gracefulShutdownSeconds` (FC-21),
1232    /// captured into [`VmRecord::graceful_shutdown_timeout`] at `create()`,
1233    /// or [`GRACEFUL_SHUTDOWN_TIMEOUT`] when the spec omits the field.
1234    #[instrument(skip(self, handle), fields(cell_id = %handle.cell_id))]
1235    async fn destroy(&self, handle: &CellHandle) -> Result<TeardownReport, CellosError> {
1236        let mut vms = self.running_vms.lock().await;
1237        let Some(mut record) = vms.remove(&handle.cell_id) else {
1238            tracing::warn!(cell_id = %handle.cell_id, "destroy called on unknown cell");
1239            return Ok(TeardownReport {
1240                cell_id: handle.cell_id.clone(),
1241                destroyed: false,
1242                peers_tracked_after: vms.len(),
1243            });
1244        };
1245
1246        // Try graceful shutdown via the API.
1247        let client = FirecrackerApiClient::new(&record.socket_path);
1248        let graceful = client
1249            .put(
1250                "/actions",
1251                &InstanceAction {
1252                    action_type: InstanceActionType::SendCtrlAltDel,
1253                },
1254            )
1255            .await;
1256
1257        if let Err(e) = graceful {
1258            tracing::debug!(error = %e, "graceful shutdown request failed — will SIGKILL");
1259        }
1260
1261        // Wait for the process to exit, or SIGKILL after timeout. FC-21: prefer
1262        // the per-spec window captured at create() — falls back to the const.
1263        let exited = tokio::time::timeout(record.graceful_shutdown_timeout, record.child.wait())
1264            .await
1265            .ok();
1266
1267        if exited.is_none() {
1268            tracing::warn!(cell_id = %handle.cell_id, "VM did not exit gracefully — sending SIGKILL");
1269            let _ = record.child.kill().await;
1270            let _ = record.child.wait().await;
1271        }
1272
1273        // Clean up socket files (best effort).
1274        if record.socket_path.exists() {
1275            let _ = std::fs::remove_file(&record.socket_path);
1276        }
1277        // Best-effort cleanup of jailer chroot tree. Only present when the
1278        // jailer was active for this cell.
1279        if let Some(chroot_dir) = record.chroot_cell_dir {
1280            let _ = std::fs::remove_dir_all(&chroot_dir);
1281        }
1282        // Also clean up the vsock UDS base file and any per-port listener sockets.
1283        let _ = std::fs::remove_file(&record.vsock_uds_path);
1284        let vsock_exit_socket = PathBuf::from(format!(
1285            "{}_{VSOCK_EXIT_PORT}",
1286            record.vsock_uds_path.display()
1287        ));
1288        let _ = std::fs::remove_file(&vsock_exit_socket);
1289
1290        // Remove per-cell scratch image if one was created.
1291        if let Some(scratch) = record.scratch_image_path {
1292            let _ = std::fs::remove_file(&scratch);
1293        }
1294
1295        // Tear down per-cell network isolation.  Both calls are idempotent and
1296        // best-effort: a cell whose TAP/nftables setup half-failed during
1297        // `create()` may have only one of these resources, and we still want
1298        // `destroy()` to run to completion so the VmRecord stays out of the
1299        // map.
1300        if let Some(tap) = record.tap_iface.as_deref() {
1301            if let Err(e) = delete_tap_device(tap).await {
1302                tracing::warn!(error = %e, tap = %tap, "delete TAP device failed");
1303            }
1304        }
1305        let cell_short = cell_id_short(&handle.cell_id);
1306        if let Err(e) = remove_network_policy(&cell_short).await {
1307            tracing::warn!(error = %e, cell_short = %cell_short, "remove nftables policy failed");
1308        }
1309
1310        tracing::info!(cell_id = %handle.cell_id, "firecracker VM destroyed");
1311
1312        let peers_after = vms.len();
1313        Ok(TeardownReport {
1314            cell_id: handle.cell_id.clone(),
1315            destroyed: true,
1316            peers_tracked_after: peers_after,
1317        })
1318    }
1319}
1320
1321// ── Non-Linux stub `CellBackend` impl ────────────────────────────────────────
1322//
1323// On Windows/macOS the supervisor's composition root still depends on the
1324// `FirecrackerCellBackend` type so that `cargo check --workspace` compiles.
1325// At runtime the backend is only ever selected when
1326// `CELLOS_CELL_BACKEND=firecracker`, and on a non-Linux host that
1327// configuration is itself an operator error — every `CellBackend` method
1328// here returns a clear `Host` error instead of attempting Linux-specific
1329// I/O. The `from_env()` constructor still succeeds so a developer running
1330// `cargo check` doesn't need any Firecracker artefacts on disk.
1331
1332#[cfg(not(target_os = "linux"))]
1333#[async_trait]
1334impl CellBackend for FirecrackerCellBackend {
1335    async fn create(&self, _spec: &ExecutionCellDocument) -> Result<CellHandle, CellosError> {
1336        Err(CellosError::Host(
1337            "FirecrackerCellBackend is only supported on Linux \
1338             (Firecracker requires Linux/KVM); compiled as a stub on this host"
1339                .into(),
1340        ))
1341    }
1342
1343    async fn destroy(&self, _handle: &CellHandle) -> Result<TeardownReport, CellosError> {
1344        Err(CellosError::Host(
1345            "FirecrackerCellBackend is only supported on Linux \
1346             (Firecracker requires Linux/KVM); compiled as a stub on this host"
1347                .into(),
1348        ))
1349    }
1350}
1351
1352// ── Helpers (Linux-only) ──────────────────────────────────────────────────────
1353//
1354// Everything below this banner depends on Linux-specific runtime surface:
1355// `tokio::net::{UnixStream, UnixListener}` (the Firecracker management API
1356// transport), the `ip`/`nft` host commands, and TAP devices. Gated as one
1357// block so the cross-platform compile only sees the `FirecrackerConfig`
1358// data type plus the no-op stub `CellBackend` impl below.
1359
1360/// Wait for the Firecracker Unix socket to exist and accept connections.
1361#[cfg(target_os = "linux")]
1362async fn wait_for_socket_ready(
1363    socket_path: &Path,
1364    connect_timeout: Duration,
1365) -> Result<(), CellosError> {
1366    let deadline = tokio::time::Instant::now() + connect_timeout;
1367    loop {
1368        if socket_path.exists() && tokio::net::UnixStream::connect(socket_path).await.is_ok() {
1369            return Ok(());
1370        }
1371        if tokio::time::Instant::now() >= deadline {
1372            return Err(CellosError::Host(format!(
1373                "timed out waiting for Firecracker socket at {} ({}s)",
1374                socket_path.display(),
1375                connect_timeout.as_secs()
1376            )));
1377        }
1378        tokio::time::sleep(Duration::from_millis(50)).await;
1379    }
1380}
1381
1382/// Compute the API socket path for a cell.
1383///
1384/// When the jailer is active it creates the socket inside the chroot at
1385/// `<chroot_base>/<firecracker_filename>/<cell_id>/root/run/firecracker.socket`.
1386/// Without the jailer the simple per-run path in `socket_dir` is used.
1387#[cfg(target_os = "linux")]
1388fn resolve_socket_path(config: &FirecrackerConfig, cell_id: &str, run_token: &Uuid) -> PathBuf {
1389    if config.jailer_binary_path.is_some() {
1390        let fc_name = config
1391            .binary_path
1392            .file_name()
1393            .expect("firecracker binary path must have a filename")
1394            .to_string_lossy()
1395            .into_owned();
1396        config
1397            .chroot_base_dir
1398            .join(fc_name)
1399            .join(cell_id)
1400            .join("root/run/firecracker.socket")
1401    } else {
1402        config
1403            .socket_dir
1404            .join(format!("cellos-fc-{cell_id}-{run_token}.socket"))
1405    }
1406}
1407
1408/// Create a sparse ext4 scratch image at `path` of `size_mib` MiB.
1409///
1410/// Uses `dd` (sparse) to allocate a file then `mkfs.ext4` to format it.
1411/// Both are standard on any Linux host that can run Firecracker.
1412#[cfg(target_os = "linux")]
1413async fn create_scratch_image(path: &Path, size_mib: u32) -> Result<(), CellosError> {
1414    // Sparse file: write no bytes, just set the file size via seek.
1415    let dd = tokio::process::Command::new("dd")
1416        .args([
1417            "if=/dev/zero",
1418            &format!("of={}", path.display()),
1419            "bs=1M",
1420            "count=0",
1421            &format!("seek={size_mib}"),
1422        ])
1423        .output()
1424        .await
1425        .map_err(|e| CellosError::Host(format!("dd for scratch image: {e}")))?;
1426    if !dd.status.success() {
1427        return Err(CellosError::Host(format!(
1428            "dd failed creating scratch image at {}: exit {:?}",
1429            path.display(),
1430            dd.status.code()
1431        )));
1432    }
1433    // Format as ext4.
1434    let mkfs = tokio::process::Command::new("mkfs.ext4")
1435        .args(["-F", &path.to_string_lossy()])
1436        .output()
1437        .await
1438        .map_err(|e| CellosError::Host(format!("mkfs.ext4 for scratch image: {e}")))?;
1439    if !mkfs.status.success() {
1440        return Err(CellosError::Host(format!(
1441            "mkfs.ext4 failed on {}: exit {:?}",
1442            path.display(),
1443            mkfs.status.code()
1444        )));
1445    }
1446    Ok(())
1447}
1448
1449/// Send configuration PUT requests to the Firecracker API.
1450#[cfg(target_os = "linux")]
1451async fn configure_vm(
1452    client: &FirecrackerApiClient,
1453    config: &FirecrackerConfig,
1454    spec: &ExecutionCellDocument,
1455    vsock_uds_path: &Path,
1456    scratch_image_path: Option<&Path>,
1457    tap_iface: Option<&str>,
1458    exit_hmac_key: &[u8],
1459) -> Result<(), CellosError> {
1460    // L2-06-4: validate the jailer security configuration BEFORE issuing any
1461    // boot-side API calls. If the operator has somehow landed here with uid=0,
1462    // gid=0, or chroot=/, refuse to configure the VM — better to fail loudly
1463    // than to silently boot a cell without the privilege boundary we claim to
1464    // enforce.
1465    validate_jailer_security_config(config)?;
1466
1467    // L2-06-3: machine memory limit derives from `spec.run.limits.memoryMax`,
1468    // falling back to the static default. Extracted into `derive_mem_size_mib`
1469    // so the precedence is unit-tested.
1470    let mem_mib = derive_mem_size_mib(&spec.spec, DEFAULT_MEM_SIZE_MIB);
1471
1472    let machine_status = client
1473        .put(
1474            "/machine-config",
1475            &MachineConfig {
1476                vcpu_count: derive_vcpu_count(&spec.spec),
1477                mem_size_mib: mem_mib,
1478                track_dirty_pages: false,
1479            },
1480        )
1481        .await?;
1482
1483    if !machine_status.is_success() {
1484        return Err(CellosError::Host(format!(
1485            "firecracker PUT /machine-config returned HTTP {machine_status}"
1486        )));
1487    }
1488
1489    // Boot source — kernel + boot args encoding the cell command.
1490    //
1491    // The rootfs boots into cellos-init, which parses `cellos.argv=<base64>`
1492    // from the kernel cmdline, executes the cell command in-VM, and forwards
1493    // the exit code to the host over vsock. Remaining L2-06 work is packaging
1494    // and production hardening rather than host-side subprocess fallback.
1495    let boot_args = build_boot_args(spec, Some(exit_hmac_key));
1496    let boot_status = client
1497        .put(
1498            "/boot-source",
1499            &BootSource {
1500                kernel_image_path: config.kernel_image_path.to_string_lossy().into_owned(),
1501                boot_args: Some(boot_args),
1502            },
1503        )
1504        .await?;
1505
1506    if !boot_status.is_success() {
1507        return Err(CellosError::Host(format!(
1508            "firecracker PUT /boot-source returned HTTP {boot_status}"
1509        )));
1510    }
1511
1512    // L2-06-1: if the spec pins a content-addressable image digest, hash the
1513    // configured rootfs file and refuse to attach it unless the on-disk bytes
1514    // match. The host-wide manifest (verify_artifacts) is the supply-chain
1515    // boundary (was THIS rootfs binary built by the trusted pipeline?); this
1516    // is the per-cell content-addressing boundary (is THIS rootfs the exact
1517    // image the spec asked for?). Both must hold for a production cell to
1518    // boot.
1519    //
1520    // sha256_file streams in 64 KiB chunks; even multi-hundred-MiB rootfs
1521    // images do not balloon RSS. Hash on the blocking pool so the runtime
1522    // stays responsive during large-image verifications.
1523    if let Some(env) = spec.spec.environment.as_ref() {
1524        if let Some(expected) = env.image_digest.as_ref() {
1525            let rootfs_owned = config.rootfs_image_path.clone();
1526            let expected_owned = expected.clone();
1527            tokio::task::spawn_blocking(move || {
1528                verify_rootfs_digest(&rootfs_owned, &expected_owned)
1529            })
1530            .await
1531            .map_err(|e| {
1532                CellosError::Host(format!(
1533                    "rootfs digest verification task panicked or was cancelled: {e}"
1534                ))
1535            })??;
1536            tracing::info!(
1537                rootfs = %config.rootfs_image_path.display(),
1538                expected_digest = %expected,
1539                "rootfs content-addressing verified (L2-06-1)"
1540            );
1541        }
1542    }
1543
1544    // Root drive.
1545    //
1546    // When a scratch image is configured, the rootfs is mounted read-only so
1547    // multiple concurrent cells can safely share the same base image; per-cell
1548    // writes go to the dedicated scratch drive.
1549    let drive_status = client
1550        .put(
1551            "/drives/rootfs",
1552            &Drive {
1553                drive_id: "rootfs".into(),
1554                path_on_host: config.rootfs_image_path.to_string_lossy().into_owned(),
1555                is_root_device: true,
1556                is_read_only: scratch_image_path.is_some(),
1557            },
1558        )
1559        .await?;
1560
1561    if !drive_status.is_success() {
1562        return Err(CellosError::Host(format!(
1563            "firecracker PUT /drives/rootfs returned HTTP {drive_status}"
1564        )));
1565    }
1566
1567    // Optional writable scratch drive (attached as /dev/vdb in the guest).
1568    if let Some(scratch) = scratch_image_path {
1569        let scratch_status = client
1570            .put(
1571                "/drives/scratch",
1572                &Drive {
1573                    drive_id: "scratch".into(),
1574                    path_on_host: scratch.to_string_lossy().into_owned(),
1575                    is_root_device: false,
1576                    is_read_only: false,
1577                },
1578            )
1579            .await?;
1580        if !scratch_status.is_success() {
1581            return Err(CellosError::Host(format!(
1582                "firecracker PUT /drives/scratch returned HTTP {scratch_status}"
1583            )));
1584        }
1585    }
1586
1587    // Virtio-net device, when a host TAP has been provisioned for this cell.
1588    // The TAP must already exist on the host and be owned by the uid the VMM
1589    // will run under (Firecracker calls `open(/dev/net/tun)` and binds via
1590    // `TUNSETIFF` — it does not create the device itself).
1591    if let Some(tap) = tap_iface {
1592        let net_status = client
1593            .put(
1594                "/network-interfaces/eth0",
1595                &NetworkInterface {
1596                    iface_id: "eth0".into(),
1597                    guest_mac: GUEST_NIC_MAC.into(),
1598                    host_dev_name: tap.to_owned(),
1599                },
1600            )
1601            .await?;
1602        if !net_status.is_success() {
1603            return Err(CellosError::Host(format!(
1604                "firecracker PUT /network-interfaces/eth0 returned HTTP {net_status}"
1605            )));
1606        }
1607    }
1608
1609    // Vsock device — enables cellos-init to send the exit code back to the host.
1610    // Guest-initiated connections to port P arrive at `<vsock_uds_path>_P` on
1611    // the host.  The host listener for port VSOCK_EXIT_PORT is already bound
1612    // before this call.
1613    let vsock_status = client
1614        .put(
1615            "/vsock",
1616            &VsockDevice {
1617                guest_cid: VSOCK_GUEST_CID,
1618                uds_path: vsock_uds_path.to_string_lossy().into_owned(),
1619            },
1620        )
1621        .await?;
1622
1623    if !vsock_status.is_success() {
1624        return Err(CellosError::Host(format!(
1625            "firecracker PUT /vsock returned HTTP {vsock_status}"
1626        )));
1627    }
1628
1629    Ok(())
1630}
1631
1632/// Build kernel command-line boot args for the cell.
1633///
1634/// Standard Linux console args + `cellos.*` parameters consumed by
1635/// `cellos-init` running as PID 1 inside the guest:
1636///
1637/// - `cellos.cell_id` — cell identifier for log attribution
1638/// - `cellos.vsock_port` — port on the host to send the exit code to
1639/// - `cellos.argv` — base64-encoded JSON array of strings (the cell command)
1640/// - `cellos.exit_hmac_key` — FC-18: base64-encoded per-cell HMAC-SHA256 key,
1641///   plumbed only when `exit_hmac_key.is_some()`. The guest uses this key to
1642///   authenticate its exit-code report; the host verifies the tag in
1643///   `listen_for_exit_code`.
1644///
1645/// If `spec.run.argv` is empty the `cellos.argv` parameter is omitted; the
1646/// guest boots normally but `cellos-init` will log an error and exit.
1647#[cfg(target_os = "linux")]
1648pub(crate) fn build_boot_args(
1649    spec: &ExecutionCellDocument,
1650    exit_hmac_key: Option<&[u8]>,
1651) -> String {
1652    // root=/dev/vda mounts the virtio-blk root drive (the rootfs.ext4 image) as
1653    // the kernel's root filesystem. Without it the kernel panics with
1654    // "Unable to mount root fs on unknown-block(0,0)" after serial init.
1655    // ipv6.disable=1: prevent the guest kernel from bringing up an IPv6 stack.
1656    // The nftables ruleset uses `table ip` (IPv4-only); without this flag a
1657    // guest workload that pulls in an IPv6 default route bypasses the allowlist
1658    // entirely. Defense-in-depth: the ip6 table also enforces policy drop (see
1659    // build_nftables_ruleset), but disabling at boot is the primary control.
1660    let base = "console=ttyS0 reboot=k panic=1 pci=off ipv6.disable=1 root=/dev/vda rw";
1661    let cell_id = &spec.spec.id;
1662    let mut args = format!("{base} cellos.cell_id={cell_id} cellos.vsock_port={VSOCK_EXIT_PORT}");
1663
1664    // Encode spec.run.argv as base64(json) for cellos-init.
1665    if let Some(argv) = spec
1666        .spec
1667        .run
1668        .as_ref()
1669        .map(|r| &r.argv)
1670        .filter(|a| !a.is_empty())
1671    {
1672        if let Ok(json) = serde_json::to_string(argv) {
1673            let b64 = BASE64_STANDARD.encode(json.as_bytes());
1674            args.push_str(&format!(" cellos.argv={b64}"));
1675        }
1676    }
1677
1678    // FC-18: plumb the per-cell HMAC key when one was generated. Tokens on
1679    // the kernel cmdline are space-separated, so base64 (no padding, no
1680    // whitespace) is the natural encoding — same trick `cellos.argv` uses.
1681    if let Some(key) = exit_hmac_key {
1682        let b64 = BASE64_STANDARD.encode(key);
1683        args.push_str(&format!(" cellos.exit_hmac_key={b64}"));
1684    }
1685
1686    args
1687}
1688
1689/// Argv passed to the **jailer** binary when running Firecracker under it.
1690///
1691/// Extracted as a pure function so the wire-level args can be regression-tested.
1692/// Two silent reverts in commit d14f134 turned `--level` back into `--log-level`
1693/// and dropped `root=/dev/vda rw` from boot args; both were caught only at
1694/// runtime by Firecracker rejecting `--log-level` with exit 153 and the kernel
1695/// panicking at root mount. Tests asserting these literal strings are present
1696/// (and the wrong strings are absent) close that regression hole.
1697#[cfg(target_os = "linux")]
1698pub(crate) fn build_jailer_argv<'a>(
1699    spec_id: &'a str,
1700    exec_file: &'a str,
1701    uid: &'a str,
1702    gid: &'a str,
1703    chroot: &'a str,
1704    no_seccomp: bool,
1705) -> Vec<&'a str> {
1706    let mut argv = vec![
1707        "--id",
1708        spec_id,
1709        "--exec-file",
1710        exec_file,
1711        "--uid",
1712        uid,
1713        "--gid",
1714        gid,
1715        "--chroot-base-dir",
1716        chroot,
1717        "--",
1718        "--api-sock",
1719        "/run/firecracker.socket",
1720        "--level",
1721        "Error",
1722    ];
1723    if no_seccomp {
1724        argv.push("--no-seccomp");
1725    }
1726    argv
1727}
1728
1729/// Argv passed to the Firecracker binary directly when the jailer is bypassed
1730/// (development mode — `CELLOS_FIRECRACKER_ALLOW_NO_JAILER=1`).
1731#[cfg(target_os = "linux")]
1732pub(crate) fn build_direct_argv(socket_path: &str, no_seccomp: bool) -> Vec<&str> {
1733    let mut argv = vec!["--api-sock", socket_path, "--level", "Error"];
1734    if no_seccomp {
1735        argv.push("--no-seccomp");
1736    }
1737    argv
1738}
1739
1740// ── Path helpers ──────────────────────────────────────────────────────────────
1741
1742fn required_absolute_path<F>(
1743    lookup: &F,
1744    key: &str,
1745    description: &str,
1746) -> Result<PathBuf, CellosError>
1747where
1748    F: Fn(&str) -> Option<String>,
1749{
1750    let value = lookup(key)
1751        .ok_or_else(|| missing_env_error(key, description))?
1752        .trim()
1753        .to_owned();
1754    if value.is_empty() {
1755        return Err(missing_env_error(key, description));
1756    }
1757    parse_absolute_path(key, description, &value)
1758}
1759
1760fn optional_absolute_path<F>(
1761    lookup: &F,
1762    key: &str,
1763    description: &str,
1764) -> Result<Option<PathBuf>, CellosError>
1765where
1766    F: Fn(&str) -> Option<String>,
1767{
1768    let Some(value) = lookup(key) else {
1769        return Ok(None);
1770    };
1771    let value = value.trim();
1772    if value.is_empty() {
1773        return Ok(None);
1774    }
1775    Ok(Some(parse_absolute_path(key, description, value)?))
1776}
1777
1778fn parse_absolute_path(key: &str, description: &str, raw: &str) -> Result<PathBuf, CellosError> {
1779    let path = Path::new(raw);
1780    if !path.is_absolute() {
1781        return Err(CellosError::Host(format!(
1782            "{key} must be an absolute path to the {description} when CELLOS_CELL_BACKEND=firecracker"
1783        )));
1784    }
1785    Ok(path.to_path_buf())
1786}
1787
1788fn missing_env_error(key: &str, description: &str) -> CellosError {
1789    CellosError::Host(format!(
1790        "{key} must be set to an absolute path to the {description} when CELLOS_CELL_BACKEND=firecracker"
1791    ))
1792}
1793
1794// ── Artifact manifest verification ────────────────────────────────────────────
1795
1796/// One parsed entry from the artifact manifest file.
1797///
1798/// Lines look like:
1799///
1800/// ```text
1801/// sha256:<hex>  kernel       /opt/cellos/firecracker/vmlinux
1802/// sha256:<hex>  rootfs       /opt/cellos/firecracker/rootfs.ext4
1803/// sha256:<hex>  firecracker  /opt/cellos/firecracker/firecracker
1804/// ```
1805#[cfg(target_os = "linux")]
1806#[derive(Debug, Clone, PartialEq, Eq)]
1807struct ManifestEntry {
1808    sha256_hex: String,
1809    role: String,
1810    path: PathBuf,
1811}
1812
1813/// Parse the manifest text. Comment lines (`#`) and blank lines are ignored.
1814/// Returns an error on any malformed entry — the manifest is a security
1815/// boundary, so silently skipping garbage lines is unsafe.
1816#[cfg(target_os = "linux")]
1817fn parse_manifest(text: &str) -> Result<Vec<ManifestEntry>, CellosError> {
1818    let mut out = Vec::new();
1819    for (lineno, raw) in text.lines().enumerate() {
1820        let line = raw.trim();
1821        if line.is_empty() || line.starts_with('#') {
1822            continue;
1823        }
1824        // Expect 3 whitespace-delimited fields: sha256:<hex>  <role>  <path>
1825        let mut parts = line.split_whitespace();
1826        let digest = parts.next();
1827        let role = parts.next();
1828        let path = parts.next();
1829        let extra = parts.next();
1830        let (digest, role, path) = match (digest, role, path) {
1831            (Some(d), Some(r), Some(p)) => (d, r, p),
1832            _ => {
1833                return Err(CellosError::Host(format!(
1834                    "manifest line {}: expected `sha256:<hex>  <role>  <path>`, got: {raw:?}",
1835                    lineno + 1
1836                )));
1837            }
1838        };
1839        if extra.is_some() {
1840            return Err(CellosError::Host(format!(
1841                "manifest line {}: unexpected trailing field after path",
1842                lineno + 1
1843            )));
1844        }
1845        let Some(hex) = digest.strip_prefix("sha256:") else {
1846            return Err(CellosError::Host(format!(
1847                "manifest line {}: digest field must start with `sha256:`, got: {digest:?}",
1848                lineno + 1
1849            )));
1850        };
1851        if hex.len() != 64 || !hex.chars().all(|c| c.is_ascii_hexdigit()) {
1852            return Err(CellosError::Host(format!(
1853                "manifest line {}: sha256 digest must be 64 hex chars, got: {hex:?}",
1854                lineno + 1
1855            )));
1856        }
1857        out.push(ManifestEntry {
1858            sha256_hex: hex.to_ascii_lowercase(),
1859            role: role.to_string(),
1860            path: PathBuf::from(path),
1861        });
1862    }
1863    Ok(out)
1864}
1865
1866/// Stream-hash a file with SHA256, returning the lowercase hex digest.
1867///
1868/// Reads the file in 64 KiB chunks so even multi-hundred-MiB rootfs images do
1869/// not balloon resident memory.
1870#[cfg(target_os = "linux")]
1871fn sha256_file(path: &Path) -> Result<String, CellosError> {
1872    use sha2::{Digest, Sha256};
1873    use std::io::Read;
1874
1875    let mut file = std::fs::File::open(path).map_err(|e| {
1876        CellosError::Host(format!(
1877            "open artifact for hashing at {}: {e}",
1878            path.display()
1879        ))
1880    })?;
1881    let mut hasher = Sha256::new();
1882    let mut buf = [0u8; 64 * 1024];
1883    loop {
1884        let n = file.read(&mut buf).map_err(|e| {
1885            CellosError::Host(format!(
1886                "read artifact at {} for hashing: {e}",
1887                path.display()
1888            ))
1889        })?;
1890        if n == 0 {
1891            break;
1892        }
1893        hasher.update(&buf[..n]);
1894    }
1895    let digest = hasher.finalize();
1896    let mut hex = String::with_capacity(64);
1897    for byte in digest {
1898        hex.push_str(&format!("{byte:02x}"));
1899    }
1900    Ok(hex)
1901}
1902
1903/// L2-06-1 — Verify the SHA256 digest of a rootfs file against an expected
1904/// value pulled from `spec.environment.imageDigest`. This is content
1905/// addressing at the *per-cell* layer: even when the host-wide manifest
1906/// (verify_artifacts) confirms the rootfs binary matches the deployment
1907/// stamp, a spec can demand a *specific* image digest and the backend must
1908/// refuse to boot otherwise. Without this gate a poisoned spec referencing
1909/// the wrong image silently boots the host's default rootfs.
1910///
1911/// `expected_sha256` accepts both the bare 64-char hex form and the
1912/// `sha256:<hex>` prefixed form (cellos-core's `EnvironmentSpec.imageDigest`
1913/// validates the prefixed form). The comparison is case-insensitive.
1914///
1915/// On mismatch returns a forensic-grade error naming the path and both
1916/// digests so operators can diff them against the spec.
1917#[cfg(target_os = "linux")]
1918fn verify_rootfs_digest(path: &Path, expected_sha256: &str) -> Result<String, CellosError> {
1919    // Normalise: strip optional `sha256:` prefix, lowercase, validate shape.
1920    let expected_hex = expected_sha256
1921        .trim()
1922        .strip_prefix("sha256:")
1923        .unwrap_or(expected_sha256.trim())
1924        .to_ascii_lowercase();
1925    if expected_hex.len() != 64 || !expected_hex.chars().all(|c| c.is_ascii_hexdigit()) {
1926        return Err(CellosError::Host(format!(
1927            "verify_rootfs_digest: expected_sha256 must be 64 hex chars (with optional `sha256:` prefix); got {expected_sha256:?}"
1928        )));
1929    }
1930    let actual_hex = sha256_file(path)?;
1931    if actual_hex != expected_hex {
1932        return Err(CellosError::Host(format!(
1933            "rootfs digest mismatch at {}: spec.environment.imageDigest declared sha256:{expected_hex}, on-disk image hashes to sha256:{actual_hex} \
1934             — refusing to boot a cell against an unverified rootfs (L2-06-1)",
1935            path.display()
1936        )));
1937    }
1938    Ok(actual_hex)
1939}
1940
1941/// L2-06-3 — Derive `mem_size_mib` for the Firecracker `PUT /machine-config`
1942/// call from the spec's `run.limits.memoryMax`, falling back to the
1943/// env-derived default when the spec does not declare a memory limit.
1944///
1945/// Extracted as a pure function so the precedence (spec → env default) is
1946/// covered by unit tests without standing up a full backend. The 64-MiB
1947/// minimum mirrors Firecracker's practical floor — smaller VMs panic during
1948/// init because the kernel cannot allocate a usable mem_init region.
1949///
1950/// Returns `mem_size_mib` clamped to `[64, u32::MAX]`. Callers that need
1951/// further policy clamping (e.g. fleet-wide max) apply it before passing the
1952/// spec in.
1953#[cfg(target_os = "linux")]
1954fn derive_mem_size_mib(spec: &cellos_core::ExecutionCellSpec, env_default: u32) -> u32 {
1955    spec.run
1956        .as_ref()
1957        .and_then(|r| r.limits.as_ref())
1958        .and_then(|l| l.memory_max_bytes)
1959        .map(|bytes| ((bytes / (1024 * 1024)) as u32).max(64))
1960        .unwrap_or(env_default)
1961}
1962
1963/// L2-06-4 — Validate that the live jailer configuration enforces the
1964/// privilege boundary we depend on for production isolation: non-root uid,
1965/// non-root gid, and a chroot base that is NOT the filesystem root.
1966///
1967/// The constructor for [`FirecrackerConfig`] already rejects uid=0/gid=0
1968/// (the FC-41 guard), but those checks run at *config-load* time and an
1969/// in-process mutation or a hand-built `FirecrackerConfig` (tests, future
1970/// dynamic reconfig) could land at `configure_vm` with insecure values. This
1971/// function is the second line of defence: called right before
1972/// `InstanceStart`, it refuses to boot the VM if any of the three invariants
1973/// fails.
1974///
1975/// Errors carry the literal token `L2-06-4` so audit log searches surface
1976/// them distinctly from the config-load-time FC-41 errors.
1977///
1978/// Returns `Ok(())` when the jailer is intentionally disabled (no
1979/// `jailer_binary_path`) — that path is the operator's explicit dev opt-out,
1980/// already gated by the loud `CELLOS_FIRECRACKER_ALLOW_NO_JAILER=1` warning
1981/// in `FirecrackerConfig::from_env`.
1982#[cfg(target_os = "linux")]
1983fn validate_jailer_security_config(config: &FirecrackerConfig) -> Result<(), CellosError> {
1984    // No jailer configured → operator explicitly opted out at config load.
1985    // Don't double-error here; the config-load WARN is the audit signal.
1986    if config.jailer_binary_path.is_none() {
1987        return Ok(());
1988    }
1989    if config.jailer_uid == 0 {
1990        return Err(CellosError::Host(
1991            "validate_jailer_security_config: jailer_uid=0 — running the jailer as root \
1992             defeats the privilege boundary that isolates the VMM from the host. \
1993             [L2-06-4]"
1994                .into(),
1995        ));
1996    }
1997    if config.jailer_gid == 0 {
1998        return Err(CellosError::Host(
1999            "validate_jailer_security_config: jailer_gid=0 — running the jailer in the root \
2000             group defeats the privilege boundary that isolates the VMM from the host. \
2001             [L2-06-4]"
2002                .into(),
2003        ));
2004    }
2005    // chroot_base must be an absolute path AND must not be the filesystem
2006    // root. `/` as chroot is equivalent to "no chroot at all" — anything the
2007    // jailer mounts or writes appears in the host's real namespace.
2008    let chroot = &config.chroot_base_dir;
2009    if !chroot.is_absolute() {
2010        return Err(CellosError::Host(format!(
2011            "validate_jailer_security_config: chroot_base_dir must be an absolute path; got {} [L2-06-4]",
2012            chroot.display()
2013        )));
2014    }
2015    if chroot == Path::new("/") {
2016        return Err(CellosError::Host(
2017            "validate_jailer_security_config: chroot_base_dir=`/` — chroot to filesystem root \
2018             is functionally no chroot at all. Configure CELLOS_FIRECRACKER_CHROOT_BASE to a \
2019             dedicated directory like /var/lib/cellos/firecracker. [L2-06-4]"
2020                .into(),
2021        ));
2022    }
2023    Ok(())
2024}
2025
2026/// FC-08 — verified SHA256 hex digests of the boot artifacts that
2027/// [`verify_artifacts`] hashed and matched against the manifest.
2028///
2029/// Surfaced on the [`CellHandle`] so the supervisor can include them on
2030/// `cell.lifecycle.v1.started`. Lowercase hex (no `sha256:` prefix), exactly
2031/// 64 chars per field. `firecracker` is `None` when the manifest did not
2032/// declare a firecracker-role entry (it is optional).
2033#[cfg(target_os = "linux")]
2034#[derive(Debug, Clone, Default)]
2035struct VerifiedDigests {
2036    kernel: Option<String>,
2037    rootfs: Option<String>,
2038    firecracker: Option<String>,
2039}
2040
2041/// Verify that the kernel, rootfs (and optionally firecracker) artifacts
2042/// referenced by the config match the SHA256 digests in the manifest.
2043///
2044/// - Returns `Ok(VerifiedDigests::default())` (all `None`) when no manifest is
2045///   configured AND `allow_no_manifest` is `true` (with a `tracing::warn!` so
2046///   operators notice they have skipped the verification step).
2047/// - Returns `Err(CellosError::Host(...))` when no manifest is configured and
2048///   `allow_no_manifest` is `false` (the default — production posture).
2049/// - Returns `Err(CellosError::Host(...))` on any parse error, missing role,
2050///   missing file, read error, or digest mismatch.
2051/// - Returns `Ok(VerifiedDigests { kernel: Some(_), rootfs: Some(_), firecracker: ... })`
2052///   on success, with the on-disk hex digests for the supervisor to surface
2053///   on `cell.lifecycle.v1.started` (FC-08).
2054#[cfg(target_os = "linux")]
2055async fn verify_artifacts(config: &FirecrackerConfig) -> Result<VerifiedDigests, CellosError> {
2056    let Some(manifest_path) = &config.manifest_path else {
2057        if config.allow_no_manifest {
2058            tracing::warn!(
2059                "MANIFEST VERIFICATION DISABLED — pre-boot artifact digest verification is being skipped \
2060                 because both CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1 and \
2061                 CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1 are set. \
2062                 This is unsafe for production and should only be used for local development."
2063            );
2064            return Ok(VerifiedDigests::default());
2065        }
2066        return Err(CellosError::Host(
2067            "firecracker init: CELLOS_FIRECRACKER_MANIFEST is not set \
2068             — pre-boot artifact digest verification is mandatory by default. \
2069             Set CELLOS_FIRECRACKER_MANIFEST to a v1 manifest path, or set BOTH \
2070             CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1 AND \
2071             CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY=1 to opt out \
2072             (development only — the second flag is a deliberate speed-bump)."
2073                .into(),
2074        ));
2075    };
2076
2077    // Manifest files are small (a handful of lines); reading synchronously
2078    // here is fine and avoids pulling in tokio's `fs` feature.
2079    let text = std::fs::read_to_string(manifest_path).map_err(|e| {
2080        CellosError::Host(format!(
2081            "read artifact manifest at {}: {e}",
2082            manifest_path.display()
2083        ))
2084    })?;
2085    let entries = parse_manifest(&text)?;
2086
2087    // Build the verification plan. kernel and rootfs are mandatory; firecracker
2088    // is optional (verified only when an entry is present).
2089    let kernel_entry = entries.iter().find(|e| e.role == "kernel").ok_or_else(|| {
2090        CellosError::Host(format!(
2091            "manifest at {} is missing a `kernel` role entry",
2092            manifest_path.display()
2093        ))
2094    })?;
2095    let rootfs_entry = entries.iter().find(|e| e.role == "rootfs").ok_or_else(|| {
2096        CellosError::Host(format!(
2097            "manifest at {} is missing a `rootfs` role entry",
2098            manifest_path.display()
2099        ))
2100    })?;
2101    let firecracker_entry = entries.iter().find(|e| e.role == "firecracker");
2102
2103    // Hash and compare. We intentionally hash the configured paths (not the
2104    // manifest paths) so a tampered manifest cannot redirect verification to
2105    // a known-good copy of the file. If the manifest path differs from the
2106    // configured path, we log it but still verify the configured one.
2107    let plan: Vec<(&str, &Path, &str, &Path)> = {
2108        let mut v: Vec<(&str, &Path, &str, &Path)> = vec![
2109            (
2110                "kernel",
2111                config.kernel_image_path.as_path(),
2112                kernel_entry.sha256_hex.as_str(),
2113                kernel_entry.path.as_path(),
2114            ),
2115            (
2116                "rootfs",
2117                config.rootfs_image_path.as_path(),
2118                rootfs_entry.sha256_hex.as_str(),
2119                rootfs_entry.path.as_path(),
2120            ),
2121        ];
2122        if let Some(fc) = firecracker_entry {
2123            v.push((
2124                "firecracker",
2125                config.binary_path.as_path(),
2126                fc.sha256_hex.as_str(),
2127                fc.path.as_path(),
2128            ));
2129        }
2130        v
2131    };
2132
2133    let mut verified = VerifiedDigests::default();
2134    for (role, configured_path, expected_hex, manifest_decl_path) in plan {
2135        if configured_path != manifest_decl_path {
2136            tracing::warn!(
2137                role,
2138                configured = %configured_path.display(),
2139                manifest = %manifest_decl_path.display(),
2140                "configured artifact path differs from manifest declaration; verifying configured path"
2141            );
2142        }
2143        // sha256_file is sync I/O; offload to the blocking pool so we do not
2144        // stall the runtime on multi-hundred-MiB rootfs hashes.
2145        let owned_path = configured_path.to_path_buf();
2146        let actual_hex = tokio::task::spawn_blocking(move || sha256_file(&owned_path))
2147            .await
2148            .map_err(|e| {
2149                CellosError::Host(format!(
2150                    "sha256 hashing task for role {role} panicked or was cancelled: {e}"
2151                ))
2152            })??;
2153        if actual_hex != expected_hex {
2154            // FC-51 + FC-09: emit manifest-failed CloudEvent onto pending buffer
2155            // before returning the existing host error (sink-unaware call site).
2156            push_manifest_failed_pending(
2157                role,
2158                expected_hex,
2159                actual_hex.as_str(),
2160                manifest_path.to_string_lossy().as_ref(),
2161            );
2162            return Err(CellosError::Host(format!(
2163                "artifact digest mismatch for role `{role}` at {}: expected sha256:{expected_hex}, got sha256:{actual_hex}",
2164                configured_path.display()
2165            )));
2166        }
2167        tracing::info!(
2168            role,
2169            path = %configured_path.display(),
2170            sha256 = %actual_hex,
2171            "artifact digest verified"
2172        );
2173        // FC-08: capture the verified digest so the supervisor can surface it
2174        // on cell.lifecycle.v1.started. We record only after the on-disk hash
2175        // matched the manifest, so any value reaching CellHandle is a verified
2176        // digest by construction.
2177        match role {
2178            "kernel" => verified.kernel = Some(actual_hex),
2179            "rootfs" => verified.rootfs = Some(actual_hex),
2180            "firecracker" => verified.firecracker = Some(actual_hex),
2181            _ => {
2182                // Unknown roles never appear in `plan`; this branch is defensive.
2183            }
2184        }
2185    }
2186
2187    Ok(verified)
2188}
2189
2190// FC-51 + FC-09 — manifest-failed emission seam. `verify_artifacts` is sink-
2191// unaware (it runs inside `create()`'s boot block); events accumulate on a
2192// process-wide buffer drained by the supervisor after backend `create()`
2193// returns Err. Long-term design wires `dyn EventSink` into `CellBackend`.
2194static MANIFEST_FAILED_PENDING: std::sync::OnceLock<
2195    std::sync::Mutex<Vec<cellos_core::CloudEventV1>>,
2196> = std::sync::OnceLock::new();
2197
2198/// Emit a manifest-failed CloudEvent onto the pending buffer. Public for
2199/// supervisor drain + FC-51 emission tests; the digest-mismatch branch in
2200/// `verify_artifacts` calls this with the live verification context.
2201pub fn push_manifest_failed_pending_for_test(
2202    role: &str,
2203    expected_sha256: &str,
2204    actual_sha256: &str,
2205    manifest_path: &str,
2206) {
2207    push_manifest_failed_pending(role, expected_sha256, actual_sha256, manifest_path);
2208}
2209
2210fn push_manifest_failed_pending(role: &str, expected: &str, actual: &str, manifest_path: &str) {
2211    let data = match cellos_core::manifest_failed_data_v1(role, expected, actual, manifest_path) {
2212        Ok(d) => d,
2213        Err(e) => {
2214            tracing::warn!(error = %e, role, "manifest_failed_data_v1 failed");
2215            return;
2216        }
2217    };
2218    let ev = cellos_core::CloudEventV1 {
2219        specversion: "1.0".into(),
2220        id: uuid::Uuid::new_v4().to_string(),
2221        source: "cellos-host-firecracker".into(),
2222        ty: cellos_core::LIFECYCLE_MANIFEST_FAILED_TYPE.into(),
2223        datacontenttype: Some("application/json".into()),
2224        data: Some(data),
2225        time: None,
2226        traceparent: None,
2227    };
2228    let buf = MANIFEST_FAILED_PENDING.get_or_init(|| std::sync::Mutex::new(Vec::new()));
2229    if let Ok(mut g) = buf.lock() {
2230        g.push(ev);
2231    }
2232}
2233
2234/// Drain pending manifest-failed events (consume-on-drain).
2235pub fn drain_pending_manifest_failed_events() -> Vec<cellos_core::CloudEventV1> {
2236    let buf = MANIFEST_FAILED_PENDING.get_or_init(|| std::sync::Mutex::new(Vec::new()));
2237    let mut g = buf.lock().unwrap_or_else(|p| p.into_inner());
2238    std::mem::take(&mut *g)
2239}
2240
2241/// Accept one connection on a Unix socket, read a 4-byte little-endian i32
2242/// exit code sent by `cellos-init` over vsock (Firecracker proxies the guest
2243/// vsock connection to this socket), then write a 1-byte ACK back.
2244///
2245/// The ACK is part of the exit-code wire protocol — `cellos-init` blocks on
2246/// reading it before calling `reboot()`. Without the ACK, the guest could
2247/// power off the VM (and Firecracker could tear down the vsock device)
2248/// before this read returns, leaving the supervisor to time out and SIGKILL
2249/// even though the workload exited cleanly. ACK write failures are logged
2250/// but not propagated: by the time we get here the 4-byte exit code is
2251/// already committed, and a half-broken ACK path is strictly less bad than
2252/// failing the whole run.
2253#[cfg(target_os = "linux")]
2254/// Bind a Unix-stream listener at `socket_path` and authenticate one
2255/// exit-code report from the guest's `cellos-init`.
2256///
2257/// FC-18: the wire format is `[code:4 LE i32][hmac:32]`. The supervisor
2258/// recomputes `HMAC-SHA256(hmac_key, code_bytes ‖ cell_id_bytes)` and
2259/// compares against the received tag. On mismatch (forged or replayed
2260/// frame) the supervisor logs a structured `vsock_exit_auth_rejected`
2261/// warning and returns `Err` — the caller treats that as "no exit code
2262/// received", which causes the supervisor to fall through to the
2263/// SIGKILL-on-timeout path. Any other code path (`Ok`) is the legitimate
2264/// cell's exit.
2265///
2266/// `hmac_key` is the per-cell key generated in `create()` and plumbed to
2267/// the guest via `cellos.exit_hmac_key=<base64>`. `cell_id` is bound into
2268/// the MAC input so a tag minted for one cell cannot be replayed against
2269/// another cell's listener.
2270#[cfg(target_os = "linux")]
2271async fn listen_for_exit_code(
2272    socket_path: &Path,
2273    hmac_key: &[u8],
2274    cell_id: &str,
2275) -> Result<i32, CellosError> {
2276    use tokio::io::AsyncWriteExt;
2277
2278    let listener = UnixListener::bind(socket_path).map_err(|e| {
2279        CellosError::Host(format!(
2280            "bind vsock exit listener at {}: {e}",
2281            socket_path.display()
2282        ))
2283    })?;
2284
2285    let (mut stream, _) = listener.accept().await.map_err(|e| {
2286        CellosError::Host(format!(
2287            "accept vsock exit connection at {}: {e}",
2288            socket_path.display()
2289        ))
2290    })?;
2291
2292    let mut frame = [0u8; EXIT_AUTHED_FRAME_LEN];
2293    stream.read_exact(&mut frame).await.map_err(|e| {
2294        CellosError::Host(format!(
2295            "read vsock exit frame from {}: {e}",
2296            socket_path.display()
2297        ))
2298    })?;
2299
2300    let mut code_bytes = [0u8; 4];
2301    code_bytes.copy_from_slice(&frame[..4]);
2302    let received_tag = &frame[4..];
2303
2304    if !verify_exit_hmac(hmac_key, &code_bytes, cell_id, received_tag) {
2305        // Structured rejection: a forged or replayed exit frame. Do NOT ACK
2306        // the sender — silence forces them to retry/timeout, which is the
2307        // intended behaviour against an attacker. Returning Err propagates
2308        // "no legitimate exit observed" up to the supervisor.
2309        tracing::warn!(
2310            cell_id = %cell_id,
2311            socket = %socket_path.display(),
2312            event = "vsock_exit_auth_rejected",
2313            "FC-18: rejecting unauthenticated vsock exit frame (HMAC mismatch)"
2314        );
2315        return Err(CellosError::Host(format!(
2316            "vsock_exit_auth_rejected: HMAC mismatch on exit frame for cell {cell_id}"
2317        )));
2318    }
2319
2320    // The ACK byte value is unused by the guest — its arrival is the signal.
2321    // 0x00 is chosen so any future protocol revision can repurpose non-zero
2322    // values without ambiguity.
2323    if let Err(e) = stream.write_all(&[0u8]).await {
2324        tracing::debug!(
2325            error = %e,
2326            socket = %socket_path.display(),
2327            "vsock ACK write failed (exit code already captured)"
2328        );
2329    }
2330
2331    Ok(i32::from_le_bytes(code_bytes))
2332}
2333
2334// ── Network isolation (TAP + nftables) ───────────────────────────────────────
2335
2336/// Sanitize and truncate a cell-id for use in interface and table names.
2337///
2338/// The kernel limits interface names to 15 bytes (`IFNAMSIZ - 1`); combined
2339/// with the `cfc-` prefix this leaves 11 bytes of slug.  We further constrain
2340/// to 8 lowercase hex chars derived from SHA-256 of the full cell id — this
2341/// avoids the zero-padding collision where `"a"` and `"a0000000"` would map
2342/// to the same slug under a naive right-pad scheme.
2343#[cfg(target_os = "linux")]
2344fn cell_id_short(cell_id: &str) -> String {
2345    use sha2::{Digest, Sha256};
2346    let digest = Sha256::digest(cell_id.as_bytes());
2347    format!(
2348        "{:08x}",
2349        u32::from_be_bytes([digest[0], digest[1], digest[2], digest[3]])
2350    )
2351}
2352
2353/// Compose the host TAP interface name from a sanitized cell-id slug.
2354#[cfg(target_os = "linux")]
2355fn tap_name_for(cell_short: &str) -> String {
2356    format!("{TAP_NAME_PREFIX}{cell_short}")
2357}
2358
2359/// Compose the per-cell nftables table name.
2360#[cfg(target_os = "linux")]
2361fn nft_table_name(cell_short: &str) -> String {
2362    format!("cellos-{cell_short}")
2363}
2364
2365/// Create a host TAP interface owned by `uid`, bring it up, and return its
2366/// name.  Linux-only — returns an error on every other OS.
2367///
2368/// The interface is created via `ip tuntap add dev <name> mode tap user <uid>`
2369/// followed by `ip link set dev <name> up`.  Both commands run via
2370/// `tokio::process::Command` with each argument passed individually — there is
2371/// no shell so no injection surface even if a future caller passes hostile
2372/// input.
2373#[cfg(target_os = "linux")]
2374async fn create_tap_device(cell_short: &str, uid: u32) -> Result<String, CellosError> {
2375    #[cfg(not(target_os = "linux"))]
2376    {
2377        let _ = (cell_short, uid);
2378        Err(CellosError::Host(
2379            "TAP device creation is only supported on Linux \
2380             (set CELLOS_FIRECRACKER_ENABLE_NETWORK=0 on this host)"
2381                .into(),
2382        ))
2383    }
2384    #[cfg(target_os = "linux")]
2385    {
2386        let name = tap_name_for(cell_short);
2387        // Defensive: should never trigger because cell_id_short caps at 8.
2388        if name.len() > 15 {
2389            return Err(CellosError::Host(format!(
2390                "computed TAP name {name:?} exceeds IFNAMSIZ (15)"
2391            )));
2392        }
2393
2394        let uid_str = uid.to_string();
2395        let add = tokio::process::Command::new("ip")
2396            .arg("tuntap")
2397            .arg("add")
2398            .arg("dev")
2399            .arg(&name)
2400            .arg("mode")
2401            .arg("tap")
2402            .arg("user")
2403            .arg(&uid_str)
2404            .output()
2405            .await
2406            .map_err(|e| CellosError::Host(format!("spawn `ip tuntap add` for {name}: {e}")))?;
2407        if !add.status.success() {
2408            return Err(CellosError::Host(format!(
2409                "`ip tuntap add dev {name}` failed: exit {:?} stderr={}",
2410                add.status.code(),
2411                String::from_utf8_lossy(&add.stderr).trim()
2412            )));
2413        }
2414
2415        let up = tokio::process::Command::new("ip")
2416            .arg("link")
2417            .arg("set")
2418            .arg("dev")
2419            .arg(&name)
2420            .arg("up")
2421            .output()
2422            .await
2423            .map_err(|e| CellosError::Host(format!("spawn `ip link set up` for {name}: {e}")))?;
2424        if !up.status.success() {
2425            // Best-effort rollback so we don't leak a half-configured TAP.
2426            let _ = delete_tap_device(&name).await;
2427            return Err(CellosError::Host(format!(
2428                "`ip link set dev {name} up` failed: exit {:?} stderr={}",
2429                up.status.code(),
2430                String::from_utf8_lossy(&up.stderr).trim()
2431            )));
2432        }
2433
2434        Ok(name)
2435    }
2436}
2437
2438/// Remove a host TAP interface created by [`create_tap_device`].  Idempotent —
2439/// returns `Ok(())` even when the device does not exist, so this is safe to
2440/// call from `destroy()` regardless of how far `create()` got.
2441#[cfg(target_os = "linux")]
2442async fn delete_tap_device(name: &str) -> Result<(), CellosError> {
2443    #[cfg(not(target_os = "linux"))]
2444    {
2445        let _ = name;
2446        Ok(())
2447    }
2448    #[cfg(target_os = "linux")]
2449    {
2450        let out = tokio::process::Command::new("ip")
2451            .arg("link")
2452            .arg("delete")
2453            .arg(name)
2454            .output()
2455            .await
2456            .map_err(|e| CellosError::Host(format!("spawn `ip link delete` for {name}: {e}")))?;
2457        if out.status.success() {
2458            return Ok(());
2459        }
2460        // Idempotent: missing device is not a failure.  `ip` writes
2461        // "Cannot find device" or "does not exist" to stderr in that case.
2462        let stderr = String::from_utf8_lossy(&out.stderr);
2463        if stderr.contains("Cannot find device") || stderr.contains("does not exist") {
2464            return Ok(());
2465        }
2466        Err(CellosError::Host(format!(
2467            "`ip link delete {name}` failed: exit {:?} stderr={}",
2468            out.status.code(),
2469            stderr.trim()
2470        )))
2471    }
2472}
2473
2474// FC-32 integration-test exposure: doc-hidden, Linux-only, additive only.
2475#[cfg(target_os = "linux")]
2476#[doc(hidden)]
2477pub mod __fc32 {
2478    pub fn cell_id_short(id: &str) -> String {
2479        super::cell_id_short(id)
2480    }
2481    pub fn tap_name_for(s: &str) -> String {
2482        super::tap_name_for(s)
2483    }
2484    pub async fn create_tap_device(s: &str, uid: u32) -> Result<String, super::CellosError> {
2485        super::create_tap_device(s, uid).await
2486    }
2487    pub async fn delete_tap_device(name: &str) -> Result<(), super::CellosError> {
2488        super::delete_tap_device(name).await
2489    }
2490}
2491
2492// FC-18 integration-test exposure: doc-hidden, additive. Mirrors `__fc32`.
2493// Platform-agnostic because the verification primitive is.
2494#[doc(hidden)]
2495pub mod __fc18 {
2496    pub const EXIT_HMAC_KEY_LEN: usize = super::EXIT_HMAC_KEY_LEN;
2497    pub const EXIT_HMAC_TAG_LEN: usize = super::EXIT_HMAC_TAG_LEN;
2498    pub fn verify_exit_hmac(
2499        key: &[u8],
2500        exit_code_bytes: &[u8; 4],
2501        cell_id: &str,
2502        received_tag: &[u8],
2503    ) -> bool {
2504        super::verify_exit_hmac(key, exit_code_bytes, cell_id, received_tag)
2505    }
2506}
2507
2508/// Build the nftables ruleset text for a cell.
2509///
2510/// The ruleset creates a fresh `ip` family table named `cellos-<cell_short>`
2511/// with two chains:
2512///
2513/// - `egress` (hooked at `forward`, priority `filter`): default DROP for
2514///   packets originating on the cell's TAP interface, with explicit ACCEPT
2515///   rules for each declared destination.
2516/// - `output` (hooked at `output`, priority `filter`): no-op accept — keeps
2517///   host-originated traffic unaffected.
2518///
2519/// The function is intentionally synchronous and free of process I/O so it
2520/// can be exercised by unit tests on any platform.  Hostnames in
2521/// `egress_rules` are emitted as inline comments only when they fail to parse
2522/// as IP literals; nftables itself does **not** resolve hostnames, so a
2523/// non-IP `host` becomes a dropped rule plus a `# unresolved` comment.
2524/// Resolution to IPs is the caller's responsibility (see
2525/// [`apply_network_policy`]).
2526#[cfg(target_os = "linux")]
2527fn build_nftables_ruleset(
2528    cell_short: &str,
2529    tap_iface: &str,
2530    egress_rules: &[EgressRule],
2531) -> String {
2532    use std::fmt::Write as _;
2533
2534    let table = nft_table_name(cell_short);
2535    let mut s = String::new();
2536    let _ = writeln!(s, "table ip {table} {{");
2537    let _ = writeln!(s, "    chain egress {{");
2538    let _ = writeln!(
2539        s,
2540        "        type filter hook forward priority filter; policy drop;"
2541    );
2542    let _ = writeln!(s, "        ct state established,related accept");
2543
2544    for rule in egress_rules {
2545        // Normalize protocol: tcp by default; "udp" → udp; anything else → tcp
2546        // (including "https" and "dns-acknowledged" which are application-level
2547        // and ride on tcp/udp respectively at the IP layer).
2548        let l4 = match rule.protocol.as_deref().map(|p| p.to_ascii_lowercase()) {
2549            Some(ref p) if p == "udp" => "udp",
2550            Some(ref p) if p == "dns-acknowledged" => "udp",
2551            _ => "tcp",
2552        };
2553
2554        match rule.host.parse::<std::net::IpAddr>() {
2555            Ok(std::net::IpAddr::V4(ip)) => {
2556                let _ = writeln!(
2557                    s,
2558                    "        iifname \"{tap_iface}\" ip daddr {ip} {l4} dport {port} accept",
2559                    port = rule.port
2560                );
2561            }
2562            Ok(std::net::IpAddr::V6(_)) => {
2563                // table ip is IPv4-only; IPv6 addresses cannot be expressed as
2564                // ip daddr rules.  Emit a comment so operators can see the rule
2565                // was declared but silently skipped rather than crashing nft.
2566                let _ = writeln!(
2567                    s,
2568                    "        # skipped IPv6 {host:?} port {port} {l4} — table ip is IPv4-only",
2569                    host = rule.host,
2570                    port = rule.port
2571                );
2572            }
2573            Err(_) => {
2574                // Hostname not pre-resolved: emit a comment so operators see
2575                // that the rule was declared but cannot be applied without DNS.
2576                let _ = writeln!(
2577                    s,
2578                    "        # unresolved host {host:?} port {port} {l4} — no accept rule",
2579                    host = rule.host,
2580                    port = rule.port
2581                );
2582            }
2583        }
2584    }
2585
2586    let _ = writeln!(s, "        iifname \"{tap_iface}\" drop");
2587    let _ = writeln!(s, "    }}");
2588    let _ = writeln!(s, "}}");
2589
2590    // IPv6 sibling table: policy drop by default, with per-rule accept entries
2591    // for declared IPv6 destinations. Defense-in-depth alongside ipv6.disable=1
2592    // in the kernel boot args — if the boot flag is ever removed, this table
2593    // ensures IPv6 traffic is still allowlist-filtered.
2594    let _ = writeln!(s, "table ip6 {table} {{");
2595    let _ = writeln!(s, "    chain egress {{");
2596    let _ = writeln!(
2597        s,
2598        "        type filter hook forward priority filter; policy drop;"
2599    );
2600    let _ = writeln!(s, "        ct state established,related accept");
2601
2602    for rule in egress_rules {
2603        let l4 = match rule.protocol.as_deref().map(|p| p.to_ascii_lowercase()) {
2604            Some(ref p) if p == "udp" => "udp",
2605            Some(ref p) if p == "dns-acknowledged" => "udp",
2606            _ => "tcp",
2607        };
2608        if let Ok(std::net::IpAddr::V6(ip)) = rule.host.parse::<std::net::IpAddr>() {
2609            let _ = writeln!(
2610                s,
2611                "        iifname \"{tap_iface}\" ip6 daddr {ip} {l4} dport {port} accept",
2612                port = rule.port
2613            );
2614        }
2615    }
2616
2617    let _ = writeln!(s, "        iifname \"{tap_iface}\" drop");
2618    let _ = writeln!(s, "    }}");
2619    let _ = writeln!(s, "}}");
2620    s
2621}
2622
2623/// Resolve hostnames in `egress_rules` to a flat list of `EgressRule` entries
2624/// whose `host` field is an IP literal.  Each input rule may expand to zero
2625/// or more output rules (zero on resolution failure, multiple when DNS
2626/// returns several addresses).
2627#[cfg(target_os = "linux")]
2628async fn resolve_egress_targets(egress_rules: &[EgressRule]) -> Vec<EgressRule> {
2629    let mut resolved = Vec::with_capacity(egress_rules.len());
2630    for rule in egress_rules {
2631        if rule.host.parse::<std::net::IpAddr>().is_ok() {
2632            resolved.push(rule.clone());
2633            continue;
2634        }
2635        match tokio::net::lookup_host((rule.host.as_str(), rule.port)).await {
2636            Ok(addrs) => {
2637                let mut any = false;
2638                for sa in addrs {
2639                    any = true;
2640                    resolved.push(EgressRule {
2641                        host: sa.ip().to_string(),
2642                        port: rule.port,
2643                        protocol: rule.protocol.clone(),
2644                        dns_egress_justification: rule.dns_egress_justification.clone(),
2645                    });
2646                }
2647                if !any {
2648                    tracing::warn!(host = %rule.host, "DNS returned no addresses; egress rule skipped");
2649                }
2650            }
2651            Err(e) => {
2652                tracing::warn!(error = %e, host = %rule.host, "DNS resolution failed; egress rule skipped");
2653            }
2654        }
2655    }
2656    resolved
2657}
2658
2659/// Apply the per-cell nftables ruleset by piping the text from
2660/// [`build_nftables_ruleset`] into `nft -f -`.  Linux-only.
2661///
2662/// Hostnames in `egress_rules` are resolved to IPs via the host's resolver
2663/// before the ruleset is generated; rules whose host fails to resolve are
2664/// dropped with a warning (the default-DROP policy still applies).
2665#[cfg(target_os = "linux")]
2666async fn apply_network_policy(
2667    cell_short: &str,
2668    tap_iface: &str,
2669    egress_rules: &[EgressRule],
2670) -> Result<(), CellosError> {
2671    #[cfg(not(target_os = "linux"))]
2672    {
2673        let _ = (cell_short, tap_iface, egress_rules);
2674        Err(CellosError::Host(
2675            "nftables policy enforcement is only supported on Linux".into(),
2676        ))
2677    }
2678    #[cfg(target_os = "linux")]
2679    {
2680        // Make sure no stale table from a previous run with the same cell-id
2681        // short slug is still present — `nft -f -` with `add table` would
2682        // append rather than replace.
2683        let _ = remove_network_policy(cell_short).await;
2684
2685        let resolved = resolve_egress_targets(egress_rules).await;
2686        let ruleset = build_nftables_ruleset(cell_short, tap_iface, &resolved);
2687
2688        let mut child = tokio::process::Command::new("nft")
2689            .arg("-f")
2690            .arg("-")
2691            .stdin(std::process::Stdio::piped())
2692            .stdout(std::process::Stdio::piped())
2693            .stderr(std::process::Stdio::piped())
2694            .spawn()
2695            .map_err(|e| CellosError::Host(format!("spawn nft: {e}")))?;
2696
2697        if let Some(mut stdin) = child.stdin.take() {
2698            stdin.write_all(ruleset.as_bytes()).await.map_err(|e| {
2699                CellosError::Host(format!("write nftables ruleset to nft stdin: {e}"))
2700            })?;
2701            stdin
2702                .shutdown()
2703                .await
2704                .map_err(|e| CellosError::Host(format!("close nft stdin: {e}")))?;
2705        }
2706
2707        let output = child
2708            .wait_with_output()
2709            .await
2710            .map_err(|e| CellosError::Host(format!("wait for nft: {e}")))?;
2711        if !output.status.success() {
2712            return Err(CellosError::Host(format!(
2713                "nft -f - rejected ruleset for {cell_short}: exit {:?} stderr={}",
2714                output.status.code(),
2715                String::from_utf8_lossy(&output.stderr).trim()
2716            )));
2717        }
2718        Ok(())
2719    }
2720}
2721
2722/// Drop the per-cell nftables table.  Idempotent — succeeds even when the
2723/// table does not exist, so it is safe to call from `destroy()` after a
2724/// half-failed `create()`.
2725#[cfg(target_os = "linux")]
2726async fn remove_network_policy(cell_short: &str) -> Result<(), CellosError> {
2727    #[cfg(not(target_os = "linux"))]
2728    {
2729        let _ = cell_short;
2730        Ok(())
2731    }
2732    #[cfg(target_os = "linux")]
2733    {
2734        let table = nft_table_name(cell_short);
2735        let out = tokio::process::Command::new("nft")
2736            .arg("delete")
2737            .arg("table")
2738            .arg("ip")
2739            .arg(&table)
2740            .output()
2741            .await
2742            .map_err(|e| CellosError::Host(format!("spawn `nft delete table {table}`: {e}")))?;
2743        if out.status.success() {
2744            return Ok(());
2745        }
2746        let stderr = String::from_utf8_lossy(&out.stderr);
2747        // nftables emits "No such file or directory" / "does not exist" when
2748        // the table is absent — that is the expected idempotent path.
2749        if stderr.contains("No such file or directory")
2750            || stderr.contains("does not exist")
2751            || stderr.contains("Could not process rule")
2752        {
2753            return Ok(());
2754        }
2755        Err(CellosError::Host(format!(
2756            "`nft delete table ip {table}` failed: exit {:?} stderr={}",
2757            out.status.code(),
2758            stderr.trim()
2759        )))
2760    }
2761}
2762
2763// ── Tests ─────────────────────────────────────────────────────────────────────
2764//
2765// The test module is Linux-only because nearly every helper it exercises
2766// (`listen_for_exit_code`, `build_jailer_argv`, `build_nftables_ruleset`,
2767// `verify_artifacts`, …) is itself Linux-only — Firecracker is a Linux/KVM
2768// VMM. Cross-platform `cargo check --workspace` only needs the public API
2769// surface to compile, not the test bodies.
2770
2771#[cfg(all(test, target_os = "linux"))]
2772mod tests {
2773    use super::*;
2774
2775    #[test]
2776    fn config_parses_required_paths() {
2777        let config = FirecrackerConfig::from_lookup(|key| match key {
2778            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/firecracker/firecracker".into()),
2779            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/firecracker/vmlinux.bin".into()),
2780            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/firecracker/rootfs.ext4".into()),
2781            "CELLOS_FIRECRACKER_JAILER_BINARY" => Some("/opt/firecracker/jailer".into()),
2782            "CELLOS_FIRECRACKER_CHROOT_BASE" => Some("/var/lib/cellos/firecracker".into()),
2783            // Opt out of mandatory manifest enforcement; this test exercises
2784            // path parsing, not the manifest guard.
2785            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
2786            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
2787            _ => None,
2788        })
2789        .unwrap();
2790
2791        assert_eq!(
2792            config.binary_path,
2793            PathBuf::from("/opt/firecracker/firecracker")
2794        );
2795        assert_eq!(
2796            config.kernel_image_path,
2797            PathBuf::from("/opt/firecracker/vmlinux.bin")
2798        );
2799        assert_eq!(
2800            config.rootfs_image_path,
2801            PathBuf::from("/opt/firecracker/rootfs.ext4")
2802        );
2803        assert_eq!(
2804            config.jailer_binary_path,
2805            Some(PathBuf::from("/opt/firecracker/jailer"))
2806        );
2807        assert_eq!(
2808            config.chroot_base_dir,
2809            PathBuf::from("/var/lib/cellos/firecracker")
2810        );
2811    }
2812
2813    #[test]
2814    fn config_socket_dir_defaults_to_tmp() {
2815        let config = FirecrackerConfig::from_lookup(|key| match key {
2816            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/firecracker/firecracker".into()),
2817            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/firecracker/vmlinux.bin".into()),
2818            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/firecracker/rootfs.ext4".into()),
2819            // Opt out of mandatory manifest enforcement; this test exercises
2820            // socket-dir defaulting, not the manifest guard.
2821            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
2822            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
2823            _ => None,
2824        })
2825        .unwrap();
2826        assert_eq!(config.socket_dir, PathBuf::from("/tmp"));
2827    }
2828
2829    #[test]
2830    fn config_requires_absolute_paths() {
2831        let err = FirecrackerConfig::from_lookup(|key| match key {
2832            "CELLOS_FIRECRACKER_BINARY" => Some("firecracker".into()),
2833            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/firecracker/vmlinux.bin".into()),
2834            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/firecracker/rootfs.ext4".into()),
2835            _ => None,
2836        })
2837        .unwrap_err();
2838
2839        assert!(err
2840            .to_string()
2841            .contains("CELLOS_FIRECRACKER_BINARY must be an absolute path"));
2842    }
2843
2844    #[test]
2845    fn config_requires_binary_path() {
2846        let err = FirecrackerConfig::from_lookup(|key| match key {
2847            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/firecracker/vmlinux.bin".into()),
2848            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/firecracker/rootfs.ext4".into()),
2849            _ => None,
2850        })
2851        .unwrap_err();
2852
2853        assert!(err
2854            .to_string()
2855            .contains("CELLOS_FIRECRACKER_BINARY must be set"));
2856    }
2857
2858    #[test]
2859    fn build_boot_args_includes_cell_id_and_vsock_port() {
2860        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
2861            "apiVersion": "cellos.io/v1",
2862            "kind": "ExecutionCell",
2863            "spec": {
2864                "id": "my-cell-001",
2865                "authority": { "secretRefs": [] },
2866                "lifetime": { "ttlSeconds": 60 }
2867            }
2868        }))
2869        .unwrap();
2870        let args = build_boot_args(&doc, None);
2871        assert!(args.contains("console=ttyS0"));
2872        assert!(args.contains("cellos.cell_id=my-cell-001"));
2873        assert!(
2874            args.contains("cellos.vsock_port=9000"),
2875            "vsock port must be encoded"
2876        );
2877    }
2878
2879    #[test]
2880    fn build_boot_args_encodes_argv_when_run_present() {
2881        use base64::engine::general_purpose::STANDARD as BASE64_STANDARD;
2882        use base64::Engine;
2883
2884        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
2885            "apiVersion": "cellos.io/v1",
2886            "kind": "ExecutionCell",
2887            "spec": {
2888                "id": "argv-cell",
2889                "authority": { "secretRefs": [] },
2890                "lifetime": { "ttlSeconds": 60 },
2891                "run": { "argv": ["echo", "hello world"] }
2892            }
2893        }))
2894        .unwrap();
2895        let args = build_boot_args(&doc, None);
2896
2897        // Extract cellos.argv=<b64> token and decode it.
2898        let b64 = args
2899            .split_ascii_whitespace()
2900            .find(|t| t.starts_with("cellos.argv="))
2901            .expect("cellos.argv not found in boot args")
2902            .strip_prefix("cellos.argv=")
2903            .unwrap();
2904
2905        let json = BASE64_STANDARD.decode(b64).expect("base64 decode");
2906        let decoded: Vec<String> = serde_json::from_slice(&json).expect("json decode");
2907        assert_eq!(decoded, vec!["echo", "hello world"]);
2908    }
2909
2910    /// Regression: commit d14f134 dropped the `root=/dev/vda rw` boot arg.
2911    /// The kernel then panicked with "Unable to mount root fs on
2912    /// unknown-block(0,0)" after serial init. The fix was re-applied in
2913    /// 95a1941. This test pins the requirement so the same revert cannot
2914    /// happen silently again.
2915    #[test]
2916    fn build_boot_args_includes_root_dev_vda_rw() {
2917        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
2918            "apiVersion": "cellos.io/v1",
2919            "kind": "ExecutionCell",
2920            "spec": {
2921                "id": "root-arg-cell",
2922                "authority": { "secretRefs": [] },
2923                "lifetime": { "ttlSeconds": 60 }
2924            }
2925        }))
2926        .unwrap();
2927        let args = build_boot_args(&doc, None);
2928        assert!(
2929            args.contains("root=/dev/vda"),
2930            "boot args MUST set root=/dev/vda — without it kernel panics at root mount. args={args:?}"
2931        );
2932        assert!(
2933            args.contains(" rw"),
2934            "rootfs must be mounted read-write (cellos-init writes to /proc, /sys mounts). args={args:?}"
2935        );
2936        assert!(
2937            args.contains("console=ttyS0"),
2938            "console=ttyS0 required for supervisor to read kernel stdout. args={args:?}"
2939        );
2940    }
2941
2942    /// Regression: commit d14f134 reverted the Firecracker arg name from
2943    /// `--level` to the (never-valid) `--log-level`. Firecracker exited at
2944    /// startup with `ParseArguments(UnexpectedArgument("log-level"))` and the
2945    /// supervisor timed out waiting for the API socket. Re-applied in 7f190d7.
2946    /// This test pins both the correct arg name AND the absence of the wrong
2947    /// one — a future agent's whole-file copy from a stale base would clobber
2948    /// the fix without this assertion.
2949    #[test]
2950    fn firecracker_argv_uses_level_not_log_level() {
2951        // Direct (no-jailer) path
2952        let direct = build_direct_argv("/tmp/fc.sock", false);
2953        assert!(
2954            direct.contains(&"--level"),
2955            "direct argv must contain --level: {direct:?}"
2956        );
2957        assert!(
2958            !direct.contains(&"--log-level"),
2959            "direct argv must NOT contain --log-level (Firecracker rejects it): {direct:?}"
2960        );
2961
2962        // Jailer path
2963        let jailer = build_jailer_argv(
2964            "cell-1",
2965            "/usr/bin/firecracker",
2966            "1000",
2967            "1000",
2968            "/tmp",
2969            false,
2970        );
2971        assert!(
2972            jailer.contains(&"--level"),
2973            "jailer argv must contain --level: {jailer:?}"
2974        );
2975        assert!(
2976            !jailer.contains(&"--log-level"),
2977            "jailer argv must NOT contain --log-level (Firecracker rejects it): {jailer:?}"
2978        );
2979    }
2980
2981    /// Pin the structural shape of the jailer argv (positional ordering, the
2982    /// `--` separator, the in-jail socket path). Catches accidental reorders
2983    /// or omissions that would still parse but produce wrong-looking VMs.
2984    #[test]
2985    fn build_jailer_argv_has_required_positionals_and_separator() {
2986        let argv = build_jailer_argv(
2987            "my-cell",
2988            "/usr/bin/firecracker",
2989            "1000",
2990            "1000",
2991            "/srv/fc",
2992            false,
2993        );
2994        // ID + exec-file + uid + gid + chroot + -- + api-sock + level
2995        assert_eq!(argv[0], "--id");
2996        assert_eq!(argv[1], "my-cell");
2997        assert_eq!(argv[2], "--exec-file");
2998        assert_eq!(argv[3], "/usr/bin/firecracker");
2999        // The `--` separator must precede the firecracker-side args.
3000        let dash_dash = argv.iter().position(|a| *a == "--").expect("missing --");
3001        // After --: api-sock + value + level + value
3002        assert_eq!(argv[dash_dash + 1], "--api-sock");
3003        assert_eq!(
3004            argv[dash_dash + 2],
3005            "/run/firecracker.socket",
3006            "in-jail socket path must be /run/firecracker.socket"
3007        );
3008    }
3009
3010    #[test]
3011    fn build_boot_args_omits_argv_when_run_absent() {
3012        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
3013            "apiVersion": "cellos.io/v1",
3014            "kind": "ExecutionCell",
3015            "spec": {
3016                "id": "no-run-cell",
3017                "authority": { "secretRefs": [] },
3018                "lifetime": { "ttlSeconds": 60 }
3019            }
3020        }))
3021        .unwrap();
3022        let args = build_boot_args(&doc, None);
3023        assert!(
3024            !args.contains("cellos.argv="),
3025            "cellos.argv must be absent when spec.run is missing"
3026        );
3027    }
3028
3029    /// FC-18: when an HMAC key is provided, `build_boot_args` must emit a
3030    /// `cellos.exit_hmac_key=<base64>` token whose decoded bytes equal the
3031    /// provided key. Pin both the prefix and the round-trip so a future
3032    /// rename or accidental hex-encoding regresses loudly.
3033    #[test]
3034    fn build_boot_args_includes_exit_hmac_key_when_provided() {
3035        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
3036            "apiVersion": "cellos.io/v1",
3037            "kind": "ExecutionCell",
3038            "spec": {
3039                "id": "fc18-cell",
3040                "authority": { "secretRefs": [] },
3041                "lifetime": { "ttlSeconds": 60 }
3042            }
3043        }))
3044        .unwrap();
3045        let key = [0x9Au8; 32];
3046        let args = build_boot_args(&doc, Some(&key));
3047        let token = args
3048            .split_ascii_whitespace()
3049            .find(|t| t.starts_with("cellos.exit_hmac_key="))
3050            .expect("FC-18 hmac key token missing");
3051        let b64 = token.strip_prefix("cellos.exit_hmac_key=").unwrap();
3052        let decoded = BASE64_STANDARD.decode(b64).expect("base64 decode");
3053        assert_eq!(decoded, key.to_vec());
3054    }
3055
3056    /// FC-18: when no key is provided, the token MUST be absent — the host
3057    /// listener will then fail-shut on the missing-tag wire shape.
3058    #[test]
3059    fn build_boot_args_omits_exit_hmac_key_when_absent() {
3060        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
3061            "apiVersion": "cellos.io/v1",
3062            "kind": "ExecutionCell",
3063            "spec": {
3064                "id": "no-fc18-cell",
3065                "authority": { "secretRefs": [] },
3066                "lifetime": { "ttlSeconds": 60 }
3067            }
3068        }))
3069        .unwrap();
3070        let args = build_boot_args(&doc, None);
3071        assert!(
3072            !args.contains("cellos.exit_hmac_key="),
3073            "cellos.exit_hmac_key must be absent when no key is provided"
3074        );
3075    }
3076
3077    /// Pure helper: a tag minted under the wrong key MUST NOT verify.
3078    /// This is the FC-18 acceptance property — extracted as a unit test
3079    /// here so it locks the contract independent of any vsock plumbing.
3080    #[test]
3081    fn verify_exit_hmac_rejects_wrong_key() {
3082        let real_key = [0x01u8; 32];
3083        let attacker_key = [0x02u8; 32];
3084        let cell_id = "cell-x";
3085        let code: i32 = 0;
3086        let attacker_tag = fc18_compute_tag(&attacker_key, code, cell_id);
3087        assert!(!verify_exit_hmac(
3088            &real_key,
3089            &code.to_le_bytes(),
3090            cell_id,
3091            &attacker_tag
3092        ));
3093    }
3094
3095    /// Pure helper: a tag minted for a different cell-id MUST NOT verify
3096    /// even with the right key — this is the cross-cell replay defence.
3097    #[test]
3098    fn verify_exit_hmac_rejects_wrong_cell_id() {
3099        let key = [0x01u8; 32];
3100        let code: i32 = 0;
3101        let other_tag = fc18_compute_tag(&key, code, "other-cell");
3102        assert!(!verify_exit_hmac(
3103            &key,
3104            &code.to_le_bytes(),
3105            "this-cell",
3106            &other_tag
3107        ));
3108    }
3109
3110    /// Pure helper: legitimate tag verifies cleanly.
3111    #[test]
3112    fn verify_exit_hmac_accepts_legitimate_tag() {
3113        let key = [0x77u8; 32];
3114        let cell_id = "the-real-cell";
3115        let code: i32 = 137;
3116        let tag = fc18_compute_tag(&key, code, cell_id);
3117        assert!(verify_exit_hmac(&key, &code.to_le_bytes(), cell_id, &tag));
3118    }
3119
3120    /// Pure helper: a tag of the wrong length MUST NOT verify.
3121    /// Defends against truncation or padding attacks.
3122    #[test]
3123    fn verify_exit_hmac_rejects_wrong_length() {
3124        let key = [0x01u8; 32];
3125        let code: i32 = 0;
3126        let bogus = [0u8; 16];
3127        assert!(!verify_exit_hmac(
3128            &key,
3129            &code.to_le_bytes(),
3130            "any-cell",
3131            &bogus
3132        ));
3133    }
3134
3135    /// Helper for the listener tests below: precomputes the FC-18 tag a
3136    /// legitimate guest would produce. Mirrors `cellos-init::compute_exit_hmac`
3137    /// — kept inline so the test crate doesn't need to depend on
3138    /// `cellos-init`'s binary surface.
3139    fn fc18_compute_tag(key: &[u8], code: i32, cell_id: &str) -> [u8; 32] {
3140        use hmac::{digest::KeyInit, Hmac, Mac};
3141        use sha2::Sha256;
3142        type HmacSha256 = Hmac<Sha256>;
3143        let mut mac = HmacSha256::new_from_slice(key).expect("any key length");
3144        mac.update(&code.to_le_bytes());
3145        mac.update(cell_id.as_bytes());
3146        let tag = mac.finalize().into_bytes();
3147        let mut out = [0u8; 32];
3148        out.copy_from_slice(&tag);
3149        out
3150    }
3151
3152    /// Verifies the vsock exit-code listener round-trip: write the FC-18
3153    /// authenticated frame (4-byte LE code + 32-byte HMAC tag), read the
3154    /// 1-byte ACK back, verify the listener reads the correct i32. The ACK
3155    /// read mirrors what cellos-init does in production — the listener
3156    /// writes the byte unconditionally after a successful HMAC verify.
3157    #[tokio::test]
3158    async fn listen_for_exit_code_round_trip() {
3159        use tokio::io::{AsyncReadExt, AsyncWriteExt};
3160        use tokio::net::UnixStream;
3161
3162        let dir = tempfile::tempdir().expect("tmpdir");
3163        let socket_path = dir.path().join("test_exit.socket");
3164        let key = [0xAAu8; 32];
3165        let cell_id = "test-cell-rt";
3166
3167        // Spawn the listener first.
3168        let path_clone = socket_path.clone();
3169        let key_clone = key;
3170        let cell_id_clone = cell_id.to_string();
3171        let handle = tokio::spawn(async move {
3172            listen_for_exit_code(&path_clone, &key_clone, &cell_id_clone).await
3173        });
3174
3175        // Give the listener a moment to bind before connecting.
3176        tokio::time::sleep(Duration::from_millis(10)).await;
3177
3178        let mut stream = UnixStream::connect(&socket_path)
3179            .await
3180            .expect("connect to listener");
3181        let code = 42i32;
3182        let tag = fc18_compute_tag(&key, code, cell_id);
3183        stream
3184            .write_all(&code.to_le_bytes())
3185            .await
3186            .expect("write exit code");
3187        stream.write_all(&tag).await.expect("write hmac tag");
3188
3189        let mut ack = [0u8; 1];
3190        stream.read_exact(&mut ack).await.expect("read ACK");
3191        assert_eq!(ack[0], 0x00, "host writes 0x00 ACK after verifying frame");
3192
3193        let received = handle.await.expect("join").expect("listen_for_exit_code");
3194        assert_eq!(received, 42);
3195    }
3196
3197    #[tokio::test]
3198    async fn listen_for_exit_code_negative_exit_code() {
3199        use tokio::io::{AsyncReadExt, AsyncWriteExt};
3200        use tokio::net::UnixStream;
3201
3202        let dir = tempfile::tempdir().expect("tmpdir");
3203        let socket_path = dir.path().join("test_exit_neg.socket");
3204        let key = [0x55u8; 32];
3205        let cell_id = "test-cell-neg";
3206
3207        let path_clone = socket_path.clone();
3208        let key_clone = key;
3209        let cell_id_clone = cell_id.to_string();
3210        let handle = tokio::spawn(async move {
3211            listen_for_exit_code(&path_clone, &key_clone, &cell_id_clone).await
3212        });
3213
3214        tokio::time::sleep(Duration::from_millis(10)).await;
3215
3216        let mut stream = UnixStream::connect(&socket_path).await.expect("connect");
3217        let code = -1i32;
3218        let tag = fc18_compute_tag(&key, code, cell_id);
3219        stream.write_all(&code.to_le_bytes()).await.expect("write");
3220        stream.write_all(&tag).await.expect("write hmac tag");
3221
3222        let mut ack = [0u8; 1];
3223        stream.read_exact(&mut ack).await.expect("read ACK");
3224        assert_eq!(ack[0], 0x00);
3225
3226        let received = handle.await.expect("join").expect("listen");
3227        assert_eq!(received, -1);
3228    }
3229
3230    /// Asserts the ACK protocol contract specifically: after the
3231    /// authenticated frame is read AND verified, the listener writes
3232    /// exactly one byte and no more, and that byte is the agreed-upon 0x00
3233    /// sentinel. Reading 2 bytes must hit EOF on the second byte — the
3234    /// listener closes the stream after the ACK.
3235    #[tokio::test]
3236    async fn listen_for_exit_code_writes_exactly_one_ack_byte() {
3237        use tokio::io::{AsyncReadExt, AsyncWriteExt};
3238        use tokio::net::UnixStream;
3239
3240        let dir = tempfile::tempdir().expect("tmpdir");
3241        let socket_path = dir.path().join("test_exit_ack.socket");
3242        let key = [0x33u8; 32];
3243        let cell_id = "test-cell-ack";
3244
3245        let path_clone = socket_path.clone();
3246        let key_clone = key;
3247        let cell_id_clone = cell_id.to_string();
3248        let handle = tokio::spawn(async move {
3249            listen_for_exit_code(&path_clone, &key_clone, &cell_id_clone).await
3250        });
3251
3252        tokio::time::sleep(Duration::from_millis(10)).await;
3253
3254        let mut stream = UnixStream::connect(&socket_path).await.expect("connect");
3255        let code = 7i32;
3256        let tag = fc18_compute_tag(&key, code, cell_id);
3257        stream
3258            .write_all(&code.to_le_bytes())
3259            .await
3260            .expect("write exit code");
3261        stream.write_all(&tag).await.expect("write hmac tag");
3262
3263        // Drain everything the host sends. The contract is "exactly one byte
3264        // then EOF", so read_to_end should yield a single 0x00.
3265        let mut sink = Vec::new();
3266        stream.read_to_end(&mut sink).await.expect("drain ack");
3267        assert_eq!(
3268            sink,
3269            vec![0x00],
3270            "listener must write exactly one 0x00 ACK byte then close"
3271        );
3272
3273        let code_received = handle.await.expect("join").expect("listen");
3274        assert_eq!(code_received, 7);
3275    }
3276
3277    /// FC-18: the listener MUST reject a frame whose HMAC does not verify.
3278    /// Behavioural contract: no ACK is written, and the spawned task
3279    /// surfaces a `vsock_exit_auth_rejected` error rather than a clean
3280    /// exit code. This is the property that prevents a different process
3281    /// (CID 3, another cell) from forging the legitimate cell's exit.
3282    #[tokio::test]
3283    async fn listen_for_exit_code_rejects_forged_hmac() {
3284        use tokio::io::AsyncWriteExt;
3285        use tokio::net::UnixStream;
3286
3287        let dir = tempfile::tempdir().expect("tmpdir");
3288        let socket_path = dir.path().join("test_exit_forged.socket");
3289        let real_key = [0x01u8; 32];
3290        let attacker_key = [0x02u8; 32];
3291        let cell_id = "test-cell-forged";
3292
3293        let path_clone = socket_path.clone();
3294        let key_clone = real_key;
3295        let cell_id_clone = cell_id.to_string();
3296        let handle = tokio::spawn(async move {
3297            listen_for_exit_code(&path_clone, &key_clone, &cell_id_clone).await
3298        });
3299
3300        tokio::time::sleep(Duration::from_millis(10)).await;
3301
3302        let mut stream = UnixStream::connect(&socket_path).await.expect("connect");
3303        // Attacker sends a plausible exit code with a tag minted under the
3304        // wrong key — this is exactly the FC-18 acceptance scenario.
3305        let attacker_tag = fc18_compute_tag(&attacker_key, 0, cell_id);
3306        stream
3307            .write_all(&0i32.to_le_bytes())
3308            .await
3309            .expect("write code");
3310        stream.write_all(&attacker_tag).await.expect("write tag");
3311
3312        let result = handle.await.expect("join");
3313        match result {
3314            Err(CellosError::Host(msg)) => {
3315                assert!(
3316                    msg.contains("vsock_exit_auth_rejected"),
3317                    "expected FC-18 rejection marker in error, got: {msg}"
3318                );
3319            }
3320            other => panic!("expected Err(Host(vsock_exit_auth_rejected)), got {other:?}"),
3321        }
3322    }
3323
3324    #[test]
3325    fn config_parses_jailer_uid_and_gid() {
3326        let cfg = FirecrackerConfig::from_lookup(|key| match key {
3327            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
3328            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
3329            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
3330            "CELLOS_FIRECRACKER_JAILER_UID" => Some("10100".into()),
3331            "CELLOS_FIRECRACKER_JAILER_GID" => Some("10200".into()),
3332            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
3333            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
3334            _ => None,
3335        })
3336        .unwrap();
3337        assert_eq!(cfg.jailer_uid, 10100);
3338        assert_eq!(cfg.jailer_gid, 10200);
3339    }
3340
3341    #[test]
3342    fn config_jailer_uid_gid_default_to_10002() {
3343        let cfg = FirecrackerConfig::from_lookup(|key| match key {
3344            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
3345            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
3346            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
3347            // Tests that don't probe manifest behaviour opt out of the
3348            // mandatory-by-default manifest guard so they exercise their
3349            // own subject without tripping the unrelated check.
3350            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
3351            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
3352            _ => None,
3353        })
3354        .unwrap();
3355        assert_eq!(cfg.jailer_uid, 10002);
3356        assert_eq!(cfg.jailer_gid, 10002);
3357    }
3358
3359    #[test]
3360    fn resolve_socket_path_without_jailer_uses_socket_dir() {
3361        use uuid::Uuid;
3362        let cfg = FirecrackerConfig::from_lookup(|key| match key {
3363            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
3364            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
3365            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
3366            // Tests that don't probe manifest behaviour opt out of the
3367            // mandatory-by-default manifest guard so they exercise their
3368            // own subject without tripping the unrelated check.
3369            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
3370            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
3371            _ => None,
3372        })
3373        .unwrap();
3374        let token = Uuid::nil();
3375        let path = resolve_socket_path(&cfg, "test-cell", &token);
3376        assert!(
3377            path.starts_with(&cfg.socket_dir),
3378            "expected socket in socket_dir, got {path:?}"
3379        );
3380        assert!(path.to_string_lossy().contains("test-cell"));
3381    }
3382
3383    #[test]
3384    fn resolve_socket_path_with_jailer_uses_chroot_base() {
3385        use uuid::Uuid;
3386        let cfg = FirecrackerConfig::from_lookup(|key| match key {
3387            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
3388            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
3389            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
3390            "CELLOS_FIRECRACKER_JAILER_BINARY" => Some("/opt/fc/jailer".into()),
3391            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
3392            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
3393            _ => None,
3394        })
3395        .unwrap();
3396        let token = Uuid::nil();
3397        let path = resolve_socket_path(&cfg, "test-cell", &token);
3398        // Expected: <chroot_base>/firecracker/test-cell/root/run/firecracker.socket
3399        let expected = cfg
3400            .chroot_base_dir
3401            .join("firecracker")
3402            .join("test-cell")
3403            .join("root/run/firecracker.socket");
3404        assert_eq!(path, expected, "got: {path:?}");
3405    }
3406
3407    /// Verifies that `create()` now attempts to spawn the VMM binary and fails
3408    /// with a spawn error (not "not implemented") when the binary does not exist.
3409    /// This proves the L2-06 lifecycle path is wired, not just a stub.
3410    #[tokio::test]
3411    async fn create_fails_with_spawn_error_when_binary_missing() {
3412        let backend = FirecrackerCellBackend::new(FirecrackerConfig {
3413            binary_path: PathBuf::from("/nonexistent/firecracker"),
3414            kernel_image_path: PathBuf::from("/opt/firecracker/vmlinux.bin"),
3415            rootfs_image_path: PathBuf::from("/opt/firecracker/rootfs.ext4"),
3416            jailer_binary_path: None,
3417            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
3418            socket_dir: PathBuf::from("/tmp"),
3419            jailer_uid: 10002,
3420            jailer_gid: 10002,
3421            scratch_dir: None,
3422            manifest_path: None,
3423            // This pre-existing test validates spawn-error wiring without the
3424            // jailer; opt out of jailer enforcement so we still hit the spawn path.
3425            require_jailer: false,
3426            // Manifest verification opted out for the same reason — keep the
3427            // test exercising the spawn path, not the new pre-boot guard.
3428            allow_no_manifest: true,
3429            // Network enforcement off so this test exercises the spawn path
3430            // without requiring `ip` / `nft` / CAP_NET_ADMIN in CI.
3431            enable_network: false,
3432            allow_no_vsock: false,
3433            no_vsock_timeout: std::time::Duration::from_secs(5),
3434            no_seccomp: false,
3435        });
3436        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
3437            "apiVersion": "cellos.io/v1",
3438            "kind": "ExecutionCell",
3439            "spec": {
3440                "id": "spawn-err-test",
3441                "authority": { "secretRefs": [] },
3442                "lifetime": { "ttlSeconds": 60 }
3443            }
3444        }))
3445        .unwrap();
3446
3447        let err = backend.create(&doc).await.unwrap_err();
3448        let msg = err.to_string();
3449        // Must NOT contain the old scaffold message.
3450        assert!(
3451            !msg.contains("not implemented"),
3452            "expected spawn error, got old scaffold message: {msg}"
3453        );
3454        // Must mention "spawn" or "nonexistent" (OS spawn failure).
3455        assert!(
3456            msg.contains("spawn") || msg.contains("nonexistent") || msg.contains("No such file"),
3457            "expected spawn error message, got: {msg}"
3458        );
3459    }
3460
3461    // ── derive_vcpu_count tests ─────────────────────────────────────────────
3462
3463    /// Build an `ExecutionCellSpec` whose `run.limits.cpu_max` matches the args.
3464    /// All other fields are populated from a minimal valid JSON document so
3465    /// this stays robust to future required fields on `ExecutionCellSpec`.
3466    fn make_spec_with_cpu(
3467        quota_micros: u64,
3468        period_micros: Option<u64>,
3469    ) -> cellos_core::ExecutionCellSpec {
3470        let mut doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
3471            "apiVersion": "cellos.io/v1",
3472            "kind": "ExecutionCell",
3473            "spec": {
3474                "id": "vcpu-test",
3475                "authority": { "secretRefs": [] },
3476                "lifetime": { "ttlSeconds": 60 },
3477                "run": { "argv": [] }
3478            }
3479        }))
3480        .unwrap();
3481        let run = doc.spec.run.as_mut().expect("run present");
3482        run.limits = Some(cellos_core::RunLimits {
3483            memory_max_bytes: None,
3484            cpu_max: Some(cellos_core::RunCpuMax {
3485                quota_micros,
3486                period_micros,
3487            }),
3488            graceful_shutdown_seconds: None,
3489        });
3490        doc.spec
3491    }
3492
3493    #[test]
3494    fn derive_vcpu_count_no_limits_returns_default() {
3495        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
3496            "apiVersion": "cellos.io/v1",
3497            "kind": "ExecutionCell",
3498            "spec": {
3499                "id": "no-limits",
3500                "authority": { "secretRefs": [] },
3501                "lifetime": { "ttlSeconds": 60 }
3502            }
3503        }))
3504        .unwrap();
3505        assert_eq!(derive_vcpu_count(&doc.spec), DEFAULT_VCPU_COUNT);
3506    }
3507
3508    #[test]
3509    fn derive_vcpu_count_one_full_core() {
3510        // quota=100_000, period=100_000 → 1 vCPU
3511        let spec = make_spec_with_cpu(100_000, Some(100_000));
3512        assert_eq!(derive_vcpu_count(&spec), 1);
3513    }
3514
3515    #[test]
3516    fn derive_vcpu_count_fractional_rounds_up() {
3517        // quota=50_000, period=100_000 → 0.5 core → 1 vCPU (round up)
3518        let spec = make_spec_with_cpu(50_000, Some(100_000));
3519        assert_eq!(derive_vcpu_count(&spec), 1);
3520    }
3521
3522    #[test]
3523    fn derive_vcpu_count_two_cores() {
3524        // quota=200_000, period=100_000 → 2 vCPU
3525        let spec = make_spec_with_cpu(200_000, Some(100_000));
3526        assert_eq!(derive_vcpu_count(&spec), 2);
3527    }
3528
3529    #[test]
3530    fn derive_vcpu_count_clamped_at_32() {
3531        // quota=10_000_000, period=100_000 → 100 vCPU → clamped to 32
3532        let spec = make_spec_with_cpu(10_000_000, Some(100_000));
3533        assert_eq!(derive_vcpu_count(&spec), 32);
3534    }
3535
3536    #[test]
3537    fn derive_vcpu_count_default_period() {
3538        // period=None defaults to 100_000; quota=150_000 → 2 vCPU (rounds up)
3539        let spec = make_spec_with_cpu(150_000, None);
3540        assert_eq!(derive_vcpu_count(&spec), 2);
3541    }
3542
3543    // ── scratch_dir config tests ───────────────────────────────────────────
3544
3545    #[test]
3546    fn config_parses_scratch_dir() {
3547        let cfg = FirecrackerConfig::from_lookup(|key| match key {
3548            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
3549            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
3550            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
3551            "CELLOS_FIRECRACKER_SCRATCH_DIR" => Some("/var/lib/cellos/scratch".into()),
3552            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
3553            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
3554            _ => None,
3555        })
3556        .unwrap();
3557        assert_eq!(
3558            cfg.scratch_dir,
3559            Some(PathBuf::from("/var/lib/cellos/scratch"))
3560        );
3561    }
3562
3563    #[test]
3564    fn config_scratch_dir_absent_when_not_set() {
3565        let cfg = FirecrackerConfig::from_lookup(|key| match key {
3566            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
3567            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
3568            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
3569            // Tests that don't probe manifest behaviour opt out of the
3570            // mandatory-by-default manifest guard so they exercise their
3571            // own subject without tripping the unrelated check.
3572            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
3573            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
3574            _ => None,
3575        })
3576        .unwrap();
3577        assert_eq!(cfg.scratch_dir, None);
3578    }
3579
3580    // ── manifest + verify_artifacts tests ──────────────────────────────────
3581
3582    /// Build a minimal `FirecrackerConfig` for verify_artifacts tests. All
3583    /// path fields point to the supplied tempdir so we can scribble files
3584    /// without hitting the host filesystem.
3585    fn cfg_for_verify(
3586        kernel: Option<PathBuf>,
3587        rootfs: Option<PathBuf>,
3588        manifest: Option<PathBuf>,
3589    ) -> FirecrackerConfig {
3590        FirecrackerConfig {
3591            binary_path: PathBuf::from("/opt/fc/firecracker"),
3592            kernel_image_path: kernel.unwrap_or_else(|| PathBuf::from("/opt/fc/vmlinux")),
3593            rootfs_image_path: rootfs.unwrap_or_else(|| PathBuf::from("/opt/fc/rootfs.ext4")),
3594            jailer_binary_path: None,
3595            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
3596            socket_dir: PathBuf::from("/tmp"),
3597            jailer_uid: 10002,
3598            jailer_gid: 10002,
3599            scratch_dir: None,
3600            manifest_path: manifest,
3601            require_jailer: false,
3602            // Default to dev opt-out so that existing tests that pass
3603            // `manifest=None` continue to verify the "skip" path. Tests that
3604            // exercise the mandatory-by-default behaviour flip this to false
3605            // explicitly.
3606            allow_no_manifest: true,
3607            enable_network: false,
3608            allow_no_vsock: false,
3609            no_vsock_timeout: std::time::Duration::from_secs(5),
3610            no_seccomp: false,
3611        }
3612    }
3613
3614    #[tokio::test]
3615    async fn verify_artifacts_skips_when_no_manifest_and_opt_out() {
3616        // No manifest configured AND allow_no_manifest=true → returns Ok(())
3617        // and does not touch any file.
3618        let cfg = cfg_for_verify(None, None, None);
3619        verify_artifacts(&cfg).await.expect(
3620            "verify_artifacts should succeed when manifest_path is None and allow_no_manifest=true",
3621        );
3622    }
3623
3624    #[tokio::test]
3625    async fn verify_artifacts_errors_when_no_manifest_and_no_opt_out() {
3626        // Mandatory-by-default posture: manifest unset and allow_no_manifest
3627        // unset → hard error from verify_artifacts. The error message must
3628        // also name the second escape-hatch flag because the dev opt-out
3629        // is two-flag (FC-05 / SEAM-23 hardening).
3630        let mut cfg = cfg_for_verify(None, None, None);
3631        cfg.allow_no_manifest = false;
3632        let err = verify_artifacts(&cfg)
3633            .await
3634            .expect_err("verify_artifacts must reject missing manifest by default");
3635        let msg = err.to_string();
3636        assert!(
3637            msg.contains("CELLOS_FIRECRACKER_MANIFEST is not set"),
3638            "expected manifest-missing error, got: {msg}"
3639        );
3640        assert!(
3641            msg.contains("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST"),
3642            "error must mention the dev opt-out hint, got: {msg}"
3643        );
3644        assert!(
3645            msg.contains("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY"),
3646            "error must name the paired escape-hatch flag (FC-05 hardening), got: {msg}"
3647        );
3648    }
3649
3650    #[tokio::test]
3651    async fn verify_artifacts_fails_on_wrong_hash() {
3652        use std::io::Write;
3653        let dir = tempfile::tempdir().expect("tmpdir");
3654        let kernel_path = dir.path().join("vmlinux");
3655        let rootfs_path = dir.path().join("rootfs.ext4");
3656        std::fs::File::create(&kernel_path)
3657            .expect("kernel")
3658            .write_all(b"hello kernel")
3659            .expect("write kernel");
3660        std::fs::File::create(&rootfs_path)
3661            .expect("rootfs")
3662            .write_all(b"hello rootfs")
3663            .expect("write rootfs");
3664
3665        // Manifest with a deliberately wrong hash for kernel (all zeroes).
3666        let manifest_path = dir.path().join("manifest.txt");
3667        let manifest = format!(
3668            "# CellOS Firecracker artifact manifest v1\n\
3669             sha256:{wrong_kernel}  kernel  {kernel}\n\
3670             sha256:{wrong_rootfs}  rootfs  {rootfs}\n",
3671            wrong_kernel = "0".repeat(64),
3672            wrong_rootfs = "0".repeat(64),
3673            kernel = kernel_path.display(),
3674            rootfs = rootfs_path.display(),
3675        );
3676        std::fs::write(&manifest_path, manifest).expect("write manifest");
3677
3678        let cfg = cfg_for_verify(
3679            Some(kernel_path.clone()),
3680            Some(rootfs_path),
3681            Some(manifest_path),
3682        );
3683
3684        let err = verify_artifacts(&cfg)
3685            .await
3686            .expect_err("expected digest mismatch error");
3687        let msg = err.to_string();
3688        assert!(
3689            msg.contains("digest mismatch"),
3690            "expected digest mismatch error, got: {msg}"
3691        );
3692        assert!(
3693            msg.contains("kernel"),
3694            "expected the failing role to be reported, got: {msg}"
3695        );
3696    }
3697
3698    #[tokio::test]
3699    async fn verify_artifacts_succeeds_on_correct_hash() {
3700        use std::io::Write;
3701        let dir = tempfile::tempdir().expect("tmpdir");
3702        let kernel_path = dir.path().join("vmlinux");
3703        let rootfs_path = dir.path().join("rootfs.ext4");
3704
3705        let kernel_bytes: &[u8] = b"hello kernel";
3706        let rootfs_bytes: &[u8] = b"hello rootfs";
3707        std::fs::File::create(&kernel_path)
3708            .expect("kernel")
3709            .write_all(kernel_bytes)
3710            .expect("write kernel");
3711        std::fs::File::create(&rootfs_path)
3712            .expect("rootfs")
3713            .write_all(rootfs_bytes)
3714            .expect("write rootfs");
3715
3716        // Compute the real digests via the same helper used in the code path.
3717        let kernel_hex = sha256_file(&kernel_path).expect("hash kernel");
3718        let rootfs_hex = sha256_file(&rootfs_path).expect("hash rootfs");
3719
3720        let manifest_path = dir.path().join("manifest.txt");
3721        let manifest = format!(
3722            "# good manifest\n\
3723             sha256:{kernel_hex}  kernel  {kernel}\n\
3724             sha256:{rootfs_hex}  rootfs  {rootfs}\n",
3725            kernel = kernel_path.display(),
3726            rootfs = rootfs_path.display(),
3727        );
3728        std::fs::write(&manifest_path, manifest).expect("write manifest");
3729
3730        let cfg = cfg_for_verify(Some(kernel_path), Some(rootfs_path), Some(manifest_path));
3731
3732        verify_artifacts(&cfg)
3733            .await
3734            .expect("verify_artifacts should succeed when digests match");
3735    }
3736
3737    /// E1 / FC-01 acceptance gate: when the on-disk SHA256 differs from the
3738    /// stamped manifest, `verify_artifacts` MUST fail closed with a forensic
3739    /// error message that names the role and echoes the manifest-declared
3740    /// digest. This is the hook every E2E test relies on — without it, a
3741    /// silently-mismatched kernel could boot.
3742    ///
3743    /// We can't reach `create()`'s `InstanceStart` path on non-Linux without a
3744    /// real KVM + firecracker binary, so this test pins the digest-mismatch
3745    /// error contract on the same `verify_artifacts` function `create()`
3746    /// invokes (lib.rs `boot_result` block, BEFORE `InstanceStart`). On
3747    /// Linux CI the firecracker-e2e job exercises the full end-to-end path.
3748    #[tokio::test]
3749    async fn verify_artifacts_fc01_digest_mismatch_names_role_and_expected_digest() {
3750        use std::io::Write;
3751        let dir = tempfile::tempdir().expect("tmpdir");
3752        let kernel_path = dir.path().join("vmlinux");
3753        let rootfs_path = dir.path().join("rootfs.ext4");
3754        std::fs::File::create(&kernel_path)
3755            .expect("kernel")
3756            .write_all(b"on-disk kernel bytes")
3757            .expect("write kernel");
3758        std::fs::File::create(&rootfs_path)
3759            .expect("rootfs")
3760            .write_all(b"on-disk rootfs bytes")
3761            .expect("write rootfs");
3762
3763        // Stamp the manifest with a deliberately wrong (but well-formed)
3764        // kernel digest. The rootfs digest is correct, so the failure must
3765        // attribute to the kernel role specifically.
3766        let real_rootfs_hex = sha256_file(&rootfs_path).expect("hash rootfs");
3767        let wrong_kernel_hex = "f".repeat(64);
3768
3769        let manifest_path = dir.path().join("manifest.txt");
3770        let manifest = format!(
3771            "# FC-01 fail-closed fixture\n\
3772             sha256:{wrong_kernel_hex}  kernel  {kernel}\n\
3773             sha256:{real_rootfs_hex}  rootfs  {rootfs}\n",
3774            kernel = kernel_path.display(),
3775            rootfs = rootfs_path.display(),
3776        );
3777        std::fs::write(&manifest_path, manifest).expect("write manifest");
3778
3779        let cfg = cfg_for_verify(
3780            Some(kernel_path.clone()),
3781            Some(rootfs_path),
3782            Some(manifest_path),
3783        );
3784        let err = verify_artifacts(&cfg)
3785            .await
3786            .expect_err("digest mismatch must fail closed");
3787        let msg = err.to_string();
3788
3789        // Forensic contract: error names the role and echoes the
3790        // manifest-declared (expected) digest. Operators reading supervisor
3791        // logs must be able to compare both digests without re-hashing.
3792        assert!(
3793            msg.contains("kernel"),
3794            "error must name the failing role; got: {msg}"
3795        );
3796        assert!(
3797            msg.contains(&wrong_kernel_hex),
3798            "error must echo the manifest-declared (expected) digest; got: {msg}"
3799        );
3800        assert!(
3801            msg.contains("digest mismatch"),
3802            "error must use the canonical `digest mismatch` phrase \
3803             (runbook + log-grep contract); got: {msg}"
3804        );
3805    }
3806
3807    /// E1 / FC-01 + FC-51 wiring gate: on a real digest mismatch,
3808    /// `verify_artifacts` MUST push a `cell.lifecycle.v1.manifest_failed`
3809    /// CloudEvent onto the process-wide pending buffer that the supervisor
3810    /// drains via `drain_pending_manifest_failed_events`. The
3811    /// `fc51_manifest_failed_emission.rs` integration test covers the
3812    /// test-seam (`push_manifest_failed_pending_for_test`) but NOT the live
3813    /// emission path inside `verify_artifacts`. Without this test the wiring
3814    /// is theoretical — a refactor that drops the
3815    /// `push_manifest_failed_pending(role, …)` call inside the mismatch
3816    /// branch would still pass the error-string assertions above while
3817    /// silently regressing the CloudEvent contract supervisor.rs depends
3818    /// on for FC-09 (visible-failure-on-mismatch).
3819    #[tokio::test]
3820    async fn verify_artifacts_fc01_digest_mismatch_emits_manifest_failed_event() {
3821        use std::io::Write;
3822        // Drain any events left behind by a previously-running test in the
3823        // same process. The pending buffer is process-wide (OnceLock<Mutex<…>>)
3824        // and cargo runs the module tests on a shared runtime, so we cannot
3825        // assume the buffer is empty at entry.
3826        let _pre = drain_pending_manifest_failed_events();
3827
3828        let dir = tempfile::tempdir().expect("tmpdir");
3829        let kernel_path = dir.path().join("vmlinux");
3830        let rootfs_path = dir.path().join("rootfs.ext4");
3831        std::fs::File::create(&kernel_path)
3832            .expect("kernel")
3833            .write_all(b"on-disk kernel bytes")
3834            .expect("write kernel");
3835        std::fs::File::create(&rootfs_path)
3836            .expect("rootfs")
3837            .write_all(b"on-disk rootfs bytes")
3838            .expect("write rootfs");
3839
3840        let real_rootfs_hex = sha256_file(&rootfs_path).expect("hash rootfs");
3841        let wrong_kernel_hex = "e".repeat(64);
3842
3843        let manifest_path = dir.path().join("manifest.txt");
3844        let manifest = format!(
3845            "# FC-51 emission fixture\n\
3846             sha256:{wrong_kernel_hex}  kernel  {kernel}\n\
3847             sha256:{real_rootfs_hex}  rootfs  {rootfs}\n",
3848            kernel = kernel_path.display(),
3849            rootfs = rootfs_path.display(),
3850        );
3851        std::fs::write(&manifest_path, &manifest).expect("write manifest");
3852
3853        let cfg = cfg_for_verify(
3854            Some(kernel_path),
3855            Some(rootfs_path),
3856            Some(manifest_path.clone()),
3857        );
3858
3859        // Must fail closed BEFORE any boot action could be taken.
3860        let _err = verify_artifacts(&cfg)
3861            .await
3862            .expect_err("digest mismatch must fail closed");
3863
3864        // …AND the live emission path must have pushed the event the
3865        // supervisor will drain for FC-09 (`cell.lifecycle.v1.manifest_failed`).
3866        let drained = drain_pending_manifest_failed_events();
3867        assert!(
3868            !drained.is_empty(),
3869            "verify_artifacts must emit a manifest_failed CloudEvent on \
3870             digest mismatch (FC-51 wiring); pending buffer was empty"
3871        );
3872        let ev = drained
3873            .iter()
3874            .find(|e| e.ty == cellos_core::LIFECYCLE_MANIFEST_FAILED_TYPE)
3875            .expect("at least one event must use LIFECYCLE_MANIFEST_FAILED_TYPE");
3876        assert_eq!(ev.source, "cellos-host-firecracker");
3877        let data_str = ev
3878            .data
3879            .as_ref()
3880            .and_then(|d| serde_json::to_string(d).ok())
3881            .unwrap_or_default();
3882        assert!(
3883            data_str.contains("kernel"),
3884            "event data must name the failing role; got: {data_str}"
3885        );
3886        assert!(
3887            data_str.contains(&wrong_kernel_hex),
3888            "event data must echo the manifest-declared (expected) digest; got: {data_str}"
3889        );
3890    }
3891
3892    /// E1 / FC-01: when no kernel role is declared in the manifest, the
3893    /// verification step fails closed with an explicit error. A manifest that
3894    /// silently skips the kernel role is just as dangerous as one with the
3895    /// wrong digest — both leave the host booting an unverified kernel image.
3896    #[tokio::test]
3897    async fn verify_artifacts_fc01_missing_kernel_role_fails_closed() {
3898        use std::io::Write;
3899        let dir = tempfile::tempdir().expect("tmpdir");
3900        let rootfs_path = dir.path().join("rootfs.ext4");
3901        std::fs::File::create(&rootfs_path)
3902            .expect("rootfs")
3903            .write_all(b"rootfs bytes")
3904            .expect("write rootfs");
3905        let real_rootfs_hex = sha256_file(&rootfs_path).expect("hash rootfs");
3906
3907        let manifest_path = dir.path().join("manifest.txt");
3908        let manifest = format!(
3909            "# missing kernel role\n\
3910             sha256:{real_rootfs_hex}  rootfs  {rootfs}\n",
3911            rootfs = rootfs_path.display(),
3912        );
3913        std::fs::write(&manifest_path, manifest).expect("write manifest");
3914
3915        let cfg = cfg_for_verify(None, Some(rootfs_path), Some(manifest_path));
3916        let err = verify_artifacts(&cfg)
3917            .await
3918            .expect_err("manifest without `kernel` role must fail closed");
3919        assert!(
3920            err.to_string().contains("kernel"),
3921            "error must name the missing role; got: {err}"
3922        );
3923    }
3924
3925    #[test]
3926    fn parse_manifest_rejects_short_digest() {
3927        let text = "sha256:deadbeef  kernel  /opt/fc/vmlinux\n";
3928        let err = parse_manifest(text).expect_err("expected parse error");
3929        assert!(err.to_string().contains("64 hex chars"), "got: {err}");
3930    }
3931
3932    #[test]
3933    fn parse_manifest_rejects_missing_prefix() {
3934        let text = "deadbeef  kernel  /opt/fc/vmlinux\n";
3935        let err = parse_manifest(text).expect_err("expected parse error");
3936        assert!(err.to_string().contains("sha256:"), "got: {err}");
3937    }
3938
3939    #[test]
3940    fn parse_manifest_skips_comments_and_blanks() {
3941        let hex = "a".repeat(64);
3942        let text = format!(
3943            "# header\n\
3944             \n\
3945             sha256:{hex}  kernel  /opt/fc/vmlinux\n\
3946             \n\
3947             # trailing comment\n"
3948        );
3949        let entries = parse_manifest(&text).expect("parse");
3950        assert_eq!(entries.len(), 1);
3951        assert_eq!(entries[0].role, "kernel");
3952        assert_eq!(entries[0].sha256_hex, hex);
3953        assert_eq!(entries[0].path, PathBuf::from("/opt/fc/vmlinux"));
3954    }
3955
3956    // ── jailer enforcement tests ───────────────────────────────────────────
3957
3958    #[tokio::test]
3959    async fn create_fails_when_jailer_required_but_not_configured() {
3960        let backend = FirecrackerCellBackend::new(FirecrackerConfig {
3961            binary_path: PathBuf::from("/nonexistent/firecracker"),
3962            kernel_image_path: PathBuf::from("/opt/firecracker/vmlinux.bin"),
3963            rootfs_image_path: PathBuf::from("/opt/firecracker/rootfs.ext4"),
3964            jailer_binary_path: None,
3965            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
3966            socket_dir: PathBuf::from("/tmp"),
3967            jailer_uid: 10002,
3968            jailer_gid: 10002,
3969            scratch_dir: None,
3970            manifest_path: None,
3971            require_jailer: true,
3972            // Tests for the jailer guard must short-circuit BEFORE the manifest
3973            // guard runs, so opt out of manifest enforcement here.
3974            allow_no_manifest: true,
3975            enable_network: false,
3976            allow_no_vsock: false,
3977            no_vsock_timeout: std::time::Duration::from_secs(5),
3978            no_seccomp: false,
3979        });
3980        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
3981            "apiVersion": "cellos.io/v1",
3982            "kind": "ExecutionCell",
3983            "spec": {
3984                "id": "jailer-required-test",
3985                "authority": { "secretRefs": [] },
3986                "lifetime": { "ttlSeconds": 60 }
3987            }
3988        }))
3989        .unwrap();
3990
3991        let err = backend.create(&doc).await.unwrap_err();
3992        let msg = err.to_string();
3993        assert!(
3994            msg.contains("jailer is required"),
3995            "expected jailer-required error, got: {msg}"
3996        );
3997        // We must fail BEFORE attempting to spawn — confirm no spawn-error keywords leak.
3998        assert!(
3999            !msg.contains("spawn"),
4000            "create() should reject before spawning, got: {msg}"
4001        );
4002    }
4003
4004    #[tokio::test]
4005    async fn create_allows_no_jailer_when_opt_out() {
4006        // require_jailer=false → create() proceeds past the jailer guard and
4007        // hits the spawn path, which then fails because /nonexistent does not
4008        // exist. The point is to prove we got past the new guard.
4009        let backend = FirecrackerCellBackend::new(FirecrackerConfig {
4010            binary_path: PathBuf::from("/nonexistent/firecracker"),
4011            kernel_image_path: PathBuf::from("/opt/firecracker/vmlinux.bin"),
4012            rootfs_image_path: PathBuf::from("/opt/firecracker/rootfs.ext4"),
4013            jailer_binary_path: None,
4014            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
4015            socket_dir: PathBuf::from("/tmp"),
4016            jailer_uid: 10002,
4017            jailer_gid: 10002,
4018            scratch_dir: None,
4019            manifest_path: None,
4020            require_jailer: false,
4021            // Test exercises the jailer opt-out path; opt out of manifest
4022            // enforcement too so we still hit the spawn path.
4023            allow_no_manifest: true,
4024            enable_network: false,
4025            allow_no_vsock: false,
4026            no_vsock_timeout: std::time::Duration::from_secs(5),
4027            no_seccomp: false,
4028        });
4029        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
4030            "apiVersion": "cellos.io/v1",
4031            "kind": "ExecutionCell",
4032            "spec": {
4033                "id": "jailer-optout-test",
4034                "authority": { "secretRefs": [] },
4035                "lifetime": { "ttlSeconds": 60 }
4036            }
4037        }))
4038        .unwrap();
4039
4040        let err = backend.create(&doc).await.unwrap_err();
4041        let msg = err.to_string();
4042        assert!(
4043            !msg.contains("jailer is required"),
4044            "should have bypassed jailer guard with require_jailer=false, got: {msg}"
4045        );
4046    }
4047
4048    /// Doc contract: when `enable_network=false`, the Firecracker backend must
4049    /// surface `nft_rules_applied = Some(false)` on the returned [`CellHandle`]
4050    /// so the supervisor emits a `network_enforcement` CloudEvent (with
4051    /// `applied=false`) on the in-VM exit path. Without this signal the event
4052    /// is silently dropped — the observability gap fixed by this commit.
4053    ///
4054    /// We can't reach the `Ok(CellHandle ..)` branch from a unit test on
4055    /// macOS (it requires KVM + a real firecracker binary), so this test
4056    /// pins the construction expression directly. If the production code
4057    /// stops using `Some(tap_iface.is_some())` this test must be updated in
4058    /// lockstep — that is the point.
4059    #[test]
4060    fn cell_handle_nft_signal_when_network_disabled() {
4061        // Mirrors the production expression in `create()` for the
4062        // `enable_network=false` path, where `tap_iface` is `None`.
4063        let tap_iface: Option<String> = None;
4064        let handle = CellHandle {
4065            cell_id: "doc-contract".to_string(),
4066            cgroup_path: None,
4067            nft_rules_applied: Some(tap_iface.is_some()),
4068            kernel_digest_sha256: None,
4069            rootfs_digest_sha256: None,
4070            firecracker_digest_sha256: None,
4071        };
4072        assert_eq!(
4073            handle.nft_rules_applied,
4074            Some(false),
4075            "enable_network=false must surface Some(false) for parity with the \
4076             host-subprocess path so network_enforcement is still observable"
4077        );
4078    }
4079
4080    /// Doc contract: when `enable_network=true` and TAP+nftables were
4081    /// provisioned, `nft_rules_applied` must be `Some(true)`.
4082    #[test]
4083    fn cell_handle_nft_signal_when_network_enabled_and_tap_provisioned() {
4084        let tap_iface: Option<String> = Some("tap-doc-contract".to_string());
4085        let handle = CellHandle {
4086            cell_id: "doc-contract".to_string(),
4087            cgroup_path: None,
4088            nft_rules_applied: Some(tap_iface.is_some()),
4089            kernel_digest_sha256: None,
4090            rootfs_digest_sha256: None,
4091            firecracker_digest_sha256: None,
4092        };
4093        assert_eq!(handle.nft_rules_applied, Some(true));
4094    }
4095
4096    #[test]
4097    fn config_require_jailer_defaults_to_true() {
4098        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4099            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4100            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4101            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4102            // Tests that don't probe manifest behaviour opt out of the
4103            // mandatory-by-default manifest guard so they exercise their
4104            // own subject without tripping the unrelated check.
4105            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4106            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4107            _ => None,
4108        })
4109        .unwrap();
4110        assert!(cfg.require_jailer, "require_jailer must default to true");
4111    }
4112
4113    #[test]
4114    fn config_require_jailer_flips_off_when_allow_no_jailer() {
4115        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4116            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4117            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4118            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4119            "CELLOS_FIRECRACKER_ALLOW_NO_JAILER" => Some("1".into()),
4120            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4121            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4122            _ => None,
4123        })
4124        .unwrap();
4125        assert!(
4126            !cfg.require_jailer,
4127            "ALLOW_NO_JAILER=1 must flip require_jailer to false"
4128        );
4129    }
4130
4131    #[test]
4132    fn config_manifest_path_parsed_when_set() {
4133        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4134            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4135            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4136            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4137            "CELLOS_FIRECRACKER_MANIFEST" => Some("/etc/cellos/manifest.txt".into()),
4138            _ => None,
4139        })
4140        .unwrap();
4141        assert_eq!(
4142            cfg.manifest_path,
4143            Some(PathBuf::from("/etc/cellos/manifest.txt"))
4144        );
4145    }
4146
4147    #[test]
4148    fn config_manifest_path_absent_when_not_set() {
4149        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4150            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4151            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4152            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4153            // Tests that don't probe manifest behaviour opt out of the
4154            // mandatory-by-default manifest guard so they exercise their
4155            // own subject without tripping the unrelated check.
4156            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4157            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4158            _ => None,
4159        })
4160        .unwrap();
4161        assert_eq!(cfg.manifest_path, None);
4162    }
4163
4164    // ── manifest mandatory-by-default guard (SEAM-23) ───────────────────────
4165    //
4166    // These four tests cover every (manifest, allow_no_manifest) combination
4167    // the operator can produce via env vars:
4168    //
4169    //   manifest set,   opt-out unset → ok (production posture)
4170    //   manifest unset, opt-out unset → hard error (mandatory by default)
4171    //   manifest unset, opt-out=1     → ok with WARN (development)
4172    //   manifest set,   opt-out=1     → hard error (inconsistent config)
4173
4174    // ── ALLOW_NO_VSOCK opt-out (catches "kernel built without vsock"
4175    //    silently hanging the supervisor instead of failing fast) ────────────
4176
4177    #[test]
4178    fn config_allow_no_vsock_default_off() {
4179        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4180            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4181            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4182            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4183            "CELLOS_FIRECRACKER_MANIFEST" => Some("/etc/cellos/manifest.txt".into()),
4184            _ => None,
4185        })
4186        .expect("base config must build");
4187        assert!(
4188            !cfg.allow_no_vsock,
4189            "allow_no_vsock must default to false (production posture: wait \
4190             for authenticated in-VM exit code, no timeout)"
4191        );
4192        // Default timeout is honoured even when opt-out is off; only consulted
4193        // when allow_no_vsock=true.
4194        assert_eq!(cfg.no_vsock_timeout, std::time::Duration::from_secs(5));
4195    }
4196
4197    #[test]
4198    fn config_allow_no_vsock_set_with_default_timeout() {
4199        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4200            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4201            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4202            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4203            "CELLOS_FIRECRACKER_MANIFEST" => Some("/etc/cellos/manifest.txt".into()),
4204            "CELLOS_FIRECRACKER_ALLOW_NO_VSOCK" => Some("1".into()),
4205            _ => None,
4206        })
4207        .expect("opt-out must build");
4208        assert!(cfg.allow_no_vsock);
4209        assert_eq!(cfg.no_vsock_timeout, std::time::Duration::from_secs(5));
4210    }
4211
4212    #[test]
4213    fn config_allow_no_vsock_with_custom_timeout() {
4214        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4215            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4216            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4217            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4218            "CELLOS_FIRECRACKER_MANIFEST" => Some("/etc/cellos/manifest.txt".into()),
4219            "CELLOS_FIRECRACKER_ALLOW_NO_VSOCK" => Some("1".into()),
4220            "CELLOS_FIRECRACKER_NO_VSOCK_TIMEOUT_SECS" => Some("30".into()),
4221            _ => None,
4222        })
4223        .expect("custom timeout must build");
4224        assert_eq!(cfg.no_vsock_timeout, std::time::Duration::from_secs(30));
4225    }
4226
4227    #[test]
4228    fn config_no_vsock_timeout_falls_back_on_garbage() {
4229        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4230            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4231            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4232            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4233            "CELLOS_FIRECRACKER_MANIFEST" => Some("/etc/cellos/manifest.txt".into()),
4234            "CELLOS_FIRECRACKER_NO_VSOCK_TIMEOUT_SECS" => Some("not-a-number".into()),
4235            _ => None,
4236        })
4237        .expect("garbage timeout falls back, doesn't error");
4238        assert_eq!(cfg.no_vsock_timeout, std::time::Duration::from_secs(5));
4239    }
4240
4241    #[test]
4242    fn config_manifest_set_opt_out_unset_is_ok() {
4243        // Production posture: manifest pinned, no opt-out → from_lookup succeeds.
4244        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4245            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4246            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4247            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4248            "CELLOS_FIRECRACKER_MANIFEST" => Some("/etc/cellos/manifest.txt".into()),
4249            _ => None,
4250        })
4251        .expect("manifest set + opt-out unset must succeed");
4252        assert_eq!(
4253            cfg.manifest_path,
4254            Some(PathBuf::from("/etc/cellos/manifest.txt"))
4255        );
4256        assert!(
4257            !cfg.allow_no_manifest,
4258            "allow_no_manifest must default to false"
4259        );
4260    }
4261
4262    #[test]
4263    fn config_manifest_unset_opt_out_unset_is_error() {
4264        // Mandatory-by-default: missing manifest with no opt-out is a hard
4265        // error from from_lookup. The error message must mention both env
4266        // var names so the operator knows how to fix it. The error must
4267        // also name the second escape-hatch flag because the dev opt-out
4268        // is two-flag (FC-05 / SEAM-23 hardening).
4269        let err = FirecrackerConfig::from_lookup(|key| match key {
4270            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4271            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4272            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4273            _ => None,
4274        })
4275        .expect_err("missing manifest must be rejected by default");
4276        let msg = err.to_string();
4277        assert!(
4278            msg.contains("CELLOS_FIRECRACKER_MANIFEST is not set"),
4279            "expected manifest-missing error, got: {msg}"
4280        );
4281        assert!(
4282            msg.contains("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST"),
4283            "error must mention dev opt-out hint, got: {msg}"
4284        );
4285        assert!(
4286            msg.contains("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY"),
4287            "error must mention the paired second escape-hatch flag, got: {msg}"
4288        );
4289        // Mirror the jailer guard: error is prefixed with `firecracker init:`
4290        // so it's easy to distinguish from runtime errors in supervisor logs.
4291        assert!(
4292            msg.contains("firecracker init"),
4293            "error must include `firecracker init` prefix, got: {msg}"
4294        );
4295    }
4296
4297    #[test]
4298    fn config_manifest_unset_opt_out_set_is_ok() {
4299        // Development opt-out: missing manifest is allowed when the operator
4300        // has explicitly opted out via CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1.
4301        // (The WARN log is emitted as a side effect; we don't assert on it
4302        // here, but the behaviour mirrors CELLOS_FIRECRACKER_ALLOW_NO_JAILER.)
4303        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4304            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4305            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4306            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4307            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4308            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4309            _ => None,
4310        })
4311        .expect("manifest unset + opt-out set must succeed (dev mode)");
4312        assert_eq!(cfg.manifest_path, None);
4313        assert!(
4314            cfg.allow_no_manifest,
4315            "ALLOW_NO_MANIFEST=1 must flip allow_no_manifest to true"
4316        );
4317    }
4318
4319    #[test]
4320    fn config_manifest_set_opt_out_set_is_inconsistent_error() {
4321        // Inconsistent config: both manifest and opt-out set. Almost certainly
4322        // an operator mistake (e.g. dev env file shipped to production), so
4323        // we fail closed with a clear error rather than silently picking one.
4324        let err = FirecrackerConfig::from_lookup(|key| match key {
4325            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4326            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4327            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4328            "CELLOS_FIRECRACKER_MANIFEST" => Some("/etc/cellos/manifest.txt".into()),
4329            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4330            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4331            _ => None,
4332        })
4333        .expect_err("conflicting manifest + opt-out config must be rejected");
4334        let msg = err.to_string();
4335        assert!(
4336            msg.contains("mutually exclusive"),
4337            "error must explain conflict, got: {msg}"
4338        );
4339        assert!(
4340            msg.contains("CELLOS_FIRECRACKER_MANIFEST")
4341                && msg.contains("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST"),
4342            "error must name both env vars, got: {msg}"
4343        );
4344        assert!(
4345            msg.contains("firecracker init"),
4346            "error must include `firecracker init` prefix, got: {msg}"
4347        );
4348    }
4349
4350    // ── two-flag escape-hatch handshake (FC-05 hardening / E2 brief) ───────
4351    //
4352    // The `_REALLY` flag exists because a single env var can leak from a dev
4353    // `.env` to a production rollout. Requiring both flags forces the
4354    // operator to make the trade-off on the same line, on purpose.
4355
4356    #[test]
4357    fn config_first_flag_alone_without_second_is_error() {
4358        // Operator sets `CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST=1` but forgot
4359        // (or is testing a leaked env var) — verification stays on, and
4360        // from_lookup MUST surface an explicit, named error.
4361        let err = FirecrackerConfig::from_lookup(|key| match key {
4362            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4363            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4364            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4365            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4366            // Note: NOT setting `_REALLY`. This is the escape-hatch leak.
4367            _ => None,
4368        })
4369        .expect_err("first flag alone must NOT be accepted");
4370        let msg = err.to_string();
4371        assert!(
4372            msg.contains("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY"),
4373            "error must name the paired flag, got: {msg}"
4374        );
4375        assert!(
4376            msg.contains("firecracker init"),
4377            "error must include `firecracker init` prefix, got: {msg}"
4378        );
4379    }
4380
4381    #[test]
4382    fn config_second_flag_alone_without_first_is_error() {
4383        // Symmetric guard: setting `_REALLY=1` without the primary flag is
4384        // also rejected. This catches a different operator mistake (operator
4385        // remembered the second flag from a runbook but the first flag was
4386        // unset by an automation tool, or someone pre-set `_REALLY` thinking
4387        // it would help).
4388        let err = FirecrackerConfig::from_lookup(|key| match key {
4389            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4390            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4391            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4392            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4393            // Note: NOT setting the primary flag.
4394            _ => None,
4395        })
4396        .expect_err("second flag alone must NOT be accepted");
4397        let msg = err.to_string();
4398        assert!(
4399            msg.contains("two-flag"),
4400            "error must explain the two-flag handshake, got: {msg}"
4401        );
4402        assert!(
4403            msg.contains("CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST"),
4404            "error must name the primary flag, got: {msg}"
4405        );
4406        assert!(
4407            msg.contains("firecracker init"),
4408            "error must include `firecracker init` prefix, got: {msg}"
4409        );
4410    }
4411
4412    #[test]
4413    fn config_both_flags_set_flips_allow_no_manifest_to_true() {
4414        // The intended dev-mode posture: BOTH flags, no manifest path. This
4415        // is the only way to disable digest verification.
4416        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4417            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4418            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4419            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4420            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4421            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4422            _ => None,
4423        })
4424        .expect("both flags set + no manifest must succeed (dev mode)");
4425        assert!(
4426            cfg.allow_no_manifest,
4427            "both flags set must flip allow_no_manifest to true"
4428        );
4429    }
4430
4431    #[test]
4432    fn config_neither_flag_with_manifest_is_production_posture() {
4433        // Default production posture: manifest pinned, neither escape-hatch
4434        // flag set. allow_no_manifest must remain false.
4435        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4436            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4437            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4438            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4439            "CELLOS_FIRECRACKER_MANIFEST" => Some("/etc/cellos/manifest.txt".into()),
4440            _ => None,
4441        })
4442        .expect("manifest set + neither opt-out flag must succeed");
4443        assert!(
4444            !cfg.allow_no_manifest,
4445            "production posture must keep allow_no_manifest=false"
4446        );
4447    }
4448
4449    // ── network enforcement (TAP + nftables) ───────────────────────────────
4450
4451    /// `IFNAMSIZ` on Linux is 16 (15 usable bytes + NUL).  The TAP name is
4452    /// `cfc-` (4) + 8-char slug = 12 bytes — well under the limit.  This test
4453    /// guards the invariant against future prefix changes.
4454    #[test]
4455    fn tap_name_stays_within_ifnamsiz() {
4456        let short = cell_id_short("0123456789abcdef-extra-tail-noise");
4457        assert_eq!(short.len(), 8, "slug must be exactly 8 chars");
4458        let name = tap_name_for(&short);
4459        assert!(
4460            name.len() <= 15,
4461            "TAP name {name:?} exceeds IFNAMSIZ (15): len={}",
4462            name.len()
4463        );
4464        assert!(name.starts_with("cfc-"));
4465    }
4466
4467    /// `cell_id_short` must produce an 8-char hex slug and must NOT collide
4468    /// when two cell ids share the same alphanumeric prefix (the old zero-pad
4469    /// scheme made `"a"` and `"a0000000"` identical — this guards against that
4470    /// regression).
4471    #[test]
4472    fn cell_id_short_no_collision_on_short_input() {
4473        let s = cell_id_short("ab");
4474        assert_eq!(s, "fb8e20fc");
4475        assert_eq!(s.len(), 8);
4476        // Demonstrate no collision: "ab" and "ab000000" must differ.
4477        assert_ne!(s, cell_id_short("ab000000"));
4478    }
4479
4480    /// `cell_id_short` must produce a stable 8-char hex slug even when the
4481    /// cell id contains non-alphanumeric characters (dashes, slashes, etc.).
4482    #[test]
4483    fn cell_id_short_stable_for_non_alphanumeric_input() {
4484        let s = cell_id_short("MY-Cell/01!@#");
4485        assert_eq!(s, "485deb36");
4486        assert_eq!(s.len(), 8);
4487        assert!(s.chars().all(|c| c.is_ascii_hexdigit()));
4488    }
4489
4490    /// Refuse-to-start guard: a spec that declares any egress rule MUST be
4491    /// rejected when the backend was configured with `enable_network: false`.
4492    /// This prevents a fail-open mode where the spec believes its network is
4493    /// locked down but the host backend silently ignores the rules.
4494    #[tokio::test]
4495    async fn create_fails_when_network_disabled_but_egress_declared() {
4496        let backend = FirecrackerCellBackend::new(FirecrackerConfig {
4497            binary_path: PathBuf::from("/nonexistent/firecracker"),
4498            kernel_image_path: PathBuf::from("/opt/firecracker/vmlinux.bin"),
4499            rootfs_image_path: PathBuf::from("/opt/firecracker/rootfs.ext4"),
4500            jailer_binary_path: None,
4501            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
4502            socket_dir: PathBuf::from("/tmp"),
4503            jailer_uid: 10002,
4504            jailer_gid: 10002,
4505            scratch_dir: None,
4506            manifest_path: None,
4507            require_jailer: false,
4508            // Test asserts the egress guard fires before any other check; opt
4509            // out of manifest enforcement so we don't trip that one first.
4510            allow_no_manifest: true,
4511            enable_network: false,
4512            allow_no_vsock: false,
4513            no_vsock_timeout: std::time::Duration::from_secs(5),
4514            no_seccomp: false,
4515        });
4516        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
4517            "apiVersion": "cellos.io/v1",
4518            "kind": "ExecutionCell",
4519            "spec": {
4520                "id": "egress-disabled-test",
4521                "authority": {
4522                    "secretRefs": [],
4523                    "egressRules": [
4524                        { "host": "api.example.com", "port": 443, "protocol": "https" }
4525                    ]
4526                },
4527                "lifetime": { "ttlSeconds": 60 }
4528            }
4529        }))
4530        .unwrap();
4531
4532        let err = backend.create(&doc).await.expect_err("create must fail");
4533        let msg = err.to_string();
4534        assert!(
4535            msg.contains("egress_rules"),
4536            "error must mention egress_rules; got: {msg}"
4537        );
4538        // And critically: we must have failed BEFORE any spawn attempt, so
4539        // the message must NOT mention the missing firecracker binary.
4540        assert!(
4541            !msg.contains("/nonexistent/firecracker"),
4542            "guard must short-circuit before spawn; got: {msg}"
4543        );
4544    }
4545
4546    /// The nftables ruleset formatter is a pure function — exercise it on a
4547    /// known set of rules and assert the structurally important strings are
4548    /// present.  This test deliberately avoids invoking `nft` so it runs in
4549    /// CI on macOS as well as Linux.
4550    #[test]
4551    fn nftables_ruleset_format() {
4552        let rules = vec![
4553            EgressRule {
4554                host: "10.0.0.1".into(),
4555                port: 443,
4556                protocol: Some("https".into()),
4557                dns_egress_justification: None,
4558            },
4559            EgressRule {
4560                host: "192.168.5.5".into(),
4561                port: 53,
4562                protocol: Some("dns-acknowledged".into()),
4563                dns_egress_justification: Some("operator-approved DNS".into()),
4564            },
4565            EgressRule {
4566                host: "203.0.113.7".into(),
4567                port: 22,
4568                protocol: Some("tcp".into()),
4569                dns_egress_justification: None,
4570            },
4571        ];
4572        let ruleset = build_nftables_ruleset("abcd1234", "cfc-abcd1234", &rules);
4573
4574        // Table is per-cell.
4575        assert!(
4576            ruleset.contains("table ip cellos-abcd1234"),
4577            "missing per-cell table; got:\n{ruleset}"
4578        );
4579        // Default-DROP forward chain hooked at filter priority.
4580        assert!(
4581            ruleset.contains("type filter hook forward priority filter; policy drop;"),
4582            "missing default-drop policy; got:\n{ruleset}"
4583        );
4584        // Established/related must always be allowed for return traffic.
4585        assert!(
4586            ruleset.contains("ct state established,related accept"),
4587            "missing conntrack accept; got:\n{ruleset}"
4588        );
4589        // Each declared destination produces an explicit accept rule on the
4590        // cell's TAP interface, with the correct L4 protocol mapping.
4591        assert!(
4592            ruleset.contains("iifname \"cfc-abcd1234\" ip daddr 10.0.0.1 tcp dport 443 accept"),
4593            "missing https accept; got:\n{ruleset}"
4594        );
4595        assert!(
4596            ruleset.contains("iifname \"cfc-abcd1234\" ip daddr 192.168.5.5 udp dport 53 accept"),
4597            "missing dns-acknowledged accept (must map to udp); got:\n{ruleset}"
4598        );
4599        assert!(
4600            ruleset.contains("iifname \"cfc-abcd1234\" ip daddr 203.0.113.7 tcp dport 22 accept"),
4601            "missing tcp accept; got:\n{ruleset}"
4602        );
4603        // Trailing explicit drop on the TAP — defence-in-depth alongside the
4604        // chain-level policy drop in case future edits change the chain
4605        // policy without updating the rules.
4606        assert!(
4607            ruleset.contains("iifname \"cfc-abcd1234\" drop"),
4608            "missing per-iface drop; got:\n{ruleset}"
4609        );
4610    }
4611
4612    /// Hostnames that aren't IP literals must be emitted as comments — not as
4613    /// `accept` rules — because `nft` does not perform name resolution.  The
4614    /// caller (`apply_network_policy`) is responsible for pre-resolving;
4615    /// the formatter must be safe when given a non-IP host as a defensive
4616    /// fallback.
4617    #[test]
4618    fn nftables_ruleset_emits_comment_for_unresolved_hostname() {
4619        let rules = vec![EgressRule {
4620            host: "api.example.com".into(),
4621            port: 443,
4622            protocol: Some("https".into()),
4623            dns_egress_justification: None,
4624        }];
4625        let ruleset = build_nftables_ruleset("xyz12345", "cfc-xyz12345", &rules);
4626        assert!(
4627            ruleset.contains("# unresolved host \"api.example.com\""),
4628            "missing unresolved-host comment; got:\n{ruleset}"
4629        );
4630        assert!(
4631            !ruleset.contains("daddr api.example.com"),
4632            "must not emit hostname as IP literal; got:\n{ruleset}"
4633        );
4634    }
4635
4636    /// `enable_network` defaults to the platform default when no env var is
4637    /// set: true on Linux, false elsewhere.
4638    #[test]
4639    fn config_enable_network_default_matches_platform() {
4640        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4641            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4642            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4643            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4644            // Tests that don't probe manifest behaviour opt out of the
4645            // mandatory-by-default manifest guard so they exercise their
4646            // own subject without tripping the unrelated check.
4647            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4648            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4649            _ => None,
4650        })
4651        .unwrap();
4652        assert_eq!(cfg.enable_network, NETWORK_DEFAULT_ENABLED);
4653    }
4654
4655    /// `CELLOS_FIRECRACKER_ENABLE_NETWORK=0` forces network off regardless of
4656    /// platform default.
4657    #[test]
4658    fn config_enable_network_env_off_overrides_default() {
4659        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4660            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4661            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4662            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4663            "CELLOS_FIRECRACKER_ENABLE_NETWORK" => Some("0".into()),
4664            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4665            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4666            _ => None,
4667        })
4668        .unwrap();
4669        assert!(!cfg.enable_network);
4670    }
4671
4672    /// `CELLOS_FIRECRACKER_ENABLE_NETWORK=1` forces network on regardless of
4673    /// platform default.
4674    #[test]
4675    fn config_enable_network_env_on_overrides_default() {
4676        let cfg = FirecrackerConfig::from_lookup(|key| match key {
4677            "CELLOS_FIRECRACKER_BINARY" => Some("/opt/fc/firecracker".into()),
4678            "CELLOS_FIRECRACKER_KERNEL_IMAGE" => Some("/opt/fc/vmlinux".into()),
4679            "CELLOS_FIRECRACKER_ROOTFS_IMAGE" => Some("/opt/fc/rootfs.ext4".into()),
4680            "CELLOS_FIRECRACKER_ENABLE_NETWORK" => Some("true".into()),
4681            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST" => Some("1".into()),
4682            "CELLOS_FIRECRACKER_ALLOW_NO_MANIFEST_REALLY" => Some("1".into()),
4683            _ => None,
4684        })
4685        .unwrap();
4686        assert!(cfg.enable_network);
4687    }
4688
4689    // ── L2-06-1 — verify_rootfs_digest tests ───────────────────────────────
4690
4691    /// A rootfs file whose on-disk SHA256 matches the declared digest must
4692    /// be accepted. Tests both the bare-hex and `sha256:`-prefixed forms.
4693    #[test]
4694    fn verify_rootfs_digest_accepts_matching_hash() {
4695        use std::io::Write;
4696        let dir = tempfile::tempdir().expect("tmpdir");
4697        let path = dir.path().join("rootfs.ext4");
4698        let bytes: &[u8] = b"deterministic rootfs bytes for L2-06-1 test";
4699        std::fs::File::create(&path)
4700            .expect("create rootfs")
4701            .write_all(bytes)
4702            .expect("write rootfs");
4703        let expected = sha256_file(&path).expect("hash rootfs");
4704
4705        // Bare hex form (no prefix) → ok.
4706        verify_rootfs_digest(&path, &expected).expect("bare-hex digest must verify");
4707        // Prefixed `sha256:` form → ok (matches the EnvironmentSpec contract).
4708        let prefixed = format!("sha256:{expected}");
4709        verify_rootfs_digest(&path, &prefixed).expect("sha256:-prefixed digest must verify");
4710        // Mixed-case must round-trip after lowercasing.
4711        let upper = expected.to_ascii_uppercase();
4712        verify_rootfs_digest(&path, &upper)
4713            .expect("uppercase digest must verify (case-insensitive)");
4714    }
4715
4716    /// A rootfs file whose on-disk SHA256 does NOT match the declared digest
4717    /// must be rejected with a forensic error that names the path and both
4718    /// digests so an operator can diff them against the spec.
4719    #[test]
4720    fn verify_rootfs_digest_rejects_mismatched_hash() {
4721        use std::io::Write;
4722        let dir = tempfile::tempdir().expect("tmpdir");
4723        let path = dir.path().join("rootfs.ext4");
4724        std::fs::File::create(&path)
4725            .expect("create rootfs")
4726            .write_all(b"the on-disk bytes")
4727            .expect("write rootfs");
4728        // A well-formed but deliberately wrong digest.
4729        let wrong = "f".repeat(64);
4730
4731        let err =
4732            verify_rootfs_digest(&path, &wrong).expect_err("digest mismatch must fail closed");
4733        let msg = err.to_string();
4734        assert!(
4735            msg.contains("rootfs digest mismatch"),
4736            "error must use canonical phrasing for log-grep contract; got: {msg}"
4737        );
4738        assert!(
4739            msg.contains("L2-06-1"),
4740            "error must carry the L2-06-1 audit tag; got: {msg}"
4741        );
4742        assert!(
4743            msg.contains(&wrong),
4744            "error must echo the declared (expected) digest; got: {msg}"
4745        );
4746    }
4747
4748    /// A malformed expected digest (wrong length, non-hex chars) is rejected
4749    /// without touching the rootfs file. This is the input-validation guard
4750    /// at the API boundary — without it a typo'd spec produces a confusing
4751    /// "mismatch" error on a bogus hex string.
4752    #[test]
4753    fn verify_rootfs_digest_rejects_malformed_expected() {
4754        let dir = tempfile::tempdir().expect("tmpdir");
4755        let path = dir.path().join("rootfs.ext4");
4756        std::fs::write(&path, b"any bytes").expect("write");
4757
4758        // Too short.
4759        let err = verify_rootfs_digest(&path, "deadbeef").expect_err("short hex must reject");
4760        assert!(err.to_string().contains("64 hex chars"), "got: {err}");
4761
4762        // Right length, wrong charset.
4763        let bad_chars = "z".repeat(64);
4764        let err = verify_rootfs_digest(&path, &bad_chars).expect_err("non-hex chars must reject");
4765        assert!(err.to_string().contains("64 hex chars"), "got: {err}");
4766    }
4767
4768    // ── L2-06-3 — derive_mem_size_mib tests ────────────────────────────────
4769
4770    /// `spec.run.limits.memoryMax=512 MiB` → `mem_size_mib = 512`.
4771    /// This is the production happy-path: the supervisor admits a spec, the
4772    /// admitted memory cap flows directly into the Firecracker machine
4773    /// configuration, and the VM boots with the right RAM allocation.
4774    #[test]
4775    fn derive_mem_size_mib_uses_spec_when_present() {
4776        let mut doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
4777            "apiVersion": "cellos.io/v1",
4778            "kind": "ExecutionCell",
4779            "spec": {
4780                "id": "mem-test",
4781                "authority": { "secretRefs": [] },
4782                "lifetime": { "ttlSeconds": 60 },
4783                "run": { "argv": [] }
4784            }
4785        }))
4786        .unwrap();
4787        let run = doc.spec.run.as_mut().expect("run present");
4788        run.limits = Some(cellos_core::RunLimits {
4789            memory_max_bytes: Some(512 * 1024 * 1024),
4790            cpu_max: None,
4791            graceful_shutdown_seconds: None,
4792        });
4793        assert_eq!(derive_mem_size_mib(&doc.spec, DEFAULT_MEM_SIZE_MIB), 512);
4794    }
4795
4796    /// No `run.limits` → fall back to the env-derived default. The default
4797    /// argument is `DEFAULT_MEM_SIZE_MIB` (128 MiB) in production; pass a
4798    /// different sentinel here to prove the fallback is honoured rather than
4799    /// hardcoded.
4800    #[test]
4801    fn derive_mem_size_mib_falls_back_to_env_default() {
4802        let doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
4803            "apiVersion": "cellos.io/v1",
4804            "kind": "ExecutionCell",
4805            "spec": {
4806                "id": "mem-default",
4807                "authority": { "secretRefs": [] },
4808                "lifetime": { "ttlSeconds": 60 }
4809            }
4810        }))
4811        .unwrap();
4812        // Sentinel default (256) chosen so we'd notice if 128 leaked.
4813        assert_eq!(derive_mem_size_mib(&doc.spec, 256), 256);
4814    }
4815
4816    /// Sub-MiB allocations clamp UP to 64 MiB (the practical Firecracker
4817    /// floor below which the guest kernel panics during init).
4818    #[test]
4819    fn derive_mem_size_mib_clamps_to_64_minimum() {
4820        let mut doc: ExecutionCellDocument = serde_json::from_value(serde_json::json!({
4821            "apiVersion": "cellos.io/v1",
4822            "kind": "ExecutionCell",
4823            "spec": {
4824                "id": "tiny",
4825                "authority": { "secretRefs": [] },
4826                "lifetime": { "ttlSeconds": 60 },
4827                "run": { "argv": [] }
4828            }
4829        }))
4830        .unwrap();
4831        let run = doc.spec.run.as_mut().expect("run present");
4832        run.limits = Some(cellos_core::RunLimits {
4833            memory_max_bytes: Some(1024), // 1 KiB → would be 0 MiB
4834            cpu_max: None,
4835            graceful_shutdown_seconds: None,
4836        });
4837        assert_eq!(derive_mem_size_mib(&doc.spec, DEFAULT_MEM_SIZE_MIB), 64);
4838    }
4839
4840    // ── L2-06-4 — validate_jailer_security_config tests ────────────────────
4841
4842    /// A well-formed config (non-root uid/gid, non-`/` chroot, jailer set)
4843    /// passes validation. This is the production posture.
4844    #[test]
4845    fn validate_jailer_security_config_accepts_safe_defaults() {
4846        let cfg = FirecrackerConfig {
4847            binary_path: PathBuf::from("/opt/fc/firecracker"),
4848            kernel_image_path: PathBuf::from("/opt/fc/vmlinux"),
4849            rootfs_image_path: PathBuf::from("/opt/fc/rootfs.ext4"),
4850            jailer_binary_path: Some(PathBuf::from("/opt/fc/jailer")),
4851            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
4852            socket_dir: PathBuf::from("/tmp"),
4853            jailer_uid: 10002,
4854            jailer_gid: 10002,
4855            scratch_dir: None,
4856            manifest_path: None,
4857            require_jailer: true,
4858            allow_no_manifest: true,
4859            enable_network: false,
4860            allow_no_vsock: false,
4861            no_vsock_timeout: std::time::Duration::from_secs(5),
4862            no_seccomp: false,
4863        };
4864        validate_jailer_security_config(&cfg).expect("safe defaults must validate");
4865    }
4866
4867    /// uid=0 must be rejected with an `L2-06-4` audit tag.
4868    #[test]
4869    fn validate_jailer_security_config_rejects_root_uid() {
4870        let cfg = FirecrackerConfig {
4871            binary_path: PathBuf::from("/opt/fc/firecracker"),
4872            kernel_image_path: PathBuf::from("/opt/fc/vmlinux"),
4873            rootfs_image_path: PathBuf::from("/opt/fc/rootfs.ext4"),
4874            jailer_binary_path: Some(PathBuf::from("/opt/fc/jailer")),
4875            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
4876            socket_dir: PathBuf::from("/tmp"),
4877            jailer_uid: 0,
4878            jailer_gid: 10002,
4879            scratch_dir: None,
4880            manifest_path: None,
4881            require_jailer: true,
4882            allow_no_manifest: true,
4883            enable_network: false,
4884            allow_no_vsock: false,
4885            no_vsock_timeout: std::time::Duration::from_secs(5),
4886            no_seccomp: false,
4887        };
4888        let err = validate_jailer_security_config(&cfg).expect_err("uid=0 must be rejected");
4889        let msg = err.to_string();
4890        assert!(msg.contains("jailer_uid=0"), "got: {msg}");
4891        assert!(msg.contains("L2-06-4"), "audit tag missing: {msg}");
4892    }
4893
4894    /// gid=0 must be rejected with an `L2-06-4` audit tag.
4895    #[test]
4896    fn validate_jailer_security_config_rejects_root_gid() {
4897        let cfg = FirecrackerConfig {
4898            binary_path: PathBuf::from("/opt/fc/firecracker"),
4899            kernel_image_path: PathBuf::from("/opt/fc/vmlinux"),
4900            rootfs_image_path: PathBuf::from("/opt/fc/rootfs.ext4"),
4901            jailer_binary_path: Some(PathBuf::from("/opt/fc/jailer")),
4902            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
4903            socket_dir: PathBuf::from("/tmp"),
4904            jailer_uid: 10002,
4905            jailer_gid: 0,
4906            scratch_dir: None,
4907            manifest_path: None,
4908            require_jailer: true,
4909            allow_no_manifest: true,
4910            enable_network: false,
4911            allow_no_vsock: false,
4912            no_vsock_timeout: std::time::Duration::from_secs(5),
4913            no_seccomp: false,
4914        };
4915        let err = validate_jailer_security_config(&cfg).expect_err("gid=0 must be rejected");
4916        let msg = err.to_string();
4917        assert!(msg.contains("jailer_gid=0"), "got: {msg}");
4918        assert!(msg.contains("L2-06-4"), "audit tag missing: {msg}");
4919    }
4920
4921    /// chroot_base_dir=`/` must be rejected — that's equivalent to no chroot.
4922    #[test]
4923    fn validate_jailer_security_config_rejects_root_chroot() {
4924        let cfg = FirecrackerConfig {
4925            binary_path: PathBuf::from("/opt/fc/firecracker"),
4926            kernel_image_path: PathBuf::from("/opt/fc/vmlinux"),
4927            rootfs_image_path: PathBuf::from("/opt/fc/rootfs.ext4"),
4928            jailer_binary_path: Some(PathBuf::from("/opt/fc/jailer")),
4929            chroot_base_dir: PathBuf::from("/"),
4930            socket_dir: PathBuf::from("/tmp"),
4931            jailer_uid: 10002,
4932            jailer_gid: 10002,
4933            scratch_dir: None,
4934            manifest_path: None,
4935            require_jailer: true,
4936            allow_no_manifest: true,
4937            enable_network: false,
4938            allow_no_vsock: false,
4939            no_vsock_timeout: std::time::Duration::from_secs(5),
4940            no_seccomp: false,
4941        };
4942        let err = validate_jailer_security_config(&cfg).expect_err("chroot=/ must be rejected");
4943        let msg = err.to_string();
4944        assert!(msg.contains("chroot_base_dir=`/`"), "got: {msg}");
4945        assert!(msg.contains("L2-06-4"), "audit tag missing: {msg}");
4946    }
4947
4948    /// No jailer configured (dev opt-out via `CELLOS_FIRECRACKER_ALLOW_NO_JAILER=1`)
4949    /// → validation short-circuits with `Ok(())`. The config-load WARN is the
4950    /// audit signal for that posture; double-erroring here would prevent
4951    /// developers from running the backend at all.
4952    #[test]
4953    fn validate_jailer_security_config_passes_when_jailer_disabled() {
4954        let cfg = FirecrackerConfig {
4955            binary_path: PathBuf::from("/opt/fc/firecracker"),
4956            kernel_image_path: PathBuf::from("/opt/fc/vmlinux"),
4957            rootfs_image_path: PathBuf::from("/opt/fc/rootfs.ext4"),
4958            jailer_binary_path: None, // dev mode
4959            chroot_base_dir: PathBuf::from("/var/lib/cellos/firecracker"),
4960            socket_dir: PathBuf::from("/tmp"),
4961            jailer_uid: 10002,
4962            jailer_gid: 10002,
4963            scratch_dir: None,
4964            manifest_path: None,
4965            require_jailer: false,
4966            allow_no_manifest: true,
4967            enable_network: false,
4968            allow_no_vsock: false,
4969            no_vsock_timeout: std::time::Duration::from_secs(5),
4970            no_seccomp: false,
4971        };
4972        validate_jailer_security_config(&cfg).expect("jailer disabled must short-circuit Ok");
4973    }
4974
4975    /// `build_jailer_argv` includes `--uid`, `--gid`, `--chroot-base-dir` with
4976    /// the values handed to it. This is the wire-level pin for L2-06-4: the
4977    /// validation function above guards the *config*, this test guards the
4978    /// *argv* that the live jailer process actually sees. If a future refactor
4979    /// drops one of these flags, the jailer no longer enforces the privilege
4980    /// boundary even with a correct config — both tests must pass.
4981    #[test]
4982    fn build_jailer_argv_includes_uid_gid_chroot_flags() {
4983        let argv = build_jailer_argv(
4984            "cell-l2-06-4",
4985            "/opt/fc/firecracker",
4986            "10002",
4987            "10003",
4988            "/var/lib/cellos/firecracker",
4989            false,
4990        );
4991        // --uid must appear with its value immediately after.
4992        let uid_pos = argv
4993            .iter()
4994            .position(|a| *a == "--uid")
4995            .expect("--uid must be present");
4996        assert_eq!(argv[uid_pos + 1], "10002");
4997        // --gid must appear with its value immediately after.
4998        let gid_pos = argv
4999            .iter()
5000            .position(|a| *a == "--gid")
5001            .expect("--gid must be present");
5002        assert_eq!(argv[gid_pos + 1], "10003");
5003        // --chroot-base-dir must appear with its value immediately after.
5004        let chroot_pos = argv
5005            .iter()
5006            .position(|a| *a == "--chroot-base-dir")
5007            .expect("--chroot-base-dir must be present");
5008        assert_eq!(argv[chroot_pos + 1], "/var/lib/cellos/firecracker");
5009    }
5010}