bvisor 0.9.0

Sync-first boundary supervisor: platform-agnostic boundary contract (types + fail-closed planner) with real Linux (landlock/seccomp/cgroups) and Wasm (wasmi/WASI) confinement backends. ZERO OS code, ZERO BatPak writes in the Backend trait.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
// THE §4 CONTRACT ORACLE for `InheritedFds::None` (proof-spine S5) — dual-channel +
// fail-closed. Proves the COMPLETE path spec → admission → lowering → execution →
// INDEPENDENT observation, INCLUDING the fail-closed branches, so the production
// ceiling may advertise InheritedFds::None=Enforced and the S1 coupling gate couples it.
//
// Compiles only with the real Linux backend + the dangerous-test-hooks harness
// (real clone3 + fexecve through the launcher bin), on Linux.
#![cfg(all(
    feature = "backend-linux",
    feature = "dangerous-test-hooks",
    target_os = "linux"
))]
//! THE BACKEND NEVER GRADES ITSELF. Two independent channels witness the child's open
//! file descriptors:
//!   (A) HOST-SIDE, KERNEL-STATE: the host reads `/proc/<child_pid>/fd` (the kernel's
//!       own fd table) and asserts it contains ONLY the declared/allowlisted fds — never
//!       a workload claim. This is the strongest oracle (genuinely independent).
//!   (B) WORKLOAD SELF-REPORT: the workload tries to WRITE to a non-CLOEXEC SENTINEL fd
//!       the PARENT opened before launch; its write must FAIL (the fd was scrubbed),
//!       reported on the launcher-captured stdout.
//! NO LEAK: the parent opens an inheritable (non-CLOEXEC) sentinel fd before launch and
//! relocates it to a fixed number; the test asserts that fd number is ABSENT from the
//! child's `/proc/<pid>/fd` (scrubbed host-side) AND that nothing the workload wrote
//! crossed the pipe (no data leak).
//!
//! The lowering under test is the REAL contract: the admitted `FdPolicy::None` drives the
//! descriptor-table fd-scrub the launcher runs (every undeclared inherited fd closed
//! before `fexecve`). The host-side /proc observation drives `run_launcher` directly
//! (execute() exposes no child pid — the S4-blessed seam), and the FULL execute() path is
//! independently exercised by the contract-path witness + the fail-closed test below.
//!
//! FAIL-CLOSED: (i) an undeclared inherited fd is scrubbed BEFORE the workload (cited from
//! the launcher mechanism proof `launcher_inherited_fds_linux.rs`); (ii) an unrealized fd
//! policy (a setup failure) ⇒ the target NEVER runs, via the full execute() path.

use bvisor::linux::launch::{self, AuthorityFd};
use bvisor::linux::protocol::{
    DescriptorKind, DescriptorRole, DescriptorShape, DescriptorSlotV1, LinuxLaunchBodyV1,
    LinuxLaunchPlanV1, LoweringWireEntryV1, LoweringWireV1, TargetSpecV1,
};
use bvisor::{
    AdmissionProgramHash, AttemptId, Backend, BackendId, BackendProfileHash, BackendRegistry,
    BoundaryPlanHash, BoundaryPlanner, BoundaryReportBody, BoundarySpec, BudgetRequirements,
    Capability, EnvPolicy, EvidenceRequirements, FdPolicy, HostControl, LinuxBackend, MinGuarantee,
    Outcome, StdStreams, Workload,
};
use std::io::Read;
use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::{Duration, Instant};

// Frozen ids/phase-codes the launcher serves (mirror the launcher's constants).
const ID_AMBIENT_SCRUB: &str = "linux.ambient.scrub.v1";
const ID_EXEC: &str = "linux.exec.v1";
const PHASE_CODE_SCRUB: u8 = 3;
const PHASE_CODE_EXEC: u8 = 5;
const SLOT_EXE: RawFd = 10;

// The injected undeclared (non-CLOEXEC) SENTINEL fd lands here: above the launcher
// channel fds (<= 14) and below the launcher's own relocation base (FD_RELOCATE_BASE ==
// 100), so it can collide with neither the channel plumbing nor a relocated source. It is
// the no-leak proof: it genuinely survives the launcher's execve and reaches the clone3
// child, so its ABSENCE from the child's /proc/<pid>/fd proves the scrub closed it.
const SENTINEL_FD: RawFd = 50;

fn test_launcher_path() -> PathBuf {
    PathBuf::from(env!("CARGO_BIN_EXE_bvisor-linux-launcher"))
}

/// A unique-per-run marker so the host can find THIS run's child in `/proc/<pid>/cmdline`
/// without racing other processes. Combines pid + a monotonic nanos timestamp.
fn unique_marker() -> String {
    let pid = std::process::id();
    let nanos = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_nanos())
        .unwrap_or(0);
    format!("BVISOR-FD-MARKER-{pid}-{nanos}")
}

/// Duplicate `fd` to a fresh number at/above `SENTINEL_FD` with CLOEXEC CLEARED (so it
/// survives the launcher's execve and is inherited by the clone3 child). `F_DUPFD`
/// allocates the LOWEST free fd at/above the base — it never clobbers an existing fd — so
/// the test is collision-safe across repeated runs. Returns the owned relocated fd; the
/// caller keeps it alive across the launcher run and reads its number for the workload.
/// (The `place_inheritable_high` pattern from launcher_inherited_fds_linux.rs:115.)
fn place_inheritable_high(fd: RawFd) -> OwnedFd {
    // SAFETY: test-only. F_DUPFD returns a fresh owned fd >= SENTINEL_FD with CLOEXEC
    // CLEARED (unlike F_DUPFD_CLOEXEC), or -1. We adopt it once.
    let new = unsafe { libc::fcntl(fd, libc::F_DUPFD, SENTINEL_FD) };
    assert!(
        (SENTINEL_FD..100).contains(&new),
        "F_DUPFD must land in the collision-free band [{SENTINEL_FD},100); got {new}"
    );
    // SAFETY: `new` is a fresh owned fd from F_DUPFD.
    unsafe { OwnedFd::from_raw_fd(new) }
}

// ── Channel A: the HOST-SIDE /proc/<pid>/fd oracle ──────────────────────────────────

/// Scan `/proc/*/cmdline` for the EXEC'd target — the process whose command line
/// contains `marker` — polling until `deadline`. Returns its pid. `None` if it never
/// appears (so the caller can fail the test honestly rather than panic on a race).
fn host_find_child(marker: &str, deadline: Instant) -> Option<RawFd> {
    while Instant::now() < deadline {
        if let Some(pid) = scan_proc_cmdline(marker) {
            return Some(pid);
        }
        std::thread::sleep(Duration::from_millis(10));
    }
    None
}

/// One pass over `/proc/<pid>/cmdline`, returning the pid of the process whose command
/// line (NUL-separated argv) contains `marker`.
fn scan_proc_cmdline(marker: &str) -> Option<RawFd> {
    let entries = std::fs::read_dir("/proc").ok()?;
    for entry in entries.flatten() {
        let name = entry.file_name();
        let Some(pid_str) = name.to_str() else {
            continue;
        };
        if !pid_str.bytes().all(|b| b.is_ascii_digit()) {
            continue;
        }
        let Ok(pid) = pid_str.parse::<RawFd>() else {
            continue;
        };
        let path = format!("/proc/{pid_str}/cmdline");
        let Ok(bytes) = std::fs::read(&path) else {
            continue;
        };
        let cmdline = String::from_utf8_lossy(&bytes);
        if cmdline.contains(marker) {
            return Some(pid);
        }
    }
    None
}

/// Read the child's OPEN fd numbers from the KERNEL (`/proc/<pid>/fd`), independent of
/// any workload claim. Returns the sorted fd numbers. `None` if the dir is unreadable
/// (the child already exited / a race) so the caller can retry within the deadline.
fn host_read_child_fds(pid: RawFd) -> Option<Vec<RawFd>> {
    let dir = std::fs::read_dir(format!("/proc/{pid}/fd")).ok()?;
    let mut fds: Vec<RawFd> = Vec::new();
    for entry in dir.flatten() {
        if let Some(name) = entry.file_name().to_str() {
            if let Ok(fd) = name.parse::<RawFd>() {
                fds.push(fd);
            }
        }
    }
    fds.sort_unstable();
    Some(fds)
}

// ── Launcher plan plumbing (the scrub is the REAL descriptor-table-driven lowering) ──

fn entry(id: &str, phase_code: u8) -> LoweringWireEntryV1 {
    LoweringWireEntryV1 {
        id: id.to_owned(),
        version: 1,
        phase_code,
        param_digest: [0u8; 32],
        decl_digest: [0u8; 32],
    }
}

fn exe_slot() -> DescriptorSlotV1 {
    DescriptorSlotV1 {
        slot_index: u32::try_from(SLOT_EXE).expect("fd fits u32"),
        role: DescriptorRole::TargetExe,
        expected: DescriptorShape {
            kind: DescriptorKind::Regular,
            writable: false,
        },
    }
}

/// An exec-only launcher plan whose descriptor table declares ONLY the exe slot (so the
/// scrub's allowlist is exactly stdio + exe + the launcher's protocol fds — `FdPolicy::None`).
fn exec_only_plan(argv: Vec<String>) -> LinuxLaunchPlanV1 {
    let lowering = LoweringWireV1 {
        entries: vec![
            entry(ID_AMBIENT_SCRUB, PHASE_CODE_SCRUB),
            entry(ID_EXEC, PHASE_CODE_EXEC),
        ],
    };
    let bytes = batpak::canonical::to_bytes(&lowering).expect("encode lowering");
    let h_l = batpak::event::hash::compute_hash(&bytes);
    let body = LinuxLaunchBodyV1 {
        attempt_id: AttemptId([7u8; 32]),
        plan_id: BoundaryPlanHash([1u8; 32]),
        h_a: AdmissionProgramHash([2u8; 32]),
        h_p: BackendProfileHash([3u8; 32]),
        h_l,
        lowering,
        descriptor_table: vec![exe_slot()],
        target: TargetSpecV1 {
            argv,
            envp: vec![("PATH".to_owned(), "/usr/bin:/bin".to_owned())],
            exe_slot: u32::try_from(SLOT_EXE).expect("fd fits u32"),
            user_namespace: None,
            network_namespace: None,
            seccomp: None,
        },
    };
    LinuxLaunchPlanV1 { body }
}

/// `/bin/sh` as the exec'd target (it can list its own fds + try the sentinel write).
fn sh_authority() -> AuthorityFd {
    AuthorityFd {
        slot_index: SLOT_EXE,
        handle: OwnedFd::from(std::fs::File::open("/bin/sh").expect("open /bin/sh")),
    }
}

// ── THE GUARANTEE-HOLDS ORACLE (dual channel + no-leak sentinel) ─────────────────────

#[test]
fn child_inherits_only_the_declared_fds_no_sentinel_leak() {
    let marker = unique_marker();

    // NO-LEAK SETUP: the PARENT opens an inheritable (non-CLOEXEC) sentinel via a pipe
    // write end relocated high, so it genuinely survives the launcher's execve and reaches
    // the clone3 child. It is NOT declared in the descriptor table, so the scrub MUST
    // close it. Both `writer` and `sentinel` stay alive across the spawn.
    let (mut reader, writer) = std::io::pipe().expect("create pipe");
    let sentinel = place_inheritable_high(writer.as_raw_fd());
    let sentinel_fd = sentinel.as_raw_fd();

    // The workload: keep alive (so the host can read /proc while it runs), carry the unique
    // marker IN THE SCRIPT (so the host can find it via /proc/<pid>/cmdline — the `: MARKER`
    // no-op embeds it where the shell's `-c` argument is recorded), AND try to write a LEAK
    // marker to the sentinel fd — its OWN report (channel B). The trailing `true` keeps the
    // shell RESIDENT (without it, bash tail-call-execs `sleep` and loses the marker); the
    // `sleep` keeps it alive while the host reads /proc.
    let script = format!(
        ": {marker}; \
         if printf LEAK >&{sentinel_fd}; then printf WROTE; else printf SCRUBBED; fi; \
         sleep 3; true"
    );
    let argv = vec!["sh".to_string(), "-c".to_string(), script];
    let plan = exec_only_plan(argv);
    let launcher = test_launcher_path();
    let deadline = Instant::now() + Duration::from_millis(2500);

    // `Builder::spawn` (not `thread::spawn`, which panics on failure).
    let handle = std::thread::Builder::new()
        .name("fd-oracle-launcher".to_string())
        .spawn(move || {
            launch::run_launcher(&launcher, &plan, vec![sh_authority()])
                .expect("the launcher runs the fd-scrub workload to a verdict")
        })
        .expect("spawn the launcher driver thread");

    // ── CHANNEL A: host-side /proc/<child_pid>/fd (kernel state) ─────────────────────
    // Find the child, then read its open fds from the kernel while it sleeps.
    let mut host_fds: Option<Vec<RawFd>> = None;
    if let Some(pid) = host_find_child(&marker, deadline) {
        while Instant::now() < deadline {
            if let Some(fds) = host_read_child_fds(pid) {
                host_fds = Some(fds);
                break;
            }
            std::thread::sleep(Duration::from_millis(10));
        }
    }

    let obs = handle.join().expect("fd-oracle launcher thread joins");
    if launch::launch_confinement_unavailable(&obs) {
        use std::io::Write as _;
        let mut sink = std::io::stderr();
        let _ = writeln!(
            sink,
            "SKIP child_inherits_only_the_declared_fds_no_sentinel_leak: kernel/container lacks \
             landlock/userns/seccomp (ENOSYS); the launcher faulted before exec — exercised on \
             capable kernels + the bvisor-linux CI lane"
        );
        return;
    }
    // Drop every host-side WRITE end so the pipe read sees EOF.
    drop(sentinel);
    drop(writer);
    let mut leaked = Vec::new();
    reader
        .read_to_end(&mut leaked)
        .expect("read the pipe read end");

    // Collect-and-assert (panic! is banned even in tests): gather every failure, assert once.
    let mut failures: Vec<String> = Vec::new();

    if !obs.exec_succeeded() {
        failures.push(format!(
            "the workload must reach ExecSucceeded; terminal={:?} notes={:?}",
            obs.terminal, obs.notes
        ));
    }

    // CHANNEL A: the host must have observed the child's kernel fd table while it was
    // alive; if not, that is itself a failure (no panic — collect-and-assert).
    match host_fds {
        None => failures.push(
            "CHANNEL A: the host must observe the child's /proc/<pid>/fd while it is alive"
                .to_string(),
        ),
        Some(host_fds) => {
            // The child's open fds are EXACTLY the declared allowlist — stdio (0,1,2) plus
            // the declared TargetExe slot fd (SLOT_EXE == 10), which the workload inherits
            // as its own image fd. EVERY other low fd is undeclared and MUST have been
            // scrubbed; in particular the SENTINEL fd (50) MUST be absent. No fd in the
            // collision-free band below the launcher's relocation base (< 100), other than
            // stdio + the declared exe slot, may survive.
            let declared = [0, 1, 2, SLOT_EXE];
            let undeclared: Vec<RawFd> = host_fds
                .iter()
                .copied()
                .filter(|fd| !declared.contains(fd) && *fd < 100)
                .collect();
            if !undeclared.is_empty() {
                failures.push(format!(
                    "CHANNEL A: the child's /proc/<pid>/fd must contain ONLY the declared \
                     allowlist; undeclared low fds survived: {undeclared:?} (full set {host_fds:?})"
                ));
            }
            if host_fds.contains(&sentinel_fd) {
                failures.push(format!(
                    "CHANNEL A (no-leak): the undeclared sentinel fd {sentinel_fd} was NOT \
                     scrubbed — it survived into the child: {host_fds:?}"
                ));
            }
            // Stdio must still be present (the workload needs its inherited stdio).
            for std_fd in [0, 1, 2] {
                if !host_fds.contains(&std_fd) {
                    failures.push(format!(
                        "CHANNEL A: the declared stdio fd {std_fd} must survive the scrub: \
                         {host_fds:?}"
                    ));
                }
            }
        }
    }

    // CHANNEL B: the workload's OWN report — its write to the sentinel fd failed.
    let out = String::from_utf8_lossy(&obs.captured_stdout);
    if !out.contains("SCRUBBED") || out.contains("WROTE") {
        failures.push(format!(
            "CHANNEL B: the workload must report the sentinel fd was SCRUBBED; got stdout={out:?}"
        ));
    }
    // NO LEAK (independent, host-side): nothing the workload wrote crossed the pipe.
    if !leaked.is_empty() {
        failures.push(format!(
            "no-leak: the sentinel fd LEAKED across the boundary: host read {leaked:?} from the pipe"
        ));
    }

    assert!(
        failures.is_empty(),
        "fd-scrub oracle failures: {failures:#?}"
    );
}

// ── The full-execute()-path witness + the contract-level fail-closed branch ──────────

/// A spec whose ONLY capability is `InheritedFds { policy }`, plus launch + capture. The
/// LinuxBackend admits the `None` policy (InheritedFdsNone is Enforced in the ceiling).
fn fds_spec(policy: FdPolicy) -> BoundarySpec {
    BoundarySpec {
        workload: Workload::Process {
            exe: "/bin/sh".to_string(),
            args: vec!["-c".to_string(), "exit 0".to_string()],
        },
        capabilities: vec![
            Capability::InheritedFds { policy },
            // An empty explicit env so the child gets a clean, declared environment.
            Capability::Environment {
                policy: EnvPolicy::Exact(Vec::new()),
            },
        ],
        controls: vec![
            HostControl::LaunchWorkload,
            HostControl::CaptureStreams {
                streams: StdStreams::capture_out_err(),
            },
        ],
        budgets: BudgetRequirements::uniform(8, MinGuarantee::Mediated),
        evidence: EvidenceRequirements::default(),
    }
}

/// Run a spec through the LinuxBackend `execute()` contract path, returning the sealed
/// durable report body. `None` from `plan()` ⇒ admission refused (the caller asserts that).
fn run_execute(spec: &BoundarySpec) -> Option<BoundaryReportBody> {
    let backend = Arc::new(LinuxBackend::with_launcher_path(test_launcher_path()));
    let id: BackendId = backend.id();
    let mut registry = BackendRegistry::new();
    registry.register(Arc::clone(&backend) as Arc<dyn Backend>);

    let plan = BoundaryPlanner::new(&registry).plan(spec, &id).ok()?;
    Some(
        bvisor::BoundaryRunner::new(&registry)
            .run(&plan)
            .expect("the run seals a terminal report")
            .body,
    )
}

#[test]
fn a_none_policy_spec_runs_through_the_execute_path() {
    // The FULL execute()/BoundaryRunner contract-path witness: a None-policy spec admits
    // (InheritedFdsNone is Enforced) and runs to a clean verdict, with the lowering fact
    // recorded — the lowering rides the production contract, not only a launcher-direct plan.
    let report = run_execute(&fds_spec(FdPolicy::None))
        .expect("an InheritedFds::None spec must ADMIT (the cell is Enforced)");
    if launch::report_confinement_unavailable(&report.observed) {
        use std::io::Write as _;
        let mut sink = std::io::stderr();
        let _ = writeln!(
            sink,
            "SKIP a_none_policy_spec_runs_through_the_execute_path: kernel/container lacks \
             landlock/userns/seccomp (ENOSYS); confinement cannot install here — exercised on \
             capable kernels + the bvisor-linux CI lane"
        );
        return;
    }

    let mut failures: Vec<String> = Vec::new();
    if report.outcome != Outcome::Completed {
        failures.push(format!(
            "the None-policy workload must run to Completed: {:?} / {:?}",
            report.outcome, report.observed
        ));
    }
    // The lowering recorded its fact on the production execute() path.
    if !report
        .observed
        .iter()
        .any(|f| f.kind == "inherited_fds_lowered")
    {
        failures.push(format!(
            "the execute() path must record the fd lowering: {:?}",
            report.observed
        ));
    }
    assert!(
        failures.is_empty(),
        "execute()-path witness failures: {failures:#?}"
    );
}

#[test]
fn an_unrealized_fd_policy_fails_closed_and_the_target_never_runs() {
    // CONTRACT-LEVEL FAIL-CLOSED: `FdPolicy::Only` is NOT realized by this backend (the
    // scrub realizes only `None`). It is absent from the ceiling, so it must REFUSE before
    // execution — the target NEVER runs. This proves the fail-closed branch on the full
    // contract path (admission), not only a launcher-direct mechanism.
    let report = run_execute(&fds_spec(FdPolicy::Only(vec![7])));
    assert!(
        report.is_none(),
        "an InheritedFds::Only spec must FAIL CLOSED at admission (the cell is Unsupported) — \
         the target never runs; got a sealed report {report:?}"
    );
}