#[path = "imp/support.rs"]
mod support;
use crate::sys::{self, ChildExecPlan, LandlockRoot, UsernsSyncPipe};
use bvisor::linux::protocol::{
confinement_installed, phase_resolution_consistent, ready_to_exec, validate_table,
DescriptorRole, LauncherState, LinuxLaunchBodyV1, LinuxLaunchPlanV1, LoweringWireEntryV1,
PhaseResult, RefusalReason, SetupPhase,
};
use std::collections::BTreeSet;
use std::os::fd::RawFd;
use support::{
boot_fault, build_seccomp_filter, user_namespace_rendezvous, verify_handles, wait_for_child,
ChildOutcome, Transcript,
};
const ID_AMBIENT_SCRUB: &str = "linux.ambient.scrub.v1";
const ID_EXEC: &str = "linux.exec.v1";
const ID_LANDLOCK_APPLY: &str = "linux.landlock.apply.v1";
const ID_SECCOMP_APPLY: &str = "linux.seccomp.apply.v1";
const PHASE_CODE_SCRUB: u8 = 3;
const PHASE_CODE_EXEC: u8 = 5;
const PHASE_CODE_CONFINE: u8 = 4;
const ENV_PLAN_FD: &str = "BVISOR_LAUNCH_PLAN_FD";
const ENV_CONTROL_FD: &str = "BVISOR_CONTROL_FD";
const ENV_ERROR_FD: &str = "BVISOR_ERROR_FD";
const ENV_ERROR_READ_FD: &str = "BVISOR_ERROR_READ_FD";
#[derive(Debug)]
enum BootError {
BadFdEnv {
var: &'static str,
},
NoControlChannel,
Os(std::io::Error),
NotSingleThreaded {
observed: usize,
},
}
impl std::fmt::Display for BootError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::BadFdEnv { var } => write!(f, "missing or malformed fd env var {var}"),
Self::NoControlChannel => write!(f, "no usable control channel fd"),
Self::Os(e) => write!(f, "launcher OS fault: {e}"),
Self::NotSingleThreaded { observed } => {
write!(f, "launcher is not single-threaded: {observed} tasks")
}
}
}
}
impl std::error::Error for BootError {}
impl From<std::io::Error> for BootError {
fn from(e: std::io::Error) -> Self {
Self::Os(e)
}
}
enum Verdict {
ExecSucceeded,
Refused(RefusalReason),
Faulted,
}
pub(crate) fn run() -> std::process::ExitCode {
let control_fd = match fd_from_env(ENV_CONTROL_FD) {
Ok(fd) => fd,
Err(_) => return boot_fault(&BootError::NoControlChannel),
};
let mut control = Transcript::new(sys::adopt_fd(control_fd));
match drive(&mut control) {
Ok(Verdict::ExecSucceeded) => std::process::ExitCode::SUCCESS,
Ok(Verdict::Refused(_reason)) => {
std::process::ExitCode::from(3)
}
Ok(Verdict::Faulted) => std::process::ExitCode::from(4),
Err(fault) => {
control.emit(LauncherState::SetupFaulted);
let _ = control.note(&format!("fault: {fault}"));
std::process::ExitCode::from(4)
}
}
}
fn drive(control: &mut Transcript) -> Result<Verdict, BootError> {
control.emit(LauncherState::LauncherStarted);
ensure_single_threaded()?;
let inputs = match read_launch_inputs(control)? {
DriveStep::Continue(inputs) => inputs,
DriveStep::Done(verdict) => return Ok(verdict),
};
let body = &inputs.plan.body;
let verified = match verify_launch_plan(control, body)? {
DriveStep::Continue(verified) => verified,
DriveStep::Done(verdict) => return Ok(verdict),
};
let mechanisms = match prepare_mechanisms(control, body, &verified)? {
DriveStep::Continue(mechanisms) => mechanisms,
DriveStep::Done(verdict) => return Ok(verdict),
};
let PreparedMechanisms {
confinement,
seccomp_program,
seccomp_built,
phases,
} = mechanisms;
let child_pid = match spawn_child(
control,
body,
inputs.error_fd,
confinement,
seccomp_program.as_ref(),
seccomp_built,
)? {
DriveStep::Continue(child_pid) => child_pid,
DriveStep::Done(verdict) => return Ok(verdict),
};
finish_child(
control,
inputs.error_fd,
inputs.error_read_fd,
child_pid,
&verified.schedule,
&phases,
)
}
enum DriveStep<T> {
Continue(T),
Done(Verdict),
}
struct LaunchInputs {
plan: LinuxLaunchPlanV1,
error_fd: RawFd,
error_read_fd: RawFd,
}
struct VerifiedPlan {
observed_digest: [u8; 32],
schedule: ClassifiedSchedule,
}
struct PreparedMechanisms {
confinement: Option<BuiltConfinement>,
seccomp_program: Option<sys::BpfProgram>,
seccomp_built: bool,
phases: Phases,
}
fn ensure_single_threaded() -> Result<(), BootError> {
let tasks = count_self_tasks()?;
if tasks == 1 {
Ok(())
} else {
Err(BootError::NotSingleThreaded { observed: tasks })
}
}
fn read_launch_inputs(control: &mut Transcript) -> Result<DriveStep<LaunchInputs>, BootError> {
let plan_fd = fd_from_env(ENV_PLAN_FD)?;
let error_fd = fd_from_env(ENV_ERROR_FD)?;
let error_read_fd = fd_from_env(ENV_ERROR_READ_FD)?;
let plan_bytes = sys::read_fd_to_vec(plan_fd)?;
let plan = match LinuxLaunchPlanV1::decode(&plan_bytes) {
Ok(plan) => plan,
Err(_) => return Ok(DriveStep::Done(refuse(control, RefusalReason::PlanInvalid))),
};
Ok(DriveStep::Continue(LaunchInputs {
plan,
error_fd,
error_read_fd,
}))
}
fn verify_launch_plan(
control: &mut Transcript,
body: &LinuxLaunchBodyV1,
) -> Result<DriveStep<VerifiedPlan>, BootError> {
let observed_digest = schedule_digest(body);
if observed_digest != body.h_l {
return Ok(DriveStep::Done(refuse(
control,
RefusalReason::IdentityMismatch,
)));
}
control.emit(LauncherState::IdentityVerified);
if validate_table(&body.descriptor_table).is_err() {
return Ok(DriveStep::Done(refuse(control, RefusalReason::PlanInvalid)));
}
let schedule = match classify_schedule(body) {
Ok(schedule) => schedule,
Err(reason) => return Ok(DriveStep::Done(refuse(control, reason))),
};
control.emit(LauncherState::PlanVerified);
if verify_handles(body)?.is_err() {
return Ok(DriveStep::Done(refuse(
control,
RefusalReason::HandleMismatch,
)));
}
control.emit(LauncherState::HandlesVerified);
Ok(DriveStep::Continue(VerifiedPlan {
observed_digest,
schedule,
}))
}
fn prepare_mechanisms(
control: &mut Transcript,
body: &LinuxLaunchBodyV1,
verified: &VerifiedPlan,
) -> Result<DriveStep<PreparedMechanisms>, BootError> {
let confinement = match build_landlock_if_scheduled(control, body, &verified.schedule)? {
DriveStep::Continue(confinement) => confinement,
DriveStep::Done(verdict) => return Ok(DriveStep::Done(verdict)),
};
let confine_built = confinement.is_some();
let seccomp_program = match build_seccomp_if_scheduled(control, body, &verified.schedule) {
DriveStep::Continue(program) => program,
DriveStep::Done(verdict) => return Ok(DriveStep::Done(verdict)),
};
let seccomp_built = seccomp_program.is_some();
let confinement_built =
confinement_actions_built(&verified.schedule, confine_built, seccomp_built);
let phases = compute_phases(&verified.schedule, confinement_built);
if !phases_are_honest(&verified.schedule, &phases) {
return Ok(DriveStep::Done(Verdict::Faulted));
}
if !ready_to_launch(
&verified.schedule,
&phases,
confinement_built,
verified,
body,
) {
return Ok(DriveStep::Done(refuse(control, RefusalReason::PlanInvalid)));
}
Ok(DriveStep::Continue(PreparedMechanisms {
confinement,
seccomp_program,
seccomp_built,
phases,
}))
}
fn build_landlock_if_scheduled(
control: &mut Transcript,
body: &LinuxLaunchBodyV1,
schedule: &ClassifiedSchedule,
) -> Result<DriveStep<Option<BuiltConfinement>>, BootError> {
if schedule.confine.is_empty() {
return Ok(DriveStep::Continue(None));
}
let open_before = open_fd_set()?;
match build_confinement(body, &open_before) {
Ok(built) => Ok(DriveStep::Continue(Some(built))),
Err(ConfineRefusal::AbiBelowFloor) => Ok(DriveStep::Done(refuse(
control,
RefusalReason::MissingPrimitive,
))),
Err(ConfineRefusal::NoUsableRoot) => Ok(DriveStep::Done(refuse(
control,
RefusalReason::HandleMismatch,
))),
}
}
fn build_seccomp_if_scheduled(
control: &mut Transcript,
body: &LinuxLaunchBodyV1,
schedule: &ClassifiedSchedule,
) -> DriveStep<Option<sys::BpfProgram>> {
if schedule.seccomp.is_empty() {
return DriveStep::Continue(None);
}
match build_seccomp_filter(body) {
Ok(program) => DriveStep::Continue(Some(program)),
Err(()) => DriveStep::Done(refuse(control, RefusalReason::MissingPrimitive)),
}
}
fn ready_to_launch(
schedule: &ClassifiedSchedule,
phases: &Phases,
confinement_built: bool,
verified: &VerifiedPlan,
body: &LinuxLaunchBodyV1,
) -> bool {
let phase_results = [
(SetupPhase::Identity, phases.identity),
(SetupPhase::Visibility, phases.visibility),
(SetupPhase::AmbientAuthority, phases.ambient),
(SetupPhase::Confinement, phases.confinement),
];
let confinement_scheduled = !schedule.confinement_actions().is_empty();
debug_assert_eq!(
confinement_installed(schedule.confinement_actions().len(), phases.confinement),
confinement_scheduled && confinement_built
);
ready_to_exec(true, phase_results, verified.observed_digest, body.h_l)
}
fn spawn_child(
control: &mut Transcript,
body: &LinuxLaunchBodyV1,
error_fd: RawFd,
confinement: Option<BuiltConfinement>,
seccomp_program: Option<&sys::BpfProgram>,
seccomp_built: bool,
) -> Result<DriveStep<libc::pid_t>, BootError> {
let exe_fd = exe_slot_fd(body)?;
let ruleset_fds = confinement
.as_ref()
.map_or(&[][..], |built| built.ruleset_fds.as_slice());
let known = KnownFds { error: error_fd };
let userns_requested = body.target.user_namespace.is_some();
let netns_requested = body.target.network_namespace.is_some();
if netns_requested && !userns_requested {
let _ = control.note("network_namespace=requested_without_userns fail_closed");
return Ok(DriveStep::Done(Verdict::Faulted));
}
let sync_pipe = match make_optional_sync_pipe(userns_requested) {
Ok(sync_pipe) => sync_pipe,
Err(()) => return Ok(DriveStep::Done(Verdict::Faulted)),
};
let child_sync = sync_pipe.map(|(read, write)| UsernsSyncPipe { read, write });
let allow = allowlist(&known, exe_fd, ruleset_fds, child_sync.map(|p| p.read));
let close_fds = scrub_close_list(&allow)?;
let child_plan = match ChildExecPlan::build(
exe_fd,
None,
error_fd,
child_sync,
&body.target.argv,
&body.target.envp,
close_fds,
) {
Ok(plan) => plan,
Err(_) => return Ok(DriveStep::Done(Verdict::Faulted)),
};
ensure_single_threaded()?;
let cgroup_fd = cgroup_slot_fd(body);
let child_pid = sys::clone3_child(
&child_plan,
confinement.map(|c| c.ruleset),
seccomp_program,
cgroup_fd,
userns_requested,
netns_requested,
)?;
control.emit(LauncherState::ChildCreated);
let _ = control.note(&format!("mechanism=clone3 child_pid={child_pid}"));
if cgroup_fd.is_some() {
let _ = control.note("cgroup_placement=clone_into_cgroup");
}
if seccomp_built {
let _ = control.note("seccomp=denylist_installed mode=filter");
}
if rendezvous_user_namespace(control, child_pid, sync_pipe, netns_requested).is_err() {
return Ok(DriveStep::Done(Verdict::Faulted));
}
Ok(DriveStep::Continue(child_pid))
}
fn make_optional_sync_pipe(userns_requested: bool) -> Result<Option<(RawFd, RawFd)>, ()> {
if userns_requested {
sys::make_sync_pipe().map(Some).map_err(|_| ())
} else {
Ok(None)
}
}
fn rendezvous_user_namespace(
control: &mut Transcript,
child_pid: libc::pid_t,
sync_pipe: Option<(RawFd, RawFd)>,
netns_requested: bool,
) -> Result<(), ()> {
let Some((_read, write)) = sync_pipe else {
return Ok(());
};
if user_namespace_rendezvous(child_pid, write).is_ok() {
let _ = control.note("user_namespace=mapped child_uid0_egid0");
if netns_requested {
let _ = control.note("network_namespace=empty_netns child_isolated");
}
return Ok(());
}
sys::reap_child(child_pid);
let _ = control.note("user_namespace=map_write_failed fail_closed");
control.emit(LauncherState::SetupFaulted);
Err(())
}
fn finish_child(
control: &mut Transcript,
error_fd: RawFd,
error_read_fd: RawFd,
child_pid: libc::pid_t,
schedule: &ClassifiedSchedule,
phases: &Phases,
) -> Result<Verdict, BootError> {
sys::close_fd(error_fd);
let child_outcome = wait_for_child(error_read_fd, child_pid)?;
match child_outcome {
ChildOutcome::ExecedToEof => {
control.emit(LauncherState::IdentityPhaseResolved);
control.emit(LauncherState::VisibilityPhaseResolved);
control.emit(LauncherState::AmbientAuthorityPhaseResolved);
let installed =
confinement_installed(schedule.confinement_actions().len(), phases.confinement);
let _ = control.note(&format!(
"confinement={:?} installed={installed}",
phases.confinement
));
control.emit(LauncherState::ConfinementPhaseResolved);
control.emit(LauncherState::ReadyToExec);
control.emit(LauncherState::ExecSucceeded);
Ok(Verdict::ExecSucceeded)
}
ChildOutcome::Errno(errno) => {
let _ = control.note(&format!("child errno={errno}"));
control.emit(LauncherState::SetupFaulted);
Ok(Verdict::Faulted)
}
}
}
fn refuse(control: &mut Transcript, reason: RefusalReason) -> Verdict {
let _ = control.note(&format!("refusal={reason:?}"));
control.emit(LauncherState::SetupRefused);
Verdict::Refused(reason)
}
struct ClassifiedSchedule {
scrub: Vec<LoweringWireEntryV1>,
confine: Vec<LoweringWireEntryV1>,
seccomp: Vec<LoweringWireEntryV1>,
}
fn classify_schedule(body: &LinuxLaunchBodyV1) -> Result<ClassifiedSchedule, RefusalReason> {
let mut scrub: Vec<LoweringWireEntryV1> = Vec::new();
let mut confine: Vec<LoweringWireEntryV1> = Vec::new();
let mut seccomp: Vec<LoweringWireEntryV1> = Vec::new();
let mut saw_exec = false;
for entry in &body.lowering.entries {
match (entry.id.as_str(), entry.phase_code) {
(ID_AMBIENT_SCRUB, PHASE_CODE_SCRUB) => scrub.push(entry.clone()),
(ID_LANDLOCK_APPLY, PHASE_CODE_CONFINE) => confine.push(entry.clone()),
(ID_SECCOMP_APPLY, PHASE_CODE_CONFINE) => seccomp.push(entry.clone()),
(ID_EXEC, PHASE_CODE_EXEC) => saw_exec = true,
_ => return Err(RefusalReason::MissingPrimitive),
}
}
if !saw_exec {
return Err(RefusalReason::MissingPrimitive);
}
Ok(ClassifiedSchedule {
scrub,
confine,
seccomp,
})
}
impl ClassifiedSchedule {
fn confinement_actions(&self) -> Vec<LoweringWireEntryV1> {
let mut actions = self.confine.clone();
actions.extend(self.seccomp.iter().cloned());
actions
}
}
fn schedule_digest(body: &LinuxLaunchBodyV1) -> [u8; 32] {
match batpak::canonical::to_bytes(&body.lowering) {
Ok(bytes) => batpak::event::hash::compute_hash(&bytes),
Err(_) => [0xFFu8; 32],
}
}
struct Phases {
identity: PhaseResult,
visibility: PhaseResult,
ambient: PhaseResult,
confinement: PhaseResult,
}
fn confinement_actions_built(
schedule: &ClassifiedSchedule,
confine_built: bool,
seccomp_built: bool,
) -> bool {
let landlock_ok = schedule.confine.is_empty() || confine_built;
let seccomp_ok = schedule.seccomp.is_empty() || seccomp_built;
landlock_ok && seccomp_ok
}
fn compute_phases(schedule: &ClassifiedSchedule, confinement_built: bool) -> Phases {
let ambient = if schedule.scrub.is_empty() {
PhaseResult::NotRequired
} else {
PhaseResult::Applied
};
let confinement = if schedule.confinement_actions().is_empty() {
PhaseResult::NotRequired
} else if confinement_built {
PhaseResult::Applied
} else {
PhaseResult::Refused
};
Phases {
identity: PhaseResult::NotRequired,
visibility: PhaseResult::NotRequired,
ambient,
confinement,
}
}
fn phases_are_honest(schedule: &ClassifiedSchedule, phases: &Phases) -> bool {
let empty: [LoweringWireEntryV1; 0] = [];
let confinement = schedule.confinement_actions();
phase_resolution_consistent(&empty, &empty, phases.identity)
&& phase_resolution_consistent(&empty, &empty, phases.visibility)
&& phase_resolution_consistent(&schedule.scrub, &schedule.scrub, phases.ambient)
&& phase_resolution_consistent(&confinement, &confinement, phases.confinement)
}
enum ConfineRefusal {
AbiBelowFloor,
NoUsableRoot,
}
struct BuiltConfinement {
ruleset: sys::RulesetCreated,
ruleset_fds: Vec<RawFd>,
}
fn build_confinement(
body: &LinuxLaunchBodyV1,
open_before: &BTreeSet<RawFd>,
) -> Result<BuiltConfinement, ConfineRefusal> {
if sys::probe_landlock_abi() < sys::LANDLOCK_ABI_FLOOR_RAW {
return Err(ConfineRefusal::AbiBelowFloor);
}
let roots = landlock_roots(body);
if roots.is_empty() {
return Err(ConfineRefusal::NoUsableRoot);
}
let ruleset = sys::build_landlock_ruleset(&roots).map_err(|_| ConfineRefusal::NoUsableRoot)?;
let ruleset_fds: Vec<RawFd> = list_open_fds()
.map_err(|_| ConfineRefusal::NoUsableRoot)?
.into_iter()
.filter(|fd| !open_before.contains(fd))
.collect();
for &fd in &ruleset_fds {
sys::set_cloexec(fd);
}
Ok(BuiltConfinement {
ruleset,
ruleset_fds,
})
}
fn landlock_roots(body: &LinuxLaunchBodyV1) -> Vec<LandlockRoot> {
body.descriptor_table
.iter()
.filter_map(|slot| match slot.role {
DescriptorRole::ReadRoot => Some(LandlockRoot {
fd: raw(slot.slot_index),
writable: false,
}),
DescriptorRole::WriteRoot => Some(LandlockRoot {
fd: raw(slot.slot_index),
writable: true,
}),
DescriptorRole::TargetExe
| DescriptorRole::CgroupDir
| DescriptorRole::Stdin
| DescriptorRole::Stdout
| DescriptorRole::Stderr
| DescriptorRole::ControlChannel
| _ => None,
})
.collect()
}
struct KnownFds {
error: RawFd,
}
fn fd_from_env(var: &'static str) -> Result<RawFd, BootError> {
let raw = std::env::var(var).map_err(|_| BootError::BadFdEnv { var })?;
raw.trim()
.parse::<RawFd>()
.map_err(|_| BootError::BadFdEnv { var })
}
fn exe_slot_fd(body: &LinuxLaunchBodyV1) -> Result<RawFd, BootError> {
Ok(raw(body.target.exe_slot))
}
fn cgroup_slot_fd(body: &LinuxLaunchBodyV1) -> Option<RawFd> {
body.descriptor_table
.iter()
.find(|slot| slot.role == DescriptorRole::CgroupDir)
.map(|slot| raw(slot.slot_index))
}
fn raw(slot_index: u32) -> RawFd {
RawFd::try_from(slot_index).unwrap_or(-1)
}
fn allowlist(
known: &KnownFds,
exe_fd: RawFd,
ruleset_fds: &[RawFd],
sync_read_fd: Option<RawFd>,
) -> BTreeSet<RawFd> {
let mut allow: BTreeSet<RawFd> = BTreeSet::new();
allow.insert(0);
allow.insert(1);
allow.insert(2);
allow.insert(exe_fd);
allow.insert(known.error);
for &fd in ruleset_fds {
allow.insert(fd);
}
if let Some(fd) = sync_read_fd {
allow.insert(fd);
}
allow
}
fn scrub_close_list(allow: &BTreeSet<RawFd>) -> Result<Vec<libc::c_int>, BootError> {
let mut close: Vec<libc::c_int> = Vec::new();
for fd in list_open_fds()? {
if !allow.contains(&fd) {
close.push(fd);
}
}
Ok(close)
}
fn list_open_fds() -> Result<Vec<RawFd>, BootError> {
let mut fds: Vec<RawFd> = Vec::new();
let dir = std::fs::read_dir("/proc/self/fd")?;
for entry in dir {
let entry = entry?;
if let Some(name) = entry.file_name().to_str() {
if let Ok(fd) = name.parse::<RawFd>() {
fds.push(fd);
}
}
}
Ok(fds
.into_iter()
.filter(|&fd| sys::fstat_shape(fd).is_ok())
.collect())
}
fn open_fd_set() -> Result<BTreeSet<RawFd>, BootError> {
Ok(list_open_fds()?.into_iter().collect())
}
fn count_self_tasks() -> Result<usize, BootError> {
let mut n = 0usize;
for entry in std::fs::read_dir("/proc/self/task")? {
let _ = entry?;
n += 1;
}
Ok(n)
}