use crate::audit::{audit, audit_error, AuditEventType};
use crate::container::{
ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
ServiceMode,
};
use crate::error::{NucleusError, Result, StateTransition};
use crate::filesystem::{
audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
switch_root, validate_production_rootfs_path, verify_context_manifest,
verify_rootfs_attestation, FilesystemState, LazyContextPopulator, TmpfsMount,
};
use crate::isolation::{NamespaceManager, UserNamespaceMapper};
use crate::network::{BridgeDriver, BridgeNetwork, NatBackend, NetworkMode, UserspaceNetwork};
use crate::resources::Cgroup;
use crate::security::{
CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks,
SeccompDenyLogger, SeccompManager, SeccompTraceReader, SecurityState,
};
use nix::sys::signal::{kill, Signal};
use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
use nix::sys::stat::Mode;
use nix::sys::wait::{waitpid, WaitStatus};
use nix::unistd::{
chown, fork, pipe, read, setresgid, setresuid, write, ForkResult, Gid, Pid, Uid,
};
use std::os::fd::OwnedFd;
use std::os::unix::fs::PermissionsExt;
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread::JoinHandle;
use tempfile::Builder;
use tracing::{debug, error, info, info_span, warn};
pub struct Container {
pub(super) config: ContainerConfig,
pub(super) runsc_path: Option<String>,
}
pub struct CreatedContainer {
pub(super) config: ContainerConfig,
pub(super) state_mgr: ContainerStateManager,
pub(super) state: ContainerState,
pub(super) child: Pid,
pub(super) cgroup_opt: Option<Cgroup>,
pub(super) network_driver: Option<BridgeDriver>,
pub(super) trace_reader: Option<SeccompTraceReader>,
pub(super) deny_logger: Option<SeccompDenyLogger>,
pub(super) exec_fifo_path: Option<PathBuf>,
pub(super) _lifecycle_span: tracing::Span,
}
impl Container {
pub fn new(config: ContainerConfig) -> Self {
Self {
config,
runsc_path: None,
}
}
pub fn run(&self) -> Result<i32> {
self.create_internal(false)?.start()
}
pub fn create(&self) -> Result<CreatedContainer> {
self.create_internal(true)
}
fn sanitize_fds() {
const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
let ret =
unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
if ret == 0 {
return;
}
if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
let fds: Vec<i32> = entries
.flatten()
.filter_map(|entry| entry.file_name().into_string().ok())
.filter_map(|s| s.parse::<i32>().ok())
.filter(|&fd| fd > 2)
.collect();
for fd in fds {
unsafe { libc::close(fd) };
}
}
}
pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
let thread_count = std::fs::read_to_string("/proc/self/status")
.ok()
.and_then(|s| {
s.lines()
.find(|line| line.starts_with("Threads:"))
.and_then(|line| line.split_whitespace().nth(1))
.and_then(|count| count.parse::<u32>().ok())
});
if thread_count == Some(1) {
return Ok(());
}
Err(NucleusError::ExecError(format!(
"{} requires a single-threaded process before fork, found {:?} threads",
context, thread_count
)))
}
fn prepare_runtime_base_override(
config: &ContainerConfig,
host_is_root: bool,
needs_external_userns_mapping: bool,
) -> Result<Option<PathBuf>> {
if !needs_external_userns_mapping {
return Ok(None);
}
if !host_is_root {
return Ok(Some(
dirs::runtime_dir()
.map(|d| d.join("nucleus"))
.unwrap_or_else(std::env::temp_dir),
));
}
let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
NucleusError::ExecError("Missing user namespace configuration".to_string())
})?;
let host_uid =
Self::mapped_host_id_for_container_id(&user_config.uid_mappings, 0, "uid mappings")?;
let host_gid =
Self::mapped_host_id_for_container_id(&user_config.gid_mappings, 0, "gid mappings")?;
let root = PathBuf::from("/run/nucleus");
Self::ensure_runtime_parent_dir(&root)?;
let runtime_root = root.join("runtime");
Self::ensure_runtime_parent_dir(&runtime_root)?;
let base = runtime_root.join(&config.id);
std::fs::create_dir_all(&base).map_err(|e| {
NucleusError::FilesystemError(format!(
"Failed to create user namespace runtime base {:?}: {}",
base, e
))
})?;
chown(
&base,
Some(Uid::from_raw(host_uid)),
Some(Gid::from_raw(host_gid)),
)
.map_err(|e| {
NucleusError::FilesystemError(format!(
"Failed to chown user namespace runtime base {:?} to {}:{}: {}",
base, host_uid, host_gid, e
))
})?;
std::fs::set_permissions(&base, std::fs::Permissions::from_mode(0o700)).map_err(|e| {
NucleusError::FilesystemError(format!(
"Failed to secure user namespace runtime base {:?}: {}",
base, e
))
})?;
Ok(Some(base))
}
fn ensure_runtime_parent_dir(path: &std::path::Path) -> Result<()> {
std::fs::create_dir_all(path).map_err(|e| {
NucleusError::FilesystemError(format!(
"Failed to create runtime parent dir {:?}: {}",
path, e
))
})?;
std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o711)).map_err(|e| {
NucleusError::FilesystemError(format!(
"Failed to secure runtime parent dir {:?}: {}",
path, e
))
})?;
Ok(())
}
fn mapped_host_id_for_container_id(
mappings: &[crate::isolation::IdMapping],
container_id: u32,
label: &str,
) -> Result<u32> {
for mapping in mappings {
let end = mapping
.container_id
.checked_add(mapping.count)
.ok_or_else(|| {
NucleusError::ConfigError(format!(
"{} overflow for container id {}",
label, container_id
))
})?;
if container_id >= mapping.container_id && container_id < end {
return mapping
.host_id
.checked_add(container_id - mapping.container_id)
.ok_or_else(|| {
NucleusError::ConfigError(format!(
"{} host id overflow for container id {}",
label, container_id
))
});
}
}
Err(NucleusError::ConfigError(format!(
"{} do not map container id {}",
label, container_id
)))
}
fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
let lifecycle_span = info_span!(
"container.lifecycle",
container.id = %self.config.id,
container.name = %self.config.name,
runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
);
let _enter = lifecycle_span.enter();
info!(
"Creating container: {} (ID: {})",
self.config.name, self.config.id
);
audit(
&self.config.id,
&self.config.name,
AuditEventType::ContainerStart,
format!(
"command={:?} mode={:?} runtime={}",
crate::audit::redact_command(&self.config.command),
self.config.service_mode,
if self.config.use_gvisor {
"gvisor"
} else {
"native"
}
),
);
let is_root = nix::unistd::Uid::effective().is_root();
let mut config = self.config.clone();
if !is_root && config.user_ns_config.is_none() {
info!("Not running as root, automatically enabling rootless mode");
config.namespaces.user = true;
config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
}
if is_root && !config.namespaces.user {
if config.service_mode == ServiceMode::Production {
info!("Running as root in production mode: enabling user namespace with UID remapping");
config.namespaces.user = true;
config.user_ns_config =
Some(crate::isolation::UserNamespaceConfig::root_remapped());
} else {
warn!(
"Running as root WITHOUT user namespace isolation. \
Container processes will run as real host UID 0. \
Use --user-ns or production mode for UID remapping."
);
}
}
if let Some(ref socket_path) = config.console_socket {
warn!(
"Console socket {} accepted but terminal forwarding is not yet implemented",
socket_path.display()
);
}
config.validate_production_mode()?;
if config.service_mode == ServiceMode::Production {
let rootfs_path = config.rootfs_path.as_ref().ok_or_else(|| {
NucleusError::ConfigError(
"Production mode requires explicit --rootfs path (no host bind mounts)"
.to_string(),
)
})?;
config.rootfs_path = Some(validate_production_rootfs_path(rootfs_path)?);
}
Self::assert_kernel_lockdown(&config)?;
Self::apply_network_mode_guards(&mut config, is_root)?;
Self::apply_trust_level_guards(&mut config)?;
config.validate_runtime_support()?;
if let NetworkMode::Bridge(ref bridge_config) = config.network {
let backend =
bridge_config.selected_nat_backend(is_root, config.user_ns_config.is_some());
if backend == NatBackend::Kernel && !is_root {
return Err(NucleusError::NetworkError(
"Kernel bridge networking requires root. Use --nat-backend userspace or leave the default auto selection for rootless/native containers."
.to_string(),
));
}
}
let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
if let Ok(all_states) = state_mgr.list_states() {
if all_states.iter().any(|s| s.name == config.name) {
return Err(NucleusError::ConfigError(format!(
"A container named '{}' already exists; use a different --name, \
or remove the stale state with 'nucleus delete'",
config.name
)));
}
}
let exec_fifo = if defer_exec_until_start {
let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
NucleusError::ExecError(format!(
"Failed to create exec FIFO {:?}: {}",
exec_fifo, e
))
})?;
Some(exec_fifo)
} else {
None
};
let cgroup_name = format!("nucleus-{}", config.id);
let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
Ok(mut cgroup) => {
match cgroup.set_limits(&config.limits) {
Ok(_) => {
info!("Created cgroup with resource limits");
Some(cgroup)
}
Err(e) => {
if config.service_mode == ServiceMode::Production {
let _ = cgroup.cleanup();
return Err(NucleusError::CgroupError(format!(
"Production mode requires cgroup resource enforcement, but \
applying limits failed: {}",
e
)));
}
warn!("Failed to set cgroup limits: {}", e);
let _ = cgroup.cleanup();
None
}
}
}
Err(e) => {
if config.service_mode == ServiceMode::Production {
return Err(NucleusError::CgroupError(format!(
"Production mode requires cgroup resource enforcement, but \
cgroup creation failed: {}",
e
)));
}
if config.user_ns_config.is_some() {
if config.limits.memory_bytes.is_some()
|| config.limits.cpu_quota_us.is_some()
|| config.limits.pids_max.is_some()
{
warn!(
"Running in rootless mode: requested resource limits cannot be \
enforced – cgroup creation requires root ({})",
e
);
} else {
debug!("Running in rootless mode without cgroup resource limits");
}
} else {
warn!(
"Failed to create cgroup (running without resource limits): {}",
e
);
}
None
}
};
let runsc_path = if config.use_gvisor {
Some(GVisorRuntime::resolve_path().map_err(|e| {
NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
})?)
} else {
None
};
let gvisor_bridge_needs_userns_mapping = config.use_gvisor
&& !is_root
&& config.user_ns_config.is_some()
&& matches!(config.network, NetworkMode::Bridge(_));
let needs_external_userns_mapping = config.user_ns_config.is_some()
&& (!config.use_gvisor || gvisor_bridge_needs_userns_mapping);
let runtime_base_override =
Self::prepare_runtime_base_override(&config, is_root, needs_external_userns_mapping)?;
let (ready_read, ready_write) = pipe().map_err(|e| {
NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
})?;
let userns_sync = if needs_external_userns_mapping {
let (request_read, request_write) = pipe().map_err(|e| {
NucleusError::ExecError(format!(
"Failed to create user namespace request pipe: {}",
e
))
})?;
let (ack_read, ack_write) = pipe().map_err(|e| {
NucleusError::ExecError(format!("Failed to create user namespace ack pipe: {}", e))
})?;
Some((request_read, request_write, ack_read, ack_write))
} else {
None
};
let (parent_setup_read, parent_setup_write) = pipe().map_err(|e| {
NucleusError::ExecError(format!("Failed to create parent setup sync pipe: {}", e))
})?;
Self::assert_single_threaded_for_fork("container create fork")?;
match unsafe { fork() }? {
ForkResult::Parent { child } => {
drop(ready_write);
drop(parent_setup_read);
let (userns_request_read, userns_ack_write) =
if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
drop(request_write);
drop(ack_read);
(Some(request_read), Some(ack_write))
} else {
(None, None)
};
info!("Forked child process: {}", child);
let mut target_pid_for_cleanup: Option<u32> = None;
let parent_setup = || -> Result<CreatedContainer> {
if needs_external_userns_mapping {
let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
NucleusError::ExecError(
"Missing user namespace configuration in parent".to_string(),
)
})?;
let request_read = userns_request_read.as_ref().ok_or_else(|| {
NucleusError::ExecError(
"Missing user namespace request pipe in parent".to_string(),
)
})?;
let ack_write = userns_ack_write.as_ref().ok_or_else(|| {
NucleusError::ExecError(
"Missing user namespace ack pipe in parent".to_string(),
)
})?;
Self::wait_for_sync_byte(
request_read,
&format!(
"Child {} exited before requesting user namespace mappings",
child
),
"Failed waiting for child user namespace request",
)?;
UserNamespaceMapper::new(user_config.clone())
.write_mappings_for_pid(child.as_raw() as u32)?;
Self::send_sync_byte(
ack_write,
"Failed to notify child that user namespace mappings are ready",
)?;
}
let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
target_pid_for_cleanup = Some(target_pid);
let cgroup_path = cgroup_opt
.as_ref()
.map(|cgroup| cgroup.path().display().to_string());
let cpu_millicores = config
.limits
.cpu_quota_us
.map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
let mut state = ContainerState::new(ContainerStateParams {
id: config.id.clone(),
name: config.name.clone(),
pid: target_pid,
command: config.command.clone(),
memory_limit: config.limits.memory_bytes,
cpu_limit: cpu_millicores,
using_gvisor: config.use_gvisor,
rootless: config.user_ns_config.is_some(),
cgroup_path,
process_uid: config.process_identity.uid,
process_gid: config.process_identity.gid,
additional_gids: config.process_identity.additional_gids.clone(),
});
state.config_hash = config.config_hash;
state.bundle_path =
config.rootfs_path.as_ref().map(|p| p.display().to_string());
let mut network_driver: Option<BridgeDriver> = None;
let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
state.status = OciStatus::Created;
state_mgr.save_state(&state)?;
if let Some(ref pid_path) = config.pid_file {
std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
NucleusError::ConfigError(format!(
"Failed to write pid-file '{}': {}",
pid_path.display(),
e
))
})?;
info!("Wrote PID {} to {}", target_pid, pid_path.display());
}
if let Some(ref mut cgroup) = cgroup_opt {
cgroup.attach_process(target_pid)?;
}
let deny_logger = Self::maybe_start_seccomp_deny_logger(
&config,
target_pid,
cgroup_opt.as_ref().map(|cgroup| cgroup.path()),
)?;
if let NetworkMode::Bridge(ref bridge_config) = config.network {
match BridgeDriver::setup_with_id(
target_pid,
bridge_config,
&config.id,
is_root,
config.user_ns_config.is_some(),
) {
Ok(net) => {
if let Some(ref egress) = config.egress_policy {
if let Err(e) = net.apply_egress_policy(
target_pid,
egress,
config.user_ns_config.is_some(),
) {
if config.service_mode == ServiceMode::Production {
return Err(NucleusError::NetworkError(format!(
"Failed to apply egress policy: {}",
e
)));
}
warn!("Failed to apply egress policy: {}", e);
}
}
network_driver = Some(net);
}
Err(e) => {
if config.service_mode == ServiceMode::Production {
return Err(e);
}
warn!("Failed to set up bridge networking: {}", e);
}
}
}
Self::send_sync_byte(
&parent_setup_write,
"Failed to notify child that parent setup is complete",
)?;
info!(
"Container {} created (child pid {}), waiting for start",
config.id, target_pid
);
Ok(CreatedContainer {
config,
state_mgr,
state,
child,
cgroup_opt,
network_driver,
trace_reader,
deny_logger,
exec_fifo_path: exec_fifo,
_lifecycle_span: lifecycle_span.clone(),
})
};
parent_setup().map_err(|e| {
if let Some(target_pid) = target_pid_for_cleanup {
let _ = kill(Pid::from_raw(target_pid as i32), Signal::SIGKILL);
}
let _ = kill(child, Signal::SIGKILL);
let _ = waitpid(child, None);
e
})
}
ForkResult::Child => {
drop(ready_read);
drop(parent_setup_write);
let (userns_request_write, userns_ack_read) =
if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
drop(request_read);
drop(ack_write);
(Some(request_write), Some(ack_read))
} else {
(None, None)
};
Self::sanitize_fds();
let temp_container = Container { config, runsc_path };
match temp_container.setup_and_exec(
Some(ready_write),
userns_request_write,
userns_ack_read,
Some(parent_setup_read),
exec_fifo,
runtime_base_override,
) {
Ok(_) => unreachable!(),
Err(e) => {
error!("Container setup failed: {}", e);
std::process::exit(1);
}
}
}
}
}
pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
let state_mgr = ContainerStateManager::new_with_root(state_root)?;
let fifo_path = state_mgr.exec_fifo_path(container_id)?;
if !fifo_path.exists() {
return Err(NucleusError::ConfigError(format!(
"No exec FIFO found for container {}; is it in 'created' state?",
container_id
)));
}
let file = std::fs::File::open(&fifo_path)
.map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
let mut buf = [0u8; 1];
std::io::Read::read(&mut &file, &mut buf)
.map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
drop(file);
let _ = std::fs::remove_file(&fifo_path);
let mut state = state_mgr.resolve_container(container_id)?;
state.status = OciStatus::Running;
state_mgr.save_state(&state)?;
Ok(())
}
fn setup_and_exec(
&self,
ready_pipe: Option<OwnedFd>,
userns_request_pipe: Option<OwnedFd>,
userns_ack_pipe: Option<OwnedFd>,
parent_setup_pipe: Option<OwnedFd>,
exec_fifo: Option<PathBuf>,
runtime_base_override: Option<PathBuf>,
) -> Result<()> {
let is_rootless = self.config.user_ns_config.is_some();
let allow_degraded_security = Self::allow_degraded_security(&self.config);
let context_manifest = if self.config.verify_context_integrity {
self.config
.context_dir
.as_ref()
.map(|dir| snapshot_context_dir(dir))
.transpose()?
} else {
None
};
let mut fs_state = FilesystemState::Unmounted;
let mut sec_state = SecurityState::Privileged;
if self.config.use_gvisor {
let gvisor_bridge_precreated_userns =
if matches!(self.config.network, NetworkMode::Bridge(_)) {
self.prepare_gvisor_bridge_namespace(
userns_request_pipe.as_ref(),
userns_ack_pipe.as_ref(),
)?
} else {
false
};
if let Some(fd) = ready_pipe {
Self::notify_namespace_ready(&fd, std::process::id())?;
}
if let Some(fd) = parent_setup_pipe.as_ref() {
Self::wait_for_sync_byte(
fd,
"Parent closed setup pipe before signalling gVisor child",
"Failed waiting for parent setup acknowledgement",
)?;
}
return self.setup_and_exec_gvisor(gvisor_bridge_precreated_userns);
}
let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
namespace_mgr.unshare_namespaces()?;
if self.config.user_ns_config.is_some() {
let request_fd = userns_request_pipe.as_ref().ok_or_else(|| {
NucleusError::ExecError(
"Missing user namespace request pipe in container child".to_string(),
)
})?;
let ack_fd = userns_ack_pipe.as_ref().ok_or_else(|| {
NucleusError::ExecError(
"Missing user namespace acknowledgement pipe in container child".to_string(),
)
})?;
Self::send_sync_byte(
request_fd,
"Failed to request user namespace mappings from parent",
)?;
Self::wait_for_sync_byte(
ack_fd,
"Parent closed user namespace ack pipe before mappings were written",
"Failed waiting for parent to finish user namespace mappings",
)?;
Self::become_userns_root_for_setup()?;
}
if self.config.namespaces.pid {
Self::assert_single_threaded_for_fork("PID namespace init fork")?;
match unsafe { fork() }? {
ForkResult::Parent { child } => {
if let Some(fd) = ready_pipe {
Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
}
std::process::exit(Self::wait_for_pid_namespace_child(child));
}
ForkResult::Child => {
if let Some(fd) = parent_setup_pipe.as_ref() {
Self::wait_for_sync_byte(
fd,
"Parent closed setup pipe before signalling PID 1 child",
"Failed waiting for parent setup acknowledgement",
)?;
}
}
}
} else {
if let Some(fd) = ready_pipe {
Self::notify_namespace_ready(&fd, std::process::id())?;
}
if let Some(fd) = parent_setup_pipe.as_ref() {
Self::wait_for_sync_byte(
fd,
"Parent closed setup pipe before signalling container child",
"Failed waiting for parent setup acknowledgement",
)?;
}
}
namespace_mgr.enter()?;
self.enforce_no_new_privs()?;
audit(
&self.config.id,
&self.config.name,
AuditEventType::NoNewPrivsSet,
"prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
);
if let Some(hostname) = &self.config.hostname {
namespace_mgr.set_hostname(hostname)?;
}
let runtime_base = if let Some(path) = runtime_base_override {
path
} else if nix::unistd::Uid::effective().is_root() {
PathBuf::from("/run/nucleus")
} else {
dirs::runtime_dir()
.map(|d| d.join("nucleus"))
.unwrap_or_else(std::env::temp_dir)
};
let _ = std::fs::create_dir_all(&runtime_base);
let runtime_dir = Builder::new()
.prefix("nucleus-runtime-")
.tempdir_in(&runtime_base)
.map_err(|e| {
NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
})?;
let container_root = runtime_dir.path().to_path_buf();
let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); tmpfs.mount()?;
fs_state = fs_state.transition(FilesystemState::Mounted)?;
create_minimal_fs(&container_root)?;
let dev_path = container_root.join("dev");
create_dev_nodes(&dev_path, false)?;
let shm_path = dev_path.join("shm");
std::fs::create_dir_all(&shm_path).map_err(|e| {
NucleusError::FilesystemError(format!("Failed to create /dev/shm: {}", e))
})?;
nix::mount::mount(
Some("shm"),
&shm_path,
Some("tmpfs"),
nix::mount::MsFlags::MS_NOSUID
| nix::mount::MsFlags::MS_NODEV
| nix::mount::MsFlags::MS_NOEXEC,
Some("mode=1777,size=64m"),
)
.map_err(|e| {
NucleusError::FilesystemError(format!("Failed to mount tmpfs on /dev/shm: {}", e))
})?;
debug!("Mounted tmpfs on /dev/shm");
if let Some(context_dir) = &self.config.context_dir {
let context_dest = container_root.join("context");
LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
if let Some(expected) = &context_manifest {
verify_context_manifest(expected, &context_dest)?;
}
}
fs_state = fs_state.transition(FilesystemState::Populated)?;
if let Some(ref rootfs_path) = self.config.rootfs_path {
let rootfs_path = if self.config.service_mode == ServiceMode::Production {
validate_production_rootfs_path(rootfs_path)?
} else {
rootfs_path.clone()
};
if self.config.verify_rootfs_attestation {
verify_rootfs_attestation(&rootfs_path)?;
}
bind_mount_rootfs(&container_root, &rootfs_path)?;
} else {
bind_mount_host_paths(&container_root, is_rootless)?;
}
mount_volumes(&container_root, &self.config.volumes)?;
if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
let bridge_dns = if bridge_config.selected_nat_backend(!is_rootless, is_rootless)
== NatBackend::Userspace
&& bridge_config.dns.is_empty()
{
vec![UserspaceNetwork::default_dns_server(&bridge_config.subnet)?]
} else {
bridge_config.dns.clone()
};
if self.config.rootfs_path.is_some() {
BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_dns)?;
} else {
BridgeNetwork::write_resolv_conf(&container_root, &bridge_dns)?;
}
}
mount_secrets_inmemory(
&container_root,
&self.config.secrets,
&self.config.process_identity,
)?;
let proc_path = container_root.join("proc");
let production_mode = self.config.service_mode == ServiceMode::Production;
let hide_pids = production_mode;
let procfs_best_effort = is_rootless && !production_mode;
mount_procfs(
&proc_path,
procfs_best_effort,
self.config.proc_readonly,
hide_pids,
)?;
mask_proc_paths(
&proc_path,
self.config.service_mode == ServiceMode::Production,
)?;
if let Some(ref hooks) = self.config.hooks {
if !hooks.create_runtime.is_empty() {
let hook_state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: self.config.id.clone(),
status: OciStatus::Creating,
pid: std::process::id(),
bundle: String::new(),
};
OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
}
}
switch_root(&container_root, self.config.allow_chroot_fallback)?;
fs_state = fs_state.transition(FilesystemState::Pivoted)?;
debug!("Filesystem state: {:?}", fs_state);
audit_mounts(self.config.service_mode == ServiceMode::Production)?;
audit(
&self.config.id,
&self.config.name,
AuditEventType::MountAuditPassed,
"all mount flags verified",
);
if let Some(ref hooks) = self.config.hooks {
if !hooks.create_container.is_empty() {
let hook_state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: self.config.id.clone(),
status: OciStatus::Created,
pid: std::process::id(),
bundle: String::new(),
};
OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
}
}
let mut cap_mgr = CapabilityManager::new();
if let Some(ref policy_path) = self.config.caps_policy {
let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
policy_path,
self.config.caps_policy_sha256.as_deref(),
)?;
if self.config.service_mode == ServiceMode::Production {
policy.validate_production()?;
}
policy.apply(&mut cap_mgr)?;
Self::apply_process_identity_to_current_process(
&self.config.process_identity,
self.config.user_ns_config.is_some(),
)?;
audit(
&self.config.id,
&self.config.name,
AuditEventType::CapabilitiesDropped,
format!("capability policy applied from {:?}", policy_path),
);
} else {
cap_mgr.drop_bounding_set()?;
Self::apply_process_identity_to_current_process(
&self.config.process_identity,
self.config.user_ns_config.is_some(),
)?;
cap_mgr.finalize_drop()?;
audit(
&self.config.id,
&self.config.name,
AuditEventType::CapabilitiesDropped,
"all capabilities dropped including bounding set",
);
}
sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
{
let is_production = self.config.service_mode == ServiceMode::Production;
if let Some(nproc_limit) = self.config.limits.pids_max {
let rlim_nproc = libc::rlimit {
rlim_cur: nproc_limit,
rlim_max: nproc_limit,
};
if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
let err = std::io::Error::last_os_error();
if is_production {
return Err(NucleusError::SeccompError(format!(
"Failed to set RLIMIT_NPROC to {} in production mode: {}",
nproc_limit, err
)));
}
warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
}
}
let rlim_nofile = libc::rlimit {
rlim_cur: 1024,
rlim_max: 1024,
};
if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
let err = std::io::Error::last_os_error();
if is_production {
return Err(NucleusError::SeccompError(format!(
"Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
err
)));
}
warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
}
let memlock_limit: u64 = self.config.limits.memlock_bytes.unwrap_or(64 * 1024);
let rlim_memlock = libc::rlimit {
rlim_cur: memlock_limit,
rlim_max: memlock_limit,
};
if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
let err = std::io::Error::last_os_error();
if is_production {
return Err(NucleusError::SeccompError(format!(
"Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
memlock_limit, err
)));
}
warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
}
}
CapabilityManager::verify_no_namespace_caps(
self.config.service_mode == ServiceMode::Production,
)?;
use crate::container::config::SeccompMode;
let mut seccomp_mgr = SeccompManager::new();
let allow_network = !matches!(self.config.network, NetworkMode::None);
let seccomp_applied = match self.config.seccomp_mode {
SeccompMode::Trace => {
audit(
&self.config.id,
&self.config.name,
AuditEventType::SeccompApplied,
"seccomp trace mode: allow-all + LOG",
);
seccomp_mgr.apply_trace_filter()?
}
SeccompMode::Enforce => {
if let Some(ref profile_path) = self.config.seccomp_profile {
audit(
&self.config.id,
&self.config.name,
AuditEventType::SeccompProfileLoaded,
format!("path={:?}", profile_path),
);
seccomp_mgr.apply_profile_from_file(
profile_path,
self.config.seccomp_profile_sha256.as_deref(),
self.config.seccomp_log_denied,
)?
} else {
seccomp_mgr.apply_filter_for_network_mode(
allow_network,
allow_degraded_security,
self.config.seccomp_log_denied,
&self.config.seccomp_allow_syscalls,
)?
}
}
};
if seccomp_applied {
sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
audit(
&self.config.id,
&self.config.name,
AuditEventType::SeccompApplied,
format!("network={}", allow_network),
);
} else if !allow_degraded_security {
return Err(NucleusError::SeccompError(
"Seccomp filter is required but was not enforced".to_string(),
));
} else {
warn!("Seccomp not enforced; container is running with degraded hardening");
}
let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
policy_path,
self.config.landlock_policy_sha256.as_deref(),
)?;
if self.config.service_mode == ServiceMode::Production {
policy.validate_production()?;
}
policy.apply(allow_degraded_security)?
} else {
let mut landlock_mgr = LandlockManager::new();
landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
for vol in &self.config.volumes {
landlock_mgr.add_rw_path(&vol.dest.to_string_lossy());
}
landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
};
if seccomp_applied && landlock_applied {
sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
if self.config.seccomp_mode == SeccompMode::Trace {
warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
} else {
sec_state = sec_state.transition(SecurityState::Locked)?;
}
audit(
&self.config.id,
&self.config.name,
AuditEventType::LandlockApplied,
if self.config.seccomp_mode == SeccompMode::Trace {
"landlock applied, but seccomp in trace mode – not locked".to_string()
} else {
"security state locked: all hardening layers active".to_string()
},
);
} else if !allow_degraded_security {
return Err(NucleusError::LandlockError(
"Landlock policy is required but was not enforced".to_string(),
));
} else {
warn!("Security state not locked; one or more hardening controls are inactive");
}
debug!("Security state: {:?}", sec_state);
if let Some(ref fifo_path) = exec_fifo {
debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
let file = std::fs::OpenOptions::new()
.write(true)
.open(fifo_path)
.map_err(|e| {
NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
})?;
std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
})?;
drop(file);
debug!("Exec FIFO released, proceeding to exec");
}
if let Some(ref hooks) = self.config.hooks {
if !hooks.start_container.is_empty() {
let hook_state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: self.config.id.clone(),
status: OciStatus::Running,
pid: std::process::id(),
bundle: String::new(),
};
OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
}
}
if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
return self.run_as_init();
}
self.exec_command()?;
Ok(())
}
pub(super) fn setup_signal_forwarding_static(
child: Pid,
) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
let mut set = SigSet::empty();
for signal in [
Signal::SIGTERM,
Signal::SIGINT,
Signal::SIGHUP,
Signal::SIGQUIT,
Signal::SIGUSR1,
Signal::SIGUSR2,
] {
set.add(signal);
}
let unblock_set = set;
pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
})?;
let stop = Arc::new(AtomicBool::new(false));
let stop_clone = stop.clone();
let handle = std::thread::Builder::new()
.name("sig-forward".to_string())
.spawn(move || {
loop {
if let Ok(signal) = unblock_set.wait() {
if stop_clone.load(Ordering::Acquire) {
break;
}
let _ = kill(child, signal);
}
}
})
.map_err(|e| {
let mut restore = SigSet::empty();
for signal in [
Signal::SIGTERM,
Signal::SIGINT,
Signal::SIGHUP,
Signal::SIGQUIT,
Signal::SIGUSR1,
Signal::SIGUSR2,
] {
restore.add(signal);
}
let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
})?;
info!("Signal forwarding configured");
Ok((stop, handle))
}
pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
loop {
match waitpid(child, None) {
Ok(WaitStatus::Exited(_, code)) => {
return Ok(code);
}
Ok(WaitStatus::Signaled(_, signal, _)) => {
info!("Child killed by signal: {:?}", signal);
return Ok(128 + signal as i32);
}
Err(nix::errno::Errno::EINTR) => {
continue;
}
Err(e) => {
return Err(NucleusError::ExecError(format!(
"Failed to wait for child: {}",
e
)));
}
_ => {
continue;
}
}
}
}
fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
let mut pid_buf = [0u8; 4];
loop {
match read(ready_read, &mut pid_buf) {
Err(nix::errno::Errno::EINTR) => continue,
Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
Ok(0) => {
return Err(NucleusError::ExecError(format!(
"Child {} exited before namespace initialization",
child
)))
}
Ok(_) => {
return Err(NucleusError::ExecError(
"Invalid namespace sync payload from child".to_string(),
))
}
Err(e) => {
return Err(NucleusError::ExecError(format!(
"Failed waiting for child namespace setup: {}",
e
)))
}
}
}
}
fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
let payload = pid.to_ne_bytes();
let mut written = 0;
while written < payload.len() {
let n = write(fd, &payload[written..]).map_err(|e| {
NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
})?;
if n == 0 {
return Err(NucleusError::ExecError(
"Failed to notify namespace readiness: short write".to_string(),
));
}
written += n;
}
Ok(())
}
fn send_sync_byte(fd: &OwnedFd, error_context: &str) -> Result<()> {
let mut written = 0;
let payload = [1u8];
while written < payload.len() {
let n = write(fd, &payload[written..])
.map_err(|e| NucleusError::ExecError(format!("{}: {}", error_context, e)))?;
if n == 0 {
return Err(NucleusError::ExecError(format!(
"{}: short write",
error_context
)));
}
written += n;
}
Ok(())
}
fn wait_for_sync_byte(fd: &OwnedFd, eof_context: &str, error_context: &str) -> Result<()> {
let mut payload = [0u8; 1];
loop {
match read(fd, &mut payload) {
Err(nix::errno::Errno::EINTR) => continue,
Ok(1) => return Ok(()),
Ok(0) => return Err(NucleusError::ExecError(eof_context.to_string())),
Ok(_) => {
return Err(NucleusError::ExecError(format!(
"{}: invalid sync payload",
error_context
)))
}
Err(e) => return Err(NucleusError::ExecError(format!("{}: {}", error_context, e))),
}
}
}
fn become_userns_root_for_setup() -> Result<()> {
setresgid(Gid::from_raw(0), Gid::from_raw(0), Gid::from_raw(0)).map_err(|e| {
NucleusError::NamespaceError(format!(
"Failed to become gid 0 inside mapped user namespace: {}",
e
))
})?;
setresuid(Uid::from_raw(0), Uid::from_raw(0), Uid::from_raw(0)).map_err(|e| {
NucleusError::NamespaceError(format!(
"Failed to become uid 0 inside mapped user namespace: {}",
e
))
})?;
debug!("Switched setup process to uid/gid 0 inside mapped user namespace");
Ok(())
}
fn prepare_gvisor_bridge_namespace(
&self,
userns_request_pipe: Option<&OwnedFd>,
userns_ack_pipe: Option<&OwnedFd>,
) -> Result<bool> {
let mut precreated_userns = false;
if self.config.user_ns_config.is_some() && !Uid::effective().is_root() {
nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWUSER).map_err(|e| {
NucleusError::NamespaceError(format!(
"Failed to unshare gVisor bridge user namespace: {}",
e
))
})?;
let request_fd = userns_request_pipe.ok_or_else(|| {
NucleusError::ExecError(
"Missing user namespace request pipe in gVisor bridge child".to_string(),
)
})?;
let ack_fd = userns_ack_pipe.ok_or_else(|| {
NucleusError::ExecError(
"Missing user namespace acknowledgement pipe in gVisor bridge child"
.to_string(),
)
})?;
Self::send_sync_byte(
request_fd,
"Failed to request gVisor bridge user namespace mappings from parent",
)?;
Self::wait_for_sync_byte(
ack_fd,
"Parent closed user namespace ack pipe before gVisor bridge mappings were written",
"Failed waiting for parent to finish gVisor bridge user namespace mappings",
)?;
Self::become_userns_root_for_setup()?;
precreated_userns = true;
}
nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNET).map_err(|e| {
NucleusError::NamespaceError(format!(
"Failed to unshare gVisor bridge network namespace: {}",
e
))
})?;
Ok(precreated_userns)
}
fn wait_for_pid_namespace_child(child: Pid) -> i32 {
loop {
match waitpid(child, None) {
Ok(WaitStatus::Exited(_, code)) => return code,
Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
Err(nix::errno::Errno::EINTR) => continue,
Err(_) => return 1,
_ => continue,
}
}
}
}
impl CreatedContainer {
pub fn start(mut self) -> Result<i32> {
let config = &self.config;
let _enter = self._lifecycle_span.enter();
if let Some(exec_fifo_path) = &self.exec_fifo_path {
let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
})?;
let mut buf = [0u8; 1];
let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
})?;
if read != 1 {
return Err(NucleusError::ExecError(
"Exec FIFO closed before start signal was delivered".to_string(),
));
}
let _ = std::fs::remove_file(exec_fifo_path);
}
self.state.status = OciStatus::Running;
self.state_mgr.save_state(&self.state)?;
let target_pid = self.state.pid;
let child = self.child;
let (sig_stop, sig_handle) =
Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
let mut sig_guard = SignalThreadGuard {
stop: Some(sig_stop),
handle: Some(sig_handle),
};
if let Some(ref probe) = config.readiness_probe {
let notify_socket = if config.sd_notify {
std::env::var("NOTIFY_SOCKET").ok()
} else {
None
};
Container::run_readiness_probe(
target_pid,
&config.name,
probe,
config.user_ns_config.is_some(),
config.use_gvisor,
&config.process_identity,
notify_socket.as_deref(),
)?;
}
let cancel_flag = Arc::new(AtomicBool::new(false));
let health_handle = if let Some(ref hc) = config.health_check {
if !hc.command.is_empty() {
let hc = hc.clone();
let pid = target_pid;
let container_name = config.name.clone();
let rootless = config.user_ns_config.is_some();
let using_gvisor = config.use_gvisor;
let process_identity = config.process_identity.clone();
let cancel = cancel_flag.clone();
Some(std::thread::spawn(move || {
Container::health_check_loop(
pid,
&container_name,
rootless,
using_gvisor,
&hc,
&process_identity,
&cancel,
);
}))
} else {
None
}
} else {
None
};
let mut health_guard = HealthThreadGuard {
cancel: Some(cancel_flag),
handle: health_handle,
};
if let Some(ref hooks) = config.hooks {
if !hooks.poststart.is_empty() {
let hook_state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: config.id.clone(),
status: OciStatus::Running,
pid: target_pid,
bundle: String::new(),
};
OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
}
}
let mut child_waited = false;
let run_result: Result<i32> = (|| {
let exit_code = Container::wait_for_child_static(child)?;
self.state.status = OciStatus::Stopped;
let _ = self.state_mgr.save_state(&self.state);
child_waited = true;
Ok(exit_code)
})();
health_guard.stop();
sig_guard.stop();
if let Some(ref hooks) = config.hooks {
if !hooks.poststop.is_empty() {
let hook_state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: config.id.clone(),
status: OciStatus::Stopped,
pid: 0,
bundle: String::new(),
};
OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
}
}
if let Some(net) = self.network_driver.take() {
if let Err(e) = net.cleanup() {
warn!("Failed to cleanup container networking: {}", e);
}
}
if !child_waited {
let _ = kill(child, Signal::SIGKILL);
let _ = waitpid(child, None);
}
if let Some(reader) = self.trace_reader.take() {
reader.stop_and_flush();
}
if let Some(logger) = self.deny_logger.take() {
logger.stop();
}
if let Some(cgroup) = self.cgroup_opt.take() {
if let Err(e) = cgroup.cleanup() {
warn!("Failed to cleanup cgroup: {}", e);
}
}
if config.use_gvisor {
if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
warn!(
"Failed to cleanup gVisor artifacts for {}: {}",
config.id, e
);
}
}
if let Err(e) = self.state_mgr.delete_state(&config.id) {
warn!("Failed to delete state for {}: {}", config.id, e);
}
match run_result {
Ok(exit_code) => {
audit(
&config.id,
&config.name,
AuditEventType::ContainerStop,
format!("exit_code={}", exit_code),
);
info!(
"Container {} ({}) exited with code {}",
config.name, config.id, exit_code
);
Ok(exit_code)
}
Err(e) => {
audit_error(
&config.id,
&config.name,
AuditEventType::ContainerStop,
format!("error={}", e),
);
Err(e)
}
}
}
}
struct SignalThreadGuard {
stop: Option<Arc<AtomicBool>>,
handle: Option<JoinHandle<()>>,
}
impl SignalThreadGuard {
fn stop(&mut self) {
if let Some(flag) = self.stop.take() {
flag.store(true, Ordering::Release);
if let Some(handle) = self.handle.as_ref() {
super::signals::wake_sigwait_thread(handle, Signal::SIGUSR1);
}
}
if let Some(handle) = self.handle.take() {
let _ = handle.join();
}
}
}
impl Drop for SignalThreadGuard {
fn drop(&mut self) {
self.stop();
}
}
struct HealthThreadGuard {
cancel: Option<Arc<AtomicBool>>,
handle: Option<JoinHandle<()>>,
}
impl HealthThreadGuard {
fn stop(&mut self) {
if let Some(flag) = self.cancel.take() {
flag.store(true, Ordering::Relaxed);
}
if let Some(handle) = self.handle.take() {
let _ = handle.join();
}
}
}
impl Drop for HealthThreadGuard {
fn drop(&mut self) {
self.stop();
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::container::KernelLockdownMode;
use crate::network::NetworkMode;
use std::ffi::OsString;
use std::sync::{Mutex, MutexGuard};
static ENV_LOCK: Mutex<()> = Mutex::new(());
struct EnvLock {
_guard: MutexGuard<'static, ()>,
}
impl EnvLock {
fn acquire() -> Self {
Self {
_guard: ENV_LOCK.lock().unwrap(),
}
}
}
struct EnvVarGuard {
key: &'static str,
previous: Option<OsString>,
}
impl EnvVarGuard {
fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
let previous = std::env::var_os(key);
std::env::set_var(key, value);
Self { key, previous }
}
fn remove(key: &'static str) -> Self {
let previous = std::env::var_os(key);
std::env::remove_var(key);
Self { key, previous }
}
}
impl Drop for EnvVarGuard {
fn drop(&mut self) {
match &self.previous {
Some(value) => std::env::set_var(self.key, value),
None => std::env::remove_var(self.key),
}
}
}
fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
let fn_start = source
.find(fn_signature)
.unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
let after = &source[fn_start..];
let open = after
.find('{')
.unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
let mut depth = 0u32;
let mut end = open;
for (i, ch) in after[open..].char_indices() {
match ch {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
end = open + i + 1;
break;
}
}
_ => {}
}
}
&after[..end]
}
#[test]
fn test_container_config() {
let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
assert!(!config.id.is_empty());
assert_eq!(config.command, vec!["/bin/sh"]);
assert!(config.use_gvisor);
}
#[test]
fn test_run_uses_immediate_start_path_with_parent_setup_gate() {
let source = include_str!("runtime.rs");
let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
let after = &source[fn_start..];
let open = after.find('{').unwrap();
let mut depth = 0u32;
let mut fn_end = open;
for (i, ch) in after[open..].char_indices() {
match ch {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
fn_end = open + i + 1;
break;
}
}
_ => {}
}
}
let run_body = &after[..fn_end];
assert!(
run_body.contains("create_internal(false)"),
"run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
);
assert!(
!run_body.contains("self.create()?.start()"),
"run() must not route through create()+start()"
);
let create_body = extract_fn_body(source, "fn create_internal");
assert!(
create_body.contains("parent_setup_write"),
"immediate run() must still use a parent setup gate before child setup proceeds"
);
}
#[test]
fn test_container_config_with_name() {
let config =
ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
.unwrap();
assert_eq!(config.name, "mycontainer");
assert!(!config.id.is_empty());
assert_ne!(config.id, config.name);
}
#[test]
fn test_allow_degraded_security_requires_explicit_config() {
let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
assert!(!Container::allow_degraded_security(&strict));
let relaxed = strict.clone().with_allow_degraded_security(true);
assert!(Container::allow_degraded_security(&relaxed));
}
#[test]
fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
assert!(!Container::allow_degraded_security(&strict));
let explicit = strict.with_allow_degraded_security(true);
assert!(Container::allow_degraded_security(&explicit));
match prev {
Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
}
}
#[test]
fn test_host_network_requires_explicit_opt_in() {
let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
.unwrap()
.with_network(NetworkMode::Host)
.with_allow_host_network(false);
let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
assert!(matches!(err, NucleusError::NetworkError(_)));
}
#[test]
fn test_host_network_opt_in_disables_net_namespace() {
let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
.unwrap()
.with_network(NetworkMode::Host)
.with_allow_host_network(true);
assert!(config.namespaces.net);
Container::apply_network_mode_guards(&mut config, true).unwrap();
assert!(!config.namespaces.net);
}
#[test]
fn test_non_host_network_does_not_require_host_opt_in() {
let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
.unwrap()
.with_network(NetworkMode::None)
.with_allow_host_network(false);
assert!(config.namespaces.net);
Container::apply_network_mode_guards(&mut config, true).unwrap();
assert!(config.namespaces.net);
}
#[test]
fn test_parse_kernel_lockdown_mode() {
assert_eq!(
Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
Some(KernelLockdownMode::Integrity)
);
assert_eq!(
Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
Some(KernelLockdownMode::Confidentiality)
);
assert_eq!(
Container::parse_active_lockdown_mode("[none] integrity"),
None
);
}
#[test]
fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
let temp = tempfile::TempDir::new().unwrap();
let source = temp.path().join("source-secret");
std::fs::write(&source, "supersecret").unwrap();
let staged = Container::stage_gvisor_secret_files(
&temp.path().join("stage"),
&[crate::container::SecretMount {
source: source.clone(),
dest: std::path::PathBuf::from("/etc/app/secret.txt"),
mode: 0o400,
}],
&crate::container::ProcessIdentity::root(),
)
.unwrap();
assert_eq!(staged.len(), 1);
assert!(staged[0].source.starts_with(temp.path().join("stage")));
assert_eq!(
std::fs::read_to_string(&staged[0].source).unwrap(),
"supersecret"
);
}
#[test]
fn test_stage_gvisor_secret_files_rejects_symlink_source() {
use std::os::unix::fs::symlink;
let temp = tempfile::TempDir::new().unwrap();
let source = temp.path().join("source-secret");
let link = temp.path().join("source-link");
std::fs::write(&source, "supersecret").unwrap();
symlink(&source, &link).unwrap();
let err = Container::stage_gvisor_secret_files(
&temp.path().join("stage"),
&[crate::container::SecretMount {
source: link,
dest: std::path::PathBuf::from("/etc/app/secret.txt"),
mode: 0o400,
}],
&crate::container::ProcessIdentity::root(),
)
.unwrap_err();
assert!(
err.to_string().contains("O_NOFOLLOW"),
"gVisor secret staging must reject symlink sources"
);
}
#[test]
fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
let source = include_str!("runtime.rs");
let fn_body = extract_fn_body(source, "fn setup_and_exec");
assert!(
fn_body.contains("mount_secrets_inmemory("),
"setup_and_exec must use in-memory secret mounting"
);
assert!(
!fn_body.contains("mount_secrets(&"),
"setup_and_exec must not bind-mount secrets from the host"
);
}
#[test]
fn test_native_production_procfs_mount_is_not_rootless_best_effort() {
let source = include_str!("runtime.rs");
let fn_body = extract_fn_body(source, "fn setup_and_exec");
assert!(
fn_body.contains(
"let production_mode = self.config.service_mode == ServiceMode::Production;"
),
"setup_and_exec must derive an explicit production-mode guard for procfs hardening"
);
assert!(
fn_body.contains("let procfs_best_effort = is_rootless && !production_mode;"),
"rootless best-effort procfs fallback must be disabled in production mode"
);
assert!(
fn_body.contains(
"mount_procfs(\n &proc_path,\n procfs_best_effort,"
),
"mount_procfs must receive the production-aware best-effort flag"
);
}
#[test]
fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
let source = include_str!("gvisor_setup.rs");
let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
assert!(
fn_body.contains("with_inmemory_secret_mounts"),
"gVisor setup must use the tmpfs-backed secret staging path"
);
assert!(
!fn_body.contains("with_secret_mounts"),
"gVisor setup must not bind-mount host secret paths"
);
}
#[test]
fn test_gvisor_bridge_precreated_userns_skips_nested_oci_userns() {
let source = include_str!("gvisor_setup.rs");
let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
let precreated_check = fn_body.find("if precreated_userns").unwrap();
let oci_userns = fn_body.find("with_rootless_user_namespace").unwrap();
assert!(
precreated_check < oci_userns,
"pre-created rootless bridge userns must skip nested OCI user namespace setup"
);
}
#[test]
fn test_gvisor_bridge_precreated_userns_disables_oci_no_new_privileges() {
let source = include_str!("gvisor_setup.rs");
let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
assert!(
fn_body.contains("if precreated_userns")
&& fn_body.contains("with_no_new_privileges(false)"),
"pre-created rootless bridge userns must not pass OCI noNewPrivileges to runsc"
);
}
#[test]
fn test_gvisor_bridge_precreated_userns_selects_runsc_rootless() {
let source = include_str!("gvisor_setup.rs");
let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
assert!(
fn_body.contains("let runsc_rootless = precreated_userns"),
"pre-created rootless bridge userns must use runsc's rootless execution path"
);
}
#[test]
fn test_gvisor_bridge_rootless_requests_external_userns_mapping() {
let source = include_str!("runtime.rs");
let create_body = extract_fn_body(source, "fn create_internal");
assert!(
create_body.contains("let gvisor_bridge_needs_userns_mapping"),
"gVisor bridge rootless setup must request parent-written userns mappings"
);
assert!(
create_body.contains("matches!(config.network, NetworkMode::Bridge(_))"),
"external mapping request must be scoped to gVisor bridge networking"
);
}
#[test]
fn test_gvisor_bridge_namespace_creates_userns_before_netns() {
let source = include_str!("runtime.rs");
let fn_body = extract_fn_body(source, "fn prepare_gvisor_bridge_namespace");
let userns = fn_body.find("CLONE_NEWUSER").unwrap();
let request = fn_body.find("send_sync_byte").unwrap();
let become_root = fn_body.find("become_userns_root_for_setup").unwrap();
let netns = fn_body.find("CLONE_NEWNET").unwrap();
assert!(
userns < request && request < become_root && become_root < netns,
"rootless gVisor bridge setup must map userns before creating the netns"
);
}
#[test]
fn test_native_fork_sites_assert_single_threaded() {
let runtime_source = include_str!("runtime.rs");
let create_body = extract_fn_body(runtime_source, "fn create_internal");
assert!(
create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
"create_internal must assert single-threaded before fork"
);
let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
assert!(
setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
"PID namespace setup must assert single-threaded before fork"
);
let exec_source = include_str!("exec.rs");
let init_body = extract_fn_body(exec_source, "fn run_as_init");
assert!(
init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
"run_as_init must assert single-threaded before fork"
);
}
#[test]
fn test_parent_setup_gate_released_after_network_policy() {
let source = include_str!("runtime.rs");
let create_body = extract_fn_body(source, "fn create_internal");
let cgroup_attach = create_body.find("cgroup.attach_process").unwrap();
let deny_logger = create_body.find("maybe_start_seccomp_deny_logger").unwrap();
let bridge_setup = create_body.find("BridgeDriver::setup_with_id").unwrap();
let egress_policy = create_body.find("net.apply_egress_policy").unwrap();
let release = create_body
.find("Failed to notify child that parent setup is complete")
.unwrap();
let created = create_body.find("Ok(CreatedContainer").unwrap();
assert!(
cgroup_attach < bridge_setup,
"parent setup gate must not release before cgroup attachment"
);
assert!(
cgroup_attach < deny_logger && deny_logger < bridge_setup,
"seccomp deny logger must start after cgroup attachment and before workload release"
);
assert!(
create_body.contains("cgroup_opt.as_ref().map(|cgroup| cgroup.path())"),
"seccomp deny logger must receive the container cgroup scope"
);
assert!(
bridge_setup < egress_policy && egress_policy < release,
"parent setup gate must not release before bridge and egress policy setup"
);
assert!(
release < created,
"create_internal must release the child only after all fallible parent setup succeeds"
);
assert!(
!create_body.contains("cgroup attachment is complete"),
"child setup gate must not be released immediately after cgroup attachment"
);
}
#[test]
fn test_child_waits_for_parent_setup_before_exec_paths() {
let source = include_str!("runtime.rs");
let setup_body = extract_fn_body(source, "fn setup_and_exec");
let gvisor_wait = setup_body
.find("Parent closed setup pipe before signalling gVisor child")
.unwrap();
let gvisor_exec = setup_body.find("setup_and_exec_gvisor").unwrap();
assert!(
gvisor_wait < gvisor_exec,
"gVisor path must wait for parent setup before execing runsc"
);
let pid1_wait = setup_body
.find("Parent closed setup pipe before signalling PID 1 child")
.unwrap();
let namespace_enter = setup_body.find("namespace_mgr.enter()?").unwrap();
assert!(
pid1_wait < namespace_enter,
"PID namespace child must wait for parent setup before container setup continues"
);
let direct_wait = setup_body
.find("Parent closed setup pipe before signalling container child")
.unwrap();
assert!(
direct_wait < namespace_enter,
"non-PID namespace child must wait for parent setup before container setup continues"
);
}
#[test]
fn test_parent_setup_failure_kills_reported_target_pid() {
let source = include_str!("runtime.rs");
let create_body = extract_fn_body(source, "fn create_internal");
let record_target = create_body
.find("target_pid_for_cleanup = Some(target_pid)")
.unwrap();
let kill_target = create_body
.find("kill(Pid::from_raw(target_pid as i32), Signal::SIGKILL)")
.unwrap();
let kill_intermediate = create_body.find("kill(child, Signal::SIGKILL)").unwrap();
assert!(
record_target < kill_target,
"parent setup cleanup must remember the reported target PID"
);
assert!(
kill_target < kill_intermediate,
"cleanup must kill the target PID before reaping the intermediate fork"
);
}
#[test]
fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
let source = include_str!("exec.rs");
let fn_body = extract_fn_body(source, "fn run_as_init");
assert!(
!fn_body.contains("Self::apply_process_identity_to_current_process("),
"run_as_init must not drop identity before the supervisor fork"
);
assert!(
fn_body.contains("self.exec_command()?"),
"workload child must still route through exec_command for identity application"
);
}
#[test]
fn test_signal_thread_shutdown_uses_thread_directed_wakeup() {
let runtime_source = include_str!("runtime.rs");
let exec_source = include_str!("exec.rs");
let signal_helper_source = include_str!("signals.rs");
let process_directed_wakeup = ["kill(Pid::this()", ", Signal::SIGUSR1)"].concat();
assert!(
!runtime_source.contains(&process_directed_wakeup),
"CreatedContainer signal-thread shutdown must not send process-directed SIGUSR1"
);
assert!(
!exec_source.contains(&process_directed_wakeup),
"init supervisor signal-thread shutdown must not send process-directed SIGUSR1"
);
assert!(
signal_helper_source.contains("libc::pthread_kill"),
"signal-thread shutdown must wake the sigwait owner with a thread-directed signal"
);
}
#[test]
fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
let _env_lock = EnvLock::acquire();
let temp = tempfile::TempDir::new().unwrap();
let _artifact_base = EnvVarGuard::set(
"NUCLEUS_GVISOR_ARTIFACT_BASE",
temp.path().join("gvisor-artifacts"),
);
let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
std::fs::create_dir_all(&artifact_dir).unwrap();
std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
assert!(!artifact_dir.exists());
}
#[test]
fn test_gvisor_artifact_base_prefers_xdg_runtime_dir() {
let _env_lock = EnvLock::acquire();
let temp = tempfile::TempDir::new().unwrap();
let _artifact_override = EnvVarGuard::remove("NUCLEUS_GVISOR_ARTIFACT_BASE");
let _runtime = EnvVarGuard::set("XDG_RUNTIME_DIR", temp.path());
assert_eq!(
Container::gvisor_artifact_dir("xdg-test"),
temp.path().join("nucleus-gvisor").join("xdg-test")
);
}
#[test]
fn test_health_check_loop_supports_cancellation() {
let source = include_str!("health.rs");
let fn_start = source.find("fn health_check_loop").unwrap();
let fn_body = &source[fn_start..fn_start + 2500];
assert!(
fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
"health_check_loop must accept an AtomicBool cancellation flag"
);
assert!(
fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
"health_check_loop must check cancellation during sleep intervals"
);
}
#[test]
fn test_runtime_probes_do_not_spawn_host_nsenter() {
let source = include_str!("health.rs");
let readiness_start = source.find("fn run_readiness_probe").unwrap();
let readiness_body = &source[readiness_start..readiness_start + 2500];
assert!(
!readiness_body.contains("Command::new(&nsenter_bin)"),
"readiness probes must not execute via host nsenter"
);
let health_start = source.find("fn health_check_loop").unwrap();
let health_body = &source[health_start..health_start + 2200];
assert!(
!health_body.contains("Command::new(&nsenter_bin)"),
"health checks must not execute via host nsenter"
);
}
#[test]
fn test_oci_mount_strip_prefix_no_expect() {
let source = include_str!("gvisor_setup.rs");
let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
let fn_body = &source[fn_start..fn_start + 600];
assert!(
!fn_body.contains(".expect("),
"prepare_oci_mountpoints must not use expect() – return Err instead"
);
}
#[test]
fn test_notify_namespace_ready_validates_write_length() {
let source = include_str!("runtime.rs");
let fn_start = source.find("fn notify_namespace_ready").unwrap();
let fn_body = &source[fn_start..fn_start + 500];
assert!(
fn_body.contains("written")
|| fn_body.contains("4")
|| fn_body.contains("payload.len()"),
"notify_namespace_ready must validate complete write of all 4 bytes"
);
}
#[test]
fn test_rlimit_failures_fatal_in_production() {
let source = include_str!("runtime.rs");
let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
assert!(
rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
"RLIMIT failures must return Err in production mode"
);
}
#[test]
fn test_tcp_readiness_probe_uses_portable_check() {
let source = include_str!("health.rs");
let probe_fn = source.find("TcpPort(port)").unwrap();
let probe_body = &source[probe_fn..probe_fn + 500];
assert!(
!probe_body.contains("/dev/tcp"),
"TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
);
}
}