Skip to main content

nucleus/container/
runtime.rs

1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3    ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4    ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8    audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9    mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
10    switch_root, verify_context_manifest, verify_rootfs_attestation, FilesystemState,
11    LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::NamespaceManager;
14use crate::network::{BridgeDriver, BridgeNetwork, NatBackend, NetworkMode, UserspaceNetwork};
15use crate::resources::Cgroup;
16use crate::security::{
17    CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks,
18    SeccompDenyLogger, SeccompManager, SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::OwnedFd;
26use std::path::PathBuf;
27use std::sync::atomic::{AtomicBool, Ordering};
28use std::sync::Arc;
29use std::thread::JoinHandle;
30use tempfile::Builder;
31use tracing::{debug, error, info, info_span, warn};
32
33/// Container runtime that orchestrates all isolation mechanisms
34///
35/// Execution flow matches the formal specifications:
36/// 1. Create namespaces (Nucleus_Isolation_NamespaceLifecycle.tla)
37/// 2. Create and configure cgroups (Nucleus_Resources_CgroupLifecycle.tla)
38/// 3. Mount tmpfs and populate context (Nucleus_Filesystem_FilesystemLifecycle.tla)
39/// 4. Drop capabilities and apply seccomp (Nucleus_Security_SecurityEnforcement.tla)
40/// 5. Execute target process
41pub struct Container {
42    pub(super) config: ContainerConfig,
43    /// Pre-resolved runsc path, resolved before fork so that user-namespace
44    /// UID changes don't block PATH-based lookup.
45    pub(super) runsc_path: Option<String>,
46}
47
48/// Handle returned by `Container::create()` representing a container whose
49/// child process has been forked and is blocked on the exec FIFO, waiting for
50/// `start()` to release it.
51pub struct CreatedContainer {
52    pub(super) config: ContainerConfig,
53    pub(super) state_mgr: ContainerStateManager,
54    pub(super) state: ContainerState,
55    pub(super) child: Pid,
56    pub(super) cgroup_opt: Option<Cgroup>,
57    pub(super) network_driver: Option<BridgeDriver>,
58    pub(super) trace_reader: Option<SeccompTraceReader>,
59    pub(super) deny_logger: Option<SeccompDenyLogger>,
60    pub(super) exec_fifo_path: Option<PathBuf>,
61    pub(super) _lifecycle_span: tracing::Span,
62}
63
64impl Container {
65    pub fn new(config: ContainerConfig) -> Self {
66        Self {
67            config,
68            runsc_path: None,
69        }
70    }
71
72    /// Run the container (convenience wrapper: create + start)
73    pub fn run(&self) -> Result<i32> {
74        self.create_internal(false)?.start()
75    }
76
77    /// Create phase: fork the child, set up cgroup/bridge, leave child blocked
78    /// on the exec FIFO. Returns a `CreatedContainer` whose `start()` method
79    /// releases the child process.
80    pub fn create(&self) -> Result<CreatedContainer> {
81        self.create_internal(true)
82    }
83
84    /// H6: Close all file descriptors > 2 in the child process after fork.
85    ///
86    /// This prevents leaking host sockets, pipes, and state files into the
87    /// container. Uses close_range(2) when available, falls back to /proc/self/fd.
88    fn sanitize_fds() {
89        // Try close_range(3, u32::MAX, CLOSE_RANGE_CLOEXEC) first – it's
90        // O(1) on Linux 5.9+ and marks all FDs as close-on-exec.
91        const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
92        // SAFETY: close_range is a safe syscall that marks FDs as close-on-exec.
93        let ret =
94            unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
95        if ret == 0 {
96            return;
97        }
98        // Fallback: iterate /proc/self/fd and close individually.
99        // Collect fds first, then close – closing during iteration would
100        // invalidate the ReadDir's own directory fd.
101        if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
102            let fds: Vec<i32> = entries
103                .flatten()
104                .filter_map(|entry| entry.file_name().into_string().ok())
105                .filter_map(|s| s.parse::<i32>().ok())
106                .filter(|&fd| fd > 2)
107                .collect();
108            for fd in fds {
109                unsafe { libc::close(fd) };
110            }
111        }
112    }
113
114    pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
115        let thread_count = std::fs::read_to_string("/proc/self/status")
116            .ok()
117            .and_then(|s| {
118                s.lines()
119                    .find(|line| line.starts_with("Threads:"))
120                    .and_then(|line| line.split_whitespace().nth(1))
121                    .and_then(|count| count.parse::<u32>().ok())
122            });
123
124        if thread_count == Some(1) {
125            return Ok(());
126        }
127
128        Err(NucleusError::ExecError(format!(
129            "{} requires a single-threaded process before fork, found {:?} threads",
130            context, thread_count
131        )))
132    }
133
134    fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
135        let lifecycle_span = info_span!(
136            "container.lifecycle",
137            container.id = %self.config.id,
138            container.name = %self.config.name,
139            runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
140        );
141        let _enter = lifecycle_span.enter();
142
143        info!(
144            "Creating container: {} (ID: {})",
145            self.config.name, self.config.id
146        );
147        audit(
148            &self.config.id,
149            &self.config.name,
150            AuditEventType::ContainerStart,
151            format!(
152                "command={:?} mode={:?} runtime={}",
153                crate::audit::redact_command(&self.config.command),
154                self.config.service_mode,
155                if self.config.use_gvisor {
156                    "gvisor"
157                } else {
158                    "native"
159                }
160            ),
161        );
162
163        // Auto-detect if we need rootless mode
164        let is_root = nix::unistd::Uid::effective().is_root();
165        let mut config = self.config.clone();
166
167        if !is_root && config.user_ns_config.is_none() {
168            info!("Not running as root, automatically enabling rootless mode");
169            config.namespaces.user = true;
170            config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
171        }
172
173        // C2: When running as root without user namespace, enable UID remapping
174        // in production mode (mandatory) or warn in other modes. Without user
175        // namespace, a container escape yields full host root.
176        if is_root && !config.namespaces.user {
177            if config.service_mode == ServiceMode::Production {
178                info!("Running as root in production mode: enabling user namespace with UID remapping");
179                config.namespaces.user = true;
180                config.user_ns_config =
181                    Some(crate::isolation::UserNamespaceConfig::root_remapped());
182            } else {
183                warn!(
184                    "Running as root WITHOUT user namespace isolation. \
185                     Container processes will run as real host UID 0. \
186                     Use --user-ns or production mode for UID remapping."
187                );
188            }
189        }
190
191        // Log console-socket acceptance (OCI interface; PTY forwarding is a future enhancement)
192        if let Some(ref socket_path) = config.console_socket {
193            warn!(
194                "Console socket {} accepted but terminal forwarding is not yet implemented",
195                socket_path.display()
196            );
197        }
198
199        // Validate production mode invariants before anything else.
200        config.validate_production_mode()?;
201        Self::assert_kernel_lockdown(&config)?;
202
203        Self::apply_network_mode_guards(&mut config, is_root)?;
204        Self::apply_trust_level_guards(&mut config)?;
205        config.validate_runtime_support()?;
206
207        if let NetworkMode::Bridge(ref bridge_config) = config.network {
208            let backend =
209                bridge_config.selected_nat_backend(is_root, config.user_ns_config.is_some());
210            if backend == NatBackend::Kernel && !is_root {
211                return Err(NucleusError::NetworkError(
212                    "Kernel bridge networking requires root. Use --nat-backend userspace or leave the default auto selection for rootless/native containers."
213                        .to_string(),
214                ));
215            }
216        }
217
218        // Create state manager, honoring --root override if set
219        let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
220
221        // Enforce name uniqueness among running containers
222        if let Ok(all_states) = state_mgr.list_states() {
223            if all_states.iter().any(|s| s.name == config.name) {
224                return Err(NucleusError::ConfigError(format!(
225                    "A container named '{}' already exists; use a different --name, \
226                     or remove the stale state with 'nucleus delete'",
227                    config.name
228                )));
229            }
230        }
231
232        // Create exec FIFO only for the two-phase create/start lifecycle.
233        // `run()` starts immediately and avoids this cross-root-path sync.
234        let exec_fifo = if defer_exec_until_start {
235            let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
236            nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
237                NucleusError::ExecError(format!(
238                    "Failed to create exec FIFO {:?}: {}",
239                    exec_fifo, e
240                ))
241            })?;
242            Some(exec_fifo)
243        } else {
244            None
245        };
246
247        // Try to create cgroup (optional for rootless mode)
248        let cgroup_name = format!("nucleus-{}", config.id);
249        let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
250            Ok(mut cgroup) => {
251                // Try to set limits
252                match cgroup.set_limits(&config.limits) {
253                    Ok(_) => {
254                        info!("Created cgroup with resource limits");
255                        Some(cgroup)
256                    }
257                    Err(e) => {
258                        if config.service_mode == ServiceMode::Production {
259                            let _ = cgroup.cleanup();
260                            return Err(NucleusError::CgroupError(format!(
261                                "Production mode requires cgroup resource enforcement, but \
262                                 applying limits failed: {}",
263                                e
264                            )));
265                        }
266                        warn!("Failed to set cgroup limits: {}", e);
267                        let _ = cgroup.cleanup();
268                        None
269                    }
270                }
271            }
272            Err(e) => {
273                if config.service_mode == ServiceMode::Production {
274                    return Err(NucleusError::CgroupError(format!(
275                        "Production mode requires cgroup resource enforcement, but \
276                         cgroup creation failed: {}",
277                        e
278                    )));
279                }
280
281                if config.user_ns_config.is_some() {
282                    if config.limits.memory_bytes.is_some()
283                        || config.limits.cpu_quota_us.is_some()
284                        || config.limits.pids_max.is_some()
285                    {
286                        warn!(
287                            "Running in rootless mode: requested resource limits cannot be \
288                             enforced – cgroup creation requires root ({})",
289                            e
290                        );
291                    } else {
292                        debug!("Running in rootless mode without cgroup resource limits");
293                    }
294                } else {
295                    warn!(
296                        "Failed to create cgroup (running without resource limits): {}",
297                        e
298                    );
299                }
300                None
301            }
302        };
303
304        // Resolve runsc path before fork, while still unprivileged.
305        let runsc_path = if config.use_gvisor {
306            Some(GVisorRuntime::resolve_path().map_err(|e| {
307                NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
308            })?)
309        } else {
310            None
311        };
312
313        // Child notifies parent after namespaces are ready.
314        let (ready_read, ready_write) = pipe().map_err(|e| {
315            NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
316        })?;
317
318        // M11: fork() in multi-threaded context. Flush log buffers and drop
319        // tracing guards before fork to minimize deadlock risk from locks held
320        // by other threads (tracing, allocator). The Tokio runtime is not yet
321        // started at this point, so async thread contention is not a concern.
322        Self::assert_single_threaded_for_fork("container create fork")?;
323        // SAFETY: fork() is called before any Tokio runtime is created.
324        // Only the main thread should be active at this point.
325        match unsafe { fork() }? {
326            ForkResult::Parent { child } => {
327                drop(ready_write);
328                info!("Forked child process: {}", child);
329
330                // Use a closure so that on any error we kill the child process
331                // instead of leaving it orphaned and blocked on the exec FIFO.
332                let parent_setup = || -> Result<CreatedContainer> {
333                    let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
334
335                    let cgroup_path = cgroup_opt
336                        .as_ref()
337                        .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
338                    let cpu_millicores = config
339                        .limits
340                        .cpu_quota_us
341                        .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
342                    let mut state = ContainerState::new(ContainerStateParams {
343                        id: config.id.clone(),
344                        name: config.name.clone(),
345                        pid: target_pid,
346                        command: config.command.clone(),
347                        memory_limit: config.limits.memory_bytes,
348                        cpu_limit: cpu_millicores,
349                        using_gvisor: config.use_gvisor,
350                        rootless: config.user_ns_config.is_some(),
351                        cgroup_path,
352                        process_uid: config.process_identity.uid,
353                        process_gid: config.process_identity.gid,
354                        additional_gids: config.process_identity.additional_gids.clone(),
355                    });
356                    state.config_hash = config.config_hash;
357                    state.bundle_path =
358                        config.rootfs_path.as_ref().map(|p| p.display().to_string());
359
360                    let mut network_driver: Option<BridgeDriver> = None;
361                    let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
362                    let deny_logger = Self::maybe_start_seccomp_deny_logger(&config, target_pid)?;
363
364                    // Transition: Creating -> Created
365                    state.status = OciStatus::Created;
366                    state_mgr.save_state(&state)?;
367
368                    // Write PID file (OCI --pid-file)
369                    if let Some(ref pid_path) = config.pid_file {
370                        std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
371                            NucleusError::ConfigError(format!(
372                                "Failed to write pid-file '{}': {}",
373                                pid_path.display(),
374                                e
375                            ))
376                        })?;
377                        info!("Wrote PID {} to {}", target_pid, pid_path.display());
378                    }
379
380                    if let Some(ref mut cgroup) = cgroup_opt {
381                        cgroup.attach_process(target_pid)?;
382                    }
383
384                    if let NetworkMode::Bridge(ref bridge_config) = config.network {
385                        match BridgeDriver::setup_with_id(
386                            target_pid,
387                            bridge_config,
388                            &config.id,
389                            is_root,
390                            config.user_ns_config.is_some(),
391                        ) {
392                            Ok(net) => {
393                                if let Some(ref egress) = config.egress_policy {
394                                    if let Err(e) = net.apply_egress_policy(
395                                        target_pid,
396                                        egress,
397                                        config.user_ns_config.is_some(),
398                                    ) {
399                                        if config.service_mode == ServiceMode::Production {
400                                            return Err(NucleusError::NetworkError(format!(
401                                                "Failed to apply egress policy: {}",
402                                                e
403                                            )));
404                                        }
405                                        warn!("Failed to apply egress policy: {}", e);
406                                    }
407                                }
408                                network_driver = Some(net);
409                            }
410                            Err(e) => {
411                                if config.service_mode == ServiceMode::Production {
412                                    return Err(e);
413                                }
414                                warn!("Failed to set up bridge networking: {}", e);
415                            }
416                        }
417                    }
418
419                    info!(
420                        "Container {} created (child pid {}), waiting for start",
421                        config.id, target_pid
422                    );
423
424                    Ok(CreatedContainer {
425                        config,
426                        state_mgr,
427                        state,
428                        child,
429                        cgroup_opt,
430                        network_driver,
431                        trace_reader,
432                        deny_logger,
433                        exec_fifo_path: exec_fifo,
434                        _lifecycle_span: lifecycle_span.clone(),
435                    })
436                };
437
438                parent_setup().map_err(|e| {
439                    // Kill the child so it doesn't remain orphaned and blocked
440                    // on the exec FIFO.
441                    let _ = kill(child, Signal::SIGKILL);
442                    let _ = waitpid(child, None);
443                    e
444                })
445            }
446            ForkResult::Child => {
447                drop(ready_read);
448                // H6: Close inherited FDs > 2 to prevent leaking host sockets/pipes
449                Self::sanitize_fds();
450                let temp_container = Container { config, runsc_path };
451                match temp_container.setup_and_exec(Some(ready_write), exec_fifo) {
452                    Ok(_) => unreachable!(),
453                    Err(e) => {
454                        error!("Container setup failed: {}", e);
455                        std::process::exit(1);
456                    }
457                }
458            }
459        }
460    }
461
462    /// Trigger a previously-created container to start by opening its exec FIFO.
463    /// Used by the CLI `start` command.
464    pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
465        let state_mgr = ContainerStateManager::new_with_root(state_root)?;
466        let fifo_path = state_mgr.exec_fifo_path(container_id)?;
467        if !fifo_path.exists() {
468            return Err(NucleusError::ConfigError(format!(
469                "No exec FIFO found for container {}; is it in 'created' state?",
470                container_id
471            )));
472        }
473
474        // Opening the FIFO for reading unblocks the child's open-for-write.
475        let file = std::fs::File::open(&fifo_path)
476            .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
477        let mut buf = [0u8; 1];
478        std::io::Read::read(&mut &file, &mut buf)
479            .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
480        drop(file);
481
482        let _ = std::fs::remove_file(&fifo_path);
483
484        // Update state to Running
485        let mut state = state_mgr.resolve_container(container_id)?;
486        state.status = OciStatus::Running;
487        state_mgr.save_state(&state)?;
488
489        Ok(())
490    }
491
492    /// Set up container environment and exec target process
493    ///
494    /// This runs in the child process after fork.
495    /// Tracks FilesystemState and SecurityState machines to enforce correct ordering.
496    fn setup_and_exec(
497        &self,
498        ready_pipe: Option<OwnedFd>,
499        exec_fifo: Option<PathBuf>,
500    ) -> Result<()> {
501        let is_rootless = self.config.user_ns_config.is_some();
502        let allow_degraded_security = Self::allow_degraded_security(&self.config);
503        let context_manifest = if self.config.verify_context_integrity {
504            self.config
505                .context_dir
506                .as_ref()
507                .map(|dir| snapshot_context_dir(dir))
508                .transpose()?
509        } else {
510            None
511        };
512
513        // Initialize state machines
514        let mut fs_state = FilesystemState::Unmounted;
515        let mut sec_state = SecurityState::Privileged;
516
517        // gVisor is the runtime that should create the container's namespaces.
518        // Running runsc after pre-unsharing our own namespaces breaks its gofer
519        // re-exec path on some systems and duplicates the OCI namespace config.
520        if self.config.use_gvisor {
521            if let Some(fd) = ready_pipe {
522                Self::notify_namespace_ready(&fd, std::process::id())?;
523            }
524            return self.setup_and_exec_gvisor();
525        }
526
527        // 1. Create namespaces in child and optionally configure user mapping.
528        let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
529        if let Some(user_config) = &self.config.user_ns_config {
530            namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
531        }
532        namespace_mgr.unshare_namespaces()?;
533
534        // CLONE_NEWPID only applies to children created after unshare().
535        // Create a child that will become PID 1 in the new namespace and exec the workload.
536        if self.config.namespaces.pid {
537            Self::assert_single_threaded_for_fork("PID namespace init fork")?;
538            match unsafe { fork() }? {
539                ForkResult::Parent { child } => {
540                    if let Some(fd) = ready_pipe {
541                        Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
542                    }
543                    std::process::exit(Self::wait_for_pid_namespace_child(child));
544                }
545                ForkResult::Child => {
546                    // Continue container setup as PID 1 in the new namespace.
547                }
548            }
549        } else if let Some(fd) = ready_pipe {
550            Self::notify_namespace_ready(&fd, std::process::id())?;
551        }
552
553        // Namespace: Unshared -> Entered (process is now inside all namespaces)
554        namespace_mgr.enter()?;
555
556        // 2. Ensure no_new_privs BEFORE any mount operations.
557        // This prevents exploitation of setuid binaries on bind-mounted paths
558        // even if a subsequent MS_NOSUID remount fails.
559        self.enforce_no_new_privs()?;
560        audit(
561            &self.config.id,
562            &self.config.name,
563            AuditEventType::NoNewPrivsSet,
564            "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
565        );
566
567        // 3. Set hostname if UTS namespace is enabled
568        if let Some(hostname) = &self.config.hostname {
569            namespace_mgr.set_hostname(hostname)?;
570        }
571
572        // 4. Mount tmpfs as container root
573        // Filesystem: Unmounted -> Mounted
574        // Use a private runtime directory instead of /tmp to avoid symlink
575        // attacks and information disclosure on multi-user systems.
576        let runtime_base = if nix::unistd::Uid::effective().is_root() {
577            std::path::PathBuf::from("/run/nucleus")
578        } else {
579            dirs::runtime_dir()
580                .map(|d| d.join("nucleus"))
581                .unwrap_or_else(std::env::temp_dir)
582        };
583        let _ = std::fs::create_dir_all(&runtime_base);
584        let runtime_dir = Builder::new()
585            .prefix("nucleus-runtime-")
586            .tempdir_in(&runtime_base)
587            .map_err(|e| {
588                NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
589            })?;
590        let container_root = runtime_dir.path().to_path_buf();
591        let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); // 1GB default
592        tmpfs.mount()?;
593        fs_state = fs_state.transition(FilesystemState::Mounted)?;
594
595        // 4. Create minimal filesystem structure
596        create_minimal_fs(&container_root)?;
597
598        // 5. Create device nodes and standard tmpfs mounts under /dev
599        let dev_path = container_root.join("dev");
600        create_dev_nodes(&dev_path, false)?;
601
602        // /dev/shm – POSIX shared memory (shm_open). Required by PostgreSQL,
603        // Redis, and other programs that use POSIX shared memory segments.
604        let shm_path = dev_path.join("shm");
605        std::fs::create_dir_all(&shm_path).map_err(|e| {
606            NucleusError::FilesystemError(format!("Failed to create /dev/shm: {}", e))
607        })?;
608        nix::mount::mount(
609            Some("shm"),
610            &shm_path,
611            Some("tmpfs"),
612            nix::mount::MsFlags::MS_NOSUID
613                | nix::mount::MsFlags::MS_NODEV
614                | nix::mount::MsFlags::MS_NOEXEC,
615            Some("mode=1777,size=64m"),
616        )
617        .map_err(|e| {
618            NucleusError::FilesystemError(format!("Failed to mount tmpfs on /dev/shm: {}", e))
619        })?;
620        debug!("Mounted tmpfs on /dev/shm");
621
622        // 6. Populate context if provided
623        // Filesystem: Mounted -> Populated
624        if let Some(context_dir) = &self.config.context_dir {
625            let context_dest = container_root.join("context");
626            LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
627            if let Some(expected) = &context_manifest {
628                verify_context_manifest(expected, &context_dest)?;
629            }
630        }
631        fs_state = fs_state.transition(FilesystemState::Populated)?;
632
633        // 7. Mount runtime paths: either a pre-built rootfs or host bind mounts
634        if let Some(ref rootfs_path) = self.config.rootfs_path {
635            if self.config.verify_rootfs_attestation {
636                verify_rootfs_attestation(rootfs_path)?;
637            }
638            bind_mount_rootfs(&container_root, rootfs_path)?;
639        } else {
640            bind_mount_host_paths(&container_root, is_rootless)?;
641        }
642
643        // 7b. Mount persistent or ephemeral volumes over the base filesystem.
644        mount_volumes(&container_root, &self.config.volumes)?;
645
646        // 7c. Write resolv.conf for bridge networking.
647        // When rootfs is mounted, /etc is read-only, so we bind-mount a writable
648        // resolv.conf over the top (same technique as secrets).
649        if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
650            let bridge_dns = if bridge_config.selected_nat_backend(!is_rootless, is_rootless)
651                == NatBackend::Userspace
652                && bridge_config.dns.is_empty()
653            {
654                vec![UserspaceNetwork::default_dns_server(&bridge_config.subnet)?]
655            } else {
656                bridge_config.dns.clone()
657            };
658            if self.config.rootfs_path.is_some() {
659                BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_dns)?;
660            } else {
661                BridgeNetwork::write_resolv_conf(&container_root, &bridge_dns)?;
662            }
663        }
664
665        // 7d. Mount secrets on an in-memory tmpfs in all modes.
666        mount_secrets_inmemory(
667            &container_root,
668            &self.config.secrets,
669            &self.config.process_identity,
670        )?;
671
672        // 8. Mount procfs (hidepid=2 in production mode to prevent PID enumeration)
673        let proc_path = container_root.join("proc");
674        let hide_pids = self.config.service_mode == ServiceMode::Production;
675        mount_procfs(
676            &proc_path,
677            is_rootless,
678            self.config.proc_readonly,
679            hide_pids,
680        )?;
681
682        // 8b. Mask sensitive /proc paths to reduce kernel info leakage
683        // SEC-06: In production mode, failures to mask critical paths are fatal.
684        mask_proc_paths(
685            &proc_path,
686            self.config.service_mode == ServiceMode::Production,
687        )?;
688
689        // 9c. Run createRuntime hooks (after namespaces created, before pivot_root)
690        if let Some(ref hooks) = self.config.hooks {
691            if !hooks.create_runtime.is_empty() {
692                let hook_state = OciContainerState {
693                    oci_version: "1.0.2".to_string(),
694                    id: self.config.id.clone(),
695                    status: OciStatus::Creating,
696                    pid: std::process::id(),
697                    bundle: String::new(),
698                };
699                OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
700            }
701        }
702
703        // 10. Switch root filesystem
704        // Filesystem: Populated -> Pivoted
705        switch_root(&container_root, self.config.allow_chroot_fallback)?;
706        fs_state = fs_state.transition(FilesystemState::Pivoted)?;
707        debug!("Filesystem state: {:?}", fs_state);
708
709        // 10b. Audit mount flags to verify filesystem hardening invariants
710        audit_mounts(self.config.service_mode == ServiceMode::Production)?;
711        audit(
712            &self.config.id,
713            &self.config.name,
714            AuditEventType::MountAuditPassed,
715            "all mount flags verified",
716        );
717
718        // 10c. Run createContainer hooks (after pivot_root, before start)
719        if let Some(ref hooks) = self.config.hooks {
720            if !hooks.create_container.is_empty() {
721                let hook_state = OciContainerState {
722                    oci_version: "1.0.2".to_string(),
723                    id: self.config.id.clone(),
724                    status: OciStatus::Created,
725                    pid: std::process::id(),
726                    bundle: String::new(),
727                };
728                OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
729            }
730        }
731
732        // 11. Drop capabilities and switch identity (Docker/runc convention).
733        //
734        // The identity switch (setuid/setgid) must happen between two cap phases:
735        //   Phase 1: drop bounding set (needs CAP_SETPCAP), clear ambient/inheritable
736        //   Identity: setgroups/setgid/setuid (needs CAP_SETUID/CAP_SETGID)
737        //   Phase 2: clear permitted/effective (or kernel auto-clears on setuid)
738        //
739        // Custom cap policies (drop_except / apply_sets) do their own full drop,
740        // so the two-phase approach only applies to the default drop-all path.
741        let mut cap_mgr = CapabilityManager::new();
742        if let Some(ref policy_path) = self.config.caps_policy {
743            let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
744                policy_path,
745                self.config.caps_policy_sha256.as_deref(),
746            )?;
747            // H3: Reject dangerous capabilities in production mode
748            if self.config.service_mode == ServiceMode::Production {
749                policy.validate_production()?;
750            }
751            policy.apply(&mut cap_mgr)?;
752            // Identity switch after custom policy (caps may already be restricted)
753            Self::apply_process_identity_to_current_process(
754                &self.config.process_identity,
755                self.config.user_ns_config.is_some(),
756            )?;
757            audit(
758                &self.config.id,
759                &self.config.name,
760                AuditEventType::CapabilitiesDropped,
761                format!("capability policy applied from {:?}", policy_path),
762            );
763        } else {
764            // Phase 1: drop bounding set while CAP_SETPCAP is still effective
765            cap_mgr.drop_bounding_set()?;
766
767            // Identity switch: setgroups/setgid/setuid while CAP_SETUID/CAP_SETGID
768            // are still in the effective set. For non-root target UIDs, the kernel
769            // auto-clears permitted/effective after setuid().
770            Self::apply_process_identity_to_current_process(
771                &self.config.process_identity,
772                self.config.user_ns_config.is_some(),
773            )?;
774
775            // Phase 2: explicitly clear any remaining caps (handles root-stays-root
776            // case where kernel doesn't auto-clear).
777            cap_mgr.finalize_drop()?;
778
779            audit(
780                &self.config.id,
781                &self.config.name,
782                AuditEventType::CapabilitiesDropped,
783                "all capabilities dropped including bounding set",
784            );
785        }
786        sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
787
788        // 12b. RLIMIT backstop: defense-in-depth against fork bombs and fd exhaustion.
789        // Must be applied BEFORE seccomp, since SYS_setrlimit is not in the allowlist.
790        // SEC-05: In production mode, RLIMIT failures are fatal – a container
791        // without resource limits is a privilege escalation vector.
792        {
793            let is_production = self.config.service_mode == ServiceMode::Production;
794
795            if let Some(nproc_limit) = self.config.limits.pids_max {
796                let rlim_nproc = libc::rlimit {
797                    rlim_cur: nproc_limit,
798                    rlim_max: nproc_limit,
799                };
800                // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
801                if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
802                    let err = std::io::Error::last_os_error();
803                    if is_production {
804                        return Err(NucleusError::SeccompError(format!(
805                            "Failed to set RLIMIT_NPROC to {} in production mode: {}",
806                            nproc_limit, err
807                        )));
808                    }
809                    warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
810                }
811            }
812
813            let rlim_nofile = libc::rlimit {
814                rlim_cur: 1024,
815                rlim_max: 1024,
816            };
817            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
818            if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
819                let err = std::io::Error::last_os_error();
820                if is_production {
821                    return Err(NucleusError::SeccompError(format!(
822                        "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
823                        err
824                    )));
825                }
826                warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
827            }
828
829            // RLIMIT_MEMLOCK: prevent container from pinning excessive physical
830            // memory via mlock(). Default 64KB matches unprivileged default, but
831            // in a user namespace the container appears as UID 0 and may have a
832            // higher inherited limit. Configurable via --memlock for io_uring etc.
833            let memlock_limit: u64 = self.config.limits.memlock_bytes.unwrap_or(64 * 1024);
834            let rlim_memlock = libc::rlimit {
835                rlim_cur: memlock_limit,
836                rlim_max: memlock_limit,
837            };
838            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
839            if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
840                let err = std::io::Error::last_os_error();
841                if is_production {
842                    return Err(NucleusError::SeccompError(format!(
843                        "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
844                        memlock_limit, err
845                    )));
846                }
847                warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
848            }
849        }
850
851        // 12c. Verify that namespace-creating capabilities are truly gone before
852        // installing seccomp. clone3 is allowed without argument filtering, so this
853        // is the sole guard against namespace escape via clone3.
854        CapabilityManager::verify_no_namespace_caps(
855            self.config.service_mode == ServiceMode::Production,
856        )?;
857
858        // 13. Apply seccomp filter (trace, profile-from-file, or built-in allowlist)
859        // Security: CapabilitiesDropped -> SeccompApplied
860        use crate::container::config::SeccompMode;
861        let mut seccomp_mgr = SeccompManager::new();
862        let allow_network = !matches!(self.config.network, NetworkMode::None);
863        let seccomp_applied = match self.config.seccomp_mode {
864            SeccompMode::Trace => {
865                audit(
866                    &self.config.id,
867                    &self.config.name,
868                    AuditEventType::SeccompApplied,
869                    "seccomp trace mode: allow-all + LOG",
870                );
871                seccomp_mgr.apply_trace_filter()?
872            }
873            SeccompMode::Enforce => {
874                if let Some(ref profile_path) = self.config.seccomp_profile {
875                    audit(
876                        &self.config.id,
877                        &self.config.name,
878                        AuditEventType::SeccompProfileLoaded,
879                        format!("path={:?}", profile_path),
880                    );
881                    seccomp_mgr.apply_profile_from_file(
882                        profile_path,
883                        self.config.seccomp_profile_sha256.as_deref(),
884                        self.config.seccomp_log_denied,
885                    )?
886                } else {
887                    seccomp_mgr.apply_filter_for_network_mode(
888                        allow_network,
889                        allow_degraded_security,
890                        self.config.seccomp_log_denied,
891                        &self.config.seccomp_allow_syscalls,
892                    )?
893                }
894            }
895        };
896        if seccomp_applied {
897            sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
898            audit(
899                &self.config.id,
900                &self.config.name,
901                AuditEventType::SeccompApplied,
902                format!("network={}", allow_network),
903            );
904        } else if !allow_degraded_security {
905            return Err(NucleusError::SeccompError(
906                "Seccomp filter is required but was not enforced".to_string(),
907            ));
908        } else {
909            warn!("Seccomp not enforced; container is running with degraded hardening");
910        }
911
912        // 14. Apply Landlock policy (from policy file or default hardcoded rules)
913        let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
914            let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
915                policy_path,
916                self.config.landlock_policy_sha256.as_deref(),
917            )?;
918            // H4: Reject write+execute on same path in production
919            if self.config.service_mode == ServiceMode::Production {
920                policy.validate_production()?;
921            }
922            policy.apply(allow_degraded_security)?
923        } else {
924            let mut landlock_mgr = LandlockManager::new();
925            landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
926            // Register volume mount destinations so Landlock permits access to them
927            for vol in &self.config.volumes {
928                landlock_mgr.add_rw_path(&vol.dest.to_string_lossy());
929            }
930            landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
931        };
932        if seccomp_applied && landlock_applied {
933            sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
934            if self.config.seccomp_mode == SeccompMode::Trace {
935                warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
936            } else {
937                sec_state = sec_state.transition(SecurityState::Locked)?;
938            }
939            audit(
940                &self.config.id,
941                &self.config.name,
942                AuditEventType::LandlockApplied,
943                if self.config.seccomp_mode == SeccompMode::Trace {
944                    "landlock applied, but seccomp in trace mode – not locked".to_string()
945                } else {
946                    "security state locked: all hardening layers active".to_string()
947                },
948            );
949        } else if !allow_degraded_security {
950            return Err(NucleusError::LandlockError(
951                "Landlock policy is required but was not enforced".to_string(),
952            ));
953        } else {
954            warn!("Security state not locked; one or more hardening controls are inactive");
955        }
956        debug!("Security state: {:?}", sec_state);
957
958        // 14c. Block on exec FIFO until start() opens it for reading.
959        // This implements the OCI two-phase create/start: all container setup
960        // is complete, but the user process doesn't exec until explicitly started.
961        if let Some(ref fifo_path) = exec_fifo {
962            debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
963            let file = std::fs::OpenOptions::new()
964                .write(true)
965                .open(fifo_path)
966                .map_err(|e| {
967                    NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
968                })?;
969            std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
970                NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
971            })?;
972            drop(file);
973            debug!("Exec FIFO released, proceeding to exec");
974        }
975
976        // 14d. Run startContainer hooks (after start signal, before user process exec)
977        if let Some(ref hooks) = self.config.hooks {
978            if !hooks.start_container.is_empty() {
979                let hook_state = OciContainerState {
980                    oci_version: "1.0.2".to_string(),
981                    id: self.config.id.clone(),
982                    status: OciStatus::Running,
983                    pid: std::process::id(),
984                    bundle: String::new(),
985                };
986                OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
987            }
988        }
989
990        // 15. In production mode with PID namespace, run as a mini-init (PID 1)
991        // that reaps zombies and forwards signals, rather than exec-ing directly.
992        if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
993            return self.run_as_init();
994        }
995
996        // 15b. Agent mode: exec target process directly
997        self.exec_command()?;
998
999        // Should never reach here
1000        Ok(())
1001    }
1002
1003    /// Forward selected signals to child process using sigwait (no async signal handlers).
1004    ///
1005    /// Returns a stop flag and join handle. Set the flag to `true` and join
1006    /// the handle to cleanly shut down the forwarding thread.
1007    pub(super) fn setup_signal_forwarding_static(
1008        child: Pid,
1009    ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
1010        let mut set = SigSet::empty();
1011        for signal in [
1012            Signal::SIGTERM,
1013            Signal::SIGINT,
1014            Signal::SIGHUP,
1015            Signal::SIGQUIT,
1016            Signal::SIGUSR1,
1017            Signal::SIGUSR2,
1018        ] {
1019            set.add(signal);
1020        }
1021
1022        let unblock_set = set;
1023        pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
1024            NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
1025        })?;
1026
1027        let stop = Arc::new(AtomicBool::new(false));
1028        let stop_clone = stop.clone();
1029        let handle = std::thread::Builder::new()
1030            .name("sig-forward".to_string())
1031            .spawn(move || {
1032                // The thread owns unblock_set and uses it for sigwait.
1033                loop {
1034                    if let Ok(signal) = unblock_set.wait() {
1035                        // Check the stop flag *after* waking so that the
1036                        // wake-up signal (SIGUSR1) is not forwarded to the
1037                        // child during shutdown.
1038                        if stop_clone.load(Ordering::Relaxed) {
1039                            break;
1040                        }
1041                        let _ = kill(child, signal);
1042                    }
1043                }
1044            })
1045            .map_err(|e| {
1046                // Restore the signal mask so the caller isn't left with
1047                // signals permanently blocked.
1048                let mut restore = SigSet::empty();
1049                for signal in [
1050                    Signal::SIGTERM,
1051                    Signal::SIGINT,
1052                    Signal::SIGHUP,
1053                    Signal::SIGQUIT,
1054                    Signal::SIGUSR1,
1055                    Signal::SIGUSR2,
1056                ] {
1057                    restore.add(signal);
1058                }
1059                let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
1060                NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
1061            })?;
1062
1063        info!("Signal forwarding configured");
1064        Ok((stop, handle))
1065    }
1066
1067    /// Wait for child process to exit
1068    pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
1069        loop {
1070            match waitpid(child, None) {
1071                Ok(WaitStatus::Exited(_, code)) => {
1072                    return Ok(code);
1073                }
1074                Ok(WaitStatus::Signaled(_, signal, _)) => {
1075                    info!("Child killed by signal: {:?}", signal);
1076                    return Ok(128 + signal as i32);
1077                }
1078                Err(nix::errno::Errno::EINTR) => {
1079                    continue;
1080                }
1081                Err(e) => {
1082                    return Err(NucleusError::ExecError(format!(
1083                        "Failed to wait for child: {}",
1084                        e
1085                    )));
1086                }
1087                _ => {
1088                    continue;
1089                }
1090            }
1091        }
1092    }
1093
1094    fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1095        let mut pid_buf = [0u8; 4];
1096        loop {
1097            match read(ready_read, &mut pid_buf) {
1098                Err(nix::errno::Errno::EINTR) => continue,
1099                Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1100                Ok(0) => {
1101                    return Err(NucleusError::ExecError(format!(
1102                        "Child {} exited before namespace initialization",
1103                        child
1104                    )))
1105                }
1106                Ok(_) => {
1107                    return Err(NucleusError::ExecError(
1108                        "Invalid namespace sync payload from child".to_string(),
1109                    ))
1110                }
1111                Err(e) => {
1112                    return Err(NucleusError::ExecError(format!(
1113                        "Failed waiting for child namespace setup: {}",
1114                        e
1115                    )))
1116                }
1117            }
1118        }
1119    }
1120
1121    fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1122        let payload = pid.to_ne_bytes();
1123        let mut written = 0;
1124        while written < payload.len() {
1125            let n = write(fd, &payload[written..]).map_err(|e| {
1126                NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1127            })?;
1128            if n == 0 {
1129                return Err(NucleusError::ExecError(
1130                    "Failed to notify namespace readiness: short write".to_string(),
1131                ));
1132            }
1133            written += n;
1134        }
1135        Ok(())
1136    }
1137
1138    fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1139        loop {
1140            match waitpid(child, None) {
1141                Ok(WaitStatus::Exited(_, code)) => return code,
1142                Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1143                Err(nix::errno::Errno::EINTR) => continue,
1144                Err(_) => return 1,
1145                _ => continue,
1146            }
1147        }
1148    }
1149}
1150
1151impl CreatedContainer {
1152    /// Start phase: release the child via the exec FIFO, transition to Running,
1153    /// then wait for the child to exit with full lifecycle management.
1154    pub fn start(mut self) -> Result<i32> {
1155        let config = &self.config;
1156        let _enter = self._lifecycle_span.enter();
1157
1158        // Open the exec FIFO for reading – this unblocks the child's
1159        // blocking open-for-write, allowing it to proceed to exec.
1160        if let Some(exec_fifo_path) = &self.exec_fifo_path {
1161            let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1162                NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1163            })?;
1164            let mut buf = [0u8; 1];
1165            let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1166                NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1167            })?;
1168            if read != 1 {
1169                return Err(NucleusError::ExecError(
1170                    "Exec FIFO closed before start signal was delivered".to_string(),
1171                ));
1172            }
1173            let _ = std::fs::remove_file(exec_fifo_path);
1174        }
1175
1176        // Transition: Created -> Running
1177        self.state.status = OciStatus::Running;
1178        self.state_mgr.save_state(&self.state)?;
1179
1180        let target_pid = self.state.pid;
1181        let child = self.child;
1182
1183        let (sig_stop, sig_handle) =
1184            Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1185
1186        // Guard ensures signal thread is stopped on any exit path (including early ? returns).
1187        let mut sig_guard = SignalThreadGuard {
1188            stop: Some(sig_stop),
1189            handle: Some(sig_handle),
1190        };
1191
1192        // Run readiness probe before declaring service ready
1193        if let Some(ref probe) = config.readiness_probe {
1194            let notify_socket = if config.sd_notify {
1195                std::env::var("NOTIFY_SOCKET").ok()
1196            } else {
1197                None
1198            };
1199            Container::run_readiness_probe(
1200                target_pid,
1201                &config.name,
1202                probe,
1203                config.user_ns_config.is_some(),
1204                config.use_gvisor,
1205                &config.process_identity,
1206                notify_socket.as_deref(),
1207            )?;
1208        }
1209
1210        // Start health check thread if configured
1211        let cancel_flag = Arc::new(AtomicBool::new(false));
1212        let health_handle = if let Some(ref hc) = config.health_check {
1213            if !hc.command.is_empty() {
1214                let hc = hc.clone();
1215                let pid = target_pid;
1216                let container_name = config.name.clone();
1217                let rootless = config.user_ns_config.is_some();
1218                let using_gvisor = config.use_gvisor;
1219                let process_identity = config.process_identity.clone();
1220                let cancel = cancel_flag.clone();
1221                Some(std::thread::spawn(move || {
1222                    Container::health_check_loop(
1223                        pid,
1224                        &container_name,
1225                        rootless,
1226                        using_gvisor,
1227                        &hc,
1228                        &process_identity,
1229                        &cancel,
1230                    );
1231                }))
1232            } else {
1233                None
1234            }
1235        } else {
1236            None
1237        };
1238
1239        // Guard ensures health check thread is cancelled on any exit path.
1240        let mut health_guard = HealthThreadGuard {
1241            cancel: Some(cancel_flag),
1242            handle: health_handle,
1243        };
1244
1245        // Run poststart hooks (after user process started, in parent)
1246        if let Some(ref hooks) = config.hooks {
1247            if !hooks.poststart.is_empty() {
1248                let hook_state = OciContainerState {
1249                    oci_version: "1.0.2".to_string(),
1250                    id: config.id.clone(),
1251                    status: OciStatus::Running,
1252                    pid: target_pid,
1253                    bundle: String::new(),
1254                };
1255                OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1256            }
1257        }
1258
1259        let mut child_waited = false;
1260        let run_result: Result<i32> = (|| {
1261            let exit_code = Container::wait_for_child_static(child)?;
1262
1263            // Transition: Running -> Stopped
1264            self.state.status = OciStatus::Stopped;
1265            let _ = self.state_mgr.save_state(&self.state);
1266
1267            child_waited = true;
1268            Ok(exit_code)
1269        })();
1270
1271        // Explicitly stop threads (guards would do this on drop too, but
1272        // explicit teardown keeps ordering visible).
1273        health_guard.stop();
1274        sig_guard.stop();
1275
1276        // Run poststop hooks (best-effort)
1277        if let Some(ref hooks) = config.hooks {
1278            if !hooks.poststop.is_empty() {
1279                let hook_state = OciContainerState {
1280                    oci_version: "1.0.2".to_string(),
1281                    id: config.id.clone(),
1282                    status: OciStatus::Stopped,
1283                    pid: 0,
1284                    bundle: String::new(),
1285                };
1286                OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1287            }
1288        }
1289
1290        if let Some(net) = self.network_driver.take() {
1291            if let Err(e) = net.cleanup() {
1292                warn!("Failed to cleanup container networking: {}", e);
1293            }
1294        }
1295
1296        if !child_waited {
1297            let _ = kill(child, Signal::SIGKILL);
1298            let _ = waitpid(child, None);
1299        }
1300
1301        if let Some(reader) = self.trace_reader.take() {
1302            reader.stop_and_flush();
1303        }
1304
1305        if let Some(logger) = self.deny_logger.take() {
1306            logger.stop();
1307        }
1308
1309        if let Some(cgroup) = self.cgroup_opt.take() {
1310            if let Err(e) = cgroup.cleanup() {
1311                warn!("Failed to cleanup cgroup: {}", e);
1312            }
1313        }
1314
1315        if config.use_gvisor {
1316            if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1317                warn!(
1318                    "Failed to cleanup gVisor artifacts for {}: {}",
1319                    config.id, e
1320                );
1321            }
1322        }
1323
1324        if let Err(e) = self.state_mgr.delete_state(&config.id) {
1325            warn!("Failed to delete state for {}: {}", config.id, e);
1326        }
1327
1328        match run_result {
1329            Ok(exit_code) => {
1330                audit(
1331                    &config.id,
1332                    &config.name,
1333                    AuditEventType::ContainerStop,
1334                    format!("exit_code={}", exit_code),
1335                );
1336                info!(
1337                    "Container {} ({}) exited with code {}",
1338                    config.name, config.id, exit_code
1339                );
1340                Ok(exit_code)
1341            }
1342            Err(e) => {
1343                audit_error(
1344                    &config.id,
1345                    &config.name,
1346                    AuditEventType::ContainerStop,
1347                    format!("error={}", e),
1348                );
1349                Err(e)
1350            }
1351        }
1352    }
1353}
1354
1355/// RAII guard that stops the signal-forwarding thread on drop.
1356struct SignalThreadGuard {
1357    stop: Option<Arc<AtomicBool>>,
1358    handle: Option<JoinHandle<()>>,
1359}
1360
1361impl SignalThreadGuard {
1362    fn stop(&mut self) {
1363        if let Some(flag) = self.stop.take() {
1364            flag.store(true, Ordering::Relaxed);
1365            // Unblock the sigwait() call so the thread can observe the stop flag.
1366            let _ = kill(Pid::this(), Signal::SIGUSR1);
1367        }
1368        if let Some(handle) = self.handle.take() {
1369            let _ = handle.join();
1370        }
1371    }
1372}
1373
1374impl Drop for SignalThreadGuard {
1375    fn drop(&mut self) {
1376        self.stop();
1377    }
1378}
1379
1380/// RAII guard that cancels the health-check thread on drop.
1381struct HealthThreadGuard {
1382    cancel: Option<Arc<AtomicBool>>,
1383    handle: Option<JoinHandle<()>>,
1384}
1385
1386impl HealthThreadGuard {
1387    fn stop(&mut self) {
1388        if let Some(flag) = self.cancel.take() {
1389            flag.store(true, Ordering::Relaxed);
1390        }
1391        if let Some(handle) = self.handle.take() {
1392            let _ = handle.join();
1393        }
1394    }
1395}
1396
1397impl Drop for HealthThreadGuard {
1398    fn drop(&mut self) {
1399        self.stop();
1400    }
1401}
1402
1403#[cfg(test)]
1404mod tests {
1405    use super::*;
1406    use crate::container::KernelLockdownMode;
1407    use crate::network::NetworkMode;
1408    use std::ffi::OsString;
1409
1410    struct EnvVarGuard {
1411        key: &'static str,
1412        previous: Option<OsString>,
1413    }
1414
1415    impl EnvVarGuard {
1416        fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
1417            let previous = std::env::var_os(key);
1418            std::env::set_var(key, value);
1419            Self { key, previous }
1420        }
1421    }
1422
1423    impl Drop for EnvVarGuard {
1424        fn drop(&mut self) {
1425            match &self.previous {
1426                Some(value) => std::env::set_var(self.key, value),
1427                None => std::env::remove_var(self.key),
1428            }
1429        }
1430    }
1431
1432    fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
1433        let fn_start = source
1434            .find(fn_signature)
1435            .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
1436        let after = &source[fn_start..];
1437        let open = after
1438            .find('{')
1439            .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
1440        let mut depth = 0u32;
1441        let mut end = open;
1442        for (i, ch) in after[open..].char_indices() {
1443            match ch {
1444                '{' => depth += 1,
1445                '}' => {
1446                    depth -= 1;
1447                    if depth == 0 {
1448                        end = open + i + 1;
1449                        break;
1450                    }
1451                }
1452                _ => {}
1453            }
1454        }
1455        &after[..end]
1456    }
1457
1458    #[test]
1459    fn test_container_config() {
1460        let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1461        assert!(!config.id.is_empty());
1462        assert_eq!(config.command, vec!["/bin/sh"]);
1463        assert!(config.use_gvisor);
1464    }
1465
1466    #[test]
1467    fn test_run_uses_immediate_start_path() {
1468        let source = include_str!("runtime.rs");
1469        let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1470        let after = &source[fn_start..];
1471        let open = after.find('{').unwrap();
1472        let mut depth = 0u32;
1473        let mut fn_end = open;
1474        for (i, ch) in after[open..].char_indices() {
1475            match ch {
1476                '{' => depth += 1,
1477                '}' => {
1478                    depth -= 1;
1479                    if depth == 0 {
1480                        fn_end = open + i + 1;
1481                        break;
1482                    }
1483                }
1484                _ => {}
1485            }
1486        }
1487        let run_body = &after[..fn_end];
1488        assert!(
1489            run_body.contains("create_internal(false)"),
1490            "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1491        );
1492        assert!(
1493            !run_body.contains("self.create()?.start()"),
1494            "run() must not route through create()+start()"
1495        );
1496    }
1497
1498    #[test]
1499    fn test_container_config_with_name() {
1500        let config =
1501            ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1502                .unwrap();
1503        assert_eq!(config.name, "mycontainer");
1504        assert!(!config.id.is_empty());
1505        assert_ne!(config.id, config.name);
1506    }
1507
1508    #[test]
1509    fn test_allow_degraded_security_requires_explicit_config() {
1510        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1511        assert!(!Container::allow_degraded_security(&strict));
1512
1513        let relaxed = strict.clone().with_allow_degraded_security(true);
1514        assert!(Container::allow_degraded_security(&relaxed));
1515    }
1516
1517    #[test]
1518    fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1519        let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1520        std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1521
1522        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1523        assert!(!Container::allow_degraded_security(&strict));
1524
1525        let explicit = strict.with_allow_degraded_security(true);
1526        assert!(Container::allow_degraded_security(&explicit));
1527
1528        match prev {
1529            Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1530            None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1531        }
1532    }
1533
1534    #[test]
1535    fn test_host_network_requires_explicit_opt_in() {
1536        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1537            .unwrap()
1538            .with_network(NetworkMode::Host)
1539            .with_allow_host_network(false);
1540        let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1541        assert!(matches!(err, NucleusError::NetworkError(_)));
1542    }
1543
1544    #[test]
1545    fn test_host_network_opt_in_disables_net_namespace() {
1546        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1547            .unwrap()
1548            .with_network(NetworkMode::Host)
1549            .with_allow_host_network(true);
1550        assert!(config.namespaces.net);
1551        Container::apply_network_mode_guards(&mut config, true).unwrap();
1552        assert!(!config.namespaces.net);
1553    }
1554
1555    #[test]
1556    fn test_non_host_network_does_not_require_host_opt_in() {
1557        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1558            .unwrap()
1559            .with_network(NetworkMode::None)
1560            .with_allow_host_network(false);
1561        assert!(config.namespaces.net);
1562        Container::apply_network_mode_guards(&mut config, true).unwrap();
1563        assert!(config.namespaces.net);
1564    }
1565
1566    #[test]
1567    fn test_parse_kernel_lockdown_mode() {
1568        assert_eq!(
1569            Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1570            Some(KernelLockdownMode::Integrity)
1571        );
1572        assert_eq!(
1573            Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1574            Some(KernelLockdownMode::Confidentiality)
1575        );
1576        assert_eq!(
1577            Container::parse_active_lockdown_mode("[none] integrity"),
1578            None
1579        );
1580    }
1581
1582    #[test]
1583    fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1584        let temp = tempfile::TempDir::new().unwrap();
1585        let source = temp.path().join("source-secret");
1586        std::fs::write(&source, "supersecret").unwrap();
1587
1588        let staged = Container::stage_gvisor_secret_files(
1589            &temp.path().join("stage"),
1590            &[crate::container::SecretMount {
1591                source: source.clone(),
1592                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1593                mode: 0o400,
1594            }],
1595            &crate::container::ProcessIdentity::root(),
1596        )
1597        .unwrap();
1598
1599        assert_eq!(staged.len(), 1);
1600        assert!(staged[0].source.starts_with(temp.path().join("stage")));
1601        assert_eq!(
1602            std::fs::read_to_string(&staged[0].source).unwrap(),
1603            "supersecret"
1604        );
1605    }
1606
1607    #[test]
1608    fn test_stage_gvisor_secret_files_rejects_symlink_source() {
1609        use std::os::unix::fs::symlink;
1610
1611        let temp = tempfile::TempDir::new().unwrap();
1612        let source = temp.path().join("source-secret");
1613        let link = temp.path().join("source-link");
1614        std::fs::write(&source, "supersecret").unwrap();
1615        symlink(&source, &link).unwrap();
1616
1617        let err = Container::stage_gvisor_secret_files(
1618            &temp.path().join("stage"),
1619            &[crate::container::SecretMount {
1620                source: link,
1621                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1622                mode: 0o400,
1623            }],
1624            &crate::container::ProcessIdentity::root(),
1625        )
1626        .unwrap_err();
1627
1628        assert!(
1629            err.to_string().contains("O_NOFOLLOW"),
1630            "gVisor secret staging must reject symlink sources"
1631        );
1632    }
1633
1634    #[test]
1635    fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
1636        let source = include_str!("runtime.rs");
1637        let fn_body = extract_fn_body(source, "fn setup_and_exec");
1638        assert!(
1639            fn_body.contains("mount_secrets_inmemory("),
1640            "setup_and_exec must use in-memory secret mounting"
1641        );
1642        assert!(
1643            !fn_body.contains("mount_secrets(&"),
1644            "setup_and_exec must not bind-mount secrets from the host"
1645        );
1646    }
1647
1648    #[test]
1649    fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
1650        let source = include_str!("gvisor_setup.rs");
1651        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
1652        assert!(
1653            fn_body.contains("with_inmemory_secret_mounts"),
1654            "gVisor setup must use the tmpfs-backed secret staging path"
1655        );
1656        assert!(
1657            !fn_body.contains("with_secret_mounts"),
1658            "gVisor setup must not bind-mount host secret paths"
1659        );
1660    }
1661
1662    #[test]
1663    fn test_native_fork_sites_assert_single_threaded() {
1664        let runtime_source = include_str!("runtime.rs");
1665        let create_body = extract_fn_body(runtime_source, "fn create_internal");
1666        assert!(
1667            create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
1668            "create_internal must assert single-threaded before fork"
1669        );
1670
1671        let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
1672        assert!(
1673            setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
1674            "PID namespace setup must assert single-threaded before fork"
1675        );
1676
1677        let exec_source = include_str!("exec.rs");
1678        let init_body = extract_fn_body(exec_source, "fn run_as_init");
1679        assert!(
1680            init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
1681            "run_as_init must assert single-threaded before fork"
1682        );
1683    }
1684
1685    #[test]
1686    fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
1687        let source = include_str!("exec.rs");
1688        let fn_body = extract_fn_body(source, "fn run_as_init");
1689        assert!(
1690            !fn_body.contains("Self::apply_process_identity_to_current_process("),
1691            "run_as_init must not drop identity before the supervisor fork"
1692        );
1693        assert!(
1694            fn_body.contains("self.exec_command()?"),
1695            "workload child must still route through exec_command for identity application"
1696        );
1697    }
1698
1699    #[test]
1700    fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1701        let temp = tempfile::TempDir::new().unwrap();
1702        let _artifact_base = EnvVarGuard::set(
1703            "NUCLEUS_GVISOR_ARTIFACT_BASE",
1704            temp.path().join("gvisor-artifacts"),
1705        );
1706        let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1707        std::fs::create_dir_all(&artifact_dir).unwrap();
1708        std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1709
1710        Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1711        assert!(!artifact_dir.exists());
1712    }
1713
1714    #[test]
1715    fn test_health_check_loop_supports_cancellation() {
1716        // BUG-18: health_check_loop must accept an AtomicBool cancel flag
1717        // and check it between iterations for prompt shutdown.
1718        // Function lives in health.rs after the runtime split.
1719        let source = include_str!("health.rs");
1720        let fn_start = source.find("fn health_check_loop").unwrap();
1721        let fn_body = &source[fn_start..fn_start + 2500];
1722        assert!(
1723            fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1724            "health_check_loop must accept an AtomicBool cancellation flag"
1725        );
1726        // Must also check cancellation during sleep
1727        assert!(
1728            fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1729            "health_check_loop must check cancellation during sleep intervals"
1730        );
1731    }
1732
1733    #[test]
1734    fn test_runtime_probes_do_not_spawn_host_nsenter() {
1735        // Both functions live in health.rs after the runtime split.
1736        let source = include_str!("health.rs");
1737
1738        let readiness_start = source.find("fn run_readiness_probe").unwrap();
1739        let readiness_body = &source[readiness_start..readiness_start + 2500];
1740        assert!(
1741            !readiness_body.contains("Command::new(&nsenter_bin)"),
1742            "readiness probes must not execute via host nsenter"
1743        );
1744
1745        let health_start = source.find("fn health_check_loop").unwrap();
1746        let health_body = &source[health_start..health_start + 2200];
1747        assert!(
1748            !health_body.contains("Command::new(&nsenter_bin)"),
1749            "health checks must not execute via host nsenter"
1750        );
1751    }
1752
1753    #[test]
1754    fn test_oci_mount_strip_prefix_no_expect() {
1755        // BUG-08: prepare_oci_mountpoints must not use expect() - use ? instead
1756        // Function lives in gvisor_setup.rs after the runtime split.
1757        let source = include_str!("gvisor_setup.rs");
1758        let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1759        let fn_body = &source[fn_start..fn_start + 600];
1760        assert!(
1761            !fn_body.contains(".expect("),
1762            "prepare_oci_mountpoints must not use expect() – return Err instead"
1763        );
1764    }
1765
1766    #[test]
1767    fn test_notify_namespace_ready_validates_write_length() {
1768        // BUG-02: notify_namespace_ready must validate that all bytes were written
1769        let source = include_str!("runtime.rs");
1770        let fn_start = source.find("fn notify_namespace_ready").unwrap();
1771        let fn_body = &source[fn_start..fn_start + 500];
1772        // Must check the return value of write() for partial writes
1773        assert!(
1774            fn_body.contains("written")
1775                || fn_body.contains("4")
1776                || fn_body.contains("payload.len()"),
1777            "notify_namespace_ready must validate complete write of all 4 bytes"
1778        );
1779    }
1780
1781    #[test]
1782    fn test_rlimit_failures_fatal_in_production() {
1783        // SEC-05: RLIMIT failures must be fatal in production mode
1784        let source = include_str!("runtime.rs");
1785        let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1786        let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1787        assert!(
1788            rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1789            "RLIMIT failures must return Err in production mode"
1790        );
1791    }
1792
1793    #[test]
1794    fn test_tcp_readiness_probe_uses_portable_check() {
1795        // BUG-14: TCP readiness probe must not use /dev/tcp (bash-only)
1796        // Function lives in health.rs after the runtime split.
1797        let source = include_str!("health.rs");
1798        let probe_fn = source.find("TcpPort(port)").unwrap();
1799        let probe_body = &source[probe_fn..probe_fn + 500];
1800        assert!(
1801            !probe_body.contains("/dev/tcp"),
1802            "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1803        );
1804    }
1805}