nucleus/container/
runtime.rs

1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3    ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4    ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8    audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9    mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
10    switch_root, verify_context_manifest, verify_rootfs_attestation, FilesystemState,
11    LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::NamespaceManager;
14use crate::network::{BridgeNetwork, NetworkMode};
15use crate::resources::Cgroup;
16use crate::security::{
17    CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks, SeccompManager,
18    SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::OwnedFd;
26use std::path::PathBuf;
27use std::sync::atomic::{AtomicBool, Ordering};
28use std::sync::Arc;
29use std::thread::JoinHandle;
30use tempfile::Builder;
31use tracing::{debug, error, info, info_span, warn};
32
33/// Container runtime that orchestrates all isolation mechanisms
34///
35/// Execution flow matches the formal specifications:
36/// 1. Create namespaces (Nucleus_Isolation_NamespaceLifecycle.tla)
37/// 2. Create and configure cgroups (Nucleus_Resources_CgroupLifecycle.tla)
38/// 3. Mount tmpfs and populate context (Nucleus_Filesystem_FilesystemLifecycle.tla)
39/// 4. Drop capabilities and apply seccomp (Nucleus_Security_SecurityEnforcement.tla)
40/// 5. Execute target process
41pub struct Container {
42    pub(super) config: ContainerConfig,
43    /// Pre-resolved runsc path, resolved before fork so that user-namespace
44    /// UID changes don't block PATH-based lookup.
45    pub(super) runsc_path: Option<String>,
46}
47
48/// Handle returned by `Container::create()` representing a container whose
49/// child process has been forked and is blocked on the exec FIFO, waiting for
50/// `start()` to release it.
51pub struct CreatedContainer {
52    pub(super) config: ContainerConfig,
53    pub(super) state_mgr: ContainerStateManager,
54    pub(super) state: ContainerState,
55    pub(super) child: Pid,
56    pub(super) cgroup_opt: Option<Cgroup>,
57    pub(super) bridge_net: Option<BridgeNetwork>,
58    pub(super) trace_reader: Option<SeccompTraceReader>,
59    pub(super) exec_fifo_path: Option<PathBuf>,
60    pub(super) _lifecycle_span: tracing::Span,
61}
62
63impl Container {
64    pub fn new(config: ContainerConfig) -> Self {
65        Self {
66            config,
67            runsc_path: None,
68        }
69    }
70
71    /// Run the container (convenience wrapper: create + start)
72    pub fn run(&self) -> Result<i32> {
73        self.create_internal(false)?.start()
74    }
75
76    /// Create phase: fork the child, set up cgroup/bridge, leave child blocked
77    /// on the exec FIFO. Returns a `CreatedContainer` whose `start()` method
78    /// releases the child process.
79    pub fn create(&self) -> Result<CreatedContainer> {
80        self.create_internal(true)
81    }
82
83    /// H6: Close all file descriptors > 2 in the child process after fork.
84    ///
85    /// This prevents leaking host sockets, pipes, and state files into the
86    /// container. Uses close_range(2) when available, falls back to /proc/self/fd.
87    fn sanitize_fds() {
88        // Try close_range(3, u32::MAX, CLOSE_RANGE_CLOEXEC) first — it's
89        // O(1) on Linux 5.9+ and marks all FDs as close-on-exec.
90        const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
91        // SAFETY: close_range is a safe syscall that marks FDs as close-on-exec.
92        let ret =
93            unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
94        if ret == 0 {
95            return;
96        }
97        // Fallback: iterate /proc/self/fd and close individually
98        if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
99            for entry in entries.flatten() {
100                if let Ok(fd_str) = entry.file_name().into_string() {
101                    if let Ok(fd) = fd_str.parse::<i32>() {
102                        if fd > 2 {
103                            unsafe { libc::close(fd) };
104                        }
105                    }
106                }
107            }
108        }
109    }
110
111    pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
112        let thread_count = std::fs::read_to_string("/proc/self/status")
113            .ok()
114            .and_then(|s| {
115                s.lines()
116                    .find(|line| line.starts_with("Threads:"))
117                    .and_then(|line| line.split_whitespace().nth(1))
118                    .and_then(|count| count.parse::<u32>().ok())
119            });
120
121        if thread_count == Some(1) {
122            return Ok(());
123        }
124
125        Err(NucleusError::ExecError(format!(
126            "{} requires a single-threaded process before fork, found {:?} threads",
127            context, thread_count
128        )))
129    }
130
131    fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
132        let lifecycle_span = info_span!(
133            "container.lifecycle",
134            container.id = %self.config.id,
135            container.name = %self.config.name,
136            runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
137        );
138        let _enter = lifecycle_span.enter();
139
140        info!(
141            "Creating container: {} (ID: {})",
142            self.config.name, self.config.id
143        );
144        audit(
145            &self.config.id,
146            &self.config.name,
147            AuditEventType::ContainerStart,
148            format!(
149                "command={:?} mode={:?} runtime={}",
150                crate::audit::redact_command(&self.config.command),
151                self.config.service_mode,
152                if self.config.use_gvisor {
153                    "gvisor"
154                } else {
155                    "native"
156                }
157            ),
158        );
159
160        // Auto-detect if we need rootless mode
161        let is_root = nix::unistd::Uid::effective().is_root();
162        let mut config = self.config.clone();
163
164        if !is_root && config.user_ns_config.is_none() {
165            info!("Not running as root, automatically enabling rootless mode");
166            config.namespaces.user = true;
167            config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
168        }
169
170        // C2: When running as root without user namespace, enable UID remapping
171        // in production mode (mandatory) or warn in other modes. Without user
172        // namespace, a container escape yields full host root.
173        if is_root && !config.namespaces.user {
174            if config.service_mode == ServiceMode::Production {
175                info!("Running as root in production mode: enabling user namespace with UID remapping");
176                config.namespaces.user = true;
177                config.user_ns_config =
178                    Some(crate::isolation::UserNamespaceConfig::root_remapped());
179            } else {
180                warn!(
181                    "Running as root WITHOUT user namespace isolation. \
182                     Container processes will run as real host UID 0. \
183                     Use --user-ns or production mode for UID remapping."
184                );
185            }
186        }
187
188        // Log console-socket acceptance (OCI interface; PTY forwarding is a future enhancement)
189        if let Some(ref socket_path) = config.console_socket {
190            warn!(
191                "Console socket {} accepted but terminal forwarding is not yet implemented",
192                socket_path.display()
193            );
194        }
195
196        // Validate production mode invariants before anything else.
197        config.validate_production_mode()?;
198        Self::assert_kernel_lockdown(&config)?;
199
200        Self::apply_network_mode_guards(&mut config, is_root)?;
201        Self::apply_trust_level_guards(&mut config)?;
202        config.validate_runtime_support()?;
203
204        // Bridge networking requires root
205        if matches!(config.network, NetworkMode::Bridge(_)) && !is_root {
206            if config.service_mode == ServiceMode::Production {
207                return Err(NucleusError::NetworkError(
208                    "Production mode with bridge networking requires root (cannot silently \
209                     degrade to no networking)"
210                        .to_string(),
211                ));
212            }
213            warn!("Bridge networking requires root, degrading to no networking");
214            config.network = NetworkMode::None;
215        }
216
217        // Create state manager, honoring --root override if set
218        let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
219
220        // Enforce name uniqueness among running containers
221        if let Ok(all_states) = state_mgr.list_states() {
222            if all_states.iter().any(|s| s.name == config.name) {
223                return Err(NucleusError::ConfigError(format!(
224                    "A container named '{}' already exists; use a different --name, \
225                     or remove the stale state with 'nucleus delete'",
226                    config.name
227                )));
228            }
229        }
230
231        // Create exec FIFO only for the two-phase create/start lifecycle.
232        // `run()` starts immediately and avoids this cross-root-path sync.
233        let exec_fifo = if defer_exec_until_start {
234            let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
235            nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
236                NucleusError::ExecError(format!(
237                    "Failed to create exec FIFO {:?}: {}",
238                    exec_fifo, e
239                ))
240            })?;
241            Some(exec_fifo)
242        } else {
243            None
244        };
245
246        // Try to create cgroup (optional for rootless mode)
247        let cgroup_name = format!("nucleus-{}", config.id);
248        let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
249            Ok(mut cgroup) => {
250                // Try to set limits
251                match cgroup.set_limits(&config.limits) {
252                    Ok(_) => {
253                        info!("Created cgroup with resource limits");
254                        Some(cgroup)
255                    }
256                    Err(e) => {
257                        if config.service_mode == ServiceMode::Production {
258                            let _ = cgroup.cleanup();
259                            return Err(NucleusError::CgroupError(format!(
260                                "Production mode requires cgroup resource enforcement, but \
261                                 applying limits failed: {}",
262                                e
263                            )));
264                        }
265                        warn!("Failed to set cgroup limits: {}", e);
266                        let _ = cgroup.cleanup();
267                        None
268                    }
269                }
270            }
271            Err(e) => {
272                if config.service_mode == ServiceMode::Production {
273                    return Err(NucleusError::CgroupError(format!(
274                        "Production mode requires cgroup resource enforcement, but \
275                         cgroup creation failed: {}",
276                        e
277                    )));
278                }
279
280                if config.user_ns_config.is_some() {
281                    if config.limits.memory_bytes.is_some()
282                        || config.limits.cpu_quota_us.is_some()
283                        || config.limits.pids_max.is_some()
284                    {
285                        warn!(
286                            "Running in rootless mode: requested resource limits cannot be \
287                             enforced – cgroup creation requires root ({})",
288                            e
289                        );
290                    } else {
291                        debug!("Running in rootless mode without cgroup resource limits");
292                    }
293                } else {
294                    warn!(
295                        "Failed to create cgroup (running without resource limits): {}",
296                        e
297                    );
298                }
299                None
300            }
301        };
302
303        // Resolve runsc path before fork, while still unprivileged.
304        let runsc_path = if config.use_gvisor {
305            Some(GVisorRuntime::resolve_path().map_err(|e| {
306                NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
307            })?)
308        } else {
309            None
310        };
311
312        // Child notifies parent after namespaces are ready.
313        let (ready_read, ready_write) = pipe().map_err(|e| {
314            NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
315        })?;
316
317        // M11: fork() in multi-threaded context. Flush log buffers and drop
318        // tracing guards before fork to minimize deadlock risk from locks held
319        // by other threads (tracing, allocator). The Tokio runtime is not yet
320        // started at this point, so async thread contention is not a concern.
321        Self::assert_single_threaded_for_fork("container create fork")?;
322        // SAFETY: fork() is called before any Tokio runtime is created.
323        // Only the main thread should be active at this point.
324        match unsafe { fork() }? {
325            ForkResult::Parent { child } => {
326                drop(ready_write);
327                info!("Forked child process: {}", child);
328
329                // Use a closure so that on any error we kill the child process
330                // instead of leaving it orphaned and blocked on the exec FIFO.
331                let parent_setup = || -> Result<CreatedContainer> {
332                    let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
333
334                    let cgroup_path = cgroup_opt
335                        .as_ref()
336                        .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
337                    let cpu_millicores = config
338                        .limits
339                        .cpu_quota_us
340                        .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
341                    let mut state = ContainerState::new(ContainerStateParams {
342                        id: config.id.clone(),
343                        name: config.name.clone(),
344                        pid: target_pid,
345                        command: config.command.clone(),
346                        memory_limit: config.limits.memory_bytes,
347                        cpu_limit: cpu_millicores,
348                        using_gvisor: config.use_gvisor,
349                        rootless: config.user_ns_config.is_some(),
350                        cgroup_path,
351                        process_uid: config.process_identity.uid,
352                        process_gid: config.process_identity.gid,
353                        additional_gids: config.process_identity.additional_gids.clone(),
354                    });
355                    state.config_hash = config.config_hash;
356                    state.bundle_path =
357                        config.rootfs_path.as_ref().map(|p| p.display().to_string());
358
359                    let mut bridge_net: Option<BridgeNetwork> = None;
360                    let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
361
362                    // Transition: Creating -> Created
363                    state.status = OciStatus::Created;
364                    state_mgr.save_state(&state)?;
365
366                    // Write PID file (OCI --pid-file)
367                    if let Some(ref pid_path) = config.pid_file {
368                        std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
369                            NucleusError::ConfigError(format!(
370                                "Failed to write pid-file '{}': {}",
371                                pid_path.display(),
372                                e
373                            ))
374                        })?;
375                        info!("Wrote PID {} to {}", target_pid, pid_path.display());
376                    }
377
378                    if let Some(ref mut cgroup) = cgroup_opt {
379                        cgroup.attach_process(target_pid)?;
380                    }
381
382                    if let NetworkMode::Bridge(ref bridge_config) = config.network {
383                        match BridgeNetwork::setup_with_id(target_pid, bridge_config, &config.id) {
384                            Ok(net) => {
385                                if let Some(ref egress) = config.egress_policy {
386                                    if let Err(e) = net.apply_egress_policy(target_pid, egress) {
387                                        if config.service_mode == ServiceMode::Production {
388                                            return Err(NucleusError::NetworkError(format!(
389                                                "Failed to apply egress policy: {}",
390                                                e
391                                            )));
392                                        }
393                                        warn!("Failed to apply egress policy: {}", e);
394                                    }
395                                }
396                                bridge_net = Some(net);
397                            }
398                            Err(e) => {
399                                if config.service_mode == ServiceMode::Production {
400                                    return Err(e);
401                                }
402                                warn!("Failed to set up bridge networking: {}", e);
403                            }
404                        }
405                    }
406
407                    info!(
408                        "Container {} created (child pid {}), waiting for start",
409                        config.id, target_pid
410                    );
411
412                    Ok(CreatedContainer {
413                        config,
414                        state_mgr,
415                        state,
416                        child,
417                        cgroup_opt,
418                        bridge_net,
419                        trace_reader,
420                        exec_fifo_path: exec_fifo,
421                        _lifecycle_span: lifecycle_span.clone(),
422                    })
423                };
424
425                parent_setup().map_err(|e| {
426                    // Kill the child so it doesn't remain orphaned and blocked
427                    // on the exec FIFO.
428                    let _ = kill(child, Signal::SIGKILL);
429                    let _ = waitpid(child, None);
430                    e
431                })
432            }
433            ForkResult::Child => {
434                drop(ready_read);
435                // H6: Close inherited FDs > 2 to prevent leaking host sockets/pipes
436                Self::sanitize_fds();
437                let temp_container = Container { config, runsc_path };
438                match temp_container.setup_and_exec(Some(ready_write), exec_fifo) {
439                    Ok(_) => unreachable!(),
440                    Err(e) => {
441                        error!("Container setup failed: {}", e);
442                        std::process::exit(1);
443                    }
444                }
445            }
446        }
447    }
448
449    /// Trigger a previously-created container to start by opening its exec FIFO.
450    /// Used by the CLI `start` command.
451    pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
452        let state_mgr = ContainerStateManager::new_with_root(state_root)?;
453        let fifo_path = state_mgr.exec_fifo_path(container_id)?;
454        if !fifo_path.exists() {
455            return Err(NucleusError::ConfigError(format!(
456                "No exec FIFO found for container {}; is it in 'created' state?",
457                container_id
458            )));
459        }
460
461        // Opening the FIFO for reading unblocks the child's open-for-write.
462        let file = std::fs::File::open(&fifo_path)
463            .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
464        let mut buf = [0u8; 1];
465        std::io::Read::read(&mut &file, &mut buf)
466            .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
467        drop(file);
468
469        let _ = std::fs::remove_file(&fifo_path);
470
471        // Update state to Running
472        let mut state = state_mgr.resolve_container(container_id)?;
473        state.status = OciStatus::Running;
474        state_mgr.save_state(&state)?;
475
476        Ok(())
477    }
478
479    /// Set up container environment and exec target process
480    ///
481    /// This runs in the child process after fork.
482    /// Tracks FilesystemState and SecurityState machines to enforce correct ordering.
483    fn setup_and_exec(
484        &self,
485        ready_pipe: Option<OwnedFd>,
486        exec_fifo: Option<PathBuf>,
487    ) -> Result<()> {
488        let is_rootless = self.config.user_ns_config.is_some();
489        let allow_degraded_security = Self::allow_degraded_security(&self.config);
490        let context_manifest = if self.config.verify_context_integrity {
491            self.config
492                .context_dir
493                .as_ref()
494                .map(|dir| snapshot_context_dir(dir))
495                .transpose()?
496        } else {
497            None
498        };
499
500        // Initialize state machines
501        let mut fs_state = FilesystemState::Unmounted;
502        let mut sec_state = SecurityState::Privileged;
503
504        // gVisor is the runtime that should create the container's namespaces.
505        // Running runsc after pre-unsharing our own namespaces breaks its gofer
506        // re-exec path on some systems and duplicates the OCI namespace config.
507        if self.config.use_gvisor {
508            if let Some(fd) = ready_pipe {
509                Self::notify_namespace_ready(&fd, std::process::id())?;
510            }
511            return self.setup_and_exec_gvisor();
512        }
513
514        // 1. Create namespaces in child and optionally configure user mapping.
515        let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
516        if let Some(user_config) = &self.config.user_ns_config {
517            namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
518        }
519        namespace_mgr.unshare_namespaces()?;
520
521        // CLONE_NEWPID only applies to children created after unshare().
522        // Create a child that will become PID 1 in the new namespace and exec the workload.
523        if self.config.namespaces.pid {
524            Self::assert_single_threaded_for_fork("PID namespace init fork")?;
525            match unsafe { fork() }? {
526                ForkResult::Parent { child } => {
527                    if let Some(fd) = ready_pipe {
528                        Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
529                    }
530                    std::process::exit(Self::wait_for_pid_namespace_child(child));
531                }
532                ForkResult::Child => {
533                    // Continue container setup as PID 1 in the new namespace.
534                }
535            }
536        } else if let Some(fd) = ready_pipe {
537            Self::notify_namespace_ready(&fd, std::process::id())?;
538        }
539
540        // Namespace: Unshared -> Entered (process is now inside all namespaces)
541        namespace_mgr.enter()?;
542
543        // 2. Ensure no_new_privs BEFORE any mount operations.
544        // This prevents exploitation of setuid binaries on bind-mounted paths
545        // even if a subsequent MS_NOSUID remount fails.
546        self.enforce_no_new_privs()?;
547        audit(
548            &self.config.id,
549            &self.config.name,
550            AuditEventType::NoNewPrivsSet,
551            "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
552        );
553
554        // 3. Set hostname if UTS namespace is enabled
555        if let Some(hostname) = &self.config.hostname {
556            namespace_mgr.set_hostname(hostname)?;
557        }
558
559        // 4. Mount tmpfs as container root
560        // Filesystem: Unmounted -> Mounted
561        // Use a private runtime directory instead of /tmp to avoid symlink
562        // attacks and information disclosure on multi-user systems.
563        let runtime_base = if nix::unistd::Uid::effective().is_root() {
564            std::path::PathBuf::from("/run/nucleus")
565        } else {
566            dirs::runtime_dir()
567                .map(|d| d.join("nucleus"))
568                .unwrap_or_else(std::env::temp_dir)
569        };
570        let _ = std::fs::create_dir_all(&runtime_base);
571        let runtime_dir = Builder::new()
572            .prefix("nucleus-runtime-")
573            .tempdir_in(&runtime_base)
574            .map_err(|e| {
575                NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
576            })?;
577        let container_root = runtime_dir.path().to_path_buf();
578        let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); // 1GB default
579        tmpfs.mount()?;
580        fs_state = fs_state.transition(FilesystemState::Mounted)?;
581
582        // 4. Create minimal filesystem structure
583        create_minimal_fs(&container_root)?;
584
585        // 5. Create device nodes
586        let dev_path = container_root.join("dev");
587        create_dev_nodes(&dev_path, false)?;
588
589        // 6. Populate context if provided
590        // Filesystem: Mounted -> Populated
591        if let Some(context_dir) = &self.config.context_dir {
592            let context_dest = container_root.join("context");
593            LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
594            if let Some(expected) = &context_manifest {
595                verify_context_manifest(expected, &context_dest)?;
596            }
597        }
598        fs_state = fs_state.transition(FilesystemState::Populated)?;
599
600        // 7. Mount runtime paths: either a pre-built rootfs or host bind mounts
601        if let Some(ref rootfs_path) = self.config.rootfs_path {
602            if self.config.verify_rootfs_attestation {
603                verify_rootfs_attestation(rootfs_path)?;
604            }
605            bind_mount_rootfs(&container_root, rootfs_path)?;
606        } else {
607            bind_mount_host_paths(&container_root, is_rootless)?;
608        }
609
610        // 7b. Mount persistent or ephemeral volumes over the base filesystem.
611        mount_volumes(&container_root, &self.config.volumes)?;
612
613        // 7c. Write resolv.conf for bridge networking.
614        // When rootfs is mounted, /etc is read-only, so we bind-mount a writable
615        // resolv.conf over the top (same technique as secrets).
616        if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
617            if self.config.rootfs_path.is_some() {
618                BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_config.dns)?;
619            } else {
620                BridgeNetwork::write_resolv_conf(&container_root, &bridge_config.dns)?;
621            }
622        }
623
624        // 7d. Mount secrets on an in-memory tmpfs in all modes.
625        mount_secrets_inmemory(
626            &container_root,
627            &self.config.secrets,
628            &self.config.process_identity,
629        )?;
630
631        // 8. Mount procfs (hidepid=2 in production mode to prevent PID enumeration)
632        let proc_path = container_root.join("proc");
633        let hide_pids = self.config.service_mode == ServiceMode::Production;
634        mount_procfs(
635            &proc_path,
636            is_rootless,
637            self.config.proc_readonly,
638            hide_pids,
639        )?;
640
641        // 8b. Mask sensitive /proc paths to reduce kernel info leakage
642        // SEC-06: In production mode, failures to mask critical paths are fatal.
643        mask_proc_paths(
644            &proc_path,
645            self.config.service_mode == ServiceMode::Production,
646        )?;
647
648        // 9c. Run createRuntime hooks (after namespaces created, before pivot_root)
649        if let Some(ref hooks) = self.config.hooks {
650            if !hooks.create_runtime.is_empty() {
651                let hook_state = OciContainerState {
652                    oci_version: "1.0.2".to_string(),
653                    id: self.config.id.clone(),
654                    status: OciStatus::Creating,
655                    pid: std::process::id(),
656                    bundle: String::new(),
657                };
658                OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
659            }
660        }
661
662        // 10. Switch root filesystem
663        // Filesystem: Populated -> Pivoted
664        switch_root(&container_root, self.config.allow_chroot_fallback)?;
665        fs_state = fs_state.transition(FilesystemState::Pivoted)?;
666        debug!("Filesystem state: {:?}", fs_state);
667
668        // 10b. Audit mount flags to verify filesystem hardening invariants
669        audit_mounts(self.config.service_mode == ServiceMode::Production)?;
670        audit(
671            &self.config.id,
672            &self.config.name,
673            AuditEventType::MountAuditPassed,
674            "all mount flags verified",
675        );
676
677        // 10c. Run createContainer hooks (after pivot_root, before start)
678        if let Some(ref hooks) = self.config.hooks {
679            if !hooks.create_container.is_empty() {
680                let hook_state = OciContainerState {
681                    oci_version: "1.0.2".to_string(),
682                    id: self.config.id.clone(),
683                    status: OciStatus::Created,
684                    pid: std::process::id(),
685                    bundle: String::new(),
686                };
687                OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
688            }
689        }
690
691        // 11. Drop capabilities (from policy file or default drop-all)
692        // Security: Privileged -> CapabilitiesDropped
693        let mut cap_mgr = CapabilityManager::new();
694        if let Some(ref policy_path) = self.config.caps_policy {
695            let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
696                policy_path,
697                self.config.caps_policy_sha256.as_deref(),
698            )?;
699            // H3: Reject dangerous capabilities in production mode
700            if self.config.service_mode == ServiceMode::Production {
701                policy.validate_production()?;
702            }
703            policy.apply(&mut cap_mgr)?;
704            audit(
705                &self.config.id,
706                &self.config.name,
707                AuditEventType::CapabilitiesDropped,
708                format!("capability policy applied from {:?}", policy_path),
709            );
710        } else {
711            cap_mgr.drop_all()?;
712            audit(
713                &self.config.id,
714                &self.config.name,
715                AuditEventType::CapabilitiesDropped,
716                "all capabilities dropped including bounding set",
717            );
718        }
719        sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
720
721        // 12b. RLIMIT backstop: defense-in-depth against fork bombs and fd exhaustion.
722        // Must be applied BEFORE seccomp, since SYS_setrlimit is not in the allowlist.
723        // SEC-05: In production mode, RLIMIT failures are fatal — a container
724        // without resource limits is a privilege escalation vector.
725        {
726            let is_production = self.config.service_mode == ServiceMode::Production;
727
728            let nproc_limit = self.config.limits.pids_max.unwrap_or(512);
729            let rlim_nproc = libc::rlimit {
730                rlim_cur: nproc_limit,
731                rlim_max: nproc_limit,
732            };
733            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
734            if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
735                let err = std::io::Error::last_os_error();
736                if is_production {
737                    return Err(NucleusError::SeccompError(format!(
738                        "Failed to set RLIMIT_NPROC to {} in production mode: {}",
739                        nproc_limit, err
740                    )));
741                }
742                warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
743            }
744
745            let rlim_nofile = libc::rlimit {
746                rlim_cur: 1024,
747                rlim_max: 1024,
748            };
749            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
750            if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
751                let err = std::io::Error::last_os_error();
752                if is_production {
753                    return Err(NucleusError::SeccompError(format!(
754                        "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
755                        err
756                    )));
757                }
758                warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
759            }
760
761            // RLIMIT_MEMLOCK: prevent container from pinning excessive physical
762            // memory via mlock(). Default 64KB matches unprivileged default, but
763            // in a user namespace the container appears as UID 0 and may have a
764            // higher inherited limit.
765            let memlock_limit: u64 = 64 * 1024; // 64KB
766            let rlim_memlock = libc::rlimit {
767                rlim_cur: memlock_limit,
768                rlim_max: memlock_limit,
769            };
770            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
771            if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
772                let err = std::io::Error::last_os_error();
773                if is_production {
774                    return Err(NucleusError::SeccompError(format!(
775                        "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
776                        memlock_limit, err
777                    )));
778                }
779                warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
780            }
781        }
782
783        // 12c. Verify that namespace-creating capabilities are truly gone before
784        // installing seccomp. clone3 is allowed without argument filtering, so this
785        // is the sole guard against namespace escape via clone3.
786        CapabilityManager::verify_no_namespace_caps(
787            self.config.service_mode == ServiceMode::Production,
788        )?;
789
790        // 13. Apply seccomp filter (trace, profile-from-file, or built-in allowlist)
791        // Security: CapabilitiesDropped -> SeccompApplied
792        use crate::container::config::SeccompMode;
793        let mut seccomp_mgr = SeccompManager::new();
794        let allow_network = !matches!(self.config.network, NetworkMode::None);
795        let seccomp_applied = match self.config.seccomp_mode {
796            SeccompMode::Trace => {
797                audit(
798                    &self.config.id,
799                    &self.config.name,
800                    AuditEventType::SeccompApplied,
801                    "seccomp trace mode: allow-all + LOG",
802                );
803                seccomp_mgr.apply_trace_filter()?
804            }
805            SeccompMode::Enforce => {
806                if let Some(ref profile_path) = self.config.seccomp_profile {
807                    audit(
808                        &self.config.id,
809                        &self.config.name,
810                        AuditEventType::SeccompProfileLoaded,
811                        format!("path={:?}", profile_path),
812                    );
813                    seccomp_mgr.apply_profile_from_file(
814                        profile_path,
815                        self.config.seccomp_profile_sha256.as_deref(),
816                        self.config.seccomp_log_denied,
817                    )?
818                } else {
819                    seccomp_mgr.apply_filter_for_network_mode(
820                        allow_network,
821                        allow_degraded_security,
822                        self.config.seccomp_log_denied,
823                    )?
824                }
825            }
826        };
827        if seccomp_applied {
828            sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
829            audit(
830                &self.config.id,
831                &self.config.name,
832                AuditEventType::SeccompApplied,
833                format!("network={}", allow_network),
834            );
835        } else if !allow_degraded_security {
836            return Err(NucleusError::SeccompError(
837                "Seccomp filter is required but was not enforced".to_string(),
838            ));
839        } else {
840            warn!("Seccomp not enforced; container is running with degraded hardening");
841        }
842
843        // 14. Apply Landlock policy (from policy file or default hardcoded rules)
844        let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
845            let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
846                policy_path,
847                self.config.landlock_policy_sha256.as_deref(),
848            )?;
849            // H4: Reject write+execute on same path in production
850            if self.config.service_mode == ServiceMode::Production {
851                policy.validate_production()?;
852            }
853            policy.apply(allow_degraded_security)?
854        } else {
855            let mut landlock_mgr = LandlockManager::new();
856            landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
857            landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
858        };
859        if seccomp_applied && landlock_applied {
860            sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
861            if self.config.seccomp_mode == SeccompMode::Trace {
862                warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
863            } else {
864                sec_state = sec_state.transition(SecurityState::Locked)?;
865            }
866            audit(
867                &self.config.id,
868                &self.config.name,
869                AuditEventType::LandlockApplied,
870                if self.config.seccomp_mode == SeccompMode::Trace {
871                    "landlock applied, but seccomp in trace mode — not locked".to_string()
872                } else {
873                    "security state locked: all hardening layers active".to_string()
874                },
875            );
876        } else if !allow_degraded_security {
877            return Err(NucleusError::LandlockError(
878                "Landlock policy is required but was not enforced".to_string(),
879            ));
880        } else {
881            warn!("Security state not locked; one or more hardening controls are inactive");
882        }
883        debug!("Security state: {:?}", sec_state);
884
885        // 14c. Block on exec FIFO until start() opens it for reading.
886        // This implements the OCI two-phase create/start: all container setup
887        // is complete, but the user process doesn't exec until explicitly started.
888        if let Some(ref fifo_path) = exec_fifo {
889            debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
890            let file = std::fs::OpenOptions::new()
891                .write(true)
892                .open(fifo_path)
893                .map_err(|e| {
894                    NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
895                })?;
896            std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
897                NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
898            })?;
899            drop(file);
900            debug!("Exec FIFO released, proceeding to exec");
901        }
902
903        // 14d. Run startContainer hooks (after start signal, before user process exec)
904        if let Some(ref hooks) = self.config.hooks {
905            if !hooks.start_container.is_empty() {
906                let hook_state = OciContainerState {
907                    oci_version: "1.0.2".to_string(),
908                    id: self.config.id.clone(),
909                    status: OciStatus::Running,
910                    pid: std::process::id(),
911                    bundle: String::new(),
912                };
913                OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
914            }
915        }
916
917        // 15. In production mode with PID namespace, run as a mini-init (PID 1)
918        // that reaps zombies and forwards signals, rather than exec-ing directly.
919        if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
920            return self.run_as_init();
921        }
922
923        // 15b. Agent mode: exec target process directly
924        self.exec_command()?;
925
926        // Should never reach here
927        Ok(())
928    }
929
930    /// Forward selected signals to child process using sigwait (no async signal handlers).
931    ///
932    /// Returns a stop flag and join handle. Set the flag to `true` and join
933    /// the handle to cleanly shut down the forwarding thread.
934    pub(super) fn setup_signal_forwarding_static(
935        child: Pid,
936    ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
937        let mut set = SigSet::empty();
938        for signal in [
939            Signal::SIGTERM,
940            Signal::SIGINT,
941            Signal::SIGHUP,
942            Signal::SIGQUIT,
943            Signal::SIGUSR1,
944            Signal::SIGUSR2,
945        ] {
946            set.add(signal);
947        }
948
949        let unblock_set = set;
950        pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
951            NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
952        })?;
953
954        let stop = Arc::new(AtomicBool::new(false));
955        let stop_clone = stop.clone();
956        let handle = std::thread::Builder::new()
957            .name("sig-forward".to_string())
958            .spawn(move || {
959                // The thread owns unblock_set and uses it for sigwait.
960                loop {
961                    if let Ok(signal) = unblock_set.wait() {
962                        // Check the stop flag *after* waking so that the
963                        // wake-up signal (SIGUSR1) is not forwarded to the
964                        // child during shutdown.
965                        if stop_clone.load(Ordering::Relaxed) {
966                            break;
967                        }
968                        let _ = kill(child, signal);
969                    }
970                }
971            })
972            .map_err(|e| {
973                // Restore the signal mask so the caller isn't left with
974                // signals permanently blocked.
975                let mut restore = SigSet::empty();
976                for signal in [
977                    Signal::SIGTERM,
978                    Signal::SIGINT,
979                    Signal::SIGHUP,
980                    Signal::SIGQUIT,
981                    Signal::SIGUSR1,
982                    Signal::SIGUSR2,
983                ] {
984                    restore.add(signal);
985                }
986                let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
987                NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
988            })?;
989
990        info!("Signal forwarding configured");
991        Ok((stop, handle))
992    }
993
994    /// Wait for child process to exit
995    pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
996        loop {
997            match waitpid(child, None) {
998                Ok(WaitStatus::Exited(_, code)) => {
999                    return Ok(code);
1000                }
1001                Ok(WaitStatus::Signaled(_, signal, _)) => {
1002                    info!("Child killed by signal: {:?}", signal);
1003                    return Ok(128 + signal as i32);
1004                }
1005                Err(nix::errno::Errno::EINTR) => {
1006                    continue;
1007                }
1008                Err(e) => {
1009                    return Err(NucleusError::ExecError(format!(
1010                        "Failed to wait for child: {}",
1011                        e
1012                    )));
1013                }
1014                _ => {
1015                    continue;
1016                }
1017            }
1018        }
1019    }
1020
1021    fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1022        let mut pid_buf = [0u8; 4];
1023        loop {
1024            match read(ready_read, &mut pid_buf) {
1025                Err(nix::errno::Errno::EINTR) => continue,
1026                Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1027                Ok(0) => {
1028                    return Err(NucleusError::ExecError(format!(
1029                        "Child {} exited before namespace initialization",
1030                        child
1031                    )))
1032                }
1033                Ok(_) => {
1034                    return Err(NucleusError::ExecError(
1035                        "Invalid namespace sync payload from child".to_string(),
1036                    ))
1037                }
1038                Err(e) => {
1039                    return Err(NucleusError::ExecError(format!(
1040                        "Failed waiting for child namespace setup: {}",
1041                        e
1042                    )))
1043                }
1044            }
1045        }
1046    }
1047
1048    fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1049        let payload = pid.to_ne_bytes();
1050        let mut written = 0;
1051        while written < payload.len() {
1052            let n = write(fd, &payload[written..]).map_err(|e| {
1053                NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1054            })?;
1055            if n == 0 {
1056                return Err(NucleusError::ExecError(
1057                    "Failed to notify namespace readiness: short write".to_string(),
1058                ));
1059            }
1060            written += n;
1061        }
1062        Ok(())
1063    }
1064
1065    fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1066        loop {
1067            match waitpid(child, None) {
1068                Ok(WaitStatus::Exited(_, code)) => return code,
1069                Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1070                Err(nix::errno::Errno::EINTR) => continue,
1071                Err(_) => return 1,
1072                _ => continue,
1073            }
1074        }
1075    }
1076}
1077
1078impl CreatedContainer {
1079    /// Start phase: release the child via the exec FIFO, transition to Running,
1080    /// then wait for the child to exit with full lifecycle management.
1081    pub fn start(mut self) -> Result<i32> {
1082        let config = &self.config;
1083        let _enter = self._lifecycle_span.enter();
1084
1085        // Open the exec FIFO for reading — this unblocks the child's
1086        // blocking open-for-write, allowing it to proceed to exec.
1087        if let Some(exec_fifo_path) = &self.exec_fifo_path {
1088            let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1089                NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1090            })?;
1091            let mut buf = [0u8; 1];
1092            let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1093                NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1094            })?;
1095            if read != 1 {
1096                return Err(NucleusError::ExecError(
1097                    "Exec FIFO closed before start signal was delivered".to_string(),
1098                ));
1099            }
1100            let _ = std::fs::remove_file(exec_fifo_path);
1101        }
1102
1103        // Transition: Created -> Running
1104        self.state.status = OciStatus::Running;
1105        self.state_mgr.save_state(&self.state)?;
1106
1107        let target_pid = self.state.pid;
1108        let child = self.child;
1109
1110        let (sig_stop, sig_handle) =
1111            Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1112
1113        // Guard ensures signal thread is stopped on any exit path (including early ? returns).
1114        let mut sig_guard = SignalThreadGuard {
1115            stop: Some(sig_stop),
1116            handle: Some(sig_handle),
1117        };
1118
1119        // Run readiness probe before declaring service ready
1120        if let Some(ref probe) = config.readiness_probe {
1121            let notify_socket = if config.sd_notify {
1122                std::env::var("NOTIFY_SOCKET").ok()
1123            } else {
1124                None
1125            };
1126            Container::run_readiness_probe(
1127                target_pid,
1128                &config.name,
1129                probe,
1130                config.user_ns_config.is_some(),
1131                config.use_gvisor,
1132                &config.process_identity,
1133                notify_socket.as_deref(),
1134            )?;
1135        }
1136
1137        // Start health check thread if configured
1138        let cancel_flag = Arc::new(AtomicBool::new(false));
1139        let health_handle = if let Some(ref hc) = config.health_check {
1140            if !hc.command.is_empty() {
1141                let hc = hc.clone();
1142                let pid = target_pid;
1143                let container_name = config.name.clone();
1144                let rootless = config.user_ns_config.is_some();
1145                let using_gvisor = config.use_gvisor;
1146                let process_identity = config.process_identity.clone();
1147                let cancel = cancel_flag.clone();
1148                Some(std::thread::spawn(move || {
1149                    Container::health_check_loop(
1150                        pid,
1151                        &container_name,
1152                        rootless,
1153                        using_gvisor,
1154                        &hc,
1155                        &process_identity,
1156                        &cancel,
1157                    );
1158                }))
1159            } else {
1160                None
1161            }
1162        } else {
1163            None
1164        };
1165
1166        // Guard ensures health check thread is cancelled on any exit path.
1167        let mut health_guard = HealthThreadGuard {
1168            cancel: Some(cancel_flag),
1169            handle: health_handle,
1170        };
1171
1172        // Run poststart hooks (after user process started, in parent)
1173        if let Some(ref hooks) = config.hooks {
1174            if !hooks.poststart.is_empty() {
1175                let hook_state = OciContainerState {
1176                    oci_version: "1.0.2".to_string(),
1177                    id: config.id.clone(),
1178                    status: OciStatus::Running,
1179                    pid: target_pid,
1180                    bundle: String::new(),
1181                };
1182                OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1183            }
1184        }
1185
1186        let mut child_waited = false;
1187        let run_result: Result<i32> = (|| {
1188            let exit_code = Container::wait_for_child_static(child)?;
1189
1190            // Transition: Running -> Stopped
1191            self.state.status = OciStatus::Stopped;
1192            let _ = self.state_mgr.save_state(&self.state);
1193
1194            child_waited = true;
1195            Ok(exit_code)
1196        })();
1197
1198        // Explicitly stop threads (guards would do this on drop too, but
1199        // explicit teardown keeps ordering visible).
1200        health_guard.stop();
1201        sig_guard.stop();
1202
1203        // Run poststop hooks (best-effort)
1204        if let Some(ref hooks) = config.hooks {
1205            if !hooks.poststop.is_empty() {
1206                let hook_state = OciContainerState {
1207                    oci_version: "1.0.2".to_string(),
1208                    id: config.id.clone(),
1209                    status: OciStatus::Stopped,
1210                    pid: 0,
1211                    bundle: String::new(),
1212                };
1213                OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1214            }
1215        }
1216
1217        if let Some(net) = self.bridge_net.take() {
1218            if let Err(e) = net.cleanup() {
1219                warn!("Failed to cleanup bridge networking: {}", e);
1220            }
1221        }
1222
1223        if !child_waited {
1224            let _ = kill(child, Signal::SIGKILL);
1225            let _ = waitpid(child, None);
1226        }
1227
1228        if let Some(reader) = self.trace_reader.take() {
1229            reader.stop_and_flush();
1230        }
1231
1232        if let Some(cgroup) = self.cgroup_opt.take() {
1233            if let Err(e) = cgroup.cleanup() {
1234                warn!("Failed to cleanup cgroup: {}", e);
1235            }
1236        }
1237
1238        if config.use_gvisor {
1239            if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1240                warn!(
1241                    "Failed to cleanup gVisor artifacts for {}: {}",
1242                    config.id, e
1243                );
1244            }
1245        }
1246
1247        if let Err(e) = self.state_mgr.delete_state(&config.id) {
1248            warn!("Failed to delete state for {}: {}", config.id, e);
1249        }
1250
1251        match run_result {
1252            Ok(exit_code) => {
1253                audit(
1254                    &config.id,
1255                    &config.name,
1256                    AuditEventType::ContainerStop,
1257                    format!("exit_code={}", exit_code),
1258                );
1259                info!(
1260                    "Container {} ({}) exited with code {}",
1261                    config.name, config.id, exit_code
1262                );
1263                Ok(exit_code)
1264            }
1265            Err(e) => {
1266                audit_error(
1267                    &config.id,
1268                    &config.name,
1269                    AuditEventType::ContainerStop,
1270                    format!("error={}", e),
1271                );
1272                Err(e)
1273            }
1274        }
1275    }
1276}
1277
1278/// RAII guard that stops the signal-forwarding thread on drop.
1279struct SignalThreadGuard {
1280    stop: Option<Arc<AtomicBool>>,
1281    handle: Option<JoinHandle<()>>,
1282}
1283
1284impl SignalThreadGuard {
1285    fn stop(&mut self) {
1286        if let Some(flag) = self.stop.take() {
1287            flag.store(true, Ordering::Relaxed);
1288            // Unblock the sigwait() call so the thread can observe the stop flag.
1289            let _ = kill(Pid::this(), Signal::SIGUSR1);
1290        }
1291        if let Some(handle) = self.handle.take() {
1292            let _ = handle.join();
1293        }
1294    }
1295}
1296
1297impl Drop for SignalThreadGuard {
1298    fn drop(&mut self) {
1299        self.stop();
1300    }
1301}
1302
1303/// RAII guard that cancels the health-check thread on drop.
1304struct HealthThreadGuard {
1305    cancel: Option<Arc<AtomicBool>>,
1306    handle: Option<JoinHandle<()>>,
1307}
1308
1309impl HealthThreadGuard {
1310    fn stop(&mut self) {
1311        if let Some(flag) = self.cancel.take() {
1312            flag.store(true, Ordering::Relaxed);
1313        }
1314        if let Some(handle) = self.handle.take() {
1315            let _ = handle.join();
1316        }
1317    }
1318}
1319
1320impl Drop for HealthThreadGuard {
1321    fn drop(&mut self) {
1322        self.stop();
1323    }
1324}
1325
1326#[cfg(test)]
1327mod tests {
1328    use super::*;
1329    use crate::container::KernelLockdownMode;
1330    use crate::network::NetworkMode;
1331
1332    fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
1333        let fn_start = source
1334            .find(fn_signature)
1335            .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
1336        let after = &source[fn_start..];
1337        let open = after
1338            .find('{')
1339            .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
1340        let mut depth = 0u32;
1341        let mut end = open;
1342        for (i, ch) in after[open..].char_indices() {
1343            match ch {
1344                '{' => depth += 1,
1345                '}' => {
1346                    depth -= 1;
1347                    if depth == 0 {
1348                        end = open + i + 1;
1349                        break;
1350                    }
1351                }
1352                _ => {}
1353            }
1354        }
1355        &after[..end]
1356    }
1357
1358    #[test]
1359    fn test_container_config() {
1360        let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1361        assert!(!config.id.is_empty());
1362        assert_eq!(config.command, vec!["/bin/sh"]);
1363        assert!(config.use_gvisor);
1364    }
1365
1366    #[test]
1367    fn test_run_uses_immediate_start_path() {
1368        let source = include_str!("runtime.rs");
1369        let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1370        let after = &source[fn_start..];
1371        let open = after.find('{').unwrap();
1372        let mut depth = 0u32;
1373        let mut fn_end = open;
1374        for (i, ch) in after[open..].char_indices() {
1375            match ch {
1376                '{' => depth += 1,
1377                '}' => {
1378                    depth -= 1;
1379                    if depth == 0 {
1380                        fn_end = open + i + 1;
1381                        break;
1382                    }
1383                }
1384                _ => {}
1385            }
1386        }
1387        let run_body = &after[..fn_end];
1388        assert!(
1389            run_body.contains("create_internal(false)"),
1390            "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1391        );
1392        assert!(
1393            !run_body.contains("self.create()?.start()"),
1394            "run() must not route through create()+start()"
1395        );
1396    }
1397
1398    #[test]
1399    fn test_container_config_with_name() {
1400        let config =
1401            ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1402                .unwrap();
1403        assert_eq!(config.name, "mycontainer");
1404        assert!(!config.id.is_empty());
1405        assert_ne!(config.id, config.name);
1406    }
1407
1408    #[test]
1409    fn test_allow_degraded_security_requires_explicit_config() {
1410        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1411        assert!(!Container::allow_degraded_security(&strict));
1412
1413        let relaxed = strict.clone().with_allow_degraded_security(true);
1414        assert!(Container::allow_degraded_security(&relaxed));
1415    }
1416
1417    #[test]
1418    fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1419        let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1420        std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1421
1422        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1423        assert!(!Container::allow_degraded_security(&strict));
1424
1425        let explicit = strict.with_allow_degraded_security(true);
1426        assert!(Container::allow_degraded_security(&explicit));
1427
1428        match prev {
1429            Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1430            None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1431        }
1432    }
1433
1434    #[test]
1435    fn test_host_network_requires_explicit_opt_in() {
1436        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1437            .unwrap()
1438            .with_network(NetworkMode::Host)
1439            .with_allow_host_network(false);
1440        let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1441        assert!(matches!(err, NucleusError::NetworkError(_)));
1442    }
1443
1444    #[test]
1445    fn test_host_network_opt_in_disables_net_namespace() {
1446        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1447            .unwrap()
1448            .with_network(NetworkMode::Host)
1449            .with_allow_host_network(true);
1450        assert!(config.namespaces.net);
1451        Container::apply_network_mode_guards(&mut config, true).unwrap();
1452        assert!(!config.namespaces.net);
1453    }
1454
1455    #[test]
1456    fn test_non_host_network_does_not_require_host_opt_in() {
1457        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1458            .unwrap()
1459            .with_network(NetworkMode::None)
1460            .with_allow_host_network(false);
1461        assert!(config.namespaces.net);
1462        Container::apply_network_mode_guards(&mut config, true).unwrap();
1463        assert!(config.namespaces.net);
1464    }
1465
1466    #[test]
1467    fn test_parse_kernel_lockdown_mode() {
1468        assert_eq!(
1469            Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1470            Some(KernelLockdownMode::Integrity)
1471        );
1472        assert_eq!(
1473            Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1474            Some(KernelLockdownMode::Confidentiality)
1475        );
1476        assert_eq!(
1477            Container::parse_active_lockdown_mode("[none] integrity"),
1478            None
1479        );
1480    }
1481
1482    #[test]
1483    fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1484        let temp = tempfile::TempDir::new().unwrap();
1485        let source = temp.path().join("source-secret");
1486        std::fs::write(&source, "supersecret").unwrap();
1487
1488        let staged = Container::stage_gvisor_secret_files(
1489            &temp.path().join("stage"),
1490            &[crate::container::SecretMount {
1491                source: source.clone(),
1492                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1493                mode: 0o400,
1494            }],
1495            &crate::container::ProcessIdentity::root(),
1496        )
1497        .unwrap();
1498
1499        assert_eq!(staged.len(), 1);
1500        assert!(staged[0].source.starts_with(temp.path().join("stage")));
1501        assert_eq!(
1502            std::fs::read_to_string(&staged[0].source).unwrap(),
1503            "supersecret"
1504        );
1505    }
1506
1507    #[test]
1508    fn test_stage_gvisor_secret_files_rejects_symlink_source() {
1509        use std::os::unix::fs::symlink;
1510
1511        let temp = tempfile::TempDir::new().unwrap();
1512        let source = temp.path().join("source-secret");
1513        let link = temp.path().join("source-link");
1514        std::fs::write(&source, "supersecret").unwrap();
1515        symlink(&source, &link).unwrap();
1516
1517        let err = Container::stage_gvisor_secret_files(
1518            &temp.path().join("stage"),
1519            &[crate::container::SecretMount {
1520                source: link,
1521                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1522                mode: 0o400,
1523            }],
1524            &crate::container::ProcessIdentity::root(),
1525        )
1526        .unwrap_err();
1527
1528        assert!(
1529            err.to_string().contains("O_NOFOLLOW"),
1530            "gVisor secret staging must reject symlink sources"
1531        );
1532    }
1533
1534    #[test]
1535    fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
1536        let source = include_str!("runtime.rs");
1537        let fn_body = extract_fn_body(source, "fn setup_and_exec");
1538        assert!(
1539            fn_body.contains("mount_secrets_inmemory("),
1540            "setup_and_exec must use in-memory secret mounting"
1541        );
1542        assert!(
1543            !fn_body.contains("mount_secrets(&"),
1544            "setup_and_exec must not bind-mount secrets from the host"
1545        );
1546    }
1547
1548    #[test]
1549    fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
1550        let source = include_str!("gvisor_setup.rs");
1551        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
1552        assert!(
1553            fn_body.contains("with_inmemory_secret_mounts"),
1554            "gVisor setup must use the tmpfs-backed secret staging path"
1555        );
1556        assert!(
1557            !fn_body.contains("with_secret_mounts"),
1558            "gVisor setup must not bind-mount host secret paths"
1559        );
1560    }
1561
1562    #[test]
1563    fn test_native_fork_sites_assert_single_threaded() {
1564        let runtime_source = include_str!("runtime.rs");
1565        let create_body = extract_fn_body(runtime_source, "fn create_internal");
1566        assert!(
1567            create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
1568            "create_internal must assert single-threaded before fork"
1569        );
1570
1571        let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
1572        assert!(
1573            setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
1574            "PID namespace setup must assert single-threaded before fork"
1575        );
1576
1577        let exec_source = include_str!("exec.rs");
1578        let init_body = extract_fn_body(exec_source, "fn run_as_init");
1579        assert!(
1580            init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
1581            "run_as_init must assert single-threaded before fork"
1582        );
1583    }
1584
1585    #[test]
1586    fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
1587        let source = include_str!("exec.rs");
1588        let fn_body = extract_fn_body(source, "fn run_as_init");
1589        assert!(
1590            !fn_body.contains("Self::apply_process_identity_to_current_process("),
1591            "run_as_init must not drop identity before the supervisor fork"
1592        );
1593        assert!(
1594            fn_body.contains("self.exec_command()?"),
1595            "workload child must still route through exec_command for identity application"
1596        );
1597    }
1598
1599    #[test]
1600    fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1601        let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1602        std::fs::create_dir_all(&artifact_dir).unwrap();
1603        std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1604
1605        Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1606        assert!(!artifact_dir.exists());
1607    }
1608
1609    #[test]
1610    fn test_health_check_loop_supports_cancellation() {
1611        // BUG-18: health_check_loop must accept an AtomicBool cancel flag
1612        // and check it between iterations for prompt shutdown.
1613        // Function lives in health.rs after the runtime split.
1614        let source = include_str!("health.rs");
1615        let fn_start = source.find("fn health_check_loop").unwrap();
1616        let fn_body = &source[fn_start..fn_start + 2500];
1617        assert!(
1618            fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1619            "health_check_loop must accept an AtomicBool cancellation flag"
1620        );
1621        // Must also check cancellation during sleep
1622        assert!(
1623            fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1624            "health_check_loop must check cancellation during sleep intervals"
1625        );
1626    }
1627
1628    #[test]
1629    fn test_runtime_probes_do_not_spawn_host_nsenter() {
1630        // Both functions live in health.rs after the runtime split.
1631        let source = include_str!("health.rs");
1632
1633        let readiness_start = source.find("fn run_readiness_probe").unwrap();
1634        let readiness_body = &source[readiness_start..readiness_start + 2500];
1635        assert!(
1636            !readiness_body.contains("Command::new(&nsenter_bin)"),
1637            "readiness probes must not execute via host nsenter"
1638        );
1639
1640        let health_start = source.find("fn health_check_loop").unwrap();
1641        let health_body = &source[health_start..health_start + 2200];
1642        assert!(
1643            !health_body.contains("Command::new(&nsenter_bin)"),
1644            "health checks must not execute via host nsenter"
1645        );
1646    }
1647
1648    #[test]
1649    fn test_oci_mount_strip_prefix_no_expect() {
1650        // BUG-08: prepare_oci_mountpoints must not use expect() - use ? instead
1651        // Function lives in gvisor_setup.rs after the runtime split.
1652        let source = include_str!("gvisor_setup.rs");
1653        let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1654        let fn_body = &source[fn_start..fn_start + 600];
1655        assert!(
1656            !fn_body.contains(".expect("),
1657            "prepare_oci_mountpoints must not use expect() — return Err instead"
1658        );
1659    }
1660
1661    #[test]
1662    fn test_notify_namespace_ready_validates_write_length() {
1663        // BUG-02: notify_namespace_ready must validate that all bytes were written
1664        let source = include_str!("runtime.rs");
1665        let fn_start = source.find("fn notify_namespace_ready").unwrap();
1666        let fn_body = &source[fn_start..fn_start + 500];
1667        // Must check the return value of write() for partial writes
1668        assert!(
1669            fn_body.contains("written")
1670                || fn_body.contains("4")
1671                || fn_body.contains("payload.len()"),
1672            "notify_namespace_ready must validate complete write of all 4 bytes"
1673        );
1674    }
1675
1676    #[test]
1677    fn test_rlimit_failures_fatal_in_production() {
1678        // SEC-05: RLIMIT failures must be fatal in production mode
1679        let source = include_str!("runtime.rs");
1680        let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1681        let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1682        assert!(
1683            rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1684            "RLIMIT failures must return Err in production mode"
1685        );
1686    }
1687
1688    #[test]
1689    fn test_tcp_readiness_probe_uses_portable_check() {
1690        // BUG-14: TCP readiness probe must not use /dev/tcp (bash-only)
1691        // Function lives in health.rs after the runtime split.
1692        let source = include_str!("health.rs");
1693        let probe_fn = source.find("TcpPort(port)").unwrap();
1694        let probe_body = &source[probe_fn..probe_fn + 500];
1695        assert!(
1696            !probe_body.contains("/dev/tcp"),
1697            "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1698        );
1699    }
1700}
nucleus/container/runtime.rs

nucleus/container/
runtime.rs