Skip to main content

nucleus/container/
runtime.rs

1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3    ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4    ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8    audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9    mask_proc_paths, mount_procfs, mount_secrets, mount_secrets_inmemory, mount_volumes,
10    snapshot_context_dir, switch_root, verify_context_manifest, verify_rootfs_attestation,
11    FilesystemState, LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::NamespaceManager;
14use crate::network::{BridgeNetwork, NetworkMode};
15use crate::resources::Cgroup;
16use crate::security::{
17    CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks, SeccompManager,
18    SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::OwnedFd;
26use std::path::PathBuf;
27use std::sync::atomic::{AtomicBool, Ordering};
28use std::sync::Arc;
29use std::thread::JoinHandle;
30use tempfile::Builder;
31use tracing::{debug, error, info, info_span, warn};
32
33/// Container runtime that orchestrates all isolation mechanisms
34///
35/// Execution flow matches the formal specifications:
36/// 1. Create namespaces (Nucleus_Isolation_NamespaceLifecycle.tla)
37/// 2. Create and configure cgroups (Nucleus_Resources_CgroupLifecycle.tla)
38/// 3. Mount tmpfs and populate context (Nucleus_Filesystem_FilesystemLifecycle.tla)
39/// 4. Drop capabilities and apply seccomp (Nucleus_Security_SecurityEnforcement.tla)
40/// 5. Execute target process
41pub struct Container {
42    pub(super) config: ContainerConfig,
43    /// Pre-resolved runsc path, resolved before fork so that user-namespace
44    /// UID changes don't block PATH-based lookup.
45    pub(super) runsc_path: Option<String>,
46}
47
48/// Handle returned by `Container::create()` representing a container whose
49/// child process has been forked and is blocked on the exec FIFO, waiting for
50/// `start()` to release it.
51pub struct CreatedContainer {
52    pub(super) config: ContainerConfig,
53    pub(super) state_mgr: ContainerStateManager,
54    pub(super) state: ContainerState,
55    pub(super) child: Pid,
56    pub(super) cgroup_opt: Option<Cgroup>,
57    pub(super) bridge_net: Option<BridgeNetwork>,
58    pub(super) trace_reader: Option<SeccompTraceReader>,
59    pub(super) exec_fifo_path: Option<PathBuf>,
60    pub(super) _lifecycle_span: tracing::Span,
61}
62
63impl Container {
64    pub fn new(config: ContainerConfig) -> Self {
65        Self {
66            config,
67            runsc_path: None,
68        }
69    }
70
71    /// Run the container (convenience wrapper: create + start)
72    pub fn run(&self) -> Result<i32> {
73        self.create_internal(false)?.start()
74    }
75
76    /// Create phase: fork the child, set up cgroup/bridge, leave child blocked
77    /// on the exec FIFO. Returns a `CreatedContainer` whose `start()` method
78    /// releases the child process.
79    pub fn create(&self) -> Result<CreatedContainer> {
80        self.create_internal(true)
81    }
82
83    /// H6: Close all file descriptors > 2 in the child process after fork.
84    ///
85    /// This prevents leaking host sockets, pipes, and state files into the
86    /// container. Uses close_range(2) when available, falls back to /proc/self/fd.
87    fn sanitize_fds() {
88        // Try close_range(3, u32::MAX, CLOSE_RANGE_CLOEXEC) first — it's
89        // O(1) on Linux 5.9+ and marks all FDs as close-on-exec.
90        const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
91        // SAFETY: close_range is a safe syscall that marks FDs as close-on-exec.
92        let ret =
93            unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
94        if ret == 0 {
95            return;
96        }
97        // Fallback: iterate /proc/self/fd and close individually
98        if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
99            for entry in entries.flatten() {
100                if let Ok(fd_str) = entry.file_name().into_string() {
101                    if let Ok(fd) = fd_str.parse::<i32>() {
102                        if fd > 2 {
103                            unsafe { libc::close(fd) };
104                        }
105                    }
106                }
107            }
108        }
109    }
110
111    fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
112        let lifecycle_span = info_span!(
113            "container.lifecycle",
114            container.id = %self.config.id,
115            container.name = %self.config.name,
116            runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
117        );
118        let _enter = lifecycle_span.enter();
119
120        info!(
121            "Creating container: {} (ID: {})",
122            self.config.name, self.config.id
123        );
124        audit(
125            &self.config.id,
126            &self.config.name,
127            AuditEventType::ContainerStart,
128            format!(
129                "command={:?} mode={:?} runtime={}",
130                crate::audit::redact_command(&self.config.command),
131                self.config.service_mode,
132                if self.config.use_gvisor {
133                    "gvisor"
134                } else {
135                    "native"
136                }
137            ),
138        );
139
140        // Auto-detect if we need rootless mode
141        let is_root = nix::unistd::Uid::effective().is_root();
142        let mut config = self.config.clone();
143
144        if !is_root && config.user_ns_config.is_none() {
145            info!("Not running as root, automatically enabling rootless mode");
146            config.namespaces.user = true;
147            config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
148        }
149
150        // C2: When running as root without user namespace, enable UID remapping
151        // in production mode (mandatory) or warn in other modes. Without user
152        // namespace, a container escape yields full host root.
153        if is_root && !config.namespaces.user {
154            if config.service_mode == ServiceMode::Production {
155                info!("Running as root in production mode: enabling user namespace with UID remapping");
156                config.namespaces.user = true;
157                config.user_ns_config =
158                    Some(crate::isolation::UserNamespaceConfig::root_remapped());
159            } else {
160                warn!(
161                    "Running as root WITHOUT user namespace isolation. \
162                     Container processes will run as real host UID 0. \
163                     Use --user-ns or production mode for UID remapping."
164                );
165            }
166        }
167
168        // Log console-socket acceptance (OCI interface; PTY forwarding is a future enhancement)
169        if let Some(ref socket_path) = config.console_socket {
170            warn!(
171                "Console socket {} accepted but terminal forwarding is not yet implemented",
172                socket_path.display()
173            );
174        }
175
176        // Validate production mode invariants before anything else.
177        config.validate_production_mode()?;
178        Self::assert_kernel_lockdown(&config)?;
179
180        Self::apply_network_mode_guards(&mut config, is_root)?;
181        Self::apply_trust_level_guards(&mut config)?;
182        config.validate_runtime_support()?;
183
184        // Bridge networking requires root
185        if matches!(config.network, NetworkMode::Bridge(_)) && !is_root {
186            if config.service_mode == ServiceMode::Production {
187                return Err(NucleusError::NetworkError(
188                    "Production mode with bridge networking requires root (cannot silently \
189                     degrade to no networking)"
190                        .to_string(),
191                ));
192            }
193            warn!("Bridge networking requires root, degrading to no networking");
194            config.network = NetworkMode::None;
195        }
196
197        // Create state manager, honoring --root override if set
198        let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
199
200        // Enforce name uniqueness among running containers
201        if let Ok(all_states) = state_mgr.list_states() {
202            if all_states.iter().any(|s| s.name == config.name) {
203                return Err(NucleusError::ConfigError(format!(
204                    "A container named '{}' already exists; use a different --name, \
205                     or remove the stale state with 'nucleus delete'",
206                    config.name
207                )));
208            }
209        }
210
211        // Create exec FIFO only for the two-phase create/start lifecycle.
212        // `run()` starts immediately and avoids this cross-root-path sync.
213        let exec_fifo = if defer_exec_until_start {
214            let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
215            nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
216                NucleusError::ExecError(format!(
217                    "Failed to create exec FIFO {:?}: {}",
218                    exec_fifo, e
219                ))
220            })?;
221            Some(exec_fifo)
222        } else {
223            None
224        };
225
226        // Try to create cgroup (optional for rootless mode)
227        let cgroup_name = format!("nucleus-{}", config.id);
228        let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
229            Ok(mut cgroup) => {
230                // Try to set limits
231                match cgroup.set_limits(&config.limits) {
232                    Ok(_) => {
233                        info!("Created cgroup with resource limits");
234                        Some(cgroup)
235                    }
236                    Err(e) => {
237                        if config.service_mode == ServiceMode::Production {
238                            let _ = cgroup.cleanup();
239                            return Err(NucleusError::CgroupError(format!(
240                                "Production mode requires cgroup resource enforcement, but \
241                                 applying limits failed: {}",
242                                e
243                            )));
244                        }
245                        warn!("Failed to set cgroup limits: {}", e);
246                        let _ = cgroup.cleanup();
247                        None
248                    }
249                }
250            }
251            Err(e) => {
252                if config.service_mode == ServiceMode::Production {
253                    return Err(NucleusError::CgroupError(format!(
254                        "Production mode requires cgroup resource enforcement, but \
255                         cgroup creation failed: {}",
256                        e
257                    )));
258                }
259
260                if config.user_ns_config.is_some() {
261                    if config.limits.memory_bytes.is_some()
262                        || config.limits.cpu_quota_us.is_some()
263                        || config.limits.pids_max.is_some()
264                    {
265                        warn!(
266                            "Running in rootless mode: requested resource limits cannot be \
267                             enforced – cgroup creation requires root ({})",
268                            e
269                        );
270                    } else {
271                        debug!("Running in rootless mode without cgroup resource limits");
272                    }
273                } else {
274                    warn!(
275                        "Failed to create cgroup (running without resource limits): {}",
276                        e
277                    );
278                }
279                None
280            }
281        };
282
283        // Resolve runsc path before fork, while still unprivileged.
284        let runsc_path = if config.use_gvisor {
285            Some(GVisorRuntime::resolve_path().map_err(|e| {
286                NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
287            })?)
288        } else {
289            None
290        };
291
292        // Child notifies parent after namespaces are ready.
293        let (ready_read, ready_write) = pipe().map_err(|e| {
294            NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
295        })?;
296
297        // M11: fork() in multi-threaded context. Flush log buffers and drop
298        // tracing guards before fork to minimize deadlock risk from locks held
299        // by other threads (tracing, allocator). The Tokio runtime is not yet
300        // started at this point, so async thread contention is not a concern.
301        // SAFETY: fork() is called before any Tokio runtime is created.
302        // Only the main thread should be active at this point.
303        match unsafe { fork() }? {
304            ForkResult::Parent { child } => {
305                drop(ready_write);
306                info!("Forked child process: {}", child);
307
308                // Use a closure so that on any error we kill the child process
309                // instead of leaving it orphaned and blocked on the exec FIFO.
310                let parent_setup = || -> Result<CreatedContainer> {
311                    let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
312
313                    let cgroup_path = cgroup_opt
314                        .as_ref()
315                        .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
316                    let cpu_millicores = config
317                        .limits
318                        .cpu_quota_us
319                        .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
320                    let mut state = ContainerState::new(ContainerStateParams {
321                        id: config.id.clone(),
322                        name: config.name.clone(),
323                        pid: target_pid,
324                        command: config.command.clone(),
325                        memory_limit: config.limits.memory_bytes,
326                        cpu_limit: cpu_millicores,
327                        using_gvisor: config.use_gvisor,
328                        rootless: config.user_ns_config.is_some(),
329                        cgroup_path,
330                        process_uid: config.process_identity.uid,
331                        process_gid: config.process_identity.gid,
332                        additional_gids: config.process_identity.additional_gids.clone(),
333                    });
334                    state.config_hash = config.config_hash;
335                    state.bundle_path =
336                        config.rootfs_path.as_ref().map(|p| p.display().to_string());
337
338                    let mut bridge_net: Option<BridgeNetwork> = None;
339                    let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
340
341                    // Transition: Creating -> Created
342                    state.status = OciStatus::Created;
343                    state_mgr.save_state(&state)?;
344
345                    // Write PID file (OCI --pid-file)
346                    if let Some(ref pid_path) = config.pid_file {
347                        std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
348                            NucleusError::ConfigError(format!(
349                                "Failed to write pid-file '{}': {}",
350                                pid_path.display(),
351                                e
352                            ))
353                        })?;
354                        info!("Wrote PID {} to {}", target_pid, pid_path.display());
355                    }
356
357                    if let Some(ref mut cgroup) = cgroup_opt {
358                        cgroup.attach_process(target_pid)?;
359                    }
360
361                    if let NetworkMode::Bridge(ref bridge_config) = config.network {
362                        match BridgeNetwork::setup_with_id(target_pid, bridge_config, &config.id) {
363                            Ok(net) => {
364                                if let Some(ref egress) = config.egress_policy {
365                                    if let Err(e) = net.apply_egress_policy(target_pid, egress) {
366                                        if config.service_mode == ServiceMode::Production {
367                                            return Err(NucleusError::NetworkError(format!(
368                                                "Failed to apply egress policy: {}",
369                                                e
370                                            )));
371                                        }
372                                        warn!("Failed to apply egress policy: {}", e);
373                                    }
374                                }
375                                bridge_net = Some(net);
376                            }
377                            Err(e) => {
378                                if config.service_mode == ServiceMode::Production {
379                                    return Err(e);
380                                }
381                                warn!("Failed to set up bridge networking: {}", e);
382                            }
383                        }
384                    }
385
386                    info!(
387                        "Container {} created (child pid {}), waiting for start",
388                        config.id, target_pid
389                    );
390
391                    Ok(CreatedContainer {
392                        config,
393                        state_mgr,
394                        state,
395                        child,
396                        cgroup_opt,
397                        bridge_net,
398                        trace_reader,
399                        exec_fifo_path: exec_fifo,
400                        _lifecycle_span: lifecycle_span.clone(),
401                    })
402                };
403
404                parent_setup().map_err(|e| {
405                    // Kill the child so it doesn't remain orphaned and blocked
406                    // on the exec FIFO.
407                    let _ = kill(child, Signal::SIGKILL);
408                    let _ = waitpid(child, None);
409                    e
410                })
411            }
412            ForkResult::Child => {
413                drop(ready_read);
414                // H6: Close inherited FDs > 2 to prevent leaking host sockets/pipes
415                Self::sanitize_fds();
416                let temp_container = Container { config, runsc_path };
417                match temp_container.setup_and_exec(Some(ready_write), exec_fifo) {
418                    Ok(_) => unreachable!(),
419                    Err(e) => {
420                        error!("Container setup failed: {}", e);
421                        std::process::exit(1);
422                    }
423                }
424            }
425        }
426    }
427
428    /// Trigger a previously-created container to start by opening its exec FIFO.
429    /// Used by the CLI `start` command.
430    pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
431        let state_mgr = ContainerStateManager::new_with_root(state_root)?;
432        let fifo_path = state_mgr.exec_fifo_path(container_id)?;
433        if !fifo_path.exists() {
434            return Err(NucleusError::ConfigError(format!(
435                "No exec FIFO found for container {}; is it in 'created' state?",
436                container_id
437            )));
438        }
439
440        // Opening the FIFO for reading unblocks the child's open-for-write.
441        let file = std::fs::File::open(&fifo_path)
442            .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
443        let mut buf = [0u8; 1];
444        std::io::Read::read(&mut &file, &mut buf)
445            .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
446        drop(file);
447
448        let _ = std::fs::remove_file(&fifo_path);
449
450        // Update state to Running
451        let mut state = state_mgr.resolve_container(container_id)?;
452        state.status = OciStatus::Running;
453        state_mgr.save_state(&state)?;
454
455        Ok(())
456    }
457
458    /// Set up container environment and exec target process
459    ///
460    /// This runs in the child process after fork.
461    /// Tracks FilesystemState and SecurityState machines to enforce correct ordering.
462    fn setup_and_exec(
463        &self,
464        ready_pipe: Option<OwnedFd>,
465        exec_fifo: Option<PathBuf>,
466    ) -> Result<()> {
467        let is_rootless = self.config.user_ns_config.is_some();
468        let allow_degraded_security = Self::allow_degraded_security(&self.config);
469        let context_manifest = if self.config.verify_context_integrity {
470            self.config
471                .context_dir
472                .as_ref()
473                .map(|dir| snapshot_context_dir(dir))
474                .transpose()?
475        } else {
476            None
477        };
478
479        // Initialize state machines
480        let mut fs_state = FilesystemState::Unmounted;
481        let mut sec_state = SecurityState::Privileged;
482
483        // gVisor is the runtime that should create the container's namespaces.
484        // Running runsc after pre-unsharing our own namespaces breaks its gofer
485        // re-exec path on some systems and duplicates the OCI namespace config.
486        if self.config.use_gvisor {
487            if let Some(fd) = ready_pipe {
488                Self::notify_namespace_ready(&fd, std::process::id())?;
489            }
490            return self.setup_and_exec_gvisor();
491        }
492
493        // 1. Create namespaces in child and optionally configure user mapping.
494        let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
495        if let Some(user_config) = &self.config.user_ns_config {
496            namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
497        }
498        namespace_mgr.unshare_namespaces()?;
499
500        // CLONE_NEWPID only applies to children created after unshare().
501        // Create a child that will become PID 1 in the new namespace and exec the workload.
502        if self.config.namespaces.pid {
503            match unsafe { fork() }? {
504                ForkResult::Parent { child } => {
505                    if let Some(fd) = ready_pipe {
506                        Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
507                    }
508                    std::process::exit(Self::wait_for_pid_namespace_child(child));
509                }
510                ForkResult::Child => {
511                    // Continue container setup as PID 1 in the new namespace.
512                }
513            }
514        } else if let Some(fd) = ready_pipe {
515            Self::notify_namespace_ready(&fd, std::process::id())?;
516        }
517
518        // Namespace: Unshared -> Entered (process is now inside all namespaces)
519        namespace_mgr.enter()?;
520
521        // 2. Ensure no_new_privs BEFORE any mount operations.
522        // This prevents exploitation of setuid binaries on bind-mounted paths
523        // even if a subsequent MS_NOSUID remount fails.
524        self.enforce_no_new_privs()?;
525        audit(
526            &self.config.id,
527            &self.config.name,
528            AuditEventType::NoNewPrivsSet,
529            "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
530        );
531
532        // 3. Set hostname if UTS namespace is enabled
533        if let Some(hostname) = &self.config.hostname {
534            namespace_mgr.set_hostname(hostname)?;
535        }
536
537        // 4. Mount tmpfs as container root
538        // Filesystem: Unmounted -> Mounted
539        // Use a private runtime directory instead of /tmp to avoid symlink
540        // attacks and information disclosure on multi-user systems.
541        let runtime_base = if nix::unistd::Uid::effective().is_root() {
542            std::path::PathBuf::from("/run/nucleus")
543        } else {
544            dirs::runtime_dir()
545                .map(|d| d.join("nucleus"))
546                .unwrap_or_else(std::env::temp_dir)
547        };
548        let _ = std::fs::create_dir_all(&runtime_base);
549        let runtime_dir = Builder::new()
550            .prefix("nucleus-runtime-")
551            .tempdir_in(&runtime_base)
552            .map_err(|e| {
553                NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
554            })?;
555        let container_root = runtime_dir.path().to_path_buf();
556        let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); // 1GB default
557        tmpfs.mount()?;
558        fs_state = fs_state.transition(FilesystemState::Mounted)?;
559
560        // 4. Create minimal filesystem structure
561        create_minimal_fs(&container_root)?;
562
563        // 5. Create device nodes
564        let dev_path = container_root.join("dev");
565        create_dev_nodes(&dev_path, false)?;
566
567        // 6. Populate context if provided
568        // Filesystem: Mounted -> Populated
569        if let Some(context_dir) = &self.config.context_dir {
570            let context_dest = container_root.join("context");
571            LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
572            if let Some(expected) = &context_manifest {
573                verify_context_manifest(expected, &context_dest)?;
574            }
575        }
576        fs_state = fs_state.transition(FilesystemState::Populated)?;
577
578        // 7. Mount runtime paths: either a pre-built rootfs or host bind mounts
579        if let Some(ref rootfs_path) = self.config.rootfs_path {
580            if self.config.verify_rootfs_attestation {
581                verify_rootfs_attestation(rootfs_path)?;
582            }
583            bind_mount_rootfs(&container_root, rootfs_path)?;
584        } else {
585            bind_mount_host_paths(&container_root, is_rootless)?;
586        }
587
588        // 7b. Mount persistent or ephemeral volumes over the base filesystem.
589        mount_volumes(&container_root, &self.config.volumes)?;
590
591        // 7c. Write resolv.conf for bridge networking.
592        // When rootfs is mounted, /etc is read-only, so we bind-mount a writable
593        // resolv.conf over the top (same technique as secrets).
594        if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
595            if self.config.rootfs_path.is_some() {
596                BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_config.dns)?;
597            } else {
598                BridgeNetwork::write_resolv_conf(&container_root, &bridge_config.dns)?;
599            }
600        }
601
602        // 7d. Mount secrets (in-memory tmpfs for production, bind-mount for agent mode)
603        if self.config.service_mode == ServiceMode::Production {
604            mount_secrets_inmemory(
605                &container_root,
606                &self.config.secrets,
607                &self.config.process_identity,
608            )?;
609        } else {
610            mount_secrets(&container_root, &self.config.secrets)?;
611        }
612
613        // 8. Mount procfs (hidepid=2 in production mode to prevent PID enumeration)
614        let proc_path = container_root.join("proc");
615        let hide_pids = self.config.service_mode == ServiceMode::Production;
616        mount_procfs(
617            &proc_path,
618            is_rootless,
619            self.config.proc_readonly,
620            hide_pids,
621        )?;
622
623        // 8b. Mask sensitive /proc paths to reduce kernel info leakage
624        // SEC-06: In production mode, failures to mask critical paths are fatal.
625        mask_proc_paths(
626            &proc_path,
627            self.config.service_mode == ServiceMode::Production,
628        )?;
629
630        // 9c. Run createRuntime hooks (after namespaces created, before pivot_root)
631        if let Some(ref hooks) = self.config.hooks {
632            if !hooks.create_runtime.is_empty() {
633                let hook_state = OciContainerState {
634                    oci_version: "1.0.2".to_string(),
635                    id: self.config.id.clone(),
636                    status: OciStatus::Creating,
637                    pid: std::process::id(),
638                    bundle: String::new(),
639                };
640                OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
641            }
642        }
643
644        // 10. Switch root filesystem
645        // Filesystem: Populated -> Pivoted
646        switch_root(&container_root, self.config.allow_chroot_fallback)?;
647        fs_state = fs_state.transition(FilesystemState::Pivoted)?;
648        debug!("Filesystem state: {:?}", fs_state);
649
650        // 10b. Audit mount flags to verify filesystem hardening invariants
651        audit_mounts(self.config.service_mode == ServiceMode::Production)?;
652        audit(
653            &self.config.id,
654            &self.config.name,
655            AuditEventType::MountAuditPassed,
656            "all mount flags verified",
657        );
658
659        // 10c. Run createContainer hooks (after pivot_root, before start)
660        if let Some(ref hooks) = self.config.hooks {
661            if !hooks.create_container.is_empty() {
662                let hook_state = OciContainerState {
663                    oci_version: "1.0.2".to_string(),
664                    id: self.config.id.clone(),
665                    status: OciStatus::Created,
666                    pid: std::process::id(),
667                    bundle: String::new(),
668                };
669                OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
670            }
671        }
672
673        // 11. Drop capabilities (from policy file or default drop-all)
674        // Security: Privileged -> CapabilitiesDropped
675        let mut cap_mgr = CapabilityManager::new();
676        if let Some(ref policy_path) = self.config.caps_policy {
677            let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
678                policy_path,
679                self.config.caps_policy_sha256.as_deref(),
680            )?;
681            // H3: Reject dangerous capabilities in production mode
682            if self.config.service_mode == ServiceMode::Production {
683                policy.validate_production()?;
684            }
685            policy.apply(&mut cap_mgr)?;
686            audit(
687                &self.config.id,
688                &self.config.name,
689                AuditEventType::CapabilitiesDropped,
690                format!("capability policy applied from {:?}", policy_path),
691            );
692        } else {
693            cap_mgr.drop_all()?;
694            audit(
695                &self.config.id,
696                &self.config.name,
697                AuditEventType::CapabilitiesDropped,
698                "all capabilities dropped including bounding set",
699            );
700        }
701        sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
702
703        // 12b. RLIMIT backstop: defense-in-depth against fork bombs and fd exhaustion.
704        // Must be applied BEFORE seccomp, since SYS_setrlimit is not in the allowlist.
705        // SEC-05: In production mode, RLIMIT failures are fatal — a container
706        // without resource limits is a privilege escalation vector.
707        {
708            let is_production = self.config.service_mode == ServiceMode::Production;
709
710            let nproc_limit = self.config.limits.pids_max.unwrap_or(512);
711            let rlim_nproc = libc::rlimit {
712                rlim_cur: nproc_limit,
713                rlim_max: nproc_limit,
714            };
715            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
716            if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
717                let err = std::io::Error::last_os_error();
718                if is_production {
719                    return Err(NucleusError::SeccompError(format!(
720                        "Failed to set RLIMIT_NPROC to {} in production mode: {}",
721                        nproc_limit, err
722                    )));
723                }
724                warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
725            }
726
727            let rlim_nofile = libc::rlimit {
728                rlim_cur: 1024,
729                rlim_max: 1024,
730            };
731            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
732            if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
733                let err = std::io::Error::last_os_error();
734                if is_production {
735                    return Err(NucleusError::SeccompError(format!(
736                        "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
737                        err
738                    )));
739                }
740                warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
741            }
742
743            // RLIMIT_MEMLOCK: prevent container from pinning excessive physical
744            // memory via mlock(). Default 64KB matches unprivileged default, but
745            // in a user namespace the container appears as UID 0 and may have a
746            // higher inherited limit.
747            let memlock_limit: u64 = 64 * 1024; // 64KB
748            let rlim_memlock = libc::rlimit {
749                rlim_cur: memlock_limit,
750                rlim_max: memlock_limit,
751            };
752            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
753            if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
754                let err = std::io::Error::last_os_error();
755                if is_production {
756                    return Err(NucleusError::SeccompError(format!(
757                        "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
758                        memlock_limit, err
759                    )));
760                }
761                warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
762            }
763        }
764
765        // 12c. Verify that namespace-creating capabilities are truly gone before
766        // installing seccomp. clone3 is allowed without argument filtering, so this
767        // is the sole guard against namespace escape via clone3.
768        CapabilityManager::verify_no_namespace_caps(
769            self.config.service_mode == ServiceMode::Production,
770        )?;
771
772        // 13. Apply seccomp filter (trace, profile-from-file, or built-in allowlist)
773        // Security: CapabilitiesDropped -> SeccompApplied
774        use crate::container::config::SeccompMode;
775        let mut seccomp_mgr = SeccompManager::new();
776        let allow_network = !matches!(self.config.network, NetworkMode::None);
777        let seccomp_applied = match self.config.seccomp_mode {
778            SeccompMode::Trace => {
779                audit(
780                    &self.config.id,
781                    &self.config.name,
782                    AuditEventType::SeccompApplied,
783                    "seccomp trace mode: allow-all + LOG",
784                );
785                seccomp_mgr.apply_trace_filter()?
786            }
787            SeccompMode::Enforce => {
788                if let Some(ref profile_path) = self.config.seccomp_profile {
789                    audit(
790                        &self.config.id,
791                        &self.config.name,
792                        AuditEventType::SeccompProfileLoaded,
793                        format!("path={:?}", profile_path),
794                    );
795                    seccomp_mgr.apply_profile_from_file(
796                        profile_path,
797                        self.config.seccomp_profile_sha256.as_deref(),
798                        self.config.seccomp_log_denied,
799                    )?
800                } else {
801                    seccomp_mgr.apply_filter_for_network_mode(
802                        allow_network,
803                        allow_degraded_security,
804                        self.config.seccomp_log_denied,
805                    )?
806                }
807            }
808        };
809        if seccomp_applied {
810            sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
811            audit(
812                &self.config.id,
813                &self.config.name,
814                AuditEventType::SeccompApplied,
815                format!("network={}", allow_network),
816            );
817        } else if !allow_degraded_security {
818            return Err(NucleusError::SeccompError(
819                "Seccomp filter is required but was not enforced".to_string(),
820            ));
821        } else {
822            warn!("Seccomp not enforced; container is running with degraded hardening");
823        }
824
825        // 14. Apply Landlock policy (from policy file or default hardcoded rules)
826        let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
827            let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
828                policy_path,
829                self.config.landlock_policy_sha256.as_deref(),
830            )?;
831            // H4: Reject write+execute on same path in production
832            if self.config.service_mode == ServiceMode::Production {
833                policy.validate_production()?;
834            }
835            policy.apply(allow_degraded_security)?
836        } else {
837            let mut landlock_mgr = LandlockManager::new();
838            landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
839            landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
840        };
841        if seccomp_applied && landlock_applied {
842            sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
843            if self.config.seccomp_mode == SeccompMode::Trace {
844                warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
845            } else {
846                sec_state = sec_state.transition(SecurityState::Locked)?;
847            }
848            audit(
849                &self.config.id,
850                &self.config.name,
851                AuditEventType::LandlockApplied,
852                if self.config.seccomp_mode == SeccompMode::Trace {
853                    "landlock applied, but seccomp in trace mode — not locked".to_string()
854                } else {
855                    "security state locked: all hardening layers active".to_string()
856                },
857            );
858        } else if !allow_degraded_security {
859            return Err(NucleusError::LandlockError(
860                "Landlock policy is required but was not enforced".to_string(),
861            ));
862        } else {
863            warn!("Security state not locked; one or more hardening controls are inactive");
864        }
865        debug!("Security state: {:?}", sec_state);
866
867        // 14c. Block on exec FIFO until start() opens it for reading.
868        // This implements the OCI two-phase create/start: all container setup
869        // is complete, but the user process doesn't exec until explicitly started.
870        if let Some(ref fifo_path) = exec_fifo {
871            debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
872            let file = std::fs::OpenOptions::new()
873                .write(true)
874                .open(fifo_path)
875                .map_err(|e| {
876                    NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
877                })?;
878            std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
879                NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
880            })?;
881            drop(file);
882            debug!("Exec FIFO released, proceeding to exec");
883        }
884
885        // 14d. Run startContainer hooks (after start signal, before user process exec)
886        if let Some(ref hooks) = self.config.hooks {
887            if !hooks.start_container.is_empty() {
888                let hook_state = OciContainerState {
889                    oci_version: "1.0.2".to_string(),
890                    id: self.config.id.clone(),
891                    status: OciStatus::Running,
892                    pid: std::process::id(),
893                    bundle: String::new(),
894                };
895                OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
896            }
897        }
898
899        // 15. In production mode with PID namespace, run as a mini-init (PID 1)
900        // that reaps zombies and forwards signals, rather than exec-ing directly.
901        if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
902            return self.run_as_init();
903        }
904
905        // 15b. Agent mode: exec target process directly
906        self.exec_command()?;
907
908        // Should never reach here
909        Ok(())
910    }
911
912    /// Forward selected signals to child process using sigwait (no async signal handlers).
913    ///
914    /// Returns a stop flag and join handle. Set the flag to `true` and join
915    /// the handle to cleanly shut down the forwarding thread.
916    pub(super) fn setup_signal_forwarding_static(
917        child: Pid,
918    ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
919        let mut set = SigSet::empty();
920        for signal in [
921            Signal::SIGTERM,
922            Signal::SIGINT,
923            Signal::SIGHUP,
924            Signal::SIGQUIT,
925            Signal::SIGUSR1,
926            Signal::SIGUSR2,
927        ] {
928            set.add(signal);
929        }
930
931        let unblock_set = set;
932        pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
933            NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
934        })?;
935
936        let stop = Arc::new(AtomicBool::new(false));
937        let stop_clone = stop.clone();
938        let handle = std::thread::Builder::new()
939            .name("sig-forward".to_string())
940            .spawn(move || {
941                // The thread owns unblock_set and uses it for sigwait.
942                loop {
943                    if let Ok(signal) = unblock_set.wait() {
944                        // Check the stop flag *after* waking so that the
945                        // wake-up signal (SIGUSR1) is not forwarded to the
946                        // child during shutdown.
947                        if stop_clone.load(Ordering::Relaxed) {
948                            break;
949                        }
950                        let _ = kill(child, signal);
951                    }
952                }
953            })
954            .map_err(|e| {
955                // Restore the signal mask so the caller isn't left with
956                // signals permanently blocked.
957                let mut restore = SigSet::empty();
958                for signal in [
959                    Signal::SIGTERM,
960                    Signal::SIGINT,
961                    Signal::SIGHUP,
962                    Signal::SIGQUIT,
963                    Signal::SIGUSR1,
964                    Signal::SIGUSR2,
965                ] {
966                    restore.add(signal);
967                }
968                let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
969                NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
970            })?;
971
972        info!("Signal forwarding configured");
973        Ok((stop, handle))
974    }
975
976    /// Wait for child process to exit
977    pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
978        loop {
979            match waitpid(child, None) {
980                Ok(WaitStatus::Exited(_, code)) => {
981                    return Ok(code);
982                }
983                Ok(WaitStatus::Signaled(_, signal, _)) => {
984                    info!("Child killed by signal: {:?}", signal);
985                    return Ok(128 + signal as i32);
986                }
987                Err(nix::errno::Errno::EINTR) => {
988                    continue;
989                }
990                Err(e) => {
991                    return Err(NucleusError::ExecError(format!(
992                        "Failed to wait for child: {}",
993                        e
994                    )));
995                }
996                _ => {
997                    continue;
998                }
999            }
1000        }
1001    }
1002
1003    fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1004        let mut pid_buf = [0u8; 4];
1005        loop {
1006            match read(ready_read, &mut pid_buf) {
1007                Err(nix::errno::Errno::EINTR) => continue,
1008                Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1009                Ok(0) => {
1010                    return Err(NucleusError::ExecError(format!(
1011                        "Child {} exited before namespace initialization",
1012                        child
1013                    )))
1014                }
1015                Ok(_) => {
1016                    return Err(NucleusError::ExecError(
1017                        "Invalid namespace sync payload from child".to_string(),
1018                    ))
1019                }
1020                Err(e) => {
1021                    return Err(NucleusError::ExecError(format!(
1022                        "Failed waiting for child namespace setup: {}",
1023                        e
1024                    )))
1025                }
1026            }
1027        }
1028    }
1029
1030    fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1031        let payload = pid.to_ne_bytes();
1032        let mut written = 0;
1033        while written < payload.len() {
1034            let n = write(fd, &payload[written..]).map_err(|e| {
1035                NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1036            })?;
1037            if n == 0 {
1038                return Err(NucleusError::ExecError(
1039                    "Failed to notify namespace readiness: short write".to_string(),
1040                ));
1041            }
1042            written += n;
1043        }
1044        Ok(())
1045    }
1046
1047    fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1048        loop {
1049            match waitpid(child, None) {
1050                Ok(WaitStatus::Exited(_, code)) => return code,
1051                Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1052                Err(nix::errno::Errno::EINTR) => continue,
1053                Err(_) => return 1,
1054                _ => continue,
1055            }
1056        }
1057    }
1058}
1059
1060impl CreatedContainer {
1061    /// Start phase: release the child via the exec FIFO, transition to Running,
1062    /// then wait for the child to exit with full lifecycle management.
1063    pub fn start(mut self) -> Result<i32> {
1064        let config = &self.config;
1065        let _enter = self._lifecycle_span.enter();
1066
1067        // Open the exec FIFO for reading — this unblocks the child's
1068        // blocking open-for-write, allowing it to proceed to exec.
1069        if let Some(exec_fifo_path) = &self.exec_fifo_path {
1070            let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1071                NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1072            })?;
1073            let mut buf = [0u8; 1];
1074            let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1075                NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1076            })?;
1077            if read != 1 {
1078                return Err(NucleusError::ExecError(
1079                    "Exec FIFO closed before start signal was delivered".to_string(),
1080                ));
1081            }
1082            let _ = std::fs::remove_file(exec_fifo_path);
1083        }
1084
1085        // Transition: Created -> Running
1086        self.state.status = OciStatus::Running;
1087        self.state_mgr.save_state(&self.state)?;
1088
1089        let target_pid = self.state.pid;
1090        let child = self.child;
1091
1092        let (sig_stop, sig_handle) =
1093            Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1094
1095        // Guard ensures signal thread is stopped on any exit path (including early ? returns).
1096        let mut sig_guard = SignalThreadGuard {
1097            stop: Some(sig_stop),
1098            handle: Some(sig_handle),
1099        };
1100
1101        // Run readiness probe before declaring service ready
1102        if let Some(ref probe) = config.readiness_probe {
1103            let notify_socket = if config.sd_notify {
1104                std::env::var("NOTIFY_SOCKET").ok()
1105            } else {
1106                None
1107            };
1108            Container::run_readiness_probe(
1109                target_pid,
1110                &config.name,
1111                probe,
1112                config.user_ns_config.is_some(),
1113                config.use_gvisor,
1114                &config.process_identity,
1115                notify_socket.as_deref(),
1116            )?;
1117        }
1118
1119        // Start health check thread if configured
1120        let cancel_flag = Arc::new(AtomicBool::new(false));
1121        let health_handle = if let Some(ref hc) = config.health_check {
1122            if !hc.command.is_empty() {
1123                let hc = hc.clone();
1124                let pid = target_pid;
1125                let container_name = config.name.clone();
1126                let rootless = config.user_ns_config.is_some();
1127                let using_gvisor = config.use_gvisor;
1128                let process_identity = config.process_identity.clone();
1129                let cancel = cancel_flag.clone();
1130                Some(std::thread::spawn(move || {
1131                    Container::health_check_loop(
1132                        pid,
1133                        &container_name,
1134                        rootless,
1135                        using_gvisor,
1136                        &hc,
1137                        &process_identity,
1138                        &cancel,
1139                    );
1140                }))
1141            } else {
1142                None
1143            }
1144        } else {
1145            None
1146        };
1147
1148        // Guard ensures health check thread is cancelled on any exit path.
1149        let mut health_guard = HealthThreadGuard {
1150            cancel: Some(cancel_flag),
1151            handle: health_handle,
1152        };
1153
1154        // Run poststart hooks (after user process started, in parent)
1155        if let Some(ref hooks) = config.hooks {
1156            if !hooks.poststart.is_empty() {
1157                let hook_state = OciContainerState {
1158                    oci_version: "1.0.2".to_string(),
1159                    id: config.id.clone(),
1160                    status: OciStatus::Running,
1161                    pid: target_pid,
1162                    bundle: String::new(),
1163                };
1164                OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1165            }
1166        }
1167
1168        let mut child_waited = false;
1169        let run_result: Result<i32> = (|| {
1170            let exit_code = Container::wait_for_child_static(child)?;
1171
1172            // Transition: Running -> Stopped
1173            self.state.status = OciStatus::Stopped;
1174            let _ = self.state_mgr.save_state(&self.state);
1175
1176            child_waited = true;
1177            Ok(exit_code)
1178        })();
1179
1180        // Explicitly stop threads (guards would do this on drop too, but
1181        // explicit teardown keeps ordering visible).
1182        health_guard.stop();
1183        sig_guard.stop();
1184
1185        // Run poststop hooks (best-effort)
1186        if let Some(ref hooks) = config.hooks {
1187            if !hooks.poststop.is_empty() {
1188                let hook_state = OciContainerState {
1189                    oci_version: "1.0.2".to_string(),
1190                    id: config.id.clone(),
1191                    status: OciStatus::Stopped,
1192                    pid: 0,
1193                    bundle: String::new(),
1194                };
1195                OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1196            }
1197        }
1198
1199        if let Some(net) = self.bridge_net.take() {
1200            if let Err(e) = net.cleanup() {
1201                warn!("Failed to cleanup bridge networking: {}", e);
1202            }
1203        }
1204
1205        if !child_waited {
1206            let _ = kill(child, Signal::SIGKILL);
1207            let _ = waitpid(child, None);
1208        }
1209
1210        if let Some(reader) = self.trace_reader.take() {
1211            reader.stop_and_flush();
1212        }
1213
1214        if let Some(cgroup) = self.cgroup_opt.take() {
1215            if let Err(e) = cgroup.cleanup() {
1216                warn!("Failed to cleanup cgroup: {}", e);
1217            }
1218        }
1219
1220        if config.use_gvisor {
1221            if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1222                warn!(
1223                    "Failed to cleanup gVisor artifacts for {}: {}",
1224                    config.id, e
1225                );
1226            }
1227        }
1228
1229        if let Err(e) = self.state_mgr.delete_state(&config.id) {
1230            warn!("Failed to delete state for {}: {}", config.id, e);
1231        }
1232
1233        match run_result {
1234            Ok(exit_code) => {
1235                audit(
1236                    &config.id,
1237                    &config.name,
1238                    AuditEventType::ContainerStop,
1239                    format!("exit_code={}", exit_code),
1240                );
1241                info!(
1242                    "Container {} ({}) exited with code {}",
1243                    config.name, config.id, exit_code
1244                );
1245                Ok(exit_code)
1246            }
1247            Err(e) => {
1248                audit_error(
1249                    &config.id,
1250                    &config.name,
1251                    AuditEventType::ContainerStop,
1252                    format!("error={}", e),
1253                );
1254                Err(e)
1255            }
1256        }
1257    }
1258}
1259
1260/// RAII guard that stops the signal-forwarding thread on drop.
1261struct SignalThreadGuard {
1262    stop: Option<Arc<AtomicBool>>,
1263    handle: Option<JoinHandle<()>>,
1264}
1265
1266impl SignalThreadGuard {
1267    fn stop(&mut self) {
1268        if let Some(flag) = self.stop.take() {
1269            flag.store(true, Ordering::Relaxed);
1270            // Unblock the sigwait() call so the thread can observe the stop flag.
1271            let _ = kill(Pid::this(), Signal::SIGUSR1);
1272        }
1273        if let Some(handle) = self.handle.take() {
1274            let _ = handle.join();
1275        }
1276    }
1277}
1278
1279impl Drop for SignalThreadGuard {
1280    fn drop(&mut self) {
1281        self.stop();
1282    }
1283}
1284
1285/// RAII guard that cancels the health-check thread on drop.
1286struct HealthThreadGuard {
1287    cancel: Option<Arc<AtomicBool>>,
1288    handle: Option<JoinHandle<()>>,
1289}
1290
1291impl HealthThreadGuard {
1292    fn stop(&mut self) {
1293        if let Some(flag) = self.cancel.take() {
1294            flag.store(true, Ordering::Relaxed);
1295        }
1296        if let Some(handle) = self.handle.take() {
1297            let _ = handle.join();
1298        }
1299    }
1300}
1301
1302impl Drop for HealthThreadGuard {
1303    fn drop(&mut self) {
1304        self.stop();
1305    }
1306}
1307
1308#[cfg(test)]
1309mod tests {
1310    use super::*;
1311    use crate::container::KernelLockdownMode;
1312    use crate::network::NetworkMode;
1313
1314    #[test]
1315    fn test_container_config() {
1316        let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1317        assert!(!config.id.is_empty());
1318        assert_eq!(config.command, vec!["/bin/sh"]);
1319        assert!(config.use_gvisor);
1320    }
1321
1322    #[test]
1323    fn test_run_uses_immediate_start_path() {
1324        let source = include_str!("runtime.rs");
1325        let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1326        let after = &source[fn_start..];
1327        let open = after.find('{').unwrap();
1328        let mut depth = 0u32;
1329        let mut fn_end = open;
1330        for (i, ch) in after[open..].char_indices() {
1331            match ch {
1332                '{' => depth += 1,
1333                '}' => {
1334                    depth -= 1;
1335                    if depth == 0 {
1336                        fn_end = open + i + 1;
1337                        break;
1338                    }
1339                }
1340                _ => {}
1341            }
1342        }
1343        let run_body = &after[..fn_end];
1344        assert!(
1345            run_body.contains("create_internal(false)"),
1346            "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1347        );
1348        assert!(
1349            !run_body.contains("self.create()?.start()"),
1350            "run() must not route through create()+start()"
1351        );
1352    }
1353
1354    #[test]
1355    fn test_container_config_with_name() {
1356        let config =
1357            ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1358                .unwrap();
1359        assert_eq!(config.name, "mycontainer");
1360        assert!(!config.id.is_empty());
1361        assert_ne!(config.id, config.name);
1362    }
1363
1364    #[test]
1365    fn test_allow_degraded_security_requires_explicit_config() {
1366        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1367        assert!(!Container::allow_degraded_security(&strict));
1368
1369        let relaxed = strict.clone().with_allow_degraded_security(true);
1370        assert!(Container::allow_degraded_security(&relaxed));
1371    }
1372
1373    #[test]
1374    fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1375        let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1376        std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1377
1378        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1379        assert!(!Container::allow_degraded_security(&strict));
1380
1381        let explicit = strict.with_allow_degraded_security(true);
1382        assert!(Container::allow_degraded_security(&explicit));
1383
1384        match prev {
1385            Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1386            None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1387        }
1388    }
1389
1390    #[test]
1391    fn test_host_network_requires_explicit_opt_in() {
1392        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1393            .unwrap()
1394            .with_network(NetworkMode::Host)
1395            .with_allow_host_network(false);
1396        let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1397        assert!(matches!(err, NucleusError::NetworkError(_)));
1398    }
1399
1400    #[test]
1401    fn test_host_network_opt_in_disables_net_namespace() {
1402        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1403            .unwrap()
1404            .with_network(NetworkMode::Host)
1405            .with_allow_host_network(true);
1406        assert!(config.namespaces.net);
1407        Container::apply_network_mode_guards(&mut config, true).unwrap();
1408        assert!(!config.namespaces.net);
1409    }
1410
1411    #[test]
1412    fn test_non_host_network_does_not_require_host_opt_in() {
1413        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1414            .unwrap()
1415            .with_network(NetworkMode::None)
1416            .with_allow_host_network(false);
1417        assert!(config.namespaces.net);
1418        Container::apply_network_mode_guards(&mut config, true).unwrap();
1419        assert!(config.namespaces.net);
1420    }
1421
1422    #[test]
1423    fn test_parse_kernel_lockdown_mode() {
1424        assert_eq!(
1425            Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1426            Some(KernelLockdownMode::Integrity)
1427        );
1428        assert_eq!(
1429            Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1430            Some(KernelLockdownMode::Confidentiality)
1431        );
1432        assert_eq!(
1433            Container::parse_active_lockdown_mode("[none] integrity"),
1434            None
1435        );
1436    }
1437
1438    #[test]
1439    fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1440        let temp = tempfile::TempDir::new().unwrap();
1441        let source = temp.path().join("source-secret");
1442        std::fs::write(&source, "supersecret").unwrap();
1443
1444        let staged = Container::stage_gvisor_secret_files(
1445            &temp.path().join("stage"),
1446            &[crate::container::SecretMount {
1447                source: source.clone(),
1448                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1449                mode: 0o400,
1450            }],
1451            &crate::container::ProcessIdentity::root(),
1452        )
1453        .unwrap();
1454
1455        assert_eq!(staged.len(), 1);
1456        assert!(staged[0].source.starts_with(temp.path().join("stage")));
1457        assert_eq!(
1458            std::fs::read_to_string(&staged[0].source).unwrap(),
1459            "supersecret"
1460        );
1461    }
1462
1463    #[test]
1464    fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1465        let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1466        std::fs::create_dir_all(&artifact_dir).unwrap();
1467        std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1468
1469        Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1470        assert!(!artifact_dir.exists());
1471    }
1472
1473    #[test]
1474    fn test_health_check_loop_supports_cancellation() {
1475        // BUG-18: health_check_loop must accept an AtomicBool cancel flag
1476        // and check it between iterations for prompt shutdown.
1477        // Function lives in health.rs after the runtime split.
1478        let source = include_str!("health.rs");
1479        let fn_start = source.find("fn health_check_loop").unwrap();
1480        let fn_body = &source[fn_start..fn_start + 2500];
1481        assert!(
1482            fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1483            "health_check_loop must accept an AtomicBool cancellation flag"
1484        );
1485        // Must also check cancellation during sleep
1486        assert!(
1487            fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1488            "health_check_loop must check cancellation during sleep intervals"
1489        );
1490    }
1491
1492    #[test]
1493    fn test_runtime_probes_do_not_spawn_host_nsenter() {
1494        // Both functions live in health.rs after the runtime split.
1495        let source = include_str!("health.rs");
1496
1497        let readiness_start = source.find("fn run_readiness_probe").unwrap();
1498        let readiness_body = &source[readiness_start..readiness_start + 2500];
1499        assert!(
1500            !readiness_body.contains("Command::new(&nsenter_bin)"),
1501            "readiness probes must not execute via host nsenter"
1502        );
1503
1504        let health_start = source.find("fn health_check_loop").unwrap();
1505        let health_body = &source[health_start..health_start + 2200];
1506        assert!(
1507            !health_body.contains("Command::new(&nsenter_bin)"),
1508            "health checks must not execute via host nsenter"
1509        );
1510    }
1511
1512    #[test]
1513    fn test_oci_mount_strip_prefix_no_expect() {
1514        // BUG-08: prepare_oci_mountpoints must not use expect() - use ? instead
1515        // Function lives in gvisor_setup.rs after the runtime split.
1516        let source = include_str!("gvisor_setup.rs");
1517        let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1518        let fn_body = &source[fn_start..fn_start + 600];
1519        assert!(
1520            !fn_body.contains(".expect("),
1521            "prepare_oci_mountpoints must not use expect() — return Err instead"
1522        );
1523    }
1524
1525    #[test]
1526    fn test_notify_namespace_ready_validates_write_length() {
1527        // BUG-02: notify_namespace_ready must validate that all bytes were written
1528        let source = include_str!("runtime.rs");
1529        let fn_start = source.find("fn notify_namespace_ready").unwrap();
1530        let fn_body = &source[fn_start..fn_start + 500];
1531        // Must check the return value of write() for partial writes
1532        assert!(
1533            fn_body.contains("written")
1534                || fn_body.contains("4")
1535                || fn_body.contains("payload.len()"),
1536            "notify_namespace_ready must validate complete write of all 4 bytes"
1537        );
1538    }
1539
1540    #[test]
1541    fn test_rlimit_failures_fatal_in_production() {
1542        // SEC-05: RLIMIT failures must be fatal in production mode
1543        let source = include_str!("runtime.rs");
1544        let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1545        let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1546        assert!(
1547            rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1548            "RLIMIT failures must return Err in production mode"
1549        );
1550    }
1551
1552    #[test]
1553    fn test_tcp_readiness_probe_uses_portable_check() {
1554        // BUG-14: TCP readiness probe must not use /dev/tcp (bash-only)
1555        // Function lives in health.rs after the runtime split.
1556        let source = include_str!("health.rs");
1557        let probe_fn = source.find("TcpPort(port)").unwrap();
1558        let probe_body = &source[probe_fn..probe_fn + 500];
1559        assert!(
1560            !probe_body.contains("/dev/tcp"),
1561            "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1562        );
1563    }
1564}