Skip to main content

nucleus/container/
runtime.rs

1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3    ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4    ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8    audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9    mask_proc_paths, mount_procfs, mount_secrets, mount_secrets_inmemory, mount_volumes,
10    snapshot_context_dir, switch_root, verify_context_manifest, verify_rootfs_attestation,
11    FilesystemState, LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::NamespaceManager;
14use crate::network::{BridgeNetwork, NetworkMode};
15use crate::resources::Cgroup;
16use crate::security::{
17    CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks, SeccompManager,
18    SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::{AsRawFd, OwnedFd};
26use std::path::PathBuf;
27use std::sync::atomic::{AtomicBool, Ordering};
28use std::sync::Arc;
29use std::thread::JoinHandle;
30use tempfile::Builder;
31use tracing::{debug, error, info, info_span, warn};
32
33/// Container runtime that orchestrates all isolation mechanisms
34///
35/// Execution flow matches the formal specifications:
36/// 1. Create namespaces (Nucleus_Isolation_NamespaceLifecycle.tla)
37/// 2. Create and configure cgroups (Nucleus_Resources_CgroupLifecycle.tla)
38/// 3. Mount tmpfs and populate context (Nucleus_Filesystem_FilesystemLifecycle.tla)
39/// 4. Drop capabilities and apply seccomp (Nucleus_Security_SecurityEnforcement.tla)
40/// 5. Execute target process
41pub struct Container {
42    pub(super) config: ContainerConfig,
43    /// Pre-resolved runsc path, resolved before fork so that user-namespace
44    /// UID changes don't block PATH-based lookup.
45    pub(super) runsc_path: Option<String>,
46}
47
48/// Handle returned by `Container::create()` representing a container whose
49/// child process has been forked and is blocked on the exec FIFO, waiting for
50/// `start()` to release it.
51pub struct CreatedContainer {
52    pub(super) config: ContainerConfig,
53    pub(super) state_mgr: ContainerStateManager,
54    pub(super) state: ContainerState,
55    pub(super) child: Pid,
56    pub(super) cgroup_opt: Option<Cgroup>,
57    pub(super) bridge_net: Option<BridgeNetwork>,
58    pub(super) trace_reader: Option<SeccompTraceReader>,
59    pub(super) exec_fifo_path: Option<PathBuf>,
60    pub(super) _lifecycle_span: tracing::Span,
61}
62
63impl Container {
64    pub fn new(config: ContainerConfig) -> Self {
65        Self {
66            config,
67            runsc_path: None,
68        }
69    }
70
71    /// Run the container (convenience wrapper: create + start)
72    pub fn run(&self) -> Result<i32> {
73        self.create_internal(false)?.start()
74    }
75
76    /// Create phase: fork the child, set up cgroup/bridge, leave child blocked
77    /// on the exec FIFO. Returns a `CreatedContainer` whose `start()` method
78    /// releases the child process.
79    pub fn create(&self) -> Result<CreatedContainer> {
80        self.create_internal(true)
81    }
82
83    fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
84        let lifecycle_span = info_span!(
85            "container.lifecycle",
86            container.id = %self.config.id,
87            container.name = %self.config.name,
88            runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
89        );
90        let _enter = lifecycle_span.enter();
91
92        info!(
93            "Creating container: {} (ID: {})",
94            self.config.name, self.config.id
95        );
96        audit(
97            &self.config.id,
98            &self.config.name,
99            AuditEventType::ContainerStart,
100            format!(
101                "command={:?} mode={:?} runtime={}",
102                self.config.command,
103                self.config.service_mode,
104                if self.config.use_gvisor {
105                    "gvisor"
106                } else {
107                    "native"
108                }
109            ),
110        );
111
112        // Auto-detect if we need rootless mode
113        let is_root = nix::unistd::Uid::effective().is_root();
114        let mut config = self.config.clone();
115
116        if !is_root && config.user_ns_config.is_none() {
117            info!("Not running as root, automatically enabling rootless mode");
118            config.namespaces.user = true;
119            config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
120        }
121
122        // Log console-socket acceptance (OCI interface; PTY forwarding is a future enhancement)
123        if let Some(ref socket_path) = config.console_socket {
124            warn!(
125                "Console socket {} accepted but terminal forwarding is not yet implemented",
126                socket_path.display()
127            );
128        }
129
130        // Validate production mode invariants before anything else.
131        config.validate_production_mode()?;
132        Self::assert_kernel_lockdown(&config)?;
133
134        Self::apply_network_mode_guards(&mut config, is_root)?;
135        Self::apply_trust_level_guards(&mut config)?;
136        config.validate_runtime_support()?;
137
138        // Bridge networking requires root
139        if matches!(config.network, NetworkMode::Bridge(_)) && !is_root {
140            if config.service_mode == ServiceMode::Production {
141                return Err(NucleusError::NetworkError(
142                    "Production mode with bridge networking requires root (cannot silently \
143                     degrade to no networking)"
144                        .to_string(),
145                ));
146            }
147            warn!("Bridge networking requires root, degrading to no networking");
148            config.network = NetworkMode::None;
149        }
150
151        // Create state manager
152        let state_mgr = ContainerStateManager::new()?;
153
154        // Enforce name uniqueness among running containers
155        if let Ok(all_states) = state_mgr.list_states() {
156            if all_states.iter().any(|s| s.name == config.name) {
157                return Err(NucleusError::ConfigError(format!(
158                    "A container named '{}' already exists; use a different --name, \
159                     or remove the stale state with 'nucleus delete'",
160                    config.name
161                )));
162            }
163        }
164
165        // Create exec FIFO only for the two-phase create/start lifecycle.
166        // `run()` starts immediately and avoids this cross-root-path sync.
167        let exec_fifo = if defer_exec_until_start {
168            let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
169            nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
170                NucleusError::ExecError(format!(
171                    "Failed to create exec FIFO {:?}: {}",
172                    exec_fifo, e
173                ))
174            })?;
175            Some(exec_fifo)
176        } else {
177            None
178        };
179
180        // Try to create cgroup (optional for rootless mode)
181        let cgroup_name = format!("nucleus-{}", config.id);
182        let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
183            Ok(mut cgroup) => {
184                // Try to set limits
185                match cgroup.set_limits(&config.limits) {
186                    Ok(_) => {
187                        info!("Created cgroup with resource limits");
188                        Some(cgroup)
189                    }
190                    Err(e) => {
191                        if config.service_mode == ServiceMode::Production {
192                            let _ = cgroup.cleanup();
193                            return Err(NucleusError::CgroupError(format!(
194                                "Production mode requires cgroup resource enforcement, but \
195                                 applying limits failed: {}",
196                                e
197                            )));
198                        }
199                        warn!("Failed to set cgroup limits: {}", e);
200                        let _ = cgroup.cleanup();
201                        None
202                    }
203                }
204            }
205            Err(e) => {
206                if config.service_mode == ServiceMode::Production {
207                    return Err(NucleusError::CgroupError(format!(
208                        "Production mode requires cgroup resource enforcement, but \
209                         cgroup creation failed: {}",
210                        e
211                    )));
212                }
213
214                if config.user_ns_config.is_some() {
215                    if config.limits.memory_bytes.is_some()
216                        || config.limits.cpu_quota_us.is_some()
217                        || config.limits.pids_max.is_some()
218                    {
219                        warn!(
220                            "Running in rootless mode: requested resource limits cannot be \
221                             enforced – cgroup creation requires root ({})",
222                            e
223                        );
224                    } else {
225                        debug!("Running in rootless mode without cgroup resource limits");
226                    }
227                } else {
228                    warn!(
229                        "Failed to create cgroup (running without resource limits): {}",
230                        e
231                    );
232                }
233                None
234            }
235        };
236
237        // Resolve runsc path before fork, while still unprivileged.
238        let runsc_path = if config.use_gvisor {
239            Some(GVisorRuntime::resolve_path().map_err(|e| {
240                NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
241            })?)
242        } else {
243            None
244        };
245
246        // Child notifies parent after namespaces are ready.
247        let (ready_read, ready_write) = pipe().map_err(|e| {
248            NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
249        })?;
250
251        // Fork child process
252        match unsafe { fork() }? {
253            ForkResult::Parent { child } => {
254                drop(ready_write);
255                info!("Forked child process: {}", child);
256
257                // Use a closure so that on any error we kill the child process
258                // instead of leaving it orphaned and blocked on the exec FIFO.
259                let parent_setup = || -> Result<CreatedContainer> {
260                    let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
261
262                    let cgroup_path = cgroup_opt
263                        .as_ref()
264                        .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
265                    let cpu_millicores = config
266                        .limits
267                        .cpu_quota_us
268                        .map(|quota| (quota * 1000) / config.limits.cpu_period_us);
269                    let mut state = ContainerState::new(ContainerStateParams {
270                        id: config.id.clone(),
271                        name: config.name.clone(),
272                        pid: target_pid,
273                        command: config.command.clone(),
274                        memory_limit: config.limits.memory_bytes,
275                        cpu_limit: cpu_millicores,
276                        using_gvisor: config.use_gvisor,
277                        rootless: config.user_ns_config.is_some(),
278                        cgroup_path,
279                        process_uid: config.process_identity.uid,
280                        process_gid: config.process_identity.gid,
281                        additional_gids: config.process_identity.additional_gids.clone(),
282                    });
283                    state.config_hash = config.config_hash;
284                    state.bundle_path =
285                        config.rootfs_path.as_ref().map(|p| p.display().to_string());
286
287                    let mut bridge_net: Option<BridgeNetwork> = None;
288                    let trace_reader =
289                        Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
290
291                    // Transition: Creating -> Created
292                    state.status = OciStatus::Created;
293                    state_mgr.save_state(&state)?;
294
295                    // Write PID file (OCI --pid-file)
296                    if let Some(ref pid_path) = config.pid_file {
297                        std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
298                            NucleusError::ConfigError(format!(
299                                "Failed to write pid-file '{}': {}",
300                                pid_path.display(),
301                                e
302                            ))
303                        })?;
304                        info!("Wrote PID {} to {}", target_pid, pid_path.display());
305                    }
306
307                    if let Some(ref mut cgroup) = cgroup_opt {
308                        cgroup.attach_process(target_pid)?;
309                    }
310
311                    if let NetworkMode::Bridge(ref bridge_config) = config.network {
312                        match BridgeNetwork::setup_with_id(
313                            target_pid,
314                            bridge_config,
315                            &config.id,
316                        ) {
317                            Ok(net) => {
318                                if let Some(ref egress) = config.egress_policy {
319                                    if let Err(e) =
320                                        net.apply_egress_policy(target_pid, egress)
321                                    {
322                                        if config.service_mode == ServiceMode::Production {
323                                            return Err(NucleusError::NetworkError(format!(
324                                                "Failed to apply egress policy: {}",
325                                                e
326                                            )));
327                                        }
328                                        warn!("Failed to apply egress policy: {}", e);
329                                    }
330                                }
331                                bridge_net = Some(net);
332                            }
333                            Err(e) => {
334                                if config.service_mode == ServiceMode::Production {
335                                    return Err(e);
336                                }
337                                warn!("Failed to set up bridge networking: {}", e);
338                            }
339                        }
340                    }
341
342                    info!(
343                        "Container {} created (child pid {}), waiting for start",
344                        config.id, target_pid
345                    );
346
347                    Ok(CreatedContainer {
348                        config,
349                        state_mgr,
350                        state,
351                        child,
352                        cgroup_opt,
353                        bridge_net,
354                        trace_reader,
355                        exec_fifo_path: exec_fifo,
356                        _lifecycle_span: lifecycle_span.clone(),
357                    })
358                };
359
360                parent_setup().map_err(|e| {
361                    // Kill the child so it doesn't remain orphaned and blocked
362                    // on the exec FIFO.
363                    let _ = kill(child, Signal::SIGKILL);
364                    let _ = waitpid(child, None);
365                    e
366                })
367            }
368            ForkResult::Child => {
369                drop(ready_read);
370                let temp_container = Container { config, runsc_path };
371                match temp_container.setup_and_exec(Some(ready_write), exec_fifo) {
372                    Ok(_) => unreachable!(),
373                    Err(e) => {
374                        error!("Container setup failed: {}", e);
375                        std::process::exit(1);
376                    }
377                }
378            }
379        }
380    }
381
382    /// Trigger a previously-created container to start by opening its exec FIFO.
383    /// Used by the CLI `start` command.
384    pub fn trigger_start(container_id: &str) -> Result<()> {
385        let state_mgr = ContainerStateManager::new()?;
386        let fifo_path = state_mgr.exec_fifo_path(container_id)?;
387        if !fifo_path.exists() {
388            return Err(NucleusError::ConfigError(format!(
389                "No exec FIFO found for container {}; is it in 'created' state?",
390                container_id
391            )));
392        }
393
394        // Opening the FIFO for reading unblocks the child's open-for-write.
395        let file = std::fs::File::open(&fifo_path)
396            .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
397        let mut buf = [0u8; 1];
398        std::io::Read::read(&mut &file, &mut buf)
399            .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
400        drop(file);
401
402        let _ = std::fs::remove_file(&fifo_path);
403
404        // Update state to Running
405        let mut state = state_mgr.resolve_container(container_id)?;
406        state.status = OciStatus::Running;
407        state_mgr.save_state(&state)?;
408
409        Ok(())
410    }
411
412    /// Set up container environment and exec target process
413    ///
414    /// This runs in the child process after fork.
415    /// Tracks FilesystemState and SecurityState machines to enforce correct ordering.
416    fn setup_and_exec(
417        &self,
418        ready_pipe: Option<OwnedFd>,
419        exec_fifo: Option<PathBuf>,
420    ) -> Result<()> {
421        let is_rootless = self.config.user_ns_config.is_some();
422        let allow_degraded_security = Self::allow_degraded_security(&self.config);
423        let context_manifest = if self.config.verify_context_integrity {
424            self.config
425                .context_dir
426                .as_ref()
427                .map(|dir| snapshot_context_dir(dir))
428                .transpose()?
429        } else {
430            None
431        };
432
433        // Initialize state machines
434        let mut fs_state = FilesystemState::Unmounted;
435        let mut sec_state = SecurityState::Privileged;
436
437        // gVisor is the runtime that should create the container's namespaces.
438        // Running runsc after pre-unsharing our own namespaces breaks its gofer
439        // re-exec path on some systems and duplicates the OCI namespace config.
440        if self.config.use_gvisor {
441            if let Some(fd) = ready_pipe {
442                Self::notify_namespace_ready(&fd, std::process::id())?;
443            }
444            return self.setup_and_exec_gvisor();
445        }
446
447        // 1. Create namespaces in child and optionally configure user mapping.
448        let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
449        if let Some(user_config) = &self.config.user_ns_config {
450            namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
451        }
452        namespace_mgr.unshare_namespaces()?;
453
454        // CLONE_NEWPID only applies to children created after unshare().
455        // Create a child that will become PID 1 in the new namespace and exec the workload.
456        if self.config.namespaces.pid {
457            match unsafe { fork() }? {
458                ForkResult::Parent { child } => {
459                    if let Some(fd) = ready_pipe {
460                        Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
461                    }
462                    std::process::exit(Self::wait_for_pid_namespace_child(child));
463                }
464                ForkResult::Child => {
465                    // Continue container setup as PID 1 in the new namespace.
466                }
467            }
468        } else if let Some(fd) = ready_pipe {
469            Self::notify_namespace_ready(&fd, std::process::id())?;
470        }
471
472        // Namespace: Unshared -> Entered (process is now inside all namespaces)
473        namespace_mgr.enter()?;
474
475        // 2. Ensure no_new_privs BEFORE any mount operations.
476        // This prevents exploitation of setuid binaries on bind-mounted paths
477        // even if a subsequent MS_NOSUID remount fails.
478        self.enforce_no_new_privs()?;
479        audit(
480            &self.config.id,
481            &self.config.name,
482            AuditEventType::NoNewPrivsSet,
483            "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
484        );
485
486        // 3. Set hostname if UTS namespace is enabled
487        if let Some(hostname) = &self.config.hostname {
488            namespace_mgr.set_hostname(hostname)?;
489        }
490
491        // 4. Mount tmpfs as container root
492        // Filesystem: Unmounted -> Mounted
493        let runtime_dir = Builder::new()
494            .prefix("nucleus-runtime-")
495            .tempdir_in("/tmp")
496            .map_err(|e| {
497                NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
498            })?;
499        let container_root = runtime_dir.path().to_path_buf();
500        let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); // 1GB default
501        tmpfs.mount()?;
502        fs_state = fs_state.transition(FilesystemState::Mounted)?;
503
504        // 4. Create minimal filesystem structure
505        create_minimal_fs(&container_root)?;
506
507        // 5. Create device nodes
508        let dev_path = container_root.join("dev");
509        create_dev_nodes(&dev_path, false)?;
510
511        // 6. Populate context if provided
512        // Filesystem: Mounted -> Populated
513        if let Some(context_dir) = &self.config.context_dir {
514            let context_dest = container_root.join("context");
515            LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
516            if let Some(expected) = &context_manifest {
517                verify_context_manifest(expected, &context_dest)?;
518            }
519        }
520        fs_state = fs_state.transition(FilesystemState::Populated)?;
521
522        // 7. Mount runtime paths: either a pre-built rootfs or host bind mounts
523        if let Some(ref rootfs_path) = self.config.rootfs_path {
524            if self.config.verify_rootfs_attestation {
525                verify_rootfs_attestation(rootfs_path)?;
526            }
527            bind_mount_rootfs(&container_root, rootfs_path)?;
528        } else {
529            bind_mount_host_paths(&container_root, is_rootless)?;
530        }
531
532        // 7b. Mount persistent or ephemeral volumes over the base filesystem.
533        mount_volumes(&container_root, &self.config.volumes)?;
534
535        // 7c. Write resolv.conf for bridge networking.
536        // When rootfs is mounted, /etc is read-only, so we bind-mount a writable
537        // resolv.conf over the top (same technique as secrets).
538        if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
539            if self.config.rootfs_path.is_some() {
540                BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_config.dns)?;
541            } else {
542                BridgeNetwork::write_resolv_conf(&container_root, &bridge_config.dns)?;
543            }
544        }
545
546        // 7d. Mount secrets (in-memory tmpfs for production, bind-mount for agent mode)
547        if self.config.service_mode == ServiceMode::Production {
548            mount_secrets_inmemory(
549                &container_root,
550                &self.config.secrets,
551                &self.config.process_identity,
552            )?;
553        } else {
554            mount_secrets(&container_root, &self.config.secrets)?;
555        }
556
557        // 8. Mount procfs (hidepid=2 in production mode to prevent PID enumeration)
558        let proc_path = container_root.join("proc");
559        let hide_pids = self.config.service_mode == ServiceMode::Production;
560        mount_procfs(
561            &proc_path,
562            is_rootless,
563            self.config.proc_readonly,
564            hide_pids,
565        )?;
566
567        // 8b. Mask sensitive /proc paths to reduce kernel info leakage
568        // SEC-06: In production mode, failures to mask critical paths are fatal.
569        mask_proc_paths(
570            &proc_path,
571            self.config.service_mode == ServiceMode::Production,
572        )?;
573
574        // 9c. Run createRuntime hooks (after namespaces created, before pivot_root)
575        if let Some(ref hooks) = self.config.hooks {
576            if !hooks.create_runtime.is_empty() {
577                let hook_state = OciContainerState {
578                    oci_version: "1.0.2".to_string(),
579                    id: self.config.id.clone(),
580                    status: OciStatus::Creating,
581                    pid: std::process::id(),
582                    bundle: String::new(),
583                };
584                OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
585            }
586        }
587
588        // 10. Switch root filesystem
589        // Filesystem: Populated -> Pivoted
590        switch_root(&container_root, self.config.allow_chroot_fallback)?;
591        fs_state = fs_state.transition(FilesystemState::Pivoted)?;
592        debug!("Filesystem state: {:?}", fs_state);
593
594        // 10b. Audit mount flags to verify filesystem hardening invariants
595        audit_mounts(self.config.service_mode == ServiceMode::Production)?;
596        audit(
597            &self.config.id,
598            &self.config.name,
599            AuditEventType::MountAuditPassed,
600            "all mount flags verified",
601        );
602
603        // 10c. Run createContainer hooks (after pivot_root, before start)
604        if let Some(ref hooks) = self.config.hooks {
605            if !hooks.create_container.is_empty() {
606                let hook_state = OciContainerState {
607                    oci_version: "1.0.2".to_string(),
608                    id: self.config.id.clone(),
609                    status: OciStatus::Created,
610                    pid: std::process::id(),
611                    bundle: String::new(),
612                };
613                OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
614            }
615        }
616
617        // 11. Drop capabilities (from policy file or default drop-all)
618        // Security: Privileged -> CapabilitiesDropped
619        let mut cap_mgr = CapabilityManager::new();
620        if let Some(ref policy_path) = self.config.caps_policy {
621            let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
622                policy_path,
623                self.config.caps_policy_sha256.as_deref(),
624            )?;
625            policy.apply(&mut cap_mgr)?;
626            audit(
627                &self.config.id,
628                &self.config.name,
629                AuditEventType::CapabilitiesDropped,
630                format!("capability policy applied from {:?}", policy_path),
631            );
632        } else {
633            cap_mgr.drop_all()?;
634            audit(
635                &self.config.id,
636                &self.config.name,
637                AuditEventType::CapabilitiesDropped,
638                "all capabilities dropped including bounding set",
639            );
640        }
641        sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
642
643        // 12b. RLIMIT backstop: defense-in-depth against fork bombs and fd exhaustion.
644        // Must be applied BEFORE seccomp, since SYS_setrlimit is not in the allowlist.
645        // SEC-05: In production mode, RLIMIT failures are fatal — a container
646        // without resource limits is a privilege escalation vector.
647        {
648            let is_production = self.config.service_mode == ServiceMode::Production;
649
650            let nproc_limit = self.config.limits.pids_max.unwrap_or(512);
651            let rlim_nproc = libc::rlimit {
652                rlim_cur: nproc_limit,
653                rlim_max: nproc_limit,
654            };
655            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
656            if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
657                let err = std::io::Error::last_os_error();
658                if is_production {
659                    return Err(NucleusError::SeccompError(format!(
660                        "Failed to set RLIMIT_NPROC to {} in production mode: {}",
661                        nproc_limit, err
662                    )));
663                }
664                warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
665            }
666
667            let rlim_nofile = libc::rlimit {
668                rlim_cur: 1024,
669                rlim_max: 1024,
670            };
671            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
672            if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
673                let err = std::io::Error::last_os_error();
674                if is_production {
675                    return Err(NucleusError::SeccompError(format!(
676                        "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
677                        err
678                    )));
679                }
680                warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
681            }
682
683            // RLIMIT_MEMLOCK: prevent container from pinning excessive physical
684            // memory via mlock(). Default 64KB matches unprivileged default, but
685            // in a user namespace the container appears as UID 0 and may have a
686            // higher inherited limit.
687            let memlock_limit: u64 = 64 * 1024; // 64KB
688            let rlim_memlock = libc::rlimit {
689                rlim_cur: memlock_limit,
690                rlim_max: memlock_limit,
691            };
692            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
693            if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
694                let err = std::io::Error::last_os_error();
695                if is_production {
696                    return Err(NucleusError::SeccompError(format!(
697                        "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
698                        memlock_limit, err
699                    )));
700                }
701                warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
702            }
703        }
704
705        // 13. Apply seccomp filter (trace, profile-from-file, or built-in allowlist)
706        // Security: CapabilitiesDropped -> SeccompApplied
707        use crate::container::config::SeccompMode;
708        let mut seccomp_mgr = SeccompManager::new();
709        let allow_network = !matches!(self.config.network, NetworkMode::None);
710        let seccomp_applied = match self.config.seccomp_mode {
711            SeccompMode::Trace => {
712                audit(
713                    &self.config.id,
714                    &self.config.name,
715                    AuditEventType::SeccompApplied,
716                    "seccomp trace mode: allow-all + LOG",
717                );
718                seccomp_mgr.apply_trace_filter()?
719            }
720            SeccompMode::Enforce => {
721                if let Some(ref profile_path) = self.config.seccomp_profile {
722                    audit(
723                        &self.config.id,
724                        &self.config.name,
725                        AuditEventType::SeccompProfileLoaded,
726                        format!("path={:?}", profile_path),
727                    );
728                    seccomp_mgr.apply_profile_from_file(
729                        profile_path,
730                        self.config.seccomp_profile_sha256.as_deref(),
731                        self.config.seccomp_log_denied,
732                    )?
733                } else {
734                    seccomp_mgr.apply_filter_for_network_mode(
735                        allow_network,
736                        allow_degraded_security,
737                        self.config.seccomp_log_denied,
738                    )?
739                }
740            }
741        };
742        if seccomp_applied {
743            sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
744            audit(
745                &self.config.id,
746                &self.config.name,
747                AuditEventType::SeccompApplied,
748                format!("network={}", allow_network),
749            );
750        } else if !allow_degraded_security {
751            return Err(NucleusError::SeccompError(
752                "Seccomp filter is required but was not enforced".to_string(),
753            ));
754        } else {
755            warn!("Seccomp not enforced; container is running with degraded hardening");
756        }
757
758        // 14. Apply Landlock policy (from policy file or default hardcoded rules)
759        let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
760            let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
761                policy_path,
762                self.config.landlock_policy_sha256.as_deref(),
763            )?;
764            policy.apply(allow_degraded_security)?
765        } else {
766            let mut landlock_mgr = LandlockManager::new();
767            landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
768            landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
769        };
770        if seccomp_applied && landlock_applied {
771            sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
772            if self.config.seccomp_mode == SeccompMode::Trace {
773                warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
774            } else {
775                sec_state = sec_state.transition(SecurityState::Locked)?;
776            }
777            audit(
778                &self.config.id,
779                &self.config.name,
780                AuditEventType::LandlockApplied,
781                if self.config.seccomp_mode == SeccompMode::Trace {
782                    "landlock applied, but seccomp in trace mode — not locked".to_string()
783                } else {
784                    "security state locked: all hardening layers active".to_string()
785                },
786            );
787        } else if !allow_degraded_security {
788            return Err(NucleusError::LandlockError(
789                "Landlock policy is required but was not enforced".to_string(),
790            ));
791        } else {
792            warn!("Security state not locked; one or more hardening controls are inactive");
793        }
794        debug!("Security state: {:?}", sec_state);
795
796        // 14c. Block on exec FIFO until start() opens it for reading.
797        // This implements the OCI two-phase create/start: all container setup
798        // is complete, but the user process doesn't exec until explicitly started.
799        if let Some(ref fifo_path) = exec_fifo {
800            debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
801            let file = std::fs::OpenOptions::new()
802                .write(true)
803                .open(fifo_path)
804                .map_err(|e| {
805                    NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
806                })?;
807            std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
808                NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
809            })?;
810            drop(file);
811            debug!("Exec FIFO released, proceeding to exec");
812        }
813
814        // 14d. Run startContainer hooks (after start signal, before user process exec)
815        if let Some(ref hooks) = self.config.hooks {
816            if !hooks.start_container.is_empty() {
817                let hook_state = OciContainerState {
818                    oci_version: "1.0.2".to_string(),
819                    id: self.config.id.clone(),
820                    status: OciStatus::Running,
821                    pid: std::process::id(),
822                    bundle: String::new(),
823                };
824                OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
825            }
826        }
827
828        // 15. In production mode with PID namespace, run as a mini-init (PID 1)
829        // that reaps zombies and forwards signals, rather than exec-ing directly.
830        if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
831            return self.run_as_init();
832        }
833
834        // 15b. Agent mode: exec target process directly
835        self.exec_command()?;
836
837        // Should never reach here
838        Ok(())
839    }
840
841    /// Forward selected signals to child process using sigwait (no async signal handlers).
842    ///
843    /// Returns a stop flag and join handle. Set the flag to `true` and join
844    /// the handle to cleanly shut down the forwarding thread.
845    pub(super) fn setup_signal_forwarding_static(
846        child: Pid,
847    ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
848        let mut set = SigSet::empty();
849        for signal in [
850            Signal::SIGTERM,
851            Signal::SIGINT,
852            Signal::SIGHUP,
853            Signal::SIGQUIT,
854            Signal::SIGUSR1,
855            Signal::SIGUSR2,
856        ] {
857            set.add(signal);
858        }
859
860        let unblock_set = set;
861        pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
862            NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
863        })?;
864
865        let stop = Arc::new(AtomicBool::new(false));
866        let stop_clone = stop.clone();
867        let handle = std::thread::Builder::new()
868            .name("sig-forward".to_string())
869            .spawn(move || {
870                // The thread owns unblock_set and uses it for sigwait.
871                while !stop_clone.load(Ordering::Relaxed) {
872                    if let Ok(signal) = unblock_set.wait() {
873                        let _ = kill(child, signal);
874                    }
875                }
876            })
877            .map_err(|e| {
878                // Restore the signal mask so the caller isn't left with
879                // signals permanently blocked.
880                let mut restore = SigSet::empty();
881                for signal in [
882                    Signal::SIGTERM,
883                    Signal::SIGINT,
884                    Signal::SIGHUP,
885                    Signal::SIGQUIT,
886                    Signal::SIGUSR1,
887                    Signal::SIGUSR2,
888                ] {
889                    restore.add(signal);
890                }
891                let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
892                NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
893            })?;
894
895        info!("Signal forwarding configured");
896        Ok((stop, handle))
897    }
898
899    /// Wait for child process to exit
900    pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
901        loop {
902            match waitpid(child, None) {
903                Ok(WaitStatus::Exited(_, code)) => {
904                    return Ok(code);
905                }
906                Ok(WaitStatus::Signaled(_, signal, _)) => {
907                    info!("Child killed by signal: {:?}", signal);
908                    return Ok(128 + signal as i32);
909                }
910                Err(nix::errno::Errno::EINTR) => {
911                    continue;
912                }
913                Err(e) => {
914                    return Err(NucleusError::ExecError(format!(
915                        "Failed to wait for child: {}",
916                        e
917                    )));
918                }
919                _ => {
920                    continue;
921                }
922            }
923        }
924    }
925
926    fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
927        let mut pid_buf = [0u8; 4];
928        loop {
929            match read(ready_read.as_raw_fd(), &mut pid_buf) {
930                Err(nix::errno::Errno::EINTR) => continue,
931                Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
932                Ok(0) => {
933                    return Err(NucleusError::ExecError(format!(
934                        "Child {} exited before namespace initialization",
935                        child
936                    )))
937                }
938                Ok(_) => {
939                    return Err(NucleusError::ExecError(
940                        "Invalid namespace sync payload from child".to_string(),
941                    ))
942                }
943                Err(e) => {
944                    return Err(NucleusError::ExecError(format!(
945                        "Failed waiting for child namespace setup: {}",
946                        e
947                    )))
948                }
949            }
950        }
951    }
952
953    fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
954        let payload = pid.to_ne_bytes();
955        let mut written = 0;
956        while written < payload.len() {
957            let n = write(fd, &payload[written..]).map_err(|e| {
958                NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
959            })?;
960            if n == 0 {
961                return Err(NucleusError::ExecError(
962                    "Failed to notify namespace readiness: short write".to_string(),
963                ));
964            }
965            written += n;
966        }
967        Ok(())
968    }
969
970    fn wait_for_pid_namespace_child(child: Pid) -> i32 {
971        loop {
972            match waitpid(child, None) {
973                Ok(WaitStatus::Exited(_, code)) => return code,
974                Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
975                Err(nix::errno::Errno::EINTR) => continue,
976                Err(_) => return 1,
977                _ => continue,
978            }
979        }
980    }
981}
982
983impl CreatedContainer {
984    /// Start phase: release the child via the exec FIFO, transition to Running,
985    /// then wait for the child to exit with full lifecycle management.
986    pub fn start(mut self) -> Result<i32> {
987        let config = &self.config;
988        let _enter = self._lifecycle_span.enter();
989
990        // Open the exec FIFO for reading — this unblocks the child's
991        // blocking open-for-write, allowing it to proceed to exec.
992        if let Some(exec_fifo_path) = &self.exec_fifo_path {
993            let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
994                NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
995            })?;
996            let mut buf = [0u8; 1];
997            let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
998                NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
999            })?;
1000            if read != 1 {
1001                return Err(NucleusError::ExecError(
1002                    "Exec FIFO closed before start signal was delivered".to_string(),
1003                ));
1004            }
1005            let _ = std::fs::remove_file(exec_fifo_path);
1006        }
1007
1008        // Transition: Created -> Running
1009        self.state.status = OciStatus::Running;
1010        self.state_mgr.save_state(&self.state)?;
1011
1012        let target_pid = self.state.pid;
1013        let child = self.child;
1014
1015        let (sig_stop, sig_handle) =
1016            Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1017
1018        // Guard ensures signal thread is stopped on any exit path (including early ? returns).
1019        let mut sig_guard = SignalThreadGuard {
1020            stop: Some(sig_stop),
1021            handle: Some(sig_handle),
1022        };
1023
1024        // Run readiness probe before declaring service ready
1025        if let Some(ref probe) = config.readiness_probe {
1026            let notify_socket = if config.sd_notify {
1027                std::env::var("NOTIFY_SOCKET").ok()
1028            } else {
1029                None
1030            };
1031            Container::run_readiness_probe(
1032                target_pid,
1033                &config.name,
1034                probe,
1035                config.user_ns_config.is_some(),
1036                config.use_gvisor,
1037                &config.process_identity,
1038                notify_socket.as_deref(),
1039            )?;
1040        }
1041
1042        // Start health check thread if configured
1043        let cancel_flag = Arc::new(AtomicBool::new(false));
1044        let health_handle = if let Some(ref hc) = config.health_check {
1045            if !hc.command.is_empty() {
1046                let hc = hc.clone();
1047                let pid = target_pid;
1048                let container_name = config.name.clone();
1049                let rootless = config.user_ns_config.is_some();
1050                let using_gvisor = config.use_gvisor;
1051                let process_identity = config.process_identity.clone();
1052                let cancel = cancel_flag.clone();
1053                Some(std::thread::spawn(move || {
1054                    Container::health_check_loop(
1055                        pid,
1056                        &container_name,
1057                        rootless,
1058                        using_gvisor,
1059                        &hc,
1060                        &process_identity,
1061                        &cancel,
1062                    );
1063                }))
1064            } else {
1065                None
1066            }
1067        } else {
1068            None
1069        };
1070
1071        // Guard ensures health check thread is cancelled on any exit path.
1072        let mut health_guard = HealthThreadGuard {
1073            cancel: Some(cancel_flag),
1074            handle: health_handle,
1075        };
1076
1077        // Run poststart hooks (after user process started, in parent)
1078        if let Some(ref hooks) = config.hooks {
1079            if !hooks.poststart.is_empty() {
1080                let hook_state = OciContainerState {
1081                    oci_version: "1.0.2".to_string(),
1082                    id: config.id.clone(),
1083                    status: OciStatus::Running,
1084                    pid: target_pid,
1085                    bundle: String::new(),
1086                };
1087                OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1088            }
1089        }
1090
1091        let mut child_waited = false;
1092        let run_result: Result<i32> = (|| {
1093            let exit_code = Container::wait_for_child_static(child)?;
1094
1095            // Transition: Running -> Stopped
1096            self.state.status = OciStatus::Stopped;
1097            let _ = self.state_mgr.save_state(&self.state);
1098
1099            child_waited = true;
1100            Ok(exit_code)
1101        })();
1102
1103        // Explicitly stop threads (guards would do this on drop too, but
1104        // explicit teardown keeps ordering visible).
1105        health_guard.stop();
1106        sig_guard.stop();
1107
1108        // Run poststop hooks (best-effort)
1109        if let Some(ref hooks) = config.hooks {
1110            if !hooks.poststop.is_empty() {
1111                let hook_state = OciContainerState {
1112                    oci_version: "1.0.2".to_string(),
1113                    id: config.id.clone(),
1114                    status: OciStatus::Stopped,
1115                    pid: 0,
1116                    bundle: String::new(),
1117                };
1118                OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1119            }
1120        }
1121
1122        if let Some(net) = self.bridge_net.take() {
1123            if let Err(e) = net.cleanup() {
1124                warn!("Failed to cleanup bridge networking: {}", e);
1125            }
1126        }
1127
1128        if !child_waited {
1129            let _ = kill(child, Signal::SIGKILL);
1130            let _ = waitpid(child, None);
1131        }
1132
1133        if let Some(reader) = self.trace_reader.take() {
1134            reader.stop_and_flush();
1135        }
1136
1137        if let Some(cgroup) = self.cgroup_opt.take() {
1138            if let Err(e) = cgroup.cleanup() {
1139                warn!("Failed to cleanup cgroup: {}", e);
1140            }
1141        }
1142
1143        if config.use_gvisor {
1144            if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1145                warn!(
1146                    "Failed to cleanup gVisor artifacts for {}: {}",
1147                    config.id, e
1148                );
1149            }
1150        }
1151
1152        if let Err(e) = self.state_mgr.delete_state(&config.id) {
1153            warn!("Failed to delete state for {}: {}", config.id, e);
1154        }
1155
1156        match run_result {
1157            Ok(exit_code) => {
1158                audit(
1159                    &config.id,
1160                    &config.name,
1161                    AuditEventType::ContainerStop,
1162                    format!("exit_code={}", exit_code),
1163                );
1164                info!(
1165                    "Container {} ({}) exited with code {}",
1166                    config.name, config.id, exit_code
1167                );
1168                Ok(exit_code)
1169            }
1170            Err(e) => {
1171                audit_error(
1172                    &config.id,
1173                    &config.name,
1174                    AuditEventType::ContainerStop,
1175                    format!("error={}", e),
1176                );
1177                Err(e)
1178            }
1179        }
1180    }
1181}
1182
1183/// RAII guard that stops the signal-forwarding thread on drop.
1184struct SignalThreadGuard {
1185    stop: Option<Arc<AtomicBool>>,
1186    handle: Option<JoinHandle<()>>,
1187}
1188
1189impl SignalThreadGuard {
1190    fn stop(&mut self) {
1191        if let Some(flag) = self.stop.take() {
1192            flag.store(true, Ordering::Relaxed);
1193            // Unblock the sigwait() call so the thread can observe the stop flag.
1194            let _ = kill(Pid::this(), Signal::SIGUSR1);
1195        }
1196        if let Some(handle) = self.handle.take() {
1197            let _ = handle.join();
1198        }
1199    }
1200}
1201
1202impl Drop for SignalThreadGuard {
1203    fn drop(&mut self) {
1204        self.stop();
1205    }
1206}
1207
1208/// RAII guard that cancels the health-check thread on drop.
1209struct HealthThreadGuard {
1210    cancel: Option<Arc<AtomicBool>>,
1211    handle: Option<JoinHandle<()>>,
1212}
1213
1214impl HealthThreadGuard {
1215    fn stop(&mut self) {
1216        if let Some(flag) = self.cancel.take() {
1217            flag.store(true, Ordering::Relaxed);
1218        }
1219        if let Some(handle) = self.handle.take() {
1220            let _ = handle.join();
1221        }
1222    }
1223}
1224
1225impl Drop for HealthThreadGuard {
1226    fn drop(&mut self) {
1227        self.stop();
1228    }
1229}
1230
1231#[cfg(test)]
1232mod tests {
1233    use super::*;
1234    use crate::container::KernelLockdownMode;
1235    use crate::network::NetworkMode;
1236
1237    #[test]
1238    fn test_container_config() {
1239        let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1240        assert!(!config.id.is_empty());
1241        assert_eq!(config.command, vec!["/bin/sh"]);
1242        assert!(config.use_gvisor);
1243    }
1244
1245    #[test]
1246    fn test_run_uses_immediate_start_path() {
1247        let source = include_str!("runtime.rs");
1248        let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1249        let after = &source[fn_start..];
1250        let open = after.find('{').unwrap();
1251        let mut depth = 0u32;
1252        let mut fn_end = open;
1253        for (i, ch) in after[open..].char_indices() {
1254            match ch {
1255                '{' => depth += 1,
1256                '}' => {
1257                    depth -= 1;
1258                    if depth == 0 {
1259                        fn_end = open + i + 1;
1260                        break;
1261                    }
1262                }
1263                _ => {}
1264            }
1265        }
1266        let run_body = &after[..fn_end];
1267        assert!(
1268            run_body.contains("create_internal(false)"),
1269            "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1270        );
1271        assert!(
1272            !run_body.contains("self.create()?.start()"),
1273            "run() must not route through create()+start()"
1274        );
1275    }
1276
1277    #[test]
1278    fn test_container_config_with_name() {
1279        let config =
1280            ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1281                .unwrap();
1282        assert_eq!(config.name, "mycontainer");
1283        assert!(!config.id.is_empty());
1284        assert_ne!(config.id, config.name);
1285    }
1286
1287    #[test]
1288    fn test_allow_degraded_security_requires_explicit_config() {
1289        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1290        assert!(!Container::allow_degraded_security(&strict));
1291
1292        let relaxed = strict.clone().with_allow_degraded_security(true);
1293        assert!(Container::allow_degraded_security(&relaxed));
1294    }
1295
1296    #[test]
1297    fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1298        let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1299        std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1300
1301        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1302        assert!(!Container::allow_degraded_security(&strict));
1303
1304        let explicit = strict.with_allow_degraded_security(true);
1305        assert!(Container::allow_degraded_security(&explicit));
1306
1307        match prev {
1308            Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1309            None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1310        }
1311    }
1312
1313    #[test]
1314    fn test_host_network_requires_explicit_opt_in() {
1315        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1316            .unwrap()
1317            .with_network(NetworkMode::Host)
1318            .with_allow_host_network(false);
1319        let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1320        assert!(matches!(err, NucleusError::NetworkError(_)));
1321    }
1322
1323    #[test]
1324    fn test_host_network_opt_in_disables_net_namespace() {
1325        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1326            .unwrap()
1327            .with_network(NetworkMode::Host)
1328            .with_allow_host_network(true);
1329        assert!(config.namespaces.net);
1330        Container::apply_network_mode_guards(&mut config, true).unwrap();
1331        assert!(!config.namespaces.net);
1332    }
1333
1334    #[test]
1335    fn test_non_host_network_does_not_require_host_opt_in() {
1336        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1337            .unwrap()
1338            .with_network(NetworkMode::None)
1339            .with_allow_host_network(false);
1340        assert!(config.namespaces.net);
1341        Container::apply_network_mode_guards(&mut config, true).unwrap();
1342        assert!(config.namespaces.net);
1343    }
1344
1345    #[test]
1346    fn test_parse_kernel_lockdown_mode() {
1347        assert_eq!(
1348            Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1349            Some(KernelLockdownMode::Integrity)
1350        );
1351        assert_eq!(
1352            Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1353            Some(KernelLockdownMode::Confidentiality)
1354        );
1355        assert_eq!(
1356            Container::parse_active_lockdown_mode("[none] integrity"),
1357            None
1358        );
1359    }
1360
1361    #[test]
1362    fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1363        let temp = tempfile::TempDir::new().unwrap();
1364        let source = temp.path().join("source-secret");
1365        std::fs::write(&source, "supersecret").unwrap();
1366
1367        let staged = Container::stage_gvisor_secret_files(
1368            &temp.path().join("stage"),
1369            &[crate::container::SecretMount {
1370                source: source.clone(),
1371                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1372                mode: 0o400,
1373            }],
1374            &crate::container::ProcessIdentity::root(),
1375        )
1376        .unwrap();
1377
1378        assert_eq!(staged.len(), 1);
1379        assert!(staged[0].source.starts_with(temp.path().join("stage")));
1380        assert_eq!(
1381            std::fs::read_to_string(&staged[0].source).unwrap(),
1382            "supersecret"
1383        );
1384    }
1385
1386    #[test]
1387    fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1388        let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1389        std::fs::create_dir_all(&artifact_dir).unwrap();
1390        std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1391
1392        Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1393        assert!(!artifact_dir.exists());
1394    }
1395
1396    #[test]
1397    fn test_health_check_loop_supports_cancellation() {
1398        // BUG-18: health_check_loop must accept an AtomicBool cancel flag
1399        // and check it between iterations for prompt shutdown.
1400        // Function lives in health.rs after the runtime split.
1401        let source = include_str!("health.rs");
1402        let fn_start = source.find("fn health_check_loop").unwrap();
1403        let fn_body = &source[fn_start..fn_start + 2500];
1404        assert!(
1405            fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1406            "health_check_loop must accept an AtomicBool cancellation flag"
1407        );
1408        // Must also check cancellation during sleep
1409        assert!(
1410            fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1411            "health_check_loop must check cancellation during sleep intervals"
1412        );
1413    }
1414
1415    #[test]
1416    fn test_runtime_probes_do_not_spawn_host_nsenter() {
1417        // Both functions live in health.rs after the runtime split.
1418        let source = include_str!("health.rs");
1419
1420        let readiness_start = source.find("fn run_readiness_probe").unwrap();
1421        let readiness_body = &source[readiness_start..readiness_start + 2500];
1422        assert!(
1423            !readiness_body.contains("Command::new(&nsenter_bin)"),
1424            "readiness probes must not execute via host nsenter"
1425        );
1426
1427        let health_start = source.find("fn health_check_loop").unwrap();
1428        let health_body = &source[health_start..health_start + 2200];
1429        assert!(
1430            !health_body.contains("Command::new(&nsenter_bin)"),
1431            "health checks must not execute via host nsenter"
1432        );
1433    }
1434
1435    #[test]
1436    fn test_oci_mount_strip_prefix_no_expect() {
1437        // BUG-08: prepare_oci_mountpoints must not use expect() - use ? instead
1438        // Function lives in gvisor_setup.rs after the runtime split.
1439        let source = include_str!("gvisor_setup.rs");
1440        let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1441        let fn_body = &source[fn_start..fn_start + 600];
1442        assert!(
1443            !fn_body.contains(".expect("),
1444            "prepare_oci_mountpoints must not use expect() — return Err instead"
1445        );
1446    }
1447
1448    #[test]
1449    fn test_notify_namespace_ready_validates_write_length() {
1450        // BUG-02: notify_namespace_ready must validate that all bytes were written
1451        let source = include_str!("runtime.rs");
1452        let fn_start = source.find("fn notify_namespace_ready").unwrap();
1453        let fn_body = &source[fn_start..fn_start + 500];
1454        // Must check the return value of write() for partial writes
1455        assert!(
1456            fn_body.contains("written")
1457                || fn_body.contains("4")
1458                || fn_body.contains("payload.len()"),
1459            "notify_namespace_ready must validate complete write of all 4 bytes"
1460        );
1461    }
1462
1463    #[test]
1464    fn test_rlimit_failures_fatal_in_production() {
1465        // SEC-05: RLIMIT failures must be fatal in production mode
1466        let source = include_str!("runtime.rs");
1467        let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1468        let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1469        assert!(
1470            rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1471            "RLIMIT failures must return Err in production mode"
1472        );
1473    }
1474
1475    #[test]
1476    fn test_tcp_readiness_probe_uses_portable_check() {
1477        // BUG-14: TCP readiness probe must not use /dev/tcp (bash-only)
1478        // Function lives in health.rs after the runtime split.
1479        let source = include_str!("health.rs");
1480        let probe_fn = source.find("TcpPort(port)").unwrap();
1481        let probe_body = &source[probe_fn..probe_fn + 500];
1482        assert!(
1483            !probe_body.contains("/dev/tcp"),
1484            "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1485        );
1486    }
1487}