nucleus/container/
runtime.rs

1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3    ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4    ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8    audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9    mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
10    switch_root, verify_context_manifest, verify_rootfs_attestation, FilesystemState,
11    LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::{NamespaceManager, UserNamespaceMapper};
14use crate::network::{BridgeDriver, BridgeNetwork, NatBackend, NetworkMode, UserspaceNetwork};
15use crate::resources::Cgroup;
16use crate::security::{
17    CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks,
18    SeccompDenyLogger, SeccompManager, SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{
25    chown, fork, pipe, read, setresgid, setresuid, write, ForkResult, Gid, Pid, Uid,
26};
27use std::os::fd::OwnedFd;
28use std::os::unix::fs::PermissionsExt;
29use std::path::PathBuf;
30use std::sync::atomic::{AtomicBool, Ordering};
31use std::sync::Arc;
32use std::thread::JoinHandle;
33use tempfile::Builder;
34use tracing::{debug, error, info, info_span, warn};
35
36/// Container runtime that orchestrates all isolation mechanisms
37///
38/// Execution flow matches the formal specifications:
39/// 1. Create namespaces (Nucleus_Isolation_NamespaceLifecycle.tla)
40/// 2. Create and configure cgroups (Nucleus_Resources_CgroupLifecycle.tla)
41/// 3. Mount tmpfs and populate context (Nucleus_Filesystem_FilesystemLifecycle.tla)
42/// 4. Drop capabilities and apply seccomp (Nucleus_Security_SecurityEnforcement.tla)
43/// 5. Execute target process
44pub struct Container {
45    pub(super) config: ContainerConfig,
46    /// Pre-resolved runsc path, resolved before fork so that user-namespace
47    /// UID changes don't block PATH-based lookup.
48    pub(super) runsc_path: Option<String>,
49}
50
51/// Handle returned by `Container::create()` representing a container whose
52/// child process has been forked and is blocked on the exec FIFO, waiting for
53/// `start()` to release it.
54pub struct CreatedContainer {
55    pub(super) config: ContainerConfig,
56    pub(super) state_mgr: ContainerStateManager,
57    pub(super) state: ContainerState,
58    pub(super) child: Pid,
59    pub(super) cgroup_opt: Option<Cgroup>,
60    pub(super) network_driver: Option<BridgeDriver>,
61    pub(super) trace_reader: Option<SeccompTraceReader>,
62    pub(super) deny_logger: Option<SeccompDenyLogger>,
63    pub(super) exec_fifo_path: Option<PathBuf>,
64    pub(super) _lifecycle_span: tracing::Span,
65}
66
67impl Container {
68    pub fn new(config: ContainerConfig) -> Self {
69        Self {
70            config,
71            runsc_path: None,
72        }
73    }
74
75    /// Run the container (convenience wrapper: create + start)
76    pub fn run(&self) -> Result<i32> {
77        self.create_internal(false)?.start()
78    }
79
80    /// Create phase: fork the child, set up cgroup/bridge, leave child blocked
81    /// on the exec FIFO. Returns a `CreatedContainer` whose `start()` method
82    /// releases the child process.
83    pub fn create(&self) -> Result<CreatedContainer> {
84        self.create_internal(true)
85    }
86
87    /// H6: Close all file descriptors > 2 in the child process after fork.
88    ///
89    /// This prevents leaking host sockets, pipes, and state files into the
90    /// container. Uses close_range(2) when available, falls back to /proc/self/fd.
91    fn sanitize_fds() {
92        // Try close_range(3, u32::MAX, CLOSE_RANGE_CLOEXEC) first – it's
93        // O(1) on Linux 5.9+ and marks all FDs as close-on-exec.
94        const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
95        // SAFETY: close_range is a safe syscall that marks FDs as close-on-exec.
96        let ret =
97            unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
98        if ret == 0 {
99            return;
100        }
101        // Fallback: iterate /proc/self/fd and close individually.
102        // Collect fds first, then close – closing during iteration would
103        // invalidate the ReadDir's own directory fd.
104        if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
105            let fds: Vec<i32> = entries
106                .flatten()
107                .filter_map(|entry| entry.file_name().into_string().ok())
108                .filter_map(|s| s.parse::<i32>().ok())
109                .filter(|&fd| fd > 2)
110                .collect();
111            for fd in fds {
112                unsafe { libc::close(fd) };
113            }
114        }
115    }
116
117    pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
118        let thread_count = std::fs::read_to_string("/proc/self/status")
119            .ok()
120            .and_then(|s| {
121                s.lines()
122                    .find(|line| line.starts_with("Threads:"))
123                    .and_then(|line| line.split_whitespace().nth(1))
124                    .and_then(|count| count.parse::<u32>().ok())
125            });
126
127        if thread_count == Some(1) {
128            return Ok(());
129        }
130
131        Err(NucleusError::ExecError(format!(
132            "{} requires a single-threaded process before fork, found {:?} threads",
133            context, thread_count
134        )))
135    }
136
137    fn prepare_runtime_base_override(
138        config: &ContainerConfig,
139        host_is_root: bool,
140        needs_external_userns_mapping: bool,
141    ) -> Result<Option<PathBuf>> {
142        if !needs_external_userns_mapping {
143            return Ok(None);
144        }
145
146        if !host_is_root {
147            return Ok(Some(
148                dirs::runtime_dir()
149                    .map(|d| d.join("nucleus"))
150                    .unwrap_or_else(std::env::temp_dir),
151            ));
152        }
153
154        let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
155            NucleusError::ExecError("Missing user namespace configuration".to_string())
156        })?;
157        let host_uid =
158            Self::mapped_host_id_for_container_id(&user_config.uid_mappings, 0, "uid mappings")?;
159        let host_gid =
160            Self::mapped_host_id_for_container_id(&user_config.gid_mappings, 0, "gid mappings")?;
161
162        let root = PathBuf::from("/run/nucleus");
163        Self::ensure_runtime_parent_dir(&root)?;
164
165        let runtime_root = root.join("runtime");
166        Self::ensure_runtime_parent_dir(&runtime_root)?;
167
168        let base = runtime_root.join(&config.id);
169        std::fs::create_dir_all(&base).map_err(|e| {
170            NucleusError::FilesystemError(format!(
171                "Failed to create user namespace runtime base {:?}: {}",
172                base, e
173            ))
174        })?;
175        chown(
176            &base,
177            Some(Uid::from_raw(host_uid)),
178            Some(Gid::from_raw(host_gid)),
179        )
180        .map_err(|e| {
181            NucleusError::FilesystemError(format!(
182                "Failed to chown user namespace runtime base {:?} to {}:{}: {}",
183                base, host_uid, host_gid, e
184            ))
185        })?;
186        std::fs::set_permissions(&base, std::fs::Permissions::from_mode(0o700)).map_err(|e| {
187            NucleusError::FilesystemError(format!(
188                "Failed to secure user namespace runtime base {:?}: {}",
189                base, e
190            ))
191        })?;
192
193        Ok(Some(base))
194    }
195
196    fn ensure_runtime_parent_dir(path: &std::path::Path) -> Result<()> {
197        std::fs::create_dir_all(path).map_err(|e| {
198            NucleusError::FilesystemError(format!(
199                "Failed to create runtime parent dir {:?}: {}",
200                path, e
201            ))
202        })?;
203        std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o711)).map_err(|e| {
204            NucleusError::FilesystemError(format!(
205                "Failed to secure runtime parent dir {:?}: {}",
206                path, e
207            ))
208        })?;
209        Ok(())
210    }
211
212    fn mapped_host_id_for_container_id(
213        mappings: &[crate::isolation::IdMapping],
214        container_id: u32,
215        label: &str,
216    ) -> Result<u32> {
217        for mapping in mappings {
218            let end = mapping
219                .container_id
220                .checked_add(mapping.count)
221                .ok_or_else(|| {
222                    NucleusError::ConfigError(format!(
223                        "{} overflow for container id {}",
224                        label, container_id
225                    ))
226                })?;
227            if container_id >= mapping.container_id && container_id < end {
228                return mapping
229                    .host_id
230                    .checked_add(container_id - mapping.container_id)
231                    .ok_or_else(|| {
232                        NucleusError::ConfigError(format!(
233                            "{} host id overflow for container id {}",
234                            label, container_id
235                        ))
236                    });
237            }
238        }
239
240        Err(NucleusError::ConfigError(format!(
241            "{} do not map container id {}",
242            label, container_id
243        )))
244    }
245
246    fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
247        let lifecycle_span = info_span!(
248            "container.lifecycle",
249            container.id = %self.config.id,
250            container.name = %self.config.name,
251            runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
252        );
253        let _enter = lifecycle_span.enter();
254
255        info!(
256            "Creating container: {} (ID: {})",
257            self.config.name, self.config.id
258        );
259        audit(
260            &self.config.id,
261            &self.config.name,
262            AuditEventType::ContainerStart,
263            format!(
264                "command={:?} mode={:?} runtime={}",
265                crate::audit::redact_command(&self.config.command),
266                self.config.service_mode,
267                if self.config.use_gvisor {
268                    "gvisor"
269                } else {
270                    "native"
271                }
272            ),
273        );
274
275        // Auto-detect if we need rootless mode
276        let is_root = nix::unistd::Uid::effective().is_root();
277        let mut config = self.config.clone();
278
279        if !is_root && config.user_ns_config.is_none() {
280            info!("Not running as root, automatically enabling rootless mode");
281            config.namespaces.user = true;
282            config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
283        }
284
285        // C2: When running as root without user namespace, enable UID remapping
286        // in production mode (mandatory) or warn in other modes. Without user
287        // namespace, a container escape yields full host root.
288        if is_root && !config.namespaces.user {
289            if config.service_mode == ServiceMode::Production {
290                info!("Running as root in production mode: enabling user namespace with UID remapping");
291                config.namespaces.user = true;
292                config.user_ns_config =
293                    Some(crate::isolation::UserNamespaceConfig::root_remapped());
294            } else {
295                warn!(
296                    "Running as root WITHOUT user namespace isolation. \
297                     Container processes will run as real host UID 0. \
298                     Use --user-ns or production mode for UID remapping."
299                );
300            }
301        }
302
303        // Log console-socket acceptance (OCI interface; PTY forwarding is a future enhancement)
304        if let Some(ref socket_path) = config.console_socket {
305            warn!(
306                "Console socket {} accepted but terminal forwarding is not yet implemented",
307                socket_path.display()
308            );
309        }
310
311        // Validate production mode invariants before anything else.
312        config.validate_production_mode()?;
313        Self::assert_kernel_lockdown(&config)?;
314
315        Self::apply_network_mode_guards(&mut config, is_root)?;
316        Self::apply_trust_level_guards(&mut config)?;
317        config.validate_runtime_support()?;
318
319        if let NetworkMode::Bridge(ref bridge_config) = config.network {
320            let backend =
321                bridge_config.selected_nat_backend(is_root, config.user_ns_config.is_some());
322            if backend == NatBackend::Kernel && !is_root {
323                return Err(NucleusError::NetworkError(
324                    "Kernel bridge networking requires root. Use --nat-backend userspace or leave the default auto selection for rootless/native containers."
325                        .to_string(),
326                ));
327            }
328        }
329
330        // Create state manager, honoring --root override if set
331        let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
332
333        // Enforce name uniqueness among running containers
334        if let Ok(all_states) = state_mgr.list_states() {
335            if all_states.iter().any(|s| s.name == config.name) {
336                return Err(NucleusError::ConfigError(format!(
337                    "A container named '{}' already exists; use a different --name, \
338                     or remove the stale state with 'nucleus delete'",
339                    config.name
340                )));
341            }
342        }
343
344        // Create exec FIFO only for the two-phase create/start lifecycle.
345        // `run()` starts immediately and avoids this cross-root-path sync.
346        let exec_fifo = if defer_exec_until_start {
347            let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
348            nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
349                NucleusError::ExecError(format!(
350                    "Failed to create exec FIFO {:?}: {}",
351                    exec_fifo, e
352                ))
353            })?;
354            Some(exec_fifo)
355        } else {
356            None
357        };
358
359        // Try to create cgroup (optional for rootless mode)
360        let cgroup_name = format!("nucleus-{}", config.id);
361        let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
362            Ok(mut cgroup) => {
363                // Try to set limits
364                match cgroup.set_limits(&config.limits) {
365                    Ok(_) => {
366                        info!("Created cgroup with resource limits");
367                        Some(cgroup)
368                    }
369                    Err(e) => {
370                        if config.service_mode == ServiceMode::Production {
371                            let _ = cgroup.cleanup();
372                            return Err(NucleusError::CgroupError(format!(
373                                "Production mode requires cgroup resource enforcement, but \
374                                 applying limits failed: {}",
375                                e
376                            )));
377                        }
378                        warn!("Failed to set cgroup limits: {}", e);
379                        let _ = cgroup.cleanup();
380                        None
381                    }
382                }
383            }
384            Err(e) => {
385                if config.service_mode == ServiceMode::Production {
386                    return Err(NucleusError::CgroupError(format!(
387                        "Production mode requires cgroup resource enforcement, but \
388                         cgroup creation failed: {}",
389                        e
390                    )));
391                }
392
393                if config.user_ns_config.is_some() {
394                    if config.limits.memory_bytes.is_some()
395                        || config.limits.cpu_quota_us.is_some()
396                        || config.limits.pids_max.is_some()
397                    {
398                        warn!(
399                            "Running in rootless mode: requested resource limits cannot be \
400                             enforced – cgroup creation requires root ({})",
401                            e
402                        );
403                    } else {
404                        debug!("Running in rootless mode without cgroup resource limits");
405                    }
406                } else {
407                    warn!(
408                        "Failed to create cgroup (running without resource limits): {}",
409                        e
410                    );
411                }
412                None
413            }
414        };
415
416        // Resolve runsc path before fork, while still unprivileged.
417        let runsc_path = if config.use_gvisor {
418            Some(GVisorRuntime::resolve_path().map_err(|e| {
419                NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
420            })?)
421        } else {
422            None
423        };
424        let gvisor_bridge_needs_userns_mapping = config.use_gvisor
425            && !is_root
426            && config.user_ns_config.is_some()
427            && matches!(config.network, NetworkMode::Bridge(_));
428        let needs_external_userns_mapping = config.user_ns_config.is_some()
429            && (!config.use_gvisor || gvisor_bridge_needs_userns_mapping);
430        let runtime_base_override =
431            Self::prepare_runtime_base_override(&config, is_root, needs_external_userns_mapping)?;
432
433        // Child notifies parent after namespaces are ready.
434        let (ready_read, ready_write) = pipe().map_err(|e| {
435            NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
436        })?;
437        let userns_sync = if needs_external_userns_mapping {
438            let (request_read, request_write) = pipe().map_err(|e| {
439                NucleusError::ExecError(format!(
440                    "Failed to create user namespace request pipe: {}",
441                    e
442                ))
443            })?;
444            let (ack_read, ack_write) = pipe().map_err(|e| {
445                NucleusError::ExecError(format!("Failed to create user namespace ack pipe: {}", e))
446            })?;
447            Some((request_read, request_write, ack_read, ack_write))
448        } else {
449            None
450        };
451        let (attach_read, attach_write) = pipe().map_err(|e| {
452            NucleusError::ExecError(format!("Failed to create cgroup attach sync pipe: {}", e))
453        })?;
454
455        // M11: fork() in multi-threaded context. Flush log buffers and drop
456        // tracing guards before fork to minimize deadlock risk from locks held
457        // by other threads (tracing, allocator). The Tokio runtime is not yet
458        // started at this point, so async thread contention is not a concern.
459        Self::assert_single_threaded_for_fork("container create fork")?;
460        // SAFETY: fork() is called before any Tokio runtime is created.
461        // Only the main thread should be active at this point.
462        match unsafe { fork() }? {
463            ForkResult::Parent { child } => {
464                drop(ready_write);
465                drop(attach_read);
466                let (userns_request_read, userns_ack_write) =
467                    if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
468                        drop(request_write);
469                        drop(ack_read);
470                        (Some(request_read), Some(ack_write))
471                    } else {
472                        (None, None)
473                    };
474                info!("Forked child process: {}", child);
475
476                // Use a closure so that on any error we kill the child process
477                // instead of leaving it orphaned and blocked on the exec FIFO.
478                let parent_setup = || -> Result<CreatedContainer> {
479                    if needs_external_userns_mapping {
480                        let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
481                            NucleusError::ExecError(
482                                "Missing user namespace configuration in parent".to_string(),
483                            )
484                        })?;
485                        let request_read = userns_request_read.as_ref().ok_or_else(|| {
486                            NucleusError::ExecError(
487                                "Missing user namespace request pipe in parent".to_string(),
488                            )
489                        })?;
490                        let ack_write = userns_ack_write.as_ref().ok_or_else(|| {
491                            NucleusError::ExecError(
492                                "Missing user namespace ack pipe in parent".to_string(),
493                            )
494                        })?;
495
496                        Self::wait_for_sync_byte(
497                            request_read,
498                            &format!(
499                                "Child {} exited before requesting user namespace mappings",
500                                child
501                            ),
502                            "Failed waiting for child user namespace request",
503                        )?;
504                        UserNamespaceMapper::new(user_config.clone())
505                            .write_mappings_for_pid(child.as_raw() as u32)?;
506                        Self::send_sync_byte(
507                            ack_write,
508                            "Failed to notify child that user namespace mappings are ready",
509                        )?;
510                    }
511
512                    let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
513
514                    let cgroup_path = cgroup_opt
515                        .as_ref()
516                        .map(|cgroup| cgroup.path().display().to_string());
517                    let cpu_millicores = config
518                        .limits
519                        .cpu_quota_us
520                        .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
521                    let mut state = ContainerState::new(ContainerStateParams {
522                        id: config.id.clone(),
523                        name: config.name.clone(),
524                        pid: target_pid,
525                        command: config.command.clone(),
526                        memory_limit: config.limits.memory_bytes,
527                        cpu_limit: cpu_millicores,
528                        using_gvisor: config.use_gvisor,
529                        rootless: config.user_ns_config.is_some(),
530                        cgroup_path,
531                        process_uid: config.process_identity.uid,
532                        process_gid: config.process_identity.gid,
533                        additional_gids: config.process_identity.additional_gids.clone(),
534                    });
535                    state.config_hash = config.config_hash;
536                    state.bundle_path =
537                        config.rootfs_path.as_ref().map(|p| p.display().to_string());
538
539                    let mut network_driver: Option<BridgeDriver> = None;
540                    let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
541                    let deny_logger = Self::maybe_start_seccomp_deny_logger(&config, target_pid)?;
542
543                    // Transition: Creating -> Created
544                    state.status = OciStatus::Created;
545                    state_mgr.save_state(&state)?;
546
547                    // Write PID file (OCI --pid-file)
548                    if let Some(ref pid_path) = config.pid_file {
549                        std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
550                            NucleusError::ConfigError(format!(
551                                "Failed to write pid-file '{}': {}",
552                                pid_path.display(),
553                                e
554                            ))
555                        })?;
556                        info!("Wrote PID {} to {}", target_pid, pid_path.display());
557                    }
558
559                    if let Some(ref mut cgroup) = cgroup_opt {
560                        cgroup.attach_process(target_pid)?;
561                    }
562                    Self::send_sync_byte(
563                        &attach_write,
564                        "Failed to notify child that cgroup attachment is complete",
565                    )?;
566
567                    if let NetworkMode::Bridge(ref bridge_config) = config.network {
568                        match BridgeDriver::setup_with_id(
569                            target_pid,
570                            bridge_config,
571                            &config.id,
572                            is_root,
573                            config.user_ns_config.is_some(),
574                        ) {
575                            Ok(net) => {
576                                if let Some(ref egress) = config.egress_policy {
577                                    if let Err(e) = net.apply_egress_policy(
578                                        target_pid,
579                                        egress,
580                                        config.user_ns_config.is_some(),
581                                    ) {
582                                        if config.service_mode == ServiceMode::Production {
583                                            return Err(NucleusError::NetworkError(format!(
584                                                "Failed to apply egress policy: {}",
585                                                e
586                                            )));
587                                        }
588                                        warn!("Failed to apply egress policy: {}", e);
589                                    }
590                                }
591                                network_driver = Some(net);
592                            }
593                            Err(e) => {
594                                if config.service_mode == ServiceMode::Production {
595                                    return Err(e);
596                                }
597                                warn!("Failed to set up bridge networking: {}", e);
598                            }
599                        }
600                    }
601
602                    info!(
603                        "Container {} created (child pid {}), waiting for start",
604                        config.id, target_pid
605                    );
606
607                    Ok(CreatedContainer {
608                        config,
609                        state_mgr,
610                        state,
611                        child,
612                        cgroup_opt,
613                        network_driver,
614                        trace_reader,
615                        deny_logger,
616                        exec_fifo_path: exec_fifo,
617                        _lifecycle_span: lifecycle_span.clone(),
618                    })
619                };
620
621                parent_setup().map_err(|e| {
622                    // Kill the child so it doesn't remain orphaned and blocked
623                    // on the exec FIFO.
624                    let _ = kill(child, Signal::SIGKILL);
625                    let _ = waitpid(child, None);
626                    e
627                })
628            }
629            ForkResult::Child => {
630                drop(ready_read);
631                drop(attach_write);
632                let (userns_request_write, userns_ack_read) =
633                    if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
634                        drop(request_read);
635                        drop(ack_write);
636                        (Some(request_write), Some(ack_read))
637                    } else {
638                        (None, None)
639                    };
640                // H6: Close inherited FDs > 2 to prevent leaking host sockets/pipes
641                Self::sanitize_fds();
642                let temp_container = Container { config, runsc_path };
643                match temp_container.setup_and_exec(
644                    Some(ready_write),
645                    userns_request_write,
646                    userns_ack_read,
647                    Some(attach_read),
648                    exec_fifo,
649                    runtime_base_override,
650                ) {
651                    Ok(_) => unreachable!(),
652                    Err(e) => {
653                        error!("Container setup failed: {}", e);
654                        std::process::exit(1);
655                    }
656                }
657            }
658        }
659    }
660
661    /// Trigger a previously-created container to start by opening its exec FIFO.
662    /// Used by the CLI `start` command.
663    pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
664        let state_mgr = ContainerStateManager::new_with_root(state_root)?;
665        let fifo_path = state_mgr.exec_fifo_path(container_id)?;
666        if !fifo_path.exists() {
667            return Err(NucleusError::ConfigError(format!(
668                "No exec FIFO found for container {}; is it in 'created' state?",
669                container_id
670            )));
671        }
672
673        // Opening the FIFO for reading unblocks the child's open-for-write.
674        let file = std::fs::File::open(&fifo_path)
675            .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
676        let mut buf = [0u8; 1];
677        std::io::Read::read(&mut &file, &mut buf)
678            .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
679        drop(file);
680
681        let _ = std::fs::remove_file(&fifo_path);
682
683        // Update state to Running
684        let mut state = state_mgr.resolve_container(container_id)?;
685        state.status = OciStatus::Running;
686        state_mgr.save_state(&state)?;
687
688        Ok(())
689    }
690
691    /// Set up container environment and exec target process
692    ///
693    /// This runs in the child process after fork.
694    /// Tracks FilesystemState and SecurityState machines to enforce correct ordering.
695    fn setup_and_exec(
696        &self,
697        ready_pipe: Option<OwnedFd>,
698        userns_request_pipe: Option<OwnedFd>,
699        userns_ack_pipe: Option<OwnedFd>,
700        cgroup_attach_pipe: Option<OwnedFd>,
701        exec_fifo: Option<PathBuf>,
702        runtime_base_override: Option<PathBuf>,
703    ) -> Result<()> {
704        let is_rootless = self.config.user_ns_config.is_some();
705        let allow_degraded_security = Self::allow_degraded_security(&self.config);
706        let context_manifest = if self.config.verify_context_integrity {
707            self.config
708                .context_dir
709                .as_ref()
710                .map(|dir| snapshot_context_dir(dir))
711                .transpose()?
712        } else {
713            None
714        };
715
716        // Initialize state machines
717        let mut fs_state = FilesystemState::Unmounted;
718        let mut sec_state = SecurityState::Privileged;
719
720        // gVisor creates the container namespaces. Bridge mode is the exception:
721        // Nucleus must hand slirp/port-forward setup a concrete target netns,
722        // then runsc inherits that netns via --network host.
723        if self.config.use_gvisor {
724            let gvisor_bridge_precreated_userns =
725                if matches!(self.config.network, NetworkMode::Bridge(_)) {
726                    self.prepare_gvisor_bridge_namespace(
727                        userns_request_pipe.as_ref(),
728                        userns_ack_pipe.as_ref(),
729                    )?
730                } else {
731                    false
732                };
733
734            if let Some(fd) = ready_pipe {
735                Self::notify_namespace_ready(&fd, std::process::id())?;
736            }
737            if let Some(fd) = cgroup_attach_pipe.as_ref() {
738                Self::wait_for_sync_byte(
739                    fd,
740                    "Parent closed cgroup attach pipe before signalling gVisor child",
741                    "Failed waiting for cgroup attach acknowledgement",
742                )?;
743            }
744            return self.setup_and_exec_gvisor(gvisor_bridge_precreated_userns);
745        }
746
747        // 1. Create namespaces in child and optionally configure user mapping.
748        let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
749        namespace_mgr.unshare_namespaces()?;
750        if self.config.user_ns_config.is_some() {
751            let request_fd = userns_request_pipe.as_ref().ok_or_else(|| {
752                NucleusError::ExecError(
753                    "Missing user namespace request pipe in container child".to_string(),
754                )
755            })?;
756            let ack_fd = userns_ack_pipe.as_ref().ok_or_else(|| {
757                NucleusError::ExecError(
758                    "Missing user namespace acknowledgement pipe in container child".to_string(),
759                )
760            })?;
761
762            Self::send_sync_byte(
763                request_fd,
764                "Failed to request user namespace mappings from parent",
765            )?;
766            Self::wait_for_sync_byte(
767                ack_fd,
768                "Parent closed user namespace ack pipe before mappings were written",
769                "Failed waiting for parent to finish user namespace mappings",
770            )?;
771            Self::become_userns_root_for_setup()?;
772        }
773
774        // CLONE_NEWPID only applies to children created after unshare().
775        // Create a child that will become PID 1 in the new namespace and exec the workload.
776        if self.config.namespaces.pid {
777            Self::assert_single_threaded_for_fork("PID namespace init fork")?;
778            match unsafe { fork() }? {
779                ForkResult::Parent { child } => {
780                    if let Some(fd) = ready_pipe {
781                        Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
782                    }
783                    std::process::exit(Self::wait_for_pid_namespace_child(child));
784                }
785                ForkResult::Child => {
786                    if let Some(fd) = cgroup_attach_pipe.as_ref() {
787                        Self::wait_for_sync_byte(
788                            fd,
789                            "Parent closed cgroup attach pipe before signalling PID 1 child",
790                            "Failed waiting for cgroup attach acknowledgement",
791                        )?;
792                    }
793                    // Continue container setup as PID 1 in the new namespace.
794                }
795            }
796        } else {
797            if let Some(fd) = ready_pipe {
798                Self::notify_namespace_ready(&fd, std::process::id())?;
799            }
800            if let Some(fd) = cgroup_attach_pipe.as_ref() {
801                Self::wait_for_sync_byte(
802                    fd,
803                    "Parent closed cgroup attach pipe before signalling container child",
804                    "Failed waiting for cgroup attach acknowledgement",
805                )?;
806            }
807        }
808
809        // Namespace: Unshared -> Entered (process is now inside all namespaces)
810        namespace_mgr.enter()?;
811
812        // 2. Ensure no_new_privs BEFORE any mount operations.
813        // This prevents exploitation of setuid binaries on bind-mounted paths
814        // even if a subsequent MS_NOSUID remount fails.
815        self.enforce_no_new_privs()?;
816        audit(
817            &self.config.id,
818            &self.config.name,
819            AuditEventType::NoNewPrivsSet,
820            "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
821        );
822
823        // 3. Set hostname if UTS namespace is enabled
824        if let Some(hostname) = &self.config.hostname {
825            namespace_mgr.set_hostname(hostname)?;
826        }
827
828        // 4. Mount tmpfs as container root
829        // Filesystem: Unmounted -> Mounted
830        // Use a private runtime directory instead of /tmp to avoid symlink
831        // attacks and information disclosure on multi-user systems.
832        let runtime_base = if let Some(path) = runtime_base_override {
833            path
834        } else if nix::unistd::Uid::effective().is_root() {
835            PathBuf::from("/run/nucleus")
836        } else {
837            dirs::runtime_dir()
838                .map(|d| d.join("nucleus"))
839                .unwrap_or_else(std::env::temp_dir)
840        };
841        let _ = std::fs::create_dir_all(&runtime_base);
842        let runtime_dir = Builder::new()
843            .prefix("nucleus-runtime-")
844            .tempdir_in(&runtime_base)
845            .map_err(|e| {
846                NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
847            })?;
848        let container_root = runtime_dir.path().to_path_buf();
849        let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); // 1GB default
850        tmpfs.mount()?;
851        fs_state = fs_state.transition(FilesystemState::Mounted)?;
852
853        // 4. Create minimal filesystem structure
854        create_minimal_fs(&container_root)?;
855
856        // 5. Create device nodes and standard tmpfs mounts under /dev
857        let dev_path = container_root.join("dev");
858        create_dev_nodes(&dev_path, false)?;
859
860        // /dev/shm – POSIX shared memory (shm_open). Required by PostgreSQL,
861        // Redis, and other programs that use POSIX shared memory segments.
862        let shm_path = dev_path.join("shm");
863        std::fs::create_dir_all(&shm_path).map_err(|e| {
864            NucleusError::FilesystemError(format!("Failed to create /dev/shm: {}", e))
865        })?;
866        nix::mount::mount(
867            Some("shm"),
868            &shm_path,
869            Some("tmpfs"),
870            nix::mount::MsFlags::MS_NOSUID
871                | nix::mount::MsFlags::MS_NODEV
872                | nix::mount::MsFlags::MS_NOEXEC,
873            Some("mode=1777,size=64m"),
874        )
875        .map_err(|e| {
876            NucleusError::FilesystemError(format!("Failed to mount tmpfs on /dev/shm: {}", e))
877        })?;
878        debug!("Mounted tmpfs on /dev/shm");
879
880        // 6. Populate context if provided
881        // Filesystem: Mounted -> Populated
882        if let Some(context_dir) = &self.config.context_dir {
883            let context_dest = container_root.join("context");
884            LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
885            if let Some(expected) = &context_manifest {
886                verify_context_manifest(expected, &context_dest)?;
887            }
888        }
889        fs_state = fs_state.transition(FilesystemState::Populated)?;
890
891        // 7. Mount runtime paths: either a pre-built rootfs or host bind mounts
892        if let Some(ref rootfs_path) = self.config.rootfs_path {
893            if self.config.verify_rootfs_attestation {
894                verify_rootfs_attestation(rootfs_path)?;
895            }
896            bind_mount_rootfs(&container_root, rootfs_path)?;
897        } else {
898            bind_mount_host_paths(&container_root, is_rootless)?;
899        }
900
901        // 7b. Mount persistent or ephemeral volumes over the base filesystem.
902        mount_volumes(&container_root, &self.config.volumes)?;
903
904        // 7c. Write resolv.conf for bridge networking.
905        // When rootfs is mounted, /etc is read-only, so we bind-mount a writable
906        // resolv.conf over the top (same technique as secrets).
907        if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
908            let bridge_dns = if bridge_config.selected_nat_backend(!is_rootless, is_rootless)
909                == NatBackend::Userspace
910                && bridge_config.dns.is_empty()
911            {
912                vec![UserspaceNetwork::default_dns_server(&bridge_config.subnet)?]
913            } else {
914                bridge_config.dns.clone()
915            };
916            if self.config.rootfs_path.is_some() {
917                BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_dns)?;
918            } else {
919                BridgeNetwork::write_resolv_conf(&container_root, &bridge_dns)?;
920            }
921        }
922
923        // 7d. Mount secrets on an in-memory tmpfs in all modes.
924        mount_secrets_inmemory(
925            &container_root,
926            &self.config.secrets,
927            &self.config.process_identity,
928        )?;
929
930        // 8. Mount procfs (hidepid=2 in production mode to prevent PID enumeration)
931        let proc_path = container_root.join("proc");
932        let hide_pids = self.config.service_mode == ServiceMode::Production;
933        mount_procfs(
934            &proc_path,
935            is_rootless,
936            self.config.proc_readonly,
937            hide_pids,
938        )?;
939
940        // 8b. Mask sensitive /proc paths to reduce kernel info leakage
941        // SEC-06: In production mode, failures to mask critical paths are fatal.
942        mask_proc_paths(
943            &proc_path,
944            self.config.service_mode == ServiceMode::Production,
945        )?;
946
947        // 9c. Run createRuntime hooks (after namespaces created, before pivot_root)
948        if let Some(ref hooks) = self.config.hooks {
949            if !hooks.create_runtime.is_empty() {
950                let hook_state = OciContainerState {
951                    oci_version: "1.0.2".to_string(),
952                    id: self.config.id.clone(),
953                    status: OciStatus::Creating,
954                    pid: std::process::id(),
955                    bundle: String::new(),
956                };
957                OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
958            }
959        }
960
961        // 10. Switch root filesystem
962        // Filesystem: Populated -> Pivoted
963        switch_root(&container_root, self.config.allow_chroot_fallback)?;
964        fs_state = fs_state.transition(FilesystemState::Pivoted)?;
965        debug!("Filesystem state: {:?}", fs_state);
966
967        // 10b. Audit mount flags to verify filesystem hardening invariants
968        audit_mounts(self.config.service_mode == ServiceMode::Production)?;
969        audit(
970            &self.config.id,
971            &self.config.name,
972            AuditEventType::MountAuditPassed,
973            "all mount flags verified",
974        );
975
976        // 10c. Run createContainer hooks (after pivot_root, before start)
977        if let Some(ref hooks) = self.config.hooks {
978            if !hooks.create_container.is_empty() {
979                let hook_state = OciContainerState {
980                    oci_version: "1.0.2".to_string(),
981                    id: self.config.id.clone(),
982                    status: OciStatus::Created,
983                    pid: std::process::id(),
984                    bundle: String::new(),
985                };
986                OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
987            }
988        }
989
990        // 11. Drop capabilities and switch identity (Docker/runc convention).
991        //
992        // The identity switch (setuid/setgid) must happen between two cap phases:
993        //   Phase 1: drop bounding set (needs CAP_SETPCAP), clear ambient/inheritable
994        //   Identity: setgroups/setgid/setuid (needs CAP_SETUID/CAP_SETGID)
995        //   Phase 2: clear permitted/effective (or kernel auto-clears on setuid)
996        //
997        // Custom cap policies (drop_except / apply_sets) do their own full drop,
998        // so the two-phase approach only applies to the default drop-all path.
999        let mut cap_mgr = CapabilityManager::new();
1000        if let Some(ref policy_path) = self.config.caps_policy {
1001            let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
1002                policy_path,
1003                self.config.caps_policy_sha256.as_deref(),
1004            )?;
1005            // H3: Reject dangerous capabilities in production mode
1006            if self.config.service_mode == ServiceMode::Production {
1007                policy.validate_production()?;
1008            }
1009            policy.apply(&mut cap_mgr)?;
1010            // Identity switch after custom policy (caps may already be restricted)
1011            Self::apply_process_identity_to_current_process(
1012                &self.config.process_identity,
1013                self.config.user_ns_config.is_some(),
1014            )?;
1015            audit(
1016                &self.config.id,
1017                &self.config.name,
1018                AuditEventType::CapabilitiesDropped,
1019                format!("capability policy applied from {:?}", policy_path),
1020            );
1021        } else {
1022            // Phase 1: drop bounding set while CAP_SETPCAP is still effective
1023            cap_mgr.drop_bounding_set()?;
1024
1025            // Identity switch: setgroups/setgid/setuid while CAP_SETUID/CAP_SETGID
1026            // are still in the effective set. For non-root target UIDs, the kernel
1027            // auto-clears permitted/effective after setuid().
1028            Self::apply_process_identity_to_current_process(
1029                &self.config.process_identity,
1030                self.config.user_ns_config.is_some(),
1031            )?;
1032
1033            // Phase 2: explicitly clear any remaining caps (handles root-stays-root
1034            // case where kernel doesn't auto-clear).
1035            cap_mgr.finalize_drop()?;
1036
1037            audit(
1038                &self.config.id,
1039                &self.config.name,
1040                AuditEventType::CapabilitiesDropped,
1041                "all capabilities dropped including bounding set",
1042            );
1043        }
1044        sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
1045
1046        // 12b. RLIMIT backstop: defense-in-depth against fork bombs and fd exhaustion.
1047        // Must be applied BEFORE seccomp, since SYS_setrlimit is not in the allowlist.
1048        // SEC-05: In production mode, RLIMIT failures are fatal – a container
1049        // without resource limits is a privilege escalation vector.
1050        {
1051            let is_production = self.config.service_mode == ServiceMode::Production;
1052
1053            if let Some(nproc_limit) = self.config.limits.pids_max {
1054                let rlim_nproc = libc::rlimit {
1055                    rlim_cur: nproc_limit,
1056                    rlim_max: nproc_limit,
1057                };
1058                // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
1059                if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
1060                    let err = std::io::Error::last_os_error();
1061                    if is_production {
1062                        return Err(NucleusError::SeccompError(format!(
1063                            "Failed to set RLIMIT_NPROC to {} in production mode: {}",
1064                            nproc_limit, err
1065                        )));
1066                    }
1067                    warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
1068                }
1069            }
1070
1071            let rlim_nofile = libc::rlimit {
1072                rlim_cur: 1024,
1073                rlim_max: 1024,
1074            };
1075            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
1076            if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
1077                let err = std::io::Error::last_os_error();
1078                if is_production {
1079                    return Err(NucleusError::SeccompError(format!(
1080                        "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
1081                        err
1082                    )));
1083                }
1084                warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
1085            }
1086
1087            // RLIMIT_MEMLOCK: prevent container from pinning excessive physical
1088            // memory via mlock(). Default 64KB matches unprivileged default, but
1089            // in a user namespace the container appears as UID 0 and may have a
1090            // higher inherited limit. Configurable via --memlock for io_uring etc.
1091            let memlock_limit: u64 = self.config.limits.memlock_bytes.unwrap_or(64 * 1024);
1092            let rlim_memlock = libc::rlimit {
1093                rlim_cur: memlock_limit,
1094                rlim_max: memlock_limit,
1095            };
1096            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
1097            if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
1098                let err = std::io::Error::last_os_error();
1099                if is_production {
1100                    return Err(NucleusError::SeccompError(format!(
1101                        "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
1102                        memlock_limit, err
1103                    )));
1104                }
1105                warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
1106            }
1107        }
1108
1109        // 12c. Verify that namespace-creating capabilities are truly gone before
1110        // installing seccomp. clone3 is allowed without argument filtering, so this
1111        // is the sole guard against namespace escape via clone3.
1112        CapabilityManager::verify_no_namespace_caps(
1113            self.config.service_mode == ServiceMode::Production,
1114        )?;
1115
1116        // 13. Apply seccomp filter (trace, profile-from-file, or built-in allowlist)
1117        // Security: CapabilitiesDropped -> SeccompApplied
1118        use crate::container::config::SeccompMode;
1119        let mut seccomp_mgr = SeccompManager::new();
1120        let allow_network = !matches!(self.config.network, NetworkMode::None);
1121        let seccomp_applied = match self.config.seccomp_mode {
1122            SeccompMode::Trace => {
1123                audit(
1124                    &self.config.id,
1125                    &self.config.name,
1126                    AuditEventType::SeccompApplied,
1127                    "seccomp trace mode: allow-all + LOG",
1128                );
1129                seccomp_mgr.apply_trace_filter()?
1130            }
1131            SeccompMode::Enforce => {
1132                if let Some(ref profile_path) = self.config.seccomp_profile {
1133                    audit(
1134                        &self.config.id,
1135                        &self.config.name,
1136                        AuditEventType::SeccompProfileLoaded,
1137                        format!("path={:?}", profile_path),
1138                    );
1139                    seccomp_mgr.apply_profile_from_file(
1140                        profile_path,
1141                        self.config.seccomp_profile_sha256.as_deref(),
1142                        self.config.seccomp_log_denied,
1143                    )?
1144                } else {
1145                    seccomp_mgr.apply_filter_for_network_mode(
1146                        allow_network,
1147                        allow_degraded_security,
1148                        self.config.seccomp_log_denied,
1149                        &self.config.seccomp_allow_syscalls,
1150                    )?
1151                }
1152            }
1153        };
1154        if seccomp_applied {
1155            sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
1156            audit(
1157                &self.config.id,
1158                &self.config.name,
1159                AuditEventType::SeccompApplied,
1160                format!("network={}", allow_network),
1161            );
1162        } else if !allow_degraded_security {
1163            return Err(NucleusError::SeccompError(
1164                "Seccomp filter is required but was not enforced".to_string(),
1165            ));
1166        } else {
1167            warn!("Seccomp not enforced; container is running with degraded hardening");
1168        }
1169
1170        // 14. Apply Landlock policy (from policy file or default hardcoded rules)
1171        let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
1172            let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
1173                policy_path,
1174                self.config.landlock_policy_sha256.as_deref(),
1175            )?;
1176            // H4: Reject write+execute on same path in production
1177            if self.config.service_mode == ServiceMode::Production {
1178                policy.validate_production()?;
1179            }
1180            policy.apply(allow_degraded_security)?
1181        } else {
1182            let mut landlock_mgr = LandlockManager::new();
1183            landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
1184            // Register volume mount destinations so Landlock permits access to them
1185            for vol in &self.config.volumes {
1186                landlock_mgr.add_rw_path(&vol.dest.to_string_lossy());
1187            }
1188            landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
1189        };
1190        if seccomp_applied && landlock_applied {
1191            sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
1192            if self.config.seccomp_mode == SeccompMode::Trace {
1193                warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
1194            } else {
1195                sec_state = sec_state.transition(SecurityState::Locked)?;
1196            }
1197            audit(
1198                &self.config.id,
1199                &self.config.name,
1200                AuditEventType::LandlockApplied,
1201                if self.config.seccomp_mode == SeccompMode::Trace {
1202                    "landlock applied, but seccomp in trace mode – not locked".to_string()
1203                } else {
1204                    "security state locked: all hardening layers active".to_string()
1205                },
1206            );
1207        } else if !allow_degraded_security {
1208            return Err(NucleusError::LandlockError(
1209                "Landlock policy is required but was not enforced".to_string(),
1210            ));
1211        } else {
1212            warn!("Security state not locked; one or more hardening controls are inactive");
1213        }
1214        debug!("Security state: {:?}", sec_state);
1215
1216        // 14c. Block on exec FIFO until start() opens it for reading.
1217        // This implements the OCI two-phase create/start: all container setup
1218        // is complete, but the user process doesn't exec until explicitly started.
1219        if let Some(ref fifo_path) = exec_fifo {
1220            debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
1221            let file = std::fs::OpenOptions::new()
1222                .write(true)
1223                .open(fifo_path)
1224                .map_err(|e| {
1225                    NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
1226                })?;
1227            std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
1228                NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
1229            })?;
1230            drop(file);
1231            debug!("Exec FIFO released, proceeding to exec");
1232        }
1233
1234        // 14d. Run startContainer hooks (after start signal, before user process exec)
1235        if let Some(ref hooks) = self.config.hooks {
1236            if !hooks.start_container.is_empty() {
1237                let hook_state = OciContainerState {
1238                    oci_version: "1.0.2".to_string(),
1239                    id: self.config.id.clone(),
1240                    status: OciStatus::Running,
1241                    pid: std::process::id(),
1242                    bundle: String::new(),
1243                };
1244                OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
1245            }
1246        }
1247
1248        // 15. In production mode with PID namespace, run as a mini-init (PID 1)
1249        // that reaps zombies and forwards signals, rather than exec-ing directly.
1250        if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
1251            return self.run_as_init();
1252        }
1253
1254        // 15b. Agent mode: exec target process directly
1255        self.exec_command()?;
1256
1257        // Should never reach here
1258        Ok(())
1259    }
1260
1261    /// Forward selected signals to child process using sigwait (no async signal handlers).
1262    ///
1263    /// Returns a stop flag and join handle. Set the flag to `true` and join
1264    /// the handle to cleanly shut down the forwarding thread.
1265    pub(super) fn setup_signal_forwarding_static(
1266        child: Pid,
1267    ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
1268        let mut set = SigSet::empty();
1269        for signal in [
1270            Signal::SIGTERM,
1271            Signal::SIGINT,
1272            Signal::SIGHUP,
1273            Signal::SIGQUIT,
1274            Signal::SIGUSR1,
1275            Signal::SIGUSR2,
1276        ] {
1277            set.add(signal);
1278        }
1279
1280        let unblock_set = set;
1281        pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
1282            NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
1283        })?;
1284
1285        let stop = Arc::new(AtomicBool::new(false));
1286        let stop_clone = stop.clone();
1287        let handle = std::thread::Builder::new()
1288            .name("sig-forward".to_string())
1289            .spawn(move || {
1290                // The thread owns unblock_set and uses it for sigwait.
1291                loop {
1292                    if let Ok(signal) = unblock_set.wait() {
1293                        // Check the stop flag *after* waking so that the
1294                        // wake-up signal (SIGUSR1) is not forwarded to the
1295                        // child during shutdown.
1296                        if stop_clone.load(Ordering::Relaxed) {
1297                            break;
1298                        }
1299                        let _ = kill(child, signal);
1300                    }
1301                }
1302            })
1303            .map_err(|e| {
1304                // Restore the signal mask so the caller isn't left with
1305                // signals permanently blocked.
1306                let mut restore = SigSet::empty();
1307                for signal in [
1308                    Signal::SIGTERM,
1309                    Signal::SIGINT,
1310                    Signal::SIGHUP,
1311                    Signal::SIGQUIT,
1312                    Signal::SIGUSR1,
1313                    Signal::SIGUSR2,
1314                ] {
1315                    restore.add(signal);
1316                }
1317                let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
1318                NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
1319            })?;
1320
1321        info!("Signal forwarding configured");
1322        Ok((stop, handle))
1323    }
1324
1325    /// Wait for child process to exit
1326    pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
1327        loop {
1328            match waitpid(child, None) {
1329                Ok(WaitStatus::Exited(_, code)) => {
1330                    return Ok(code);
1331                }
1332                Ok(WaitStatus::Signaled(_, signal, _)) => {
1333                    info!("Child killed by signal: {:?}", signal);
1334                    return Ok(128 + signal as i32);
1335                }
1336                Err(nix::errno::Errno::EINTR) => {
1337                    continue;
1338                }
1339                Err(e) => {
1340                    return Err(NucleusError::ExecError(format!(
1341                        "Failed to wait for child: {}",
1342                        e
1343                    )));
1344                }
1345                _ => {
1346                    continue;
1347                }
1348            }
1349        }
1350    }
1351
1352    fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1353        let mut pid_buf = [0u8; 4];
1354        loop {
1355            match read(ready_read, &mut pid_buf) {
1356                Err(nix::errno::Errno::EINTR) => continue,
1357                Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1358                Ok(0) => {
1359                    return Err(NucleusError::ExecError(format!(
1360                        "Child {} exited before namespace initialization",
1361                        child
1362                    )))
1363                }
1364                Ok(_) => {
1365                    return Err(NucleusError::ExecError(
1366                        "Invalid namespace sync payload from child".to_string(),
1367                    ))
1368                }
1369                Err(e) => {
1370                    return Err(NucleusError::ExecError(format!(
1371                        "Failed waiting for child namespace setup: {}",
1372                        e
1373                    )))
1374                }
1375            }
1376        }
1377    }
1378
1379    fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1380        let payload = pid.to_ne_bytes();
1381        let mut written = 0;
1382        while written < payload.len() {
1383            let n = write(fd, &payload[written..]).map_err(|e| {
1384                NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1385            })?;
1386            if n == 0 {
1387                return Err(NucleusError::ExecError(
1388                    "Failed to notify namespace readiness: short write".to_string(),
1389                ));
1390            }
1391            written += n;
1392        }
1393        Ok(())
1394    }
1395
1396    fn send_sync_byte(fd: &OwnedFd, error_context: &str) -> Result<()> {
1397        let mut written = 0;
1398        let payload = [1u8];
1399        while written < payload.len() {
1400            let n = write(fd, &payload[written..])
1401                .map_err(|e| NucleusError::ExecError(format!("{}: {}", error_context, e)))?;
1402            if n == 0 {
1403                return Err(NucleusError::ExecError(format!(
1404                    "{}: short write",
1405                    error_context
1406                )));
1407            }
1408            written += n;
1409        }
1410        Ok(())
1411    }
1412
1413    fn wait_for_sync_byte(fd: &OwnedFd, eof_context: &str, error_context: &str) -> Result<()> {
1414        let mut payload = [0u8; 1];
1415        loop {
1416            match read(fd, &mut payload) {
1417                Err(nix::errno::Errno::EINTR) => continue,
1418                Ok(1) => return Ok(()),
1419                Ok(0) => return Err(NucleusError::ExecError(eof_context.to_string())),
1420                Ok(_) => {
1421                    return Err(NucleusError::ExecError(format!(
1422                        "{}: invalid sync payload",
1423                        error_context
1424                    )))
1425                }
1426                Err(e) => return Err(NucleusError::ExecError(format!("{}: {}", error_context, e))),
1427            }
1428        }
1429    }
1430
1431    fn become_userns_root_for_setup() -> Result<()> {
1432        setresgid(Gid::from_raw(0), Gid::from_raw(0), Gid::from_raw(0)).map_err(|e| {
1433            NucleusError::NamespaceError(format!(
1434                "Failed to become gid 0 inside mapped user namespace: {}",
1435                e
1436            ))
1437        })?;
1438        setresuid(Uid::from_raw(0), Uid::from_raw(0), Uid::from_raw(0)).map_err(|e| {
1439            NucleusError::NamespaceError(format!(
1440                "Failed to become uid 0 inside mapped user namespace: {}",
1441                e
1442            ))
1443        })?;
1444        debug!("Switched setup process to uid/gid 0 inside mapped user namespace");
1445        Ok(())
1446    }
1447
1448    fn prepare_gvisor_bridge_namespace(
1449        &self,
1450        userns_request_pipe: Option<&OwnedFd>,
1451        userns_ack_pipe: Option<&OwnedFd>,
1452    ) -> Result<bool> {
1453        let mut precreated_userns = false;
1454        if self.config.user_ns_config.is_some() && !Uid::effective().is_root() {
1455            nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWUSER).map_err(|e| {
1456                NucleusError::NamespaceError(format!(
1457                    "Failed to unshare gVisor bridge user namespace: {}",
1458                    e
1459                ))
1460            })?;
1461
1462            let request_fd = userns_request_pipe.ok_or_else(|| {
1463                NucleusError::ExecError(
1464                    "Missing user namespace request pipe in gVisor bridge child".to_string(),
1465                )
1466            })?;
1467            let ack_fd = userns_ack_pipe.ok_or_else(|| {
1468                NucleusError::ExecError(
1469                    "Missing user namespace acknowledgement pipe in gVisor bridge child"
1470                        .to_string(),
1471                )
1472            })?;
1473
1474            Self::send_sync_byte(
1475                request_fd,
1476                "Failed to request gVisor bridge user namespace mappings from parent",
1477            )?;
1478            Self::wait_for_sync_byte(
1479                ack_fd,
1480                "Parent closed user namespace ack pipe before gVisor bridge mappings were written",
1481                "Failed waiting for parent to finish gVisor bridge user namespace mappings",
1482            )?;
1483            Self::become_userns_root_for_setup()?;
1484            precreated_userns = true;
1485        }
1486
1487        nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNET).map_err(|e| {
1488            NucleusError::NamespaceError(format!(
1489                "Failed to unshare gVisor bridge network namespace: {}",
1490                e
1491            ))
1492        })?;
1493        Ok(precreated_userns)
1494    }
1495
1496    fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1497        loop {
1498            match waitpid(child, None) {
1499                Ok(WaitStatus::Exited(_, code)) => return code,
1500                Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1501                Err(nix::errno::Errno::EINTR) => continue,
1502                Err(_) => return 1,
1503                _ => continue,
1504            }
1505        }
1506    }
1507}
1508
1509impl CreatedContainer {
1510    /// Start phase: release the child via the exec FIFO, transition to Running,
1511    /// then wait for the child to exit with full lifecycle management.
1512    pub fn start(mut self) -> Result<i32> {
1513        let config = &self.config;
1514        let _enter = self._lifecycle_span.enter();
1515
1516        // Open the exec FIFO for reading – this unblocks the child's
1517        // blocking open-for-write, allowing it to proceed to exec.
1518        if let Some(exec_fifo_path) = &self.exec_fifo_path {
1519            let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1520                NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1521            })?;
1522            let mut buf = [0u8; 1];
1523            let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1524                NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1525            })?;
1526            if read != 1 {
1527                return Err(NucleusError::ExecError(
1528                    "Exec FIFO closed before start signal was delivered".to_string(),
1529                ));
1530            }
1531            let _ = std::fs::remove_file(exec_fifo_path);
1532        }
1533
1534        // Transition: Created -> Running
1535        self.state.status = OciStatus::Running;
1536        self.state_mgr.save_state(&self.state)?;
1537
1538        let target_pid = self.state.pid;
1539        let child = self.child;
1540
1541        let (sig_stop, sig_handle) =
1542            Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1543
1544        // Guard ensures signal thread is stopped on any exit path (including early ? returns).
1545        let mut sig_guard = SignalThreadGuard {
1546            stop: Some(sig_stop),
1547            handle: Some(sig_handle),
1548        };
1549
1550        // Run readiness probe before declaring service ready
1551        if let Some(ref probe) = config.readiness_probe {
1552            let notify_socket = if config.sd_notify {
1553                std::env::var("NOTIFY_SOCKET").ok()
1554            } else {
1555                None
1556            };
1557            Container::run_readiness_probe(
1558                target_pid,
1559                &config.name,
1560                probe,
1561                config.user_ns_config.is_some(),
1562                config.use_gvisor,
1563                &config.process_identity,
1564                notify_socket.as_deref(),
1565            )?;
1566        }
1567
1568        // Start health check thread if configured
1569        let cancel_flag = Arc::new(AtomicBool::new(false));
1570        let health_handle = if let Some(ref hc) = config.health_check {
1571            if !hc.command.is_empty() {
1572                let hc = hc.clone();
1573                let pid = target_pid;
1574                let container_name = config.name.clone();
1575                let rootless = config.user_ns_config.is_some();
1576                let using_gvisor = config.use_gvisor;
1577                let process_identity = config.process_identity.clone();
1578                let cancel = cancel_flag.clone();
1579                Some(std::thread::spawn(move || {
1580                    Container::health_check_loop(
1581                        pid,
1582                        &container_name,
1583                        rootless,
1584                        using_gvisor,
1585                        &hc,
1586                        &process_identity,
1587                        &cancel,
1588                    );
1589                }))
1590            } else {
1591                None
1592            }
1593        } else {
1594            None
1595        };
1596
1597        // Guard ensures health check thread is cancelled on any exit path.
1598        let mut health_guard = HealthThreadGuard {
1599            cancel: Some(cancel_flag),
1600            handle: health_handle,
1601        };
1602
1603        // Run poststart hooks (after user process started, in parent)
1604        if let Some(ref hooks) = config.hooks {
1605            if !hooks.poststart.is_empty() {
1606                let hook_state = OciContainerState {
1607                    oci_version: "1.0.2".to_string(),
1608                    id: config.id.clone(),
1609                    status: OciStatus::Running,
1610                    pid: target_pid,
1611                    bundle: String::new(),
1612                };
1613                OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1614            }
1615        }
1616
1617        let mut child_waited = false;
1618        let run_result: Result<i32> = (|| {
1619            let exit_code = Container::wait_for_child_static(child)?;
1620
1621            // Transition: Running -> Stopped
1622            self.state.status = OciStatus::Stopped;
1623            let _ = self.state_mgr.save_state(&self.state);
1624
1625            child_waited = true;
1626            Ok(exit_code)
1627        })();
1628
1629        // Explicitly stop threads (guards would do this on drop too, but
1630        // explicit teardown keeps ordering visible).
1631        health_guard.stop();
1632        sig_guard.stop();
1633
1634        // Run poststop hooks (best-effort)
1635        if let Some(ref hooks) = config.hooks {
1636            if !hooks.poststop.is_empty() {
1637                let hook_state = OciContainerState {
1638                    oci_version: "1.0.2".to_string(),
1639                    id: config.id.clone(),
1640                    status: OciStatus::Stopped,
1641                    pid: 0,
1642                    bundle: String::new(),
1643                };
1644                OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1645            }
1646        }
1647
1648        if let Some(net) = self.network_driver.take() {
1649            if let Err(e) = net.cleanup() {
1650                warn!("Failed to cleanup container networking: {}", e);
1651            }
1652        }
1653
1654        if !child_waited {
1655            let _ = kill(child, Signal::SIGKILL);
1656            let _ = waitpid(child, None);
1657        }
1658
1659        if let Some(reader) = self.trace_reader.take() {
1660            reader.stop_and_flush();
1661        }
1662
1663        if let Some(logger) = self.deny_logger.take() {
1664            logger.stop();
1665        }
1666
1667        if let Some(cgroup) = self.cgroup_opt.take() {
1668            if let Err(e) = cgroup.cleanup() {
1669                warn!("Failed to cleanup cgroup: {}", e);
1670            }
1671        }
1672
1673        if config.use_gvisor {
1674            if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1675                warn!(
1676                    "Failed to cleanup gVisor artifacts for {}: {}",
1677                    config.id, e
1678                );
1679            }
1680        }
1681
1682        if let Err(e) = self.state_mgr.delete_state(&config.id) {
1683            warn!("Failed to delete state for {}: {}", config.id, e);
1684        }
1685
1686        match run_result {
1687            Ok(exit_code) => {
1688                audit(
1689                    &config.id,
1690                    &config.name,
1691                    AuditEventType::ContainerStop,
1692                    format!("exit_code={}", exit_code),
1693                );
1694                info!(
1695                    "Container {} ({}) exited with code {}",
1696                    config.name, config.id, exit_code
1697                );
1698                Ok(exit_code)
1699            }
1700            Err(e) => {
1701                audit_error(
1702                    &config.id,
1703                    &config.name,
1704                    AuditEventType::ContainerStop,
1705                    format!("error={}", e),
1706                );
1707                Err(e)
1708            }
1709        }
1710    }
1711}
1712
1713/// RAII guard that stops the signal-forwarding thread on drop.
1714struct SignalThreadGuard {
1715    stop: Option<Arc<AtomicBool>>,
1716    handle: Option<JoinHandle<()>>,
1717}
1718
1719impl SignalThreadGuard {
1720    fn stop(&mut self) {
1721        if let Some(flag) = self.stop.take() {
1722            flag.store(true, Ordering::Relaxed);
1723            // Unblock the sigwait() call so the thread can observe the stop flag.
1724            let _ = kill(Pid::this(), Signal::SIGUSR1);
1725        }
1726        if let Some(handle) = self.handle.take() {
1727            let _ = handle.join();
1728        }
1729    }
1730}
1731
1732impl Drop for SignalThreadGuard {
1733    fn drop(&mut self) {
1734        self.stop();
1735    }
1736}
1737
1738/// RAII guard that cancels the health-check thread on drop.
1739struct HealthThreadGuard {
1740    cancel: Option<Arc<AtomicBool>>,
1741    handle: Option<JoinHandle<()>>,
1742}
1743
1744impl HealthThreadGuard {
1745    fn stop(&mut self) {
1746        if let Some(flag) = self.cancel.take() {
1747            flag.store(true, Ordering::Relaxed);
1748        }
1749        if let Some(handle) = self.handle.take() {
1750            let _ = handle.join();
1751        }
1752    }
1753}
1754
1755impl Drop for HealthThreadGuard {
1756    fn drop(&mut self) {
1757        self.stop();
1758    }
1759}
1760
1761#[cfg(test)]
1762mod tests {
1763    use super::*;
1764    use crate::container::KernelLockdownMode;
1765    use crate::network::NetworkMode;
1766    use std::ffi::OsString;
1767    use std::sync::{Mutex, MutexGuard};
1768
1769    static ENV_LOCK: Mutex<()> = Mutex::new(());
1770
1771    struct EnvLock {
1772        _guard: MutexGuard<'static, ()>,
1773    }
1774
1775    impl EnvLock {
1776        fn acquire() -> Self {
1777            Self {
1778                _guard: ENV_LOCK.lock().unwrap(),
1779            }
1780        }
1781    }
1782
1783    struct EnvVarGuard {
1784        key: &'static str,
1785        previous: Option<OsString>,
1786    }
1787
1788    impl EnvVarGuard {
1789        fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
1790            let previous = std::env::var_os(key);
1791            std::env::set_var(key, value);
1792            Self { key, previous }
1793        }
1794
1795        fn remove(key: &'static str) -> Self {
1796            let previous = std::env::var_os(key);
1797            std::env::remove_var(key);
1798            Self { key, previous }
1799        }
1800    }
1801
1802    impl Drop for EnvVarGuard {
1803        fn drop(&mut self) {
1804            match &self.previous {
1805                Some(value) => std::env::set_var(self.key, value),
1806                None => std::env::remove_var(self.key),
1807            }
1808        }
1809    }
1810
1811    fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
1812        let fn_start = source
1813            .find(fn_signature)
1814            .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
1815        let after = &source[fn_start..];
1816        let open = after
1817            .find('{')
1818            .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
1819        let mut depth = 0u32;
1820        let mut end = open;
1821        for (i, ch) in after[open..].char_indices() {
1822            match ch {
1823                '{' => depth += 1,
1824                '}' => {
1825                    depth -= 1;
1826                    if depth == 0 {
1827                        end = open + i + 1;
1828                        break;
1829                    }
1830                }
1831                _ => {}
1832            }
1833        }
1834        &after[..end]
1835    }
1836
1837    #[test]
1838    fn test_container_config() {
1839        let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1840        assert!(!config.id.is_empty());
1841        assert_eq!(config.command, vec!["/bin/sh"]);
1842        assert!(config.use_gvisor);
1843    }
1844
1845    #[test]
1846    fn test_run_uses_immediate_start_path() {
1847        let source = include_str!("runtime.rs");
1848        let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1849        let after = &source[fn_start..];
1850        let open = after.find('{').unwrap();
1851        let mut depth = 0u32;
1852        let mut fn_end = open;
1853        for (i, ch) in after[open..].char_indices() {
1854            match ch {
1855                '{' => depth += 1,
1856                '}' => {
1857                    depth -= 1;
1858                    if depth == 0 {
1859                        fn_end = open + i + 1;
1860                        break;
1861                    }
1862                }
1863                _ => {}
1864            }
1865        }
1866        let run_body = &after[..fn_end];
1867        assert!(
1868            run_body.contains("create_internal(false)"),
1869            "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1870        );
1871        assert!(
1872            !run_body.contains("self.create()?.start()"),
1873            "run() must not route through create()+start()"
1874        );
1875    }
1876
1877    #[test]
1878    fn test_container_config_with_name() {
1879        let config =
1880            ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1881                .unwrap();
1882        assert_eq!(config.name, "mycontainer");
1883        assert!(!config.id.is_empty());
1884        assert_ne!(config.id, config.name);
1885    }
1886
1887    #[test]
1888    fn test_allow_degraded_security_requires_explicit_config() {
1889        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1890        assert!(!Container::allow_degraded_security(&strict));
1891
1892        let relaxed = strict.clone().with_allow_degraded_security(true);
1893        assert!(Container::allow_degraded_security(&relaxed));
1894    }
1895
1896    #[test]
1897    fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1898        let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1899        std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1900
1901        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1902        assert!(!Container::allow_degraded_security(&strict));
1903
1904        let explicit = strict.with_allow_degraded_security(true);
1905        assert!(Container::allow_degraded_security(&explicit));
1906
1907        match prev {
1908            Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1909            None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1910        }
1911    }
1912
1913    #[test]
1914    fn test_host_network_requires_explicit_opt_in() {
1915        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1916            .unwrap()
1917            .with_network(NetworkMode::Host)
1918            .with_allow_host_network(false);
1919        let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1920        assert!(matches!(err, NucleusError::NetworkError(_)));
1921    }
1922
1923    #[test]
1924    fn test_host_network_opt_in_disables_net_namespace() {
1925        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1926            .unwrap()
1927            .with_network(NetworkMode::Host)
1928            .with_allow_host_network(true);
1929        assert!(config.namespaces.net);
1930        Container::apply_network_mode_guards(&mut config, true).unwrap();
1931        assert!(!config.namespaces.net);
1932    }
1933
1934    #[test]
1935    fn test_non_host_network_does_not_require_host_opt_in() {
1936        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1937            .unwrap()
1938            .with_network(NetworkMode::None)
1939            .with_allow_host_network(false);
1940        assert!(config.namespaces.net);
1941        Container::apply_network_mode_guards(&mut config, true).unwrap();
1942        assert!(config.namespaces.net);
1943    }
1944
1945    #[test]
1946    fn test_parse_kernel_lockdown_mode() {
1947        assert_eq!(
1948            Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1949            Some(KernelLockdownMode::Integrity)
1950        );
1951        assert_eq!(
1952            Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1953            Some(KernelLockdownMode::Confidentiality)
1954        );
1955        assert_eq!(
1956            Container::parse_active_lockdown_mode("[none] integrity"),
1957            None
1958        );
1959    }
1960
1961    #[test]
1962    fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1963        let temp = tempfile::TempDir::new().unwrap();
1964        let source = temp.path().join("source-secret");
1965        std::fs::write(&source, "supersecret").unwrap();
1966
1967        let staged = Container::stage_gvisor_secret_files(
1968            &temp.path().join("stage"),
1969            &[crate::container::SecretMount {
1970                source: source.clone(),
1971                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1972                mode: 0o400,
1973            }],
1974            &crate::container::ProcessIdentity::root(),
1975        )
1976        .unwrap();
1977
1978        assert_eq!(staged.len(), 1);
1979        assert!(staged[0].source.starts_with(temp.path().join("stage")));
1980        assert_eq!(
1981            std::fs::read_to_string(&staged[0].source).unwrap(),
1982            "supersecret"
1983        );
1984    }
1985
1986    #[test]
1987    fn test_stage_gvisor_secret_files_rejects_symlink_source() {
1988        use std::os::unix::fs::symlink;
1989
1990        let temp = tempfile::TempDir::new().unwrap();
1991        let source = temp.path().join("source-secret");
1992        let link = temp.path().join("source-link");
1993        std::fs::write(&source, "supersecret").unwrap();
1994        symlink(&source, &link).unwrap();
1995
1996        let err = Container::stage_gvisor_secret_files(
1997            &temp.path().join("stage"),
1998            &[crate::container::SecretMount {
1999                source: link,
2000                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
2001                mode: 0o400,
2002            }],
2003            &crate::container::ProcessIdentity::root(),
2004        )
2005        .unwrap_err();
2006
2007        assert!(
2008            err.to_string().contains("O_NOFOLLOW"),
2009            "gVisor secret staging must reject symlink sources"
2010        );
2011    }
2012
2013    #[test]
2014    fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
2015        let source = include_str!("runtime.rs");
2016        let fn_body = extract_fn_body(source, "fn setup_and_exec");
2017        assert!(
2018            fn_body.contains("mount_secrets_inmemory("),
2019            "setup_and_exec must use in-memory secret mounting"
2020        );
2021        assert!(
2022            !fn_body.contains("mount_secrets(&"),
2023            "setup_and_exec must not bind-mount secrets from the host"
2024        );
2025    }
2026
2027    #[test]
2028    fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
2029        let source = include_str!("gvisor_setup.rs");
2030        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2031        assert!(
2032            fn_body.contains("with_inmemory_secret_mounts"),
2033            "gVisor setup must use the tmpfs-backed secret staging path"
2034        );
2035        assert!(
2036            !fn_body.contains("with_secret_mounts"),
2037            "gVisor setup must not bind-mount host secret paths"
2038        );
2039    }
2040
2041    #[test]
2042    fn test_gvisor_bridge_precreated_userns_skips_nested_oci_userns() {
2043        let source = include_str!("gvisor_setup.rs");
2044        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2045        let precreated_check = fn_body.find("if precreated_userns").unwrap();
2046        let oci_userns = fn_body.find("with_rootless_user_namespace").unwrap();
2047        assert!(
2048            precreated_check < oci_userns,
2049            "pre-created rootless bridge userns must skip nested OCI user namespace setup"
2050        );
2051    }
2052
2053    #[test]
2054    fn test_gvisor_bridge_precreated_userns_disables_oci_no_new_privileges() {
2055        let source = include_str!("gvisor_setup.rs");
2056        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2057        assert!(
2058            fn_body.contains("if precreated_userns")
2059                && fn_body.contains("with_no_new_privileges(false)"),
2060            "pre-created rootless bridge userns must not pass OCI noNewPrivileges to runsc"
2061        );
2062    }
2063
2064    #[test]
2065    fn test_gvisor_bridge_rootless_requests_external_userns_mapping() {
2066        let source = include_str!("runtime.rs");
2067        let create_body = extract_fn_body(source, "fn create_internal");
2068        assert!(
2069            create_body.contains("let gvisor_bridge_needs_userns_mapping"),
2070            "gVisor bridge rootless setup must request parent-written userns mappings"
2071        );
2072        assert!(
2073            create_body.contains("matches!(config.network, NetworkMode::Bridge(_))"),
2074            "external mapping request must be scoped to gVisor bridge networking"
2075        );
2076    }
2077
2078    #[test]
2079    fn test_gvisor_bridge_namespace_creates_userns_before_netns() {
2080        let source = include_str!("runtime.rs");
2081        let fn_body = extract_fn_body(source, "fn prepare_gvisor_bridge_namespace");
2082        let userns = fn_body.find("CLONE_NEWUSER").unwrap();
2083        let request = fn_body.find("send_sync_byte").unwrap();
2084        let become_root = fn_body.find("become_userns_root_for_setup").unwrap();
2085        let netns = fn_body.find("CLONE_NEWNET").unwrap();
2086        assert!(
2087            userns < request && request < become_root && become_root < netns,
2088            "rootless gVisor bridge setup must map userns before creating the netns"
2089        );
2090    }
2091
2092    #[test]
2093    fn test_native_fork_sites_assert_single_threaded() {
2094        let runtime_source = include_str!("runtime.rs");
2095        let create_body = extract_fn_body(runtime_source, "fn create_internal");
2096        assert!(
2097            create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
2098            "create_internal must assert single-threaded before fork"
2099        );
2100
2101        let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
2102        assert!(
2103            setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
2104            "PID namespace setup must assert single-threaded before fork"
2105        );
2106
2107        let exec_source = include_str!("exec.rs");
2108        let init_body = extract_fn_body(exec_source, "fn run_as_init");
2109        assert!(
2110            init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
2111            "run_as_init must assert single-threaded before fork"
2112        );
2113    }
2114
2115    #[test]
2116    fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
2117        let source = include_str!("exec.rs");
2118        let fn_body = extract_fn_body(source, "fn run_as_init");
2119        assert!(
2120            !fn_body.contains("Self::apply_process_identity_to_current_process("),
2121            "run_as_init must not drop identity before the supervisor fork"
2122        );
2123        assert!(
2124            fn_body.contains("self.exec_command()?"),
2125            "workload child must still route through exec_command for identity application"
2126        );
2127    }
2128
2129    #[test]
2130    fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
2131        let _env_lock = EnvLock::acquire();
2132        let temp = tempfile::TempDir::new().unwrap();
2133        let _artifact_base = EnvVarGuard::set(
2134            "NUCLEUS_GVISOR_ARTIFACT_BASE",
2135            temp.path().join("gvisor-artifacts"),
2136        );
2137        let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
2138        std::fs::create_dir_all(&artifact_dir).unwrap();
2139        std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
2140
2141        Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
2142        assert!(!artifact_dir.exists());
2143    }
2144
2145    #[test]
2146    fn test_gvisor_artifact_base_prefers_xdg_runtime_dir() {
2147        let _env_lock = EnvLock::acquire();
2148        let temp = tempfile::TempDir::new().unwrap();
2149        let _artifact_override = EnvVarGuard::remove("NUCLEUS_GVISOR_ARTIFACT_BASE");
2150        let _runtime = EnvVarGuard::set("XDG_RUNTIME_DIR", temp.path());
2151
2152        assert_eq!(
2153            Container::gvisor_artifact_dir("xdg-test"),
2154            temp.path().join("nucleus-gvisor").join("xdg-test")
2155        );
2156    }
2157
2158    #[test]
2159    fn test_health_check_loop_supports_cancellation() {
2160        // BUG-18: health_check_loop must accept an AtomicBool cancel flag
2161        // and check it between iterations for prompt shutdown.
2162        // Function lives in health.rs after the runtime split.
2163        let source = include_str!("health.rs");
2164        let fn_start = source.find("fn health_check_loop").unwrap();
2165        let fn_body = &source[fn_start..fn_start + 2500];
2166        assert!(
2167            fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
2168            "health_check_loop must accept an AtomicBool cancellation flag"
2169        );
2170        // Must also check cancellation during sleep
2171        assert!(
2172            fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
2173            "health_check_loop must check cancellation during sleep intervals"
2174        );
2175    }
2176
2177    #[test]
2178    fn test_runtime_probes_do_not_spawn_host_nsenter() {
2179        // Both functions live in health.rs after the runtime split.
2180        let source = include_str!("health.rs");
2181
2182        let readiness_start = source.find("fn run_readiness_probe").unwrap();
2183        let readiness_body = &source[readiness_start..readiness_start + 2500];
2184        assert!(
2185            !readiness_body.contains("Command::new(&nsenter_bin)"),
2186            "readiness probes must not execute via host nsenter"
2187        );
2188
2189        let health_start = source.find("fn health_check_loop").unwrap();
2190        let health_body = &source[health_start..health_start + 2200];
2191        assert!(
2192            !health_body.contains("Command::new(&nsenter_bin)"),
2193            "health checks must not execute via host nsenter"
2194        );
2195    }
2196
2197    #[test]
2198    fn test_oci_mount_strip_prefix_no_expect() {
2199        // BUG-08: prepare_oci_mountpoints must not use expect() - use ? instead
2200        // Function lives in gvisor_setup.rs after the runtime split.
2201        let source = include_str!("gvisor_setup.rs");
2202        let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
2203        let fn_body = &source[fn_start..fn_start + 600];
2204        assert!(
2205            !fn_body.contains(".expect("),
2206            "prepare_oci_mountpoints must not use expect() – return Err instead"
2207        );
2208    }
2209
2210    #[test]
2211    fn test_notify_namespace_ready_validates_write_length() {
2212        // BUG-02: notify_namespace_ready must validate that all bytes were written
2213        let source = include_str!("runtime.rs");
2214        let fn_start = source.find("fn notify_namespace_ready").unwrap();
2215        let fn_body = &source[fn_start..fn_start + 500];
2216        // Must check the return value of write() for partial writes
2217        assert!(
2218            fn_body.contains("written")
2219                || fn_body.contains("4")
2220                || fn_body.contains("payload.len()"),
2221            "notify_namespace_ready must validate complete write of all 4 bytes"
2222        );
2223    }
2224
2225    #[test]
2226    fn test_rlimit_failures_fatal_in_production() {
2227        // SEC-05: RLIMIT failures must be fatal in production mode
2228        let source = include_str!("runtime.rs");
2229        let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
2230        let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
2231        assert!(
2232            rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
2233            "RLIMIT failures must return Err in production mode"
2234        );
2235    }
2236
2237    #[test]
2238    fn test_tcp_readiness_probe_uses_portable_check() {
2239        // BUG-14: TCP readiness probe must not use /dev/tcp (bash-only)
2240        // Function lives in health.rs after the runtime split.
2241        let source = include_str!("health.rs");
2242        let probe_fn = source.find("TcpPort(port)").unwrap();
2243        let probe_body = &source[probe_fn..probe_fn + 500];
2244        assert!(
2245            !probe_body.contains("/dev/tcp"),
2246            "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
2247        );
2248    }
2249}
nucleus/container/runtime.rs

nucleus/container/
runtime.rs