Skip to main content

nucleus/container/
runtime.rs

1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3    ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4    ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8    audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9    mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
10    switch_root, validate_production_rootfs_path, verify_context_manifest,
11    verify_rootfs_attestation, FilesystemState, LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::{NamespaceManager, UserNamespaceMapper};
14use crate::network::{BridgeDriver, BridgeNetwork, NatBackend, NetworkMode, UserspaceNetwork};
15use crate::resources::Cgroup;
16use crate::security::{
17    CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks,
18    SeccompDenyLogger, SeccompManager, SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{
25    chown, fork, pipe, read, setresgid, setresuid, write, ForkResult, Gid, Pid, Uid,
26};
27use std::os::fd::OwnedFd;
28use std::os::unix::fs::PermissionsExt;
29use std::path::PathBuf;
30use std::sync::atomic::{AtomicBool, Ordering};
31use std::sync::Arc;
32use std::thread::JoinHandle;
33use tempfile::Builder;
34use tracing::{debug, error, info, info_span, warn};
35
36/// Container runtime that orchestrates all isolation mechanisms
37///
38/// Execution flow matches the formal specifications:
39/// 1. Create namespaces (Nucleus_Isolation_NamespaceLifecycle.tla)
40/// 2. Create and configure cgroups (Nucleus_Resources_CgroupLifecycle.tla)
41/// 3. Mount tmpfs and populate context (Nucleus_Filesystem_FilesystemLifecycle.tla)
42/// 4. Drop capabilities and apply seccomp (Nucleus_Security_SecurityEnforcement.tla)
43/// 5. Execute target process
44pub struct Container {
45    pub(super) config: ContainerConfig,
46    /// Pre-resolved runsc path, resolved before fork so that user-namespace
47    /// UID changes don't block PATH-based lookup.
48    pub(super) runsc_path: Option<String>,
49}
50
51/// Handle returned by `Container::create()` representing a container whose
52/// child process has been forked and is blocked on the exec FIFO, waiting for
53/// `start()` to release it.
54pub struct CreatedContainer {
55    pub(super) config: ContainerConfig,
56    pub(super) state_mgr: ContainerStateManager,
57    pub(super) state: ContainerState,
58    pub(super) child: Pid,
59    pub(super) cgroup_opt: Option<Cgroup>,
60    pub(super) network_driver: Option<BridgeDriver>,
61    pub(super) trace_reader: Option<SeccompTraceReader>,
62    pub(super) deny_logger: Option<SeccompDenyLogger>,
63    pub(super) exec_fifo_path: Option<PathBuf>,
64    pub(super) _lifecycle_span: tracing::Span,
65}
66
67impl Container {
68    pub fn new(config: ContainerConfig) -> Self {
69        Self {
70            config,
71            runsc_path: None,
72        }
73    }
74
75    /// Run the container (convenience wrapper: create + start)
76    pub fn run(&self) -> Result<i32> {
77        self.create_internal(false)?.start()
78    }
79
80    /// Create phase: fork the child, set up cgroup/bridge, leave child blocked
81    /// on the exec FIFO. Returns a `CreatedContainer` whose `start()` method
82    /// releases the child process.
83    pub fn create(&self) -> Result<CreatedContainer> {
84        self.create_internal(true)
85    }
86
87    /// H6: Close all file descriptors > 2 in the child process after fork.
88    ///
89    /// This prevents leaking host sockets, pipes, and state files into the
90    /// container. Uses close_range(2) when available, falls back to /proc/self/fd.
91    fn sanitize_fds() {
92        // Try close_range(3, u32::MAX, CLOSE_RANGE_CLOEXEC) first – it's
93        // O(1) on Linux 5.9+ and marks all FDs as close-on-exec.
94        const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
95        // SAFETY: close_range is a safe syscall that marks FDs as close-on-exec.
96        let ret =
97            unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
98        if ret == 0 {
99            return;
100        }
101        // Fallback: iterate /proc/self/fd and close individually.
102        // Collect fds first, then close – closing during iteration would
103        // invalidate the ReadDir's own directory fd.
104        if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
105            let fds: Vec<i32> = entries
106                .flatten()
107                .filter_map(|entry| entry.file_name().into_string().ok())
108                .filter_map(|s| s.parse::<i32>().ok())
109                .filter(|&fd| fd > 2)
110                .collect();
111            for fd in fds {
112                unsafe { libc::close(fd) };
113            }
114        }
115    }
116
117    pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
118        let thread_count = std::fs::read_to_string("/proc/self/status")
119            .ok()
120            .and_then(|s| {
121                s.lines()
122                    .find(|line| line.starts_with("Threads:"))
123                    .and_then(|line| line.split_whitespace().nth(1))
124                    .and_then(|count| count.parse::<u32>().ok())
125            });
126
127        if thread_count == Some(1) {
128            return Ok(());
129        }
130
131        Err(NucleusError::ExecError(format!(
132            "{} requires a single-threaded process before fork, found {:?} threads",
133            context, thread_count
134        )))
135    }
136
137    fn prepare_runtime_base_override(
138        config: &ContainerConfig,
139        host_is_root: bool,
140        needs_external_userns_mapping: bool,
141    ) -> Result<Option<PathBuf>> {
142        if !needs_external_userns_mapping {
143            return Ok(None);
144        }
145
146        if !host_is_root {
147            return Ok(Some(
148                dirs::runtime_dir()
149                    .map(|d| d.join("nucleus"))
150                    .unwrap_or_else(std::env::temp_dir),
151            ));
152        }
153
154        let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
155            NucleusError::ExecError("Missing user namespace configuration".to_string())
156        })?;
157        let host_uid =
158            Self::mapped_host_id_for_container_id(&user_config.uid_mappings, 0, "uid mappings")?;
159        let host_gid =
160            Self::mapped_host_id_for_container_id(&user_config.gid_mappings, 0, "gid mappings")?;
161
162        let root = PathBuf::from("/run/nucleus");
163        Self::ensure_runtime_parent_dir(&root)?;
164
165        let runtime_root = root.join("runtime");
166        Self::ensure_runtime_parent_dir(&runtime_root)?;
167
168        let base = runtime_root.join(&config.id);
169        std::fs::create_dir_all(&base).map_err(|e| {
170            NucleusError::FilesystemError(format!(
171                "Failed to create user namespace runtime base {:?}: {}",
172                base, e
173            ))
174        })?;
175        chown(
176            &base,
177            Some(Uid::from_raw(host_uid)),
178            Some(Gid::from_raw(host_gid)),
179        )
180        .map_err(|e| {
181            NucleusError::FilesystemError(format!(
182                "Failed to chown user namespace runtime base {:?} to {}:{}: {}",
183                base, host_uid, host_gid, e
184            ))
185        })?;
186        std::fs::set_permissions(&base, std::fs::Permissions::from_mode(0o700)).map_err(|e| {
187            NucleusError::FilesystemError(format!(
188                "Failed to secure user namespace runtime base {:?}: {}",
189                base, e
190            ))
191        })?;
192
193        Ok(Some(base))
194    }
195
196    fn ensure_runtime_parent_dir(path: &std::path::Path) -> Result<()> {
197        std::fs::create_dir_all(path).map_err(|e| {
198            NucleusError::FilesystemError(format!(
199                "Failed to create runtime parent dir {:?}: {}",
200                path, e
201            ))
202        })?;
203        std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o711)).map_err(|e| {
204            NucleusError::FilesystemError(format!(
205                "Failed to secure runtime parent dir {:?}: {}",
206                path, e
207            ))
208        })?;
209        Ok(())
210    }
211
212    fn mapped_host_id_for_container_id(
213        mappings: &[crate::isolation::IdMapping],
214        container_id: u32,
215        label: &str,
216    ) -> Result<u32> {
217        for mapping in mappings {
218            let end = mapping
219                .container_id
220                .checked_add(mapping.count)
221                .ok_or_else(|| {
222                    NucleusError::ConfigError(format!(
223                        "{} overflow for container id {}",
224                        label, container_id
225                    ))
226                })?;
227            if container_id >= mapping.container_id && container_id < end {
228                return mapping
229                    .host_id
230                    .checked_add(container_id - mapping.container_id)
231                    .ok_or_else(|| {
232                        NucleusError::ConfigError(format!(
233                            "{} host id overflow for container id {}",
234                            label, container_id
235                        ))
236                    });
237            }
238        }
239
240        Err(NucleusError::ConfigError(format!(
241            "{} do not map container id {}",
242            label, container_id
243        )))
244    }
245
246    fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
247        let lifecycle_span = info_span!(
248            "container.lifecycle",
249            container.id = %self.config.id,
250            container.name = %self.config.name,
251            runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
252        );
253        let _enter = lifecycle_span.enter();
254
255        info!(
256            "Creating container: {} (ID: {})",
257            self.config.name, self.config.id
258        );
259        audit(
260            &self.config.id,
261            &self.config.name,
262            AuditEventType::ContainerStart,
263            format!(
264                "command={:?} mode={:?} runtime={}",
265                crate::audit::redact_command(&self.config.command),
266                self.config.service_mode,
267                if self.config.use_gvisor {
268                    "gvisor"
269                } else {
270                    "native"
271                }
272            ),
273        );
274
275        // Auto-detect if we need rootless mode
276        let is_root = nix::unistd::Uid::effective().is_root();
277        let mut config = self.config.clone();
278
279        if !is_root && config.user_ns_config.is_none() {
280            info!("Not running as root, automatically enabling rootless mode");
281            config.namespaces.user = true;
282            config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
283        }
284
285        // C2: When running as root without user namespace, enable UID remapping
286        // in production mode (mandatory) or warn in other modes. Without user
287        // namespace, a container escape yields full host root.
288        if is_root && !config.namespaces.user {
289            if config.service_mode == ServiceMode::Production {
290                info!("Running as root in production mode: enabling user namespace with UID remapping");
291                config.namespaces.user = true;
292                config.user_ns_config =
293                    Some(crate::isolation::UserNamespaceConfig::root_remapped());
294            } else {
295                warn!(
296                    "Running as root WITHOUT user namespace isolation. \
297                     Container processes will run as real host UID 0. \
298                     Use --user-ns or production mode for UID remapping."
299                );
300            }
301        }
302
303        // Log console-socket acceptance (OCI interface; PTY forwarding is a future enhancement)
304        if let Some(ref socket_path) = config.console_socket {
305            warn!(
306                "Console socket {} accepted but terminal forwarding is not yet implemented",
307                socket_path.display()
308            );
309        }
310
311        // Validate production mode invariants before anything else.
312        config.validate_production_mode()?;
313        if config.service_mode == ServiceMode::Production {
314            let rootfs_path = config.rootfs_path.as_ref().ok_or_else(|| {
315                NucleusError::ConfigError(
316                    "Production mode requires explicit --rootfs path (no host bind mounts)"
317                        .to_string(),
318                )
319            })?;
320            config.rootfs_path = Some(validate_production_rootfs_path(rootfs_path)?);
321        }
322        Self::assert_kernel_lockdown(&config)?;
323
324        Self::apply_network_mode_guards(&mut config, is_root)?;
325        Self::apply_trust_level_guards(&mut config)?;
326        config.validate_runtime_support()?;
327
328        if let NetworkMode::Bridge(ref bridge_config) = config.network {
329            let backend =
330                bridge_config.selected_nat_backend(is_root, config.user_ns_config.is_some());
331            if backend == NatBackend::Kernel && !is_root {
332                return Err(NucleusError::NetworkError(
333                    "Kernel bridge networking requires root. Use --nat-backend userspace or leave the default auto selection for rootless/native containers."
334                        .to_string(),
335                ));
336            }
337        }
338
339        // Create state manager, honoring --root override if set
340        let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
341
342        // Enforce name uniqueness among running containers
343        if let Ok(all_states) = state_mgr.list_states() {
344            if all_states.iter().any(|s| s.name == config.name) {
345                return Err(NucleusError::ConfigError(format!(
346                    "A container named '{}' already exists; use a different --name, \
347                     or remove the stale state with 'nucleus delete'",
348                    config.name
349                )));
350            }
351        }
352
353        // Create exec FIFO only for the two-phase create/start lifecycle.
354        // The immediate `run()` lifecycle is still gated by parent_setup_pipe
355        // below; it just does not need an externally-triggered start FIFO.
356        let exec_fifo = if defer_exec_until_start {
357            let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
358            nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
359                NucleusError::ExecError(format!(
360                    "Failed to create exec FIFO {:?}: {}",
361                    exec_fifo, e
362                ))
363            })?;
364            Some(exec_fifo)
365        } else {
366            None
367        };
368
369        // Try to create cgroup (optional for rootless mode)
370        let cgroup_name = format!("nucleus-{}", config.id);
371        let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
372            Ok(mut cgroup) => {
373                // Try to set limits
374                match cgroup.set_limits(&config.limits) {
375                    Ok(_) => {
376                        info!("Created cgroup with resource limits");
377                        Some(cgroup)
378                    }
379                    Err(e) => {
380                        if config.service_mode == ServiceMode::Production {
381                            let _ = cgroup.cleanup();
382                            return Err(NucleusError::CgroupError(format!(
383                                "Production mode requires cgroup resource enforcement, but \
384                                 applying limits failed: {}",
385                                e
386                            )));
387                        }
388                        warn!("Failed to set cgroup limits: {}", e);
389                        let _ = cgroup.cleanup();
390                        None
391                    }
392                }
393            }
394            Err(e) => {
395                if config.service_mode == ServiceMode::Production {
396                    return Err(NucleusError::CgroupError(format!(
397                        "Production mode requires cgroup resource enforcement, but \
398                         cgroup creation failed: {}",
399                        e
400                    )));
401                }
402
403                if config.user_ns_config.is_some() {
404                    if config.limits.memory_bytes.is_some()
405                        || config.limits.cpu_quota_us.is_some()
406                        || config.limits.pids_max.is_some()
407                    {
408                        warn!(
409                            "Running in rootless mode: requested resource limits cannot be \
410                             enforced – cgroup creation requires root ({})",
411                            e
412                        );
413                    } else {
414                        debug!("Running in rootless mode without cgroup resource limits");
415                    }
416                } else {
417                    warn!(
418                        "Failed to create cgroup (running without resource limits): {}",
419                        e
420                    );
421                }
422                None
423            }
424        };
425
426        // Resolve runsc path before fork, while still unprivileged.
427        let runsc_path = if config.use_gvisor {
428            Some(GVisorRuntime::resolve_path().map_err(|e| {
429                NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
430            })?)
431        } else {
432            None
433        };
434        let gvisor_bridge_needs_userns_mapping = config.use_gvisor
435            && !is_root
436            && config.user_ns_config.is_some()
437            && matches!(config.network, NetworkMode::Bridge(_));
438        let needs_external_userns_mapping = config.user_ns_config.is_some()
439            && (!config.use_gvisor || gvisor_bridge_needs_userns_mapping);
440        let runtime_base_override =
441            Self::prepare_runtime_base_override(&config, is_root, needs_external_userns_mapping)?;
442
443        // Child notifies parent after namespaces are ready.
444        let (ready_read, ready_write) = pipe().map_err(|e| {
445            NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
446        })?;
447        let userns_sync = if needs_external_userns_mapping {
448            let (request_read, request_write) = pipe().map_err(|e| {
449                NucleusError::ExecError(format!(
450                    "Failed to create user namespace request pipe: {}",
451                    e
452                ))
453            })?;
454            let (ack_read, ack_write) = pipe().map_err(|e| {
455                NucleusError::ExecError(format!("Failed to create user namespace ack pipe: {}", e))
456            })?;
457            Some((request_read, request_write, ack_read, ack_write))
458        } else {
459            None
460        };
461        let (parent_setup_read, parent_setup_write) = pipe().map_err(|e| {
462            NucleusError::ExecError(format!("Failed to create parent setup sync pipe: {}", e))
463        })?;
464
465        // M11: fork() in multi-threaded context. Flush log buffers and drop
466        // tracing guards before fork to minimize deadlock risk from locks held
467        // by other threads (tracing, allocator). The Tokio runtime is not yet
468        // started at this point, so async thread contention is not a concern.
469        Self::assert_single_threaded_for_fork("container create fork")?;
470        // SAFETY: fork() is called before any Tokio runtime is created.
471        // Only the main thread should be active at this point.
472        match unsafe { fork() }? {
473            ForkResult::Parent { child } => {
474                drop(ready_write);
475                drop(parent_setup_read);
476                let (userns_request_read, userns_ack_write) =
477                    if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
478                        drop(request_write);
479                        drop(ack_read);
480                        (Some(request_read), Some(ack_write))
481                    } else {
482                        (None, None)
483                    };
484                info!("Forked child process: {}", child);
485
486                // Use a closure so that on any error we kill the forked child.
487                // If the PID-namespace child has already reported readiness,
488                // also kill that target PID; it may otherwise be reparented
489                // away from the intermediate process.
490                let mut target_pid_for_cleanup: Option<u32> = None;
491                let parent_setup = || -> Result<CreatedContainer> {
492                    if needs_external_userns_mapping {
493                        let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
494                            NucleusError::ExecError(
495                                "Missing user namespace configuration in parent".to_string(),
496                            )
497                        })?;
498                        let request_read = userns_request_read.as_ref().ok_or_else(|| {
499                            NucleusError::ExecError(
500                                "Missing user namespace request pipe in parent".to_string(),
501                            )
502                        })?;
503                        let ack_write = userns_ack_write.as_ref().ok_or_else(|| {
504                            NucleusError::ExecError(
505                                "Missing user namespace ack pipe in parent".to_string(),
506                            )
507                        })?;
508
509                        Self::wait_for_sync_byte(
510                            request_read,
511                            &format!(
512                                "Child {} exited before requesting user namespace mappings",
513                                child
514                            ),
515                            "Failed waiting for child user namespace request",
516                        )?;
517                        UserNamespaceMapper::new(user_config.clone())
518                            .write_mappings_for_pid(child.as_raw() as u32)?;
519                        Self::send_sync_byte(
520                            ack_write,
521                            "Failed to notify child that user namespace mappings are ready",
522                        )?;
523                    }
524
525                    let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
526                    target_pid_for_cleanup = Some(target_pid);
527
528                    let cgroup_path = cgroup_opt
529                        .as_ref()
530                        .map(|cgroup| cgroup.path().display().to_string());
531                    let cpu_millicores = config
532                        .limits
533                        .cpu_quota_us
534                        .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
535                    let mut state = ContainerState::new(ContainerStateParams {
536                        id: config.id.clone(),
537                        name: config.name.clone(),
538                        pid: target_pid,
539                        command: config.command.clone(),
540                        memory_limit: config.limits.memory_bytes,
541                        cpu_limit: cpu_millicores,
542                        using_gvisor: config.use_gvisor,
543                        rootless: config.user_ns_config.is_some(),
544                        cgroup_path,
545                        process_uid: config.process_identity.uid,
546                        process_gid: config.process_identity.gid,
547                        additional_gids: config.process_identity.additional_gids.clone(),
548                    });
549                    state.config_hash = config.config_hash;
550                    state.bundle_path =
551                        config.rootfs_path.as_ref().map(|p| p.display().to_string());
552
553                    let mut network_driver: Option<BridgeDriver> = None;
554                    let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
555
556                    // Transition: Creating -> Created
557                    state.status = OciStatus::Created;
558                    state_mgr.save_state(&state)?;
559
560                    // Write PID file (OCI --pid-file)
561                    if let Some(ref pid_path) = config.pid_file {
562                        std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
563                            NucleusError::ConfigError(format!(
564                                "Failed to write pid-file '{}': {}",
565                                pid_path.display(),
566                                e
567                            ))
568                        })?;
569                        info!("Wrote PID {} to {}", target_pid, pid_path.display());
570                    }
571
572                    if let Some(ref mut cgroup) = cgroup_opt {
573                        cgroup.attach_process(target_pid)?;
574                    }
575
576                    let deny_logger = Self::maybe_start_seccomp_deny_logger(
577                        &config,
578                        target_pid,
579                        cgroup_opt.as_ref().map(|cgroup| cgroup.path()),
580                    )?;
581
582                    if let NetworkMode::Bridge(ref bridge_config) = config.network {
583                        match BridgeDriver::setup_with_id(
584                            target_pid,
585                            bridge_config,
586                            &config.id,
587                            is_root,
588                            config.user_ns_config.is_some(),
589                        ) {
590                            Ok(net) => {
591                                if let Some(ref egress) = config.egress_policy {
592                                    if let Err(e) = net.apply_egress_policy(
593                                        target_pid,
594                                        egress,
595                                        config.user_ns_config.is_some(),
596                                    ) {
597                                        if config.service_mode == ServiceMode::Production {
598                                            return Err(NucleusError::NetworkError(format!(
599                                                "Failed to apply egress policy: {}",
600                                                e
601                                            )));
602                                        }
603                                        warn!("Failed to apply egress policy: {}", e);
604                                    }
605                                }
606                                network_driver = Some(net);
607                            }
608                            Err(e) => {
609                                if config.service_mode == ServiceMode::Production {
610                                    return Err(e);
611                                }
612                                warn!("Failed to set up bridge networking: {}", e);
613                            }
614                        }
615                    }
616
617                    Self::send_sync_byte(
618                        &parent_setup_write,
619                        "Failed to notify child that parent setup is complete",
620                    )?;
621
622                    info!(
623                        "Container {} created (child pid {}), waiting for start",
624                        config.id, target_pid
625                    );
626
627                    Ok(CreatedContainer {
628                        config,
629                        state_mgr,
630                        state,
631                        child,
632                        cgroup_opt,
633                        network_driver,
634                        trace_reader,
635                        deny_logger,
636                        exec_fifo_path: exec_fifo,
637                        _lifecycle_span: lifecycle_span.clone(),
638                    })
639                };
640
641                parent_setup().map_err(|e| {
642                    if let Some(target_pid) = target_pid_for_cleanup {
643                        let _ = kill(Pid::from_raw(target_pid as i32), Signal::SIGKILL);
644                    }
645                    let _ = kill(child, Signal::SIGKILL);
646                    let _ = waitpid(child, None);
647                    e
648                })
649            }
650            ForkResult::Child => {
651                drop(ready_read);
652                drop(parent_setup_write);
653                let (userns_request_write, userns_ack_read) =
654                    if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
655                        drop(request_read);
656                        drop(ack_write);
657                        (Some(request_write), Some(ack_read))
658                    } else {
659                        (None, None)
660                    };
661                // H6: Close inherited FDs > 2 to prevent leaking host sockets/pipes
662                Self::sanitize_fds();
663                let temp_container = Container { config, runsc_path };
664                match temp_container.setup_and_exec(
665                    Some(ready_write),
666                    userns_request_write,
667                    userns_ack_read,
668                    Some(parent_setup_read),
669                    exec_fifo,
670                    runtime_base_override,
671                ) {
672                    Ok(_) => unreachable!(),
673                    Err(e) => {
674                        error!("Container setup failed: {}", e);
675                        std::process::exit(1);
676                    }
677                }
678            }
679        }
680    }
681
682    /// Trigger a previously-created container to start by opening its exec FIFO.
683    /// Used by the CLI `start` command.
684    pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
685        let state_mgr = ContainerStateManager::new_with_root(state_root)?;
686        let fifo_path = state_mgr.exec_fifo_path(container_id)?;
687        if !fifo_path.exists() {
688            return Err(NucleusError::ConfigError(format!(
689                "No exec FIFO found for container {}; is it in 'created' state?",
690                container_id
691            )));
692        }
693
694        // Opening the FIFO for reading unblocks the child's open-for-write.
695        let file = std::fs::File::open(&fifo_path)
696            .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
697        let mut buf = [0u8; 1];
698        std::io::Read::read(&mut &file, &mut buf)
699            .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
700        drop(file);
701
702        let _ = std::fs::remove_file(&fifo_path);
703
704        // Update state to Running
705        let mut state = state_mgr.resolve_container(container_id)?;
706        state.status = OciStatus::Running;
707        state_mgr.save_state(&state)?;
708
709        Ok(())
710    }
711
712    /// Set up container environment and exec target process
713    ///
714    /// This runs in the child process after fork.
715    /// Tracks FilesystemState and SecurityState machines to enforce correct ordering.
716    fn setup_and_exec(
717        &self,
718        ready_pipe: Option<OwnedFd>,
719        userns_request_pipe: Option<OwnedFd>,
720        userns_ack_pipe: Option<OwnedFd>,
721        parent_setup_pipe: Option<OwnedFd>,
722        exec_fifo: Option<PathBuf>,
723        runtime_base_override: Option<PathBuf>,
724    ) -> Result<()> {
725        let is_rootless = self.config.user_ns_config.is_some();
726        let allow_degraded_security = Self::allow_degraded_security(&self.config);
727        let context_manifest = if self.config.verify_context_integrity {
728            self.config
729                .context_dir
730                .as_ref()
731                .map(|dir| snapshot_context_dir(dir))
732                .transpose()?
733        } else {
734            None
735        };
736
737        // Initialize state machines
738        let mut fs_state = FilesystemState::Unmounted;
739        let mut sec_state = SecurityState::Privileged;
740
741        // gVisor creates the container namespaces. Bridge mode is the exception:
742        // Nucleus must hand slirp/port-forward setup a concrete target netns,
743        // then runsc inherits that netns via --network host.
744        if self.config.use_gvisor {
745            let gvisor_bridge_precreated_userns =
746                if matches!(self.config.network, NetworkMode::Bridge(_)) {
747                    self.prepare_gvisor_bridge_namespace(
748                        userns_request_pipe.as_ref(),
749                        userns_ack_pipe.as_ref(),
750                    )?
751                } else {
752                    false
753                };
754
755            if let Some(fd) = ready_pipe {
756                Self::notify_namespace_ready(&fd, std::process::id())?;
757            }
758            if let Some(fd) = parent_setup_pipe.as_ref() {
759                Self::wait_for_sync_byte(
760                    fd,
761                    "Parent closed setup pipe before signalling gVisor child",
762                    "Failed waiting for parent setup acknowledgement",
763                )?;
764            }
765            return self.setup_and_exec_gvisor(gvisor_bridge_precreated_userns);
766        }
767
768        // 1. Create namespaces in child and optionally configure user mapping.
769        let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
770        namespace_mgr.unshare_namespaces()?;
771        if self.config.user_ns_config.is_some() {
772            let request_fd = userns_request_pipe.as_ref().ok_or_else(|| {
773                NucleusError::ExecError(
774                    "Missing user namespace request pipe in container child".to_string(),
775                )
776            })?;
777            let ack_fd = userns_ack_pipe.as_ref().ok_or_else(|| {
778                NucleusError::ExecError(
779                    "Missing user namespace acknowledgement pipe in container child".to_string(),
780                )
781            })?;
782
783            Self::send_sync_byte(
784                request_fd,
785                "Failed to request user namespace mappings from parent",
786            )?;
787            Self::wait_for_sync_byte(
788                ack_fd,
789                "Parent closed user namespace ack pipe before mappings were written",
790                "Failed waiting for parent to finish user namespace mappings",
791            )?;
792            Self::become_userns_root_for_setup()?;
793        }
794
795        // CLONE_NEWPID only applies to children created after unshare().
796        // Create a child that will become PID 1 in the new namespace and exec the workload.
797        if self.config.namespaces.pid {
798            Self::assert_single_threaded_for_fork("PID namespace init fork")?;
799            match unsafe { fork() }? {
800                ForkResult::Parent { child } => {
801                    if let Some(fd) = ready_pipe {
802                        Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
803                    }
804                    std::process::exit(Self::wait_for_pid_namespace_child(child));
805                }
806                ForkResult::Child => {
807                    if let Some(fd) = parent_setup_pipe.as_ref() {
808                        Self::wait_for_sync_byte(
809                            fd,
810                            "Parent closed setup pipe before signalling PID 1 child",
811                            "Failed waiting for parent setup acknowledgement",
812                        )?;
813                    }
814                    // Continue container setup as PID 1 in the new namespace.
815                }
816            }
817        } else {
818            if let Some(fd) = ready_pipe {
819                Self::notify_namespace_ready(&fd, std::process::id())?;
820            }
821            if let Some(fd) = parent_setup_pipe.as_ref() {
822                Self::wait_for_sync_byte(
823                    fd,
824                    "Parent closed setup pipe before signalling container child",
825                    "Failed waiting for parent setup acknowledgement",
826                )?;
827            }
828        }
829
830        // Namespace: Unshared -> Entered (process is now inside all namespaces)
831        namespace_mgr.enter()?;
832
833        // 2. Ensure no_new_privs BEFORE any mount operations.
834        // This prevents exploitation of setuid binaries on bind-mounted paths
835        // even if a subsequent MS_NOSUID remount fails.
836        self.enforce_no_new_privs()?;
837        audit(
838            &self.config.id,
839            &self.config.name,
840            AuditEventType::NoNewPrivsSet,
841            "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
842        );
843
844        // 3. Set hostname if UTS namespace is enabled
845        if let Some(hostname) = &self.config.hostname {
846            namespace_mgr.set_hostname(hostname)?;
847        }
848
849        // 4. Mount tmpfs as container root
850        // Filesystem: Unmounted -> Mounted
851        // Use a private runtime directory instead of /tmp to avoid symlink
852        // attacks and information disclosure on multi-user systems.
853        let runtime_base = if let Some(path) = runtime_base_override {
854            path
855        } else if nix::unistd::Uid::effective().is_root() {
856            PathBuf::from("/run/nucleus")
857        } else {
858            dirs::runtime_dir()
859                .map(|d| d.join("nucleus"))
860                .unwrap_or_else(std::env::temp_dir)
861        };
862        let _ = std::fs::create_dir_all(&runtime_base);
863        let runtime_dir = Builder::new()
864            .prefix("nucleus-runtime-")
865            .tempdir_in(&runtime_base)
866            .map_err(|e| {
867                NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
868            })?;
869        let container_root = runtime_dir.path().to_path_buf();
870        let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); // 1GB default
871        tmpfs.mount()?;
872        fs_state = fs_state.transition(FilesystemState::Mounted)?;
873
874        // 4. Create minimal filesystem structure
875        create_minimal_fs(&container_root)?;
876
877        // 5. Create device nodes and standard tmpfs mounts under /dev
878        let dev_path = container_root.join("dev");
879        create_dev_nodes(&dev_path, false)?;
880
881        // /dev/shm – POSIX shared memory (shm_open). Required by PostgreSQL,
882        // Redis, and other programs that use POSIX shared memory segments.
883        let shm_path = dev_path.join("shm");
884        std::fs::create_dir_all(&shm_path).map_err(|e| {
885            NucleusError::FilesystemError(format!("Failed to create /dev/shm: {}", e))
886        })?;
887        nix::mount::mount(
888            Some("shm"),
889            &shm_path,
890            Some("tmpfs"),
891            nix::mount::MsFlags::MS_NOSUID
892                | nix::mount::MsFlags::MS_NODEV
893                | nix::mount::MsFlags::MS_NOEXEC,
894            Some("mode=1777,size=64m"),
895        )
896        .map_err(|e| {
897            NucleusError::FilesystemError(format!("Failed to mount tmpfs on /dev/shm: {}", e))
898        })?;
899        debug!("Mounted tmpfs on /dev/shm");
900
901        // 6. Populate context if provided
902        // Filesystem: Mounted -> Populated
903        if let Some(context_dir) = &self.config.context_dir {
904            let context_dest = container_root.join("context");
905            LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
906            if let Some(expected) = &context_manifest {
907                verify_context_manifest(expected, &context_dest)?;
908            }
909        }
910        fs_state = fs_state.transition(FilesystemState::Populated)?;
911
912        // 7. Mount runtime paths: either a pre-built rootfs or host bind mounts
913        if let Some(ref rootfs_path) = self.config.rootfs_path {
914            let rootfs_path = if self.config.service_mode == ServiceMode::Production {
915                validate_production_rootfs_path(rootfs_path)?
916            } else {
917                rootfs_path.clone()
918            };
919            if self.config.verify_rootfs_attestation {
920                verify_rootfs_attestation(&rootfs_path)?;
921            }
922            bind_mount_rootfs(&container_root, &rootfs_path)?;
923        } else {
924            bind_mount_host_paths(&container_root, is_rootless)?;
925        }
926
927        // 7b. Mount persistent or ephemeral volumes over the base filesystem.
928        mount_volumes(&container_root, &self.config.volumes)?;
929
930        // 7c. Write resolv.conf for bridge networking.
931        // When rootfs is mounted, /etc is read-only, so we bind-mount a writable
932        // resolv.conf over the top (same technique as secrets).
933        if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
934            let bridge_dns = if bridge_config.selected_nat_backend(!is_rootless, is_rootless)
935                == NatBackend::Userspace
936                && bridge_config.dns.is_empty()
937            {
938                vec![UserspaceNetwork::default_dns_server(&bridge_config.subnet)?]
939            } else {
940                bridge_config.dns.clone()
941            };
942            if self.config.rootfs_path.is_some() {
943                BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_dns)?;
944            } else {
945                BridgeNetwork::write_resolv_conf(&container_root, &bridge_dns)?;
946            }
947        }
948
949        // 7d. Mount secrets on an in-memory tmpfs in all modes.
950        mount_secrets_inmemory(
951            &container_root,
952            &self.config.secrets,
953            &self.config.process_identity,
954        )?;
955
956        // 8. Mount procfs (hidepid=2 in production mode to prevent PID enumeration)
957        let proc_path = container_root.join("proc");
958        let production_mode = self.config.service_mode == ServiceMode::Production;
959        let hide_pids = production_mode;
960        let procfs_best_effort = is_rootless && !production_mode;
961        mount_procfs(
962            &proc_path,
963            procfs_best_effort,
964            self.config.proc_readonly,
965            hide_pids,
966        )?;
967
968        // 8b. Mask sensitive /proc paths to reduce kernel info leakage
969        // SEC-06: In production mode, failures to mask critical paths are fatal.
970        mask_proc_paths(
971            &proc_path,
972            self.config.service_mode == ServiceMode::Production,
973        )?;
974
975        // 9c. Run createRuntime hooks (after namespaces created, before pivot_root)
976        if let Some(ref hooks) = self.config.hooks {
977            if !hooks.create_runtime.is_empty() {
978                let hook_state = OciContainerState {
979                    oci_version: "1.0.2".to_string(),
980                    id: self.config.id.clone(),
981                    status: OciStatus::Creating,
982                    pid: std::process::id(),
983                    bundle: String::new(),
984                };
985                OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
986            }
987        }
988
989        // 10. Switch root filesystem
990        // Filesystem: Populated -> Pivoted
991        switch_root(&container_root, self.config.allow_chroot_fallback)?;
992        fs_state = fs_state.transition(FilesystemState::Pivoted)?;
993        debug!("Filesystem state: {:?}", fs_state);
994
995        // 10b. Audit mount flags to verify filesystem hardening invariants
996        audit_mounts(self.config.service_mode == ServiceMode::Production)?;
997        audit(
998            &self.config.id,
999            &self.config.name,
1000            AuditEventType::MountAuditPassed,
1001            "all mount flags verified",
1002        );
1003
1004        // 10c. Run createContainer hooks (after pivot_root, before start)
1005        if let Some(ref hooks) = self.config.hooks {
1006            if !hooks.create_container.is_empty() {
1007                let hook_state = OciContainerState {
1008                    oci_version: "1.0.2".to_string(),
1009                    id: self.config.id.clone(),
1010                    status: OciStatus::Created,
1011                    pid: std::process::id(),
1012                    bundle: String::new(),
1013                };
1014                OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
1015            }
1016        }
1017
1018        // 11. Drop capabilities and switch identity (Docker/runc convention).
1019        //
1020        // The identity switch (setuid/setgid) must happen between two cap phases:
1021        //   Phase 1: drop bounding set (needs CAP_SETPCAP), clear ambient/inheritable
1022        //   Identity: setgroups/setgid/setuid (needs CAP_SETUID/CAP_SETGID)
1023        //   Phase 2: clear permitted/effective (or kernel auto-clears on setuid)
1024        //
1025        // Custom cap policies (drop_except / apply_sets) do their own full drop,
1026        // so the two-phase approach only applies to the default drop-all path.
1027        let mut cap_mgr = CapabilityManager::new();
1028        if let Some(ref policy_path) = self.config.caps_policy {
1029            let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
1030                policy_path,
1031                self.config.caps_policy_sha256.as_deref(),
1032            )?;
1033            // H3: Reject dangerous capabilities in production mode
1034            if self.config.service_mode == ServiceMode::Production {
1035                policy.validate_production()?;
1036            }
1037            policy.apply(&mut cap_mgr)?;
1038            // Identity switch after custom policy (caps may already be restricted)
1039            Self::apply_process_identity_to_current_process(
1040                &self.config.process_identity,
1041                self.config.user_ns_config.is_some(),
1042            )?;
1043            audit(
1044                &self.config.id,
1045                &self.config.name,
1046                AuditEventType::CapabilitiesDropped,
1047                format!("capability policy applied from {:?}", policy_path),
1048            );
1049        } else {
1050            // Phase 1: drop bounding set while CAP_SETPCAP is still effective
1051            cap_mgr.drop_bounding_set()?;
1052
1053            // Identity switch: setgroups/setgid/setuid while CAP_SETUID/CAP_SETGID
1054            // are still in the effective set. For non-root target UIDs, the kernel
1055            // auto-clears permitted/effective after setuid().
1056            Self::apply_process_identity_to_current_process(
1057                &self.config.process_identity,
1058                self.config.user_ns_config.is_some(),
1059            )?;
1060
1061            // Phase 2: explicitly clear any remaining caps (handles root-stays-root
1062            // case where kernel doesn't auto-clear).
1063            cap_mgr.finalize_drop()?;
1064
1065            audit(
1066                &self.config.id,
1067                &self.config.name,
1068                AuditEventType::CapabilitiesDropped,
1069                "all capabilities dropped including bounding set",
1070            );
1071        }
1072        sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
1073
1074        // 12b. RLIMIT backstop: defense-in-depth against fork bombs and fd exhaustion.
1075        // Must be applied BEFORE seccomp, since SYS_setrlimit is not in the allowlist.
1076        // SEC-05: In production mode, RLIMIT failures are fatal – a container
1077        // without resource limits is a privilege escalation vector.
1078        {
1079            let is_production = self.config.service_mode == ServiceMode::Production;
1080
1081            if let Some(nproc_limit) = self.config.limits.pids_max {
1082                let rlim_nproc = libc::rlimit {
1083                    rlim_cur: nproc_limit,
1084                    rlim_max: nproc_limit,
1085                };
1086                // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
1087                if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
1088                    let err = std::io::Error::last_os_error();
1089                    if is_production {
1090                        return Err(NucleusError::SeccompError(format!(
1091                            "Failed to set RLIMIT_NPROC to {} in production mode: {}",
1092                            nproc_limit, err
1093                        )));
1094                    }
1095                    warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
1096                }
1097            }
1098
1099            let rlim_nofile = libc::rlimit {
1100                rlim_cur: 1024,
1101                rlim_max: 1024,
1102            };
1103            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
1104            if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
1105                let err = std::io::Error::last_os_error();
1106                if is_production {
1107                    return Err(NucleusError::SeccompError(format!(
1108                        "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
1109                        err
1110                    )));
1111                }
1112                warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
1113            }
1114
1115            // RLIMIT_MEMLOCK: prevent container from pinning excessive physical
1116            // memory via mlock(). Default 64KB matches unprivileged default, but
1117            // in a user namespace the container appears as UID 0 and may have a
1118            // higher inherited limit. Configurable via --memlock for io_uring etc.
1119            let memlock_limit: u64 = self.config.limits.memlock_bytes.unwrap_or(64 * 1024);
1120            let rlim_memlock = libc::rlimit {
1121                rlim_cur: memlock_limit,
1122                rlim_max: memlock_limit,
1123            };
1124            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
1125            if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
1126                let err = std::io::Error::last_os_error();
1127                if is_production {
1128                    return Err(NucleusError::SeccompError(format!(
1129                        "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
1130                        memlock_limit, err
1131                    )));
1132                }
1133                warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
1134            }
1135        }
1136
1137        // 12c. Verify that namespace-creating capabilities are truly gone before
1138        // installing seccomp. Seccomp denies unfilterable clone3 and filters
1139        // clone namespace flags; capability dropping remains an independent guard.
1140        CapabilityManager::verify_no_namespace_caps(
1141            self.config.service_mode == ServiceMode::Production,
1142        )?;
1143
1144        // 13. Apply seccomp filter (trace, profile-from-file, or built-in allowlist)
1145        // Security: CapabilitiesDropped -> SeccompApplied
1146        use crate::container::config::SeccompMode;
1147        let mut seccomp_mgr = SeccompManager::new();
1148        let allow_network = !matches!(self.config.network, NetworkMode::None);
1149        let seccomp_applied = match self.config.seccomp_mode {
1150            SeccompMode::Trace => {
1151                audit(
1152                    &self.config.id,
1153                    &self.config.name,
1154                    AuditEventType::SeccompApplied,
1155                    "seccomp trace mode: allow-all + LOG",
1156                );
1157                seccomp_mgr.apply_trace_filter()?
1158            }
1159            SeccompMode::Enforce => {
1160                if let Some(ref profile_path) = self.config.seccomp_profile {
1161                    audit(
1162                        &self.config.id,
1163                        &self.config.name,
1164                        AuditEventType::SeccompProfileLoaded,
1165                        format!("path={:?}", profile_path),
1166                    );
1167                    seccomp_mgr.apply_profile_from_file(
1168                        profile_path,
1169                        self.config.seccomp_profile_sha256.as_deref(),
1170                        self.config.seccomp_log_denied,
1171                    )?
1172                } else {
1173                    seccomp_mgr.apply_filter_for_network_mode(
1174                        allow_network,
1175                        allow_degraded_security,
1176                        self.config.seccomp_log_denied,
1177                        &self.config.seccomp_allow_syscalls,
1178                    )?
1179                }
1180            }
1181        };
1182        if seccomp_applied {
1183            sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
1184            audit(
1185                &self.config.id,
1186                &self.config.name,
1187                AuditEventType::SeccompApplied,
1188                format!("network={}", allow_network),
1189            );
1190        } else if !allow_degraded_security {
1191            return Err(NucleusError::SeccompError(
1192                "Seccomp filter is required but was not enforced".to_string(),
1193            ));
1194        } else {
1195            warn!("Seccomp not enforced; container is running with degraded hardening");
1196        }
1197
1198        // 14. Apply Landlock policy (from policy file or default hardcoded rules)
1199        let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
1200            let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
1201                policy_path,
1202                self.config.landlock_policy_sha256.as_deref(),
1203            )?;
1204            // H4: Reject write+execute on same path in production
1205            if self.config.service_mode == ServiceMode::Production {
1206                policy.validate_production()?;
1207            }
1208            policy.apply(allow_degraded_security)?
1209        } else {
1210            let mut landlock_mgr = LandlockManager::new();
1211            landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
1212            // Register volume mount destinations so Landlock permits access to them
1213            for vol in &self.config.volumes {
1214                landlock_mgr.add_rw_path(&vol.dest.to_string_lossy());
1215            }
1216            landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
1217        };
1218        if seccomp_applied && landlock_applied {
1219            sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
1220            if self.config.seccomp_mode == SeccompMode::Trace {
1221                warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
1222            } else {
1223                sec_state = sec_state.transition(SecurityState::Locked)?;
1224            }
1225            audit(
1226                &self.config.id,
1227                &self.config.name,
1228                AuditEventType::LandlockApplied,
1229                if self.config.seccomp_mode == SeccompMode::Trace {
1230                    "landlock applied, but seccomp in trace mode – not locked".to_string()
1231                } else {
1232                    "security state locked: all hardening layers active".to_string()
1233                },
1234            );
1235        } else if !allow_degraded_security {
1236            return Err(NucleusError::LandlockError(
1237                "Landlock policy is required but was not enforced".to_string(),
1238            ));
1239        } else {
1240            warn!("Security state not locked; one or more hardening controls are inactive");
1241        }
1242        debug!("Security state: {:?}", sec_state);
1243
1244        // 14c. Block on exec FIFO until start() opens it for reading.
1245        // This implements the OCI two-phase create/start: all container setup
1246        // is complete, but the user process doesn't exec until explicitly started.
1247        if let Some(ref fifo_path) = exec_fifo {
1248            debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
1249            let file = std::fs::OpenOptions::new()
1250                .write(true)
1251                .open(fifo_path)
1252                .map_err(|e| {
1253                    NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
1254                })?;
1255            std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
1256                NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
1257            })?;
1258            drop(file);
1259            debug!("Exec FIFO released, proceeding to exec");
1260        }
1261
1262        // 14d. Run startContainer hooks (after start signal, before user process exec)
1263        if let Some(ref hooks) = self.config.hooks {
1264            if !hooks.start_container.is_empty() {
1265                let hook_state = OciContainerState {
1266                    oci_version: "1.0.2".to_string(),
1267                    id: self.config.id.clone(),
1268                    status: OciStatus::Running,
1269                    pid: std::process::id(),
1270                    bundle: String::new(),
1271                };
1272                OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
1273            }
1274        }
1275
1276        // 15. In production mode with PID namespace, run as a mini-init (PID 1)
1277        // that reaps zombies and forwards signals, rather than exec-ing directly.
1278        if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
1279            return self.run_as_init();
1280        }
1281
1282        // 15b. Agent mode: exec target process directly
1283        self.exec_command()?;
1284
1285        // Should never reach here
1286        Ok(())
1287    }
1288
1289    /// Forward selected signals to child process using sigwait (no async signal handlers).
1290    ///
1291    /// Returns a stop flag and join handle. Set the flag to `true` and join
1292    /// the handle to cleanly shut down the forwarding thread.
1293    pub(super) fn setup_signal_forwarding_static(
1294        child: Pid,
1295    ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
1296        let mut set = SigSet::empty();
1297        for signal in [
1298            Signal::SIGTERM,
1299            Signal::SIGINT,
1300            Signal::SIGHUP,
1301            Signal::SIGQUIT,
1302            Signal::SIGUSR1,
1303            Signal::SIGUSR2,
1304        ] {
1305            set.add(signal);
1306        }
1307
1308        let unblock_set = set;
1309        pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
1310            NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
1311        })?;
1312
1313        let stop = Arc::new(AtomicBool::new(false));
1314        let stop_clone = stop.clone();
1315        let handle = std::thread::Builder::new()
1316            .name("sig-forward".to_string())
1317            .spawn(move || {
1318                // The thread owns unblock_set and uses it for sigwait.
1319                loop {
1320                    if let Ok(signal) = unblock_set.wait() {
1321                        // Check the stop flag *after* waking so that the
1322                        // wake-up signal (SIGUSR1) is not forwarded to the
1323                        // child during shutdown.
1324                        if stop_clone.load(Ordering::Acquire) {
1325                            break;
1326                        }
1327                        let _ = kill(child, signal);
1328                    }
1329                }
1330            })
1331            .map_err(|e| {
1332                // Restore the signal mask so the caller isn't left with
1333                // signals permanently blocked.
1334                let mut restore = SigSet::empty();
1335                for signal in [
1336                    Signal::SIGTERM,
1337                    Signal::SIGINT,
1338                    Signal::SIGHUP,
1339                    Signal::SIGQUIT,
1340                    Signal::SIGUSR1,
1341                    Signal::SIGUSR2,
1342                ] {
1343                    restore.add(signal);
1344                }
1345                let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
1346                NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
1347            })?;
1348
1349        info!("Signal forwarding configured");
1350        Ok((stop, handle))
1351    }
1352
1353    /// Wait for child process to exit
1354    pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
1355        loop {
1356            match waitpid(child, None) {
1357                Ok(WaitStatus::Exited(_, code)) => {
1358                    return Ok(code);
1359                }
1360                Ok(WaitStatus::Signaled(_, signal, _)) => {
1361                    info!("Child killed by signal: {:?}", signal);
1362                    return Ok(128 + signal as i32);
1363                }
1364                Err(nix::errno::Errno::EINTR) => {
1365                    continue;
1366                }
1367                Err(e) => {
1368                    return Err(NucleusError::ExecError(format!(
1369                        "Failed to wait for child: {}",
1370                        e
1371                    )));
1372                }
1373                _ => {
1374                    continue;
1375                }
1376            }
1377        }
1378    }
1379
1380    fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1381        let mut pid_buf = [0u8; 4];
1382        loop {
1383            match read(ready_read, &mut pid_buf) {
1384                Err(nix::errno::Errno::EINTR) => continue,
1385                Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1386                Ok(0) => {
1387                    return Err(NucleusError::ExecError(format!(
1388                        "Child {} exited before namespace initialization",
1389                        child
1390                    )))
1391                }
1392                Ok(_) => {
1393                    return Err(NucleusError::ExecError(
1394                        "Invalid namespace sync payload from child".to_string(),
1395                    ))
1396                }
1397                Err(e) => {
1398                    return Err(NucleusError::ExecError(format!(
1399                        "Failed waiting for child namespace setup: {}",
1400                        e
1401                    )))
1402                }
1403            }
1404        }
1405    }
1406
1407    fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1408        let payload = pid.to_ne_bytes();
1409        let mut written = 0;
1410        while written < payload.len() {
1411            let n = write(fd, &payload[written..]).map_err(|e| {
1412                NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1413            })?;
1414            if n == 0 {
1415                return Err(NucleusError::ExecError(
1416                    "Failed to notify namespace readiness: short write".to_string(),
1417                ));
1418            }
1419            written += n;
1420        }
1421        Ok(())
1422    }
1423
1424    fn send_sync_byte(fd: &OwnedFd, error_context: &str) -> Result<()> {
1425        let mut written = 0;
1426        let payload = [1u8];
1427        while written < payload.len() {
1428            let n = write(fd, &payload[written..])
1429                .map_err(|e| NucleusError::ExecError(format!("{}: {}", error_context, e)))?;
1430            if n == 0 {
1431                return Err(NucleusError::ExecError(format!(
1432                    "{}: short write",
1433                    error_context
1434                )));
1435            }
1436            written += n;
1437        }
1438        Ok(())
1439    }
1440
1441    fn wait_for_sync_byte(fd: &OwnedFd, eof_context: &str, error_context: &str) -> Result<()> {
1442        let mut payload = [0u8; 1];
1443        loop {
1444            match read(fd, &mut payload) {
1445                Err(nix::errno::Errno::EINTR) => continue,
1446                Ok(1) => return Ok(()),
1447                Ok(0) => return Err(NucleusError::ExecError(eof_context.to_string())),
1448                Ok(_) => {
1449                    return Err(NucleusError::ExecError(format!(
1450                        "{}: invalid sync payload",
1451                        error_context
1452                    )))
1453                }
1454                Err(e) => return Err(NucleusError::ExecError(format!("{}: {}", error_context, e))),
1455            }
1456        }
1457    }
1458
1459    fn become_userns_root_for_setup() -> Result<()> {
1460        setresgid(Gid::from_raw(0), Gid::from_raw(0), Gid::from_raw(0)).map_err(|e| {
1461            NucleusError::NamespaceError(format!(
1462                "Failed to become gid 0 inside mapped user namespace: {}",
1463                e
1464            ))
1465        })?;
1466        setresuid(Uid::from_raw(0), Uid::from_raw(0), Uid::from_raw(0)).map_err(|e| {
1467            NucleusError::NamespaceError(format!(
1468                "Failed to become uid 0 inside mapped user namespace: {}",
1469                e
1470            ))
1471        })?;
1472        debug!("Switched setup process to uid/gid 0 inside mapped user namespace");
1473        Ok(())
1474    }
1475
1476    fn prepare_gvisor_bridge_namespace(
1477        &self,
1478        userns_request_pipe: Option<&OwnedFd>,
1479        userns_ack_pipe: Option<&OwnedFd>,
1480    ) -> Result<bool> {
1481        let mut precreated_userns = false;
1482        if self.config.user_ns_config.is_some() && !Uid::effective().is_root() {
1483            nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWUSER).map_err(|e| {
1484                NucleusError::NamespaceError(format!(
1485                    "Failed to unshare gVisor bridge user namespace: {}",
1486                    e
1487                ))
1488            })?;
1489
1490            let request_fd = userns_request_pipe.ok_or_else(|| {
1491                NucleusError::ExecError(
1492                    "Missing user namespace request pipe in gVisor bridge child".to_string(),
1493                )
1494            })?;
1495            let ack_fd = userns_ack_pipe.ok_or_else(|| {
1496                NucleusError::ExecError(
1497                    "Missing user namespace acknowledgement pipe in gVisor bridge child"
1498                        .to_string(),
1499                )
1500            })?;
1501
1502            Self::send_sync_byte(
1503                request_fd,
1504                "Failed to request gVisor bridge user namespace mappings from parent",
1505            )?;
1506            Self::wait_for_sync_byte(
1507                ack_fd,
1508                "Parent closed user namespace ack pipe before gVisor bridge mappings were written",
1509                "Failed waiting for parent to finish gVisor bridge user namespace mappings",
1510            )?;
1511            Self::become_userns_root_for_setup()?;
1512            precreated_userns = true;
1513        }
1514
1515        nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNET).map_err(|e| {
1516            NucleusError::NamespaceError(format!(
1517                "Failed to unshare gVisor bridge network namespace: {}",
1518                e
1519            ))
1520        })?;
1521        Ok(precreated_userns)
1522    }
1523
1524    fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1525        loop {
1526            match waitpid(child, None) {
1527                Ok(WaitStatus::Exited(_, code)) => return code,
1528                Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1529                Err(nix::errno::Errno::EINTR) => continue,
1530                Err(_) => return 1,
1531                _ => continue,
1532            }
1533        }
1534    }
1535}
1536
1537impl CreatedContainer {
1538    /// Start phase: release the child via the exec FIFO, transition to Running,
1539    /// then wait for the child to exit with full lifecycle management.
1540    pub fn start(mut self) -> Result<i32> {
1541        let config = &self.config;
1542        let _enter = self._lifecycle_span.enter();
1543
1544        // Open the exec FIFO for reading – this unblocks the child's
1545        // blocking open-for-write, allowing it to proceed to exec.
1546        if let Some(exec_fifo_path) = &self.exec_fifo_path {
1547            let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1548                NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1549            })?;
1550            let mut buf = [0u8; 1];
1551            let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1552                NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1553            })?;
1554            if read != 1 {
1555                return Err(NucleusError::ExecError(
1556                    "Exec FIFO closed before start signal was delivered".to_string(),
1557                ));
1558            }
1559            let _ = std::fs::remove_file(exec_fifo_path);
1560        }
1561
1562        // Transition: Created -> Running
1563        self.state.status = OciStatus::Running;
1564        self.state_mgr.save_state(&self.state)?;
1565
1566        let target_pid = self.state.pid;
1567        let child = self.child;
1568
1569        let (sig_stop, sig_handle) =
1570            Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1571
1572        // Guard ensures signal thread is stopped on any exit path (including early ? returns).
1573        let mut sig_guard = SignalThreadGuard {
1574            stop: Some(sig_stop),
1575            handle: Some(sig_handle),
1576        };
1577
1578        // Run readiness probe before declaring service ready
1579        if let Some(ref probe) = config.readiness_probe {
1580            let notify_socket = if config.sd_notify {
1581                std::env::var("NOTIFY_SOCKET").ok()
1582            } else {
1583                None
1584            };
1585            Container::run_readiness_probe(
1586                target_pid,
1587                &config.name,
1588                probe,
1589                config.user_ns_config.is_some(),
1590                config.use_gvisor,
1591                &config.process_identity,
1592                notify_socket.as_deref(),
1593            )?;
1594        }
1595
1596        // Start health check thread if configured
1597        let cancel_flag = Arc::new(AtomicBool::new(false));
1598        let health_handle = if let Some(ref hc) = config.health_check {
1599            if !hc.command.is_empty() {
1600                let hc = hc.clone();
1601                let pid = target_pid;
1602                let container_name = config.name.clone();
1603                let rootless = config.user_ns_config.is_some();
1604                let using_gvisor = config.use_gvisor;
1605                let process_identity = config.process_identity.clone();
1606                let cancel = cancel_flag.clone();
1607                Some(std::thread::spawn(move || {
1608                    Container::health_check_loop(
1609                        pid,
1610                        &container_name,
1611                        rootless,
1612                        using_gvisor,
1613                        &hc,
1614                        &process_identity,
1615                        &cancel,
1616                    );
1617                }))
1618            } else {
1619                None
1620            }
1621        } else {
1622            None
1623        };
1624
1625        // Guard ensures health check thread is cancelled on any exit path.
1626        let mut health_guard = HealthThreadGuard {
1627            cancel: Some(cancel_flag),
1628            handle: health_handle,
1629        };
1630
1631        // Run poststart hooks (after user process started, in parent)
1632        if let Some(ref hooks) = config.hooks {
1633            if !hooks.poststart.is_empty() {
1634                let hook_state = OciContainerState {
1635                    oci_version: "1.0.2".to_string(),
1636                    id: config.id.clone(),
1637                    status: OciStatus::Running,
1638                    pid: target_pid,
1639                    bundle: String::new(),
1640                };
1641                OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1642            }
1643        }
1644
1645        let mut child_waited = false;
1646        let run_result: Result<i32> = (|| {
1647            let exit_code = Container::wait_for_child_static(child)?;
1648
1649            // Transition: Running -> Stopped
1650            self.state.status = OciStatus::Stopped;
1651            let _ = self.state_mgr.save_state(&self.state);
1652
1653            child_waited = true;
1654            Ok(exit_code)
1655        })();
1656
1657        // Explicitly stop threads (guards would do this on drop too, but
1658        // explicit teardown keeps ordering visible).
1659        health_guard.stop();
1660        sig_guard.stop();
1661
1662        // Run poststop hooks (best-effort)
1663        if let Some(ref hooks) = config.hooks {
1664            if !hooks.poststop.is_empty() {
1665                let hook_state = OciContainerState {
1666                    oci_version: "1.0.2".to_string(),
1667                    id: config.id.clone(),
1668                    status: OciStatus::Stopped,
1669                    pid: 0,
1670                    bundle: String::new(),
1671                };
1672                OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1673            }
1674        }
1675
1676        if let Some(net) = self.network_driver.take() {
1677            if let Err(e) = net.cleanup() {
1678                warn!("Failed to cleanup container networking: {}", e);
1679            }
1680        }
1681
1682        if !child_waited {
1683            let _ = kill(child, Signal::SIGKILL);
1684            let _ = waitpid(child, None);
1685        }
1686
1687        if let Some(reader) = self.trace_reader.take() {
1688            reader.stop_and_flush();
1689        }
1690
1691        if let Some(logger) = self.deny_logger.take() {
1692            logger.stop();
1693        }
1694
1695        if let Some(cgroup) = self.cgroup_opt.take() {
1696            if let Err(e) = cgroup.cleanup() {
1697                warn!("Failed to cleanup cgroup: {}", e);
1698            }
1699        }
1700
1701        if config.use_gvisor {
1702            if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1703                warn!(
1704                    "Failed to cleanup gVisor artifacts for {}: {}",
1705                    config.id, e
1706                );
1707            }
1708        }
1709
1710        if let Err(e) = self.state_mgr.delete_state(&config.id) {
1711            warn!("Failed to delete state for {}: {}", config.id, e);
1712        }
1713
1714        match run_result {
1715            Ok(exit_code) => {
1716                audit(
1717                    &config.id,
1718                    &config.name,
1719                    AuditEventType::ContainerStop,
1720                    format!("exit_code={}", exit_code),
1721                );
1722                info!(
1723                    "Container {} ({}) exited with code {}",
1724                    config.name, config.id, exit_code
1725                );
1726                Ok(exit_code)
1727            }
1728            Err(e) => {
1729                audit_error(
1730                    &config.id,
1731                    &config.name,
1732                    AuditEventType::ContainerStop,
1733                    format!("error={}", e),
1734                );
1735                Err(e)
1736            }
1737        }
1738    }
1739}
1740
1741/// RAII guard that stops the signal-forwarding thread on drop.
1742struct SignalThreadGuard {
1743    stop: Option<Arc<AtomicBool>>,
1744    handle: Option<JoinHandle<()>>,
1745}
1746
1747impl SignalThreadGuard {
1748    fn stop(&mut self) {
1749        if let Some(flag) = self.stop.take() {
1750            flag.store(true, Ordering::Release);
1751            if let Some(handle) = self.handle.as_ref() {
1752                super::signals::wake_sigwait_thread(handle, Signal::SIGUSR1);
1753            }
1754        }
1755        if let Some(handle) = self.handle.take() {
1756            let _ = handle.join();
1757        }
1758    }
1759}
1760
1761impl Drop for SignalThreadGuard {
1762    fn drop(&mut self) {
1763        self.stop();
1764    }
1765}
1766
1767/// RAII guard that cancels the health-check thread on drop.
1768struct HealthThreadGuard {
1769    cancel: Option<Arc<AtomicBool>>,
1770    handle: Option<JoinHandle<()>>,
1771}
1772
1773impl HealthThreadGuard {
1774    fn stop(&mut self) {
1775        if let Some(flag) = self.cancel.take() {
1776            flag.store(true, Ordering::Relaxed);
1777        }
1778        if let Some(handle) = self.handle.take() {
1779            let _ = handle.join();
1780        }
1781    }
1782}
1783
1784impl Drop for HealthThreadGuard {
1785    fn drop(&mut self) {
1786        self.stop();
1787    }
1788}
1789
1790#[cfg(test)]
1791mod tests {
1792    use super::*;
1793    use crate::container::KernelLockdownMode;
1794    use crate::network::NetworkMode;
1795    use std::ffi::OsString;
1796    use std::sync::{Mutex, MutexGuard};
1797
1798    static ENV_LOCK: Mutex<()> = Mutex::new(());
1799
1800    struct EnvLock {
1801        _guard: MutexGuard<'static, ()>,
1802    }
1803
1804    impl EnvLock {
1805        fn acquire() -> Self {
1806            Self {
1807                _guard: ENV_LOCK.lock().unwrap(),
1808            }
1809        }
1810    }
1811
1812    struct EnvVarGuard {
1813        key: &'static str,
1814        previous: Option<OsString>,
1815    }
1816
1817    impl EnvVarGuard {
1818        fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
1819            let previous = std::env::var_os(key);
1820            std::env::set_var(key, value);
1821            Self { key, previous }
1822        }
1823
1824        fn remove(key: &'static str) -> Self {
1825            let previous = std::env::var_os(key);
1826            std::env::remove_var(key);
1827            Self { key, previous }
1828        }
1829    }
1830
1831    impl Drop for EnvVarGuard {
1832        fn drop(&mut self) {
1833            match &self.previous {
1834                Some(value) => std::env::set_var(self.key, value),
1835                None => std::env::remove_var(self.key),
1836            }
1837        }
1838    }
1839
1840    fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
1841        let fn_start = source
1842            .find(fn_signature)
1843            .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
1844        let after = &source[fn_start..];
1845        let open = after
1846            .find('{')
1847            .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
1848        let mut depth = 0u32;
1849        let mut end = open;
1850        for (i, ch) in after[open..].char_indices() {
1851            match ch {
1852                '{' => depth += 1,
1853                '}' => {
1854                    depth -= 1;
1855                    if depth == 0 {
1856                        end = open + i + 1;
1857                        break;
1858                    }
1859                }
1860                _ => {}
1861            }
1862        }
1863        &after[..end]
1864    }
1865
1866    #[test]
1867    fn test_container_config() {
1868        let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1869        assert!(!config.id.is_empty());
1870        assert_eq!(config.command, vec!["/bin/sh"]);
1871        assert!(config.use_gvisor);
1872    }
1873
1874    #[test]
1875    fn test_run_uses_immediate_start_path_with_parent_setup_gate() {
1876        let source = include_str!("runtime.rs");
1877        let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1878        let after = &source[fn_start..];
1879        let open = after.find('{').unwrap();
1880        let mut depth = 0u32;
1881        let mut fn_end = open;
1882        for (i, ch) in after[open..].char_indices() {
1883            match ch {
1884                '{' => depth += 1,
1885                '}' => {
1886                    depth -= 1;
1887                    if depth == 0 {
1888                        fn_end = open + i + 1;
1889                        break;
1890                    }
1891                }
1892                _ => {}
1893            }
1894        }
1895        let run_body = &after[..fn_end];
1896        assert!(
1897            run_body.contains("create_internal(false)"),
1898            "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1899        );
1900        assert!(
1901            !run_body.contains("self.create()?.start()"),
1902            "run() must not route through create()+start()"
1903        );
1904
1905        let create_body = extract_fn_body(source, "fn create_internal");
1906        assert!(
1907            create_body.contains("parent_setup_write"),
1908            "immediate run() must still use a parent setup gate before child setup proceeds"
1909        );
1910    }
1911
1912    #[test]
1913    fn test_container_config_with_name() {
1914        let config =
1915            ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1916                .unwrap();
1917        assert_eq!(config.name, "mycontainer");
1918        assert!(!config.id.is_empty());
1919        assert_ne!(config.id, config.name);
1920    }
1921
1922    #[test]
1923    fn test_allow_degraded_security_requires_explicit_config() {
1924        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1925        assert!(!Container::allow_degraded_security(&strict));
1926
1927        let relaxed = strict.clone().with_allow_degraded_security(true);
1928        assert!(Container::allow_degraded_security(&relaxed));
1929    }
1930
1931    #[test]
1932    fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1933        let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1934        std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1935
1936        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1937        assert!(!Container::allow_degraded_security(&strict));
1938
1939        let explicit = strict.with_allow_degraded_security(true);
1940        assert!(Container::allow_degraded_security(&explicit));
1941
1942        match prev {
1943            Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1944            None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1945        }
1946    }
1947
1948    #[test]
1949    fn test_host_network_requires_explicit_opt_in() {
1950        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1951            .unwrap()
1952            .with_network(NetworkMode::Host)
1953            .with_allow_host_network(false);
1954        let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1955        assert!(matches!(err, NucleusError::NetworkError(_)));
1956    }
1957
1958    #[test]
1959    fn test_host_network_opt_in_disables_net_namespace() {
1960        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1961            .unwrap()
1962            .with_network(NetworkMode::Host)
1963            .with_allow_host_network(true);
1964        assert!(config.namespaces.net);
1965        Container::apply_network_mode_guards(&mut config, true).unwrap();
1966        assert!(!config.namespaces.net);
1967    }
1968
1969    #[test]
1970    fn test_non_host_network_does_not_require_host_opt_in() {
1971        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1972            .unwrap()
1973            .with_network(NetworkMode::None)
1974            .with_allow_host_network(false);
1975        assert!(config.namespaces.net);
1976        Container::apply_network_mode_guards(&mut config, true).unwrap();
1977        assert!(config.namespaces.net);
1978    }
1979
1980    #[test]
1981    fn test_parse_kernel_lockdown_mode() {
1982        assert_eq!(
1983            Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1984            Some(KernelLockdownMode::Integrity)
1985        );
1986        assert_eq!(
1987            Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1988            Some(KernelLockdownMode::Confidentiality)
1989        );
1990        assert_eq!(
1991            Container::parse_active_lockdown_mode("[none] integrity"),
1992            None
1993        );
1994    }
1995
1996    #[test]
1997    fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1998        let temp = tempfile::TempDir::new().unwrap();
1999        let source = temp.path().join("source-secret");
2000        std::fs::write(&source, "supersecret").unwrap();
2001
2002        let staged = Container::stage_gvisor_secret_files(
2003            &temp.path().join("stage"),
2004            &[crate::container::SecretMount {
2005                source: source.clone(),
2006                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
2007                mode: 0o400,
2008            }],
2009            &crate::container::ProcessIdentity::root(),
2010        )
2011        .unwrap();
2012
2013        assert_eq!(staged.len(), 1);
2014        assert!(staged[0].source.starts_with(temp.path().join("stage")));
2015        assert_eq!(
2016            std::fs::read_to_string(&staged[0].source).unwrap(),
2017            "supersecret"
2018        );
2019    }
2020
2021    #[test]
2022    fn test_stage_gvisor_secret_files_rejects_symlink_source() {
2023        use std::os::unix::fs::symlink;
2024
2025        let temp = tempfile::TempDir::new().unwrap();
2026        let source = temp.path().join("source-secret");
2027        let link = temp.path().join("source-link");
2028        std::fs::write(&source, "supersecret").unwrap();
2029        symlink(&source, &link).unwrap();
2030
2031        let err = Container::stage_gvisor_secret_files(
2032            &temp.path().join("stage"),
2033            &[crate::container::SecretMount {
2034                source: link,
2035                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
2036                mode: 0o400,
2037            }],
2038            &crate::container::ProcessIdentity::root(),
2039        )
2040        .unwrap_err();
2041
2042        assert!(
2043            err.to_string().contains("O_NOFOLLOW"),
2044            "gVisor secret staging must reject symlink sources"
2045        );
2046    }
2047
2048    #[test]
2049    fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
2050        let source = include_str!("runtime.rs");
2051        let fn_body = extract_fn_body(source, "fn setup_and_exec");
2052        assert!(
2053            fn_body.contains("mount_secrets_inmemory("),
2054            "setup_and_exec must use in-memory secret mounting"
2055        );
2056        assert!(
2057            !fn_body.contains("mount_secrets(&"),
2058            "setup_and_exec must not bind-mount secrets from the host"
2059        );
2060    }
2061
2062    #[test]
2063    fn test_native_production_procfs_mount_is_not_rootless_best_effort() {
2064        let source = include_str!("runtime.rs");
2065        let fn_body = extract_fn_body(source, "fn setup_and_exec");
2066
2067        assert!(
2068            fn_body.contains(
2069                "let production_mode = self.config.service_mode == ServiceMode::Production;"
2070            ),
2071            "setup_and_exec must derive an explicit production-mode guard for procfs hardening"
2072        );
2073        assert!(
2074            fn_body.contains("let procfs_best_effort = is_rootless && !production_mode;"),
2075            "rootless best-effort procfs fallback must be disabled in production mode"
2076        );
2077        assert!(
2078            fn_body.contains(
2079                "mount_procfs(\n            &proc_path,\n            procfs_best_effort,"
2080            ),
2081            "mount_procfs must receive the production-aware best-effort flag"
2082        );
2083    }
2084
2085    #[test]
2086    fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
2087        let source = include_str!("gvisor_setup.rs");
2088        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2089        assert!(
2090            fn_body.contains("with_inmemory_secret_mounts"),
2091            "gVisor setup must use the tmpfs-backed secret staging path"
2092        );
2093        assert!(
2094            !fn_body.contains("with_secret_mounts"),
2095            "gVisor setup must not bind-mount host secret paths"
2096        );
2097    }
2098
2099    #[test]
2100    fn test_gvisor_bridge_precreated_userns_skips_nested_oci_userns() {
2101        let source = include_str!("gvisor_setup.rs");
2102        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2103        let precreated_check = fn_body.find("if precreated_userns").unwrap();
2104        let oci_userns = fn_body.find("with_rootless_user_namespace").unwrap();
2105        assert!(
2106            precreated_check < oci_userns,
2107            "pre-created rootless bridge userns must skip nested OCI user namespace setup"
2108        );
2109    }
2110
2111    #[test]
2112    fn test_gvisor_bridge_precreated_userns_disables_oci_no_new_privileges() {
2113        let source = include_str!("gvisor_setup.rs");
2114        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2115        assert!(
2116            fn_body.contains("if precreated_userns")
2117                && fn_body.contains("with_no_new_privileges(false)"),
2118            "pre-created rootless bridge userns must not pass OCI noNewPrivileges to runsc"
2119        );
2120    }
2121
2122    #[test]
2123    fn test_gvisor_bridge_precreated_userns_selects_runsc_rootless() {
2124        let source = include_str!("gvisor_setup.rs");
2125        let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2126        assert!(
2127            fn_body.contains("let runsc_rootless = precreated_userns"),
2128            "pre-created rootless bridge userns must use runsc's rootless execution path"
2129        );
2130    }
2131
2132    #[test]
2133    fn test_gvisor_bridge_rootless_requests_external_userns_mapping() {
2134        let source = include_str!("runtime.rs");
2135        let create_body = extract_fn_body(source, "fn create_internal");
2136        assert!(
2137            create_body.contains("let gvisor_bridge_needs_userns_mapping"),
2138            "gVisor bridge rootless setup must request parent-written userns mappings"
2139        );
2140        assert!(
2141            create_body.contains("matches!(config.network, NetworkMode::Bridge(_))"),
2142            "external mapping request must be scoped to gVisor bridge networking"
2143        );
2144    }
2145
2146    #[test]
2147    fn test_gvisor_bridge_namespace_creates_userns_before_netns() {
2148        let source = include_str!("runtime.rs");
2149        let fn_body = extract_fn_body(source, "fn prepare_gvisor_bridge_namespace");
2150        let userns = fn_body.find("CLONE_NEWUSER").unwrap();
2151        let request = fn_body.find("send_sync_byte").unwrap();
2152        let become_root = fn_body.find("become_userns_root_for_setup").unwrap();
2153        let netns = fn_body.find("CLONE_NEWNET").unwrap();
2154        assert!(
2155            userns < request && request < become_root && become_root < netns,
2156            "rootless gVisor bridge setup must map userns before creating the netns"
2157        );
2158    }
2159
2160    #[test]
2161    fn test_native_fork_sites_assert_single_threaded() {
2162        let runtime_source = include_str!("runtime.rs");
2163        let create_body = extract_fn_body(runtime_source, "fn create_internal");
2164        assert!(
2165            create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
2166            "create_internal must assert single-threaded before fork"
2167        );
2168
2169        let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
2170        assert!(
2171            setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
2172            "PID namespace setup must assert single-threaded before fork"
2173        );
2174
2175        let exec_source = include_str!("exec.rs");
2176        let init_body = extract_fn_body(exec_source, "fn run_as_init");
2177        assert!(
2178            init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
2179            "run_as_init must assert single-threaded before fork"
2180        );
2181    }
2182
2183    #[test]
2184    fn test_parent_setup_gate_released_after_network_policy() {
2185        let source = include_str!("runtime.rs");
2186        let create_body = extract_fn_body(source, "fn create_internal");
2187
2188        let cgroup_attach = create_body.find("cgroup.attach_process").unwrap();
2189        let deny_logger = create_body.find("maybe_start_seccomp_deny_logger").unwrap();
2190        let bridge_setup = create_body.find("BridgeDriver::setup_with_id").unwrap();
2191        let egress_policy = create_body.find("net.apply_egress_policy").unwrap();
2192        let release = create_body
2193            .find("Failed to notify child that parent setup is complete")
2194            .unwrap();
2195        let created = create_body.find("Ok(CreatedContainer").unwrap();
2196
2197        assert!(
2198            cgroup_attach < bridge_setup,
2199            "parent setup gate must not release before cgroup attachment"
2200        );
2201        assert!(
2202            cgroup_attach < deny_logger && deny_logger < bridge_setup,
2203            "seccomp deny logger must start after cgroup attachment and before workload release"
2204        );
2205        assert!(
2206            create_body.contains("cgroup_opt.as_ref().map(|cgroup| cgroup.path())"),
2207            "seccomp deny logger must receive the container cgroup scope"
2208        );
2209        assert!(
2210            bridge_setup < egress_policy && egress_policy < release,
2211            "parent setup gate must not release before bridge and egress policy setup"
2212        );
2213        assert!(
2214            release < created,
2215            "create_internal must release the child only after all fallible parent setup succeeds"
2216        );
2217        assert!(
2218            !create_body.contains("cgroup attachment is complete"),
2219            "child setup gate must not be released immediately after cgroup attachment"
2220        );
2221    }
2222
2223    #[test]
2224    fn test_child_waits_for_parent_setup_before_exec_paths() {
2225        let source = include_str!("runtime.rs");
2226        let setup_body = extract_fn_body(source, "fn setup_and_exec");
2227
2228        let gvisor_wait = setup_body
2229            .find("Parent closed setup pipe before signalling gVisor child")
2230            .unwrap();
2231        let gvisor_exec = setup_body.find("setup_and_exec_gvisor").unwrap();
2232        assert!(
2233            gvisor_wait < gvisor_exec,
2234            "gVisor path must wait for parent setup before execing runsc"
2235        );
2236
2237        let pid1_wait = setup_body
2238            .find("Parent closed setup pipe before signalling PID 1 child")
2239            .unwrap();
2240        let namespace_enter = setup_body.find("namespace_mgr.enter()?").unwrap();
2241        assert!(
2242            pid1_wait < namespace_enter,
2243            "PID namespace child must wait for parent setup before container setup continues"
2244        );
2245
2246        let direct_wait = setup_body
2247            .find("Parent closed setup pipe before signalling container child")
2248            .unwrap();
2249        assert!(
2250            direct_wait < namespace_enter,
2251            "non-PID namespace child must wait for parent setup before container setup continues"
2252        );
2253    }
2254
2255    #[test]
2256    fn test_parent_setup_failure_kills_reported_target_pid() {
2257        let source = include_str!("runtime.rs");
2258        let create_body = extract_fn_body(source, "fn create_internal");
2259
2260        let record_target = create_body
2261            .find("target_pid_for_cleanup = Some(target_pid)")
2262            .unwrap();
2263        let kill_target = create_body
2264            .find("kill(Pid::from_raw(target_pid as i32), Signal::SIGKILL)")
2265            .unwrap();
2266        let kill_intermediate = create_body.find("kill(child, Signal::SIGKILL)").unwrap();
2267
2268        assert!(
2269            record_target < kill_target,
2270            "parent setup cleanup must remember the reported target PID"
2271        );
2272        assert!(
2273            kill_target < kill_intermediate,
2274            "cleanup must kill the target PID before reaping the intermediate fork"
2275        );
2276    }
2277
2278    #[test]
2279    fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
2280        let source = include_str!("exec.rs");
2281        let fn_body = extract_fn_body(source, "fn run_as_init");
2282        assert!(
2283            !fn_body.contains("Self::apply_process_identity_to_current_process("),
2284            "run_as_init must not drop identity before the supervisor fork"
2285        );
2286        assert!(
2287            fn_body.contains("self.exec_command()?"),
2288            "workload child must still route through exec_command for identity application"
2289        );
2290    }
2291
2292    #[test]
2293    fn test_signal_thread_shutdown_uses_thread_directed_wakeup() {
2294        let runtime_source = include_str!("runtime.rs");
2295        let exec_source = include_str!("exec.rs");
2296        let signal_helper_source = include_str!("signals.rs");
2297        let process_directed_wakeup = ["kill(Pid::this()", ", Signal::SIGUSR1)"].concat();
2298
2299        assert!(
2300            !runtime_source.contains(&process_directed_wakeup),
2301            "CreatedContainer signal-thread shutdown must not send process-directed SIGUSR1"
2302        );
2303        assert!(
2304            !exec_source.contains(&process_directed_wakeup),
2305            "init supervisor signal-thread shutdown must not send process-directed SIGUSR1"
2306        );
2307        assert!(
2308            signal_helper_source.contains("libc::pthread_kill"),
2309            "signal-thread shutdown must wake the sigwait owner with a thread-directed signal"
2310        );
2311    }
2312
2313    #[test]
2314    fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
2315        let _env_lock = EnvLock::acquire();
2316        let temp = tempfile::TempDir::new().unwrap();
2317        let _artifact_base = EnvVarGuard::set(
2318            "NUCLEUS_GVISOR_ARTIFACT_BASE",
2319            temp.path().join("gvisor-artifacts"),
2320        );
2321        let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
2322        std::fs::create_dir_all(&artifact_dir).unwrap();
2323        std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
2324
2325        Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
2326        assert!(!artifact_dir.exists());
2327    }
2328
2329    #[test]
2330    fn test_gvisor_artifact_base_prefers_xdg_runtime_dir() {
2331        let _env_lock = EnvLock::acquire();
2332        let temp = tempfile::TempDir::new().unwrap();
2333        let _artifact_override = EnvVarGuard::remove("NUCLEUS_GVISOR_ARTIFACT_BASE");
2334        let _runtime = EnvVarGuard::set("XDG_RUNTIME_DIR", temp.path());
2335
2336        assert_eq!(
2337            Container::gvisor_artifact_dir("xdg-test"),
2338            temp.path().join("nucleus-gvisor").join("xdg-test")
2339        );
2340    }
2341
2342    #[test]
2343    fn test_health_check_loop_supports_cancellation() {
2344        // BUG-18: health_check_loop must accept an AtomicBool cancel flag
2345        // and check it between iterations for prompt shutdown.
2346        // Function lives in health.rs after the runtime split.
2347        let source = include_str!("health.rs");
2348        let fn_start = source.find("fn health_check_loop").unwrap();
2349        let fn_body = &source[fn_start..fn_start + 2500];
2350        assert!(
2351            fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
2352            "health_check_loop must accept an AtomicBool cancellation flag"
2353        );
2354        // Must also check cancellation during sleep
2355        assert!(
2356            fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
2357            "health_check_loop must check cancellation during sleep intervals"
2358        );
2359    }
2360
2361    #[test]
2362    fn test_runtime_probes_do_not_spawn_host_nsenter() {
2363        // Both functions live in health.rs after the runtime split.
2364        let source = include_str!("health.rs");
2365
2366        let readiness_start = source.find("fn run_readiness_probe").unwrap();
2367        let readiness_body = &source[readiness_start..readiness_start + 2500];
2368        assert!(
2369            !readiness_body.contains("Command::new(&nsenter_bin)"),
2370            "readiness probes must not execute via host nsenter"
2371        );
2372
2373        let health_start = source.find("fn health_check_loop").unwrap();
2374        let health_body = &source[health_start..health_start + 2200];
2375        assert!(
2376            !health_body.contains("Command::new(&nsenter_bin)"),
2377            "health checks must not execute via host nsenter"
2378        );
2379    }
2380
2381    #[test]
2382    fn test_oci_mount_strip_prefix_no_expect() {
2383        // BUG-08: prepare_oci_mountpoints must not use expect() - use ? instead
2384        // Function lives in gvisor_setup.rs after the runtime split.
2385        let source = include_str!("gvisor_setup.rs");
2386        let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
2387        let fn_body = &source[fn_start..fn_start + 600];
2388        assert!(
2389            !fn_body.contains(".expect("),
2390            "prepare_oci_mountpoints must not use expect() – return Err instead"
2391        );
2392    }
2393
2394    #[test]
2395    fn test_notify_namespace_ready_validates_write_length() {
2396        // BUG-02: notify_namespace_ready must validate that all bytes were written
2397        let source = include_str!("runtime.rs");
2398        let fn_start = source.find("fn notify_namespace_ready").unwrap();
2399        let fn_body = &source[fn_start..fn_start + 500];
2400        // Must check the return value of write() for partial writes
2401        assert!(
2402            fn_body.contains("written")
2403                || fn_body.contains("4")
2404                || fn_body.contains("payload.len()"),
2405            "notify_namespace_ready must validate complete write of all 4 bytes"
2406        );
2407    }
2408
2409    #[test]
2410    fn test_rlimit_failures_fatal_in_production() {
2411        // SEC-05: RLIMIT failures must be fatal in production mode
2412        let source = include_str!("runtime.rs");
2413        let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
2414        let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
2415        assert!(
2416            rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
2417            "RLIMIT failures must return Err in production mode"
2418        );
2419    }
2420
2421    #[test]
2422    fn test_tcp_readiness_probe_uses_portable_check() {
2423        // BUG-14: TCP readiness probe must not use /dev/tcp (bash-only)
2424        // Function lives in health.rs after the runtime split.
2425        let source = include_str!("health.rs");
2426        let probe_fn = source.find("TcpPort(port)").unwrap();
2427        let probe_body = &source[probe_fn..probe_fn + 500];
2428        assert!(
2429            !probe_body.contains("/dev/tcp"),
2430            "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
2431        );
2432    }
2433}