Skip to main content

nucleus/container/
runtime.rs

1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3    ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams,
4    OciStatus, ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8    audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9    mask_proc_paths, mount_procfs, mount_secrets, mount_secrets_inmemory,
10    snapshot_context_dir, switch_root, verify_context_manifest,
11    verify_rootfs_attestation, FilesystemState, LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::{NamespaceManager};
14use crate::network::{BridgeNetwork, NetworkMode};
15use crate::resources::Cgroup;
16use crate::security::{
17    CapabilityManager, GVisorRuntime, LandlockManager,
18    OciContainerState, OciHooks, SeccompManager, SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::{AsRawFd, OwnedFd};
26use std::path::PathBuf;
27use tempfile::Builder;
28use tracing::{debug, error, info, info_span, warn};
29
30/// Container runtime that orchestrates all isolation mechanisms
31///
32/// Execution flow matches the formal specifications:
33/// 1. Create namespaces (Nucleus_Isolation_NamespaceLifecycle.tla)
34/// 2. Create and configure cgroups (Nucleus_Resources_CgroupLifecycle.tla)
35/// 3. Mount tmpfs and populate context (Nucleus_Filesystem_FilesystemLifecycle.tla)
36/// 4. Drop capabilities and apply seccomp (Nucleus_Security_SecurityEnforcement.tla)
37/// 5. Execute target process
38pub struct Container {
39    pub(super) config: ContainerConfig,
40    /// Pre-resolved runsc path, resolved before fork so that user-namespace
41    /// UID changes don't block PATH-based lookup.
42    pub(super) runsc_path: Option<String>,
43}
44
45/// Handle returned by `Container::create()` representing a container whose
46/// child process has been forked and is blocked on the exec FIFO, waiting for
47/// `start()` to release it.
48pub struct CreatedContainer {
49    pub(super) config: ContainerConfig,
50    pub(super) state_mgr: ContainerStateManager,
51    pub(super) state: ContainerState,
52    pub(super) child: Pid,
53    pub(super) cgroup_opt: Option<Cgroup>,
54    pub(super) bridge_net: Option<BridgeNetwork>,
55    pub(super) trace_reader: Option<SeccompTraceReader>,
56    pub(super) exec_fifo_path: PathBuf,
57    pub(super) _lifecycle_span: tracing::Span,
58}
59
60impl Container {
61    pub fn new(config: ContainerConfig) -> Self {
62        Self {
63            config,
64            runsc_path: None,
65        }
66    }
67
68    /// Run the container (convenience wrapper: create + start)
69    pub fn run(&self) -> Result<i32> {
70        self.create()?.start()
71    }
72
73    /// Create phase: fork the child, set up cgroup/bridge, leave child blocked
74    /// on the exec FIFO. Returns a `CreatedContainer` whose `start()` method
75    /// releases the child process.
76    pub fn create(&self) -> Result<CreatedContainer> {
77        let lifecycle_span = info_span!(
78            "container.lifecycle",
79            container.id = %self.config.id,
80            container.name = %self.config.name,
81            runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
82        );
83        let _enter = lifecycle_span.enter();
84
85        info!(
86            "Creating container: {} (ID: {})",
87            self.config.name, self.config.id
88        );
89        audit(
90            &self.config.id,
91            &self.config.name,
92            AuditEventType::ContainerStart,
93            format!(
94                "command={:?} mode={:?} runtime={}",
95                self.config.command,
96                self.config.service_mode,
97                if self.config.use_gvisor {
98                    "gvisor"
99                } else {
100                    "native"
101                }
102            ),
103        );
104
105        // Auto-detect if we need rootless mode
106        let is_root = nix::unistd::Uid::effective().is_root();
107        let mut config = self.config.clone();
108
109        if !is_root && config.user_ns_config.is_none() {
110            info!("Not running as root, automatically enabling rootless mode");
111            config.namespaces.user = true;
112            config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
113        }
114
115        // Log console-socket acceptance (OCI interface; PTY forwarding is a future enhancement)
116        if let Some(ref socket_path) = config.console_socket {
117            warn!(
118                "Console socket {} accepted but terminal forwarding is not yet implemented",
119                socket_path.display()
120            );
121        }
122
123        // Validate production mode invariants before anything else.
124        config.validate_production_mode()?;
125        Self::assert_kernel_lockdown(&config)?;
126
127        Self::apply_network_mode_guards(&mut config, is_root)?;
128        Self::apply_trust_level_guards(&mut config)?;
129        config.validate_runtime_support()?;
130
131        // Bridge networking requires root
132        if matches!(config.network, NetworkMode::Bridge(_)) && !is_root {
133            if config.service_mode == ServiceMode::Production {
134                return Err(NucleusError::NetworkError(
135                    "Production mode with bridge networking requires root (cannot silently \
136                     degrade to no networking)"
137                        .to_string(),
138                ));
139            }
140            warn!("Bridge networking requires root, degrading to no networking");
141            config.network = NetworkMode::None;
142        }
143
144        // Create state manager
145        let state_mgr = ContainerStateManager::new()?;
146
147        // Enforce name uniqueness among running containers
148        if let Ok(all_states) = state_mgr.list_states() {
149            if all_states.iter().any(|s| s.name == config.name) {
150                return Err(NucleusError::ConfigError(format!(
151                    "A container named '{}' already exists; use a different --name, \
152                     or remove the stale state with 'nucleus delete'",
153                    config.name
154                )));
155            }
156        }
157
158        // Create exec FIFO for two-phase create/start synchronization.
159        // The child will block on open-for-write until start() opens for read.
160        let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
161        nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
162            NucleusError::ExecError(format!("Failed to create exec FIFO {:?}: {}", exec_fifo, e))
163        })?;
164
165        // Try to create cgroup (optional for rootless mode)
166        let cgroup_name = format!("nucleus-{}", config.id);
167        let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
168            Ok(mut cgroup) => {
169                // Try to set limits
170                match cgroup.set_limits(&config.limits) {
171                    Ok(_) => {
172                        info!("Created cgroup with resource limits");
173                        Some(cgroup)
174                    }
175                    Err(e) => {
176                        if config.service_mode == ServiceMode::Production {
177                            let _ = cgroup.cleanup();
178                            return Err(NucleusError::CgroupError(format!(
179                                "Production mode requires cgroup resource enforcement, but \
180                                 applying limits failed: {}",
181                                e
182                            )));
183                        }
184                        warn!("Failed to set cgroup limits: {}", e);
185                        let _ = cgroup.cleanup();
186                        None
187                    }
188                }
189            }
190            Err(e) => {
191                if config.service_mode == ServiceMode::Production {
192                    return Err(NucleusError::CgroupError(format!(
193                        "Production mode requires cgroup resource enforcement, but \
194                         cgroup creation failed: {}",
195                        e
196                    )));
197                }
198
199                if config.user_ns_config.is_some() {
200                    if config.limits.memory_bytes.is_some()
201                        || config.limits.cpu_quota_us.is_some()
202                        || config.limits.pids_max.is_some()
203                    {
204                        warn!(
205                            "Running in rootless mode: requested resource limits cannot be \
206                             enforced – cgroup creation requires root ({})",
207                            e
208                        );
209                    } else {
210                        debug!("Running in rootless mode without cgroup resource limits");
211                    }
212                } else {
213                    warn!(
214                        "Failed to create cgroup (running without resource limits): {}",
215                        e
216                    );
217                }
218                None
219            }
220        };
221
222        // Resolve runsc path before fork, while still unprivileged.
223        let runsc_path = if config.use_gvisor {
224            Some(GVisorRuntime::resolve_path().map_err(|e| {
225                NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
226            })?)
227        } else {
228            None
229        };
230
231        // Child notifies parent after namespaces are ready.
232        let (ready_read, ready_write) = pipe().map_err(|e| {
233            NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
234        })?;
235
236        // Fork child process
237        match unsafe { fork() }? {
238            ForkResult::Parent { child } => {
239                drop(ready_write);
240                info!("Forked child process: {}", child);
241
242                let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
243
244                let cgroup_path = cgroup_opt
245                    .as_ref()
246                    .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
247                let cpu_millicores = config
248                    .limits
249                    .cpu_quota_us
250                    .map(|quota| (quota * 1000) / config.limits.cpu_period_us);
251                let mut state = ContainerState::new(ContainerStateParams {
252                    id: config.id.clone(),
253                    name: config.name.clone(),
254                    pid: target_pid,
255                    command: config.command.clone(),
256                    memory_limit: config.limits.memory_bytes,
257                    cpu_limit: cpu_millicores,
258                    using_gvisor: config.use_gvisor,
259                    rootless: config.user_ns_config.is_some(),
260                    cgroup_path,
261                });
262                state.config_hash = config.config_hash;
263                state.bundle_path = config.rootfs_path.as_ref().map(|p| p.display().to_string());
264
265                let mut bridge_net: Option<BridgeNetwork> = None;
266                let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
267
268                // Transition: Creating -> Created
269                state.status = OciStatus::Created;
270                state_mgr.save_state(&state)?;
271
272                // Write PID file (OCI --pid-file)
273                if let Some(ref pid_path) = config.pid_file {
274                    std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
275                        NucleusError::ConfigError(format!(
276                            "Failed to write pid-file '{}': {}",
277                            pid_path.display(),
278                            e
279                        ))
280                    })?;
281                    info!("Wrote PID {} to {}", target_pid, pid_path.display());
282                }
283
284                if let Some(ref mut cgroup) = cgroup_opt {
285                    cgroup.attach_process(target_pid)?;
286                }
287
288                if let NetworkMode::Bridge(ref bridge_config) = config.network {
289                    match BridgeNetwork::setup_with_id(target_pid, bridge_config, &config.id) {
290                        Ok(net) => {
291                            if let Some(ref egress) = config.egress_policy {
292                                if let Err(e) = net.apply_egress_policy(target_pid, egress) {
293                                    if config.service_mode == ServiceMode::Production {
294                                        return Err(NucleusError::NetworkError(format!(
295                                            "Failed to apply egress policy: {}",
296                                            e
297                                        )));
298                                    }
299                                    warn!("Failed to apply egress policy: {}", e);
300                                }
301                            }
302                            bridge_net = Some(net);
303                        }
304                        Err(e) => {
305                            if config.service_mode == ServiceMode::Production {
306                                return Err(e);
307                            }
308                            warn!("Failed to set up bridge networking: {}", e);
309                        }
310                    }
311                }
312
313                info!(
314                    "Container {} created (child pid {}), waiting for start",
315                    config.id, target_pid
316                );
317
318                Ok(CreatedContainer {
319                    config,
320                    state_mgr,
321                    state,
322                    child,
323                    cgroup_opt,
324                    bridge_net,
325                    trace_reader,
326                    exec_fifo_path: exec_fifo,
327                    _lifecycle_span: lifecycle_span.clone(),
328                })
329            }
330            ForkResult::Child => {
331                drop(ready_read);
332                let temp_container = Container { config, runsc_path };
333                match temp_container.setup_and_exec(Some(ready_write), Some(exec_fifo)) {
334                    Ok(_) => unreachable!(),
335                    Err(e) => {
336                        error!("Container setup failed: {}", e);
337                        std::process::exit(1);
338                    }
339                }
340            }
341        }
342    }
343
344    /// Trigger a previously-created container to start by opening its exec FIFO.
345    /// Used by the CLI `start` command.
346    pub fn trigger_start(container_id: &str) -> Result<()> {
347        let state_mgr = ContainerStateManager::new()?;
348        let fifo_path = state_mgr.exec_fifo_path(container_id)?;
349        if !fifo_path.exists() {
350            return Err(NucleusError::ConfigError(format!(
351                "No exec FIFO found for container {}; is it in 'created' state?",
352                container_id
353            )));
354        }
355
356        // Opening the FIFO for reading unblocks the child's open-for-write.
357        let file = std::fs::File::open(&fifo_path)
358            .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
359        let mut buf = [0u8; 1];
360        std::io::Read::read(&mut &file, &mut buf)
361            .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
362        drop(file);
363
364        let _ = std::fs::remove_file(&fifo_path);
365
366        // Update state to Running
367        let mut state = state_mgr.resolve_container(container_id)?;
368        state.status = OciStatus::Running;
369        state_mgr.save_state(&state)?;
370
371        Ok(())
372    }
373
374    /// Set up container environment and exec target process
375    ///
376    /// This runs in the child process after fork.
377    /// Tracks FilesystemState and SecurityState machines to enforce correct ordering.
378    fn setup_and_exec(
379        &self,
380        ready_pipe: Option<OwnedFd>,
381        exec_fifo: Option<PathBuf>,
382    ) -> Result<()> {
383        let is_rootless = self.config.user_ns_config.is_some();
384        let allow_degraded_security = Self::allow_degraded_security(&self.config);
385        let context_manifest = if self.config.verify_context_integrity {
386            self.config
387                .context_dir
388                .as_ref()
389                .map(|dir| snapshot_context_dir(dir))
390                .transpose()?
391        } else {
392            None
393        };
394
395        // Initialize state machines
396        let mut fs_state = FilesystemState::Unmounted;
397        let mut sec_state = SecurityState::Privileged;
398
399        // gVisor is the runtime that should create the container's namespaces.
400        // Running runsc after pre-unsharing our own namespaces breaks its gofer
401        // re-exec path on some systems and duplicates the OCI namespace config.
402        if self.config.use_gvisor {
403            if let Some(fd) = ready_pipe {
404                Self::notify_namespace_ready(&fd, std::process::id())?;
405            }
406            return self.setup_and_exec_gvisor();
407        }
408
409        // 1. Create namespaces in child and optionally configure user mapping.
410        let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
411        if let Some(user_config) = &self.config.user_ns_config {
412            namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
413        }
414        namespace_mgr.unshare_namespaces()?;
415
416        // CLONE_NEWPID only applies to children created after unshare().
417        // Create a child that will become PID 1 in the new namespace and exec the workload.
418        if self.config.namespaces.pid {
419            match unsafe { fork() }? {
420                ForkResult::Parent { child } => {
421                    if let Some(fd) = ready_pipe {
422                        Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
423                    }
424                    std::process::exit(Self::wait_for_pid_namespace_child(child));
425                }
426                ForkResult::Child => {
427                    // Continue container setup as PID 1 in the new namespace.
428                }
429            }
430        } else if let Some(fd) = ready_pipe {
431            Self::notify_namespace_ready(&fd, std::process::id())?;
432        }
433
434        // Namespace: Unshared -> Entered (process is now inside all namespaces)
435        namespace_mgr.enter()?;
436
437        // 2. Ensure no_new_privs BEFORE any mount operations.
438        // This prevents exploitation of setuid binaries on bind-mounted paths
439        // even if a subsequent MS_NOSUID remount fails.
440        self.enforce_no_new_privs()?;
441        audit(
442            &self.config.id,
443            &self.config.name,
444            AuditEventType::NoNewPrivsSet,
445            "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
446        );
447
448        // 3. Set hostname if UTS namespace is enabled
449        if let Some(hostname) = &self.config.hostname {
450            namespace_mgr.set_hostname(hostname)?;
451        }
452
453        // 4. Mount tmpfs as container root
454        // Filesystem: Unmounted -> Mounted
455        let runtime_dir = Builder::new()
456            .prefix("nucleus-runtime-")
457            .tempdir_in("/tmp")
458            .map_err(|e| {
459                NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
460            })?;
461        let container_root = runtime_dir.path().to_path_buf();
462        let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); // 1GB default
463        tmpfs.mount()?;
464        fs_state = fs_state.transition(FilesystemState::Mounted)?;
465
466        // 4. Create minimal filesystem structure
467        create_minimal_fs(&container_root)?;
468
469        // 5. Create device nodes
470        let dev_path = container_root.join("dev");
471        create_dev_nodes(&dev_path, false)?;
472
473        // 6. Populate context if provided
474        // Filesystem: Mounted -> Populated
475        if let Some(context_dir) = &self.config.context_dir {
476            let context_dest = container_root.join("context");
477            LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
478            if let Some(expected) = &context_manifest {
479                verify_context_manifest(expected, &context_dest)?;
480            }
481        }
482        fs_state = fs_state.transition(FilesystemState::Populated)?;
483
484        // 7. Mount runtime paths: either a pre-built rootfs or host bind mounts
485        if let Some(ref rootfs_path) = self.config.rootfs_path {
486            if self.config.verify_rootfs_attestation {
487                verify_rootfs_attestation(rootfs_path)?;
488            }
489            bind_mount_rootfs(&container_root, rootfs_path)?;
490        } else {
491            bind_mount_host_paths(&container_root, is_rootless)?;
492        }
493
494        // 7b. Write resolv.conf for bridge networking.
495        // When rootfs is mounted, /etc is read-only, so we bind-mount a writable
496        // resolv.conf over the top (same technique as secrets).
497        if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
498            if self.config.rootfs_path.is_some() {
499                BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_config.dns)?;
500            } else {
501                BridgeNetwork::write_resolv_conf(&container_root, &bridge_config.dns)?;
502            }
503        }
504
505        // 7c. Mount secrets (in-memory tmpfs for production, bind-mount for agent mode)
506        if self.config.service_mode == ServiceMode::Production {
507            mount_secrets_inmemory(&container_root, &self.config.secrets)?;
508        } else {
509            mount_secrets(&container_root, &self.config.secrets)?;
510        }
511
512        // 8. Mount procfs (hidepid=2 in production mode to prevent PID enumeration)
513        let proc_path = container_root.join("proc");
514        let hide_pids = self.config.service_mode == ServiceMode::Production;
515        mount_procfs(
516            &proc_path,
517            is_rootless,
518            self.config.proc_readonly,
519            hide_pids,
520        )?;
521
522        // 8b. Mask sensitive /proc paths to reduce kernel info leakage
523        // SEC-06: In production mode, failures to mask critical paths are fatal.
524        mask_proc_paths(
525            &proc_path,
526            self.config.service_mode == ServiceMode::Production,
527        )?;
528
529        // 9c. Run createRuntime hooks (after namespaces created, before pivot_root)
530        if let Some(ref hooks) = self.config.hooks {
531            if !hooks.create_runtime.is_empty() {
532                let hook_state = OciContainerState {
533                    oci_version: "1.0.2".to_string(),
534                    id: self.config.id.clone(),
535                    status: OciStatus::Creating,
536                    pid: std::process::id(),
537                    bundle: String::new(),
538                };
539                OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
540            }
541        }
542
543        // 10. Switch root filesystem
544        // Filesystem: Populated -> Pivoted
545        switch_root(&container_root, self.config.allow_chroot_fallback)?;
546        fs_state = fs_state.transition(FilesystemState::Pivoted)?;
547        debug!("Filesystem state: {:?}", fs_state);
548
549        // 10b. Audit mount flags to verify filesystem hardening invariants
550        audit_mounts(self.config.service_mode == ServiceMode::Production)?;
551        audit(
552            &self.config.id,
553            &self.config.name,
554            AuditEventType::MountAuditPassed,
555            "all mount flags verified",
556        );
557
558        // 10c. Run createContainer hooks (after pivot_root, before start)
559        if let Some(ref hooks) = self.config.hooks {
560            if !hooks.create_container.is_empty() {
561                let hook_state = OciContainerState {
562                    oci_version: "1.0.2".to_string(),
563                    id: self.config.id.clone(),
564                    status: OciStatus::Created,
565                    pid: std::process::id(),
566                    bundle: String::new(),
567                };
568                OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
569            }
570        }
571
572        // 11. Drop capabilities (from policy file or default drop-all)
573        // Security: Privileged -> CapabilitiesDropped
574        let mut cap_mgr = CapabilityManager::new();
575        if let Some(ref policy_path) = self.config.caps_policy {
576            let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
577                policy_path,
578                self.config.caps_policy_sha256.as_deref(),
579            )?;
580            policy.apply(&mut cap_mgr)?;
581            audit(
582                &self.config.id,
583                &self.config.name,
584                AuditEventType::CapabilitiesDropped,
585                format!("capability policy applied from {:?}", policy_path),
586            );
587        } else {
588            cap_mgr.drop_all()?;
589            audit(
590                &self.config.id,
591                &self.config.name,
592                AuditEventType::CapabilitiesDropped,
593                "all capabilities dropped including bounding set",
594            );
595        }
596        sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
597
598        // 12b. RLIMIT backstop: defense-in-depth against fork bombs and fd exhaustion.
599        // Must be applied BEFORE seccomp, since SYS_setrlimit is not in the allowlist.
600        // SEC-05: In production mode, RLIMIT failures are fatal — a container
601        // without resource limits is a privilege escalation vector.
602        {
603            let is_production = self.config.service_mode == ServiceMode::Production;
604
605            let nproc_limit = self.config.limits.pids_max.unwrap_or(512);
606            let rlim_nproc = libc::rlimit {
607                rlim_cur: nproc_limit,
608                rlim_max: nproc_limit,
609            };
610            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
611            if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
612                let err = std::io::Error::last_os_error();
613                if is_production {
614                    return Err(NucleusError::SeccompError(format!(
615                        "Failed to set RLIMIT_NPROC to {} in production mode: {}",
616                        nproc_limit, err
617                    )));
618                }
619                warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
620            }
621
622            let rlim_nofile = libc::rlimit {
623                rlim_cur: 1024,
624                rlim_max: 1024,
625            };
626            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
627            if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
628                let err = std::io::Error::last_os_error();
629                if is_production {
630                    return Err(NucleusError::SeccompError(format!(
631                        "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
632                        err
633                    )));
634                }
635                warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
636            }
637
638            // RLIMIT_MEMLOCK: prevent container from pinning excessive physical
639            // memory via mlock(). Default 64KB matches unprivileged default, but
640            // in a user namespace the container appears as UID 0 and may have a
641            // higher inherited limit.
642            let memlock_limit: u64 = 64 * 1024; // 64KB
643            let rlim_memlock = libc::rlimit {
644                rlim_cur: memlock_limit,
645                rlim_max: memlock_limit,
646            };
647            // SAFETY: setrlimit is a standard POSIX call with no memory safety concerns.
648            if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
649                let err = std::io::Error::last_os_error();
650                if is_production {
651                    return Err(NucleusError::SeccompError(format!(
652                        "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
653                        memlock_limit, err
654                    )));
655                }
656                warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
657            }
658        }
659
660        // 13. Apply seccomp filter (trace, profile-from-file, or built-in allowlist)
661        // Security: CapabilitiesDropped -> SeccompApplied
662        use crate::container::config::SeccompMode;
663        let mut seccomp_mgr = SeccompManager::new();
664        let allow_network = !matches!(self.config.network, NetworkMode::None);
665        let seccomp_applied = match self.config.seccomp_mode {
666            SeccompMode::Trace => {
667                audit(
668                    &self.config.id,
669                    &self.config.name,
670                    AuditEventType::SeccompApplied,
671                    "seccomp trace mode: allow-all + LOG",
672                );
673                seccomp_mgr.apply_trace_filter()?
674            }
675            SeccompMode::Enforce => {
676                if let Some(ref profile_path) = self.config.seccomp_profile {
677                    audit(
678                        &self.config.id,
679                        &self.config.name,
680                        AuditEventType::SeccompProfileLoaded,
681                        format!("path={:?}", profile_path),
682                    );
683                    seccomp_mgr.apply_profile_from_file(
684                        profile_path,
685                        self.config.seccomp_profile_sha256.as_deref(),
686                        self.config.seccomp_log_denied,
687                    )?
688                } else {
689                    seccomp_mgr.apply_filter_for_network_mode(
690                        allow_network,
691                        allow_degraded_security,
692                        self.config.seccomp_log_denied,
693                    )?
694                }
695            }
696        };
697        if seccomp_applied {
698            sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
699            audit(
700                &self.config.id,
701                &self.config.name,
702                AuditEventType::SeccompApplied,
703                format!("network={}", allow_network),
704            );
705        } else if !allow_degraded_security {
706            return Err(NucleusError::SeccompError(
707                "Seccomp filter is required but was not enforced".to_string(),
708            ));
709        } else {
710            warn!("Seccomp not enforced; container is running with degraded hardening");
711        }
712
713        // 14. Apply Landlock policy (from policy file or default hardcoded rules)
714        let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
715            let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
716                policy_path,
717                self.config.landlock_policy_sha256.as_deref(),
718            )?;
719            policy.apply(allow_degraded_security)?
720        } else {
721            let mut landlock_mgr = LandlockManager::new();
722            landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
723            landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
724        };
725        if seccomp_applied && landlock_applied {
726            sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
727            if self.config.seccomp_mode == SeccompMode::Trace {
728                warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
729            } else {
730                sec_state = sec_state.transition(SecurityState::Locked)?;
731            }
732            audit(
733                &self.config.id,
734                &self.config.name,
735                AuditEventType::LandlockApplied,
736                if self.config.seccomp_mode == SeccompMode::Trace {
737                    "landlock applied, but seccomp in trace mode — not locked".to_string()
738                } else {
739                    "security state locked: all hardening layers active".to_string()
740                },
741            );
742        } else if !allow_degraded_security {
743            return Err(NucleusError::LandlockError(
744                "Landlock policy is required but was not enforced".to_string(),
745            ));
746        } else {
747            warn!("Security state not locked; one or more hardening controls are inactive");
748        }
749        debug!("Security state: {:?}", sec_state);
750
751        // 14c. Block on exec FIFO until start() opens it for reading.
752        // This implements the OCI two-phase create/start: all container setup
753        // is complete, but the user process doesn't exec until explicitly started.
754        if let Some(ref fifo_path) = exec_fifo {
755            debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
756            let file = std::fs::OpenOptions::new()
757                .write(true)
758                .open(fifo_path)
759                .map_err(|e| {
760                    NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
761                })?;
762            std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
763                NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
764            })?;
765            drop(file);
766            debug!("Exec FIFO released, proceeding to exec");
767        }
768
769        // 14d. Run startContainer hooks (after start signal, before user process exec)
770        if let Some(ref hooks) = self.config.hooks {
771            if !hooks.start_container.is_empty() {
772                let hook_state = OciContainerState {
773                    oci_version: "1.0.2".to_string(),
774                    id: self.config.id.clone(),
775                    status: OciStatus::Running,
776                    pid: std::process::id(),
777                    bundle: String::new(),
778                };
779                OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
780            }
781        }
782
783        // 15. In production mode with PID namespace, run as a mini-init (PID 1)
784        // that reaps zombies and forwards signals, rather than exec-ing directly.
785        if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
786            return self.run_as_init();
787        }
788
789        // 15b. Agent mode: exec target process directly
790        self.exec_command()?;
791
792        // Should never reach here
793        Ok(())
794    }
795
796    /// Forward selected signals to child process using sigwait (no async signal handlers).
797    pub(super) fn setup_signal_forwarding_static(child: Pid) -> Result<()> {
798        let mut set = SigSet::empty();
799        for signal in [
800            Signal::SIGTERM,
801            Signal::SIGINT,
802            Signal::SIGHUP,
803            Signal::SIGQUIT,
804            Signal::SIGUSR1,
805            Signal::SIGUSR2,
806        ] {
807            set.add(signal);
808        }
809
810        pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&set), None).map_err(|e| {
811            NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
812        })?;
813
814        std::thread::spawn(move || {
815            while let Ok(signal) = set.wait() {
816                let _ = kill(child, signal);
817            }
818        });
819
820        info!("Signal forwarding configured");
821        Ok(())
822    }
823
824    /// Wait for child process to exit
825    pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
826        loop {
827            match waitpid(child, None) {
828                Ok(WaitStatus::Exited(_, code)) => {
829                    return Ok(code);
830                }
831                Ok(WaitStatus::Signaled(_, signal, _)) => {
832                    info!("Child killed by signal: {:?}", signal);
833                    return Ok(128 + signal as i32);
834                }
835                Err(nix::errno::Errno::EINTR) => {
836                    continue;
837                }
838                Err(e) => {
839                    return Err(NucleusError::ExecError(format!(
840                        "Failed to wait for child: {}",
841                        e
842                    )));
843                }
844                _ => {
845                    continue;
846                }
847            }
848        }
849    }
850
851    fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
852        let mut pid_buf = [0u8; 4];
853        loop {
854            match read(ready_read.as_raw_fd(), &mut pid_buf) {
855                Err(nix::errno::Errno::EINTR) => continue,
856                Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
857                Ok(0) => {
858                    return Err(NucleusError::ExecError(format!(
859                        "Child {} exited before namespace initialization",
860                        child
861                    )))
862                }
863                Ok(_) => {
864                    return Err(NucleusError::ExecError(
865                        "Invalid namespace sync payload from child".to_string(),
866                    ))
867                }
868                Err(e) => {
869                    return Err(NucleusError::ExecError(format!(
870                        "Failed waiting for child namespace setup: {}",
871                        e
872                    )))
873                }
874            }
875        }
876    }
877
878    fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
879        let payload = pid.to_ne_bytes();
880        let mut written = 0;
881        while written < payload.len() {
882            let n = write(fd, &payload[written..]).map_err(|e| {
883                NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
884            })?;
885            if n == 0 {
886                return Err(NucleusError::ExecError(
887                    "Failed to notify namespace readiness: short write".to_string(),
888                ));
889            }
890            written += n;
891        }
892        Ok(())
893    }
894
895    fn wait_for_pid_namespace_child(child: Pid) -> i32 {
896        loop {
897            match waitpid(child, None) {
898                Ok(WaitStatus::Exited(_, code)) => return code,
899                Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
900                Err(nix::errno::Errno::EINTR) => continue,
901                Err(_) => return 1,
902                _ => continue,
903            }
904        }
905    }
906}
907
908impl CreatedContainer {
909    /// Start phase: release the child via the exec FIFO, transition to Running,
910    /// then wait for the child to exit with full lifecycle management.
911    pub fn start(mut self) -> Result<i32> {
912        let config = &self.config;
913        let _enter = self._lifecycle_span.enter();
914
915        // Open the exec FIFO for reading — this unblocks the child's
916        // blocking open-for-write, allowing it to proceed to exec.
917        {
918            let file = std::fs::File::open(&self.exec_fifo_path).map_err(|e| {
919                NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
920            })?;
921            let mut buf = [0u8; 1];
922            std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
923                NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
924            })?;
925        }
926        let _ = std::fs::remove_file(&self.exec_fifo_path);
927
928        // Transition: Created -> Running
929        self.state.status = OciStatus::Running;
930        self.state_mgr.save_state(&self.state)?;
931
932        let target_pid = self.state.pid;
933        let child = self.child;
934
935        Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
936
937        // Run readiness probe before declaring service ready
938        if let Some(ref probe) = config.readiness_probe {
939            let notify_socket = if config.sd_notify {
940                std::env::var("NOTIFY_SOCKET").ok()
941            } else {
942                None
943            };
944            Container::run_readiness_probe(
945                target_pid,
946                &config.name,
947                probe,
948                config.user_ns_config.is_some(),
949                config.use_gvisor,
950                notify_socket.as_deref(),
951            )?;
952        }
953
954        // Start health check thread if configured
955        let cancel_flag = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
956        let health_handle = if let Some(ref hc) = config.health_check {
957            if !hc.command.is_empty() {
958                let hc = hc.clone();
959                let pid = target_pid;
960                let container_name = config.name.clone();
961                let rootless = config.user_ns_config.is_some();
962                let using_gvisor = config.use_gvisor;
963                let cancel = cancel_flag.clone();
964                Some(std::thread::spawn(move || {
965                    Container::health_check_loop(
966                        pid,
967                        &container_name,
968                        rootless,
969                        using_gvisor,
970                        &hc,
971                        &cancel,
972                    );
973                }))
974            } else {
975                None
976            }
977        } else {
978            None
979        };
980
981        // Run poststart hooks (after user process started, in parent)
982        if let Some(ref hooks) = config.hooks {
983            if !hooks.poststart.is_empty() {
984                let hook_state = OciContainerState {
985                    oci_version: "1.0.2".to_string(),
986                    id: config.id.clone(),
987                    status: OciStatus::Running,
988                    pid: target_pid,
989                    bundle: String::new(),
990                };
991                OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
992            }
993        }
994
995        let mut child_waited = false;
996        let run_result: Result<i32> = (|| {
997            let exit_code = Container::wait_for_child_static(child)?;
998
999            // Transition: Running -> Stopped
1000            self.state.status = OciStatus::Stopped;
1001            let _ = self.state_mgr.save_state(&self.state);
1002
1003            child_waited = true;
1004            Ok(exit_code)
1005        })();
1006
1007        // Cancel health check thread immediately regardless of run_result.
1008        // This ensures the health thread stops even if readiness_probe or
1009        // wait_for_child_static fails, preventing nsenter calls into a dead container.
1010        cancel_flag.store(true, std::sync::atomic::Ordering::Relaxed);
1011        if let Some(handle) = health_handle {
1012            let _ = handle.join();
1013        }
1014
1015        // Run poststop hooks (best-effort)
1016        if let Some(ref hooks) = config.hooks {
1017            if !hooks.poststop.is_empty() {
1018                let hook_state = OciContainerState {
1019                    oci_version: "1.0.2".to_string(),
1020                    id: config.id.clone(),
1021                    status: OciStatus::Stopped,
1022                    pid: 0,
1023                    bundle: String::new(),
1024                };
1025                OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1026            }
1027        }
1028
1029        if let Some(net) = self.bridge_net.take() {
1030            if let Err(e) = net.cleanup() {
1031                warn!("Failed to cleanup bridge networking: {}", e);
1032            }
1033        }
1034
1035        if !child_waited {
1036            let _ = kill(child, Signal::SIGKILL);
1037            let _ = waitpid(child, None);
1038        }
1039
1040        if let Some(reader) = self.trace_reader.take() {
1041            reader.stop_and_flush();
1042        }
1043
1044        if let Some(cgroup) = self.cgroup_opt.take() {
1045            if let Err(e) = cgroup.cleanup() {
1046                warn!("Failed to cleanup cgroup: {}", e);
1047            }
1048        }
1049
1050        if config.use_gvisor {
1051            if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1052                warn!(
1053                    "Failed to cleanup gVisor artifacts for {}: {}",
1054                    config.id, e
1055                );
1056            }
1057        }
1058
1059        if let Err(e) = self.state_mgr.delete_state(&config.id) {
1060            warn!("Failed to delete state for {}: {}", config.id, e);
1061        }
1062
1063        match run_result {
1064            Ok(exit_code) => {
1065                audit(
1066                    &config.id,
1067                    &config.name,
1068                    AuditEventType::ContainerStop,
1069                    format!("exit_code={}", exit_code),
1070                );
1071                info!(
1072                    "Container {} ({}) exited with code {}",
1073                    config.name, config.id, exit_code
1074                );
1075                Ok(exit_code)
1076            }
1077            Err(e) => {
1078                audit_error(
1079                    &config.id,
1080                    &config.name,
1081                    AuditEventType::ContainerStop,
1082                    format!("error={}", e),
1083                );
1084                Err(e)
1085            }
1086        }
1087    }
1088}
1089
1090#[cfg(test)]
1091mod tests {
1092    use super::*;
1093    use crate::container::KernelLockdownMode;
1094    use crate::network::NetworkMode;
1095
1096    #[test]
1097    fn test_container_config() {
1098        let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1099        assert!(!config.id.is_empty());
1100        assert_eq!(config.command, vec!["/bin/sh"]);
1101        assert!(config.use_gvisor);
1102    }
1103
1104    #[test]
1105    fn test_container_config_with_name() {
1106        let config =
1107            ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1108                .unwrap();
1109        assert_eq!(config.name, "mycontainer");
1110        assert!(!config.id.is_empty());
1111        assert_ne!(config.id, config.name);
1112    }
1113
1114    #[test]
1115    fn test_allow_degraded_security_requires_explicit_config() {
1116        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1117        assert!(!Container::allow_degraded_security(&strict));
1118
1119        let relaxed = strict.clone().with_allow_degraded_security(true);
1120        assert!(Container::allow_degraded_security(&relaxed));
1121    }
1122
1123    #[test]
1124    fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1125        let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1126        std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1127
1128        let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1129        assert!(!Container::allow_degraded_security(&strict));
1130
1131        let explicit = strict.with_allow_degraded_security(true);
1132        assert!(Container::allow_degraded_security(&explicit));
1133
1134        match prev {
1135            Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1136            None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1137        }
1138    }
1139
1140    #[test]
1141    fn test_host_network_requires_explicit_opt_in() {
1142        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1143            .unwrap()
1144            .with_network(NetworkMode::Host)
1145            .with_allow_host_network(false);
1146        let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1147        assert!(matches!(err, NucleusError::NetworkError(_)));
1148    }
1149
1150    #[test]
1151    fn test_host_network_opt_in_disables_net_namespace() {
1152        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1153            .unwrap()
1154            .with_network(NetworkMode::Host)
1155            .with_allow_host_network(true);
1156        assert!(config.namespaces.net);
1157        Container::apply_network_mode_guards(&mut config, true).unwrap();
1158        assert!(!config.namespaces.net);
1159    }
1160
1161    #[test]
1162    fn test_non_host_network_does_not_require_host_opt_in() {
1163        let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1164            .unwrap()
1165            .with_network(NetworkMode::None)
1166            .with_allow_host_network(false);
1167        assert!(config.namespaces.net);
1168        Container::apply_network_mode_guards(&mut config, true).unwrap();
1169        assert!(config.namespaces.net);
1170    }
1171
1172    #[test]
1173    fn test_parse_kernel_lockdown_mode() {
1174        assert_eq!(
1175            Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1176            Some(KernelLockdownMode::Integrity)
1177        );
1178        assert_eq!(
1179            Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1180            Some(KernelLockdownMode::Confidentiality)
1181        );
1182        assert_eq!(
1183            Container::parse_active_lockdown_mode("[none] integrity"),
1184            None
1185        );
1186    }
1187
1188    #[test]
1189    fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1190        let temp = tempfile::TempDir::new().unwrap();
1191        let source = temp.path().join("source-secret");
1192        std::fs::write(&source, "supersecret").unwrap();
1193
1194        let staged = Container::stage_gvisor_secret_files(
1195            &temp.path().join("stage"),
1196            &[crate::container::SecretMount {
1197                source: source.clone(),
1198                dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1199                mode: 0o400,
1200            }],
1201        )
1202        .unwrap();
1203
1204        assert_eq!(staged.len(), 1);
1205        assert!(staged[0].source.starts_with(temp.path().join("stage")));
1206        assert_eq!(
1207            std::fs::read_to_string(&staged[0].source).unwrap(),
1208            "supersecret"
1209        );
1210    }
1211
1212    #[test]
1213    fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1214        let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1215        std::fs::create_dir_all(&artifact_dir).unwrap();
1216        std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1217
1218        Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1219        assert!(!artifact_dir.exists());
1220    }
1221
1222    #[test]
1223    fn test_health_check_loop_supports_cancellation() {
1224        // BUG-18: health_check_loop must accept an AtomicBool cancel flag
1225        // and check it between iterations for prompt shutdown.
1226        // Function lives in health.rs after the runtime split.
1227        let source = include_str!("health.rs");
1228        let fn_start = source.find("fn health_check_loop").unwrap();
1229        let fn_body = &source[fn_start..fn_start + 2500];
1230        assert!(
1231            fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1232            "health_check_loop must accept an AtomicBool cancellation flag"
1233        );
1234        // Must also check cancellation during sleep
1235        assert!(
1236            fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1237            "health_check_loop must check cancellation during sleep intervals"
1238        );
1239    }
1240
1241    #[test]
1242    fn test_runtime_probes_do_not_spawn_host_nsenter() {
1243        // Both functions live in health.rs after the runtime split.
1244        let source = include_str!("health.rs");
1245
1246        let readiness_start = source.find("fn run_readiness_probe").unwrap();
1247        let readiness_body = &source[readiness_start..readiness_start + 2500];
1248        assert!(
1249            !readiness_body.contains("Command::new(&nsenter_bin)"),
1250            "readiness probes must not execute via host nsenter"
1251        );
1252
1253        let health_start = source.find("fn health_check_loop").unwrap();
1254        let health_body = &source[health_start..health_start + 2200];
1255        assert!(
1256            !health_body.contains("Command::new(&nsenter_bin)"),
1257            "health checks must not execute via host nsenter"
1258        );
1259    }
1260
1261    #[test]
1262    fn test_oci_mount_strip_prefix_no_expect() {
1263        // BUG-08: prepare_oci_mountpoints must not use expect() - use ? instead
1264        // Function lives in gvisor_setup.rs after the runtime split.
1265        let source = include_str!("gvisor_setup.rs");
1266        let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1267        let fn_body = &source[fn_start..fn_start + 600];
1268        assert!(
1269            !fn_body.contains(".expect("),
1270            "prepare_oci_mountpoints must not use expect() — return Err instead"
1271        );
1272    }
1273
1274    #[test]
1275    fn test_notify_namespace_ready_validates_write_length() {
1276        // BUG-02: notify_namespace_ready must validate that all bytes were written
1277        let source = include_str!("runtime.rs");
1278        let fn_start = source.find("fn notify_namespace_ready").unwrap();
1279        let fn_body = &source[fn_start..fn_start + 500];
1280        // Must check the return value of write() for partial writes
1281        assert!(
1282            fn_body.contains("written")
1283                || fn_body.contains("4")
1284                || fn_body.contains("payload.len()"),
1285            "notify_namespace_ready must validate complete write of all 4 bytes"
1286        );
1287    }
1288
1289    #[test]
1290    fn test_rlimit_failures_fatal_in_production() {
1291        // SEC-05: RLIMIT failures must be fatal in production mode
1292        let source = include_str!("runtime.rs");
1293        let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1294        let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1295        assert!(
1296            rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1297            "RLIMIT failures must return Err in production mode"
1298        );
1299    }
1300
1301    #[test]
1302    fn test_tcp_readiness_probe_uses_portable_check() {
1303        // BUG-14: TCP readiness probe must not use /dev/tcp (bash-only)
1304        // Function lives in health.rs after the runtime split.
1305        let source = include_str!("health.rs");
1306        let probe_fn = source.find("TcpPort(port)").unwrap();
1307        let probe_body = &source[probe_fn..probe_fn + 500];
1308        assert!(
1309            !probe_body.contains("/dev/tcp"),
1310            "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1311        );
1312    }
1313}