1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3 ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4 ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8 audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9 mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
10 switch_root, verify_context_manifest, verify_rootfs_attestation, FilesystemState,
11 LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::{NamespaceManager, UserNamespaceMapper};
14use crate::network::{BridgeDriver, BridgeNetwork, NatBackend, NetworkMode, UserspaceNetwork};
15use crate::resources::Cgroup;
16use crate::security::{
17 CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks,
18 SeccompDenyLogger, SeccompManager, SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{
25 chown, fork, pipe, read, setresgid, setresuid, write, ForkResult, Gid, Pid, Uid,
26};
27use std::os::fd::OwnedFd;
28use std::os::unix::fs::PermissionsExt;
29use std::path::PathBuf;
30use std::sync::atomic::{AtomicBool, Ordering};
31use std::sync::Arc;
32use std::thread::JoinHandle;
33use tempfile::Builder;
34use tracing::{debug, error, info, info_span, warn};
35
36pub struct Container {
45 pub(super) config: ContainerConfig,
46 pub(super) runsc_path: Option<String>,
49}
50
51pub struct CreatedContainer {
55 pub(super) config: ContainerConfig,
56 pub(super) state_mgr: ContainerStateManager,
57 pub(super) state: ContainerState,
58 pub(super) child: Pid,
59 pub(super) cgroup_opt: Option<Cgroup>,
60 pub(super) network_driver: Option<BridgeDriver>,
61 pub(super) trace_reader: Option<SeccompTraceReader>,
62 pub(super) deny_logger: Option<SeccompDenyLogger>,
63 pub(super) exec_fifo_path: Option<PathBuf>,
64 pub(super) _lifecycle_span: tracing::Span,
65}
66
67impl Container {
68 pub fn new(config: ContainerConfig) -> Self {
69 Self {
70 config,
71 runsc_path: None,
72 }
73 }
74
75 pub fn run(&self) -> Result<i32> {
77 self.create_internal(false)?.start()
78 }
79
80 pub fn create(&self) -> Result<CreatedContainer> {
84 self.create_internal(true)
85 }
86
87 fn sanitize_fds() {
92 const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
95 let ret =
97 unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
98 if ret == 0 {
99 return;
100 }
101 if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
105 let fds: Vec<i32> = entries
106 .flatten()
107 .filter_map(|entry| entry.file_name().into_string().ok())
108 .filter_map(|s| s.parse::<i32>().ok())
109 .filter(|&fd| fd > 2)
110 .collect();
111 for fd in fds {
112 unsafe { libc::close(fd) };
113 }
114 }
115 }
116
117 pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
118 let thread_count = std::fs::read_to_string("/proc/self/status")
119 .ok()
120 .and_then(|s| {
121 s.lines()
122 .find(|line| line.starts_with("Threads:"))
123 .and_then(|line| line.split_whitespace().nth(1))
124 .and_then(|count| count.parse::<u32>().ok())
125 });
126
127 if thread_count == Some(1) {
128 return Ok(());
129 }
130
131 Err(NucleusError::ExecError(format!(
132 "{} requires a single-threaded process before fork, found {:?} threads",
133 context, thread_count
134 )))
135 }
136
137 fn prepare_runtime_base_override(
138 config: &ContainerConfig,
139 host_is_root: bool,
140 needs_external_userns_mapping: bool,
141 ) -> Result<Option<PathBuf>> {
142 if !needs_external_userns_mapping {
143 return Ok(None);
144 }
145
146 if !host_is_root {
147 return Ok(Some(
148 dirs::runtime_dir()
149 .map(|d| d.join("nucleus"))
150 .unwrap_or_else(std::env::temp_dir),
151 ));
152 }
153
154 let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
155 NucleusError::ExecError("Missing user namespace configuration".to_string())
156 })?;
157 let host_uid =
158 Self::mapped_host_id_for_container_id(&user_config.uid_mappings, 0, "uid mappings")?;
159 let host_gid =
160 Self::mapped_host_id_for_container_id(&user_config.gid_mappings, 0, "gid mappings")?;
161
162 let root = PathBuf::from("/run/nucleus");
163 Self::ensure_runtime_parent_dir(&root)?;
164
165 let runtime_root = root.join("runtime");
166 Self::ensure_runtime_parent_dir(&runtime_root)?;
167
168 let base = runtime_root.join(&config.id);
169 std::fs::create_dir_all(&base).map_err(|e| {
170 NucleusError::FilesystemError(format!(
171 "Failed to create user namespace runtime base {:?}: {}",
172 base, e
173 ))
174 })?;
175 chown(
176 &base,
177 Some(Uid::from_raw(host_uid)),
178 Some(Gid::from_raw(host_gid)),
179 )
180 .map_err(|e| {
181 NucleusError::FilesystemError(format!(
182 "Failed to chown user namespace runtime base {:?} to {}:{}: {}",
183 base, host_uid, host_gid, e
184 ))
185 })?;
186 std::fs::set_permissions(&base, std::fs::Permissions::from_mode(0o700)).map_err(|e| {
187 NucleusError::FilesystemError(format!(
188 "Failed to secure user namespace runtime base {:?}: {}",
189 base, e
190 ))
191 })?;
192
193 Ok(Some(base))
194 }
195
196 fn ensure_runtime_parent_dir(path: &std::path::Path) -> Result<()> {
197 std::fs::create_dir_all(path).map_err(|e| {
198 NucleusError::FilesystemError(format!(
199 "Failed to create runtime parent dir {:?}: {}",
200 path, e
201 ))
202 })?;
203 std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o711)).map_err(|e| {
204 NucleusError::FilesystemError(format!(
205 "Failed to secure runtime parent dir {:?}: {}",
206 path, e
207 ))
208 })?;
209 Ok(())
210 }
211
212 fn mapped_host_id_for_container_id(
213 mappings: &[crate::isolation::IdMapping],
214 container_id: u32,
215 label: &str,
216 ) -> Result<u32> {
217 for mapping in mappings {
218 let end = mapping
219 .container_id
220 .checked_add(mapping.count)
221 .ok_or_else(|| {
222 NucleusError::ConfigError(format!(
223 "{} overflow for container id {}",
224 label, container_id
225 ))
226 })?;
227 if container_id >= mapping.container_id && container_id < end {
228 return mapping
229 .host_id
230 .checked_add(container_id - mapping.container_id)
231 .ok_or_else(|| {
232 NucleusError::ConfigError(format!(
233 "{} host id overflow for container id {}",
234 label, container_id
235 ))
236 });
237 }
238 }
239
240 Err(NucleusError::ConfigError(format!(
241 "{} do not map container id {}",
242 label, container_id
243 )))
244 }
245
246 fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
247 let lifecycle_span = info_span!(
248 "container.lifecycle",
249 container.id = %self.config.id,
250 container.name = %self.config.name,
251 runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
252 );
253 let _enter = lifecycle_span.enter();
254
255 info!(
256 "Creating container: {} (ID: {})",
257 self.config.name, self.config.id
258 );
259 audit(
260 &self.config.id,
261 &self.config.name,
262 AuditEventType::ContainerStart,
263 format!(
264 "command={:?} mode={:?} runtime={}",
265 crate::audit::redact_command(&self.config.command),
266 self.config.service_mode,
267 if self.config.use_gvisor {
268 "gvisor"
269 } else {
270 "native"
271 }
272 ),
273 );
274
275 let is_root = nix::unistd::Uid::effective().is_root();
277 let mut config = self.config.clone();
278
279 if !is_root && config.user_ns_config.is_none() {
280 info!("Not running as root, automatically enabling rootless mode");
281 config.namespaces.user = true;
282 config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
283 }
284
285 if is_root && !config.namespaces.user {
289 if config.service_mode == ServiceMode::Production {
290 info!("Running as root in production mode: enabling user namespace with UID remapping");
291 config.namespaces.user = true;
292 config.user_ns_config =
293 Some(crate::isolation::UserNamespaceConfig::root_remapped());
294 } else {
295 warn!(
296 "Running as root WITHOUT user namespace isolation. \
297 Container processes will run as real host UID 0. \
298 Use --user-ns or production mode for UID remapping."
299 );
300 }
301 }
302
303 if let Some(ref socket_path) = config.console_socket {
305 warn!(
306 "Console socket {} accepted but terminal forwarding is not yet implemented",
307 socket_path.display()
308 );
309 }
310
311 config.validate_production_mode()?;
313 Self::assert_kernel_lockdown(&config)?;
314
315 Self::apply_network_mode_guards(&mut config, is_root)?;
316 Self::apply_trust_level_guards(&mut config)?;
317 config.validate_runtime_support()?;
318
319 if let NetworkMode::Bridge(ref bridge_config) = config.network {
320 let backend =
321 bridge_config.selected_nat_backend(is_root, config.user_ns_config.is_some());
322 if backend == NatBackend::Kernel && !is_root {
323 return Err(NucleusError::NetworkError(
324 "Kernel bridge networking requires root. Use --nat-backend userspace or leave the default auto selection for rootless/native containers."
325 .to_string(),
326 ));
327 }
328 }
329
330 let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
332
333 if let Ok(all_states) = state_mgr.list_states() {
335 if all_states.iter().any(|s| s.name == config.name) {
336 return Err(NucleusError::ConfigError(format!(
337 "A container named '{}' already exists; use a different --name, \
338 or remove the stale state with 'nucleus delete'",
339 config.name
340 )));
341 }
342 }
343
344 let exec_fifo = if defer_exec_until_start {
347 let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
348 nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
349 NucleusError::ExecError(format!(
350 "Failed to create exec FIFO {:?}: {}",
351 exec_fifo, e
352 ))
353 })?;
354 Some(exec_fifo)
355 } else {
356 None
357 };
358
359 let cgroup_name = format!("nucleus-{}", config.id);
361 let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
362 Ok(mut cgroup) => {
363 match cgroup.set_limits(&config.limits) {
365 Ok(_) => {
366 info!("Created cgroup with resource limits");
367 Some(cgroup)
368 }
369 Err(e) => {
370 if config.service_mode == ServiceMode::Production {
371 let _ = cgroup.cleanup();
372 return Err(NucleusError::CgroupError(format!(
373 "Production mode requires cgroup resource enforcement, but \
374 applying limits failed: {}",
375 e
376 )));
377 }
378 warn!("Failed to set cgroup limits: {}", e);
379 let _ = cgroup.cleanup();
380 None
381 }
382 }
383 }
384 Err(e) => {
385 if config.service_mode == ServiceMode::Production {
386 return Err(NucleusError::CgroupError(format!(
387 "Production mode requires cgroup resource enforcement, but \
388 cgroup creation failed: {}",
389 e
390 )));
391 }
392
393 if config.user_ns_config.is_some() {
394 if config.limits.memory_bytes.is_some()
395 || config.limits.cpu_quota_us.is_some()
396 || config.limits.pids_max.is_some()
397 {
398 warn!(
399 "Running in rootless mode: requested resource limits cannot be \
400 enforced – cgroup creation requires root ({})",
401 e
402 );
403 } else {
404 debug!("Running in rootless mode without cgroup resource limits");
405 }
406 } else {
407 warn!(
408 "Failed to create cgroup (running without resource limits): {}",
409 e
410 );
411 }
412 None
413 }
414 };
415
416 let runsc_path = if config.use_gvisor {
418 Some(GVisorRuntime::resolve_path().map_err(|e| {
419 NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
420 })?)
421 } else {
422 None
423 };
424 let gvisor_bridge_needs_userns_mapping = config.use_gvisor
425 && !is_root
426 && config.user_ns_config.is_some()
427 && matches!(config.network, NetworkMode::Bridge(_));
428 let needs_external_userns_mapping = config.user_ns_config.is_some()
429 && (!config.use_gvisor || gvisor_bridge_needs_userns_mapping);
430 let runtime_base_override =
431 Self::prepare_runtime_base_override(&config, is_root, needs_external_userns_mapping)?;
432
433 let (ready_read, ready_write) = pipe().map_err(|e| {
435 NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
436 })?;
437 let userns_sync = if needs_external_userns_mapping {
438 let (request_read, request_write) = pipe().map_err(|e| {
439 NucleusError::ExecError(format!(
440 "Failed to create user namespace request pipe: {}",
441 e
442 ))
443 })?;
444 let (ack_read, ack_write) = pipe().map_err(|e| {
445 NucleusError::ExecError(format!("Failed to create user namespace ack pipe: {}", e))
446 })?;
447 Some((request_read, request_write, ack_read, ack_write))
448 } else {
449 None
450 };
451 let (attach_read, attach_write) = pipe().map_err(|e| {
452 NucleusError::ExecError(format!("Failed to create cgroup attach sync pipe: {}", e))
453 })?;
454
455 Self::assert_single_threaded_for_fork("container create fork")?;
460 match unsafe { fork() }? {
463 ForkResult::Parent { child } => {
464 drop(ready_write);
465 drop(attach_read);
466 let (userns_request_read, userns_ack_write) =
467 if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
468 drop(request_write);
469 drop(ack_read);
470 (Some(request_read), Some(ack_write))
471 } else {
472 (None, None)
473 };
474 info!("Forked child process: {}", child);
475
476 let parent_setup = || -> Result<CreatedContainer> {
479 if needs_external_userns_mapping {
480 let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
481 NucleusError::ExecError(
482 "Missing user namespace configuration in parent".to_string(),
483 )
484 })?;
485 let request_read = userns_request_read.as_ref().ok_or_else(|| {
486 NucleusError::ExecError(
487 "Missing user namespace request pipe in parent".to_string(),
488 )
489 })?;
490 let ack_write = userns_ack_write.as_ref().ok_or_else(|| {
491 NucleusError::ExecError(
492 "Missing user namespace ack pipe in parent".to_string(),
493 )
494 })?;
495
496 Self::wait_for_sync_byte(
497 request_read,
498 &format!(
499 "Child {} exited before requesting user namespace mappings",
500 child
501 ),
502 "Failed waiting for child user namespace request",
503 )?;
504 UserNamespaceMapper::new(user_config.clone())
505 .write_mappings_for_pid(child.as_raw() as u32)?;
506 Self::send_sync_byte(
507 ack_write,
508 "Failed to notify child that user namespace mappings are ready",
509 )?;
510 }
511
512 let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
513
514 let cgroup_path = cgroup_opt
515 .as_ref()
516 .map(|cgroup| cgroup.path().display().to_string());
517 let cpu_millicores = config
518 .limits
519 .cpu_quota_us
520 .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
521 let mut state = ContainerState::new(ContainerStateParams {
522 id: config.id.clone(),
523 name: config.name.clone(),
524 pid: target_pid,
525 command: config.command.clone(),
526 memory_limit: config.limits.memory_bytes,
527 cpu_limit: cpu_millicores,
528 using_gvisor: config.use_gvisor,
529 rootless: config.user_ns_config.is_some(),
530 cgroup_path,
531 process_uid: config.process_identity.uid,
532 process_gid: config.process_identity.gid,
533 additional_gids: config.process_identity.additional_gids.clone(),
534 });
535 state.config_hash = config.config_hash;
536 state.bundle_path =
537 config.rootfs_path.as_ref().map(|p| p.display().to_string());
538
539 let mut network_driver: Option<BridgeDriver> = None;
540 let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
541 let deny_logger = Self::maybe_start_seccomp_deny_logger(&config, target_pid)?;
542
543 state.status = OciStatus::Created;
545 state_mgr.save_state(&state)?;
546
547 if let Some(ref pid_path) = config.pid_file {
549 std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
550 NucleusError::ConfigError(format!(
551 "Failed to write pid-file '{}': {}",
552 pid_path.display(),
553 e
554 ))
555 })?;
556 info!("Wrote PID {} to {}", target_pid, pid_path.display());
557 }
558
559 if let Some(ref mut cgroup) = cgroup_opt {
560 cgroup.attach_process(target_pid)?;
561 }
562 Self::send_sync_byte(
563 &attach_write,
564 "Failed to notify child that cgroup attachment is complete",
565 )?;
566
567 if let NetworkMode::Bridge(ref bridge_config) = config.network {
568 match BridgeDriver::setup_with_id(
569 target_pid,
570 bridge_config,
571 &config.id,
572 is_root,
573 config.user_ns_config.is_some(),
574 ) {
575 Ok(net) => {
576 if let Some(ref egress) = config.egress_policy {
577 if let Err(e) = net.apply_egress_policy(
578 target_pid,
579 egress,
580 config.user_ns_config.is_some(),
581 ) {
582 if config.service_mode == ServiceMode::Production {
583 return Err(NucleusError::NetworkError(format!(
584 "Failed to apply egress policy: {}",
585 e
586 )));
587 }
588 warn!("Failed to apply egress policy: {}", e);
589 }
590 }
591 network_driver = Some(net);
592 }
593 Err(e) => {
594 if config.service_mode == ServiceMode::Production {
595 return Err(e);
596 }
597 warn!("Failed to set up bridge networking: {}", e);
598 }
599 }
600 }
601
602 info!(
603 "Container {} created (child pid {}), waiting for start",
604 config.id, target_pid
605 );
606
607 Ok(CreatedContainer {
608 config,
609 state_mgr,
610 state,
611 child,
612 cgroup_opt,
613 network_driver,
614 trace_reader,
615 deny_logger,
616 exec_fifo_path: exec_fifo,
617 _lifecycle_span: lifecycle_span.clone(),
618 })
619 };
620
621 parent_setup().map_err(|e| {
622 let _ = kill(child, Signal::SIGKILL);
625 let _ = waitpid(child, None);
626 e
627 })
628 }
629 ForkResult::Child => {
630 drop(ready_read);
631 drop(attach_write);
632 let (userns_request_write, userns_ack_read) =
633 if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
634 drop(request_read);
635 drop(ack_write);
636 (Some(request_write), Some(ack_read))
637 } else {
638 (None, None)
639 };
640 Self::sanitize_fds();
642 let temp_container = Container { config, runsc_path };
643 match temp_container.setup_and_exec(
644 Some(ready_write),
645 userns_request_write,
646 userns_ack_read,
647 Some(attach_read),
648 exec_fifo,
649 runtime_base_override,
650 ) {
651 Ok(_) => unreachable!(),
652 Err(e) => {
653 error!("Container setup failed: {}", e);
654 std::process::exit(1);
655 }
656 }
657 }
658 }
659 }
660
661 pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
664 let state_mgr = ContainerStateManager::new_with_root(state_root)?;
665 let fifo_path = state_mgr.exec_fifo_path(container_id)?;
666 if !fifo_path.exists() {
667 return Err(NucleusError::ConfigError(format!(
668 "No exec FIFO found for container {}; is it in 'created' state?",
669 container_id
670 )));
671 }
672
673 let file = std::fs::File::open(&fifo_path)
675 .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
676 let mut buf = [0u8; 1];
677 std::io::Read::read(&mut &file, &mut buf)
678 .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
679 drop(file);
680
681 let _ = std::fs::remove_file(&fifo_path);
682
683 let mut state = state_mgr.resolve_container(container_id)?;
685 state.status = OciStatus::Running;
686 state_mgr.save_state(&state)?;
687
688 Ok(())
689 }
690
691 fn setup_and_exec(
696 &self,
697 ready_pipe: Option<OwnedFd>,
698 userns_request_pipe: Option<OwnedFd>,
699 userns_ack_pipe: Option<OwnedFd>,
700 cgroup_attach_pipe: Option<OwnedFd>,
701 exec_fifo: Option<PathBuf>,
702 runtime_base_override: Option<PathBuf>,
703 ) -> Result<()> {
704 let is_rootless = self.config.user_ns_config.is_some();
705 let allow_degraded_security = Self::allow_degraded_security(&self.config);
706 let context_manifest = if self.config.verify_context_integrity {
707 self.config
708 .context_dir
709 .as_ref()
710 .map(|dir| snapshot_context_dir(dir))
711 .transpose()?
712 } else {
713 None
714 };
715
716 let mut fs_state = FilesystemState::Unmounted;
718 let mut sec_state = SecurityState::Privileged;
719
720 if self.config.use_gvisor {
724 let gvisor_bridge_precreated_userns =
725 if matches!(self.config.network, NetworkMode::Bridge(_)) {
726 self.prepare_gvisor_bridge_namespace(
727 userns_request_pipe.as_ref(),
728 userns_ack_pipe.as_ref(),
729 )?
730 } else {
731 false
732 };
733
734 if let Some(fd) = ready_pipe {
735 Self::notify_namespace_ready(&fd, std::process::id())?;
736 }
737 if let Some(fd) = cgroup_attach_pipe.as_ref() {
738 Self::wait_for_sync_byte(
739 fd,
740 "Parent closed cgroup attach pipe before signalling gVisor child",
741 "Failed waiting for cgroup attach acknowledgement",
742 )?;
743 }
744 return self.setup_and_exec_gvisor(gvisor_bridge_precreated_userns);
745 }
746
747 let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
749 namespace_mgr.unshare_namespaces()?;
750 if self.config.user_ns_config.is_some() {
751 let request_fd = userns_request_pipe.as_ref().ok_or_else(|| {
752 NucleusError::ExecError(
753 "Missing user namespace request pipe in container child".to_string(),
754 )
755 })?;
756 let ack_fd = userns_ack_pipe.as_ref().ok_or_else(|| {
757 NucleusError::ExecError(
758 "Missing user namespace acknowledgement pipe in container child".to_string(),
759 )
760 })?;
761
762 Self::send_sync_byte(
763 request_fd,
764 "Failed to request user namespace mappings from parent",
765 )?;
766 Self::wait_for_sync_byte(
767 ack_fd,
768 "Parent closed user namespace ack pipe before mappings were written",
769 "Failed waiting for parent to finish user namespace mappings",
770 )?;
771 Self::become_userns_root_for_setup()?;
772 }
773
774 if self.config.namespaces.pid {
777 Self::assert_single_threaded_for_fork("PID namespace init fork")?;
778 match unsafe { fork() }? {
779 ForkResult::Parent { child } => {
780 if let Some(fd) = ready_pipe {
781 Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
782 }
783 std::process::exit(Self::wait_for_pid_namespace_child(child));
784 }
785 ForkResult::Child => {
786 if let Some(fd) = cgroup_attach_pipe.as_ref() {
787 Self::wait_for_sync_byte(
788 fd,
789 "Parent closed cgroup attach pipe before signalling PID 1 child",
790 "Failed waiting for cgroup attach acknowledgement",
791 )?;
792 }
793 }
795 }
796 } else {
797 if let Some(fd) = ready_pipe {
798 Self::notify_namespace_ready(&fd, std::process::id())?;
799 }
800 if let Some(fd) = cgroup_attach_pipe.as_ref() {
801 Self::wait_for_sync_byte(
802 fd,
803 "Parent closed cgroup attach pipe before signalling container child",
804 "Failed waiting for cgroup attach acknowledgement",
805 )?;
806 }
807 }
808
809 namespace_mgr.enter()?;
811
812 self.enforce_no_new_privs()?;
816 audit(
817 &self.config.id,
818 &self.config.name,
819 AuditEventType::NoNewPrivsSet,
820 "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
821 );
822
823 if let Some(hostname) = &self.config.hostname {
825 namespace_mgr.set_hostname(hostname)?;
826 }
827
828 let runtime_base = if let Some(path) = runtime_base_override {
833 path
834 } else if nix::unistd::Uid::effective().is_root() {
835 PathBuf::from("/run/nucleus")
836 } else {
837 dirs::runtime_dir()
838 .map(|d| d.join("nucleus"))
839 .unwrap_or_else(std::env::temp_dir)
840 };
841 let _ = std::fs::create_dir_all(&runtime_base);
842 let runtime_dir = Builder::new()
843 .prefix("nucleus-runtime-")
844 .tempdir_in(&runtime_base)
845 .map_err(|e| {
846 NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
847 })?;
848 let container_root = runtime_dir.path().to_path_buf();
849 let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); tmpfs.mount()?;
851 fs_state = fs_state.transition(FilesystemState::Mounted)?;
852
853 create_minimal_fs(&container_root)?;
855
856 let dev_path = container_root.join("dev");
858 create_dev_nodes(&dev_path, false)?;
859
860 let shm_path = dev_path.join("shm");
863 std::fs::create_dir_all(&shm_path).map_err(|e| {
864 NucleusError::FilesystemError(format!("Failed to create /dev/shm: {}", e))
865 })?;
866 nix::mount::mount(
867 Some("shm"),
868 &shm_path,
869 Some("tmpfs"),
870 nix::mount::MsFlags::MS_NOSUID
871 | nix::mount::MsFlags::MS_NODEV
872 | nix::mount::MsFlags::MS_NOEXEC,
873 Some("mode=1777,size=64m"),
874 )
875 .map_err(|e| {
876 NucleusError::FilesystemError(format!("Failed to mount tmpfs on /dev/shm: {}", e))
877 })?;
878 debug!("Mounted tmpfs on /dev/shm");
879
880 if let Some(context_dir) = &self.config.context_dir {
883 let context_dest = container_root.join("context");
884 LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
885 if let Some(expected) = &context_manifest {
886 verify_context_manifest(expected, &context_dest)?;
887 }
888 }
889 fs_state = fs_state.transition(FilesystemState::Populated)?;
890
891 if let Some(ref rootfs_path) = self.config.rootfs_path {
893 if self.config.verify_rootfs_attestation {
894 verify_rootfs_attestation(rootfs_path)?;
895 }
896 bind_mount_rootfs(&container_root, rootfs_path)?;
897 } else {
898 bind_mount_host_paths(&container_root, is_rootless)?;
899 }
900
901 mount_volumes(&container_root, &self.config.volumes)?;
903
904 if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
908 let bridge_dns = if bridge_config.selected_nat_backend(!is_rootless, is_rootless)
909 == NatBackend::Userspace
910 && bridge_config.dns.is_empty()
911 {
912 vec![UserspaceNetwork::default_dns_server(&bridge_config.subnet)?]
913 } else {
914 bridge_config.dns.clone()
915 };
916 if self.config.rootfs_path.is_some() {
917 BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_dns)?;
918 } else {
919 BridgeNetwork::write_resolv_conf(&container_root, &bridge_dns)?;
920 }
921 }
922
923 mount_secrets_inmemory(
925 &container_root,
926 &self.config.secrets,
927 &self.config.process_identity,
928 )?;
929
930 let proc_path = container_root.join("proc");
932 let hide_pids = self.config.service_mode == ServiceMode::Production;
933 mount_procfs(
934 &proc_path,
935 is_rootless,
936 self.config.proc_readonly,
937 hide_pids,
938 )?;
939
940 mask_proc_paths(
943 &proc_path,
944 self.config.service_mode == ServiceMode::Production,
945 )?;
946
947 if let Some(ref hooks) = self.config.hooks {
949 if !hooks.create_runtime.is_empty() {
950 let hook_state = OciContainerState {
951 oci_version: "1.0.2".to_string(),
952 id: self.config.id.clone(),
953 status: OciStatus::Creating,
954 pid: std::process::id(),
955 bundle: String::new(),
956 };
957 OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
958 }
959 }
960
961 switch_root(&container_root, self.config.allow_chroot_fallback)?;
964 fs_state = fs_state.transition(FilesystemState::Pivoted)?;
965 debug!("Filesystem state: {:?}", fs_state);
966
967 audit_mounts(self.config.service_mode == ServiceMode::Production)?;
969 audit(
970 &self.config.id,
971 &self.config.name,
972 AuditEventType::MountAuditPassed,
973 "all mount flags verified",
974 );
975
976 if let Some(ref hooks) = self.config.hooks {
978 if !hooks.create_container.is_empty() {
979 let hook_state = OciContainerState {
980 oci_version: "1.0.2".to_string(),
981 id: self.config.id.clone(),
982 status: OciStatus::Created,
983 pid: std::process::id(),
984 bundle: String::new(),
985 };
986 OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
987 }
988 }
989
990 let mut cap_mgr = CapabilityManager::new();
1000 if let Some(ref policy_path) = self.config.caps_policy {
1001 let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
1002 policy_path,
1003 self.config.caps_policy_sha256.as_deref(),
1004 )?;
1005 if self.config.service_mode == ServiceMode::Production {
1007 policy.validate_production()?;
1008 }
1009 policy.apply(&mut cap_mgr)?;
1010 Self::apply_process_identity_to_current_process(
1012 &self.config.process_identity,
1013 self.config.user_ns_config.is_some(),
1014 )?;
1015 audit(
1016 &self.config.id,
1017 &self.config.name,
1018 AuditEventType::CapabilitiesDropped,
1019 format!("capability policy applied from {:?}", policy_path),
1020 );
1021 } else {
1022 cap_mgr.drop_bounding_set()?;
1024
1025 Self::apply_process_identity_to_current_process(
1029 &self.config.process_identity,
1030 self.config.user_ns_config.is_some(),
1031 )?;
1032
1033 cap_mgr.finalize_drop()?;
1036
1037 audit(
1038 &self.config.id,
1039 &self.config.name,
1040 AuditEventType::CapabilitiesDropped,
1041 "all capabilities dropped including bounding set",
1042 );
1043 }
1044 sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
1045
1046 {
1051 let is_production = self.config.service_mode == ServiceMode::Production;
1052
1053 if let Some(nproc_limit) = self.config.limits.pids_max {
1054 let rlim_nproc = libc::rlimit {
1055 rlim_cur: nproc_limit,
1056 rlim_max: nproc_limit,
1057 };
1058 if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
1060 let err = std::io::Error::last_os_error();
1061 if is_production {
1062 return Err(NucleusError::SeccompError(format!(
1063 "Failed to set RLIMIT_NPROC to {} in production mode: {}",
1064 nproc_limit, err
1065 )));
1066 }
1067 warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
1068 }
1069 }
1070
1071 let rlim_nofile = libc::rlimit {
1072 rlim_cur: 1024,
1073 rlim_max: 1024,
1074 };
1075 if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
1077 let err = std::io::Error::last_os_error();
1078 if is_production {
1079 return Err(NucleusError::SeccompError(format!(
1080 "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
1081 err
1082 )));
1083 }
1084 warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
1085 }
1086
1087 let memlock_limit: u64 = self.config.limits.memlock_bytes.unwrap_or(64 * 1024);
1092 let rlim_memlock = libc::rlimit {
1093 rlim_cur: memlock_limit,
1094 rlim_max: memlock_limit,
1095 };
1096 if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
1098 let err = std::io::Error::last_os_error();
1099 if is_production {
1100 return Err(NucleusError::SeccompError(format!(
1101 "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
1102 memlock_limit, err
1103 )));
1104 }
1105 warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
1106 }
1107 }
1108
1109 CapabilityManager::verify_no_namespace_caps(
1113 self.config.service_mode == ServiceMode::Production,
1114 )?;
1115
1116 use crate::container::config::SeccompMode;
1119 let mut seccomp_mgr = SeccompManager::new();
1120 let allow_network = !matches!(self.config.network, NetworkMode::None);
1121 let seccomp_applied = match self.config.seccomp_mode {
1122 SeccompMode::Trace => {
1123 audit(
1124 &self.config.id,
1125 &self.config.name,
1126 AuditEventType::SeccompApplied,
1127 "seccomp trace mode: allow-all + LOG",
1128 );
1129 seccomp_mgr.apply_trace_filter()?
1130 }
1131 SeccompMode::Enforce => {
1132 if let Some(ref profile_path) = self.config.seccomp_profile {
1133 audit(
1134 &self.config.id,
1135 &self.config.name,
1136 AuditEventType::SeccompProfileLoaded,
1137 format!("path={:?}", profile_path),
1138 );
1139 seccomp_mgr.apply_profile_from_file(
1140 profile_path,
1141 self.config.seccomp_profile_sha256.as_deref(),
1142 self.config.seccomp_log_denied,
1143 )?
1144 } else {
1145 seccomp_mgr.apply_filter_for_network_mode(
1146 allow_network,
1147 allow_degraded_security,
1148 self.config.seccomp_log_denied,
1149 &self.config.seccomp_allow_syscalls,
1150 )?
1151 }
1152 }
1153 };
1154 if seccomp_applied {
1155 sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
1156 audit(
1157 &self.config.id,
1158 &self.config.name,
1159 AuditEventType::SeccompApplied,
1160 format!("network={}", allow_network),
1161 );
1162 } else if !allow_degraded_security {
1163 return Err(NucleusError::SeccompError(
1164 "Seccomp filter is required but was not enforced".to_string(),
1165 ));
1166 } else {
1167 warn!("Seccomp not enforced; container is running with degraded hardening");
1168 }
1169
1170 let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
1172 let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
1173 policy_path,
1174 self.config.landlock_policy_sha256.as_deref(),
1175 )?;
1176 if self.config.service_mode == ServiceMode::Production {
1178 policy.validate_production()?;
1179 }
1180 policy.apply(allow_degraded_security)?
1181 } else {
1182 let mut landlock_mgr = LandlockManager::new();
1183 landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
1184 for vol in &self.config.volumes {
1186 landlock_mgr.add_rw_path(&vol.dest.to_string_lossy());
1187 }
1188 landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
1189 };
1190 if seccomp_applied && landlock_applied {
1191 sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
1192 if self.config.seccomp_mode == SeccompMode::Trace {
1193 warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
1194 } else {
1195 sec_state = sec_state.transition(SecurityState::Locked)?;
1196 }
1197 audit(
1198 &self.config.id,
1199 &self.config.name,
1200 AuditEventType::LandlockApplied,
1201 if self.config.seccomp_mode == SeccompMode::Trace {
1202 "landlock applied, but seccomp in trace mode – not locked".to_string()
1203 } else {
1204 "security state locked: all hardening layers active".to_string()
1205 },
1206 );
1207 } else if !allow_degraded_security {
1208 return Err(NucleusError::LandlockError(
1209 "Landlock policy is required but was not enforced".to_string(),
1210 ));
1211 } else {
1212 warn!("Security state not locked; one or more hardening controls are inactive");
1213 }
1214 debug!("Security state: {:?}", sec_state);
1215
1216 if let Some(ref fifo_path) = exec_fifo {
1220 debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
1221 let file = std::fs::OpenOptions::new()
1222 .write(true)
1223 .open(fifo_path)
1224 .map_err(|e| {
1225 NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
1226 })?;
1227 std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
1228 NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
1229 })?;
1230 drop(file);
1231 debug!("Exec FIFO released, proceeding to exec");
1232 }
1233
1234 if let Some(ref hooks) = self.config.hooks {
1236 if !hooks.start_container.is_empty() {
1237 let hook_state = OciContainerState {
1238 oci_version: "1.0.2".to_string(),
1239 id: self.config.id.clone(),
1240 status: OciStatus::Running,
1241 pid: std::process::id(),
1242 bundle: String::new(),
1243 };
1244 OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
1245 }
1246 }
1247
1248 if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
1251 return self.run_as_init();
1252 }
1253
1254 self.exec_command()?;
1256
1257 Ok(())
1259 }
1260
1261 pub(super) fn setup_signal_forwarding_static(
1266 child: Pid,
1267 ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
1268 let mut set = SigSet::empty();
1269 for signal in [
1270 Signal::SIGTERM,
1271 Signal::SIGINT,
1272 Signal::SIGHUP,
1273 Signal::SIGQUIT,
1274 Signal::SIGUSR1,
1275 Signal::SIGUSR2,
1276 ] {
1277 set.add(signal);
1278 }
1279
1280 let unblock_set = set;
1281 pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
1282 NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
1283 })?;
1284
1285 let stop = Arc::new(AtomicBool::new(false));
1286 let stop_clone = stop.clone();
1287 let handle = std::thread::Builder::new()
1288 .name("sig-forward".to_string())
1289 .spawn(move || {
1290 loop {
1292 if let Ok(signal) = unblock_set.wait() {
1293 if stop_clone.load(Ordering::Relaxed) {
1297 break;
1298 }
1299 let _ = kill(child, signal);
1300 }
1301 }
1302 })
1303 .map_err(|e| {
1304 let mut restore = SigSet::empty();
1307 for signal in [
1308 Signal::SIGTERM,
1309 Signal::SIGINT,
1310 Signal::SIGHUP,
1311 Signal::SIGQUIT,
1312 Signal::SIGUSR1,
1313 Signal::SIGUSR2,
1314 ] {
1315 restore.add(signal);
1316 }
1317 let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
1318 NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
1319 })?;
1320
1321 info!("Signal forwarding configured");
1322 Ok((stop, handle))
1323 }
1324
1325 pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
1327 loop {
1328 match waitpid(child, None) {
1329 Ok(WaitStatus::Exited(_, code)) => {
1330 return Ok(code);
1331 }
1332 Ok(WaitStatus::Signaled(_, signal, _)) => {
1333 info!("Child killed by signal: {:?}", signal);
1334 return Ok(128 + signal as i32);
1335 }
1336 Err(nix::errno::Errno::EINTR) => {
1337 continue;
1338 }
1339 Err(e) => {
1340 return Err(NucleusError::ExecError(format!(
1341 "Failed to wait for child: {}",
1342 e
1343 )));
1344 }
1345 _ => {
1346 continue;
1347 }
1348 }
1349 }
1350 }
1351
1352 fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1353 let mut pid_buf = [0u8; 4];
1354 loop {
1355 match read(ready_read, &mut pid_buf) {
1356 Err(nix::errno::Errno::EINTR) => continue,
1357 Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1358 Ok(0) => {
1359 return Err(NucleusError::ExecError(format!(
1360 "Child {} exited before namespace initialization",
1361 child
1362 )))
1363 }
1364 Ok(_) => {
1365 return Err(NucleusError::ExecError(
1366 "Invalid namespace sync payload from child".to_string(),
1367 ))
1368 }
1369 Err(e) => {
1370 return Err(NucleusError::ExecError(format!(
1371 "Failed waiting for child namespace setup: {}",
1372 e
1373 )))
1374 }
1375 }
1376 }
1377 }
1378
1379 fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1380 let payload = pid.to_ne_bytes();
1381 let mut written = 0;
1382 while written < payload.len() {
1383 let n = write(fd, &payload[written..]).map_err(|e| {
1384 NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1385 })?;
1386 if n == 0 {
1387 return Err(NucleusError::ExecError(
1388 "Failed to notify namespace readiness: short write".to_string(),
1389 ));
1390 }
1391 written += n;
1392 }
1393 Ok(())
1394 }
1395
1396 fn send_sync_byte(fd: &OwnedFd, error_context: &str) -> Result<()> {
1397 let mut written = 0;
1398 let payload = [1u8];
1399 while written < payload.len() {
1400 let n = write(fd, &payload[written..])
1401 .map_err(|e| NucleusError::ExecError(format!("{}: {}", error_context, e)))?;
1402 if n == 0 {
1403 return Err(NucleusError::ExecError(format!(
1404 "{}: short write",
1405 error_context
1406 )));
1407 }
1408 written += n;
1409 }
1410 Ok(())
1411 }
1412
1413 fn wait_for_sync_byte(fd: &OwnedFd, eof_context: &str, error_context: &str) -> Result<()> {
1414 let mut payload = [0u8; 1];
1415 loop {
1416 match read(fd, &mut payload) {
1417 Err(nix::errno::Errno::EINTR) => continue,
1418 Ok(1) => return Ok(()),
1419 Ok(0) => return Err(NucleusError::ExecError(eof_context.to_string())),
1420 Ok(_) => {
1421 return Err(NucleusError::ExecError(format!(
1422 "{}: invalid sync payload",
1423 error_context
1424 )))
1425 }
1426 Err(e) => return Err(NucleusError::ExecError(format!("{}: {}", error_context, e))),
1427 }
1428 }
1429 }
1430
1431 fn become_userns_root_for_setup() -> Result<()> {
1432 setresgid(Gid::from_raw(0), Gid::from_raw(0), Gid::from_raw(0)).map_err(|e| {
1433 NucleusError::NamespaceError(format!(
1434 "Failed to become gid 0 inside mapped user namespace: {}",
1435 e
1436 ))
1437 })?;
1438 setresuid(Uid::from_raw(0), Uid::from_raw(0), Uid::from_raw(0)).map_err(|e| {
1439 NucleusError::NamespaceError(format!(
1440 "Failed to become uid 0 inside mapped user namespace: {}",
1441 e
1442 ))
1443 })?;
1444 debug!("Switched setup process to uid/gid 0 inside mapped user namespace");
1445 Ok(())
1446 }
1447
1448 fn prepare_gvisor_bridge_namespace(
1449 &self,
1450 userns_request_pipe: Option<&OwnedFd>,
1451 userns_ack_pipe: Option<&OwnedFd>,
1452 ) -> Result<bool> {
1453 let mut precreated_userns = false;
1454 if self.config.user_ns_config.is_some() && !Uid::effective().is_root() {
1455 nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWUSER).map_err(|e| {
1456 NucleusError::NamespaceError(format!(
1457 "Failed to unshare gVisor bridge user namespace: {}",
1458 e
1459 ))
1460 })?;
1461
1462 let request_fd = userns_request_pipe.ok_or_else(|| {
1463 NucleusError::ExecError(
1464 "Missing user namespace request pipe in gVisor bridge child".to_string(),
1465 )
1466 })?;
1467 let ack_fd = userns_ack_pipe.ok_or_else(|| {
1468 NucleusError::ExecError(
1469 "Missing user namespace acknowledgement pipe in gVisor bridge child"
1470 .to_string(),
1471 )
1472 })?;
1473
1474 Self::send_sync_byte(
1475 request_fd,
1476 "Failed to request gVisor bridge user namespace mappings from parent",
1477 )?;
1478 Self::wait_for_sync_byte(
1479 ack_fd,
1480 "Parent closed user namespace ack pipe before gVisor bridge mappings were written",
1481 "Failed waiting for parent to finish gVisor bridge user namespace mappings",
1482 )?;
1483 Self::become_userns_root_for_setup()?;
1484 precreated_userns = true;
1485 }
1486
1487 nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNET).map_err(|e| {
1488 NucleusError::NamespaceError(format!(
1489 "Failed to unshare gVisor bridge network namespace: {}",
1490 e
1491 ))
1492 })?;
1493 Ok(precreated_userns)
1494 }
1495
1496 fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1497 loop {
1498 match waitpid(child, None) {
1499 Ok(WaitStatus::Exited(_, code)) => return code,
1500 Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1501 Err(nix::errno::Errno::EINTR) => continue,
1502 Err(_) => return 1,
1503 _ => continue,
1504 }
1505 }
1506 }
1507}
1508
1509impl CreatedContainer {
1510 pub fn start(mut self) -> Result<i32> {
1513 let config = &self.config;
1514 let _enter = self._lifecycle_span.enter();
1515
1516 if let Some(exec_fifo_path) = &self.exec_fifo_path {
1519 let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1520 NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1521 })?;
1522 let mut buf = [0u8; 1];
1523 let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1524 NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1525 })?;
1526 if read != 1 {
1527 return Err(NucleusError::ExecError(
1528 "Exec FIFO closed before start signal was delivered".to_string(),
1529 ));
1530 }
1531 let _ = std::fs::remove_file(exec_fifo_path);
1532 }
1533
1534 self.state.status = OciStatus::Running;
1536 self.state_mgr.save_state(&self.state)?;
1537
1538 let target_pid = self.state.pid;
1539 let child = self.child;
1540
1541 let (sig_stop, sig_handle) =
1542 Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1543
1544 let mut sig_guard = SignalThreadGuard {
1546 stop: Some(sig_stop),
1547 handle: Some(sig_handle),
1548 };
1549
1550 if let Some(ref probe) = config.readiness_probe {
1552 let notify_socket = if config.sd_notify {
1553 std::env::var("NOTIFY_SOCKET").ok()
1554 } else {
1555 None
1556 };
1557 Container::run_readiness_probe(
1558 target_pid,
1559 &config.name,
1560 probe,
1561 config.user_ns_config.is_some(),
1562 config.use_gvisor,
1563 &config.process_identity,
1564 notify_socket.as_deref(),
1565 )?;
1566 }
1567
1568 let cancel_flag = Arc::new(AtomicBool::new(false));
1570 let health_handle = if let Some(ref hc) = config.health_check {
1571 if !hc.command.is_empty() {
1572 let hc = hc.clone();
1573 let pid = target_pid;
1574 let container_name = config.name.clone();
1575 let rootless = config.user_ns_config.is_some();
1576 let using_gvisor = config.use_gvisor;
1577 let process_identity = config.process_identity.clone();
1578 let cancel = cancel_flag.clone();
1579 Some(std::thread::spawn(move || {
1580 Container::health_check_loop(
1581 pid,
1582 &container_name,
1583 rootless,
1584 using_gvisor,
1585 &hc,
1586 &process_identity,
1587 &cancel,
1588 );
1589 }))
1590 } else {
1591 None
1592 }
1593 } else {
1594 None
1595 };
1596
1597 let mut health_guard = HealthThreadGuard {
1599 cancel: Some(cancel_flag),
1600 handle: health_handle,
1601 };
1602
1603 if let Some(ref hooks) = config.hooks {
1605 if !hooks.poststart.is_empty() {
1606 let hook_state = OciContainerState {
1607 oci_version: "1.0.2".to_string(),
1608 id: config.id.clone(),
1609 status: OciStatus::Running,
1610 pid: target_pid,
1611 bundle: String::new(),
1612 };
1613 OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1614 }
1615 }
1616
1617 let mut child_waited = false;
1618 let run_result: Result<i32> = (|| {
1619 let exit_code = Container::wait_for_child_static(child)?;
1620
1621 self.state.status = OciStatus::Stopped;
1623 let _ = self.state_mgr.save_state(&self.state);
1624
1625 child_waited = true;
1626 Ok(exit_code)
1627 })();
1628
1629 health_guard.stop();
1632 sig_guard.stop();
1633
1634 if let Some(ref hooks) = config.hooks {
1636 if !hooks.poststop.is_empty() {
1637 let hook_state = OciContainerState {
1638 oci_version: "1.0.2".to_string(),
1639 id: config.id.clone(),
1640 status: OciStatus::Stopped,
1641 pid: 0,
1642 bundle: String::new(),
1643 };
1644 OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1645 }
1646 }
1647
1648 if let Some(net) = self.network_driver.take() {
1649 if let Err(e) = net.cleanup() {
1650 warn!("Failed to cleanup container networking: {}", e);
1651 }
1652 }
1653
1654 if !child_waited {
1655 let _ = kill(child, Signal::SIGKILL);
1656 let _ = waitpid(child, None);
1657 }
1658
1659 if let Some(reader) = self.trace_reader.take() {
1660 reader.stop_and_flush();
1661 }
1662
1663 if let Some(logger) = self.deny_logger.take() {
1664 logger.stop();
1665 }
1666
1667 if let Some(cgroup) = self.cgroup_opt.take() {
1668 if let Err(e) = cgroup.cleanup() {
1669 warn!("Failed to cleanup cgroup: {}", e);
1670 }
1671 }
1672
1673 if config.use_gvisor {
1674 if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1675 warn!(
1676 "Failed to cleanup gVisor artifacts for {}: {}",
1677 config.id, e
1678 );
1679 }
1680 }
1681
1682 if let Err(e) = self.state_mgr.delete_state(&config.id) {
1683 warn!("Failed to delete state for {}: {}", config.id, e);
1684 }
1685
1686 match run_result {
1687 Ok(exit_code) => {
1688 audit(
1689 &config.id,
1690 &config.name,
1691 AuditEventType::ContainerStop,
1692 format!("exit_code={}", exit_code),
1693 );
1694 info!(
1695 "Container {} ({}) exited with code {}",
1696 config.name, config.id, exit_code
1697 );
1698 Ok(exit_code)
1699 }
1700 Err(e) => {
1701 audit_error(
1702 &config.id,
1703 &config.name,
1704 AuditEventType::ContainerStop,
1705 format!("error={}", e),
1706 );
1707 Err(e)
1708 }
1709 }
1710 }
1711}
1712
1713struct SignalThreadGuard {
1715 stop: Option<Arc<AtomicBool>>,
1716 handle: Option<JoinHandle<()>>,
1717}
1718
1719impl SignalThreadGuard {
1720 fn stop(&mut self) {
1721 if let Some(flag) = self.stop.take() {
1722 flag.store(true, Ordering::Relaxed);
1723 let _ = kill(Pid::this(), Signal::SIGUSR1);
1725 }
1726 if let Some(handle) = self.handle.take() {
1727 let _ = handle.join();
1728 }
1729 }
1730}
1731
1732impl Drop for SignalThreadGuard {
1733 fn drop(&mut self) {
1734 self.stop();
1735 }
1736}
1737
1738struct HealthThreadGuard {
1740 cancel: Option<Arc<AtomicBool>>,
1741 handle: Option<JoinHandle<()>>,
1742}
1743
1744impl HealthThreadGuard {
1745 fn stop(&mut self) {
1746 if let Some(flag) = self.cancel.take() {
1747 flag.store(true, Ordering::Relaxed);
1748 }
1749 if let Some(handle) = self.handle.take() {
1750 let _ = handle.join();
1751 }
1752 }
1753}
1754
1755impl Drop for HealthThreadGuard {
1756 fn drop(&mut self) {
1757 self.stop();
1758 }
1759}
1760
1761#[cfg(test)]
1762mod tests {
1763 use super::*;
1764 use crate::container::KernelLockdownMode;
1765 use crate::network::NetworkMode;
1766 use std::ffi::OsString;
1767 use std::sync::{Mutex, MutexGuard};
1768
1769 static ENV_LOCK: Mutex<()> = Mutex::new(());
1770
1771 struct EnvLock {
1772 _guard: MutexGuard<'static, ()>,
1773 }
1774
1775 impl EnvLock {
1776 fn acquire() -> Self {
1777 Self {
1778 _guard: ENV_LOCK.lock().unwrap(),
1779 }
1780 }
1781 }
1782
1783 struct EnvVarGuard {
1784 key: &'static str,
1785 previous: Option<OsString>,
1786 }
1787
1788 impl EnvVarGuard {
1789 fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
1790 let previous = std::env::var_os(key);
1791 std::env::set_var(key, value);
1792 Self { key, previous }
1793 }
1794
1795 fn remove(key: &'static str) -> Self {
1796 let previous = std::env::var_os(key);
1797 std::env::remove_var(key);
1798 Self { key, previous }
1799 }
1800 }
1801
1802 impl Drop for EnvVarGuard {
1803 fn drop(&mut self) {
1804 match &self.previous {
1805 Some(value) => std::env::set_var(self.key, value),
1806 None => std::env::remove_var(self.key),
1807 }
1808 }
1809 }
1810
1811 fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
1812 let fn_start = source
1813 .find(fn_signature)
1814 .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
1815 let after = &source[fn_start..];
1816 let open = after
1817 .find('{')
1818 .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
1819 let mut depth = 0u32;
1820 let mut end = open;
1821 for (i, ch) in after[open..].char_indices() {
1822 match ch {
1823 '{' => depth += 1,
1824 '}' => {
1825 depth -= 1;
1826 if depth == 0 {
1827 end = open + i + 1;
1828 break;
1829 }
1830 }
1831 _ => {}
1832 }
1833 }
1834 &after[..end]
1835 }
1836
1837 #[test]
1838 fn test_container_config() {
1839 let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1840 assert!(!config.id.is_empty());
1841 assert_eq!(config.command, vec!["/bin/sh"]);
1842 assert!(config.use_gvisor);
1843 }
1844
1845 #[test]
1846 fn test_run_uses_immediate_start_path() {
1847 let source = include_str!("runtime.rs");
1848 let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1849 let after = &source[fn_start..];
1850 let open = after.find('{').unwrap();
1851 let mut depth = 0u32;
1852 let mut fn_end = open;
1853 for (i, ch) in after[open..].char_indices() {
1854 match ch {
1855 '{' => depth += 1,
1856 '}' => {
1857 depth -= 1;
1858 if depth == 0 {
1859 fn_end = open + i + 1;
1860 break;
1861 }
1862 }
1863 _ => {}
1864 }
1865 }
1866 let run_body = &after[..fn_end];
1867 assert!(
1868 run_body.contains("create_internal(false)"),
1869 "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1870 );
1871 assert!(
1872 !run_body.contains("self.create()?.start()"),
1873 "run() must not route through create()+start()"
1874 );
1875 }
1876
1877 #[test]
1878 fn test_container_config_with_name() {
1879 let config =
1880 ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1881 .unwrap();
1882 assert_eq!(config.name, "mycontainer");
1883 assert!(!config.id.is_empty());
1884 assert_ne!(config.id, config.name);
1885 }
1886
1887 #[test]
1888 fn test_allow_degraded_security_requires_explicit_config() {
1889 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1890 assert!(!Container::allow_degraded_security(&strict));
1891
1892 let relaxed = strict.clone().with_allow_degraded_security(true);
1893 assert!(Container::allow_degraded_security(&relaxed));
1894 }
1895
1896 #[test]
1897 fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1898 let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1899 std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1900
1901 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1902 assert!(!Container::allow_degraded_security(&strict));
1903
1904 let explicit = strict.with_allow_degraded_security(true);
1905 assert!(Container::allow_degraded_security(&explicit));
1906
1907 match prev {
1908 Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1909 None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1910 }
1911 }
1912
1913 #[test]
1914 fn test_host_network_requires_explicit_opt_in() {
1915 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1916 .unwrap()
1917 .with_network(NetworkMode::Host)
1918 .with_allow_host_network(false);
1919 let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1920 assert!(matches!(err, NucleusError::NetworkError(_)));
1921 }
1922
1923 #[test]
1924 fn test_host_network_opt_in_disables_net_namespace() {
1925 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1926 .unwrap()
1927 .with_network(NetworkMode::Host)
1928 .with_allow_host_network(true);
1929 assert!(config.namespaces.net);
1930 Container::apply_network_mode_guards(&mut config, true).unwrap();
1931 assert!(!config.namespaces.net);
1932 }
1933
1934 #[test]
1935 fn test_non_host_network_does_not_require_host_opt_in() {
1936 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1937 .unwrap()
1938 .with_network(NetworkMode::None)
1939 .with_allow_host_network(false);
1940 assert!(config.namespaces.net);
1941 Container::apply_network_mode_guards(&mut config, true).unwrap();
1942 assert!(config.namespaces.net);
1943 }
1944
1945 #[test]
1946 fn test_parse_kernel_lockdown_mode() {
1947 assert_eq!(
1948 Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1949 Some(KernelLockdownMode::Integrity)
1950 );
1951 assert_eq!(
1952 Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1953 Some(KernelLockdownMode::Confidentiality)
1954 );
1955 assert_eq!(
1956 Container::parse_active_lockdown_mode("[none] integrity"),
1957 None
1958 );
1959 }
1960
1961 #[test]
1962 fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1963 let temp = tempfile::TempDir::new().unwrap();
1964 let source = temp.path().join("source-secret");
1965 std::fs::write(&source, "supersecret").unwrap();
1966
1967 let staged = Container::stage_gvisor_secret_files(
1968 &temp.path().join("stage"),
1969 &[crate::container::SecretMount {
1970 source: source.clone(),
1971 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1972 mode: 0o400,
1973 }],
1974 &crate::container::ProcessIdentity::root(),
1975 )
1976 .unwrap();
1977
1978 assert_eq!(staged.len(), 1);
1979 assert!(staged[0].source.starts_with(temp.path().join("stage")));
1980 assert_eq!(
1981 std::fs::read_to_string(&staged[0].source).unwrap(),
1982 "supersecret"
1983 );
1984 }
1985
1986 #[test]
1987 fn test_stage_gvisor_secret_files_rejects_symlink_source() {
1988 use std::os::unix::fs::symlink;
1989
1990 let temp = tempfile::TempDir::new().unwrap();
1991 let source = temp.path().join("source-secret");
1992 let link = temp.path().join("source-link");
1993 std::fs::write(&source, "supersecret").unwrap();
1994 symlink(&source, &link).unwrap();
1995
1996 let err = Container::stage_gvisor_secret_files(
1997 &temp.path().join("stage"),
1998 &[crate::container::SecretMount {
1999 source: link,
2000 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
2001 mode: 0o400,
2002 }],
2003 &crate::container::ProcessIdentity::root(),
2004 )
2005 .unwrap_err();
2006
2007 assert!(
2008 err.to_string().contains("O_NOFOLLOW"),
2009 "gVisor secret staging must reject symlink sources"
2010 );
2011 }
2012
2013 #[test]
2014 fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
2015 let source = include_str!("runtime.rs");
2016 let fn_body = extract_fn_body(source, "fn setup_and_exec");
2017 assert!(
2018 fn_body.contains("mount_secrets_inmemory("),
2019 "setup_and_exec must use in-memory secret mounting"
2020 );
2021 assert!(
2022 !fn_body.contains("mount_secrets(&"),
2023 "setup_and_exec must not bind-mount secrets from the host"
2024 );
2025 }
2026
2027 #[test]
2028 fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
2029 let source = include_str!("gvisor_setup.rs");
2030 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2031 assert!(
2032 fn_body.contains("with_inmemory_secret_mounts"),
2033 "gVisor setup must use the tmpfs-backed secret staging path"
2034 );
2035 assert!(
2036 !fn_body.contains("with_secret_mounts"),
2037 "gVisor setup must not bind-mount host secret paths"
2038 );
2039 }
2040
2041 #[test]
2042 fn test_gvisor_bridge_precreated_userns_skips_nested_oci_userns() {
2043 let source = include_str!("gvisor_setup.rs");
2044 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2045 let precreated_check = fn_body.find("if precreated_userns").unwrap();
2046 let oci_userns = fn_body.find("with_rootless_user_namespace").unwrap();
2047 assert!(
2048 precreated_check < oci_userns,
2049 "pre-created rootless bridge userns must skip nested OCI user namespace setup"
2050 );
2051 }
2052
2053 #[test]
2054 fn test_gvisor_bridge_precreated_userns_disables_oci_no_new_privileges() {
2055 let source = include_str!("gvisor_setup.rs");
2056 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2057 assert!(
2058 fn_body.contains("if precreated_userns")
2059 && fn_body.contains("with_no_new_privileges(false)"),
2060 "pre-created rootless bridge userns must not pass OCI noNewPrivileges to runsc"
2061 );
2062 }
2063
2064 #[test]
2065 fn test_gvisor_bridge_rootless_requests_external_userns_mapping() {
2066 let source = include_str!("runtime.rs");
2067 let create_body = extract_fn_body(source, "fn create_internal");
2068 assert!(
2069 create_body.contains("let gvisor_bridge_needs_userns_mapping"),
2070 "gVisor bridge rootless setup must request parent-written userns mappings"
2071 );
2072 assert!(
2073 create_body.contains("matches!(config.network, NetworkMode::Bridge(_))"),
2074 "external mapping request must be scoped to gVisor bridge networking"
2075 );
2076 }
2077
2078 #[test]
2079 fn test_gvisor_bridge_namespace_creates_userns_before_netns() {
2080 let source = include_str!("runtime.rs");
2081 let fn_body = extract_fn_body(source, "fn prepare_gvisor_bridge_namespace");
2082 let userns = fn_body.find("CLONE_NEWUSER").unwrap();
2083 let request = fn_body.find("send_sync_byte").unwrap();
2084 let become_root = fn_body.find("become_userns_root_for_setup").unwrap();
2085 let netns = fn_body.find("CLONE_NEWNET").unwrap();
2086 assert!(
2087 userns < request && request < become_root && become_root < netns,
2088 "rootless gVisor bridge setup must map userns before creating the netns"
2089 );
2090 }
2091
2092 #[test]
2093 fn test_native_fork_sites_assert_single_threaded() {
2094 let runtime_source = include_str!("runtime.rs");
2095 let create_body = extract_fn_body(runtime_source, "fn create_internal");
2096 assert!(
2097 create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
2098 "create_internal must assert single-threaded before fork"
2099 );
2100
2101 let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
2102 assert!(
2103 setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
2104 "PID namespace setup must assert single-threaded before fork"
2105 );
2106
2107 let exec_source = include_str!("exec.rs");
2108 let init_body = extract_fn_body(exec_source, "fn run_as_init");
2109 assert!(
2110 init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
2111 "run_as_init must assert single-threaded before fork"
2112 );
2113 }
2114
2115 #[test]
2116 fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
2117 let source = include_str!("exec.rs");
2118 let fn_body = extract_fn_body(source, "fn run_as_init");
2119 assert!(
2120 !fn_body.contains("Self::apply_process_identity_to_current_process("),
2121 "run_as_init must not drop identity before the supervisor fork"
2122 );
2123 assert!(
2124 fn_body.contains("self.exec_command()?"),
2125 "workload child must still route through exec_command for identity application"
2126 );
2127 }
2128
2129 #[test]
2130 fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
2131 let _env_lock = EnvLock::acquire();
2132 let temp = tempfile::TempDir::new().unwrap();
2133 let _artifact_base = EnvVarGuard::set(
2134 "NUCLEUS_GVISOR_ARTIFACT_BASE",
2135 temp.path().join("gvisor-artifacts"),
2136 );
2137 let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
2138 std::fs::create_dir_all(&artifact_dir).unwrap();
2139 std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
2140
2141 Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
2142 assert!(!artifact_dir.exists());
2143 }
2144
2145 #[test]
2146 fn test_gvisor_artifact_base_prefers_xdg_runtime_dir() {
2147 let _env_lock = EnvLock::acquire();
2148 let temp = tempfile::TempDir::new().unwrap();
2149 let _artifact_override = EnvVarGuard::remove("NUCLEUS_GVISOR_ARTIFACT_BASE");
2150 let _runtime = EnvVarGuard::set("XDG_RUNTIME_DIR", temp.path());
2151
2152 assert_eq!(
2153 Container::gvisor_artifact_dir("xdg-test"),
2154 temp.path().join("nucleus-gvisor").join("xdg-test")
2155 );
2156 }
2157
2158 #[test]
2159 fn test_health_check_loop_supports_cancellation() {
2160 let source = include_str!("health.rs");
2164 let fn_start = source.find("fn health_check_loop").unwrap();
2165 let fn_body = &source[fn_start..fn_start + 2500];
2166 assert!(
2167 fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
2168 "health_check_loop must accept an AtomicBool cancellation flag"
2169 );
2170 assert!(
2172 fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
2173 "health_check_loop must check cancellation during sleep intervals"
2174 );
2175 }
2176
2177 #[test]
2178 fn test_runtime_probes_do_not_spawn_host_nsenter() {
2179 let source = include_str!("health.rs");
2181
2182 let readiness_start = source.find("fn run_readiness_probe").unwrap();
2183 let readiness_body = &source[readiness_start..readiness_start + 2500];
2184 assert!(
2185 !readiness_body.contains("Command::new(&nsenter_bin)"),
2186 "readiness probes must not execute via host nsenter"
2187 );
2188
2189 let health_start = source.find("fn health_check_loop").unwrap();
2190 let health_body = &source[health_start..health_start + 2200];
2191 assert!(
2192 !health_body.contains("Command::new(&nsenter_bin)"),
2193 "health checks must not execute via host nsenter"
2194 );
2195 }
2196
2197 #[test]
2198 fn test_oci_mount_strip_prefix_no_expect() {
2199 let source = include_str!("gvisor_setup.rs");
2202 let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
2203 let fn_body = &source[fn_start..fn_start + 600];
2204 assert!(
2205 !fn_body.contains(".expect("),
2206 "prepare_oci_mountpoints must not use expect() – return Err instead"
2207 );
2208 }
2209
2210 #[test]
2211 fn test_notify_namespace_ready_validates_write_length() {
2212 let source = include_str!("runtime.rs");
2214 let fn_start = source.find("fn notify_namespace_ready").unwrap();
2215 let fn_body = &source[fn_start..fn_start + 500];
2216 assert!(
2218 fn_body.contains("written")
2219 || fn_body.contains("4")
2220 || fn_body.contains("payload.len()"),
2221 "notify_namespace_ready must validate complete write of all 4 bytes"
2222 );
2223 }
2224
2225 #[test]
2226 fn test_rlimit_failures_fatal_in_production() {
2227 let source = include_str!("runtime.rs");
2229 let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
2230 let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
2231 assert!(
2232 rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
2233 "RLIMIT failures must return Err in production mode"
2234 );
2235 }
2236
2237 #[test]
2238 fn test_tcp_readiness_probe_uses_portable_check() {
2239 let source = include_str!("health.rs");
2242 let probe_fn = source.find("TcpPort(port)").unwrap();
2243 let probe_body = &source[probe_fn..probe_fn + 500];
2244 assert!(
2245 !probe_body.contains("/dev/tcp"),
2246 "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
2247 );
2248 }
2249}