1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3 ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4 ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8 audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9 mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
10 switch_root, validate_production_rootfs_path, verify_context_manifest,
11 verify_rootfs_attestation, FilesystemState, LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::{NamespaceManager, UserNamespaceMapper};
14use crate::network::{BridgeDriver, BridgeNetwork, NatBackend, NetworkMode, UserspaceNetwork};
15use crate::resources::Cgroup;
16use crate::security::{
17 CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks,
18 SeccompDenyLogger, SeccompManager, SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{
25 chown, fork, pipe, read, setresgid, setresuid, write, ForkResult, Gid, Pid, Uid,
26};
27use std::os::fd::OwnedFd;
28use std::os::unix::fs::PermissionsExt;
29use std::path::PathBuf;
30use std::sync::atomic::{AtomicBool, Ordering};
31use std::sync::Arc;
32use std::thread::JoinHandle;
33use tempfile::Builder;
34use tracing::{debug, error, info, info_span, warn};
35
36pub struct Container {
45 pub(super) config: ContainerConfig,
46 pub(super) runsc_path: Option<String>,
49}
50
51pub struct CreatedContainer {
55 pub(super) config: ContainerConfig,
56 pub(super) state_mgr: ContainerStateManager,
57 pub(super) state: ContainerState,
58 pub(super) child: Pid,
59 pub(super) cgroup_opt: Option<Cgroup>,
60 pub(super) network_driver: Option<BridgeDriver>,
61 pub(super) trace_reader: Option<SeccompTraceReader>,
62 pub(super) deny_logger: Option<SeccompDenyLogger>,
63 pub(super) exec_fifo_path: Option<PathBuf>,
64 pub(super) _lifecycle_span: tracing::Span,
65}
66
67impl Container {
68 pub fn new(config: ContainerConfig) -> Self {
69 Self {
70 config,
71 runsc_path: None,
72 }
73 }
74
75 pub fn run(&self) -> Result<i32> {
77 self.create_internal(false)?.start()
78 }
79
80 pub fn create(&self) -> Result<CreatedContainer> {
84 self.create_internal(true)
85 }
86
87 fn sanitize_fds() {
92 const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
95 let ret =
97 unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
98 if ret == 0 {
99 return;
100 }
101 if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
105 let fds: Vec<i32> = entries
106 .flatten()
107 .filter_map(|entry| entry.file_name().into_string().ok())
108 .filter_map(|s| s.parse::<i32>().ok())
109 .filter(|&fd| fd > 2)
110 .collect();
111 for fd in fds {
112 unsafe { libc::close(fd) };
113 }
114 }
115 }
116
117 pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
118 let thread_count = std::fs::read_to_string("/proc/self/status")
119 .ok()
120 .and_then(|s| {
121 s.lines()
122 .find(|line| line.starts_with("Threads:"))
123 .and_then(|line| line.split_whitespace().nth(1))
124 .and_then(|count| count.parse::<u32>().ok())
125 });
126
127 if thread_count == Some(1) {
128 return Ok(());
129 }
130
131 Err(NucleusError::ExecError(format!(
132 "{} requires a single-threaded process before fork, found {:?} threads",
133 context, thread_count
134 )))
135 }
136
137 fn prepare_runtime_base_override(
138 config: &ContainerConfig,
139 host_is_root: bool,
140 needs_external_userns_mapping: bool,
141 ) -> Result<Option<PathBuf>> {
142 if !needs_external_userns_mapping {
143 return Ok(None);
144 }
145
146 if !host_is_root {
147 return Ok(Some(
148 dirs::runtime_dir()
149 .map(|d| d.join("nucleus"))
150 .unwrap_or_else(std::env::temp_dir),
151 ));
152 }
153
154 let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
155 NucleusError::ExecError("Missing user namespace configuration".to_string())
156 })?;
157 let host_uid =
158 Self::mapped_host_id_for_container_id(&user_config.uid_mappings, 0, "uid mappings")?;
159 let host_gid =
160 Self::mapped_host_id_for_container_id(&user_config.gid_mappings, 0, "gid mappings")?;
161
162 let root = PathBuf::from("/run/nucleus");
163 Self::ensure_runtime_parent_dir(&root)?;
164
165 let runtime_root = root.join("runtime");
166 Self::ensure_runtime_parent_dir(&runtime_root)?;
167
168 let base = runtime_root.join(&config.id);
169 std::fs::create_dir_all(&base).map_err(|e| {
170 NucleusError::FilesystemError(format!(
171 "Failed to create user namespace runtime base {:?}: {}",
172 base, e
173 ))
174 })?;
175 chown(
176 &base,
177 Some(Uid::from_raw(host_uid)),
178 Some(Gid::from_raw(host_gid)),
179 )
180 .map_err(|e| {
181 NucleusError::FilesystemError(format!(
182 "Failed to chown user namespace runtime base {:?} to {}:{}: {}",
183 base, host_uid, host_gid, e
184 ))
185 })?;
186 std::fs::set_permissions(&base, std::fs::Permissions::from_mode(0o700)).map_err(|e| {
187 NucleusError::FilesystemError(format!(
188 "Failed to secure user namespace runtime base {:?}: {}",
189 base, e
190 ))
191 })?;
192
193 Ok(Some(base))
194 }
195
196 fn ensure_runtime_parent_dir(path: &std::path::Path) -> Result<()> {
197 std::fs::create_dir_all(path).map_err(|e| {
198 NucleusError::FilesystemError(format!(
199 "Failed to create runtime parent dir {:?}: {}",
200 path, e
201 ))
202 })?;
203 std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o711)).map_err(|e| {
204 NucleusError::FilesystemError(format!(
205 "Failed to secure runtime parent dir {:?}: {}",
206 path, e
207 ))
208 })?;
209 Ok(())
210 }
211
212 fn mapped_host_id_for_container_id(
213 mappings: &[crate::isolation::IdMapping],
214 container_id: u32,
215 label: &str,
216 ) -> Result<u32> {
217 for mapping in mappings {
218 let end = mapping
219 .container_id
220 .checked_add(mapping.count)
221 .ok_or_else(|| {
222 NucleusError::ConfigError(format!(
223 "{} overflow for container id {}",
224 label, container_id
225 ))
226 })?;
227 if container_id >= mapping.container_id && container_id < end {
228 return mapping
229 .host_id
230 .checked_add(container_id - mapping.container_id)
231 .ok_or_else(|| {
232 NucleusError::ConfigError(format!(
233 "{} host id overflow for container id {}",
234 label, container_id
235 ))
236 });
237 }
238 }
239
240 Err(NucleusError::ConfigError(format!(
241 "{} do not map container id {}",
242 label, container_id
243 )))
244 }
245
246 fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
247 let lifecycle_span = info_span!(
248 "container.lifecycle",
249 container.id = %self.config.id,
250 container.name = %self.config.name,
251 runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
252 );
253 let _enter = lifecycle_span.enter();
254
255 info!(
256 "Creating container: {} (ID: {})",
257 self.config.name, self.config.id
258 );
259 audit(
260 &self.config.id,
261 &self.config.name,
262 AuditEventType::ContainerStart,
263 format!(
264 "command={:?} mode={:?} runtime={}",
265 crate::audit::redact_command(&self.config.command),
266 self.config.service_mode,
267 if self.config.use_gvisor {
268 "gvisor"
269 } else {
270 "native"
271 }
272 ),
273 );
274
275 let is_root = nix::unistd::Uid::effective().is_root();
277 let mut config = self.config.clone();
278
279 if !is_root && config.user_ns_config.is_none() {
280 info!("Not running as root, automatically enabling rootless mode");
281 config.namespaces.user = true;
282 config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
283 }
284
285 if is_root && !config.namespaces.user {
289 if config.service_mode == ServiceMode::Production {
290 info!("Running as root in production mode: enabling user namespace with UID remapping");
291 config.namespaces.user = true;
292 config.user_ns_config =
293 Some(crate::isolation::UserNamespaceConfig::root_remapped());
294 } else {
295 warn!(
296 "Running as root WITHOUT user namespace isolation. \
297 Container processes will run as real host UID 0. \
298 Use --user-ns or production mode for UID remapping."
299 );
300 }
301 }
302
303 if let Some(ref socket_path) = config.console_socket {
305 warn!(
306 "Console socket {} accepted but terminal forwarding is not yet implemented",
307 socket_path.display()
308 );
309 }
310
311 config.validate_production_mode()?;
313 if config.service_mode == ServiceMode::Production {
314 let rootfs_path = config.rootfs_path.as_ref().ok_or_else(|| {
315 NucleusError::ConfigError(
316 "Production mode requires explicit --rootfs path (no host bind mounts)"
317 .to_string(),
318 )
319 })?;
320 config.rootfs_path = Some(validate_production_rootfs_path(rootfs_path)?);
321 }
322 Self::assert_kernel_lockdown(&config)?;
323
324 Self::apply_network_mode_guards(&mut config, is_root)?;
325 Self::apply_trust_level_guards(&mut config)?;
326 config.validate_runtime_support()?;
327
328 if let NetworkMode::Bridge(ref bridge_config) = config.network {
329 let backend =
330 bridge_config.selected_nat_backend(is_root, config.user_ns_config.is_some());
331 if backend == NatBackend::Kernel && !is_root {
332 return Err(NucleusError::NetworkError(
333 "Kernel bridge networking requires root. Use --nat-backend userspace or leave the default auto selection for rootless/native containers."
334 .to_string(),
335 ));
336 }
337 }
338
339 let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
341
342 if let Ok(all_states) = state_mgr.list_states() {
344 if all_states.iter().any(|s| s.name == config.name) {
345 return Err(NucleusError::ConfigError(format!(
346 "A container named '{}' already exists; use a different --name, \
347 or remove the stale state with 'nucleus delete'",
348 config.name
349 )));
350 }
351 }
352
353 let exec_fifo = if defer_exec_until_start {
357 let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
358 nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
359 NucleusError::ExecError(format!(
360 "Failed to create exec FIFO {:?}: {}",
361 exec_fifo, e
362 ))
363 })?;
364 Some(exec_fifo)
365 } else {
366 None
367 };
368
369 let cgroup_name = format!("nucleus-{}", config.id);
371 let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
372 Ok(mut cgroup) => {
373 match cgroup.set_limits(&config.limits) {
375 Ok(_) => {
376 info!("Created cgroup with resource limits");
377 Some(cgroup)
378 }
379 Err(e) => {
380 if config.service_mode == ServiceMode::Production {
381 let _ = cgroup.cleanup();
382 return Err(NucleusError::CgroupError(format!(
383 "Production mode requires cgroup resource enforcement, but \
384 applying limits failed: {}",
385 e
386 )));
387 }
388 warn!("Failed to set cgroup limits: {}", e);
389 let _ = cgroup.cleanup();
390 None
391 }
392 }
393 }
394 Err(e) => {
395 if config.service_mode == ServiceMode::Production {
396 return Err(NucleusError::CgroupError(format!(
397 "Production mode requires cgroup resource enforcement, but \
398 cgroup creation failed: {}",
399 e
400 )));
401 }
402
403 if config.user_ns_config.is_some() {
404 if config.limits.memory_bytes.is_some()
405 || config.limits.cpu_quota_us.is_some()
406 || config.limits.pids_max.is_some()
407 {
408 warn!(
409 "Running in rootless mode: requested resource limits cannot be \
410 enforced – cgroup creation requires root ({})",
411 e
412 );
413 } else {
414 debug!("Running in rootless mode without cgroup resource limits");
415 }
416 } else {
417 warn!(
418 "Failed to create cgroup (running without resource limits): {}",
419 e
420 );
421 }
422 None
423 }
424 };
425
426 let runsc_path = if config.use_gvisor {
428 Some(GVisorRuntime::resolve_path().map_err(|e| {
429 NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
430 })?)
431 } else {
432 None
433 };
434 let gvisor_bridge_needs_userns_mapping = config.use_gvisor
435 && !is_root
436 && config.user_ns_config.is_some()
437 && matches!(config.network, NetworkMode::Bridge(_));
438 let needs_external_userns_mapping = config.user_ns_config.is_some()
439 && (!config.use_gvisor || gvisor_bridge_needs_userns_mapping);
440 let runtime_base_override =
441 Self::prepare_runtime_base_override(&config, is_root, needs_external_userns_mapping)?;
442
443 let (ready_read, ready_write) = pipe().map_err(|e| {
445 NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
446 })?;
447 let userns_sync = if needs_external_userns_mapping {
448 let (request_read, request_write) = pipe().map_err(|e| {
449 NucleusError::ExecError(format!(
450 "Failed to create user namespace request pipe: {}",
451 e
452 ))
453 })?;
454 let (ack_read, ack_write) = pipe().map_err(|e| {
455 NucleusError::ExecError(format!("Failed to create user namespace ack pipe: {}", e))
456 })?;
457 Some((request_read, request_write, ack_read, ack_write))
458 } else {
459 None
460 };
461 let (parent_setup_read, parent_setup_write) = pipe().map_err(|e| {
462 NucleusError::ExecError(format!("Failed to create parent setup sync pipe: {}", e))
463 })?;
464
465 Self::assert_single_threaded_for_fork("container create fork")?;
470 match unsafe { fork() }? {
473 ForkResult::Parent { child } => {
474 drop(ready_write);
475 drop(parent_setup_read);
476 let (userns_request_read, userns_ack_write) =
477 if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
478 drop(request_write);
479 drop(ack_read);
480 (Some(request_read), Some(ack_write))
481 } else {
482 (None, None)
483 };
484 info!("Forked child process: {}", child);
485
486 let mut target_pid_for_cleanup: Option<u32> = None;
491 let parent_setup = || -> Result<CreatedContainer> {
492 if needs_external_userns_mapping {
493 let user_config = config.user_ns_config.as_ref().ok_or_else(|| {
494 NucleusError::ExecError(
495 "Missing user namespace configuration in parent".to_string(),
496 )
497 })?;
498 let request_read = userns_request_read.as_ref().ok_or_else(|| {
499 NucleusError::ExecError(
500 "Missing user namespace request pipe in parent".to_string(),
501 )
502 })?;
503 let ack_write = userns_ack_write.as_ref().ok_or_else(|| {
504 NucleusError::ExecError(
505 "Missing user namespace ack pipe in parent".to_string(),
506 )
507 })?;
508
509 Self::wait_for_sync_byte(
510 request_read,
511 &format!(
512 "Child {} exited before requesting user namespace mappings",
513 child
514 ),
515 "Failed waiting for child user namespace request",
516 )?;
517 UserNamespaceMapper::new(user_config.clone())
518 .write_mappings_for_pid(child.as_raw() as u32)?;
519 Self::send_sync_byte(
520 ack_write,
521 "Failed to notify child that user namespace mappings are ready",
522 )?;
523 }
524
525 let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
526 target_pid_for_cleanup = Some(target_pid);
527
528 let cgroup_path = cgroup_opt
529 .as_ref()
530 .map(|cgroup| cgroup.path().display().to_string());
531 let cpu_millicores = config
532 .limits
533 .cpu_quota_us
534 .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
535 let mut state = ContainerState::new(ContainerStateParams {
536 id: config.id.clone(),
537 name: config.name.clone(),
538 pid: target_pid,
539 command: config.command.clone(),
540 memory_limit: config.limits.memory_bytes,
541 cpu_limit: cpu_millicores,
542 using_gvisor: config.use_gvisor,
543 rootless: config.user_ns_config.is_some(),
544 cgroup_path,
545 process_uid: config.process_identity.uid,
546 process_gid: config.process_identity.gid,
547 additional_gids: config.process_identity.additional_gids.clone(),
548 });
549 state.config_hash = config.config_hash;
550 state.bundle_path =
551 config.rootfs_path.as_ref().map(|p| p.display().to_string());
552
553 let mut network_driver: Option<BridgeDriver> = None;
554 let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
555
556 state.status = OciStatus::Created;
558 state_mgr.save_state(&state)?;
559
560 if let Some(ref pid_path) = config.pid_file {
562 std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
563 NucleusError::ConfigError(format!(
564 "Failed to write pid-file '{}': {}",
565 pid_path.display(),
566 e
567 ))
568 })?;
569 info!("Wrote PID {} to {}", target_pid, pid_path.display());
570 }
571
572 if let Some(ref mut cgroup) = cgroup_opt {
573 cgroup.attach_process(target_pid)?;
574 }
575
576 let deny_logger = Self::maybe_start_seccomp_deny_logger(
577 &config,
578 target_pid,
579 cgroup_opt.as_ref().map(|cgroup| cgroup.path()),
580 )?;
581
582 if let NetworkMode::Bridge(ref bridge_config) = config.network {
583 match BridgeDriver::setup_with_id(
584 target_pid,
585 bridge_config,
586 &config.id,
587 is_root,
588 config.user_ns_config.is_some(),
589 ) {
590 Ok(net) => {
591 if let Some(ref egress) = config.egress_policy {
592 if let Err(e) = net.apply_egress_policy(
593 target_pid,
594 egress,
595 config.user_ns_config.is_some(),
596 ) {
597 if config.service_mode == ServiceMode::Production {
598 return Err(NucleusError::NetworkError(format!(
599 "Failed to apply egress policy: {}",
600 e
601 )));
602 }
603 warn!("Failed to apply egress policy: {}", e);
604 }
605 }
606 network_driver = Some(net);
607 }
608 Err(e) => {
609 if config.service_mode == ServiceMode::Production {
610 return Err(e);
611 }
612 warn!("Failed to set up bridge networking: {}", e);
613 }
614 }
615 }
616
617 Self::send_sync_byte(
618 &parent_setup_write,
619 "Failed to notify child that parent setup is complete",
620 )?;
621
622 info!(
623 "Container {} created (child pid {}), waiting for start",
624 config.id, target_pid
625 );
626
627 Ok(CreatedContainer {
628 config,
629 state_mgr,
630 state,
631 child,
632 cgroup_opt,
633 network_driver,
634 trace_reader,
635 deny_logger,
636 exec_fifo_path: exec_fifo,
637 _lifecycle_span: lifecycle_span.clone(),
638 })
639 };
640
641 parent_setup().map_err(|e| {
642 if let Some(target_pid) = target_pid_for_cleanup {
643 let _ = kill(Pid::from_raw(target_pid as i32), Signal::SIGKILL);
644 }
645 let _ = kill(child, Signal::SIGKILL);
646 let _ = waitpid(child, None);
647 e
648 })
649 }
650 ForkResult::Child => {
651 drop(ready_read);
652 drop(parent_setup_write);
653 let (userns_request_write, userns_ack_read) =
654 if let Some((request_read, request_write, ack_read, ack_write)) = userns_sync {
655 drop(request_read);
656 drop(ack_write);
657 (Some(request_write), Some(ack_read))
658 } else {
659 (None, None)
660 };
661 Self::sanitize_fds();
663 let temp_container = Container { config, runsc_path };
664 match temp_container.setup_and_exec(
665 Some(ready_write),
666 userns_request_write,
667 userns_ack_read,
668 Some(parent_setup_read),
669 exec_fifo,
670 runtime_base_override,
671 ) {
672 Ok(_) => unreachable!(),
673 Err(e) => {
674 error!("Container setup failed: {}", e);
675 std::process::exit(1);
676 }
677 }
678 }
679 }
680 }
681
682 pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
685 let state_mgr = ContainerStateManager::new_with_root(state_root)?;
686 let fifo_path = state_mgr.exec_fifo_path(container_id)?;
687 if !fifo_path.exists() {
688 return Err(NucleusError::ConfigError(format!(
689 "No exec FIFO found for container {}; is it in 'created' state?",
690 container_id
691 )));
692 }
693
694 let file = std::fs::File::open(&fifo_path)
696 .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
697 let mut buf = [0u8; 1];
698 std::io::Read::read(&mut &file, &mut buf)
699 .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
700 drop(file);
701
702 let _ = std::fs::remove_file(&fifo_path);
703
704 let mut state = state_mgr.resolve_container(container_id)?;
706 state.status = OciStatus::Running;
707 state_mgr.save_state(&state)?;
708
709 Ok(())
710 }
711
712 fn setup_and_exec(
717 &self,
718 ready_pipe: Option<OwnedFd>,
719 userns_request_pipe: Option<OwnedFd>,
720 userns_ack_pipe: Option<OwnedFd>,
721 parent_setup_pipe: Option<OwnedFd>,
722 exec_fifo: Option<PathBuf>,
723 runtime_base_override: Option<PathBuf>,
724 ) -> Result<()> {
725 let is_rootless = self.config.user_ns_config.is_some();
726 let allow_degraded_security = Self::allow_degraded_security(&self.config);
727 let context_manifest = if self.config.verify_context_integrity {
728 self.config
729 .context_dir
730 .as_ref()
731 .map(|dir| snapshot_context_dir(dir))
732 .transpose()?
733 } else {
734 None
735 };
736
737 let mut fs_state = FilesystemState::Unmounted;
739 let mut sec_state = SecurityState::Privileged;
740
741 if self.config.use_gvisor {
745 let gvisor_bridge_precreated_userns =
746 if matches!(self.config.network, NetworkMode::Bridge(_)) {
747 self.prepare_gvisor_bridge_namespace(
748 userns_request_pipe.as_ref(),
749 userns_ack_pipe.as_ref(),
750 )?
751 } else {
752 false
753 };
754
755 if let Some(fd) = ready_pipe {
756 Self::notify_namespace_ready(&fd, std::process::id())?;
757 }
758 if let Some(fd) = parent_setup_pipe.as_ref() {
759 Self::wait_for_sync_byte(
760 fd,
761 "Parent closed setup pipe before signalling gVisor child",
762 "Failed waiting for parent setup acknowledgement",
763 )?;
764 }
765 return self.setup_and_exec_gvisor(gvisor_bridge_precreated_userns);
766 }
767
768 let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
770 namespace_mgr.unshare_namespaces()?;
771 if self.config.user_ns_config.is_some() {
772 let request_fd = userns_request_pipe.as_ref().ok_or_else(|| {
773 NucleusError::ExecError(
774 "Missing user namespace request pipe in container child".to_string(),
775 )
776 })?;
777 let ack_fd = userns_ack_pipe.as_ref().ok_or_else(|| {
778 NucleusError::ExecError(
779 "Missing user namespace acknowledgement pipe in container child".to_string(),
780 )
781 })?;
782
783 Self::send_sync_byte(
784 request_fd,
785 "Failed to request user namespace mappings from parent",
786 )?;
787 Self::wait_for_sync_byte(
788 ack_fd,
789 "Parent closed user namespace ack pipe before mappings were written",
790 "Failed waiting for parent to finish user namespace mappings",
791 )?;
792 Self::become_userns_root_for_setup()?;
793 }
794
795 if self.config.namespaces.pid {
798 Self::assert_single_threaded_for_fork("PID namespace init fork")?;
799 match unsafe { fork() }? {
800 ForkResult::Parent { child } => {
801 if let Some(fd) = ready_pipe {
802 Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
803 }
804 std::process::exit(Self::wait_for_pid_namespace_child(child));
805 }
806 ForkResult::Child => {
807 if let Some(fd) = parent_setup_pipe.as_ref() {
808 Self::wait_for_sync_byte(
809 fd,
810 "Parent closed setup pipe before signalling PID 1 child",
811 "Failed waiting for parent setup acknowledgement",
812 )?;
813 }
814 }
816 }
817 } else {
818 if let Some(fd) = ready_pipe {
819 Self::notify_namespace_ready(&fd, std::process::id())?;
820 }
821 if let Some(fd) = parent_setup_pipe.as_ref() {
822 Self::wait_for_sync_byte(
823 fd,
824 "Parent closed setup pipe before signalling container child",
825 "Failed waiting for parent setup acknowledgement",
826 )?;
827 }
828 }
829
830 namespace_mgr.enter()?;
832
833 self.enforce_no_new_privs()?;
837 audit(
838 &self.config.id,
839 &self.config.name,
840 AuditEventType::NoNewPrivsSet,
841 "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
842 );
843
844 if let Some(hostname) = &self.config.hostname {
846 namespace_mgr.set_hostname(hostname)?;
847 }
848
849 let runtime_base = if let Some(path) = runtime_base_override {
854 path
855 } else if nix::unistd::Uid::effective().is_root() {
856 PathBuf::from("/run/nucleus")
857 } else {
858 dirs::runtime_dir()
859 .map(|d| d.join("nucleus"))
860 .unwrap_or_else(std::env::temp_dir)
861 };
862 let _ = std::fs::create_dir_all(&runtime_base);
863 let runtime_dir = Builder::new()
864 .prefix("nucleus-runtime-")
865 .tempdir_in(&runtime_base)
866 .map_err(|e| {
867 NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
868 })?;
869 let container_root = runtime_dir.path().to_path_buf();
870 let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); tmpfs.mount()?;
872 fs_state = fs_state.transition(FilesystemState::Mounted)?;
873
874 create_minimal_fs(&container_root)?;
876
877 let dev_path = container_root.join("dev");
879 create_dev_nodes(&dev_path, false)?;
880
881 let shm_path = dev_path.join("shm");
884 std::fs::create_dir_all(&shm_path).map_err(|e| {
885 NucleusError::FilesystemError(format!("Failed to create /dev/shm: {}", e))
886 })?;
887 nix::mount::mount(
888 Some("shm"),
889 &shm_path,
890 Some("tmpfs"),
891 nix::mount::MsFlags::MS_NOSUID
892 | nix::mount::MsFlags::MS_NODEV
893 | nix::mount::MsFlags::MS_NOEXEC,
894 Some("mode=1777,size=64m"),
895 )
896 .map_err(|e| {
897 NucleusError::FilesystemError(format!("Failed to mount tmpfs on /dev/shm: {}", e))
898 })?;
899 debug!("Mounted tmpfs on /dev/shm");
900
901 if let Some(context_dir) = &self.config.context_dir {
904 let context_dest = container_root.join("context");
905 LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
906 if let Some(expected) = &context_manifest {
907 verify_context_manifest(expected, &context_dest)?;
908 }
909 }
910 fs_state = fs_state.transition(FilesystemState::Populated)?;
911
912 if let Some(ref rootfs_path) = self.config.rootfs_path {
914 let rootfs_path = if self.config.service_mode == ServiceMode::Production {
915 validate_production_rootfs_path(rootfs_path)?
916 } else {
917 rootfs_path.clone()
918 };
919 if self.config.verify_rootfs_attestation {
920 verify_rootfs_attestation(&rootfs_path)?;
921 }
922 bind_mount_rootfs(&container_root, &rootfs_path)?;
923 } else {
924 bind_mount_host_paths(&container_root, is_rootless)?;
925 }
926
927 mount_volumes(&container_root, &self.config.volumes)?;
929
930 if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
934 let bridge_dns = if bridge_config.selected_nat_backend(!is_rootless, is_rootless)
935 == NatBackend::Userspace
936 && bridge_config.dns.is_empty()
937 {
938 vec![UserspaceNetwork::default_dns_server(&bridge_config.subnet)?]
939 } else {
940 bridge_config.dns.clone()
941 };
942 if self.config.rootfs_path.is_some() {
943 BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_dns)?;
944 } else {
945 BridgeNetwork::write_resolv_conf(&container_root, &bridge_dns)?;
946 }
947 }
948
949 mount_secrets_inmemory(
951 &container_root,
952 &self.config.secrets,
953 &self.config.process_identity,
954 )?;
955
956 let proc_path = container_root.join("proc");
958 let production_mode = self.config.service_mode == ServiceMode::Production;
959 let hide_pids = production_mode;
960 let procfs_best_effort = is_rootless && !production_mode;
961 mount_procfs(
962 &proc_path,
963 procfs_best_effort,
964 self.config.proc_readonly,
965 hide_pids,
966 )?;
967
968 mask_proc_paths(
971 &proc_path,
972 self.config.service_mode == ServiceMode::Production,
973 )?;
974
975 if let Some(ref hooks) = self.config.hooks {
977 if !hooks.create_runtime.is_empty() {
978 let hook_state = OciContainerState {
979 oci_version: "1.0.2".to_string(),
980 id: self.config.id.clone(),
981 status: OciStatus::Creating,
982 pid: std::process::id(),
983 bundle: String::new(),
984 };
985 OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
986 }
987 }
988
989 switch_root(&container_root, self.config.allow_chroot_fallback)?;
992 fs_state = fs_state.transition(FilesystemState::Pivoted)?;
993 debug!("Filesystem state: {:?}", fs_state);
994
995 audit_mounts(self.config.service_mode == ServiceMode::Production)?;
997 audit(
998 &self.config.id,
999 &self.config.name,
1000 AuditEventType::MountAuditPassed,
1001 "all mount flags verified",
1002 );
1003
1004 if let Some(ref hooks) = self.config.hooks {
1006 if !hooks.create_container.is_empty() {
1007 let hook_state = OciContainerState {
1008 oci_version: "1.0.2".to_string(),
1009 id: self.config.id.clone(),
1010 status: OciStatus::Created,
1011 pid: std::process::id(),
1012 bundle: String::new(),
1013 };
1014 OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
1015 }
1016 }
1017
1018 let mut cap_mgr = CapabilityManager::new();
1028 if let Some(ref policy_path) = self.config.caps_policy {
1029 let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
1030 policy_path,
1031 self.config.caps_policy_sha256.as_deref(),
1032 )?;
1033 if self.config.service_mode == ServiceMode::Production {
1035 policy.validate_production()?;
1036 }
1037 policy.apply(&mut cap_mgr)?;
1038 Self::apply_process_identity_to_current_process(
1040 &self.config.process_identity,
1041 self.config.user_ns_config.is_some(),
1042 )?;
1043 audit(
1044 &self.config.id,
1045 &self.config.name,
1046 AuditEventType::CapabilitiesDropped,
1047 format!("capability policy applied from {:?}", policy_path),
1048 );
1049 } else {
1050 cap_mgr.drop_bounding_set()?;
1052
1053 Self::apply_process_identity_to_current_process(
1057 &self.config.process_identity,
1058 self.config.user_ns_config.is_some(),
1059 )?;
1060
1061 cap_mgr.finalize_drop()?;
1064
1065 audit(
1066 &self.config.id,
1067 &self.config.name,
1068 AuditEventType::CapabilitiesDropped,
1069 "all capabilities dropped including bounding set",
1070 );
1071 }
1072 sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
1073
1074 {
1079 let is_production = self.config.service_mode == ServiceMode::Production;
1080
1081 if let Some(nproc_limit) = self.config.limits.pids_max {
1082 let rlim_nproc = libc::rlimit {
1083 rlim_cur: nproc_limit,
1084 rlim_max: nproc_limit,
1085 };
1086 if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
1088 let err = std::io::Error::last_os_error();
1089 if is_production {
1090 return Err(NucleusError::SeccompError(format!(
1091 "Failed to set RLIMIT_NPROC to {} in production mode: {}",
1092 nproc_limit, err
1093 )));
1094 }
1095 warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
1096 }
1097 }
1098
1099 let rlim_nofile = libc::rlimit {
1100 rlim_cur: 1024,
1101 rlim_max: 1024,
1102 };
1103 if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
1105 let err = std::io::Error::last_os_error();
1106 if is_production {
1107 return Err(NucleusError::SeccompError(format!(
1108 "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
1109 err
1110 )));
1111 }
1112 warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
1113 }
1114
1115 let memlock_limit: u64 = self.config.limits.memlock_bytes.unwrap_or(64 * 1024);
1120 let rlim_memlock = libc::rlimit {
1121 rlim_cur: memlock_limit,
1122 rlim_max: memlock_limit,
1123 };
1124 if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
1126 let err = std::io::Error::last_os_error();
1127 if is_production {
1128 return Err(NucleusError::SeccompError(format!(
1129 "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
1130 memlock_limit, err
1131 )));
1132 }
1133 warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
1134 }
1135 }
1136
1137 CapabilityManager::verify_no_namespace_caps(
1141 self.config.service_mode == ServiceMode::Production,
1142 )?;
1143
1144 use crate::container::config::SeccompMode;
1147 let mut seccomp_mgr = SeccompManager::new();
1148 let allow_network = !matches!(self.config.network, NetworkMode::None);
1149 let seccomp_applied = match self.config.seccomp_mode {
1150 SeccompMode::Trace => {
1151 audit(
1152 &self.config.id,
1153 &self.config.name,
1154 AuditEventType::SeccompApplied,
1155 "seccomp trace mode: allow-all + LOG",
1156 );
1157 seccomp_mgr.apply_trace_filter()?
1158 }
1159 SeccompMode::Enforce => {
1160 if let Some(ref profile_path) = self.config.seccomp_profile {
1161 audit(
1162 &self.config.id,
1163 &self.config.name,
1164 AuditEventType::SeccompProfileLoaded,
1165 format!("path={:?}", profile_path),
1166 );
1167 seccomp_mgr.apply_profile_from_file(
1168 profile_path,
1169 self.config.seccomp_profile_sha256.as_deref(),
1170 self.config.seccomp_log_denied,
1171 )?
1172 } else {
1173 seccomp_mgr.apply_filter_for_network_mode(
1174 allow_network,
1175 allow_degraded_security,
1176 self.config.seccomp_log_denied,
1177 &self.config.seccomp_allow_syscalls,
1178 )?
1179 }
1180 }
1181 };
1182 if seccomp_applied {
1183 sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
1184 audit(
1185 &self.config.id,
1186 &self.config.name,
1187 AuditEventType::SeccompApplied,
1188 format!("network={}", allow_network),
1189 );
1190 } else if !allow_degraded_security {
1191 return Err(NucleusError::SeccompError(
1192 "Seccomp filter is required but was not enforced".to_string(),
1193 ));
1194 } else {
1195 warn!("Seccomp not enforced; container is running with degraded hardening");
1196 }
1197
1198 let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
1200 let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
1201 policy_path,
1202 self.config.landlock_policy_sha256.as_deref(),
1203 )?;
1204 if self.config.service_mode == ServiceMode::Production {
1206 policy.validate_production()?;
1207 }
1208 policy.apply(allow_degraded_security)?
1209 } else {
1210 let mut landlock_mgr = LandlockManager::new();
1211 landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
1212 for vol in &self.config.volumes {
1214 landlock_mgr.add_rw_path(&vol.dest.to_string_lossy());
1215 }
1216 landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
1217 };
1218 if seccomp_applied && landlock_applied {
1219 sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
1220 if self.config.seccomp_mode == SeccompMode::Trace {
1221 warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
1222 } else {
1223 sec_state = sec_state.transition(SecurityState::Locked)?;
1224 }
1225 audit(
1226 &self.config.id,
1227 &self.config.name,
1228 AuditEventType::LandlockApplied,
1229 if self.config.seccomp_mode == SeccompMode::Trace {
1230 "landlock applied, but seccomp in trace mode – not locked".to_string()
1231 } else {
1232 "security state locked: all hardening layers active".to_string()
1233 },
1234 );
1235 } else if !allow_degraded_security {
1236 return Err(NucleusError::LandlockError(
1237 "Landlock policy is required but was not enforced".to_string(),
1238 ));
1239 } else {
1240 warn!("Security state not locked; one or more hardening controls are inactive");
1241 }
1242 debug!("Security state: {:?}", sec_state);
1243
1244 if let Some(ref fifo_path) = exec_fifo {
1248 debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
1249 let file = std::fs::OpenOptions::new()
1250 .write(true)
1251 .open(fifo_path)
1252 .map_err(|e| {
1253 NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
1254 })?;
1255 std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
1256 NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
1257 })?;
1258 drop(file);
1259 debug!("Exec FIFO released, proceeding to exec");
1260 }
1261
1262 if let Some(ref hooks) = self.config.hooks {
1264 if !hooks.start_container.is_empty() {
1265 let hook_state = OciContainerState {
1266 oci_version: "1.0.2".to_string(),
1267 id: self.config.id.clone(),
1268 status: OciStatus::Running,
1269 pid: std::process::id(),
1270 bundle: String::new(),
1271 };
1272 OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
1273 }
1274 }
1275
1276 if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
1279 return self.run_as_init();
1280 }
1281
1282 self.exec_command()?;
1284
1285 Ok(())
1287 }
1288
1289 pub(super) fn setup_signal_forwarding_static(
1294 child: Pid,
1295 ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
1296 let mut set = SigSet::empty();
1297 for signal in [
1298 Signal::SIGTERM,
1299 Signal::SIGINT,
1300 Signal::SIGHUP,
1301 Signal::SIGQUIT,
1302 Signal::SIGUSR1,
1303 Signal::SIGUSR2,
1304 ] {
1305 set.add(signal);
1306 }
1307
1308 let unblock_set = set;
1309 pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
1310 NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
1311 })?;
1312
1313 let stop = Arc::new(AtomicBool::new(false));
1314 let stop_clone = stop.clone();
1315 let handle = std::thread::Builder::new()
1316 .name("sig-forward".to_string())
1317 .spawn(move || {
1318 loop {
1320 if let Ok(signal) = unblock_set.wait() {
1321 if stop_clone.load(Ordering::Acquire) {
1325 break;
1326 }
1327 let _ = kill(child, signal);
1328 }
1329 }
1330 })
1331 .map_err(|e| {
1332 let mut restore = SigSet::empty();
1335 for signal in [
1336 Signal::SIGTERM,
1337 Signal::SIGINT,
1338 Signal::SIGHUP,
1339 Signal::SIGQUIT,
1340 Signal::SIGUSR1,
1341 Signal::SIGUSR2,
1342 ] {
1343 restore.add(signal);
1344 }
1345 let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
1346 NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
1347 })?;
1348
1349 info!("Signal forwarding configured");
1350 Ok((stop, handle))
1351 }
1352
1353 pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
1355 loop {
1356 match waitpid(child, None) {
1357 Ok(WaitStatus::Exited(_, code)) => {
1358 return Ok(code);
1359 }
1360 Ok(WaitStatus::Signaled(_, signal, _)) => {
1361 info!("Child killed by signal: {:?}", signal);
1362 return Ok(128 + signal as i32);
1363 }
1364 Err(nix::errno::Errno::EINTR) => {
1365 continue;
1366 }
1367 Err(e) => {
1368 return Err(NucleusError::ExecError(format!(
1369 "Failed to wait for child: {}",
1370 e
1371 )));
1372 }
1373 _ => {
1374 continue;
1375 }
1376 }
1377 }
1378 }
1379
1380 fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1381 let mut pid_buf = [0u8; 4];
1382 loop {
1383 match read(ready_read, &mut pid_buf) {
1384 Err(nix::errno::Errno::EINTR) => continue,
1385 Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1386 Ok(0) => {
1387 return Err(NucleusError::ExecError(format!(
1388 "Child {} exited before namespace initialization",
1389 child
1390 )))
1391 }
1392 Ok(_) => {
1393 return Err(NucleusError::ExecError(
1394 "Invalid namespace sync payload from child".to_string(),
1395 ))
1396 }
1397 Err(e) => {
1398 return Err(NucleusError::ExecError(format!(
1399 "Failed waiting for child namespace setup: {}",
1400 e
1401 )))
1402 }
1403 }
1404 }
1405 }
1406
1407 fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1408 let payload = pid.to_ne_bytes();
1409 let mut written = 0;
1410 while written < payload.len() {
1411 let n = write(fd, &payload[written..]).map_err(|e| {
1412 NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1413 })?;
1414 if n == 0 {
1415 return Err(NucleusError::ExecError(
1416 "Failed to notify namespace readiness: short write".to_string(),
1417 ));
1418 }
1419 written += n;
1420 }
1421 Ok(())
1422 }
1423
1424 fn send_sync_byte(fd: &OwnedFd, error_context: &str) -> Result<()> {
1425 let mut written = 0;
1426 let payload = [1u8];
1427 while written < payload.len() {
1428 let n = write(fd, &payload[written..])
1429 .map_err(|e| NucleusError::ExecError(format!("{}: {}", error_context, e)))?;
1430 if n == 0 {
1431 return Err(NucleusError::ExecError(format!(
1432 "{}: short write",
1433 error_context
1434 )));
1435 }
1436 written += n;
1437 }
1438 Ok(())
1439 }
1440
1441 fn wait_for_sync_byte(fd: &OwnedFd, eof_context: &str, error_context: &str) -> Result<()> {
1442 let mut payload = [0u8; 1];
1443 loop {
1444 match read(fd, &mut payload) {
1445 Err(nix::errno::Errno::EINTR) => continue,
1446 Ok(1) => return Ok(()),
1447 Ok(0) => return Err(NucleusError::ExecError(eof_context.to_string())),
1448 Ok(_) => {
1449 return Err(NucleusError::ExecError(format!(
1450 "{}: invalid sync payload",
1451 error_context
1452 )))
1453 }
1454 Err(e) => return Err(NucleusError::ExecError(format!("{}: {}", error_context, e))),
1455 }
1456 }
1457 }
1458
1459 fn become_userns_root_for_setup() -> Result<()> {
1460 setresgid(Gid::from_raw(0), Gid::from_raw(0), Gid::from_raw(0)).map_err(|e| {
1461 NucleusError::NamespaceError(format!(
1462 "Failed to become gid 0 inside mapped user namespace: {}",
1463 e
1464 ))
1465 })?;
1466 setresuid(Uid::from_raw(0), Uid::from_raw(0), Uid::from_raw(0)).map_err(|e| {
1467 NucleusError::NamespaceError(format!(
1468 "Failed to become uid 0 inside mapped user namespace: {}",
1469 e
1470 ))
1471 })?;
1472 debug!("Switched setup process to uid/gid 0 inside mapped user namespace");
1473 Ok(())
1474 }
1475
1476 fn prepare_gvisor_bridge_namespace(
1477 &self,
1478 userns_request_pipe: Option<&OwnedFd>,
1479 userns_ack_pipe: Option<&OwnedFd>,
1480 ) -> Result<bool> {
1481 let mut precreated_userns = false;
1482 if self.config.user_ns_config.is_some() && !Uid::effective().is_root() {
1483 nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWUSER).map_err(|e| {
1484 NucleusError::NamespaceError(format!(
1485 "Failed to unshare gVisor bridge user namespace: {}",
1486 e
1487 ))
1488 })?;
1489
1490 let request_fd = userns_request_pipe.ok_or_else(|| {
1491 NucleusError::ExecError(
1492 "Missing user namespace request pipe in gVisor bridge child".to_string(),
1493 )
1494 })?;
1495 let ack_fd = userns_ack_pipe.ok_or_else(|| {
1496 NucleusError::ExecError(
1497 "Missing user namespace acknowledgement pipe in gVisor bridge child"
1498 .to_string(),
1499 )
1500 })?;
1501
1502 Self::send_sync_byte(
1503 request_fd,
1504 "Failed to request gVisor bridge user namespace mappings from parent",
1505 )?;
1506 Self::wait_for_sync_byte(
1507 ack_fd,
1508 "Parent closed user namespace ack pipe before gVisor bridge mappings were written",
1509 "Failed waiting for parent to finish gVisor bridge user namespace mappings",
1510 )?;
1511 Self::become_userns_root_for_setup()?;
1512 precreated_userns = true;
1513 }
1514
1515 nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNET).map_err(|e| {
1516 NucleusError::NamespaceError(format!(
1517 "Failed to unshare gVisor bridge network namespace: {}",
1518 e
1519 ))
1520 })?;
1521 Ok(precreated_userns)
1522 }
1523
1524 fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1525 loop {
1526 match waitpid(child, None) {
1527 Ok(WaitStatus::Exited(_, code)) => return code,
1528 Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1529 Err(nix::errno::Errno::EINTR) => continue,
1530 Err(_) => return 1,
1531 _ => continue,
1532 }
1533 }
1534 }
1535}
1536
1537impl CreatedContainer {
1538 pub fn start(mut self) -> Result<i32> {
1541 let config = &self.config;
1542 let _enter = self._lifecycle_span.enter();
1543
1544 if let Some(exec_fifo_path) = &self.exec_fifo_path {
1547 let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1548 NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1549 })?;
1550 let mut buf = [0u8; 1];
1551 let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1552 NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1553 })?;
1554 if read != 1 {
1555 return Err(NucleusError::ExecError(
1556 "Exec FIFO closed before start signal was delivered".to_string(),
1557 ));
1558 }
1559 let _ = std::fs::remove_file(exec_fifo_path);
1560 }
1561
1562 self.state.status = OciStatus::Running;
1564 self.state_mgr.save_state(&self.state)?;
1565
1566 let target_pid = self.state.pid;
1567 let child = self.child;
1568
1569 let (sig_stop, sig_handle) =
1570 Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1571
1572 let mut sig_guard = SignalThreadGuard {
1574 stop: Some(sig_stop),
1575 handle: Some(sig_handle),
1576 };
1577
1578 if let Some(ref probe) = config.readiness_probe {
1580 let notify_socket = if config.sd_notify {
1581 std::env::var("NOTIFY_SOCKET").ok()
1582 } else {
1583 None
1584 };
1585 Container::run_readiness_probe(
1586 target_pid,
1587 &config.name,
1588 probe,
1589 config.user_ns_config.is_some(),
1590 config.use_gvisor,
1591 &config.process_identity,
1592 notify_socket.as_deref(),
1593 )?;
1594 }
1595
1596 let cancel_flag = Arc::new(AtomicBool::new(false));
1598 let health_handle = if let Some(ref hc) = config.health_check {
1599 if !hc.command.is_empty() {
1600 let hc = hc.clone();
1601 let pid = target_pid;
1602 let container_name = config.name.clone();
1603 let rootless = config.user_ns_config.is_some();
1604 let using_gvisor = config.use_gvisor;
1605 let process_identity = config.process_identity.clone();
1606 let cancel = cancel_flag.clone();
1607 Some(std::thread::spawn(move || {
1608 Container::health_check_loop(
1609 pid,
1610 &container_name,
1611 rootless,
1612 using_gvisor,
1613 &hc,
1614 &process_identity,
1615 &cancel,
1616 );
1617 }))
1618 } else {
1619 None
1620 }
1621 } else {
1622 None
1623 };
1624
1625 let mut health_guard = HealthThreadGuard {
1627 cancel: Some(cancel_flag),
1628 handle: health_handle,
1629 };
1630
1631 if let Some(ref hooks) = config.hooks {
1633 if !hooks.poststart.is_empty() {
1634 let hook_state = OciContainerState {
1635 oci_version: "1.0.2".to_string(),
1636 id: config.id.clone(),
1637 status: OciStatus::Running,
1638 pid: target_pid,
1639 bundle: String::new(),
1640 };
1641 OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1642 }
1643 }
1644
1645 let mut child_waited = false;
1646 let run_result: Result<i32> = (|| {
1647 let exit_code = Container::wait_for_child_static(child)?;
1648
1649 self.state.status = OciStatus::Stopped;
1651 let _ = self.state_mgr.save_state(&self.state);
1652
1653 child_waited = true;
1654 Ok(exit_code)
1655 })();
1656
1657 health_guard.stop();
1660 sig_guard.stop();
1661
1662 if let Some(ref hooks) = config.hooks {
1664 if !hooks.poststop.is_empty() {
1665 let hook_state = OciContainerState {
1666 oci_version: "1.0.2".to_string(),
1667 id: config.id.clone(),
1668 status: OciStatus::Stopped,
1669 pid: 0,
1670 bundle: String::new(),
1671 };
1672 OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1673 }
1674 }
1675
1676 if let Some(net) = self.network_driver.take() {
1677 if let Err(e) = net.cleanup() {
1678 warn!("Failed to cleanup container networking: {}", e);
1679 }
1680 }
1681
1682 if !child_waited {
1683 let _ = kill(child, Signal::SIGKILL);
1684 let _ = waitpid(child, None);
1685 }
1686
1687 if let Some(reader) = self.trace_reader.take() {
1688 reader.stop_and_flush();
1689 }
1690
1691 if let Some(logger) = self.deny_logger.take() {
1692 logger.stop();
1693 }
1694
1695 if let Some(cgroup) = self.cgroup_opt.take() {
1696 if let Err(e) = cgroup.cleanup() {
1697 warn!("Failed to cleanup cgroup: {}", e);
1698 }
1699 }
1700
1701 if config.use_gvisor {
1702 if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1703 warn!(
1704 "Failed to cleanup gVisor artifacts for {}: {}",
1705 config.id, e
1706 );
1707 }
1708 }
1709
1710 if let Err(e) = self.state_mgr.delete_state(&config.id) {
1711 warn!("Failed to delete state for {}: {}", config.id, e);
1712 }
1713
1714 match run_result {
1715 Ok(exit_code) => {
1716 audit(
1717 &config.id,
1718 &config.name,
1719 AuditEventType::ContainerStop,
1720 format!("exit_code={}", exit_code),
1721 );
1722 info!(
1723 "Container {} ({}) exited with code {}",
1724 config.name, config.id, exit_code
1725 );
1726 Ok(exit_code)
1727 }
1728 Err(e) => {
1729 audit_error(
1730 &config.id,
1731 &config.name,
1732 AuditEventType::ContainerStop,
1733 format!("error={}", e),
1734 );
1735 Err(e)
1736 }
1737 }
1738 }
1739}
1740
1741struct SignalThreadGuard {
1743 stop: Option<Arc<AtomicBool>>,
1744 handle: Option<JoinHandle<()>>,
1745}
1746
1747impl SignalThreadGuard {
1748 fn stop(&mut self) {
1749 if let Some(flag) = self.stop.take() {
1750 flag.store(true, Ordering::Release);
1751 if let Some(handle) = self.handle.as_ref() {
1752 super::signals::wake_sigwait_thread(handle, Signal::SIGUSR1);
1753 }
1754 }
1755 if let Some(handle) = self.handle.take() {
1756 let _ = handle.join();
1757 }
1758 }
1759}
1760
1761impl Drop for SignalThreadGuard {
1762 fn drop(&mut self) {
1763 self.stop();
1764 }
1765}
1766
1767struct HealthThreadGuard {
1769 cancel: Option<Arc<AtomicBool>>,
1770 handle: Option<JoinHandle<()>>,
1771}
1772
1773impl HealthThreadGuard {
1774 fn stop(&mut self) {
1775 if let Some(flag) = self.cancel.take() {
1776 flag.store(true, Ordering::Relaxed);
1777 }
1778 if let Some(handle) = self.handle.take() {
1779 let _ = handle.join();
1780 }
1781 }
1782}
1783
1784impl Drop for HealthThreadGuard {
1785 fn drop(&mut self) {
1786 self.stop();
1787 }
1788}
1789
1790#[cfg(test)]
1791mod tests {
1792 use super::*;
1793 use crate::container::KernelLockdownMode;
1794 use crate::network::NetworkMode;
1795 use std::ffi::OsString;
1796 use std::sync::{Mutex, MutexGuard};
1797
1798 static ENV_LOCK: Mutex<()> = Mutex::new(());
1799
1800 struct EnvLock {
1801 _guard: MutexGuard<'static, ()>,
1802 }
1803
1804 impl EnvLock {
1805 fn acquire() -> Self {
1806 Self {
1807 _guard: ENV_LOCK.lock().unwrap(),
1808 }
1809 }
1810 }
1811
1812 struct EnvVarGuard {
1813 key: &'static str,
1814 previous: Option<OsString>,
1815 }
1816
1817 impl EnvVarGuard {
1818 fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
1819 let previous = std::env::var_os(key);
1820 std::env::set_var(key, value);
1821 Self { key, previous }
1822 }
1823
1824 fn remove(key: &'static str) -> Self {
1825 let previous = std::env::var_os(key);
1826 std::env::remove_var(key);
1827 Self { key, previous }
1828 }
1829 }
1830
1831 impl Drop for EnvVarGuard {
1832 fn drop(&mut self) {
1833 match &self.previous {
1834 Some(value) => std::env::set_var(self.key, value),
1835 None => std::env::remove_var(self.key),
1836 }
1837 }
1838 }
1839
1840 fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
1841 let fn_start = source
1842 .find(fn_signature)
1843 .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
1844 let after = &source[fn_start..];
1845 let open = after
1846 .find('{')
1847 .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
1848 let mut depth = 0u32;
1849 let mut end = open;
1850 for (i, ch) in after[open..].char_indices() {
1851 match ch {
1852 '{' => depth += 1,
1853 '}' => {
1854 depth -= 1;
1855 if depth == 0 {
1856 end = open + i + 1;
1857 break;
1858 }
1859 }
1860 _ => {}
1861 }
1862 }
1863 &after[..end]
1864 }
1865
1866 #[test]
1867 fn test_container_config() {
1868 let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1869 assert!(!config.id.is_empty());
1870 assert_eq!(config.command, vec!["/bin/sh"]);
1871 assert!(config.use_gvisor);
1872 }
1873
1874 #[test]
1875 fn test_run_uses_immediate_start_path_with_parent_setup_gate() {
1876 let source = include_str!("runtime.rs");
1877 let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1878 let after = &source[fn_start..];
1879 let open = after.find('{').unwrap();
1880 let mut depth = 0u32;
1881 let mut fn_end = open;
1882 for (i, ch) in after[open..].char_indices() {
1883 match ch {
1884 '{' => depth += 1,
1885 '}' => {
1886 depth -= 1;
1887 if depth == 0 {
1888 fn_end = open + i + 1;
1889 break;
1890 }
1891 }
1892 _ => {}
1893 }
1894 }
1895 let run_body = &after[..fn_end];
1896 assert!(
1897 run_body.contains("create_internal(false)"),
1898 "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1899 );
1900 assert!(
1901 !run_body.contains("self.create()?.start()"),
1902 "run() must not route through create()+start()"
1903 );
1904
1905 let create_body = extract_fn_body(source, "fn create_internal");
1906 assert!(
1907 create_body.contains("parent_setup_write"),
1908 "immediate run() must still use a parent setup gate before child setup proceeds"
1909 );
1910 }
1911
1912 #[test]
1913 fn test_container_config_with_name() {
1914 let config =
1915 ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1916 .unwrap();
1917 assert_eq!(config.name, "mycontainer");
1918 assert!(!config.id.is_empty());
1919 assert_ne!(config.id, config.name);
1920 }
1921
1922 #[test]
1923 fn test_allow_degraded_security_requires_explicit_config() {
1924 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1925 assert!(!Container::allow_degraded_security(&strict));
1926
1927 let relaxed = strict.clone().with_allow_degraded_security(true);
1928 assert!(Container::allow_degraded_security(&relaxed));
1929 }
1930
1931 #[test]
1932 fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1933 let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1934 std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1935
1936 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1937 assert!(!Container::allow_degraded_security(&strict));
1938
1939 let explicit = strict.with_allow_degraded_security(true);
1940 assert!(Container::allow_degraded_security(&explicit));
1941
1942 match prev {
1943 Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1944 None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1945 }
1946 }
1947
1948 #[test]
1949 fn test_host_network_requires_explicit_opt_in() {
1950 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1951 .unwrap()
1952 .with_network(NetworkMode::Host)
1953 .with_allow_host_network(false);
1954 let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1955 assert!(matches!(err, NucleusError::NetworkError(_)));
1956 }
1957
1958 #[test]
1959 fn test_host_network_opt_in_disables_net_namespace() {
1960 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1961 .unwrap()
1962 .with_network(NetworkMode::Host)
1963 .with_allow_host_network(true);
1964 assert!(config.namespaces.net);
1965 Container::apply_network_mode_guards(&mut config, true).unwrap();
1966 assert!(!config.namespaces.net);
1967 }
1968
1969 #[test]
1970 fn test_non_host_network_does_not_require_host_opt_in() {
1971 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1972 .unwrap()
1973 .with_network(NetworkMode::None)
1974 .with_allow_host_network(false);
1975 assert!(config.namespaces.net);
1976 Container::apply_network_mode_guards(&mut config, true).unwrap();
1977 assert!(config.namespaces.net);
1978 }
1979
1980 #[test]
1981 fn test_parse_kernel_lockdown_mode() {
1982 assert_eq!(
1983 Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1984 Some(KernelLockdownMode::Integrity)
1985 );
1986 assert_eq!(
1987 Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1988 Some(KernelLockdownMode::Confidentiality)
1989 );
1990 assert_eq!(
1991 Container::parse_active_lockdown_mode("[none] integrity"),
1992 None
1993 );
1994 }
1995
1996 #[test]
1997 fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1998 let temp = tempfile::TempDir::new().unwrap();
1999 let source = temp.path().join("source-secret");
2000 std::fs::write(&source, "supersecret").unwrap();
2001
2002 let staged = Container::stage_gvisor_secret_files(
2003 &temp.path().join("stage"),
2004 &[crate::container::SecretMount {
2005 source: source.clone(),
2006 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
2007 mode: 0o400,
2008 }],
2009 &crate::container::ProcessIdentity::root(),
2010 )
2011 .unwrap();
2012
2013 assert_eq!(staged.len(), 1);
2014 assert!(staged[0].source.starts_with(temp.path().join("stage")));
2015 assert_eq!(
2016 std::fs::read_to_string(&staged[0].source).unwrap(),
2017 "supersecret"
2018 );
2019 }
2020
2021 #[test]
2022 fn test_stage_gvisor_secret_files_rejects_symlink_source() {
2023 use std::os::unix::fs::symlink;
2024
2025 let temp = tempfile::TempDir::new().unwrap();
2026 let source = temp.path().join("source-secret");
2027 let link = temp.path().join("source-link");
2028 std::fs::write(&source, "supersecret").unwrap();
2029 symlink(&source, &link).unwrap();
2030
2031 let err = Container::stage_gvisor_secret_files(
2032 &temp.path().join("stage"),
2033 &[crate::container::SecretMount {
2034 source: link,
2035 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
2036 mode: 0o400,
2037 }],
2038 &crate::container::ProcessIdentity::root(),
2039 )
2040 .unwrap_err();
2041
2042 assert!(
2043 err.to_string().contains("O_NOFOLLOW"),
2044 "gVisor secret staging must reject symlink sources"
2045 );
2046 }
2047
2048 #[test]
2049 fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
2050 let source = include_str!("runtime.rs");
2051 let fn_body = extract_fn_body(source, "fn setup_and_exec");
2052 assert!(
2053 fn_body.contains("mount_secrets_inmemory("),
2054 "setup_and_exec must use in-memory secret mounting"
2055 );
2056 assert!(
2057 !fn_body.contains("mount_secrets(&"),
2058 "setup_and_exec must not bind-mount secrets from the host"
2059 );
2060 }
2061
2062 #[test]
2063 fn test_native_production_procfs_mount_is_not_rootless_best_effort() {
2064 let source = include_str!("runtime.rs");
2065 let fn_body = extract_fn_body(source, "fn setup_and_exec");
2066
2067 assert!(
2068 fn_body.contains(
2069 "let production_mode = self.config.service_mode == ServiceMode::Production;"
2070 ),
2071 "setup_and_exec must derive an explicit production-mode guard for procfs hardening"
2072 );
2073 assert!(
2074 fn_body.contains("let procfs_best_effort = is_rootless && !production_mode;"),
2075 "rootless best-effort procfs fallback must be disabled in production mode"
2076 );
2077 assert!(
2078 fn_body.contains(
2079 "mount_procfs(\n &proc_path,\n procfs_best_effort,"
2080 ),
2081 "mount_procfs must receive the production-aware best-effort flag"
2082 );
2083 }
2084
2085 #[test]
2086 fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
2087 let source = include_str!("gvisor_setup.rs");
2088 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2089 assert!(
2090 fn_body.contains("with_inmemory_secret_mounts"),
2091 "gVisor setup must use the tmpfs-backed secret staging path"
2092 );
2093 assert!(
2094 !fn_body.contains("with_secret_mounts"),
2095 "gVisor setup must not bind-mount host secret paths"
2096 );
2097 }
2098
2099 #[test]
2100 fn test_gvisor_bridge_precreated_userns_skips_nested_oci_userns() {
2101 let source = include_str!("gvisor_setup.rs");
2102 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2103 let precreated_check = fn_body.find("if precreated_userns").unwrap();
2104 let oci_userns = fn_body.find("with_rootless_user_namespace").unwrap();
2105 assert!(
2106 precreated_check < oci_userns,
2107 "pre-created rootless bridge userns must skip nested OCI user namespace setup"
2108 );
2109 }
2110
2111 #[test]
2112 fn test_gvisor_bridge_precreated_userns_disables_oci_no_new_privileges() {
2113 let source = include_str!("gvisor_setup.rs");
2114 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2115 assert!(
2116 fn_body.contains("if precreated_userns")
2117 && fn_body.contains("with_no_new_privileges(false)"),
2118 "pre-created rootless bridge userns must not pass OCI noNewPrivileges to runsc"
2119 );
2120 }
2121
2122 #[test]
2123 fn test_gvisor_bridge_precreated_userns_selects_runsc_rootless() {
2124 let source = include_str!("gvisor_setup.rs");
2125 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
2126 assert!(
2127 fn_body.contains("let runsc_rootless = precreated_userns"),
2128 "pre-created rootless bridge userns must use runsc's rootless execution path"
2129 );
2130 }
2131
2132 #[test]
2133 fn test_gvisor_bridge_rootless_requests_external_userns_mapping() {
2134 let source = include_str!("runtime.rs");
2135 let create_body = extract_fn_body(source, "fn create_internal");
2136 assert!(
2137 create_body.contains("let gvisor_bridge_needs_userns_mapping"),
2138 "gVisor bridge rootless setup must request parent-written userns mappings"
2139 );
2140 assert!(
2141 create_body.contains("matches!(config.network, NetworkMode::Bridge(_))"),
2142 "external mapping request must be scoped to gVisor bridge networking"
2143 );
2144 }
2145
2146 #[test]
2147 fn test_gvisor_bridge_namespace_creates_userns_before_netns() {
2148 let source = include_str!("runtime.rs");
2149 let fn_body = extract_fn_body(source, "fn prepare_gvisor_bridge_namespace");
2150 let userns = fn_body.find("CLONE_NEWUSER").unwrap();
2151 let request = fn_body.find("send_sync_byte").unwrap();
2152 let become_root = fn_body.find("become_userns_root_for_setup").unwrap();
2153 let netns = fn_body.find("CLONE_NEWNET").unwrap();
2154 assert!(
2155 userns < request && request < become_root && become_root < netns,
2156 "rootless gVisor bridge setup must map userns before creating the netns"
2157 );
2158 }
2159
2160 #[test]
2161 fn test_native_fork_sites_assert_single_threaded() {
2162 let runtime_source = include_str!("runtime.rs");
2163 let create_body = extract_fn_body(runtime_source, "fn create_internal");
2164 assert!(
2165 create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
2166 "create_internal must assert single-threaded before fork"
2167 );
2168
2169 let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
2170 assert!(
2171 setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
2172 "PID namespace setup must assert single-threaded before fork"
2173 );
2174
2175 let exec_source = include_str!("exec.rs");
2176 let init_body = extract_fn_body(exec_source, "fn run_as_init");
2177 assert!(
2178 init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
2179 "run_as_init must assert single-threaded before fork"
2180 );
2181 }
2182
2183 #[test]
2184 fn test_parent_setup_gate_released_after_network_policy() {
2185 let source = include_str!("runtime.rs");
2186 let create_body = extract_fn_body(source, "fn create_internal");
2187
2188 let cgroup_attach = create_body.find("cgroup.attach_process").unwrap();
2189 let deny_logger = create_body.find("maybe_start_seccomp_deny_logger").unwrap();
2190 let bridge_setup = create_body.find("BridgeDriver::setup_with_id").unwrap();
2191 let egress_policy = create_body.find("net.apply_egress_policy").unwrap();
2192 let release = create_body
2193 .find("Failed to notify child that parent setup is complete")
2194 .unwrap();
2195 let created = create_body.find("Ok(CreatedContainer").unwrap();
2196
2197 assert!(
2198 cgroup_attach < bridge_setup,
2199 "parent setup gate must not release before cgroup attachment"
2200 );
2201 assert!(
2202 cgroup_attach < deny_logger && deny_logger < bridge_setup,
2203 "seccomp deny logger must start after cgroup attachment and before workload release"
2204 );
2205 assert!(
2206 create_body.contains("cgroup_opt.as_ref().map(|cgroup| cgroup.path())"),
2207 "seccomp deny logger must receive the container cgroup scope"
2208 );
2209 assert!(
2210 bridge_setup < egress_policy && egress_policy < release,
2211 "parent setup gate must not release before bridge and egress policy setup"
2212 );
2213 assert!(
2214 release < created,
2215 "create_internal must release the child only after all fallible parent setup succeeds"
2216 );
2217 assert!(
2218 !create_body.contains("cgroup attachment is complete"),
2219 "child setup gate must not be released immediately after cgroup attachment"
2220 );
2221 }
2222
2223 #[test]
2224 fn test_child_waits_for_parent_setup_before_exec_paths() {
2225 let source = include_str!("runtime.rs");
2226 let setup_body = extract_fn_body(source, "fn setup_and_exec");
2227
2228 let gvisor_wait = setup_body
2229 .find("Parent closed setup pipe before signalling gVisor child")
2230 .unwrap();
2231 let gvisor_exec = setup_body.find("setup_and_exec_gvisor").unwrap();
2232 assert!(
2233 gvisor_wait < gvisor_exec,
2234 "gVisor path must wait for parent setup before execing runsc"
2235 );
2236
2237 let pid1_wait = setup_body
2238 .find("Parent closed setup pipe before signalling PID 1 child")
2239 .unwrap();
2240 let namespace_enter = setup_body.find("namespace_mgr.enter()?").unwrap();
2241 assert!(
2242 pid1_wait < namespace_enter,
2243 "PID namespace child must wait for parent setup before container setup continues"
2244 );
2245
2246 let direct_wait = setup_body
2247 .find("Parent closed setup pipe before signalling container child")
2248 .unwrap();
2249 assert!(
2250 direct_wait < namespace_enter,
2251 "non-PID namespace child must wait for parent setup before container setup continues"
2252 );
2253 }
2254
2255 #[test]
2256 fn test_parent_setup_failure_kills_reported_target_pid() {
2257 let source = include_str!("runtime.rs");
2258 let create_body = extract_fn_body(source, "fn create_internal");
2259
2260 let record_target = create_body
2261 .find("target_pid_for_cleanup = Some(target_pid)")
2262 .unwrap();
2263 let kill_target = create_body
2264 .find("kill(Pid::from_raw(target_pid as i32), Signal::SIGKILL)")
2265 .unwrap();
2266 let kill_intermediate = create_body.find("kill(child, Signal::SIGKILL)").unwrap();
2267
2268 assert!(
2269 record_target < kill_target,
2270 "parent setup cleanup must remember the reported target PID"
2271 );
2272 assert!(
2273 kill_target < kill_intermediate,
2274 "cleanup must kill the target PID before reaping the intermediate fork"
2275 );
2276 }
2277
2278 #[test]
2279 fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
2280 let source = include_str!("exec.rs");
2281 let fn_body = extract_fn_body(source, "fn run_as_init");
2282 assert!(
2283 !fn_body.contains("Self::apply_process_identity_to_current_process("),
2284 "run_as_init must not drop identity before the supervisor fork"
2285 );
2286 assert!(
2287 fn_body.contains("self.exec_command()?"),
2288 "workload child must still route through exec_command for identity application"
2289 );
2290 }
2291
2292 #[test]
2293 fn test_signal_thread_shutdown_uses_thread_directed_wakeup() {
2294 let runtime_source = include_str!("runtime.rs");
2295 let exec_source = include_str!("exec.rs");
2296 let signal_helper_source = include_str!("signals.rs");
2297 let process_directed_wakeup = ["kill(Pid::this()", ", Signal::SIGUSR1)"].concat();
2298
2299 assert!(
2300 !runtime_source.contains(&process_directed_wakeup),
2301 "CreatedContainer signal-thread shutdown must not send process-directed SIGUSR1"
2302 );
2303 assert!(
2304 !exec_source.contains(&process_directed_wakeup),
2305 "init supervisor signal-thread shutdown must not send process-directed SIGUSR1"
2306 );
2307 assert!(
2308 signal_helper_source.contains("libc::pthread_kill"),
2309 "signal-thread shutdown must wake the sigwait owner with a thread-directed signal"
2310 );
2311 }
2312
2313 #[test]
2314 fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
2315 let _env_lock = EnvLock::acquire();
2316 let temp = tempfile::TempDir::new().unwrap();
2317 let _artifact_base = EnvVarGuard::set(
2318 "NUCLEUS_GVISOR_ARTIFACT_BASE",
2319 temp.path().join("gvisor-artifacts"),
2320 );
2321 let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
2322 std::fs::create_dir_all(&artifact_dir).unwrap();
2323 std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
2324
2325 Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
2326 assert!(!artifact_dir.exists());
2327 }
2328
2329 #[test]
2330 fn test_gvisor_artifact_base_prefers_xdg_runtime_dir() {
2331 let _env_lock = EnvLock::acquire();
2332 let temp = tempfile::TempDir::new().unwrap();
2333 let _artifact_override = EnvVarGuard::remove("NUCLEUS_GVISOR_ARTIFACT_BASE");
2334 let _runtime = EnvVarGuard::set("XDG_RUNTIME_DIR", temp.path());
2335
2336 assert_eq!(
2337 Container::gvisor_artifact_dir("xdg-test"),
2338 temp.path().join("nucleus-gvisor").join("xdg-test")
2339 );
2340 }
2341
2342 #[test]
2343 fn test_health_check_loop_supports_cancellation() {
2344 let source = include_str!("health.rs");
2348 let fn_start = source.find("fn health_check_loop").unwrap();
2349 let fn_body = &source[fn_start..fn_start + 2500];
2350 assert!(
2351 fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
2352 "health_check_loop must accept an AtomicBool cancellation flag"
2353 );
2354 assert!(
2356 fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
2357 "health_check_loop must check cancellation during sleep intervals"
2358 );
2359 }
2360
2361 #[test]
2362 fn test_runtime_probes_do_not_spawn_host_nsenter() {
2363 let source = include_str!("health.rs");
2365
2366 let readiness_start = source.find("fn run_readiness_probe").unwrap();
2367 let readiness_body = &source[readiness_start..readiness_start + 2500];
2368 assert!(
2369 !readiness_body.contains("Command::new(&nsenter_bin)"),
2370 "readiness probes must not execute via host nsenter"
2371 );
2372
2373 let health_start = source.find("fn health_check_loop").unwrap();
2374 let health_body = &source[health_start..health_start + 2200];
2375 assert!(
2376 !health_body.contains("Command::new(&nsenter_bin)"),
2377 "health checks must not execute via host nsenter"
2378 );
2379 }
2380
2381 #[test]
2382 fn test_oci_mount_strip_prefix_no_expect() {
2383 let source = include_str!("gvisor_setup.rs");
2386 let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
2387 let fn_body = &source[fn_start..fn_start + 600];
2388 assert!(
2389 !fn_body.contains(".expect("),
2390 "prepare_oci_mountpoints must not use expect() – return Err instead"
2391 );
2392 }
2393
2394 #[test]
2395 fn test_notify_namespace_ready_validates_write_length() {
2396 let source = include_str!("runtime.rs");
2398 let fn_start = source.find("fn notify_namespace_ready").unwrap();
2399 let fn_body = &source[fn_start..fn_start + 500];
2400 assert!(
2402 fn_body.contains("written")
2403 || fn_body.contains("4")
2404 || fn_body.contains("payload.len()"),
2405 "notify_namespace_ready must validate complete write of all 4 bytes"
2406 );
2407 }
2408
2409 #[test]
2410 fn test_rlimit_failures_fatal_in_production() {
2411 let source = include_str!("runtime.rs");
2413 let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
2414 let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
2415 assert!(
2416 rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
2417 "RLIMIT failures must return Err in production mode"
2418 );
2419 }
2420
2421 #[test]
2422 fn test_tcp_readiness_probe_uses_portable_check() {
2423 let source = include_str!("health.rs");
2426 let probe_fn = source.find("TcpPort(port)").unwrap();
2427 let probe_body = &source[probe_fn..probe_fn + 500];
2428 assert!(
2429 !probe_body.contains("/dev/tcp"),
2430 "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
2431 );
2432 }
2433}