1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3 ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4 ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8 audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9 mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
10 switch_root, verify_context_manifest, verify_rootfs_attestation, FilesystemState,
11 LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::NamespaceManager;
14use crate::network::{BridgeNetwork, NetworkMode};
15use crate::resources::Cgroup;
16use crate::security::{
17 CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks, SeccompManager,
18 SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::OwnedFd;
26use std::path::PathBuf;
27use std::sync::atomic::{AtomicBool, Ordering};
28use std::sync::Arc;
29use std::thread::JoinHandle;
30use tempfile::Builder;
31use tracing::{debug, error, info, info_span, warn};
32
33pub struct Container {
42 pub(super) config: ContainerConfig,
43 pub(super) runsc_path: Option<String>,
46}
47
48pub struct CreatedContainer {
52 pub(super) config: ContainerConfig,
53 pub(super) state_mgr: ContainerStateManager,
54 pub(super) state: ContainerState,
55 pub(super) child: Pid,
56 pub(super) cgroup_opt: Option<Cgroup>,
57 pub(super) bridge_net: Option<BridgeNetwork>,
58 pub(super) trace_reader: Option<SeccompTraceReader>,
59 pub(super) exec_fifo_path: Option<PathBuf>,
60 pub(super) _lifecycle_span: tracing::Span,
61}
62
63impl Container {
64 pub fn new(config: ContainerConfig) -> Self {
65 Self {
66 config,
67 runsc_path: None,
68 }
69 }
70
71 pub fn run(&self) -> Result<i32> {
73 self.create_internal(false)?.start()
74 }
75
76 pub fn create(&self) -> Result<CreatedContainer> {
80 self.create_internal(true)
81 }
82
83 fn sanitize_fds() {
88 const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
91 let ret =
93 unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
94 if ret == 0 {
95 return;
96 }
97 if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
99 for entry in entries.flatten() {
100 if let Ok(fd_str) = entry.file_name().into_string() {
101 if let Ok(fd) = fd_str.parse::<i32>() {
102 if fd > 2 {
103 unsafe { libc::close(fd) };
104 }
105 }
106 }
107 }
108 }
109 }
110
111 pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
112 let thread_count = std::fs::read_to_string("/proc/self/status")
113 .ok()
114 .and_then(|s| {
115 s.lines()
116 .find(|line| line.starts_with("Threads:"))
117 .and_then(|line| line.split_whitespace().nth(1))
118 .and_then(|count| count.parse::<u32>().ok())
119 });
120
121 if thread_count == Some(1) {
122 return Ok(());
123 }
124
125 Err(NucleusError::ExecError(format!(
126 "{} requires a single-threaded process before fork, found {:?} threads",
127 context, thread_count
128 )))
129 }
130
131 fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
132 let lifecycle_span = info_span!(
133 "container.lifecycle",
134 container.id = %self.config.id,
135 container.name = %self.config.name,
136 runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
137 );
138 let _enter = lifecycle_span.enter();
139
140 info!(
141 "Creating container: {} (ID: {})",
142 self.config.name, self.config.id
143 );
144 audit(
145 &self.config.id,
146 &self.config.name,
147 AuditEventType::ContainerStart,
148 format!(
149 "command={:?} mode={:?} runtime={}",
150 crate::audit::redact_command(&self.config.command),
151 self.config.service_mode,
152 if self.config.use_gvisor {
153 "gvisor"
154 } else {
155 "native"
156 }
157 ),
158 );
159
160 let is_root = nix::unistd::Uid::effective().is_root();
162 let mut config = self.config.clone();
163
164 if !is_root && config.user_ns_config.is_none() {
165 info!("Not running as root, automatically enabling rootless mode");
166 config.namespaces.user = true;
167 config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
168 }
169
170 if is_root && !config.namespaces.user {
174 if config.service_mode == ServiceMode::Production {
175 info!("Running as root in production mode: enabling user namespace with UID remapping");
176 config.namespaces.user = true;
177 config.user_ns_config =
178 Some(crate::isolation::UserNamespaceConfig::root_remapped());
179 } else {
180 warn!(
181 "Running as root WITHOUT user namespace isolation. \
182 Container processes will run as real host UID 0. \
183 Use --user-ns or production mode for UID remapping."
184 );
185 }
186 }
187
188 if let Some(ref socket_path) = config.console_socket {
190 warn!(
191 "Console socket {} accepted but terminal forwarding is not yet implemented",
192 socket_path.display()
193 );
194 }
195
196 config.validate_production_mode()?;
198 Self::assert_kernel_lockdown(&config)?;
199
200 Self::apply_network_mode_guards(&mut config, is_root)?;
201 Self::apply_trust_level_guards(&mut config)?;
202 config.validate_runtime_support()?;
203
204 if matches!(config.network, NetworkMode::Bridge(_)) && !is_root {
206 if config.service_mode == ServiceMode::Production {
207 return Err(NucleusError::NetworkError(
208 "Production mode with bridge networking requires root (cannot silently \
209 degrade to no networking)"
210 .to_string(),
211 ));
212 }
213 warn!("Bridge networking requires root, degrading to no networking");
214 config.network = NetworkMode::None;
215 }
216
217 let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
219
220 if let Ok(all_states) = state_mgr.list_states() {
222 if all_states.iter().any(|s| s.name == config.name) {
223 return Err(NucleusError::ConfigError(format!(
224 "A container named '{}' already exists; use a different --name, \
225 or remove the stale state with 'nucleus delete'",
226 config.name
227 )));
228 }
229 }
230
231 let exec_fifo = if defer_exec_until_start {
234 let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
235 nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
236 NucleusError::ExecError(format!(
237 "Failed to create exec FIFO {:?}: {}",
238 exec_fifo, e
239 ))
240 })?;
241 Some(exec_fifo)
242 } else {
243 None
244 };
245
246 let cgroup_name = format!("nucleus-{}", config.id);
248 let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
249 Ok(mut cgroup) => {
250 match cgroup.set_limits(&config.limits) {
252 Ok(_) => {
253 info!("Created cgroup with resource limits");
254 Some(cgroup)
255 }
256 Err(e) => {
257 if config.service_mode == ServiceMode::Production {
258 let _ = cgroup.cleanup();
259 return Err(NucleusError::CgroupError(format!(
260 "Production mode requires cgroup resource enforcement, but \
261 applying limits failed: {}",
262 e
263 )));
264 }
265 warn!("Failed to set cgroup limits: {}", e);
266 let _ = cgroup.cleanup();
267 None
268 }
269 }
270 }
271 Err(e) => {
272 if config.service_mode == ServiceMode::Production {
273 return Err(NucleusError::CgroupError(format!(
274 "Production mode requires cgroup resource enforcement, but \
275 cgroup creation failed: {}",
276 e
277 )));
278 }
279
280 if config.user_ns_config.is_some() {
281 if config.limits.memory_bytes.is_some()
282 || config.limits.cpu_quota_us.is_some()
283 || config.limits.pids_max.is_some()
284 {
285 warn!(
286 "Running in rootless mode: requested resource limits cannot be \
287 enforced – cgroup creation requires root ({})",
288 e
289 );
290 } else {
291 debug!("Running in rootless mode without cgroup resource limits");
292 }
293 } else {
294 warn!(
295 "Failed to create cgroup (running without resource limits): {}",
296 e
297 );
298 }
299 None
300 }
301 };
302
303 let runsc_path = if config.use_gvisor {
305 Some(GVisorRuntime::resolve_path().map_err(|e| {
306 NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
307 })?)
308 } else {
309 None
310 };
311
312 let (ready_read, ready_write) = pipe().map_err(|e| {
314 NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
315 })?;
316
317 Self::assert_single_threaded_for_fork("container create fork")?;
322 match unsafe { fork() }? {
325 ForkResult::Parent { child } => {
326 drop(ready_write);
327 info!("Forked child process: {}", child);
328
329 let parent_setup = || -> Result<CreatedContainer> {
332 let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
333
334 let cgroup_path = cgroup_opt
335 .as_ref()
336 .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
337 let cpu_millicores = config
338 .limits
339 .cpu_quota_us
340 .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
341 let mut state = ContainerState::new(ContainerStateParams {
342 id: config.id.clone(),
343 name: config.name.clone(),
344 pid: target_pid,
345 command: config.command.clone(),
346 memory_limit: config.limits.memory_bytes,
347 cpu_limit: cpu_millicores,
348 using_gvisor: config.use_gvisor,
349 rootless: config.user_ns_config.is_some(),
350 cgroup_path,
351 process_uid: config.process_identity.uid,
352 process_gid: config.process_identity.gid,
353 additional_gids: config.process_identity.additional_gids.clone(),
354 });
355 state.config_hash = config.config_hash;
356 state.bundle_path =
357 config.rootfs_path.as_ref().map(|p| p.display().to_string());
358
359 let mut bridge_net: Option<BridgeNetwork> = None;
360 let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
361
362 state.status = OciStatus::Created;
364 state_mgr.save_state(&state)?;
365
366 if let Some(ref pid_path) = config.pid_file {
368 std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
369 NucleusError::ConfigError(format!(
370 "Failed to write pid-file '{}': {}",
371 pid_path.display(),
372 e
373 ))
374 })?;
375 info!("Wrote PID {} to {}", target_pid, pid_path.display());
376 }
377
378 if let Some(ref mut cgroup) = cgroup_opt {
379 cgroup.attach_process(target_pid)?;
380 }
381
382 if let NetworkMode::Bridge(ref bridge_config) = config.network {
383 match BridgeNetwork::setup_with_id(target_pid, bridge_config, &config.id) {
384 Ok(net) => {
385 if let Some(ref egress) = config.egress_policy {
386 if let Err(e) = net.apply_egress_policy(target_pid, egress) {
387 if config.service_mode == ServiceMode::Production {
388 return Err(NucleusError::NetworkError(format!(
389 "Failed to apply egress policy: {}",
390 e
391 )));
392 }
393 warn!("Failed to apply egress policy: {}", e);
394 }
395 }
396 bridge_net = Some(net);
397 }
398 Err(e) => {
399 if config.service_mode == ServiceMode::Production {
400 return Err(e);
401 }
402 warn!("Failed to set up bridge networking: {}", e);
403 }
404 }
405 }
406
407 info!(
408 "Container {} created (child pid {}), waiting for start",
409 config.id, target_pid
410 );
411
412 Ok(CreatedContainer {
413 config,
414 state_mgr,
415 state,
416 child,
417 cgroup_opt,
418 bridge_net,
419 trace_reader,
420 exec_fifo_path: exec_fifo,
421 _lifecycle_span: lifecycle_span.clone(),
422 })
423 };
424
425 parent_setup().map_err(|e| {
426 let _ = kill(child, Signal::SIGKILL);
429 let _ = waitpid(child, None);
430 e
431 })
432 }
433 ForkResult::Child => {
434 drop(ready_read);
435 Self::sanitize_fds();
437 let temp_container = Container { config, runsc_path };
438 match temp_container.setup_and_exec(Some(ready_write), exec_fifo) {
439 Ok(_) => unreachable!(),
440 Err(e) => {
441 error!("Container setup failed: {}", e);
442 std::process::exit(1);
443 }
444 }
445 }
446 }
447 }
448
449 pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
452 let state_mgr = ContainerStateManager::new_with_root(state_root)?;
453 let fifo_path = state_mgr.exec_fifo_path(container_id)?;
454 if !fifo_path.exists() {
455 return Err(NucleusError::ConfigError(format!(
456 "No exec FIFO found for container {}; is it in 'created' state?",
457 container_id
458 )));
459 }
460
461 let file = std::fs::File::open(&fifo_path)
463 .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
464 let mut buf = [0u8; 1];
465 std::io::Read::read(&mut &file, &mut buf)
466 .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
467 drop(file);
468
469 let _ = std::fs::remove_file(&fifo_path);
470
471 let mut state = state_mgr.resolve_container(container_id)?;
473 state.status = OciStatus::Running;
474 state_mgr.save_state(&state)?;
475
476 Ok(())
477 }
478
479 fn setup_and_exec(
484 &self,
485 ready_pipe: Option<OwnedFd>,
486 exec_fifo: Option<PathBuf>,
487 ) -> Result<()> {
488 let is_rootless = self.config.user_ns_config.is_some();
489 let allow_degraded_security = Self::allow_degraded_security(&self.config);
490 let context_manifest = if self.config.verify_context_integrity {
491 self.config
492 .context_dir
493 .as_ref()
494 .map(|dir| snapshot_context_dir(dir))
495 .transpose()?
496 } else {
497 None
498 };
499
500 let mut fs_state = FilesystemState::Unmounted;
502 let mut sec_state = SecurityState::Privileged;
503
504 if self.config.use_gvisor {
508 if let Some(fd) = ready_pipe {
509 Self::notify_namespace_ready(&fd, std::process::id())?;
510 }
511 return self.setup_and_exec_gvisor();
512 }
513
514 let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
516 if let Some(user_config) = &self.config.user_ns_config {
517 namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
518 }
519 namespace_mgr.unshare_namespaces()?;
520
521 if self.config.namespaces.pid {
524 Self::assert_single_threaded_for_fork("PID namespace init fork")?;
525 match unsafe { fork() }? {
526 ForkResult::Parent { child } => {
527 if let Some(fd) = ready_pipe {
528 Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
529 }
530 std::process::exit(Self::wait_for_pid_namespace_child(child));
531 }
532 ForkResult::Child => {
533 }
535 }
536 } else if let Some(fd) = ready_pipe {
537 Self::notify_namespace_ready(&fd, std::process::id())?;
538 }
539
540 namespace_mgr.enter()?;
542
543 self.enforce_no_new_privs()?;
547 audit(
548 &self.config.id,
549 &self.config.name,
550 AuditEventType::NoNewPrivsSet,
551 "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
552 );
553
554 if let Some(hostname) = &self.config.hostname {
556 namespace_mgr.set_hostname(hostname)?;
557 }
558
559 let runtime_base = if nix::unistd::Uid::effective().is_root() {
564 std::path::PathBuf::from("/run/nucleus")
565 } else {
566 dirs::runtime_dir()
567 .map(|d| d.join("nucleus"))
568 .unwrap_or_else(std::env::temp_dir)
569 };
570 let _ = std::fs::create_dir_all(&runtime_base);
571 let runtime_dir = Builder::new()
572 .prefix("nucleus-runtime-")
573 .tempdir_in(&runtime_base)
574 .map_err(|e| {
575 NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
576 })?;
577 let container_root = runtime_dir.path().to_path_buf();
578 let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); tmpfs.mount()?;
580 fs_state = fs_state.transition(FilesystemState::Mounted)?;
581
582 create_minimal_fs(&container_root)?;
584
585 let dev_path = container_root.join("dev");
587 create_dev_nodes(&dev_path, false)?;
588
589 if let Some(context_dir) = &self.config.context_dir {
592 let context_dest = container_root.join("context");
593 LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
594 if let Some(expected) = &context_manifest {
595 verify_context_manifest(expected, &context_dest)?;
596 }
597 }
598 fs_state = fs_state.transition(FilesystemState::Populated)?;
599
600 if let Some(ref rootfs_path) = self.config.rootfs_path {
602 if self.config.verify_rootfs_attestation {
603 verify_rootfs_attestation(rootfs_path)?;
604 }
605 bind_mount_rootfs(&container_root, rootfs_path)?;
606 } else {
607 bind_mount_host_paths(&container_root, is_rootless)?;
608 }
609
610 mount_volumes(&container_root, &self.config.volumes)?;
612
613 if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
617 if self.config.rootfs_path.is_some() {
618 BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_config.dns)?;
619 } else {
620 BridgeNetwork::write_resolv_conf(&container_root, &bridge_config.dns)?;
621 }
622 }
623
624 mount_secrets_inmemory(
626 &container_root,
627 &self.config.secrets,
628 &self.config.process_identity,
629 )?;
630
631 let proc_path = container_root.join("proc");
633 let hide_pids = self.config.service_mode == ServiceMode::Production;
634 mount_procfs(
635 &proc_path,
636 is_rootless,
637 self.config.proc_readonly,
638 hide_pids,
639 )?;
640
641 mask_proc_paths(
644 &proc_path,
645 self.config.service_mode == ServiceMode::Production,
646 )?;
647
648 if let Some(ref hooks) = self.config.hooks {
650 if !hooks.create_runtime.is_empty() {
651 let hook_state = OciContainerState {
652 oci_version: "1.0.2".to_string(),
653 id: self.config.id.clone(),
654 status: OciStatus::Creating,
655 pid: std::process::id(),
656 bundle: String::new(),
657 };
658 OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
659 }
660 }
661
662 switch_root(&container_root, self.config.allow_chroot_fallback)?;
665 fs_state = fs_state.transition(FilesystemState::Pivoted)?;
666 debug!("Filesystem state: {:?}", fs_state);
667
668 audit_mounts(self.config.service_mode == ServiceMode::Production)?;
670 audit(
671 &self.config.id,
672 &self.config.name,
673 AuditEventType::MountAuditPassed,
674 "all mount flags verified",
675 );
676
677 if let Some(ref hooks) = self.config.hooks {
679 if !hooks.create_container.is_empty() {
680 let hook_state = OciContainerState {
681 oci_version: "1.0.2".to_string(),
682 id: self.config.id.clone(),
683 status: OciStatus::Created,
684 pid: std::process::id(),
685 bundle: String::new(),
686 };
687 OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
688 }
689 }
690
691 let mut cap_mgr = CapabilityManager::new();
694 if let Some(ref policy_path) = self.config.caps_policy {
695 let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
696 policy_path,
697 self.config.caps_policy_sha256.as_deref(),
698 )?;
699 if self.config.service_mode == ServiceMode::Production {
701 policy.validate_production()?;
702 }
703 policy.apply(&mut cap_mgr)?;
704 audit(
705 &self.config.id,
706 &self.config.name,
707 AuditEventType::CapabilitiesDropped,
708 format!("capability policy applied from {:?}", policy_path),
709 );
710 } else {
711 cap_mgr.drop_all()?;
712 audit(
713 &self.config.id,
714 &self.config.name,
715 AuditEventType::CapabilitiesDropped,
716 "all capabilities dropped including bounding set",
717 );
718 }
719 sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
720
721 {
726 let is_production = self.config.service_mode == ServiceMode::Production;
727
728 let nproc_limit = self.config.limits.pids_max.unwrap_or(512);
729 let rlim_nproc = libc::rlimit {
730 rlim_cur: nproc_limit,
731 rlim_max: nproc_limit,
732 };
733 if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
735 let err = std::io::Error::last_os_error();
736 if is_production {
737 return Err(NucleusError::SeccompError(format!(
738 "Failed to set RLIMIT_NPROC to {} in production mode: {}",
739 nproc_limit, err
740 )));
741 }
742 warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
743 }
744
745 let rlim_nofile = libc::rlimit {
746 rlim_cur: 1024,
747 rlim_max: 1024,
748 };
749 if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
751 let err = std::io::Error::last_os_error();
752 if is_production {
753 return Err(NucleusError::SeccompError(format!(
754 "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
755 err
756 )));
757 }
758 warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
759 }
760
761 let memlock_limit: u64 = 64 * 1024; let rlim_memlock = libc::rlimit {
767 rlim_cur: memlock_limit,
768 rlim_max: memlock_limit,
769 };
770 if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
772 let err = std::io::Error::last_os_error();
773 if is_production {
774 return Err(NucleusError::SeccompError(format!(
775 "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
776 memlock_limit, err
777 )));
778 }
779 warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
780 }
781 }
782
783 CapabilityManager::verify_no_namespace_caps(
787 self.config.service_mode == ServiceMode::Production,
788 )?;
789
790 use crate::container::config::SeccompMode;
793 let mut seccomp_mgr = SeccompManager::new();
794 let allow_network = !matches!(self.config.network, NetworkMode::None);
795 let seccomp_applied = match self.config.seccomp_mode {
796 SeccompMode::Trace => {
797 audit(
798 &self.config.id,
799 &self.config.name,
800 AuditEventType::SeccompApplied,
801 "seccomp trace mode: allow-all + LOG",
802 );
803 seccomp_mgr.apply_trace_filter()?
804 }
805 SeccompMode::Enforce => {
806 if let Some(ref profile_path) = self.config.seccomp_profile {
807 audit(
808 &self.config.id,
809 &self.config.name,
810 AuditEventType::SeccompProfileLoaded,
811 format!("path={:?}", profile_path),
812 );
813 seccomp_mgr.apply_profile_from_file(
814 profile_path,
815 self.config.seccomp_profile_sha256.as_deref(),
816 self.config.seccomp_log_denied,
817 )?
818 } else {
819 seccomp_mgr.apply_filter_for_network_mode(
820 allow_network,
821 allow_degraded_security,
822 self.config.seccomp_log_denied,
823 )?
824 }
825 }
826 };
827 if seccomp_applied {
828 sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
829 audit(
830 &self.config.id,
831 &self.config.name,
832 AuditEventType::SeccompApplied,
833 format!("network={}", allow_network),
834 );
835 } else if !allow_degraded_security {
836 return Err(NucleusError::SeccompError(
837 "Seccomp filter is required but was not enforced".to_string(),
838 ));
839 } else {
840 warn!("Seccomp not enforced; container is running with degraded hardening");
841 }
842
843 let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
845 let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
846 policy_path,
847 self.config.landlock_policy_sha256.as_deref(),
848 )?;
849 if self.config.service_mode == ServiceMode::Production {
851 policy.validate_production()?;
852 }
853 policy.apply(allow_degraded_security)?
854 } else {
855 let mut landlock_mgr = LandlockManager::new();
856 landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
857 landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
858 };
859 if seccomp_applied && landlock_applied {
860 sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
861 if self.config.seccomp_mode == SeccompMode::Trace {
862 warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
863 } else {
864 sec_state = sec_state.transition(SecurityState::Locked)?;
865 }
866 audit(
867 &self.config.id,
868 &self.config.name,
869 AuditEventType::LandlockApplied,
870 if self.config.seccomp_mode == SeccompMode::Trace {
871 "landlock applied, but seccomp in trace mode — not locked".to_string()
872 } else {
873 "security state locked: all hardening layers active".to_string()
874 },
875 );
876 } else if !allow_degraded_security {
877 return Err(NucleusError::LandlockError(
878 "Landlock policy is required but was not enforced".to_string(),
879 ));
880 } else {
881 warn!("Security state not locked; one or more hardening controls are inactive");
882 }
883 debug!("Security state: {:?}", sec_state);
884
885 if let Some(ref fifo_path) = exec_fifo {
889 debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
890 let file = std::fs::OpenOptions::new()
891 .write(true)
892 .open(fifo_path)
893 .map_err(|e| {
894 NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
895 })?;
896 std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
897 NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
898 })?;
899 drop(file);
900 debug!("Exec FIFO released, proceeding to exec");
901 }
902
903 if let Some(ref hooks) = self.config.hooks {
905 if !hooks.start_container.is_empty() {
906 let hook_state = OciContainerState {
907 oci_version: "1.0.2".to_string(),
908 id: self.config.id.clone(),
909 status: OciStatus::Running,
910 pid: std::process::id(),
911 bundle: String::new(),
912 };
913 OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
914 }
915 }
916
917 if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
920 return self.run_as_init();
921 }
922
923 self.exec_command()?;
925
926 Ok(())
928 }
929
930 pub(super) fn setup_signal_forwarding_static(
935 child: Pid,
936 ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
937 let mut set = SigSet::empty();
938 for signal in [
939 Signal::SIGTERM,
940 Signal::SIGINT,
941 Signal::SIGHUP,
942 Signal::SIGQUIT,
943 Signal::SIGUSR1,
944 Signal::SIGUSR2,
945 ] {
946 set.add(signal);
947 }
948
949 let unblock_set = set;
950 pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
951 NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
952 })?;
953
954 let stop = Arc::new(AtomicBool::new(false));
955 let stop_clone = stop.clone();
956 let handle = std::thread::Builder::new()
957 .name("sig-forward".to_string())
958 .spawn(move || {
959 loop {
961 if let Ok(signal) = unblock_set.wait() {
962 if stop_clone.load(Ordering::Relaxed) {
966 break;
967 }
968 let _ = kill(child, signal);
969 }
970 }
971 })
972 .map_err(|e| {
973 let mut restore = SigSet::empty();
976 for signal in [
977 Signal::SIGTERM,
978 Signal::SIGINT,
979 Signal::SIGHUP,
980 Signal::SIGQUIT,
981 Signal::SIGUSR1,
982 Signal::SIGUSR2,
983 ] {
984 restore.add(signal);
985 }
986 let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
987 NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
988 })?;
989
990 info!("Signal forwarding configured");
991 Ok((stop, handle))
992 }
993
994 pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
996 loop {
997 match waitpid(child, None) {
998 Ok(WaitStatus::Exited(_, code)) => {
999 return Ok(code);
1000 }
1001 Ok(WaitStatus::Signaled(_, signal, _)) => {
1002 info!("Child killed by signal: {:?}", signal);
1003 return Ok(128 + signal as i32);
1004 }
1005 Err(nix::errno::Errno::EINTR) => {
1006 continue;
1007 }
1008 Err(e) => {
1009 return Err(NucleusError::ExecError(format!(
1010 "Failed to wait for child: {}",
1011 e
1012 )));
1013 }
1014 _ => {
1015 continue;
1016 }
1017 }
1018 }
1019 }
1020
1021 fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1022 let mut pid_buf = [0u8; 4];
1023 loop {
1024 match read(ready_read, &mut pid_buf) {
1025 Err(nix::errno::Errno::EINTR) => continue,
1026 Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1027 Ok(0) => {
1028 return Err(NucleusError::ExecError(format!(
1029 "Child {} exited before namespace initialization",
1030 child
1031 )))
1032 }
1033 Ok(_) => {
1034 return Err(NucleusError::ExecError(
1035 "Invalid namespace sync payload from child".to_string(),
1036 ))
1037 }
1038 Err(e) => {
1039 return Err(NucleusError::ExecError(format!(
1040 "Failed waiting for child namespace setup: {}",
1041 e
1042 )))
1043 }
1044 }
1045 }
1046 }
1047
1048 fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1049 let payload = pid.to_ne_bytes();
1050 let mut written = 0;
1051 while written < payload.len() {
1052 let n = write(fd, &payload[written..]).map_err(|e| {
1053 NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1054 })?;
1055 if n == 0 {
1056 return Err(NucleusError::ExecError(
1057 "Failed to notify namespace readiness: short write".to_string(),
1058 ));
1059 }
1060 written += n;
1061 }
1062 Ok(())
1063 }
1064
1065 fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1066 loop {
1067 match waitpid(child, None) {
1068 Ok(WaitStatus::Exited(_, code)) => return code,
1069 Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1070 Err(nix::errno::Errno::EINTR) => continue,
1071 Err(_) => return 1,
1072 _ => continue,
1073 }
1074 }
1075 }
1076}
1077
1078impl CreatedContainer {
1079 pub fn start(mut self) -> Result<i32> {
1082 let config = &self.config;
1083 let _enter = self._lifecycle_span.enter();
1084
1085 if let Some(exec_fifo_path) = &self.exec_fifo_path {
1088 let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1089 NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1090 })?;
1091 let mut buf = [0u8; 1];
1092 let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1093 NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1094 })?;
1095 if read != 1 {
1096 return Err(NucleusError::ExecError(
1097 "Exec FIFO closed before start signal was delivered".to_string(),
1098 ));
1099 }
1100 let _ = std::fs::remove_file(exec_fifo_path);
1101 }
1102
1103 self.state.status = OciStatus::Running;
1105 self.state_mgr.save_state(&self.state)?;
1106
1107 let target_pid = self.state.pid;
1108 let child = self.child;
1109
1110 let (sig_stop, sig_handle) =
1111 Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1112
1113 let mut sig_guard = SignalThreadGuard {
1115 stop: Some(sig_stop),
1116 handle: Some(sig_handle),
1117 };
1118
1119 if let Some(ref probe) = config.readiness_probe {
1121 let notify_socket = if config.sd_notify {
1122 std::env::var("NOTIFY_SOCKET").ok()
1123 } else {
1124 None
1125 };
1126 Container::run_readiness_probe(
1127 target_pid,
1128 &config.name,
1129 probe,
1130 config.user_ns_config.is_some(),
1131 config.use_gvisor,
1132 &config.process_identity,
1133 notify_socket.as_deref(),
1134 )?;
1135 }
1136
1137 let cancel_flag = Arc::new(AtomicBool::new(false));
1139 let health_handle = if let Some(ref hc) = config.health_check {
1140 if !hc.command.is_empty() {
1141 let hc = hc.clone();
1142 let pid = target_pid;
1143 let container_name = config.name.clone();
1144 let rootless = config.user_ns_config.is_some();
1145 let using_gvisor = config.use_gvisor;
1146 let process_identity = config.process_identity.clone();
1147 let cancel = cancel_flag.clone();
1148 Some(std::thread::spawn(move || {
1149 Container::health_check_loop(
1150 pid,
1151 &container_name,
1152 rootless,
1153 using_gvisor,
1154 &hc,
1155 &process_identity,
1156 &cancel,
1157 );
1158 }))
1159 } else {
1160 None
1161 }
1162 } else {
1163 None
1164 };
1165
1166 let mut health_guard = HealthThreadGuard {
1168 cancel: Some(cancel_flag),
1169 handle: health_handle,
1170 };
1171
1172 if let Some(ref hooks) = config.hooks {
1174 if !hooks.poststart.is_empty() {
1175 let hook_state = OciContainerState {
1176 oci_version: "1.0.2".to_string(),
1177 id: config.id.clone(),
1178 status: OciStatus::Running,
1179 pid: target_pid,
1180 bundle: String::new(),
1181 };
1182 OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1183 }
1184 }
1185
1186 let mut child_waited = false;
1187 let run_result: Result<i32> = (|| {
1188 let exit_code = Container::wait_for_child_static(child)?;
1189
1190 self.state.status = OciStatus::Stopped;
1192 let _ = self.state_mgr.save_state(&self.state);
1193
1194 child_waited = true;
1195 Ok(exit_code)
1196 })();
1197
1198 health_guard.stop();
1201 sig_guard.stop();
1202
1203 if let Some(ref hooks) = config.hooks {
1205 if !hooks.poststop.is_empty() {
1206 let hook_state = OciContainerState {
1207 oci_version: "1.0.2".to_string(),
1208 id: config.id.clone(),
1209 status: OciStatus::Stopped,
1210 pid: 0,
1211 bundle: String::new(),
1212 };
1213 OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1214 }
1215 }
1216
1217 if let Some(net) = self.bridge_net.take() {
1218 if let Err(e) = net.cleanup() {
1219 warn!("Failed to cleanup bridge networking: {}", e);
1220 }
1221 }
1222
1223 if !child_waited {
1224 let _ = kill(child, Signal::SIGKILL);
1225 let _ = waitpid(child, None);
1226 }
1227
1228 if let Some(reader) = self.trace_reader.take() {
1229 reader.stop_and_flush();
1230 }
1231
1232 if let Some(cgroup) = self.cgroup_opt.take() {
1233 if let Err(e) = cgroup.cleanup() {
1234 warn!("Failed to cleanup cgroup: {}", e);
1235 }
1236 }
1237
1238 if config.use_gvisor {
1239 if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1240 warn!(
1241 "Failed to cleanup gVisor artifacts for {}: {}",
1242 config.id, e
1243 );
1244 }
1245 }
1246
1247 if let Err(e) = self.state_mgr.delete_state(&config.id) {
1248 warn!("Failed to delete state for {}: {}", config.id, e);
1249 }
1250
1251 match run_result {
1252 Ok(exit_code) => {
1253 audit(
1254 &config.id,
1255 &config.name,
1256 AuditEventType::ContainerStop,
1257 format!("exit_code={}", exit_code),
1258 );
1259 info!(
1260 "Container {} ({}) exited with code {}",
1261 config.name, config.id, exit_code
1262 );
1263 Ok(exit_code)
1264 }
1265 Err(e) => {
1266 audit_error(
1267 &config.id,
1268 &config.name,
1269 AuditEventType::ContainerStop,
1270 format!("error={}", e),
1271 );
1272 Err(e)
1273 }
1274 }
1275 }
1276}
1277
1278struct SignalThreadGuard {
1280 stop: Option<Arc<AtomicBool>>,
1281 handle: Option<JoinHandle<()>>,
1282}
1283
1284impl SignalThreadGuard {
1285 fn stop(&mut self) {
1286 if let Some(flag) = self.stop.take() {
1287 flag.store(true, Ordering::Relaxed);
1288 let _ = kill(Pid::this(), Signal::SIGUSR1);
1290 }
1291 if let Some(handle) = self.handle.take() {
1292 let _ = handle.join();
1293 }
1294 }
1295}
1296
1297impl Drop for SignalThreadGuard {
1298 fn drop(&mut self) {
1299 self.stop();
1300 }
1301}
1302
1303struct HealthThreadGuard {
1305 cancel: Option<Arc<AtomicBool>>,
1306 handle: Option<JoinHandle<()>>,
1307}
1308
1309impl HealthThreadGuard {
1310 fn stop(&mut self) {
1311 if let Some(flag) = self.cancel.take() {
1312 flag.store(true, Ordering::Relaxed);
1313 }
1314 if let Some(handle) = self.handle.take() {
1315 let _ = handle.join();
1316 }
1317 }
1318}
1319
1320impl Drop for HealthThreadGuard {
1321 fn drop(&mut self) {
1322 self.stop();
1323 }
1324}
1325
1326#[cfg(test)]
1327mod tests {
1328 use super::*;
1329 use crate::container::KernelLockdownMode;
1330 use crate::network::NetworkMode;
1331
1332 fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
1333 let fn_start = source
1334 .find(fn_signature)
1335 .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
1336 let after = &source[fn_start..];
1337 let open = after
1338 .find('{')
1339 .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
1340 let mut depth = 0u32;
1341 let mut end = open;
1342 for (i, ch) in after[open..].char_indices() {
1343 match ch {
1344 '{' => depth += 1,
1345 '}' => {
1346 depth -= 1;
1347 if depth == 0 {
1348 end = open + i + 1;
1349 break;
1350 }
1351 }
1352 _ => {}
1353 }
1354 }
1355 &after[..end]
1356 }
1357
1358 #[test]
1359 fn test_container_config() {
1360 let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1361 assert!(!config.id.is_empty());
1362 assert_eq!(config.command, vec!["/bin/sh"]);
1363 assert!(config.use_gvisor);
1364 }
1365
1366 #[test]
1367 fn test_run_uses_immediate_start_path() {
1368 let source = include_str!("runtime.rs");
1369 let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1370 let after = &source[fn_start..];
1371 let open = after.find('{').unwrap();
1372 let mut depth = 0u32;
1373 let mut fn_end = open;
1374 for (i, ch) in after[open..].char_indices() {
1375 match ch {
1376 '{' => depth += 1,
1377 '}' => {
1378 depth -= 1;
1379 if depth == 0 {
1380 fn_end = open + i + 1;
1381 break;
1382 }
1383 }
1384 _ => {}
1385 }
1386 }
1387 let run_body = &after[..fn_end];
1388 assert!(
1389 run_body.contains("create_internal(false)"),
1390 "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1391 );
1392 assert!(
1393 !run_body.contains("self.create()?.start()"),
1394 "run() must not route through create()+start()"
1395 );
1396 }
1397
1398 #[test]
1399 fn test_container_config_with_name() {
1400 let config =
1401 ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1402 .unwrap();
1403 assert_eq!(config.name, "mycontainer");
1404 assert!(!config.id.is_empty());
1405 assert_ne!(config.id, config.name);
1406 }
1407
1408 #[test]
1409 fn test_allow_degraded_security_requires_explicit_config() {
1410 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1411 assert!(!Container::allow_degraded_security(&strict));
1412
1413 let relaxed = strict.clone().with_allow_degraded_security(true);
1414 assert!(Container::allow_degraded_security(&relaxed));
1415 }
1416
1417 #[test]
1418 fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1419 let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1420 std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1421
1422 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1423 assert!(!Container::allow_degraded_security(&strict));
1424
1425 let explicit = strict.with_allow_degraded_security(true);
1426 assert!(Container::allow_degraded_security(&explicit));
1427
1428 match prev {
1429 Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1430 None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1431 }
1432 }
1433
1434 #[test]
1435 fn test_host_network_requires_explicit_opt_in() {
1436 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1437 .unwrap()
1438 .with_network(NetworkMode::Host)
1439 .with_allow_host_network(false);
1440 let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1441 assert!(matches!(err, NucleusError::NetworkError(_)));
1442 }
1443
1444 #[test]
1445 fn test_host_network_opt_in_disables_net_namespace() {
1446 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1447 .unwrap()
1448 .with_network(NetworkMode::Host)
1449 .with_allow_host_network(true);
1450 assert!(config.namespaces.net);
1451 Container::apply_network_mode_guards(&mut config, true).unwrap();
1452 assert!(!config.namespaces.net);
1453 }
1454
1455 #[test]
1456 fn test_non_host_network_does_not_require_host_opt_in() {
1457 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1458 .unwrap()
1459 .with_network(NetworkMode::None)
1460 .with_allow_host_network(false);
1461 assert!(config.namespaces.net);
1462 Container::apply_network_mode_guards(&mut config, true).unwrap();
1463 assert!(config.namespaces.net);
1464 }
1465
1466 #[test]
1467 fn test_parse_kernel_lockdown_mode() {
1468 assert_eq!(
1469 Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1470 Some(KernelLockdownMode::Integrity)
1471 );
1472 assert_eq!(
1473 Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1474 Some(KernelLockdownMode::Confidentiality)
1475 );
1476 assert_eq!(
1477 Container::parse_active_lockdown_mode("[none] integrity"),
1478 None
1479 );
1480 }
1481
1482 #[test]
1483 fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1484 let temp = tempfile::TempDir::new().unwrap();
1485 let source = temp.path().join("source-secret");
1486 std::fs::write(&source, "supersecret").unwrap();
1487
1488 let staged = Container::stage_gvisor_secret_files(
1489 &temp.path().join("stage"),
1490 &[crate::container::SecretMount {
1491 source: source.clone(),
1492 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1493 mode: 0o400,
1494 }],
1495 &crate::container::ProcessIdentity::root(),
1496 )
1497 .unwrap();
1498
1499 assert_eq!(staged.len(), 1);
1500 assert!(staged[0].source.starts_with(temp.path().join("stage")));
1501 assert_eq!(
1502 std::fs::read_to_string(&staged[0].source).unwrap(),
1503 "supersecret"
1504 );
1505 }
1506
1507 #[test]
1508 fn test_stage_gvisor_secret_files_rejects_symlink_source() {
1509 use std::os::unix::fs::symlink;
1510
1511 let temp = tempfile::TempDir::new().unwrap();
1512 let source = temp.path().join("source-secret");
1513 let link = temp.path().join("source-link");
1514 std::fs::write(&source, "supersecret").unwrap();
1515 symlink(&source, &link).unwrap();
1516
1517 let err = Container::stage_gvisor_secret_files(
1518 &temp.path().join("stage"),
1519 &[crate::container::SecretMount {
1520 source: link,
1521 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1522 mode: 0o400,
1523 }],
1524 &crate::container::ProcessIdentity::root(),
1525 )
1526 .unwrap_err();
1527
1528 assert!(
1529 err.to_string().contains("O_NOFOLLOW"),
1530 "gVisor secret staging must reject symlink sources"
1531 );
1532 }
1533
1534 #[test]
1535 fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
1536 let source = include_str!("runtime.rs");
1537 let fn_body = extract_fn_body(source, "fn setup_and_exec");
1538 assert!(
1539 fn_body.contains("mount_secrets_inmemory("),
1540 "setup_and_exec must use in-memory secret mounting"
1541 );
1542 assert!(
1543 !fn_body.contains("mount_secrets(&"),
1544 "setup_and_exec must not bind-mount secrets from the host"
1545 );
1546 }
1547
1548 #[test]
1549 fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
1550 let source = include_str!("gvisor_setup.rs");
1551 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
1552 assert!(
1553 fn_body.contains("with_inmemory_secret_mounts"),
1554 "gVisor setup must use the tmpfs-backed secret staging path"
1555 );
1556 assert!(
1557 !fn_body.contains("with_secret_mounts"),
1558 "gVisor setup must not bind-mount host secret paths"
1559 );
1560 }
1561
1562 #[test]
1563 fn test_native_fork_sites_assert_single_threaded() {
1564 let runtime_source = include_str!("runtime.rs");
1565 let create_body = extract_fn_body(runtime_source, "fn create_internal");
1566 assert!(
1567 create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
1568 "create_internal must assert single-threaded before fork"
1569 );
1570
1571 let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
1572 assert!(
1573 setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
1574 "PID namespace setup must assert single-threaded before fork"
1575 );
1576
1577 let exec_source = include_str!("exec.rs");
1578 let init_body = extract_fn_body(exec_source, "fn run_as_init");
1579 assert!(
1580 init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
1581 "run_as_init must assert single-threaded before fork"
1582 );
1583 }
1584
1585 #[test]
1586 fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
1587 let source = include_str!("exec.rs");
1588 let fn_body = extract_fn_body(source, "fn run_as_init");
1589 assert!(
1590 !fn_body.contains("Self::apply_process_identity_to_current_process("),
1591 "run_as_init must not drop identity before the supervisor fork"
1592 );
1593 assert!(
1594 fn_body.contains("self.exec_command()?"),
1595 "workload child must still route through exec_command for identity application"
1596 );
1597 }
1598
1599 #[test]
1600 fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1601 let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1602 std::fs::create_dir_all(&artifact_dir).unwrap();
1603 std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1604
1605 Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1606 assert!(!artifact_dir.exists());
1607 }
1608
1609 #[test]
1610 fn test_health_check_loop_supports_cancellation() {
1611 let source = include_str!("health.rs");
1615 let fn_start = source.find("fn health_check_loop").unwrap();
1616 let fn_body = &source[fn_start..fn_start + 2500];
1617 assert!(
1618 fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1619 "health_check_loop must accept an AtomicBool cancellation flag"
1620 );
1621 assert!(
1623 fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1624 "health_check_loop must check cancellation during sleep intervals"
1625 );
1626 }
1627
1628 #[test]
1629 fn test_runtime_probes_do_not_spawn_host_nsenter() {
1630 let source = include_str!("health.rs");
1632
1633 let readiness_start = source.find("fn run_readiness_probe").unwrap();
1634 let readiness_body = &source[readiness_start..readiness_start + 2500];
1635 assert!(
1636 !readiness_body.contains("Command::new(&nsenter_bin)"),
1637 "readiness probes must not execute via host nsenter"
1638 );
1639
1640 let health_start = source.find("fn health_check_loop").unwrap();
1641 let health_body = &source[health_start..health_start + 2200];
1642 assert!(
1643 !health_body.contains("Command::new(&nsenter_bin)"),
1644 "health checks must not execute via host nsenter"
1645 );
1646 }
1647
1648 #[test]
1649 fn test_oci_mount_strip_prefix_no_expect() {
1650 let source = include_str!("gvisor_setup.rs");
1653 let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1654 let fn_body = &source[fn_start..fn_start + 600];
1655 assert!(
1656 !fn_body.contains(".expect("),
1657 "prepare_oci_mountpoints must not use expect() — return Err instead"
1658 );
1659 }
1660
1661 #[test]
1662 fn test_notify_namespace_ready_validates_write_length() {
1663 let source = include_str!("runtime.rs");
1665 let fn_start = source.find("fn notify_namespace_ready").unwrap();
1666 let fn_body = &source[fn_start..fn_start + 500];
1667 assert!(
1669 fn_body.contains("written")
1670 || fn_body.contains("4")
1671 || fn_body.contains("payload.len()"),
1672 "notify_namespace_ready must validate complete write of all 4 bytes"
1673 );
1674 }
1675
1676 #[test]
1677 fn test_rlimit_failures_fatal_in_production() {
1678 let source = include_str!("runtime.rs");
1680 let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1681 let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1682 assert!(
1683 rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1684 "RLIMIT failures must return Err in production mode"
1685 );
1686 }
1687
1688 #[test]
1689 fn test_tcp_readiness_probe_uses_portable_check() {
1690 let source = include_str!("health.rs");
1693 let probe_fn = source.find("TcpPort(port)").unwrap();
1694 let probe_body = &source[probe_fn..probe_fn + 500];
1695 assert!(
1696 !probe_body.contains("/dev/tcp"),
1697 "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1698 );
1699 }
1700}