1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3 ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4 ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8 audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9 mask_proc_paths, mount_procfs, mount_secrets_inmemory, mount_volumes, snapshot_context_dir,
10 switch_root, verify_context_manifest, verify_rootfs_attestation, FilesystemState,
11 LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::NamespaceManager;
14use crate::network::{BridgeDriver, BridgeNetwork, NatBackend, NetworkMode, UserspaceNetwork};
15use crate::resources::Cgroup;
16use crate::security::{
17 CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks,
18 SeccompDenyLogger, SeccompManager, SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::OwnedFd;
26use std::path::PathBuf;
27use std::sync::atomic::{AtomicBool, Ordering};
28use std::sync::Arc;
29use std::thread::JoinHandle;
30use tempfile::Builder;
31use tracing::{debug, error, info, info_span, warn};
32
33pub struct Container {
42 pub(super) config: ContainerConfig,
43 pub(super) runsc_path: Option<String>,
46}
47
48pub struct CreatedContainer {
52 pub(super) config: ContainerConfig,
53 pub(super) state_mgr: ContainerStateManager,
54 pub(super) state: ContainerState,
55 pub(super) child: Pid,
56 pub(super) cgroup_opt: Option<Cgroup>,
57 pub(super) network_driver: Option<BridgeDriver>,
58 pub(super) trace_reader: Option<SeccompTraceReader>,
59 pub(super) deny_logger: Option<SeccompDenyLogger>,
60 pub(super) exec_fifo_path: Option<PathBuf>,
61 pub(super) _lifecycle_span: tracing::Span,
62}
63
64impl Container {
65 pub fn new(config: ContainerConfig) -> Self {
66 Self {
67 config,
68 runsc_path: None,
69 }
70 }
71
72 pub fn run(&self) -> Result<i32> {
74 self.create_internal(false)?.start()
75 }
76
77 pub fn create(&self) -> Result<CreatedContainer> {
81 self.create_internal(true)
82 }
83
84 fn sanitize_fds() {
89 const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
92 let ret =
94 unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
95 if ret == 0 {
96 return;
97 }
98 if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
102 let fds: Vec<i32> = entries
103 .flatten()
104 .filter_map(|entry| entry.file_name().into_string().ok())
105 .filter_map(|s| s.parse::<i32>().ok())
106 .filter(|&fd| fd > 2)
107 .collect();
108 for fd in fds {
109 unsafe { libc::close(fd) };
110 }
111 }
112 }
113
114 pub(crate) fn assert_single_threaded_for_fork(context: &str) -> Result<()> {
115 let thread_count = std::fs::read_to_string("/proc/self/status")
116 .ok()
117 .and_then(|s| {
118 s.lines()
119 .find(|line| line.starts_with("Threads:"))
120 .and_then(|line| line.split_whitespace().nth(1))
121 .and_then(|count| count.parse::<u32>().ok())
122 });
123
124 if thread_count == Some(1) {
125 return Ok(());
126 }
127
128 Err(NucleusError::ExecError(format!(
129 "{} requires a single-threaded process before fork, found {:?} threads",
130 context, thread_count
131 )))
132 }
133
134 fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
135 let lifecycle_span = info_span!(
136 "container.lifecycle",
137 container.id = %self.config.id,
138 container.name = %self.config.name,
139 runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
140 );
141 let _enter = lifecycle_span.enter();
142
143 info!(
144 "Creating container: {} (ID: {})",
145 self.config.name, self.config.id
146 );
147 audit(
148 &self.config.id,
149 &self.config.name,
150 AuditEventType::ContainerStart,
151 format!(
152 "command={:?} mode={:?} runtime={}",
153 crate::audit::redact_command(&self.config.command),
154 self.config.service_mode,
155 if self.config.use_gvisor {
156 "gvisor"
157 } else {
158 "native"
159 }
160 ),
161 );
162
163 let is_root = nix::unistd::Uid::effective().is_root();
165 let mut config = self.config.clone();
166
167 if !is_root && config.user_ns_config.is_none() {
168 info!("Not running as root, automatically enabling rootless mode");
169 config.namespaces.user = true;
170 config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
171 }
172
173 if is_root && !config.namespaces.user {
177 if config.service_mode == ServiceMode::Production {
178 info!("Running as root in production mode: enabling user namespace with UID remapping");
179 config.namespaces.user = true;
180 config.user_ns_config =
181 Some(crate::isolation::UserNamespaceConfig::root_remapped());
182 } else {
183 warn!(
184 "Running as root WITHOUT user namespace isolation. \
185 Container processes will run as real host UID 0. \
186 Use --user-ns or production mode for UID remapping."
187 );
188 }
189 }
190
191 if let Some(ref socket_path) = config.console_socket {
193 warn!(
194 "Console socket {} accepted but terminal forwarding is not yet implemented",
195 socket_path.display()
196 );
197 }
198
199 config.validate_production_mode()?;
201 Self::assert_kernel_lockdown(&config)?;
202
203 Self::apply_network_mode_guards(&mut config, is_root)?;
204 Self::apply_trust_level_guards(&mut config)?;
205 config.validate_runtime_support()?;
206
207 if let NetworkMode::Bridge(ref bridge_config) = config.network {
208 let backend =
209 bridge_config.selected_nat_backend(is_root, config.user_ns_config.is_some());
210 if backend == NatBackend::Kernel && !is_root {
211 return Err(NucleusError::NetworkError(
212 "Kernel bridge networking requires root. Use --nat-backend userspace or leave the default auto selection for rootless/native containers."
213 .to_string(),
214 ));
215 }
216 }
217
218 let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
220
221 if let Ok(all_states) = state_mgr.list_states() {
223 if all_states.iter().any(|s| s.name == config.name) {
224 return Err(NucleusError::ConfigError(format!(
225 "A container named '{}' already exists; use a different --name, \
226 or remove the stale state with 'nucleus delete'",
227 config.name
228 )));
229 }
230 }
231
232 let exec_fifo = if defer_exec_until_start {
235 let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
236 nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
237 NucleusError::ExecError(format!(
238 "Failed to create exec FIFO {:?}: {}",
239 exec_fifo, e
240 ))
241 })?;
242 Some(exec_fifo)
243 } else {
244 None
245 };
246
247 let cgroup_name = format!("nucleus-{}", config.id);
249 let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
250 Ok(mut cgroup) => {
251 match cgroup.set_limits(&config.limits) {
253 Ok(_) => {
254 info!("Created cgroup with resource limits");
255 Some(cgroup)
256 }
257 Err(e) => {
258 if config.service_mode == ServiceMode::Production {
259 let _ = cgroup.cleanup();
260 return Err(NucleusError::CgroupError(format!(
261 "Production mode requires cgroup resource enforcement, but \
262 applying limits failed: {}",
263 e
264 )));
265 }
266 warn!("Failed to set cgroup limits: {}", e);
267 let _ = cgroup.cleanup();
268 None
269 }
270 }
271 }
272 Err(e) => {
273 if config.service_mode == ServiceMode::Production {
274 return Err(NucleusError::CgroupError(format!(
275 "Production mode requires cgroup resource enforcement, but \
276 cgroup creation failed: {}",
277 e
278 )));
279 }
280
281 if config.user_ns_config.is_some() {
282 if config.limits.memory_bytes.is_some()
283 || config.limits.cpu_quota_us.is_some()
284 || config.limits.pids_max.is_some()
285 {
286 warn!(
287 "Running in rootless mode: requested resource limits cannot be \
288 enforced – cgroup creation requires root ({})",
289 e
290 );
291 } else {
292 debug!("Running in rootless mode without cgroup resource limits");
293 }
294 } else {
295 warn!(
296 "Failed to create cgroup (running without resource limits): {}",
297 e
298 );
299 }
300 None
301 }
302 };
303
304 let runsc_path = if config.use_gvisor {
306 Some(GVisorRuntime::resolve_path().map_err(|e| {
307 NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
308 })?)
309 } else {
310 None
311 };
312
313 let (ready_read, ready_write) = pipe().map_err(|e| {
315 NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
316 })?;
317
318 Self::assert_single_threaded_for_fork("container create fork")?;
323 match unsafe { fork() }? {
326 ForkResult::Parent { child } => {
327 drop(ready_write);
328 info!("Forked child process: {}", child);
329
330 let parent_setup = || -> Result<CreatedContainer> {
333 let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
334
335 let cgroup_path = cgroup_opt
336 .as_ref()
337 .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
338 let cpu_millicores = config
339 .limits
340 .cpu_quota_us
341 .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
342 let mut state = ContainerState::new(ContainerStateParams {
343 id: config.id.clone(),
344 name: config.name.clone(),
345 pid: target_pid,
346 command: config.command.clone(),
347 memory_limit: config.limits.memory_bytes,
348 cpu_limit: cpu_millicores,
349 using_gvisor: config.use_gvisor,
350 rootless: config.user_ns_config.is_some(),
351 cgroup_path,
352 process_uid: config.process_identity.uid,
353 process_gid: config.process_identity.gid,
354 additional_gids: config.process_identity.additional_gids.clone(),
355 });
356 state.config_hash = config.config_hash;
357 state.bundle_path =
358 config.rootfs_path.as_ref().map(|p| p.display().to_string());
359
360 let mut network_driver: Option<BridgeDriver> = None;
361 let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
362 let deny_logger = Self::maybe_start_seccomp_deny_logger(&config, target_pid)?;
363
364 state.status = OciStatus::Created;
366 state_mgr.save_state(&state)?;
367
368 if let Some(ref pid_path) = config.pid_file {
370 std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
371 NucleusError::ConfigError(format!(
372 "Failed to write pid-file '{}': {}",
373 pid_path.display(),
374 e
375 ))
376 })?;
377 info!("Wrote PID {} to {}", target_pid, pid_path.display());
378 }
379
380 if let Some(ref mut cgroup) = cgroup_opt {
381 cgroup.attach_process(target_pid)?;
382 }
383
384 if let NetworkMode::Bridge(ref bridge_config) = config.network {
385 match BridgeDriver::setup_with_id(
386 target_pid,
387 bridge_config,
388 &config.id,
389 is_root,
390 config.user_ns_config.is_some(),
391 ) {
392 Ok(net) => {
393 if let Some(ref egress) = config.egress_policy {
394 if let Err(e) = net.apply_egress_policy(
395 target_pid,
396 egress,
397 config.user_ns_config.is_some(),
398 ) {
399 if config.service_mode == ServiceMode::Production {
400 return Err(NucleusError::NetworkError(format!(
401 "Failed to apply egress policy: {}",
402 e
403 )));
404 }
405 warn!("Failed to apply egress policy: {}", e);
406 }
407 }
408 network_driver = Some(net);
409 }
410 Err(e) => {
411 if config.service_mode == ServiceMode::Production {
412 return Err(e);
413 }
414 warn!("Failed to set up bridge networking: {}", e);
415 }
416 }
417 }
418
419 info!(
420 "Container {} created (child pid {}), waiting for start",
421 config.id, target_pid
422 );
423
424 Ok(CreatedContainer {
425 config,
426 state_mgr,
427 state,
428 child,
429 cgroup_opt,
430 network_driver,
431 trace_reader,
432 deny_logger,
433 exec_fifo_path: exec_fifo,
434 _lifecycle_span: lifecycle_span.clone(),
435 })
436 };
437
438 parent_setup().map_err(|e| {
439 let _ = kill(child, Signal::SIGKILL);
442 let _ = waitpid(child, None);
443 e
444 })
445 }
446 ForkResult::Child => {
447 drop(ready_read);
448 Self::sanitize_fds();
450 let temp_container = Container { config, runsc_path };
451 match temp_container.setup_and_exec(Some(ready_write), exec_fifo) {
452 Ok(_) => unreachable!(),
453 Err(e) => {
454 error!("Container setup failed: {}", e);
455 std::process::exit(1);
456 }
457 }
458 }
459 }
460 }
461
462 pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
465 let state_mgr = ContainerStateManager::new_with_root(state_root)?;
466 let fifo_path = state_mgr.exec_fifo_path(container_id)?;
467 if !fifo_path.exists() {
468 return Err(NucleusError::ConfigError(format!(
469 "No exec FIFO found for container {}; is it in 'created' state?",
470 container_id
471 )));
472 }
473
474 let file = std::fs::File::open(&fifo_path)
476 .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
477 let mut buf = [0u8; 1];
478 std::io::Read::read(&mut &file, &mut buf)
479 .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
480 drop(file);
481
482 let _ = std::fs::remove_file(&fifo_path);
483
484 let mut state = state_mgr.resolve_container(container_id)?;
486 state.status = OciStatus::Running;
487 state_mgr.save_state(&state)?;
488
489 Ok(())
490 }
491
492 fn setup_and_exec(
497 &self,
498 ready_pipe: Option<OwnedFd>,
499 exec_fifo: Option<PathBuf>,
500 ) -> Result<()> {
501 let is_rootless = self.config.user_ns_config.is_some();
502 let allow_degraded_security = Self::allow_degraded_security(&self.config);
503 let context_manifest = if self.config.verify_context_integrity {
504 self.config
505 .context_dir
506 .as_ref()
507 .map(|dir| snapshot_context_dir(dir))
508 .transpose()?
509 } else {
510 None
511 };
512
513 let mut fs_state = FilesystemState::Unmounted;
515 let mut sec_state = SecurityState::Privileged;
516
517 if self.config.use_gvisor {
521 if let Some(fd) = ready_pipe {
522 Self::notify_namespace_ready(&fd, std::process::id())?;
523 }
524 return self.setup_and_exec_gvisor();
525 }
526
527 let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
529 if let Some(user_config) = &self.config.user_ns_config {
530 namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
531 }
532 namespace_mgr.unshare_namespaces()?;
533
534 if self.config.namespaces.pid {
537 Self::assert_single_threaded_for_fork("PID namespace init fork")?;
538 match unsafe { fork() }? {
539 ForkResult::Parent { child } => {
540 if let Some(fd) = ready_pipe {
541 Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
542 }
543 std::process::exit(Self::wait_for_pid_namespace_child(child));
544 }
545 ForkResult::Child => {
546 }
548 }
549 } else if let Some(fd) = ready_pipe {
550 Self::notify_namespace_ready(&fd, std::process::id())?;
551 }
552
553 namespace_mgr.enter()?;
555
556 self.enforce_no_new_privs()?;
560 audit(
561 &self.config.id,
562 &self.config.name,
563 AuditEventType::NoNewPrivsSet,
564 "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
565 );
566
567 if let Some(hostname) = &self.config.hostname {
569 namespace_mgr.set_hostname(hostname)?;
570 }
571
572 let runtime_base = if nix::unistd::Uid::effective().is_root() {
577 std::path::PathBuf::from("/run/nucleus")
578 } else {
579 dirs::runtime_dir()
580 .map(|d| d.join("nucleus"))
581 .unwrap_or_else(std::env::temp_dir)
582 };
583 let _ = std::fs::create_dir_all(&runtime_base);
584 let runtime_dir = Builder::new()
585 .prefix("nucleus-runtime-")
586 .tempdir_in(&runtime_base)
587 .map_err(|e| {
588 NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
589 })?;
590 let container_root = runtime_dir.path().to_path_buf();
591 let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); tmpfs.mount()?;
593 fs_state = fs_state.transition(FilesystemState::Mounted)?;
594
595 create_minimal_fs(&container_root)?;
597
598 let dev_path = container_root.join("dev");
600 create_dev_nodes(&dev_path, false)?;
601
602 let shm_path = dev_path.join("shm");
605 std::fs::create_dir_all(&shm_path).map_err(|e| {
606 NucleusError::FilesystemError(format!("Failed to create /dev/shm: {}", e))
607 })?;
608 nix::mount::mount(
609 Some("shm"),
610 &shm_path,
611 Some("tmpfs"),
612 nix::mount::MsFlags::MS_NOSUID
613 | nix::mount::MsFlags::MS_NODEV
614 | nix::mount::MsFlags::MS_NOEXEC,
615 Some("mode=1777,size=64m"),
616 )
617 .map_err(|e| {
618 NucleusError::FilesystemError(format!("Failed to mount tmpfs on /dev/shm: {}", e))
619 })?;
620 debug!("Mounted tmpfs on /dev/shm");
621
622 if let Some(context_dir) = &self.config.context_dir {
625 let context_dest = container_root.join("context");
626 LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
627 if let Some(expected) = &context_manifest {
628 verify_context_manifest(expected, &context_dest)?;
629 }
630 }
631 fs_state = fs_state.transition(FilesystemState::Populated)?;
632
633 if let Some(ref rootfs_path) = self.config.rootfs_path {
635 if self.config.verify_rootfs_attestation {
636 verify_rootfs_attestation(rootfs_path)?;
637 }
638 bind_mount_rootfs(&container_root, rootfs_path)?;
639 } else {
640 bind_mount_host_paths(&container_root, is_rootless)?;
641 }
642
643 mount_volumes(&container_root, &self.config.volumes)?;
645
646 if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
650 let bridge_dns = if bridge_config.selected_nat_backend(!is_rootless, is_rootless)
651 == NatBackend::Userspace
652 && bridge_config.dns.is_empty()
653 {
654 vec![UserspaceNetwork::default_dns_server(&bridge_config.subnet)?]
655 } else {
656 bridge_config.dns.clone()
657 };
658 if self.config.rootfs_path.is_some() {
659 BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_dns)?;
660 } else {
661 BridgeNetwork::write_resolv_conf(&container_root, &bridge_dns)?;
662 }
663 }
664
665 mount_secrets_inmemory(
667 &container_root,
668 &self.config.secrets,
669 &self.config.process_identity,
670 )?;
671
672 let proc_path = container_root.join("proc");
674 let hide_pids = self.config.service_mode == ServiceMode::Production;
675 mount_procfs(
676 &proc_path,
677 is_rootless,
678 self.config.proc_readonly,
679 hide_pids,
680 )?;
681
682 mask_proc_paths(
685 &proc_path,
686 self.config.service_mode == ServiceMode::Production,
687 )?;
688
689 if let Some(ref hooks) = self.config.hooks {
691 if !hooks.create_runtime.is_empty() {
692 let hook_state = OciContainerState {
693 oci_version: "1.0.2".to_string(),
694 id: self.config.id.clone(),
695 status: OciStatus::Creating,
696 pid: std::process::id(),
697 bundle: String::new(),
698 };
699 OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
700 }
701 }
702
703 switch_root(&container_root, self.config.allow_chroot_fallback)?;
706 fs_state = fs_state.transition(FilesystemState::Pivoted)?;
707 debug!("Filesystem state: {:?}", fs_state);
708
709 audit_mounts(self.config.service_mode == ServiceMode::Production)?;
711 audit(
712 &self.config.id,
713 &self.config.name,
714 AuditEventType::MountAuditPassed,
715 "all mount flags verified",
716 );
717
718 if let Some(ref hooks) = self.config.hooks {
720 if !hooks.create_container.is_empty() {
721 let hook_state = OciContainerState {
722 oci_version: "1.0.2".to_string(),
723 id: self.config.id.clone(),
724 status: OciStatus::Created,
725 pid: std::process::id(),
726 bundle: String::new(),
727 };
728 OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
729 }
730 }
731
732 let mut cap_mgr = CapabilityManager::new();
742 if let Some(ref policy_path) = self.config.caps_policy {
743 let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
744 policy_path,
745 self.config.caps_policy_sha256.as_deref(),
746 )?;
747 if self.config.service_mode == ServiceMode::Production {
749 policy.validate_production()?;
750 }
751 policy.apply(&mut cap_mgr)?;
752 Self::apply_process_identity_to_current_process(
754 &self.config.process_identity,
755 self.config.user_ns_config.is_some(),
756 )?;
757 audit(
758 &self.config.id,
759 &self.config.name,
760 AuditEventType::CapabilitiesDropped,
761 format!("capability policy applied from {:?}", policy_path),
762 );
763 } else {
764 cap_mgr.drop_bounding_set()?;
766
767 Self::apply_process_identity_to_current_process(
771 &self.config.process_identity,
772 self.config.user_ns_config.is_some(),
773 )?;
774
775 cap_mgr.finalize_drop()?;
778
779 audit(
780 &self.config.id,
781 &self.config.name,
782 AuditEventType::CapabilitiesDropped,
783 "all capabilities dropped including bounding set",
784 );
785 }
786 sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
787
788 {
793 let is_production = self.config.service_mode == ServiceMode::Production;
794
795 if let Some(nproc_limit) = self.config.limits.pids_max {
796 let rlim_nproc = libc::rlimit {
797 rlim_cur: nproc_limit,
798 rlim_max: nproc_limit,
799 };
800 if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
802 let err = std::io::Error::last_os_error();
803 if is_production {
804 return Err(NucleusError::SeccompError(format!(
805 "Failed to set RLIMIT_NPROC to {} in production mode: {}",
806 nproc_limit, err
807 )));
808 }
809 warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
810 }
811 }
812
813 let rlim_nofile = libc::rlimit {
814 rlim_cur: 1024,
815 rlim_max: 1024,
816 };
817 if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
819 let err = std::io::Error::last_os_error();
820 if is_production {
821 return Err(NucleusError::SeccompError(format!(
822 "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
823 err
824 )));
825 }
826 warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
827 }
828
829 let memlock_limit: u64 = self.config.limits.memlock_bytes.unwrap_or(64 * 1024);
834 let rlim_memlock = libc::rlimit {
835 rlim_cur: memlock_limit,
836 rlim_max: memlock_limit,
837 };
838 if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
840 let err = std::io::Error::last_os_error();
841 if is_production {
842 return Err(NucleusError::SeccompError(format!(
843 "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
844 memlock_limit, err
845 )));
846 }
847 warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
848 }
849 }
850
851 CapabilityManager::verify_no_namespace_caps(
855 self.config.service_mode == ServiceMode::Production,
856 )?;
857
858 use crate::container::config::SeccompMode;
861 let mut seccomp_mgr = SeccompManager::new();
862 let allow_network = !matches!(self.config.network, NetworkMode::None);
863 let seccomp_applied = match self.config.seccomp_mode {
864 SeccompMode::Trace => {
865 audit(
866 &self.config.id,
867 &self.config.name,
868 AuditEventType::SeccompApplied,
869 "seccomp trace mode: allow-all + LOG",
870 );
871 seccomp_mgr.apply_trace_filter()?
872 }
873 SeccompMode::Enforce => {
874 if let Some(ref profile_path) = self.config.seccomp_profile {
875 audit(
876 &self.config.id,
877 &self.config.name,
878 AuditEventType::SeccompProfileLoaded,
879 format!("path={:?}", profile_path),
880 );
881 seccomp_mgr.apply_profile_from_file(
882 profile_path,
883 self.config.seccomp_profile_sha256.as_deref(),
884 self.config.seccomp_log_denied,
885 )?
886 } else {
887 seccomp_mgr.apply_filter_for_network_mode(
888 allow_network,
889 allow_degraded_security,
890 self.config.seccomp_log_denied,
891 &self.config.seccomp_allow_syscalls,
892 )?
893 }
894 }
895 };
896 if seccomp_applied {
897 sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
898 audit(
899 &self.config.id,
900 &self.config.name,
901 AuditEventType::SeccompApplied,
902 format!("network={}", allow_network),
903 );
904 } else if !allow_degraded_security {
905 return Err(NucleusError::SeccompError(
906 "Seccomp filter is required but was not enforced".to_string(),
907 ));
908 } else {
909 warn!("Seccomp not enforced; container is running with degraded hardening");
910 }
911
912 let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
914 let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
915 policy_path,
916 self.config.landlock_policy_sha256.as_deref(),
917 )?;
918 if self.config.service_mode == ServiceMode::Production {
920 policy.validate_production()?;
921 }
922 policy.apply(allow_degraded_security)?
923 } else {
924 let mut landlock_mgr = LandlockManager::new();
925 landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
926 for vol in &self.config.volumes {
928 landlock_mgr.add_rw_path(&vol.dest.to_string_lossy());
929 }
930 landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
931 };
932 if seccomp_applied && landlock_applied {
933 sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
934 if self.config.seccomp_mode == SeccompMode::Trace {
935 warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
936 } else {
937 sec_state = sec_state.transition(SecurityState::Locked)?;
938 }
939 audit(
940 &self.config.id,
941 &self.config.name,
942 AuditEventType::LandlockApplied,
943 if self.config.seccomp_mode == SeccompMode::Trace {
944 "landlock applied, but seccomp in trace mode – not locked".to_string()
945 } else {
946 "security state locked: all hardening layers active".to_string()
947 },
948 );
949 } else if !allow_degraded_security {
950 return Err(NucleusError::LandlockError(
951 "Landlock policy is required but was not enforced".to_string(),
952 ));
953 } else {
954 warn!("Security state not locked; one or more hardening controls are inactive");
955 }
956 debug!("Security state: {:?}", sec_state);
957
958 if let Some(ref fifo_path) = exec_fifo {
962 debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
963 let file = std::fs::OpenOptions::new()
964 .write(true)
965 .open(fifo_path)
966 .map_err(|e| {
967 NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
968 })?;
969 std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
970 NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
971 })?;
972 drop(file);
973 debug!("Exec FIFO released, proceeding to exec");
974 }
975
976 if let Some(ref hooks) = self.config.hooks {
978 if !hooks.start_container.is_empty() {
979 let hook_state = OciContainerState {
980 oci_version: "1.0.2".to_string(),
981 id: self.config.id.clone(),
982 status: OciStatus::Running,
983 pid: std::process::id(),
984 bundle: String::new(),
985 };
986 OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
987 }
988 }
989
990 if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
993 return self.run_as_init();
994 }
995
996 self.exec_command()?;
998
999 Ok(())
1001 }
1002
1003 pub(super) fn setup_signal_forwarding_static(
1008 child: Pid,
1009 ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
1010 let mut set = SigSet::empty();
1011 for signal in [
1012 Signal::SIGTERM,
1013 Signal::SIGINT,
1014 Signal::SIGHUP,
1015 Signal::SIGQUIT,
1016 Signal::SIGUSR1,
1017 Signal::SIGUSR2,
1018 ] {
1019 set.add(signal);
1020 }
1021
1022 let unblock_set = set;
1023 pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
1024 NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
1025 })?;
1026
1027 let stop = Arc::new(AtomicBool::new(false));
1028 let stop_clone = stop.clone();
1029 let handle = std::thread::Builder::new()
1030 .name("sig-forward".to_string())
1031 .spawn(move || {
1032 loop {
1034 if let Ok(signal) = unblock_set.wait() {
1035 if stop_clone.load(Ordering::Relaxed) {
1039 break;
1040 }
1041 let _ = kill(child, signal);
1042 }
1043 }
1044 })
1045 .map_err(|e| {
1046 let mut restore = SigSet::empty();
1049 for signal in [
1050 Signal::SIGTERM,
1051 Signal::SIGINT,
1052 Signal::SIGHUP,
1053 Signal::SIGQUIT,
1054 Signal::SIGUSR1,
1055 Signal::SIGUSR2,
1056 ] {
1057 restore.add(signal);
1058 }
1059 let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
1060 NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
1061 })?;
1062
1063 info!("Signal forwarding configured");
1064 Ok((stop, handle))
1065 }
1066
1067 pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
1069 loop {
1070 match waitpid(child, None) {
1071 Ok(WaitStatus::Exited(_, code)) => {
1072 return Ok(code);
1073 }
1074 Ok(WaitStatus::Signaled(_, signal, _)) => {
1075 info!("Child killed by signal: {:?}", signal);
1076 return Ok(128 + signal as i32);
1077 }
1078 Err(nix::errno::Errno::EINTR) => {
1079 continue;
1080 }
1081 Err(e) => {
1082 return Err(NucleusError::ExecError(format!(
1083 "Failed to wait for child: {}",
1084 e
1085 )));
1086 }
1087 _ => {
1088 continue;
1089 }
1090 }
1091 }
1092 }
1093
1094 fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1095 let mut pid_buf = [0u8; 4];
1096 loop {
1097 match read(ready_read, &mut pid_buf) {
1098 Err(nix::errno::Errno::EINTR) => continue,
1099 Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1100 Ok(0) => {
1101 return Err(NucleusError::ExecError(format!(
1102 "Child {} exited before namespace initialization",
1103 child
1104 )))
1105 }
1106 Ok(_) => {
1107 return Err(NucleusError::ExecError(
1108 "Invalid namespace sync payload from child".to_string(),
1109 ))
1110 }
1111 Err(e) => {
1112 return Err(NucleusError::ExecError(format!(
1113 "Failed waiting for child namespace setup: {}",
1114 e
1115 )))
1116 }
1117 }
1118 }
1119 }
1120
1121 fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1122 let payload = pid.to_ne_bytes();
1123 let mut written = 0;
1124 while written < payload.len() {
1125 let n = write(fd, &payload[written..]).map_err(|e| {
1126 NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1127 })?;
1128 if n == 0 {
1129 return Err(NucleusError::ExecError(
1130 "Failed to notify namespace readiness: short write".to_string(),
1131 ));
1132 }
1133 written += n;
1134 }
1135 Ok(())
1136 }
1137
1138 fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1139 loop {
1140 match waitpid(child, None) {
1141 Ok(WaitStatus::Exited(_, code)) => return code,
1142 Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1143 Err(nix::errno::Errno::EINTR) => continue,
1144 Err(_) => return 1,
1145 _ => continue,
1146 }
1147 }
1148 }
1149}
1150
1151impl CreatedContainer {
1152 pub fn start(mut self) -> Result<i32> {
1155 let config = &self.config;
1156 let _enter = self._lifecycle_span.enter();
1157
1158 if let Some(exec_fifo_path) = &self.exec_fifo_path {
1161 let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1162 NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1163 })?;
1164 let mut buf = [0u8; 1];
1165 let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1166 NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1167 })?;
1168 if read != 1 {
1169 return Err(NucleusError::ExecError(
1170 "Exec FIFO closed before start signal was delivered".to_string(),
1171 ));
1172 }
1173 let _ = std::fs::remove_file(exec_fifo_path);
1174 }
1175
1176 self.state.status = OciStatus::Running;
1178 self.state_mgr.save_state(&self.state)?;
1179
1180 let target_pid = self.state.pid;
1181 let child = self.child;
1182
1183 let (sig_stop, sig_handle) =
1184 Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1185
1186 let mut sig_guard = SignalThreadGuard {
1188 stop: Some(sig_stop),
1189 handle: Some(sig_handle),
1190 };
1191
1192 if let Some(ref probe) = config.readiness_probe {
1194 let notify_socket = if config.sd_notify {
1195 std::env::var("NOTIFY_SOCKET").ok()
1196 } else {
1197 None
1198 };
1199 Container::run_readiness_probe(
1200 target_pid,
1201 &config.name,
1202 probe,
1203 config.user_ns_config.is_some(),
1204 config.use_gvisor,
1205 &config.process_identity,
1206 notify_socket.as_deref(),
1207 )?;
1208 }
1209
1210 let cancel_flag = Arc::new(AtomicBool::new(false));
1212 let health_handle = if let Some(ref hc) = config.health_check {
1213 if !hc.command.is_empty() {
1214 let hc = hc.clone();
1215 let pid = target_pid;
1216 let container_name = config.name.clone();
1217 let rootless = config.user_ns_config.is_some();
1218 let using_gvisor = config.use_gvisor;
1219 let process_identity = config.process_identity.clone();
1220 let cancel = cancel_flag.clone();
1221 Some(std::thread::spawn(move || {
1222 Container::health_check_loop(
1223 pid,
1224 &container_name,
1225 rootless,
1226 using_gvisor,
1227 &hc,
1228 &process_identity,
1229 &cancel,
1230 );
1231 }))
1232 } else {
1233 None
1234 }
1235 } else {
1236 None
1237 };
1238
1239 let mut health_guard = HealthThreadGuard {
1241 cancel: Some(cancel_flag),
1242 handle: health_handle,
1243 };
1244
1245 if let Some(ref hooks) = config.hooks {
1247 if !hooks.poststart.is_empty() {
1248 let hook_state = OciContainerState {
1249 oci_version: "1.0.2".to_string(),
1250 id: config.id.clone(),
1251 status: OciStatus::Running,
1252 pid: target_pid,
1253 bundle: String::new(),
1254 };
1255 OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1256 }
1257 }
1258
1259 let mut child_waited = false;
1260 let run_result: Result<i32> = (|| {
1261 let exit_code = Container::wait_for_child_static(child)?;
1262
1263 self.state.status = OciStatus::Stopped;
1265 let _ = self.state_mgr.save_state(&self.state);
1266
1267 child_waited = true;
1268 Ok(exit_code)
1269 })();
1270
1271 health_guard.stop();
1274 sig_guard.stop();
1275
1276 if let Some(ref hooks) = config.hooks {
1278 if !hooks.poststop.is_empty() {
1279 let hook_state = OciContainerState {
1280 oci_version: "1.0.2".to_string(),
1281 id: config.id.clone(),
1282 status: OciStatus::Stopped,
1283 pid: 0,
1284 bundle: String::new(),
1285 };
1286 OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1287 }
1288 }
1289
1290 if let Some(net) = self.network_driver.take() {
1291 if let Err(e) = net.cleanup() {
1292 warn!("Failed to cleanup container networking: {}", e);
1293 }
1294 }
1295
1296 if !child_waited {
1297 let _ = kill(child, Signal::SIGKILL);
1298 let _ = waitpid(child, None);
1299 }
1300
1301 if let Some(reader) = self.trace_reader.take() {
1302 reader.stop_and_flush();
1303 }
1304
1305 if let Some(logger) = self.deny_logger.take() {
1306 logger.stop();
1307 }
1308
1309 if let Some(cgroup) = self.cgroup_opt.take() {
1310 if let Err(e) = cgroup.cleanup() {
1311 warn!("Failed to cleanup cgroup: {}", e);
1312 }
1313 }
1314
1315 if config.use_gvisor {
1316 if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1317 warn!(
1318 "Failed to cleanup gVisor artifacts for {}: {}",
1319 config.id, e
1320 );
1321 }
1322 }
1323
1324 if let Err(e) = self.state_mgr.delete_state(&config.id) {
1325 warn!("Failed to delete state for {}: {}", config.id, e);
1326 }
1327
1328 match run_result {
1329 Ok(exit_code) => {
1330 audit(
1331 &config.id,
1332 &config.name,
1333 AuditEventType::ContainerStop,
1334 format!("exit_code={}", exit_code),
1335 );
1336 info!(
1337 "Container {} ({}) exited with code {}",
1338 config.name, config.id, exit_code
1339 );
1340 Ok(exit_code)
1341 }
1342 Err(e) => {
1343 audit_error(
1344 &config.id,
1345 &config.name,
1346 AuditEventType::ContainerStop,
1347 format!("error={}", e),
1348 );
1349 Err(e)
1350 }
1351 }
1352 }
1353}
1354
1355struct SignalThreadGuard {
1357 stop: Option<Arc<AtomicBool>>,
1358 handle: Option<JoinHandle<()>>,
1359}
1360
1361impl SignalThreadGuard {
1362 fn stop(&mut self) {
1363 if let Some(flag) = self.stop.take() {
1364 flag.store(true, Ordering::Relaxed);
1365 let _ = kill(Pid::this(), Signal::SIGUSR1);
1367 }
1368 if let Some(handle) = self.handle.take() {
1369 let _ = handle.join();
1370 }
1371 }
1372}
1373
1374impl Drop for SignalThreadGuard {
1375 fn drop(&mut self) {
1376 self.stop();
1377 }
1378}
1379
1380struct HealthThreadGuard {
1382 cancel: Option<Arc<AtomicBool>>,
1383 handle: Option<JoinHandle<()>>,
1384}
1385
1386impl HealthThreadGuard {
1387 fn stop(&mut self) {
1388 if let Some(flag) = self.cancel.take() {
1389 flag.store(true, Ordering::Relaxed);
1390 }
1391 if let Some(handle) = self.handle.take() {
1392 let _ = handle.join();
1393 }
1394 }
1395}
1396
1397impl Drop for HealthThreadGuard {
1398 fn drop(&mut self) {
1399 self.stop();
1400 }
1401}
1402
1403#[cfg(test)]
1404mod tests {
1405 use super::*;
1406 use crate::container::KernelLockdownMode;
1407 use crate::network::NetworkMode;
1408 use std::ffi::OsString;
1409
1410 struct EnvVarGuard {
1411 key: &'static str,
1412 previous: Option<OsString>,
1413 }
1414
1415 impl EnvVarGuard {
1416 fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
1417 let previous = std::env::var_os(key);
1418 std::env::set_var(key, value);
1419 Self { key, previous }
1420 }
1421 }
1422
1423 impl Drop for EnvVarGuard {
1424 fn drop(&mut self) {
1425 match &self.previous {
1426 Some(value) => std::env::set_var(self.key, value),
1427 None => std::env::remove_var(self.key),
1428 }
1429 }
1430 }
1431
1432 fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
1433 let fn_start = source
1434 .find(fn_signature)
1435 .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
1436 let after = &source[fn_start..];
1437 let open = after
1438 .find('{')
1439 .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
1440 let mut depth = 0u32;
1441 let mut end = open;
1442 for (i, ch) in after[open..].char_indices() {
1443 match ch {
1444 '{' => depth += 1,
1445 '}' => {
1446 depth -= 1;
1447 if depth == 0 {
1448 end = open + i + 1;
1449 break;
1450 }
1451 }
1452 _ => {}
1453 }
1454 }
1455 &after[..end]
1456 }
1457
1458 #[test]
1459 fn test_container_config() {
1460 let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1461 assert!(!config.id.is_empty());
1462 assert_eq!(config.command, vec!["/bin/sh"]);
1463 assert!(config.use_gvisor);
1464 }
1465
1466 #[test]
1467 fn test_run_uses_immediate_start_path() {
1468 let source = include_str!("runtime.rs");
1469 let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1470 let after = &source[fn_start..];
1471 let open = after.find('{').unwrap();
1472 let mut depth = 0u32;
1473 let mut fn_end = open;
1474 for (i, ch) in after[open..].char_indices() {
1475 match ch {
1476 '{' => depth += 1,
1477 '}' => {
1478 depth -= 1;
1479 if depth == 0 {
1480 fn_end = open + i + 1;
1481 break;
1482 }
1483 }
1484 _ => {}
1485 }
1486 }
1487 let run_body = &after[..fn_end];
1488 assert!(
1489 run_body.contains("create_internal(false)"),
1490 "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1491 );
1492 assert!(
1493 !run_body.contains("self.create()?.start()"),
1494 "run() must not route through create()+start()"
1495 );
1496 }
1497
1498 #[test]
1499 fn test_container_config_with_name() {
1500 let config =
1501 ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1502 .unwrap();
1503 assert_eq!(config.name, "mycontainer");
1504 assert!(!config.id.is_empty());
1505 assert_ne!(config.id, config.name);
1506 }
1507
1508 #[test]
1509 fn test_allow_degraded_security_requires_explicit_config() {
1510 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1511 assert!(!Container::allow_degraded_security(&strict));
1512
1513 let relaxed = strict.clone().with_allow_degraded_security(true);
1514 assert!(Container::allow_degraded_security(&relaxed));
1515 }
1516
1517 #[test]
1518 fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1519 let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1520 std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1521
1522 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1523 assert!(!Container::allow_degraded_security(&strict));
1524
1525 let explicit = strict.with_allow_degraded_security(true);
1526 assert!(Container::allow_degraded_security(&explicit));
1527
1528 match prev {
1529 Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1530 None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1531 }
1532 }
1533
1534 #[test]
1535 fn test_host_network_requires_explicit_opt_in() {
1536 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1537 .unwrap()
1538 .with_network(NetworkMode::Host)
1539 .with_allow_host_network(false);
1540 let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1541 assert!(matches!(err, NucleusError::NetworkError(_)));
1542 }
1543
1544 #[test]
1545 fn test_host_network_opt_in_disables_net_namespace() {
1546 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1547 .unwrap()
1548 .with_network(NetworkMode::Host)
1549 .with_allow_host_network(true);
1550 assert!(config.namespaces.net);
1551 Container::apply_network_mode_guards(&mut config, true).unwrap();
1552 assert!(!config.namespaces.net);
1553 }
1554
1555 #[test]
1556 fn test_non_host_network_does_not_require_host_opt_in() {
1557 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1558 .unwrap()
1559 .with_network(NetworkMode::None)
1560 .with_allow_host_network(false);
1561 assert!(config.namespaces.net);
1562 Container::apply_network_mode_guards(&mut config, true).unwrap();
1563 assert!(config.namespaces.net);
1564 }
1565
1566 #[test]
1567 fn test_parse_kernel_lockdown_mode() {
1568 assert_eq!(
1569 Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1570 Some(KernelLockdownMode::Integrity)
1571 );
1572 assert_eq!(
1573 Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1574 Some(KernelLockdownMode::Confidentiality)
1575 );
1576 assert_eq!(
1577 Container::parse_active_lockdown_mode("[none] integrity"),
1578 None
1579 );
1580 }
1581
1582 #[test]
1583 fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1584 let temp = tempfile::TempDir::new().unwrap();
1585 let source = temp.path().join("source-secret");
1586 std::fs::write(&source, "supersecret").unwrap();
1587
1588 let staged = Container::stage_gvisor_secret_files(
1589 &temp.path().join("stage"),
1590 &[crate::container::SecretMount {
1591 source: source.clone(),
1592 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1593 mode: 0o400,
1594 }],
1595 &crate::container::ProcessIdentity::root(),
1596 )
1597 .unwrap();
1598
1599 assert_eq!(staged.len(), 1);
1600 assert!(staged[0].source.starts_with(temp.path().join("stage")));
1601 assert_eq!(
1602 std::fs::read_to_string(&staged[0].source).unwrap(),
1603 "supersecret"
1604 );
1605 }
1606
1607 #[test]
1608 fn test_stage_gvisor_secret_files_rejects_symlink_source() {
1609 use std::os::unix::fs::symlink;
1610
1611 let temp = tempfile::TempDir::new().unwrap();
1612 let source = temp.path().join("source-secret");
1613 let link = temp.path().join("source-link");
1614 std::fs::write(&source, "supersecret").unwrap();
1615 symlink(&source, &link).unwrap();
1616
1617 let err = Container::stage_gvisor_secret_files(
1618 &temp.path().join("stage"),
1619 &[crate::container::SecretMount {
1620 source: link,
1621 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1622 mode: 0o400,
1623 }],
1624 &crate::container::ProcessIdentity::root(),
1625 )
1626 .unwrap_err();
1627
1628 assert!(
1629 err.to_string().contains("O_NOFOLLOW"),
1630 "gVisor secret staging must reject symlink sources"
1631 );
1632 }
1633
1634 #[test]
1635 fn test_native_runtime_uses_inmemory_secrets_for_all_modes() {
1636 let source = include_str!("runtime.rs");
1637 let fn_body = extract_fn_body(source, "fn setup_and_exec");
1638 assert!(
1639 fn_body.contains("mount_secrets_inmemory("),
1640 "setup_and_exec must use in-memory secret mounting"
1641 );
1642 assert!(
1643 !fn_body.contains("mount_secrets(&"),
1644 "setup_and_exec must not bind-mount secrets from the host"
1645 );
1646 }
1647
1648 #[test]
1649 fn test_gvisor_uses_inmemory_secret_staging_for_all_modes() {
1650 let source = include_str!("gvisor_setup.rs");
1651 let fn_body = extract_fn_body(source, "fn setup_and_exec_gvisor_oci");
1652 assert!(
1653 fn_body.contains("with_inmemory_secret_mounts"),
1654 "gVisor setup must use the tmpfs-backed secret staging path"
1655 );
1656 assert!(
1657 !fn_body.contains("with_secret_mounts"),
1658 "gVisor setup must not bind-mount host secret paths"
1659 );
1660 }
1661
1662 #[test]
1663 fn test_native_fork_sites_assert_single_threaded() {
1664 let runtime_source = include_str!("runtime.rs");
1665 let create_body = extract_fn_body(runtime_source, "fn create_internal");
1666 assert!(
1667 create_body.contains("assert_single_threaded_for_fork(\"container create fork\")"),
1668 "create_internal must assert single-threaded before fork"
1669 );
1670
1671 let setup_body = extract_fn_body(runtime_source, "fn setup_and_exec");
1672 assert!(
1673 setup_body.contains("assert_single_threaded_for_fork(\"PID namespace init fork\")"),
1674 "PID namespace setup must assert single-threaded before fork"
1675 );
1676
1677 let exec_source = include_str!("exec.rs");
1678 let init_body = extract_fn_body(exec_source, "fn run_as_init");
1679 assert!(
1680 init_body.contains("assert_single_threaded_for_fork(\"init supervisor fork\")"),
1681 "run_as_init must assert single-threaded before fork"
1682 );
1683 }
1684
1685 #[test]
1686 fn test_run_as_init_keeps_identity_drop_in_workload_child_path() {
1687 let source = include_str!("exec.rs");
1688 let fn_body = extract_fn_body(source, "fn run_as_init");
1689 assert!(
1690 !fn_body.contains("Self::apply_process_identity_to_current_process("),
1691 "run_as_init must not drop identity before the supervisor fork"
1692 );
1693 assert!(
1694 fn_body.contains("self.exec_command()?"),
1695 "workload child must still route through exec_command for identity application"
1696 );
1697 }
1698
1699 #[test]
1700 fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1701 let temp = tempfile::TempDir::new().unwrap();
1702 let _artifact_base = EnvVarGuard::set(
1703 "NUCLEUS_GVISOR_ARTIFACT_BASE",
1704 temp.path().join("gvisor-artifacts"),
1705 );
1706 let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1707 std::fs::create_dir_all(&artifact_dir).unwrap();
1708 std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1709
1710 Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1711 assert!(!artifact_dir.exists());
1712 }
1713
1714 #[test]
1715 fn test_health_check_loop_supports_cancellation() {
1716 let source = include_str!("health.rs");
1720 let fn_start = source.find("fn health_check_loop").unwrap();
1721 let fn_body = &source[fn_start..fn_start + 2500];
1722 assert!(
1723 fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1724 "health_check_loop must accept an AtomicBool cancellation flag"
1725 );
1726 assert!(
1728 fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1729 "health_check_loop must check cancellation during sleep intervals"
1730 );
1731 }
1732
1733 #[test]
1734 fn test_runtime_probes_do_not_spawn_host_nsenter() {
1735 let source = include_str!("health.rs");
1737
1738 let readiness_start = source.find("fn run_readiness_probe").unwrap();
1739 let readiness_body = &source[readiness_start..readiness_start + 2500];
1740 assert!(
1741 !readiness_body.contains("Command::new(&nsenter_bin)"),
1742 "readiness probes must not execute via host nsenter"
1743 );
1744
1745 let health_start = source.find("fn health_check_loop").unwrap();
1746 let health_body = &source[health_start..health_start + 2200];
1747 assert!(
1748 !health_body.contains("Command::new(&nsenter_bin)"),
1749 "health checks must not execute via host nsenter"
1750 );
1751 }
1752
1753 #[test]
1754 fn test_oci_mount_strip_prefix_no_expect() {
1755 let source = include_str!("gvisor_setup.rs");
1758 let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1759 let fn_body = &source[fn_start..fn_start + 600];
1760 assert!(
1761 !fn_body.contains(".expect("),
1762 "prepare_oci_mountpoints must not use expect() – return Err instead"
1763 );
1764 }
1765
1766 #[test]
1767 fn test_notify_namespace_ready_validates_write_length() {
1768 let source = include_str!("runtime.rs");
1770 let fn_start = source.find("fn notify_namespace_ready").unwrap();
1771 let fn_body = &source[fn_start..fn_start + 500];
1772 assert!(
1774 fn_body.contains("written")
1775 || fn_body.contains("4")
1776 || fn_body.contains("payload.len()"),
1777 "notify_namespace_ready must validate complete write of all 4 bytes"
1778 );
1779 }
1780
1781 #[test]
1782 fn test_rlimit_failures_fatal_in_production() {
1783 let source = include_str!("runtime.rs");
1785 let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1786 let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1787 assert!(
1788 rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1789 "RLIMIT failures must return Err in production mode"
1790 );
1791 }
1792
1793 #[test]
1794 fn test_tcp_readiness_probe_uses_portable_check() {
1795 let source = include_str!("health.rs");
1798 let probe_fn = source.find("TcpPort(port)").unwrap();
1799 let probe_body = &source[probe_fn..probe_fn + 500];
1800 assert!(
1801 !probe_body.contains("/dev/tcp"),
1802 "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1803 );
1804 }
1805}