1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3 ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4 ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8 audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9 mask_proc_paths, mount_procfs, mount_secrets, mount_secrets_inmemory, mount_volumes,
10 snapshot_context_dir, switch_root, verify_context_manifest, verify_rootfs_attestation,
11 FilesystemState, LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::NamespaceManager;
14use crate::network::{BridgeNetwork, NetworkMode};
15use crate::resources::Cgroup;
16use crate::security::{
17 CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks, SeccompManager,
18 SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::{AsRawFd, OwnedFd};
26use std::path::PathBuf;
27use std::sync::atomic::{AtomicBool, Ordering};
28use std::sync::Arc;
29use std::thread::JoinHandle;
30use tempfile::Builder;
31use tracing::{debug, error, info, info_span, warn};
32
33pub struct Container {
42 pub(super) config: ContainerConfig,
43 pub(super) runsc_path: Option<String>,
46}
47
48pub struct CreatedContainer {
52 pub(super) config: ContainerConfig,
53 pub(super) state_mgr: ContainerStateManager,
54 pub(super) state: ContainerState,
55 pub(super) child: Pid,
56 pub(super) cgroup_opt: Option<Cgroup>,
57 pub(super) bridge_net: Option<BridgeNetwork>,
58 pub(super) trace_reader: Option<SeccompTraceReader>,
59 pub(super) exec_fifo_path: Option<PathBuf>,
60 pub(super) _lifecycle_span: tracing::Span,
61}
62
63impl Container {
64 pub fn new(config: ContainerConfig) -> Self {
65 Self {
66 config,
67 runsc_path: None,
68 }
69 }
70
71 pub fn run(&self) -> Result<i32> {
73 self.create_internal(false)?.start()
74 }
75
76 pub fn create(&self) -> Result<CreatedContainer> {
80 self.create_internal(true)
81 }
82
83 fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
84 let lifecycle_span = info_span!(
85 "container.lifecycle",
86 container.id = %self.config.id,
87 container.name = %self.config.name,
88 runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
89 );
90 let _enter = lifecycle_span.enter();
91
92 info!(
93 "Creating container: {} (ID: {})",
94 self.config.name, self.config.id
95 );
96 audit(
97 &self.config.id,
98 &self.config.name,
99 AuditEventType::ContainerStart,
100 format!(
101 "command={:?} mode={:?} runtime={}",
102 self.config.command,
103 self.config.service_mode,
104 if self.config.use_gvisor {
105 "gvisor"
106 } else {
107 "native"
108 }
109 ),
110 );
111
112 let is_root = nix::unistd::Uid::effective().is_root();
114 let mut config = self.config.clone();
115
116 if !is_root && config.user_ns_config.is_none() {
117 info!("Not running as root, automatically enabling rootless mode");
118 config.namespaces.user = true;
119 config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
120 }
121
122 if let Some(ref socket_path) = config.console_socket {
124 warn!(
125 "Console socket {} accepted but terminal forwarding is not yet implemented",
126 socket_path.display()
127 );
128 }
129
130 config.validate_production_mode()?;
132 Self::assert_kernel_lockdown(&config)?;
133
134 Self::apply_network_mode_guards(&mut config, is_root)?;
135 Self::apply_trust_level_guards(&mut config)?;
136 config.validate_runtime_support()?;
137
138 if matches!(config.network, NetworkMode::Bridge(_)) && !is_root {
140 if config.service_mode == ServiceMode::Production {
141 return Err(NucleusError::NetworkError(
142 "Production mode with bridge networking requires root (cannot silently \
143 degrade to no networking)"
144 .to_string(),
145 ));
146 }
147 warn!("Bridge networking requires root, degrading to no networking");
148 config.network = NetworkMode::None;
149 }
150
151 let state_mgr = ContainerStateManager::new()?;
153
154 if let Ok(all_states) = state_mgr.list_states() {
156 if all_states.iter().any(|s| s.name == config.name) {
157 return Err(NucleusError::ConfigError(format!(
158 "A container named '{}' already exists; use a different --name, \
159 or remove the stale state with 'nucleus delete'",
160 config.name
161 )));
162 }
163 }
164
165 let exec_fifo = if defer_exec_until_start {
168 let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
169 nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
170 NucleusError::ExecError(format!(
171 "Failed to create exec FIFO {:?}: {}",
172 exec_fifo, e
173 ))
174 })?;
175 Some(exec_fifo)
176 } else {
177 None
178 };
179
180 let cgroup_name = format!("nucleus-{}", config.id);
182 let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
183 Ok(mut cgroup) => {
184 match cgroup.set_limits(&config.limits) {
186 Ok(_) => {
187 info!("Created cgroup with resource limits");
188 Some(cgroup)
189 }
190 Err(e) => {
191 if config.service_mode == ServiceMode::Production {
192 let _ = cgroup.cleanup();
193 return Err(NucleusError::CgroupError(format!(
194 "Production mode requires cgroup resource enforcement, but \
195 applying limits failed: {}",
196 e
197 )));
198 }
199 warn!("Failed to set cgroup limits: {}", e);
200 let _ = cgroup.cleanup();
201 None
202 }
203 }
204 }
205 Err(e) => {
206 if config.service_mode == ServiceMode::Production {
207 return Err(NucleusError::CgroupError(format!(
208 "Production mode requires cgroup resource enforcement, but \
209 cgroup creation failed: {}",
210 e
211 )));
212 }
213
214 if config.user_ns_config.is_some() {
215 if config.limits.memory_bytes.is_some()
216 || config.limits.cpu_quota_us.is_some()
217 || config.limits.pids_max.is_some()
218 {
219 warn!(
220 "Running in rootless mode: requested resource limits cannot be \
221 enforced – cgroup creation requires root ({})",
222 e
223 );
224 } else {
225 debug!("Running in rootless mode without cgroup resource limits");
226 }
227 } else {
228 warn!(
229 "Failed to create cgroup (running without resource limits): {}",
230 e
231 );
232 }
233 None
234 }
235 };
236
237 let runsc_path = if config.use_gvisor {
239 Some(GVisorRuntime::resolve_path().map_err(|e| {
240 NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
241 })?)
242 } else {
243 None
244 };
245
246 let (ready_read, ready_write) = pipe().map_err(|e| {
248 NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
249 })?;
250
251 match unsafe { fork() }? {
253 ForkResult::Parent { child } => {
254 drop(ready_write);
255 info!("Forked child process: {}", child);
256
257 let parent_setup = || -> Result<CreatedContainer> {
260 let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
261
262 let cgroup_path = cgroup_opt
263 .as_ref()
264 .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
265 let cpu_millicores = config
266 .limits
267 .cpu_quota_us
268 .map(|quota| (quota * 1000) / config.limits.cpu_period_us);
269 let mut state = ContainerState::new(ContainerStateParams {
270 id: config.id.clone(),
271 name: config.name.clone(),
272 pid: target_pid,
273 command: config.command.clone(),
274 memory_limit: config.limits.memory_bytes,
275 cpu_limit: cpu_millicores,
276 using_gvisor: config.use_gvisor,
277 rootless: config.user_ns_config.is_some(),
278 cgroup_path,
279 process_uid: config.process_identity.uid,
280 process_gid: config.process_identity.gid,
281 additional_gids: config.process_identity.additional_gids.clone(),
282 });
283 state.config_hash = config.config_hash;
284 state.bundle_path =
285 config.rootfs_path.as_ref().map(|p| p.display().to_string());
286
287 let mut bridge_net: Option<BridgeNetwork> = None;
288 let trace_reader =
289 Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
290
291 state.status = OciStatus::Created;
293 state_mgr.save_state(&state)?;
294
295 if let Some(ref pid_path) = config.pid_file {
297 std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
298 NucleusError::ConfigError(format!(
299 "Failed to write pid-file '{}': {}",
300 pid_path.display(),
301 e
302 ))
303 })?;
304 info!("Wrote PID {} to {}", target_pid, pid_path.display());
305 }
306
307 if let Some(ref mut cgroup) = cgroup_opt {
308 cgroup.attach_process(target_pid)?;
309 }
310
311 if let NetworkMode::Bridge(ref bridge_config) = config.network {
312 match BridgeNetwork::setup_with_id(
313 target_pid,
314 bridge_config,
315 &config.id,
316 ) {
317 Ok(net) => {
318 if let Some(ref egress) = config.egress_policy {
319 if let Err(e) =
320 net.apply_egress_policy(target_pid, egress)
321 {
322 if config.service_mode == ServiceMode::Production {
323 return Err(NucleusError::NetworkError(format!(
324 "Failed to apply egress policy: {}",
325 e
326 )));
327 }
328 warn!("Failed to apply egress policy: {}", e);
329 }
330 }
331 bridge_net = Some(net);
332 }
333 Err(e) => {
334 if config.service_mode == ServiceMode::Production {
335 return Err(e);
336 }
337 warn!("Failed to set up bridge networking: {}", e);
338 }
339 }
340 }
341
342 info!(
343 "Container {} created (child pid {}), waiting for start",
344 config.id, target_pid
345 );
346
347 Ok(CreatedContainer {
348 config,
349 state_mgr,
350 state,
351 child,
352 cgroup_opt,
353 bridge_net,
354 trace_reader,
355 exec_fifo_path: exec_fifo,
356 _lifecycle_span: lifecycle_span.clone(),
357 })
358 };
359
360 parent_setup().map_err(|e| {
361 let _ = kill(child, Signal::SIGKILL);
364 let _ = waitpid(child, None);
365 e
366 })
367 }
368 ForkResult::Child => {
369 drop(ready_read);
370 let temp_container = Container { config, runsc_path };
371 match temp_container.setup_and_exec(Some(ready_write), exec_fifo) {
372 Ok(_) => unreachable!(),
373 Err(e) => {
374 error!("Container setup failed: {}", e);
375 std::process::exit(1);
376 }
377 }
378 }
379 }
380 }
381
382 pub fn trigger_start(container_id: &str) -> Result<()> {
385 let state_mgr = ContainerStateManager::new()?;
386 let fifo_path = state_mgr.exec_fifo_path(container_id)?;
387 if !fifo_path.exists() {
388 return Err(NucleusError::ConfigError(format!(
389 "No exec FIFO found for container {}; is it in 'created' state?",
390 container_id
391 )));
392 }
393
394 let file = std::fs::File::open(&fifo_path)
396 .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
397 let mut buf = [0u8; 1];
398 std::io::Read::read(&mut &file, &mut buf)
399 .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
400 drop(file);
401
402 let _ = std::fs::remove_file(&fifo_path);
403
404 let mut state = state_mgr.resolve_container(container_id)?;
406 state.status = OciStatus::Running;
407 state_mgr.save_state(&state)?;
408
409 Ok(())
410 }
411
412 fn setup_and_exec(
417 &self,
418 ready_pipe: Option<OwnedFd>,
419 exec_fifo: Option<PathBuf>,
420 ) -> Result<()> {
421 let is_rootless = self.config.user_ns_config.is_some();
422 let allow_degraded_security = Self::allow_degraded_security(&self.config);
423 let context_manifest = if self.config.verify_context_integrity {
424 self.config
425 .context_dir
426 .as_ref()
427 .map(|dir| snapshot_context_dir(dir))
428 .transpose()?
429 } else {
430 None
431 };
432
433 let mut fs_state = FilesystemState::Unmounted;
435 let mut sec_state = SecurityState::Privileged;
436
437 if self.config.use_gvisor {
441 if let Some(fd) = ready_pipe {
442 Self::notify_namespace_ready(&fd, std::process::id())?;
443 }
444 return self.setup_and_exec_gvisor();
445 }
446
447 let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
449 if let Some(user_config) = &self.config.user_ns_config {
450 namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
451 }
452 namespace_mgr.unshare_namespaces()?;
453
454 if self.config.namespaces.pid {
457 match unsafe { fork() }? {
458 ForkResult::Parent { child } => {
459 if let Some(fd) = ready_pipe {
460 Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
461 }
462 std::process::exit(Self::wait_for_pid_namespace_child(child));
463 }
464 ForkResult::Child => {
465 }
467 }
468 } else if let Some(fd) = ready_pipe {
469 Self::notify_namespace_ready(&fd, std::process::id())?;
470 }
471
472 namespace_mgr.enter()?;
474
475 self.enforce_no_new_privs()?;
479 audit(
480 &self.config.id,
481 &self.config.name,
482 AuditEventType::NoNewPrivsSet,
483 "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
484 );
485
486 if let Some(hostname) = &self.config.hostname {
488 namespace_mgr.set_hostname(hostname)?;
489 }
490
491 let runtime_dir = Builder::new()
494 .prefix("nucleus-runtime-")
495 .tempdir_in("/tmp")
496 .map_err(|e| {
497 NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
498 })?;
499 let container_root = runtime_dir.path().to_path_buf();
500 let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); tmpfs.mount()?;
502 fs_state = fs_state.transition(FilesystemState::Mounted)?;
503
504 create_minimal_fs(&container_root)?;
506
507 let dev_path = container_root.join("dev");
509 create_dev_nodes(&dev_path, false)?;
510
511 if let Some(context_dir) = &self.config.context_dir {
514 let context_dest = container_root.join("context");
515 LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
516 if let Some(expected) = &context_manifest {
517 verify_context_manifest(expected, &context_dest)?;
518 }
519 }
520 fs_state = fs_state.transition(FilesystemState::Populated)?;
521
522 if let Some(ref rootfs_path) = self.config.rootfs_path {
524 if self.config.verify_rootfs_attestation {
525 verify_rootfs_attestation(rootfs_path)?;
526 }
527 bind_mount_rootfs(&container_root, rootfs_path)?;
528 } else {
529 bind_mount_host_paths(&container_root, is_rootless)?;
530 }
531
532 mount_volumes(&container_root, &self.config.volumes)?;
534
535 if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
539 if self.config.rootfs_path.is_some() {
540 BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_config.dns)?;
541 } else {
542 BridgeNetwork::write_resolv_conf(&container_root, &bridge_config.dns)?;
543 }
544 }
545
546 if self.config.service_mode == ServiceMode::Production {
548 mount_secrets_inmemory(
549 &container_root,
550 &self.config.secrets,
551 &self.config.process_identity,
552 )?;
553 } else {
554 mount_secrets(&container_root, &self.config.secrets)?;
555 }
556
557 let proc_path = container_root.join("proc");
559 let hide_pids = self.config.service_mode == ServiceMode::Production;
560 mount_procfs(
561 &proc_path,
562 is_rootless,
563 self.config.proc_readonly,
564 hide_pids,
565 )?;
566
567 mask_proc_paths(
570 &proc_path,
571 self.config.service_mode == ServiceMode::Production,
572 )?;
573
574 if let Some(ref hooks) = self.config.hooks {
576 if !hooks.create_runtime.is_empty() {
577 let hook_state = OciContainerState {
578 oci_version: "1.0.2".to_string(),
579 id: self.config.id.clone(),
580 status: OciStatus::Creating,
581 pid: std::process::id(),
582 bundle: String::new(),
583 };
584 OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
585 }
586 }
587
588 switch_root(&container_root, self.config.allow_chroot_fallback)?;
591 fs_state = fs_state.transition(FilesystemState::Pivoted)?;
592 debug!("Filesystem state: {:?}", fs_state);
593
594 audit_mounts(self.config.service_mode == ServiceMode::Production)?;
596 audit(
597 &self.config.id,
598 &self.config.name,
599 AuditEventType::MountAuditPassed,
600 "all mount flags verified",
601 );
602
603 if let Some(ref hooks) = self.config.hooks {
605 if !hooks.create_container.is_empty() {
606 let hook_state = OciContainerState {
607 oci_version: "1.0.2".to_string(),
608 id: self.config.id.clone(),
609 status: OciStatus::Created,
610 pid: std::process::id(),
611 bundle: String::new(),
612 };
613 OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
614 }
615 }
616
617 let mut cap_mgr = CapabilityManager::new();
620 if let Some(ref policy_path) = self.config.caps_policy {
621 let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
622 policy_path,
623 self.config.caps_policy_sha256.as_deref(),
624 )?;
625 policy.apply(&mut cap_mgr)?;
626 audit(
627 &self.config.id,
628 &self.config.name,
629 AuditEventType::CapabilitiesDropped,
630 format!("capability policy applied from {:?}", policy_path),
631 );
632 } else {
633 cap_mgr.drop_all()?;
634 audit(
635 &self.config.id,
636 &self.config.name,
637 AuditEventType::CapabilitiesDropped,
638 "all capabilities dropped including bounding set",
639 );
640 }
641 sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
642
643 {
648 let is_production = self.config.service_mode == ServiceMode::Production;
649
650 let nproc_limit = self.config.limits.pids_max.unwrap_or(512);
651 let rlim_nproc = libc::rlimit {
652 rlim_cur: nproc_limit,
653 rlim_max: nproc_limit,
654 };
655 if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
657 let err = std::io::Error::last_os_error();
658 if is_production {
659 return Err(NucleusError::SeccompError(format!(
660 "Failed to set RLIMIT_NPROC to {} in production mode: {}",
661 nproc_limit, err
662 )));
663 }
664 warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
665 }
666
667 let rlim_nofile = libc::rlimit {
668 rlim_cur: 1024,
669 rlim_max: 1024,
670 };
671 if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
673 let err = std::io::Error::last_os_error();
674 if is_production {
675 return Err(NucleusError::SeccompError(format!(
676 "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
677 err
678 )));
679 }
680 warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
681 }
682
683 let memlock_limit: u64 = 64 * 1024; let rlim_memlock = libc::rlimit {
689 rlim_cur: memlock_limit,
690 rlim_max: memlock_limit,
691 };
692 if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
694 let err = std::io::Error::last_os_error();
695 if is_production {
696 return Err(NucleusError::SeccompError(format!(
697 "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
698 memlock_limit, err
699 )));
700 }
701 warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
702 }
703 }
704
705 use crate::container::config::SeccompMode;
708 let mut seccomp_mgr = SeccompManager::new();
709 let allow_network = !matches!(self.config.network, NetworkMode::None);
710 let seccomp_applied = match self.config.seccomp_mode {
711 SeccompMode::Trace => {
712 audit(
713 &self.config.id,
714 &self.config.name,
715 AuditEventType::SeccompApplied,
716 "seccomp trace mode: allow-all + LOG",
717 );
718 seccomp_mgr.apply_trace_filter()?
719 }
720 SeccompMode::Enforce => {
721 if let Some(ref profile_path) = self.config.seccomp_profile {
722 audit(
723 &self.config.id,
724 &self.config.name,
725 AuditEventType::SeccompProfileLoaded,
726 format!("path={:?}", profile_path),
727 );
728 seccomp_mgr.apply_profile_from_file(
729 profile_path,
730 self.config.seccomp_profile_sha256.as_deref(),
731 self.config.seccomp_log_denied,
732 )?
733 } else {
734 seccomp_mgr.apply_filter_for_network_mode(
735 allow_network,
736 allow_degraded_security,
737 self.config.seccomp_log_denied,
738 )?
739 }
740 }
741 };
742 if seccomp_applied {
743 sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
744 audit(
745 &self.config.id,
746 &self.config.name,
747 AuditEventType::SeccompApplied,
748 format!("network={}", allow_network),
749 );
750 } else if !allow_degraded_security {
751 return Err(NucleusError::SeccompError(
752 "Seccomp filter is required but was not enforced".to_string(),
753 ));
754 } else {
755 warn!("Seccomp not enforced; container is running with degraded hardening");
756 }
757
758 let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
760 let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
761 policy_path,
762 self.config.landlock_policy_sha256.as_deref(),
763 )?;
764 policy.apply(allow_degraded_security)?
765 } else {
766 let mut landlock_mgr = LandlockManager::new();
767 landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
768 landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
769 };
770 if seccomp_applied && landlock_applied {
771 sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
772 if self.config.seccomp_mode == SeccompMode::Trace {
773 warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
774 } else {
775 sec_state = sec_state.transition(SecurityState::Locked)?;
776 }
777 audit(
778 &self.config.id,
779 &self.config.name,
780 AuditEventType::LandlockApplied,
781 if self.config.seccomp_mode == SeccompMode::Trace {
782 "landlock applied, but seccomp in trace mode — not locked".to_string()
783 } else {
784 "security state locked: all hardening layers active".to_string()
785 },
786 );
787 } else if !allow_degraded_security {
788 return Err(NucleusError::LandlockError(
789 "Landlock policy is required but was not enforced".to_string(),
790 ));
791 } else {
792 warn!("Security state not locked; one or more hardening controls are inactive");
793 }
794 debug!("Security state: {:?}", sec_state);
795
796 if let Some(ref fifo_path) = exec_fifo {
800 debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
801 let file = std::fs::OpenOptions::new()
802 .write(true)
803 .open(fifo_path)
804 .map_err(|e| {
805 NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
806 })?;
807 std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
808 NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
809 })?;
810 drop(file);
811 debug!("Exec FIFO released, proceeding to exec");
812 }
813
814 if let Some(ref hooks) = self.config.hooks {
816 if !hooks.start_container.is_empty() {
817 let hook_state = OciContainerState {
818 oci_version: "1.0.2".to_string(),
819 id: self.config.id.clone(),
820 status: OciStatus::Running,
821 pid: std::process::id(),
822 bundle: String::new(),
823 };
824 OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
825 }
826 }
827
828 if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
831 return self.run_as_init();
832 }
833
834 self.exec_command()?;
836
837 Ok(())
839 }
840
841 pub(super) fn setup_signal_forwarding_static(
846 child: Pid,
847 ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
848 let mut set = SigSet::empty();
849 for signal in [
850 Signal::SIGTERM,
851 Signal::SIGINT,
852 Signal::SIGHUP,
853 Signal::SIGQUIT,
854 Signal::SIGUSR1,
855 Signal::SIGUSR2,
856 ] {
857 set.add(signal);
858 }
859
860 let unblock_set = set;
861 pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
862 NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
863 })?;
864
865 let stop = Arc::new(AtomicBool::new(false));
866 let stop_clone = stop.clone();
867 let handle = std::thread::Builder::new()
868 .name("sig-forward".to_string())
869 .spawn(move || {
870 while !stop_clone.load(Ordering::Relaxed) {
872 if let Ok(signal) = unblock_set.wait() {
873 let _ = kill(child, signal);
874 }
875 }
876 })
877 .map_err(|e| {
878 let mut restore = SigSet::empty();
881 for signal in [
882 Signal::SIGTERM,
883 Signal::SIGINT,
884 Signal::SIGHUP,
885 Signal::SIGQUIT,
886 Signal::SIGUSR1,
887 Signal::SIGUSR2,
888 ] {
889 restore.add(signal);
890 }
891 let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
892 NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
893 })?;
894
895 info!("Signal forwarding configured");
896 Ok((stop, handle))
897 }
898
899 pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
901 loop {
902 match waitpid(child, None) {
903 Ok(WaitStatus::Exited(_, code)) => {
904 return Ok(code);
905 }
906 Ok(WaitStatus::Signaled(_, signal, _)) => {
907 info!("Child killed by signal: {:?}", signal);
908 return Ok(128 + signal as i32);
909 }
910 Err(nix::errno::Errno::EINTR) => {
911 continue;
912 }
913 Err(e) => {
914 return Err(NucleusError::ExecError(format!(
915 "Failed to wait for child: {}",
916 e
917 )));
918 }
919 _ => {
920 continue;
921 }
922 }
923 }
924 }
925
926 fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
927 let mut pid_buf = [0u8; 4];
928 loop {
929 match read(ready_read.as_raw_fd(), &mut pid_buf) {
930 Err(nix::errno::Errno::EINTR) => continue,
931 Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
932 Ok(0) => {
933 return Err(NucleusError::ExecError(format!(
934 "Child {} exited before namespace initialization",
935 child
936 )))
937 }
938 Ok(_) => {
939 return Err(NucleusError::ExecError(
940 "Invalid namespace sync payload from child".to_string(),
941 ))
942 }
943 Err(e) => {
944 return Err(NucleusError::ExecError(format!(
945 "Failed waiting for child namespace setup: {}",
946 e
947 )))
948 }
949 }
950 }
951 }
952
953 fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
954 let payload = pid.to_ne_bytes();
955 let mut written = 0;
956 while written < payload.len() {
957 let n = write(fd, &payload[written..]).map_err(|e| {
958 NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
959 })?;
960 if n == 0 {
961 return Err(NucleusError::ExecError(
962 "Failed to notify namespace readiness: short write".to_string(),
963 ));
964 }
965 written += n;
966 }
967 Ok(())
968 }
969
970 fn wait_for_pid_namespace_child(child: Pid) -> i32 {
971 loop {
972 match waitpid(child, None) {
973 Ok(WaitStatus::Exited(_, code)) => return code,
974 Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
975 Err(nix::errno::Errno::EINTR) => continue,
976 Err(_) => return 1,
977 _ => continue,
978 }
979 }
980 }
981}
982
983impl CreatedContainer {
984 pub fn start(mut self) -> Result<i32> {
987 let config = &self.config;
988 let _enter = self._lifecycle_span.enter();
989
990 if let Some(exec_fifo_path) = &self.exec_fifo_path {
993 let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
994 NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
995 })?;
996 let mut buf = [0u8; 1];
997 let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
998 NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
999 })?;
1000 if read != 1 {
1001 return Err(NucleusError::ExecError(
1002 "Exec FIFO closed before start signal was delivered".to_string(),
1003 ));
1004 }
1005 let _ = std::fs::remove_file(exec_fifo_path);
1006 }
1007
1008 self.state.status = OciStatus::Running;
1010 self.state_mgr.save_state(&self.state)?;
1011
1012 let target_pid = self.state.pid;
1013 let child = self.child;
1014
1015 let (sig_stop, sig_handle) =
1016 Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1017
1018 let mut sig_guard = SignalThreadGuard {
1020 stop: Some(sig_stop),
1021 handle: Some(sig_handle),
1022 };
1023
1024 if let Some(ref probe) = config.readiness_probe {
1026 let notify_socket = if config.sd_notify {
1027 std::env::var("NOTIFY_SOCKET").ok()
1028 } else {
1029 None
1030 };
1031 Container::run_readiness_probe(
1032 target_pid,
1033 &config.name,
1034 probe,
1035 config.user_ns_config.is_some(),
1036 config.use_gvisor,
1037 &config.process_identity,
1038 notify_socket.as_deref(),
1039 )?;
1040 }
1041
1042 let cancel_flag = Arc::new(AtomicBool::new(false));
1044 let health_handle = if let Some(ref hc) = config.health_check {
1045 if !hc.command.is_empty() {
1046 let hc = hc.clone();
1047 let pid = target_pid;
1048 let container_name = config.name.clone();
1049 let rootless = config.user_ns_config.is_some();
1050 let using_gvisor = config.use_gvisor;
1051 let process_identity = config.process_identity.clone();
1052 let cancel = cancel_flag.clone();
1053 Some(std::thread::spawn(move || {
1054 Container::health_check_loop(
1055 pid,
1056 &container_name,
1057 rootless,
1058 using_gvisor,
1059 &hc,
1060 &process_identity,
1061 &cancel,
1062 );
1063 }))
1064 } else {
1065 None
1066 }
1067 } else {
1068 None
1069 };
1070
1071 let mut health_guard = HealthThreadGuard {
1073 cancel: Some(cancel_flag),
1074 handle: health_handle,
1075 };
1076
1077 if let Some(ref hooks) = config.hooks {
1079 if !hooks.poststart.is_empty() {
1080 let hook_state = OciContainerState {
1081 oci_version: "1.0.2".to_string(),
1082 id: config.id.clone(),
1083 status: OciStatus::Running,
1084 pid: target_pid,
1085 bundle: String::new(),
1086 };
1087 OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1088 }
1089 }
1090
1091 let mut child_waited = false;
1092 let run_result: Result<i32> = (|| {
1093 let exit_code = Container::wait_for_child_static(child)?;
1094
1095 self.state.status = OciStatus::Stopped;
1097 let _ = self.state_mgr.save_state(&self.state);
1098
1099 child_waited = true;
1100 Ok(exit_code)
1101 })();
1102
1103 health_guard.stop();
1106 sig_guard.stop();
1107
1108 if let Some(ref hooks) = config.hooks {
1110 if !hooks.poststop.is_empty() {
1111 let hook_state = OciContainerState {
1112 oci_version: "1.0.2".to_string(),
1113 id: config.id.clone(),
1114 status: OciStatus::Stopped,
1115 pid: 0,
1116 bundle: String::new(),
1117 };
1118 OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1119 }
1120 }
1121
1122 if let Some(net) = self.bridge_net.take() {
1123 if let Err(e) = net.cleanup() {
1124 warn!("Failed to cleanup bridge networking: {}", e);
1125 }
1126 }
1127
1128 if !child_waited {
1129 let _ = kill(child, Signal::SIGKILL);
1130 let _ = waitpid(child, None);
1131 }
1132
1133 if let Some(reader) = self.trace_reader.take() {
1134 reader.stop_and_flush();
1135 }
1136
1137 if let Some(cgroup) = self.cgroup_opt.take() {
1138 if let Err(e) = cgroup.cleanup() {
1139 warn!("Failed to cleanup cgroup: {}", e);
1140 }
1141 }
1142
1143 if config.use_gvisor {
1144 if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1145 warn!(
1146 "Failed to cleanup gVisor artifacts for {}: {}",
1147 config.id, e
1148 );
1149 }
1150 }
1151
1152 if let Err(e) = self.state_mgr.delete_state(&config.id) {
1153 warn!("Failed to delete state for {}: {}", config.id, e);
1154 }
1155
1156 match run_result {
1157 Ok(exit_code) => {
1158 audit(
1159 &config.id,
1160 &config.name,
1161 AuditEventType::ContainerStop,
1162 format!("exit_code={}", exit_code),
1163 );
1164 info!(
1165 "Container {} ({}) exited with code {}",
1166 config.name, config.id, exit_code
1167 );
1168 Ok(exit_code)
1169 }
1170 Err(e) => {
1171 audit_error(
1172 &config.id,
1173 &config.name,
1174 AuditEventType::ContainerStop,
1175 format!("error={}", e),
1176 );
1177 Err(e)
1178 }
1179 }
1180 }
1181}
1182
1183struct SignalThreadGuard {
1185 stop: Option<Arc<AtomicBool>>,
1186 handle: Option<JoinHandle<()>>,
1187}
1188
1189impl SignalThreadGuard {
1190 fn stop(&mut self) {
1191 if let Some(flag) = self.stop.take() {
1192 flag.store(true, Ordering::Relaxed);
1193 let _ = kill(Pid::this(), Signal::SIGUSR1);
1195 }
1196 if let Some(handle) = self.handle.take() {
1197 let _ = handle.join();
1198 }
1199 }
1200}
1201
1202impl Drop for SignalThreadGuard {
1203 fn drop(&mut self) {
1204 self.stop();
1205 }
1206}
1207
1208struct HealthThreadGuard {
1210 cancel: Option<Arc<AtomicBool>>,
1211 handle: Option<JoinHandle<()>>,
1212}
1213
1214impl HealthThreadGuard {
1215 fn stop(&mut self) {
1216 if let Some(flag) = self.cancel.take() {
1217 flag.store(true, Ordering::Relaxed);
1218 }
1219 if let Some(handle) = self.handle.take() {
1220 let _ = handle.join();
1221 }
1222 }
1223}
1224
1225impl Drop for HealthThreadGuard {
1226 fn drop(&mut self) {
1227 self.stop();
1228 }
1229}
1230
1231#[cfg(test)]
1232mod tests {
1233 use super::*;
1234 use crate::container::KernelLockdownMode;
1235 use crate::network::NetworkMode;
1236
1237 #[test]
1238 fn test_container_config() {
1239 let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1240 assert!(!config.id.is_empty());
1241 assert_eq!(config.command, vec!["/bin/sh"]);
1242 assert!(config.use_gvisor);
1243 }
1244
1245 #[test]
1246 fn test_run_uses_immediate_start_path() {
1247 let source = include_str!("runtime.rs");
1248 let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1249 let after = &source[fn_start..];
1250 let open = after.find('{').unwrap();
1251 let mut depth = 0u32;
1252 let mut fn_end = open;
1253 for (i, ch) in after[open..].char_indices() {
1254 match ch {
1255 '{' => depth += 1,
1256 '}' => {
1257 depth -= 1;
1258 if depth == 0 {
1259 fn_end = open + i + 1;
1260 break;
1261 }
1262 }
1263 _ => {}
1264 }
1265 }
1266 let run_body = &after[..fn_end];
1267 assert!(
1268 run_body.contains("create_internal(false)"),
1269 "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1270 );
1271 assert!(
1272 !run_body.contains("self.create()?.start()"),
1273 "run() must not route through create()+start()"
1274 );
1275 }
1276
1277 #[test]
1278 fn test_container_config_with_name() {
1279 let config =
1280 ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1281 .unwrap();
1282 assert_eq!(config.name, "mycontainer");
1283 assert!(!config.id.is_empty());
1284 assert_ne!(config.id, config.name);
1285 }
1286
1287 #[test]
1288 fn test_allow_degraded_security_requires_explicit_config() {
1289 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1290 assert!(!Container::allow_degraded_security(&strict));
1291
1292 let relaxed = strict.clone().with_allow_degraded_security(true);
1293 assert!(Container::allow_degraded_security(&relaxed));
1294 }
1295
1296 #[test]
1297 fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1298 let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1299 std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1300
1301 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1302 assert!(!Container::allow_degraded_security(&strict));
1303
1304 let explicit = strict.with_allow_degraded_security(true);
1305 assert!(Container::allow_degraded_security(&explicit));
1306
1307 match prev {
1308 Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1309 None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1310 }
1311 }
1312
1313 #[test]
1314 fn test_host_network_requires_explicit_opt_in() {
1315 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1316 .unwrap()
1317 .with_network(NetworkMode::Host)
1318 .with_allow_host_network(false);
1319 let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1320 assert!(matches!(err, NucleusError::NetworkError(_)));
1321 }
1322
1323 #[test]
1324 fn test_host_network_opt_in_disables_net_namespace() {
1325 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1326 .unwrap()
1327 .with_network(NetworkMode::Host)
1328 .with_allow_host_network(true);
1329 assert!(config.namespaces.net);
1330 Container::apply_network_mode_guards(&mut config, true).unwrap();
1331 assert!(!config.namespaces.net);
1332 }
1333
1334 #[test]
1335 fn test_non_host_network_does_not_require_host_opt_in() {
1336 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1337 .unwrap()
1338 .with_network(NetworkMode::None)
1339 .with_allow_host_network(false);
1340 assert!(config.namespaces.net);
1341 Container::apply_network_mode_guards(&mut config, true).unwrap();
1342 assert!(config.namespaces.net);
1343 }
1344
1345 #[test]
1346 fn test_parse_kernel_lockdown_mode() {
1347 assert_eq!(
1348 Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1349 Some(KernelLockdownMode::Integrity)
1350 );
1351 assert_eq!(
1352 Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1353 Some(KernelLockdownMode::Confidentiality)
1354 );
1355 assert_eq!(
1356 Container::parse_active_lockdown_mode("[none] integrity"),
1357 None
1358 );
1359 }
1360
1361 #[test]
1362 fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1363 let temp = tempfile::TempDir::new().unwrap();
1364 let source = temp.path().join("source-secret");
1365 std::fs::write(&source, "supersecret").unwrap();
1366
1367 let staged = Container::stage_gvisor_secret_files(
1368 &temp.path().join("stage"),
1369 &[crate::container::SecretMount {
1370 source: source.clone(),
1371 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1372 mode: 0o400,
1373 }],
1374 &crate::container::ProcessIdentity::root(),
1375 )
1376 .unwrap();
1377
1378 assert_eq!(staged.len(), 1);
1379 assert!(staged[0].source.starts_with(temp.path().join("stage")));
1380 assert_eq!(
1381 std::fs::read_to_string(&staged[0].source).unwrap(),
1382 "supersecret"
1383 );
1384 }
1385
1386 #[test]
1387 fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1388 let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1389 std::fs::create_dir_all(&artifact_dir).unwrap();
1390 std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1391
1392 Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1393 assert!(!artifact_dir.exists());
1394 }
1395
1396 #[test]
1397 fn test_health_check_loop_supports_cancellation() {
1398 let source = include_str!("health.rs");
1402 let fn_start = source.find("fn health_check_loop").unwrap();
1403 let fn_body = &source[fn_start..fn_start + 2500];
1404 assert!(
1405 fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1406 "health_check_loop must accept an AtomicBool cancellation flag"
1407 );
1408 assert!(
1410 fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1411 "health_check_loop must check cancellation during sleep intervals"
1412 );
1413 }
1414
1415 #[test]
1416 fn test_runtime_probes_do_not_spawn_host_nsenter() {
1417 let source = include_str!("health.rs");
1419
1420 let readiness_start = source.find("fn run_readiness_probe").unwrap();
1421 let readiness_body = &source[readiness_start..readiness_start + 2500];
1422 assert!(
1423 !readiness_body.contains("Command::new(&nsenter_bin)"),
1424 "readiness probes must not execute via host nsenter"
1425 );
1426
1427 let health_start = source.find("fn health_check_loop").unwrap();
1428 let health_body = &source[health_start..health_start + 2200];
1429 assert!(
1430 !health_body.contains("Command::new(&nsenter_bin)"),
1431 "health checks must not execute via host nsenter"
1432 );
1433 }
1434
1435 #[test]
1436 fn test_oci_mount_strip_prefix_no_expect() {
1437 let source = include_str!("gvisor_setup.rs");
1440 let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1441 let fn_body = &source[fn_start..fn_start + 600];
1442 assert!(
1443 !fn_body.contains(".expect("),
1444 "prepare_oci_mountpoints must not use expect() — return Err instead"
1445 );
1446 }
1447
1448 #[test]
1449 fn test_notify_namespace_ready_validates_write_length() {
1450 let source = include_str!("runtime.rs");
1452 let fn_start = source.find("fn notify_namespace_ready").unwrap();
1453 let fn_body = &source[fn_start..fn_start + 500];
1454 assert!(
1456 fn_body.contains("written")
1457 || fn_body.contains("4")
1458 || fn_body.contains("payload.len()"),
1459 "notify_namespace_ready must validate complete write of all 4 bytes"
1460 );
1461 }
1462
1463 #[test]
1464 fn test_rlimit_failures_fatal_in_production() {
1465 let source = include_str!("runtime.rs");
1467 let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1468 let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1469 assert!(
1470 rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1471 "RLIMIT failures must return Err in production mode"
1472 );
1473 }
1474
1475 #[test]
1476 fn test_tcp_readiness_probe_uses_portable_check() {
1477 let source = include_str!("health.rs");
1480 let probe_fn = source.find("TcpPort(port)").unwrap();
1481 let probe_body = &source[probe_fn..probe_fn + 500];
1482 assert!(
1483 !probe_body.contains("/dev/tcp"),
1484 "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1485 );
1486 }
1487}