1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3 ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams, OciStatus,
4 ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8 audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9 mask_proc_paths, mount_procfs, mount_secrets, mount_secrets_inmemory, mount_volumes,
10 snapshot_context_dir, switch_root, verify_context_manifest, verify_rootfs_attestation,
11 FilesystemState, LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::NamespaceManager;
14use crate::network::{BridgeNetwork, NetworkMode};
15use crate::resources::Cgroup;
16use crate::security::{
17 CapabilityManager, GVisorRuntime, LandlockManager, OciContainerState, OciHooks, SeccompManager,
18 SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::OwnedFd;
26use std::path::PathBuf;
27use std::sync::atomic::{AtomicBool, Ordering};
28use std::sync::Arc;
29use std::thread::JoinHandle;
30use tempfile::Builder;
31use tracing::{debug, error, info, info_span, warn};
32
33pub struct Container {
42 pub(super) config: ContainerConfig,
43 pub(super) runsc_path: Option<String>,
46}
47
48pub struct CreatedContainer {
52 pub(super) config: ContainerConfig,
53 pub(super) state_mgr: ContainerStateManager,
54 pub(super) state: ContainerState,
55 pub(super) child: Pid,
56 pub(super) cgroup_opt: Option<Cgroup>,
57 pub(super) bridge_net: Option<BridgeNetwork>,
58 pub(super) trace_reader: Option<SeccompTraceReader>,
59 pub(super) exec_fifo_path: Option<PathBuf>,
60 pub(super) _lifecycle_span: tracing::Span,
61}
62
63impl Container {
64 pub fn new(config: ContainerConfig) -> Self {
65 Self {
66 config,
67 runsc_path: None,
68 }
69 }
70
71 pub fn run(&self) -> Result<i32> {
73 self.create_internal(false)?.start()
74 }
75
76 pub fn create(&self) -> Result<CreatedContainer> {
80 self.create_internal(true)
81 }
82
83 fn sanitize_fds() {
88 const CLOSE_RANGE_CLOEXEC: libc::c_uint = 4;
91 let ret =
93 unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, CLOSE_RANGE_CLOEXEC) };
94 if ret == 0 {
95 return;
96 }
97 if let Ok(entries) = std::fs::read_dir("/proc/self/fd") {
99 for entry in entries.flatten() {
100 if let Ok(fd_str) = entry.file_name().into_string() {
101 if let Ok(fd) = fd_str.parse::<i32>() {
102 if fd > 2 {
103 unsafe { libc::close(fd) };
104 }
105 }
106 }
107 }
108 }
109 }
110
111 fn create_internal(&self, defer_exec_until_start: bool) -> Result<CreatedContainer> {
112 let lifecycle_span = info_span!(
113 "container.lifecycle",
114 container.id = %self.config.id,
115 container.name = %self.config.name,
116 runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
117 );
118 let _enter = lifecycle_span.enter();
119
120 info!(
121 "Creating container: {} (ID: {})",
122 self.config.name, self.config.id
123 );
124 audit(
125 &self.config.id,
126 &self.config.name,
127 AuditEventType::ContainerStart,
128 format!(
129 "command={:?} mode={:?} runtime={}",
130 crate::audit::redact_command(&self.config.command),
131 self.config.service_mode,
132 if self.config.use_gvisor {
133 "gvisor"
134 } else {
135 "native"
136 }
137 ),
138 );
139
140 let is_root = nix::unistd::Uid::effective().is_root();
142 let mut config = self.config.clone();
143
144 if !is_root && config.user_ns_config.is_none() {
145 info!("Not running as root, automatically enabling rootless mode");
146 config.namespaces.user = true;
147 config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
148 }
149
150 if is_root && !config.namespaces.user {
154 if config.service_mode == ServiceMode::Production {
155 info!("Running as root in production mode: enabling user namespace with UID remapping");
156 config.namespaces.user = true;
157 config.user_ns_config =
158 Some(crate::isolation::UserNamespaceConfig::root_remapped());
159 } else {
160 warn!(
161 "Running as root WITHOUT user namespace isolation. \
162 Container processes will run as real host UID 0. \
163 Use --user-ns or production mode for UID remapping."
164 );
165 }
166 }
167
168 if let Some(ref socket_path) = config.console_socket {
170 warn!(
171 "Console socket {} accepted but terminal forwarding is not yet implemented",
172 socket_path.display()
173 );
174 }
175
176 config.validate_production_mode()?;
178 Self::assert_kernel_lockdown(&config)?;
179
180 Self::apply_network_mode_guards(&mut config, is_root)?;
181 Self::apply_trust_level_guards(&mut config)?;
182 config.validate_runtime_support()?;
183
184 if matches!(config.network, NetworkMode::Bridge(_)) && !is_root {
186 if config.service_mode == ServiceMode::Production {
187 return Err(NucleusError::NetworkError(
188 "Production mode with bridge networking requires root (cannot silently \
189 degrade to no networking)"
190 .to_string(),
191 ));
192 }
193 warn!("Bridge networking requires root, degrading to no networking");
194 config.network = NetworkMode::None;
195 }
196
197 let state_mgr = ContainerStateManager::new_with_root(config.state_root.clone())?;
199
200 if let Ok(all_states) = state_mgr.list_states() {
202 if all_states.iter().any(|s| s.name == config.name) {
203 return Err(NucleusError::ConfigError(format!(
204 "A container named '{}' already exists; use a different --name, \
205 or remove the stale state with 'nucleus delete'",
206 config.name
207 )));
208 }
209 }
210
211 let exec_fifo = if defer_exec_until_start {
214 let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
215 nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
216 NucleusError::ExecError(format!(
217 "Failed to create exec FIFO {:?}: {}",
218 exec_fifo, e
219 ))
220 })?;
221 Some(exec_fifo)
222 } else {
223 None
224 };
225
226 let cgroup_name = format!("nucleus-{}", config.id);
228 let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
229 Ok(mut cgroup) => {
230 match cgroup.set_limits(&config.limits) {
232 Ok(_) => {
233 info!("Created cgroup with resource limits");
234 Some(cgroup)
235 }
236 Err(e) => {
237 if config.service_mode == ServiceMode::Production {
238 let _ = cgroup.cleanup();
239 return Err(NucleusError::CgroupError(format!(
240 "Production mode requires cgroup resource enforcement, but \
241 applying limits failed: {}",
242 e
243 )));
244 }
245 warn!("Failed to set cgroup limits: {}", e);
246 let _ = cgroup.cleanup();
247 None
248 }
249 }
250 }
251 Err(e) => {
252 if config.service_mode == ServiceMode::Production {
253 return Err(NucleusError::CgroupError(format!(
254 "Production mode requires cgroup resource enforcement, but \
255 cgroup creation failed: {}",
256 e
257 )));
258 }
259
260 if config.user_ns_config.is_some() {
261 if config.limits.memory_bytes.is_some()
262 || config.limits.cpu_quota_us.is_some()
263 || config.limits.pids_max.is_some()
264 {
265 warn!(
266 "Running in rootless mode: requested resource limits cannot be \
267 enforced – cgroup creation requires root ({})",
268 e
269 );
270 } else {
271 debug!("Running in rootless mode without cgroup resource limits");
272 }
273 } else {
274 warn!(
275 "Failed to create cgroup (running without resource limits): {}",
276 e
277 );
278 }
279 None
280 }
281 };
282
283 let runsc_path = if config.use_gvisor {
285 Some(GVisorRuntime::resolve_path().map_err(|e| {
286 NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
287 })?)
288 } else {
289 None
290 };
291
292 let (ready_read, ready_write) = pipe().map_err(|e| {
294 NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
295 })?;
296
297 match unsafe { fork() }? {
304 ForkResult::Parent { child } => {
305 drop(ready_write);
306 info!("Forked child process: {}", child);
307
308 let parent_setup = || -> Result<CreatedContainer> {
311 let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
312
313 let cgroup_path = cgroup_opt
314 .as_ref()
315 .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
316 let cpu_millicores = config
317 .limits
318 .cpu_quota_us
319 .map(|quota| quota.saturating_mul(1000) / config.limits.cpu_period_us);
320 let mut state = ContainerState::new(ContainerStateParams {
321 id: config.id.clone(),
322 name: config.name.clone(),
323 pid: target_pid,
324 command: config.command.clone(),
325 memory_limit: config.limits.memory_bytes,
326 cpu_limit: cpu_millicores,
327 using_gvisor: config.use_gvisor,
328 rootless: config.user_ns_config.is_some(),
329 cgroup_path,
330 process_uid: config.process_identity.uid,
331 process_gid: config.process_identity.gid,
332 additional_gids: config.process_identity.additional_gids.clone(),
333 });
334 state.config_hash = config.config_hash;
335 state.bundle_path =
336 config.rootfs_path.as_ref().map(|p| p.display().to_string());
337
338 let mut bridge_net: Option<BridgeNetwork> = None;
339 let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
340
341 state.status = OciStatus::Created;
343 state_mgr.save_state(&state)?;
344
345 if let Some(ref pid_path) = config.pid_file {
347 std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
348 NucleusError::ConfigError(format!(
349 "Failed to write pid-file '{}': {}",
350 pid_path.display(),
351 e
352 ))
353 })?;
354 info!("Wrote PID {} to {}", target_pid, pid_path.display());
355 }
356
357 if let Some(ref mut cgroup) = cgroup_opt {
358 cgroup.attach_process(target_pid)?;
359 }
360
361 if let NetworkMode::Bridge(ref bridge_config) = config.network {
362 match BridgeNetwork::setup_with_id(target_pid, bridge_config, &config.id) {
363 Ok(net) => {
364 if let Some(ref egress) = config.egress_policy {
365 if let Err(e) = net.apply_egress_policy(target_pid, egress) {
366 if config.service_mode == ServiceMode::Production {
367 return Err(NucleusError::NetworkError(format!(
368 "Failed to apply egress policy: {}",
369 e
370 )));
371 }
372 warn!("Failed to apply egress policy: {}", e);
373 }
374 }
375 bridge_net = Some(net);
376 }
377 Err(e) => {
378 if config.service_mode == ServiceMode::Production {
379 return Err(e);
380 }
381 warn!("Failed to set up bridge networking: {}", e);
382 }
383 }
384 }
385
386 info!(
387 "Container {} created (child pid {}), waiting for start",
388 config.id, target_pid
389 );
390
391 Ok(CreatedContainer {
392 config,
393 state_mgr,
394 state,
395 child,
396 cgroup_opt,
397 bridge_net,
398 trace_reader,
399 exec_fifo_path: exec_fifo,
400 _lifecycle_span: lifecycle_span.clone(),
401 })
402 };
403
404 parent_setup().map_err(|e| {
405 let _ = kill(child, Signal::SIGKILL);
408 let _ = waitpid(child, None);
409 e
410 })
411 }
412 ForkResult::Child => {
413 drop(ready_read);
414 Self::sanitize_fds();
416 let temp_container = Container { config, runsc_path };
417 match temp_container.setup_and_exec(Some(ready_write), exec_fifo) {
418 Ok(_) => unreachable!(),
419 Err(e) => {
420 error!("Container setup failed: {}", e);
421 std::process::exit(1);
422 }
423 }
424 }
425 }
426 }
427
428 pub fn trigger_start(container_id: &str, state_root: Option<PathBuf>) -> Result<()> {
431 let state_mgr = ContainerStateManager::new_with_root(state_root)?;
432 let fifo_path = state_mgr.exec_fifo_path(container_id)?;
433 if !fifo_path.exists() {
434 return Err(NucleusError::ConfigError(format!(
435 "No exec FIFO found for container {}; is it in 'created' state?",
436 container_id
437 )));
438 }
439
440 let file = std::fs::File::open(&fifo_path)
442 .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
443 let mut buf = [0u8; 1];
444 std::io::Read::read(&mut &file, &mut buf)
445 .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
446 drop(file);
447
448 let _ = std::fs::remove_file(&fifo_path);
449
450 let mut state = state_mgr.resolve_container(container_id)?;
452 state.status = OciStatus::Running;
453 state_mgr.save_state(&state)?;
454
455 Ok(())
456 }
457
458 fn setup_and_exec(
463 &self,
464 ready_pipe: Option<OwnedFd>,
465 exec_fifo: Option<PathBuf>,
466 ) -> Result<()> {
467 let is_rootless = self.config.user_ns_config.is_some();
468 let allow_degraded_security = Self::allow_degraded_security(&self.config);
469 let context_manifest = if self.config.verify_context_integrity {
470 self.config
471 .context_dir
472 .as_ref()
473 .map(|dir| snapshot_context_dir(dir))
474 .transpose()?
475 } else {
476 None
477 };
478
479 let mut fs_state = FilesystemState::Unmounted;
481 let mut sec_state = SecurityState::Privileged;
482
483 if self.config.use_gvisor {
487 if let Some(fd) = ready_pipe {
488 Self::notify_namespace_ready(&fd, std::process::id())?;
489 }
490 return self.setup_and_exec_gvisor();
491 }
492
493 let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
495 if let Some(user_config) = &self.config.user_ns_config {
496 namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
497 }
498 namespace_mgr.unshare_namespaces()?;
499
500 if self.config.namespaces.pid {
503 match unsafe { fork() }? {
504 ForkResult::Parent { child } => {
505 if let Some(fd) = ready_pipe {
506 Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
507 }
508 std::process::exit(Self::wait_for_pid_namespace_child(child));
509 }
510 ForkResult::Child => {
511 }
513 }
514 } else if let Some(fd) = ready_pipe {
515 Self::notify_namespace_ready(&fd, std::process::id())?;
516 }
517
518 namespace_mgr.enter()?;
520
521 self.enforce_no_new_privs()?;
525 audit(
526 &self.config.id,
527 &self.config.name,
528 AuditEventType::NoNewPrivsSet,
529 "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
530 );
531
532 if let Some(hostname) = &self.config.hostname {
534 namespace_mgr.set_hostname(hostname)?;
535 }
536
537 let runtime_base = if nix::unistd::Uid::effective().is_root() {
542 std::path::PathBuf::from("/run/nucleus")
543 } else {
544 dirs::runtime_dir()
545 .map(|d| d.join("nucleus"))
546 .unwrap_or_else(std::env::temp_dir)
547 };
548 let _ = std::fs::create_dir_all(&runtime_base);
549 let runtime_dir = Builder::new()
550 .prefix("nucleus-runtime-")
551 .tempdir_in(&runtime_base)
552 .map_err(|e| {
553 NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
554 })?;
555 let container_root = runtime_dir.path().to_path_buf();
556 let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); tmpfs.mount()?;
558 fs_state = fs_state.transition(FilesystemState::Mounted)?;
559
560 create_minimal_fs(&container_root)?;
562
563 let dev_path = container_root.join("dev");
565 create_dev_nodes(&dev_path, false)?;
566
567 if let Some(context_dir) = &self.config.context_dir {
570 let context_dest = container_root.join("context");
571 LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
572 if let Some(expected) = &context_manifest {
573 verify_context_manifest(expected, &context_dest)?;
574 }
575 }
576 fs_state = fs_state.transition(FilesystemState::Populated)?;
577
578 if let Some(ref rootfs_path) = self.config.rootfs_path {
580 if self.config.verify_rootfs_attestation {
581 verify_rootfs_attestation(rootfs_path)?;
582 }
583 bind_mount_rootfs(&container_root, rootfs_path)?;
584 } else {
585 bind_mount_host_paths(&container_root, is_rootless)?;
586 }
587
588 mount_volumes(&container_root, &self.config.volumes)?;
590
591 if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
595 if self.config.rootfs_path.is_some() {
596 BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_config.dns)?;
597 } else {
598 BridgeNetwork::write_resolv_conf(&container_root, &bridge_config.dns)?;
599 }
600 }
601
602 if self.config.service_mode == ServiceMode::Production {
604 mount_secrets_inmemory(
605 &container_root,
606 &self.config.secrets,
607 &self.config.process_identity,
608 )?;
609 } else {
610 mount_secrets(&container_root, &self.config.secrets)?;
611 }
612
613 let proc_path = container_root.join("proc");
615 let hide_pids = self.config.service_mode == ServiceMode::Production;
616 mount_procfs(
617 &proc_path,
618 is_rootless,
619 self.config.proc_readonly,
620 hide_pids,
621 )?;
622
623 mask_proc_paths(
626 &proc_path,
627 self.config.service_mode == ServiceMode::Production,
628 )?;
629
630 if let Some(ref hooks) = self.config.hooks {
632 if !hooks.create_runtime.is_empty() {
633 let hook_state = OciContainerState {
634 oci_version: "1.0.2".to_string(),
635 id: self.config.id.clone(),
636 status: OciStatus::Creating,
637 pid: std::process::id(),
638 bundle: String::new(),
639 };
640 OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
641 }
642 }
643
644 switch_root(&container_root, self.config.allow_chroot_fallback)?;
647 fs_state = fs_state.transition(FilesystemState::Pivoted)?;
648 debug!("Filesystem state: {:?}", fs_state);
649
650 audit_mounts(self.config.service_mode == ServiceMode::Production)?;
652 audit(
653 &self.config.id,
654 &self.config.name,
655 AuditEventType::MountAuditPassed,
656 "all mount flags verified",
657 );
658
659 if let Some(ref hooks) = self.config.hooks {
661 if !hooks.create_container.is_empty() {
662 let hook_state = OciContainerState {
663 oci_version: "1.0.2".to_string(),
664 id: self.config.id.clone(),
665 status: OciStatus::Created,
666 pid: std::process::id(),
667 bundle: String::new(),
668 };
669 OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
670 }
671 }
672
673 let mut cap_mgr = CapabilityManager::new();
676 if let Some(ref policy_path) = self.config.caps_policy {
677 let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
678 policy_path,
679 self.config.caps_policy_sha256.as_deref(),
680 )?;
681 if self.config.service_mode == ServiceMode::Production {
683 policy.validate_production()?;
684 }
685 policy.apply(&mut cap_mgr)?;
686 audit(
687 &self.config.id,
688 &self.config.name,
689 AuditEventType::CapabilitiesDropped,
690 format!("capability policy applied from {:?}", policy_path),
691 );
692 } else {
693 cap_mgr.drop_all()?;
694 audit(
695 &self.config.id,
696 &self.config.name,
697 AuditEventType::CapabilitiesDropped,
698 "all capabilities dropped including bounding set",
699 );
700 }
701 sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
702
703 {
708 let is_production = self.config.service_mode == ServiceMode::Production;
709
710 let nproc_limit = self.config.limits.pids_max.unwrap_or(512);
711 let rlim_nproc = libc::rlimit {
712 rlim_cur: nproc_limit,
713 rlim_max: nproc_limit,
714 };
715 if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
717 let err = std::io::Error::last_os_error();
718 if is_production {
719 return Err(NucleusError::SeccompError(format!(
720 "Failed to set RLIMIT_NPROC to {} in production mode: {}",
721 nproc_limit, err
722 )));
723 }
724 warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
725 }
726
727 let rlim_nofile = libc::rlimit {
728 rlim_cur: 1024,
729 rlim_max: 1024,
730 };
731 if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
733 let err = std::io::Error::last_os_error();
734 if is_production {
735 return Err(NucleusError::SeccompError(format!(
736 "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
737 err
738 )));
739 }
740 warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
741 }
742
743 let memlock_limit: u64 = 64 * 1024; let rlim_memlock = libc::rlimit {
749 rlim_cur: memlock_limit,
750 rlim_max: memlock_limit,
751 };
752 if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
754 let err = std::io::Error::last_os_error();
755 if is_production {
756 return Err(NucleusError::SeccompError(format!(
757 "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
758 memlock_limit, err
759 )));
760 }
761 warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
762 }
763 }
764
765 CapabilityManager::verify_no_namespace_caps(
769 self.config.service_mode == ServiceMode::Production,
770 )?;
771
772 use crate::container::config::SeccompMode;
775 let mut seccomp_mgr = SeccompManager::new();
776 let allow_network = !matches!(self.config.network, NetworkMode::None);
777 let seccomp_applied = match self.config.seccomp_mode {
778 SeccompMode::Trace => {
779 audit(
780 &self.config.id,
781 &self.config.name,
782 AuditEventType::SeccompApplied,
783 "seccomp trace mode: allow-all + LOG",
784 );
785 seccomp_mgr.apply_trace_filter()?
786 }
787 SeccompMode::Enforce => {
788 if let Some(ref profile_path) = self.config.seccomp_profile {
789 audit(
790 &self.config.id,
791 &self.config.name,
792 AuditEventType::SeccompProfileLoaded,
793 format!("path={:?}", profile_path),
794 );
795 seccomp_mgr.apply_profile_from_file(
796 profile_path,
797 self.config.seccomp_profile_sha256.as_deref(),
798 self.config.seccomp_log_denied,
799 )?
800 } else {
801 seccomp_mgr.apply_filter_for_network_mode(
802 allow_network,
803 allow_degraded_security,
804 self.config.seccomp_log_denied,
805 )?
806 }
807 }
808 };
809 if seccomp_applied {
810 sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
811 audit(
812 &self.config.id,
813 &self.config.name,
814 AuditEventType::SeccompApplied,
815 format!("network={}", allow_network),
816 );
817 } else if !allow_degraded_security {
818 return Err(NucleusError::SeccompError(
819 "Seccomp filter is required but was not enforced".to_string(),
820 ));
821 } else {
822 warn!("Seccomp not enforced; container is running with degraded hardening");
823 }
824
825 let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
827 let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
828 policy_path,
829 self.config.landlock_policy_sha256.as_deref(),
830 )?;
831 if self.config.service_mode == ServiceMode::Production {
833 policy.validate_production()?;
834 }
835 policy.apply(allow_degraded_security)?
836 } else {
837 let mut landlock_mgr = LandlockManager::new();
838 landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
839 landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
840 };
841 if seccomp_applied && landlock_applied {
842 sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
843 if self.config.seccomp_mode == SeccompMode::Trace {
844 warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
845 } else {
846 sec_state = sec_state.transition(SecurityState::Locked)?;
847 }
848 audit(
849 &self.config.id,
850 &self.config.name,
851 AuditEventType::LandlockApplied,
852 if self.config.seccomp_mode == SeccompMode::Trace {
853 "landlock applied, but seccomp in trace mode — not locked".to_string()
854 } else {
855 "security state locked: all hardening layers active".to_string()
856 },
857 );
858 } else if !allow_degraded_security {
859 return Err(NucleusError::LandlockError(
860 "Landlock policy is required but was not enforced".to_string(),
861 ));
862 } else {
863 warn!("Security state not locked; one or more hardening controls are inactive");
864 }
865 debug!("Security state: {:?}", sec_state);
866
867 if let Some(ref fifo_path) = exec_fifo {
871 debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
872 let file = std::fs::OpenOptions::new()
873 .write(true)
874 .open(fifo_path)
875 .map_err(|e| {
876 NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
877 })?;
878 std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
879 NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
880 })?;
881 drop(file);
882 debug!("Exec FIFO released, proceeding to exec");
883 }
884
885 if let Some(ref hooks) = self.config.hooks {
887 if !hooks.start_container.is_empty() {
888 let hook_state = OciContainerState {
889 oci_version: "1.0.2".to_string(),
890 id: self.config.id.clone(),
891 status: OciStatus::Running,
892 pid: std::process::id(),
893 bundle: String::new(),
894 };
895 OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
896 }
897 }
898
899 if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
902 return self.run_as_init();
903 }
904
905 self.exec_command()?;
907
908 Ok(())
910 }
911
912 pub(super) fn setup_signal_forwarding_static(
917 child: Pid,
918 ) -> Result<(Arc<AtomicBool>, JoinHandle<()>)> {
919 let mut set = SigSet::empty();
920 for signal in [
921 Signal::SIGTERM,
922 Signal::SIGINT,
923 Signal::SIGHUP,
924 Signal::SIGQUIT,
925 Signal::SIGUSR1,
926 Signal::SIGUSR2,
927 ] {
928 set.add(signal);
929 }
930
931 let unblock_set = set;
932 pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&unblock_set), None).map_err(|e| {
933 NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
934 })?;
935
936 let stop = Arc::new(AtomicBool::new(false));
937 let stop_clone = stop.clone();
938 let handle = std::thread::Builder::new()
939 .name("sig-forward".to_string())
940 .spawn(move || {
941 loop {
943 if let Ok(signal) = unblock_set.wait() {
944 if stop_clone.load(Ordering::Relaxed) {
948 break;
949 }
950 let _ = kill(child, signal);
951 }
952 }
953 })
954 .map_err(|e| {
955 let mut restore = SigSet::empty();
958 for signal in [
959 Signal::SIGTERM,
960 Signal::SIGINT,
961 Signal::SIGHUP,
962 Signal::SIGQUIT,
963 Signal::SIGUSR1,
964 Signal::SIGUSR2,
965 ] {
966 restore.add(signal);
967 }
968 let _ = pthread_sigmask(SigmaskHow::SIG_UNBLOCK, Some(&restore), None);
969 NucleusError::ExecError(format!("Failed to spawn signal thread: {}", e))
970 })?;
971
972 info!("Signal forwarding configured");
973 Ok((stop, handle))
974 }
975
976 pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
978 loop {
979 match waitpid(child, None) {
980 Ok(WaitStatus::Exited(_, code)) => {
981 return Ok(code);
982 }
983 Ok(WaitStatus::Signaled(_, signal, _)) => {
984 info!("Child killed by signal: {:?}", signal);
985 return Ok(128 + signal as i32);
986 }
987 Err(nix::errno::Errno::EINTR) => {
988 continue;
989 }
990 Err(e) => {
991 return Err(NucleusError::ExecError(format!(
992 "Failed to wait for child: {}",
993 e
994 )));
995 }
996 _ => {
997 continue;
998 }
999 }
1000 }
1001 }
1002
1003 fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
1004 let mut pid_buf = [0u8; 4];
1005 loop {
1006 match read(ready_read, &mut pid_buf) {
1007 Err(nix::errno::Errno::EINTR) => continue,
1008 Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
1009 Ok(0) => {
1010 return Err(NucleusError::ExecError(format!(
1011 "Child {} exited before namespace initialization",
1012 child
1013 )))
1014 }
1015 Ok(_) => {
1016 return Err(NucleusError::ExecError(
1017 "Invalid namespace sync payload from child".to_string(),
1018 ))
1019 }
1020 Err(e) => {
1021 return Err(NucleusError::ExecError(format!(
1022 "Failed waiting for child namespace setup: {}",
1023 e
1024 )))
1025 }
1026 }
1027 }
1028 }
1029
1030 fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
1031 let payload = pid.to_ne_bytes();
1032 let mut written = 0;
1033 while written < payload.len() {
1034 let n = write(fd, &payload[written..]).map_err(|e| {
1035 NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
1036 })?;
1037 if n == 0 {
1038 return Err(NucleusError::ExecError(
1039 "Failed to notify namespace readiness: short write".to_string(),
1040 ));
1041 }
1042 written += n;
1043 }
1044 Ok(())
1045 }
1046
1047 fn wait_for_pid_namespace_child(child: Pid) -> i32 {
1048 loop {
1049 match waitpid(child, None) {
1050 Ok(WaitStatus::Exited(_, code)) => return code,
1051 Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
1052 Err(nix::errno::Errno::EINTR) => continue,
1053 Err(_) => return 1,
1054 _ => continue,
1055 }
1056 }
1057 }
1058}
1059
1060impl CreatedContainer {
1061 pub fn start(mut self) -> Result<i32> {
1064 let config = &self.config;
1065 let _enter = self._lifecycle_span.enter();
1066
1067 if let Some(exec_fifo_path) = &self.exec_fifo_path {
1070 let file = std::fs::File::open(exec_fifo_path).map_err(|e| {
1071 NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
1072 })?;
1073 let mut buf = [0u8; 1];
1074 let read = std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
1075 NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
1076 })?;
1077 if read != 1 {
1078 return Err(NucleusError::ExecError(
1079 "Exec FIFO closed before start signal was delivered".to_string(),
1080 ));
1081 }
1082 let _ = std::fs::remove_file(exec_fifo_path);
1083 }
1084
1085 self.state.status = OciStatus::Running;
1087 self.state_mgr.save_state(&self.state)?;
1088
1089 let target_pid = self.state.pid;
1090 let child = self.child;
1091
1092 let (sig_stop, sig_handle) =
1093 Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
1094
1095 let mut sig_guard = SignalThreadGuard {
1097 stop: Some(sig_stop),
1098 handle: Some(sig_handle),
1099 };
1100
1101 if let Some(ref probe) = config.readiness_probe {
1103 let notify_socket = if config.sd_notify {
1104 std::env::var("NOTIFY_SOCKET").ok()
1105 } else {
1106 None
1107 };
1108 Container::run_readiness_probe(
1109 target_pid,
1110 &config.name,
1111 probe,
1112 config.user_ns_config.is_some(),
1113 config.use_gvisor,
1114 &config.process_identity,
1115 notify_socket.as_deref(),
1116 )?;
1117 }
1118
1119 let cancel_flag = Arc::new(AtomicBool::new(false));
1121 let health_handle = if let Some(ref hc) = config.health_check {
1122 if !hc.command.is_empty() {
1123 let hc = hc.clone();
1124 let pid = target_pid;
1125 let container_name = config.name.clone();
1126 let rootless = config.user_ns_config.is_some();
1127 let using_gvisor = config.use_gvisor;
1128 let process_identity = config.process_identity.clone();
1129 let cancel = cancel_flag.clone();
1130 Some(std::thread::spawn(move || {
1131 Container::health_check_loop(
1132 pid,
1133 &container_name,
1134 rootless,
1135 using_gvisor,
1136 &hc,
1137 &process_identity,
1138 &cancel,
1139 );
1140 }))
1141 } else {
1142 None
1143 }
1144 } else {
1145 None
1146 };
1147
1148 let mut health_guard = HealthThreadGuard {
1150 cancel: Some(cancel_flag),
1151 handle: health_handle,
1152 };
1153
1154 if let Some(ref hooks) = config.hooks {
1156 if !hooks.poststart.is_empty() {
1157 let hook_state = OciContainerState {
1158 oci_version: "1.0.2".to_string(),
1159 id: config.id.clone(),
1160 status: OciStatus::Running,
1161 pid: target_pid,
1162 bundle: String::new(),
1163 };
1164 OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
1165 }
1166 }
1167
1168 let mut child_waited = false;
1169 let run_result: Result<i32> = (|| {
1170 let exit_code = Container::wait_for_child_static(child)?;
1171
1172 self.state.status = OciStatus::Stopped;
1174 let _ = self.state_mgr.save_state(&self.state);
1175
1176 child_waited = true;
1177 Ok(exit_code)
1178 })();
1179
1180 health_guard.stop();
1183 sig_guard.stop();
1184
1185 if let Some(ref hooks) = config.hooks {
1187 if !hooks.poststop.is_empty() {
1188 let hook_state = OciContainerState {
1189 oci_version: "1.0.2".to_string(),
1190 id: config.id.clone(),
1191 status: OciStatus::Stopped,
1192 pid: 0,
1193 bundle: String::new(),
1194 };
1195 OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1196 }
1197 }
1198
1199 if let Some(net) = self.bridge_net.take() {
1200 if let Err(e) = net.cleanup() {
1201 warn!("Failed to cleanup bridge networking: {}", e);
1202 }
1203 }
1204
1205 if !child_waited {
1206 let _ = kill(child, Signal::SIGKILL);
1207 let _ = waitpid(child, None);
1208 }
1209
1210 if let Some(reader) = self.trace_reader.take() {
1211 reader.stop_and_flush();
1212 }
1213
1214 if let Some(cgroup) = self.cgroup_opt.take() {
1215 if let Err(e) = cgroup.cleanup() {
1216 warn!("Failed to cleanup cgroup: {}", e);
1217 }
1218 }
1219
1220 if config.use_gvisor {
1221 if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1222 warn!(
1223 "Failed to cleanup gVisor artifacts for {}: {}",
1224 config.id, e
1225 );
1226 }
1227 }
1228
1229 if let Err(e) = self.state_mgr.delete_state(&config.id) {
1230 warn!("Failed to delete state for {}: {}", config.id, e);
1231 }
1232
1233 match run_result {
1234 Ok(exit_code) => {
1235 audit(
1236 &config.id,
1237 &config.name,
1238 AuditEventType::ContainerStop,
1239 format!("exit_code={}", exit_code),
1240 );
1241 info!(
1242 "Container {} ({}) exited with code {}",
1243 config.name, config.id, exit_code
1244 );
1245 Ok(exit_code)
1246 }
1247 Err(e) => {
1248 audit_error(
1249 &config.id,
1250 &config.name,
1251 AuditEventType::ContainerStop,
1252 format!("error={}", e),
1253 );
1254 Err(e)
1255 }
1256 }
1257 }
1258}
1259
1260struct SignalThreadGuard {
1262 stop: Option<Arc<AtomicBool>>,
1263 handle: Option<JoinHandle<()>>,
1264}
1265
1266impl SignalThreadGuard {
1267 fn stop(&mut self) {
1268 if let Some(flag) = self.stop.take() {
1269 flag.store(true, Ordering::Relaxed);
1270 let _ = kill(Pid::this(), Signal::SIGUSR1);
1272 }
1273 if let Some(handle) = self.handle.take() {
1274 let _ = handle.join();
1275 }
1276 }
1277}
1278
1279impl Drop for SignalThreadGuard {
1280 fn drop(&mut self) {
1281 self.stop();
1282 }
1283}
1284
1285struct HealthThreadGuard {
1287 cancel: Option<Arc<AtomicBool>>,
1288 handle: Option<JoinHandle<()>>,
1289}
1290
1291impl HealthThreadGuard {
1292 fn stop(&mut self) {
1293 if let Some(flag) = self.cancel.take() {
1294 flag.store(true, Ordering::Relaxed);
1295 }
1296 if let Some(handle) = self.handle.take() {
1297 let _ = handle.join();
1298 }
1299 }
1300}
1301
1302impl Drop for HealthThreadGuard {
1303 fn drop(&mut self) {
1304 self.stop();
1305 }
1306}
1307
1308#[cfg(test)]
1309mod tests {
1310 use super::*;
1311 use crate::container::KernelLockdownMode;
1312 use crate::network::NetworkMode;
1313
1314 #[test]
1315 fn test_container_config() {
1316 let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1317 assert!(!config.id.is_empty());
1318 assert_eq!(config.command, vec!["/bin/sh"]);
1319 assert!(config.use_gvisor);
1320 }
1321
1322 #[test]
1323 fn test_run_uses_immediate_start_path() {
1324 let source = include_str!("runtime.rs");
1325 let fn_start = source.find("pub fn run(&self) -> Result<i32>").unwrap();
1326 let after = &source[fn_start..];
1327 let open = after.find('{').unwrap();
1328 let mut depth = 0u32;
1329 let mut fn_end = open;
1330 for (i, ch) in after[open..].char_indices() {
1331 match ch {
1332 '{' => depth += 1,
1333 '}' => {
1334 depth -= 1;
1335 if depth == 0 {
1336 fn_end = open + i + 1;
1337 break;
1338 }
1339 }
1340 _ => {}
1341 }
1342 }
1343 let run_body = &after[..fn_end];
1344 assert!(
1345 run_body.contains("create_internal(false)"),
1346 "run() must bypass deferred exec FIFO startup to avoid cross-root deadlocks"
1347 );
1348 assert!(
1349 !run_body.contains("self.create()?.start()"),
1350 "run() must not route through create()+start()"
1351 );
1352 }
1353
1354 #[test]
1355 fn test_container_config_with_name() {
1356 let config =
1357 ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1358 .unwrap();
1359 assert_eq!(config.name, "mycontainer");
1360 assert!(!config.id.is_empty());
1361 assert_ne!(config.id, config.name);
1362 }
1363
1364 #[test]
1365 fn test_allow_degraded_security_requires_explicit_config() {
1366 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1367 assert!(!Container::allow_degraded_security(&strict));
1368
1369 let relaxed = strict.clone().with_allow_degraded_security(true);
1370 assert!(Container::allow_degraded_security(&relaxed));
1371 }
1372
1373 #[test]
1374 fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1375 let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1376 std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1377
1378 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1379 assert!(!Container::allow_degraded_security(&strict));
1380
1381 let explicit = strict.with_allow_degraded_security(true);
1382 assert!(Container::allow_degraded_security(&explicit));
1383
1384 match prev {
1385 Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1386 None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1387 }
1388 }
1389
1390 #[test]
1391 fn test_host_network_requires_explicit_opt_in() {
1392 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1393 .unwrap()
1394 .with_network(NetworkMode::Host)
1395 .with_allow_host_network(false);
1396 let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1397 assert!(matches!(err, NucleusError::NetworkError(_)));
1398 }
1399
1400 #[test]
1401 fn test_host_network_opt_in_disables_net_namespace() {
1402 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1403 .unwrap()
1404 .with_network(NetworkMode::Host)
1405 .with_allow_host_network(true);
1406 assert!(config.namespaces.net);
1407 Container::apply_network_mode_guards(&mut config, true).unwrap();
1408 assert!(!config.namespaces.net);
1409 }
1410
1411 #[test]
1412 fn test_non_host_network_does_not_require_host_opt_in() {
1413 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1414 .unwrap()
1415 .with_network(NetworkMode::None)
1416 .with_allow_host_network(false);
1417 assert!(config.namespaces.net);
1418 Container::apply_network_mode_guards(&mut config, true).unwrap();
1419 assert!(config.namespaces.net);
1420 }
1421
1422 #[test]
1423 fn test_parse_kernel_lockdown_mode() {
1424 assert_eq!(
1425 Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1426 Some(KernelLockdownMode::Integrity)
1427 );
1428 assert_eq!(
1429 Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1430 Some(KernelLockdownMode::Confidentiality)
1431 );
1432 assert_eq!(
1433 Container::parse_active_lockdown_mode("[none] integrity"),
1434 None
1435 );
1436 }
1437
1438 #[test]
1439 fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1440 let temp = tempfile::TempDir::new().unwrap();
1441 let source = temp.path().join("source-secret");
1442 std::fs::write(&source, "supersecret").unwrap();
1443
1444 let staged = Container::stage_gvisor_secret_files(
1445 &temp.path().join("stage"),
1446 &[crate::container::SecretMount {
1447 source: source.clone(),
1448 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1449 mode: 0o400,
1450 }],
1451 &crate::container::ProcessIdentity::root(),
1452 )
1453 .unwrap();
1454
1455 assert_eq!(staged.len(), 1);
1456 assert!(staged[0].source.starts_with(temp.path().join("stage")));
1457 assert_eq!(
1458 std::fs::read_to_string(&staged[0].source).unwrap(),
1459 "supersecret"
1460 );
1461 }
1462
1463 #[test]
1464 fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1465 let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1466 std::fs::create_dir_all(&artifact_dir).unwrap();
1467 std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1468
1469 Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1470 assert!(!artifact_dir.exists());
1471 }
1472
1473 #[test]
1474 fn test_health_check_loop_supports_cancellation() {
1475 let source = include_str!("health.rs");
1479 let fn_start = source.find("fn health_check_loop").unwrap();
1480 let fn_body = &source[fn_start..fn_start + 2500];
1481 assert!(
1482 fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1483 "health_check_loop must accept an AtomicBool cancellation flag"
1484 );
1485 assert!(
1487 fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1488 "health_check_loop must check cancellation during sleep intervals"
1489 );
1490 }
1491
1492 #[test]
1493 fn test_runtime_probes_do_not_spawn_host_nsenter() {
1494 let source = include_str!("health.rs");
1496
1497 let readiness_start = source.find("fn run_readiness_probe").unwrap();
1498 let readiness_body = &source[readiness_start..readiness_start + 2500];
1499 assert!(
1500 !readiness_body.contains("Command::new(&nsenter_bin)"),
1501 "readiness probes must not execute via host nsenter"
1502 );
1503
1504 let health_start = source.find("fn health_check_loop").unwrap();
1505 let health_body = &source[health_start..health_start + 2200];
1506 assert!(
1507 !health_body.contains("Command::new(&nsenter_bin)"),
1508 "health checks must not execute via host nsenter"
1509 );
1510 }
1511
1512 #[test]
1513 fn test_oci_mount_strip_prefix_no_expect() {
1514 let source = include_str!("gvisor_setup.rs");
1517 let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1518 let fn_body = &source[fn_start..fn_start + 600];
1519 assert!(
1520 !fn_body.contains(".expect("),
1521 "prepare_oci_mountpoints must not use expect() — return Err instead"
1522 );
1523 }
1524
1525 #[test]
1526 fn test_notify_namespace_ready_validates_write_length() {
1527 let source = include_str!("runtime.rs");
1529 let fn_start = source.find("fn notify_namespace_ready").unwrap();
1530 let fn_body = &source[fn_start..fn_start + 500];
1531 assert!(
1533 fn_body.contains("written")
1534 || fn_body.contains("4")
1535 || fn_body.contains("payload.len()"),
1536 "notify_namespace_ready must validate complete write of all 4 bytes"
1537 );
1538 }
1539
1540 #[test]
1541 fn test_rlimit_failures_fatal_in_production() {
1542 let source = include_str!("runtime.rs");
1544 let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1545 let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1546 assert!(
1547 rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1548 "RLIMIT failures must return Err in production mode"
1549 );
1550 }
1551
1552 #[test]
1553 fn test_tcp_readiness_probe_uses_portable_check() {
1554 let source = include_str!("health.rs");
1557 let probe_fn = source.find("TcpPort(port)").unwrap();
1558 let probe_body = &source[probe_fn..probe_fn + 500];
1559 assert!(
1560 !probe_body.contains("/dev/tcp"),
1561 "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1562 );
1563 }
1564}