1use crate::audit::{audit, audit_error, AuditEventType};
2use crate::container::{
3 ContainerConfig, ContainerState, ContainerStateManager, ContainerStateParams,
4 OciStatus, ServiceMode,
5};
6use crate::error::{NucleusError, Result, StateTransition};
7use crate::filesystem::{
8 audit_mounts, bind_mount_host_paths, bind_mount_rootfs, create_dev_nodes, create_minimal_fs,
9 mask_proc_paths, mount_procfs, mount_secrets, mount_secrets_inmemory,
10 snapshot_context_dir, switch_root, verify_context_manifest,
11 verify_rootfs_attestation, FilesystemState, LazyContextPopulator, TmpfsMount,
12};
13use crate::isolation::{NamespaceManager};
14use crate::network::{BridgeNetwork, NetworkMode};
15use crate::resources::Cgroup;
16use crate::security::{
17 CapabilityManager, GVisorRuntime, LandlockManager,
18 OciContainerState, OciHooks, SeccompManager, SeccompTraceReader, SecurityState,
19};
20use nix::sys::signal::{kill, Signal};
21use nix::sys::signal::{pthread_sigmask, SigSet, SigmaskHow};
22use nix::sys::stat::Mode;
23use nix::sys::wait::{waitpid, WaitStatus};
24use nix::unistd::{fork, pipe, read, write, ForkResult, Pid};
25use std::os::fd::{AsRawFd, OwnedFd};
26use std::path::PathBuf;
27use tempfile::Builder;
28use tracing::{debug, error, info, info_span, warn};
29
30pub struct Container {
39 pub(super) config: ContainerConfig,
40 pub(super) runsc_path: Option<String>,
43}
44
45pub struct CreatedContainer {
49 pub(super) config: ContainerConfig,
50 pub(super) state_mgr: ContainerStateManager,
51 pub(super) state: ContainerState,
52 pub(super) child: Pid,
53 pub(super) cgroup_opt: Option<Cgroup>,
54 pub(super) bridge_net: Option<BridgeNetwork>,
55 pub(super) trace_reader: Option<SeccompTraceReader>,
56 pub(super) exec_fifo_path: PathBuf,
57 pub(super) _lifecycle_span: tracing::Span,
58}
59
60impl Container {
61 pub fn new(config: ContainerConfig) -> Self {
62 Self {
63 config,
64 runsc_path: None,
65 }
66 }
67
68 pub fn run(&self) -> Result<i32> {
70 self.create()?.start()
71 }
72
73 pub fn create(&self) -> Result<CreatedContainer> {
77 let lifecycle_span = info_span!(
78 "container.lifecycle",
79 container.id = %self.config.id,
80 container.name = %self.config.name,
81 runtime = if self.config.use_gvisor { "gvisor" } else { "native" }
82 );
83 let _enter = lifecycle_span.enter();
84
85 info!(
86 "Creating container: {} (ID: {})",
87 self.config.name, self.config.id
88 );
89 audit(
90 &self.config.id,
91 &self.config.name,
92 AuditEventType::ContainerStart,
93 format!(
94 "command={:?} mode={:?} runtime={}",
95 self.config.command,
96 self.config.service_mode,
97 if self.config.use_gvisor {
98 "gvisor"
99 } else {
100 "native"
101 }
102 ),
103 );
104
105 let is_root = nix::unistd::Uid::effective().is_root();
107 let mut config = self.config.clone();
108
109 if !is_root && config.user_ns_config.is_none() {
110 info!("Not running as root, automatically enabling rootless mode");
111 config.namespaces.user = true;
112 config.user_ns_config = Some(crate::isolation::UserNamespaceConfig::rootless());
113 }
114
115 if let Some(ref socket_path) = config.console_socket {
117 warn!(
118 "Console socket {} accepted but terminal forwarding is not yet implemented",
119 socket_path.display()
120 );
121 }
122
123 config.validate_production_mode()?;
125 Self::assert_kernel_lockdown(&config)?;
126
127 Self::apply_network_mode_guards(&mut config, is_root)?;
128 Self::apply_trust_level_guards(&mut config)?;
129 config.validate_runtime_support()?;
130
131 if matches!(config.network, NetworkMode::Bridge(_)) && !is_root {
133 if config.service_mode == ServiceMode::Production {
134 return Err(NucleusError::NetworkError(
135 "Production mode with bridge networking requires root (cannot silently \
136 degrade to no networking)"
137 .to_string(),
138 ));
139 }
140 warn!("Bridge networking requires root, degrading to no networking");
141 config.network = NetworkMode::None;
142 }
143
144 let state_mgr = ContainerStateManager::new()?;
146
147 if let Ok(all_states) = state_mgr.list_states() {
149 if all_states.iter().any(|s| s.name == config.name) {
150 return Err(NucleusError::ConfigError(format!(
151 "A container named '{}' already exists; use a different --name, \
152 or remove the stale state with 'nucleus delete'",
153 config.name
154 )));
155 }
156 }
157
158 let exec_fifo = state_mgr.exec_fifo_path(&config.id)?;
161 nix::unistd::mkfifo(&exec_fifo, Mode::S_IRUSR | Mode::S_IWUSR).map_err(|e| {
162 NucleusError::ExecError(format!("Failed to create exec FIFO {:?}: {}", exec_fifo, e))
163 })?;
164
165 let cgroup_name = format!("nucleus-{}", config.id);
167 let mut cgroup_opt = match Cgroup::create(&cgroup_name) {
168 Ok(mut cgroup) => {
169 match cgroup.set_limits(&config.limits) {
171 Ok(_) => {
172 info!("Created cgroup with resource limits");
173 Some(cgroup)
174 }
175 Err(e) => {
176 if config.service_mode == ServiceMode::Production {
177 let _ = cgroup.cleanup();
178 return Err(NucleusError::CgroupError(format!(
179 "Production mode requires cgroup resource enforcement, but \
180 applying limits failed: {}",
181 e
182 )));
183 }
184 warn!("Failed to set cgroup limits: {}", e);
185 let _ = cgroup.cleanup();
186 None
187 }
188 }
189 }
190 Err(e) => {
191 if config.service_mode == ServiceMode::Production {
192 return Err(NucleusError::CgroupError(format!(
193 "Production mode requires cgroup resource enforcement, but \
194 cgroup creation failed: {}",
195 e
196 )));
197 }
198
199 if config.user_ns_config.is_some() {
200 if config.limits.memory_bytes.is_some()
201 || config.limits.cpu_quota_us.is_some()
202 || config.limits.pids_max.is_some()
203 {
204 warn!(
205 "Running in rootless mode: requested resource limits cannot be \
206 enforced – cgroup creation requires root ({})",
207 e
208 );
209 } else {
210 debug!("Running in rootless mode without cgroup resource limits");
211 }
212 } else {
213 warn!(
214 "Failed to create cgroup (running without resource limits): {}",
215 e
216 );
217 }
218 None
219 }
220 };
221
222 let runsc_path = if config.use_gvisor {
224 Some(GVisorRuntime::resolve_path().map_err(|e| {
225 NucleusError::GVisorError(format!("Failed to resolve runsc path: {}", e))
226 })?)
227 } else {
228 None
229 };
230
231 let (ready_read, ready_write) = pipe().map_err(|e| {
233 NucleusError::ExecError(format!("Failed to create namespace sync pipe: {}", e))
234 })?;
235
236 match unsafe { fork() }? {
238 ForkResult::Parent { child } => {
239 drop(ready_write);
240 info!("Forked child process: {}", child);
241
242 let target_pid = Self::wait_for_namespace_ready(&ready_read, child)?;
243
244 let cgroup_path = cgroup_opt
245 .as_ref()
246 .map(|_| format!("/sys/fs/cgroup/{}", cgroup_name));
247 let cpu_millicores = config
248 .limits
249 .cpu_quota_us
250 .map(|quota| (quota * 1000) / config.limits.cpu_period_us);
251 let mut state = ContainerState::new(ContainerStateParams {
252 id: config.id.clone(),
253 name: config.name.clone(),
254 pid: target_pid,
255 command: config.command.clone(),
256 memory_limit: config.limits.memory_bytes,
257 cpu_limit: cpu_millicores,
258 using_gvisor: config.use_gvisor,
259 rootless: config.user_ns_config.is_some(),
260 cgroup_path,
261 });
262 state.config_hash = config.config_hash;
263 state.bundle_path = config.rootfs_path.as_ref().map(|p| p.display().to_string());
264
265 let mut bridge_net: Option<BridgeNetwork> = None;
266 let trace_reader = Self::maybe_start_seccomp_trace_reader(&config, target_pid)?;
267
268 state.status = OciStatus::Created;
270 state_mgr.save_state(&state)?;
271
272 if let Some(ref pid_path) = config.pid_file {
274 std::fs::write(pid_path, target_pid.to_string()).map_err(|e| {
275 NucleusError::ConfigError(format!(
276 "Failed to write pid-file '{}': {}",
277 pid_path.display(),
278 e
279 ))
280 })?;
281 info!("Wrote PID {} to {}", target_pid, pid_path.display());
282 }
283
284 if let Some(ref mut cgroup) = cgroup_opt {
285 cgroup.attach_process(target_pid)?;
286 }
287
288 if let NetworkMode::Bridge(ref bridge_config) = config.network {
289 match BridgeNetwork::setup_with_id(target_pid, bridge_config, &config.id) {
290 Ok(net) => {
291 if let Some(ref egress) = config.egress_policy {
292 if let Err(e) = net.apply_egress_policy(target_pid, egress) {
293 if config.service_mode == ServiceMode::Production {
294 return Err(NucleusError::NetworkError(format!(
295 "Failed to apply egress policy: {}",
296 e
297 )));
298 }
299 warn!("Failed to apply egress policy: {}", e);
300 }
301 }
302 bridge_net = Some(net);
303 }
304 Err(e) => {
305 if config.service_mode == ServiceMode::Production {
306 return Err(e);
307 }
308 warn!("Failed to set up bridge networking: {}", e);
309 }
310 }
311 }
312
313 info!(
314 "Container {} created (child pid {}), waiting for start",
315 config.id, target_pid
316 );
317
318 Ok(CreatedContainer {
319 config,
320 state_mgr,
321 state,
322 child,
323 cgroup_opt,
324 bridge_net,
325 trace_reader,
326 exec_fifo_path: exec_fifo,
327 _lifecycle_span: lifecycle_span.clone(),
328 })
329 }
330 ForkResult::Child => {
331 drop(ready_read);
332 let temp_container = Container { config, runsc_path };
333 match temp_container.setup_and_exec(Some(ready_write), Some(exec_fifo)) {
334 Ok(_) => unreachable!(),
335 Err(e) => {
336 error!("Container setup failed: {}", e);
337 std::process::exit(1);
338 }
339 }
340 }
341 }
342 }
343
344 pub fn trigger_start(container_id: &str) -> Result<()> {
347 let state_mgr = ContainerStateManager::new()?;
348 let fifo_path = state_mgr.exec_fifo_path(container_id)?;
349 if !fifo_path.exists() {
350 return Err(NucleusError::ConfigError(format!(
351 "No exec FIFO found for container {}; is it in 'created' state?",
352 container_id
353 )));
354 }
355
356 let file = std::fs::File::open(&fifo_path)
358 .map_err(|e| NucleusError::ExecError(format!("Failed to open exec FIFO: {}", e)))?;
359 let mut buf = [0u8; 1];
360 std::io::Read::read(&mut &file, &mut buf)
361 .map_err(|e| NucleusError::ExecError(format!("Failed to read exec FIFO: {}", e)))?;
362 drop(file);
363
364 let _ = std::fs::remove_file(&fifo_path);
365
366 let mut state = state_mgr.resolve_container(container_id)?;
368 state.status = OciStatus::Running;
369 state_mgr.save_state(&state)?;
370
371 Ok(())
372 }
373
374 fn setup_and_exec(
379 &self,
380 ready_pipe: Option<OwnedFd>,
381 exec_fifo: Option<PathBuf>,
382 ) -> Result<()> {
383 let is_rootless = self.config.user_ns_config.is_some();
384 let allow_degraded_security = Self::allow_degraded_security(&self.config);
385 let context_manifest = if self.config.verify_context_integrity {
386 self.config
387 .context_dir
388 .as_ref()
389 .map(|dir| snapshot_context_dir(dir))
390 .transpose()?
391 } else {
392 None
393 };
394
395 let mut fs_state = FilesystemState::Unmounted;
397 let mut sec_state = SecurityState::Privileged;
398
399 if self.config.use_gvisor {
403 if let Some(fd) = ready_pipe {
404 Self::notify_namespace_ready(&fd, std::process::id())?;
405 }
406 return self.setup_and_exec_gvisor();
407 }
408
409 let mut namespace_mgr = NamespaceManager::new(self.config.namespaces.clone());
411 if let Some(user_config) = &self.config.user_ns_config {
412 namespace_mgr = namespace_mgr.with_user_mapping(user_config.clone());
413 }
414 namespace_mgr.unshare_namespaces()?;
415
416 if self.config.namespaces.pid {
419 match unsafe { fork() }? {
420 ForkResult::Parent { child } => {
421 if let Some(fd) = ready_pipe {
422 Self::notify_namespace_ready(&fd, child.as_raw() as u32)?;
423 }
424 std::process::exit(Self::wait_for_pid_namespace_child(child));
425 }
426 ForkResult::Child => {
427 }
429 }
430 } else if let Some(fd) = ready_pipe {
431 Self::notify_namespace_ready(&fd, std::process::id())?;
432 }
433
434 namespace_mgr.enter()?;
436
437 self.enforce_no_new_privs()?;
441 audit(
442 &self.config.id,
443 &self.config.name,
444 AuditEventType::NoNewPrivsSet,
445 "prctl(PR_SET_NO_NEW_PRIVS, 1) applied (early, before mounts)",
446 );
447
448 if let Some(hostname) = &self.config.hostname {
450 namespace_mgr.set_hostname(hostname)?;
451 }
452
453 let runtime_dir = Builder::new()
456 .prefix("nucleus-runtime-")
457 .tempdir_in("/tmp")
458 .map_err(|e| {
459 NucleusError::FilesystemError(format!("Failed to create runtime dir: {}", e))
460 })?;
461 let container_root = runtime_dir.path().to_path_buf();
462 let mut tmpfs = TmpfsMount::new(&container_root, Some(1024 * 1024 * 1024)); tmpfs.mount()?;
464 fs_state = fs_state.transition(FilesystemState::Mounted)?;
465
466 create_minimal_fs(&container_root)?;
468
469 let dev_path = container_root.join("dev");
471 create_dev_nodes(&dev_path, false)?;
472
473 if let Some(context_dir) = &self.config.context_dir {
476 let context_dest = container_root.join("context");
477 LazyContextPopulator::populate(&self.config.context_mode, context_dir, &context_dest)?;
478 if let Some(expected) = &context_manifest {
479 verify_context_manifest(expected, &context_dest)?;
480 }
481 }
482 fs_state = fs_state.transition(FilesystemState::Populated)?;
483
484 if let Some(ref rootfs_path) = self.config.rootfs_path {
486 if self.config.verify_rootfs_attestation {
487 verify_rootfs_attestation(rootfs_path)?;
488 }
489 bind_mount_rootfs(&container_root, rootfs_path)?;
490 } else {
491 bind_mount_host_paths(&container_root, is_rootless)?;
492 }
493
494 if let NetworkMode::Bridge(ref bridge_config) = self.config.network {
498 if self.config.rootfs_path.is_some() {
499 BridgeNetwork::bind_mount_resolv_conf(&container_root, &bridge_config.dns)?;
500 } else {
501 BridgeNetwork::write_resolv_conf(&container_root, &bridge_config.dns)?;
502 }
503 }
504
505 if self.config.service_mode == ServiceMode::Production {
507 mount_secrets_inmemory(&container_root, &self.config.secrets)?;
508 } else {
509 mount_secrets(&container_root, &self.config.secrets)?;
510 }
511
512 let proc_path = container_root.join("proc");
514 let hide_pids = self.config.service_mode == ServiceMode::Production;
515 mount_procfs(
516 &proc_path,
517 is_rootless,
518 self.config.proc_readonly,
519 hide_pids,
520 )?;
521
522 mask_proc_paths(
525 &proc_path,
526 self.config.service_mode == ServiceMode::Production,
527 )?;
528
529 if let Some(ref hooks) = self.config.hooks {
531 if !hooks.create_runtime.is_empty() {
532 let hook_state = OciContainerState {
533 oci_version: "1.0.2".to_string(),
534 id: self.config.id.clone(),
535 status: OciStatus::Creating,
536 pid: std::process::id(),
537 bundle: String::new(),
538 };
539 OciHooks::run_hooks(&hooks.create_runtime, &hook_state, "createRuntime")?;
540 }
541 }
542
543 switch_root(&container_root, self.config.allow_chroot_fallback)?;
546 fs_state = fs_state.transition(FilesystemState::Pivoted)?;
547 debug!("Filesystem state: {:?}", fs_state);
548
549 audit_mounts(self.config.service_mode == ServiceMode::Production)?;
551 audit(
552 &self.config.id,
553 &self.config.name,
554 AuditEventType::MountAuditPassed,
555 "all mount flags verified",
556 );
557
558 if let Some(ref hooks) = self.config.hooks {
560 if !hooks.create_container.is_empty() {
561 let hook_state = OciContainerState {
562 oci_version: "1.0.2".to_string(),
563 id: self.config.id.clone(),
564 status: OciStatus::Created,
565 pid: std::process::id(),
566 bundle: String::new(),
567 };
568 OciHooks::run_hooks(&hooks.create_container, &hook_state, "createContainer")?;
569 }
570 }
571
572 let mut cap_mgr = CapabilityManager::new();
575 if let Some(ref policy_path) = self.config.caps_policy {
576 let policy: crate::security::CapsPolicy = crate::security::load_toml_policy(
577 policy_path,
578 self.config.caps_policy_sha256.as_deref(),
579 )?;
580 policy.apply(&mut cap_mgr)?;
581 audit(
582 &self.config.id,
583 &self.config.name,
584 AuditEventType::CapabilitiesDropped,
585 format!("capability policy applied from {:?}", policy_path),
586 );
587 } else {
588 cap_mgr.drop_all()?;
589 audit(
590 &self.config.id,
591 &self.config.name,
592 AuditEventType::CapabilitiesDropped,
593 "all capabilities dropped including bounding set",
594 );
595 }
596 sec_state = sec_state.transition(SecurityState::CapabilitiesDropped)?;
597
598 {
603 let is_production = self.config.service_mode == ServiceMode::Production;
604
605 let nproc_limit = self.config.limits.pids_max.unwrap_or(512);
606 let rlim_nproc = libc::rlimit {
607 rlim_cur: nproc_limit,
608 rlim_max: nproc_limit,
609 };
610 if unsafe { libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) } != 0 {
612 let err = std::io::Error::last_os_error();
613 if is_production {
614 return Err(NucleusError::SeccompError(format!(
615 "Failed to set RLIMIT_NPROC to {} in production mode: {}",
616 nproc_limit, err
617 )));
618 }
619 warn!("Failed to set RLIMIT_NPROC to {}: {}", nproc_limit, err);
620 }
621
622 let rlim_nofile = libc::rlimit {
623 rlim_cur: 1024,
624 rlim_max: 1024,
625 };
626 if unsafe { libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) } != 0 {
628 let err = std::io::Error::last_os_error();
629 if is_production {
630 return Err(NucleusError::SeccompError(format!(
631 "Failed to set RLIMIT_NOFILE to 1024 in production mode: {}",
632 err
633 )));
634 }
635 warn!("Failed to set RLIMIT_NOFILE to 1024: {}", err);
636 }
637
638 let memlock_limit: u64 = 64 * 1024; let rlim_memlock = libc::rlimit {
644 rlim_cur: memlock_limit,
645 rlim_max: memlock_limit,
646 };
647 if unsafe { libc::setrlimit(libc::RLIMIT_MEMLOCK, &rlim_memlock) } != 0 {
649 let err = std::io::Error::last_os_error();
650 if is_production {
651 return Err(NucleusError::SeccompError(format!(
652 "Failed to set RLIMIT_MEMLOCK to {} in production mode: {}",
653 memlock_limit, err
654 )));
655 }
656 warn!("Failed to set RLIMIT_MEMLOCK to {}: {}", memlock_limit, err);
657 }
658 }
659
660 use crate::container::config::SeccompMode;
663 let mut seccomp_mgr = SeccompManager::new();
664 let allow_network = !matches!(self.config.network, NetworkMode::None);
665 let seccomp_applied = match self.config.seccomp_mode {
666 SeccompMode::Trace => {
667 audit(
668 &self.config.id,
669 &self.config.name,
670 AuditEventType::SeccompApplied,
671 "seccomp trace mode: allow-all + LOG",
672 );
673 seccomp_mgr.apply_trace_filter()?
674 }
675 SeccompMode::Enforce => {
676 if let Some(ref profile_path) = self.config.seccomp_profile {
677 audit(
678 &self.config.id,
679 &self.config.name,
680 AuditEventType::SeccompProfileLoaded,
681 format!("path={:?}", profile_path),
682 );
683 seccomp_mgr.apply_profile_from_file(
684 profile_path,
685 self.config.seccomp_profile_sha256.as_deref(),
686 self.config.seccomp_log_denied,
687 )?
688 } else {
689 seccomp_mgr.apply_filter_for_network_mode(
690 allow_network,
691 allow_degraded_security,
692 self.config.seccomp_log_denied,
693 )?
694 }
695 }
696 };
697 if seccomp_applied {
698 sec_state = sec_state.transition(SecurityState::SeccompApplied)?;
699 audit(
700 &self.config.id,
701 &self.config.name,
702 AuditEventType::SeccompApplied,
703 format!("network={}", allow_network),
704 );
705 } else if !allow_degraded_security {
706 return Err(NucleusError::SeccompError(
707 "Seccomp filter is required but was not enforced".to_string(),
708 ));
709 } else {
710 warn!("Seccomp not enforced; container is running with degraded hardening");
711 }
712
713 let landlock_applied = if let Some(ref policy_path) = self.config.landlock_policy {
715 let policy: crate::security::LandlockPolicy = crate::security::load_toml_policy(
716 policy_path,
717 self.config.landlock_policy_sha256.as_deref(),
718 )?;
719 policy.apply(allow_degraded_security)?
720 } else {
721 let mut landlock_mgr = LandlockManager::new();
722 landlock_mgr.assert_minimum_abi(self.config.service_mode == ServiceMode::Production)?;
723 landlock_mgr.apply_container_policy_with_mode(allow_degraded_security)?
724 };
725 if seccomp_applied && landlock_applied {
726 sec_state = sec_state.transition(SecurityState::LandlockApplied)?;
727 if self.config.seccomp_mode == SeccompMode::Trace {
728 warn!("Security state NOT locked: seccomp in trace mode (allow-all)");
729 } else {
730 sec_state = sec_state.transition(SecurityState::Locked)?;
731 }
732 audit(
733 &self.config.id,
734 &self.config.name,
735 AuditEventType::LandlockApplied,
736 if self.config.seccomp_mode == SeccompMode::Trace {
737 "landlock applied, but seccomp in trace mode — not locked".to_string()
738 } else {
739 "security state locked: all hardening layers active".to_string()
740 },
741 );
742 } else if !allow_degraded_security {
743 return Err(NucleusError::LandlockError(
744 "Landlock policy is required but was not enforced".to_string(),
745 ));
746 } else {
747 warn!("Security state not locked; one or more hardening controls are inactive");
748 }
749 debug!("Security state: {:?}", sec_state);
750
751 if let Some(ref fifo_path) = exec_fifo {
755 debug!("Waiting on exec FIFO {:?} for start signal", fifo_path);
756 let file = std::fs::OpenOptions::new()
757 .write(true)
758 .open(fifo_path)
759 .map_err(|e| {
760 NucleusError::ExecError(format!("Failed to open exec FIFO for writing: {}", e))
761 })?;
762 std::io::Write::write_all(&mut &file, &[0u8]).map_err(|e| {
763 NucleusError::ExecError(format!("Failed to write exec FIFO sync byte: {}", e))
764 })?;
765 drop(file);
766 debug!("Exec FIFO released, proceeding to exec");
767 }
768
769 if let Some(ref hooks) = self.config.hooks {
771 if !hooks.start_container.is_empty() {
772 let hook_state = OciContainerState {
773 oci_version: "1.0.2".to_string(),
774 id: self.config.id.clone(),
775 status: OciStatus::Running,
776 pid: std::process::id(),
777 bundle: String::new(),
778 };
779 OciHooks::run_hooks(&hooks.start_container, &hook_state, "startContainer")?;
780 }
781 }
782
783 if self.config.service_mode == ServiceMode::Production && self.config.namespaces.pid {
786 return self.run_as_init();
787 }
788
789 self.exec_command()?;
791
792 Ok(())
794 }
795
796 pub(super) fn setup_signal_forwarding_static(child: Pid) -> Result<()> {
798 let mut set = SigSet::empty();
799 for signal in [
800 Signal::SIGTERM,
801 Signal::SIGINT,
802 Signal::SIGHUP,
803 Signal::SIGQUIT,
804 Signal::SIGUSR1,
805 Signal::SIGUSR2,
806 ] {
807 set.add(signal);
808 }
809
810 pthread_sigmask(SigmaskHow::SIG_BLOCK, Some(&set), None).map_err(|e| {
811 NucleusError::ExecError(format!("Failed to block forwarded signals: {}", e))
812 })?;
813
814 std::thread::spawn(move || {
815 while let Ok(signal) = set.wait() {
816 let _ = kill(child, signal);
817 }
818 });
819
820 info!("Signal forwarding configured");
821 Ok(())
822 }
823
824 pub(super) fn wait_for_child_static(child: Pid) -> Result<i32> {
826 loop {
827 match waitpid(child, None) {
828 Ok(WaitStatus::Exited(_, code)) => {
829 return Ok(code);
830 }
831 Ok(WaitStatus::Signaled(_, signal, _)) => {
832 info!("Child killed by signal: {:?}", signal);
833 return Ok(128 + signal as i32);
834 }
835 Err(nix::errno::Errno::EINTR) => {
836 continue;
837 }
838 Err(e) => {
839 return Err(NucleusError::ExecError(format!(
840 "Failed to wait for child: {}",
841 e
842 )));
843 }
844 _ => {
845 continue;
846 }
847 }
848 }
849 }
850
851 fn wait_for_namespace_ready(ready_read: &OwnedFd, child: Pid) -> Result<u32> {
852 let mut pid_buf = [0u8; 4];
853 loop {
854 match read(ready_read.as_raw_fd(), &mut pid_buf) {
855 Err(nix::errno::Errno::EINTR) => continue,
856 Ok(4) => return Ok(u32::from_ne_bytes(pid_buf)),
857 Ok(0) => {
858 return Err(NucleusError::ExecError(format!(
859 "Child {} exited before namespace initialization",
860 child
861 )))
862 }
863 Ok(_) => {
864 return Err(NucleusError::ExecError(
865 "Invalid namespace sync payload from child".to_string(),
866 ))
867 }
868 Err(e) => {
869 return Err(NucleusError::ExecError(format!(
870 "Failed waiting for child namespace setup: {}",
871 e
872 )))
873 }
874 }
875 }
876 }
877
878 fn notify_namespace_ready(fd: &OwnedFd, pid: u32) -> Result<()> {
879 let payload = pid.to_ne_bytes();
880 let mut written = 0;
881 while written < payload.len() {
882 let n = write(fd, &payload[written..]).map_err(|e| {
883 NucleusError::ExecError(format!("Failed to notify namespace readiness: {}", e))
884 })?;
885 if n == 0 {
886 return Err(NucleusError::ExecError(
887 "Failed to notify namespace readiness: short write".to_string(),
888 ));
889 }
890 written += n;
891 }
892 Ok(())
893 }
894
895 fn wait_for_pid_namespace_child(child: Pid) -> i32 {
896 loop {
897 match waitpid(child, None) {
898 Ok(WaitStatus::Exited(_, code)) => return code,
899 Ok(WaitStatus::Signaled(_, signal, _)) => return 128 + signal as i32,
900 Err(nix::errno::Errno::EINTR) => continue,
901 Err(_) => return 1,
902 _ => continue,
903 }
904 }
905 }
906}
907
908impl CreatedContainer {
909 pub fn start(mut self) -> Result<i32> {
912 let config = &self.config;
913 let _enter = self._lifecycle_span.enter();
914
915 {
918 let file = std::fs::File::open(&self.exec_fifo_path).map_err(|e| {
919 NucleusError::ExecError(format!("Failed to open exec FIFO for reading: {}", e))
920 })?;
921 let mut buf = [0u8; 1];
922 std::io::Read::read(&mut &file, &mut buf).map_err(|e| {
923 NucleusError::ExecError(format!("Failed to read exec FIFO sync byte: {}", e))
924 })?;
925 }
926 let _ = std::fs::remove_file(&self.exec_fifo_path);
927
928 self.state.status = OciStatus::Running;
930 self.state_mgr.save_state(&self.state)?;
931
932 let target_pid = self.state.pid;
933 let child = self.child;
934
935 Container::setup_signal_forwarding_static(Pid::from_raw(target_pid as i32))?;
936
937 if let Some(ref probe) = config.readiness_probe {
939 let notify_socket = if config.sd_notify {
940 std::env::var("NOTIFY_SOCKET").ok()
941 } else {
942 None
943 };
944 Container::run_readiness_probe(
945 target_pid,
946 &config.name,
947 probe,
948 config.user_ns_config.is_some(),
949 config.use_gvisor,
950 notify_socket.as_deref(),
951 )?;
952 }
953
954 let cancel_flag = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
956 let health_handle = if let Some(ref hc) = config.health_check {
957 if !hc.command.is_empty() {
958 let hc = hc.clone();
959 let pid = target_pid;
960 let container_name = config.name.clone();
961 let rootless = config.user_ns_config.is_some();
962 let using_gvisor = config.use_gvisor;
963 let cancel = cancel_flag.clone();
964 Some(std::thread::spawn(move || {
965 Container::health_check_loop(
966 pid,
967 &container_name,
968 rootless,
969 using_gvisor,
970 &hc,
971 &cancel,
972 );
973 }))
974 } else {
975 None
976 }
977 } else {
978 None
979 };
980
981 if let Some(ref hooks) = config.hooks {
983 if !hooks.poststart.is_empty() {
984 let hook_state = OciContainerState {
985 oci_version: "1.0.2".to_string(),
986 id: config.id.clone(),
987 status: OciStatus::Running,
988 pid: target_pid,
989 bundle: String::new(),
990 };
991 OciHooks::run_hooks(&hooks.poststart, &hook_state, "poststart")?;
992 }
993 }
994
995 let mut child_waited = false;
996 let run_result: Result<i32> = (|| {
997 let exit_code = Container::wait_for_child_static(child)?;
998
999 self.state.status = OciStatus::Stopped;
1001 let _ = self.state_mgr.save_state(&self.state);
1002
1003 child_waited = true;
1004 Ok(exit_code)
1005 })();
1006
1007 cancel_flag.store(true, std::sync::atomic::Ordering::Relaxed);
1011 if let Some(handle) = health_handle {
1012 let _ = handle.join();
1013 }
1014
1015 if let Some(ref hooks) = config.hooks {
1017 if !hooks.poststop.is_empty() {
1018 let hook_state = OciContainerState {
1019 oci_version: "1.0.2".to_string(),
1020 id: config.id.clone(),
1021 status: OciStatus::Stopped,
1022 pid: 0,
1023 bundle: String::new(),
1024 };
1025 OciHooks::run_hooks_best_effort(&hooks.poststop, &hook_state, "poststop");
1026 }
1027 }
1028
1029 if let Some(net) = self.bridge_net.take() {
1030 if let Err(e) = net.cleanup() {
1031 warn!("Failed to cleanup bridge networking: {}", e);
1032 }
1033 }
1034
1035 if !child_waited {
1036 let _ = kill(child, Signal::SIGKILL);
1037 let _ = waitpid(child, None);
1038 }
1039
1040 if let Some(reader) = self.trace_reader.take() {
1041 reader.stop_and_flush();
1042 }
1043
1044 if let Some(cgroup) = self.cgroup_opt.take() {
1045 if let Err(e) = cgroup.cleanup() {
1046 warn!("Failed to cleanup cgroup: {}", e);
1047 }
1048 }
1049
1050 if config.use_gvisor {
1051 if let Err(e) = Container::cleanup_gvisor_artifacts(&config.id) {
1052 warn!(
1053 "Failed to cleanup gVisor artifacts for {}: {}",
1054 config.id, e
1055 );
1056 }
1057 }
1058
1059 if let Err(e) = self.state_mgr.delete_state(&config.id) {
1060 warn!("Failed to delete state for {}: {}", config.id, e);
1061 }
1062
1063 match run_result {
1064 Ok(exit_code) => {
1065 audit(
1066 &config.id,
1067 &config.name,
1068 AuditEventType::ContainerStop,
1069 format!("exit_code={}", exit_code),
1070 );
1071 info!(
1072 "Container {} ({}) exited with code {}",
1073 config.name, config.id, exit_code
1074 );
1075 Ok(exit_code)
1076 }
1077 Err(e) => {
1078 audit_error(
1079 &config.id,
1080 &config.name,
1081 AuditEventType::ContainerStop,
1082 format!("error={}", e),
1083 );
1084 Err(e)
1085 }
1086 }
1087 }
1088}
1089
1090#[cfg(test)]
1091mod tests {
1092 use super::*;
1093 use crate::container::KernelLockdownMode;
1094 use crate::network::NetworkMode;
1095
1096 #[test]
1097 fn test_container_config() {
1098 let config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1099 assert!(!config.id.is_empty());
1100 assert_eq!(config.command, vec!["/bin/sh"]);
1101 assert!(config.use_gvisor);
1102 }
1103
1104 #[test]
1105 fn test_container_config_with_name() {
1106 let config =
1107 ContainerConfig::try_new(Some("mycontainer".to_string()), vec!["/bin/sh".to_string()])
1108 .unwrap();
1109 assert_eq!(config.name, "mycontainer");
1110 assert!(!config.id.is_empty());
1111 assert_ne!(config.id, config.name);
1112 }
1113
1114 #[test]
1115 fn test_allow_degraded_security_requires_explicit_config() {
1116 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1117 assert!(!Container::allow_degraded_security(&strict));
1118
1119 let relaxed = strict.clone().with_allow_degraded_security(true);
1120 assert!(Container::allow_degraded_security(&relaxed));
1121 }
1122
1123 #[test]
1124 fn test_env_var_cannot_force_degraded_security_without_explicit_opt_in() {
1125 let prev = std::env::var_os("NUCLEUS_ALLOW_DEGRADED_SECURITY");
1126 std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", "1");
1127
1128 let strict = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()]).unwrap();
1129 assert!(!Container::allow_degraded_security(&strict));
1130
1131 let explicit = strict.with_allow_degraded_security(true);
1132 assert!(Container::allow_degraded_security(&explicit));
1133
1134 match prev {
1135 Some(v) => std::env::set_var("NUCLEUS_ALLOW_DEGRADED_SECURITY", v),
1136 None => std::env::remove_var("NUCLEUS_ALLOW_DEGRADED_SECURITY"),
1137 }
1138 }
1139
1140 #[test]
1141 fn test_host_network_requires_explicit_opt_in() {
1142 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1143 .unwrap()
1144 .with_network(NetworkMode::Host)
1145 .with_allow_host_network(false);
1146 let err = Container::apply_network_mode_guards(&mut config, true).unwrap_err();
1147 assert!(matches!(err, NucleusError::NetworkError(_)));
1148 }
1149
1150 #[test]
1151 fn test_host_network_opt_in_disables_net_namespace() {
1152 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1153 .unwrap()
1154 .with_network(NetworkMode::Host)
1155 .with_allow_host_network(true);
1156 assert!(config.namespaces.net);
1157 Container::apply_network_mode_guards(&mut config, true).unwrap();
1158 assert!(!config.namespaces.net);
1159 }
1160
1161 #[test]
1162 fn test_non_host_network_does_not_require_host_opt_in() {
1163 let mut config = ContainerConfig::try_new(None, vec!["/bin/sh".to_string()])
1164 .unwrap()
1165 .with_network(NetworkMode::None)
1166 .with_allow_host_network(false);
1167 assert!(config.namespaces.net);
1168 Container::apply_network_mode_guards(&mut config, true).unwrap();
1169 assert!(config.namespaces.net);
1170 }
1171
1172 #[test]
1173 fn test_parse_kernel_lockdown_mode() {
1174 assert_eq!(
1175 Container::parse_active_lockdown_mode("none [integrity] confidentiality"),
1176 Some(KernelLockdownMode::Integrity)
1177 );
1178 assert_eq!(
1179 Container::parse_active_lockdown_mode("none integrity [confidentiality]"),
1180 Some(KernelLockdownMode::Confidentiality)
1181 );
1182 assert_eq!(
1183 Container::parse_active_lockdown_mode("[none] integrity"),
1184 None
1185 );
1186 }
1187
1188 #[test]
1189 fn test_stage_gvisor_secret_files_rewrites_sources_under_stage_dir() {
1190 let temp = tempfile::TempDir::new().unwrap();
1191 let source = temp.path().join("source-secret");
1192 std::fs::write(&source, "supersecret").unwrap();
1193
1194 let staged = Container::stage_gvisor_secret_files(
1195 &temp.path().join("stage"),
1196 &[crate::container::SecretMount {
1197 source: source.clone(),
1198 dest: std::path::PathBuf::from("/etc/app/secret.txt"),
1199 mode: 0o400,
1200 }],
1201 )
1202 .unwrap();
1203
1204 assert_eq!(staged.len(), 1);
1205 assert!(staged[0].source.starts_with(temp.path().join("stage")));
1206 assert_eq!(
1207 std::fs::read_to_string(&staged[0].source).unwrap(),
1208 "supersecret"
1209 );
1210 }
1211
1212 #[test]
1213 fn test_cleanup_gvisor_artifacts_removes_artifact_dir() {
1214 let artifact_dir = Container::gvisor_artifact_dir("cleanup-test");
1215 std::fs::create_dir_all(&artifact_dir).unwrap();
1216 std::fs::write(artifact_dir.join("config.json"), "{}").unwrap();
1217
1218 Container::cleanup_gvisor_artifacts("cleanup-test").unwrap();
1219 assert!(!artifact_dir.exists());
1220 }
1221
1222 #[test]
1223 fn test_health_check_loop_supports_cancellation() {
1224 let source = include_str!("health.rs");
1228 let fn_start = source.find("fn health_check_loop").unwrap();
1229 let fn_body = &source[fn_start..fn_start + 2500];
1230 assert!(
1231 fn_body.contains("AtomicBool") && fn_body.contains("cancel"),
1232 "health_check_loop must accept an AtomicBool cancellation flag"
1233 );
1234 assert!(
1236 fn_body.contains("cancellable_sleep") || fn_body.contains("cancel.load"),
1237 "health_check_loop must check cancellation during sleep intervals"
1238 );
1239 }
1240
1241 #[test]
1242 fn test_runtime_probes_do_not_spawn_host_nsenter() {
1243 let source = include_str!("health.rs");
1245
1246 let readiness_start = source.find("fn run_readiness_probe").unwrap();
1247 let readiness_body = &source[readiness_start..readiness_start + 2500];
1248 assert!(
1249 !readiness_body.contains("Command::new(&nsenter_bin)"),
1250 "readiness probes must not execute via host nsenter"
1251 );
1252
1253 let health_start = source.find("fn health_check_loop").unwrap();
1254 let health_body = &source[health_start..health_start + 2200];
1255 assert!(
1256 !health_body.contains("Command::new(&nsenter_bin)"),
1257 "health checks must not execute via host nsenter"
1258 );
1259 }
1260
1261 #[test]
1262 fn test_oci_mount_strip_prefix_no_expect() {
1263 let source = include_str!("gvisor_setup.rs");
1266 let fn_start = source.find("fn prepare_oci_mountpoints").unwrap();
1267 let fn_body = &source[fn_start..fn_start + 600];
1268 assert!(
1269 !fn_body.contains(".expect("),
1270 "prepare_oci_mountpoints must not use expect() — return Err instead"
1271 );
1272 }
1273
1274 #[test]
1275 fn test_notify_namespace_ready_validates_write_length() {
1276 let source = include_str!("runtime.rs");
1278 let fn_start = source.find("fn notify_namespace_ready").unwrap();
1279 let fn_body = &source[fn_start..fn_start + 500];
1280 assert!(
1282 fn_body.contains("written")
1283 || fn_body.contains("4")
1284 || fn_body.contains("payload.len()"),
1285 "notify_namespace_ready must validate complete write of all 4 bytes"
1286 );
1287 }
1288
1289 #[test]
1290 fn test_rlimit_failures_fatal_in_production() {
1291 let source = include_str!("runtime.rs");
1293 let rlimit_start = source.find("12b. RLIMIT backstop").unwrap();
1294 let rlimit_section = &source[rlimit_start..rlimit_start + 2000];
1295 assert!(
1296 rlimit_section.contains("is_production") && rlimit_section.contains("return Err"),
1297 "RLIMIT failures must return Err in production mode"
1298 );
1299 }
1300
1301 #[test]
1302 fn test_tcp_readiness_probe_uses_portable_check() {
1303 let source = include_str!("health.rs");
1306 let probe_fn = source.find("TcpPort(port)").unwrap();
1307 let probe_body = &source[probe_fn..probe_fn + 500];
1308 assert!(
1309 !probe_body.contains("/dev/tcp"),
1310 "TCP readiness probe must not use /dev/tcp (bash-specific, fails on dash/ash)"
1311 );
1312 }
1313}