use crate::container::OciStatus;
use crate::error::{NucleusError, Result};
use crate::filesystem::normalize_container_destination;
use crate::isolation::{IdMapping, NamespaceConfig, UserNamespaceConfig};
use crate::resources::ResourceLimits;
use serde::{Deserialize, Serialize};
use std::collections::{BTreeSet, HashMap};
use std::fs;
use std::fs::OpenOptions;
use std::io::Write;
use std::os::unix::fs::{OpenOptionsExt, PermissionsExt};
use std::path::{Path, PathBuf};
use tracing::{debug, info, warn};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciConfig {
#[serde(rename = "ociVersion")]
pub oci_version: String,
pub root: OciRoot,
pub process: OciProcess,
pub hostname: Option<String>,
pub mounts: Vec<OciMount>,
pub linux: Option<OciLinux>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hooks: Option<OciHooks>,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub annotations: HashMap<String, String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciRoot {
pub path: String,
pub readonly: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciProcess {
pub terminal: bool,
pub user: OciUser,
pub args: Vec<String>,
pub env: Vec<String>,
pub cwd: String,
#[serde(rename = "noNewPrivileges")]
pub no_new_privileges: bool,
pub capabilities: Option<OciCapabilities>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub rlimits: Vec<OciRlimit>,
#[serde(
rename = "consoleSize",
default,
skip_serializing_if = "Option::is_none"
)]
pub console_size: Option<OciConsoleSize>,
#[serde(
rename = "apparmorProfile",
default,
skip_serializing_if = "Option::is_none"
)]
pub apparmor_profile: Option<String>,
#[serde(
rename = "selinuxLabel",
default,
skip_serializing_if = "Option::is_none"
)]
pub selinux_label: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciUser {
pub uid: u32,
pub gid: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub additional_gids: Option<Vec<u32>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciCapabilities {
pub bounding: Vec<String>,
pub effective: Vec<String>,
pub inheritable: Vec<String>,
pub permitted: Vec<String>,
pub ambient: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciMount {
pub destination: String,
pub source: String,
#[serde(rename = "type")]
pub mount_type: String,
pub options: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciLinux {
#[serde(skip_serializing_if = "Option::is_none")]
pub namespaces: Option<Vec<OciNamespace>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub resources: Option<OciResources>,
#[serde(rename = "uidMappings", skip_serializing_if = "Vec::is_empty", default)]
pub uid_mappings: Vec<OciIdMapping>,
#[serde(rename = "gidMappings", skip_serializing_if = "Vec::is_empty", default)]
pub gid_mappings: Vec<OciIdMapping>,
#[serde(rename = "maskedPaths", skip_serializing_if = "Vec::is_empty", default)]
pub masked_paths: Vec<String>,
#[serde(
rename = "readonlyPaths",
skip_serializing_if = "Vec::is_empty",
default
)]
pub readonly_paths: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub devices: Vec<OciDevice>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub seccomp: Option<OciSeccomp>,
#[serde(
rename = "rootfsPropagation",
default,
skip_serializing_if = "Option::is_none"
)]
pub rootfs_propagation: Option<String>,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub sysctl: HashMap<String, String>,
#[serde(
rename = "cgroupsPath",
default,
skip_serializing_if = "Option::is_none"
)]
pub cgroups_path: Option<String>,
#[serde(rename = "intelRdt", default, skip_serializing_if = "Option::is_none")]
pub intel_rdt: Option<OciIntelRdt>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciNamespace {
#[serde(rename = "type")]
pub namespace_type: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct OciIdMapping {
#[serde(rename = "containerID")]
pub container_id: u32,
#[serde(rename = "hostID")]
pub host_id: u32,
pub size: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciResources {
#[serde(skip_serializing_if = "Option::is_none")]
pub memory: Option<OciMemory>,
#[serde(skip_serializing_if = "Option::is_none")]
pub cpu: Option<OciCpu>,
#[serde(skip_serializing_if = "Option::is_none")]
pub pids: Option<OciPids>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciMemory {
#[serde(skip_serializing_if = "Option::is_none")]
pub limit: Option<i64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciCpu {
#[serde(skip_serializing_if = "Option::is_none")]
pub quota: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub period: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciPids {
pub limit: i64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciRlimit {
#[serde(rename = "type")]
pub limit_type: String,
pub hard: u64,
pub soft: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciConsoleSize {
pub height: u32,
pub width: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciDevice {
#[serde(rename = "type")]
pub device_type: String,
pub path: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub major: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub minor: Option<i64>,
#[serde(rename = "fileMode", skip_serializing_if = "Option::is_none")]
pub file_mode: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub uid: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub gid: Option<u32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciSeccomp {
#[serde(rename = "defaultAction")]
pub default_action: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub architectures: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub syscalls: Vec<OciSeccompSyscall>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciSeccompSyscall {
pub names: Vec<String>,
pub action: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub args: Vec<OciSeccompArg>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciSeccompArg {
pub index: u32,
pub value: u64,
#[serde(rename = "valueTwo", default, skip_serializing_if = "is_zero")]
pub value_two: u64,
pub op: String,
}
fn is_zero(v: &u64) -> bool {
*v == 0
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciIntelRdt {
#[serde(rename = "closID", default, skip_serializing_if = "Option::is_none")]
pub clos_id: Option<String>,
#[serde(
rename = "l3CacheSchema",
default,
skip_serializing_if = "Option::is_none"
)]
pub l3_cache_schema: Option<String>,
#[serde(
rename = "memBwSchema",
default,
skip_serializing_if = "Option::is_none"
)]
pub mem_bw_schema: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OciHook {
pub path: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub args: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub env: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub timeout: Option<u32>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct OciHooks {
#[serde(
rename = "createRuntime",
default,
skip_serializing_if = "Vec::is_empty"
)]
pub create_runtime: Vec<OciHook>,
#[serde(
rename = "createContainer",
default,
skip_serializing_if = "Vec::is_empty"
)]
pub create_container: Vec<OciHook>,
#[serde(
rename = "startContainer",
default,
skip_serializing_if = "Vec::is_empty"
)]
pub start_container: Vec<OciHook>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub poststart: Vec<OciHook>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub poststop: Vec<OciHook>,
}
#[derive(Debug, Clone, Serialize)]
pub struct OciContainerState {
#[serde(rename = "ociVersion")]
pub oci_version: String,
pub id: String,
pub status: OciStatus,
pub pid: u32,
pub bundle: String,
}
impl OciHooks {
pub fn is_empty(&self) -> bool {
self.create_runtime.is_empty()
&& self.create_container.is_empty()
&& self.start_container.is_empty()
&& self.poststart.is_empty()
&& self.poststop.is_empty()
}
pub fn run_hooks(hooks: &[OciHook], state: &OciContainerState, phase: &str) -> Result<()> {
let state_json = serde_json::to_string(state).map_err(|e| {
NucleusError::HookError(format!(
"Failed to serialize container state for hook: {}",
e
))
})?;
for (i, hook) in hooks.iter().enumerate() {
info!(
"Running {} hook [{}/{}]: {}",
phase,
i + 1,
hooks.len(),
hook.path
);
Self::execute_hook(hook, &state_json, phase)?;
}
Ok(())
}
pub fn run_hooks_best_effort(hooks: &[OciHook], state: &OciContainerState, phase: &str) {
let state_json = match serde_json::to_string(state) {
Ok(json) => json,
Err(e) => {
warn!(
"Failed to serialize container state for {} hooks: {}",
phase, e
);
return;
}
};
for (i, hook) in hooks.iter().enumerate() {
info!(
"Running {} hook [{}/{}]: {}",
phase,
i + 1,
hooks.len(),
hook.path
);
if let Err(e) = Self::execute_hook(hook, &state_json, phase) {
warn!("{} hook [{}] failed (continuing): {}", phase, i + 1, e);
}
}
}
fn execute_hook(hook: &OciHook, state_json: &str, phase: &str) -> Result<()> {
#[cfg(not(test))]
use std::os::unix::process::CommandExt;
use std::process::{Command, Stdio};
let hook_path = Path::new(&hook.path);
if !hook_path.is_absolute() {
return Err(NucleusError::HookError(format!(
"{} hook path must be absolute: {}",
phase, hook.path
)));
}
#[cfg(not(test))]
{
const TRUSTED_HOOK_PREFIXES: &[&str] = &[
"/usr/bin/",
"/usr/sbin/",
"/usr/lib/",
"/usr/libexec/",
"/usr/local/bin/",
"/usr/local/sbin/",
"/usr/local/libexec/",
"/bin/",
"/sbin/",
"/nix/store/",
"/opt/",
];
if !TRUSTED_HOOK_PREFIXES
.iter()
.any(|prefix| hook.path.starts_with(prefix))
{
return Err(NucleusError::HookError(format!(
"{} hook path '{}' is not under a trusted directory ({:?})",
phase, hook.path, TRUSTED_HOOK_PREFIXES
)));
}
}
match std::fs::symlink_metadata(hook_path) {
Ok(meta) if meta.file_type().is_symlink() => {
return Err(NucleusError::HookError(format!(
"{} hook path is a symlink (refusing to follow): {}",
phase, hook.path
)));
}
Err(_) => {
return Err(NucleusError::HookError(format!(
"{} hook binary not found: {}",
phase, hook.path
)));
}
Ok(_) => {}
}
Self::validate_hook_binary(hook_path, phase)?;
let mut cmd = Command::new(&hook.path);
if !hook.args.is_empty() {
cmd.args(&hook.args[1..]);
}
if !hook.env.is_empty() {
cmd.env_clear();
for entry in &hook.env {
if let Some((key, value)) = entry.split_once('=') {
cmd.env(key, value);
}
}
}
cmd.stdin(Stdio::piped());
cmd.stdout(Stdio::piped());
cmd.stderr(Stdio::piped());
#[cfg(not(test))]
unsafe {
cmd.pre_exec(|| {
if libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0 {
return Err(std::io::Error::last_os_error());
}
let rlim_nproc = libc::rlimit {
rlim_cur: 1024,
rlim_max: 1024,
};
if libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) != 0 {
return Err(std::io::Error::last_os_error());
}
let rlim_nofile = libc::rlimit {
rlim_cur: 1024,
rlim_max: 1024,
};
if libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) != 0 {
return Err(std::io::Error::last_os_error());
}
Ok(())
});
}
let mut child = cmd.spawn().map_err(|e| {
NucleusError::HookError(format!(
"Failed to spawn {} hook {}: {}",
phase, hook.path, e
))
})?;
if let Some(mut stdin) = child.stdin.take() {
use std::io::Write as IoWrite;
let _ = stdin.write_all(state_json.as_bytes());
}
let timeout_secs = hook.timeout.unwrap_or(30) as u64;
let start = std::time::Instant::now();
let timeout = std::time::Duration::from_secs(timeout_secs);
loop {
match child.try_wait() {
Ok(Some(status)) => {
if status.success() {
debug!("{} hook {} completed successfully", phase, hook.path);
return Ok(());
} else {
let stderr = child
.stderr
.take()
.map(|mut e| {
let mut buf = String::new();
use std::io::Read;
let _ = e.read_to_string(&mut buf);
buf
})
.unwrap_or_default();
return Err(NucleusError::HookError(format!(
"{} hook {} exited with status: {}{}",
phase,
hook.path,
status,
if stderr.is_empty() {
String::new()
} else {
format!(" (stderr: {})", stderr.trim())
}
)));
}
}
Ok(None) => {
if start.elapsed() >= timeout {
let _ = child.kill();
let _ = child.wait();
return Err(NucleusError::HookError(format!(
"{} hook {} timed out after {}s",
phase, hook.path, timeout_secs
)));
}
std::thread::sleep(std::time::Duration::from_millis(50));
}
Err(e) => {
return Err(NucleusError::HookError(format!(
"Failed to wait for {} hook {}: {}",
phase, hook.path, e
)));
}
}
}
}
fn validate_hook_binary(hook_path: &Path, phase: &str) -> Result<()> {
let metadata = std::fs::symlink_metadata(hook_path).map_err(|e| {
NucleusError::HookError(format!(
"Failed to stat {} hook {}: {}",
phase,
hook_path.display(),
e
))
})?;
use std::os::unix::fs::MetadataExt;
let mode = metadata.mode();
let uid = metadata.uid();
let gid = metadata.gid();
let effective_uid = nix::unistd::Uid::effective().as_raw();
if mode & 0o002 != 0 {
return Err(NucleusError::HookError(format!(
"{} hook {} is world-writable (mode {:04o}) – refusing to execute",
phase,
hook_path.display(),
mode & 0o7777
)));
}
if mode & 0o020 != 0 && uid != 0 {
return Err(NucleusError::HookError(format!(
"{} hook {} is group-writable and not owned by root (mode {:04o}, uid {}) – refusing to execute",
phase,
hook_path.display(),
mode & 0o7777,
uid
)));
}
if uid != 0 && uid != effective_uid {
return Err(NucleusError::HookError(format!(
"{} hook {} is owned by UID {} (expected 0 or {}) – refusing to execute",
phase,
hook_path.display(),
uid,
effective_uid
)));
}
if mode & 0o6000 != 0 {
return Err(NucleusError::HookError(format!(
"{} hook {} has setuid/setgid bits (mode {:04o}) – refusing to execute",
phase,
hook_path.display(),
mode & 0o7777
)));
}
debug!(
"{} hook {} validation passed (uid={}, gid={}, mode={:04o})",
phase,
hook_path.display(),
uid,
gid,
mode & 0o7777
);
Ok(())
}
}
impl OciConfig {
pub fn new(command: Vec<String>, hostname: Option<String>) -> Self {
Self {
oci_version: "1.0.2".to_string(),
root: OciRoot {
path: "rootfs".to_string(),
readonly: true,
},
process: OciProcess {
terminal: false,
user: OciUser {
uid: 0,
gid: 0,
additional_gids: None,
},
args: command,
env: vec![
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
],
cwd: "/".to_string(),
no_new_privileges: true,
capabilities: Some(OciCapabilities {
bounding: vec![],
effective: vec![],
inheritable: vec![],
permitted: vec![],
ambient: vec![],
}),
rlimits: vec![],
console_size: None,
apparmor_profile: None,
selinux_label: None,
},
hostname,
mounts: vec![
OciMount {
destination: "/proc".to_string(),
source: "proc".to_string(),
mount_type: "proc".to_string(),
options: vec![
"nosuid".to_string(),
"noexec".to_string(),
"nodev".to_string(),
],
},
OciMount {
destination: "/dev".to_string(),
source: "tmpfs".to_string(),
mount_type: "tmpfs".to_string(),
options: vec![
"nosuid".to_string(),
"noexec".to_string(),
"strictatime".to_string(),
"mode=755".to_string(),
"size=65536k".to_string(),
],
},
OciMount {
destination: "/dev/shm".to_string(),
source: "shm".to_string(),
mount_type: "tmpfs".to_string(),
options: vec![
"nosuid".to_string(),
"noexec".to_string(),
"nodev".to_string(),
"mode=1777".to_string(),
"size=65536k".to_string(),
],
},
OciMount {
destination: "/tmp".to_string(),
source: "tmpfs".to_string(),
mount_type: "tmpfs".to_string(),
options: vec![
"nosuid".to_string(),
"nodev".to_string(),
"noexec".to_string(),
"mode=1777".to_string(),
"size=65536k".to_string(),
],
},
OciMount {
destination: "/sys".to_string(),
source: "sysfs".to_string(),
mount_type: "sysfs".to_string(),
options: vec![
"nosuid".to_string(),
"noexec".to_string(),
"nodev".to_string(),
"ro".to_string(),
],
},
],
hooks: None,
annotations: HashMap::new(),
linux: Some(OciLinux {
namespaces: Some(vec![
OciNamespace {
namespace_type: "pid".to_string(),
},
OciNamespace {
namespace_type: "network".to_string(),
},
OciNamespace {
namespace_type: "ipc".to_string(),
},
OciNamespace {
namespace_type: "uts".to_string(),
},
OciNamespace {
namespace_type: "mount".to_string(),
},
]),
resources: None,
uid_mappings: vec![],
gid_mappings: vec![],
masked_paths: vec![
"/proc/acpi".to_string(),
"/proc/asound".to_string(),
"/proc/kcore".to_string(),
"/proc/keys".to_string(),
"/proc/latency_stats".to_string(),
"/proc/sched_debug".to_string(),
"/proc/scsi".to_string(),
"/proc/timer_list".to_string(),
"/proc/timer_stats".to_string(),
"/proc/sysrq-trigger".to_string(), "/proc/kpagecount".to_string(),
"/proc/kpageflags".to_string(),
"/proc/kpagecgroup".to_string(),
"/proc/config.gz".to_string(),
"/proc/kallsyms".to_string(),
"/sys/firmware".to_string(),
],
readonly_paths: vec![
"/proc/bus".to_string(),
"/proc/fs".to_string(),
"/proc/irq".to_string(),
"/proc/sys".to_string(),
],
devices: vec![
OciDevice {
device_type: "c".to_string(),
path: "/dev/null".to_string(),
major: Some(1),
minor: Some(3),
file_mode: Some(0o666),
uid: Some(0),
gid: Some(0),
},
OciDevice {
device_type: "c".to_string(),
path: "/dev/zero".to_string(),
major: Some(1),
minor: Some(5),
file_mode: Some(0o666),
uid: Some(0),
gid: Some(0),
},
OciDevice {
device_type: "c".to_string(),
path: "/dev/full".to_string(),
major: Some(1),
minor: Some(7),
file_mode: Some(0o666),
uid: Some(0),
gid: Some(0),
},
OciDevice {
device_type: "c".to_string(),
path: "/dev/random".to_string(),
major: Some(1),
minor: Some(8),
file_mode: Some(0o666),
uid: Some(0),
gid: Some(0),
},
OciDevice {
device_type: "c".to_string(),
path: "/dev/urandom".to_string(),
major: Some(1),
minor: Some(9),
file_mode: Some(0o666),
uid: Some(0),
gid: Some(0),
},
],
seccomp: None,
rootfs_propagation: Some("rprivate".to_string()),
sysctl: HashMap::new(),
cgroups_path: None,
intel_rdt: None,
}),
}
}
pub fn with_resources(mut self, limits: &ResourceLimits) -> Self {
let mut resources = OciResources {
memory: None,
cpu: None,
pids: None,
};
if let Some(memory_bytes) = limits.memory_bytes {
resources.memory = Some(OciMemory {
limit: Some(memory_bytes as i64),
});
}
if let Some(quota_us) = limits.cpu_quota_us {
resources.cpu = Some(OciCpu {
quota: Some(quota_us as i64),
period: Some(limits.cpu_period_us),
});
}
if let Some(pids_max) = limits.pids_max {
resources.pids = Some(OciPids {
limit: pids_max as i64,
});
}
if let Some(linux) = &mut self.linux {
linux.resources = Some(resources);
}
self
}
pub fn with_env(mut self, vars: &[(String, String)]) -> Self {
for (key, value) in vars {
self.process.env.push(format!("{}={}", key, value));
}
self
}
pub fn with_sd_notify(mut self) -> Self {
if let Ok(notify_socket) = std::env::var("NOTIFY_SOCKET") {
self.process
.env
.push(format!("NOTIFY_SOCKET={}", notify_socket));
}
self
}
pub fn with_secret_mounts(mut self, secrets: &[crate::container::SecretMount]) -> Self {
for secret in secrets {
self.mounts.push(OciMount {
destination: secret.dest.to_string_lossy().to_string(),
source: secret.source.to_string_lossy().to_string(),
mount_type: "bind".to_string(),
options: vec![
"bind".to_string(),
"ro".to_string(),
"nosuid".to_string(),
"nodev".to_string(),
"noexec".to_string(),
],
});
}
self
}
pub fn with_process_identity(mut self, identity: &crate::container::ProcessIdentity) -> Self {
self.process.user.uid = identity.uid;
self.process.user.gid = identity.gid;
self.process.user.additional_gids = if identity.additional_gids.is_empty() {
None
} else {
Some(identity.additional_gids.clone())
};
self
}
pub fn with_inmemory_secret_mounts(
mut self,
stage_dir: &Path,
secrets: &[crate::container::SecretMount],
) -> Result<Self> {
self.mounts.push(OciMount {
destination: "/run/secrets".to_string(),
source: stage_dir.to_string_lossy().to_string(),
mount_type: "bind".to_string(),
options: vec![
"bind".to_string(),
"ro".to_string(),
"nosuid".to_string(),
"nodev".to_string(),
"noexec".to_string(),
],
});
for secret in secrets {
let dest = normalize_container_destination(&secret.dest)?;
if !secret.source.starts_with(stage_dir) {
return Err(NucleusError::ConfigError(format!(
"Staged secret source {:?} must live under {:?}",
secret.source, stage_dir
)));
}
self.mounts.push(OciMount {
destination: dest.to_string_lossy().to_string(),
source: secret.source.to_string_lossy().to_string(),
mount_type: "bind".to_string(),
options: vec![
"bind".to_string(),
"ro".to_string(),
"nosuid".to_string(),
"nodev".to_string(),
"noexec".to_string(),
],
});
}
Ok(self)
}
pub fn with_volume_mounts(mut self, volumes: &[crate::container::VolumeMount]) -> Result<Self> {
use crate::container::VolumeSource;
for volume in volumes {
let dest = normalize_container_destination(&volume.dest)?;
match &volume.source {
VolumeSource::Bind { source } => {
crate::filesystem::validate_bind_mount_source(source)?;
let mut options = vec![
"bind".to_string(),
"nosuid".to_string(),
"nodev".to_string(),
];
if volume.read_only {
options.push("ro".to_string());
}
self.mounts.push(OciMount {
destination: dest.to_string_lossy().to_string(),
source: source.to_string_lossy().to_string(),
mount_type: "bind".to_string(),
options,
});
}
VolumeSource::Tmpfs { size } => {
let mut options = vec![
"nosuid".to_string(),
"nodev".to_string(),
"mode=0755".to_string(),
];
if volume.read_only {
options.push("ro".to_string());
}
if let Some(size) = size {
options.push(format!("size={}", size));
}
self.mounts.push(OciMount {
destination: dest.to_string_lossy().to_string(),
source: "tmpfs".to_string(),
mount_type: "tmpfs".to_string(),
options,
});
}
}
}
Ok(self)
}
pub fn with_context_bind(mut self, context_dir: &std::path::Path) -> Self {
self.mounts.push(OciMount {
destination: "/context".to_string(),
source: context_dir.to_string_lossy().to_string(),
mount_type: "bind".to_string(),
options: vec![
"bind".to_string(),
"ro".to_string(),
"nosuid".to_string(),
"nodev".to_string(),
],
});
self
}
pub fn with_rootfs_binds(mut self, rootfs_path: &std::path::Path) -> Self {
let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
for subdir in &subdirs {
let source = rootfs_path.join(subdir);
if source.exists() {
self.mounts.push(OciMount {
destination: format!("/{}", subdir),
source: source.to_string_lossy().to_string(),
mount_type: "bind".to_string(),
options: vec![
"bind".to_string(),
"ro".to_string(),
"nosuid".to_string(),
"nodev".to_string(),
],
});
}
}
self
}
pub fn with_namespace_config(mut self, config: &NamespaceConfig) -> Self {
let mut namespaces = Vec::new();
if config.pid {
namespaces.push(OciNamespace {
namespace_type: "pid".to_string(),
});
}
if config.net {
namespaces.push(OciNamespace {
namespace_type: "network".to_string(),
});
}
if config.ipc {
namespaces.push(OciNamespace {
namespace_type: "ipc".to_string(),
});
}
if config.uts {
namespaces.push(OciNamespace {
namespace_type: "uts".to_string(),
});
}
if config.mnt {
namespaces.push(OciNamespace {
namespace_type: "mount".to_string(),
});
}
if config.cgroup {
namespaces.push(OciNamespace {
namespace_type: "cgroup".to_string(),
});
}
if config.time {
namespaces.push(OciNamespace {
namespace_type: "time".to_string(),
});
}
if config.user {
namespaces.push(OciNamespace {
namespace_type: "user".to_string(),
});
}
if let Some(linux) = &mut self.linux {
linux.namespaces = Some(namespaces);
}
self
}
pub fn with_host_runtime_binds(mut self) -> Self {
let host_paths: BTreeSet<String> =
["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"]
.iter()
.map(|s| s.to_string())
.collect();
for host_path in host_paths {
let source = Path::new(&host_path);
if !source.exists() {
continue;
}
self.mounts.push(OciMount {
destination: host_path.clone(),
source: source.to_string_lossy().to_string(),
mount_type: "bind".to_string(),
options: vec![
"bind".to_string(),
"ro".to_string(),
"nosuid".to_string(),
"nodev".to_string(),
],
});
}
self
}
pub fn with_user_namespace(mut self) -> Self {
if let Some(linux) = &mut self.linux {
if let Some(namespaces) = &mut linux.namespaces {
namespaces.push(OciNamespace {
namespace_type: "user".to_string(),
});
}
}
self
}
pub fn with_rootless_user_namespace(mut self, config: &UserNamespaceConfig) -> Self {
if let Some(linux) = &mut self.linux {
if let Some(namespaces) = &mut linux.namespaces {
namespaces.retain(|ns| ns.namespace_type != "network");
if !namespaces.iter().any(|ns| ns.namespace_type == "user") {
namespaces.push(OciNamespace {
namespace_type: "user".to_string(),
});
}
}
linux.uid_mappings = config.uid_mappings.iter().map(OciIdMapping::from).collect();
linux.gid_mappings = config.gid_mappings.iter().map(OciIdMapping::from).collect();
}
self
}
pub fn with_hooks(mut self, hooks: OciHooks) -> Self {
if hooks.is_empty() {
self.hooks = None;
} else {
self.hooks = Some(hooks);
}
self
}
pub fn with_rlimits(mut self, limits: &ResourceLimits) -> Self {
let mut rlimits = Vec::with_capacity(3);
if let Some(nproc_limit) = limits.pids_max {
rlimits.push(OciRlimit {
limit_type: "RLIMIT_NPROC".to_string(),
hard: nproc_limit,
soft: nproc_limit,
});
}
rlimits.push(OciRlimit {
limit_type: "RLIMIT_NOFILE".to_string(),
hard: 1024,
soft: 1024,
});
let memlock_limit = limits.memlock_bytes.unwrap_or(64 * 1024);
rlimits.push(OciRlimit {
limit_type: "RLIMIT_MEMLOCK".to_string(),
hard: memlock_limit,
soft: memlock_limit,
});
self.process.rlimits = rlimits;
self
}
pub fn with_seccomp(mut self, seccomp: OciSeccomp) -> Self {
if let Some(linux) = &mut self.linux {
linux.seccomp = Some(seccomp);
}
self
}
pub fn with_cgroups_path(mut self, path: String) -> Self {
if let Some(linux) = &mut self.linux {
linux.cgroups_path = Some(path);
}
self
}
pub fn with_sysctl(mut self, sysctl: HashMap<String, String>) -> Self {
if let Some(linux) = &mut self.linux {
linux.sysctl = sysctl;
}
self
}
pub fn with_annotations(mut self, annotations: HashMap<String, String>) -> Self {
self.annotations = annotations;
self
}
}
impl From<&IdMapping> for OciIdMapping {
fn from(mapping: &IdMapping) -> Self {
Self {
container_id: mapping.container_id,
host_id: mapping.host_id,
size: mapping.count,
}
}
}
pub struct OciBundle {
bundle_path: PathBuf,
config: OciConfig,
}
impl OciBundle {
pub fn new(bundle_path: PathBuf, config: OciConfig) -> Self {
Self {
bundle_path,
config,
}
}
pub fn create(&self) -> Result<()> {
info!("Creating OCI bundle at {:?}", self.bundle_path);
fs::create_dir_all(&self.bundle_path).map_err(|e| {
NucleusError::GVisorError(format!(
"Failed to create bundle directory {:?}: {}",
self.bundle_path, e
))
})?;
fs::set_permissions(&self.bundle_path, fs::Permissions::from_mode(0o700)).map_err(|e| {
NucleusError::GVisorError(format!(
"Failed to secure bundle directory permissions {:?}: {}",
self.bundle_path, e
))
})?;
let rootfs = self.bundle_path.join("rootfs");
fs::create_dir_all(&rootfs).map_err(|e| {
NucleusError::GVisorError(format!("Failed to create rootfs directory: {}", e))
})?;
fs::set_permissions(&rootfs, fs::Permissions::from_mode(0o755)).map_err(|e| {
NucleusError::GVisorError(format!(
"Failed to set rootfs directory permissions {:?}: {}",
rootfs, e
))
})?;
let config_path = self.bundle_path.join("config.json");
let config_json = serde_json::to_string_pretty(&self.config).map_err(|e| {
NucleusError::GVisorError(format!("Failed to serialize OCI config: {}", e))
})?;
let mut file = OpenOptions::new()
.create(true)
.truncate(true)
.write(true)
.mode(0o600)
.custom_flags(libc::O_NOFOLLOW)
.open(&config_path)
.map_err(|e| NucleusError::GVisorError(format!("Failed to open config.json: {}", e)))?;
file.write_all(config_json.as_bytes()).map_err(|e| {
NucleusError::GVisorError(format!("Failed to write config.json: {}", e))
})?;
file.sync_all()
.map_err(|e| NucleusError::GVisorError(format!("Failed to sync config.json: {}", e)))?;
debug!("Created OCI bundle structure at {:?}", self.bundle_path);
Ok(())
}
pub fn rootfs_path(&self) -> PathBuf {
self.bundle_path.join("rootfs")
}
pub fn bundle_path(&self) -> &Path {
&self.bundle_path
}
pub fn cleanup(&self) -> Result<()> {
if self.bundle_path.exists() {
fs::remove_dir_all(&self.bundle_path).map_err(|e| {
NucleusError::GVisorError(format!("Failed to cleanup bundle: {}", e))
})?;
debug!("Cleaned up OCI bundle at {:?}", self.bundle_path);
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_oci_config_new() {
let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
assert_eq!(config.oci_version, "1.0.2");
assert_eq!(config.root.path, "rootfs");
assert_eq!(config.process.args, vec!["/bin/sh"]);
assert_eq!(config.hostname, Some("test".to_string()));
}
#[test]
fn test_oci_config_with_resources() {
let limits = ResourceLimits::unlimited()
.with_memory("512M")
.unwrap()
.with_cpu_cores(2.0)
.unwrap();
let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_resources(&limits);
assert!(config.linux.is_some());
let linux = config.linux.unwrap();
assert!(linux.resources.is_some());
let resources = linux.resources.unwrap();
assert!(resources.memory.is_some());
assert!(resources.cpu.is_some());
}
#[test]
fn test_oci_bundle_create() {
let temp_dir = TempDir::new().unwrap();
let bundle_path = temp_dir.path().join("test-bundle");
let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
let bundle = OciBundle::new(bundle_path.clone(), config);
bundle.create().unwrap();
assert!(bundle_path.exists());
assert!(bundle_path.join("rootfs").exists());
assert!(bundle_path.join("config.json").exists());
bundle.cleanup().unwrap();
assert!(!bundle_path.exists());
}
#[test]
fn test_oci_config_serialization() {
let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
let json = serde_json::to_string_pretty(&config).unwrap();
assert!(json.contains("ociVersion"));
assert!(json.contains("1.0.2"));
assert!(json.contains("/bin/sh"));
let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized.oci_version, config.oci_version);
assert_eq!(deserialized.process.args, config.process.args);
}
#[test]
fn test_host_runtime_binds_uses_fixed_paths_not_host_path() {
std::env::set_var("PATH", "/tmp/evil-inject-path/bin:/opt/attacker/sbin");
let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_host_runtime_binds();
let mount_dests: Vec<&str> = config
.mounts
.iter()
.map(|m| m.destination.as_str())
.collect();
let mount_srcs: Vec<&str> = config.mounts.iter().map(|m| m.source.as_str()).collect();
for path in &["/tmp/evil-inject-path", "/opt/attacker"] {
assert!(
!mount_dests.iter().any(|d| d.contains(path)),
"with_host_runtime_binds must not use host $PATH – found {:?} in mount destinations",
path
);
assert!(
!mount_srcs.iter().any(|s| s.contains(path)),
"with_host_runtime_binds must not use host $PATH – found {:?} in mount sources",
path
);
}
let allowed_prefixes = ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"];
for mount in &config.mounts {
if mount.mount_type == "bind" {
assert!(
allowed_prefixes
.iter()
.any(|p| mount.destination.starts_with(p)),
"unexpected bind mount destination: {} – only FHS paths allowed",
mount.destination
);
}
}
}
#[test]
fn test_volume_mounts_include_bind_and_tmpfs_options() {
let tmp = tempfile::TempDir::new().unwrap();
let config = OciConfig::new(vec!["/bin/sh".to_string()], None)
.with_volume_mounts(&[
crate::container::VolumeMount {
source: crate::container::VolumeSource::Bind {
source: tmp.path().to_path_buf(),
},
dest: std::path::PathBuf::from("/var/lib/app"),
read_only: true,
},
crate::container::VolumeMount {
source: crate::container::VolumeSource::Tmpfs {
size: Some("64M".to_string()),
},
dest: std::path::PathBuf::from("/var/cache/app"),
read_only: false,
},
])
.unwrap();
assert!(config.mounts.iter().any(|mount| {
mount.destination == "/var/lib/app"
&& mount.mount_type == "bind"
&& mount.options.contains(&"ro".to_string())
}));
assert!(config.mounts.iter().any(|mount| {
mount.destination == "/var/cache/app"
&& mount.mount_type == "tmpfs"
&& mount.options.contains(&"size=64M".to_string())
}));
}
#[test]
fn test_volume_mounts_reject_sensitive_host_sources() {
let err = OciConfig::new(vec!["/bin/sh".to_string()], None)
.with_volume_mounts(&[crate::container::VolumeMount {
source: crate::container::VolumeSource::Bind {
source: std::path::PathBuf::from("/proc/sys"),
},
dest: std::path::PathBuf::from("/host-proc"),
read_only: true,
}])
.unwrap_err();
assert!(err.to_string().contains("sensitive host path"));
}
#[test]
fn test_oci_config_with_process_identity() {
let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_process_identity(
&crate::container::ProcessIdentity {
uid: 1001,
gid: 1002,
additional_gids: vec![1003, 1004],
},
);
assert_eq!(config.process.user.uid, 1001);
assert_eq!(config.process.user.gid, 1002);
assert_eq!(config.process.user.additional_gids, Some(vec![1003, 1004]));
}
#[test]
fn test_oci_config_with_rlimits_uses_configured_memlock() {
let limits = ResourceLimits::default()
.with_pids(99)
.unwrap()
.with_memlock("8M")
.unwrap();
let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_rlimits(&limits);
assert!(config.process.rlimits.iter().any(|limit| {
limit.limit_type == "RLIMIT_NPROC" && limit.soft == 99 && limit.hard == 99
}));
assert!(config.process.rlimits.iter().any(|limit| {
limit.limit_type == "RLIMIT_MEMLOCK"
&& limit.soft == 8 * 1024 * 1024
&& limit.hard == 8 * 1024 * 1024
}));
}
#[test]
fn test_oci_config_with_rlimits_omits_nproc_when_unlimited() {
let limits = ResourceLimits {
pids_max: None,
..ResourceLimits::default()
};
let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_rlimits(&limits);
assert!(
!config
.process
.rlimits
.iter()
.any(|limit| limit.limit_type == "RLIMIT_NPROC"),
"RLIMIT_NPROC must be omitted when pids_max is unlimited"
);
}
#[test]
fn test_oci_config_uses_hardcoded_path_not_host() {
std::env::set_var("PATH", "/nix/store/secret-hash/bin:/home/user/.local/bin");
let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
let path_env = config
.process
.env
.iter()
.find(|e| e.starts_with("PATH="))
.expect("PATH env must be set");
assert_eq!(
path_env, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"OCI config must not leak host PATH"
);
assert!(
!path_env.contains("/nix/store/secret"),
"Host PATH must not leak into container"
);
}
#[test]
fn test_oci_hooks_serialization_roundtrip() {
let hooks = OciHooks {
create_runtime: vec![OciHook {
path: "/usr/bin/hook1".to_string(),
args: vec!["hook1".to_string(), "--arg1".to_string()],
env: vec!["FOO=bar".to_string()],
timeout: Some(10),
}],
create_container: vec![],
start_container: vec![],
poststart: vec![OciHook {
path: "/usr/bin/hook2".to_string(),
args: vec![],
env: vec![],
timeout: None,
}],
poststop: vec![],
};
let json = serde_json::to_string_pretty(&hooks).unwrap();
assert!(json.contains("createRuntime"));
assert!(json.contains("/usr/bin/hook1"));
assert!(!json.contains("createContainer"));
let deserialized: OciHooks = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized.create_runtime.len(), 1);
assert_eq!(deserialized.create_runtime[0].path, "/usr/bin/hook1");
assert_eq!(deserialized.create_runtime[0].timeout, Some(10));
assert_eq!(deserialized.poststart.len(), 1);
assert!(deserialized.create_container.is_empty());
}
#[test]
fn test_oci_hooks_is_empty() {
let empty = OciHooks::default();
assert!(empty.is_empty());
let not_empty = OciHooks {
poststop: vec![OciHook {
path: "/bin/cleanup".to_string(),
args: vec![],
env: vec![],
timeout: None,
}],
..Default::default()
};
assert!(!not_empty.is_empty());
}
#[test]
fn test_oci_config_with_hooks() {
let hooks = OciHooks {
create_runtime: vec![OciHook {
path: "/usr/bin/setup".to_string(),
args: vec![],
env: vec![],
timeout: None,
}],
..Default::default()
};
let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(hooks);
assert!(config.hooks.is_some());
let json = serde_json::to_string_pretty(&config).unwrap();
assert!(json.contains("hooks"));
assert!(json.contains("createRuntime"));
let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
assert!(deserialized.hooks.is_some());
assert_eq!(deserialized.hooks.unwrap().create_runtime.len(), 1);
}
#[test]
fn test_oci_config_with_empty_hooks_serializes_without_hooks() {
let config =
OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(OciHooks::default());
assert!(config.hooks.is_none());
let json = serde_json::to_string_pretty(&config).unwrap();
assert!(!json.contains("hooks"));
}
#[test]
fn test_oci_hook_rejects_relative_path() {
let hook = OciHook {
path: "relative/path".to_string(),
args: vec![],
env: vec![],
timeout: None,
};
let state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: "test".to_string(),
status: OciStatus::Creating,
pid: 1234,
bundle: "/tmp/bundle".to_string(),
};
let result = OciHooks::run_hooks(&[hook], &state, "test");
assert!(result.is_err());
let err_msg = result.unwrap_err().to_string();
assert!(err_msg.contains("absolute"), "error: {}", err_msg);
}
fn original_path() -> String {
if let Ok(environ) = std::fs::read("/proc/self/environ") {
for entry in environ.split(|&b| b == 0) {
if let Ok(s) = std::str::from_utf8(entry) {
if let Some(val) = s.strip_prefix("PATH=") {
return val.to_string();
}
}
}
}
String::new()
}
fn find_bash() -> String {
let candidates = ["/bin/bash", "/usr/bin/bash"];
for c in &candidates {
if std::path::Path::new(c).exists() {
return c.to_string();
}
}
for dir in original_path().split(':') {
let candidate = std::path::PathBuf::from(dir).join("bash");
if candidate.exists() {
return candidate.to_string_lossy().to_string();
}
}
panic!("Cannot find bash binary for test");
}
fn write_script(path: &std::path::Path, body: &str) {
use std::io::Write as IoWrite;
let bash = find_bash();
let orig_path = original_path();
let content = format!("#!{}\nexport PATH='{}'\n{}", bash, orig_path, body);
let mut f = OpenOptions::new()
.create(true)
.truncate(true)
.write(true)
.mode(0o755)
.open(path)
.unwrap();
f.write_all(content.as_bytes()).unwrap();
f.sync_all().unwrap();
drop(f);
}
#[test]
fn test_oci_hook_executes_successfully() {
let temp_dir = TempDir::new().unwrap();
let hook_script = temp_dir.path().join("hook.sh");
let output_file = temp_dir.path().join("output.json");
write_script(
&hook_script,
&format!("cat > {}\n", output_file.to_string_lossy()),
);
let hook = OciHook {
path: hook_script.to_string_lossy().to_string(),
args: vec![],
env: vec![],
timeout: Some(5),
};
let state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: "test-container".to_string(),
status: OciStatus::Creating,
pid: 12345,
bundle: "/tmp/test-bundle".to_string(),
};
OciHooks::run_hooks(&[hook], &state, "createRuntime").unwrap();
let written = std::fs::read_to_string(&output_file).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&written).unwrap();
assert_eq!(parsed["id"], "test-container");
assert_eq!(parsed["pid"], 12345);
assert_eq!(parsed["status"], "creating");
}
#[test]
fn test_oci_hook_nonzero_exit_is_error() {
let temp_dir = TempDir::new().unwrap();
let hook_script = temp_dir.path().join("fail.sh");
write_script(&hook_script, "exit 1\n");
let hook = OciHook {
path: hook_script.to_string_lossy().to_string(),
args: vec![],
env: vec![],
timeout: Some(5),
};
let state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: "test".to_string(),
status: OciStatus::Creating,
pid: 1,
bundle: "".to_string(),
};
let result = OciHooks::run_hooks(&[hook], &state, "test");
assert!(result.is_err());
assert!(result
.unwrap_err()
.to_string()
.contains("exited with status"));
}
#[test]
fn test_oci_hooks_best_effort_continues_on_failure() {
let temp_dir = TempDir::new().unwrap();
let fail_script = temp_dir.path().join("fail.sh");
write_script(&fail_script, "exit 1\n");
let marker = temp_dir.path().join("ran");
let ok_script = temp_dir.path().join("ok.sh");
write_script(&ok_script, &format!("touch {}\n", marker.to_string_lossy()));
let hooks = vec![
OciHook {
path: fail_script.to_string_lossy().to_string(),
args: vec![],
env: vec![],
timeout: Some(5),
},
OciHook {
path: ok_script.to_string_lossy().to_string(),
args: vec![],
env: vec![],
timeout: Some(5),
},
];
let state = OciContainerState {
oci_version: "1.0.2".to_string(),
id: "test".to_string(),
status: OciStatus::Stopped,
pid: 0,
bundle: "".to_string(),
};
OciHooks::run_hooks_best_effort(&hooks, &state, "poststop");
assert!(marker.exists(), "second hook should run after first fails");
}
}