#![expect(clippy::disallowed_types)]
use std::{
collections::{HashMap, HashSet},
env,
ffi::{OsStr, OsString},
fmt::Write as FmtWrite,
fs::{self, DirBuilder},
io::{BufReader, BufWriter, Write as IOWrite},
os::{
fd::{AsRawFd, OwnedFd, RawFd},
linux::fs::MetadataExt,
unix::fs::{DirBuilderExt, PermissionsExt},
},
path::{Path, PathBuf},
process::{exit, ExitCode},
rc::Rc,
str::FromStr,
};
use clap::Parser;
use libcgroups::common::{CgroupManager, ControllerOpt};
use libcontainer::{
apparmor,
config::YoukiConfig,
container::{builder::ContainerBuilder, Container, ContainerStatus},
error::{ErrInvalidSpec, LibcontainerError, MissingSpecError},
hooks,
notify_socket::{NotifyListener, NotifySocket, NOTIFY_FILE},
process,
process::{args::ContainerArgs, intel_rdt::delete_resctrl_subdirectory},
signal::Signal,
syscall::{
linux::MountOption,
syscall::{create_syscall, SyscallType},
},
tty,
user_ns::UserNamespaceConfig,
utils,
utils::{rootless_required, PathBufExt},
workload::{Executor, ExecutorError, ExecutorValidationError},
};
use liboci_cli::{
Checkpoint, CommonCmd, Create, Delete, Events, Exec, Features, GlobalOpts, Kill, List, Pause,
Ps, Resume, Run, StandardCmd, Start, State, Update,
};
use nix::{
errno::Errno,
fcntl::OFlag,
sys::{
signal,
signal::kill,
signalfd::SigSet,
stat::{fchmod, Mode},
wait::{Id, WaitPidFlag},
},
unistd::{pipe2, read, Gid, Pid, Uid},
};
use oci_spec::runtime::{
ApparmorBuilder, Capabilities as SpecCapabilities, Capability, CgroupBuilder, FeaturesBuilder,
IDMapBuilder, IntelRdtBuilder, LinuxBuilder, LinuxCapabilities, LinuxCapabilitiesBuilder,
LinuxFeatureBuilder, LinuxIdMappingBuilder, LinuxNamespace, LinuxNamespaceBuilder,
LinuxNamespaceType, LinuxPidsBuilder, LinuxResources, LinuxResourcesBuilder,
LinuxSchedulerPolicy, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscall,
LinuxSyscallBuilder, Mount, MountExtensionsBuilder, Process, ProcessBuilder, SelinuxBuilder,
Spec, VERSION,
};
use procfs::process::Namespace;
use serde_json::to_writer_pretty;
use syd::{
compat::{openat2, set_name, set_no_new_privs, waitid, ResolveFlag, WaitStatus},
config::ENV_SKIP_SCMP,
confine::secure_getenv,
err::{err2io, SydError, SydResult},
fd::AT_BADFD,
hook::Supervisor,
ignore_signals,
log::log_init,
lookup::{safe_open_file2, safe_open_how},
path::{XPath, XPathBuf},
retry::retry_on_eintr,
sandbox::Sandbox,
syslog::LogLevel,
IgnoreSignalOpts,
};
use tabwriter::TabWriter;
use tracing_subscriber::layer::SubscriberExt;
#[cfg(all(
not(coverage),
not(feature = "prof"),
not(target_os = "android"),
not(target_arch = "riscv64"),
target_page_size_4k,
target_pointer_width = "64"
))]
#[global_allocator]
static GLOBAL: hardened_malloc::HardenedMalloc = hardened_malloc::HardenedMalloc;
#[cfg(feature = "prof")]
#[global_allocator]
static GLOBAL: tcmalloc::TCMalloc = tcmalloc::TCMalloc;
pub struct SydLayer;
impl<S> tracing_subscriber::layer::Layer<S> for SydLayer
where
S: tracing::Subscriber + for<'a> tracing_subscriber::registry::LookupSpan<'a>,
{
fn event_enabled(
&self,
_event: &tracing::Event,
_ctx: tracing_subscriber::layer::Context<S>,
) -> bool {
syd::log_enabled!(LogLevel::Debug)
}
fn on_event(&self, event: &tracing::Event, _ctx: tracing_subscriber::layer::Context<S>) {
syd::debug!("ctx": "oci_trace", "event": format!("{event:?}"));
}
}
#[derive(Clone)]
struct SydExecutor {}
impl Executor for SydExecutor {
fn exec(&self, spec: &Spec) -> Result<(), ExecutorError> {
set_no_new_privs().or(Err(ExecutorError::CantHandle(
"Failed to set no-new-privs attribute!",
)))?;
let _ = set_name(c"syd_oci");
syd::t!(
"Syd-OCI-Spec: {}",
serde_json::to_string(spec).unwrap_or("?".to_string())
);
#[expect(clippy::disallowed_methods)]
let proc = spec
.process()
.as_ref()
.expect("oci_spec::runtime::Spec::process");
#[expect(clippy::disallowed_methods)]
let argv = proc
.args()
.as_ref()
.expect("oci_spec::runtime::Process::args!None")
.iter()
.map(OsString::from)
.collect::<Vec<_>>();
#[expect(clippy::disallowed_methods)]
let (comm, argv) = argv
.split_first()
.map(|(arg0, argv)| (arg0, argv.to_vec()))
.expect("oci_spec::runtime::Process::args!Empty");
if let Some(env) = proc.env() {
for var in env {
if let Some((var, val)) = var.split_once('=') {
let var = OsString::from(var);
if env::var_os(&var).is_none() {
env::set_var(var, OsString::from(val));
}
}
}
}
let sandbox = Sandbox::try_from(spec)
.or(Err(ExecutorError::CantHandle("Failed to initialize Syd!")))?;
syd::t!(
"Syd-OCI-Sandbox: {}",
serde_json::to_string(&sandbox).unwrap_or("?".to_string())
);
let mut opts = IgnoreSignalOpts::SkipIgnoreAlarm;
if sandbox.options.allow_unsafe_prlimit() {
opts.insert(IgnoreSignalOpts::SkipIgnoreCoreDump);
}
ignore_signals(opts).or(Err(ExecutorError::CantHandle("Failed to ignore signals!")))?;
Supervisor::run(sandbox, None, comm, argv, None)
.map(i32::from)
.map(exit)
.map_err(|err| ExecutorError::Execution(err.into()))?
}
fn validate(&self, spec: &Spec) -> Result<(), ExecutorValidationError> {
let proc = spec
.process()
.as_ref()
.ok_or(ExecutorValidationError::ArgValidationError(
"spec did not contain process".into(),
))?;
if let Some(args) = proc.args() {
let envs: Vec<String> = proc.env().as_ref().unwrap_or(&vec![]).clone();
let path_vars: Vec<&String> = envs.iter().filter(|&e| e.starts_with("PATH=")).collect();
if path_vars.is_empty() {
syd::t!("PATH environment variable is not set");
Err(ExecutorValidationError::ArgValidationError(
"PATH environment variable is not set".into(),
))?;
}
let path_var = path_vars[0].trim_start_matches("PATH=");
match get_executable_path(&args[0], path_var) {
None => {
syd::t!("executable for container process not found in PATH");
Err(ExecutorValidationError::ArgValidationError(format!(
"executable '{}' not found in $PATH",
args[0]
)))?;
}
Some(path) => match is_executable(&path) {
Ok(true) => {
syd::t!("found executable in executor");
}
Ok(false) => {
syd::t!("executable does not have the correct permission set");
Err(ExecutorValidationError::ArgValidationError(format!(
"executable '{}' at path '{:?}' does not have correct permissions",
args[0], path
)))?;
}
Err(err) => {
syd::t!("failed to check permissions for executable: {err}");
Err(ExecutorValidationError::ArgValidationError(format!(
"failed to check permissions for executable '{}' at path '{:?}' : {}",
args[0], path, err
)))?;
}
},
}
}
Ok(())
}
}
const NAMESPACE_TYPES: &[&str] = &["ipc", "uts", "net", "pid", "mnt", "cgroup"];
const TENANT_NOTIFY: &str = "not-";
const TENANT_TTY: &str = "tty-";
struct SydTenantContainerBuilder {
base: ContainerBuilder,
env: HashMap<String, String>,
cwd: Option<PathBuf>,
args: Vec<String>,
no_new_privs: Option<bool>,
capabilities: Vec<String>,
process: Option<PathBuf>,
detached: bool,
as_sibling: bool,
syscall: SyscallType,
container_id: String,
pid_file: Option<PathBuf>,
preserve_fds: i32,
executor: Box<dyn Executor>,
root_path: PathBuf,
console_socket: Option<PathBuf>,
}
impl SydTenantContainerBuilder {
fn new(opt: GlobalOpts, args: Exec) -> SydResult<Self> {
let syscall = SyscallType::default();
let container_id = args.container_id.clone();
let pid_file = if let Some(ref p) = args.pid_file {
Some(p.canonicalize_safely()?)
} else {
None
};
let executor = Box::new(SydExecutor {});
let mut preserve_fds = args.preserve_fds;
if opt.log.is_some() {
preserve_fds += 1; }
#[expect(clippy::disallowed_methods)]
let builder = ContainerBuilder::new(container_id.clone(), syscall)
.with_executor(SydExecutor {})
.with_root_path(opt.root.clone().unwrap())?
.with_console_socket(args.console_socket.as_ref())
.with_pid_file(pid_file.clone())?
.validate_id()?;
#[expect(clippy::disallowed_methods)]
Ok(Self {
base: builder,
env: HashMap::new(),
cwd: None,
args: Vec::new(),
no_new_privs: None,
capabilities: Vec::new(),
process: None,
detached: false,
as_sibling: false,
syscall,
container_id,
pid_file,
preserve_fds,
executor,
root_path: opt.root.unwrap(),
console_socket: args.console_socket,
})
}
pub fn with_env(mut self, env: HashMap<String, String>) -> Self {
self.env = env;
self
}
pub fn with_cwd<P: Into<PathBuf>>(mut self, path: Option<P>) -> Self {
self.cwd = path.map(|p| p.into());
self
}
pub fn with_container_args(mut self, args: Vec<String>) -> Self {
self.args = args;
self
}
pub fn with_no_new_privs(mut self, no_new_privs: bool) -> Self {
self.no_new_privs = Some(no_new_privs);
self
}
#[expect(dead_code)]
pub fn with_capabilities(mut self, capabilities: Vec<String>) -> Self {
self.capabilities = capabilities;
self
}
pub fn with_process<P: Into<PathBuf>>(mut self, path: Option<P>) -> Self {
self.process = path.map(|p| p.into());
self
}
pub fn with_detach(mut self, detached: bool) -> Self {
self.detached = detached;
self
}
pub fn build(self) -> Result<Pid, LibcontainerError> {
let container_dir = self.lookup_container_dir()?;
let container = self.load_container_state(container_dir.clone())?;
let mut spec = self.load_init_spec(&container)?;
self.adapt_spec_for_tenant(&mut spec, &container)?;
syd::t!("{spec:?}");
let notify_path = Self::setup_notify_listener(&container_dir)?;
let rootfs = fs::canonicalize(spec.root().as_ref().ok_or(MissingSpecError::Root)?.path())
.map_err(LibcontainerError::OtherIO)?;
let csocketfd = self.setup_tty_socket(&container_dir)?;
let use_systemd = self.should_use_systemd(&container);
let user_ns_config = UserNamespaceConfig::new(&spec)?;
let (read_end, write_end) = pipe2(OFlag::O_CLOEXEC)
.map_err(|e| LibcontainerError::OtherIO(std::io::Error::from_raw_os_error(e as i32)))?;
let mut builder_impl = SydContainerBuilderImpl {
container_type: ContainerType::SydTenantContainer {
exec_notify_fd: write_end.as_raw_fd(),
},
syscall: self.syscall,
container_id: self.container_id,
pid_file: self.pid_file,
console_socket: csocketfd,
use_systemd,
spec: Rc::new(spec),
rootfs,
user_ns_config,
notify_path: notify_path.clone(),
container: None,
preserve_fds: self.preserve_fds,
detached: self.detached,
executor: self.executor,
no_pivot: false,
stdin: self.base.stdin,
stdout: self.base.stdout,
stderr: self.base.stderr,
as_sibling: self.as_sibling,
};
let pid = builder_impl.create()?;
let mut notify_socket = NotifySocket::new(notify_path);
notify_socket.notify_container_start()?;
drop(write_end);
let mut err_str_buf = Vec::new();
loop {
let mut buf = [0; 3];
match read(&read_end, &mut buf).map_err(|e| {
LibcontainerError::OtherIO(std::io::Error::from_raw_os_error(e as i32))
})? {
0 => {
if err_str_buf.is_empty() {
return Ok(pid);
} else {
return Err(LibcontainerError::Other(
String::from_utf8_lossy(&err_str_buf).to_string(),
));
}
}
_ => {
err_str_buf.extend(buf);
}
}
}
}
fn lookup_container_dir(&self) -> Result<PathBuf, LibcontainerError> {
let container_dir = self.root_path.join(&self.container_id);
if !XPath::new(&container_dir).exists(true) {
syd::t!("container dir does not exist");
return Err(LibcontainerError::NoDirectory);
}
Ok(container_dir)
}
fn load_init_spec(&self, container: &Container) -> Result<Spec, LibcontainerError> {
let spec_path = container.bundle().join("config.json");
let mut spec = syd_spec_load(spec_path)?;
Self::validate_spec(&spec)?;
spec.canonicalize_rootfs(container.bundle())?;
Ok(spec)
}
fn validate_spec(spec: &Spec) -> Result<(), LibcontainerError> {
let version = spec.version();
if !version.starts_with("1.") {
syd::t!(
"runtime spec has incompatible version '{}'. Only 1.X.Y is supported",
spec.version()
);
Err(ErrInvalidSpec::UnsupportedVersion)?;
}
if let Some(process) = spec.process() {
if let Some(io_priority) = process.io_priority() {
let priority = io_priority.priority();
let iop_class_res = serde_json::to_string(&io_priority.class());
match iop_class_res {
Ok(_iop_class) => {
if !(0..=7).contains(&priority) {
syd::t!("io priority '{}' not between 0 and 7 (inclusive), class '{}' not in (IO_PRIO_CLASS_RT,IO_PRIO_CLASS_BE,IO_PRIO_CLASS_IDLE)",
priority, _iop_class);
Err(ErrInvalidSpec::IoPriority)?;
}
}
Err(_e) => {
syd::t!("failed to parse io priority class: {_e}");
Err(ErrInvalidSpec::IoPriority)?;
}
}
}
if let Some(sc) = process.scheduler() {
let policy = sc.policy();
if let Some(nice) = sc.nice() {
if (*policy == LinuxSchedulerPolicy::SchedBatch
|| *policy == LinuxSchedulerPolicy::SchedOther)
&& (*nice < -20 || *nice > 19)
{
syd::t!("invalid scheduler.nice: '{nice}', must be within -20 to 19");
Err(ErrInvalidSpec::Scheduler)?;
}
}
if let Some(priority) = sc.priority() {
if *priority != 0
&& (*policy != LinuxSchedulerPolicy::SchedFifo
&& *policy != LinuxSchedulerPolicy::SchedRr)
{
syd::t!("scheduler.priority can only be specified for SchedFIFO or SchedRR policy");
Err(ErrInvalidSpec::Scheduler)?;
}
}
if *policy != LinuxSchedulerPolicy::SchedDeadline {
if let Some(runtime) = sc.runtime() {
if *runtime != 0 {
syd::t!(
"scheduler runtime can only be specified for SchedDeadline policy"
);
Err(ErrInvalidSpec::Scheduler)?;
}
}
if let Some(deadline) = sc.deadline() {
if *deadline != 0 {
syd::t!(
"scheduler deadline can only be specified for SchedDeadline policy"
);
Err(ErrInvalidSpec::Scheduler)?;
}
}
if let Some(period) = sc.period() {
if *period != 0 {
syd::t!(
"scheduler period can only be specified for SchedDeadline policy"
);
Err(ErrInvalidSpec::Scheduler)?;
}
}
}
}
}
let syscall = create_syscall();
utils::validate_spec_for_new_user_ns(spec, &*syscall)?;
Ok(())
}
fn load_container_state(&self, container_dir: PathBuf) -> Result<Container, LibcontainerError> {
let container = Container::load(container_dir)?;
if !container.can_exec() {
syd::t!("cannot exec as container");
return Err(LibcontainerError::IncorrectStatus(container.status()));
}
Ok(container)
}
fn adapt_spec_for_tenant(
&self,
spec: &mut Spec,
container: &Container,
) -> Result<(), LibcontainerError> {
let process = if let Some(process) = &self.process {
self.get_process(process)?
} else {
let mut process_builder = ProcessBuilder::default()
.args(self.get_args()?)
.env(self.get_environment());
if let Some(cwd) = self.get_working_dir()? {
process_builder = process_builder.cwd(cwd);
}
if let Some(no_new_priv) = self.get_no_new_privileges() {
process_builder = process_builder.no_new_privileges(no_new_priv);
}
if let Some(caps) = self.get_capabilities(spec)? {
process_builder = process_builder.capabilities(caps);
}
process_builder.build()?
};
let container_pid = container.pid().ok_or(LibcontainerError::Other(
"could not retrieve container init pid".into(),
))?;
let init_process = procfs::process::Process::new(container_pid.as_raw()).map_err(|_| {
LibcontainerError::OtherIO(std::io::Error::from_raw_os_error(nix::libc::ESRCH))
})?;
let ns = self.get_namespaces(
init_process
.namespaces()
.map_err(|_| {
LibcontainerError::OtherIO(std::io::Error::from_raw_os_error(nix::libc::ESRCH))
})?
.0,
)?;
#[expect(clippy::disallowed_methods)]
let spec_linux = spec.linux().as_ref().unwrap();
let mut linux_builder = LinuxBuilder::default().namespaces(ns);
if let Some(ref cgroup_path) = spec_linux.cgroups_path() {
linux_builder = linux_builder.cgroups_path(cgroup_path.clone());
}
let linux = linux_builder.build()?;
spec.set_process(Some(process)).set_linux(Some(linux));
Ok(())
}
fn get_process(&self, process: &Path) -> Result<Process, LibcontainerError> {
if !XPath::new(process).exists(true) {
syd::t!("process.json file does not exist");
return Err(LibcontainerError::Other(
"process.json file does not exist".into(),
));
}
let process = utils::open(process).map_err(LibcontainerError::OtherIO)?;
let reader = BufReader::new(process);
let process_spec =
serde_json::from_reader(reader).map_err(LibcontainerError::OtherSerialization)?;
Ok(process_spec)
}
fn get_working_dir(&self) -> Result<Option<PathBuf>, LibcontainerError> {
if let Some(cwd) = &self.cwd {
if cwd.is_relative() {
syd::t!("current working directory must be an absolute path");
return Err(LibcontainerError::Other(
"current working directory must be an absolute path".into(),
));
}
return Ok(Some(cwd.into()));
}
Ok(None)
}
fn get_args(&self) -> Result<Vec<String>, LibcontainerError> {
if self.args.is_empty() {
Err(MissingSpecError::Args)?;
}
Ok(self.args.clone())
}
fn get_environment(&self) -> Vec<String> {
self.env.iter().map(|(k, v)| format!("{k}={v}")).collect()
}
fn get_no_new_privileges(&self) -> Option<bool> {
self.no_new_privs
}
fn get_capabilities(
&self,
spec: &Spec,
) -> Result<Option<LinuxCapabilities>, LibcontainerError> {
if !self.capabilities.is_empty() {
let mut caps: Vec<syd::caps::Capability> = Vec::with_capacity(self.capabilities.len());
for cap in &self.capabilities {
caps.push(
syd::caps::Capability::from_str(cap)
.map_err(|e| LibcontainerError::Other(e.to_string()))?,
);
}
let caps: SpecCapabilities = caps.iter().map(|c| c.spec()).collect();
if let Some(spec_caps) = spec
.process()
.as_ref()
.ok_or(MissingSpecError::Process)?
.capabilities()
{
let mut capabilities_builder = LinuxCapabilitiesBuilder::default();
capabilities_builder = match spec_caps.ambient() {
Some(ambient) => {
let ambient: SpecCapabilities = ambient.union(&caps).copied().collect();
capabilities_builder.ambient(ambient)
}
None => capabilities_builder,
};
capabilities_builder = match spec_caps.bounding() {
Some(bounding) => {
let bounding: SpecCapabilities = bounding.union(&caps).copied().collect();
capabilities_builder.bounding(bounding)
}
None => capabilities_builder,
};
capabilities_builder = match spec_caps.effective() {
Some(effective) => {
let effective: SpecCapabilities = effective.union(&caps).copied().collect();
capabilities_builder.effective(effective)
}
None => capabilities_builder,
};
capabilities_builder = match spec_caps.inheritable() {
Some(inheritable) => {
let inheritable: SpecCapabilities =
inheritable.union(&caps).copied().collect();
capabilities_builder.inheritable(inheritable)
}
None => capabilities_builder,
};
capabilities_builder = match spec_caps.permitted() {
Some(permitted) => {
let permitted: SpecCapabilities = permitted.union(&caps).copied().collect();
capabilities_builder.permitted(permitted)
}
None => capabilities_builder,
};
let c = capabilities_builder.build()?;
return Ok(Some(c));
}
return Ok(Some(
LinuxCapabilitiesBuilder::default()
.bounding(caps.clone())
.effective(caps.clone())
.inheritable(caps.clone())
.permitted(caps.clone())
.ambient(caps)
.build()?,
));
}
Ok(None)
}
fn get_namespaces(
&self,
init_namespaces: HashMap<OsString, Namespace>,
) -> Result<Vec<LinuxNamespace>, LibcontainerError> {
let mut tenant_namespaces = Vec::with_capacity(init_namespaces.len());
for &ns_type in NAMESPACE_TYPES {
if let Some(init_ns) = init_namespaces.get(OsStr::new(ns_type)) {
let tenant_ns = LinuxNamespaceType::try_from(ns_type)?;
tenant_namespaces.push(
LinuxNamespaceBuilder::default()
.typ(tenant_ns)
.path(init_ns.path.clone())
.build()?,
)
}
}
Ok(tenant_namespaces)
}
fn should_use_systemd(&self, container: &Container) -> bool {
container.systemd()
}
fn setup_notify_listener(container_dir: &Path) -> Result<PathBuf, LibcontainerError> {
let notify_name = Self::generate_name(container_dir, TENANT_NOTIFY);
let socket_path = container_dir.join(notify_name);
Ok(socket_path)
}
fn setup_tty_socket(&self, container_dir: &Path) -> Result<Option<OwnedFd>, LibcontainerError> {
let tty_name = Self::generate_name(container_dir, TENANT_TTY);
let csocketfd = if let Some(console_socket) = &self.console_socket {
Some(tty::setup_console_socket(
container_dir,
console_socket,
&tty_name,
)?)
} else {
None
};
Ok(csocketfd)
}
fn generate_name(dir: &Path, prefix: &str) -> String {
loop {
let mut rand_buf = [0u8; 2];
if unsafe {
nix::libc::getrandom(
rand_buf.as_mut_ptr() as *mut nix::libc::c_void,
rand_buf.len(),
nix::libc::GRND_RANDOM,
)
} < 0
{
panic!("getrandom: {}", Errno::last());
}
let rand = i16::from_be_bytes(rand_buf);
let name = format!("{prefix}{rand:x}");
if !XPath::new(&dir.join(&name)).exists(true) {
return name;
}
}
}
}
struct SydInitContainerBuilder {
base: ContainerBuilder,
bundle: PathBuf,
use_systemd: bool,
detached: bool,
no_pivot: bool,
as_sibling: bool,
console_socket: Option<PathBuf>,
syscall: SyscallType,
container_id: String,
pid_file: Option<PathBuf>,
preserve_fds: i32,
executor: Box<dyn Executor>,
root_path: PathBuf,
}
impl TryFrom<(GlobalOpts, Create)> for SydInitContainerBuilder {
type Error = SydError;
fn try_from(options: (GlobalOpts, Create)) -> SydResult<Self> {
let (opt, args) = options;
let syscall = SyscallType::default();
let container_id = args.container_id.clone();
let pid_file = if let Some(ref p) = args.pid_file {
Some(p.canonicalize_safely()?)
} else {
None
};
let executor = Box::new(SydExecutor {});
let mut preserve_fds = args.preserve_fds;
if opt.log.is_some() {
preserve_fds += 1; }
#[expect(clippy::disallowed_methods)]
let builder = ContainerBuilder::new(container_id.clone(), syscall)
.with_executor(SydExecutor {})
.with_pid_file(pid_file.clone())?
.with_console_socket(args.console_socket.as_ref())
.with_root_path(opt.root.clone().unwrap())?
.with_preserved_fds(preserve_fds)
.validate_id()?;
#[expect(clippy::disallowed_methods)]
Ok(Self {
base: builder,
bundle: args.bundle,
use_systemd: opt.systemd_cgroup,
detached: true,
no_pivot: false,
as_sibling: false,
container_id,
executor,
pid_file,
syscall,
console_socket: args.console_socket,
preserve_fds,
root_path: opt.root.unwrap(),
})
}
}
impl TryFrom<(GlobalOpts, Run)> for SydInitContainerBuilder {
type Error = SydError;
fn try_from(options: (GlobalOpts, Run)) -> SydResult<Self> {
let (opt, args) = options;
let syscall = SyscallType::default();
let container_id = args.container_id.clone();
let pid_file = if let Some(ref p) = args.pid_file {
Some(p.canonicalize_safely()?)
} else {
None
};
let executor = Box::new(SydExecutor {});
let mut preserve_fds = args.preserve_fds;
if opt.log.is_some() {
preserve_fds += 1; }
#[expect(clippy::disallowed_methods)]
let builder = ContainerBuilder::new(container_id.clone(), syscall)
.with_executor(SydExecutor {})
.with_pid_file(pid_file.clone())?
.with_console_socket(args.console_socket.as_ref())
.with_root_path(opt.root.clone().unwrap())?
.with_preserved_fds(preserve_fds)
.validate_id()?;
#[expect(clippy::disallowed_methods)]
Ok(Self {
base: builder,
bundle: args.bundle,
use_systemd: opt.systemd_cgroup,
detached: true,
no_pivot: false,
as_sibling: false,
container_id,
executor,
pid_file,
syscall,
console_socket: args.console_socket,
preserve_fds,
root_path: opt.root.unwrap(),
})
}
}
impl SydInitContainerBuilder {
pub fn with_systemd(mut self, should_use: bool) -> Self {
self.use_systemd = should_use;
self
}
pub fn with_detach(mut self, detached: bool) -> Self {
self.detached = detached;
self
}
pub fn build(self) -> Result<Container, LibcontainerError> {
let spec = self.load_spec()?;
let container_dir = self.create_container_dir()?;
let mut container = self.create_container_state(&container_dir)?;
container
.set_systemd(self.use_systemd)
.set_annotations(spec.annotations().clone());
let notify_path = container_dir.join(NOTIFY_FILE);
let rootfs = fs::canonicalize(spec.root().as_ref().ok_or(MissingSpecError::Root)?.path())
.map_err(LibcontainerError::OtherIO)?;
let csocketfd = if let Some(console_socket) = &self.console_socket {
Some(tty::setup_console_socket(
&container_dir,
console_socket,
"tty",
)?)
} else {
None
};
syd::t!("parsing user namespace config");
let user_ns_config = UserNamespaceConfig::new(&spec)?;
syd::t!("parsing youki config");
let mut config = YoukiConfig::from_spec(&spec, container.id())?;
let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
config.cgroup_path = get_cgroup_path(linux.cgroups_path(), &self.container_id);
config.save(&container_dir).map_err(|err| {
syd::t!("failed to save config: {err}");
err
})?;
let mut builder_impl = SydContainerBuilderImpl {
container_type: ContainerType::SydInitContainer,
syscall: self.syscall,
container_id: self.container_id,
pid_file: self.pid_file,
console_socket: csocketfd,
use_systemd: self.use_systemd,
spec: Rc::new(spec),
rootfs,
user_ns_config,
notify_path,
container: Some(container.clone()),
preserve_fds: self.preserve_fds,
detached: self.detached,
executor: self.executor,
no_pivot: self.no_pivot,
stdin: self.base.stdin,
stdout: self.base.stdout,
stderr: self.base.stderr,
as_sibling: self.as_sibling,
};
builder_impl.create()?;
container.refresh_state()?;
Ok(container)
}
fn create_container_dir(&self) -> Result<PathBuf, LibcontainerError> {
let container_dir = self.root_path.join(&self.container_id);
syd::t!("container directory will be {container_dir:?}");
if XPath::new(&container_dir).exists(false) {
syd::t!("container already exists");
return Err(LibcontainerError::Exist);
}
std::fs::create_dir_all(&container_dir).map_err(|err| {
syd::t!("failed to create container directory: {err}");
LibcontainerError::OtherIO(err)
})?;
Ok(container_dir)
}
fn load_spec(&self) -> Result<Spec, LibcontainerError> {
let source_spec_path = self.bundle.join("config.json");
let mut spec = syd_spec_load(source_spec_path).inspect_err(|err| {
syd::t!("failed to load OCI spec: {err}");
})?;
Self::validate_spec(&spec).inspect_err(|err| {
syd::t!("failed to validate OCI spec: {err}");
})?;
spec.canonicalize_rootfs(&self.bundle).inspect_err(|err| {
syd::t!("failed to canonicalize rootfs: {err}");
})?;
let dot_oci = if let Some(root) = spec.root() {
XPathBuf::from(root.path().clone()).join(b".oci.syd-3")
} else {
return Err(ErrInvalidSpec::UnsupportedVersion)?;
};
let is_rootless = rootless_required(&*create_syscall())
.map_err(LibcontainerError::OtherIO)
.inspect_err(|err| {
syd::t!("failed to determine rootless required: {err}");
})?;
let syd_dir = if env::var_os(syd::config::ENV_OCI_NO_CONFIG).is_some() {
None
} else if !is_rootless {
Some(XPathBuf::from("/etc/syd/oci"))
} else if let Some(path) = env::var_os("XDG_CONFIG_HOME") {
Some(XPathBuf::from(path).join(b"syd").join(b"oci"))
} else if let Ok(path) = env::var("HOME") {
Some(XPathBuf::from(path).join(b".syd").join(b"oci"))
} else {
None
};
if let Some(syd_dir) = syd_dir {
let mut sources = vec![];
match (spec.hostname(), spec.domainname()) {
(Some(hostname), Some(domainname)) => {
let hname = XPathBuf::from(hostname.clone());
let dname = XPathBuf::from(domainname.clone());
hname.check_name().map_err(err2io).inspect_err(|err| {
syd::t!("detected unsafe hostname in OCI spec: {err}");
})?;
dname.check_name().map_err(err2io).inspect_err(|err| {
syd::t!("detected unsafe domainname in OCI spec: {err}");
})?;
let mut name = dname.clone();
name.append_bytes(b".syd-3");
sources.push(syd_dir.join(name.as_bytes()));
let mut name = hname.clone();
name.append_byte(b'.');
name.append_bytes(dname.as_bytes());
name.append_bytes(b".syd-3");
sources.push(syd_dir.join(name.as_bytes()));
let mut name = hname.clone();
name.append_bytes(b".syd-3");
sources.push(syd_dir.join(name.as_bytes()));
}
(None, Some(domainname)) => {
let dname = XPathBuf::from(domainname.clone());
dname.check_name().map_err(err2io).inspect_err(|err| {
syd::t!("detected unsafe domainname in OCI spec: {err}");
})?;
let mut name = dname.clone();
name.append_bytes(b".syd-3");
sources.push(syd_dir.join(name.as_bytes()));
}
(Some(hostname), None) => {
let hname = XPathBuf::from(hostname.clone());
hname.check_name().map_err(err2io).inspect_err(|err| {
syd::t!("detected unsafe hostname in OCI spec: {err}");
})?;
let mut name = hname.clone();
name.append_bytes(b".syd-3");
sources.push(syd_dir.join(name.as_bytes()));
}
_ => {}
};
sources.push(syd_dir.join(b"default.syd-3"));
let mut config_fd = None;
for path in sources {
match safe_open_file2(AT_BADFD, &path) {
Ok((fd, _)) => {
config_fd = Some(fd);
break;
}
Err(Errno::ENOENT) => {}
Err(errno) => {
syd::t!("error opening Syd configuration file `{path}' for read: {errno}");
return Err(err2io(errno));
}
};
}
if let Some(mut config_fd) = config_fd {
let how = safe_open_how(OFlag::O_WRONLY | OFlag::O_CREAT | OFlag::O_EXCL, ResolveFlag::empty())
.resolve(ResolveFlag::RESOLVE_NO_MAGICLINKS | ResolveFlag::RESOLVE_NO_SYMLINKS);
#[expect(clippy::disallowed_methods)]
let mut oci_fd = retry_on_eintr(|| openat2(AT_BADFD, &dot_oci, how))
.map_err(err2io)
.inspect_err(|err| {
syd::t!(
"error opening Syd configuration file `{dot_oci}' for write: {err}"
);
})?;
syd::io::copy(&mut config_fd, &mut oci_fd)
.map_err(err2io)
.inspect_err(|err| {
syd::t!("error writing Syd configuration file `{dot_oci}': {err}");
})?;
fchmod(oci_fd, Mode::from_bits_retain(0o444))
.map_err(err2io)
.inspect_err(|err| {
syd::t!("error changing mode of Syd configuration file `{dot_oci}': {err}");
})?;
}
}
Ok(spec)
}
fn validate_spec(spec: &Spec) -> Result<(), LibcontainerError> {
let version = spec.version();
if !version.starts_with("1.") {
syd::t!(
"runtime spec has incompatible version '{}'. Only 1.X.Y is supported",
spec.version()
);
Err(ErrInvalidSpec::UnsupportedVersion)?;
}
if let Some(process) = spec.process() {
if let Some(_profile) = process.apparmor_profile() {
let apparmor_is_enabled = apparmor::is_enabled().map_err(|err| {
syd::t!("failed to check if apparmor is enabled");
LibcontainerError::OtherIO(err)
})?;
if !apparmor_is_enabled {
syd::t!("apparmor profile exists in the spec, but apparmor is not activated on this system");
Err(ErrInvalidSpec::AppArmorNotEnabled)?;
}
}
if let Some(io_priority) = process.io_priority() {
let priority = io_priority.priority();
let iop_class_res = serde_json::to_string(&io_priority.class());
match iop_class_res {
Ok(_iop_class) => {
if !(0..=7).contains(&priority) {
syd::t!("io priority '{}' not between 0 and 7 (inclusive), class '{}' not in (IO_PRIO_CLASS_RT,IO_PRIO_CLASS_BE,IO_PRIO_CLASS_IDLE)",
priority, _iop_class);
Err(ErrInvalidSpec::IoPriority)?;
}
}
Err(_e) => {
syd::t!("failed to parse io priority class: {_e}");
Err(ErrInvalidSpec::IoPriority)?;
}
}
}
}
let syscall = create_syscall();
utils::validate_spec_for_new_user_ns(spec, &*syscall)?;
Ok(())
}
fn create_container_state(&self, container_dir: &Path) -> Result<Container, LibcontainerError> {
let container = Container::new(
&self.container_id,
ContainerStatus::Creating,
None,
&self.bundle,
container_dir,
)?;
container.save()?;
Ok(container)
}
}
#[derive(Debug, Copy, Clone)]
enum ContainerType {
SydInitContainer,
SydTenantContainer { exec_notify_fd: RawFd },
}
struct SydContainerBuilderImpl {
pub container_type: ContainerType,
pub syscall: SyscallType,
pub use_systemd: bool,
pub container_id: String,
pub spec: Rc<Spec>,
pub rootfs: PathBuf,
pub pid_file: Option<PathBuf>,
pub console_socket: Option<OwnedFd>,
pub user_ns_config: Option<UserNamespaceConfig>,
pub notify_path: PathBuf,
pub container: Option<Container>,
pub preserve_fds: i32,
pub detached: bool,
pub executor: Box<dyn Executor>,
pub no_pivot: bool,
pub stdin: Option<OwnedFd>,
pub stdout: Option<OwnedFd>,
pub stderr: Option<OwnedFd>,
pub as_sibling: bool,
}
impl SydContainerBuilderImpl {
fn create(&mut self) -> Result<Pid, LibcontainerError> {
match self.run_container() {
Ok(pid) => Ok(pid),
Err(outer) => {
if matches!(self.container_type, ContainerType::SydInitContainer) {
self.cleanup_container()?;
}
Err(outer)
}
}
}
fn run_container(&mut self) -> Result<Pid, LibcontainerError> {
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
let cgroups_path = get_cgroup_path(linux.cgroups_path(), &self.container_id);
let cgroup_config = libcgroups::common::CgroupConfig {
cgroup_path: cgroups_path,
systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(),
container_name: self.container_id.to_owned(),
};
let process = self
.spec
.process()
.as_ref()
.ok_or(MissingSpecError::Process)?;
if matches!(self.container_type, ContainerType::SydInitContainer) {
if let Some(hooks) = self.spec.hooks() {
hooks::run_hooks(
hooks.create_runtime().as_ref(),
self.container.as_ref().map(|c| &c.state),
None,
None,
)?
}
}
let notify_listener = NotifyListener::new(&self.notify_path)?;
#[expect(clippy::disallowed_methods)]
if let Some(oom_score_adj) = process.oom_score_adj() {
syd::t!("Set OOM score to {oom_score_adj}");
let mut f = fs::File::create("/proc/self/oom_score_adj").map_err(|err| {
syd::t!("failed to open /proc/self/oom_score_adj: {err}");
LibcontainerError::OtherIO(err)
})?;
f.write_all(oom_score_adj.to_string().as_bytes())
.map_err(|err| {
syd::t!("failed to write to /proc/self/oom_score_adj: {err}");
LibcontainerError::OtherIO(err)
})?;
}
let container_args = ContainerArgs {
container_type: match self.container_type {
ContainerType::SydInitContainer => process::args::ContainerType::InitContainer,
ContainerType::SydTenantContainer { exec_notify_fd } => {
process::args::ContainerType::TenantContainer { exec_notify_fd }
}
},
syscall: self.syscall,
spec: Rc::clone(&self.spec),
rootfs: self.rootfs.to_owned(),
console_socket: self.console_socket.as_ref().map(|c| c.as_raw_fd()),
notify_listener,
preserve_fds: self.preserve_fds,
container: self.container.to_owned(),
user_ns_config: self.user_ns_config.to_owned(),
cgroup_config,
detached: self.detached,
executor: self.executor.clone(),
no_pivot: self.no_pivot,
stdin: self.stdin.as_ref().map(|x| x.as_raw_fd()),
stdout: self.stdout.as_ref().map(|x| x.as_raw_fd()),
stderr: self.stderr.as_ref().map(|x| x.as_raw_fd()),
as_sibling: self.as_sibling,
pid_file: self.pid_file.clone(),
};
let (init_pid, need_to_clean_up_intel_rdt_dir) =
process::container_main_process::container_main_process(&container_args).map_err(
|err| {
syd::t!("failed to run container process: {err}");
LibcontainerError::MainProcess(err)
},
)?;
if let Some(pid_file) = &self.pid_file {
fs::write(pid_file, format!("{init_pid}")).map_err(|err| {
syd::t!("failed to write pid to file: {err}");
LibcontainerError::OtherIO(err)
})?;
}
if let Some(container) = &mut self.container {
container
.set_status(ContainerStatus::Created)
.set_creator(nix::unistd::geteuid().as_raw())
.set_pid(init_pid.as_raw())
.set_clean_up_intel_rdt_directory(need_to_clean_up_intel_rdt_dir)
.save()?;
}
Ok(Pid::from_raw(init_pid.as_raw()))
}
fn cleanup_container(&self) -> Result<(), LibcontainerError> {
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
let cgroups_path = get_cgroup_path(linux.cgroups_path(), &self.container_id);
let cmanager =
libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig {
cgroup_path: cgroups_path,
systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(),
container_name: self.container_id.to_string(),
})?;
let mut errors = Vec::new();
if let Err(e) = cmanager.remove() {
syd::t!("failed to remove cgroup manager: {e}");
errors.push(e.to_string());
}
if let Some(container) = &self.container {
if let Some(true) = container.clean_up_intel_rdt_subdirectory() {
if let Err(e) = delete_resctrl_subdirectory(container.id()) {
syd::t!("failed to delete resctrl subdirectory: {e}");
errors.push(e.to_string());
}
}
if XPath::new(&container.root).exists(true) {
if let Err(e) = fs::remove_dir_all(&container.root) {
syd::t!("failed to delete container root: {e}");
errors.push(e.to_string());
}
}
}
if !errors.is_empty() {
return Err(LibcontainerError::Other(format!(
"failed to cleanup container: {}",
errors.join(";")
)));
}
Ok(())
}
}
#[macro_export]
macro_rules! syd_oci_version {
() => {
concat!(
"version ",
env!("CARGO_PKG_VERSION"),
"\ncommit: ",
env!("SYD_GIT_COMMIT"),
)
};
}
#[derive(Parser, Debug)]
enum SubCommand {
#[clap(flatten)]
Standard(Box<StandardCmd>),
#[clap(flatten)]
Common(Box<CommonCmd>),
}
#[derive(Parser, Debug)]
#[clap(
name = "syd-oci",
version = syd_oci_version!(),
about = "Syd's OCI container runtime",
author = "Ali Polatel <alip@chesswob.org>",
)]
struct Opts {
#[clap(flatten)]
global: GlobalOpts,
#[clap(subcommand)]
subcmd: SubCommand,
}
syd::main! {
if env::var_os(syd::config::ENV_QUICK_BOOT).is_none() {
syd::seal::ensure_sealed()?;
} else {
match env::var_os("RUST_BACKTRACE") {
Some(val) => env::set_var("SYD_RUST_BACKTRACE", val),
None => env::remove_var("SYD_RUST_BACKTRACE"),
};
if secure_getenv(ENV_SKIP_SCMP).is_none() {
env::set_var("RUST_BACKTRACE", "0");
}
}
env::remove_var(syd::config::ENV_DUMP_SCMP);
let mut opts = Opts::parse();
let (level, trace_level) = if opts.global.debug {
(LogLevel::Debug, tracing::Level::DEBUG)
} else {
(LogLevel::Info, tracing::Level::INFO)
};
log_init(level, None)?;
let log_level_filter = tracing_subscriber::filter::LevelFilter::from(trace_level);
let format_layer = tracing_subscriber::fmt::layer()
.with_writer(std::io::sink) .with_span_events(tracing_subscriber::fmt::format::FmtSpan::NONE);
let subscriber = tracing_subscriber::registry()
.with(format_layer)
.with(log_level_filter)
.with(SydLayer);
tracing::subscriber::set_global_default(subscriber)?;
make_root(&mut opts.global)?;
match opts.subcmd {
SubCommand::Standard(cmd) => match *cmd {
StandardCmd::Create(subopts) => cmd_create(opts.global, subopts),
StandardCmd::Start(subopts) => cmd_start(opts.global, subopts),
StandardCmd::State(subopts) => cmd_state(opts.global, subopts),
StandardCmd::Kill(subopts) => cmd_kill(opts.global, subopts),
StandardCmd::Delete(subopts) => cmd_delete(opts.global, subopts),
},
SubCommand::Common(cmd) => match *cmd {
CommonCmd::Features(subopts) => cmd_features(opts.global, subopts),
CommonCmd::Ps(subopts) => cmd_ps(opts.global, subopts),
CommonCmd::List(subopts) => cmd_list(opts.global, subopts),
CommonCmd::Spec(subopts) => cmd_spec(opts.global, subopts),
CommonCmd::Pause(subopts) => cmd_pause(opts.global, subopts),
CommonCmd::Resume(subopts) => cmd_resume(opts.global, subopts),
CommonCmd::Events(subopts) => cmd_events(opts.global, subopts),
CommonCmd::Update(subopts) => cmd_update(opts.global, subopts),
CommonCmd::Checkpointt(subopts) => cmd_checkpoint(opts.global, subopts),
CommonCmd::Exec(subopts) => cmd_exec(opts.global, subopts),
CommonCmd::Run(subopts) => cmd_run(opts.global, subopts),
},
}
}
fn cmd_create(opt: GlobalOpts, args: Create) -> SydResult<ExitCode> {
let systemd_cgroup = opt.systemd_cgroup;
SydInitContainerBuilder::try_from((opt, args))?
.with_systemd(systemd_cgroup)
.with_detach(true)
.build()?;
Ok(ExitCode::SUCCESS)
}
fn cmd_start(opt: GlobalOpts, args: Start) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let mut container = Container::load(container_root)?;
container.start()?;
Ok(ExitCode::SUCCESS)
}
fn cmd_state(opt: GlobalOpts, args: State) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let container = Container::load(container_root)?;
println!("{}", serde_json::to_string_pretty(&container.state)?);
Ok(ExitCode::SUCCESS)
}
fn cmd_kill(opt: GlobalOpts, args: Kill) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let mut container = Container::load(container_root)?;
let signal: Signal = args.signal.as_str().try_into()?;
container.kill(signal, args.all)?;
Ok(ExitCode::SUCCESS)
}
fn cmd_delete(opt: GlobalOpts, args: Delete) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(false) && args.force {
return Ok(ExitCode::SUCCESS);
}
let mut container = Container::load(container_root)?;
container.delete(args.force)?;
Ok(ExitCode::SUCCESS)
}
#[expect(clippy::disallowed_methods)]
fn cmd_features(_opt: GlobalOpts, _args: Features) -> SydResult<ExitCode> {
let namespaces = match query_supported_namespaces() {
Ok(ns) => ns,
Err(e) => {
eprintln!("Error querying supported namespaces: {e}");
Vec::new()
}
};
let capabilities = match query_caps() {
Ok(caps) => caps,
Err(e) => {
eprintln!("Error querying available capabilities: {e}");
Vec::new()
}
};
let linux = LinuxFeatureBuilder::default()
.namespaces(namespaces)
.capabilities(capabilities)
.cgroup(
CgroupBuilder::default()
.v1(true) .v2(true) .systemd(true) .systemd_user(true) .rdma(false)
.build()
.unwrap(),
)
.apparmor(ApparmorBuilder::default().enabled(true).build().unwrap())
.mount_extensions(
MountExtensionsBuilder::default()
.idmap(IDMapBuilder::default().enabled(false).build().unwrap())
.build()
.unwrap(),
)
.selinux(SelinuxBuilder::default().enabled(false).build().unwrap())
.intel_rdt(IntelRdtBuilder::default().enabled(true).build().unwrap())
.build()
.unwrap();
let features = FeaturesBuilder::default()
.oci_version_max(VERSION)
.oci_version_min(String::from("1.0.0"))
.hooks(known_hooks())
.mount_options(MountOption::known_options())
.linux(linux)
.build()
.unwrap();
let pretty_json_str = serde_json::to_string_pretty(&features)?;
println!("{pretty_json_str}");
Ok(ExitCode::SUCCESS)
}
fn cmd_ps(opt: GlobalOpts, args: Ps) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let container = Container::load(container_root)?;
let cmanager = libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig {
cgroup_path: container.spec()?.cgroup_path,
systemd_cgroup: container.systemd(),
container_name: container.id().to_string(),
})?;
let pids: Vec<i32> = cmanager
.get_all_pids()?
.iter()
.map(|pid| pid.as_raw())
.collect();
if args.format == "json" {
println!("{}", serde_json::to_string(&pids)?);
} else if args.format == "table" {
let default_ps_options = vec![String::from("-ef")];
let ps_options = if args.ps_options.is_empty() {
&default_ps_options
} else {
&args.ps_options
};
let output = std::process::Command::new("ps").args(ps_options).output()?;
if !output.status.success() {
println!("{}", std::str::from_utf8(&output.stderr)?);
} else {
let lines = std::str::from_utf8(&output.stdout)?;
let lines: Vec<&str> = lines.split('\n').collect();
let pid_index = get_pid_index(lines[0])?;
println!("{}", &lines[0]);
for line in &lines[1..] {
if line.is_empty() {
continue;
}
let fields: Vec<&str> = line.split_whitespace().collect();
let pid: i32 = fields[pid_index].parse()?;
if pids.contains(&pid) {
println!("{line}");
}
}
}
}
Ok(ExitCode::SUCCESS)
}
fn cmd_list(opt: GlobalOpts, _args: List) -> SydResult<ExitCode> {
let mut content = String::new();
#[expect(clippy::disallowed_methods)]
for container_dir in fs::read_dir(opt.root.unwrap())? {
let container_dir = container_dir?.path();
let state_file = container_dir.join("state.json");
if !XPath::new(&state_file).exists(true) {
continue;
}
let container = Container::load(container_dir)?;
let pid = if let Some(pid) = container.pid() {
pid.to_string()
} else {
"".to_owned()
};
let user_name = container.creator().unwrap_or_default();
let created = if let Some(utc) = container.created() {
utc.to_rfc3339()
} else {
"".to_owned()
};
let _ = writeln!(
content,
"{}\t{}\t{}\t{}\t{}\t{}",
container.id(),
pid,
container.status(),
container.bundle().display(),
created,
user_name.to_string_lossy()
);
}
let mut tab_writer = TabWriter::new(std::io::stdout());
writeln!(&mut tab_writer, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tCREATOR")?;
write!(&mut tab_writer, "{content}")?;
tab_writer.flush()?;
Ok(ExitCode::SUCCESS)
}
fn cmd_spec(_opt: GlobalOpts, args: liboci_cli::Spec) -> SydResult<ExitCode> {
let spec = if args.rootless {
get_rootless_spec()?
} else {
Spec::default()
};
#[expect(clippy::disallowed_methods)]
let file = fs::File::create("config.json")?;
let mut writer = BufWriter::new(file);
to_writer_pretty(&mut writer, &spec)?;
writer.flush()?;
Ok(ExitCode::SUCCESS)
}
fn cmd_pause(opt: GlobalOpts, args: Pause) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let mut container = Container::load(container_root)?;
container.pause()?;
Ok(ExitCode::SUCCESS)
}
fn cmd_resume(opt: GlobalOpts, args: Resume) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let mut container = Container::load(container_root)?;
container.resume()?;
Ok(ExitCode::SUCCESS)
}
fn cmd_events(opt: GlobalOpts, args: Events) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let mut container = Container::load(container_root)?;
container.events(args.interval, args.stats)?;
Ok(ExitCode::SUCCESS)
}
fn cmd_update(opt: GlobalOpts, args: Update) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let container = Container::load(container_root)?;
let cmanager = libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig {
cgroup_path: container.spec()?.cgroup_path,
systemd_cgroup: container.systemd(),
container_name: container.id().to_string(),
})?;
let linux_res: LinuxResources;
#[expect(clippy::disallowed_methods)]
if let Some(resources_path) = args.resources {
linux_res = if resources_path.to_string_lossy() == "-" {
serde_json::from_reader(std::io::stdin())?
} else {
let file = fs::File::open(resources_path)?;
let reader = BufReader::new(file);
serde_json::from_reader(reader)?
};
} else {
let mut builder = LinuxResourcesBuilder::default();
if let Some(new_pids_limit) = args.pids_limit {
builder = builder.pids(LinuxPidsBuilder::default().limit(new_pids_limit).build()?);
}
linux_res = builder.build()?;
}
cmanager.apply(&ControllerOpt {
resources: &linux_res,
disable_oom_killer: false,
oom_score_adj: None,
freezer_state: None,
})?;
Ok(ExitCode::SUCCESS)
}
fn cmd_checkpoint(opt: GlobalOpts, args: Checkpoint) -> SydResult<ExitCode> {
#[expect(clippy::disallowed_methods)]
let container_root = opt.root.unwrap().join(args.container_id.clone());
if !XPath::new(&container_root).exists(true) {
return Err(Errno::ENOENT.into());
};
let mut container = Container::load(container_root)?;
let opts = libcontainer::container::CheckpointOptions {
ext_unix_sk: args.ext_unix_sk,
file_locks: args.file_locks,
image_path: args.image_path,
leave_running: args.leave_running,
shell_job: args.shell_job,
tcp_established: args.tcp_established,
work_path: args.work_path,
};
container.checkpoint(&opts)?;
Ok(ExitCode::SUCCESS)
}
fn cmd_exec(opt: GlobalOpts, args: Exec) -> SydResult<ExitCode> {
let pid = {
let cwd = args.cwd.clone();
let env = args.env.clone().into_iter().collect();
let detach = args.detach;
let no_new_privs = args.no_new_privs;
let command = args.command.clone();
let process = args.process.clone();
let pid = SydTenantContainerBuilder::new(opt, args)?
.with_detach(detach)
.with_cwd(cwd)
.with_env(env)
.with_process(process)
.with_no_new_privs(no_new_privs)
.with_container_args(command)
.build()?;
if detach {
return Ok(ExitCode::SUCCESS);
}
pid
};
loop {
return match waitid(Id::Pid(Pid::from_raw(pid.as_raw())), WaitPidFlag::WEXITED) {
Ok(WaitStatus::Exited(_, status)) => Ok(ExitCode::from(status as u8)),
Ok(WaitStatus::Signaled(_, sig, _)) => Ok(ExitCode::from(128 + (sig as u8))),
Ok(_) => Ok(ExitCode::SUCCESS),
Err(Errno::EINTR) => continue,
Err(errno) => Err(errno.into()),
};
}
}
fn cmd_run(opt: GlobalOpts, args: Run) -> SydResult<ExitCode> {
let detach = args.detach;
let systemd_cgroup = opt.systemd_cgroup;
let mut container = SydInitContainerBuilder::try_from((opt, args))?
.with_systemd(systemd_cgroup)
.with_detach(detach)
.build()?;
container.start()?;
if detach {
return Ok(ExitCode::SUCCESS);
}
debug_assert!(
container.pid().is_some(),
"expects a container init pid in the container state"
);
#[expect(clippy::disallowed_methods)]
let foreground_result = handle_foreground(Pid::from_raw(container.pid().unwrap().as_raw()));
container.delete(true)?;
Ok(foreground_result
.map(|i| ExitCode::from(i as u8))
.unwrap_or(ExitCode::FAILURE))
}
fn syd_spec_load<P: AsRef<Path>>(config: P) -> Result<Spec, LibcontainerError> {
let mut spec = Spec::load(&config)?;
if let Some(linux) = spec.linux() {
if let Some(seccomp) = linux.seccomp() {
syd::t!(
"Syd-OCI-Seccomp-Pre: {}",
serde_json::to_string(&seccomp).unwrap_or("?".to_string())
);
let mut syscalls = if let Some(syscalls) = seccomp.syscalls() {
syscalls
.iter()
.cloned()
.map(|mut entry| {
if entry.action() != LinuxSeccompAction::ScmpActAllow
|| entry.errno_ret().is_some()
{
let filtered = entry
.names()
.iter()
.filter(|n| {
syd::config::OCI_SYSCALLS
.binary_search(&n.as_str())
.is_err()
})
.cloned()
.collect::<Vec<String>>();
entry.set_names(filtered);
}
entry
})
.collect::<Vec<LinuxSyscall>>()
} else {
Vec::new()
};
let sydallowlist = LinuxSyscallBuilder::default()
.action(LinuxSeccompAction::ScmpActAllow)
.names(
syd::config::OCI_SYSCALLS
.iter()
.map(ToString::to_string)
.collect::<Vec<_>>(),
)
.build()?;
syscalls.push(sydallowlist);
let mut builder = LinuxSeccompBuilder::default()
.default_action(seccomp.default_action())
.syscalls(syscalls);
if let Some(default_errno_ret) = seccomp.default_errno_ret() {
builder = builder.default_errno_ret(default_errno_ret)
}
if let Some(flags) = seccomp.flags() {
builder = builder.flags(flags.clone());
}
if let Some(architectures) = seccomp.architectures() {
builder = builder.architectures(architectures.clone());
}
if let Some(listener_path) = seccomp.listener_path() {
builder = builder.listener_path(listener_path);
}
if let Some(listener_metadata) = seccomp.listener_metadata() {
builder = builder.listener_metadata(listener_metadata);
}
let seccomp = builder.build()?;
syd::t!(
"Syd-OCI-Seccomp-Post: {}",
serde_json::to_string(&seccomp).unwrap_or("?".to_string())
);
let mut linux = linux.clone();
linux.set_seccomp(Some(seccomp));
spec.set_linux(Some(linux));
}
}
if let Some(process) = spec.process() {
let syscall = create_syscall();
if rootless_required(&*syscall).map_err(LibcontainerError::OtherIO)? {
return Ok(spec);
}
let mut p = process.clone();
if let Some(capabilities) = process.capabilities() {
let mut caps = LinuxCapabilitiesBuilder::default();
if let Some(c) = capabilities.bounding() {
let mut c = c.clone();
c.insert(Capability::SysPtrace);
caps = caps.bounding(c);
} else {
caps = caps.bounding(HashSet::from([Capability::SysPtrace]));
}
if let Some(c) = capabilities.effective() {
let mut c = c.clone();
c.insert(Capability::SysPtrace);
caps = caps.effective(c);
} else {
caps = caps.effective(HashSet::from([Capability::SysPtrace]));
}
if let Some(c) = capabilities.permitted() {
let mut c = c.clone();
c.insert(Capability::SysPtrace);
caps = caps.permitted(c);
} else {
caps = caps.permitted(HashSet::from([Capability::SysPtrace]));
}
if let Some(c) = capabilities.inheritable() {
caps = caps.inheritable(c.clone());
} else {
caps = caps.inheritable(HashSet::new());
}
if let Some(c) = capabilities.ambient() {
caps = caps.ambient(c.clone());
} else {
caps = caps.ambient(HashSet::new());
}
let caps = caps.build()?;
p.set_capabilities(Some(caps));
} else {
let caps = LinuxCapabilitiesBuilder::default()
.bounding(HashSet::from([Capability::SysPtrace]))
.effective(HashSet::from([Capability::SysPtrace]))
.permitted(HashSet::from([Capability::SysPtrace]))
.inheritable(HashSet::new())
.ambient(HashSet::new())
.build()?;
p.set_capabilities(Some(caps));
}
spec.set_process(Some(p));
}
Ok(spec)
}
fn handle_foreground(init_pid: Pid) -> SydResult<i32> {
syd::t!("waiting for container init process to exit");
let signal_set = SigSet::all();
signal_set.thread_block()?;
loop {
match signal_set.wait()? {
signal::SIGCHLD => {
syd::t!("reaping child processes");
loop {
match waitid(Id::All, WaitPidFlag::WNOHANG) {
Ok(WaitStatus::Exited(pid, status)) => {
if pid.eq(&init_pid) {
return Ok(status);
}
}
Ok(WaitStatus::Signaled(pid, signal, _)) => {
if pid.eq(&init_pid) {
return Ok(signal);
}
}
Ok(WaitStatus::StillAlive) => {
break;
}
Ok(_) | Err(Errno::EINTR) => {}
Err(errno) => return Err(errno.into()),
}
}
}
signal::SIGURG => {
}
signal::SIGWINCH => {
}
signal => {
syd::t!("forwarding signal {}", signal as i32);
let _ = kill(init_pid, Some(signal)).map_err(|_err| {
syd::t!("failed to forward signal to container init process: {_err}")
});
}
}
}
}
fn get_rootless_spec() -> SydResult<Spec> {
let mut namespaces: Vec<LinuxNamespace> =
libcontainer::oci_spec::runtime::get_default_namespaces()
.into_iter()
.filter(|ns| {
ns.typ() != LinuxNamespaceType::Network && ns.typ() != LinuxNamespaceType::User
})
.collect();
namespaces.push(
LinuxNamespaceBuilder::default()
.typ(LinuxNamespaceType::User)
.build()?,
);
let uid = Uid::effective().as_raw();
let gid = Gid::effective().as_raw();
let linux = LinuxBuilder::default()
.namespaces(namespaces)
.uid_mappings(vec![LinuxIdMappingBuilder::default()
.host_id(uid)
.container_id(0_u32)
.size(1_u32)
.build()?])
.gid_mappings(vec![LinuxIdMappingBuilder::default()
.host_id(gid)
.container_id(0_u32)
.size(1_u32)
.build()?])
.build()?;
let mut mounts: Vec<Mount> = libcontainer::oci_spec::runtime::get_default_mounts();
for mount in &mut mounts {
if mount.destination().eq(Path::new("/sys")) {
mount
.set_source(Some(PathBuf::from("/sys")))
.set_typ(Some(String::from("none")))
.set_options(Some(vec![
"rbind".to_string(),
"nosuid".to_string(),
"noexec".to_string(),
"nodev".to_string(),
"ro".to_string(),
]));
} else {
let options: Vec<String> = mount
.options()
.as_ref()
.unwrap_or(&vec![])
.iter()
.filter(|&o| !o.starts_with("gid=") && !o.starts_with("uid="))
.map(|o| o.to_string())
.collect();
mount.set_options(Some(options));
}
}
let mut spec = Spec::default();
spec.set_linux(Some(linux)).set_mounts(Some(mounts));
Ok(spec)
}
fn get_pid_index(title: &str) -> SydResult<usize> {
let titles = title.split_whitespace();
for (index, name) in titles.enumerate() {
if name == "PID" {
return Ok(index);
}
}
Err(Errno::ENOENT.into())
}
fn make_root(opt: &mut GlobalOpts) -> SydResult<()> {
let uid = Uid::current();
#[expect(clippy::disallowed_methods)]
if opt.root.is_none() {
let syscall = create_syscall();
let is_rootless_required = rootless_required(&*syscall)?;
opt.root = Some(if !is_rootless_required {
PathBuf::from("/run/syd")
} else if let Ok(path) = env::var("XDG_RUNTIME_DIR") {
PathBuf::from(format!("{path}/syd"))
} else {
PathBuf::from(format!("/run/user/{uid}/syd"))
});
};
let path = match opt.root {
Some(ref path) => path,
_ => unreachable!(),
};
mkdir_p(path, Mode::S_IRWXU | Mode::S_ISVTX)?;
let path = path.canonicalize()?;
assert_eq!(path_uid(&path)?, uid, "UID mismatch on root directory!");
opt.root = Some(path);
Ok(())
}
fn mkdir_p<P: AsRef<Path>>(dir: P, mode: Mode) -> SydResult<()> {
Ok(DirBuilder::new()
.recursive(true)
.mode(mode.bits())
.create(&dir)?)
}
fn path_uid<P: AsRef<Path>>(path: P) -> SydResult<Uid> {
Ok(Uid::from_raw(fs::metadata(&path)?.st_uid()))
}
fn get_executable_path(name: &str, path_var: &str) -> Option<PathBuf> {
if name.contains('/') && XPath::new(name).exists(true) {
return Some(PathBuf::from(name));
}
for path in path_var.split(':') {
let potential_path = PathBuf::from(path).join(name);
if XPath::new(&potential_path).exists(true) {
return Some(potential_path);
}
}
None
}
fn is_executable(path: &Path) -> std::result::Result<bool, std::io::Error> {
let metadata = path.metadata()?;
let permissions = metadata.permissions();
Ok(metadata.is_file() && permissions.mode() & 0o001 != 0)
}
fn get_cgroup_path(cgroups_path: &Option<PathBuf>, container_id: &str) -> PathBuf {
match cgroups_path {
Some(cpath) => cpath.clone(),
None => PathBuf::from(format!(":syd:{container_id}")),
}
}
fn query_caps() -> SydResult<Vec<String>> {
Ok(syd::caps::Capabilities::all()
.iter()
.map(|cap| format!("{cap:?}"))
.collect())
}
fn query_supported_namespaces() -> SydResult<Vec<LinuxNamespaceType>> {
Ok(vec![
LinuxNamespaceType::Pid,
LinuxNamespaceType::Network,
LinuxNamespaceType::Uts,
LinuxNamespaceType::Ipc,
LinuxNamespaceType::Mount,
LinuxNamespaceType::User,
LinuxNamespaceType::Cgroup,
LinuxNamespaceType::Time,
])
}
fn known_hooks() -> Vec<String> {
[
"prestart",
"createRuntime",
"createContainer",
"startContainer",
"poststart",
"poststop",
]
.iter()
.map(|s| s.to_string())
.collect()
}