use std::fs;
use std::io;
use std::os::unix::fs::{FileTypeExt, MetadataExt};
use std::path::{Path, PathBuf};
use std::process::Command;
use nix::sys::stat::{Mode, SFlag, mknod};
use nix::unistd::{Gid, Uid, chown};
use crate::error::{VmRuntimeError, VmRuntimeResult};
const DEFAULT_JAILER_BIN: &str = "/usr/bin/jailer";
const DEFAULT_CHROOT_BASE: &str = "/srv/jailer";
const DEFAULT_CGROUP_PARENT: &str = "microvm.slice";
const DEFAULT_UID: u32 = 123;
const DEFAULT_GID: u32 = 100;
const KERNEL_BASENAME: &str = "vmlinux";
const ROOTFS_BASENAME: &str = "rootfs.ext4";
const API_SOCKET_BASENAME: &str = "api.sock";
const KVM_MAJOR: u64 = 10;
const KVM_MINOR: u64 = 232;
const TUN_MAJOR: u64 = 10;
const TUN_MINOR: u64 = 200;
#[derive(Debug, Clone)]
pub struct JailerConfig {
pub jailer_bin: PathBuf,
pub uid: u32,
pub gid: u32,
pub chroot_base: PathBuf,
pub cgroup_parent: String,
pub cgroup_v2: bool,
pub numa_node: Option<u32>,
}
impl Default for JailerConfig {
fn default() -> Self {
Self {
jailer_bin: PathBuf::from(DEFAULT_JAILER_BIN),
uid: DEFAULT_UID,
gid: DEFAULT_GID,
chroot_base: PathBuf::from(DEFAULT_CHROOT_BASE),
cgroup_parent: DEFAULT_CGROUP_PARENT.to_string(),
cgroup_v2: true,
numa_node: None,
}
}
}
impl JailerConfig {
pub fn from_env() -> Self {
let default = Self::default();
Self {
jailer_bin: std::env::var("MICROVM_JAILER_BIN")
.map(PathBuf::from)
.unwrap_or(default.jailer_bin),
uid: std::env::var("MICROVM_JAILER_UID")
.ok()
.and_then(|v| v.parse::<u32>().ok())
.unwrap_or(default.uid),
gid: std::env::var("MICROVM_JAILER_GID")
.ok()
.and_then(|v| v.parse::<u32>().ok())
.unwrap_or(default.gid),
chroot_base: std::env::var("MICROVM_JAILER_CHROOT_BASE")
.map(PathBuf::from)
.unwrap_or(default.chroot_base),
cgroup_parent: std::env::var("MICROVM_JAILER_CGROUP_PARENT")
.unwrap_or(default.cgroup_parent),
cgroup_v2: std::env::var("MICROVM_JAILER_CGROUP_VERSION")
.ok()
.map(|v| v.trim() != "1")
.unwrap_or(default.cgroup_v2),
numa_node: std::env::var("MICROVM_JAILER_NUMA_NODE")
.ok()
.and_then(|v| v.parse::<u32>().ok()),
}
}
}
#[derive(Debug, Clone)]
pub struct VmJail {
pub chroot_path: PathBuf,
pub api_socket_in_chroot: PathBuf,
pub api_socket_on_host: PathBuf,
}
#[derive(Debug, Clone)]
pub struct Jailer {
config: JailerConfig,
}
impl Jailer {
pub fn new(config: JailerConfig) -> Self {
Self { config }
}
pub fn from_env() -> Self {
Self::new(JailerConfig::from_env())
}
pub fn config(&self) -> &JailerConfig {
&self.config
}
pub fn chroot_for(&self, vm_id: &str) -> PathBuf {
self.config
.chroot_base
.join("firecracker")
.join(safe_vm_id(vm_id))
.join("root")
}
pub fn prepare(
&self,
vm_id: &str,
kernel: &Path,
rootfs: &Path,
extra_drives: &[PathBuf],
) -> VmRuntimeResult<VmJail> {
let safe_id = safe_vm_id(vm_id);
if safe_id.is_empty() {
return Err(VmRuntimeError::Jailer(format!(
"vm id '{vm_id}' is empty after sanitisation"
)));
}
let chroot_path = self
.config
.chroot_base
.join("firecracker")
.join(&safe_id)
.join("root");
if !kernel.exists() {
return Err(VmRuntimeError::Jailer(format!(
"kernel image not found: {}",
kernel.display()
)));
}
if !rootfs.exists() {
return Err(VmRuntimeError::Jailer(format!(
"rootfs image not found: {}",
rootfs.display()
)));
}
for drive in extra_drives {
if !drive.exists() {
return Err(VmRuntimeError::Jailer(format!(
"extra drive not found: {}",
drive.display()
)));
}
}
create_dir_all(&chroot_path)?;
create_dir_all(&chroot_path.join("dev"))?;
create_dir_all(&chroot_path.join("dev").join("net"))?;
link_or_copy(kernel, &chroot_path.join(KERNEL_BASENAME))?;
link_or_copy(rootfs, &chroot_path.join(ROOTFS_BASENAME))?;
for (idx, drive) in extra_drives.iter().enumerate() {
let basename = drive
.file_name()
.map(|n| n.to_string_lossy().into_owned())
.unwrap_or_else(|| format!("drive-{idx}.img"));
link_or_copy(drive, &chroot_path.join(basename))?;
}
ensure_char_device(&chroot_path.join("dev").join("kvm"), KVM_MAJOR, KVM_MINOR)?;
ensure_char_device(
&chroot_path.join("dev").join("net").join("tun"),
TUN_MAJOR,
TUN_MINOR,
)?;
chown_tree(&chroot_path, self.config.uid, self.config.gid)?;
Ok(VmJail {
chroot_path: chroot_path.clone(),
api_socket_in_chroot: PathBuf::from("/").join(API_SOCKET_BASENAME),
api_socket_on_host: chroot_path.join(API_SOCKET_BASENAME),
})
}
pub fn build_command(
&self,
vm_id: &str,
jail: &VmJail,
firecracker_bin: &Path,
) -> VmRuntimeResult<Command> {
let safe_id = safe_vm_id(vm_id);
if safe_id.is_empty() {
return Err(VmRuntimeError::Jailer(format!(
"vm id '{vm_id}' is empty after sanitisation"
)));
}
if !firecracker_bin.is_absolute() {
return Err(VmRuntimeError::Jailer(format!(
"firecracker binary must be an absolute path: {}",
firecracker_bin.display()
)));
}
let expected_chroot = self
.config
.chroot_base
.join("firecracker")
.join(&safe_id)
.join("root");
if jail.chroot_path != expected_chroot {
return Err(VmRuntimeError::Jailer(format!(
"jail chroot {} does not match expected {} for vm '{vm_id}'",
jail.chroot_path.display(),
expected_chroot.display()
)));
}
let mut cmd = Command::new(&self.config.jailer_bin);
cmd.arg("--id")
.arg(&safe_id)
.arg("--exec-file")
.arg(firecracker_bin)
.arg("--uid")
.arg(self.config.uid.to_string())
.arg("--gid")
.arg(self.config.gid.to_string())
.arg("--chroot-base-dir")
.arg(&self.config.chroot_base);
cmd.arg("--cgroup-version")
.arg(if self.config.cgroup_v2 { "2" } else { "1" })
.arg("--parent-cgroup")
.arg(&self.config.cgroup_parent);
if let Some(node) = self.config.numa_node {
cmd.arg("--numa-node").arg(node.to_string());
}
cmd.arg("--new-pid-ns");
cmd.arg("--");
cmd.arg("--api-sock").arg(&jail.api_socket_in_chroot);
Ok(cmd)
}
pub fn teardown(&self, vm_id: &str) -> VmRuntimeResult<()> {
let safe_id = safe_vm_id(vm_id);
if safe_id.is_empty() {
return Err(VmRuntimeError::Jailer(format!(
"vm id '{vm_id}' is empty after sanitisation"
)));
}
let vm_dir = self.config.chroot_base.join("firecracker").join(&safe_id);
if vm_dir.exists() {
fs::remove_dir_all(&vm_dir).map_err(|e| {
VmRuntimeError::Jailer(format!("failed removing chroot {}: {e}", vm_dir.display()))
})?;
}
let cgroup_path = PathBuf::from("/sys/fs/cgroup")
.join(&self.config.cgroup_parent)
.join(&safe_id);
let _ = fs::remove_dir(&cgroup_path);
Ok(())
}
}
pub fn safe_vm_id(vm_id: &str) -> String {
vm_id
.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
c
} else {
'_'
}
})
.collect()
}
fn create_dir_all(path: &Path) -> VmRuntimeResult<()> {
fs::create_dir_all(path).map_err(|e| {
VmRuntimeError::Jailer(format!("failed creating directory {}: {e}", path.display()))
})
}
fn link_or_copy(src: &Path, dst: &Path) -> VmRuntimeResult<()> {
if dst.exists() {
if same_inode(src, dst).unwrap_or(false) {
return Ok(());
}
fs::remove_file(dst).map_err(|e| {
VmRuntimeError::Jailer(format!(
"failed removing stale chroot artifact {}: {e}",
dst.display()
))
})?;
}
match fs::hard_link(src, dst) {
Ok(()) => Ok(()),
Err(e) if e.raw_os_error() == Some(libc_exdev()) => {
fs::copy(src, dst).map(|_| ()).map_err(|e| {
VmRuntimeError::Jailer(format!(
"failed copying {} -> {}: {e}",
src.display(),
dst.display()
))
})?;
Ok(())
}
Err(e) => Err(VmRuntimeError::Jailer(format!(
"failed hardlinking {} -> {}: {e}",
src.display(),
dst.display()
))),
}
}
fn same_inode(a: &Path, b: &Path) -> io::Result<bool> {
let ma = fs::metadata(a)?;
let mb = fs::metadata(b)?;
Ok(ma.dev() == mb.dev() && ma.ino() == mb.ino())
}
#[inline]
fn libc_exdev() -> i32 {
nix::errno::Errno::EXDEV as i32
}
fn ensure_char_device(path: &Path, major: u64, minor: u64) -> VmRuntimeResult<()> {
if path.exists() {
let md = fs::metadata(path).map_err(|e| {
VmRuntimeError::Jailer(format!(
"failed stat'ing existing chroot node {}: {e}",
path.display()
))
})?;
let file_type = md.file_type();
if !file_type.is_char_device() {
return Err(VmRuntimeError::Jailer(format!(
"chroot path {} exists but is not a character device",
path.display()
)));
}
let rdev = md.rdev();
let expected = makedev(major, minor);
if rdev != expected {
return Err(VmRuntimeError::Jailer(format!(
"chroot device {} has rdev {rdev:#x}, expected {expected:#x}",
path.display()
)));
}
return Ok(());
}
let mode = Mode::S_IRUSR | Mode::S_IWUSR;
let dev = makedev(major, minor);
match mknod(path, SFlag::S_IFCHR, mode, dev) {
Ok(()) => Ok(()),
Err(nix::errno::Errno::EEXIST) => Ok(()),
Err(nix::errno::Errno::EPERM) | Err(nix::errno::Errno::EACCES) => {
Err(VmRuntimeError::Jailer(format!(
"mknod({}, c {major} {minor}) denied — process needs CAP_MKNOD",
path.display()
)))
}
Err(e) => Err(VmRuntimeError::Jailer(format!(
"mknod({}, c {major} {minor}) failed: {e}",
path.display()
))),
}
}
fn makedev(major: u64, minor: u64) -> u64 {
((major & 0xffff_f000) << 32)
| ((major & 0x0000_0fff) << 8)
| ((minor & 0xffff_ff00) << 12)
| (minor & 0x0000_00ff)
}
fn chown_tree(root: &Path, uid: u32, gid: u32) -> VmRuntimeResult<()> {
let owner = Some(Uid::from_raw(uid));
let group = Some(Gid::from_raw(gid));
chown(root, owner, group).map_err(|e| {
VmRuntimeError::Jailer(format!(
"chown {} to {uid}:{gid} failed: {e}",
root.display()
))
})?;
let entries = fs::read_dir(root).map_err(|e| {
VmRuntimeError::Jailer(format!("failed reading chroot dir {}: {e}", root.display()))
})?;
for entry in entries {
let entry = entry.map_err(|e| {
VmRuntimeError::Jailer(format!(
"failed iterating chroot dir {}: {e}",
root.display()
))
})?;
let path = entry.path();
let file_type = entry.file_type().map_err(|e| {
VmRuntimeError::Jailer(format!("failed stat'ing {}: {e}", path.display()))
})?;
if file_type.is_dir() {
chown_tree(&path, uid, gid)?;
} else {
chown(&path, owner, group).map_err(|e| {
VmRuntimeError::Jailer(format!(
"chown {} to {uid}:{gid} failed: {e}",
path.display()
))
})?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn cfg(base: &Path) -> JailerConfig {
JailerConfig {
jailer_bin: PathBuf::from("/usr/bin/jailer"),
uid: 123,
gid: 100,
chroot_base: base.to_path_buf(),
cgroup_parent: "microvm.slice".to_string(),
cgroup_v2: true,
numa_node: None,
}
}
#[test]
fn safe_vm_id_matches_adapter_convention() {
assert_eq!(safe_vm_id("vm-1_a"), "vm-1_a");
assert_eq!(safe_vm_id("vm/with/slash"), "vm_with_slash");
assert_eq!(safe_vm_id("../etc/passwd"), "___etc_passwd");
assert_eq!(safe_vm_id("ünicode"), "_nicode");
assert_eq!(safe_vm_id("ok123"), "ok123");
}
#[test]
fn chroot_path_computation() {
let j = Jailer::new(cfg(Path::new("/srv/jailer")));
assert_eq!(
j.chroot_for("vm-1"),
PathBuf::from("/srv/jailer/firecracker/vm-1/root")
);
assert_eq!(
j.chroot_for("../etc"),
PathBuf::from("/srv/jailer/firecracker/___etc/root")
);
}
#[test]
fn build_command_basic_args_in_expected_order() {
let j = Jailer::new(cfg(Path::new("/srv/jailer")));
let jail = VmJail {
chroot_path: PathBuf::from("/srv/jailer/firecracker/vm-1/root"),
api_socket_in_chroot: PathBuf::from("/api.sock"),
api_socket_on_host: PathBuf::from("/srv/jailer/firecracker/vm-1/root/api.sock"),
};
let cmd = j
.build_command("vm-1", &jail, Path::new("/usr/bin/firecracker"))
.expect("command builds");
assert_eq!(cmd.get_program(), "/usr/bin/jailer");
let args: Vec<String> = cmd
.get_args()
.map(|a| a.to_string_lossy().into_owned())
.collect();
assert_eq!(
args,
vec![
"--id".to_string(),
"vm-1".to_string(),
"--exec-file".to_string(),
"/usr/bin/firecracker".to_string(),
"--uid".to_string(),
"123".to_string(),
"--gid".to_string(),
"100".to_string(),
"--chroot-base-dir".to_string(),
"/srv/jailer".to_string(),
"--cgroup-version".to_string(),
"2".to_string(),
"--parent-cgroup".to_string(),
"microvm.slice".to_string(),
"--new-pid-ns".to_string(),
"--".to_string(),
"--api-sock".to_string(),
"/api.sock".to_string(),
]
);
}
#[test]
fn build_command_cgroup_v1_flag() {
let mut c = cfg(Path::new("/srv/jailer"));
c.cgroup_v2 = false;
let j = Jailer::new(c);
let jail = VmJail {
chroot_path: PathBuf::from("/srv/jailer/firecracker/vm-1/root"),
api_socket_in_chroot: PathBuf::from("/api.sock"),
api_socket_on_host: PathBuf::from("/srv/jailer/firecracker/vm-1/root/api.sock"),
};
let cmd = j
.build_command("vm-1", &jail, Path::new("/usr/bin/firecracker"))
.expect("command builds");
let args: Vec<String> = cmd
.get_args()
.map(|a| a.to_string_lossy().into_owned())
.collect();
let i = args.iter().position(|s| s == "--cgroup-version").unwrap();
assert_eq!(args[i + 1], "1");
}
#[test]
fn build_command_includes_numa_when_configured() {
let mut c = cfg(Path::new("/srv/jailer"));
c.numa_node = Some(3);
let j = Jailer::new(c);
let jail = VmJail {
chroot_path: PathBuf::from("/srv/jailer/firecracker/vm-1/root"),
api_socket_in_chroot: PathBuf::from("/api.sock"),
api_socket_on_host: PathBuf::from("/srv/jailer/firecracker/vm-1/root/api.sock"),
};
let cmd = j
.build_command("vm-1", &jail, Path::new("/usr/bin/firecracker"))
.expect("command builds");
let args: Vec<String> = cmd
.get_args()
.map(|a| a.to_string_lossy().into_owned())
.collect();
let i = args.iter().position(|s| s == "--numa-node").unwrap();
assert_eq!(args[i + 1], "3");
let sep = args.iter().position(|s| s == "--").unwrap();
assert!(i < sep);
}
#[test]
fn build_command_sanitises_vm_id() {
let j = Jailer::new(cfg(Path::new("/srv/jailer")));
let jail = VmJail {
chroot_path: PathBuf::from("/srv/jailer/firecracker/___etc/root"),
api_socket_in_chroot: PathBuf::from("/api.sock"),
api_socket_on_host: PathBuf::from("/srv/jailer/firecracker/___etc/root/api.sock"),
};
let cmd = j
.build_command("../etc", &jail, Path::new("/usr/bin/firecracker"))
.expect("command builds");
let args: Vec<String> = cmd
.get_args()
.map(|a| a.to_string_lossy().into_owned())
.collect();
let i = args.iter().position(|s| s == "--id").unwrap();
assert_eq!(args[i + 1], "___etc");
}
#[test]
fn build_command_rejects_relative_firecracker_bin() {
let j = Jailer::new(cfg(Path::new("/srv/jailer")));
let jail = VmJail {
chroot_path: PathBuf::from("/srv/jailer/firecracker/vm-1/root"),
api_socket_in_chroot: PathBuf::from("/api.sock"),
api_socket_on_host: PathBuf::from("/srv/jailer/firecracker/vm-1/root/api.sock"),
};
let err = j
.build_command("vm-1", &jail, Path::new("firecracker"))
.unwrap_err();
match err {
VmRuntimeError::Jailer(msg) => assert!(msg.contains("absolute"), "{msg}"),
other => panic!("expected Jailer error, got {other:?}"),
}
}
#[test]
fn build_command_rejects_chroot_mismatch() {
let j = Jailer::new(cfg(Path::new("/srv/jailer")));
let jail = VmJail {
chroot_path: PathBuf::from("/srv/jailer/firecracker/other/root"),
api_socket_in_chroot: PathBuf::from("/api.sock"),
api_socket_on_host: PathBuf::from("/srv/jailer/firecracker/other/root/api.sock"),
};
let err = j
.build_command("vm-1", &jail, Path::new("/usr/bin/firecracker"))
.unwrap_err();
assert!(matches!(err, VmRuntimeError::Jailer(_)));
}
#[test]
fn teardown_is_idempotent_without_prepare() {
let tmp = tempfile::tempdir().unwrap();
let j = Jailer::new(cfg(tmp.path()));
j.teardown("vm-1").expect("teardown ok");
j.teardown("vm-1").expect("teardown ok");
}
#[test]
fn teardown_removes_chroot_tree() {
let tmp = tempfile::tempdir().unwrap();
let j = Jailer::new(cfg(tmp.path()));
let vm_dir = tmp.path().join("firecracker").join("vm-1");
fs::create_dir_all(vm_dir.join("root").join("dev")).unwrap();
fs::write(vm_dir.join("root").join("vmlinux"), b"fake").unwrap();
assert!(vm_dir.exists());
j.teardown("vm-1").expect("teardown ok");
assert!(!vm_dir.exists());
}
#[test]
fn makedev_matches_glibc_for_kvm_and_tun() {
assert_eq!(makedev(10, 232), 0xae8);
assert_eq!(makedev(10, 200), 0xac8);
}
#[test]
#[ignore = "requires root: CAP_MKNOD + CAP_CHOWN"]
fn prepare_is_idempotent() {
let tmp = tempfile::tempdir().unwrap();
let kernel = tmp.path().join("vmlinux.src");
let rootfs = tmp.path().join("rootfs.src");
fs::write(&kernel, b"fake kernel").unwrap();
fs::write(&rootfs, b"fake rootfs").unwrap();
let j = Jailer::new(cfg(tmp.path()));
let jail1 = j.prepare("vm-1", &kernel, &rootfs, &[]).expect("prepare 1");
let jail2 = j.prepare("vm-1", &kernel, &rootfs, &[]).expect("prepare 2");
assert_eq!(jail1.chroot_path, jail2.chroot_path);
let kvm = jail1.chroot_path.join("dev/kvm");
let tun = jail1.chroot_path.join("dev/net/tun");
assert!(fs::metadata(&kvm).unwrap().file_type().is_char_device());
assert!(fs::metadata(&tun).unwrap().file_type().is_char_device());
}
}