use std::collections::HashSet;
use std::io::{Seek, SeekFrom, Write};
use std::os::unix::io::{AsRawFd, RawFd};
use std::sync::Arc;
use tokio::sync::Mutex;
use crate::seccomp::notif::{read_child_mem, write_child_mem, NotifAction, NotifPolicy, SupervisorState};
use crate::sys::structs::{SeccompNotif, EACCES};
use crate::sys::syscall;
const SENSITIVE_PATHS: &[&str] = &[
"/proc/kcore",
"/proc/kmsg",
"/proc/kallsyms",
"/proc/keys",
"/proc/key-users",
"/proc/sysrq-trigger",
"/sys/firmware",
"/sys/kernel/security",
];
pub(crate) fn is_sensitive_proc(path: &str) -> bool {
SENSITIVE_PATHS
.iter()
.any(|&sensitive| path == sensitive || path.starts_with(&format!("{}/", sensitive)))
}
pub(crate) fn generate_cpuinfo(num_cpus: u32) -> Vec<u8> {
let mut buf = String::new();
for i in 0..num_cpus {
if i > 0 {
buf.push('\n');
}
buf.push_str(&format!(
"processor\t: {}\nmodel name\t: Virtual CPU\ncpu MHz\t\t: 2400.000\n",
i
));
}
buf.into_bytes()
}
pub(crate) fn generate_meminfo(total_bytes: u64, used_bytes: u64) -> Vec<u8> {
let total_kb = total_bytes / 1024;
let used_kb = used_bytes.min(total_bytes) / 1024;
let free_kb = total_kb.saturating_sub(used_kb);
let avail_kb = free_kb;
format!(
"MemTotal: {} kB\n\
MemFree: {} kB\n\
MemAvailable: {} kB\n",
total_kb, free_kb, avail_kb,
)
.into_bytes()
}
pub(crate) fn generate_proc_net_tcp(bound_ports: &HashSet<u16>, is_v6: bool) -> Vec<u8> {
let path = if is_v6 { "/proc/net/tcp6" } else { "/proc/net/tcp" };
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(_) => return Vec::new(),
};
let mut result = String::new();
for (i, line) in content.lines().enumerate() {
if i == 0 {
result.push_str(line);
result.push('\n');
continue;
}
if let Some(local_port) = parse_proc_net_tcp_port(line) {
if bound_ports.contains(&local_port) {
result.push_str(line);
result.push('\n');
}
}
}
result.into_bytes()
}
fn parse_proc_net_tcp_port(line: &str) -> Option<u16> {
let fields: Vec<&str> = line.split_whitespace().collect();
if fields.len() < 2 {
return None;
}
let local = fields[1];
let colon = local.rfind(':')?;
let port_hex = &local[colon + 1..];
u16::from_str_radix(port_hex, 16).ok()
}
fn inject_memfd(content: &[u8]) -> NotifAction {
let memfd = match syscall::memfd_create("sandlock", 0) {
Ok(fd) => fd,
Err(_) => return NotifAction::Continue, };
let raw = memfd.as_raw_fd();
{
let mut file = unsafe { std::fs::File::from_raw_fd(raw) };
if file.write_all(content).is_err() || file.seek(SeekFrom::Start(0)).is_err() {
std::mem::forget(file);
return NotifAction::Continue;
}
std::mem::forget(file);
}
let leaked_fd = raw;
std::mem::forget(memfd);
NotifAction::InjectFdSend {
srcfd: leaked_fd,
}
}
use std::os::unix::io::FromRawFd;
fn read_path(notif: &SeccompNotif, addr: u64, notif_fd: RawFd) -> Option<String> {
if addr == 0 {
return None;
}
let bytes = read_child_mem(notif_fd, notif.id, notif.pid, addr, 256).ok()?;
let nul_pos = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
String::from_utf8(bytes[..nul_pos].to_vec()).ok()
}
pub(crate) async fn handle_proc_open(
notif: &SeccompNotif,
state: &Arc<Mutex<SupervisorState>>,
policy: &NotifPolicy,
notif_fd: RawFd,
) -> NotifAction {
let path_ptr = notif.data.args[1];
let path = match read_path(notif, path_ptr, notif_fd) {
Some(p) => p,
None => return NotifAction::Continue,
};
if is_sensitive_proc(&path) {
return NotifAction::Errno(EACCES);
}
if path == "/proc/cpuinfo" {
if let Some(num_cpus) = policy.num_cpus {
let content = generate_cpuinfo(num_cpus);
return inject_memfd(&content);
}
}
if path == "/proc/meminfo" && policy.max_memory_bytes > 0 {
let st = state.lock().await;
let content = generate_meminfo(policy.max_memory_bytes, st.mem_used);
return inject_memfd(&content);
}
if policy.port_remap && (path == "/proc/net/tcp" || path == "/proc/net/tcp6") {
let is_v6 = path.ends_with('6');
let st = state.lock().await;
let content = generate_proc_net_tcp(&st.port_map.bound_ports, is_v6);
return inject_memfd(&content);
}
NotifAction::Continue
}
pub(crate) fn handle_sched_getaffinity(
notif: &SeccompNotif,
num_cpus: u32,
notif_fd: RawFd,
) -> NotifAction {
let cpusetsize = notif.data.args[1] as usize;
let mask_addr = notif.data.args[2];
if mask_addr == 0 || cpusetsize == 0 {
return NotifAction::Continue;
}
let mut mask = vec![0u8; cpusetsize];
for i in 0..num_cpus as usize {
let byte_idx = i / 8;
let bit_idx = i % 8;
if byte_idx < mask.len() {
mask[byte_idx] |= 1 << bit_idx;
}
}
match write_child_mem(notif_fd, notif.id, notif.pid, mask_addr, &mask) {
Ok(()) => NotifAction::ReturnValue(cpusetsize as i64),
Err(_) => NotifAction::Continue,
}
}
pub(crate) fn handle_uname(
notif: &SeccompNotif,
hostname: &str,
notif_fd: RawFd,
) -> NotifAction {
let buf_addr = notif.data.args[0];
if buf_addr == 0 {
return NotifAction::Continue;
}
let mut uts: libc::utsname = unsafe { std::mem::zeroed() };
if unsafe { libc::uname(&mut uts) } != 0 {
return NotifAction::Continue;
}
let name_bytes = hostname.as_bytes();
let len = name_bytes.len().min(uts.nodename.len() - 1);
for (i, &b) in name_bytes[..len].iter().enumerate() {
uts.nodename[i] = b as libc::c_char;
}
uts.nodename[len] = 0;
let bytes = unsafe {
std::slice::from_raw_parts(
&uts as *const _ as *const u8,
std::mem::size_of::<libc::utsname>(),
)
};
match write_child_mem(notif_fd, notif.id, notif.pid, buf_addr, bytes) {
Ok(()) => NotifAction::ReturnValue(0),
Err(_) => NotifAction::Continue,
}
}
pub(crate) fn handle_hostname_open(
notif: &SeccompNotif,
hostname: &str,
notif_fd: RawFd,
) -> Option<NotifAction> {
let path_ptr = notif.data.args[1];
let path = read_path(notif, path_ptr, notif_fd)?;
if path != "/etc/hostname" {
return None;
}
let content = format!("{}\n", hostname);
Some(inject_memfd(content.as_bytes()))
}
pub(crate) async fn handle_sorted_getdents(
notif: &SeccompNotif,
state: &Arc<Mutex<SupervisorState>>,
notif_fd: RawFd,
) -> NotifAction {
let pid = notif.pid;
let child_fd = (notif.data.args[0] & 0xFFFF_FFFF) as u32;
let buf_addr = notif.data.args[1];
let buf_size = (notif.data.args[2] & 0xFFFF_FFFF) as usize;
let cache_key = (pid as i32, child_fd);
let mut st = state.lock().await;
if !st.getdents_cache.contains_key(&cache_key) {
let link_path = format!("/proc/{}/fd/{}", pid, child_fd);
let dir_path = match std::fs::read_link(&link_path) {
Ok(t) => t,
Err(_) => return NotifAction::Continue,
};
let dir = match std::fs::read_dir(&dir_path) {
Ok(d) => d,
Err(_) => return NotifAction::Continue,
};
let mut names: Vec<_> = dir
.filter_map(|e| e.ok())
.map(|e| {
let name = e.file_name().to_string_lossy().into_owned();
let d_type = match e.file_type() {
Ok(ft) if ft.is_dir() => DT_DIR,
Ok(ft) if ft.is_symlink() => DT_LNK,
_ => DT_REG,
};
let d_ino = {
use std::os::linux::fs::MetadataExt;
e.metadata().map(|m| m.st_ino()).unwrap_or(0)
};
(name, d_type, d_ino)
})
.collect();
names.sort_by(|a, b| a.0.cmp(&b.0));
let entries: Vec<Vec<u8>> = names
.iter()
.enumerate()
.map(|(i, (name, d_type, d_ino))| {
build_dirent64(*d_ino, (i + 1) as i64, *d_type, name)
})
.collect();
st.getdents_cache.insert(cache_key, entries);
}
let entries = match st.getdents_cache.get_mut(&cache_key) {
Some(e) => e,
None => return NotifAction::Continue,
};
if entries.is_empty() {
return NotifAction::ReturnValue(0);
}
let mut result = Vec::new();
let mut consumed = 0;
for entry in entries.iter() {
if result.len() + entry.len() > buf_size {
break;
}
result.extend_from_slice(entry);
consumed += 1;
}
if consumed > 0 {
entries.drain(..consumed);
}
drop(st);
if !result.is_empty() {
if write_child_mem(notif_fd, notif.id, pid, buf_addr, &result).is_err() {
return NotifAction::Continue;
}
}
NotifAction::ReturnValue(result.len() as i64)
}
pub(crate) const DT_DIR: u8 = 4;
pub(crate) const DT_REG: u8 = 8;
pub(crate) const DT_LNK: u8 = 10;
pub(crate) fn build_dirent64(d_ino: u64, d_off: i64, d_type: u8, name: &str) -> Vec<u8> {
let name_bytes = name.as_bytes();
let reclen = ((19 + name_bytes.len() + 1) + 7) & !7; let mut buf = vec![0u8; reclen];
buf[0..8].copy_from_slice(&d_ino.to_ne_bytes());
buf[8..16].copy_from_slice(&d_off.to_ne_bytes());
buf[16..18].copy_from_slice(&(reclen as u16).to_ne_bytes());
buf[18] = d_type;
buf[19..19 + name_bytes.len()].copy_from_slice(name_bytes);
buf
}
fn build_filtered_dirents(sandbox_pids: &HashSet<i32>) -> Vec<Vec<u8>> {
let mut entries = Vec::new();
let mut d_off: i64 = 0;
let dir = match std::fs::read_dir("/proc") {
Ok(d) => d,
Err(_) => return entries,
};
for entry in dir {
let entry = match entry {
Ok(e) => e,
Err(_) => continue,
};
let name = entry.file_name();
let name_str = name.to_string_lossy();
if let Ok(pid) = name_str.parse::<i32>() {
if !sandbox_pids.contains(&pid) {
continue;
}
}
d_off += 1;
let d_type = match entry.file_type() {
Ok(ft) if ft.is_dir() => DT_DIR,
Ok(ft) if ft.is_symlink() => DT_LNK,
_ => DT_REG,
};
let d_ino = {
use std::os::linux::fs::MetadataExt;
entry.metadata().map(|m| m.st_ino()).unwrap_or(0)
};
entries.push(build_dirent64(d_ino, d_off, d_type, &name_str));
}
entries
}
pub(crate) async fn handle_getdents(
notif: &SeccompNotif,
state: &Arc<Mutex<SupervisorState>>,
_policy: &NotifPolicy,
notif_fd: RawFd,
) -> NotifAction {
let pid = notif.pid; let child_fd = (notif.data.args[0] & 0xFFFF_FFFF) as u32;
let buf_addr = notif.data.args[1];
let buf_size = (notif.data.args[2] & 0xFFFF_FFFF) as usize;
let link_path = format!("/proc/{}/fd/{}", pid, child_fd);
let target = match std::fs::read_link(&link_path) {
Ok(t) => t,
Err(_) => return NotifAction::Continue,
};
if target.to_str() != Some("/proc") {
return NotifAction::Continue;
}
let cache_key = (pid as i32, child_fd);
let mut st = state.lock().await;
if !st.getdents_cache.contains_key(&cache_key) {
let entries = build_filtered_dirents(&st.proc_pids);
st.getdents_cache.insert(cache_key, entries);
}
let entries = match st.getdents_cache.get_mut(&cache_key) {
Some(e) => e,
None => return NotifAction::Continue,
};
let mut result = Vec::new();
let mut consumed = 0;
for entry in entries.iter() {
if result.len() + entry.len() > buf_size {
break;
}
result.extend_from_slice(entry);
consumed += 1;
}
if consumed > 0 {
entries.drain(..consumed);
}
if entries.is_empty() {
st.getdents_cache.remove(&cache_key);
}
drop(st);
if !result.is_empty() {
if write_child_mem(notif_fd, notif.id, pid, buf_addr, &result).is_err() {
return NotifAction::Continue;
}
}
NotifAction::ReturnValue(result.len() as i64)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_sensitive_proc() {
assert!(is_sensitive_proc("/proc/kcore"));
assert!(is_sensitive_proc("/proc/kmsg"));
assert!(is_sensitive_proc("/proc/kallsyms"));
assert!(is_sensitive_proc("/proc/keys"));
assert!(is_sensitive_proc("/proc/key-users"));
assert!(is_sensitive_proc("/proc/sysrq-trigger"));
assert!(is_sensitive_proc("/sys/firmware"));
assert!(is_sensitive_proc("/sys/firmware/efi"));
assert!(is_sensitive_proc("/sys/kernel/security"));
assert!(is_sensitive_proc("/sys/kernel/security/apparmor"));
assert!(!is_sensitive_proc("/proc/cpuinfo"));
assert!(!is_sensitive_proc("/proc/meminfo"));
assert!(!is_sensitive_proc("/proc/1/status"));
assert!(!is_sensitive_proc("/sys/class/net"));
}
#[test]
fn test_generate_cpuinfo_single() {
let info = generate_cpuinfo(1);
let text = String::from_utf8(info).unwrap();
assert!(text.contains("processor\t: 0"));
assert!(text.contains("model name\t: Virtual CPU"));
assert!(text.contains("cpu MHz\t\t: 2400.000"));
assert!(!text.contains("processor\t: 1"));
}
#[test]
fn test_generate_cpuinfo_multiple() {
let info = generate_cpuinfo(4);
let text = String::from_utf8(info).unwrap();
assert!(text.contains("processor\t: 0"));
assert!(text.contains("processor\t: 1"));
assert!(text.contains("processor\t: 2"));
assert!(text.contains("processor\t: 3"));
assert!(!text.contains("processor\t: 4"));
}
#[test]
fn test_generate_meminfo() {
let total = 1024 * 1024 * 1024u64;
let used = 256 * 1024 * 1024u64;
let info = generate_meminfo(total, used);
let text = String::from_utf8(info).unwrap();
let total_kb = total / 1024;
let used_kb = used / 1024;
let free_kb = total_kb - used_kb;
assert!(text.contains(&format!("MemTotal: {} kB", total_kb)));
assert!(text.contains(&format!("MemFree: {} kB", free_kb)));
assert!(text.contains(&format!("MemAvailable: {} kB", free_kb)));
}
#[test]
fn test_generate_meminfo_zero_used() {
let total = 512 * 1024 * 1024u64;
let info = generate_meminfo(total, 0);
let text = String::from_utf8(info).unwrap();
let total_kb = total / 1024;
assert!(text.contains(&format!("MemTotal: {} kB", total_kb)));
assert!(text.contains(&format!("MemFree: {} kB", total_kb)));
}
#[test]
fn test_generate_meminfo_over_used() {
let total = 100 * 1024u64;
let used = 200 * 1024u64;
let info = generate_meminfo(total, used);
let text = String::from_utf8(info).unwrap();
assert!(text.contains("MemFree: 0 kB"));
}
#[test]
fn test_build_dirent64() {
let entry = build_dirent64(12345, 1, DT_DIR, "1234");
assert_eq!(entry.len(), 24); let d_ino = u64::from_ne_bytes(entry[0..8].try_into().unwrap());
assert_eq!(d_ino, 12345);
let d_reclen = u16::from_ne_bytes(entry[16..18].try_into().unwrap());
assert_eq!(d_reclen, 24);
assert_eq!(entry[18], DT_DIR);
assert_eq!(&entry[19..23], b"1234");
assert_eq!(entry[23], 0);
}
#[test]
fn test_build_dirent64_alignment() {
let entry = build_dirent64(1, 1, DT_REG, "ab");
assert_eq!(entry.len(), 24);
}
#[test]
fn test_build_filtered_dirents() {
use std::collections::HashSet;
let mut sandbox_pids = HashSet::new();
sandbox_pids.insert(1_i32);
let entries = build_filtered_dirents(&sandbox_pids);
assert!(!entries.is_empty());
}
}