use crate::error::{NucleusError, Result};
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, BTreeSet};
use std::io::{BufRead, BufReader, Write};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread::JoinHandle;
use std::time::{Duration, Instant};
use tracing::{debug, info, warn};
const DENY_SCOPE_REFRESH_INTERVAL: Duration = Duration::from_millis(250);
const DENY_SCOPE_STALE_PID_TTL: Duration = Duration::from_secs(5);
const DENY_SCOPE_POLL_TIMEOUT_MS: libc::c_int = 250;
const PROC_ROOT: &str = "/proc";
const CGROUP_V2_ROOT: &str = "/sys/fs/cgroup";
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TraceRecord {
pub syscall: i64,
pub name: Option<String>,
pub count: u64,
}
pub struct SeccompTraceReader {
pid: u32,
output_path: PathBuf,
stop: Arc<AtomicBool>,
handle: Option<JoinHandle<()>>,
}
impl SeccompTraceReader {
pub fn new(pid: u32, output_path: &Path) -> Self {
Self {
pid,
output_path: output_path.to_path_buf(),
stop: Arc::new(AtomicBool::new(false)),
handle: None,
}
}
pub fn start_recording(&mut self) -> Result<()> {
let pid = self.pid;
let output_path = self.output_path.clone();
let stop = self.stop.clone();
let handle = std::thread::spawn(move || {
if let Err(e) = record_loop(pid, &output_path, &stop) {
warn!("Seccomp trace reader error: {}", e);
}
});
self.handle = Some(handle);
info!("Seccomp trace reader started for PID {}", self.pid);
Ok(())
}
pub fn stop_and_flush(mut self) {
self.stop.store(true, Ordering::Release);
if let Some(handle) = self.handle.take() {
let _ = handle.join();
}
info!(
"Seccomp trace reader stopped, output at {:?}",
self.output_path
);
}
}
impl Drop for SeccompTraceReader {
fn drop(&mut self) {
self.stop.store(true, Ordering::Release);
if let Some(handle) = self.handle.take() {
let _ = handle.join();
}
}
}
fn record_loop(pid: u32, output_path: &Path, stop: &AtomicBool) -> Result<()> {
let mut syscalls: BTreeMap<i64, u64> = BTreeMap::new();
let kmsg_path = std::path::Path::new("/dev/kmsg");
if let Ok(meta) = std::fs::symlink_metadata(kmsg_path) {
if meta.file_type().is_symlink() {
warn!("/dev/kmsg is a symlink – refusing to open for seccomp tracing");
write_trace_file(output_path, &syscalls)?;
return Ok(());
}
}
let file = match std::fs::File::open(kmsg_path) {
Ok(f) => f,
Err(e) => {
warn!(
"Cannot open /dev/kmsg for seccomp tracing: {} \
(requires root or CAP_SYSLOG). Falling back to no-trace mode.",
e
);
write_trace_file(output_path, &syscalls)?;
return Ok(());
}
};
use std::os::unix::io::AsRawFd;
let fd = file.as_raw_fd();
unsafe {
let flags = libc::fcntl(fd, libc::F_GETFL);
if flags >= 0 {
libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
}
}
let reader = BufReader::new(file);
let pid_pattern = format!("pid={}", pid);
for line in reader.lines() {
if stop.load(Ordering::Acquire) {
break;
}
let line = match line {
Ok(l) => l,
Err(e) => {
if e.kind() == std::io::ErrorKind::WouldBlock {
let mut pfd = libc::pollfd {
fd,
events: libc::POLLIN,
revents: 0,
};
unsafe { libc::poll(&mut pfd, 1, 2000) };
continue;
}
debug!("kmsg read error: {}", e);
continue;
}
};
if line.contains("type=1326") && line.contains(&pid_pattern) {
if let Some(nr) = extract_syscall_nr(&line) {
*syscalls.entry(nr).or_insert(0) += 1;
}
}
}
write_trace_file(output_path, &syscalls)?;
info!("Seccomp trace: recorded {} unique syscalls", syscalls.len());
Ok(())
}
fn extract_syscall_nr(line: &str) -> Option<i64> {
line.split_whitespace()
.find(|s| s.starts_with("syscall="))
.and_then(|s| s.strip_prefix("syscall="))
.and_then(|s| s.parse().ok())
}
fn extract_audit_pid(line: &str) -> Option<u32> {
line.split_whitespace()
.find(|s| s.starts_with("pid="))
.and_then(|s| s.strip_prefix("pid="))
.and_then(|s| s.parse().ok())
}
fn write_trace_file(path: &Path, syscalls: &BTreeMap<i64, u64>) -> Result<()> {
let mut file = std::fs::File::create(path).map_err(|e| {
NucleusError::ConfigError(format!("Failed to create trace file {:?}: {}", path, e))
})?;
for (&nr, &count) in syscalls {
let record = TraceRecord {
syscall: nr,
name: super::seccomp_generate::syscall_number_to_name(nr).map(String::from),
count,
};
let line =
serde_json::to_string(&record).unwrap_or_else(|e| format!("{{\"error\":\"{}\"}}", e));
writeln!(file, "{}", line).map_err(|e| {
NucleusError::ConfigError(format!("Failed to write trace record: {}", e))
})?;
}
Ok(())
}
pub struct SeccompDenyLogger {
pid: u32,
cgroup_path: Option<PathBuf>,
stop: Arc<AtomicBool>,
handle: Option<JoinHandle<()>>,
}
impl SeccompDenyLogger {
pub fn new(pid: u32, cgroup_path: Option<PathBuf>) -> Self {
Self {
pid,
cgroup_path,
stop: Arc::new(AtomicBool::new(false)),
handle: None,
}
}
pub fn start(&mut self) -> Result<()> {
let pid = self.pid;
let cgroup_path = self.cgroup_path.clone();
let stop = self.stop.clone();
let handle = std::thread::spawn(move || {
if let Err(e) = deny_log_loop(pid, cgroup_path, &stop) {
warn!("Seccomp deny logger error: {}", e);
}
});
self.handle = Some(handle);
debug!(
cgroup = self
.cgroup_path
.as_ref()
.map(|path| path.display().to_string()),
"Seccomp deny logger started for PID {}", self.pid
);
Ok(())
}
pub fn stop(mut self) {
self.stop.store(true, Ordering::Release);
if let Some(handle) = self.handle.take() {
let _ = handle.join();
}
}
}
impl Drop for SeccompDenyLogger {
fn drop(&mut self) {
self.stop.store(true, Ordering::Release);
if let Some(handle) = self.handle.take() {
let _ = handle.join();
}
}
}
#[derive(Debug)]
struct SeccompDenyScope {
target_pid: u32,
proc_root: PathBuf,
cgroup_path: Option<PathBuf>,
cgroup_relative_path: Option<String>,
target_pid_namespace: Option<String>,
known_pids: BTreeMap<u32, Instant>,
last_refresh: Option<Instant>,
}
impl SeccompDenyScope {
fn new(target_pid: u32, cgroup_path: Option<PathBuf>) -> Self {
Self::with_proc_root(target_pid, PathBuf::from(PROC_ROOT), cgroup_path, None)
}
fn with_proc_root(
target_pid: u32,
proc_root: PathBuf,
cgroup_path: Option<PathBuf>,
cgroup_relative_path: Option<String>,
) -> Self {
let cgroup_relative_path = cgroup_relative_path.or_else(|| {
cgroup_path
.as_deref()
.and_then(cgroup_relative_path_from_host_path)
});
Self {
target_pid,
proc_root,
cgroup_path,
cgroup_relative_path,
target_pid_namespace: None,
known_pids: BTreeMap::new(),
last_refresh: None,
}
}
fn matches_pid(&mut self, pid: u32, now: Instant) -> bool {
if pid == self.target_pid {
self.remember_pid(pid, now);
return true;
}
self.refresh_if_stale(now);
if self.has_recent_pid(pid, now) {
return true;
}
self.refresh(now);
if self.has_recent_pid(pid, now) {
return true;
}
if self.process_matches_cgroup(pid) || self.process_matches_pid_namespace(pid) {
self.remember_pid(pid, now);
return true;
}
false
}
fn refresh_if_stale(&mut self, now: Instant) {
let should_refresh = self
.last_refresh
.and_then(|last| now.checked_duration_since(last))
.map(|age| age >= DENY_SCOPE_REFRESH_INTERVAL)
.unwrap_or(true);
if should_refresh {
self.refresh(now);
}
}
fn refresh(&mut self, now: Instant) {
self.expire_stale_pids(now);
self.remember_pid(self.target_pid, now);
if self.target_pid_namespace.is_none() {
self.target_pid_namespace = read_pid_namespace(&self.proc_root, self.target_pid);
}
let mut scoped_pids = BTreeSet::new();
collect_process_tree_pids(&self.proc_root, self.target_pid, &mut scoped_pids);
for pid in scoped_pids {
self.remember_pid(pid, now);
}
if let Some(cgroup_path) = &self.cgroup_path {
for pid in read_pids_from_file(&cgroup_path.join("cgroup.procs")) {
self.remember_pid(pid, now);
}
}
self.last_refresh = Some(now);
}
fn remember_pid(&mut self, pid: u32, now: Instant) {
self.known_pids.insert(pid, now);
}
fn has_recent_pid(&self, pid: u32, now: Instant) -> bool {
self.known_pids
.get(&pid)
.map(|seen| is_recent(*seen, now))
.unwrap_or(false)
}
fn expire_stale_pids(&mut self, now: Instant) {
self.known_pids.retain(|_, seen| is_recent(*seen, now));
}
fn process_matches_cgroup(&self, pid: u32) -> bool {
let Some(expected) = self.cgroup_relative_path.as_deref() else {
return false;
};
let cgroup_file = self.proc_root.join(pid.to_string()).join("cgroup");
let Ok(content) = std::fs::read_to_string(cgroup_file) else {
return false;
};
cgroup_content_matches_path(&content, expected)
}
fn process_matches_pid_namespace(&self, pid: u32) -> bool {
let Some(target_ns) = self.target_pid_namespace.as_deref() else {
return false;
};
read_pid_namespace(&self.proc_root, pid)
.as_deref()
.map(|pid_ns| pid_ns == target_ns)
.unwrap_or(false)
}
}
fn is_recent(seen: Instant, now: Instant) -> bool {
now.checked_duration_since(seen)
.map(|age| age <= DENY_SCOPE_STALE_PID_TTL)
.unwrap_or(true)
}
fn collect_process_tree_pids(proc_root: &Path, root_pid: u32, out: &mut BTreeSet<u32>) {
let mut stack = vec![root_pid];
let mut visited = BTreeSet::new();
while let Some(pid) = stack.pop() {
if !visited.insert(pid) {
continue;
}
out.insert(pid);
stack.extend(read_child_pids(proc_root, pid));
}
}
fn read_child_pids(proc_root: &Path, pid: u32) -> Vec<u32> {
let task_dir = proc_root.join(pid.to_string()).join("task");
let Ok(entries) = std::fs::read_dir(task_dir) else {
return Vec::new();
};
let mut children = Vec::new();
for entry in entries.flatten() {
let children_path = entry.path().join("children");
if let Ok(content) = std::fs::read_to_string(children_path) {
children.extend(parse_pid_list(&content));
}
}
children
}
fn read_pids_from_file(path: &Path) -> Vec<u32> {
std::fs::read_to_string(path)
.map(|content| parse_pid_list(&content))
.unwrap_or_default()
}
fn parse_pid_list(content: &str) -> Vec<u32> {
content
.split_whitespace()
.filter_map(|pid| pid.parse::<u32>().ok())
.collect()
}
fn read_pid_namespace(proc_root: &Path, pid: u32) -> Option<String> {
std::fs::read_link(proc_root.join(pid.to_string()).join("ns").join("pid"))
.ok()
.map(|path| path.to_string_lossy().into_owned())
}
fn cgroup_relative_path_from_host_path(cgroup_path: &Path) -> Option<String> {
let root = Path::new(CGROUP_V2_ROOT);
let canonical_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
let canonical_path =
std::fs::canonicalize(cgroup_path).unwrap_or_else(|_| cgroup_path.to_path_buf());
let relative = canonical_path.strip_prefix(canonical_root).ok()?;
Some(normalize_cgroup_path(&format!(
"/{}",
relative.to_string_lossy()
)))
}
fn cgroup_content_matches_path(content: &str, expected: &str) -> bool {
let expected = normalize_cgroup_path(expected);
content
.lines()
.filter_map(|line| line.rsplit_once(':').map(|(_, path)| path.trim()))
.any(|actual| cgroup_path_contains(&normalize_cgroup_path(actual), &expected))
}
fn cgroup_path_contains(actual: &str, expected: &str) -> bool {
if expected == "/" {
return actual == "/";
}
actual == expected
|| actual
.strip_prefix(expected)
.map(|suffix| suffix.starts_with('/'))
.unwrap_or(false)
}
fn normalize_cgroup_path(path: &str) -> String {
let trimmed = path.trim().trim_end_matches('/');
if trimmed.is_empty() {
return "/".to_string();
}
if trimmed.starts_with('/') {
trimmed.to_string()
} else {
format!("/{}", trimmed)
}
}
fn deny_log_loop(pid: u32, cgroup_path: Option<PathBuf>, stop: &AtomicBool) -> Result<()> {
let kmsg_path = std::path::Path::new("/dev/kmsg");
if let Ok(meta) = std::fs::symlink_metadata(kmsg_path) {
if meta.file_type().is_symlink() {
warn!("/dev/kmsg is a symlink – refusing to open for seccomp deny logging");
return Ok(());
}
}
let file = match std::fs::File::open(kmsg_path) {
Ok(f) => f,
Err(e) => {
warn!(
"Cannot open /dev/kmsg for seccomp deny logging: {} \
(requires root or CAP_SYSLOG)",
e
);
return Ok(());
}
};
use std::os::unix::io::AsRawFd;
let fd = file.as_raw_fd();
unsafe {
let flags = libc::fcntl(fd, libc::F_GETFL);
if flags >= 0 {
libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
}
}
let reader = BufReader::new(file);
let mut scope = SeccompDenyScope::new(pid, cgroup_path);
scope.refresh(Instant::now());
for line in reader.lines() {
if stop.load(Ordering::Acquire) {
break;
}
let line = match line {
Ok(l) => l,
Err(e) => {
if e.kind() == std::io::ErrorKind::WouldBlock {
scope.refresh_if_stale(Instant::now());
let mut pfd = libc::pollfd {
fd,
events: libc::POLLIN,
revents: 0,
};
unsafe { libc::poll(&mut pfd, 1, DENY_SCOPE_POLL_TIMEOUT_MS) };
continue;
}
debug!("kmsg read error: {}", e);
continue;
}
};
if let Some((audit_pid, nr)) =
denied_syscall_record_for_scope(&line, &mut scope, Instant::now())
{
let name = super::seccomp_generate::syscall_number_to_name(nr).unwrap_or("unknown");
warn!(
syscall = nr,
name = name,
pid = audit_pid,
target_pid = pid,
"seccomp denied syscall"
);
}
}
Ok(())
}
fn denied_syscall_record_for_scope(
line: &str,
scope: &mut SeccompDenyScope,
now: Instant,
) -> Option<(u32, i64)> {
if !line.contains("type=1326") {
return None;
}
let pid = extract_audit_pid(line)?;
if !scope.matches_pid(pid, now) {
return None;
}
extract_syscall_nr(line).map(|nr| (pid, nr))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_syscall_nr() {
let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=42 comm=\"test\" exe=\"/bin/test\" sig=0 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x7fff0000";
assert_eq!(extract_syscall_nr(line), Some(257));
}
#[test]
fn test_extract_syscall_nr_missing() {
assert_eq!(extract_syscall_nr("no syscall here"), None);
}
#[test]
fn test_extract_audit_pid_ignores_ppid() {
let line = "audit: type=1326 audit(123:456): ppid=7 pid=42 comm=\"test\" syscall=257";
assert_eq!(extract_audit_pid(line), Some(42));
}
#[test]
fn test_deny_scope_matches_forked_child_audit_pid() {
let temp = tempfile::tempdir().unwrap();
let target_task = temp.path().join("42/task/42");
std::fs::create_dir_all(&target_task).unwrap();
std::fs::write(target_task.join("children"), "43\n").unwrap();
let mut scope = SeccompDenyScope::with_proc_root(42, temp.path().to_path_buf(), None, None);
let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"probe\" exe=\"/bin/probe\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
assert_eq!(
denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
Some((43, 257))
);
}
#[test]
fn test_deny_scope_rejects_unrelated_seccomp_pid() {
let temp = tempfile::tempdir().unwrap();
let target_task = temp.path().join("42/task/42");
std::fs::create_dir_all(&target_task).unwrap();
std::fs::write(target_task.join("children"), "").unwrap();
let mut scope = SeccompDenyScope::with_proc_root(42, temp.path().to_path_buf(), None, None);
let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"other\" exe=\"/bin/other\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
assert_eq!(
denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
None
);
}
#[test]
fn test_deny_scope_matches_cgroup_member_audit_pid() {
let temp = tempfile::tempdir().unwrap();
let proc_root = temp.path().join("proc");
let cgroup_dir = temp.path().join("cgroup");
std::fs::create_dir_all(proc_root.join("42/task/42")).unwrap();
std::fs::create_dir_all(proc_root.join("43")).unwrap();
std::fs::create_dir_all(&cgroup_dir).unwrap();
std::fs::write(proc_root.join("42/task/42/children"), "").unwrap();
std::fs::write(cgroup_dir.join("cgroup.procs"), "43\n").unwrap();
let mut scope = SeccompDenyScope::with_proc_root(
42,
proc_root,
Some(cgroup_dir),
Some("/nucleus-test".to_string()),
);
let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"probe\" exe=\"/bin/probe\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
assert_eq!(
denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
Some((43, 257))
);
}
#[test]
fn test_cgroup_content_matches_subgroup_membership() {
assert!(cgroup_content_matches_path(
"0::/nucleus-test/workers\n",
"/nucleus-test"
));
assert!(!cgroup_content_matches_path(
"0::/nucleus-other\n",
"/nucleus-test"
));
}
fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
let fn_start = source
.find(fn_signature)
.unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
let after = &source[fn_start..];
let open = after
.find('{')
.unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
let mut depth = 0u32;
let mut end = open;
for (i, ch) in after[open..].char_indices() {
match ch {
'{' => depth += 1,
'}' => {
depth -= 1;
if depth == 0 {
end = open + i + 1;
break;
}
}
_ => {}
}
}
&after[..end]
}
#[test]
fn test_reader_uses_nonblocking_io() {
let source = include_str!("seccomp_trace.rs");
let fn_body = extract_fn_body(source, "fn record_loop");
assert!(
fn_body.contains("O_NONBLOCK"),
"record_loop must use O_NONBLOCK for non-blocking reads on /dev/kmsg"
);
assert!(
fn_body.contains("libc::poll"),
"record_loop must use poll() for timed waits on /dev/kmsg"
);
let setsockopt_lines: Vec<&str> = fn_body
.lines()
.filter(|l| !l.trim().starts_with("//"))
.filter(|l| l.contains("setsockopt"))
.collect();
assert!(
setsockopt_lines.is_empty(),
"record_loop must not call setsockopt on /dev/kmsg"
);
}
#[test]
fn test_trace_record_serialization() {
let record = TraceRecord {
syscall: 0,
name: Some("read".to_string()),
count: 42,
};
let json = serde_json::to_string(&record).unwrap();
assert!(json.contains("\"syscall\":0"));
assert!(json.contains("\"name\":\"read\""));
assert!(json.contains("\"count\":42"));
}
}