Skip to main content

nucleus/security/
seccomp_trace.rs

1//! Seccomp trace mode: record syscalls for profile generation.
2//!
3//! In trace mode, an allow-all seccomp filter is installed with
4//! `SECCOMP_FILTER_FLAG_LOG`, causing the kernel to log every syscall to
5//! the audit subsystem. A reader thread monitors `/dev/kmsg` for
6//! SECCOMP audit records matching the container PID and writes unique
7//! syscalls to an NDJSON trace file.
8//!
9//! This is a development tool – requires root or CAP_SYSLOG for
10//! `/dev/kmsg` access.
11
12use crate::error::{NucleusError, Result};
13use serde::{Deserialize, Serialize};
14use std::collections::{BTreeMap, BTreeSet};
15use std::io::{BufRead, BufReader, Write};
16use std::path::{Path, PathBuf};
17use std::sync::atomic::{AtomicBool, Ordering};
18use std::sync::Arc;
19use std::thread::JoinHandle;
20use std::time::{Duration, Instant};
21use tracing::{debug, info, warn};
22
23const DENY_SCOPE_REFRESH_INTERVAL: Duration = Duration::from_millis(250);
24const DENY_SCOPE_STALE_PID_TTL: Duration = Duration::from_secs(5);
25const DENY_SCOPE_POLL_TIMEOUT_MS: libc::c_int = 250;
26const PROC_ROOT: &str = "/proc";
27const CGROUP_V2_ROOT: &str = "/sys/fs/cgroup";
28
29/// A single trace record in the NDJSON output.
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct TraceRecord {
32    /// Syscall number (e.g. 0 for read on x86_64).
33    pub syscall: i64,
34    /// Syscall name if known.
35    pub name: Option<String>,
36    /// Number of times this syscall was observed.
37    pub count: u64,
38}
39
40/// Reads `/dev/kmsg` for SECCOMP audit records and collects unique syscalls.
41pub struct SeccompTraceReader {
42    pid: u32,
43    output_path: PathBuf,
44    stop: Arc<AtomicBool>,
45    handle: Option<JoinHandle<()>>,
46}
47
48impl SeccompTraceReader {
49    /// Create a new trace reader for the given child PID.
50    pub fn new(pid: u32, output_path: &Path) -> Self {
51        Self {
52            pid,
53            output_path: output_path.to_path_buf(),
54            stop: Arc::new(AtomicBool::new(false)),
55            handle: None,
56        }
57    }
58
59    /// Start the background reader thread.
60    ///
61    /// Opens `/dev/kmsg` and filters for `audit: type=1326` (SECCOMP)
62    /// messages matching the target PID.
63    pub fn start_recording(&mut self) -> Result<()> {
64        let pid = self.pid;
65        let output_path = self.output_path.clone();
66        let stop = self.stop.clone();
67
68        let handle = std::thread::spawn(move || {
69            if let Err(e) = record_loop(pid, &output_path, &stop) {
70                warn!("Seccomp trace reader error: {}", e);
71            }
72        });
73
74        self.handle = Some(handle);
75        info!("Seccomp trace reader started for PID {}", self.pid);
76        Ok(())
77    }
78
79    /// Signal the reader to stop and wait for it to flush.
80    pub fn stop_and_flush(mut self) {
81        self.stop.store(true, Ordering::Release);
82        if let Some(handle) = self.handle.take() {
83            let _ = handle.join();
84        }
85        info!(
86            "Seccomp trace reader stopped, output at {:?}",
87            self.output_path
88        );
89    }
90}
91
92impl Drop for SeccompTraceReader {
93    fn drop(&mut self) {
94        self.stop.store(true, Ordering::Release);
95        if let Some(handle) = self.handle.take() {
96            let _ = handle.join();
97        }
98    }
99}
100
101/// Main recording loop – reads /dev/kmsg and extracts SECCOMP records.
102fn record_loop(pid: u32, output_path: &Path, stop: &AtomicBool) -> Result<()> {
103    let mut syscalls: BTreeMap<i64, u64> = BTreeMap::new();
104
105    // Verify /dev/kmsg is not a symlink before opening
106    let kmsg_path = std::path::Path::new("/dev/kmsg");
107    if let Ok(meta) = std::fs::symlink_metadata(kmsg_path) {
108        if meta.file_type().is_symlink() {
109            warn!("/dev/kmsg is a symlink – refusing to open for seccomp tracing");
110            write_trace_file(output_path, &syscalls)?;
111            return Ok(());
112        }
113    }
114
115    // Open /dev/kmsg for reading (requires CAP_SYSLOG or root)
116    let file = match std::fs::File::open(kmsg_path) {
117        Ok(f) => f,
118        Err(e) => {
119            warn!(
120                "Cannot open /dev/kmsg for seccomp tracing: {} \
121                 (requires root or CAP_SYSLOG). Falling back to no-trace mode.",
122                e
123            );
124            // Write empty trace file
125            write_trace_file(output_path, &syscalls)?;
126            return Ok(());
127        }
128    };
129
130    // Set O_NONBLOCK so reads don't block indefinitely. We use poll() with a
131    // timeout to periodically check the stop flag. The previous setsockopt(SO_RCVTIMEO)
132    // approach was incorrect: /dev/kmsg is a character device, not a socket, so
133    // setsockopt silently fails with ENOTSOCK.
134    use std::os::unix::io::AsRawFd;
135    let fd = file.as_raw_fd();
136    // SAFETY: fd is a valid file descriptor from File::open("/dev/kmsg").
137    // F_GETFL/F_SETFL only modify the file status flags; O_NONBLOCK is safe
138    // to set and required for poll-based reading.
139    unsafe {
140        let flags = libc::fcntl(fd, libc::F_GETFL);
141        if flags >= 0 {
142            libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
143        }
144    }
145
146    let reader = BufReader::new(file);
147    let pid_pattern = format!("pid={}", pid);
148
149    for line in reader.lines() {
150        if stop.load(Ordering::Acquire) {
151            break;
152        }
153
154        let line = match line {
155            Ok(l) => l,
156            Err(e) => {
157                if e.kind() == std::io::ErrorKind::WouldBlock {
158                    // No data available – poll with 2s timeout, then check stop flag
159                    let mut pfd = libc::pollfd {
160                        fd,
161                        events: libc::POLLIN,
162                        revents: 0,
163                    };
164                    // SAFETY: pfd is a valid stack-allocated pollfd with a valid fd.
165                    // poll with nfds=1 and timeout=2000ms is safe; it only blocks.
166                    unsafe { libc::poll(&mut pfd, 1, 2000) };
167                    continue;
168                }
169                debug!("kmsg read error: {}", e);
170                continue;
171            }
172        };
173
174        // SECCOMP audit lines look like:
175        // audit: type=1326 ... pid=<PID> ... syscall=<NR> ...
176        if line.contains("type=1326") && line.contains(&pid_pattern) {
177            if let Some(nr) = extract_syscall_nr(&line) {
178                *syscalls.entry(nr).or_insert(0) += 1;
179            }
180        }
181    }
182
183    write_trace_file(output_path, &syscalls)?;
184    info!("Seccomp trace: recorded {} unique syscalls", syscalls.len());
185    Ok(())
186}
187
188/// Extract the syscall number from an audit SECCOMP line.
189fn extract_syscall_nr(line: &str) -> Option<i64> {
190    // Look for "syscall=NNN" in the line
191    line.split_whitespace()
192        .find(|s| s.starts_with("syscall="))
193        .and_then(|s| s.strip_prefix("syscall="))
194        .and_then(|s| s.parse().ok())
195}
196
197/// Extract the syscalling process PID from an audit SECCOMP line.
198fn extract_audit_pid(line: &str) -> Option<u32> {
199    line.split_whitespace()
200        .find(|s| s.starts_with("pid="))
201        .and_then(|s| s.strip_prefix("pid="))
202        .and_then(|s| s.parse().ok())
203}
204
205/// Write the accumulated trace data as NDJSON.
206fn write_trace_file(path: &Path, syscalls: &BTreeMap<i64, u64>) -> Result<()> {
207    let mut file = std::fs::File::create(path).map_err(|e| {
208        NucleusError::ConfigError(format!("Failed to create trace file {:?}: {}", path, e))
209    })?;
210
211    for (&nr, &count) in syscalls {
212        let record = TraceRecord {
213            syscall: nr,
214            name: super::seccomp_generate::syscall_number_to_name(nr).map(String::from),
215            count,
216        };
217        let line =
218            serde_json::to_string(&record).unwrap_or_else(|e| format!("{{\"error\":\"{}\"}}", e));
219        writeln!(file, "{}", line).map_err(|e| {
220            NucleusError::ConfigError(format!("Failed to write trace record: {}", e))
221        })?;
222    }
223
224    Ok(())
225}
226
227/// Reads `/dev/kmsg` for SECCOMP deny records and emits WARN-level logs.
228///
229/// When `--seccomp-log-denied` is set with `SECCOMP_FILTER_FLAG_LOG`, the
230/// kernel logs denied syscalls to the audit subsystem. This reader runs in
231/// the parent process (which survives the child kill) and surfaces those
232/// records as application-level warnings. The audit record PID is matched
233/// against the container's target process, descendants, cgroup, and PID
234/// namespace so forked workload denials are not silently dropped.
235pub struct SeccompDenyLogger {
236    pid: u32,
237    cgroup_path: Option<PathBuf>,
238    stop: Arc<AtomicBool>,
239    handle: Option<JoinHandle<()>>,
240}
241
242impl SeccompDenyLogger {
243    pub fn new(pid: u32, cgroup_path: Option<PathBuf>) -> Self {
244        Self {
245            pid,
246            cgroup_path,
247            stop: Arc::new(AtomicBool::new(false)),
248            handle: None,
249        }
250    }
251
252    /// Start the background reader thread.
253    pub fn start(&mut self) -> Result<()> {
254        let pid = self.pid;
255        let cgroup_path = self.cgroup_path.clone();
256        let stop = self.stop.clone();
257
258        let handle = std::thread::spawn(move || {
259            if let Err(e) = deny_log_loop(pid, cgroup_path, &stop) {
260                warn!("Seccomp deny logger error: {}", e);
261            }
262        });
263
264        self.handle = Some(handle);
265        debug!(
266            cgroup = self
267                .cgroup_path
268                .as_ref()
269                .map(|path| path.display().to_string()),
270            "Seccomp deny logger started for PID {}", self.pid
271        );
272        Ok(())
273    }
274
275    /// Signal the logger to stop and join the thread.
276    pub fn stop(mut self) {
277        self.stop.store(true, Ordering::Release);
278        if let Some(handle) = self.handle.take() {
279            let _ = handle.join();
280        }
281    }
282}
283
284impl Drop for SeccompDenyLogger {
285    fn drop(&mut self) {
286        self.stop.store(true, Ordering::Release);
287        if let Some(handle) = self.handle.take() {
288            let _ = handle.join();
289        }
290    }
291}
292
293#[derive(Debug)]
294struct SeccompDenyScope {
295    target_pid: u32,
296    proc_root: PathBuf,
297    cgroup_path: Option<PathBuf>,
298    cgroup_relative_path: Option<String>,
299    target_pid_namespace: Option<String>,
300    known_pids: BTreeMap<u32, Instant>,
301    last_refresh: Option<Instant>,
302}
303
304impl SeccompDenyScope {
305    fn new(target_pid: u32, cgroup_path: Option<PathBuf>) -> Self {
306        Self::with_proc_root(target_pid, PathBuf::from(PROC_ROOT), cgroup_path, None)
307    }
308
309    fn with_proc_root(
310        target_pid: u32,
311        proc_root: PathBuf,
312        cgroup_path: Option<PathBuf>,
313        cgroup_relative_path: Option<String>,
314    ) -> Self {
315        let cgroup_relative_path = cgroup_relative_path.or_else(|| {
316            cgroup_path
317                .as_deref()
318                .and_then(cgroup_relative_path_from_host_path)
319        });
320        Self {
321            target_pid,
322            proc_root,
323            cgroup_path,
324            cgroup_relative_path,
325            target_pid_namespace: None,
326            known_pids: BTreeMap::new(),
327            last_refresh: None,
328        }
329    }
330
331    fn matches_pid(&mut self, pid: u32, now: Instant) -> bool {
332        if pid == self.target_pid {
333            self.remember_pid(pid, now);
334            return true;
335        }
336
337        self.refresh_if_stale(now);
338        if self.has_recent_pid(pid, now) {
339            return true;
340        }
341
342        // A deny line for an unknown PID is exactly the forked-workload case.
343        // Force a fresh scope scan before deciding that the audit record belongs
344        // to some other process on the host.
345        self.refresh(now);
346        if self.has_recent_pid(pid, now) {
347            return true;
348        }
349
350        if self.process_matches_cgroup(pid) || self.process_matches_pid_namespace(pid) {
351            self.remember_pid(pid, now);
352            return true;
353        }
354
355        false
356    }
357
358    fn refresh_if_stale(&mut self, now: Instant) {
359        let should_refresh = self
360            .last_refresh
361            .and_then(|last| now.checked_duration_since(last))
362            .map(|age| age >= DENY_SCOPE_REFRESH_INTERVAL)
363            .unwrap_or(true);
364        if should_refresh {
365            self.refresh(now);
366        }
367    }
368
369    fn refresh(&mut self, now: Instant) {
370        self.expire_stale_pids(now);
371        self.remember_pid(self.target_pid, now);
372
373        if self.target_pid_namespace.is_none() {
374            self.target_pid_namespace = read_pid_namespace(&self.proc_root, self.target_pid);
375        }
376
377        let mut scoped_pids = BTreeSet::new();
378        collect_process_tree_pids(&self.proc_root, self.target_pid, &mut scoped_pids);
379        for pid in scoped_pids {
380            self.remember_pid(pid, now);
381        }
382
383        if let Some(cgroup_path) = &self.cgroup_path {
384            for pid in read_pids_from_file(&cgroup_path.join("cgroup.procs")) {
385                self.remember_pid(pid, now);
386            }
387        }
388
389        self.last_refresh = Some(now);
390    }
391
392    fn remember_pid(&mut self, pid: u32, now: Instant) {
393        self.known_pids.insert(pid, now);
394    }
395
396    fn has_recent_pid(&self, pid: u32, now: Instant) -> bool {
397        self.known_pids
398            .get(&pid)
399            .map(|seen| is_recent(*seen, now))
400            .unwrap_or(false)
401    }
402
403    fn expire_stale_pids(&mut self, now: Instant) {
404        self.known_pids.retain(|_, seen| is_recent(*seen, now));
405    }
406
407    fn process_matches_cgroup(&self, pid: u32) -> bool {
408        let Some(expected) = self.cgroup_relative_path.as_deref() else {
409            return false;
410        };
411        let cgroup_file = self.proc_root.join(pid.to_string()).join("cgroup");
412        let Ok(content) = std::fs::read_to_string(cgroup_file) else {
413            return false;
414        };
415        cgroup_content_matches_path(&content, expected)
416    }
417
418    fn process_matches_pid_namespace(&self, pid: u32) -> bool {
419        let Some(target_ns) = self.target_pid_namespace.as_deref() else {
420            return false;
421        };
422        read_pid_namespace(&self.proc_root, pid)
423            .as_deref()
424            .map(|pid_ns| pid_ns == target_ns)
425            .unwrap_or(false)
426    }
427}
428
429fn is_recent(seen: Instant, now: Instant) -> bool {
430    now.checked_duration_since(seen)
431        .map(|age| age <= DENY_SCOPE_STALE_PID_TTL)
432        .unwrap_or(true)
433}
434
435fn collect_process_tree_pids(proc_root: &Path, root_pid: u32, out: &mut BTreeSet<u32>) {
436    let mut stack = vec![root_pid];
437    let mut visited = BTreeSet::new();
438
439    while let Some(pid) = stack.pop() {
440        if !visited.insert(pid) {
441            continue;
442        }
443        out.insert(pid);
444        stack.extend(read_child_pids(proc_root, pid));
445    }
446}
447
448fn read_child_pids(proc_root: &Path, pid: u32) -> Vec<u32> {
449    let task_dir = proc_root.join(pid.to_string()).join("task");
450    let Ok(entries) = std::fs::read_dir(task_dir) else {
451        return Vec::new();
452    };
453
454    let mut children = Vec::new();
455    for entry in entries.flatten() {
456        let children_path = entry.path().join("children");
457        if let Ok(content) = std::fs::read_to_string(children_path) {
458            children.extend(parse_pid_list(&content));
459        }
460    }
461    children
462}
463
464fn read_pids_from_file(path: &Path) -> Vec<u32> {
465    std::fs::read_to_string(path)
466        .map(|content| parse_pid_list(&content))
467        .unwrap_or_default()
468}
469
470fn parse_pid_list(content: &str) -> Vec<u32> {
471    content
472        .split_whitespace()
473        .filter_map(|pid| pid.parse::<u32>().ok())
474        .collect()
475}
476
477fn read_pid_namespace(proc_root: &Path, pid: u32) -> Option<String> {
478    std::fs::read_link(proc_root.join(pid.to_string()).join("ns").join("pid"))
479        .ok()
480        .map(|path| path.to_string_lossy().into_owned())
481}
482
483fn cgroup_relative_path_from_host_path(cgroup_path: &Path) -> Option<String> {
484    let root = Path::new(CGROUP_V2_ROOT);
485    let canonical_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
486    let canonical_path =
487        std::fs::canonicalize(cgroup_path).unwrap_or_else(|_| cgroup_path.to_path_buf());
488    let relative = canonical_path.strip_prefix(canonical_root).ok()?;
489    Some(normalize_cgroup_path(&format!(
490        "/{}",
491        relative.to_string_lossy()
492    )))
493}
494
495fn cgroup_content_matches_path(content: &str, expected: &str) -> bool {
496    let expected = normalize_cgroup_path(expected);
497    content
498        .lines()
499        .filter_map(|line| line.rsplit_once(':').map(|(_, path)| path.trim()))
500        .any(|actual| cgroup_path_contains(&normalize_cgroup_path(actual), &expected))
501}
502
503fn cgroup_path_contains(actual: &str, expected: &str) -> bool {
504    if expected == "/" {
505        return actual == "/";
506    }
507    actual == expected
508        || actual
509            .strip_prefix(expected)
510            .map(|suffix| suffix.starts_with('/'))
511            .unwrap_or(false)
512}
513
514fn normalize_cgroup_path(path: &str) -> String {
515    let trimmed = path.trim().trim_end_matches('/');
516    if trimmed.is_empty() {
517        return "/".to_string();
518    }
519    if trimmed.starts_with('/') {
520        trimmed.to_string()
521    } else {
522        format!("/{}", trimmed)
523    }
524}
525
526/// Main deny-log loop – reads /dev/kmsg and emits WARN for denied syscalls.
527fn deny_log_loop(pid: u32, cgroup_path: Option<PathBuf>, stop: &AtomicBool) -> Result<()> {
528    let kmsg_path = std::path::Path::new("/dev/kmsg");
529    if let Ok(meta) = std::fs::symlink_metadata(kmsg_path) {
530        if meta.file_type().is_symlink() {
531            warn!("/dev/kmsg is a symlink – refusing to open for seccomp deny logging");
532            return Ok(());
533        }
534    }
535
536    let file = match std::fs::File::open(kmsg_path) {
537        Ok(f) => f,
538        Err(e) => {
539            warn!(
540                "Cannot open /dev/kmsg for seccomp deny logging: {} \
541                 (requires root or CAP_SYSLOG)",
542                e
543            );
544            return Ok(());
545        }
546    };
547
548    use std::os::unix::io::AsRawFd;
549    let fd = file.as_raw_fd();
550    // SAFETY: fd is a valid file descriptor from File::open("/dev/kmsg").
551    // F_GETFL/F_SETFL only modify the file status flags; O_NONBLOCK is safe
552    // to set and required for poll-based reading.
553    unsafe {
554        let flags = libc::fcntl(fd, libc::F_GETFL);
555        if flags >= 0 {
556            libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
557        }
558    }
559
560    let reader = BufReader::new(file);
561    let mut scope = SeccompDenyScope::new(pid, cgroup_path);
562    scope.refresh(Instant::now());
563
564    for line in reader.lines() {
565        if stop.load(Ordering::Acquire) {
566            break;
567        }
568
569        let line = match line {
570            Ok(l) => l,
571            Err(e) => {
572                if e.kind() == std::io::ErrorKind::WouldBlock {
573                    scope.refresh_if_stale(Instant::now());
574                    let mut pfd = libc::pollfd {
575                        fd,
576                        events: libc::POLLIN,
577                        revents: 0,
578                    };
579                    // SAFETY: pfd is a valid stack-allocated pollfd with a valid fd.
580                    // poll with nfds=1 and a bounded timeout is safe; it only blocks.
581                    unsafe { libc::poll(&mut pfd, 1, DENY_SCOPE_POLL_TIMEOUT_MS) };
582                    continue;
583                }
584                debug!("kmsg read error: {}", e);
585                continue;
586            }
587        };
588
589        if let Some((audit_pid, nr)) =
590            denied_syscall_record_for_scope(&line, &mut scope, Instant::now())
591        {
592            let name = super::seccomp_generate::syscall_number_to_name(nr).unwrap_or("unknown");
593            warn!(
594                syscall = nr,
595                name = name,
596                pid = audit_pid,
597                target_pid = pid,
598                "seccomp denied syscall"
599            );
600        }
601    }
602
603    Ok(())
604}
605
606fn denied_syscall_record_for_scope(
607    line: &str,
608    scope: &mut SeccompDenyScope,
609    now: Instant,
610) -> Option<(u32, i64)> {
611    if !line.contains("type=1326") {
612        return None;
613    }
614    let pid = extract_audit_pid(line)?;
615    if !scope.matches_pid(pid, now) {
616        return None;
617    }
618    extract_syscall_nr(line).map(|nr| (pid, nr))
619}
620
621#[cfg(test)]
622mod tests {
623    use super::*;
624
625    #[test]
626    fn test_extract_syscall_nr() {
627        let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=42 comm=\"test\" exe=\"/bin/test\" sig=0 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x7fff0000";
628        assert_eq!(extract_syscall_nr(line), Some(257));
629    }
630
631    #[test]
632    fn test_extract_syscall_nr_missing() {
633        assert_eq!(extract_syscall_nr("no syscall here"), None);
634    }
635
636    #[test]
637    fn test_extract_audit_pid_ignores_ppid() {
638        let line = "audit: type=1326 audit(123:456): ppid=7 pid=42 comm=\"test\" syscall=257";
639        assert_eq!(extract_audit_pid(line), Some(42));
640    }
641
642    #[test]
643    fn test_deny_scope_matches_forked_child_audit_pid() {
644        let temp = tempfile::tempdir().unwrap();
645        let target_task = temp.path().join("42/task/42");
646        std::fs::create_dir_all(&target_task).unwrap();
647        std::fs::write(target_task.join("children"), "43\n").unwrap();
648
649        let mut scope = SeccompDenyScope::with_proc_root(42, temp.path().to_path_buf(), None, None);
650        let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"probe\" exe=\"/bin/probe\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
651
652        assert_eq!(
653            denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
654            Some((43, 257))
655        );
656    }
657
658    #[test]
659    fn test_deny_scope_rejects_unrelated_seccomp_pid() {
660        let temp = tempfile::tempdir().unwrap();
661        let target_task = temp.path().join("42/task/42");
662        std::fs::create_dir_all(&target_task).unwrap();
663        std::fs::write(target_task.join("children"), "").unwrap();
664
665        let mut scope = SeccompDenyScope::with_proc_root(42, temp.path().to_path_buf(), None, None);
666        let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"other\" exe=\"/bin/other\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
667
668        assert_eq!(
669            denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
670            None
671        );
672    }
673
674    #[test]
675    fn test_deny_scope_matches_cgroup_member_audit_pid() {
676        let temp = tempfile::tempdir().unwrap();
677        let proc_root = temp.path().join("proc");
678        let cgroup_dir = temp.path().join("cgroup");
679        std::fs::create_dir_all(proc_root.join("42/task/42")).unwrap();
680        std::fs::create_dir_all(proc_root.join("43")).unwrap();
681        std::fs::create_dir_all(&cgroup_dir).unwrap();
682        std::fs::write(proc_root.join("42/task/42/children"), "").unwrap();
683        std::fs::write(cgroup_dir.join("cgroup.procs"), "43\n").unwrap();
684
685        let mut scope = SeccompDenyScope::with_proc_root(
686            42,
687            proc_root,
688            Some(cgroup_dir),
689            Some("/nucleus-test".to_string()),
690        );
691        let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"probe\" exe=\"/bin/probe\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
692
693        assert_eq!(
694            denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
695            Some((43, 257))
696        );
697    }
698
699    #[test]
700    fn test_cgroup_content_matches_subgroup_membership() {
701        assert!(cgroup_content_matches_path(
702            "0::/nucleus-test/workers\n",
703            "/nucleus-test"
704        ));
705        assert!(!cgroup_content_matches_path(
706            "0::/nucleus-other\n",
707            "/nucleus-test"
708        ));
709    }
710
711    /// Extract the body of a function from source text by brace-matching,
712    /// avoiding fragile hardcoded character-window offsets (SEC-MED-03).
713    fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
714        let fn_start = source
715            .find(fn_signature)
716            .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
717        let after = &source[fn_start..];
718        let open = after
719            .find('{')
720            .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
721        let mut depth = 0u32;
722        let mut end = open;
723        for (i, ch) in after[open..].char_indices() {
724            match ch {
725                '{' => depth += 1,
726                '}' => {
727                    depth -= 1;
728                    if depth == 0 {
729                        end = open + i + 1;
730                        break;
731                    }
732                }
733                _ => {}
734            }
735        }
736        &after[..end]
737    }
738
739    #[test]
740    fn test_reader_uses_nonblocking_io() {
741        // Verify record_loop uses O_NONBLOCK + poll, not socket-only APIs.
742        // /dev/kmsg is a character device; socket APIs like SO_RCVTIMEO silently fail.
743        // NOTE: Uses brace-matched function body extraction (SEC-MED-03).
744        let source = include_str!("seccomp_trace.rs");
745        let fn_body = extract_fn_body(source, "fn record_loop");
746        assert!(
747            fn_body.contains("O_NONBLOCK"),
748            "record_loop must use O_NONBLOCK for non-blocking reads on /dev/kmsg"
749        );
750        assert!(
751            fn_body.contains("libc::poll"),
752            "record_loop must use poll() for timed waits on /dev/kmsg"
753        );
754        // setsockopt must not appear in the function (socket API doesn't work on char devices)
755        let setsockopt_lines: Vec<&str> = fn_body
756            .lines()
757            .filter(|l| !l.trim().starts_with("//"))
758            .filter(|l| l.contains("setsockopt"))
759            .collect();
760        assert!(
761            setsockopt_lines.is_empty(),
762            "record_loop must not call setsockopt on /dev/kmsg"
763        );
764    }
765
766    #[test]
767    fn test_trace_record_serialization() {
768        let record = TraceRecord {
769            syscall: 0,
770            name: Some("read".to_string()),
771            count: 42,
772        };
773        let json = serde_json::to_string(&record).unwrap();
774        assert!(json.contains("\"syscall\":0"));
775        assert!(json.contains("\"name\":\"read\""));
776        assert!(json.contains("\"count\":42"));
777    }
778}