Skip to main content

nucleus/security/
seccomp_trace.rs

1//! Seccomp trace mode: record syscalls for profile generation.
2//!
3//! In trace mode, an allow-all seccomp filter is installed with
4//! `SECCOMP_FILTER_FLAG_LOG`, causing the kernel to log every syscall to
5//! the audit subsystem. A reader thread monitors `/dev/kmsg` for
6//! SECCOMP audit records matching the container PID and writes unique
7//! syscalls to an NDJSON trace file.
8//!
9//! This is a development tool – requires root or CAP_SYSLOG for
10//! `/dev/kmsg` access.
11
12use crate::error::{NucleusError, Result};
13use serde::{Deserialize, Serialize};
14use std::collections::BTreeMap;
15use std::io::{BufRead, BufReader, Write};
16use std::path::{Path, PathBuf};
17use std::sync::atomic::{AtomicBool, Ordering};
18use std::sync::Arc;
19use std::thread::JoinHandle;
20use tracing::{debug, info, warn};
21
22/// A single trace record in the NDJSON output.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct TraceRecord {
25    /// Syscall number (e.g. 0 for read on x86_64).
26    pub syscall: i64,
27    /// Syscall name if known.
28    pub name: Option<String>,
29    /// Number of times this syscall was observed.
30    pub count: u64,
31}
32
33/// Reads `/dev/kmsg` for SECCOMP audit records and collects unique syscalls.
34pub struct SeccompTraceReader {
35    pid: u32,
36    output_path: PathBuf,
37    stop: Arc<AtomicBool>,
38    handle: Option<JoinHandle<()>>,
39}
40
41impl SeccompTraceReader {
42    /// Create a new trace reader for the given child PID.
43    pub fn new(pid: u32, output_path: &Path) -> Self {
44        Self {
45            pid,
46            output_path: output_path.to_path_buf(),
47            stop: Arc::new(AtomicBool::new(false)),
48            handle: None,
49        }
50    }
51
52    /// Start the background reader thread.
53    ///
54    /// Opens `/dev/kmsg` and filters for `audit: type=1326` (SECCOMP)
55    /// messages matching the target PID.
56    pub fn start_recording(&mut self) -> Result<()> {
57        let pid = self.pid;
58        let output_path = self.output_path.clone();
59        let stop = self.stop.clone();
60
61        let handle = std::thread::spawn(move || {
62            if let Err(e) = record_loop(pid, &output_path, &stop) {
63                warn!("Seccomp trace reader error: {}", e);
64            }
65        });
66
67        self.handle = Some(handle);
68        info!("Seccomp trace reader started for PID {}", self.pid);
69        Ok(())
70    }
71
72    /// Signal the reader to stop and wait for it to flush.
73    pub fn stop_and_flush(mut self) {
74        self.stop.store(true, Ordering::Release);
75        if let Some(handle) = self.handle.take() {
76            let _ = handle.join();
77        }
78        info!(
79            "Seccomp trace reader stopped, output at {:?}",
80            self.output_path
81        );
82    }
83}
84
85impl Drop for SeccompTraceReader {
86    fn drop(&mut self) {
87        self.stop.store(true, Ordering::Release);
88        if let Some(handle) = self.handle.take() {
89            let _ = handle.join();
90        }
91    }
92}
93
94/// Main recording loop – reads /dev/kmsg and extracts SECCOMP records.
95fn record_loop(pid: u32, output_path: &Path, stop: &AtomicBool) -> Result<()> {
96    let mut syscalls: BTreeMap<i64, u64> = BTreeMap::new();
97
98    // Verify /dev/kmsg is not a symlink before opening
99    let kmsg_path = std::path::Path::new("/dev/kmsg");
100    if let Ok(meta) = std::fs::symlink_metadata(kmsg_path) {
101        if meta.file_type().is_symlink() {
102            warn!("/dev/kmsg is a symlink – refusing to open for seccomp tracing");
103            write_trace_file(output_path, &syscalls)?;
104            return Ok(());
105        }
106    }
107
108    // Open /dev/kmsg for reading (requires CAP_SYSLOG or root)
109    let file = match std::fs::File::open(kmsg_path) {
110        Ok(f) => f,
111        Err(e) => {
112            warn!(
113                "Cannot open /dev/kmsg for seccomp tracing: {} \
114                 (requires root or CAP_SYSLOG). Falling back to no-trace mode.",
115                e
116            );
117            // Write empty trace file
118            write_trace_file(output_path, &syscalls)?;
119            return Ok(());
120        }
121    };
122
123    // Set O_NONBLOCK so reads don't block indefinitely. We use poll() with a
124    // timeout to periodically check the stop flag. The previous setsockopt(SO_RCVTIMEO)
125    // approach was incorrect: /dev/kmsg is a character device, not a socket, so
126    // setsockopt silently fails with ENOTSOCK.
127    use std::os::unix::io::AsRawFd;
128    let fd = file.as_raw_fd();
129    // SAFETY: fd is a valid file descriptor from File::open("/dev/kmsg").
130    // F_GETFL/F_SETFL only modify the file status flags; O_NONBLOCK is safe
131    // to set and required for poll-based reading.
132    unsafe {
133        let flags = libc::fcntl(fd, libc::F_GETFL);
134        if flags >= 0 {
135            libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
136        }
137    }
138
139    let reader = BufReader::new(file);
140    let pid_pattern = format!("pid={}", pid);
141
142    for line in reader.lines() {
143        if stop.load(Ordering::Acquire) {
144            break;
145        }
146
147        let line = match line {
148            Ok(l) => l,
149            Err(e) => {
150                if e.kind() == std::io::ErrorKind::WouldBlock {
151                    // No data available – poll with 2s timeout, then check stop flag
152                    let mut pfd = libc::pollfd {
153                        fd,
154                        events: libc::POLLIN,
155                        revents: 0,
156                    };
157                    // SAFETY: pfd is a valid stack-allocated pollfd with a valid fd.
158                    // poll with nfds=1 and timeout=2000ms is safe; it only blocks.
159                    unsafe { libc::poll(&mut pfd, 1, 2000) };
160                    continue;
161                }
162                debug!("kmsg read error: {}", e);
163                continue;
164            }
165        };
166
167        // SECCOMP audit lines look like:
168        // audit: type=1326 ... pid=<PID> ... syscall=<NR> ...
169        if line.contains("type=1326") && line.contains(&pid_pattern) {
170            if let Some(nr) = extract_syscall_nr(&line) {
171                *syscalls.entry(nr).or_insert(0) += 1;
172            }
173        }
174    }
175
176    write_trace_file(output_path, &syscalls)?;
177    info!("Seccomp trace: recorded {} unique syscalls", syscalls.len());
178    Ok(())
179}
180
181/// Extract the syscall number from an audit SECCOMP line.
182fn extract_syscall_nr(line: &str) -> Option<i64> {
183    // Look for "syscall=NNN" in the line
184    line.split_whitespace()
185        .find(|s| s.starts_with("syscall="))
186        .and_then(|s| s.strip_prefix("syscall="))
187        .and_then(|s| s.parse().ok())
188}
189
190/// Write the accumulated trace data as NDJSON.
191fn write_trace_file(path: &Path, syscalls: &BTreeMap<i64, u64>) -> Result<()> {
192    let mut file = std::fs::File::create(path).map_err(|e| {
193        NucleusError::ConfigError(format!("Failed to create trace file {:?}: {}", path, e))
194    })?;
195
196    for (&nr, &count) in syscalls {
197        let record = TraceRecord {
198            syscall: nr,
199            name: super::seccomp_generate::syscall_number_to_name(nr).map(String::from),
200            count,
201        };
202        let line =
203            serde_json::to_string(&record).unwrap_or_else(|e| format!("{{\"error\":\"{}\"}}", e));
204        writeln!(file, "{}", line).map_err(|e| {
205            NucleusError::ConfigError(format!("Failed to write trace record: {}", e))
206        })?;
207    }
208
209    Ok(())
210}
211
212/// Reads `/dev/kmsg` for SECCOMP deny records and emits WARN-level logs.
213///
214/// When `--seccomp-log-denied` is set with `SECCOMP_FILTER_FLAG_LOG`, the
215/// kernel logs denied syscalls to the audit subsystem. This reader runs in
216/// the parent process (which survives the child kill) and surfaces those
217/// records as application-level warnings.
218pub struct SeccompDenyLogger {
219    pid: u32,
220    stop: Arc<AtomicBool>,
221    handle: Option<JoinHandle<()>>,
222}
223
224impl SeccompDenyLogger {
225    pub fn new(pid: u32) -> Self {
226        Self {
227            pid,
228            stop: Arc::new(AtomicBool::new(false)),
229            handle: None,
230        }
231    }
232
233    /// Start the background reader thread.
234    pub fn start(&mut self) -> Result<()> {
235        let pid = self.pid;
236        let stop = self.stop.clone();
237
238        let handle = std::thread::spawn(move || {
239            if let Err(e) = deny_log_loop(pid, &stop) {
240                warn!("Seccomp deny logger error: {}", e);
241            }
242        });
243
244        self.handle = Some(handle);
245        debug!("Seccomp deny logger started for PID {}", self.pid);
246        Ok(())
247    }
248
249    /// Signal the logger to stop and join the thread.
250    pub fn stop(mut self) {
251        self.stop.store(true, Ordering::Release);
252        if let Some(handle) = self.handle.take() {
253            let _ = handle.join();
254        }
255    }
256}
257
258impl Drop for SeccompDenyLogger {
259    fn drop(&mut self) {
260        self.stop.store(true, Ordering::Release);
261        if let Some(handle) = self.handle.take() {
262            let _ = handle.join();
263        }
264    }
265}
266
267/// Main deny-log loop – reads /dev/kmsg and emits WARN for denied syscalls.
268fn deny_log_loop(pid: u32, stop: &AtomicBool) -> Result<()> {
269    let kmsg_path = std::path::Path::new("/dev/kmsg");
270    if let Ok(meta) = std::fs::symlink_metadata(kmsg_path) {
271        if meta.file_type().is_symlink() {
272            warn!("/dev/kmsg is a symlink – refusing to open for seccomp deny logging");
273            return Ok(());
274        }
275    }
276
277    let file = match std::fs::File::open(kmsg_path) {
278        Ok(f) => f,
279        Err(e) => {
280            warn!(
281                "Cannot open /dev/kmsg for seccomp deny logging: {} \
282                 (requires root or CAP_SYSLOG)",
283                e
284            );
285            return Ok(());
286        }
287    };
288
289    use std::os::unix::io::AsRawFd;
290    let fd = file.as_raw_fd();
291    // SAFETY: fd is a valid file descriptor from File::open("/dev/kmsg").
292    // F_GETFL/F_SETFL only modify the file status flags; O_NONBLOCK is safe
293    // to set and required for poll-based reading.
294    unsafe {
295        let flags = libc::fcntl(fd, libc::F_GETFL);
296        if flags >= 0 {
297            libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
298        }
299    }
300
301    let reader = BufReader::new(file);
302    let pid_pattern = format!("pid={}", pid);
303
304    for line in reader.lines() {
305        if stop.load(Ordering::Acquire) {
306            break;
307        }
308
309        let line = match line {
310            Ok(l) => l,
311            Err(e) => {
312                if e.kind() == std::io::ErrorKind::WouldBlock {
313                    let mut pfd = libc::pollfd {
314                        fd,
315                        events: libc::POLLIN,
316                        revents: 0,
317                    };
318                    // SAFETY: pfd is a valid stack-allocated pollfd with a valid fd.
319                    // poll with nfds=1 and timeout=2000ms is safe; it only blocks.
320                    unsafe { libc::poll(&mut pfd, 1, 2000) };
321                    continue;
322                }
323                debug!("kmsg read error: {}", e);
324                continue;
325            }
326        };
327
328        if line.contains("type=1326") && line.contains(&pid_pattern) {
329            if let Some(nr) = extract_syscall_nr(&line) {
330                let name = super::seccomp_generate::syscall_number_to_name(nr).unwrap_or("unknown");
331                warn!(
332                    syscall = nr,
333                    name = name,
334                    pid = pid,
335                    "seccomp denied syscall"
336                );
337            }
338        }
339    }
340
341    Ok(())
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347
348    #[test]
349    fn test_extract_syscall_nr() {
350        let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=42 comm=\"test\" exe=\"/bin/test\" sig=0 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x7fff0000";
351        assert_eq!(extract_syscall_nr(line), Some(257));
352    }
353
354    #[test]
355    fn test_extract_syscall_nr_missing() {
356        assert_eq!(extract_syscall_nr("no syscall here"), None);
357    }
358
359    /// Extract the body of a function from source text by brace-matching,
360    /// avoiding fragile hardcoded character-window offsets (SEC-MED-03).
361    fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
362        let fn_start = source
363            .find(fn_signature)
364            .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
365        let after = &source[fn_start..];
366        let open = after
367            .find('{')
368            .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
369        let mut depth = 0u32;
370        let mut end = open;
371        for (i, ch) in after[open..].char_indices() {
372            match ch {
373                '{' => depth += 1,
374                '}' => {
375                    depth -= 1;
376                    if depth == 0 {
377                        end = open + i + 1;
378                        break;
379                    }
380                }
381                _ => {}
382            }
383        }
384        &after[..end]
385    }
386
387    #[test]
388    fn test_reader_uses_nonblocking_io() {
389        // Verify record_loop uses O_NONBLOCK + poll, not socket-only APIs.
390        // /dev/kmsg is a character device; socket APIs like SO_RCVTIMEO silently fail.
391        // NOTE: Uses brace-matched function body extraction (SEC-MED-03).
392        let source = include_str!("seccomp_trace.rs");
393        let fn_body = extract_fn_body(source, "fn record_loop");
394        assert!(
395            fn_body.contains("O_NONBLOCK"),
396            "record_loop must use O_NONBLOCK for non-blocking reads on /dev/kmsg"
397        );
398        assert!(
399            fn_body.contains("libc::poll"),
400            "record_loop must use poll() for timed waits on /dev/kmsg"
401        );
402        // setsockopt must not appear in the function (socket API doesn't work on char devices)
403        let setsockopt_lines: Vec<&str> = fn_body
404            .lines()
405            .filter(|l| !l.trim().starts_with("//"))
406            .filter(|l| l.contains("setsockopt"))
407            .collect();
408        assert!(
409            setsockopt_lines.is_empty(),
410            "record_loop must not call setsockopt on /dev/kmsg"
411        );
412    }
413
414    #[test]
415    fn test_trace_record_serialization() {
416        let record = TraceRecord {
417            syscall: 0,
418            name: Some("read".to_string()),
419            count: 42,
420        };
421        let json = serde_json::to_string(&record).unwrap();
422        assert!(json.contains("\"syscall\":0"));
423        assert!(json.contains("\"name\":\"read\""));
424        assert!(json.contains("\"count\":42"));
425    }
426}