Skip to main content

nucleus/security/
seccomp_generate.rs

1//! Seccomp profile generator: create minimal profiles from trace data.
2//!
3//! Reads NDJSON trace files produced by `--seccomp-mode trace` and
4//! generates a minimal OCI-format seccomp profile containing only
5//! the syscalls actually used by the workload.
6
7use crate::error::{NucleusError, Result};
8use crate::security::seccomp_trace::TraceRecord;
9use serde::{Deserialize, Serialize};
10use std::collections::HashSet;
11use std::io::BufRead;
12use std::path::Path;
13use tracing::info;
14
15/// OCI-format seccomp profile (subset).
16#[derive(Debug, Clone, Serialize, Deserialize)]
17#[serde(rename_all = "camelCase")]
18pub struct SeccompProfile {
19    /// Default action for unlisted syscalls.
20    pub default_action: String,
21
22    /// Target architectures.
23    #[serde(default)]
24    pub architectures: Vec<String>,
25
26    /// Syscall groups with their action.
27    #[serde(default)]
28    pub syscalls: Vec<SeccompSyscallGroup>,
29}
30
31/// A group of syscalls sharing the same action.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct SeccompSyscallGroup {
34    /// Syscall names.
35    pub names: Vec<String>,
36
37    /// Action: typically "SCMP_ACT_ALLOW".
38    pub action: String,
39
40    /// Optional argument filters (not generated, but preserved).
41    #[serde(default, skip_serializing_if = "Vec::is_empty")]
42    pub args: Vec<SeccompArgFilter>,
43}
44
45/// Argument-level filter for a syscall.
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct SeccompArgFilter {
48    /// Argument index (0-5).
49    pub index: u32,
50    /// Comparison operator.
51    pub op: String,
52    /// Comparison value.
53    pub value: u64,
54}
55
56/// Return the OCI seccomp architecture constant for the current target.
57///
58/// Detected at compile time via `cfg!(target_arch)` so generated profiles
59/// always match the binary's architecture.
60fn native_scmp_arch() -> &'static str {
61    if cfg!(target_arch = "x86_64") {
62        "SCMP_ARCH_X86_64"
63    } else if cfg!(target_arch = "aarch64") {
64        "SCMP_ARCH_AARCH64"
65    } else if cfg!(target_arch = "x86") {
66        "SCMP_ARCH_X86"
67    } else if cfg!(target_arch = "arm") {
68        "SCMP_ARCH_ARM"
69    } else if cfg!(target_arch = "riscv64") {
70        "SCMP_ARCH_RISCV64"
71    } else if cfg!(target_arch = "s390x") {
72        "SCMP_ARCH_S390X"
73    } else {
74        "SCMP_ARCH_NATIVE"
75    }
76}
77
78/// Generate a minimal seccomp profile from a trace file.
79///
80/// Reads NDJSON records, collects unique syscalls, and produces
81/// an OCI-format JSON profile that allows exactly those syscalls.
82pub fn generate_from_trace(trace_path: &Path) -> Result<SeccompProfile> {
83    let file = std::fs::File::open(trace_path).map_err(|e| {
84        NucleusError::ConfigError(format!("Failed to open trace file {:?}: {}", trace_path, e))
85    })?;
86
87    let reader = std::io::BufReader::new(file);
88    let mut syscall_set: HashSet<String> = HashSet::new();
89
90    for line in reader.lines() {
91        let line = line
92            .map_err(|e| NucleusError::ConfigError(format!("Failed to read trace line: {}", e)))?;
93
94        if line.trim().is_empty() {
95            continue;
96        }
97
98        let record: TraceRecord = serde_json::from_str(&line).map_err(|e| {
99            NucleusError::ConfigError(format!(
100                "Failed to parse trace record: {}: line='{}'",
101                e, line
102            ))
103        })?;
104
105        let name = record.name.unwrap_or_else(|| {
106            syscall_number_to_name(record.syscall)
107                .map(String::from)
108                .unwrap_or_else(|| format!("__NR_{}", record.syscall))
109        });
110
111        syscall_set.insert(name);
112    }
113
114    let mut syscall_names: Vec<String> = syscall_set.into_iter().collect();
115
116    syscall_names.sort();
117
118    info!(
119        "Generated seccomp profile with {} syscalls from {:?}",
120        syscall_names.len(),
121        trace_path
122    );
123
124    Ok(SeccompProfile {
125        default_action: "SCMP_ACT_KILL_PROCESS".to_string(),
126        architectures: vec![native_scmp_arch().to_string()],
127        syscalls: vec![SeccompSyscallGroup {
128            names: syscall_names,
129            action: "SCMP_ACT_ALLOW".to_string(),
130            args: vec![],
131        }],
132    })
133}
134
135/// Map a syscall number back to its name (inverse of syscall_name_to_number).
136pub fn syscall_number_to_name(nr: i64) -> Option<&'static str> {
137    syscall_table()
138        .into_iter()
139        .find(|(_, n)| *n == nr)
140        .map(|(name, _)| name)
141}
142
143/// (name, number) pairs for all mapped syscalls.
144///
145/// Built as a function rather than a static to support cfg-gated x86_64-only
146/// legacy syscalls that don't exist on aarch64.
147fn syscall_table() -> Vec<(&'static str, i64)> {
148    let mut table = vec![
149        ("read", libc::SYS_read),
150        ("write", libc::SYS_write),
151        ("openat", libc::SYS_openat),
152        ("close", libc::SYS_close),
153        ("fstat", libc::SYS_fstat),
154        ("lseek", libc::SYS_lseek),
155        ("fcntl", libc::SYS_fcntl),
156        ("readv", libc::SYS_readv),
157        ("writev", libc::SYS_writev),
158        ("pread64", libc::SYS_pread64),
159        ("pwrite64", libc::SYS_pwrite64),
160        ("readlinkat", libc::SYS_readlinkat),
161        ("newfstatat", libc::SYS_newfstatat),
162        ("statx", libc::SYS_statx),
163        ("faccessat", libc::SYS_faccessat),
164        ("faccessat2", libc::SYS_faccessat2),
165        ("dup", libc::SYS_dup),
166        ("dup3", libc::SYS_dup3),
167        ("pipe2", libc::SYS_pipe2),
168        ("unlinkat", libc::SYS_unlinkat),
169        ("renameat", libc::SYS_renameat),
170        ("renameat2", libc::SYS_renameat2),
171        ("linkat", libc::SYS_linkat),
172        ("symlinkat", libc::SYS_symlinkat),
173        ("fchmod", libc::SYS_fchmod),
174        ("fchmodat", libc::SYS_fchmodat),
175        ("truncate", libc::SYS_truncate),
176        ("ftruncate", libc::SYS_ftruncate),
177        ("fallocate", libc::SYS_fallocate),
178        ("fsync", libc::SYS_fsync),
179        ("fdatasync", libc::SYS_fdatasync),
180        ("flock", libc::SYS_flock),
181        ("copy_file_range", libc::SYS_copy_file_range),
182        ("splice", libc::SYS_splice),
183        ("tee", libc::SYS_tee),
184        ("mmap", libc::SYS_mmap),
185        ("munmap", libc::SYS_munmap),
186        ("mprotect", libc::SYS_mprotect),
187        ("brk", libc::SYS_brk),
188        ("mremap", libc::SYS_mremap),
189        ("madvise", libc::SYS_madvise),
190        ("msync", libc::SYS_msync),
191        ("mlock", libc::SYS_mlock),
192        ("munlock", libc::SYS_munlock),
193        ("clone", libc::SYS_clone),
194        ("clone3", libc::SYS_clone3),
195        ("execve", libc::SYS_execve),
196        ("execveat", libc::SYS_execveat),
197        ("wait4", libc::SYS_wait4),
198        ("waitid", libc::SYS_waitid),
199        ("exit", libc::SYS_exit),
200        ("exit_group", libc::SYS_exit_group),
201        ("getpid", libc::SYS_getpid),
202        ("gettid", libc::SYS_gettid),
203        ("getuid", libc::SYS_getuid),
204        ("getgid", libc::SYS_getgid),
205        ("geteuid", libc::SYS_geteuid),
206        ("getegid", libc::SYS_getegid),
207        ("getppid", libc::SYS_getppid),
208        ("setsid", libc::SYS_setsid),
209        ("getgroups", libc::SYS_getgroups),
210        ("rt_sigaction", libc::SYS_rt_sigaction),
211        ("rt_sigprocmask", libc::SYS_rt_sigprocmask),
212        ("rt_sigreturn", libc::SYS_rt_sigreturn),
213        ("rt_sigsuspend", libc::SYS_rt_sigsuspend),
214        ("sigaltstack", libc::SYS_sigaltstack),
215        ("kill", libc::SYS_kill),
216        ("tgkill", libc::SYS_tgkill),
217        ("clock_gettime", libc::SYS_clock_gettime),
218        ("clock_getres", libc::SYS_clock_getres),
219        ("clock_nanosleep", libc::SYS_clock_nanosleep),
220        ("gettimeofday", libc::SYS_gettimeofday),
221        ("nanosleep", libc::SYS_nanosleep),
222        ("getcwd", libc::SYS_getcwd),
223        ("chdir", libc::SYS_chdir),
224        ("fchdir", libc::SYS_fchdir),
225        ("mkdirat", libc::SYS_mkdirat),
226        ("getdents64", libc::SYS_getdents64),
227        ("socket", libc::SYS_socket),
228        ("connect", libc::SYS_connect),
229        ("sendto", libc::SYS_sendto),
230        ("recvfrom", libc::SYS_recvfrom),
231        ("sendmsg", libc::SYS_sendmsg),
232        ("recvmsg", libc::SYS_recvmsg),
233        ("shutdown", libc::SYS_shutdown),
234        ("bind", libc::SYS_bind),
235        ("listen", libc::SYS_listen),
236        ("accept", libc::SYS_accept),
237        ("accept4", libc::SYS_accept4),
238        ("setsockopt", libc::SYS_setsockopt),
239        ("getsockopt", libc::SYS_getsockopt),
240        ("getsockname", libc::SYS_getsockname),
241        ("getpeername", libc::SYS_getpeername),
242        ("socketpair", libc::SYS_socketpair),
243        ("ppoll", libc::SYS_ppoll),
244        ("pselect6", libc::SYS_pselect6),
245        ("epoll_create1", libc::SYS_epoll_create1),
246        ("epoll_ctl", libc::SYS_epoll_ctl),
247        ("epoll_pwait", libc::SYS_epoll_pwait),
248        ("eventfd2", libc::SYS_eventfd2),
249        ("signalfd4", libc::SYS_signalfd4),
250        ("timerfd_create", libc::SYS_timerfd_create),
251        ("timerfd_settime", libc::SYS_timerfd_settime),
252        ("timerfd_gettime", libc::SYS_timerfd_gettime),
253        ("uname", libc::SYS_uname),
254        ("getrandom", libc::SYS_getrandom),
255        ("futex", libc::SYS_futex),
256        ("set_tid_address", libc::SYS_set_tid_address),
257        ("set_robust_list", libc::SYS_set_robust_list),
258        ("get_robust_list", libc::SYS_get_robust_list),
259        ("sysinfo", libc::SYS_sysinfo),
260        ("umask", libc::SYS_umask),
261        ("prlimit64", libc::SYS_prlimit64),
262        ("getrusage", libc::SYS_getrusage),
263        ("times", libc::SYS_times),
264        ("sched_yield", libc::SYS_sched_yield),
265        ("sched_getaffinity", libc::SYS_sched_getaffinity),
266        ("getcpu", libc::SYS_getcpu),
267        ("rseq", libc::SYS_rseq),
268        ("close_range", libc::SYS_close_range),
269        ("memfd_create", libc::SYS_memfd_create),
270        ("ioctl", libc::SYS_ioctl),
271        ("prctl", libc::SYS_prctl),
272        ("landlock_create_ruleset", libc::SYS_landlock_create_ruleset),
273        ("landlock_add_rule", libc::SYS_landlock_add_rule),
274        ("landlock_restrict_self", libc::SYS_landlock_restrict_self),
275    ];
276
277    // Legacy syscalls only available on x86_64
278    #[cfg(target_arch = "x86_64")]
279    table.extend_from_slice(&[
280        ("open", libc::SYS_open),
281        ("stat", libc::SYS_stat),
282        ("lstat", libc::SYS_lstat),
283        ("access", libc::SYS_access),
284        ("readlink", libc::SYS_readlink),
285        ("dup2", libc::SYS_dup2),
286        ("pipe", libc::SYS_pipe),
287        ("unlink", libc::SYS_unlink),
288        ("rename", libc::SYS_rename),
289        ("link", libc::SYS_link),
290        ("symlink", libc::SYS_symlink),
291        ("chmod", libc::SYS_chmod),
292        ("fadvise64", libc::SYS_fadvise64),
293        ("sendfile", libc::SYS_sendfile),
294        ("fork", libc::SYS_fork),
295        ("getpgrp", libc::SYS_getpgrp),
296        ("mkdir", libc::SYS_mkdir),
297        ("rmdir", libc::SYS_rmdir),
298        ("getdents", libc::SYS_getdents),
299        ("poll", libc::SYS_poll),
300        ("select", libc::SYS_select),
301        ("epoll_create", libc::SYS_epoll_create),
302        ("epoll_wait", libc::SYS_epoll_wait),
303        ("eventfd", libc::SYS_eventfd),
304        ("signalfd", libc::SYS_signalfd),
305        ("arch_prctl", libc::SYS_arch_prctl),
306        ("getrlimit", libc::SYS_getrlimit),
307    ]);
308
309    table
310}
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315
316    #[test]
317    fn test_syscall_number_to_name() {
318        assert_eq!(syscall_number_to_name(libc::SYS_read), Some("read"));
319        assert_eq!(syscall_number_to_name(libc::SYS_write), Some("write"));
320        assert_eq!(syscall_number_to_name(libc::SYS_openat), Some("openat"));
321        assert_eq!(syscall_number_to_name(99999), None);
322    }
323
324    #[test]
325    fn test_generate_from_trace() {
326        let dir = tempfile::tempdir().unwrap();
327        let trace_path = dir.path().join("trace.ndjson");
328
329        // Write test trace data
330        std::fs::write(
331            &trace_path,
332            r#"{"syscall":0,"name":"read","count":10}
333{"syscall":1,"name":"write","count":5}
334{"syscall":257,"name":"openat","count":3}
335"#,
336        )
337        .unwrap();
338
339        let profile = generate_from_trace(&trace_path).unwrap();
340        assert_eq!(profile.default_action, "SCMP_ACT_KILL_PROCESS");
341        assert_eq!(profile.syscalls.len(), 1);
342
343        let names = &profile.syscalls[0].names;
344        assert_eq!(names.len(), 3);
345        assert!(names.contains(&"read".to_string()));
346        assert!(names.contains(&"write".to_string()));
347        assert!(names.contains(&"openat".to_string()));
348    }
349
350    #[test]
351    fn test_profile_serialization() {
352        let profile = SeccompProfile {
353            default_action: "SCMP_ACT_KILL_PROCESS".to_string(),
354            architectures: vec!["SCMP_ARCH_X86_64".to_string()],
355            syscalls: vec![SeccompSyscallGroup {
356                names: vec!["read".to_string(), "write".to_string()],
357                action: "SCMP_ACT_ALLOW".to_string(),
358                args: vec![],
359            }],
360        };
361
362        let json = serde_json::to_string_pretty(&profile).unwrap();
363        assert!(json.contains("\"defaultAction\""));
364        assert!(json.contains("SCMP_ACT_KILL_PROCESS"));
365        assert!(json.contains("\"read\""));
366
367        // Roundtrip
368        let parsed: SeccompProfile = serde_json::from_str(&json).unwrap();
369        assert_eq!(parsed.syscalls[0].names.len(), 2);
370    }
371
372    #[test]
373    fn test_native_scmp_arch_matches_target() {
374        let arch = native_scmp_arch();
375        #[cfg(target_arch = "x86_64")]
376        assert_eq!(arch, "SCMP_ARCH_X86_64");
377        #[cfg(target_arch = "aarch64")]
378        assert_eq!(arch, "SCMP_ARCH_AARCH64");
379        // Always starts with SCMP_ARCH_
380        assert!(arch.starts_with("SCMP_ARCH_"));
381    }
382
383    #[test]
384    fn test_generated_profile_uses_native_arch() {
385        let dir = tempfile::tempdir().unwrap();
386        let trace_path = dir.path().join("trace.ndjson");
387        std::fs::write(
388            &trace_path,
389            r#"{"syscall":0,"name":"read","count":1}
390"#,
391        )
392        .unwrap();
393
394        let profile = generate_from_trace(&trace_path).unwrap();
395        assert_eq!(profile.architectures.len(), 1);
396        assert_eq!(profile.architectures[0], native_scmp_arch());
397    }
398}