Skip to main content

nucleus/security/
seccomp_generate.rs

1//! Seccomp profile generator: create minimal profiles from trace data.
2//!
3//! Reads NDJSON trace files produced by `--seccomp-mode trace` and
4//! generates a minimal OCI-format seccomp profile containing only
5//! the syscalls actually used by the workload.
6
7use crate::error::{NucleusError, Result};
8use crate::security::seccomp_trace::TraceRecord;
9#[cfg(any(
10    target_arch = "x86_64",
11    target_arch = "aarch64",
12    target_arch = "riscv64"
13))]
14use crate::security::syscall_numbers::{SYS_FADVISE64, SYS_SENDFILE};
15use serde::{Deserialize, Serialize};
16use std::collections::HashSet;
17use std::io::BufRead;
18use std::path::Path;
19use tracing::info;
20
21/// OCI-format seccomp profile (subset).
22#[derive(Debug, Clone, Serialize, Deserialize)]
23#[serde(rename_all = "camelCase")]
24pub struct SeccompProfile {
25    /// Default action for unlisted syscalls.
26    pub default_action: String,
27
28    /// Target architectures.
29    #[serde(default)]
30    pub architectures: Vec<String>,
31
32    /// Syscall groups with their action.
33    #[serde(default)]
34    pub syscalls: Vec<SeccompSyscallGroup>,
35}
36
37/// A group of syscalls sharing the same action.
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct SeccompSyscallGroup {
40    /// Syscall names.
41    pub names: Vec<String>,
42
43    /// Action: typically "SCMP_ACT_ALLOW".
44    pub action: String,
45
46    /// Optional argument filters (not generated, but preserved).
47    #[serde(default, skip_serializing_if = "Vec::is_empty")]
48    pub args: Vec<SeccompArgFilter>,
49}
50
51/// Argument-level filter for a syscall.
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct SeccompArgFilter {
54    /// Argument index (0-5).
55    pub index: u32,
56    /// Comparison operator.
57    pub op: String,
58    /// Comparison value.
59    pub value: u64,
60}
61
62/// Return the OCI seccomp architecture constant for the current target.
63///
64/// Detected at compile time via `cfg!(target_arch)` so generated profiles
65/// always match the binary's architecture.
66fn native_scmp_arch() -> &'static str {
67    if cfg!(target_arch = "x86_64") {
68        "SCMP_ARCH_X86_64"
69    } else if cfg!(target_arch = "aarch64") {
70        "SCMP_ARCH_AARCH64"
71    } else if cfg!(target_arch = "x86") {
72        "SCMP_ARCH_X86"
73    } else if cfg!(target_arch = "arm") {
74        "SCMP_ARCH_ARM"
75    } else if cfg!(target_arch = "riscv64") {
76        "SCMP_ARCH_RISCV64"
77    } else if cfg!(target_arch = "s390x") {
78        "SCMP_ARCH_S390X"
79    } else {
80        "SCMP_ARCH_NATIVE"
81    }
82}
83
84/// Generate a minimal seccomp profile from a trace file.
85///
86/// Reads NDJSON records, collects unique syscalls, and produces
87/// an OCI-format JSON profile that allows exactly those syscalls.
88pub fn generate_from_trace(trace_path: &Path) -> Result<SeccompProfile> {
89    let file = std::fs::File::open(trace_path).map_err(|e| {
90        NucleusError::ConfigError(format!("Failed to open trace file {:?}: {}", trace_path, e))
91    })?;
92
93    let reader = std::io::BufReader::new(file);
94    let mut syscall_set: HashSet<String> = HashSet::new();
95
96    for line in reader.lines() {
97        let line = line
98            .map_err(|e| NucleusError::ConfigError(format!("Failed to read trace line: {}", e)))?;
99
100        if line.trim().is_empty() {
101            continue;
102        }
103
104        let record: TraceRecord = serde_json::from_str(&line).map_err(|e| {
105            NucleusError::ConfigError(format!(
106                "Failed to parse trace record: {}: line='{}'",
107                e, line
108            ))
109        })?;
110
111        let name = record.name.unwrap_or_else(|| {
112            syscall_number_to_name(record.syscall)
113                .map(String::from)
114                .unwrap_or_else(|| format!("__NR_{}", record.syscall))
115        });
116
117        syscall_set.insert(name);
118    }
119
120    let mut syscall_names: Vec<String> = syscall_set.into_iter().collect();
121
122    syscall_names.sort();
123
124    info!(
125        "Generated seccomp profile with {} syscalls from {:?}",
126        syscall_names.len(),
127        trace_path
128    );
129
130    Ok(SeccompProfile {
131        default_action: "SCMP_ACT_KILL_PROCESS".to_string(),
132        architectures: vec![native_scmp_arch().to_string()],
133        syscalls: vec![SeccompSyscallGroup {
134            names: syscall_names,
135            action: "SCMP_ACT_ALLOW".to_string(),
136            args: vec![],
137        }],
138    })
139}
140
141/// Map a syscall number back to its name (inverse of syscall_name_to_number).
142pub fn syscall_number_to_name(nr: i64) -> Option<&'static str> {
143    syscall_table()
144        .into_iter()
145        .find(|(_, n)| *n == nr)
146        .map(|(name, _)| name)
147}
148
149/// (name, number) pairs for all mapped syscalls.
150///
151/// Built as a function rather than a static to support cfg-gated x86_64-only
152/// legacy syscalls that don't exist on aarch64.
153fn syscall_table() -> Vec<(&'static str, i64)> {
154    let mut table = vec![
155        ("read", libc::SYS_read),
156        ("write", libc::SYS_write),
157        ("openat", libc::SYS_openat),
158        ("close", libc::SYS_close),
159        ("fstat", libc::SYS_fstat),
160        ("lseek", libc::SYS_lseek),
161        ("fcntl", libc::SYS_fcntl),
162        ("readv", libc::SYS_readv),
163        ("writev", libc::SYS_writev),
164        ("preadv", libc::SYS_preadv),
165        ("pwritev", libc::SYS_pwritev),
166        ("preadv2", libc::SYS_preadv2),
167        ("pwritev2", libc::SYS_pwritev2),
168        ("pread64", libc::SYS_pread64),
169        ("pwrite64", libc::SYS_pwrite64),
170        ("readlinkat", libc::SYS_readlinkat),
171        ("newfstatat", libc::SYS_newfstatat),
172        ("statx", libc::SYS_statx),
173        ("faccessat", libc::SYS_faccessat),
174        ("faccessat2", libc::SYS_faccessat2),
175        ("dup", libc::SYS_dup),
176        ("dup3", libc::SYS_dup3),
177        ("pipe2", libc::SYS_pipe2),
178        ("unlinkat", libc::SYS_unlinkat),
179        ("renameat", libc::SYS_renameat),
180        ("renameat2", libc::SYS_renameat2),
181        ("linkat", libc::SYS_linkat),
182        ("symlinkat", libc::SYS_symlinkat),
183        ("fchmod", libc::SYS_fchmod),
184        ("fchmodat", libc::SYS_fchmodat),
185        ("truncate", libc::SYS_truncate),
186        ("ftruncate", libc::SYS_ftruncate),
187        ("fallocate", libc::SYS_fallocate),
188        #[cfg(any(
189            target_arch = "x86_64",
190            target_arch = "aarch64",
191            target_arch = "riscv64"
192        ))]
193        ("fadvise64", SYS_FADVISE64),
194        ("fsync", libc::SYS_fsync),
195        ("fdatasync", libc::SYS_fdatasync),
196        ("flock", libc::SYS_flock),
197        #[cfg(any(
198            target_arch = "x86_64",
199            target_arch = "aarch64",
200            target_arch = "riscv64"
201        ))]
202        ("sendfile", SYS_SENDFILE),
203        ("copy_file_range", libc::SYS_copy_file_range),
204        ("splice", libc::SYS_splice),
205        ("tee", libc::SYS_tee),
206        ("mmap", libc::SYS_mmap),
207        ("munmap", libc::SYS_munmap),
208        ("mprotect", libc::SYS_mprotect),
209        ("brk", libc::SYS_brk),
210        ("mremap", libc::SYS_mremap),
211        ("madvise", libc::SYS_madvise),
212        ("msync", libc::SYS_msync),
213        ("mlock", libc::SYS_mlock),
214        ("munlock", libc::SYS_munlock),
215        ("clone", libc::SYS_clone),
216        ("clone3", libc::SYS_clone3),
217        ("execve", libc::SYS_execve),
218        ("execveat", libc::SYS_execveat),
219        ("wait4", libc::SYS_wait4),
220        ("waitid", libc::SYS_waitid),
221        ("exit", libc::SYS_exit),
222        ("exit_group", libc::SYS_exit_group),
223        ("getpid", libc::SYS_getpid),
224        ("gettid", libc::SYS_gettid),
225        ("getuid", libc::SYS_getuid),
226        ("getgid", libc::SYS_getgid),
227        ("geteuid", libc::SYS_geteuid),
228        ("getegid", libc::SYS_getegid),
229        ("getppid", libc::SYS_getppid),
230        ("setsid", libc::SYS_setsid),
231        ("getgroups", libc::SYS_getgroups),
232        ("rt_sigaction", libc::SYS_rt_sigaction),
233        ("rt_sigprocmask", libc::SYS_rt_sigprocmask),
234        ("rt_sigreturn", libc::SYS_rt_sigreturn),
235        ("rt_sigsuspend", libc::SYS_rt_sigsuspend),
236        ("sigaltstack", libc::SYS_sigaltstack),
237        ("kill", libc::SYS_kill),
238        ("tgkill", libc::SYS_tgkill),
239        ("clock_gettime", libc::SYS_clock_gettime),
240        ("clock_getres", libc::SYS_clock_getres),
241        ("clock_nanosleep", libc::SYS_clock_nanosleep),
242        ("gettimeofday", libc::SYS_gettimeofday),
243        ("nanosleep", libc::SYS_nanosleep),
244        ("getcwd", libc::SYS_getcwd),
245        ("chdir", libc::SYS_chdir),
246        ("fchdir", libc::SYS_fchdir),
247        ("mkdirat", libc::SYS_mkdirat),
248        ("getdents64", libc::SYS_getdents64),
249        ("socket", libc::SYS_socket),
250        ("connect", libc::SYS_connect),
251        ("sendto", libc::SYS_sendto),
252        ("recvfrom", libc::SYS_recvfrom),
253        ("sendmsg", libc::SYS_sendmsg),
254        ("recvmsg", libc::SYS_recvmsg),
255        ("shutdown", libc::SYS_shutdown),
256        ("bind", libc::SYS_bind),
257        ("listen", libc::SYS_listen),
258        ("accept", libc::SYS_accept),
259        ("accept4", libc::SYS_accept4),
260        ("setsockopt", libc::SYS_setsockopt),
261        ("getsockopt", libc::SYS_getsockopt),
262        ("getsockname", libc::SYS_getsockname),
263        ("getpeername", libc::SYS_getpeername),
264        ("socketpair", libc::SYS_socketpair),
265        ("ppoll", libc::SYS_ppoll),
266        ("pselect6", libc::SYS_pselect6),
267        ("epoll_create1", libc::SYS_epoll_create1),
268        ("epoll_ctl", libc::SYS_epoll_ctl),
269        ("epoll_pwait", libc::SYS_epoll_pwait),
270        ("eventfd2", libc::SYS_eventfd2),
271        ("signalfd4", libc::SYS_signalfd4),
272        ("timerfd_create", libc::SYS_timerfd_create),
273        ("timerfd_settime", libc::SYS_timerfd_settime),
274        ("timerfd_gettime", libc::SYS_timerfd_gettime),
275        ("uname", libc::SYS_uname),
276        ("getrandom", libc::SYS_getrandom),
277        ("futex", libc::SYS_futex),
278        ("set_tid_address", libc::SYS_set_tid_address),
279        ("set_robust_list", libc::SYS_set_robust_list),
280        ("get_robust_list", libc::SYS_get_robust_list),
281        ("sysinfo", libc::SYS_sysinfo),
282        ("umask", libc::SYS_umask),
283        ("prlimit64", libc::SYS_prlimit64),
284        ("getrusage", libc::SYS_getrusage),
285        ("times", libc::SYS_times),
286        ("sched_yield", libc::SYS_sched_yield),
287        ("sched_getaffinity", libc::SYS_sched_getaffinity),
288        ("getcpu", libc::SYS_getcpu),
289        ("rseq", libc::SYS_rseq),
290        ("close_range", libc::SYS_close_range),
291        ("memfd_create", libc::SYS_memfd_create),
292        ("ioctl", libc::SYS_ioctl),
293        ("prctl", libc::SYS_prctl),
294        ("landlock_create_ruleset", libc::SYS_landlock_create_ruleset),
295        ("landlock_add_rule", libc::SYS_landlock_add_rule),
296        ("landlock_restrict_self", libc::SYS_landlock_restrict_self),
297    ];
298
299    // Legacy syscalls only available on x86_64
300    #[cfg(target_arch = "x86_64")]
301    table.extend_from_slice(&[
302        ("open", libc::SYS_open),
303        ("stat", libc::SYS_stat),
304        ("lstat", libc::SYS_lstat),
305        ("access", libc::SYS_access),
306        ("readlink", libc::SYS_readlink),
307        ("dup2", libc::SYS_dup2),
308        ("pipe", libc::SYS_pipe),
309        ("unlink", libc::SYS_unlink),
310        ("rename", libc::SYS_rename),
311        ("link", libc::SYS_link),
312        ("symlink", libc::SYS_symlink),
313        ("chmod", libc::SYS_chmod),
314        ("fork", libc::SYS_fork),
315        ("getpgrp", libc::SYS_getpgrp),
316        ("mkdir", libc::SYS_mkdir),
317        ("rmdir", libc::SYS_rmdir),
318        ("getdents", libc::SYS_getdents),
319        ("poll", libc::SYS_poll),
320        ("select", libc::SYS_select),
321        ("epoll_create", libc::SYS_epoll_create),
322        ("epoll_wait", libc::SYS_epoll_wait),
323        ("eventfd", libc::SYS_eventfd),
324        ("signalfd", libc::SYS_signalfd),
325        ("arch_prctl", libc::SYS_arch_prctl),
326        ("getrlimit", libc::SYS_getrlimit),
327    ]);
328
329    table
330}
331
332#[cfg(test)]
333mod tests {
334    use super::*;
335
336    #[test]
337    fn test_syscall_number_to_name() {
338        assert_eq!(syscall_number_to_name(libc::SYS_read), Some("read"));
339        assert_eq!(syscall_number_to_name(libc::SYS_write), Some("write"));
340        assert_eq!(syscall_number_to_name(libc::SYS_openat), Some("openat"));
341        assert_eq!(syscall_number_to_name(99999), None);
342    }
343
344    #[cfg(any(
345        target_arch = "x86_64",
346        target_arch = "aarch64",
347        target_arch = "riscv64"
348    ))]
349    #[test]
350    fn test_generic_file_syscall_numbers_to_name() {
351        assert_eq!(syscall_number_to_name(SYS_FADVISE64), Some("fadvise64"));
352        assert_eq!(syscall_number_to_name(SYS_SENDFILE), Some("sendfile"));
353    }
354
355    #[test]
356    fn test_generate_from_trace() {
357        let dir = tempfile::tempdir().unwrap();
358        let trace_path = dir.path().join("trace.ndjson");
359
360        // Write test trace data
361        std::fs::write(
362            &trace_path,
363            r#"{"syscall":0,"name":"read","count":10}
364{"syscall":1,"name":"write","count":5}
365{"syscall":257,"name":"openat","count":3}
366"#,
367        )
368        .unwrap();
369
370        let profile = generate_from_trace(&trace_path).unwrap();
371        assert_eq!(profile.default_action, "SCMP_ACT_KILL_PROCESS");
372        assert_eq!(profile.syscalls.len(), 1);
373
374        let names = &profile.syscalls[0].names;
375        assert_eq!(names.len(), 3);
376        assert!(names.contains(&"read".to_string()));
377        assert!(names.contains(&"write".to_string()));
378        assert!(names.contains(&"openat".to_string()));
379    }
380
381    #[test]
382    fn test_profile_serialization() {
383        let profile = SeccompProfile {
384            default_action: "SCMP_ACT_KILL_PROCESS".to_string(),
385            architectures: vec!["SCMP_ARCH_X86_64".to_string()],
386            syscalls: vec![SeccompSyscallGroup {
387                names: vec!["read".to_string(), "write".to_string()],
388                action: "SCMP_ACT_ALLOW".to_string(),
389                args: vec![],
390            }],
391        };
392
393        let json = serde_json::to_string_pretty(&profile).unwrap();
394        assert!(json.contains("\"defaultAction\""));
395        assert!(json.contains("SCMP_ACT_KILL_PROCESS"));
396        assert!(json.contains("\"read\""));
397
398        // Roundtrip
399        let parsed: SeccompProfile = serde_json::from_str(&json).unwrap();
400        assert_eq!(parsed.syscalls[0].names.len(), 2);
401    }
402
403    #[test]
404    fn test_native_scmp_arch_matches_target() {
405        let arch = native_scmp_arch();
406        #[cfg(target_arch = "x86_64")]
407        assert_eq!(arch, "SCMP_ARCH_X86_64");
408        #[cfg(target_arch = "aarch64")]
409        assert_eq!(arch, "SCMP_ARCH_AARCH64");
410        // Always starts with SCMP_ARCH_
411        assert!(arch.starts_with("SCMP_ARCH_"));
412    }
413
414    #[test]
415    fn test_generated_profile_uses_native_arch() {
416        let dir = tempfile::tempdir().unwrap();
417        let trace_path = dir.path().join("trace.ndjson");
418        std::fs::write(
419            &trace_path,
420            r#"{"syscall":0,"name":"read","count":1}
421"#,
422        )
423        .unwrap();
424
425        let profile = generate_from_trace(&trace_path).unwrap();
426        assert_eq!(profile.architectures.len(), 1);
427        assert_eq!(profile.architectures[0], native_scmp_arch());
428    }
429}