1use crate::error::{NucleusError, Result};
13use serde::{Deserialize, Serialize};
14use std::collections::{BTreeMap, BTreeSet};
15use std::io::{BufRead, BufReader, Write};
16use std::path::{Path, PathBuf};
17use std::sync::atomic::{AtomicBool, Ordering};
18use std::sync::Arc;
19use std::thread::JoinHandle;
20use std::time::{Duration, Instant};
21use tracing::{debug, info, warn};
22
23const DENY_SCOPE_REFRESH_INTERVAL: Duration = Duration::from_millis(250);
24const DENY_SCOPE_STALE_PID_TTL: Duration = Duration::from_secs(5);
25const DENY_SCOPE_POLL_TIMEOUT_MS: libc::c_int = 250;
26const PROC_ROOT: &str = "/proc";
27const CGROUP_V2_ROOT: &str = "/sys/fs/cgroup";
28
29#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct TraceRecord {
32 pub syscall: i64,
34 pub name: Option<String>,
36 pub count: u64,
38}
39
40pub struct SeccompTraceReader {
42 pid: u32,
43 output_path: PathBuf,
44 stop: Arc<AtomicBool>,
45 handle: Option<JoinHandle<()>>,
46}
47
48impl SeccompTraceReader {
49 pub fn new(pid: u32, output_path: &Path) -> Self {
51 Self {
52 pid,
53 output_path: output_path.to_path_buf(),
54 stop: Arc::new(AtomicBool::new(false)),
55 handle: None,
56 }
57 }
58
59 pub fn start_recording(&mut self) -> Result<()> {
64 let pid = self.pid;
65 let output_path = self.output_path.clone();
66 let stop = self.stop.clone();
67
68 let handle = std::thread::spawn(move || {
69 if let Err(e) = record_loop(pid, &output_path, &stop) {
70 warn!("Seccomp trace reader error: {}", e);
71 }
72 });
73
74 self.handle = Some(handle);
75 info!("Seccomp trace reader started for PID {}", self.pid);
76 Ok(())
77 }
78
79 pub fn stop_and_flush(mut self) {
81 self.stop.store(true, Ordering::Release);
82 if let Some(handle) = self.handle.take() {
83 let _ = handle.join();
84 }
85 info!(
86 "Seccomp trace reader stopped, output at {:?}",
87 self.output_path
88 );
89 }
90}
91
92impl Drop for SeccompTraceReader {
93 fn drop(&mut self) {
94 self.stop.store(true, Ordering::Release);
95 if let Some(handle) = self.handle.take() {
96 let _ = handle.join();
97 }
98 }
99}
100
101fn record_loop(pid: u32, output_path: &Path, stop: &AtomicBool) -> Result<()> {
103 let mut syscalls: BTreeMap<i64, u64> = BTreeMap::new();
104
105 let kmsg_path = std::path::Path::new("/dev/kmsg");
107 if let Ok(meta) = std::fs::symlink_metadata(kmsg_path) {
108 if meta.file_type().is_symlink() {
109 warn!("/dev/kmsg is a symlink – refusing to open for seccomp tracing");
110 write_trace_file(output_path, &syscalls)?;
111 return Ok(());
112 }
113 }
114
115 let file = match std::fs::File::open(kmsg_path) {
117 Ok(f) => f,
118 Err(e) => {
119 warn!(
120 "Cannot open /dev/kmsg for seccomp tracing: {} \
121 (requires root or CAP_SYSLOG). Falling back to no-trace mode.",
122 e
123 );
124 write_trace_file(output_path, &syscalls)?;
126 return Ok(());
127 }
128 };
129
130 use std::os::unix::io::AsRawFd;
135 let fd = file.as_raw_fd();
136 unsafe {
140 let flags = libc::fcntl(fd, libc::F_GETFL);
141 if flags >= 0 {
142 libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
143 }
144 }
145
146 let reader = BufReader::new(file);
147 let pid_pattern = format!("pid={}", pid);
148
149 for line in reader.lines() {
150 if stop.load(Ordering::Acquire) {
151 break;
152 }
153
154 let line = match line {
155 Ok(l) => l,
156 Err(e) => {
157 if e.kind() == std::io::ErrorKind::WouldBlock {
158 let mut pfd = libc::pollfd {
160 fd,
161 events: libc::POLLIN,
162 revents: 0,
163 };
164 unsafe { libc::poll(&mut pfd, 1, 2000) };
167 continue;
168 }
169 debug!("kmsg read error: {}", e);
170 continue;
171 }
172 };
173
174 if line.contains("type=1326") && line.contains(&pid_pattern) {
177 if let Some(nr) = extract_syscall_nr(&line) {
178 *syscalls.entry(nr).or_insert(0) += 1;
179 }
180 }
181 }
182
183 write_trace_file(output_path, &syscalls)?;
184 info!("Seccomp trace: recorded {} unique syscalls", syscalls.len());
185 Ok(())
186}
187
188fn extract_syscall_nr(line: &str) -> Option<i64> {
190 line.split_whitespace()
192 .find(|s| s.starts_with("syscall="))
193 .and_then(|s| s.strip_prefix("syscall="))
194 .and_then(|s| s.parse().ok())
195}
196
197fn extract_audit_pid(line: &str) -> Option<u32> {
199 line.split_whitespace()
200 .find(|s| s.starts_with("pid="))
201 .and_then(|s| s.strip_prefix("pid="))
202 .and_then(|s| s.parse().ok())
203}
204
205fn write_trace_file(path: &Path, syscalls: &BTreeMap<i64, u64>) -> Result<()> {
207 let mut file = std::fs::File::create(path).map_err(|e| {
208 NucleusError::ConfigError(format!("Failed to create trace file {:?}: {}", path, e))
209 })?;
210
211 for (&nr, &count) in syscalls {
212 let record = TraceRecord {
213 syscall: nr,
214 name: super::seccomp_generate::syscall_number_to_name(nr).map(String::from),
215 count,
216 };
217 let line =
218 serde_json::to_string(&record).unwrap_or_else(|e| format!("{{\"error\":\"{}\"}}", e));
219 writeln!(file, "{}", line).map_err(|e| {
220 NucleusError::ConfigError(format!("Failed to write trace record: {}", e))
221 })?;
222 }
223
224 Ok(())
225}
226
227pub struct SeccompDenyLogger {
236 pid: u32,
237 cgroup_path: Option<PathBuf>,
238 stop: Arc<AtomicBool>,
239 handle: Option<JoinHandle<()>>,
240}
241
242impl SeccompDenyLogger {
243 pub fn new(pid: u32, cgroup_path: Option<PathBuf>) -> Self {
244 Self {
245 pid,
246 cgroup_path,
247 stop: Arc::new(AtomicBool::new(false)),
248 handle: None,
249 }
250 }
251
252 pub fn start(&mut self) -> Result<()> {
254 let pid = self.pid;
255 let cgroup_path = self.cgroup_path.clone();
256 let stop = self.stop.clone();
257
258 let handle = std::thread::spawn(move || {
259 if let Err(e) = deny_log_loop(pid, cgroup_path, &stop) {
260 warn!("Seccomp deny logger error: {}", e);
261 }
262 });
263
264 self.handle = Some(handle);
265 debug!(
266 cgroup = self
267 .cgroup_path
268 .as_ref()
269 .map(|path| path.display().to_string()),
270 "Seccomp deny logger started for PID {}", self.pid
271 );
272 Ok(())
273 }
274
275 pub fn stop(mut self) {
277 self.stop.store(true, Ordering::Release);
278 if let Some(handle) = self.handle.take() {
279 let _ = handle.join();
280 }
281 }
282}
283
284impl Drop for SeccompDenyLogger {
285 fn drop(&mut self) {
286 self.stop.store(true, Ordering::Release);
287 if let Some(handle) = self.handle.take() {
288 let _ = handle.join();
289 }
290 }
291}
292
293#[derive(Debug)]
294struct SeccompDenyScope {
295 target_pid: u32,
296 proc_root: PathBuf,
297 cgroup_path: Option<PathBuf>,
298 cgroup_relative_path: Option<String>,
299 target_pid_namespace: Option<String>,
300 known_pids: BTreeMap<u32, Instant>,
301 last_refresh: Option<Instant>,
302}
303
304impl SeccompDenyScope {
305 fn new(target_pid: u32, cgroup_path: Option<PathBuf>) -> Self {
306 Self::with_proc_root(target_pid, PathBuf::from(PROC_ROOT), cgroup_path, None)
307 }
308
309 fn with_proc_root(
310 target_pid: u32,
311 proc_root: PathBuf,
312 cgroup_path: Option<PathBuf>,
313 cgroup_relative_path: Option<String>,
314 ) -> Self {
315 let cgroup_relative_path = cgroup_relative_path.or_else(|| {
316 cgroup_path
317 .as_deref()
318 .and_then(cgroup_relative_path_from_host_path)
319 });
320 Self {
321 target_pid,
322 proc_root,
323 cgroup_path,
324 cgroup_relative_path,
325 target_pid_namespace: None,
326 known_pids: BTreeMap::new(),
327 last_refresh: None,
328 }
329 }
330
331 fn matches_pid(&mut self, pid: u32, now: Instant) -> bool {
332 if pid == self.target_pid {
333 self.remember_pid(pid, now);
334 return true;
335 }
336
337 self.refresh_if_stale(now);
338 if self.has_recent_pid(pid, now) {
339 return true;
340 }
341
342 self.refresh(now);
346 if self.has_recent_pid(pid, now) {
347 return true;
348 }
349
350 if self.process_matches_cgroup(pid) || self.process_matches_pid_namespace(pid) {
351 self.remember_pid(pid, now);
352 return true;
353 }
354
355 false
356 }
357
358 fn refresh_if_stale(&mut self, now: Instant) {
359 let should_refresh = self
360 .last_refresh
361 .and_then(|last| now.checked_duration_since(last))
362 .map(|age| age >= DENY_SCOPE_REFRESH_INTERVAL)
363 .unwrap_or(true);
364 if should_refresh {
365 self.refresh(now);
366 }
367 }
368
369 fn refresh(&mut self, now: Instant) {
370 self.expire_stale_pids(now);
371 self.remember_pid(self.target_pid, now);
372
373 if self.target_pid_namespace.is_none() {
374 self.target_pid_namespace = read_pid_namespace(&self.proc_root, self.target_pid);
375 }
376
377 let mut scoped_pids = BTreeSet::new();
378 collect_process_tree_pids(&self.proc_root, self.target_pid, &mut scoped_pids);
379 for pid in scoped_pids {
380 self.remember_pid(pid, now);
381 }
382
383 if let Some(cgroup_path) = &self.cgroup_path {
384 for pid in read_pids_from_file(&cgroup_path.join("cgroup.procs")) {
385 self.remember_pid(pid, now);
386 }
387 }
388
389 self.last_refresh = Some(now);
390 }
391
392 fn remember_pid(&mut self, pid: u32, now: Instant) {
393 self.known_pids.insert(pid, now);
394 }
395
396 fn has_recent_pid(&self, pid: u32, now: Instant) -> bool {
397 self.known_pids
398 .get(&pid)
399 .map(|seen| is_recent(*seen, now))
400 .unwrap_or(false)
401 }
402
403 fn expire_stale_pids(&mut self, now: Instant) {
404 self.known_pids.retain(|_, seen| is_recent(*seen, now));
405 }
406
407 fn process_matches_cgroup(&self, pid: u32) -> bool {
408 let Some(expected) = self.cgroup_relative_path.as_deref() else {
409 return false;
410 };
411 let cgroup_file = self.proc_root.join(pid.to_string()).join("cgroup");
412 let Ok(content) = std::fs::read_to_string(cgroup_file) else {
413 return false;
414 };
415 cgroup_content_matches_path(&content, expected)
416 }
417
418 fn process_matches_pid_namespace(&self, pid: u32) -> bool {
419 let Some(target_ns) = self.target_pid_namespace.as_deref() else {
420 return false;
421 };
422 read_pid_namespace(&self.proc_root, pid)
423 .as_deref()
424 .map(|pid_ns| pid_ns == target_ns)
425 .unwrap_or(false)
426 }
427}
428
429fn is_recent(seen: Instant, now: Instant) -> bool {
430 now.checked_duration_since(seen)
431 .map(|age| age <= DENY_SCOPE_STALE_PID_TTL)
432 .unwrap_or(true)
433}
434
435fn collect_process_tree_pids(proc_root: &Path, root_pid: u32, out: &mut BTreeSet<u32>) {
436 let mut stack = vec![root_pid];
437 let mut visited = BTreeSet::new();
438
439 while let Some(pid) = stack.pop() {
440 if !visited.insert(pid) {
441 continue;
442 }
443 out.insert(pid);
444 stack.extend(read_child_pids(proc_root, pid));
445 }
446}
447
448fn read_child_pids(proc_root: &Path, pid: u32) -> Vec<u32> {
449 let task_dir = proc_root.join(pid.to_string()).join("task");
450 let Ok(entries) = std::fs::read_dir(task_dir) else {
451 return Vec::new();
452 };
453
454 let mut children = Vec::new();
455 for entry in entries.flatten() {
456 let children_path = entry.path().join("children");
457 if let Ok(content) = std::fs::read_to_string(children_path) {
458 children.extend(parse_pid_list(&content));
459 }
460 }
461 children
462}
463
464fn read_pids_from_file(path: &Path) -> Vec<u32> {
465 std::fs::read_to_string(path)
466 .map(|content| parse_pid_list(&content))
467 .unwrap_or_default()
468}
469
470fn parse_pid_list(content: &str) -> Vec<u32> {
471 content
472 .split_whitespace()
473 .filter_map(|pid| pid.parse::<u32>().ok())
474 .collect()
475}
476
477fn read_pid_namespace(proc_root: &Path, pid: u32) -> Option<String> {
478 std::fs::read_link(proc_root.join(pid.to_string()).join("ns").join("pid"))
479 .ok()
480 .map(|path| path.to_string_lossy().into_owned())
481}
482
483fn cgroup_relative_path_from_host_path(cgroup_path: &Path) -> Option<String> {
484 let root = Path::new(CGROUP_V2_ROOT);
485 let canonical_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
486 let canonical_path =
487 std::fs::canonicalize(cgroup_path).unwrap_or_else(|_| cgroup_path.to_path_buf());
488 let relative = canonical_path.strip_prefix(canonical_root).ok()?;
489 Some(normalize_cgroup_path(&format!(
490 "/{}",
491 relative.to_string_lossy()
492 )))
493}
494
495fn cgroup_content_matches_path(content: &str, expected: &str) -> bool {
496 let expected = normalize_cgroup_path(expected);
497 content
498 .lines()
499 .filter_map(|line| line.rsplit_once(':').map(|(_, path)| path.trim()))
500 .any(|actual| cgroup_path_contains(&normalize_cgroup_path(actual), &expected))
501}
502
503fn cgroup_path_contains(actual: &str, expected: &str) -> bool {
504 if expected == "/" {
505 return actual == "/";
506 }
507 actual == expected
508 || actual
509 .strip_prefix(expected)
510 .map(|suffix| suffix.starts_with('/'))
511 .unwrap_or(false)
512}
513
514fn normalize_cgroup_path(path: &str) -> String {
515 let trimmed = path.trim().trim_end_matches('/');
516 if trimmed.is_empty() {
517 return "/".to_string();
518 }
519 if trimmed.starts_with('/') {
520 trimmed.to_string()
521 } else {
522 format!("/{}", trimmed)
523 }
524}
525
526fn deny_log_loop(pid: u32, cgroup_path: Option<PathBuf>, stop: &AtomicBool) -> Result<()> {
528 let kmsg_path = std::path::Path::new("/dev/kmsg");
529 if let Ok(meta) = std::fs::symlink_metadata(kmsg_path) {
530 if meta.file_type().is_symlink() {
531 warn!("/dev/kmsg is a symlink – refusing to open for seccomp deny logging");
532 return Ok(());
533 }
534 }
535
536 let file = match std::fs::File::open(kmsg_path) {
537 Ok(f) => f,
538 Err(e) => {
539 warn!(
540 "Cannot open /dev/kmsg for seccomp deny logging: {} \
541 (requires root or CAP_SYSLOG)",
542 e
543 );
544 return Ok(());
545 }
546 };
547
548 use std::os::unix::io::AsRawFd;
549 let fd = file.as_raw_fd();
550 unsafe {
554 let flags = libc::fcntl(fd, libc::F_GETFL);
555 if flags >= 0 {
556 libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK);
557 }
558 }
559
560 let reader = BufReader::new(file);
561 let mut scope = SeccompDenyScope::new(pid, cgroup_path);
562 scope.refresh(Instant::now());
563
564 for line in reader.lines() {
565 if stop.load(Ordering::Acquire) {
566 break;
567 }
568
569 let line = match line {
570 Ok(l) => l,
571 Err(e) => {
572 if e.kind() == std::io::ErrorKind::WouldBlock {
573 scope.refresh_if_stale(Instant::now());
574 let mut pfd = libc::pollfd {
575 fd,
576 events: libc::POLLIN,
577 revents: 0,
578 };
579 unsafe { libc::poll(&mut pfd, 1, DENY_SCOPE_POLL_TIMEOUT_MS) };
582 continue;
583 }
584 debug!("kmsg read error: {}", e);
585 continue;
586 }
587 };
588
589 if let Some((audit_pid, nr)) =
590 denied_syscall_record_for_scope(&line, &mut scope, Instant::now())
591 {
592 let name = super::seccomp_generate::syscall_number_to_name(nr).unwrap_or("unknown");
593 warn!(
594 syscall = nr,
595 name = name,
596 pid = audit_pid,
597 target_pid = pid,
598 "seccomp denied syscall"
599 );
600 }
601 }
602
603 Ok(())
604}
605
606fn denied_syscall_record_for_scope(
607 line: &str,
608 scope: &mut SeccompDenyScope,
609 now: Instant,
610) -> Option<(u32, i64)> {
611 if !line.contains("type=1326") {
612 return None;
613 }
614 let pid = extract_audit_pid(line)?;
615 if !scope.matches_pid(pid, now) {
616 return None;
617 }
618 extract_syscall_nr(line).map(|nr| (pid, nr))
619}
620
621#[cfg(test)]
622mod tests {
623 use super::*;
624
625 #[test]
626 fn test_extract_syscall_nr() {
627 let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=42 comm=\"test\" exe=\"/bin/test\" sig=0 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x7fff0000";
628 assert_eq!(extract_syscall_nr(line), Some(257));
629 }
630
631 #[test]
632 fn test_extract_syscall_nr_missing() {
633 assert_eq!(extract_syscall_nr("no syscall here"), None);
634 }
635
636 #[test]
637 fn test_extract_audit_pid_ignores_ppid() {
638 let line = "audit: type=1326 audit(123:456): ppid=7 pid=42 comm=\"test\" syscall=257";
639 assert_eq!(extract_audit_pid(line), Some(42));
640 }
641
642 #[test]
643 fn test_deny_scope_matches_forked_child_audit_pid() {
644 let temp = tempfile::tempdir().unwrap();
645 let target_task = temp.path().join("42/task/42");
646 std::fs::create_dir_all(&target_task).unwrap();
647 std::fs::write(target_task.join("children"), "43\n").unwrap();
648
649 let mut scope = SeccompDenyScope::with_proc_root(42, temp.path().to_path_buf(), None, None);
650 let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"probe\" exe=\"/bin/probe\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
651
652 assert_eq!(
653 denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
654 Some((43, 257))
655 );
656 }
657
658 #[test]
659 fn test_deny_scope_rejects_unrelated_seccomp_pid() {
660 let temp = tempfile::tempdir().unwrap();
661 let target_task = temp.path().join("42/task/42");
662 std::fs::create_dir_all(&target_task).unwrap();
663 std::fs::write(target_task.join("children"), "").unwrap();
664
665 let mut scope = SeccompDenyScope::with_proc_root(42, temp.path().to_path_buf(), None, None);
666 let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"other\" exe=\"/bin/other\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
667
668 assert_eq!(
669 denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
670 None
671 );
672 }
673
674 #[test]
675 fn test_deny_scope_matches_cgroup_member_audit_pid() {
676 let temp = tempfile::tempdir().unwrap();
677 let proc_root = temp.path().join("proc");
678 let cgroup_dir = temp.path().join("cgroup");
679 std::fs::create_dir_all(proc_root.join("42/task/42")).unwrap();
680 std::fs::create_dir_all(proc_root.join("43")).unwrap();
681 std::fs::create_dir_all(&cgroup_dir).unwrap();
682 std::fs::write(proc_root.join("42/task/42/children"), "").unwrap();
683 std::fs::write(cgroup_dir.join("cgroup.procs"), "43\n").unwrap();
684
685 let mut scope = SeccompDenyScope::with_proc_root(
686 42,
687 proc_root,
688 Some(cgroup_dir),
689 Some("/nucleus-test".to_string()),
690 );
691 let line = "6,1234,5678,-;audit: type=1326 audit(123:456): auid=0 uid=0 gid=0 ses=1 pid=43 comm=\"probe\" exe=\"/bin/probe\" sig=31 arch=c000003e syscall=257 compat=0 ip=0x7f action=0x80000000";
692
693 assert_eq!(
694 denied_syscall_record_for_scope(line, &mut scope, Instant::now()),
695 Some((43, 257))
696 );
697 }
698
699 #[test]
700 fn test_cgroup_content_matches_subgroup_membership() {
701 assert!(cgroup_content_matches_path(
702 "0::/nucleus-test/workers\n",
703 "/nucleus-test"
704 ));
705 assert!(!cgroup_content_matches_path(
706 "0::/nucleus-other\n",
707 "/nucleus-test"
708 ));
709 }
710
711 fn extract_fn_body<'a>(source: &'a str, fn_signature: &str) -> &'a str {
714 let fn_start = source
715 .find(fn_signature)
716 .unwrap_or_else(|| panic!("function '{}' not found in source", fn_signature));
717 let after = &source[fn_start..];
718 let open = after
719 .find('{')
720 .unwrap_or_else(|| panic!("no opening brace found for '{}'", fn_signature));
721 let mut depth = 0u32;
722 let mut end = open;
723 for (i, ch) in after[open..].char_indices() {
724 match ch {
725 '{' => depth += 1,
726 '}' => {
727 depth -= 1;
728 if depth == 0 {
729 end = open + i + 1;
730 break;
731 }
732 }
733 _ => {}
734 }
735 }
736 &after[..end]
737 }
738
739 #[test]
740 fn test_reader_uses_nonblocking_io() {
741 let source = include_str!("seccomp_trace.rs");
745 let fn_body = extract_fn_body(source, "fn record_loop");
746 assert!(
747 fn_body.contains("O_NONBLOCK"),
748 "record_loop must use O_NONBLOCK for non-blocking reads on /dev/kmsg"
749 );
750 assert!(
751 fn_body.contains("libc::poll"),
752 "record_loop must use poll() for timed waits on /dev/kmsg"
753 );
754 let setsockopt_lines: Vec<&str> = fn_body
756 .lines()
757 .filter(|l| !l.trim().starts_with("//"))
758 .filter(|l| l.contains("setsockopt"))
759 .collect();
760 assert!(
761 setsockopt_lines.is_empty(),
762 "record_loop must not call setsockopt on /dev/kmsg"
763 );
764 }
765
766 #[test]
767 fn test_trace_record_serialization() {
768 let record = TraceRecord {
769 syscall: 0,
770 name: Some("read".to_string()),
771 count: 42,
772 };
773 let json = serde_json::to_string(&record).unwrap();
774 assert!(json.contains("\"syscall\":0"));
775 assert!(json.contains("\"name\":\"read\""));
776 assert!(json.contains("\"count\":42"));
777 }
778}