1use crate::{
2 common::non_nul_string::NonNulString,
3 npk::manifest::capabilities::Capability,
4 seccomp::{profiles::default, Profile, SyscallArgRule, SyscallRule},
5};
6use anyhow::{bail, Result};
7use bindings::{
8 seccomp_data, sock_filter, sock_fprog, BPF_ABS, BPF_ALU, BPF_AND, BPF_IMM, BPF_JEQ, BPF_JMP,
9 BPF_K, BPF_LD, BPF_MAXINSNS, BPF_MEM, BPF_NEG, BPF_OR, BPF_RET, BPF_ST, BPF_W, SYSCALL_MAP,
10};
11use log::trace;
12use nix::errno::Errno;
13use serde::{Deserialize, Deserializer, Serialize, Serializer};
14use std::{
15 collections::{HashMap, HashSet},
16 mem::size_of,
17};
18
19#[allow(unused, non_snake_case, non_camel_case_types, non_upper_case_globals)]
20mod bindings {
21 include!(concat!(env!("OUT_DIR"), "/syscall_bindings.rs"));
22 include!(concat!(env!("OUT_DIR"), "/seccomp_bindings.rs"));
23}
24
25#[cfg(target_arch = "aarch64")]
26const AUDIT_ARCH: u32 = bindings::AUDIT_ARCH_AARCH64;
27#[cfg(target_arch = "x86_64")]
28const AUDIT_ARCH: u32 = bindings::AUDIT_ARCH_X86_64;
29
30const REQUIRED_SYSCALLS: &[u32] = &[bindings::SYS_execve];
32
33const EVAL_NEXT: u8 = 0;
35const SKIP_NEXT: u8 = 1;
37
38pub fn seccomp_filter(
40 profile: Option<&Profile>,
41 rules: Option<&HashMap<NonNulString, SyscallRule>>,
42 caps: &HashSet<Capability>,
43) -> AllowList {
44 check_platform_requirements();
45
46 let mut builder = Builder::new();
47 if let Some(profile) = profile {
48 builder.extend(builder_from_profile(profile, caps));
49 }
50 if let Some(rules) = rules {
51 builder.extend(builder_from_rules(rules));
52 }
53 builder.build()
54}
55
56pub(crate) fn builder_from_rules(rules: &HashMap<NonNulString, SyscallRule>) -> Builder {
58 let mut builder = Builder::new();
59 for (name, call_rule) in rules {
60 let arg_rule = match call_rule {
61 SyscallRule::Any => None,
62 SyscallRule::Args(a) => Some(a),
63 };
64 if let Err(e) = builder.allow_syscall_name(name, arg_rule.cloned()) {
65 trace!("failed to allow syscall {}: {}", &name.to_string(), e);
67 }
68 }
69 builder
70}
71
72fn builder_from_profile(profile: &Profile, caps: &HashSet<Capability>) -> Builder {
74 match profile {
75 Profile::Default => {
76 let mut builder = default::BASE.clone();
77
78 if !caps.is_empty() {
80 let mut cap_sys_admin = false;
81 for cap in caps {
82 match cap {
83 Capability::CAP_CHOWN => {}
84 Capability::CAP_DAC_OVERRIDE => {}
85 Capability::CAP_DAC_READ_SEARCH => {
86 builder.extend(default::CAP_DAC_READ_SEARCH.clone());
87 }
88 Capability::CAP_FOWNER => {}
89 Capability::CAP_FSETID => {}
90 Capability::CAP_KILL => {}
91 Capability::CAP_SETGID => {}
92 Capability::CAP_SETUID => {}
93 Capability::CAP_SETPCAP => {}
94 Capability::CAP_LINUX_IMMUTABLE => {}
95 Capability::CAP_NET_BIND_SERVICE => {}
96 Capability::CAP_NET_BROADCAST => {}
97 Capability::CAP_NET_ADMIN => {}
98 Capability::CAP_NET_RAW => {}
99 Capability::CAP_IPC_LOCK => {}
100 Capability::CAP_IPC_OWNER => {}
101 Capability::CAP_SYS_MODULE => {
102 builder.extend(default::CAP_SYS_MODULE.clone());
103 }
104 Capability::CAP_SYS_RAWIO => {
105 builder.extend(default::CAP_SYS_RAWIO.clone());
106 }
107 Capability::CAP_SYS_CHROOT => {
108 builder.extend(default::CAP_SYS_CHROOT.clone());
109 }
110 Capability::CAP_SYS_PTRACE => {
111 builder.extend(default::CAP_SYS_PTRACE.clone());
112 }
113 Capability::CAP_SYS_PACCT => {
114 builder.extend(default::CAP_SYS_PACCT.clone());
115 }
116 Capability::CAP_SYS_ADMIN => {
117 cap_sys_admin = true;
118 builder.extend(default::CAP_SYS_ADMIN.clone());
119 }
120 Capability::CAP_SYS_BOOT => {
121 builder.extend(default::CAP_SYS_BOOT.clone());
122 }
123 Capability::CAP_SYS_NICE => {
124 builder.extend(default::CAP_SYS_NICE.clone());
125 }
126 Capability::CAP_SYS_RESOURCE => {}
127 Capability::CAP_SYS_TIME => {
128 builder.extend(default::CAP_SYS_TIME.clone());
129 }
130 Capability::CAP_SYS_TTY_CONFIG => {
131 builder.extend(default::CAP_SYS_TTY_CONFIG.clone());
132 }
133 Capability::CAP_MKNOD => {}
134 Capability::CAP_LEASE => {}
135 Capability::CAP_AUDIT_WRITE => {}
136 Capability::CAP_AUDIT_CONTROL => {}
137 Capability::CAP_SETFCAP => {}
138 Capability::CAP_MAC_OVERRIDE => {}
139 Capability::CAP_MAC_ADMIN => {}
140 Capability::CAP_SYSLOG => {
141 builder.extend(default::CAP_SYSLOG.clone());
142 }
143 Capability::CAP_WAKE_ALARM => {}
144 Capability::CAP_BLOCK_SUSPEND => {}
145 Capability::CAP_AUDIT_READ => {}
146 Capability::CAP_PERFMON => {}
147 Capability::CAP_BPF => {}
148 Capability::CAP_CHECKPOINT_RESTORE => {}
149 };
150 }
151 if !cap_sys_admin {
152 builder.extend(default::NON_CAP_SYS_ADMIN.clone());
153 }
154 }
155 builder
156 }
157 }
158}
159
160fn check_platform_requirements() {
162 #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
163 compile_error!("seccomp is only supported on aarch64 and x86_64");
164 #[cfg(target_pointer_width = "32")]
165 compile_error!("seccomp is not supported on 32 Bit architectures");
166 #[cfg(target_endian = "big")]
167 compile_error!("seccomp is not supported on Big Endian architectures");
168}
169
170#[derive(Clone, Debug, PartialEq, Eq)]
171pub struct SockFilter {
172 pub code: u16,
173 pub jt: u8,
174 pub jf: u8,
175 pub k: u32,
176}
177
178impl Serialize for SockFilter {
179 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
180 where
181 S: Serializer,
182 {
183 let a = (self.code as u32) << 16 | (self.jt as u32) << 8 | self.jf as u32;
184 let value = (a as u64) << 32 | self.k as u64;
185 serializer.serialize_u64(value)
186 }
187}
188
189impl<'de> Deserialize<'de> for SockFilter {
190 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
191 where
192 D: Deserializer<'de>,
193 {
194 let value = u64::deserialize(deserializer)?;
195 let a = (value >> 32) as u32;
196 let code = ((a & 0xFFFF0000) >> 16) as u16;
197 let jt = ((a & 0xFF00) >> 8) as u8;
198 let jf = (a & 0xFF) as u8;
199 let k = (value & 0xFFFFFFFF) as u32;
200 Ok(SockFilter { code, jt, jf, k })
201 }
202}
203
204impl From<&SockFilter> for sock_filter {
205 fn from(s: &SockFilter) -> sock_filter {
206 sock_filter {
207 code: s.code,
208 jt: s.jt,
209 jf: s.jf,
210 k: s.k,
211 }
212 }
213}
214
215#[derive(Clone, Debug, Default, Serialize, Deserialize)]
217pub struct AllowList {
218 list: Vec<SockFilter>,
219}
220
221impl AllowList {
222 pub fn apply(&self) -> Result<()> {
224 #[cfg(target_os = "android")]
225 const PR_SET_SECCOMP: nix::libc::c_int = 22;
226 #[cfg(target_os = "android")]
227 const SECCOMP_MODE_FILTER: nix::libc::c_int = 2;
228
229 #[cfg(not(target_os = "android"))]
230 use nix::libc::{PR_SET_SECCOMP, SECCOMP_MODE_FILTER};
231
232 if self.list.len() > BPF_MAXINSNS as usize {
233 bail!("seccomp filter list exceeds maximum number of BPF statements");
234 }
235
236 let list = self
238 .list
239 .iter()
240 .map(Into::into)
241 .collect::<Vec<sock_filter>>();
242
243 let sf_prog = sock_fprog {
244 len: list.len() as u16,
245 filter: list.as_ptr() as *mut bindings::sock_filter,
246 };
247 let sf_prog_ptr = &sf_prog as *const sock_fprog;
248 let result = unsafe { nix::libc::prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, sf_prog_ptr) };
249 Errno::result(result)?;
250 Ok(())
251 }
252}
253
254#[derive(Clone, Eq, PartialEq, Debug)]
255struct NumericSyscallRule {
256 nr: u32,
258 arg_rule: Option<SyscallArgRule>,
260}
261
262#[derive(Default, Clone)]
264pub struct Builder {
265 allowlist: Vec<NumericSyscallRule>,
266 log_only: bool,
267}
268
269impl Builder {
270 pub fn new() -> Self {
272 let mut builder: Builder = Default::default();
273
274 for syscall in REQUIRED_SYSCALLS {
276 builder.allow_syscall_nr(*syscall, None);
277 }
278 builder
279 }
280
281 pub(crate) fn allow_syscall_nr(
283 &mut self,
284 nr: u32,
285 arg_rule: Option<SyscallArgRule>,
286 ) -> &mut Builder {
287 self.allowlist.push(NumericSyscallRule { nr, arg_rule });
288 self
289 }
290
291 pub(crate) fn allow_syscall_name(
293 &mut self,
294 name: &str,
295 arg_rule: Option<SyscallArgRule>,
296 ) -> Result<&mut Builder> {
297 match translate_syscall(name) {
298 Some(nr) => Ok(self.allow_syscall_nr(nr, arg_rule)),
299 None => bail!("unknown system call {}", name),
300 }
301 }
302
303 #[allow(unused)]
305 pub(crate) fn log_only(&mut self) -> &mut Builder {
306 self.log_only = true;
307 self
308 }
309
310 pub(crate) fn extend(&mut self, other: Builder) -> &mut Builder {
313 self.allowlist.extend(other.allowlist);
314 self.log_only &= other.log_only;
315 self
316 }
317
318 pub(crate) fn build(mut self) -> AllowList {
320 self.allowlist.sort_unstable_by_key(|rule| rule.nr);
322 self.allowlist.dedup();
323
324 let mut filter = AllowList { list: vec![] };
325
326 load_arch_into_acc(&mut filter);
328
329 jump_if_acc_is_equal(&mut filter, AUDIT_ARCH, SKIP_NEXT, EVAL_NEXT);
331 filter
332 .list
333 .push(bpf_ret(nix::libc::SECCOMP_RET_KILL_PROCESS));
334
335 load_syscall_nr_into_acc(&mut filter);
337
338 for rule in &self.allowlist {
340 if let Some(arg_rule) = &rule.arg_rule {
341 if let Some(values) = &arg_rule.values {
342 trace!("Adding seccomp argument block (nr={})", rule.nr);
343
344 assert!(values.len() <= ((u8::MAX - 5) / 4) as usize); let skip_if_no_match: u8 = (4 + 4 * values.len() + 1) as u8;
347
348 jump_if_acc_is_equal(&mut filter, rule.nr, EVAL_NEXT, skip_if_no_match);
350 let mut insts = 0;
352 insts += load_syscall_arg_into_scratch(&mut filter, arg_rule);
354 insts += jump_if_scratch_matches(&mut filter, values, EVAL_NEXT, SKIP_NEXT);
356 insts += return_success(&mut filter);
358 assert_eq!(skip_if_no_match as u32, insts);
359 load_syscall_nr_into_acc(&mut filter);
361
362 trace!("Finished seccomp argument block (nr={})", rule.nr);
363 }
364 if let Some(mask) = arg_rule.mask {
365 trace!(
366 "Adding seccomp argument block (nr={}, mask={})",
367 rule.nr,
368 mask
369 );
370
371 let skip_if_no_match: u8 = (4 + 6 + 1) as u8;
373
374 jump_if_acc_is_equal(&mut filter, rule.nr, EVAL_NEXT, skip_if_no_match);
376 let mut insts = 0;
378 insts += load_syscall_arg_into_scratch(&mut filter, arg_rule);
380 insts += jump_if_scratch_matches_mask(&mut filter, mask, EVAL_NEXT, SKIP_NEXT);
382 insts += return_success(&mut filter);
383 assert_eq!(skip_if_no_match as u32, insts);
385 load_syscall_nr_into_acc(&mut filter);
386
387 trace!(
388 "Finished seccomp arg. block (nr={}, mask={})",
389 rule.nr,
390 mask
391 );
392 }
393 } else {
394 trace!("Adding seccomp syscall block (nr={})", rule.nr);
395
396 jump_if_acc_is_equal(&mut filter, rule.nr, EVAL_NEXT, SKIP_NEXT);
398 return_success(&mut filter);
399 trace!("Finished seccomp syscall block (nr={})", rule.nr);
402 }
403 }
404
405 return_fail(&mut filter, self.log_only);
407
408 filter
409 }
410}
411
412fn translate_syscall(name: &str) -> Option<u32> {
414 SYSCALL_MAP.get(name).cloned()
415}
416
417fn load_arch_into_acc(filter: &mut AllowList) -> u32 {
419 filter.list.push(bpf_stmt(
420 BPF_LD | BPF_W | BPF_ABS,
421 memoffset::offset_of!(seccomp_data, arch) as u32,
422 ));
423 1
424}
425
426fn load_syscall_nr_into_acc(filter: &mut AllowList) -> u32 {
428 filter.list.push(bpf_stmt(
429 BPF_LD | BPF_W | BPF_ABS,
430 memoffset::offset_of!(seccomp_data, nr) as u32,
431 ));
432 1
433}
434
435fn load_syscall_arg_into_scratch(filter: &mut AllowList, arg_rule: &SyscallArgRule) -> u32 {
437 let mut insts = 0;
439 insts += load_arg_low_into_acc(filter, arg_rule);
440 insts += store_acc_in_scratch_low(filter);
441 insts += load_arg_high_into_acc(filter, arg_rule);
442 insts += store_acc_in_scratch_high(filter);
443 insts
444}
445
446fn load_arg_low_into_acc(filter: &mut AllowList, arg_rule: &SyscallArgRule) -> u32 {
448 filter.list.push(bpf_stmt(
449 BPF_LD | BPF_W | BPF_ABS,
450 arg_low_array_offset(arg_rule.index) as u32,
451 ));
452 1
453}
454
455fn load_arg_high_into_acc(filter: &mut AllowList, arg_rule: &SyscallArgRule) -> u32 {
457 filter.list.push(bpf_stmt(
458 BPF_LD | BPF_W | BPF_ABS,
459 arg_high_array_offset(arg_rule.index) as u32,
460 ));
461 1
462}
463
464const SECCOMP_DATA_ARGS_SIZE: usize = size_of::<u64>();
473
474fn arg_low_array_offset(index: usize) -> usize {
476 memoffset::offset_of!(seccomp_data, args) + (index * SECCOMP_DATA_ARGS_SIZE)
477}
478
479fn arg_high_array_offset(index: usize) -> usize {
481 memoffset::offset_of!(seccomp_data, args)
482 + (index * SECCOMP_DATA_ARGS_SIZE)
483 + (SECCOMP_DATA_ARGS_SIZE / 2)
484}
485
486fn _load_into_acc(filter: &mut AllowList, value: u32) -> u32 {
488 filter.list.push(bpf_stmt(BPF_LD | BPF_IMM, value));
489 1
490}
491
492const SCRATCH_LOW_INDEX: u32 = 0;
493const SCRATCH_HIGH_INDEX: u32 = 1;
494
495fn load_scratch_low_into_acc(filter: &mut AllowList) -> u32 {
497 filter
498 .list
499 .push(bpf_stmt(BPF_LD | BPF_MEM, SCRATCH_LOW_INDEX));
500 1
501}
502
503fn load_scratch_high_into_acc(filter: &mut AllowList) -> u32 {
505 filter
506 .list
507 .push(bpf_stmt(BPF_LD | BPF_MEM, SCRATCH_HIGH_INDEX));
508 1
509}
510
511fn store_acc_in_scratch_low(filter: &mut AllowList) -> u32 {
513 filter.list.push(bpf_stmt(BPF_ST, SCRATCH_LOW_INDEX));
514 1
515}
516
517fn store_acc_in_scratch_high(filter: &mut AllowList) -> u32 {
519 filter.list.push(bpf_stmt(BPF_ST, SCRATCH_HIGH_INDEX));
520 1
521}
522
523fn jump_if_scratch_matches(
525 filter: &mut AllowList,
526 values: &[u64],
527 jump_true: u8,
528 jump_false: u8,
529) -> u32 {
530 assert!(values.len() <= u8::MAX as usize);
531 let mut insts = 0;
532
533 for (iteration, value) in values.iter().enumerate() {
534 const INSTS_PER_ITER: u8 = 4; assert!(values.len() > iteration);
538 let offset_adjust = INSTS_PER_ITER
539 .checked_mul((values.len() - iteration - 1) as u8)
540 .expect("BCP offset overflow");
541
542 let jump_true = jump_true + offset_adjust;
544 let jump_false = jump_false + offset_adjust;
545
546 let insts_before = insts;
548 insts += jump_if_scratch_is_equal(filter, *value, jump_true, jump_false);
549 assert_eq!(insts_before + INSTS_PER_ITER as u32, insts);
550 }
551 insts
552}
553
554fn jump_if_acc_is_equal(filter: &mut AllowList, value: u32, jump_true: u8, jump_false: u8) -> u32 {
556 filter.list.push(bpf_jump(
557 BPF_JMP | BPF_JEQ | BPF_K,
558 value,
559 jump_true,
560 jump_false,
561 ));
562 1
563}
564
565fn jump_if_acc_matches_mask(
567 filter: &mut AllowList,
568 mask: u32,
569 jump_true: u8,
570 jump_false: u8,
571) -> u32 {
572 let mut insts = 0;
573 filter.list.push(bpf_and(!mask)); insts += 1;
575 insts += jump_if_acc_is_equal(filter, 0, jump_true, jump_false);
576 insts
577}
578
579fn jump_if_scratch_is_equal(
581 filter: &mut AllowList,
582 value: u64,
583 jump_true: u8,
584 jump_false: u8,
585) -> u32 {
586 let low: u32 = value as u32;
588 let high: u32 = (value >> 32) as u32;
589 let mut insts = 0;
590 insts += load_scratch_low_into_acc(filter);
591 insts += jump_if_acc_is_equal(filter, low, EVAL_NEXT, jump_false + 2);
592 insts += load_scratch_high_into_acc(filter);
593 insts += jump_if_acc_is_equal(filter, high, jump_true, jump_false);
594 insts
595}
596
597fn jump_if_scratch_matches_mask(
599 filter: &mut AllowList,
600 mask: u64,
601 jump_true: u8,
602 jump_false: u8,
603) -> u32 {
604 const INSTS_PER_CHECK: u8 = 3;
605
606 let low: u32 = mask as u32;
608 let high: u32 = (mask >> 32) as u32;
609 let mut insts = 0;
610 let insts_before = insts;
611 insts += load_scratch_low_into_acc(filter);
612 insts += jump_if_acc_matches_mask(filter, low, EVAL_NEXT, jump_false + INSTS_PER_CHECK);
613 assert_eq!(insts_before + INSTS_PER_CHECK as u32, insts);
614 insts += load_scratch_high_into_acc(filter);
615 insts += jump_if_acc_matches_mask(filter, high, jump_true, jump_false);
616 assert_eq!(insts_before + 2 * INSTS_PER_CHECK as u32, insts);
617 insts
618}
619
620fn return_fail(filter: &mut AllowList, log_only: bool) -> u32 {
622 if log_only {
623 filter.list.push(bpf_ret(nix::libc::SECCOMP_RET_LOG));
624 } else {
625 filter
626 .list
627 .push(bpf_ret(nix::libc::SECCOMP_RET_KILL_PROCESS));
628 }
629 1
630}
631
632fn return_success(filter: &mut AllowList) -> u32 {
634 trace!("add_success");
635 filter.list.push(bpf_ret(nix::libc::SECCOMP_RET_ALLOW));
636 1
637}
638
639fn _bpf_neg() -> SockFilter {
641 trace!("bpf_neg");
642 bpf_stmt(BPF_ALU | BPF_NEG, 0)
643}
644
645fn bpf_and(k: u32) -> SockFilter {
647 trace!("bpf_and({})", k);
648 bpf_stmt(BPF_ALU | BPF_AND | BPF_K, k)
649}
650
651fn _bpf_or(k: u32) -> SockFilter {
653 trace!("bpf_or({})", k);
654 bpf_stmt(BPF_ALU | BPF_OR | BPF_K, k)
655}
656
657fn bpf_ret(k: u32) -> SockFilter {
659 trace!("bpf_ret({})", k);
660 bpf_stmt(BPF_RET | BPF_K, k)
661}
662
663fn bpf_stmt(code: u32, k: u32) -> SockFilter {
665 trace!("bpf_stmt({}, {})", code, k);
666 bpf_jump(code, k, 0, 0)
667}
668
669fn bpf_jump(code: u32, k: u32, jt: u8, jf: u8) -> SockFilter {
671 trace!("*bpf_jump({}, {}, {}, {})", code, k, jt, jf);
672 SockFilter {
673 code: code as u16,
674 k,
675 jt,
676 jf,
677 }
678}
679
680#[cfg(test)]
681#[allow(clippy::unwrap_used)]
682mod test {
683 use super::SockFilter;
684 use proptest::prelude::*;
685
686 proptest! {
687 #[test]
688 fn sock_filter_serialize_deserialize(a in 0..100, b in 0i32..10) {
689 let filter = SockFilter {
690 code: (a + b) as u16,
691 jt: a as u8,
692 jf: b as u8,
693 k: (a * b) as u32,
694 };
695 let serialized = serde_json::to_string(&filter).unwrap();
696 let deserialized: SockFilter = serde_json::from_str(&serialized).unwrap();
697 prop_assert_eq!(filter, deserialized);
698 }
699 }
700}