use crate::{
common::non_nul_string::NonNulString,
npk::manifest::capabilities::Capability,
seccomp::{profiles::default, Profile, SyscallArgRule, SyscallRule},
};
use anyhow::{bail, Result};
use bindings::{
seccomp_data, sock_filter, sock_fprog, BPF_ABS, BPF_ALU, BPF_AND, BPF_IMM, BPF_JEQ, BPF_JMP,
BPF_K, BPF_LD, BPF_MAXINSNS, BPF_MEM, BPF_NEG, BPF_OR, BPF_RET, BPF_ST, BPF_W, SYSCALL_MAP,
};
use log::trace;
use nix::errno::Errno;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::{
collections::{HashMap, HashSet},
mem::size_of,
};
#[allow(unused, non_snake_case, non_camel_case_types, non_upper_case_globals)]
mod bindings {
include!(concat!(env!("OUT_DIR"), "/syscall_bindings.rs"));
include!(concat!(env!("OUT_DIR"), "/seccomp_bindings.rs"));
}
#[cfg(target_arch = "aarch64")]
const AUDIT_ARCH: u32 = bindings::AUDIT_ARCH_AARCH64;
#[cfg(target_arch = "x86_64")]
const AUDIT_ARCH: u32 = bindings::AUDIT_ARCH_X86_64;
const REQUIRED_SYSCALLS: &[u32] = &[bindings::SYS_execve];
const EVAL_NEXT: u8 = 0;
const SKIP_NEXT: u8 = 1;
pub fn seccomp_filter(
profile: Option<&Profile>,
rules: Option<&HashMap<NonNulString, SyscallRule>>,
caps: &HashSet<Capability>,
) -> AllowList {
check_platform_requirements();
let mut builder = Builder::new();
if let Some(profile) = profile {
builder.extend(builder_from_profile(profile, caps));
}
if let Some(rules) = rules {
builder.extend(builder_from_rules(rules));
}
builder.build()
}
pub(crate) fn builder_from_rules(rules: &HashMap<NonNulString, SyscallRule>) -> Builder {
let mut builder = Builder::new();
for (name, call_rule) in rules {
let arg_rule = match call_rule {
SyscallRule::Any => None,
SyscallRule::Args(a) => Some(a),
};
if let Err(e) = builder.allow_syscall_name(name, arg_rule.cloned()) {
trace!("failed to allow syscall {}: {}", &name.to_string(), e);
}
}
builder
}
fn builder_from_profile(profile: &Profile, caps: &HashSet<Capability>) -> Builder {
match profile {
Profile::Default => {
let mut builder = default::BASE.clone();
if !caps.is_empty() {
let mut cap_sys_admin = false;
for cap in caps {
match cap {
Capability::CAP_CHOWN => {}
Capability::CAP_DAC_OVERRIDE => {}
Capability::CAP_DAC_READ_SEARCH => {
builder.extend(default::CAP_DAC_READ_SEARCH.clone());
}
Capability::CAP_FOWNER => {}
Capability::CAP_FSETID => {}
Capability::CAP_KILL => {}
Capability::CAP_SETGID => {}
Capability::CAP_SETUID => {}
Capability::CAP_SETPCAP => {}
Capability::CAP_LINUX_IMMUTABLE => {}
Capability::CAP_NET_BIND_SERVICE => {}
Capability::CAP_NET_BROADCAST => {}
Capability::CAP_NET_ADMIN => {}
Capability::CAP_NET_RAW => {}
Capability::CAP_IPC_LOCK => {}
Capability::CAP_IPC_OWNER => {}
Capability::CAP_SYS_MODULE => {
builder.extend(default::CAP_SYS_MODULE.clone());
}
Capability::CAP_SYS_RAWIO => {
builder.extend(default::CAP_SYS_RAWIO.clone());
}
Capability::CAP_SYS_CHROOT => {
builder.extend(default::CAP_SYS_CHROOT.clone());
}
Capability::CAP_SYS_PTRACE => {
builder.extend(default::CAP_SYS_PTRACE.clone());
}
Capability::CAP_SYS_PACCT => {
builder.extend(default::CAP_SYS_PACCT.clone());
}
Capability::CAP_SYS_ADMIN => {
cap_sys_admin = true;
builder.extend(default::CAP_SYS_ADMIN.clone());
}
Capability::CAP_SYS_BOOT => {
builder.extend(default::CAP_SYS_BOOT.clone());
}
Capability::CAP_SYS_NICE => {
builder.extend(default::CAP_SYS_NICE.clone());
}
Capability::CAP_SYS_RESOURCE => {}
Capability::CAP_SYS_TIME => {
builder.extend(default::CAP_SYS_TIME.clone());
}
Capability::CAP_SYS_TTY_CONFIG => {
builder.extend(default::CAP_SYS_TTY_CONFIG.clone());
}
Capability::CAP_MKNOD => {}
Capability::CAP_LEASE => {}
Capability::CAP_AUDIT_WRITE => {}
Capability::CAP_AUDIT_CONTROL => {}
Capability::CAP_SETFCAP => {}
Capability::CAP_MAC_OVERRIDE => {}
Capability::CAP_MAC_ADMIN => {}
Capability::CAP_SYSLOG => {
builder.extend(default::CAP_SYSLOG.clone());
}
Capability::CAP_WAKE_ALARM => {}
Capability::CAP_BLOCK_SUSPEND => {}
Capability::CAP_AUDIT_READ => {}
Capability::CAP_PERFMON => {}
Capability::CAP_BPF => {}
Capability::CAP_CHECKPOINT_RESTORE => {}
};
}
if !cap_sys_admin {
builder.extend(default::NON_CAP_SYS_ADMIN.clone());
}
}
builder
}
}
}
fn check_platform_requirements() {
#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
compile_error!("seccomp is only supported on aarch64 and x86_64");
#[cfg(target_pointer_width = "32")]
compile_error!("seccomp is not supported on 32 Bit architectures");
#[cfg(target_endian = "big")]
compile_error!("seccomp is not supported on Big Endian architectures");
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct SockFilter {
pub code: u16,
pub jt: u8,
pub jf: u8,
pub k: u32,
}
impl Serialize for SockFilter {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let a = (self.code as u32) << 16 | (self.jt as u32) << 8 | self.jf as u32;
let value = (a as u64) << 32 | self.k as u64;
serializer.serialize_u64(value)
}
}
impl<'de> Deserialize<'de> for SockFilter {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let value = u64::deserialize(deserializer)?;
let a = (value >> 32) as u32;
let code = ((a & 0xFFFF0000) >> 16) as u16;
let jt = ((a & 0xFF00) >> 8) as u8;
let jf = (a & 0xFF) as u8;
let k = (value & 0xFFFFFFFF) as u32;
Ok(SockFilter { code, jt, jf, k })
}
}
impl From<&SockFilter> for sock_filter {
fn from(s: &SockFilter) -> sock_filter {
sock_filter {
code: s.code,
jt: s.jt,
jf: s.jf,
k: s.k,
}
}
}
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct AllowList {
list: Vec<SockFilter>,
}
impl AllowList {
pub fn apply(&self) -> Result<()> {
#[cfg(target_os = "android")]
const PR_SET_SECCOMP: nix::libc::c_int = 22;
#[cfg(target_os = "android")]
const SECCOMP_MODE_FILTER: nix::libc::c_int = 2;
#[cfg(not(target_os = "android"))]
use nix::libc::{PR_SET_SECCOMP, SECCOMP_MODE_FILTER};
if self.list.len() > BPF_MAXINSNS as usize {
bail!("seccomp filter list exceeds maximum number of BPF statements");
}
let list = self
.list
.iter()
.map(Into::into)
.collect::<Vec<sock_filter>>();
let sf_prog = sock_fprog {
len: list.len() as u16,
filter: list.as_ptr() as *mut bindings::sock_filter,
};
let sf_prog_ptr = &sf_prog as *const sock_fprog;
let result = unsafe { nix::libc::prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, sf_prog_ptr) };
Errno::result(result)?;
Ok(())
}
}
#[derive(Clone, Eq, PartialEq, Debug)]
struct NumericSyscallRule {
nr: u32,
arg_rule: Option<SyscallArgRule>,
}
#[derive(Default, Clone)]
pub struct Builder {
allowlist: Vec<NumericSyscallRule>,
log_only: bool,
}
impl Builder {
pub fn new() -> Self {
let mut builder: Builder = Default::default();
for syscall in REQUIRED_SYSCALLS {
builder.allow_syscall_nr(*syscall, None);
}
builder
}
pub(crate) fn allow_syscall_nr(
&mut self,
nr: u32,
arg_rule: Option<SyscallArgRule>,
) -> &mut Builder {
self.allowlist.push(NumericSyscallRule { nr, arg_rule });
self
}
pub(crate) fn allow_syscall_name(
&mut self,
name: &str,
arg_rule: Option<SyscallArgRule>,
) -> Result<&mut Builder> {
match translate_syscall(name) {
Some(nr) => Ok(self.allow_syscall_nr(nr, arg_rule)),
None => bail!("unknown system call {}", name),
}
}
#[allow(unused)]
pub(crate) fn log_only(&mut self) -> &mut Builder {
self.log_only = true;
self
}
pub(crate) fn extend(&mut self, other: Builder) -> &mut Builder {
self.allowlist.extend(other.allowlist);
self.log_only &= other.log_only;
self
}
pub(crate) fn build(mut self) -> AllowList {
self.allowlist.sort_unstable_by_key(|rule| rule.nr);
self.allowlist.dedup();
let mut filter = AllowList { list: vec![] };
load_arch_into_acc(&mut filter);
jump_if_acc_is_equal(&mut filter, AUDIT_ARCH, SKIP_NEXT, EVAL_NEXT);
filter
.list
.push(bpf_ret(nix::libc::SECCOMP_RET_KILL_PROCESS));
load_syscall_nr_into_acc(&mut filter);
for rule in &self.allowlist {
if let Some(arg_rule) = &rule.arg_rule {
if let Some(values) = &arg_rule.values {
trace!("Adding seccomp argument block (nr={})", rule.nr);
assert!(values.len() <= ((u8::MAX - 5) / 4) as usize); let skip_if_no_match: u8 = (4 + 4 * values.len() + 1) as u8;
jump_if_acc_is_equal(&mut filter, rule.nr, EVAL_NEXT, skip_if_no_match);
let mut insts = 0;
insts += load_syscall_arg_into_scratch(&mut filter, arg_rule);
insts += jump_if_scratch_matches(&mut filter, values, EVAL_NEXT, SKIP_NEXT);
insts += return_success(&mut filter);
assert_eq!(skip_if_no_match as u32, insts);
load_syscall_nr_into_acc(&mut filter);
trace!("Finished seccomp argument block (nr={})", rule.nr);
}
if let Some(mask) = arg_rule.mask {
trace!(
"Adding seccomp argument block (nr={}, mask={})",
rule.nr,
mask
);
let skip_if_no_match: u8 = (4 + 6 + 1) as u8;
jump_if_acc_is_equal(&mut filter, rule.nr, EVAL_NEXT, skip_if_no_match);
let mut insts = 0;
insts += load_syscall_arg_into_scratch(&mut filter, arg_rule);
insts += jump_if_scratch_matches_mask(&mut filter, mask, EVAL_NEXT, SKIP_NEXT);
insts += return_success(&mut filter);
assert_eq!(skip_if_no_match as u32, insts);
load_syscall_nr_into_acc(&mut filter);
trace!(
"Finished seccomp arg. block (nr={}, mask={})",
rule.nr,
mask
);
}
} else {
trace!("Adding seccomp syscall block (nr={})", rule.nr);
jump_if_acc_is_equal(&mut filter, rule.nr, EVAL_NEXT, SKIP_NEXT);
return_success(&mut filter);
trace!("Finished seccomp syscall block (nr={})", rule.nr);
}
}
return_fail(&mut filter, self.log_only);
filter
}
}
fn translate_syscall(name: &str) -> Option<u32> {
SYSCALL_MAP.get(name).cloned()
}
fn load_arch_into_acc(filter: &mut AllowList) -> u32 {
filter.list.push(bpf_stmt(
BPF_LD | BPF_W | BPF_ABS,
memoffset::offset_of!(seccomp_data, arch) as u32,
));
1
}
fn load_syscall_nr_into_acc(filter: &mut AllowList) -> u32 {
filter.list.push(bpf_stmt(
BPF_LD | BPF_W | BPF_ABS,
memoffset::offset_of!(seccomp_data, nr) as u32,
));
1
}
fn load_syscall_arg_into_scratch(filter: &mut AllowList, arg_rule: &SyscallArgRule) -> u32 {
let mut insts = 0;
insts += load_arg_low_into_acc(filter, arg_rule);
insts += store_acc_in_scratch_low(filter);
insts += load_arg_high_into_acc(filter, arg_rule);
insts += store_acc_in_scratch_high(filter);
insts
}
fn load_arg_low_into_acc(filter: &mut AllowList, arg_rule: &SyscallArgRule) -> u32 {
filter.list.push(bpf_stmt(
BPF_LD | BPF_W | BPF_ABS,
arg_low_array_offset(arg_rule.index) as u32,
));
1
}
fn load_arg_high_into_acc(filter: &mut AllowList, arg_rule: &SyscallArgRule) -> u32 {
filter.list.push(bpf_stmt(
BPF_LD | BPF_W | BPF_ABS,
arg_high_array_offset(arg_rule.index) as u32,
));
1
}
const SECCOMP_DATA_ARGS_SIZE: usize = size_of::<u64>();
fn arg_low_array_offset(index: usize) -> usize {
memoffset::offset_of!(seccomp_data, args) + (index * SECCOMP_DATA_ARGS_SIZE)
}
fn arg_high_array_offset(index: usize) -> usize {
memoffset::offset_of!(seccomp_data, args)
+ (index * SECCOMP_DATA_ARGS_SIZE)
+ (SECCOMP_DATA_ARGS_SIZE / 2)
}
fn _load_into_acc(filter: &mut AllowList, value: u32) -> u32 {
filter.list.push(bpf_stmt(BPF_LD | BPF_IMM, value));
1
}
const SCRATCH_LOW_INDEX: u32 = 0;
const SCRATCH_HIGH_INDEX: u32 = 1;
fn load_scratch_low_into_acc(filter: &mut AllowList) -> u32 {
filter
.list
.push(bpf_stmt(BPF_LD | BPF_MEM, SCRATCH_LOW_INDEX));
1
}
fn load_scratch_high_into_acc(filter: &mut AllowList) -> u32 {
filter
.list
.push(bpf_stmt(BPF_LD | BPF_MEM, SCRATCH_HIGH_INDEX));
1
}
fn store_acc_in_scratch_low(filter: &mut AllowList) -> u32 {
filter.list.push(bpf_stmt(BPF_ST, SCRATCH_LOW_INDEX));
1
}
fn store_acc_in_scratch_high(filter: &mut AllowList) -> u32 {
filter.list.push(bpf_stmt(BPF_ST, SCRATCH_HIGH_INDEX));
1
}
fn jump_if_scratch_matches(
filter: &mut AllowList,
values: &[u64],
jump_true: u8,
jump_false: u8,
) -> u32 {
assert!(values.len() <= u8::MAX as usize);
let mut insts = 0;
for (iteration, value) in values.iter().enumerate() {
const INSTS_PER_ITER: u8 = 4; assert!(values.len() > iteration);
let offset_adjust = INSTS_PER_ITER
.checked_mul((values.len() - iteration - 1) as u8)
.expect("BCP offset overflow");
let jump_true = jump_true + offset_adjust;
let jump_false = jump_false + offset_adjust;
let insts_before = insts;
insts += jump_if_scratch_is_equal(filter, *value, jump_true, jump_false);
assert_eq!(insts_before + INSTS_PER_ITER as u32, insts);
}
insts
}
fn jump_if_acc_is_equal(filter: &mut AllowList, value: u32, jump_true: u8, jump_false: u8) -> u32 {
filter.list.push(bpf_jump(
BPF_JMP | BPF_JEQ | BPF_K,
value,
jump_true,
jump_false,
));
1
}
fn jump_if_acc_matches_mask(
filter: &mut AllowList,
mask: u32,
jump_true: u8,
jump_false: u8,
) -> u32 {
let mut insts = 0;
filter.list.push(bpf_and(!mask)); insts += 1;
insts += jump_if_acc_is_equal(filter, 0, jump_true, jump_false);
insts
}
fn jump_if_scratch_is_equal(
filter: &mut AllowList,
value: u64,
jump_true: u8,
jump_false: u8,
) -> u32 {
let low: u32 = value as u32;
let high: u32 = (value >> 32) as u32;
let mut insts = 0;
insts += load_scratch_low_into_acc(filter);
insts += jump_if_acc_is_equal(filter, low, EVAL_NEXT, jump_false + 2);
insts += load_scratch_high_into_acc(filter);
insts += jump_if_acc_is_equal(filter, high, jump_true, jump_false);
insts
}
fn jump_if_scratch_matches_mask(
filter: &mut AllowList,
mask: u64,
jump_true: u8,
jump_false: u8,
) -> u32 {
const INSTS_PER_CHECK: u8 = 3;
let low: u32 = mask as u32;
let high: u32 = (mask >> 32) as u32;
let mut insts = 0;
let insts_before = insts;
insts += load_scratch_low_into_acc(filter);
insts += jump_if_acc_matches_mask(filter, low, EVAL_NEXT, jump_false + INSTS_PER_CHECK);
assert_eq!(insts_before + INSTS_PER_CHECK as u32, insts);
insts += load_scratch_high_into_acc(filter);
insts += jump_if_acc_matches_mask(filter, high, jump_true, jump_false);
assert_eq!(insts_before + 2 * INSTS_PER_CHECK as u32, insts);
insts
}
fn return_fail(filter: &mut AllowList, log_only: bool) -> u32 {
if log_only {
filter.list.push(bpf_ret(nix::libc::SECCOMP_RET_LOG));
} else {
filter
.list
.push(bpf_ret(nix::libc::SECCOMP_RET_KILL_PROCESS));
}
1
}
fn return_success(filter: &mut AllowList) -> u32 {
trace!("add_success");
filter.list.push(bpf_ret(nix::libc::SECCOMP_RET_ALLOW));
1
}
fn _bpf_neg() -> SockFilter {
trace!("bpf_neg");
bpf_stmt(BPF_ALU | BPF_NEG, 0)
}
fn bpf_and(k: u32) -> SockFilter {
trace!("bpf_and({})", k);
bpf_stmt(BPF_ALU | BPF_AND | BPF_K, k)
}
fn _bpf_or(k: u32) -> SockFilter {
trace!("bpf_or({})", k);
bpf_stmt(BPF_ALU | BPF_OR | BPF_K, k)
}
fn bpf_ret(k: u32) -> SockFilter {
trace!("bpf_ret({})", k);
bpf_stmt(BPF_RET | BPF_K, k)
}
fn bpf_stmt(code: u32, k: u32) -> SockFilter {
trace!("bpf_stmt({}, {})", code, k);
bpf_jump(code, k, 0, 0)
}
fn bpf_jump(code: u32, k: u32, jt: u8, jf: u8) -> SockFilter {
trace!("*bpf_jump({}, {}, {}, {})", code, k, jt, jf);
SockFilter {
code: code as u16,
k,
jt,
jf,
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod test {
use super::SockFilter;
use proptest::prelude::*;
proptest! {
#[test]
fn sock_filter_serialize_deserialize(a in 0..100, b in 0i32..10) {
let filter = SockFilter {
code: (a + b) as u16,
jt: a as u8,
jf: b as u8,
k: (a * b) as u32,
};
let serialized = serde_json::to_string(&filter).unwrap();
let deserialized: SockFilter = serde_json::from_str(&serialized).unwrap();
prop_assert_eq!(filter, deserialized);
}
}
}