Skip to main content

libcontainer/seccomp/
mod.rs

1use std::collections::HashSet;
2use std::num::TryFromIntError;
3use std::os::unix::io;
4
5use libseccomp::{
6    ScmpAction, ScmpArch, ScmpArgCompare, ScmpCompareOp, ScmpFilterContext, ScmpSyscall,
7};
8use oci_spec::runtime::{
9    Arch, LinuxSeccomp, LinuxSeccompAction, LinuxSeccompFilterFlag, LinuxSeccompOperator,
10};
11
12#[derive(Debug, thiserror::Error)]
13pub enum SeccompError {
14    #[error("failed to translate trace action due to failed to convert errno {errno} into i16")]
15    TraceAction { source: TryFromIntError, errno: i32 },
16    #[error("SCMP_ACT_NOTIFY cannot be used as default action")]
17    NotifyAsDefaultAction,
18    #[error("SCMP_ACT_NOTIFY cannot be used for the write syscall")]
19    NotifyWriteSyscall,
20    #[error("failed to add arch to seccomp")]
21    AddArch {
22        source: libseccomp::error::SeccompError,
23        arch: Arch,
24    },
25    #[error("failed to load seccomp context")]
26    LoadContext {
27        source: libseccomp::error::SeccompError,
28    },
29    #[error("failed to get seccomp notify id")]
30    GetNotifyId {
31        source: libseccomp::error::SeccompError,
32    },
33    #[error("failed to add rule to seccomp")]
34    AddRule {
35        source: libseccomp::error::SeccompError,
36    },
37    #[error("failed to create new seccomp filter")]
38    NewFilter {
39        source: libseccomp::error::SeccompError,
40        default: LinuxSeccompAction,
41    },
42    #[error("failed to set filter flag")]
43    SetFilterFlag {
44        source: libseccomp::error::SeccompError,
45        flag: LinuxSeccompFilterFlag,
46    },
47    #[error("failed to set SCMP_FLTATR_CTL_NNP")]
48    SetCtlNnp {
49        source: libseccomp::error::SeccompError,
50    },
51}
52
53type Result<T> = std::result::Result<T, SeccompError>;
54
55fn translate_arch(arch: Arch) -> ScmpArch {
56    match arch {
57        Arch::ScmpArchNative => ScmpArch::Native,
58        Arch::ScmpArchX86 => ScmpArch::X86,
59        Arch::ScmpArchX86_64 => ScmpArch::X8664,
60        Arch::ScmpArchX32 => ScmpArch::X32,
61        Arch::ScmpArchArm => ScmpArch::Arm,
62        Arch::ScmpArchAarch64 => ScmpArch::Aarch64,
63        Arch::ScmpArchMips => ScmpArch::Mips,
64        Arch::ScmpArchMips64 => ScmpArch::Mips64,
65        Arch::ScmpArchMips64n32 => ScmpArch::Mips64N32,
66        Arch::ScmpArchMipsel => ScmpArch::Mipsel,
67        Arch::ScmpArchMipsel64 => ScmpArch::Mipsel64,
68        Arch::ScmpArchMipsel64n32 => ScmpArch::Mipsel64N32,
69        Arch::ScmpArchPpc => ScmpArch::Ppc,
70        Arch::ScmpArchPpc64 => ScmpArch::Ppc64,
71        Arch::ScmpArchPpc64le => ScmpArch::Ppc64Le,
72        Arch::ScmpArchS390 => ScmpArch::S390,
73        Arch::ScmpArchS390x => ScmpArch::S390X,
74        Arch::ScmpArchRiscv64 => ScmpArch::Riscv64,
75    }
76}
77
78fn translate_action(action: LinuxSeccompAction, errno: Option<u32>) -> Result<ScmpAction> {
79    tracing::trace!(?action, ?errno, "translating action");
80    let errno = errno.map(|e| e as i32).unwrap_or(libc::EPERM);
81    let action = match action {
82        LinuxSeccompAction::ScmpActKill => ScmpAction::KillThread,
83        LinuxSeccompAction::ScmpActTrap => ScmpAction::Trap,
84        LinuxSeccompAction::ScmpActErrno => ScmpAction::Errno(errno),
85        LinuxSeccompAction::ScmpActTrace => ScmpAction::Trace(
86            errno
87                .try_into()
88                .map_err(|err| SeccompError::TraceAction { source: err, errno })?,
89        ),
90        LinuxSeccompAction::ScmpActAllow => ScmpAction::Allow,
91        LinuxSeccompAction::ScmpActKillProcess => ScmpAction::KillProcess,
92        LinuxSeccompAction::ScmpActNotify => ScmpAction::Notify,
93        LinuxSeccompAction::ScmpActLog => ScmpAction::Log,
94        LinuxSeccompAction::ScmpActKillThread => ScmpAction::KillThread,
95    };
96
97    tracing::trace!(?action, "translated action");
98    Ok(action)
99}
100
101fn translate_op(op: LinuxSeccompOperator, datum_b: Option<u64>) -> ScmpCompareOp {
102    match op {
103        LinuxSeccompOperator::ScmpCmpNe => ScmpCompareOp::NotEqual,
104        LinuxSeccompOperator::ScmpCmpLt => ScmpCompareOp::Less,
105        LinuxSeccompOperator::ScmpCmpLe => ScmpCompareOp::LessOrEqual,
106        LinuxSeccompOperator::ScmpCmpEq => ScmpCompareOp::Equal,
107        LinuxSeccompOperator::ScmpCmpGe => ScmpCompareOp::GreaterEqual,
108        LinuxSeccompOperator::ScmpCmpGt => ScmpCompareOp::Greater,
109        LinuxSeccompOperator::ScmpCmpMaskedEq => ScmpCompareOp::MaskedEqual(datum_b.unwrap_or(0)),
110    }
111}
112
113fn check_seccomp(seccomp: &LinuxSeccomp) -> Result<()> {
114    // We don't support notify as default action. After the seccomp filter is
115    // created with notify, the container process will have to communicate the
116    // returned fd to another process. Therefore, we need the write syscall or
117    // otherwise, the write syscall will be block by the seccomp filter causing
118    // the container process to hang. `runc` also disallow notify as default
119    // action.
120    // Note: read and close syscall are also used, because if we can
121    // successfully write fd to another process, the other process can choose to
122    // handle read/close syscall and allow read and close to proceed as
123    // expected.
124    if seccomp.default_action() == LinuxSeccompAction::ScmpActNotify {
125        return Err(SeccompError::NotifyAsDefaultAction);
126    }
127
128    if let Some(syscalls) = seccomp.syscalls() {
129        for syscall in syscalls {
130            if syscall.action() == LinuxSeccompAction::ScmpActNotify {
131                for name in syscall.names() {
132                    if name == "write" {
133                        return Err(SeccompError::NotifyWriteSyscall);
134                    }
135                }
136            }
137        }
138    }
139
140    Ok(())
141}
142
143#[tracing::instrument(level = "trace", skip(seccomp))]
144pub fn initialize_seccomp(seccomp: &LinuxSeccomp) -> Result<Option<io::RawFd>> {
145    check_seccomp(seccomp)?;
146
147    tracing::trace!(default_action = ?seccomp.default_action(), errno = ?seccomp.default_errno_ret(), "initializing seccomp");
148    let default_action = translate_action(seccomp.default_action(), seccomp.default_errno_ret())?;
149    let mut ctx =
150        ScmpFilterContext::new(default_action).map_err(|err| SeccompError::NewFilter {
151            source: err,
152            default: seccomp.default_action(),
153        })?;
154
155    if let Some(flags) = seccomp.flags() {
156        for flag in flags {
157            match flag {
158                LinuxSeccompFilterFlag::SeccompFilterFlagLog => ctx.set_ctl_log(true),
159                LinuxSeccompFilterFlag::SeccompFilterFlagTsync => ctx.set_ctl_tsync(true),
160                LinuxSeccompFilterFlag::SeccompFilterFlagSpecAllow => ctx.set_ctl_ssb(true),
161                LinuxSeccompFilterFlag::SeccompFilterFlagWaitKillableRecv => {
162                    ctx.set_ctl_waitkill(true)
163                }
164            }
165            .map_err(|err| SeccompError::SetFilterFlag {
166                source: err,
167                flag: *flag,
168            })?;
169        }
170    }
171
172    if let Some(architectures) = seccomp.architectures() {
173        for &arch in architectures {
174            tracing::trace!(?arch, "adding architecture");
175            ctx.add_arch(translate_arch(arch))
176                .map_err(|err| SeccompError::AddArch { source: err, arch })?;
177        }
178    }
179
180    // The SCMP_FLTATR_CTL_NNP controls if the seccomp load function will set
181    // the new privilege bit automatically in prctl. Normally this is a good
182    // thing, but for us we need better control. Based on the spec, if OCI
183    // runtime spec doesn't set the no new privileges in Process, we should not
184    // set it here.  If the seccomp load operation fails without enough
185    // privilege, so be it. To prevent this automatic behavior, we unset the
186    // value here.
187    ctx.set_ctl_nnp(false)
188        .map_err(|err| SeccompError::SetCtlNnp { source: err })?;
189
190    if let Some(syscalls) = seccomp.syscalls() {
191        for syscall in syscalls {
192            let action = translate_action(syscall.action(), syscall.errno_ret())?;
193            if action == default_action {
194                // When the action is the same as the default action, the rule is redundant. We can
195                // skip this here to avoid failing when we add the rules.
196                tracing::warn!(
197                    "detect a seccomp action that is the same as the default action: {:?}",
198                    syscall
199                );
200                continue;
201            }
202
203            for name in syscall.names() {
204                let sc = match ScmpSyscall::from_name(name) {
205                    Ok(x) => x,
206                    Err(_) => {
207                        // If we failed to resolve the syscall by name, likely the kernel
208                        // doeesn't support this syscall. So it is safe to skip...
209                        tracing::warn!(
210                            "failed to resolve syscall, likely kernel doesn't support this. {:?}",
211                            name
212                        );
213                        continue;
214                    }
215                };
216                match syscall.args() {
217                    // libseccomp allows multiple argument comparisons in a single rule,
218                    // but each syscall argument can only be compared once per rule.
219                    // When multiple comparisons target the same argument index,
220                    // we follow runc's behavior and add each condition as a separate rule.
221                    // Ref: libseccomp seccomp_rule_add(3)
222                    // https://github.com/seccomp/libseccomp/blob/9d7a3cd937e7841ece62ac19f0f06aafd0fdaaa9/doc/man/man3/seccomp_rule_add.3#L137
223                    // Ref: runc seccomp_linux.go
224                    // https://github.com/opencontainers/runc/blob/4b97e12fccdfca981a296d9ef82df5f3ae95e288/libcontainer/seccomp/seccomp_linux.go#L327
225                    Some(args) => {
226                        let mut comparators = Vec::<ScmpArgCompare>::with_capacity(args.len());
227                        let mut seen = HashSet::new();
228                        let mut has_duplicate_index = false;
229
230                        for arg in args {
231                            let index = arg.index() as u32;
232                            let comparator = ScmpArgCompare::new(
233                                index,
234                                translate_op(arg.op(), arg.value_two()),
235                                arg.value(),
236                            );
237                            if !seen.insert(index) {
238                                has_duplicate_index = true;
239                            }
240                            comparators.push(comparator);
241                        }
242
243                        if has_duplicate_index {
244                            for comparator in &comparators {
245                                tracing::trace!(
246                                    ?name,
247                                    ?action,
248                                    ?comparator,
249                                    "add seccomp conditional rule separately"
250                                );
251                                ctx.add_rule_conditional(action, sc, std::slice::from_ref(comparator))
252                                    .map_err(|err| {
253                                        tracing::error!(
254                                            "failed to add seccomp action: {:?}. Comparator: {:?} Syscall: {name}",
255                                            &action,
256                                            comparator,
257                                        );
258                                        SeccompError::AddRule { source: err }
259                                    })?;
260                            }
261                        } else {
262                            tracing::trace!(
263                                ?name,
264                                ?action,
265                                ?comparators,
266                                "add seccomp conditional rule"
267                            );
268                            ctx.add_rule_conditional(action, sc, &comparators)
269                                .map_err(|err| {
270                                    tracing::error!(
271                                        "failed to add seccomp action: {:?}. Comparators: {:?} Syscall: {name}",
272                                        &action,
273                                        comparators,
274                                    );
275                                    SeccompError::AddRule { source: err }
276                                })?;
277                        }
278                    }
279                    None => {
280                        tracing::trace!(?name, ?action, "add seccomp rule");
281                        ctx.add_rule(action, sc).map_err(|err| {
282                            tracing::error!(
283                                "failed to add seccomp rule: {:?}. Syscall: {name}",
284                                &sc
285                            );
286                            SeccompError::AddRule { source: err }
287                        })?;
288                    }
289                }
290            }
291        }
292    }
293
294    // In order to use the SECCOMP_SET_MODE_FILTER operation, either the calling
295    // thread must have the CAP_SYS_ADMIN capability in its user namespace, or
296    // the thread must already have the no_new_privs bit set.
297    // Ref: https://man7.org/linux/man-pages/man2/seccomp.2.html
298    ctx.load()
299        .map_err(|err| SeccompError::LoadContext { source: err })?;
300
301    let fd = if is_notify(seccomp) {
302        Some(
303            ctx.get_notify_fd()
304                .map_err(|err| SeccompError::GetNotifyId { source: err })?,
305        )
306    } else {
307        None
308    };
309
310    Ok(fd)
311}
312
313pub fn is_notify(seccomp: &LinuxSeccomp) -> bool {
314    seccomp
315        .syscalls()
316        .iter()
317        .flatten()
318        .any(|syscall| syscall.action() == LinuxSeccompAction::ScmpActNotify)
319}
320
321#[cfg(test)]
322mod tests {
323    use std::path;
324
325    use anyhow::{Context, Result};
326    use oci_spec::runtime::{
327        Arch, LinuxSeccompArgBuilder, LinuxSeccompBuilder, LinuxSyscallBuilder,
328    };
329    use serial_test::serial;
330
331    use super::*;
332    use crate::test_utils::{self, TestCallbackError};
333
334    #[test]
335    #[serial]
336    fn test_basic() -> Result<()> {
337        // Note: seccomp profile is really hard to write unit test for. First,
338        // we can't really test default error or kill action, since rust test
339        // actually relies on certain syscalls. Second, some of the syscall will
340        // not return errorno. These syscalls will just send an abort signal or
341        // even just segfaults.  Here we choose to use `getcwd` syscall for
342        // testing, since it will correctly return an error under seccomp rule.
343        // This is more of a sanity check.
344
345        // Here, we choose an error that getcwd call would never return on its own, so
346        // we can make sure that getcwd failed because of seccomp rule.
347        let expect_error = libc::EAGAIN;
348
349        let syscall = LinuxSyscallBuilder::default()
350            .names(vec![String::from("getcwd")])
351            .action(LinuxSeccompAction::ScmpActErrno)
352            .errno_ret(expect_error as u32)
353            .build()?;
354        let seccomp_profile = LinuxSeccompBuilder::default()
355            .default_action(LinuxSeccompAction::ScmpActAllow)
356            .architectures(vec![Arch::ScmpArchNative])
357            .syscalls(vec![syscall])
358            .build()?;
359
360        test_utils::test_in_child_process(|| {
361            let _ = prctl::set_no_new_privileges(true);
362            initialize_seccomp(&seccomp_profile).expect("failed to initialize seccomp");
363            let ret = nix::unistd::getcwd();
364            if ret.is_ok() {
365                Err(TestCallbackError::Custom(
366                    "getcwd didn't error out as seccomp profile specified".to_string(),
367                ))?;
368            }
369
370            if let Some(errno) = ret.err() {
371                if errno != nix::errno::Errno::from_raw(expect_error) {
372                    Err(TestCallbackError::Custom(format!(
373                        "getcwd failed but we didn't get the expected error from seccomp profile: {}",
374                        errno
375                    )))?;
376                }
377            }
378
379            Ok(())
380        })?;
381
382        Ok(())
383    }
384
385    #[test]
386    #[serial]
387    fn test_moby() -> Result<()> {
388        let fixture_path =
389            path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/seccomp/fixture/config.json");
390        let spec = oci_spec::runtime::Spec::load(fixture_path)
391            .context("Failed to load test spec for seccomp")?;
392
393        // We know linux and seccomp exist, so let's just unwrap.
394        let seccomp_profile = spec.linux().as_ref().unwrap().seccomp().as_ref().unwrap();
395        test_utils::test_in_child_process(|| {
396            let _ = prctl::set_no_new_privileges(true);
397            initialize_seccomp(seccomp_profile).expect("failed to initialize seccomp");
398
399            Ok(())
400        })?;
401
402        Ok(())
403    }
404
405    #[test]
406    #[serial]
407    fn test_seccomp_notify() -> Result<()> {
408        let syscall = LinuxSyscallBuilder::default()
409            .names(vec![String::from("getcwd")])
410            .action(LinuxSeccompAction::ScmpActNotify)
411            .build()?;
412        let seccomp_profile = LinuxSeccompBuilder::default()
413            .default_action(LinuxSeccompAction::ScmpActAllow)
414            .architectures(vec![Arch::ScmpArchNative])
415            .syscalls(vec![syscall])
416            .build()?;
417        test_utils::test_in_child_process(|| {
418            let _ = prctl::set_no_new_privileges(true);
419            let fd =
420                initialize_seccomp(&seccomp_profile).expect("failed to initialize seccomp profile");
421            if fd.is_none() {
422                Err(TestCallbackError::Custom(
423                    "failed to get a seccomp notify fd with notify seccomp profile".to_string(),
424                ))?;
425            }
426
427            Ok(())
428        })?;
429
430        Ok(())
431    }
432
433    #[test]
434    #[serial]
435    fn test_seccomp_conditional_rule_multiple_distinct_args() -> Result<()> {
436        let syscall = LinuxSyscallBuilder::default()
437            .names(vec![String::from("socket")])
438            .action(LinuxSeccompAction::ScmpActErrno)
439            .errno_ret(libc::EAGAIN as u32)
440            .args(vec![
441                LinuxSeccompArgBuilder::default()
442                    .index(0_usize)
443                    .value(libc::AF_INET as u64)
444                    .op(LinuxSeccompOperator::ScmpCmpEq)
445                    .build()?,
446                LinuxSeccompArgBuilder::default()
447                    .index(1_usize)
448                    .value(libc::SOCK_STREAM as u64)
449                    .op(LinuxSeccompOperator::ScmpCmpEq)
450                    .build()?,
451            ])
452            .build()?;
453
454        let seccomp_profile = LinuxSeccompBuilder::default()
455            .default_action(LinuxSeccompAction::ScmpActAllow)
456            .architectures(vec![Arch::ScmpArchNative])
457            .syscalls(vec![syscall])
458            .build()?;
459
460        test_utils::test_in_child_process(|| {
461            let _ = prctl::set_no_new_privileges(true);
462            initialize_seccomp(&seccomp_profile).expect("failed to initialize seccomp");
463            Ok(())
464        })?;
465
466        Ok(())
467    }
468
469    #[test]
470    #[serial]
471    fn test_seccomp_conditional_rule_duplicate_arg_index() -> Result<()> {
472        let syscall = LinuxSyscallBuilder::default()
473            .names(vec![String::from("socket")])
474            .action(LinuxSeccompAction::ScmpActErrno)
475            .errno_ret(libc::EAGAIN as u32)
476            .args(vec![
477                LinuxSeccompArgBuilder::default()
478                    .index(0_usize)
479                    .value(libc::AF_INET as u64)
480                    .op(LinuxSeccompOperator::ScmpCmpEq)
481                    .build()?,
482                LinuxSeccompArgBuilder::default()
483                    .index(0_usize)
484                    .value(libc::AF_UNIX as u64)
485                    .op(LinuxSeccompOperator::ScmpCmpNe)
486                    .build()?,
487            ])
488            .build()?;
489
490        let seccomp_profile = LinuxSeccompBuilder::default()
491            .default_action(LinuxSeccompAction::ScmpActAllow)
492            .architectures(vec![Arch::ScmpArchNative])
493            .syscalls(vec![syscall])
494            .build()?;
495
496        test_utils::test_in_child_process(|| {
497            let _ = prctl::set_no_new_privileges(true);
498            initialize_seccomp(&seccomp_profile).expect("failed to initialize seccomp");
499            Ok(())
500        })?;
501
502        Ok(())
503    }
504
505    #[test]
506    #[serial]
507    fn test_seccomp_multiple_syscall_entries_for_same_name() -> Result<()> {
508        let rule1 = LinuxSyscallBuilder::default()
509            .names(vec!["socket".into()])
510            .action(LinuxSeccompAction::ScmpActErrno)
511            .errno_ret(libc::EAGAIN as u32)
512            .args(vec![
513                LinuxSeccompArgBuilder::default()
514                    .index(0_usize)
515                    .value(libc::AF_NETLINK as u64)
516                    .op(LinuxSeccompOperator::ScmpCmpEq)
517                    .build()?,
518                LinuxSeccompArgBuilder::default()
519                    .index(2_usize)
520                    .value(libc::NETLINK_AUDIT as u64)
521                    .op(LinuxSeccompOperator::ScmpCmpNe)
522                    .build()?,
523            ])
524            .build()?;
525
526        let rule2 = LinuxSyscallBuilder::default()
527            .names(vec!["socket".into()])
528            .action(LinuxSeccompAction::ScmpActErrno)
529            .errno_ret(libc::EAGAIN as u32)
530            .args(vec![
531                LinuxSeccompArgBuilder::default()
532                    .index(0_usize)
533                    .value(libc::AF_INET as u64)
534                    .op(LinuxSeccompOperator::ScmpCmpNe)
535                    .build()?,
536            ])
537            .build()?;
538
539        let profile = LinuxSeccompBuilder::default()
540            .default_action(LinuxSeccompAction::ScmpActAllow)
541            .architectures(vec![Arch::ScmpArchNative])
542            .syscalls(vec![rule1, rule2])
543            .build()?;
544
545        test_utils::test_in_child_process(|| {
546            let _ = prctl::set_no_new_privileges(true);
547            initialize_seccomp(&profile).expect("failed to initialize seccomp");
548
549            Ok(())
550        })?;
551
552        Ok(())
553    }
554}