syd 3.52.0

rock-solid application kernel
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
#![allow(clippy::undocumented_unsafe_blocks)]

use std::{
    env, mem,
    os::{
        fd::{AsRawFd, FromRawFd, RawFd},
        unix::ffi::OsStrExt,
    },
    ptr,
};

use btoi::btoi;
use memchr::arch::all::is_prefix;
use nix::{
    errno::Errno,
    libc,
    sys::{
        resource::{setrlimit, Resource},
        signal::{kill, sigprocmask, SigSet, SigmaskHow, Signal},
    },
    unistd::{execvp, getpid, read, setsid, tcsetpgrp, write},
};

use crate::{
    caps::Capability,
    compat::{seccomp_load_listener, set_dumpable, set_name, set_pdeathsig},
    config::*,
    confine::{
        confine_landlock_scope, confine_scmp_ioctl_cld, confine_scmp_kptr, confine_scmp_pwritev2,
        safe_drop_cap,
    },
    fd::{close, close_static_files, SafeOwnedFd, ROOT_FILE},
    retry::retry_on_eintr,
    unshare::{config::Config, error::ErrorCode as Err, run::ChildInfo},
};

unsafe fn fail_errno(code: Err, errno: i32) -> ! {
    let msg = match code {
        Err::CapSet => c"syd: capset error".as_ptr(),
        Err::Exec => c"syd: exec error".as_ptr(),
        Err::ParentDeathSignal => c"syd: parent-death-signal error".as_ptr(),
        Err::PreExec => c"syd: pre-exec error".as_ptr(),
        Err::ProcessStop => c"syd: error stopping process".as_ptr(),
        Err::ResetSignal => c"syd: error resetting signals".as_ptr(),
        Err::SetResourceLimits => c"syd: error setting resource limits".as_ptr(),
        Err::LandlockFilterScopedSignals => c"syd: error scoping signals with landlock".as_ptr(),
        Err::Seccomp => c"syd: seccomp error".as_ptr(),
        Err::SeccompFilterIoctl => c"syd: seccomp filter ioctl error".as_ptr(),
        Err::SeccompFilterAppendOnly => c"syd: seccomp filter pwritev2 error".as_ptr(),
        Err::SeccompFilterKptr => c"syd: seccomp filter kernel pointer error".as_ptr(),
        Err::SeccompSendFd => c"syd: seccomp send notify-fd error".as_ptr(),
        Err::SeccompWaitFd => c"syd: seccomp wait for notify-fd error".as_ptr(),
        Err::SetDumpable => c"syd: error resetting process dumpable attribute".as_ptr(),
        Err::SetSid => c"syd: setsid error".as_ptr(),
        Err::SetPty => c"syd: error setting pty as controlling terminal".as_ptr(),
        Err::DupPty => c"syd: error duplicating pty onto stdio fds".as_ptr(),
        Err::SetPgrp => c"syd: error setting foreground process group".as_ptr(),
        Err::SetTSC => c"syd: set-tsc error".as_ptr(),
    };
    Errno::set_raw(errno);
    libc::perror(msg as *const libc::c_char);
    libc::_exit(errno);
}

macro_rules! fail_safe {
    ($child:expr, $error:expr) => {
        let errno = Errno::last_raw();
        unsafe { fail_errno($error, errno) }
    };
}

macro_rules! fail_errno_safe {
    ($child:expr, $error:expr, $errno:expr) => {
        unsafe { fail_errno($error, $errno) }
    };
}

#[expect(clippy::cognitive_complexity)]
pub fn child_after_clone(mut child: Box<ChildInfo>) -> ! {
    // Set process name, ignore errors.
    let _ = set_name(c"syd_exec");

    // We'll write seccomp(2) notify fd to the second pipe, and
    // read the acknowledgement notification from the first pipe.
    // SAFETY: Raw fds are valid in the child's fd table after clone(2) without CLONE_FILES.
    let (pipe_ro, pipe_rw) = unsafe {
        (
            SafeOwnedFd::from_raw_fd(child.seccomp_pipefd.0 .0),
            SafeOwnedFd::from_raw_fd(child.seccomp_pipefd.1 .1),
        )
    };

    // Close the unused ends of the pipes.
    let _ = close(child.seccomp_pipefd.0 .1);
    let _ = close(child.seccomp_pipefd.1 .0);

    if let Some(&sig) = child.cfg.death_sig.as_ref() {
        if let Err(errno) = set_pdeathsig(Some(sig)) {
            fail_errno_safe!(child, Err::ParentDeathSignal, errno as i32);
        }
    }

    // Restriction 0: Change controlling terminal to PTY as necessary.
    if let Some(pty_fd) = child.pty_fd.take() {
        // SAFETY: pty_fd is a valid FD.
        let pty_fd = unsafe { SafeOwnedFd::from_raw_fd(pty_fd) };

        // Become session leader so we can take a controlling TTY.
        if let Err(errno) = setsid() {
            fail_errno_safe!(child, Err::SetSid, errno as i32);
        }

        // Make the PTY fd our controlling terminal.
        if let Err(errno) =
            Errno::result(unsafe { libc::ioctl(pty_fd.as_raw_fd(), libc::TIOCSCTTY, 0) })
        {
            fail_errno_safe!(child, Err::SetPty, errno as i32);
        }

        // Make us the foreground process group.
        if let Err(errno) = tcsetpgrp(&pty_fd, getpid()) {
            fail_errno_safe!(child, Err::SetPgrp, errno as i32);
        }

        // Duplicate PTY fd onto stdio(3) fds.
        for std_fd in [libc::STDIN_FILENO, libc::STDOUT_FILENO, libc::STDERR_FILENO] {
            if let Err(errno) = Errno::result(unsafe { libc::dup2(pty_fd.as_raw_fd(), std_fd) }) {
                fail_errno_safe!(child, Err::DupPty, errno as i32);
            }
        }

        // Close the original PTY fd.
        drop(pty_fd);
    }

    // This must happen after ^^PTY handling above,
    // because we want to unignore SIGTTOU.
    if child.cfg.restore_sigmask {
        // Reset blocking signals.
        // Step 1: Reset the signal mask using pthread_sigmask.
        unsafe {
            let mut sigmask: libc::sigset_t = mem::zeroed();
            libc::sigemptyset(&raw mut sigmask);
            libc::pthread_sigmask(libc::SIG_SETMASK, &raw const sigmask, ptr::null_mut());
        }
        // Step 2: Unblock all signals using sigprocmask.
        let sigmask = SigSet::all();
        if let Err(errno) = sigprocmask(SigmaskHow::SIG_UNBLOCK, Some(&sigmask), None) {
            fail_errno_safe!(child, Err::ResetSignal, errno as i32);
        }

        // Reset all signals to their default dispositions.
        if let Err(errno) = crate::reset_signals() {
            fail_errno_safe!(child, Err::ResetSignal, errno as i32);
        }
    }

    // Restriction 1:
    //
    // Add per-architecture seccomp(2) filters to deny unsafe ioctl(2) requests.
    if let Some(denylist) = child.ioctl_denylist.take() {
        if let Err(error) = confine_scmp_ioctl_cld(&denylist, child.cfg.ssb) {
            let errno = error.errno().unwrap_or(Errno::ENOSYS);
            fail_errno_safe!(child, Err::SeccompFilterIoctl, errno as i32);
        }
    }

    // Restriction 2:
    //
    // Deny RWF_NOAPPEND for pwritev2(2) if append-only is enabled.
    if child.cfg.append_only {
        if let Err(error) = confine_scmp_pwritev2(child.cfg.ssb) {
            let errno = error.errno().unwrap_or(Errno::ENOSYS);
            fail_errno_safe!(child, Err::SeccompFilterAppendOnly, errno as i32);
        }
    }

    // Restriction 3:
    //
    // Restrict kernel pointers in syscall arguments unless trace/allow_unsafe_kptr:1 is set.
    if child.cfg.restrict_kptr {
        if let Err(error) = confine_scmp_kptr(child.cfg.ssb) {
            let errno = error.errno().unwrap_or(Errno::ENOSYS);
            fail_errno_safe!(child, Err::SeccompFilterKptr, errno as i32);
        }
    }

    // Apply a landlock(7) scope sandbox to restrict
    // 1. Ptrace attach outside Landlock.
    // 2. Signal send outside Landlock.
    //
    // Leave path and network restrictions for landlock(7)
    // to be configured by the user using Lock sandboxing.
    //
    // This must happen before close_static_files() which will
    // invalidate ROOT_FILE().
    if let Err(errno) = confine_landlock_scope(
        Some(ROOT_FILE()),
        child.cfg.landlock_access_fs,
        child.cfg.landlock_access_net,
        child.cfg.landlock_scoped_abs,
    ) {
        fail_errno_safe!(child, Err::LandlockFilterScopedSignals, errno as i32);
    }

    // Do not leak the static file descriptors to the sandbox process.
    close_static_files();

    // Do not leak the following FDs to the sandbox process:
    // 1. Log file descriptor.
    // 2. IPC epoll file descriptor.
    // 3. IPC UNIX socket descriptor.
    // TODO: Move this to config.rs.
    const CLOSE_FD_ENVS: &[&str] = &[ENV_LOG_FD, ENV_IPC_POLL_FD, ENV_IPC_UNIX_FD];
    for env in CLOSE_FD_ENVS {
        let fd = if let Some(fd) = env::var_os(env) {
            btoi::<RawFd>(fd.as_bytes()).ok()
        } else {
            None
        };
        if let Some(fd) = fd {
            if fd >= 0 {
                let _ = close(fd);
            }
        }
    }

    // Check if export mode is in effect.
    let is_export = env::var_os(ENV_DUMP_SCMP).is_some();

    // Passthrough RUST_BACKTRACE to the sandbox process.
    match env::var_os("SYD_RUST_BACKTRACE") {
        Some(val) => env::set_var("RUST_BACKTRACE", val),
        None => env::remove_var("RUST_BACKTRACE"),
    }

    // Clean Syd environment variables from process environment.
    for (key, _) in env::vars_os() {
        if is_prefix(key.as_bytes(), b"CARGO_BIN_EXE_syd")
            || (is_prefix(key.as_bytes(), b"SYD_") && !is_prefix(key.as_bytes(), b"SYD_TEST_"))
        {
            env::remove_var(key);
        }
    }

    if let Some(callback) = &child.pre_exec {
        if let Err(errno) = callback() {
            fail_errno_safe!(child, Err::PreExec, errno as i32);
        }
    }

    if child.cfg.deny_tsc {
        if let Err(errno) =
            Errno::result(unsafe { libc::prctl(libc::PR_SET_TSC, libc::PR_TSC_SIGSEGV) })
        {
            fail_errno_safe!(child, Err::SetTSC, errno as i32);
        }
    }

    if child.cfg.restrict_prlimit {
        // Set resource limits before seccomp(2), because it will deny prlimit(2).
        if let Err(errno) = set_resource_limits(&child.cfg) {
            fail_errno_safe!(child, Err::SetResourceLimits, errno as i32);
        }
    }

    // Reset dumpable attribute to allow per-proc(5) access.
    if let Err(errno) = set_dumpable(true) {
        fail_errno_safe!(child, Err::SetDumpable, errno as i32);
    }

    if child.cfg.stop {
        // Stop the process to give the parent a chance to seize us and
        // set ptrace(2) options. This must happen _before_ loading the
        // seccomp(2) filter.
        if let Err(errno) = kill(getpid(), Signal::SIGSTOP) {
            fail_errno_safe!(child, Err::ProcessStop, errno as i32);
        }
    }

    if let Some(seccomp_filter) = child.seccomp_filter {
        // Load the seccomp(2) filter, get seccomp(2) notification fd.
        let seccomp_fd = match seccomp_load_listener(&seccomp_filter) {
            Ok(fd) => fd,
            Err(errno) => fail_errno_safe!(child, Err::Seccomp, errno as i32),
        };

        // Write the value of the seccomp notify fd to the pipe.
        // Handle partial writes and interrupts.
        // EOF means parent died before reading.
        let fd = seccomp_fd.as_raw_fd().to_le_bytes();
        let mut nwrite = 0;
        while nwrite < fd.len() {
            #[expect(clippy::arithmetic_side_effects)]
            match retry_on_eintr(|| write(&pipe_rw, &fd[nwrite..])) {
                Ok(0) => {
                    // Parent died before reading.
                    // This should ideally never happen.
                    fail_errno_safe!(child, Err::SeccompSendFd, Errno::EIO as i32);
                }
                Ok(n) => nwrite += n,
                Err(errno) => fail_errno_safe!(child, Err::SeccompSendFd, errno as i32),
            }
        }

        // Close the write end of the pipe.
        drop(pipe_rw);

        // Wait for the parent to get the file descriptor.
        // Handle interrupts.
        // Partial read is not possible.
        // EOF means parent died before writing to the pipe.
        let mut buf = [0u8; 1];
        match retry_on_eintr(|| read(&pipe_ro, &mut buf[..])) {
            Ok(0) => {
                // Parent died before writing.
                // This should ideally never happen.
                fail_errno_safe!(child, Err::SeccompWaitFd, Errno::EIO as i32);
            }
            Ok(1) if buf[0] == 42 => {
                // Parent received seccomp fd successfully.
                // We can go ahead and close our copy now.
            }
            Ok(_) => unreachable!("BUG: The meaning of life is not {:#x}!", buf[0]),
            Err(errno) => fail_errno_safe!(child, Err::SeccompWaitFd, errno as i32),
        }

        // Close our copy of the seccomp-notify fd.
        // Parent process has already acknowledged that
        // it has received a copy of this fd.
        drop(seccomp_fd);

        // Release resources for seccomp BPF filter.
        // Memory allocation/deallocation is OK here
        // now that we have transferred over the
        // seccomp-notify fd to the parent process.
        // Otherwise we'd risk breaking Memory sandboxing.
        drop(seccomp_filter);

        // Close the read end of the pipe.
        drop(pipe_ro);
    } else {
        // Close unused ends of the pipes.
        drop(pipe_ro);
        drop(pipe_rw);
    }

    // Drop the following capabilities unconditionally.
    // 1. CAP_CHOWN: for privileged chown(2)
    // 2. CAP_MKNOD: for privileged mknod(2)
    // 3. CAP_NET_BIND_SERVICE: for privileged bind(2)
    // 4. CAP_NET_RAW: for privileged socket(2)
    // These system calls happen in syd-emulator threads even if the
    // respective unsafe options are set, therefore dropping the caps
    // here ensures this.
    const CAP_DROP: &[Capability] = &[
        Capability::CAP_CHOWN,
        Capability::CAP_MKNOD,
        Capability::CAP_NET_BIND_SERVICE,
        Capability::CAP_NET_RAW,
    ];
    for cap in CAP_DROP {
        if safe_drop_cap(*cap).is_err() {
            fail_safe!(child, Err::CapSet);
        }
    }

    // Drop CAP_SYS_PTRACE late as Syd may need it.
    if !child.cfg.keep && safe_drop_cap(Capability::CAP_SYS_PTRACE).is_err() {
        fail_safe!(child, Err::CapSet);
    }

    if !child.cfg.restrict_prlimit {
        // Set resource limits after seccomp(2) with trace/allow_unsafe_prlimit:1.
        if let Err(errno) = set_resource_limits(&child.cfg) {
            fail_errno_safe!(child, Err::SetResourceLimits, errno as i32);
        }
    }

    // Exit immediately if export mode is in effect.
    if is_export {
        unsafe { libc::_exit(0) };
    }

    let Err(errno) = execvp(&child.exe_file, &child.exe_args);
    fail_errno_safe!(child, Err::Exec, errno as i32);
}

fn set_resource_limits(cfg: &Config) -> Result<(), Errno> {
    if let Some(lim) = cfg.rlimit_as {
        setrlimit(Resource::RLIMIT_AS, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_core {
        setrlimit(Resource::RLIMIT_CORE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_cpu {
        setrlimit(Resource::RLIMIT_CPU, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_data {
        setrlimit(Resource::RLIMIT_DATA, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_fsize {
        setrlimit(Resource::RLIMIT_FSIZE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_memlock {
        setrlimit(Resource::RLIMIT_MEMLOCK, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_msgqueue {
        setrlimit(Resource::RLIMIT_MSGQUEUE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_nice {
        setrlimit(Resource::RLIMIT_NICE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_nofile {
        setrlimit(Resource::RLIMIT_NOFILE, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_nproc {
        setrlimit(Resource::RLIMIT_NPROC, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_rtprio {
        setrlimit(Resource::RLIMIT_RTPRIO, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_rttime {
        setrlimit(Resource::RLIMIT_RTTIME, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_sigpending {
        setrlimit(Resource::RLIMIT_SIGPENDING, lim, lim)?;
    }
    if let Some(lim) = cfg.rlimit_stack {
        setrlimit(Resource::RLIMIT_STACK, lim, lim)?;
    }
    Ok(())
}