1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
use std::{
ffi::CString,
os::{
fd::{AsFd, AsRawFd, FromRawFd, IntoRawFd},
unix::io::RawFd,
},
};
use libseccomp::ScmpFilterContext;
use nix::{
errno::Errno,
fcntl::OFlag,
sched::CloneFlags,
sys::{
ptrace::{cont, Options},
signal::{kill, Signal},
wait::{Id, WaitPidFlag},
},
unistd::{read, write, Pid},
};
use crate::{
compat::{waitid, WaitStatus, PTRACE_SEIZE},
fd::{fdclone, pidfd_getfd, SafeOwnedFd},
log::LOG_FD,
retry::retry_on_eintr,
rng::duprand,
sandbox::RawIoctlMap,
unshare::{child::child_after_clone, config::Config, Child, Command},
};
type ChildPreExecFunc = Box<dyn Fn() -> Result<(), Errno>>;
type PipePair = ((RawFd, RawFd), (RawFd, RawFd));
pub struct ChildInfo {
pub cfg: Config,
pub exe_file: CString,
pub exe_args: Vec<CString>,
pub pre_exec: Option<ChildPreExecFunc>,
pub pty_fd: Option<RawFd>,
pub ioctl_denylist: Option<RawIoctlMap>,
pub seccomp_filter: Option<ScmpFilterContext>,
pub seccomp_pipefd: PipePair,
}
impl Command {
/// Spawn the command and return a handle that can be waited for
pub fn spawn(mut self) -> Result<Child, Errno> {
let exe_file = self.exe_file.take().ok_or(Errno::EFAULT)?;
let exe_args = self.exe_args.take().ok_or(Errno::EFAULT)?;
// Prepare information for the Syd child.
let child_info = Box::new(ChildInfo {
exe_file,
exe_args,
cfg: self.config,
pre_exec: std::mem::take(&mut self.pre_exec),
pty_fd: std::mem::take(&mut self.pty_fd),
ioctl_denylist: std::mem::take(&mut self.ioctl_denylist),
seccomp_filter: std::mem::take(&mut self.seccomp_filter),
seccomp_pipefd: self.seccomp_pipefd,
});
// Call clone(2), child_after_clone never returns.
let (pid_fd, child) = fdclone(
move || {
child_after_clone(child_info);
},
CloneFlags::empty(),
Some(libc::SIGCHLD),
)?;
// SAFETY: Randomize the pid FD for hardening.
let pid_fd_rand = duprand(pid_fd.as_raw_fd(), OFlag::O_CLOEXEC)?;
drop(pid_fd);
let pid_fd = pid_fd_rand;
// SAFETY: Randomize the log FD for hardening.
// O_EXCL closes oldfd on success.
let log_fd = LOG_FD.load(std::sync::atomic::Ordering::Relaxed);
if log_fd >= 0 {
let log_fd = duprand(log_fd, OFlag::O_CLOEXEC | OFlag::O_EXCL)?;
LOG_FD.store(log_fd.into_raw_fd(), std::sync::atomic::Ordering::Relaxed);
} // else logging is disabled.
let seccomp_fd = match self.after_start(child, &pid_fd) {
Ok(seccomp_fd) => seccomp_fd,
Err(e) => loop {
match waitid(Id::PIDFd(pid_fd.as_fd()), WaitPidFlag::WEXITED) {
Ok(WaitStatus::Exited(_, errno)) => return Err(Errno::from_raw(errno)),
Err(Errno::EINTR) => {}
_ => return Err(e),
}
},
};
Ok(Child {
pid: child.into(),
pid_fd: pid_fd.into_raw_fd(),
seccomp_fd: seccomp_fd.into_raw_fd(),
status: None,
})
}
fn after_start<Fd: AsFd>(mut self, pid: Pid, pid_fd: Fd) -> Result<SafeOwnedFd, Errno> {
if self.config.stop {
// Seize the process for tracing.
// This must happen before reading the seccomp fd.
// TODO: Make ptrace options configurable.
let ptrace_options: Options = Options::PTRACE_O_TRACEFORK
| Options::PTRACE_O_TRACEVFORK
| Options::PTRACE_O_TRACECLONE
| Options::PTRACE_O_TRACEEXEC // used by Exec TOCTOU mitigator.
| Options::PTRACE_O_TRACEEXIT // used by SegvGuard.
| Options::PTRACE_O_TRACESECCOMP // used by chdir and exec hooks.
| Options::PTRACE_O_TRACESYSGOOD // ditto.
| Options::PTRACE_O_EXITKILL; // we also set PDEATHSIG so this is the second layer.
// Step 1: Wait for the process to stop itself.
// Note, we also wait for EXITED so that if the process is
// interrupted, and the wait will fall through to the assert
// to fail.
let status = waitid(
Id::PIDFd(pid_fd.as_fd()),
WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::__WNOTHREAD,
)?;
assert_eq!(status, WaitStatus::Stopped(pid, libc::SIGSTOP));
// Step 2: Seize the process.
// 1. We use PTRACE_SEIZE in the parent rather than
// PTRACE_TRACEME in the child for its improved
// behaviour/API. This also gives us the chance to deny
// PTRACE_TRACEME and further confine the sandbox against
// e.g. trivial ptrace detectors.
// 2. Panic if PTRACE_SEIZE fails as otherwise we will leave
// the sandbox process in an uninterruptible, broken state.
// The typical error case is EPERM which means parent is
// strace or YAMA is active.
assert_eq!(
// SAFETY: `pid` is a valid child pid from `fdclone`;
// `PTRACE_SEIZE` with valid option flags.
Errno::result(unsafe {
libc::ptrace(
PTRACE_SEIZE,
pid.as_raw(),
0,
ptrace_options.bits() as *mut libc::c_void,
)
})
.map(drop),
Ok(()),
"YAMA or strace? Use with strace -f syd -pD ..."
);
let status = waitid(
Id::PIDFd(pid_fd.as_fd()),
WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::__WNOTHREAD,
)?;
assert_eq!(
status,
WaitStatus::PtraceEvent(pid, libc::SIGSTOP, libc::PTRACE_EVENT_STOP)
);
// SAFETY: nix does not have a wrapper for PTRACE_LISTEN.
Errno::result(unsafe {
libc::ptrace(crate::compat::PTRACE_LISTEN, pid.as_raw(), 0, 0)
})?;
// Step 3: Successfully attached, resume the process.
// We have to do a simple signal ping-pong here but
// it's done once and it's worth the trouble.
kill(pid, Signal::SIGCONT)?;
let status = waitid(
Id::PIDFd(pid_fd.as_fd()),
WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::__WNOTHREAD,
)?;
assert_eq!(
status,
WaitStatus::PtraceEvent(pid, libc::SIGTRAP, libc::PTRACE_EVENT_STOP)
);
cont(pid, None)?;
let status = waitid(
Id::PIDFd(pid_fd.as_fd()),
WaitPidFlag::WEXITED | WaitPidFlag::WSTOPPED | WaitPidFlag::__WNOTHREAD,
)?;
assert_eq!(status, WaitStatus::PtraceEvent(pid, libc::SIGCONT, 0));
cont(pid, Some(Signal::SIGCONT))?;
}
if let Some(ref mut callback) = self.before_unfreeze {
#[expect(clippy::cast_sign_loss)]
callback(i32::from(pid) as u32)?;
}
// SAFETY: Parent owns its copy of the pipes,
// and is responsible for closing them.
let seccomp_pipefd = unsafe {
(
(
SafeOwnedFd::from_raw_fd(self.seccomp_pipefd.0 .0),
SafeOwnedFd::from_raw_fd(self.seccomp_pipefd.0 .1),
),
(
SafeOwnedFd::from_raw_fd(self.seccomp_pipefd.1 .0),
SafeOwnedFd::from_raw_fd(self.seccomp_pipefd.1 .1),
),
)
};
// We'll read seccomp notify fd from the second pipe,
// and write the acknowledgement notification to
// the first pipe.
let (pipe_ro, pipe_rw) = (seccomp_pipefd.1 .0, seccomp_pipefd.0 .1);
// Close the unused ends of the pipes.
drop(seccomp_pipefd.0 .0);
drop(seccomp_pipefd.1 .1);
// Read the value of the file descriptor from the pipe.
// Handle interrupts and partial reads.
// EOF means process died before writing to the pipe.
let mut buf = vec![0u8; size_of::<RawFd>()];
let mut nread = 0;
while nread < buf.len() {
#[expect(clippy::arithmetic_side_effects)]
match read(&pipe_ro, &mut buf[nread..]) {
Ok(0) => return Err(Errno::EIO),
Ok(n) => nread += n,
Err(Errno::EINTR | Errno::EAGAIN) => continue,
Err(errno) => return Err(errno),
}
}
// Close the read end of the pipe.
drop(pipe_ro);
let remote_seccomp_fd = match buf.as_slice().try_into() {
Ok(buf) => RawFd::from_le_bytes(buf),
Err(_) => return Err(Errno::EINVAL),
};
// Get the seccomp notify fd using pidfd_getfd(2).
// The child is waiting on the read end of the pipe,
// for us to safely transfer the file descriptor.
let seccomp_fd = pidfd_getfd(pid_fd, remote_seccomp_fd)?;
// Unblock the child to safely continue and close
// their copy of the seccomp notify file descriptor.
// Handle interrupts.
// Partial write is not possible.
// EOF means process died before reading from the pipe.
let buf = [42u8; 1];
match retry_on_eintr(|| write(&pipe_rw, &buf))? {
0 => return Err(Errno::EIO),
1 => {}
n => unreachable!("BUG: invalid pipe write of size {n}!"),
};
// Close the write end of the pipe.
drop(pipe_rw);
// SAFETY: Randomize the seccomp(2) fd for hardening.
// Old seccomp fd will be closed by Drop on function exit.
duprand(seccomp_fd.as_raw_fd(), OFlag::O_CLOEXEC)
}
}