kastellan_sandbox/lib.rs
1//! kastellan-sandbox: declarative, cross-platform sandbox for tool workers.
2//!
3//! One [`SandboxPolicy`] drives all backends. Backend selection is automatic
4//! per OS, with an optional micro-VM backend for stronger isolation.
5//!
6//! Backends (Phase 0/0b):
7//! - linux_bwrap — bubblewrap + Landlock + seccomp-bpf
8//! - macos_seatbelt — sandbox-exec (Seatbelt) + setrlimit
9//! - microvm — Firecracker (Linux) / Apple `container` CLI (macOS Tahoe+)
10
11#[cfg(target_os = "linux")]
12pub mod linux_bwrap;
13#[cfg(target_os = "linux")]
14pub mod linux_cgroup;
15#[cfg(target_os = "macos")]
16pub mod macos_container;
17#[cfg(target_os = "macos")]
18pub mod macos_seatbelt;
19
20use std::path::PathBuf;
21use std::sync::Arc;
22
23use serde::{Deserialize, Serialize};
24use thiserror::Error;
25
26/// Coarse profile presets that map to backend-specific defaults.
27#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
28pub enum Profile {
29 /// Strictest: no net by default, scratch FS only, minimal syscall set.
30 #[default]
31 WorkerStrict,
32 /// Slightly relaxed for workers that need outbound HTTPS via the egress proxy.
33 WorkerNetClient,
34}
35
36#[derive(Clone, Debug, Default, Serialize, Deserialize)]
37pub enum Net {
38 /// Deny all network access.
39 #[default]
40 Deny,
41 /// Allowlist of "host:port" entries. Egress still flows through the egress proxy.
42 Allowlist(Vec<String>),
43 /// The egress proxy itself: real outbound + DNS, self-enforcing. Maps to
44 /// the same "share the host network namespace" behaviour as `Allowlist`
45 /// *today*, but names the proxy-vs-worker distinction explicitly. Slice #2
46 /// diverges them: `Allowlist` workers get a private netns whose only route
47 /// out is the proxy UDS, while `ProxyEgress` keeps the real netns.
48 ProxyEgress,
49}
50
51#[derive(Clone, Debug, Serialize, Deserialize)]
52pub struct SandboxPolicy {
53 /// Read-only mounts/paths.
54 pub fs_read: Vec<PathBuf>,
55 /// Writable paths (typically a per-worker scratch dir).
56 pub fs_write: Vec<PathBuf>,
57 /// Network policy.
58 pub net: Net,
59 /// Hard CPU-time limit (milliseconds). Enforced via
60 /// `setrlimit(RLIMIT_CPU)` from the worker prelude (POSIX, so applies
61 /// on Linux and macOS). `0` means "unset, no rlimit applied".
62 pub cpu_ms: u64,
63 /// Hard memory limit (megabytes).
64 ///
65 /// * **Linux:** enforced via cgroup `MemoryMax` by [`crate::linux_cgroup`].
66 /// * **macOS Seatbelt:** **not enforced** (Seatbelt has no memory
67 /// primitive; `RLIMIT_AS` has high false-positive risk for
68 /// malloc-heavy workers and is intentionally deferred).
69 /// * **macOS Apple `container` backend** ([`crate::macos_container`]):
70 /// enforced via `container run -m <N>M` with SIGKILL on overrun.
71 /// Note the **200 MiB floor** — `container` rejects smaller values;
72 /// the backend clamps and emits a `tracing::warn!` so operators see
73 /// the silent widening. Opt-in per-worker (Slice 2 wiring), not the
74 /// default macOS backend.
75 pub mem_mb: u64,
76 /// Profile preset.
77 pub profile: Profile,
78 /// Per-worker CPU bandwidth ceiling (percent of one CPU). `None`
79 /// falls back to the backend's defense-in-depth default.
80 ///
81 /// * **Linux cgroup:** enforced; default 200%, hardcoded in
82 /// [`crate::linux_cgroup`].
83 /// * **macOS Seatbelt:** no effect (no equivalent primitive).
84 /// * **macOS Apple `container` backend:** enforced via
85 /// `container run -c <fractional vCPUs>`; `None` lets `container`
86 /// pick up its host `--default-cpus` configuration (no
87 /// backend-emitted default, deliberately diverging from
88 /// `linux_cgroup` to avoid silently capping the per-host setting).
89 #[serde(default)]
90 pub cpu_quota_pct: Option<u32>,
91 /// Per-worker max task count. `None` falls back to the backend's
92 /// defense-in-depth default.
93 ///
94 /// * **Linux cgroup:** enforced via `pids.max` (per-cgroup process
95 /// count, kernel-enforced); default 64.
96 /// * **macOS Seatbelt:** no effect (no equivalent primitive).
97 /// * **macOS Apple `container` backend:** enforced via `container
98 /// run --ulimit nproc=<N>:<N>`, which becomes per-real-UID
99 /// `RLIMIT_NPROC` inside the Linux VM. **Semantic gap worth
100 /// knowing:** the Linux cgroup form is per-cgroup; the container
101 /// form is per-UID across the VM. Inside a one-worker container
102 /// running as a single UID the practical effect is similar, but
103 /// the guarantees are not identical.
104 #[serde(default)]
105 pub tasks_max: Option<u64>,
106 /// Environment variables to set inside the jail. Empty by default
107 /// — the host environment is **always** cleared before this is
108 /// applied, so the jail sees only what's listed here.
109 #[serde(default)]
110 pub env: Vec<(String, String)>,
111}
112
113impl Default for SandboxPolicy {
114 /// Conservative defaults: no FS access, no network, strict profile,
115 /// 1-second CPU budget, 64 MiB memory, no cgroup overrides. Production
116 /// callers (e.g. `shell_exec_entry`) override the limits explicitly;
117 /// the `Default` impl exists so tests and future field additions can
118 /// use `..Default::default()` without churning every fixture.
119 fn default() -> Self {
120 Self {
121 fs_read: Vec::new(),
122 fs_write: Vec::new(),
123 net: Net::default(),
124 cpu_ms: 1_000,
125 mem_mb: 64,
126 profile: Profile::default(),
127 cpu_quota_pct: None,
128 tasks_max: None,
129 env: Vec::new(),
130 }
131 }
132}
133
134#[derive(Debug, Error)]
135pub enum SandboxError {
136 #[error("backend error: {0}")]
137 Backend(String),
138}
139
140/// Operator-facing identifier for selecting a specific sandbox backend
141/// per-worker. Cfg-gated per-OS so cross-OS mis-config (e.g. declaring
142/// `Container` on Linux) is a compile-time error rather than a runtime
143/// surprise.
144///
145/// `None` on a `ToolEntry.sandbox_backend` means "use the per-OS
146/// default" — today darwin → `Seatbelt`, linux → `Bwrap`. Only opt in
147/// here when a worker has a concrete reason to diverge (e.g. needs
148/// memory enforcement on macOS, which `Seatbelt` can't provide).
149///
150/// `Serialize + Deserialize` derives are for future operator-config
151/// plumbing (e.g. surfacing `sandbox_backend` in a manifest file or
152/// CLI subcommand). No current call-site serialises this; the derives
153/// are forward-looking so a later config slice doesn't need to revisit
154/// every `ToolEntry` constructor.
155///
156/// `Container` is deliberately bound to the macOS Apple `container`
157/// CLI under `#[cfg(target_os = "macos")]`. A future Linux micro-VM
158/// backend (Firecracker, Kata, gVisor, etc.) would add a
159/// linux-cfg-gated variant with its own name (e.g. `FirecrackerVm`)
160/// rather than overloading `Container` — the cfg-gating prevents
161/// ambiguity today.
162///
163/// See `docs/superpowers/specs/2026-05-21-macos-container-slice-2-design.md`
164/// for the rationale behind OS-specific variant names vs an abstract
165/// `MicroVm` category.
166#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
167pub enum SandboxBackendKind {
168 #[cfg(target_os = "linux")]
169 Bwrap,
170 #[cfg(target_os = "macos")]
171 Seatbelt,
172 #[cfg(target_os = "macos")]
173 Container,
174}
175
176/// Common backend interface. To be implemented by [`linux_bwrap`], [`macos_seatbelt`],
177/// and [`microvm`] in subsequent phases.
178///
179/// `Send + Sync` are required because backends are shared via `Arc<dyn SandboxBackend>`
180/// across async tasks in the scheduler (one `Arc` per lane runner). Both concrete
181/// implementations (`LinuxBwrap`, `MacosSeatbelt`) hold no mutable state and
182/// satisfy these bounds automatically.
183pub trait SandboxBackend: Send + Sync {
184 /// Build the argv (or equivalent invocation) that runs `program` with `args`
185 /// under `policy`. Implementation detail of the backend; not stable yet.
186 fn spawn_under_policy(
187 &self,
188 policy: &SandboxPolicy,
189 program: &str,
190 args: &[&str],
191 ) -> Result<std::process::Child, SandboxError>;
192}
193
194/// Pick the default backend for the current OS.
195///
196/// Kept for direct-spawn callers (e.g. `tests-common::sandbox::backend()`)
197/// that don't need per-entry selection. Daemon-backed call sites
198/// construct [`SandboxBackends::default_for_current_os`] instead — that
199/// bundle supports the per-worker `sandbox_backend` opt-in introduced
200/// by Slice 2.
201pub fn default_backend() -> Box<dyn SandboxBackend> {
202 #[cfg(target_os = "linux")]
203 {
204 Box::new(linux_bwrap::LinuxBwrap::new())
205 }
206 #[cfg(target_os = "macos")]
207 {
208 Box::new(macos_seatbelt::MacosSeatbelt::new())
209 }
210 #[cfg(not(any(target_os = "linux", target_os = "macos")))]
211 {
212 Box::new(NotYetImplemented)
213 }
214}
215
216/// Per-OS bundle of constructed sandbox backends, used by the lifecycle
217/// managers to resolve a per-worker [`SandboxBackendKind`] to a concrete
218/// `Arc<dyn SandboxBackend>`.
219///
220/// Fields are cfg-gated to match `SandboxBackendKind` — every variant
221/// of the enum that exists at compile time has a backing field, so
222/// [`SandboxBackends::resolve`] is total (no runtime panic path for
223/// "unknown variant").
224///
225/// Constructed once at daemon startup via
226/// [`SandboxBackends::default_for_current_os`] (cheap — backends hold
227/// no mutable state) and threaded through the lifecycle managers as
228/// `Arc<SandboxBackends>`. Tests build a custom instance directly via
229/// struct-literal syntax with their own counter / stub backends.
230///
231/// `Clone` is provided so consumers that thread the bundle through
232/// async boundaries can copy the per-field `Arc`s cheaply.
233#[derive(Clone)]
234pub struct SandboxBackends {
235 #[cfg(target_os = "linux")]
236 pub bwrap: Arc<dyn SandboxBackend>,
237 #[cfg(target_os = "macos")]
238 pub seatbelt: Arc<dyn SandboxBackend>,
239 #[cfg(target_os = "macos")]
240 pub container: Arc<dyn SandboxBackend>,
241}
242
243impl SandboxBackends {
244 /// Construct the per-OS default bundle. On Linux this is a single
245 /// `LinuxBwrap`; on darwin it is `MacosSeatbelt` (the per-OS
246 /// default) plus a `MacosContainer` for opt-in workers. Cheap —
247 /// each backend is a unit-like struct with no I/O at construction.
248 pub fn default_for_current_os() -> Self {
249 #[cfg(target_os = "linux")]
250 {
251 Self {
252 bwrap: Arc::new(linux_bwrap::LinuxBwrap::new()),
253 }
254 }
255 #[cfg(target_os = "macos")]
256 {
257 Self {
258 seatbelt: Arc::new(macos_seatbelt::MacosSeatbelt::new()),
259 container: Arc::new(macos_container::MacosContainer::new()),
260 }
261 }
262 }
263
264 /// Resolve a per-worker [`SandboxBackendKind`] (+ optional container
265 /// image tag) to a concrete backend.
266 ///
267 /// Visible arms vary by OS via cfg-gating on the enum variants:
268 ///
269 /// * `(None, _)` — per-OS default. Linux → `bwrap`; darwin → `seatbelt`.
270 /// * `(Some(Bwrap), _)` — Linux only. Cached `bwrap` slot;
271 /// `image` is ignored (bwrap doesn't use container images).
272 /// * `(Some(Seatbelt), _)` — darwin only. Cached `seatbelt` slot;
273 /// `image` is ignored (Seatbelt isn't a container backend).
274 /// * `(Some(Container), None)` — darwin only. Cached default-image
275 /// container backend (the Slice 1 / smoke-test posture; `alpine:3.20`).
276 /// * `(Some(Container), Some(tag))` — darwin only. Per-call
277 /// `Arc::new(MacosContainer::with_image(tag))`. Cheap (String +
278 /// Arc); `MacosContainer::probe()` was called once at construction
279 /// against the default image, and `probe` is image-independent
280 /// (it checks `container --version` + `container system status`),
281 /// so no re-probe needed here.
282 ///
283 /// The returned `Arc` is held for the lifetime of one acquire call
284 /// (single-use lifecycle) or one warm-slot fill (idle-timeout
285 /// lifecycle).
286 pub fn resolve(
287 &self,
288 kind: Option<SandboxBackendKind>,
289 image: Option<&str>,
290 ) -> Arc<dyn SandboxBackend> {
291 match (kind, image) {
292 (None, _) => {
293 #[cfg(target_os = "linux")]
294 {
295 Arc::clone(&self.bwrap)
296 }
297 #[cfg(target_os = "macos")]
298 {
299 Arc::clone(&self.seatbelt)
300 }
301 }
302 #[cfg(target_os = "linux")]
303 (Some(SandboxBackendKind::Bwrap), _) => Arc::clone(&self.bwrap),
304 #[cfg(target_os = "macos")]
305 (Some(SandboxBackendKind::Seatbelt), _) => Arc::clone(&self.seatbelt),
306 #[cfg(target_os = "macos")]
307 (Some(SandboxBackendKind::Container), None) => Arc::clone(&self.container),
308 #[cfg(target_os = "macos")]
309 (Some(SandboxBackendKind::Container), Some(tag)) => {
310 Arc::new(macos_container::MacosContainer::with_image(tag))
311 }
312 }
313 }
314}
315
316#[cfg(not(any(target_os = "linux", target_os = "macos")))]
317struct NotYetImplemented;
318
319#[cfg(not(any(target_os = "linux", target_os = "macos")))]
320impl SandboxBackend for NotYetImplemented {
321 fn spawn_under_policy(
322 &self,
323 _policy: &SandboxPolicy,
324 _program: &str,
325 _args: &[&str],
326 ) -> Result<std::process::Child, SandboxError> {
327 Err(SandboxError::Backend(
328 "no sandbox backend for this OS — only Linux and macOS are supported".into(),
329 ))
330 }
331}
332
333#[cfg(test)]
334mod tests {
335 use super::*;
336
337 /// `Default` pins the most-restrictive sensible values: no FS access,
338 /// no network, `WorkerStrict` profile, 1-second CPU budget, 64 MiB
339 /// memory. The intent is that adding a future field to
340 /// [`SandboxPolicy`] doesn't require touching every test fixture;
341 /// production callers must override the limits explicitly. Pinned
342 /// so a future change to the defaults is a deliberate audit-trail
343 /// edit.
344 #[test]
345 fn sandbox_policy_default_is_strict_deny_with_one_second_budget() {
346 let p = SandboxPolicy::default();
347 assert!(p.fs_read.is_empty());
348 assert!(p.fs_write.is_empty());
349 assert!(matches!(p.net, Net::Deny));
350 assert_eq!(p.cpu_ms, 1_000);
351 assert_eq!(p.mem_mb, 64);
352 assert_eq!(p.profile, Profile::WorkerStrict);
353 assert!(p.env.is_empty());
354 }
355
356 /// Both new tunables default to `None`, which falls back to the
357 /// hardcoded defense-in-depth ceilings in `linux_cgroup`. Production
358 /// policies override explicitly when they need tighter caps.
359 #[test]
360 fn sandbox_policy_default_leaves_cpu_quota_and_tasks_max_unset() {
361 let p = SandboxPolicy::default();
362 assert_eq!(p.cpu_quota_pct, None);
363 assert_eq!(p.tasks_max, None);
364 }
365
366 #[test]
367 fn net_default_is_deny() {
368 assert!(matches!(Net::default(), Net::Deny));
369 }
370
371 #[test]
372 fn profile_default_is_worker_strict() {
373 assert_eq!(Profile::default(), Profile::WorkerStrict);
374 }
375
376 /// `SandboxBackendKind` is `Copy + Eq` so it can be threaded through
377 /// per-call dispatch without lifetime gymnastics. Cfg-gating means
378 /// the variant set is OS-specific by design — cross-OS mis-config
379 /// is a compile-time error rather than a runtime surprise.
380 #[test]
381 fn sandbox_backend_kind_is_copy_and_eq() {
382 #[cfg(target_os = "linux")]
383 {
384 let a = SandboxBackendKind::Bwrap;
385 let b = a;
386 assert_eq!(a, b);
387 }
388 #[cfg(target_os = "macos")]
389 {
390 let a = SandboxBackendKind::Seatbelt;
391 let b = a;
392 assert_eq!(a, b);
393 let c = SandboxBackendKind::Container;
394 assert_ne!(a, c);
395 }
396 }
397
398 /// `resolve(None)` returns the per-OS default backend. The test pins
399 /// pointer identity against the struct's own per-OS default slot —
400 /// if a future refactor swaps the default to a different slot, this
401 /// trips deliberately.
402 #[test]
403 fn sandbox_backends_resolve_none_returns_per_os_default() {
404 let sbs = SandboxBackends::default_for_current_os();
405 let got = sbs.resolve(None, None);
406 #[cfg(target_os = "linux")]
407 assert!(Arc::ptr_eq(&got, &sbs.bwrap));
408 #[cfg(target_os = "macos")]
409 assert!(Arc::ptr_eq(&got, &sbs.seatbelt));
410 }
411
412 #[cfg(target_os = "macos")]
413 #[test]
414 fn sandbox_backends_resolve_some_seatbelt_on_darwin() {
415 let sbs = SandboxBackends::default_for_current_os();
416 let got = sbs.resolve(Some(SandboxBackendKind::Seatbelt), None);
417 assert!(Arc::ptr_eq(&got, &sbs.seatbelt));
418 }
419
420 #[cfg(target_os = "macos")]
421 #[test]
422 fn sandbox_backends_resolve_some_container_on_darwin() {
423 let sbs = SandboxBackends::default_for_current_os();
424 let got = sbs.resolve(Some(SandboxBackendKind::Container), None);
425 assert!(Arc::ptr_eq(&got, &sbs.container));
426 }
427
428 #[cfg(target_os = "linux")]
429 #[test]
430 fn sandbox_backends_resolve_some_bwrap_on_linux() {
431 let sbs = SandboxBackends::default_for_current_os();
432 let got = sbs.resolve(Some(SandboxBackendKind::Bwrap), None);
433 assert!(Arc::ptr_eq(&got, &sbs.bwrap));
434 }
435
436 #[cfg(target_os = "macos")]
437 #[test]
438 fn sandbox_backends_resolve_with_custom_image_returns_fresh_container() {
439 // When the operator opts a worker into container mode with a custom
440 // image tag (Slice 2.5: gliner-relex flips to kastellan/gliner-relex:dev),
441 // resolve(Some(Container), Some("kastellan/gliner-relex:dev")) must
442 // return a backend whose image() method reports that tag — NOT the
443 // cached default-image backend's tag (DEFAULT_IMAGE = alpine:3.20).
444 let backends = SandboxBackends::default_for_current_os();
445 let backend = backends.resolve(
446 Some(SandboxBackendKind::Container),
447 Some("kastellan/gliner-relex:dev"),
448 );
449 // Downcast via Any is overkill — use the public surface of MacosContainer
450 // by constructing one with the same image and checking the resolver
451 // returned an Arc that holds the right tag.
452 //
453 // Since `dyn SandboxBackend` doesn't expose image(), we test via a
454 // probe: the per-call MacosContainer::with_image(tag) path returns
455 // a fresh Arc that is NOT pointer-equal to the cached default slot.
456 let cached_default = backends.resolve(Some(SandboxBackendKind::Container), None);
457 assert!(
458 !Arc::ptr_eq(&backend, &cached_default),
459 "resolve with custom image must return a fresh backend, not the cached default-image slot"
460 );
461 }
462
463 #[cfg(target_os = "macos")]
464 #[test]
465 fn sandbox_backends_resolve_with_none_image_returns_cached_default() {
466 // resolve(Some(Container), None) — the smoke-test / Slice 1 posture —
467 // must return the cached default-image slot (Arc-pointer identity).
468 // Slice 1's tests rely on this: they don't pass a custom image, and
469 // the per-call construction path would be a behaviour change.
470 let backends = SandboxBackends::default_for_current_os();
471 let first = backends.resolve(Some(SandboxBackendKind::Container), None);
472 let second = backends.resolve(Some(SandboxBackendKind::Container), None);
473 assert!(
474 Arc::ptr_eq(&first, &second),
475 "resolve with image=None must return the cached default-image slot (Arc-pointer identity)"
476 );
477 }
478}