Skip to main content

kastellan_sandbox/
lib.rs

1//! kastellan-sandbox: declarative, cross-platform sandbox for tool workers.
2//!
3//! One [`SandboxPolicy`] drives all backends. Backend selection is automatic
4//! per OS, with an optional micro-VM backend for stronger isolation.
5//!
6//! Backends (Phase 0/0b):
7//!   - linux_bwrap   — bubblewrap + Landlock + seccomp-bpf
8//!   - macos_seatbelt — sandbox-exec (Seatbelt) + setrlimit
9//!   - microvm       — Firecracker (Linux) / Apple `container` CLI (macOS Tahoe+)
10
11#[cfg(target_os = "linux")]
12pub mod linux_bwrap;
13#[cfg(target_os = "linux")]
14pub mod linux_cgroup;
15#[cfg(target_os = "macos")]
16pub mod macos_container;
17#[cfg(target_os = "macos")]
18pub mod macos_seatbelt;
19
20use std::path::PathBuf;
21use std::sync::Arc;
22
23use serde::{Deserialize, Serialize};
24use thiserror::Error;
25
26/// Coarse profile presets that map to backend-specific defaults.
27#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
28pub enum Profile {
29    /// Strictest: no net by default, scratch FS only, minimal syscall set.
30    #[default]
31    WorkerStrict,
32    /// Slightly relaxed for workers that need outbound HTTPS via the egress proxy.
33    WorkerNetClient,
34}
35
36#[derive(Clone, Debug, Default, Serialize, Deserialize)]
37pub enum Net {
38    /// Deny all network access.
39    #[default]
40    Deny,
41    /// Allowlist of "host:port" entries. Egress still flows through the egress proxy.
42    Allowlist(Vec<String>),
43    /// The egress proxy itself: real outbound + DNS, self-enforcing. Maps to
44    /// the same "share the host network namespace" behaviour as `Allowlist`
45    /// *today*, but names the proxy-vs-worker distinction explicitly. Slice #2
46    /// diverges them: `Allowlist` workers get a private netns whose only route
47    /// out is the proxy UDS, while `ProxyEgress` keeps the real netns.
48    ProxyEgress,
49}
50
51#[derive(Clone, Debug, Serialize, Deserialize)]
52pub struct SandboxPolicy {
53    /// Read-only mounts/paths.
54    pub fs_read: Vec<PathBuf>,
55    /// Writable paths (typically a per-worker scratch dir).
56    pub fs_write: Vec<PathBuf>,
57    /// Network policy.
58    pub net: Net,
59    /// Hard CPU-time limit (milliseconds). Enforced via
60    /// `setrlimit(RLIMIT_CPU)` from the worker prelude (POSIX, so applies
61    /// on Linux and macOS). `0` means "unset, no rlimit applied".
62    pub cpu_ms: u64,
63    /// Hard memory limit (megabytes).
64    ///
65    /// * **Linux:** enforced via cgroup `MemoryMax` by [`crate::linux_cgroup`].
66    /// * **macOS Seatbelt:** **not enforced** (Seatbelt has no memory
67    ///   primitive; `RLIMIT_AS` has high false-positive risk for
68    ///   malloc-heavy workers and is intentionally deferred).
69    /// * **macOS Apple `container` backend** ([`crate::macos_container`]):
70    ///   enforced via `container run -m <N>M` with SIGKILL on overrun.
71    ///   Note the **200 MiB floor** — `container` rejects smaller values;
72    ///   the backend clamps and emits a `tracing::warn!` so operators see
73    ///   the silent widening. Opt-in per-worker (Slice 2 wiring), not the
74    ///   default macOS backend.
75    pub mem_mb: u64,
76    /// Profile preset.
77    pub profile: Profile,
78    /// Per-worker CPU bandwidth ceiling (percent of one CPU). `None`
79    /// falls back to the backend's defense-in-depth default.
80    ///
81    /// * **Linux cgroup:** enforced; default 200%, hardcoded in
82    ///   [`crate::linux_cgroup`].
83    /// * **macOS Seatbelt:** no effect (no equivalent primitive).
84    /// * **macOS Apple `container` backend:** enforced via
85    ///   `container run -c <fractional vCPUs>`; `None` lets `container`
86    ///   pick up its host `--default-cpus` configuration (no
87    ///   backend-emitted default, deliberately diverging from
88    ///   `linux_cgroup` to avoid silently capping the per-host setting).
89    #[serde(default)]
90    pub cpu_quota_pct: Option<u32>,
91    /// Per-worker max task count. `None` falls back to the backend's
92    /// defense-in-depth default.
93    ///
94    /// * **Linux cgroup:** enforced via `pids.max` (per-cgroup process
95    ///   count, kernel-enforced); default 64.
96    /// * **macOS Seatbelt:** no effect (no equivalent primitive).
97    /// * **macOS Apple `container` backend:** enforced via `container
98    ///   run --ulimit nproc=<N>:<N>`, which becomes per-real-UID
99    ///   `RLIMIT_NPROC` inside the Linux VM. **Semantic gap worth
100    ///   knowing:** the Linux cgroup form is per-cgroup; the container
101    ///   form is per-UID across the VM. Inside a one-worker container
102    ///   running as a single UID the practical effect is similar, but
103    ///   the guarantees are not identical.
104    #[serde(default)]
105    pub tasks_max: Option<u64>,
106    /// Environment variables to set inside the jail. Empty by default
107    /// — the host environment is **always** cleared before this is
108    /// applied, so the jail sees only what's listed here.
109    #[serde(default)]
110    pub env: Vec<(String, String)>,
111}
112
113impl Default for SandboxPolicy {
114    /// Conservative defaults: no FS access, no network, strict profile,
115    /// 1-second CPU budget, 64 MiB memory, no cgroup overrides. Production
116    /// callers (e.g. `shell_exec_entry`) override the limits explicitly;
117    /// the `Default` impl exists so tests and future field additions can
118    /// use `..Default::default()` without churning every fixture.
119    fn default() -> Self {
120        Self {
121            fs_read: Vec::new(),
122            fs_write: Vec::new(),
123            net: Net::default(),
124            cpu_ms: 1_000,
125            mem_mb: 64,
126            profile: Profile::default(),
127            cpu_quota_pct: None,
128            tasks_max: None,
129            env: Vec::new(),
130        }
131    }
132}
133
134#[derive(Debug, Error)]
135pub enum SandboxError {
136    #[error("backend error: {0}")]
137    Backend(String),
138}
139
140/// Operator-facing identifier for selecting a specific sandbox backend
141/// per-worker. Cfg-gated per-OS so cross-OS mis-config (e.g. declaring
142/// `Container` on Linux) is a compile-time error rather than a runtime
143/// surprise.
144///
145/// `None` on a `ToolEntry.sandbox_backend` means "use the per-OS
146/// default" — today darwin → `Seatbelt`, linux → `Bwrap`. Only opt in
147/// here when a worker has a concrete reason to diverge (e.g. needs
148/// memory enforcement on macOS, which `Seatbelt` can't provide).
149///
150/// `Serialize + Deserialize` derives are for future operator-config
151/// plumbing (e.g. surfacing `sandbox_backend` in a manifest file or
152/// CLI subcommand). No current call-site serialises this; the derives
153/// are forward-looking so a later config slice doesn't need to revisit
154/// every `ToolEntry` constructor.
155///
156/// `Container` is deliberately bound to the macOS Apple `container`
157/// CLI under `#[cfg(target_os = "macos")]`. A future Linux micro-VM
158/// backend (Firecracker, Kata, gVisor, etc.) would add a
159/// linux-cfg-gated variant with its own name (e.g. `FirecrackerVm`)
160/// rather than overloading `Container` — the cfg-gating prevents
161/// ambiguity today.
162///
163/// See `docs/superpowers/specs/2026-05-21-macos-container-slice-2-design.md`
164/// for the rationale behind OS-specific variant names vs an abstract
165/// `MicroVm` category.
166#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
167pub enum SandboxBackendKind {
168    #[cfg(target_os = "linux")]
169    Bwrap,
170    #[cfg(target_os = "macos")]
171    Seatbelt,
172    #[cfg(target_os = "macos")]
173    Container,
174}
175
176/// Common backend interface. To be implemented by [`linux_bwrap`], [`macos_seatbelt`],
177/// and [`microvm`] in subsequent phases.
178///
179/// `Send + Sync` are required because backends are shared via `Arc<dyn SandboxBackend>`
180/// across async tasks in the scheduler (one `Arc` per lane runner). Both concrete
181/// implementations (`LinuxBwrap`, `MacosSeatbelt`) hold no mutable state and
182/// satisfy these bounds automatically.
183pub trait SandboxBackend: Send + Sync {
184    /// Build the argv (or equivalent invocation) that runs `program` with `args`
185    /// under `policy`. Implementation detail of the backend; not stable yet.
186    fn spawn_under_policy(
187        &self,
188        policy: &SandboxPolicy,
189        program: &str,
190        args: &[&str],
191    ) -> Result<std::process::Child, SandboxError>;
192}
193
194/// Pick the default backend for the current OS.
195///
196/// Kept for direct-spawn callers (e.g. `tests-common::sandbox::backend()`)
197/// that don't need per-entry selection. Daemon-backed call sites
198/// construct [`SandboxBackends::default_for_current_os`] instead — that
199/// bundle supports the per-worker `sandbox_backend` opt-in introduced
200/// by Slice 2.
201pub fn default_backend() -> Box<dyn SandboxBackend> {
202    #[cfg(target_os = "linux")]
203    {
204        Box::new(linux_bwrap::LinuxBwrap::new())
205    }
206    #[cfg(target_os = "macos")]
207    {
208        Box::new(macos_seatbelt::MacosSeatbelt::new())
209    }
210    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
211    {
212        Box::new(NotYetImplemented)
213    }
214}
215
216/// Per-OS bundle of constructed sandbox backends, used by the lifecycle
217/// managers to resolve a per-worker [`SandboxBackendKind`] to a concrete
218/// `Arc<dyn SandboxBackend>`.
219///
220/// Fields are cfg-gated to match `SandboxBackendKind` — every variant
221/// of the enum that exists at compile time has a backing field, so
222/// [`SandboxBackends::resolve`] is total (no runtime panic path for
223/// "unknown variant").
224///
225/// Constructed once at daemon startup via
226/// [`SandboxBackends::default_for_current_os`] (cheap — backends hold
227/// no mutable state) and threaded through the lifecycle managers as
228/// `Arc<SandboxBackends>`. Tests build a custom instance directly via
229/// struct-literal syntax with their own counter / stub backends.
230///
231/// `Clone` is provided so consumers that thread the bundle through
232/// async boundaries can copy the per-field `Arc`s cheaply.
233#[derive(Clone)]
234pub struct SandboxBackends {
235    #[cfg(target_os = "linux")]
236    pub bwrap: Arc<dyn SandboxBackend>,
237    #[cfg(target_os = "macos")]
238    pub seatbelt: Arc<dyn SandboxBackend>,
239    #[cfg(target_os = "macos")]
240    pub container: Arc<dyn SandboxBackend>,
241}
242
243impl SandboxBackends {
244    /// Construct the per-OS default bundle. On Linux this is a single
245    /// `LinuxBwrap`; on darwin it is `MacosSeatbelt` (the per-OS
246    /// default) plus a `MacosContainer` for opt-in workers. Cheap —
247    /// each backend is a unit-like struct with no I/O at construction.
248    pub fn default_for_current_os() -> Self {
249        #[cfg(target_os = "linux")]
250        {
251            Self {
252                bwrap: Arc::new(linux_bwrap::LinuxBwrap::new()),
253            }
254        }
255        #[cfg(target_os = "macos")]
256        {
257            Self {
258                seatbelt: Arc::new(macos_seatbelt::MacosSeatbelt::new()),
259                container: Arc::new(macos_container::MacosContainer::new()),
260            }
261        }
262    }
263
264    /// Resolve a per-worker [`SandboxBackendKind`] (+ optional container
265    /// image tag) to a concrete backend.
266    ///
267    /// Visible arms vary by OS via cfg-gating on the enum variants:
268    ///
269    /// * `(None, _)` — per-OS default. Linux → `bwrap`; darwin → `seatbelt`.
270    /// * `(Some(Bwrap), _)` — Linux only. Cached `bwrap` slot;
271    ///   `image` is ignored (bwrap doesn't use container images).
272    /// * `(Some(Seatbelt), _)` — darwin only. Cached `seatbelt` slot;
273    ///   `image` is ignored (Seatbelt isn't a container backend).
274    /// * `(Some(Container), None)` — darwin only. Cached default-image
275    ///   container backend (the Slice 1 / smoke-test posture; `alpine:3.20`).
276    /// * `(Some(Container), Some(tag))` — darwin only. Per-call
277    ///   `Arc::new(MacosContainer::with_image(tag))`. Cheap (String +
278    ///   Arc); `MacosContainer::probe()` was called once at construction
279    ///   against the default image, and `probe` is image-independent
280    ///   (it checks `container --version` + `container system status`),
281    ///   so no re-probe needed here.
282    ///
283    /// The returned `Arc` is held for the lifetime of one acquire call
284    /// (single-use lifecycle) or one warm-slot fill (idle-timeout
285    /// lifecycle).
286    pub fn resolve(
287        &self,
288        kind: Option<SandboxBackendKind>,
289        image: Option<&str>,
290    ) -> Arc<dyn SandboxBackend> {
291        match (kind, image) {
292            (None, _) => {
293                #[cfg(target_os = "linux")]
294                {
295                    Arc::clone(&self.bwrap)
296                }
297                #[cfg(target_os = "macos")]
298                {
299                    Arc::clone(&self.seatbelt)
300                }
301            }
302            #[cfg(target_os = "linux")]
303            (Some(SandboxBackendKind::Bwrap), _) => Arc::clone(&self.bwrap),
304            #[cfg(target_os = "macos")]
305            (Some(SandboxBackendKind::Seatbelt), _) => Arc::clone(&self.seatbelt),
306            #[cfg(target_os = "macos")]
307            (Some(SandboxBackendKind::Container), None) => Arc::clone(&self.container),
308            #[cfg(target_os = "macos")]
309            (Some(SandboxBackendKind::Container), Some(tag)) => {
310                Arc::new(macos_container::MacosContainer::with_image(tag))
311            }
312        }
313    }
314}
315
316#[cfg(not(any(target_os = "linux", target_os = "macos")))]
317struct NotYetImplemented;
318
319#[cfg(not(any(target_os = "linux", target_os = "macos")))]
320impl SandboxBackend for NotYetImplemented {
321    fn spawn_under_policy(
322        &self,
323        _policy: &SandboxPolicy,
324        _program: &str,
325        _args: &[&str],
326    ) -> Result<std::process::Child, SandboxError> {
327        Err(SandboxError::Backend(
328            "no sandbox backend for this OS — only Linux and macOS are supported".into(),
329        ))
330    }
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336
337    /// `Default` pins the most-restrictive sensible values: no FS access,
338    /// no network, `WorkerStrict` profile, 1-second CPU budget, 64 MiB
339    /// memory. The intent is that adding a future field to
340    /// [`SandboxPolicy`] doesn't require touching every test fixture;
341    /// production callers must override the limits explicitly. Pinned
342    /// so a future change to the defaults is a deliberate audit-trail
343    /// edit.
344    #[test]
345    fn sandbox_policy_default_is_strict_deny_with_one_second_budget() {
346        let p = SandboxPolicy::default();
347        assert!(p.fs_read.is_empty());
348        assert!(p.fs_write.is_empty());
349        assert!(matches!(p.net, Net::Deny));
350        assert_eq!(p.cpu_ms, 1_000);
351        assert_eq!(p.mem_mb, 64);
352        assert_eq!(p.profile, Profile::WorkerStrict);
353        assert!(p.env.is_empty());
354    }
355
356    /// Both new tunables default to `None`, which falls back to the
357    /// hardcoded defense-in-depth ceilings in `linux_cgroup`. Production
358    /// policies override explicitly when they need tighter caps.
359    #[test]
360    fn sandbox_policy_default_leaves_cpu_quota_and_tasks_max_unset() {
361        let p = SandboxPolicy::default();
362        assert_eq!(p.cpu_quota_pct, None);
363        assert_eq!(p.tasks_max, None);
364    }
365
366    #[test]
367    fn net_default_is_deny() {
368        assert!(matches!(Net::default(), Net::Deny));
369    }
370
371    #[test]
372    fn profile_default_is_worker_strict() {
373        assert_eq!(Profile::default(), Profile::WorkerStrict);
374    }
375
376    /// `SandboxBackendKind` is `Copy + Eq` so it can be threaded through
377    /// per-call dispatch without lifetime gymnastics. Cfg-gating means
378    /// the variant set is OS-specific by design — cross-OS mis-config
379    /// is a compile-time error rather than a runtime surprise.
380    #[test]
381    fn sandbox_backend_kind_is_copy_and_eq() {
382        #[cfg(target_os = "linux")]
383        {
384            let a = SandboxBackendKind::Bwrap;
385            let b = a;
386            assert_eq!(a, b);
387        }
388        #[cfg(target_os = "macos")]
389        {
390            let a = SandboxBackendKind::Seatbelt;
391            let b = a;
392            assert_eq!(a, b);
393            let c = SandboxBackendKind::Container;
394            assert_ne!(a, c);
395        }
396    }
397
398    /// `resolve(None)` returns the per-OS default backend. The test pins
399    /// pointer identity against the struct's own per-OS default slot —
400    /// if a future refactor swaps the default to a different slot, this
401    /// trips deliberately.
402    #[test]
403    fn sandbox_backends_resolve_none_returns_per_os_default() {
404        let sbs = SandboxBackends::default_for_current_os();
405        let got = sbs.resolve(None, None);
406        #[cfg(target_os = "linux")]
407        assert!(Arc::ptr_eq(&got, &sbs.bwrap));
408        #[cfg(target_os = "macos")]
409        assert!(Arc::ptr_eq(&got, &sbs.seatbelt));
410    }
411
412    #[cfg(target_os = "macos")]
413    #[test]
414    fn sandbox_backends_resolve_some_seatbelt_on_darwin() {
415        let sbs = SandboxBackends::default_for_current_os();
416        let got = sbs.resolve(Some(SandboxBackendKind::Seatbelt), None);
417        assert!(Arc::ptr_eq(&got, &sbs.seatbelt));
418    }
419
420    #[cfg(target_os = "macos")]
421    #[test]
422    fn sandbox_backends_resolve_some_container_on_darwin() {
423        let sbs = SandboxBackends::default_for_current_os();
424        let got = sbs.resolve(Some(SandboxBackendKind::Container), None);
425        assert!(Arc::ptr_eq(&got, &sbs.container));
426    }
427
428    #[cfg(target_os = "linux")]
429    #[test]
430    fn sandbox_backends_resolve_some_bwrap_on_linux() {
431        let sbs = SandboxBackends::default_for_current_os();
432        let got = sbs.resolve(Some(SandboxBackendKind::Bwrap), None);
433        assert!(Arc::ptr_eq(&got, &sbs.bwrap));
434    }
435
436    #[cfg(target_os = "macos")]
437    #[test]
438    fn sandbox_backends_resolve_with_custom_image_returns_fresh_container() {
439        // When the operator opts a worker into container mode with a custom
440        // image tag (Slice 2.5: gliner-relex flips to kastellan/gliner-relex:dev),
441        // resolve(Some(Container), Some("kastellan/gliner-relex:dev")) must
442        // return a backend whose image() method reports that tag — NOT the
443        // cached default-image backend's tag (DEFAULT_IMAGE = alpine:3.20).
444        let backends = SandboxBackends::default_for_current_os();
445        let backend = backends.resolve(
446            Some(SandboxBackendKind::Container),
447            Some("kastellan/gliner-relex:dev"),
448        );
449        // Downcast via Any is overkill — use the public surface of MacosContainer
450        // by constructing one with the same image and checking the resolver
451        // returned an Arc that holds the right tag.
452        //
453        // Since `dyn SandboxBackend` doesn't expose image(), we test via a
454        // probe: the per-call MacosContainer::with_image(tag) path returns
455        // a fresh Arc that is NOT pointer-equal to the cached default slot.
456        let cached_default = backends.resolve(Some(SandboxBackendKind::Container), None);
457        assert!(
458            !Arc::ptr_eq(&backend, &cached_default),
459            "resolve with custom image must return a fresh backend, not the cached default-image slot"
460        );
461    }
462
463    #[cfg(target_os = "macos")]
464    #[test]
465    fn sandbox_backends_resolve_with_none_image_returns_cached_default() {
466        // resolve(Some(Container), None) — the smoke-test / Slice 1 posture —
467        // must return the cached default-image slot (Arc-pointer identity).
468        // Slice 1's tests rely on this: they don't pass a custom image, and
469        // the per-call construction path would be a behaviour change.
470        let backends = SandboxBackends::default_for_current_os();
471        let first = backends.resolve(Some(SandboxBackendKind::Container), None);
472        let second = backends.resolve(Some(SandboxBackendKind::Container), None);
473        assert!(
474            Arc::ptr_eq(&first, &second),
475            "resolve with image=None must return the cached default-image slot (Arc-pointer identity)"
476        );
477    }
478}