swink_agent_eval/evaluators/code/sandbox/
mod.rs

1//! Sandboxed execution evaluator (T080–T083, behind `evaluator-sandbox`).
2//!
3//! Wraps a child process with POSIX `rlimit`s + (on Linux) a fresh network
4//! namespace so extracted code can be executed under deterministic resource
5//! bounds without spinning up a container. Windows builds ship a stub that
6//! surfaces [`EvaluatorError::UnsupportedPlatform`] at evaluation time per
7//! FR-017.
8//!
9//! The public surface is stable across platforms:
10//!
11//! * [`SandboxLimits`] — resource caps (wall-clock / CPU / RSS / FDs / network).
12//!   Default values are pinned by FR-017.
13//! * [`SandboxOutcome`] — structured return type from [`run_sandboxed`]
14//!   capturing success, stderr, and which limit (if any) was exceeded.
15//! * [`run_sandboxed`] — lower-level primitive used by the integration tests
16//!   (T083) to exercise each limit in isolation.
17//! * [`SandboxedExecutionEvaluator`] + [`SandboxRunner`] — [`crate::Evaluator`]
18//!   binding that extracts a code block, dispatches to a [`SandboxRunner`]
19//!   (default: shell) to build the child `Command`, and folds the
20//!   [`SandboxOutcome`] into an [`crate::EvalMetricResult`].
21//!
22//! ## Unsafe scope
23//!
24//! Per FR-049, unsafe is denied workspace-wide and narrowed further at the
25//! `swink-agent-eval` crate root. The single authorised carve-out is the
26//! `cfg(target_family = "unix")` [`posix`] submodule, which relaxes to
27//! `#![allow(unsafe_code)]` — every `unsafe` block inside it carries a
28//! `// SAFETY:` comment explaining the invariant being upheld. Nothing in this
29//! parent module uses `unsafe`.
30
31use std::process::Command;
32use std::sync::Arc;
33use std::time::Duration;
34
35use crate::evaluator::Evaluator;
36use crate::evaluators::EvaluatorError;
37use crate::evaluators::code::extractor::CodeExtractor;
38use crate::score::Score;
39use crate::types::{EvalCase, EvalMetricResult, Invocation};
40
41#[cfg(target_family = "unix")]
42pub(crate) mod posix;
43
44/// Resource limits enforced on the child process (T080 / FR-017).
45///
46/// Defaults are pinned by FR-017:
47///
48/// | Limit             | Default    | Rationale                                |
49/// |-------------------|------------|------------------------------------------|
50/// | `wall_clock`      | 120 s      | Real-time deadline enforced by parent.  |
51/// | `cpu`             | 60 s       | `RLIMIT_CPU` seconds.                    |
52/// | `memory_bytes`    | 1 GiB      | `RLIMIT_AS` address space cap.           |
53/// | `max_open_files`  | 256        | `RLIMIT_NOFILE` hard + soft.             |
54/// | `allow_network`   | `false`    | Linux: `unshare(CLONE_NEWNET)`.          |
55///
56/// On macOS `unshare` is unavailable and the network-off invariant degrades to
57/// "child has no configured provider" — documented as a known limitation in
58/// `specs/043-evals-adv-features/research.md` §R-006.
59#[derive(Debug, Clone)]
60pub struct SandboxLimits {
61    /// Real-time deadline. The parent SIGKILLs the child on expiry.
62    pub wall_clock: Duration,
63    /// CPU seconds via `RLIMIT_CPU`. Child receives SIGXCPU on expiry.
64    pub cpu: Duration,
65    /// Virtual address space ceiling via `RLIMIT_AS`.
66    pub memory_bytes: u64,
67    /// File-descriptor ceiling via `RLIMIT_NOFILE`.
68    pub max_open_files: u64,
69    /// Whether the child may open external network connections.
70    pub allow_network: bool,
71}
72
73impl Default for SandboxLimits {
74    fn default() -> Self {
75        Self {
76            wall_clock: Duration::from_secs(120),
77            cpu: Duration::from_secs(60),
78            memory_bytes: 1024 * 1024 * 1024,
79            max_open_files: 256,
80            allow_network: false,
81        }
82    }
83}
84
85/// Structured outcome of [`run_sandboxed`].
86#[derive(Debug, Clone)]
87pub struct SandboxOutcome {
88    /// The child exited with status 0 and no limit was exceeded.
89    pub success: bool,
90    /// Raw exit code, if the child exited normally.
91    pub exit_code: Option<i32>,
92    /// Terminating signal number, if the child was signalled.
93    pub signal: Option<i32>,
94    /// Captured stderr (trimmed and truncated to the first few lines).
95    pub stderr: String,
96    /// Which limit (if any) was exceeded.
97    pub limit_exceeded: Option<String>,
98}
99
100impl SandboxOutcome {
101    /// Short label describing the outcome for reporter consumption.
102    #[must_use]
103    pub fn summary(&self) -> String {
104        match &self.limit_exceeded {
105            Some(limit) => format!("sandbox limit exceeded: {limit}"),
106            None if self.success => "ok".to_string(),
107            None => {
108                let detail = self
109                    .stderr
110                    .lines()
111                    .filter(|line| !line.trim().is_empty())
112                    .take(8)
113                    .collect::<Vec<_>>()
114                    .join("\n");
115                if detail.is_empty() {
116                    match (self.exit_code, self.signal) {
117                        (Some(code), _) => format!("exit status {code}"),
118                        (_, Some(sig)) => format!("signal {sig}"),
119                        _ => "non-zero exit".to_string(),
120                    }
121                } else {
122                    detail
123                }
124            }
125        }
126    }
127}
128
129/// Execute `command` under the configured [`SandboxLimits`] (T081).
130///
131/// On Unix this installs `rlimit`s via a `pre_exec` hook inside the
132/// [`posix`] submodule and enforces wall-clock by SIGKILL-ing the child after
133/// the deadline. On Windows this returns
134/// [`EvaluatorError::UnsupportedPlatform`] (T082) without spawning.
135///
136/// When a limit is exceeded the returned [`SandboxOutcome`] has
137/// `limit_exceeded = Some(<name>)` and callers may synthesise
138/// [`EvaluatorError::SandboxLimitExceeded`] from it; the lower-level
139/// [`SandboxOutcome`] shape is preserved so callers that want to inspect
140/// stderr can do so before mapping to the typed error.
141pub fn run_sandboxed(
142    command: Command,
143    limits: &SandboxLimits,
144) -> Result<SandboxOutcome, EvaluatorError> {
145    #[cfg(target_family = "unix")]
146    {
147        posix::run_sandboxed_unix(command, limits)
148    }
149    #[cfg(target_family = "windows")]
150    {
151        // Touch the parameters to silence the unused-warning on stub builds.
152        let _ = (command, limits);
153        Err(EvaluatorError::UnsupportedPlatform {
154            reason: "SandboxedExecutionEvaluator is Unix-only (Linux/macOS); \
155                     FR-017 defines Windows as unsupported for this evaluator"
156                .to_string(),
157        })
158    }
159}
160
161/// Builds the child `Command` for a [`SandboxedExecutionEvaluator`].
162///
163/// Implementors are responsible for writing any auxiliary files into
164/// `workdir` and returning a `Command` whose `current_dir` is inside
165/// `workdir`. The evaluator guarantees `workdir` lives for the duration of
166/// the child process and is deleted afterwards.
167pub trait SandboxRunner: Send + Sync {
168    /// Assemble the `Command` to execute `code` inside `workdir`.
169    fn command(&self, code: &str, workdir: &std::path::Path) -> std::io::Result<Command>;
170}
171
172/// Default [`SandboxRunner`]: runs the extracted snippet verbatim via
173/// `/bin/sh -c`. Intended for smoke tests and shell-style snippets.
174///
175/// Most real deployments will want to plug a custom runner in (e.g. scaffold
176/// a Rust crate and `cargo run`), but the shell runner keeps the evaluator
177/// useful out of the box and — crucially — self-contained for tests (no
178/// compilers, no `cc`).
179#[derive(Debug, Default, Clone)]
180pub struct ShellRunner;
181
182impl SandboxRunner for ShellRunner {
183    fn command(&self, code: &str, workdir: &std::path::Path) -> std::io::Result<Command> {
184        let script = workdir.join("snippet.sh");
185        std::fs::write(&script, code)?;
186        let mut cmd = Command::new("/bin/sh");
187        cmd.arg(script);
188        cmd.current_dir(workdir);
189        Ok(cmd)
190    }
191}
192
193/// Sandboxed execution evaluator (T081 — evaluator wiring).
194///
195/// Extracts a code block via the configured [`CodeExtractor`], writes it
196/// into a fresh tempdir, and invokes a [`SandboxRunner`] to produce the
197/// child `Command`. The child runs under [`SandboxLimits`]; the outcome is
198/// folded into an [`EvalMetricResult`].
199///
200/// When no code is extractable this evaluator returns `None` to match the
201/// FR-020 "criterion not set" semantics used by the sibling
202/// [`crate::CargoCheckEvaluator`].
203pub struct SandboxedExecutionEvaluator {
204    name: &'static str,
205    extractor: Arc<CodeExtractor>,
206    limits: SandboxLimits,
207    runner: Arc<dyn SandboxRunner>,
208}
209
210impl SandboxedExecutionEvaluator {
211    /// Construct an evaluator with the default shell runner + default limits.
212    #[must_use]
213    pub fn new(extractor: Arc<CodeExtractor>) -> Self {
214        Self {
215            name: "sandboxed_execution",
216            extractor,
217            limits: SandboxLimits::default(),
218            runner: Arc::new(ShellRunner),
219        }
220    }
221
222    /// Override the reported evaluator name.
223    #[must_use]
224    pub const fn with_name(mut self, name: &'static str) -> Self {
225        self.name = name;
226        self
227    }
228
229    /// Override the resource limits.
230    #[must_use]
231    pub fn with_limits(mut self, limits: SandboxLimits) -> Self {
232        self.limits = limits;
233        self
234    }
235
236    /// Override the runner used to build the child `Command`.
237    #[must_use]
238    pub fn with_runner(mut self, runner: Arc<dyn SandboxRunner>) -> Self {
239        self.runner = runner;
240        self
241    }
242}
243
244impl Evaluator for SandboxedExecutionEvaluator {
245    fn name(&self) -> &'static str {
246        self.name
247    }
248
249    fn evaluate(&self, _case: &EvalCase, invocation: &Invocation) -> Option<EvalMetricResult> {
250        let response = invocation.final_response.as_ref()?;
251        let code = crate::evaluators::block_on(self.extractor.extract(response))?;
252
253        let tempdir = match tempfile::TempDir::new() {
254            Ok(dir) => dir,
255            Err(err) => {
256                return Some(EvalMetricResult {
257                    evaluator_name: self.name.to_string(),
258                    score: Score::fail(),
259                    details: Some(format!("tempdir creation failed: {err}")),
260                });
261            }
262        };
263
264        let command = match self.runner.command(&code, tempdir.path()) {
265            Ok(cmd) => cmd,
266            Err(err) => {
267                return Some(EvalMetricResult {
268                    evaluator_name: self.name.to_string(),
269                    score: Score::fail(),
270                    details: Some(format!("runner failed: {err}")),
271                });
272            }
273        };
274
275        let (score, details) = match run_sandboxed(command, &self.limits) {
276            Ok(outcome) => {
277                let score = if outcome.success {
278                    Score::pass()
279                } else {
280                    Score::fail()
281                };
282                (score, outcome.summary())
283            }
284            Err(EvaluatorError::UnsupportedPlatform { reason }) => {
285                (Score::fail(), format!("unsupported platform: {reason}"))
286            }
287            Err(EvaluatorError::SandboxLimitExceeded { limit }) => {
288                (Score::fail(), format!("sandbox limit exceeded: {limit}"))
289            }
290            Err(err) => (Score::fail(), err.into_metric_details()),
291        };
292
293        Some(EvalMetricResult {
294            evaluator_name: self.name.to_string(),
295            score,
296            details: Some(details),
297        })
298    }
299}
swink_agent_eval/evaluators/code/sandbox/mod.rs

swink_agent_eval/evaluators/code/sandbox/
mod.rs