swink_agent_eval/evaluators/code/sandbox/mod.rs
1//! Sandboxed execution evaluator (T080–T083, behind `evaluator-sandbox`).
2//!
3//! Wraps a child process with POSIX `rlimit`s + (on Linux) a fresh network
4//! namespace so extracted code can be executed under deterministic resource
5//! bounds without spinning up a container. Windows builds ship a stub that
6//! surfaces [`EvaluatorError::UnsupportedPlatform`] at evaluation time per
7//! FR-017.
8//!
9//! The public surface is stable across platforms:
10//!
11//! * [`SandboxLimits`] — resource caps (wall-clock / CPU / RSS / FDs / network).
12//! Default values are pinned by FR-017.
13//! * [`SandboxOutcome`] — structured return type from [`run_sandboxed`]
14//! capturing success, stderr, and which limit (if any) was exceeded.
15//! * [`run_sandboxed`] — lower-level primitive used by the integration tests
16//! (T083) to exercise each limit in isolation.
17//! * [`SandboxedExecutionEvaluator`] + [`SandboxRunner`] — [`crate::Evaluator`]
18//! binding that extracts a code block, dispatches to a [`SandboxRunner`]
19//! (default: shell) to build the child `Command`, and folds the
20//! [`SandboxOutcome`] into an [`crate::EvalMetricResult`].
21//!
22//! ## Unsafe scope
23//!
24//! Per FR-049, unsafe is denied workspace-wide and narrowed further at the
25//! `swink-agent-eval` crate root. The single authorised carve-out is the
26//! `cfg(target_family = "unix")` [`posix`] submodule, which relaxes to
27//! `#![allow(unsafe_code)]` — every `unsafe` block inside it carries a
28//! `// SAFETY:` comment explaining the invariant being upheld. Nothing in this
29//! parent module uses `unsafe`.
30
31use std::process::Command;
32use std::sync::Arc;
33use std::time::Duration;
34
35use crate::evaluator::Evaluator;
36use crate::evaluators::EvaluatorError;
37use crate::evaluators::code::extractor::CodeExtractor;
38use crate::score::Score;
39use crate::types::{EvalCase, EvalMetricResult, Invocation};
40
41#[cfg(target_family = "unix")]
42pub(crate) mod posix;
43
44/// Resource limits enforced on the child process (T080 / FR-017).
45///
46/// Defaults are pinned by FR-017:
47///
48/// | Limit | Default | Rationale |
49/// |-------------------|------------|------------------------------------------|
50/// | `wall_clock` | 120 s | Real-time deadline enforced by parent. |
51/// | `cpu` | 60 s | `RLIMIT_CPU` seconds. |
52/// | `memory_bytes` | 1 GiB | `RLIMIT_AS` address space cap. |
53/// | `max_open_files` | 256 | `RLIMIT_NOFILE` hard + soft. |
54/// | `allow_network` | `false` | Linux: `unshare(CLONE_NEWNET)`. |
55///
56/// On macOS `unshare` is unavailable and the network-off invariant degrades to
57/// "child has no configured provider" — documented as a known limitation in
58/// `specs/043-evals-adv-features/research.md` §R-006.
59#[derive(Debug, Clone)]
60pub struct SandboxLimits {
61 /// Real-time deadline. The parent SIGKILLs the child on expiry.
62 pub wall_clock: Duration,
63 /// CPU seconds via `RLIMIT_CPU`. Child receives SIGXCPU on expiry.
64 pub cpu: Duration,
65 /// Virtual address space ceiling via `RLIMIT_AS`.
66 pub memory_bytes: u64,
67 /// File-descriptor ceiling via `RLIMIT_NOFILE`.
68 pub max_open_files: u64,
69 /// Whether the child may open external network connections.
70 pub allow_network: bool,
71}
72
73impl Default for SandboxLimits {
74 fn default() -> Self {
75 Self {
76 wall_clock: Duration::from_secs(120),
77 cpu: Duration::from_secs(60),
78 memory_bytes: 1024 * 1024 * 1024,
79 max_open_files: 256,
80 allow_network: false,
81 }
82 }
83}
84
85/// Structured outcome of [`run_sandboxed`].
86#[derive(Debug, Clone)]
87pub struct SandboxOutcome {
88 /// The child exited with status 0 and no limit was exceeded.
89 pub success: bool,
90 /// Raw exit code, if the child exited normally.
91 pub exit_code: Option<i32>,
92 /// Terminating signal number, if the child was signalled.
93 pub signal: Option<i32>,
94 /// Captured stderr (trimmed and truncated to the first few lines).
95 pub stderr: String,
96 /// Which limit (if any) was exceeded.
97 pub limit_exceeded: Option<String>,
98}
99
100impl SandboxOutcome {
101 /// Short label describing the outcome for reporter consumption.
102 #[must_use]
103 pub fn summary(&self) -> String {
104 match &self.limit_exceeded {
105 Some(limit) => format!("sandbox limit exceeded: {limit}"),
106 None if self.success => "ok".to_string(),
107 None => {
108 let detail = self
109 .stderr
110 .lines()
111 .filter(|line| !line.trim().is_empty())
112 .take(8)
113 .collect::<Vec<_>>()
114 .join("\n");
115 if detail.is_empty() {
116 match (self.exit_code, self.signal) {
117 (Some(code), _) => format!("exit status {code}"),
118 (_, Some(sig)) => format!("signal {sig}"),
119 _ => "non-zero exit".to_string(),
120 }
121 } else {
122 detail
123 }
124 }
125 }
126 }
127}
128
129/// Execute `command` under the configured [`SandboxLimits`] (T081).
130///
131/// On Unix this installs `rlimit`s via a `pre_exec` hook inside the
132/// [`posix`] submodule and enforces wall-clock by SIGKILL-ing the child after
133/// the deadline. On Windows this returns
134/// [`EvaluatorError::UnsupportedPlatform`] (T082) without spawning.
135///
136/// When a limit is exceeded the returned [`SandboxOutcome`] has
137/// `limit_exceeded = Some(<name>)` and callers may synthesise
138/// [`EvaluatorError::SandboxLimitExceeded`] from it; the lower-level
139/// [`SandboxOutcome`] shape is preserved so callers that want to inspect
140/// stderr can do so before mapping to the typed error.
141pub fn run_sandboxed(
142 command: Command,
143 limits: &SandboxLimits,
144) -> Result<SandboxOutcome, EvaluatorError> {
145 #[cfg(target_family = "unix")]
146 {
147 posix::run_sandboxed_unix(command, limits)
148 }
149 #[cfg(target_family = "windows")]
150 {
151 // Touch the parameters to silence the unused-warning on stub builds.
152 let _ = (command, limits);
153 Err(EvaluatorError::UnsupportedPlatform {
154 reason: "SandboxedExecutionEvaluator is Unix-only (Linux/macOS); \
155 FR-017 defines Windows as unsupported for this evaluator"
156 .to_string(),
157 })
158 }
159}
160
161/// Builds the child `Command` for a [`SandboxedExecutionEvaluator`].
162///
163/// Implementors are responsible for writing any auxiliary files into
164/// `workdir` and returning a `Command` whose `current_dir` is inside
165/// `workdir`. The evaluator guarantees `workdir` lives for the duration of
166/// the child process and is deleted afterwards.
167pub trait SandboxRunner: Send + Sync {
168 /// Assemble the `Command` to execute `code` inside `workdir`.
169 fn command(&self, code: &str, workdir: &std::path::Path) -> std::io::Result<Command>;
170}
171
172/// Default [`SandboxRunner`]: runs the extracted snippet verbatim via
173/// `/bin/sh -c`. Intended for smoke tests and shell-style snippets.
174///
175/// Most real deployments will want to plug a custom runner in (e.g. scaffold
176/// a Rust crate and `cargo run`), but the shell runner keeps the evaluator
177/// useful out of the box and — crucially — self-contained for tests (no
178/// compilers, no `cc`).
179#[derive(Debug, Default, Clone)]
180pub struct ShellRunner;
181
182impl SandboxRunner for ShellRunner {
183 fn command(&self, code: &str, workdir: &std::path::Path) -> std::io::Result<Command> {
184 let script = workdir.join("snippet.sh");
185 std::fs::write(&script, code)?;
186 let mut cmd = Command::new("/bin/sh");
187 cmd.arg(script);
188 cmd.current_dir(workdir);
189 Ok(cmd)
190 }
191}
192
193/// Sandboxed execution evaluator (T081 — evaluator wiring).
194///
195/// Extracts a code block via the configured [`CodeExtractor`], writes it
196/// into a fresh tempdir, and invokes a [`SandboxRunner`] to produce the
197/// child `Command`. The child runs under [`SandboxLimits`]; the outcome is
198/// folded into an [`EvalMetricResult`].
199///
200/// When no code is extractable this evaluator returns `None` to match the
201/// FR-020 "criterion not set" semantics used by the sibling
202/// [`crate::CargoCheckEvaluator`].
203pub struct SandboxedExecutionEvaluator {
204 name: &'static str,
205 extractor: Arc<CodeExtractor>,
206 limits: SandboxLimits,
207 runner: Arc<dyn SandboxRunner>,
208}
209
210impl SandboxedExecutionEvaluator {
211 /// Construct an evaluator with the default shell runner + default limits.
212 #[must_use]
213 pub fn new(extractor: Arc<CodeExtractor>) -> Self {
214 Self {
215 name: "sandboxed_execution",
216 extractor,
217 limits: SandboxLimits::default(),
218 runner: Arc::new(ShellRunner),
219 }
220 }
221
222 /// Override the reported evaluator name.
223 #[must_use]
224 pub const fn with_name(mut self, name: &'static str) -> Self {
225 self.name = name;
226 self
227 }
228
229 /// Override the resource limits.
230 #[must_use]
231 pub fn with_limits(mut self, limits: SandboxLimits) -> Self {
232 self.limits = limits;
233 self
234 }
235
236 /// Override the runner used to build the child `Command`.
237 #[must_use]
238 pub fn with_runner(mut self, runner: Arc<dyn SandboxRunner>) -> Self {
239 self.runner = runner;
240 self
241 }
242}
243
244impl Evaluator for SandboxedExecutionEvaluator {
245 fn name(&self) -> &'static str {
246 self.name
247 }
248
249 fn evaluate(&self, _case: &EvalCase, invocation: &Invocation) -> Option<EvalMetricResult> {
250 let response = invocation.final_response.as_ref()?;
251 let code = crate::evaluators::block_on(self.extractor.extract(response))?;
252
253 let tempdir = match tempfile::TempDir::new() {
254 Ok(dir) => dir,
255 Err(err) => {
256 return Some(EvalMetricResult {
257 evaluator_name: self.name.to_string(),
258 score: Score::fail(),
259 details: Some(format!("tempdir creation failed: {err}")),
260 });
261 }
262 };
263
264 let command = match self.runner.command(&code, tempdir.path()) {
265 Ok(cmd) => cmd,
266 Err(err) => {
267 return Some(EvalMetricResult {
268 evaluator_name: self.name.to_string(),
269 score: Score::fail(),
270 details: Some(format!("runner failed: {err}")),
271 });
272 }
273 };
274
275 let (score, details) = match run_sandboxed(command, &self.limits) {
276 Ok(outcome) => {
277 let score = if outcome.success {
278 Score::pass()
279 } else {
280 Score::fail()
281 };
282 (score, outcome.summary())
283 }
284 Err(EvaluatorError::UnsupportedPlatform { reason }) => {
285 (Score::fail(), format!("unsupported platform: {reason}"))
286 }
287 Err(EvaluatorError::SandboxLimitExceeded { limit }) => {
288 (Score::fail(), format!("sandbox limit exceeded: {limit}"))
289 }
290 Err(err) => (Score::fail(), err.into_metric_details()),
291 };
292
293 Some(EvalMetricResult {
294 evaluator_name: self.name.to_string(),
295 score,
296 details: Some(details),
297 })
298 }
299}