oharness-tools 0.1.0

ToolSet trait and bundled tool kits (bash, filesystem) for open-harness
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
//! `bash` tool kit (§7.5). Runs shell commands in the workspace directory.
//!
//! M1a: straightforward subprocess exec with stdout/stderr capture and optional
//! timeout. Not sandboxed — callers should pair this with `ApprovalChannel` or a
//! custom `ToolPolicy` for anything beyond local research. The full threat
//! model, what's mitigated here vs. what isn't, and deployer
//! recommendations live in `docs/security.md`.

use crate::context::ToolContext;
use crate::toolset::{ToolOutcome, ToolSet};
use async_trait::async_trait;
use oharness_core::message::{Content, ToolOutput};
use oharness_core::ToolSpec;
use serde::Deserialize;
use serde_json::{json, Value};
use std::process::Stdio;
use std::sync::OnceLock;
use std::time::Duration;
use tokio::process::Command;

const DEFAULT_TIMEOUT_SECS: u64 = 60;
const MAX_OUTPUT_BYTES: usize = 64 * 1024;

/// A single-tool `ToolSet` exposing `bash`.
pub struct BashTool {
    name: String,
    timeout: Duration,
    /// Optional env-var allowlist. When `Some(names)`, the
    /// subprocess starts with a cleared environment and only the
    /// listed variables are copied over from the parent. When
    /// `None`, the subprocess inherits the full parent environment
    /// (backwards-compatible default). See `docs/security.md` §3.3
    /// for recommended allowlists.
    env_allowlist: Option<Vec<String>>,
    specs: Vec<ToolSpec>,
}

impl Default for BashTool {
    fn default() -> Self {
        Self::new("bash")
    }
}

impl BashTool {
    pub fn new(name: impl Into<String>) -> Self {
        let name = name.into();
        let specs = vec![ToolSpec {
            name: name.clone(),
            description: "Execute a shell command via `/bin/bash -c <command>`. Returns \
                          combined stdout/stderr. Commands run in the configured \
                          workspace directory, or the current directory if no workspace \
                          is set. Output is truncated at 64KiB."
                .to_string(),
            input_schema: default_schema(),
        }];
        Self {
            name,
            timeout: Duration::from_secs(DEFAULT_TIMEOUT_SECS),
            env_allowlist: None,
            specs,
        }
    }

    pub fn with_timeout(mut self, d: Duration) -> Self {
        self.timeout = d;
        self
    }

    /// Restrict the subprocess environment to the listed variables.
    /// The child starts with no env, then the allowlisted names are
    /// copied over from the parent's env (silently skipped if
    /// unset). Recommended for eval / CI / untrusted-LLM work:
    ///
    /// ```ignore
    /// let tool = BashTool::default()
    ///     .with_env_allowlist(["PATH", "HOME", "USER", "SHELL", "LANG"]);
    /// ```
    ///
    /// This hides `*_API_KEY`, `AWS_*`, `ANTHROPIC_*`, SSH agent
    /// sockets, and similar secrets from the subprocess. Without
    /// an allowlist the subprocess inherits the full environment.
    pub fn with_env_allowlist<I, S>(mut self, names: I) -> Self
    where
        I: IntoIterator<Item = S>,
        S: Into<String>,
    {
        self.env_allowlist = Some(names.into_iter().map(Into::into).collect());
        self
    }
}

#[async_trait]
impl ToolSet for BashTool {
    fn specs(&self) -> &[ToolSpec] {
        &self.specs
    }

    async fn execute(&self, name: &str, input: Value, ctx: &ToolContext) -> ToolOutcome {
        if name != self.name {
            return ToolOutcome::error(format!("tool `{name}` not handled by BashTool"), false);
        }
        if ctx.cancellation.is_cancelled() {
            return ToolOutcome::Cancelled;
        }

        let parsed: BashInput = match serde_json::from_value(input) {
            Ok(v) => v,
            Err(e) => return ToolOutcome::error(format!("invalid bash input: {e}"), false),
        };

        let mut cmd = Command::new("/bin/bash");
        cmd.arg("-c").arg(&parsed.command);
        if let Some(ws) = ctx.workspace_path() {
            cmd.current_dir(ws);
        }

        // Env-allowlist filtering. Default (`None`) keeps the
        // full parent env for backwards-compat; an allowlist
        // clears env first, then copies over the named vars.
        if let Some(names) = &self.env_allowlist {
            cmd.env_clear();
            for name in names {
                if let Ok(val) = std::env::var(name) {
                    cmd.env(name, val);
                }
            }
        }

        // Capture stdout + stderr via pipes (`Command::spawn` by
        // default inherits the parent's, which leaks output into
        // the harness's own stdio).
        cmd.stdout(Stdio::piped());
        cmd.stderr(Stdio::piped());
        // Don't let the subprocess read from the parent stdin —
        // some commands (e.g. `cat`) would otherwise block waiting
        // for input that never arrives.
        cmd.stdin(Stdio::null());

        // Ensure the child dies if the enclosing future is dropped —
        // e.g. on timeout, cancellation, or the caller drop-completing
        // a task that spawned the tool call. Without this, a timed-out
        // bash command leaks a background process. Audit notes:
        // `docs/security.md` §3.1.
        cmd.kill_on_drop(true);

        let timeout_dur = parsed
            .timeout_secs
            .map(Duration::from_secs)
            .unwrap_or(self.timeout);

        // Run the child, racing three things:
        // 1. the child finishing normally,
        // 2. the timeout firing,
        // 3. `ctx.cancellation` being signalled.
        // Because `kill_on_drop(true)` is set, dropping the Child on
        // any of (2)/(3) sends SIGKILL; we don't need a manual kill.
        let cancellation = ctx.cancellation.clone();
        let output = {
            let child = match cmd.spawn() {
                Ok(c) => c,
                Err(e) => return ToolOutcome::error(format!("bash spawn: {e}"), true),
            };
            tokio::select! {
                res = child.wait_with_output() => match res {
                    Ok(o) => o,
                    Err(e) => return ToolOutcome::error(format!("bash: {e}"), true),
                },
                _ = tokio::time::sleep(timeout_dur) => {
                    // child drops here; kill_on_drop reaps it.
                    return ToolOutcome::error(
                        format!("bash: timed out after {}s", timeout_dur.as_secs()),
                        true,
                    );
                }
                _ = cancellation.cancelled() => {
                    return ToolOutcome::Cancelled;
                }
            }
        };

        let stdout = String::from_utf8_lossy(&output.stdout);
        let stderr = String::from_utf8_lossy(&output.stderr);
        let code = output.status.code();

        let mut combined = String::new();
        if !stdout.is_empty() {
            combined.push_str("STDOUT:\n");
            combined.push_str(&stdout);
        }
        if !stderr.is_empty() {
            if !combined.is_empty() {
                combined.push_str("\n\n");
            }
            combined.push_str("STDERR:\n");
            combined.push_str(&stderr);
        }
        let (combined, truncated) = if combined.len() > MAX_OUTPUT_BYTES {
            (
                format!(
                    "{}\n\n[truncated at {MAX_OUTPUT_BYTES} bytes]",
                    &combined[..MAX_OUTPUT_BYTES]
                ),
                true,
            )
        } else {
            (combined, false)
        };

        let tail = match code {
            Some(0) => String::new(),
            Some(c) => format!("\n\n[exit code: {c}]"),
            None => "\n\n[exit: killed by signal]".to_string(),
        };

        ToolOutcome::Success(ToolOutput {
            content: vec![Content::text(format!("{combined}{tail}"))],
            truncated,
        })
    }
}

#[derive(Debug, Deserialize)]
struct BashInput {
    command: String,
    #[serde(default)]
    timeout_secs: Option<u64>,
}

fn default_schema() -> Value {
    static SCHEMA: OnceLock<Value> = OnceLock::new();
    SCHEMA
        .get_or_init(|| {
            json!({
                "type": "object",
                "required": ["command"],
                "properties": {
                    "command": {
                        "type": "string",
                        "description": "The shell command to execute."
                    },
                    "timeout_secs": {
                        "type": "integer",
                        "description": "Optional per-call timeout in seconds.",
                        "minimum": 1
                    }
                },
                "additionalProperties": false
            })
        })
        .clone()
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::time::Instant;

    fn context() -> ToolContext {
        ToolContext::null()
    }

    fn outcome_text(outcome: &ToolOutcome) -> String {
        match outcome {
            ToolOutcome::Success(output) => output
                .content
                .iter()
                .filter_map(|c| match c {
                    Content::Text { text } => Some(text.as_str()),
                    _ => None,
                })
                .collect::<Vec<_>>()
                .join("\n"),
            ToolOutcome::ExecutionError { message, .. } => message.clone(),
            ToolOutcome::Denied { reason } => reason.clone(),
            ToolOutcome::Cancelled => String::from("<cancelled>"),
        }
    }

    #[tokio::test]
    async fn happy_path_captures_stdout() {
        let tool = BashTool::default();
        let outcome = tool
            .execute("bash", json!({"command": "echo hello world"}), &context())
            .await;
        assert!(matches!(outcome, ToolOutcome::Success(_)));
        let text = outcome_text(&outcome);
        assert!(text.contains("hello world"), "missing stdout: {text}");
    }

    /// Regression: pre-audit, a timed-out bash command leaked a
    /// background process because `timeout()` dropped the child
    /// without killing it. With `kill_on_drop(true)`, the child
    /// dies when the future drops — verified by the timeout
    /// returning in ~1s even though the command asked for 30s.
    #[tokio::test]
    async fn timeout_kills_subprocess_not_leaks_it() {
        let tool = BashTool::default().with_timeout(Duration::from_secs(1));
        let start = Instant::now();
        let outcome = tool
            .execute("bash", json!({"command": "sleep 30"}), &context())
            .await;
        let elapsed = start.elapsed();
        assert!(
            elapsed < Duration::from_secs(3),
            "bash did not return promptly on timeout: took {elapsed:?}"
        );
        match outcome {
            ToolOutcome::ExecutionError { message, .. } => {
                assert!(message.contains("timed out"), "{message}");
            }
            other => panic!("expected ExecutionError, got {other:?}"),
        }
    }

    /// Cancellation via `ctx.cancellation` fires mid-execution
    /// (not just pre-spawn) — the tool polls the token concurrently
    /// via `tokio::select!`.
    #[tokio::test]
    async fn cancellation_interrupts_running_command() {
        let tool = BashTool::default().with_timeout(Duration::from_secs(30));
        let mut ctx = ToolContext::null();
        let token = ctx.cancellation.clone();
        // Cancel after 200ms — mid-sleep.
        tokio::spawn(async move {
            tokio::time::sleep(Duration::from_millis(200)).await;
            token.cancel();
        });
        // Reassign to capture the token's cancel trigger — the
        // cancellation token shared with the background task IS
        // the same token ctx holds.
        ctx.cancellation = ctx.cancellation.clone();

        let start = Instant::now();
        let outcome = tool
            .execute("bash", json!({"command": "sleep 30"}), &ctx)
            .await;
        let elapsed = start.elapsed();
        assert!(
            elapsed < Duration::from_secs(3),
            "cancellation was not prompt: took {elapsed:?}"
        );
        assert!(matches!(outcome, ToolOutcome::Cancelled), "got {outcome:?}");
    }

    /// Env allowlist: when set, the subprocess starts with a
    /// cleared env and only the listed vars propagate. Use a
    /// distinctive var that the parent sets but we don't allow —
    /// `env` should NOT show it in the child's output.
    #[tokio::test]
    async fn env_allowlist_hides_unlisted_vars() {
        // Set a parent env var the subprocess shouldn't see.
        // Safety note: modifying the process env isn't ideal in
        // parallel tests, but the child observes a snapshot at
        // spawn. Other tests don't read this key.
        // SAFETY: in Rust 2024+ `std::env::set_var` is unsafe; on
        // our pinned toolchain it's still safe.
        std::env::set_var("OHARNESS_BASH_TEST_SECRET", "should-not-leak");

        let tool = BashTool::default().with_env_allowlist(["PATH", "HOME"]);
        let outcome = tool
            .execute("bash", json!({"command": "env"}), &context())
            .await;
        let text = outcome_text(&outcome);
        assert!(
            !text.contains("OHARNESS_BASH_TEST_SECRET"),
            "secret env var leaked through allowlist: {text}"
        );

        // Clean up.
        std::env::remove_var("OHARNESS_BASH_TEST_SECRET");
    }

    /// Without an allowlist, the subprocess inherits the full
    /// env (backwards-compatible default).
    #[tokio::test]
    async fn no_allowlist_inherits_env() {
        std::env::set_var("OHARNESS_BASH_PASSTHROUGH", "visible");

        let tool = BashTool::default();
        let outcome = tool
            .execute("bash", json!({"command": "env"}), &context())
            .await;
        let text = outcome_text(&outcome);
        assert!(
            text.contains("OHARNESS_BASH_PASSTHROUGH"),
            "expected env var to passthrough without allowlist: {text}"
        );

        std::env::remove_var("OHARNESS_BASH_PASSTHROUGH");
    }

    /// Output over 64KiB is truncated, and the `truncated` flag
    /// is set on the ToolOutput.
    #[tokio::test]
    async fn large_output_is_truncated_flagged() {
        let tool = BashTool::default();
        // Produce ~200KB of output — well past the 64KB cap.
        let outcome = tool
            .execute(
                "bash",
                json!({"command": "yes foo | head -c 200000"}),
                &context(),
            )
            .await;
        match outcome {
            ToolOutcome::Success(output) => {
                assert!(output.truncated, "truncated flag not set");
                let text = output
                    .content
                    .iter()
                    .filter_map(|c| match c {
                        Content::Text { text } => Some(text.as_str()),
                        _ => None,
                    })
                    .collect::<Vec<_>>()
                    .join("");
                assert!(text.contains("truncated at"), "missing truncation marker");
            }
            other => panic!("expected Success, got {other:?}"),
        }
    }
}