Skip to main content

koda_core/tools/
shell.rs

1//! Shell command execution tool (Bash).
2//!
3//! Runs commands as child processes with timeout protection.
4//! Output line cap is set by `OutputCaps` (context-scaled).
5//!
6//! ## Parameters
7//!
8//! - **`command`** (required) — The shell command to execute
9//! - **`timeout`** (optional, default 60) — Timeout in seconds
10//! - **`background`** (optional, default false) — Run in background, return PID
11//!
12//! ## Background mode
13//!
14//! When `background: true` the command is spawned detached and control returns
15//! immediately with the PID. Use for dev servers, file watchers, and other
16//! long-running processes. Background processes are tracked in `BgRegistry`.
17//!
18//! ## Safety
19//!
20//! - Commands are classified by `bash_safety::classify_bash_command`
21//! - Destructive commands (`rm -rf`, `git push --force`) always need confirmation
22//! - Path escapes outside the project root are flagged by `bash_path_lint`
23//! - Output is capped to prevent context overflow (verbose output is truncated)
24//!
25//! ## Best practices (sent to the model)
26//!
27//! - Use Bash only for builds, tests, git, and commands without a dedicated tool
28//! - Never use Bash for file ops — use Read/Write/Edit/Grep/List instead
29//! - Suppress verbose output: pipe to `tail`, use `--quiet`, avoid `-v` flags
30
31use crate::engine::{EngineEvent, EngineSink};
32use crate::providers::ToolDefinition;
33use crate::tools::bg_process::BgRegistry;
34use anyhow::Result;
35use serde_json::{Value, json};
36use std::path::Path;
37use tokio::io::{AsyncBufReadExt, BufReader};
38
39const DEFAULT_TIMEOUT_SECS: u64 = 60;
40/// Hard ceiling to prevent LLM-controlled DoS via huge timeout values.
41const MAX_TIMEOUT_SECS: u64 = 300;
42/// Max stderr lines to include in the summary (stderr is high-signal).
43const SUMMARY_STDERR_LINES: usize = 50;
44/// Max stdout tail lines to include in the summary.
45const SUMMARY_STDOUT_TAIL: usize = 20;
46/// Hard memory ceiling for line collection. Pathological commands (`yes`,
47/// `cat /dev/urandom | base64`) can produce gigabytes within the 300s timeout.
48/// Once this byte threshold is reached, lines are still streamed to the TUI
49/// but no longer collected into the in-memory Vec. The DB cap
50/// (`MAX_FULL_OUTPUT_BYTES`) handles what actually gets persisted.
51const MAX_COLLECT_BYTES: usize = 10 * 1024 * 1024; // 10 MB
52
53/// Result of a shell command with both a model-facing summary and full output.
54#[derive(Debug, Clone)]
55pub struct ShellOutput {
56    /// Compact summary for the model's context window.
57    pub summary: String,
58    /// Full untruncated output for DB storage / RecallContext retrieval.
59    /// `None` for background commands (no output to capture).
60    pub full_output: Option<String>,
61}
62
63/// Return tool definitions for the LLM.
64pub fn definitions() -> Vec<ToolDefinition> {
65    vec![ToolDefinition {
66        name: "Bash".to_string(),
67        description: "Execute a shell command. Use ONLY for builds, tests, git, \
68            and commands without a dedicated tool. Never use for file ops \
69            (use Read/Write/Edit/Grep/List instead). Suppress verbose output: \
70            pipe to tail, use --quiet, avoid -v flags. \
71            Set background=true for long-running processes (dev servers, watchers) \
72            — returns immediately with the PID."
73            .to_string(),
74        parameters: json!({
75            "type": "object",
76            "properties": {
77                "command": {
78                    "type": "string",
79                    "description": "The shell command to execute"
80                },
81                "timeout": {
82                    "type": "integer",
83                    "description": "Timeout in seconds (default: 60, ignored when background=true)"
84                },
85                "background": {
86                    "type": "boolean",
87                    "description": "Run in background and return immediately with PID (default: false). \
88                        Use for dev servers, file watchers, and other long-running processes."
89                }
90            },
91            "required": ["command"]
92        }),
93    }]
94}
95
96/// Execute a shell command with timeout, output capping, and optional streaming.
97///
98/// When `sink` is provided, each line of stdout/stderr is emitted as a
99/// `ToolOutputLine` event as it arrives — giving the TUI a live terminal feel.
100/// The full output is still collected and returned as the tool result.
101///
102/// When `args["background"]` is `true`, the process is spawned detached and
103/// this function returns immediately with the PID.  The `BgRegistry` tracks
104/// the child so it is cleaned up (SIGTERM) when the session ends.
105//
106// 9 args is over clippy's default 7 — every one is load-bearing context
107// (project root, args, output cap, bg registry, streaming sink, trust
108// mode, two proxy ports). Bundling into a struct just to placate a lint
109// would obscure the call sites; the named-keyword feel comes for free
110// from Rust's positional-with-types discipline.
111#[allow(clippy::too_many_arguments)]
112pub async fn run_shell_command(
113    project_root: &Path,
114    args: &Value,
115    max_lines: usize,
116    bg: &BgRegistry,
117    sink: Option<(&dyn EngineSink, &str)>,
118    trust: &crate::trust::TrustMode,
119    policy: &koda_sandbox::SandboxPolicy,
120    proxy_port: Option<u16>,
121    socks5_port: Option<u16>,
122    // Phase E of #996: who invoked this Bash call? Threaded down to
123    // `spawn_background` so the bg-process registry entry is tagged
124    // with the right `spawner`. Without this, a sub-agent that calls
125    // `Bash{background:true}` and later tries to `CancelTask` its
126    // own process gets `Forbidden` because the entry is tagged with
127    // `None` while the caller is `Some(my_invocation_id)`.
128    //
129    // Top-level callers pass `None` and the entry is also tagged
130    // with `None` — the existing top-level happy path is preserved.
131    caller_spawner: Option<u32>,
132) -> Result<ShellOutput> {
133    let command = args["command"]
134        .as_str()
135        .ok_or_else(|| anyhow::anyhow!("Missing 'command' argument"))?;
136    let background = args["background"].as_bool().unwrap_or(false);
137
138    tracing::info!(
139        "Running shell command (background={background}): [{} chars]",
140        command.len()
141    );
142
143    if background {
144        let msg = spawn_background(
145            project_root,
146            command,
147            bg,
148            trust,
149            policy,
150            proxy_port,
151            socks5_port,
152            caller_spawner,
153        )?;
154        return Ok(ShellOutput {
155            summary: msg,
156            full_output: None,
157        });
158    }
159
160    // Phase 5 PR-3 of #934: timeout precedence is
161    //   1. explicit `timeout` arg (model-supplied per-call)
162    //   2. `policy.limits.wall_time_secs` (per-agent default)
163    //   3. `DEFAULT_TIMEOUT_SECS` legacy constant fallback
164    // Hard ceiling `MAX_TIMEOUT_SECS` clamps all three — keeps the
165    // DoS protection that the LLM can't widen its own deadline by
166    // asking for a 1-hour run.
167    let timeout_secs = args["timeout"]
168        .as_u64()
169        .or(policy.limits.wall_time_secs)
170        .unwrap_or(DEFAULT_TIMEOUT_SECS)
171        .min(MAX_TIMEOUT_SECS);
172
173    // Spawn via sandbox wrapper (enforced for all trust modes).
174    // Phase 5 PR-2 of #934: policy is now threaded in from the
175    // ToolRegistry (set via [`ToolRegistry::with_sandbox_policy`] in
176    // sub-agent dispatch; defaults to `strict_default()` for the main
177    // agent). PR-3 will start populating the policy with non-default
178    // values via [`crate::sandbox::policy_for_agent`].
179    let mut child = crate::sandbox::build(
180        command,
181        project_root,
182        trust,
183        policy,
184        proxy_port,
185        socks5_port,
186    )?
187    .stdout(std::process::Stdio::piped())
188    .stderr(std::process::Stdio::piped())
189    .spawn()
190    .map_err(|e| anyhow::anyhow!("Failed to execute command: {e}"))?;
191
192    let stdout = child.stdout.take().unwrap();
193    let stderr = child.stderr.take().unwrap();
194
195    let mut stdout_lines: Vec<String> = Vec::new();
196    let mut stderr_lines: Vec<String> = Vec::new();
197
198    // Read stdout and stderr concurrently, streaming lines as they arrive.
199    // Lines are always streamed to the TUI, but collection into Vec stops
200    // once max_lines or MAX_COLLECT_BYTES is reached (OOM protection).
201    let sink_info = sink.map(|(s, id)| (s, id.to_string()));
202    let result = tokio::time::timeout(
203        std::time::Duration::from_secs(timeout_secs),
204        read_streams(
205            stdout,
206            stderr,
207            &mut stdout_lines,
208            &mut stderr_lines,
209            max_lines,
210            &sink_info,
211        ),
212    )
213    .await;
214
215    match result {
216        Ok(Ok(())) => {
217            // Wait for exit status after streams are drained.
218            let status = child
219                .wait()
220                .await
221                .map_err(|e| anyhow::anyhow!("wait: {e}"))?;
222            let exit_code = status.code().unwrap_or(-1);
223
224            // Phase 1 of #934: surface kernel-sandbox denials to the model.
225            // The kernel makes the syscall fail with EACCES/EPERM and the
226            // child shell prints a libc error to stderr; parse those lines
227            // into structured violations and append a CC-style block.
228            // Always-on (informational only — doesn't change exit codes or
229            // change which lines we already collected).
230            annotate_violations(exit_code, command, &mut stderr_lines);
231
232            let summary = format_summary(exit_code, &stdout_lines, &stderr_lines);
233            // Phase 5 PR-8 of #934: thread the policy-supplied output cap
234            // into format_full_output. `None` keeps the historical 2 MB
235            // default so today's behavior is byte-identical until a
236            // future PR populates `policy.limits.max_output_bytes`.
237            let max_bytes = policy
238                .limits
239                .max_output_bytes
240                .map(|n| usize::try_from(n).unwrap_or(usize::MAX))
241                .unwrap_or(DEFAULT_MAX_FULL_OUTPUT_BYTES);
242            let full = format_full_output(exit_code, &stdout_lines, &stderr_lines, max_bytes);
243
244            Ok(ShellOutput {
245                summary,
246                full_output: Some(full),
247            })
248        }
249        Ok(Err(e)) => Err(anyhow::anyhow!("Stream read error: {e}")),
250        Err(_) => {
251            // Timeout — kill the child.
252            let _ = child.kill().await;
253            let msg = format!("Command timed out after {timeout_secs}s: {command}");
254            Ok(ShellOutput {
255                summary: msg.clone(),
256                full_output: Some(msg),
257            })
258        }
259    }
260}
261
262/// Read stdout and stderr concurrently, collecting lines and optionally streaming them.
263///
264/// Lines are always streamed to the TUI sink (if present), but collection into
265/// the Vecs is gated by two caps:
266///   - `max_lines` — total stdout + stderr lines collected
267///   - `MAX_COLLECT_BYTES` — total bytes collected (OOM protection)
268///
269/// Once either cap is hit, new lines are still streamed to the TUI but silently
270/// dropped from the Vecs. This keeps the TUI responsive while bounding memory
271/// for pathological commands.
272async fn read_streams(
273    stdout: tokio::process::ChildStdout,
274    stderr: tokio::process::ChildStderr,
275    stdout_lines: &mut Vec<String>,
276    stderr_lines: &mut Vec<String>,
277    max_lines: usize,
278    sink_info: &Option<(&dyn EngineSink, String)>,
279) -> std::io::Result<()> {
280    let mut stdout_reader = BufReader::new(stdout).lines();
281    let mut stderr_reader = BufReader::new(stderr).lines();
282
283    let mut stdout_done = false;
284    let mut stderr_done = false;
285    let mut collected_bytes: usize = 0;
286    let mut collected_lines: usize = 0;
287
288    while !stdout_done || !stderr_done {
289        tokio::select! {
290            line = stdout_reader.next_line(), if !stdout_done => {
291                match line? {
292                    Some(l) => {
293                        if let Some((sink, id)) = sink_info {
294                            sink.emit(EngineEvent::ToolOutputLine {
295                                id: id.clone(),
296                                line: l.clone(),
297                                is_stderr: false,
298                            });
299                        }
300                        if collected_lines < max_lines
301                            && collected_bytes < MAX_COLLECT_BYTES
302                        {
303                            collected_bytes += l.len();
304                            collected_lines += 1;
305                            stdout_lines.push(l);
306                        }
307                    }
308                    None => stdout_done = true,
309                }
310            }
311            line = stderr_reader.next_line(), if !stderr_done => {
312                match line? {
313                    Some(l) => {
314                        if let Some((sink, id)) = sink_info {
315                            sink.emit(EngineEvent::ToolOutputLine {
316                                id: id.clone(),
317                                line: l.clone(),
318                                is_stderr: true,
319                            });
320                        }
321                        if collected_lines < max_lines
322                            && collected_bytes < MAX_COLLECT_BYTES
323                        {
324                            collected_bytes += l.len();
325                            collected_lines += 1;
326                            stderr_lines.push(l);
327                        }
328                    }
329                    None => stderr_done = true,
330                }
331            }
332        }
333    }
334    Ok(())
335}
336
337/// Spawn a command in the background and register it.
338///
339/// Returns immediately with PID + instructions. Sync because `spawn()` doesn't
340/// need to await — only `output()` / `wait()` block.
341#[allow(clippy::too_many_arguments)]
342fn spawn_background(
343    project_root: &Path,
344    command: &str,
345    bg: &BgRegistry,
346    trust: &crate::trust::TrustMode,
347    policy: &koda_sandbox::SandboxPolicy,
348    proxy_port: Option<u16>,
349    socks5_port: Option<u16>,
350    caller_spawner: Option<u32>,
351) -> Result<String> {
352    // Spawn via sandbox wrapper (enforced for all trust modes).
353    // Detach stdio so the process doesn't block on terminal I/O.
354    // Phase 5 PR-2 of #934: policy threaded in from the registry
355    // (see comment on `run_shell_command` above for the threading rationale).
356    let child = crate::sandbox::build(
357        command,
358        project_root,
359        trust,
360        policy,
361        proxy_port,
362        socks5_port,
363    )?
364    .stdin(std::process::Stdio::null())
365    .stdout(std::process::Stdio::null())
366    .stderr(std::process::Stdio::null())
367    .spawn()
368    .map_err(|e| anyhow::anyhow!("Failed to spawn background command: {e}"))?;
369
370    let pid = child
371        .id()
372        .ok_or_else(|| anyhow::anyhow!("Spawned process has no PID (already exited)"))?;
373
374    bg.insert(pid, command.to_string(), child, caller_spawner);
375
376    Ok(format!(
377        "Background process started.\n  PID:     {pid}\n  Command: {command}\n\
378         To stop:  Bash{{command: \"kill {pid}\"}}\n\
379         To force: Bash{{command: \"kill -9 {pid}\"}}\n\
380         Note: process will be stopped automatically when the session ends."
381    ))
382}
383
384/// Build a compact summary for the model's context window.
385///
386/// Includes all stderr (high-signal — errors/warnings) and only the tail
387/// of stdout (low-signal — build progress noise).  Line counts let the
388/// model decide whether to retrieve the full output via RecallContext.
389fn format_summary(exit_code: i32, stdout_lines: &[String], stderr_lines: &[String]) -> String {
390    let mut out = format!(
391        "Exit code: {exit_code} | stdout: {} lines | stderr: {} lines",
392        stdout_lines.len(),
393        stderr_lines.len(),
394    );
395
396    // Stderr first — always include (capped at SUMMARY_STDERR_LINES).
397    if !stderr_lines.is_empty() {
398        let (label, text) = if stderr_lines.len() > SUMMARY_STDERR_LINES {
399            let skipped = stderr_lines.len() - SUMMARY_STDERR_LINES;
400            (
401                format!(
402                    "\n\n--- stderr (last {} of {}, {skipped} skipped) ---",
403                    SUMMARY_STDERR_LINES,
404                    stderr_lines.len(),
405                ),
406                stderr_lines[stderr_lines.len() - SUMMARY_STDERR_LINES..].join("\n"),
407            )
408        } else {
409            (
410                format!("\n\n--- stderr ({} lines) ---", stderr_lines.len()),
411                stderr_lines.join("\n"),
412            )
413        };
414        out.push_str(&label);
415        out.push('\n');
416        out.push_str(&text);
417    }
418
419    // Stdout tail — only last N lines.
420    if !stdout_lines.is_empty() {
421        let (label, text) = if stdout_lines.len() > SUMMARY_STDOUT_TAIL {
422            (
423                format!(
424                    "\n\n--- stdout (last {} of {}) ---",
425                    SUMMARY_STDOUT_TAIL,
426                    stdout_lines.len(),
427                ),
428                stdout_lines[stdout_lines.len() - SUMMARY_STDOUT_TAIL..].join("\n"),
429            )
430        } else {
431            (
432                format!("\n\n--- stdout ({} lines) ---", stdout_lines.len()),
433                stdout_lines.join("\n"),
434            )
435        };
436        out.push_str(&label);
437        out.push('\n');
438        out.push_str(&text);
439    }
440
441    // Hint for the model.
442    if stdout_lines.len() > SUMMARY_STDOUT_TAIL || stderr_lines.len() > SUMMARY_STDERR_LINES {
443        out.push_str("\n\nFull output stored. Use RecallContext to search if needed.");
444    }
445
446    out
447}
448
449/// Default per-command full-output cap when the active sandbox policy
450/// doesn't override it. 2 MB is the historical value — generous enough
451/// for RecallContext to find errors deep in build/test output, while
452/// still preventing pathological commands from bloating the SQLite DB.
453///
454/// Per-policy override comes in via `policy.limits.max_output_bytes`,
455/// threaded through to [`format_full_output`] (Phase 5 PR-8 of #934 —
456/// the last declared-but-not-enforced field in the sandbox policy).
457const DEFAULT_MAX_FULL_OUTPUT_BYTES: usize = 2 * 1024 * 1024; // 2 MB
458
459/// Render a byte count as a human-friendly string for the truncation
460/// marker. Round numbers get clean labels ("2MB", "512KB"); everything
461/// else falls back to raw bytes so the model + human reader can always
462/// tell exactly where the cap landed.
463fn format_byte_cap(bytes: usize) -> String {
464    const KB: usize = 1024;
465    const MB: usize = 1024 * 1024;
466    if bytes >= MB && bytes.is_multiple_of(MB) {
467        format!("{}MB", bytes / MB)
468    } else if bytes >= KB && bytes.is_multiple_of(KB) {
469        format!("{}KB", bytes / KB)
470    } else {
471        format!("{bytes} bytes")
472    }
473}
474
475/// Build the full output for DB storage.
476///
477/// Stored in `messages.full_content` and searchable via RecallContext.
478///
479/// `max_bytes` is the hard cap. The caller computes it from
480/// `policy.limits.max_output_bytes`, falling back to
481/// [`DEFAULT_MAX_FULL_OUTPUT_BYTES`] when the policy is silent. Threading
482/// the cap as a parameter (rather than reading the policy here) keeps
483/// this function trivially testable without spinning up a `SandboxPolicy`.
484fn format_full_output(
485    exit_code: i32,
486    stdout_lines: &[String],
487    stderr_lines: &[String],
488    max_bytes: usize,
489) -> String {
490    let mut out = format!("Exit code: {exit_code}\n");
491    if !stdout_lines.is_empty() {
492        out.push_str("\n--- stdout ---\n");
493        out.push_str(&stdout_lines.join("\n"));
494    }
495    if !stderr_lines.is_empty() {
496        out.push_str("\n\n--- stderr ---\n");
497        out.push_str(&stderr_lines.join("\n"));
498    }
499
500    // Hard cap to prevent DB bloat from pathological commands. Cap is
501    // policy-controlled (see `max_bytes` doc above); the message names
502    // the actual byte count so a future per-trust-mode override is
503    // legible to the model and the human reading the DB.
504    if out.len() > max_bytes {
505        out.truncate(max_bytes);
506        // Find safe char boundary
507        while !out.is_char_boundary(out.len()) {
508            out.pop();
509        }
510        out.push_str(&format!(
511            "\n\n[... output truncated at {} ...]",
512            format_byte_cap(max_bytes)
513        ));
514    }
515
516    out
517}
518
519/// Phase 1 of #934: parse stderr for kernel-sandbox denials and append a
520/// `<sandbox_violations>` block (CC pattern) so the model can react.
521///
522/// We only annotate when the command exited non-zero — a successful
523/// command with `Permission denied` in its stderr is almost always a
524/// false positive (e.g. `find / 2>/dev/null` swallows pre-sandbox
525/// errors that aren't sandbox denials at all).
526///
527/// The block is appended *to* `stderr_lines` (not stdout) so existing
528/// formatters carry it through unchanged. Violations are also recorded
529/// in the process-wide [`koda_sandbox::global_store`] for `/sandbox
530/// status` to surface later.
531fn annotate_violations(exit_code: i32, command: &str, stderr_lines: &mut Vec<String>) {
532    if exit_code == 0 {
533        return;
534    }
535    let joined = stderr_lines.join("\n");
536    let violations = koda_sandbox::monitor::parse_stderr(&joined, Some(command));
537    if violations.is_empty() {
538        return;
539    }
540    let store = koda_sandbox::global_store();
541    for v in &violations {
542        store.record(v.clone());
543    }
544    if let Some(block) = koda_sandbox::render_block(&violations) {
545        // Push each line of the block as its own entry so the line-count
546        // accounting in format_summary stays accurate.
547        for line in block.lines() {
548            stderr_lines.push(line.to_string());
549        }
550    }
551}
552
553#[cfg(test)]
554mod tests {
555    use super::*;
556    use crate::tools::bg_process::BgRegistry;
557
558    fn bg() -> BgRegistry {
559        BgRegistry::new()
560    }
561
562    #[tokio::test]
563    async fn shell_timeout_returns_timeout_message() {
564        let tmp = tempfile::tempdir().unwrap();
565        let args = serde_json::json!({"command": "sleep 5", "timeout": 1});
566        let result = run_shell_command(
567            tmp.path(),
568            &args,
569            256,
570            &bg(),
571            None,
572            &crate::trust::TrustMode::Safe,
573            &koda_sandbox::SandboxPolicy::strict_default(),
574            None,
575            None,
576            None,
577        )
578        .await
579        .unwrap();
580        assert!(
581            result.summary.contains("timed out"),
582            "Expected timeout message, got: {}",
583            result.summary
584        );
585    }
586
587    #[tokio::test]
588    async fn shell_respects_custom_timeout_parameter() {
589        let tmp = tempfile::tempdir().unwrap();
590        let args = serde_json::json!({"command": "echo hello", "timeout": 5});
591        let result = run_shell_command(
592            tmp.path(),
593            &args,
594            256,
595            &bg(),
596            None,
597            &crate::trust::TrustMode::Safe,
598            &koda_sandbox::SandboxPolicy::strict_default(),
599            None,
600            None,
601            None,
602        )
603        .await
604        .unwrap();
605        assert!(
606            result.summary.contains("hello"),
607            "Fast command should succeed: {}",
608            result.summary
609        );
610    }
611
612    #[tokio::test]
613    async fn shell_default_timeout_is_applied_when_not_specified() {
614        let tmp = tempfile::tempdir().unwrap();
615        let args = serde_json::json!({"command": "echo world"});
616        let result = run_shell_command(
617            tmp.path(),
618            &args,
619            256,
620            &bg(),
621            None,
622            &crate::trust::TrustMode::Safe,
623            &koda_sandbox::SandboxPolicy::strict_default(),
624            None,
625            None,
626            None,
627        )
628        .await
629        .unwrap();
630        assert!(
631            result.summary.contains("world"),
632            "Command without explicit timeout should work: {}",
633            result.summary
634        );
635    }
636
637    /// Phase 5 PR-8 of #934: end-to-end test that
638    /// `policy.limits.max_output_bytes` actually flows through
639    /// `run_shell_command` to truncate `full_output`. Pins the
640    /// integration that the unit tests on `format_full_output` can't
641    /// see — without this, a future refactor could silently drop the
642    /// policy lookup at the call site and only the unit tests would
643    /// stay green (because they call `format_full_output` directly).
644    ///
645    /// Generates ~10KB of stdout under a 1KB cap and asserts the
646    /// stored full_output respects the cap + carries the policy-aware
647    /// truncation marker.
648    #[tokio::test]
649    async fn run_shell_command_honors_policy_max_output_bytes() {
650        let tmp = tempfile::tempdir().unwrap();
651        // ~10KB of output: yes printed 1000 times of a 10-char string.
652        let args = serde_json::json!({
653            "command": "yes 'aaaaaaaaa' | head -1000"
654        });
655        let mut policy = koda_sandbox::SandboxPolicy::strict_default();
656        policy.limits.max_output_bytes = Some(1024); // 1 KB cap
657
658        let result = run_shell_command(
659            tmp.path(),
660            &args,
661            2000,
662            &bg(),
663            None,
664            &crate::trust::TrustMode::Safe,
665            &policy,
666            None,
667            None,
668            None,
669        )
670        .await
671        .expect("shell command must succeed");
672
673        let full = result.full_output.expect("full_output should be populated");
674        assert!(
675            full.len() <= 1024 + 50,
676            "full_output ({} bytes) must respect policy cap (1024) + small marker overhead",
677            full.len()
678        );
679        assert!(
680            full.contains("truncated at 1KB"),
681            "truncation marker should name the policy-supplied cap, got tail: {:?}",
682            full.lines().last()
683        );
684    }
685
686    #[tokio::test]
687    async fn background_spawn_returns_pid() {
688        let tmp = tempfile::tempdir().unwrap();
689        let registry = BgRegistry::new();
690        let args = serde_json::json!({"command": "sleep 60", "background": true});
691        let result = run_shell_command(
692            tmp.path(),
693            &args,
694            256,
695            &registry,
696            None,
697            &crate::trust::TrustMode::Safe,
698            &koda_sandbox::SandboxPolicy::strict_default(),
699            None,
700            None,
701            None,
702        )
703        .await
704        .unwrap();
705        assert!(
706            result.summary.contains("Background process started"),
707            "{}",
708            result.summary
709        );
710        assert!(result.summary.contains("PID:"), "{}", result.summary);
711        assert!(result.summary.contains("kill"), "{}", result.summary);
712        assert!(
713            result.full_output.is_none(),
714            "background has no full_output"
715        );
716        assert_eq!(registry.len(), 1);
717    }
718
719    /// Phase E of #996 regression: when a sub-agent (caller_spawner =
720    /// `Some(N)`) does `Bash{background:true}`, the bg-process entry
721    /// must be tagged with the same `Some(N)`. Without this fix, the
722    /// entry was hard-coded `None`, which meant the sub-agent's later
723    /// `CancelTask` / `WaitTask` (also `Some(N)`) would get
724    /// `Forbidden` because `None != Some(N)`.
725    ///
726    /// We assert at the registry level (`snapshot()` exposes the
727    /// `spawner` field) rather than driving CancelTask, because the
728    /// scoping check is what this PR fixes — the cancel mechanics are
729    /// already covered by `bg_process::tests::kill_as_caller_*`.
730    #[tokio::test]
731    async fn background_spawn_tags_entry_with_caller_spawner() {
732        let tmp = tempfile::tempdir().unwrap();
733        let registry = BgRegistry::new();
734        let args = serde_json::json!({"command": "sleep 60", "background": true});
735        let _ = run_shell_command(
736            tmp.path(),
737            &args,
738            256,
739            &registry,
740            None,
741            &crate::trust::TrustMode::Safe,
742            &koda_sandbox::SandboxPolicy::strict_default(),
743            None,
744            None,
745            Some(42), // sub-agent invocation id
746        )
747        .await
748        .unwrap();
749
750        let snap = registry.snapshot();
751        assert_eq!(snap.len(), 1, "one bg process expected");
752        assert_eq!(
753            snap[0].spawner,
754            Some(42),
755            "bg-process entry must carry the caller's spawner id so a \
756             same-spawner CancelTask doesn't get Forbidden"
757        );
758    }
759
760    #[tokio::test]
761    async fn background_false_runs_synchronously() {
762        let tmp = tempfile::tempdir().unwrap();
763        let args = serde_json::json!({"command": "echo sync", "background": false});
764        let result = run_shell_command(
765            tmp.path(),
766            &args,
767            256,
768            &bg(),
769            None,
770            &crate::trust::TrustMode::Safe,
771            &koda_sandbox::SandboxPolicy::strict_default(),
772            None,
773            None,
774            None,
775        )
776        .await
777        .unwrap();
778        assert!(result.summary.contains("sync"), "{}", result.summary);
779        assert!(
780            !result.summary.contains("PID:"),
781            "foreground should not have PID line: {}",
782            result.summary
783        );
784    }
785
786    #[test]
787    fn test_format_summary_short_output() {
788        let stdout: Vec<String> = vec!["hello", "world"]
789            .into_iter()
790            .map(String::from)
791            .collect();
792        let stderr: Vec<String> = vec![];
793        let summary = format_summary(0, &stdout, &stderr);
794        assert!(summary.contains("Exit code: 0"));
795        assert!(summary.contains("stdout: 2 lines"));
796        assert!(summary.contains("hello"));
797        assert!(summary.contains("world"));
798        // Short output should NOT have the RecallContext hint
799        assert!(!summary.contains("RecallContext"));
800    }
801
802    #[test]
803    fn test_format_summary_long_stdout_truncated() {
804        let stdout: Vec<String> = (0..100).map(|i| format!("line {i}")).collect();
805        let stderr: Vec<String> = vec!["warning: something".into()];
806        let summary = format_summary(0, &stdout, &stderr);
807        // Should contain last 20 lines
808        assert!(summary.contains("line 99"));
809        assert!(summary.contains("line 80"));
810        // Should NOT contain early lines
811        assert!(!summary.contains("line 0\n"));
812        // Should show truncation metadata
813        assert!(summary.contains("last 20 of 100"));
814        // Stderr should be fully included
815        assert!(summary.contains("warning: something"));
816        // Should have RecallContext hint
817        assert!(summary.contains("RecallContext"));
818    }
819
820    #[test]
821    fn test_format_full_output_includes_everything() {
822        let stdout: Vec<String> = (0..100).map(|i| format!("line {i}")).collect();
823        let stderr: Vec<String> = vec!["err1".into(), "err2".into()];
824        let full = format_full_output(1, &stdout, &stderr, DEFAULT_MAX_FULL_OUTPUT_BYTES);
825        assert!(full.contains("Exit code: 1"));
826        assert!(full.contains("line 0"));
827        assert!(full.contains("line 99"));
828        assert!(full.contains("err1"));
829        assert!(full.contains("err2"));
830    }
831
832    #[test]
833    fn test_format_full_output_capped_at_2mb() {
834        // Each line is ~16 bytes; 200K lines ≈ 3.2 MB → should truncate.
835        let stdout: Vec<String> = (0..200_000).map(|i| format!("line {i}: padding")).collect();
836        let full = format_full_output(0, &stdout, &[], DEFAULT_MAX_FULL_OUTPUT_BYTES);
837        assert!(full.len() <= 2 * 1024 * 1024 + 50); // 2MB + truncation message
838        assert!(full.contains("truncated at 2MB"));
839    }
840
841    /// Phase 5 PR-8 of #934: the policy-supplied `max_output_bytes`
842    /// overrides the historical 2 MB default. Pins the contract that
843    /// `format_full_output` honors its `max_bytes` parameter and
844    /// surfaces the actual cap in the truncation marker (so the model
845    /// + human reading the DB can always tell where output stopped).
846    #[test]
847    fn format_full_output_honors_explicit_cap() {
848        // Tight 4KB cap; the marker should show "4KB", and total output
849        // length must not exceed the cap + a small marker overhead.
850        let stdout: Vec<String> = (0..2000).map(|i| format!("line {i}: padding")).collect();
851        let cap = 4 * 1024;
852        let full = format_full_output(0, &stdout, &[], cap);
853        assert!(
854            full.len() <= cap + 50,
855            "output {} should fit within cap {} + marker",
856            full.len(),
857            cap
858        );
859        assert!(
860            full.contains("truncated at 4KB"),
861            "truncation marker should name the actual cap, got: {}",
862            full.lines().last().unwrap_or("")
863        );
864    }
865
866    /// `format_byte_cap` formats round MB / KB cleanly so the truncation
867    /// marker is human-legible for the common cases (2 MB default,
868    /// future per-trust-mode caps that'll likely be round numbers).
869    /// Falls back to raw bytes for anything weird so the marker is
870    /// always exact, never lossy.
871    #[test]
872    fn format_byte_cap_renders_round_units() {
873        assert_eq!(format_byte_cap(2 * 1024 * 1024), "2MB");
874        assert_eq!(format_byte_cap(4 * 1024), "4KB");
875        assert_eq!(format_byte_cap(1500), "1500 bytes");
876        assert_eq!(format_byte_cap(0), "0 bytes");
877    }
878
879    #[test]
880    fn test_shell_output_has_full_output() {
881        // Verify ShellOutput struct works correctly
882        let so = ShellOutput {
883            summary: "Exit code: 0".into(),
884            full_output: Some("full output here".into()),
885        };
886        assert_eq!(so.summary, "Exit code: 0");
887        assert_eq!(so.full_output.unwrap(), "full output here");
888    }
889
890    #[tokio::test]
891    async fn collection_stops_at_max_lines() {
892        let tmp = tempfile::tempdir().unwrap();
893        // Generate 50 lines of output but cap collection at 10.
894        let args = serde_json::json!({
895            "command": "seq 1 50"
896        });
897        let result = run_shell_command(
898            tmp.path(),
899            &args,
900            10,
901            &bg(),
902            None,
903            &crate::trust::TrustMode::Safe,
904            &koda_sandbox::SandboxPolicy::strict_default(),
905            None,
906            None,
907            None,
908        )
909        .await
910        .unwrap();
911        // Summary should reflect that we only collected 10 lines.
912        assert!(
913            result.summary.contains("stdout: 10 lines"),
914            "Expected 10 collected lines, got: {}",
915            result.summary
916        );
917        // Full output should NOT contain lines beyond the cap.
918        let full = result.full_output.unwrap();
919        assert!(full.contains("1"), "Should contain first line");
920        assert!(!full.contains("\n50\n"), "Should NOT contain line 50");
921    }
922
923    #[test]
924    fn test_timeout_capped_at_max() {
925        // Mirrors the precedence formula in `run_shell_command` so a
926        // refactor here forces a refactor there (and vice versa).
927        let args = serde_json::json!({"command": "echo hi", "timeout": 99999});
928        let policy = koda_sandbox::SandboxPolicy::strict_default();
929        let t = args["timeout"]
930            .as_u64()
931            .or(policy.limits.wall_time_secs)
932            .unwrap_or(DEFAULT_TIMEOUT_SECS)
933            .min(MAX_TIMEOUT_SECS);
934        assert_eq!(t, MAX_TIMEOUT_SECS);
935    }
936
937    // Phase 5 PR-3 of #934: timeout precedence is
938    //   arg > policy.limits.wall_time_secs > DEFAULT_TIMEOUT_SECS
939    // Each rung is pinned by its own test so a regression in one
940    // (e.g. swapping arg and policy precedence — letting a per-agent
941    // policy override an explicit per-call deadline) names the bug.
942
943    #[test]
944    fn timeout_precedence_arg_beats_policy() {
945        let args = serde_json::json!({"command": "x", "timeout": 42});
946        let mut policy = koda_sandbox::SandboxPolicy::strict_default();
947        policy.limits.wall_time_secs = Some(120);
948        let t = args["timeout"]
949            .as_u64()
950            .or(policy.limits.wall_time_secs)
951            .unwrap_or(DEFAULT_TIMEOUT_SECS)
952            .min(MAX_TIMEOUT_SECS);
953        assert_eq!(
954            t, 42,
955            "explicit per-call timeout must win over policy default"
956        );
957    }
958
959    #[test]
960    fn timeout_precedence_policy_beats_legacy_default() {
961        let args = serde_json::json!({"command": "x"}); // no timeout arg
962        let mut policy = koda_sandbox::SandboxPolicy::strict_default();
963        policy.limits.wall_time_secs = Some(45);
964        let t = args["timeout"]
965            .as_u64()
966            .or(policy.limits.wall_time_secs)
967            .unwrap_or(DEFAULT_TIMEOUT_SECS)
968            .min(MAX_TIMEOUT_SECS);
969        assert_eq!(
970            t, 45,
971            "policy-supplied wall_time_secs must beat the legacy DEFAULT_TIMEOUT_SECS const"
972        );
973    }
974
975    #[test]
976    fn timeout_precedence_legacy_default_when_arg_and_policy_absent() {
977        let args = serde_json::json!({"command": "x"});
978        let policy = koda_sandbox::SandboxPolicy::strict_default(); // wall_time_secs = None
979        let t = args["timeout"]
980            .as_u64()
981            .or(policy.limits.wall_time_secs)
982            .unwrap_or(DEFAULT_TIMEOUT_SECS)
983            .min(MAX_TIMEOUT_SECS);
984        assert_eq!(t, DEFAULT_TIMEOUT_SECS, "fallback chain bottom rung");
985    }
986
987    #[test]
988    fn timeout_max_ceiling_clamps_policy_too() {
989        // Defense against "sub-agent gets a generous wall_time policy
990        // and bypasses the global DoS ceiling". The ceiling must clamp
991        // *all* sources, not just user-supplied args.
992        let args = serde_json::json!({"command": "x"});
993        let mut policy = koda_sandbox::SandboxPolicy::strict_default();
994        policy.limits.wall_time_secs = Some(99_999);
995        let t = args["timeout"]
996            .as_u64()
997            .or(policy.limits.wall_time_secs)
998            .unwrap_or(DEFAULT_TIMEOUT_SECS)
999            .min(MAX_TIMEOUT_SECS);
1000        assert_eq!(
1001            t, MAX_TIMEOUT_SECS,
1002            "policy can't widen its own deadline past the hard DoS ceiling"
1003        );
1004    }
1005
1006    #[tokio::test]
1007    async fn streaming_emits_lines_to_sink() {
1008        use std::sync::{Arc, Mutex};
1009
1010        /// Collects ToolOutputLine events for testing.
1011        #[derive(Debug, Default)]
1012        struct CaptureSink {
1013            lines: Mutex<Vec<(String, bool)>>,
1014        }
1015        impl crate::engine::EngineSink for CaptureSink {
1016            fn emit(&self, event: EngineEvent) {
1017                if let EngineEvent::ToolOutputLine {
1018                    line, is_stderr, ..
1019                } = event
1020                {
1021                    self.lines.lock().unwrap().push((line, is_stderr));
1022                }
1023            }
1024        }
1025
1026        let tmp = tempfile::tempdir().unwrap();
1027        let sink = Arc::new(CaptureSink::default());
1028        let args = serde_json::json!({
1029            "command": "echo alpha && echo bravo && echo charlie >&2"
1030        });
1031        let result = run_shell_command(
1032            tmp.path(),
1033            &args,
1034            256,
1035            &bg(),
1036            Some((sink.as_ref(), "test_id")),
1037            &crate::trust::TrustMode::Safe,
1038            &koda_sandbox::SandboxPolicy::strict_default(),
1039            None,
1040            None,
1041            None,
1042        )
1043        .await
1044        .unwrap();
1045
1046        // Summary should contain the output
1047        assert!(result.summary.contains("alpha"));
1048        assert!(result.summary.contains("bravo"));
1049        assert!(result.summary.contains("charlie"));
1050
1051        // Full output should contain everything
1052        let full = result.full_output.unwrap();
1053        assert!(full.contains("alpha"));
1054        assert!(full.contains("bravo"));
1055        assert!(full.contains("charlie"));
1056
1057        // Streaming lines should have been emitted
1058        let lines = sink.lines.lock().unwrap();
1059        assert!(
1060            lines.len() >= 3,
1061            "Expected at least 3 streamed lines, got {}: {lines:?}",
1062            lines.len()
1063        );
1064
1065        // At least one stdout and one stderr line
1066        assert!(lines.iter().any(|(_, is_stderr)| !is_stderr));
1067        assert!(lines.iter().any(|(_, is_stderr)| *is_stderr));
1068    }
1069
1070    // ── Phase 1 of #934: violation annotation ────────────────────────────
1071
1072    #[test]
1073    fn annotate_violations_skips_when_exit_code_zero() {
1074        // Successful command with denial-looking stderr → no annotation.
1075        // Otherwise `find / 2>/dev/null` would be falsely annotated every
1076        // time it hits an unreadable subdir.
1077        let mut stderr = vec!["touch: /etc/x: Permission denied".into()];
1078        annotate_violations(0, "find /", &mut stderr);
1079        assert_eq!(stderr.len(), 1, "no extra lines should be appended");
1080        assert!(!stderr.iter().any(|l| l.contains("<sandbox_violations>")));
1081    }
1082
1083    #[test]
1084    fn annotate_violations_appends_block_on_real_denial() {
1085        let mut stderr = vec!["touch: /etc/passwd: Operation not permitted".into()];
1086        annotate_violations(1, "touch /etc/passwd", &mut stderr);
1087        assert!(
1088            stderr.iter().any(|l| l == "<sandbox_violations>"),
1089            "open tag must be appended: {stderr:?}"
1090        );
1091        assert!(
1092            stderr
1093                .iter()
1094                .any(|l| l.contains("deny file-write* /etc/passwd")),
1095            "violation line must be appended: {stderr:?}"
1096        );
1097        assert!(
1098            stderr.iter().any(|l| l == "</sandbox_violations>"),
1099            "close tag must be appended: {stderr:?}"
1100        );
1101    }
1102
1103    #[test]
1104    fn annotate_violations_noop_when_no_denial_markers() {
1105        // Failed command (exit_code != 0) but stderr doesn't look like
1106        // a sandbox denial → no annotation. E.g. `cargo build` failure.
1107        let mut stderr = vec!["error[E0277]: trait bound not satisfied".into()];
1108        annotate_violations(101, "cargo build", &mut stderr);
1109        assert_eq!(stderr.len(), 1, "no annotation expected: {stderr:?}");
1110    }
1111
1112    /// Phase 1 acceptance criterion of #934: a real sandboxed bash command
1113    /// that touches ~/.ssh returns annotated stderr.
1114    ///
1115    /// Goes through the full `run_shell_command` pipeline — sandbox build,
1116    /// process spawn, stream capture, kernel denial, stderr parse,
1117    /// `<sandbox_violations>` annotation. This is the proof-of-life test
1118    /// for the whole Phase 1 stack.
1119    #[cfg(target_os = "macos")]
1120    #[tokio::test]
1121    async fn run_shell_command_annotates_ssh_write_denial() {
1122        use serde_json::json;
1123
1124        let home = std::env::var("HOME").unwrap_or_else(|_| "/Users/test".into());
1125        let ssh_dir = format!("{home}/.ssh");
1126        if !std::path::Path::new(&ssh_dir).exists() {
1127            eprintln!("skip: {ssh_dir} does not exist");
1128            return;
1129        }
1130        let project = tempfile::tempdir().unwrap();
1131        let canary = format!("{ssh_dir}/koda_phase1_annotation_canary");
1132
1133        let result = run_shell_command(
1134            project.path(),
1135            &json!({"command": format!("touch {canary}")}),
1136            500,
1137            &bg(),
1138            None,
1139            &crate::trust::TrustMode::Safe,
1140            &koda_sandbox::SandboxPolicy::strict_default(),
1141            None,
1142            None,
1143            None,
1144        )
1145        .await
1146        .expect("run_shell_command should succeed even when child fails");
1147
1148        let full = result.full_output.expect("full output expected");
1149        assert!(
1150            full.contains("<sandbox_violations>"),
1151            "missing open tag in full output:\n{full}"
1152        );
1153        assert!(
1154            full.contains("deny file-write*"),
1155            "missing violation kind in full output:\n{full}"
1156        );
1157        assert!(
1158            full.contains("</sandbox_violations>"),
1159            "missing close tag in full output:\n{full}"
1160        );
1161        // Acceptance: file was not created (sandbox actually enforced).
1162        assert!(
1163            !std::path::Path::new(&canary).exists(),
1164            "canary file should NOT have been created: {canary}"
1165        );
1166    }
1167}