// std/agent/command_capture
//
// Preserve the full, unfiltered output of `producer | cheap-filter`
// shell pipelines so an agent never has to re-run a slow command just to
// see what the trailing filter discarded.
//
// When an agent runs `slow_cmd | tail -5`, the *shell* applies `tail`
// before Harn's command runner ever sees the bytes, so only the 5
// surviving lines are captured and the full `slow_cmd` output is lost.
// This module recognizes that shape and rewrites it to
//
// slow_cmd | tee '<capture>' 2>/dev/null | tail -5
//
// `tee` is transparent (the filter sees identical bytes, the exit status
// is unchanged) and the `2>/dev/null` + POSIX "tee keeps copying to stdout
// even when a file operand fails to open" guarantee means the rewrite can
// never break the agent's command. After the run we attach an
// `output_capture` field pointing at the captured file — but only if it
// actually materialized, so a sandbox that blocks the temp write produces
// no misleading hint.
//
// Everything here is a pure, conservative, bail-preferring recognizer:
// it rewrites ONLY a clean single pipeline (optionally behind a verbatim
// `cd … && ` / `… ; ` prefix) and refuses the moment it sees anything it
// cannot reason about (command/process substitution, here-docs, subshell
// grouping, background jobs, unbalanced quotes, …). False negatives are
// always preferred over false positives.
// -------------------------------------------------------------------------------------------------
// Filter allowlist
// -------------------------------------------------------------------------------------------------
/**
* Commands that read their entire stdin before producing output. A `tee`
* placed immediately upstream of one of these captures the COMPLETE
* producer output. We deliberately EXCLUDE early-terminating filters
* (`head`, `grep -m/-q/-l`, `sed …q`): there `tee` would only see a
* partial stream, and those filters exist precisely to stop a slow
* producer early — preserving the agent's intent is the safe choice.
*/
fn __cc_consume_all_filters() {
return [
"tail",
"wc",
"sort",
"uniq",
"cat",
"nl",
"tac",
"rev",
"column",
"fold",
"fmt",
"cut",
"tr",
"grep",
"egrep",
"fgrep",
"md5sum",
"sha1sum",
"sha256sum",
"shasum",
"base64",
"od",
"hexdump",
"xxd",
]
}
fn __cc_list_has(items, needle) {
for item in items {
if item == needle {
return true
}
}
return false
}
fn __cc_basename(word) {
let slash = word.last_index_of("/")
if slash < 0 {
return word
}
return word.substring(slash + 1, word.len())
}
fn __cc_first_token(segment) {
for raw in segment.split(" ") {
let token = raw.trim()
if token != "" {
return token
}
}
return ""
}
/**
* grep / egrep / fgrep short-circuit (and SIGPIPE the producer) with any
* of these flags. We over-match on purpose: a false bail is harmless, a
* missed short-circuit would make the "full output" hint a lie.
*/
fn __cc_grep_short_circuits(segment) {
for raw in segment.split(" ") {
let token = raw.trim()
if token != "" {
if token == "--max-count"
|| token.starts_with("--max-count=")
|| token == "--quiet"
|| token == "--silent"
|| token == "--files-with-matches"
|| token == "--files-without-match" {
return true
}
if token.starts_with("-") && !token.starts_with("--") && token.len() > 1 {
let cluster = token.substring(1, token.len())
if cluster.contains("q") || cluster.contains("m") || cluster.contains("l")
|| cluster.contains("L") {
return true
}
}
}
}
return false
}
fn __cc_filter_allowed(filter) {
let word = __cc_first_token(filter)
if word == "" {
return false
}
// A leading `VAR=val` assignment or env prefix is unusual in this slot;
// refuse rather than guess.
if word.contains("=") {
return false
}
let name = __cc_basename(word).lowercase()
if !__cc_list_has(__cc_consume_all_filters(), name) {
return false
}
if name == "grep" || name == "egrep" || name == "fgrep" {
if __cc_grep_short_circuits(filter) {
return false
}
}
return true
}
// -------------------------------------------------------------------------------------------------
// Conservative top-level scanner
// -------------------------------------------------------------------------------------------------
/**
* Blunt pre-check: bail outright if the command contains any construct we
* refuse to reason about. Over-bails when these appear inside single
* quotes (where they would be literal) — which is safe.
*/
fn __cc_hazardous(command) {
return command.contains("$(")
|| command.contains("`")
|| command.contains("<(")
|| command.contains(">(")
|| command.contains("<<")
|| command.contains("\n")
|| command.contains("\r")
|| command.contains("|&")
|| command.contains("&>")
|| command.contains(" tee ")
|| command.contains("| tee")
}
/**
* Walk the command tracking single/double quote and backslash-escape state,
* recording top-level statement separators (`;`, `&&`, `||`) and pipes
* (`|`). Returns nil to bail on a subshell group `(`, a background `&`, a
* `|&` pipe, or unbalanced quotes.
*/
fn __cc_scan_cuts(command) {
let n = command.len()
var i = 0
var squote = false
var dquote = false
var cuts = []
while i < n {
let ch = command.char_at(i)
if squote {
if ch == "'" {
squote = false
}
i = i + 1
} else if dquote {
if ch == "\\" {
// Skip the escaped character (correct for locating the close quote).
i = i + 2
} else if ch == "\"" {
dquote = false
i = i + 1
} else {
i = i + 1
}
} else if ch == "\\" {
i = i + 2
} else if ch == "'" {
squote = true
i = i + 1
} else if ch == "\"" {
dquote = true
i = i + 1
} else if ch == "(" {
return nil
} else if ch == ";" {
cuts = cuts.push({pos: i, len: 1, kind: "stmt"})
i = i + 1
} else if ch == "&" {
if command.char_at(i + 1) == "&" {
cuts = cuts.push({pos: i, len: 2, kind: "stmt"})
i = i + 2
} else if i > 0 && (command.char_at(i - 1) == ">" || command.char_at(i - 1) == "<") {
// fd-duplication redirection (e.g. `2>&1`, `>&2`), not a
// background job — leave it verbatim inside the segment.
i = i + 1
} else {
return nil
}
} else if ch == "|" {
let next = command.char_at(i + 1)
if next == "|" {
cuts = cuts.push({pos: i, len: 2, kind: "stmt"})
i = i + 2
} else if next == "&" {
return nil
} else {
cuts = cuts.push({pos: i, len: 1, kind: "pipe"})
i = i + 1
}
} else {
i = i + 1
}
}
if squote || dquote {
return nil
}
return cuts
}
/**
* Recognize a rewritable `producer | consume-all-filter` pipeline.
*
* Returns `{prefix, producer, filter}` where `prefix` is the verbatim
* leading text up to and including the last top-level statement separator
* (so `cd foo && cmd | tail` keeps `cd foo && `), or nil when the command
* is not a safe rewrite target.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: experimental
* @example: command_capture_detect("seq 100 | tail -3")
*/
pub fn command_capture_detect(command) {
if command == nil {
return nil
}
if command.trim() == "" {
return nil
}
if __cc_hazardous(command) {
return nil
}
let cuts = __cc_scan_cuts(command)
if cuts == nil {
return nil
}
// Start of the last top-level statement.
var start = 0
for cut in cuts {
if cut.kind == "stmt" {
start = cut.pos + cut.len
}
}
// Last top-level pipe within that statement.
var pipe_pos = -1
for cut in cuts {
if cut.kind == "pipe" && cut.pos >= start {
pipe_pos = cut.pos
}
}
if pipe_pos < 0 {
return nil
}
let n = command.len()
let prefix = command.substring(0, start)
let producer = command.substring(start, pipe_pos).trim()
let filter = command.substring(pipe_pos + 1, n).trim()
if producer == "" || filter == "" {
return nil
}
if !__cc_filter_allowed(filter) {
return nil
}
return {prefix: prefix, producer: producer, filter: filter}
}
// -------------------------------------------------------------------------------------------------
// Rewrite + plan
// -------------------------------------------------------------------------------------------------
/** Single-quote a path for POSIX sh, escaping embedded single quotes. */
fn __cc_shquote(value) {
return "'" + value.replace("'", "'\\''") + "'"
}
/**
* Build a capture plan for a raw shell command and a chosen capture path.
*
* Returns `{rewritten, producer, filter, capture_path}` (the `rewritten`
* command inserts `| tee '<capture_path>' 2>/dev/null |` before the final
* filter) or nil when the command is not a safe rewrite target.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: experimental
* @example: command_capture_plan("seq 100 | tail -3", "/tmp/out")
*/
pub fn command_capture_plan(command, capture_path) {
let detected = command_capture_detect(command)
if detected == nil {
return nil
}
let head = if detected.prefix.trim() == "" {
""
} else {
detected.prefix.trim_end() + " "
}
let rewritten = head + detected.producer + " | tee " + __cc_shquote(capture_path)
+ " 2>/dev/null | "
+ detected.filter
return {
rewritten: rewritten,
producer: detected.producer,
filter: detected.filter,
capture_path: capture_path,
}
}
/**
* Convenience wrapper returning just the rewritten command string (or nil).
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: experimental
* @example: command_capture_rewrite("ls | wc -l", "/tmp/out")
*/
pub fn command_capture_rewrite(command, capture_path) {
return command_capture_plan(command, capture_path)?.rewritten
}
fn __cc_new_capture_path() {
return path_join(harness.fs.temp_dir(), "harn-capture-" + uuid() + ".out")
}
fn __cc_posix_shell(word) {
let name = __cc_basename(word).lowercase()
return name == "sh" || name == "bash" || name == "zsh" || name == "dash" || name == "ksh"
}
/**
* Plan a capture rewrite for a `run_command` request dict.
*
* Handles `{mode: "shell", command}` (POSIX hosts only) and
* `{mode: "argv", argv: [sh|bash|…, "-c", cmd, …]}`. Returns
* `{run_request, plan}` with the producer's full output redirected to a
* fresh temp file, or nil when no safe rewrite applies.
*
* @effects: []
* @allocation: heap
* @errors: []
* @api_stability: experimental
* @example: command_capture_plan_request({mode: "shell", command: "ls | wc -l"})
*/
pub fn command_capture_plan_request(request) {
if request == nil {
return nil
}
let mode = request?.mode
if mode == "argv" {
let argv = request?.argv ?? []
if len(argv) >= 3 && argv[1] == "-c" && __cc_posix_shell(argv[0]) {
let capture_path = __cc_new_capture_path()
let plan = command_capture_plan(argv[2], capture_path)
if plan == nil {
return nil
}
var new_argv = []
var i = 0
while i < len(argv) {
if i == 2 {
new_argv = new_argv.push(plan.rewritten)
} else {
new_argv = new_argv.push(argv[i])
}
i = i + 1
}
return {run_request: request + {argv: new_argv}, plan: plan}
}
return nil
}
if mode == "shell" {
if platform() == "windows" {
return nil
}
let capture_path = __cc_new_capture_path()
let plan = command_capture_plan(request?.command ?? "", capture_path)
if plan == nil {
return nil
}
return {run_request: request + {command: plan.rewritten}, plan: plan}
}
return nil
}
/**
* Attach an `output_capture` hint to a command result, pointing the agent
* at the captured full output — but only when the capture file actually
* materialized (so a blocked temp write produces no misleading hint).
*
* @effects: [fs]
* @allocation: heap
* @errors: []
* @api_stability: experimental
* @example: command_capture_annotate(result, plan)
*/
pub fn command_capture_annotate(result, plan) {
if result == nil || plan == nil {
return result
}
let path = plan?.capture_path
if path == nil {
return result
}
if !harness.fs.exists(path) {
return result
}
let hint = "Filtered output is shown above. The full, unfiltered output of `"
+ plan.producer
+ "` was captured to "
+ path
+ " — read it (read_command_output {path: \""
+ path
+ "\"} or read_file) instead of re-running the command."
return result
+ {output_capture: {full_output_path: path, producer: plan.producer, filter: plan.filter, hint: hint}}
}