harn-stdlib 0.8.51

// std/agent/command_capture
//
// Preserve the full, unfiltered output of `producer | cheap-filter`
// shell pipelines so an agent never has to re-run a slow command just to
// see what the trailing filter discarded.
//
// When an agent runs `slow_cmd | tail -5`, the *shell* applies `tail`
// before Harn's command runner ever sees the bytes, so only the 5
// surviving lines are captured and the full `slow_cmd` output is lost.
// This module recognizes that shape and rewrites it to
//
//     slow_cmd | tee '<capture>' 2>/dev/null | tail -5
//
// `tee` is transparent (the filter sees identical bytes, the exit status
// is unchanged) and the `2>/dev/null` + POSIX "tee keeps copying to stdout
// even when a file operand fails to open" guarantee means the rewrite can
// never break the agent's command. After the run we attach an
// `output_capture` field pointing at the captured file — but only if it
// actually materialized, so a sandbox that blocks the temp write produces
// no misleading hint.
//
// Everything here is a pure, conservative, bail-preferring recognizer:
// it rewrites ONLY a clean single pipeline (optionally behind a verbatim
// `cd … && ` / `… ; ` prefix) and refuses the moment it sees anything it
// cannot reason about (command/process substitution, here-docs, subshell
// grouping, background jobs, unbalanced quotes, …). False negatives are
// always preferred over false positives.

// -------------------------------------------------------------------------------------------------
// Filter allowlist
// -------------------------------------------------------------------------------------------------

/**
 * Commands that read their entire stdin before producing output. A `tee`
 * placed immediately upstream of one of these captures the COMPLETE
 * producer output. We deliberately EXCLUDE early-terminating filters
 * (`head`, `grep -m/-q/-l`, `sed …q`): there `tee` would only see a
 * partial stream, and those filters exist precisely to stop a slow
 * producer early — preserving the agent's intent is the safe choice.
 */
fn __cc_consume_all_filters() {
  return [
    "tail",
    "wc",
    "sort",
    "uniq",
    "cat",
    "nl",
    "tac",
    "rev",
    "column",
    "fold",
    "fmt",
    "cut",
    "tr",
    "grep",
    "egrep",
    "fgrep",
    "md5sum",
    "sha1sum",
    "sha256sum",
    "shasum",
    "base64",
    "od",
    "hexdump",
    "xxd",
  ]
}

fn __cc_list_has(items, needle) {
  for item in items {
    if item == needle {
      return true
    }
  }
  return false
}

fn __cc_basename(word) {
  let slash = word.last_index_of("/")
  if slash < 0 {
    return word
  }
  return word.substring(slash + 1, word.len())
}

fn __cc_first_token(segment) {
  for raw in segment.split(" ") {
    let token = raw.trim()
    if token != "" {
      return token
    }
  }
  return ""
}

/**
 * grep / egrep / fgrep short-circuit (and SIGPIPE the producer) with any
 * of these flags. We over-match on purpose: a false bail is harmless, a
 * missed short-circuit would make the "full output" hint a lie.
 */
fn __cc_grep_short_circuits(segment) {
  for raw in segment.split(" ") {
    let token = raw.trim()
    if token != "" {
      if token == "--max-count"
        || token.starts_with("--max-count=")
        || token == "--quiet"
        || token == "--silent"
        || token == "--files-with-matches"
        || token == "--files-without-match" {
        return true
      }
      if token.starts_with("-") && !token.starts_with("--") && token.len() > 1 {
        let cluster = token.substring(1, token.len())
        if cluster.contains("q") || cluster.contains("m") || cluster.contains("l")
          || cluster.contains("L") {
          return true
        }
      }
    }
  }
  return false
}

fn __cc_filter_allowed(filter) {
  let word = __cc_first_token(filter)
  if word == "" {
    return false
  }
  // A leading `VAR=val` assignment or env prefix is unusual in this slot;
  // refuse rather than guess.
  if word.contains("=") {
    return false
  }
  let name = __cc_basename(word).lowercase()
  if !__cc_list_has(__cc_consume_all_filters(), name) {
    return false
  }
  if name == "grep" || name == "egrep" || name == "fgrep" {
    if __cc_grep_short_circuits(filter) {
      return false
    }
  }
  return true
}

// -------------------------------------------------------------------------------------------------
// Conservative top-level scanner
// -------------------------------------------------------------------------------------------------

/**
 * Blunt pre-check: bail outright if the command contains any construct we
 * refuse to reason about. Over-bails when these appear inside single
 * quotes (where they would be literal) — which is safe.
 */
fn __cc_hazardous(command) {
  return command.contains("$(")
    || command.contains("`")
    || command.contains("<(")
    || command.contains(">(")
    || command.contains("<<")
    || command.contains("\n")
    || command.contains("\r")
    || command.contains("|&")
    || command.contains("&>")
    || command.contains(" tee ")
    || command.contains("| tee")
}

/**
 * Walk the command tracking single/double quote and backslash-escape state,
 * recording top-level statement separators (`;`, `&&`, `||`) and pipes
 * (`|`). Returns nil to bail on a subshell group `(`, a background `&`, a
 * `|&` pipe, or unbalanced quotes.
 */
fn __cc_scan_cuts(command) {
  let n = command.len()
  var i = 0
  var squote = false
  var dquote = false
  var cuts = []
  while i < n {
    let ch = command.char_at(i)
    if squote {
      if ch == "'" {
        squote = false
      }
      i = i + 1
    } else if dquote {
      if ch == "\\" {
        // Skip the escaped character (correct for locating the close quote).
        i = i + 2
      } else if ch == "\"" {
        dquote = false
        i = i + 1
      } else {
        i = i + 1
      }
    } else if ch == "\\" {
      i = i + 2
    } else if ch == "'" {
      squote = true
      i = i + 1
    } else if ch == "\"" {
      dquote = true
      i = i + 1
    } else if ch == "(" {
      return nil
    } else if ch == ";" {
      cuts = cuts.push({pos: i, len: 1, kind: "stmt"})
      i = i + 1
    } else if ch == "&" {
      if command.char_at(i + 1) == "&" {
        cuts = cuts.push({pos: i, len: 2, kind: "stmt"})
        i = i + 2
      } else if i > 0 && (command.char_at(i - 1) == ">" || command.char_at(i - 1) == "<") {
        // fd-duplication redirection (e.g. `2>&1`, `>&2`), not a
        // background job — leave it verbatim inside the segment.
        i = i + 1
      } else {
        return nil
      }
    } else if ch == "|" {
      let next = command.char_at(i + 1)
      if next == "|" {
        cuts = cuts.push({pos: i, len: 2, kind: "stmt"})
        i = i + 2
      } else if next == "&" {
        return nil
      } else {
        cuts = cuts.push({pos: i, len: 1, kind: "pipe"})
        i = i + 1
      }
    } else {
      i = i + 1
    }
  }
  if squote || dquote {
    return nil
  }
  return cuts
}

/**
 * Recognize a rewritable `producer | consume-all-filter` pipeline.
 *
 * Returns `{prefix, producer, filter}` where `prefix` is the verbatim
 * leading text up to and including the last top-level statement separator
 * (so `cd foo && cmd | tail` keeps `cd foo && `), or nil when the command
 * is not a safe rewrite target.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: experimental
 * @example: command_capture_detect("seq 100 | tail -3")
 */
pub fn command_capture_detect(command) {
  if command == nil {
    return nil
  }
  if command.trim() == "" {
    return nil
  }
  if __cc_hazardous(command) {
    return nil
  }
  let cuts = __cc_scan_cuts(command)
  if cuts == nil {
    return nil
  }
  // Start of the last top-level statement.
  var start = 0
  for cut in cuts {
    if cut.kind == "stmt" {
      start = cut.pos + cut.len
    }
  }
  // Last top-level pipe within that statement.
  var pipe_pos = -1
  for cut in cuts {
    if cut.kind == "pipe" && cut.pos >= start {
      pipe_pos = cut.pos
    }
  }
  if pipe_pos < 0 {
    return nil
  }
  let n = command.len()
  let prefix = command.substring(0, start)
  let producer = command.substring(start, pipe_pos).trim()
  let filter = command.substring(pipe_pos + 1, n).trim()
  if producer == "" || filter == "" {
    return nil
  }
  if !__cc_filter_allowed(filter) {
    return nil
  }
  return {prefix: prefix, producer: producer, filter: filter}
}

// -------------------------------------------------------------------------------------------------
// Rewrite + plan
// -------------------------------------------------------------------------------------------------

/** Single-quote a path for POSIX sh, escaping embedded single quotes. */
fn __cc_shquote(value) {
  return "'" + value.replace("'", "'\\''") + "'"
}

/**
 * Build a capture plan for a raw shell command and a chosen capture path.
 *
 * Returns `{rewritten, producer, filter, capture_path}` (the `rewritten`
 * command inserts `| tee '<capture_path>' 2>/dev/null |` before the final
 * filter) or nil when the command is not a safe rewrite target.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: experimental
 * @example: command_capture_plan("seq 100 | tail -3", "/tmp/out")
 */
pub fn command_capture_plan(command, capture_path) {
  let detected = command_capture_detect(command)
  if detected == nil {
    return nil
  }
  let head = if detected.prefix.trim() == "" {
    ""
  } else {
    detected.prefix.trim_end() + " "
  }
  let rewritten = head + detected.producer + " | tee " + __cc_shquote(capture_path)
    + " 2>/dev/null | "
    + detected.filter
  return {
    rewritten: rewritten,
    producer: detected.producer,
    filter: detected.filter,
    capture_path: capture_path,
  }
}

/**
 * Convenience wrapper returning just the rewritten command string (or nil).
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: experimental
 * @example: command_capture_rewrite("ls | wc -l", "/tmp/out")
 */
pub fn command_capture_rewrite(command, capture_path) {
  return command_capture_plan(command, capture_path)?.rewritten
}

fn __cc_new_capture_path() {
  return path_join(harness.fs.temp_dir(), "harn-capture-" + uuid() + ".out")
}

fn __cc_posix_shell(word) {
  let name = __cc_basename(word).lowercase()
  return name == "sh" || name == "bash" || name == "zsh" || name == "dash" || name == "ksh"
}

/**
 * Plan a capture rewrite for a `run_command` request dict.
 *
 * Handles `{mode: "shell", command}` (POSIX hosts only) and
 * `{mode: "argv", argv: [sh|bash|…, "-c", cmd, …]}`. Returns
 * `{run_request, plan}` with the producer's full output redirected to a
 * fresh temp file, or nil when no safe rewrite applies.
 *
 * @effects: []
 * @allocation: heap
 * @errors: []
 * @api_stability: experimental
 * @example: command_capture_plan_request({mode: "shell", command: "ls | wc -l"})
 */
pub fn command_capture_plan_request(request) {
  if request == nil {
    return nil
  }
  let mode = request?.mode
  if mode == "argv" {
    let argv = request?.argv ?? []
    if len(argv) >= 3 && argv[1] == "-c" && __cc_posix_shell(argv[0]) {
      let capture_path = __cc_new_capture_path()
      let plan = command_capture_plan(argv[2], capture_path)
      if plan == nil {
        return nil
      }
      var new_argv = []
      var i = 0
      while i < len(argv) {
        if i == 2 {
          new_argv = new_argv.push(plan.rewritten)
        } else {
          new_argv = new_argv.push(argv[i])
        }
        i = i + 1
      }
      return {run_request: request + {argv: new_argv}, plan: plan}
    }
    return nil
  }
  if mode == "shell" {
    if platform() == "windows" {
      return nil
    }
    let capture_path = __cc_new_capture_path()
    let plan = command_capture_plan(request?.command ?? "", capture_path)
    if plan == nil {
      return nil
    }
    return {run_request: request + {command: plan.rewritten}, plan: plan}
  }
  return nil
}

/**
 * Attach an `output_capture` hint to a command result, pointing the agent
 * at the captured full output — but only when the capture file actually
 * materialized (so a blocked temp write produces no misleading hint).
 *
 * @effects: [fs]
 * @allocation: heap
 * @errors: []
 * @api_stability: experimental
 * @example: command_capture_annotate(result, plan)
 */
pub fn command_capture_annotate(result, plan) {
  if result == nil || plan == nil {
    return result
  }
  let path = plan?.capture_path
  if path == nil {
    return result
  }
  if !harness.fs.exists(path) {
    return result
  }
  let hint = "Filtered output is shown above. The full, unfiltered output of `"
    + plan.producer
    + "` was captured to "
    + path
    + " — read it (read_command_output {path: \""
    + path
    + "\"} or read_file) instead of re-running the command."
  return result
    + {output_capture: {full_output_path: path, producer: plan.producer, filter: plan.filter, hint: hint}}
}