harn-stdlib 0.8.163

// std/agent/judge_internals — private helpers shared by `std/agent/judge`
// (verify_completion + done_judge) and `std/agent/step_judge`. NOT
// intended as a public surface: every export is prefixed `__judge_` and
// the file is excluded from the docs sweep. Keep shared judge plumbing here:
// LLM-option overrides, structured-output schema field names, and the
// "raw_verdict -> {vetoed, feedback}" classifier.
/**
 * Keys on `judge_cfg` that override `opts.llm_options` when present.
 * Listed once here so adding (or renaming) an LLM tuning knob is a
 * single-line change instead of a 2x sweep.
 */
const __JUDGE_LLM_OVERRIDE_KEYS = ["temperature", "max_tokens", "top_p", "tool_format", "reasoning_effort"]

/**
 * Apply per-judge LLM overrides on top of an already-built `llm_opts`
 * dict. Mirrors the `for key in [...]` block that lived inline at both
 * call sites before v0.8.43.
 *
 * @effects: []
 * @errors: []
 * @api_stability: internal
 */
pub fn __judge_apply_llm_overrides(llm_opts, judge_cfg) {
  var out = llm_opts
  for key in __JUDGE_LLM_OVERRIDE_KEYS {
    if judge_cfg[key] != nil {
      out = out + {[key]: judge_cfg[key]}
    }
  }
  return out
}

/**
 * JSON structural characters can never be part of a legitimate verdict
 * token, so a captured verdict containing one was mangled upstream.
 */
const __JUDGE_VERDICT_JSON_JUNK = ["\"", ",", "{", "}", ":", "\\"]

/**
 * Normalize a captured judge verdict to its leading token. Structured
 * judges occasionally emit sloppy JSON (double commas, run-on key/value
 * pairs) that the structured-call repair layer salvages by capturing
 * trailing JSON junk into the verdict string — observed live in
 * `judge_decision` events as `continue",,` and `continue",  "reasoning":`.
 * Cut at the first JSON structural character and trim; verdicts without
 * JSON junk (including multi-word prose verdicts) pass through unchanged.
 *
 * @effects: []
 * @errors: []
 * @api_stability: internal
 * @example: __judge_verdict_token("continue\",,")
 */
pub fn __judge_verdict_token(raw_verdict) {
  let normalized = lowercase(trim(to_string(raw_verdict ?? "")))
  var cut = len(normalized)
  for junk in __JUDGE_VERDICT_JSON_JUNK {
    let idx = normalized.index_of(junk)
    if idx >= 0 && idx < cut {
      cut = idx
    }
  }
  if cut == len(normalized) {
    return normalized
  }
  return trim(normalized[0:cut])
}

/**
 * Punctuation that cheap models hang off the leading verdict word
 * (`"done."`, `"done!"`, `"pass:"`, `"(done)"`, quotes/asterisks from
 * markdown emphasis). Stripped from both ends of the leading token before
 * the allow-list compare so a decorated `done` still classifies as DONE.
 */
const __JUDGE_TOKEN_PUNCT = ".,;:!?\"'`*_-)("

/**
 * Reduce a normalized verdict to its leading whitespace-delimited word with
 * surrounding punctuation stripped. Cheap models decorate enum values
 * (`"done."`, `"yes, complete"`, `"done — all tests pass"`); the verdict's
 * *intent* is its first word, so classification keys on that word instead of
 * requiring whole-string equality. `"not done yet"` reduces to `"not"`
 * (still a veto), while `"done."` reduces to `"done"` (a pass).
 *
 * @effects: []
 * @errors: []
 * @api_stability: internal
 * @example: __judge_leading_word("done.")
 */
pub fn __judge_leading_word(normalized) {
  var first = ""
  for piece in normalized.split(" ") {
    if first == "" && trim(piece) != "" {
      first = trim(piece)
    }
  }
  // Strip leading punctuation.
  var start = 0
  while start < len(first) && __JUDGE_TOKEN_PUNCT.contains(first.char_at(start)) {
    start = start + 1
  }
  // Strip trailing punctuation.
  var stop = len(first)
  while stop > start && __JUDGE_TOKEN_PUNCT.contains(first.char_at(stop - 1)) {
    stop = stop - 1
  }
  return first[start:stop]
}

/**
 * Classify a raw verdict string against an allow-list of pass tokens and
 * compose the `{vetoed, feedback?}` outcome. `feedback_candidates` is
 * tried in order: the first non-nil, non-empty entry wins. Falls back
 * to `feedback_default` when nothing else is set.
 *
 * Used by `agent_step_judge` (pass tokens like "pass"/"yes"/"approve") and
 * by `agent_verify_or_continue` (pass tokens like "done"/"complete").
 * The raw verdict is normalized through `__judge_verdict_token` first, then
 * reduced to its leading word so a decorated enum value (`"done."`,
 * `"done — all tests pass"`) still classifies. The stored/emitted `verdict`
 * field carries the clean leading word.
 *
 * @effects: []
 * @errors: []
 * @api_stability: internal
 * @example: __judge_classify_verdict("pass", ["pass", "yes"], [critique], default)
 */
pub fn __judge_classify_verdict(raw_verdict, pass_tokens, feedback_candidates, feedback_default) {
  let normalized = __judge_leading_word(__judge_verdict_token(raw_verdict))
  if contains(pass_tokens, normalized) {
    return {vetoed: false, verdict: normalized}
  }
  var feedback = ""
  for candidate in feedback_candidates {
    if feedback == "" && candidate != nil && to_string(candidate) != "" {
      feedback = to_string(candidate)
    }
  }
  if feedback == "" {
    feedback = feedback_default
  }
  return {vetoed: true, feedback: feedback, verdict: normalized}
}