// std/agent/judge_internals — private helpers shared by `std/agent/judge`
// (verify_completion + done_judge) and `std/agent/step_judge`. NOT
// intended as a public surface: every export is prefixed `__judge_` and
// the file is excluded from the docs sweep. Keep shared judge plumbing here:
// LLM-option overrides, structured-output schema field names, and the
// "raw_verdict -> {vetoed, feedback}" classifier.
/**
* Keys on `judge_cfg` that override `opts.llm_options` when present.
* Listed once here so adding (or renaming) an LLM tuning knob is a
* single-line change instead of a 2x sweep.
*/
const __JUDGE_LLM_OVERRIDE_KEYS = ["temperature", "max_tokens", "top_p", "tool_format", "reasoning_effort"]
/**
* Apply per-judge LLM overrides on top of an already-built `llm_opts`
* dict. Mirrors the `for key in [...]` block that lived inline at both
* call sites before v0.8.43.
*
* @effects: []
* @errors: []
* @api_stability: internal
*/
pub fn __judge_apply_llm_overrides(llm_opts, judge_cfg) {
var out = llm_opts
for key in __JUDGE_LLM_OVERRIDE_KEYS {
if judge_cfg[key] != nil {
out = out + {[key]: judge_cfg[key]}
}
}
return out
}
/**
* JSON structural characters can never be part of a legitimate verdict
* token, so a captured verdict containing one was mangled upstream.
*/
const __JUDGE_VERDICT_JSON_JUNK = ["\"", ",", "{", "}", ":", "\\"]
/**
* Normalize a captured judge verdict to its leading token. Structured
* judges occasionally emit sloppy JSON (double commas, run-on key/value
* pairs) that the structured-call repair layer salvages by capturing
* trailing JSON junk into the verdict string — observed live in
* `judge_decision` events as `continue",,` and `continue", "reasoning":`.
* Cut at the first JSON structural character and trim; verdicts without
* JSON junk (including multi-word prose verdicts) pass through unchanged.
*
* @effects: []
* @errors: []
* @api_stability: internal
* @example: __judge_verdict_token("continue\",,")
*/
pub fn __judge_verdict_token(raw_verdict) {
let normalized = lowercase(trim(to_string(raw_verdict ?? "")))
var cut = len(normalized)
for junk in __JUDGE_VERDICT_JSON_JUNK {
let idx = normalized.index_of(junk)
if idx >= 0 && idx < cut {
cut = idx
}
}
if cut == len(normalized) {
return normalized
}
return trim(normalized[0:cut])
}
/**
* Punctuation that cheap models hang off the leading verdict word
* (`"done."`, `"done!"`, `"pass:"`, `"(done)"`, quotes/asterisks from
* markdown emphasis). Stripped from both ends of the leading token before
* the allow-list compare so a decorated `done` still classifies as DONE.
*/
const __JUDGE_TOKEN_PUNCT = ".,;:!?\"'`*_-)("
/**
* Reduce a normalized verdict to its leading whitespace-delimited word with
* surrounding punctuation stripped. Cheap models decorate enum values
* (`"done."`, `"yes, complete"`, `"done — all tests pass"`); the verdict's
* *intent* is its first word, so classification keys on that word instead of
* requiring whole-string equality. `"not done yet"` reduces to `"not"`
* (still a veto), while `"done."` reduces to `"done"` (a pass).
*
* @effects: []
* @errors: []
* @api_stability: internal
* @example: __judge_leading_word("done.")
*/
pub fn __judge_leading_word(normalized) {
var first = ""
for piece in normalized.split(" ") {
if first == "" && trim(piece) != "" {
first = trim(piece)
}
}
// Strip leading punctuation.
var start = 0
while start < len(first) && __JUDGE_TOKEN_PUNCT.contains(first.char_at(start)) {
start = start + 1
}
// Strip trailing punctuation.
var stop = len(first)
while stop > start && __JUDGE_TOKEN_PUNCT.contains(first.char_at(stop - 1)) {
stop = stop - 1
}
return first[start:stop]
}
/**
* Classify a raw verdict string against an allow-list of pass tokens and
* compose the `{vetoed, feedback?}` outcome. `feedback_candidates` is
* tried in order: the first non-nil, non-empty entry wins. Falls back
* to `feedback_default` when nothing else is set.
*
* Used by `agent_step_judge` (pass tokens like "pass"/"yes"/"approve") and
* by `agent_verify_or_continue` (pass tokens like "done"/"complete").
* The raw verdict is normalized through `__judge_verdict_token` first, then
* reduced to its leading word so a decorated enum value (`"done."`,
* `"done — all tests pass"`) still classifies. The stored/emitted `verdict`
* field carries the clean leading word.
*
* @effects: []
* @errors: []
* @api_stability: internal
* @example: __judge_classify_verdict("pass", ["pass", "yes"], [critique], default)
*/
pub fn __judge_classify_verdict(raw_verdict, pass_tokens, feedback_candidates, feedback_default) {
let normalized = __judge_leading_word(__judge_verdict_token(raw_verdict))
if contains(pass_tokens, normalized) {
return {vetoed: false, verdict: normalized}
}
var feedback = ""
for candidate in feedback_candidates {
if feedback == "" && candidate != nil && to_string(candidate) != "" {
feedback = to_string(candidate)
}
}
if feedback == "" {
feedback = feedback_default
}
return {vetoed: true, feedback: feedback, verdict: normalized}
}