harn-stdlib 0.8.163

// @harn-entrypoint-category llm.stdlib
//
// std/llm/safe — DRY consolidations for envelope-shaped llm_call results,
// case-insensitive dict access, and judge-payload reconstruction.
import { agent_session_messages } from "std/agent/state"

/**
 * Try-wrap llm_call into the canonical envelope shape:
 * {ok: true, value: <llm dict>} on success
 * {ok: false, status: <"budget_exhausted" | "exception">, error?} on error.
 *
 * @effects: [llm.call]
 * @errors: []
 */
pub fn safe_call(prompt, system, options) {
  let result = try {
    llm_call(prompt, system, options)
  }
  if !is_err(result) {
    return {ok: true, value: unwrap(result)}
  }
  let err = unwrap_err(result)
  let reason = if type_of(err) == "dict" {
    err?.reason ?? ""
  } else {
    ""
  }
  if reason == "budget_exceeded" {
    return {ok: false, status: "budget_exhausted", error: err}
  }
  return {ok: false, status: "exception", error: err}
}

/**
 * Direct case-insensitive single-key lookup on a dict. Returns nil on miss.
 *
 * @effects: []
 * @errors: []
 */
pub fn dict_get_ci(d, key) {
  if type_of(d) != "dict" {
    return nil
  }
  let target = lowercase(to_string(key))
  for k in d.keys() {
    if lowercase(to_string(k)) == target {
      return d[k]
    }
  }
  return nil
}

fn __value_is_present(value) {
  if value == nil {
    return false
  }
  let kind = type_of(value)
  if kind == "string" {
    return value != ""
  }
  if kind == "list" {
    return len(value) > 0
  }
  if kind == "dict" {
    return len(value.keys()) > 0
  }
  return true
}

/**
 * Case-insensitive top-level dict lookup. Tries each name in order; returns
 * the first non-nil non-empty value, else default. Top-level keys only.
 *
 * @effects: []
 * @errors: []
 */
pub fn safe_field(envelope, names, default) {
  if type_of(envelope) != "dict" {
    return default
  }
  if type_of(names) != "list" {
    return default
  }
  for name in names {
    let value = dict_get_ci(envelope, name)
    if __value_is_present(value) {
      return value
    }
  }
  return default
}

/**
 * Recursively normalize all dict keys to lowercase. Lists pass through
 * unchanged, but dicts within lists are recursed into. Idempotent.
 *
 * @effects: []
 * @errors: []
 */
pub fn with_case_insensitive_keys(envelope) {
  if type_of(envelope) == "dict" {
    var out = {}
    for k in envelope.keys() {
      let new_key = lowercase(to_string(k))
      out = out + {[new_key]: with_case_insensitive_keys(envelope[k])}
    }
    return out
  }
  if type_of(envelope) == "list" {
    var out = []
    for item in envelope {
      out = out.push(with_case_insensitive_keys(item))
    }
    return out
  }
  return envelope
}

/**
 * Merges defaults UNDER the envelope's data field. The envelope wins per-key.
 * If envelope.ok is false or envelope is nil, returns {ok: false, ...defaults}.
 *
 * @effects: []
 * @errors: []
 */
pub fn structured_envelope_or_default(envelope, defaults) {
  let base = if type_of(defaults) == "dict" {
    defaults
  } else {
    {}
  }
  if envelope == nil {
    return {ok: false} + base
  }
  if type_of(envelope) != "dict" {
    return {ok: false} + base
  }
  if !(envelope?.ok ?? false) {
    return {ok: false} + base + envelope
  }
  let data = if type_of(envelope?.data) == "dict" {
    envelope.data
  } else {
    {}
  }
  return envelope + {data: base + data}
}

fn __unique_names(names) {
  var unique = []
  for name in names {
    if name != "" && !contains(unique, name) {
      unique = unique.push(name)
    }
  }
  return unique
}

fn __session_tool_names(messages) {
  var names = []
  for message in messages {
    if message?.role == "tool" {
      names = names.push(message?.name ?? "")
    }
  }
  return __unique_names(names)
}

/**
 * Build the canonical judge payload. Mirrors agent/judge.__judge_payload but
 * callable from non-agent_loop contexts (e.g. parallel_judge in ensemble).
 * Returns {session_id, task, stop_reason, text, visible_text, last_text,
 * transcript, all_tools_used, successful_tools_used, iteration}.
 *
 * @effects: []
 * @errors: []
 */
pub fn judge_payload(session, _opts, stop_reason, text, iteration) {
  let session_id = session?.session_id ?? ""
  let messages = if session_id != "" {
    let m = try {
      agent_session_messages(session_id)
    }
    if is_err(m) {
      []
    } else {
      unwrap(m)
    }
  } else {
    []
  }
  let tool_names = __session_tool_names(messages)
  return {
    session_id: session_id,
    task: session?.task ?? "",
    stop_reason: stop_reason,
    text: text,
    visible_text: text,
    last_text: text,
    transcript: json_stringify(messages),
    all_tools_used: join(tool_names, ", "),
    successful_tools_used: join(tool_names, ", "),
    iteration: iteration,
  }
}

/**
 * Normalize a judge verdict string. Lowercases, trims, optionally maps via
 * alias_groups (a list of {canonical, aliases}). Returns the canonical form
 * or the lowered/trimmed original.
 *
 * @effects: []
 * @errors: []
 */
pub fn verdict_normalize(text, alias_groups) {
  let normalized = lowercase(trim(to_string(text ?? "")))
  if alias_groups == nil || type_of(alias_groups) != "list" || len(alias_groups) == 0 {
    return normalized
  }
  for group in alias_groups {
    let {canonical = "", aliases = []} = group ?? {}
    if canonical != "" && type_of(aliases) == "list" && contains(aliases, normalized) {
      return canonical
    }
  }
  return normalized
}

/**
 * Build a deterministic schema-retry nudge string from a JSON Schema.
 * Lists required fields (sorted), enforces lowercase keys, no markdown,
 * no fences. Optional hint string is appended.
 *
 * @effects: []
 * @errors: []
 */
pub fn schema_retry_nudge_for(schema, hint) {
  let required = if type_of(schema) == "dict" && type_of(schema?.required) == "list" {
    schema.required
  } else {
    []
  }
  let sorted_required = required.sort()
  let required_line = if len(sorted_required) > 0 {
    "Required keys (lowercase): " + join(sorted_required, ", ") + "."
  } else {
    "No required keys are declared."
  }
  var lines = [
    "Your previous response did not pass schema validation.",
    "Re-emit valid JSON only:",
    "- Use lowercase keys exactly as specified.",
    "- Do not wrap in markdown fences.",
    "- Do not include prose, commentary, or trailing text.",
    required_line,
  ]
  if hint != nil && to_string(hint) != "" {
    lines = lines.push("Hint: " + to_string(hint))
  }
  return join(lines, "\n")
}

/**
 * One-shot structured-output helper that bundles schema retries, an
 * automatic repair pass, judge-friendly defaults, and case-insensitive
 * key normalization on the result.
 *
 * Replaces the recurring 80-120 LOC structured-output dance in judges
 * and analyzers. Returns the canonical envelope from
 * `llm_call_structured_result` augmented with `value` (alias for
 * lowercase-key-normalized `data`) and `ok` already populated. Callers
 * dispatch on `result.ok` and read structured fields off `result.value`.
 *
 * Conceptually equivalent to the structured-output preset
 *
 *     compose([with_coerce({})])(__structured_caller(schema))
 *
 * with `__apply_judge_defaults` baking in judge-friendly options before
 * the call. Schema retries + the repair pass are owned by
 * `llm_call_structured_result` (the structured base caller), so this
 * function is the canonical preset; it stays in `std/llm/safe` rather
 * than `std/llm/handlers` because callers consume it as a one-shot,
 * not as a caller-seam middleware.
 *
 * Defaults applied when the corresponding option is unset:
 *   temperature      -> 0.0
 *   schema_retries   -> 2
 *   repair.enabled   -> true
 *   repair.max_tokens-> floored reasoning-awarely: `max(600, structured floor)`
 *                       (600 baseline on a non-reasoning route; reasoning budget
 *                       + verdict headroom on a reasoning route)
 *   repair.temperature -> 0.0
 *   schema_retry_nudge -> derived via `schema_retry_nudge_for`
 *   max_tokens       -> floored to `STRUCTURED_MIN_MAX_TOKENS` (512)
 *
 * The `max_tokens` FLOOR is a measurement-integrity guard, not a tuning knob:
 * a judge/router structured call that goes out with a tiny budget (e.g. the
 * historical `max_tokens: 180` completion-judge default) truncates mid-object
 * on a reasoning model — which spends part of the same output budget on its
 * reasoning channel — yielding unparseable JSON and a silent dead-judge
 * fall-through. Flooring (never lowering) the budget keeps the JSON bounded by
 * the schema yet large enough to finish. Provider-generic: it never inspects
 * the provider or model.
 *
 * @effects: []
 * @errors: []
 */
pub fn safe_structured_call(prompt, schema, options) {
  let user_options = if type_of(options) == "dict" {
    options
  } else {
    {}
  }
  let resolved = __apply_judge_defaults(schema, user_options)
  let envelope = llm_call_structured_result(prompt, schema, resolved)
  return __augment_envelope(envelope)
}

/**
 * Minimum output-token budget for a structured judge/router call on a
 * NON-reasoning route. A call that goes out under this floor risks truncating
 * its JSON mid-object, which masquerades as a dead-judge abstention. The floor
 * only RAISES an unset or too-small budget; an explicit larger `max_tokens`
 * (e.g. a 1200-token rubric judge) is left untouched.
 */
let STRUCTURED_MIN_MAX_TOKENS = 512

/**
 * Minimum output-token budget for the structured REPAIR side-call on a
 * NON-reasoning route. The repair pass re-emits the verdict JSON after a
 * schema-validation failure; historically it ran with a flat 600-token budget.
 * Kept as the non-reasoning baseline (never lowered) so a reasoning route is
 * the only thing the reasoning-aware floor raises — preserving exact behavior
 * for non-reasoning callers while fixing the reasoning-route starvation twin of
 * the #3598 judge bug.
 */
let REPAIR_MIN_MAX_TOKENS = 600

/**
 * Headroom (output tokens) reserved for the visible JSON verdict ON TOP of a
 * reasoning model's hidden reasoning spend. A judge/router verdict object is
 * small, but it must finish AFTER the reasoning channel has drained part of the
 * shared `max_tokens` budget, so the floor reserves this much beyond the
 * resolved reasoning budget.
 */
let STRUCTURED_REASONING_VERDICT_HEADROOM = 768

/**
 * Hard ceiling for the reasoning-aware structured floor, mirroring the Rust
 * `MAX_TOKENS_RETRY_CEILING` so a high-effort reasoning route never inflates a
 * single judge call without bound.
 */
let STRUCTURED_MAX_MAX_TOKENS = 32768

/**
 * Approximate reasoning-channel output budget (tokens) Harn's reasoning policy
 * allocates for an effort level. Delegates to the `llm_reasoning_effort_budget`
 * builtin, which is backed by the canonical `budget_for_reasoning_level` in
 * `crates/harn-vm/src/llm/reasoning_policy.rs` (single source of truth — this
 * helper does NOT re-hardcode the level -> token numbers). Used only as a
 * fallback when the resolved `thinking` dict carries no explicit `budget_tokens`
 * (e.g. an `effort`-mode route, which exposes `{mode, level}` but no budget).
 */
fn __reasoning_effort_budget(level) {
  return llm_reasoning_effort_budget(to_string(level ?? ""))
}

/**
 * Resolve the reasoning-channel output budget (tokens) for a structured call's
 * options, or 0 when reasoning is disabled / absent. Provider-aware: it asks
 * Harn's own reasoning policy to resolve the route's `thinking`, then reads the
 * explicit `budget_tokens` (Anthropic-style numeric budget) or maps the effort
 * level (gpt-oss/Harmony, o-series) to its approximate budget. A caller's
 * explicit `reasoning_effort` is honoured directly since the policy passes such
 * opts through untouched.
 */
fn __structured_reasoning_budget(options) {
  let opts = if type_of(options) == "dict" {
    options
  } else {
    {}
  }
  let resolved = try {
    llm_apply_reasoning_policy(opts)
  }
  let safe_resolved = if is_err(resolved) {
    opts
  } else {
    unwrap(resolved)
  }
  let thinking = safe_resolved?.thinking
  // Caller set reasoning_effort explicitly -> policy returns opts unchanged
  // (no `thinking`), so fall back to the caller's effort level.
  if type_of(thinking) != "dict" {
    if opts?.reasoning_effort != nil {
      let effort = to_string(opts.reasoning_effort)
      if effort == "off" || effort == "none" || effort == "disabled" {
        return 0
      }
      return __reasoning_effort_budget(effort)
    }
    return 0
  }
  let mode = to_string(thinking?.mode ?? "")
  if mode == "disabled" {
    return 0
  }
  let explicit_budget = to_int(thinking?.budget_tokens ?? 0) ?? 0
  if explicit_budget > 0 {
    return explicit_budget
  }
  if mode == "effort" {
    return __reasoning_effort_budget(thinking?.level)
  }
  // `enabled`/`adaptive` with no numeric budget: use the medium fallback so a
  // reasoning route still clears the verdict-starvation floor.
  return __reasoning_effort_budget("medium")
}

/**
 * Reasoning-aware minimum output-token budget for a structured judge/router
 * call. On a non-reasoning route this is the flat `STRUCTURED_MIN_MAX_TOKENS`.
 * On a reasoning route the hidden reasoning channel shares the same `max_tokens`
 * budget as the visible JSON, so a flat floor can be entirely consumed by
 * reasoning — re-truncating the verdict to empty. This reserves the resolved
 * reasoning budget PLUS `STRUCTURED_REASONING_VERDICT_HEADROOM` for the verdict,
 * clamped to `STRUCTURED_MAX_MAX_TOKENS`. Generalizable: it scales with the
 * route's actual reasoning effort rather than hardcoding a per-model value.
 * @effects: []
 * @errors: []
 */
pub fn structured_min_max_tokens(options) {
  let reasoning_budget = __structured_reasoning_budget(options)
  if reasoning_budget <= 0 {
    return STRUCTURED_MIN_MAX_TOKENS
  }
  let floor = reasoning_budget + STRUCTURED_REASONING_VERDICT_HEADROOM
  if floor > STRUCTURED_MAX_MAX_TOKENS {
    return STRUCTURED_MAX_MAX_TOKENS
  }
  if floor < STRUCTURED_MIN_MAX_TOKENS {
    return STRUCTURED_MIN_MAX_TOKENS
  }
  return floor
}

/**
 * Reasoning-aware minimum output-token budget for the structured REPAIR
 * side-call. The repair pass re-emits the verdict JSON on the SAME route after a
 * schema-validation failure, so it shares the reasoning channel's `max_tokens`
 * budget exactly like the main call. A flat 600-token repair floor is the twin
 * of the pre-#3598 judge starvation: on a reasoning route the hidden analysis
 * channel drains the shared budget and truncates the repaired JSON to empty -> a
 * silent dead-repair. This returns `max(REPAIR_MIN_MAX_TOKENS, structured floor)`
 * so a non-reasoning route keeps the historical 600 baseline (unchanged) while a
 * reasoning route is raised to reasoning_budget + verdict headroom. It reuses
 * `structured_min_max_tokens` so the repair floor never drifts from the judge
 * floor. Only ever RAISES; an explicit larger repair budget is left untouched.
 * @effects: []
 * @errors: []
 */
pub fn repair_min_max_tokens(options) {
  let floor = structured_min_max_tokens(options)
  if floor > REPAIR_MIN_MAX_TOKENS {
    return floor
  }
  return REPAIR_MIN_MAX_TOKENS
}

fn __apply_judge_defaults(schema, options) {
  var resolved = options
  let requested_max_tokens = to_int(resolved?.max_tokens ?? 0) ?? 0
  let floor = structured_min_max_tokens(resolved)
  if requested_max_tokens < floor {
    resolved = resolved + {max_tokens: floor}
  }
  if resolved?.temperature == nil {
    resolved = resolved + {temperature: 0.0}
  }
  if resolved?.schema_retries == nil {
    resolved = resolved + {schema_retries: 2}
  }
  if resolved?.schema_retry_nudge == nil {
    resolved = resolved + {schema_retry_nudge: schema_retry_nudge_for(schema, nil)}
  }
  let user_repair = if type_of(resolved?.repair) == "dict" {
    resolved.repair
  } else {
    {}
  }
  var repair = user_repair
  if repair?.enabled == nil {
    repair = repair + {enabled: true}
  }
  // Floor the repair budget reasoning-awarely (see `repair_min_max_tokens`):
  // a non-reasoning route keeps the historical 600 baseline, a reasoning route
  // gets reasoning_budget + verdict headroom so the repaired JSON verdict is not
  // starved by the route's hidden reasoning channel (twin of the #3598 judge
  // bug). Resolved against the same options as the main floor so the two never
  // drift. Only ever RAISES.
  let repair_floor = repair_min_max_tokens(resolved)
  let requested_repair_max_tokens = to_int(repair?.max_tokens ?? 0) ?? 0
  if requested_repair_max_tokens < repair_floor {
    repair = repair + {max_tokens: repair_floor}
  }
  if repair?.temperature == nil {
    repair = repair + {temperature: 0.0}
  }
  return resolved + {repair: repair}
}

/**
 * Single-shot context-overflow detection on a structured/llm error.
 *
 * A single-shot structured call (judge, router, classifier, cached lookup) has
 * NO live session transcript, so it cannot emergency-compact-and-retry the way
 * the agent loop does — there is nothing to mask. The least-surprising thing we
 * CAN do is surface the condition TRANSPARENTLY so a harness author sees
 * "this call overflowed the model's window" instead of a generic `error` that
 * looks like a dead judge or a flaky 400.
 *
 * Mirrors the Rust classifier's `[context_overflow]` reason tag and the
 * structured `reason`/`error_category` fields, so detection stays consistent
 * with the agent-loop recovery path and is provider-agnostic.
 *
 * @effects: []
 * @errors: []
 */
pub fn is_context_overflow_error(err) {
  if type_of(err) == "dict" {
    let reason = to_string(err?.reason ?? "")
    if reason == "context_overflow" {
      return true
    }
    let category = to_string(err?.error_category ?? "")
    if category == "context_overflow" {
      return true
    }
    let message = to_string(err?.message ?? "") + " " + to_string(err?.error ?? "")
    return contains(message, "[context_overflow]")
  }
  return contains(to_string(err), "[context_overflow]")
}

fn __augment_envelope(envelope) {
  if type_of(envelope) != "dict" {
    let exc_status = if is_context_overflow_error(envelope) {
      "context_overflow"
    } else {
      "exception"
    }
    return {ok: false, status: exc_status, value: {}, error: envelope}
  }
  let ok_flag = envelope?.ok ?? false
  let data = if type_of(envelope?.data) == "dict" {
    envelope.data
  } else {
    {}
  }
  let normalized = with_case_insensitive_keys(data)
  let status = if ok_flag {
    "ok"
  } else if is_context_overflow_error(envelope) {
    "context_overflow"
  } else {
    let category = envelope?.error_category
    if category != nil {
      to_string(category)
    } else {
      "error"
    }
  }
  return envelope + {value: normalized, status: status}
}
// Distinct, machine-readable status so callers/harness authors can SEE that
// a no-session single-shot call hit the context window (vs. a generic 400 or
// a dead judge). The agent loop recovers overflow via compaction; this path
// cannot, but it must never be silent.