harn-stdlib 0.8.20

// @harn-entrypoint-category llm.stdlib
//
// std/llm/ensemble — multi-sample helpers (best_of_n, self_consistency,
// parallel_judge, debate). Each function uses safe_call from std/llm/safe so
// failures surface as {ok: false, status} envelopes rather than throws.
//
// Citations:
//   - Wang et al. 2022 "Self-Consistency Improves Chain of Thought Reasoning"
//     (arxiv:2203.11171).
//   - Du et al. 2023 "Improving Factuality and Reasoning in Language Models
//     through Multiagent Debate" (arxiv:2305.14325).
//   - OpenAI best-of-n / reranking patterns (cookbook).
import { agent_emit_event } from "std/agent/state"
import { safe_call, with_case_insensitive_keys } from "std/llm/safe"

fn __clamp_int(n, lo, hi) {
  let v = to_int(n)
  if v < lo {
    return lo
  }
  if v > hi {
    return hi
  }
  return v
}

fn __sampler_opts_for(opts) {
  let so = opts?.sampler_opts
  if type_of(so) == "dict" {
    return so
  }
  return {temperature: 1.0}
}

fn __merge_call_opts(base, sampler_opts) {
  let merged = if type_of(base) == "dict" {
    base
  } else {
    {}
  }
  return merged + sampler_opts
}

// -------------------------------------------------------------------------------------------------
// best_of_n
// -------------------------------------------------------------------------------------------------

fn __sample_one(prompt, system, call_opts, index) {
  let envelope = safe_call(prompt, system, call_opts)
  if !(envelope?.ok ?? false) {
    return {ok: false, index: index, status: envelope?.status, error: envelope?.error, text: ""}
  }
  let text = to_string(envelope.value?.text ?? "")
  return {ok: true, index: index, text: text, value: envelope.value}
}

fn __collect_samples(prompt, system, base_opts, sampler_opts, n, run_parallel) {
  let call_opts = __merge_call_opts(base_opts, sampler_opts)
  var indices = []
  var i = 0
  while i < n {
    indices = indices.push(i)
    i = i + 1
  }
  if run_parallel {
    return parallel each indices { idx ->
      __sample_one(prompt, system, call_opts, idx)
    }
  }
  var out = []
  for idx in indices {
    out = out.push(__sample_one(prompt, system, call_opts, idx))
  }
  return out
}

fn __reward_filter(samples, reward) {
  if reward == nil {
    return samples
  }
  var scored = []
  for s in samples {
    if s?.ok ?? false {
      let r = try {
        reward(s.text)
      }
      if !is_err(r) {
        scored = scored.push(s + {reward: to_float(unwrap(r))})
      } else {
        scored = scored.push(s + {reward: 0.0})
      }
    }
  }
  return scored
}

fn __structured_judge(samples, opts) {
  var labelled = []
  let total = len(samples)
  var i = 0
  while i < total {
    let s = samples[i]
    if s?.ok ?? false {
      labelled = labelled.push("[" + to_string(i) + "] " + to_string(s.text))
    }
    i = i + 1
  }
  if len(labelled) == 0 {
    return {ok: false, status: "no_valid_samples"}
  }
  let prompt = if opts?.judge_prompt != nil && to_string(opts.judge_prompt) != "" {
    to_string(opts.judge_prompt) + "\n\n" + join(labelled, "\n\n")
  } else {
    "Given these candidate answers, pick the best one. Reply ONLY with"
      + " a JSON object {\"best_index\": <int>, \"scores\": [..],"
      + " \"reasoning\": \"<brief>\"}.\n\n"
      + join(labelled, "\n\n")
  }
  let system = opts?.judge_system ?? ""
  var jopts = if type_of(opts?.judge_opts) == "dict" {
    opts.judge_opts
  } else {
    {}
  }
  if opts?.judge_model != nil && to_string(opts.judge_model) != "" {
    jopts = jopts + {model: to_string(opts.judge_model)}
  }
  let schema = {
    type: "object",
    properties: {
      best_index: {type: "integer"},
      scores: {type: "array", items: {type: "number"}},
      reasoning: {type: "string"},
    },
    required: ["best_index"],
  }
  let envelope = try {
    llm_call_structured_result(prompt, schema, jopts + {system: system})
  }
  if is_err(envelope) {
    return {ok: false, status: "exception", error: unwrap_err(envelope)}
  }
  let env = unwrap(envelope)
  if !(env?.ok ?? false) {
    return {ok: false, status: env?.status ?? "judge_failed", error: env?.error}
  }
  let data = with_case_insensitive_keys(env?.data ?? {})
  let best = to_int(data?.best_index ?? 0)
  return {
    ok: true,
    best_index: best,
    scores: data?.scores ?? [],
    reasoning: to_string(data?.reasoning ?? ""),
  }
}

/**
 * best_of_n(prompt, system, opts) -> dict
 *
 * Sample N candidates in parallel and ask a judge to pick the best.
 *
 * Options:
 *   - n: int (default 5; clamped to [2, 32])
 *   - sampler_opts: dict (default {temperature: 1.0})
 *   - judge: "structured" | closure (default "structured")
 *   - judge_opts, judge_model, judge_prompt, judge_system: optional
 *   - parallel: bool (default true)
 *   - reward: closure(text) -> float (optional pre-judge filter)
 *
 * Returns: {ok, best: {text, index, score?}, candidates, judge, reasoning}.
 * On all-fail: {ok: false, status: "all_samples_failed"}.
 */
pub fn best_of_n(prompt, system, opts = nil) {
  let cfg = if type_of(opts) == "dict" {
    opts
  } else {
    {}
  }
  let n = __clamp_int(cfg?.n ?? 5, 2, 32)
  let sampler_opts = __sampler_opts_for(cfg)
  let run_parallel = cfg?.parallel ?? true
  let base_opts = if type_of(cfg?.call_opts) == "dict" {
    cfg.call_opts
  } else {
    {}
  }
  let samples = __collect_samples(prompt, system, base_opts, sampler_opts, n, run_parallel)
  let valid = samples.filter({ s -> s?.ok ?? false })
  if len(valid) == 0 {
    return {ok: false, status: "all_samples_failed", candidates: samples}
  }
  let reward_filtered = __reward_filter(samples, cfg?.reward)
  let judge_kind = cfg?.judge ?? "structured"
  if type_of(judge_kind) == "closure" || type_of(judge_kind) == "function" {
    let r = try {
      judge_kind(valid)
    }
    if is_err(r) {
      return {ok: false, status: "exception", error: unwrap_err(r), candidates: samples}
    }
    let verdict = unwrap(r)
    let chosen_idx = to_int(verdict?.best_index ?? 0)
    let safe_idx = if chosen_idx >= 0 && chosen_idx < len(samples) {
      chosen_idx
    } else {
      valid[0].index
    }
    let best = samples[safe_idx]
    return {
      ok: true,
      best: {text: best?.text ?? "", index: safe_idx, score: verdict?.score ?? nil},
      candidates: samples,
      judge: "closure",
      reasoning: to_string(verdict?.reasoning ?? ""),
    }
  }
  let verdict = __structured_judge(samples, cfg)
  if !(verdict?.ok ?? false) {
    return {
      ok: false,
      status: to_string(verdict?.status ?? "judge_failed"),
      candidates: samples,
      reward_filtered: reward_filtered,
    }
  }
  let chosen_idx = verdict.best_index
  let safe_idx = if chosen_idx >= 0 && chosen_idx < len(samples) {
    chosen_idx
  } else {
    valid[0].index
  }
  let best = samples[safe_idx]
  return {
    ok: true,
    best: {text: best?.text ?? "", index: safe_idx},
    candidates: samples,
    judge: "structured",
    reasoning: verdict.reasoning,
    scores: verdict.scores,
  }
}

// -------------------------------------------------------------------------------------------------
// self_consistency
// -------------------------------------------------------------------------------------------------

fn __log2(x) {
  // log2(x) = ln(x) / ln(2). Harn provides `ln`/`log` via std/math; fall
  // back on a manual approximation if not.
  let lnx = ln(x)
  let ln2 = ln(2.0)
  return lnx / ln2
}

fn __shannon_entropy(counts, total) {
  if total <= 0 {
    return 0.0
  }
  var h = 0.0
  for c in counts {
    let cnt = to_float(c)
    if cnt > 0.0 {
      let p = cnt / to_float(total)
      h = h - p * __log2(p)
    }
  }
  return h
}

fn __answer_key(value) {
  let kind = type_of(value)
  if kind == "string" {
    return value
  }
  if kind == "dict" || kind == "list" {
    return json_stringify(value)
  }
  return to_string(value)
}

fn __extract_answer(sample, extract, normalize) {
  if !(sample?.ok ?? false) {
    return nil
  }
  let raw = try {
    extract(sample.text)
  }
  if is_err(raw) {
    return nil
  }
  var ans = unwrap(raw)
  if normalize != nil && ans != nil {
    let norm = try {
      normalize(ans)
    }
    if !is_err(norm) {
      ans = unwrap(norm)
    }
  }
  return ans
}

fn __weight_for(sample, vote_mode, confidence_fn) {
  if vote_mode != "weighted" {
    return 0.0
  }
  let cv = try {
    confidence_fn(sample)
  }
  if is_err(cv) {
    return 1.0
  }
  return to_float(unwrap(cv))
}

fn __tally_samples(samples, extract, normalize, vote_mode, confidence_fn) {
  var keys_in_order = []
  var counts = {}
  var first_seen = {}
  var weights = {}
  var extracted = []
  var idx = 0
  for s in samples {
    let ans = __extract_answer(s, extract, normalize)
    if ans == nil {
      extracted = extracted.push(nil)
    } else {
      let key = __answer_key(ans)
      if counts[key] == nil {
        keys_in_order = keys_in_order.push(key)
        counts = counts + {[key]: 0}
        first_seen = first_seen + {[key]: idx}
        weights = weights + {[key]: 0.0}
      }
      counts = counts + {[key]: counts[key] + 1}
      let w = __weight_for(s, vote_mode, confidence_fn)
      if w != 0.0 {
        weights = weights + {[key]: weights[key] + w}
      }
      extracted = extracted.push(ans)
    }
    idx = idx + 1
  }
  return {
    keys_in_order: keys_in_order,
    counts: counts,
    first_seen: first_seen,
    weights: weights,
    extracted: extracted,
  }
}

fn __score_for(key, counts, weights, vote_mode) {
  if vote_mode == "weighted" {
    return weights[key]
  }
  return to_float(counts[key])
}

fn __pick_winner(tally, vote_mode) {
  let keys = tally.keys_in_order
  var best_key = keys[0]
  var best_score = __score_for(best_key, tally.counts, tally.weights, vote_mode)
  var best_first_seen = tally.first_seen[best_key]
  var tie = false
  var i = 1
  while i < len(keys) {
    let k = keys[i]
    let score = __score_for(k, tally.counts, tally.weights, vote_mode)
    if score > best_score {
      best_key = k
      best_score = score
      best_first_seen = tally.first_seen[k]
      tie = false
    } else if score == best_score {
      let fs = tally.first_seen[k]
      if fs < best_first_seen {
        best_key = k
        best_first_seen = fs
      }
      tie = true
    }
    i = i + 1
  }
  return {key: best_key, tie: tie}
}

fn __canonical_answer(extracted, key) {
  for e in extracted {
    if e != nil && __answer_key(e) == key {
      return e
    }
  }
  return nil
}

fn __build_distribution(keys_in_order, counts) {
  var distribution = []
  var total = 0
  for k in keys_in_order {
    let c = counts[k]
    total = total + c
    distribution = distribution.push({answer: k, count: c})
  }
  return {distribution: distribution, total: total}
}

/**
 * self_consistency(prompt, system, opts) -> dict
 *
 * Sample N times at temperature > 0, extract a canonical answer per sample,
 * and majority-vote (Wang et al. 2022; arxiv:2203.11171).
 *
 * Options:
 *   - n: int (default 8)
 *   - sampler_opts: dict (default {temperature: 1.2})
 *   - extract: closure(text) -> any (REQUIRED)
 *   - normalize: closure(answer) -> any (default identity)
 *   - vote: "majority" | "weighted" (default "majority")
 *   - confidence_fn: closure(sample) -> float (REQUIRED for "weighted")
 *   - parallel: bool (default true)
 *   - _session_id: string (optional; emits "self_consistency_tie")
 *
 * Returns: {ok, answer, answer_count, total, distribution, candidates, entropy}.
 * Ties: lowest-index extracted answer wins; emits "self_consistency_tie".
 * No extractable answers: {ok: false, status: "no_extractable_answers"}.
 */
pub fn self_consistency(prompt, system, opts = nil) {
  let cfg = if type_of(opts) == "dict" {
    opts
  } else {
    {}
  }
  let extract = cfg?.extract
  if extract == nil {
    throw "self_consistency: opts.extract is required"
  }
  let vote_mode = cfg?.vote ?? "majority"
  let confidence_fn = cfg?.confidence_fn
  if vote_mode == "weighted" && confidence_fn == nil {
    throw "self_consistency: opts.confidence_fn is required for weighted voting"
  }
  let n = __clamp_int(cfg?.n ?? 8, 2, 64)
  let sampler_opts = if type_of(cfg?.sampler_opts) == "dict" {
    cfg.sampler_opts
  } else {
    {temperature: 1.2}
  }
  let run_parallel = cfg?.parallel ?? true
  let base_opts = if type_of(cfg?.call_opts) == "dict" {
    cfg.call_opts
  } else {
    {}
  }
  let samples = __collect_samples(prompt, system, base_opts, sampler_opts, n, run_parallel)
  let tally = __tally_samples(samples, extract, cfg?.normalize, vote_mode, confidence_fn)
  if len(tally.keys_in_order) == 0 {
    return {ok: false, status: "no_extractable_answers", candidates: samples}
  }
  let winner = __pick_winner(tally, vote_mode)
  let dist = __build_distribution(tally.keys_in_order, tally.counts)
  var counts_only = []
  for d in dist.distribution {
    counts_only = counts_only.push(d.count)
  }
  let session_id = to_string(cfg?._session_id ?? "")
  if winner.tie && session_id != "" {
    let _ = try {
      agent_emit_event(
        session_id,
        "self_consistency_tie",
        {answer: winner.key, total: dist.total, distribution: dist.distribution},
      )
    }
  }
  return {
    ok: true,
    answer: __canonical_answer(tally.extracted, winner.key),
    answer_count: tally.counts[winner.key],
    total: dist.total,
    distribution: dist.distribution,
    candidates: samples,
    entropy: __shannon_entropy(counts_only, dist.total),
  }
}

// -------------------------------------------------------------------------------------------------
// parallel_judge
// -------------------------------------------------------------------------------------------------

/**
 * parallel_judge(items, judge_fn, opts) -> list<dict>
 *
 * Run `judge_fn(item) -> verdict` for each item. Output preserves input
 * order. Each entry: {item, verdict?, ok, error?, duration_ms}.
 *
 * Options:
 *   - max_concurrent: int (default 4)
 *   - on_error: "skip" | "fail_fast" | "collect" (default "collect")
 *   - on_progress: closure({index, total, item, verdict}) -> nil
 */
pub fn parallel_judge(items, judge_fn, opts = nil) {
  let cfg = if type_of(opts) == "dict" {
    opts
  } else {
    {}
  }
  let on_error = cfg?.on_error ?? "collect"
  let on_progress = cfg?.on_progress
  let total = len(items)
  if total == 0 {
    return []
  }
  // index pairs to preserve order through parallel each
  var indexed = []
  var i = 0
  for item in items {
    indexed = indexed.push({index: i, item: item})
    i = i + 1
  }
  let raw = parallel each indexed with { max_concurrent: __clamp_int(cfg?.max_concurrent ?? 4, 1, 64) } { entry ->
    let started = now_ms()
    let r = try {
      judge_fn(entry.item)
    }
    let duration = now_ms() - started
    if is_err(r) {
      let err = unwrap_err(r)
      let record = {index: entry.index, item: entry.item, ok: false, error: err, duration_ms: duration}
      if on_progress != nil {
        let _ = try {
          on_progress({index: entry.index, total: total, item: entry.item, verdict: nil})
        }
      }
      record
    } else {
      let verdict = unwrap(r)
      let record = {index: entry.index, item: entry.item, ok: true, verdict: verdict, duration_ms: duration}
      if on_progress != nil {
        let _ = try {
          on_progress({index: entry.index, total: total, item: entry.item, verdict: verdict})
        }
      }
      record
    }
  }
  // Sort by index to preserve input order. parallel each currently returns
  // results in input-list order, but we sort defensively in case that ever
  // changes.
  let sorted = raw.sort({ a, b -> a.index - b.index })
  var out = []
  for r in sorted {
    let failed = !(r?.ok ?? false)
    if failed && on_error == "fail_fast" {
      throw r?.error ?? "parallel_judge: judge_fn errored"
    }
    if !(failed && on_error == "skip") {
      out = out.push(r)
    }
  }
  return out
}

type DebateDebater = string | {name: string?, system: string?, instruction: string?, llm_options: dict?}

type DebateResponse = {
  debater: string,
  text: string,
  provider: string?,
  model: string?,
  input_tokens: int?,
  output_tokens: int?,
}

type DebateRound = {
  round: int,
  responses: list<DebateResponse>,
  max_round_drift: float?,
  stable: bool?,
  drifts: list<dict>?,
}

type DebateResult = {
  prompt: string,
  debaters: list<string>,
  requested_rounds: int,
  n_rounds: int,
  completed_rounds: int,
  stopped_early: bool,
  stop_reason: string?,
  rounds: list<DebateRound>,
  stability: list<dict>,
  final_responses: list<DebateResponse>,
  short_circuit_event_id: int?,
}

fn __debate_prompt_value(opts) {
  let prompt = opts?.prompt ?? opts?.question ?? opts?.task
  if type_of(prompt) != "string" || trim(prompt) == "" {
    throw "debate: opts.prompt must be a non-empty string"
  }
  return prompt
}

fn __debate_int(value, fallback, label) {
  let parsed = to_int(value ?? fallback)
  if parsed == nil || parsed < 1 {
    throw "debate: " + label + " must be a positive integer"
  }
  return parsed
}

fn __debate_float(value, fallback, label) {
  let parsed = to_float(value ?? fallback)
  if parsed == nil {
    throw "debate: " + label + " must be numeric"
  }
  return parsed
}

fn __debate_debaters(opts) {
  let debaters = opts?.debaters ?? opts?.agents ?? ["debater_1", "debater_2"]
  if type_of(debaters) != "list" || len(debaters) == 0 {
    throw "debate: opts.debaters must be a non-empty list"
  }
  return debaters
}

fn __debater_name(debater, index) {
  if type_of(debater) == "string" {
    let name = trim(debater)
    if name != "" {
      return name
    }
  }
  if type_of(debater) == "dict" {
    let name = debater?.name
    if type_of(name) == "string" && trim(name) != "" {
      return trim(name)
    }
    return "debater_" + to_string(index + 1)
  }
  throw "debate: each debater must be a string or dict"
}

fn __debater_instruction(debater) {
  if type_of(debater) == "dict" && type_of(debater?.instruction) == "string" {
    return trim(debater.instruction)
  }
  return ""
}

fn __debater_names(debaters) {
  var names = []
  var index = 0
  while index < len(debaters) {
    names = names.push(__debater_name(debaters[index], index))
    index += 1
  }
  return names
}

fn __debate_options(raw) {
  if type_of(raw) != "dict" {
    throw "debate: opts must be a dict"
  }
  if contains(raw.keys(), "adaptive_stop") && type_of(raw.adaptive_stop) != "bool" {
    throw "debate: opts.adaptive_stop must be a bool"
  }
  let threshold = __debate_float(raw?.stability_threshold ?? raw?.threshold, 0.15, "stability_threshold")
  if threshold <= 0.0 || threshold > 1.0 {
    throw "debate: stability_threshold must be > 0 and <= 1"
  }
  return raw
    + {
    prompt: __debate_prompt_value(raw),
    debaters: __debate_debaters(raw),
    n_rounds: __debate_int(raw?.n_rounds ?? raw?.rounds ?? raw?.max_rounds, 3, "n_rounds"),
    adaptive_stop: raw?.adaptive_stop ?? false,
    stability_threshold: threshold,
    stability_patience: 2,
  }
}

fn __copy_top_level_llm_options(opts, base) {
  var out = {} + base
  for key in [
    "provider",
    "model",
    "temperature",
    "max_tokens",
    "timeout_ms",
    "llm_retries",
    "llm_backoff_ms",
    "budget",
    "response_format",
    "schema_retries",
    "session_id",
  ] {
    if contains(opts.keys(), key) {
      out[key] = opts[key]
    }
  }
  return out
}

fn __debate_llm_options(opts, debater) {
  var out = opts?.llm_options ?? {}
  if type_of(out) != "dict" {
    throw "debate: opts.llm_options must be a dict when provided"
  }
  out = __copy_top_level_llm_options(opts, out)
  if type_of(debater) == "dict" {
    let debater_options = debater?.llm_options ?? {}
    if type_of(debater_options) != "dict" {
      throw "debate: debater.llm_options must be a dict when provided"
    }
    out = out + debater_options
  }
  return out
}

fn __debate_system(opts, debater) {
  var parts = []
  if type_of(opts?.system) == "string" && trim(opts.system) != "" {
    parts = parts.push(trim(opts.system))
  }
  if type_of(debater) == "dict" && type_of(debater?.system) == "string" && trim(debater.system) != "" {
    parts = parts.push(trim(debater.system))
  }
  parts = parts
    .push(
    "You are participating in a multi-agent debate. Answer from your assigned perspective, revise when prior rounds change your view, and keep the response concise.",
  )
  return join(parts, "\n\n")
}

fn __debate_history(rounds) {
  var lines = []
  for round in rounds {
    lines = lines.push("Round " + to_string(round.round) + ":")
    for response in round.responses {
      lines = lines.push(response.debater + ": " + response.text)
    }
  }
  return join(lines, "\n")
}

fn __debate_call_prompt(opts, debater, debater_name, round_number, previous_rounds) {
  var parts = [
    "Question:\n" + opts.prompt,
    "Debater: " + debater_name,
    "Round: " + to_string(round_number) + " of " + to_string(opts.n_rounds),
  ]
  let instruction = __debater_instruction(debater)
  if instruction != "" {
    parts = parts.push("Perspective:\n" + instruction)
  }
  let history = __debate_history(previous_rounds)
  if history != "" {
    parts = parts.push("Previous rounds:\n" + history)
  }
  parts = parts.push("Return only this debater's next response.")
  return join(parts, "\n\n")
}

fn __debate_response_text(result) {
  if type_of(result) == "dict" {
    if result?.text != nil {
      return to_string(result.text)
    }
    if result?.data != nil {
      return to_string(result.data)
    }
  }
  return to_string(result)
}

fn __debate_response(debater_name, result) {
  var response = {debater: debater_name, text: __debate_response_text(result)}
  if type_of(result) == "dict" {
    for key in ["provider", "model", "input_tokens", "output_tokens"] {
      if result[key] != nil {
        response[key] = result[key]
      }
    }
  }
  return response
}

fn __debate_run_round(opts, round_number, previous_rounds) {
  var responses = []
  var index = 0
  while index < len(opts.debaters) {
    let debater = opts.debaters[index]
    let name = __debater_name(debater, index)
    let result = llm_call(
      __debate_call_prompt(opts, debater, name, round_number, previous_rounds),
      __debate_system(opts, debater),
      __debate_llm_options(opts, debater),
    )
    responses = responses.push(__debate_response(name, result))
    index += 1
  }
  return responses
}

fn __debate_tokens(text) {
  let cleaned = trim(regex_replace("[^A-Za-z0-9_]+", " ", lowercase(to_string(text))))
  if cleaned == "" {
    return []
  }
  return split(cleaned, " ").filter({ token -> return token != "" })
}

fn __ngram_counts(tokens, width) {
  var counts = {}
  if len(tokens) < width {
    return counts
  }
  var index = 0
  while index <= len(tokens) - width {
    let key = join(tokens[index:index + width], " ")
    let current = counts[key] ?? 0
    counts[key] = current + 1
    index += 1
  }
  return counts
}

fn __ngram_precision(reference_tokens, candidate_tokens, width) {
  let candidate_counts = __ngram_counts(candidate_tokens, width)
  let reference_counts = __ngram_counts(reference_tokens, width)
  var overlap = 0
  var total = 0
  for entry in entries(candidate_counts) {
    overlap += min(entry.value, reference_counts[entry.key] ?? 0)
    total += entry.value
  }
  if total == 0 {
    return 0.0
  }
  return overlap * 1.0 / total
}

fn __bleu_lite(reference_text, candidate_text) {
  let reference_tokens = __debate_tokens(reference_text)
  let candidate_tokens = __debate_tokens(candidate_text)
  if len(reference_tokens) == 0 && len(candidate_tokens) == 0 {
    return 1.0
  }
  if len(reference_tokens) == 0 || len(candidate_tokens) == 0 {
    return 0.0
  }
  let unigram = __ngram_precision(reference_tokens, candidate_tokens, 1)
  var precision_sum = unigram
  var precision_count = 1.0
  if len(reference_tokens) >= 2 && len(candidate_tokens) >= 2 {
    precision_sum += __ngram_precision(reference_tokens, candidate_tokens, 2)
    precision_count += 1.0
  }
  let brevity = if len(candidate_tokens) < len(reference_tokens) {
    len(candidate_tokens) * 1.0 / len(reference_tokens)
  } else {
    1.0
  }
  return brevity * precision_sum / precision_count
}

fn __text_drift(previous_text, current_text) {
  let similarity = __bleu_lite(previous_text, current_text)
  if similarity < 0.0 {
    return 1.0
  }
  if similarity > 1.0 {
    return 0.0
  }
  return 1.0 - similarity
}

fn __round_stability(previous_round, current_round, threshold) {
  var drifts = []
  var max_drift = 0.0
  var index = 0
  while index < len(current_round.responses) {
    let current = current_round.responses[index]
    let previous = previous_round.responses[index]
    let drift = __text_drift(previous?.text ?? "", current?.text ?? "")
    drifts = drifts.push({debater: current.debater, drift: drift})
    max_drift = max(max_drift, drift)
    index += 1
  }
  return {
    round: current_round.round,
    max_round_drift: max_drift,
    threshold: threshold,
    stable: max_drift < threshold,
    drifts: drifts,
  }
}

fn __emit_stability_short_circuit(opts, stability) {
  return event_log
    .emit(
    "llm.ensemble.debate",
    "debate_stability_short_circuit",
    {
      round: stability.round,
      requested_rounds: opts.n_rounds,
      max_round_drift: stability.max_round_drift,
      threshold: opts.stability_threshold,
      consecutive_stable_rounds: opts.stability_patience,
      drifts: stability.drifts,
    },
    {round: to_string(stability.round), requested_rounds: to_string(opts.n_rounds)},
  )
}

/**
 * Run a multi-debater LLM debate. Set `adaptive_stop: true` to stop after
 * two consecutive stable rounds when every debater's response drift is below
 * `stability_threshold` (default `0.15`).
 */
pub fn debate(opts) -> DebateResult {
  let config = __debate_options(opts)
  var rounds = []
  var stability = []
  var consecutive_stable = 0
  var short_circuit_event_id = nil
  var round_number = 1
  while round_number <= config.n_rounds {
    let responses = __debate_run_round(config, round_number, rounds)
    var round = {round: round_number, responses: responses}
    if len(rounds) > 0 {
      let round_stability = __round_stability(rounds[-1], round, config.stability_threshold)
      stability = stability.push(round_stability)
      round = round
        + {
        max_round_drift: round_stability.max_round_drift,
        stable: round_stability.stable,
        drifts: round_stability.drifts,
      }
      if config.adaptive_stop {
        if round_stability.stable {
          consecutive_stable += 1
        } else {
          consecutive_stable = 0
        }
      }
    }
    rounds = rounds.push(round)
    if config.adaptive_stop && consecutive_stable >= config.stability_patience {
      short_circuit_event_id = __emit_stability_short_circuit(config, stability[-1])
      break
    }
    round_number += 1
  }
  let stopped_early = short_circuit_event_id != nil
  return {
    prompt: config.prompt,
    debaters: __debater_names(config.debaters),
    requested_rounds: config.n_rounds,
    n_rounds: config.n_rounds,
    completed_rounds: len(rounds),
    stopped_early: stopped_early,
    stop_reason: if stopped_early {
      "stability"
    } else {
      nil
    },
    rounds: rounds,
    stability: stability,
    final_responses: rounds[-1].responses,
    short_circuit_event_id: short_circuit_event_id,
  }
}

fn __tot_is_callable(value) {
  let kind = type_of(value)
  return kind == "function" || kind == "closure" || kind == "fn"
}

fn __tot_is_number(value) {
  let kind = type_of(value)
  return kind == "int" || kind == "float"
}

fn __tot_require_callable(opts, name) {
  let value = opts[name]
  require __tot_is_callable(value), "tree_of_thoughts: opts." + name + " must be callable"
  return value
}

fn __tot_require_positive_int(value, label) {
  require __tot_is_number(value), "tree_of_thoughts: " + label + " must be numeric"
  require floor(value) == value, "tree_of_thoughts: " + label + " must be an integer"
  require value >= 1, "tree_of_thoughts: " + label + " must be at least 1"
  return to_int(value)
}

fn __tot_require_non_negative_int(value, label) {
  require __tot_is_number(value), "tree_of_thoughts: " + label + " must be numeric"
  require floor(value) == value, "tree_of_thoughts: " + label + " must be an integer"
  require value >= 0, "tree_of_thoughts: " + label + " must be non-negative"
  return to_int(value)
}

fn __tot_score(evaluate, state) {
  let score = evaluate(state)
  require __tot_is_number(score), "tree_of_thoughts: evaluate(state) must return a number"
  return score
}

fn __tot_terminal(is_terminal, state) {
  let terminal = is_terminal(state)
  require type_of(terminal) == "bool", "tree_of_thoughts: is_terminal(state) must return a bool"
  return terminal
}

fn __tot_stop(stop, node) {
  if stop == nil {
    return false
  }
  let should_stop = stop(node)
  require type_of(should_stop) == "bool", "tree_of_thoughts: stop(node) must return a bool"
  return should_stop
}

fn __tot_node(id, parent_id, state, path, depth, score, terminal) {
  return {
    id: id,
    parent_id: parent_id,
    state: state,
    path: path,
    depth: depth,
    score: score,
    terminal: terminal,
  }
}

fn __tot_better_terminal(best_terminal, candidate) {
  return candidate.terminal && (best_terminal == nil || candidate.score > best_terminal.score)
}

fn __tot_better_seen(best_seen, candidate) {
  return best_seen == nil || candidate.score > best_seen.score
}

fn __tot_update_best(best, node) {
  var best_terminal = best.terminal
  var best_seen = best.seen
  if __tot_better_terminal(best_terminal, node) {
    best_terminal = node
  }
  if __tot_better_seen(best_seen, node) {
    best_seen = node
  }
  return {terminal: best_terminal, seen: best_seen}
}

fn __tot_top(nodes, limit) {
  return nodes.sort_by({ node -> 0 - node.score }).take(limit)
}

fn __tot_children(parent, expand, evaluate, is_terminal, k, next_id) {
  let expanded = expand(parent.state, k)
  require type_of(expanded) == "list", "tree_of_thoughts: expand(state, k) must return a list"
  var children = []
  var idx = 0
  let limit = if len(expanded) < k {
    len(expanded)
  } else {
    k
  }
  while idx < limit {
    let state = expanded[idx]
    let node = __tot_node(
      next_id + idx,
      parent.id,
      state,
      parent.path.push(state),
      parent.depth + 1,
      __tot_score(evaluate, state),
      __tot_terminal(is_terminal, state),
    )
    children = children.push(node)
    idx = idx + 1
  }
  return children
}

fn __tot_finalize(search, k, max_depth, beam_width, nodes, best, stop_reason) {
  let best_node = best.terminal ?? best.seen
  let reason = if stop_reason != nil {
    stop_reason
  } else if best.terminal != nil {
    "terminal"
  } else {
    "exhausted"
  }
  return {
    ok: best.terminal != nil,
    best_path: best_node?.path ?? [],
    best_score: best_node?.score,
    best_node: best_node,
    stop_reason: reason,
    tree: {
      root_id: 0,
      best_id: best_node?.id,
      nodes: nodes,
      search: search,
      k: k,
      max_depth: max_depth,
      beam_width: beam_width,
    },
  }
}

fn __tot_search_frontier(
  search,
  root,
  expand,
  evaluate,
  is_terminal,
  stop,
  k,
  max_depth,
  beam_width,
) {
  var nodes = [root]
  var frontier = [root]
  var next_id = 1
  var best = __tot_update_best({terminal: nil, seen: nil}, root)
  var stop_reason = nil
  while len(frontier) > 0 && stop_reason == nil {
    let current = if search == "dfs" {
      frontier.last()
    } else {
      frontier[0]
    }
    frontier = if search == "dfs" {
      frontier.pop()
    } else {
      frontier.slice(1)
    }
    if __tot_stop(stop, current) {
      stop_reason = "stop"
    } else if !current.terminal && current.depth < max_depth {
      let children = __tot_children(current, expand, evaluate, is_terminal, k, next_id)
      next_id = next_id + len(children)
      for child in children {
        nodes = nodes.push(child)
        best = __tot_update_best(best, child)
      }
      if search == "bfs" {
        for child in children {
          frontier = frontier.push(child)
        }
      }
      if search == "dfs" {
        frontier = frontier + children.reverse()
      }
    }
  }
  return __tot_finalize(search, k, max_depth, beam_width, nodes, best, stop_reason)
}

fn __tot_search_beam(root, expand, evaluate, is_terminal, stop, k, max_depth, beam_width) {
  var nodes = [root]
  var beam = [root]
  var next_id = 1
  var best = __tot_update_best({terminal: nil, seen: nil}, root)
  var stop_reason = nil
  var depth = 0
  while len(beam) > 0 && depth < max_depth && stop_reason == nil {
    var candidates = []
    for current in beam {
      if stop_reason == nil {
        if __tot_stop(stop, current) {
          stop_reason = "stop"
        } else if !current.terminal && current.depth < max_depth {
          let children = __tot_children(current, expand, evaluate, is_terminal, k, next_id)
          next_id = next_id + len(children)
          for child in children {
            nodes = nodes.push(child)
            candidates = candidates.push(child)
            best = __tot_update_best(best, child)
          }
        }
      }
    }
    beam = __tot_top(candidates, beam_width)
    depth = depth + 1
  }
  return __tot_finalize("beam", k, max_depth, beam_width, nodes, best, stop_reason)
}

/** tree_of_thoughts searches explicitly expanded reasoning states with BFS, DFS, or beam search. */
pub fn tree_of_thoughts(opts) {
  require type_of(opts) == "dict", "tree_of_thoughts: opts must be a dict"
  require opts.has("initial_state"), "tree_of_thoughts: opts.initial_state is required"
  let expand = __tot_require_callable(opts, "expand")
  let evaluate = __tot_require_callable(opts, "evaluate")
  let is_terminal = __tot_require_callable(opts, "is_terminal")
  let stop = opts?.stop
  if stop != nil {
    require __tot_is_callable(stop), "tree_of_thoughts: opts.stop must be callable"
  }
  let search = opts?.search ?? "bfs"
  require search == "bfs" || search == "dfs" || search == "beam", "tree_of_thoughts: opts.search must be \"bfs\", \"dfs\", or \"beam\""
  let k = __tot_require_positive_int(opts?.k ?? 1, "opts.k")
  let max_depth = __tot_require_non_negative_int(opts?.max_depth ?? 8, "opts.max_depth")
  let beam_width = __tot_require_positive_int(opts?.beam_width ?? k, "opts.beam_width")
  let initial_state = opts.initial_state
  let root = __tot_node(
    0,
    nil,
    initial_state,
    [initial_state],
    0,
    __tot_score(evaluate, initial_state),
    __tot_terminal(is_terminal, initial_state),
  )
  if search == "beam" {
    return __tot_search_beam(root, expand, evaluate, is_terminal, stop, k, max_depth, beam_width)
  }
  return __tot_search_frontier(search, root, expand, evaluate, is_terminal, stop, k, max_depth, beam_width)
}