harn-stdlib 0.8.18

/**
 * @harn-entrypoint-category llm.stdlib
 *
 * std/llm/budget — model-aware token / context helpers for sizing prompts and
 * recommending max_output_tokens. Uses Rust tiktoken builtins for known model
 * families and a deterministic heuristic fallback for local/unknown models.
 *
 * Note: the spec calls for a one-shot warn agent_emit_event when the model's
 * context window is unknown. We skip the event emit here because no agent
 * session is guaranteed at these call sites. Callers that want the signal
 * should observe `budget_summary().assumptions`.
 */
fn __llm_budget_heuristic_tokens(text) {
  if text == "" {
    return 0
  }
  let divisor = if regex_match("[\\p{Han}\\p{Hiragana}\\p{Katakana}\\p{Hangul}]", text) {
    1.0
  } else {
    if contains(text, "```") || contains(text, "=>") || contains(text, "->")
      || contains(text, "{")
      || contains(text, "}")
      || contains(text, ";") {
      3.5
    } else {
      4.0
    }
  }
  return to_int(ceil(len(text) * 1.0 / divisor))
}

fn __llm_budget_model(model) {
  if model == nil {
    return ""
  }
  return to_string(model)
}

fn __llm_budget_text(text) {
  if text == nil {
    return ""
  }
  return to_string(text)
}

fn __ceil_int(x) {
  let f = floor(x)
  if x == f {
    return to_int(f)
  }
  return to_int(f) + 1
}

/** token_count_encoder returns the encoder metadata used for a model token count. */
pub fn token_count_encoder(model) {
  return tiktoken_tokenizer_info(__llm_budget_model(model))
}

/** estimate_text_tokens returns a model-aware text-token estimate. */
pub fn estimate_text_tokens(text, model = nil) {
  let normalized = __llm_budget_text(text)
  let model_id = __llm_budget_model(model)
  let info = tiktoken_tokenizer_info(model_id)
  if info.known_model_family {
    return tiktoken_count_tokens(normalized, model_id)
  }
  return __llm_budget_heuristic_tokens(normalized)
}

/** estimate_text_tokens_detail returns token count plus the encoder or heuristic source. */
pub fn estimate_text_tokens_detail(text, model = nil) {
  let normalized = __llm_budget_text(text)
  let model_id = __llm_budget_model(model)
  let info = tiktoken_tokenizer_info(model_id)
  if info.known_model_family {
    return info + {tokens: tiktoken_count_tokens(normalized, model_id)}
  }
  return info + {tokens: __llm_budget_heuristic_tokens(normalized)}
}

/**
 * Look up the effective runtime context window for a model. Configurable local
 * providers may advertise a larger `context_window` while setting
 * `runtime_context_window` to the safer default currently being served.
 * Falls back to 8192 if the catalog has no entry. Returns int.
 */
pub fn context_window_for(model) {
  let info = try {
    llm_model_info(model)
  }
  if is_err(info) {
    return 8192
  }
  let resolved = unwrap(info)
  if type_of(resolved) != "dict" {
    return 8192
  }
  let catalog = resolved?.catalog
  if type_of(catalog) != "dict" {
    return 8192
  }
  let cw = catalog?.runtime_context_window ?? catalog?.context_window
  if cw == nil {
    return 8192
  }
  return to_int(cw)
}

fn __model_max_output_tokens(model) {
  let info = try {
    llm_model_info(model)
  }
  if is_err(info) {
    return 16384
  }
  let resolved = unwrap(info)
  if type_of(resolved) != "dict" {
    return 16384
  }
  let catalog = resolved?.catalog
  if type_of(catalog) != "dict" {
    return 16384
  }
  let mot = catalog?.max_output_tokens
  if mot == nil {
    return 16384
  }
  return to_int(mot)
}

fn __task_clamp(kind, capped, used, summary_ratio) {
  if kind == "summarize" {
    let ratio_floor = __ceil_int(used * summary_ratio)
    let upper = if ratio_floor > 256 {
      ratio_floor
    } else {
      256
    }
    return min(capped, upper)
  }
  if kind == "chat" {
    return min(capped, 1024)
  }
  if kind == "plan" {
    return min(capped, 2048)
  }
  if kind == "code" {
    return min(capped, 8192)
  }
  if kind == "json" {
    return min(capped, 4096)
  }
  if kind == "agent" {
    return min(capped, 4096)
  }
  return capped
}

/**
 * Recommend max_output_tokens for a given prompt/system/model/task_kind.
 * Required: opts.prompt, opts.model. Optional: opts.system, opts.headroom
 * (default 0.10), opts.task_kind (default "chat"), opts.summary_ratio
 * (default 0.30).
 */
pub fn recommend_max_output_tokens(opts) {
  if type_of(opts) != "dict" {
    throw "recommend_max_output_tokens: opts must be a dict"
  }
  if opts?.prompt == nil {
    throw "recommend_max_output_tokens: opts.prompt is required"
  }
  if opts?.model == nil || opts.model == "" {
    throw "recommend_max_output_tokens: opts.model is required"
  }
  let model = opts.model
  let headroom = opts?.headroom ?? 0.1
  let kind = opts?.task_kind ?? "chat"
  let summary_ratio = opts?.summary_ratio ?? 0.3
  let ctx = context_window_for(model)
  let prompt_tokens = estimate_text_tokens(opts.prompt, model)
  let system_tokens = estimate_text_tokens(opts?.system ?? "", model)
  let used = prompt_tokens + system_tokens
  let headroom_tokens = __ceil_int(ctx * headroom)
  let raw_remaining = ctx - used - headroom_tokens
  let model_cap = __model_max_output_tokens(model)
  let capped = min(raw_remaining, model_cap)
  let clamped = __task_clamp(kind, capped, used, summary_ratio)
  if clamped < 64 {
    return 64
  }
  return clamped
}

/**
 * Debug helper. Returns a dict with all intermediate values used by
 * recommend_max_output_tokens, plus an `assumptions` list of strings.
 */
pub fn budget_summary(opts) {
  if type_of(opts) != "dict" {
    throw "budget_summary: opts must be a dict"
  }
  let model = opts?.model ?? ""
  let info = llm_model_info(model)
  let provider = if type_of(info) == "dict" {
    to_string(info?.provider ?? "")
  } else {
    ""
  }
  let catalog = if type_of(info) == "dict" {
    info?.catalog
  } else {
    nil
  }
  var assumptions = []
  let ctx = if type_of(catalog) == "dict" && catalog?.context_window != nil {
    to_int(catalog.context_window)
  } else {
    assumptions = assumptions.push("context_window unknown - using 8192 fallback")
    8192
  }
  let headroom = opts?.headroom ?? 0.1
  let kind = opts?.task_kind ?? "chat"
  let summary_ratio = opts?.summary_ratio ?? 0.3
  let prompt_tokens = estimate_text_tokens(opts?.prompt ?? "", model)
  let system_tokens = estimate_text_tokens(opts?.system ?? "", model)
  let headroom_tokens = __ceil_int(ctx * headroom)
  let remaining = ctx - prompt_tokens - system_tokens - headroom_tokens
  let model_cap = __model_max_output_tokens(model)
  let capped = min(remaining, model_cap)
  var clamped = __task_clamp(kind, capped, prompt_tokens + system_tokens, summary_ratio)
  if clamped < 64 {
    clamped = 64
  }
  return {
    model: model,
    provider: provider,
    context_window: ctx,
    prompt_tokens_est: prompt_tokens,
    system_tokens_est: system_tokens,
    headroom_tokens: headroom_tokens,
    remaining_tokens: remaining,
    recommended_max_output: clamped,
    task_kind: kind,
    assumptions: assumptions,
  }
}

/**
 * Returns a bool: does the rendered text fit in the model's context window
 * after reserving `headroom` of the window?
 */
pub fn fits_in_context(text, model, headroom = 0.1) {
  let ctx = context_window_for(model)
  let used = estimate_text_tokens(text, model)
  let reserved = __ceil_int(ctx * headroom)
  return used + reserved <= ctx
}