harn-vm 0.8.113

# ---------- Cerebras ----------------------------------------------------------

# Cerebras-hosted GPT-OSS accepts top-level `reasoning_effort` only for
# {low, medium, high}. It rejects `none` and `minimal` with
# wrong_api_format. Harn's provider-neutral `reasoning_policy: "off"` therefore
# floors to the lowest accepted effort (`low`) instead of sending a value the
# endpoint cannot template.
#
# gpt-oss on Cerebras uses NATIVE tool calls. The prior `json`/text pin was a
# defensive workaround for an "empty native streaming payload" defect that no
# longer reproduces: 2026-06-12 live probes (raw curl + Harn's streaming path)
# confirm Cerebras gpt-oss-120b returns clean OpenAI-shape `tool_calls` in both
# non-streaming and SSE modes, and Harn's index-keyed SSE reassembler
# (transport.rs `oai_tool_map`) accumulates the arg-fragment deltas correctly
# (3/3 tool_calls=1 end-to-end). Conversely, BOTH `json` and `text` formats
# yield ZERO parsed calls because gpt-oss does not emit Harn's ```tool/name/args
# contract — it emits a bare `{"tool":..,"arguments":..}` dialect that the
# fenced-JSON parser does not recognize. native is therefore the measured-best
# (and only working) channel. Pinned EXPLICITLY because `gpt-oss-120b` is a
# catalogued model and the catalog invariant requires every catalogued model's
# rule to set native_tools + preferred_tool_format. (groq / together gpt-oss
# rows are now native too.)
# To change, set `preferred_tool_format` ("native" | "json" | "text").
#
# `reasoning_required_for_tools = true`: gpt-oss (Harmony) calls tools INSIDE
# the chain-of-thought channel, so reasoning-off breaks tool calling. Combined
# with reasoning_effort_levels above, reasoning_policy floors an auto "off" to
# the lowest accepted effort ("low") for tool tasks instead of disabling
# reasoning. Must NOT carry a Qwen-style `auto_reasoning_overrides = "off"`.
[[provider.cerebras]]
model_match = "gpt-oss-*"
native_tools = true
preferred_tool_format = "native"
structured_output = "native"
thinking_modes = ["effort"]
reasoning_effort_supported = true
reasoning_effort_levels = ["low", "medium", "high"]
reasoning_required_for_tools = true
prefers_xml_scaffolding = false
prefers_markdown_scaffolding = true
structured_output_mode = "native_json"
supports_assistant_prefill = false
prefers_role_developer = false
prefers_xml_tools = false
thinking_block_style = "reasoning_summary"
# Cerebras serves gpt-oss-120b at FULL precision, but request-rate throttling
# (~57K tok/req capping to ~4.4 req/min during the 2026-06 effort) makes the
# *timing* unusable for a durable N>=5 meter baseline even though the OUTPUTS
# are trustworthy. The precision canary should not reject this route on quality
# grounds, but the meter must budget for the throughput ceiling.
serving_precision = "throttled"

# Cerebras GLM 4.7 is a public preview model with native tools, native
# structured output, and top-level `reasoning_effort`; `none` is the documented
# no-thinking value, while the older `disable_reasoning` knob is deprecated.
[[provider.cerebras]]
model_match = "zai-glm-*"
native_tools = true
preferred_tool_format = "native"
structured_output = "native"
thinking_modes = ["effort"]
reasoning_effort_supported = true
reasoning_none_supported = true
prefers_xml_scaffolding = false
prefers_markdown_scaffolding = true
structured_output_mode = "native_json"
supports_assistant_prefill = false
prefers_role_developer = false
prefers_xml_tools = false
thinking_block_style = "inline"

[[provider.cerebras]]
model_match = "llama-*"
native_tools = true
preferred_tool_format = "native"
structured_output = "native"
prefers_xml_scaffolding = false
prefers_markdown_scaffolding = true
structured_output_mode = "native_json"
supports_assistant_prefill = false
prefers_role_developer = false
prefers_xml_tools = false
thinking_block_style = "none"

[[provider.cerebras]]
model_match = "qwen-*"
native_tools = true
preferred_tool_format = "native"
structured_output = "native"
prefers_xml_scaffolding = false
prefers_markdown_scaffolding = true
structured_output_mode = "native_json"
supports_assistant_prefill = false
prefers_role_developer = false
prefers_xml_tools = false
thinking_block_style = "none"