harn-vm 0.7.49 - Docs.rs

# Harn provider capability matrix.
#
# One `[[provider.<name>]]` array entry per rule; first match wins per
# (provider, model). Place more specific `model_match` patterns before
# wildcards. `version_min = [major, minor]` narrows the match to a model
# ID whose `(major, minor)` version (parsed from the Anthropic / OpenAI
# naming schemes) is greater than or equal to the given tuple. Rules
# whose `version_min` is unparseable for the given model are skipped.
#
# `[provider_family]` declares the sibling providers that inherit rules
# from a canonical family when they have no rule of their own (OpenRouter
# et al. speak the same Responses API and forward `tool_search` /
# `defer_loading` unchanged — they fall through to `[[provider.openai]]`
# by default).
#
# Users override or extend this table per-project via
# `[[capabilities.provider.<name>]]` entries in `harn.toml`. Project
# overrides are checked before the built-in rules for the same provider
# name and are authoritative on overlap.
#
# Supported per-rule fields:
#   model_match     : glob pattern matched against the lowercased model ID.
#   version_min     : [major, minor] lower bound, provider-aware parse.
#   native_tools    : whether the model accepts native tool-call wire shape.
#   defer_loading   : whether `defer_loading: true` is honored on tool defs.
#   tool_search     : list of native tool-search variants, preferred first.
#                     Anthropic = ["bm25", "regex"];
#                     OpenAI    = ["hosted", "client"].
#   max_tools       : cap on tool-definition count the provider will accept.
#                     Used by harn-lint to warn about oversized registries.
#   prompt_caching  : whether the provider honors cache_control blocks.
#   thinking        : whether extended / adaptive thinking is available.
#   preserve_thinking: whether prior <think> blocks should be carried forward.
#   server_parser   : server-side response parser that transforms model output.
#   honors_chat_template_kwargs: whether chat_template_kwargs are honored.
#   recommended_endpoint: preferred endpoint family for this route.
#   text_tool_wire_format_supported: whether Harn text tool calls survive.

# ---------- Anthropic (Claude) ------------------------------------------------

# Haiku 4.5+ supports server-side tool search.
[[provider.anthropic]]
model_match = "claude-haiku-*"
version_min = [4, 5]
native_tools = true
defer_loading = true
tool_search = ["bm25", "regex"]
max_tools = 10000
prompt_caching = true
thinking = true

# Opus 4.0+ supports tool search.
[[provider.anthropic]]
model_match = "claude-opus-*"
version_min = [4, 0]
native_tools = true
defer_loading = true
tool_search = ["bm25", "regex"]
max_tools = 10000
prompt_caching = true
thinking = true

# Sonnet 4.0+ supports tool search.
[[provider.anthropic]]
model_match = "claude-sonnet-*"
version_min = [4, 0]
native_tools = true
defer_loading = true
tool_search = ["bm25", "regex"]
max_tools = 10000
prompt_caching = true
thinking = true

# OpenRouter-style `anthropic/claude-...` prefixes.
[[provider.anthropic]]
model_match = "anthropic/claude-haiku-*"
version_min = [4, 5]
native_tools = true
defer_loading = true
tool_search = ["bm25", "regex"]
max_tools = 10000
prompt_caching = true
thinking = true

[[provider.anthropic]]
model_match = "anthropic/claude-opus-*"
version_min = [4, 0]
native_tools = true
defer_loading = true
tool_search = ["bm25", "regex"]
max_tools = 10000
prompt_caching = true
thinking = true

[[provider.anthropic]]
model_match = "anthropic/claude-sonnet-*"
version_min = [4, 0]
native_tools = true
defer_loading = true
tool_search = ["bm25", "regex"]
max_tools = 10000
prompt_caching = true
thinking = true

# Catch-all for older Claude models — native tools + prompt caching +
# thinking, but no progressive tool disclosure.
[[provider.anthropic]]
model_match = "claude-*"
native_tools = true
prompt_caching = true
thinking = true

# ---------- OpenAI family -----------------------------------------------------
#
# `provider.openai` rules are inherited by the sibling providers declared
# in `[provider_family]` below (OpenRouter, Together, Groq, DeepSeek,
# Fireworks, HuggingFace, local vLLM/SGLang). Siblings may still add their
# own `[[provider.<name>]]` rules and those win over the openai fallback.

# gpt-5.4+ exposes native `tool_search` on the Responses API.
[[provider.openai]]
model_match = "gpt-*"
version_min = [5, 4]
native_tools = true
defer_loading = true
tool_search = ["hosted", "client"]

# Legacy GPT: native tool calls only.
[[provider.openai]]
model_match = "gpt-*"
native_tools = true

# Reasoning family (o1, o3, o4, ...).
[[provider.openai]]
model_match = "o1*"
native_tools = true

[[provider.openai]]
model_match = "o3*"
native_tools = true

[[provider.openai]]
model_match = "o4*"
native_tools = true

# OpenRouter-style provider-prefixed IDs.
[[provider.openai]]
model_match = "openai/gpt-*"
version_min = [5, 4]
native_tools = true
defer_loading = true
tool_search = ["hosted", "client"]

[[provider.openai]]
model_match = "openai/gpt-*"
native_tools = true

# ---------- Local / Ollama ----------------------------------------------------
#
# Local providers don't advertise native tool_search or prompt caching.
# Native-tools coverage depends on the specific model. Qwen 3 / 3.5
# packages on Ollama expose native tool calling and thinking on the
# official docs + model pages. Qwen 3.6 adds `preserve_thinking` which
# Alibaba recommends for multi-turn agentic / coding workflows (the
# flagship use case we're building for) — keep it on by default.

[[provider.ollama]]
model_match = "qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "ollama_qwen3coder"
honors_chat_template_kwargs = false
recommended_endpoint = "/api/generate-raw"
text_tool_wire_format_supported = false

[[provider.ollama]]
model_match = "qwen3*"
native_tools = true
thinking = true
server_parser = "ollama_qwen3coder"
honors_chat_template_kwargs = false
recommended_endpoint = "/api/generate-raw"
text_tool_wire_format_supported = false

# llama.cpp / llama-server speaks OpenAI-compat but is a separate
# provider entry from Ollama. Give it the same Qwen3.6 wiring so the
# local GGUF path gets `preserve_thinking` by default too.
[[provider.llamacpp]]
model_match = "*qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.llamacpp]]
model_match = "*qwen3*"
native_tools = true
thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

# Local vLLM / SGLang — same story. The capability rules here gate
# chat_template_kwargs emission in openai_compat; the actual model has
# to have been launched with the matching --chat-template.
[[provider.local]]
model_match = "*qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.local]]
model_match = "*qwen3*"
native_tools = true
thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

# Apple Silicon MLX server (mlx-vlm). OpenAI-compat with vision; honors
# the same `chat_template_kwargs.preserve_thinking` knob as vLLM, since
# mlx-vlm reuses HF tokenizers / chat templates.
[[provider.mlx]]
model_match = "*qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.mlx]]
model_match = "*qwen3*"
native_tools = true
thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

# ---------- Qwen3.6 family (routed providers) --------------------------------
#
# DashScope is the authoritative Qwen host; Fireworks and the local
# OpenAI-compatible servers honor Qwen `chat_template_kwargs`.
# OpenRouter forwards thinking through its `reasoning` field instead.
# The sibling fallthrough to `[[provider.openai]]` would match the gpt-*
# rules (which don't apply) so we declare Qwen explicitly here to prevent
# a silent fallthrough to empty capabilities.

[[provider.dashscope]]
model_match = "qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.dashscope]]
model_match = "qwen*"
native_tools = true
thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.fireworks]]
model_match = "*qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

# Fireworks's own model IDs squash dots to `p` (`qwen3p6`, `qwen3p5`),
# so we need a second rule to catch their canonical naming.
[[provider.fireworks]]
model_match = "*qwen3p6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.fireworks]]
model_match = "*qwen*"
native_tools = true
thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

# OpenRouter: forwards Qwen thinking via its `reasoning` field (not
# chat_template_kwargs — transform_request strips that). Preservation
# works transparently since thinking blocks round-trip as assistant
# content; the reasoning enablement lives in `openrouter_reasoning_config`.
[[provider.openrouter]]
model_match = "qwen/qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = false
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.openrouter]]
model_match = "qwen/*"
native_tools = true
thinking = true
server_parser = "none"
honors_chat_template_kwargs = false
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

# HuggingFace router + Together pass Qwen chat_template_kwargs through
# unchanged. Match the qwen3.6 model card shape (`Qwen/Qwen3.6-*`).
[[provider.huggingface]]
model_match = "qwen/qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.huggingface]]
model_match = "qwen/*"
native_tools = true
thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.together]]
model_match = "qwen/qwen3.6*"
native_tools = true
thinking = true
preserve_thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

[[provider.together]]
model_match = "qwen/*"
native_tools = true
thinking = true
server_parser = "none"
honors_chat_template_kwargs = true
recommended_endpoint = "/v1/chat/completions"
text_tool_wire_format_supported = true

# ---------- DeepSeek reasoning (across host providers) -----------------------
#
# DeepSeek V3.1+ exposes reasoning via `chat_template_kwargs.enable_thinking`
# when routed through vLLM-style hosts (HuggingFace router, Together). On
# OpenRouter it surfaces as the `reasoning` field like other models on
# that platform. Prompt caching is DeepSeek-native (automatic, billed as
# cache_read tokens) so declare it wherever the host forwards the usage.

[[provider.together]]
model_match = "deepseek-ai/deepseek-v3*"
native_tools = true
thinking = true
prompt_caching = true
server_parser = "none"
honors_chat_template_kwargs = true

[[provider.huggingface]]
model_match = "deepseek-ai/deepseek-v3*"
native_tools = true
thinking = true
prompt_caching = true
server_parser = "none"
honors_chat_template_kwargs = true

[[provider.openrouter]]
model_match = "deepseek/deepseek-v3*"
native_tools = true
thinking = true
prompt_caching = true
server_parser = "none"
honors_chat_template_kwargs = false

# ---------- Google Gemini + Gemma 4 (via OpenRouter) -------------------------
#
# Gemini 2.5 exposes thinking budgets through OpenRouter's `reasoning`
# field. Gemma 4 does not; it's a plain instruction-tuned dense / MoE
# model with native tool calling.

[[provider.openrouter]]
model_match = "google/gemini-2.5*"
native_tools = true
thinking = true
prompt_caching = true

[[provider.openrouter]]
model_match = "google/gemma-4*"
native_tools = true

# ---------- Moonshot Kimi K2.5 (via Together) --------------------------------
#
# Kimi K2.5 does OpenAI-compatible native tool calls; no thinking surface.

[[provider.together]]
model_match = "moonshotai/*"
native_tools = true

# ---------- Mock --------------------------------------------------------------
#
# Mock spoofs either Anthropic or OpenAI shape depending on the model ID.
# Handled specially in the loader (see `capabilities::lookup`): Claude-
# shape model strings route to the `anthropic` rule list first, otherwise
# fall through to the `openai` rule list.

# ---------- provider_family aliases ------------------------------------------

[provider_family]
openrouter = "openai"
together = "openai"
groq = "openai"
deepseek = "openai"
fireworks = "openai"
huggingface = "openai"
local = "openai"
# New April 2026 routes for Qwen3.6:
dashscope = "openai"
llamacpp = "openai"
mlx = "openai"