harn-vm 0.8.79 - Docs.rs

# @generated by `harn providers build-config`; do not edit directly.
# Edit crates/harn-vm/src/llm/catalog_sources/**/*.toml instead.

# --- source: 00-base.toml ---
# Harn's built-in LLM provider/model catalog source fragments.
#
# The files under catalog_sources/ are the source of truth for Harn's
# bundled defaults:
# providers, model aliases, inference + tier routing rules, canonical
# model metadata + pricing, qc defaults, and per-pattern hyperparameter
# overrides. `harn providers build-config` concatenates these fragments into
# llm/providers.toml, which deserializes into `ProvidersConfig` via the same
# Serde pipeline that loads HARN_PROVIDERS_CONFIG /
# ~/.config/harn/providers.toml / harn.toml [providers] /
# package-manifest [llm] sections at runtime.
#
# Resolution order at startup (later overlays win on per-key basis):
#   1. Generated llm/providers.toml (embedded into the VM via include_str!)
#   2. ~/.config/harn/providers.toml (user-global override)
#   3. HARN_PROVIDERS_CONFIG env var (explicit per-process override)
#   4. Per-run programmatic overlays installed by hosts via
#      llm_config::set_user_overrides()
#
# Edit these fragments to change defaults, then run
# `harn providers build-config` and `harn providers export`. Do not re-add
# equivalent data as Rust literals in llm_config.rs — that creates the
# parallel system this catalog exists to eliminate.

default_provider = "anthropic"

# --- source: 10-providers/all.toml ---
# ── Providers ────────────────────────────────────────────────────────────────
# Each [providers.X] block defines an LLM endpoint Harn can dial. The
# `auth_env` field can be a single string or an array (tried in order).
# `cost_per_1k_in/out` are coarse provider-level fallbacks used when a
# specific [models.X] entry has no `pricing` table.

[providers.anthropic]
base_url = "https://api.anthropic.com/v1"
auth_style = "header"
auth_header = "x-api-key"
auth_env = "ANTHROPIC_API_KEY"
chat_endpoint = "/messages"
features = ["prompt_caching", "thinking"]
cost_per_1k_in = 0.003
cost_per_1k_out = 0.015
latency_p50_ms = 2500
extra_headers = { "anthropic-version" = "2023-06-01" }

[providers.anthropic.healthcheck]
method = "POST"
path = "/messages/count_tokens"
body = '{"model":"claude-sonnet-4-6","messages":[{"role":"user","content":"x"}]}'

[providers.openai]
base_url = "https://api.openai.com/v1"
auth_style = "bearer"
auth_env = "OPENAI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1800

[providers.openai.healthcheck]
method = "GET"
path = "/models"

[providers.openrouter]
base_url = "https://openrouter.ai/api/v1"
auth_style = "bearer"
auth_env = "OPENROUTER_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.003
cost_per_1k_out = 0.015
latency_p50_ms = 2200

[providers.openrouter.healthcheck]
method = "GET"
path = "/auth/key"

[providers.huggingface]
base_url = "https://router.huggingface.co/v1"
auth_style = "bearer"
auth_env = ["HF_TOKEN", "HUGGINGFACE_API_KEY"]
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 2400

[providers.huggingface.healthcheck]
method = "GET"
url = "https://huggingface.co/api/whoami-v2"

# Ollama defaults to /api/chat (native NDJSON) so the test stubs keep
# working; hosts can flip to /v1/chat/completions via a providers.toml
# overlay to bypass Ollama's per-model tool-call post-processors
# (qwen3coder.go, qwen35.go) that raise HTTP 500s on text-mode responses
# for the Qwen3.5 family.
[providers.ollama]
base_url = "http://localhost:11434"
base_url_env = "OLLAMA_HOST"
auth_style = "none"
chat_endpoint = "/api/chat"
completion_endpoint = "/api/generate"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 1200

[providers.ollama.healthcheck]
method = "GET"
path = "/api/tags"

[providers.gemini]
base_url = "https://generativelanguage.googleapis.com"
base_url_env = "GEMINI_BASE_URL"
auth_style = "header"
auth_header = "x-goog-api-key"
auth_env = ["GEMINI_API_KEY", "GOOGLE_API_KEY"]
chat_endpoint = "/v1beta/models"
cost_per_1k_in = 0.00125
cost_per_1k_out = 0.005
latency_p50_ms = 1800

[providers.gemini.healthcheck]
method = "GET"
path = "/v1beta/models"

[providers.mistral]
base_url = "https://api.mistral.ai/v1"
base_url_env = "MISTRAL_BASE_URL"
auth_style = "bearer"
auth_env = "MISTRAL_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0005
cost_per_1k_out = 0.0015
latency_p50_ms = 1800
features = ["native_tools"]

[providers.mistral.healthcheck]
method = "GET"
path = "/models"

[providers.cohere]
base_url = "https://api.cohere.ai/compatibility/v1"
base_url_env = "COHERE_BASE_URL"
auth_style = "bearer"
auth_env = "COHERE_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1900
features = ["native_tools", "reasoning"]

[providers.cohere.healthcheck]
method = "GET"
path = "/models"

[providers.xai]
base_url = "https://api.x.ai/v1"
base_url_env = "XAI_BASE_URL"
auth_style = "bearer"
auth_env = "XAI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.001
cost_per_1k_out = 0.002
latency_p50_ms = 1600
features = ["responses_api", "native_tools", "reasoning"]

[providers.xai.healthcheck]
method = "GET"
path = "/models"

[providers.together]
base_url = "https://api.together.xyz/v1"
base_url_env = "TOGETHER_AI_BASE_URL"
auth_style = "bearer"
auth_env = "TOGETHER_AI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 1600

[providers.together.healthcheck]
method = "GET"
path = "/models"

# Groq — OpenAI-compatible LPU-hosted fast inference. Headline ~840 tok/s
# on Llama 3.1 8B, ~594 tok/s on Llama 4 Scout. Useful executor target
# when sub-100ms TTFT matters more than raw quality.
[providers.groq]
base_url = "https://api.groq.com/openai/v1"
base_url_env = "GROQ_BASE_URL"
auth_style = "bearer"
auth_env = "GROQ_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0001
cost_per_1k_out = 0.0003
latency_p50_ms = 450

[providers.groq.healthcheck]
method = "GET"
path = "/models"

# Cerebras — OpenAI-compatible wafer-scale inference. High token throughput
# makes it a strong fit for latency-budgeted binder workloads; end-to-end
# p50 still includes client/provider round-trip time, so callers should keep
# their own wall-clock budget. Provider-level pricing here is a coarse
# default; per-model rows under [models.X] hold the authoritative numbers
# from Cerebras's public model discovery endpoint.
[providers.cerebras]
base_url = "https://api.cerebras.ai/v1"
base_url_env = "CEREBRAS_BASE_URL"
auth_style = "bearer"
auth_env = "CEREBRAS_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.00035
cost_per_1k_out = 0.00075
latency_p50_ms = 150
features = ["native_tools"]

[providers.cerebras.healthcheck]
method = "GET"
path = "/models"

[providers.deepseek]
base_url = "https://api.deepseek.com/v1"
base_url_env = "DEEPSEEK_BASE_URL"
auth_style = "bearer"
auth_env = "DEEPSEEK_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.00014
cost_per_1k_out = 0.00028
latency_p50_ms = 1800

[providers.deepseek.healthcheck]
method = "GET"
path = "/models"

[providers.fireworks]
base_url = "https://api.fireworks.ai/inference/v1"
base_url_env = "FIREWORKS_BASE_URL"
auth_style = "bearer"
auth_env = "FIREWORKS_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 1400

[providers.fireworks.healthcheck]
method = "GET"
path = "/models"

[providers.dashscope]
base_url = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
base_url_env = "DASHSCOPE_BASE_URL"
auth_style = "bearer"
auth_env = "DASHSCOPE_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0003
cost_per_1k_out = 0.0012
latency_p50_ms = 1600

[providers.dashscope.healthcheck]
method = "GET"
path = "/models"

# MiniMax — OpenAI-compatible endpoint for MiniMax M2/M3 models. The direct
# API serves MiniMax-M3 plus the open-weight M2 family on the same
# /v1/chat/completions URL; the wire format mirrors OpenAI chat completions
# with native tool calls and model-specific thinking controls.
[providers.minimax]
base_url = "https://api.minimax.io/v1"
base_url_env = "MINIMAX_BASE_URL"
auth_style = "bearer"
auth_env = "MINIMAX_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0006
cost_per_1k_out = 0.0024
latency_p50_ms = 1700

[providers.minimax.healthcheck]
method = "GET"
path = "/models"

# Z.AI — host of the GLM family (GLM-4.6, GLM-4.7). Z.AI publishes both a
# native PaaS endpoint (`/api/paas/v4`) and an OpenAI-compatible endpoint
# (`/v1`); Harn dials the OpenAI-compatible one so the existing
# openai_chat_completions wire format Just Works. The provider also
# accepts the legacy `ZHIPU_API_KEY` env var; callers can rely on the
# `auth_env` array trying each in order.
[providers.zai]
base_url = "https://api.z.ai/v1"
base_url_env = "ZAI_BASE_URL"
auth_style = "bearer"
auth_env = ["ZAI_API_KEY", "ZHIPU_API_KEY"]
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0004
cost_per_1k_out = 0.0017
latency_p50_ms = 1900

[providers.zai.healthcheck]
method = "GET"
path = "/models"

# AWS Bedrock — resolves credentials through env, profile, container, or
# EC2 instance roles, then signs Converse API calls with SigV4.
[providers.bedrock]
base_url = ""
base_url_env = "BEDROCK_BASE_URL"
auth_style = "aws_sigv4"
chat_endpoint = "/model/{model}/converse"
features = ["native_tools"]
latency_p50_ms = 2600

# Azure OpenAI — deployment name is routed in the URL; callers can
# either pass the deployment as the Harn model field or set
# AZURE_OPENAI_DEPLOYMENT.
[providers.azure_openai]
base_url = "https://{resource}.openai.azure.com"
base_url_env = "AZURE_OPENAI_ENDPOINT"
auth_style = "azure_openai"
auth_env = ["AZURE_OPENAI_API_KEY", "AZURE_OPENAI_AD_TOKEN", "AZURE_OPENAI_BEARER_TOKEN"]
chat_endpoint = "/openai/deployments/{deployment}/chat/completions?api-version={api_version}"
features = ["native_tools"]
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1900

[providers.vertex]
base_url = "https://aiplatform.googleapis.com/v1"
base_url_env = "VERTEX_AI_BASE_URL"
auth_style = "bearer"
auth_env = ["VERTEX_AI_ACCESS_TOKEN", "GOOGLE_OAUTH_ACCESS_TOKEN", "GOOGLE_APPLICATION_CREDENTIALS"]
chat_endpoint = "/projects/{project}/locations/{location}/publishers/google/models/{model}:generateContent"
features = ["native_tools"]
cost_per_1k_in = 0.00125
cost_per_1k_out = 0.005
latency_p50_ms = 2100

[providers.local]
base_url = "http://localhost:8000"
base_url_env = "LOCAL_LLM_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900

[providers.local.healthcheck]
method = "GET"
path = "/v1/models"

# llama.cpp — separate from `local` so capability rules can isolate Qwen
# chat-template thinking quirks from other local OpenAI-compatible hosts.
[providers.llamacpp]
base_url = "http://127.0.0.1:8001"
base_url_env = "LLAMACPP_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900

[providers.llamacpp.healthcheck]
method = "GET"
path = "/v1/models"

# Apple Silicon MLX. Harn owns readiness probing; hosts that want
# script-based auto-start should launch the process first, then call
# Harn again to verify readiness.
[providers.mlx]
base_url = "http://127.0.0.1:8002"
base_url_env = "MLX_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900

[providers.mlx.healthcheck]
method = "GET"
path = "/v1/models"

[providers.vllm]
base_url = "http://localhost:8000"
base_url_env = "VLLM_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 800

[providers.vllm.healthcheck]
method = "GET"
path = "/v1/models"

[providers.tgi]
base_url = "http://localhost:8080"
base_url_env = "TGI_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 950

[providers.tgi.healthcheck]
method = "GET"
path = "/health"

# --- source: 12-local-runtime/lifecycle.toml ---
# Local runtime lifecycle metadata for `harn local`.
# These rows describe provider mechanics, not machine-specific model paths.

[providers.ollama.local_runtime]
kind = "daemon_api"
command = "ollama"
default_port = 11434
stop = "keep_alive_zero"
source_url = "https://github.com/ollama/ollama/blob/main/docs/api.md"
last_verified = "2026-06-05"
notes = "Load via Ollama generate/chat warmup; unload by posting an empty prompt with keep_alive=0."

[providers.llamacpp.local_runtime]
kind = "managed_process"
command = "llama-server"
model_source_env = "LLAMACPP_MODEL"
default_port = 8001
model_arg = "--model"
served_model_arg = "--alias"
host_arg = "--host"
port_arg = "--port"
ctx_arg = "--ctx-size"
parallel_arg = "--parallel"
gpu_layers_arg = "--n-gpu-layers"
cache_type_k_arg = "--cache-type-k"
cache_type_v_arg = "--cache-type-v"
cache_ram_arg = "--cache-ram"
default_args = ["--jinja", "--reasoning", "off", "--reasoning-format", "deepseek", "--metrics", "--flash-attn", "on"]
stop = "pid"
source_url = "https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md"
last_verified = "2026-06-05"
notes = "OpenAI-compatible HTTP server. Use --model-source or LLAMACPP_MODEL for the GGUF path; Harn records the launched PID for local stop."

[providers.mlx.local_runtime]
kind = "managed_process"
command = "mlx_lm.server"
model_source_env = "MLX_MODEL"
default_port = 8002
model_arg = "--model"
host_arg = "--host"
port_arg = "--port"
stop = "pid"
source_url = "https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md"
last_verified = "2026-06-05"
notes = "OpenAI-like MLX-LM server. Use --model-source or MLX_MODEL for an MLX-compatible path or Hugging Face repo id."

# --- source: 20-routing/inference.toml ---
# ── Inference rules ──────────────────────────────────────────────────────────
# Map a model ID shape to a default provider when the caller doesn't
# specify one. First match wins. User overlays prepend, so they can
# preempt these defaults without removing them.

[[inference_rules]]
pattern = "claude-*"
provider = "anthropic"

[[inference_rules]]
pattern = "gpt-*"
provider = "openai"

[[inference_rules]]
pattern = "o1*"
provider = "openai"

[[inference_rules]]
pattern = "o3*"
provider = "openai"

[[inference_rules]]
pattern = "o4*"
provider = "openai"

[[inference_rules]]
pattern = "anthropic.claude-*"
provider = "bedrock"

[[inference_rules]]
pattern = "meta.llama*"
provider = "bedrock"

[[inference_rules]]
pattern = "amazon.*"
provider = "bedrock"

[[inference_rules]]
pattern = "mistral.*"
provider = "bedrock"

[[inference_rules]]
pattern = "cohere.*"
provider = "bedrock"

[[inference_rules]]
pattern = "gemini-*"
provider = "gemini"

[[inference_rules]]
pattern = "mistral-*"
provider = "mistral"

[[inference_rules]]
pattern = "devstral-*"
provider = "mistral"

[[inference_rules]]
pattern = "command-*"
provider = "cohere"

[[inference_rules]]
pattern = "grok-*"
provider = "xai"

[[inference_rules]]
pattern = "groq/*"
provider = "groq"

# Cerebras model IDs come back as bare names ("gpt-oss-120b",
# "llama-3.3-70b") from /v1/models, so callers slash-prefix
# them as "cerebras/<model>" to disambiguate from OpenRouter's
# one-slash convention. Match the prefix before the generic
# single-slash rule routes it elsewhere.
[[inference_rules]]
pattern = "cerebras/*"
provider = "cerebras"

# MiniMax — released-name canonical IDs (capital-M) sit on the direct
# API. OpenRouter mirrors the same family under `minimax/*` slugs; that
# routing is handled by the generic slash-prefix rule.
[[inference_rules]]
pattern = "MiniMax-*"
provider = "minimax"

# Z.AI — GLM family. Bare IDs ("glm-5", "glm-5.1") dial the direct
# OpenAI-compatible Z.AI endpoint; `zhipu/*` is accepted as a legacy
# prefix some packagers still use.
[[inference_rules]]
pattern = "glm-*"
provider = "zai"

[[inference_rules]]
pattern = "zhipu/*"
provider = "zai"

# DeepSeek — V4 family. The direct API uses bare IDs while OpenRouter
# slugs them as `deepseek/*`; only the bare form needs an inference
# rule because slash-prefixed IDs hit the generic OpenRouter rule.
[[inference_rules]]
pattern = "deepseek-v4*"
provider = "deepseek"

[[inference_rules]]
pattern = "deepseek-chat"
provider = "deepseek"

[[inference_rules]]
pattern = "deepseek-reasoner"
provider = "deepseek"

# --- source: 20-routing/tier-defaults.toml ---
# ── Tier defaults ────────────────────────────────────────────────────────────
# Tier is self-declared on each model row via `tier = "small" | "mid" |
# "frontier" | "reasoning"`. The legacy pattern-based [[tier_rules]] table
# has been removed; the catalog is the single source of truth. Models
# without an explicit `tier` resolve to `tier_defaults.default`.

[tier_defaults]
default = "mid"

# --- source: 30-aliases/aliases.toml ---
# ── Aliases ──────────────────────────────────────────────────────────────────
# Short symbolic names → (model id, provider, optional tool_format). The
# tier-resolution path (`resolve_tier_model("frontier", None)`) reads
# `frontier`, `mid`, `small`; provider-scoped tiers like `tier/mid` let
# callers force a specific resolution per provider.

# Short flagship aliases — these track whatever the current
# generation is. Bump these when a successor lands.
[aliases.sonnet]
id = "claude-sonnet-4-6"
provider = "anthropic"

[aliases.opus]
id = "claude-opus-4-8"
provider = "anthropic"

[aliases.haiku]
id = "claude-haiku-4-5-20251001"
provider = "anthropic"

[aliases.frontier]
id = "claude-sonnet-4-6"
provider = "anthropic"

[aliases."tier/frontier"]
id = "claude-sonnet-4-6"
provider = "anthropic"

[aliases.mid]
id = "gpt-4o-mini"
provider = "openai"

[aliases."tier/mid"]
id = "gpt-4o-mini"
provider = "openai"

[aliases.small]
id = "Qwen/Qwen3.5-9B"
provider = "openrouter"

[aliases."tier/small"]
id = "Qwen/Qwen3.5-9B"
provider = "openrouter"

# Local Gemma 4 variants (vLLM / OpenAI-compat backend at `providers.local`).
[aliases.local-gemma4]
id = "gemma-4-26b-a4b-it"
provider = "local"

[aliases.local-gemma4-26b]
id = "gemma-4-26b-a4b-it"
provider = "local"

[aliases.local-gemma4-31b]
id = "gemma-4-31b-it"
provider = "local"

[aliases.local-gemma4-e4b]
id = "gemma-4-e4b-it"
provider = "local"

[aliases.local-gemma4-e2b]
id = "gemma-4-e2b-it"
provider = "local"

[aliases.ollama-gemma4]
id = "gemma4:26b"
provider = "ollama"
tool_format = "text"

[aliases.ollama-gemma4-26b]
id = "gemma4:26b"
provider = "ollama"
tool_format = "text"

[aliases.ollama-gemma4-12b]
id = "gemma4:12b-mlx"
provider = "ollama"
tool_format = "text"

[aliases.ollama-gemma4-12b-nvfp4]
id = "gemma4:12b-nvfp4"
provider = "ollama"
tool_format = "text"

[aliases.local-gemma4-12b]
id = "gemma-4-12b-it"
provider = "local"

# Gemma 4 26B/31B via hosted APIs (the 12B is on-device only). The Gemini API
# serves Gemma under its bare id; OpenRouter/Together use org-prefixed ids.
[aliases.gemini-gemma4-31b]
id = "models/gemma-4-31b-it"
provider = "gemini"

[aliases.gemini-gemma4-26b]
id = "models/gemma-4-26b-a4b-it"
provider = "gemini"

[aliases.openrouter-gemma4-31b]
id = "google/gemma-4-31b-it"
provider = "openrouter"

[aliases.openrouter-gemma4-26b]
id = "google/gemma-4-26b-a4b-it"
provider = "openrouter"

[aliases.together-gemma4-31b]
id = "google/gemma-4-31B-it"
provider = "together"

# qwen3.6 has no working Ollama route — Ollama's qwen3.5-family server-side
# tool-call parser 500s on text-tool output (ollama/ollama#14986, #14570).
# Use the llamacpp provider for local qwen3.x.

# llama.cpp — Unsloth Dynamic 2.0 GGUF served by llama-server.
[aliases."llamacpp-qwen3.6"]
id = "qwen3.6-35b-a3b"
provider = "llamacpp"
tool_format = "text"

[aliases."llamacpp-qwen3.6-q4"]
id = "qwen3.6-35b-a3b-ud-q4-k-xl"
provider = "llamacpp"
tool_format = "native"

[aliases."local-qwen3.6"]
id = "qwen3.6-35b-a3b-ud-q4-k-xl"
provider = "llamacpp"
tool_format = "native"

[aliases."local-qwen3.6-gguf"]
id = "qwen3.6-35b-a3b-ud-q4-k-xl"
provider = "llamacpp"
tool_format = "native"

# MLX (Apple Silicon).
[aliases.mlx-qwen36-27b]
id = "unsloth/Qwen3.6-27B-UD-MLX-4bit"
provider = "mlx"

[aliases."mlx-qwen3.6-27b"]
id = "unsloth/Qwen3.6-27B-UD-MLX-4bit"
provider = "mlx"
tool_format = "native"

[aliases."mlx-qwen3.6-27b-q4"]
id = "unsloth/Qwen3.6-27B-UD-MLX-4bit"
provider = "mlx"
tool_format = "native"

[aliases."local-qwen3.6-27b"]
id = "unsloth/Qwen3.6-27B-UD-MLX-4bit"
provider = "mlx"
tool_format = "native"

# MiniMax direct API aliases.
[aliases.minimax]
id = "MiniMax-M3"
provider = "minimax"

[aliases."minimax-m2"]
id = "MiniMax-M2"
provider = "minimax"

[aliases."minimax-m2.5"]
id = "MiniMax-M2.5"
provider = "minimax"

[aliases."minimax-m2.7"]
id = "MiniMax-M2.7"
provider = "minimax"

[aliases."minimax-m3"]
id = "MiniMax-M3"
provider = "minimax"

# Z.AI GLM aliases.
[aliases.glm]
id = "glm-5.1"
provider = "zai"

[aliases."glm-5"]
id = "glm-5"
provider = "zai"

[aliases."glm-5.1"]
id = "glm-5.1"
provider = "zai"

# DeepSeek V4 direct API aliases.
[aliases.deepseek]
id = "deepseek-v4-flash"
provider = "deepseek"

[aliases."deepseek-flash"]
id = "deepseek-v4-flash"
provider = "deepseek"

[aliases."deepseek-pro"]
id = "deepseek-v4-pro"
provider = "deepseek"

[aliases."deepseek-v4-flash"]
id = "deepseek-v4-flash"
provider = "deepseek"

[aliases."deepseek-v4-pro"]
id = "deepseek-v4-pro"
provider = "deepseek"

[aliases.cohere]
id = "command-a-plus-05-2026"
provider = "cohere"

[aliases."command-a-plus"]
id = "command-a-plus-05-2026"
provider = "cohere"

[aliases.grok-code]
id = "grok-build-0.1"
provider = "xai"

[aliases."grok-code-fast"]
id = "grok-build-0.1"
provider = "xai"

# Devstral (Mistral's agentic-coding tune).
[aliases.devstral-small-2]
id = "devstral-small-2:24b"
provider = "ollama"
tool_format = "text"

[aliases.ollama-devstral-small-2]
id = "devstral-small-2:24b"
provider = "ollama"
tool_format = "text"

[aliases.ollama-devstral-small-2-native]
id = "devstral-small-2:24b"
provider = "ollama"
tool_format = "native"

# --- source: 30-aliases/tool-calling.toml ---
# ── Alias tool-calling probe state ───────────────────────────────────────────
# Per-alias overrides recording the last-observed native vs. text vs.
# streaming tool-call probe outcome and the desired fallback. Hosts may
# update these via providers.toml overlays as they re-probe a model.

[alias_tool_calling.ollama-gemma4]
native = "unknown"
text = "unknown"
streaming_native = "unknown"
fallback_mode = "disabled"
failure_reason = "requires_tool_probe"

[alias_tool_calling.ollama-gemma4-12b]
native = "unknown"
text = "unknown"
streaming_native = "unknown"
fallback_mode = "disabled"
failure_reason = "requires_tool_probe"

[alias_tool_calling."llamacpp-qwen3.6-q4"]
native = "pass"
text = "unknown"
streaming_native = "pass"
fallback_mode = "native"
last_probe_at = "2026-06-05"

[alias_tool_calling."local-qwen3.6"]
native = "pass"
text = "unknown"
streaming_native = "pass"
fallback_mode = "native"
last_probe_at = "2026-06-05"

[alias_tool_calling."local-qwen3.6-gguf"]
native = "pass"
text = "unknown"
streaming_native = "pass"
fallback_mode = "native"
last_probe_at = "2026-06-05"

[alias_tool_calling."mlx-qwen3.6-27b"]
native = "unknown"
text = "unknown"
streaming_native = "unknown"
fallback_mode = "native"
failure_reason = "requires_served_identity_and_tool_probe"

# --- source: 40-defaults/qc.toml ---
# ── QC defaults ──────────────────────────────────────────────────────────────
# Default low-cost model per provider for cheap quality-check / repair
# passes. Scripts read these via `qc_default_model(provider)`.

[qc_defaults]
anthropic = "claude-haiku-4-5-20251001"
openai = "gpt-4o-mini"
openrouter = "google/gemini-2.5-flash"
ollama = "llama3.2"
local = "gpt-4o"
mistral = "mistral-small-2603"
cohere = "command-a-plus-05-2026"
xai = "grok-build-0.1"
groq = "llama-3.1-8b-instant"
minimax = "MiniMax-M2.5-highspeed"
zai = "glm-5"
deepseek = "deepseek-v4-flash"

# --- source: 60-models/00-anthropic.toml ---
# ── Models ───────────────────────────────────────────────────────────────────
# Canonical model metadata: display name, provider, context window,
# capabilities, pricing (USD per 1M tokens), and deprecation status.
# Pricing reflects public provider pages snapshotted at the comment
# beside each section; edit the literal here and the change shows up in
# `git blame`.

# Anthropic ─ pricing pages: https://www.anthropic.com/pricing &
# https://platform.claude.com/docs/en/about-claude/model-deprecations.
# Sonnet 4.5 retired 2026-05-15; Sonnet 4 and Opus 4 retire 2026-06-15.

[models."claude-3-5-haiku-20241022"]
name = "Claude Haiku 3.5"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 0.80, output_per_mtok = 4.00, cache_read_per_mtok = 0.08, cache_write_per_mtok = 1.00 }
tier = "small"
open_weight = false
strengths = ["speed", "cheap", "summarization", "tool_use"]

[models."claude-haiku-4-5-20251001"]
name = "Claude Haiku 4.5"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 1.00, output_per_mtok = 5.00, cache_read_per_mtok = 0.10, cache_write_per_mtok = 1.25 }
tier = "mid"
open_weight = false
strengths = ["speed", "cheap", "coding", "tool_use", "summarization"]

[models."claude-3-5-sonnet-20240620"]
name = "Claude Sonnet 3.5 (2024-06-20)"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context"]
[models."claude-3-5-sonnet-20241022"]
name = "Claude Sonnet 3.5 (2024-10-22)"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context"]
[models."claude-sonnet-4-20250514"]
name = "Claude Sonnet 4"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
deprecated = true
deprecation_note = "Sunset 2026-06-15 per Anthropic deprecations page. Replaced by claude-sonnet-4-6."
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 49.0 }
[models."claude-sonnet-4-5"]
name = "Claude Sonnet 4.5"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
deprecated = true
deprecation_note = "Sunset 2026-05-15 per Anthropic deprecations page. Replaced by claude-sonnet-4-6."
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 77.2 }
[models."claude-sonnet-4-6"]
name = "Claude Sonnet 4.6"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 79.6 }
complementary_with = ["openai-gpt", "google-gemini", "qwen", "deepseek", "kimi"]
[models."claude-sonnet-4-7"]
name = "Claude Sonnet 4.7"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }

# OpenRouter-routed Anthropic models. Kept as distinct catalog entries
# so `openrouter:anthropic/claude-*` resolves with the right capability
# matrix — without these, native-tools requests fail with `option `tools`
# is not supported by ... (provider openrouter)` because the lookup
# falls back to a no-tools shape. OpenRouter passes Anthropic's
# native-tools API through verbatim, including prompt-caching headers
# (cache attribution surface differs — tracked separately in #2320).
# Pricing matches the direct Anthropic API; OpenRouter adds its own
# margin at request time.
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 81.0 }
[models."anthropic/claude-haiku-4-5"]
name = "Claude Haiku 4.5 (via OpenRouter)"
provider = "openrouter"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching"]
pricing = { input_per_mtok = 1.00, output_per_mtok = 5.00, cache_read_per_mtok = 0.10, cache_write_per_mtok = 1.25 }
tier = "mid"
open_weight = false
strengths = ["speed", "cheap", "coding", "tool_use", "summarization"]
[models."anthropic/claude-sonnet-4-6"]
name = "Claude Sonnet 4.6 (via OpenRouter)"
provider = "openrouter"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
complementary_with = ["openai-gpt", "google-gemini", "qwen", "deepseek", "kimi"]
[models."claude-3-opus-20240229"]
name = "Claude Opus 3"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 75.00, cache_read_per_mtok = 1.50, cache_write_per_mtok = 18.75 }
tier = "frontier"
open_weight = false
strengths = ["reasoning", "long_context"]
[models."claude-opus-4-20250514"]
name = "Claude Opus 4"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 75.00, cache_read_per_mtok = 1.50, cache_write_per_mtok = 18.75 }
deprecated = true
deprecation_note = "Sunset 2026-06-15 per Anthropic deprecations page. Replaced by claude-opus-4-8."
superseded_by = "claude-opus-4-8"
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 77.6 }
[models."claude-opus-4-1-20250805"]
name = "Claude Opus 4.1"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 75.00, cache_read_per_mtok = 1.50, cache_write_per_mtok = 18.75 }
deprecated = true
deprecation_note = "Superseded by claude-opus-4-8. No formal sunset yet; switch when convenient."
superseded_by = "claude-opus-4-8"
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 78.9 }
[models."claude-opus-4-6"]
name = "Claude Opus 4.6"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 5.00, output_per_mtok = 25.00, cache_read_per_mtok = 0.50, cache_write_per_mtok = 6.25 }
deprecated = true
deprecation_note = "Superseded by claude-opus-4-8. No formal sunset yet; switch when convenient."
superseded_by = "claude-opus-4-8"
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 80.8, swe_bench_pro = 53.4 }
# Fast mode DEPRECATED at the Opus 4.8 launch; removed ~30 days later.
fast_mode = { param = "speed", value = "fast", beta_header = "fast-mode-2026-02-01", otps_speedup = 2.5, status = "deprecated", pricing = { input_per_mtok = 30.00, output_per_mtok = 150.00, cache_read_per_mtok = 3.00, cache_write_per_mtok = 37.50 }, note = "Deprecated at the Opus 4.8 launch; removed ~30 days later, after which speed=fast silently falls back to standard speed/pricing. Migrate to Opus 4.8 or 4.7 fast mode." }
[models."claude-opus-4-7"]
name = "Claude Opus 4.7"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 5.00, output_per_mtok = 25.00, cache_read_per_mtok = 0.50, cache_write_per_mtok = 6.25 }
deprecated = true
deprecation_note = "Superseded by claude-opus-4-8. No formal sunset yet; switch when convenient."
superseded_by = "claude-opus-4-8"
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 87.6, swe_bench_pro = 64.3 }
# Fast mode (research preview): 6x standard pricing, ~2.5x output tok/s.
fast_mode = { param = "speed", value = "fast", beta_header = "fast-mode-2026-02-01", otps_speedup = 2.5, status = "research_preview", pricing = { input_per_mtok = 30.00, output_per_mtok = 150.00, cache_read_per_mtok = 3.00, cache_write_per_mtok = 37.50 }, note = "Claude API + Managed Agents only. Migrate to Opus 4.8 fast mode for the cheaper 2x rate." }

# Claude Opus 4.8 (released 2026-05-28) — Anthropic's most capable model
# for complex reasoning, long-horizon agentic coding, and high-autonomy
# work; the current `opus` alias target. Adaptive thinking only (extended
# thinking budgets return 400); the `effort` parameter controls reasoning
# depth and defaults to `high` on every surface. Sampling params
# (temperature/top_p/top_k) are rejected — steer via prompting instead.
# Natively a 1M-token context window (via the long-context beta); the
# catalog keeps the standard-tier 200k convention shared by every other
# Claude row. Pricing per the Anthropic models overview: $5 / $25 per
# MTok in/out (cache read 0.1x, 5-min cache write 1.25x).
[models."claude-opus-4-8"]
name = "Claude Opus 4.8"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 5.00, output_per_mtok = 25.00, cache_read_per_mtok = 0.50, cache_write_per_mtok = 6.25 }
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 88.6, swe_bench_pro = 69.2 }
complementary_with = ["openai-gpt", "google-gemini", "qwen", "deepseek", "kimi"]
# Fast mode (research preview): 2x standard pricing, ~2.5x output tok/s.
fast_mode = { param = "speed", value = "fast", beta_header = "fast-mode-2026-02-01", otps_speedup = 2.5, status = "research_preview", pricing = { input_per_mtok = 10.00, output_per_mtok = 50.00, cache_read_per_mtok = 1.00, cache_write_per_mtok = 12.50 }, note = "Claude API + Managed Agents only (not Bedrock/Vertex/Foundry); excluded from Batch and Priority Tier. Switching speed invalidates the prompt cache. Waitlist/account-manager gated." }

# --- source: 60-models/10-openai-gemini-mistral.toml ---
# OpenAI ─ pricing pages: https://platform.openai.com/docs/pricing.

# GPT-5.5 — current OpenAI frontier (Responses + Chat Completions).
# Reasoning model driven by `reasoning_effort`. Standard $5 / $30 per MTok
# (cached input $0.50); short-context rates shown — long-context (>200k)
# bills higher per OpenAI's pricing page. Accelerated serving rides the
# `service_tier` knob: Codex exposes it as "Fast mode"
# (service_tier = "fast", ~1.5x faster output) and the API as priority
# processing (service_tier = "priority"); both bill at 2.5x standard.
# Off by default.
[models."gpt-5.5"]
name = "GPT-5.5"
provider = "openai"
context_window = 400000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 5.00, output_per_mtok = 30.00, cache_read_per_mtok = 0.50 }
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "tool_use", "long_context", "agentic"]
complementary_with = ["anthropic-claude", "google-gemini", "qwen", "deepseek", "kimi"]
fast_mode = { param = "service_tier", value = "fast", otps_speedup = 1.5, status = "ga", pricing = { input_per_mtok = 12.50, output_per_mtok = 75.00, cache_read_per_mtok = 1.25 }, note = "Codex \"Fast mode\" (service_tier=\"fast\", ~1.5x faster output) and API priority processing (service_tier=\"priority\") both bill at 2.5x standard. Not offered for long-context, fine-tuned models, or embeddings." }

# GPT-4o retired from ChatGPT 2026-02-13; chatgpt-4o-latest removed
# from API 2026-02-17 (Enterprise/Edu grace until 2026-04-03).
[models."gpt-4o"]
name = "GPT-4o"
provider = "openai"
context_window = 128000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 2.50, output_per_mtok = 10.00, cache_read_per_mtok = 1.25 }
deprecated = true
deprecation_note = "API sunset 2026-02-17 per OpenAI deprecations page. Switch to gpt-5-mini for cheap routing or gpt-5 for frontier."
tier = "frontier"
open_weight = false
strengths = ["coding", "vision", "tool_use"]
[models."gpt-4o-mini"]
name = "GPT-4o Mini"
provider = "openai"
context_window = 128000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60 }
# Not yet deprecated as of 2026-05 — OpenAI's deprecation page lists
# gpt-4o (Feb 17 2026 API sunset) but gpt-4o-mini has no announced
# sunset. Still the canonical `mid` tier default until gpt-5-mini ships
# with confirmed pricing.
tier = "mid"
open_weight = false
strengths = ["speed", "cheap", "summarization", "tool_use"]
complementary_with = ["anthropic-claude", "google-gemini", "qwen", "deepseek", "kimi"]
[models."gpt-4-turbo"]
name = "GPT-4 Turbo"
provider = "openai"
context_window = 128000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 10.00, output_per_mtok = 30.00 }
deprecated = true
deprecation_note = "Superseded by gpt-5 family. Listed for cost-attribution backfill only."
tier = "frontier"
open_weight = false
strengths = ["coding", "tool_use"]
[models.o1]
name = "OpenAI o1"
provider = "openai"
context_window = 200000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 60.00, cache_read_per_mtok = 7.50 }
tier = "reasoning"
open_weight = false
strengths = ["reasoning"]
[models."o1-mini"]
name = "OpenAI o1-mini"
provider = "openai"
context_window = 128000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 12.00, cache_read_per_mtok = 1.50 }
tier = "reasoning"
open_weight = false
strengths = ["reasoning", "cheap"]
[models.o3]
name = "OpenAI o3"
provider = "openai"
context_window = 200000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 60.00, cache_read_per_mtok = 7.50 }
tier = "reasoning"
open_weight = false
strengths = ["reasoning", "coding"]
benchmarks = { swe_bench_verified = 69.1 }
[models."o3-mini"]
name = "OpenAI o3-mini"
provider = "openai"
context_window = 200000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 1.10, output_per_mtok = 4.40, cache_read_per_mtok = 0.55 }

# Google Gemini ─ pricing: https://ai.google.dev/pricing.
# Gemini 1.0 / 1.5 already retired. Gemini 2.0 Flash + Flash-Lite shut
# down 2026-06-01 per the deprecations page.
tier = "reasoning"
open_weight = false
strengths = ["reasoning", "coding", "cheap"]
benchmarks = { swe_bench_verified = 49.3 }
[models."gemini-2.5-flash"]
name = "Gemini 2.5 Flash"
provider = "gemini"
context_window = 1048576
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.10, output_per_mtok = 0.40, cache_read_per_mtok = 0.025 }

# OpenRouter-routed variant of the same model — kept as a distinct
# catalog entry so the `qc_defaults.openrouter` lookup resolves to a
# registered ID. Pricing matches the native Gemini API; OpenRouter adds
# its own margin at request time.
tier = "mid"
open_weight = false
strengths = ["speed", "long_context", "vision", "cheap", "tool_use"]
complementary_with = ["anthropic-claude", "openai-gpt", "qwen", "deepseek", "kimi"]
[models."google/gemini-2.5-flash"]
name = "Gemini 2.5 Flash (via OpenRouter)"
provider = "openrouter"
context_window = 1048576
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.10, output_per_mtok = 0.40, cache_read_per_mtok = 0.025 }
tier = "mid"
open_weight = false
strengths = ["speed", "long_context", "vision", "cheap", "tool_use"]
complementary_with = ["anthropic-claude", "openai-gpt", "qwen", "deepseek", "kimi"]
[models."gemini-2.5-pro"]
name = "Gemini 2.5 Pro"
provider = "gemini"
context_window = 2097152
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 1.25, output_per_mtok = 5.00, cache_read_per_mtok = 0.3125 }

# Mistral hosted via OpenRouter.
tier = "frontier"
open_weight = false
strengths = ["long_context", "vision", "reasoning", "coding"]
benchmarks = { swe_bench_verified = 63.8 }
complementary_with = ["anthropic-claude", "openai-gpt", "qwen", "deepseek", "kimi"]
[models."mistral-large-2512"]
name = "Mistral Large 3 2512"
provider = "mistral"
context_window = 262144
logical_model = "mistral-large-3-2512"
equivalence_group = "mistral-large-3-2512"
api_dialect = "openai_chat_compat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.50, output_per_mtok = 1.50, cache_read_per_mtok = 0.05 }
architecture = { parameter_count_b = 675.0, active_parameter_count_b = 41.0, moe = true, license = "Apache-2.0", source_url = "https://docs.mistral.ai/models/model-cards/mistral-large-3-25-12", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["coding", "tool_use", "long_context", "vision"]
[models."mistral-small-2603"]
name = "Mistral Small 4"
provider = "mistral"
context_window = 262144
logical_model = "mistral-small-4-2603"
equivalence_group = "mistral-small-4-2603"
api_dialect = "openai_chat_compat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60, cache_read_per_mtok = 0.015 }
architecture = { parameter_count_b = 119.0, active_parameter_count_b = 6.5, moe = true, license = "Apache-2.0", source_url = "https://docs.mistral.ai/models/model-cards/mistral-small-4-0-26-03", last_verified = "2026-06-05" }
tier = "mid"
open_weight = true
strengths = ["cheap", "coding", "speed", "tool_use", "long_context"]
[models."mistralai/mistral-large-2512"]
name = "Mistral Large 3 2512"
provider = "openrouter"
context_window = 262144
logical_model = "mistral-large-3-2512"
equivalence_group = "mistral-large-3-2512"
served_variant = "openrouter"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.50, output_per_mtok = 1.50, cache_read_per_mtok = 0.05 }
architecture = { parameter_count_b = 675.0, active_parameter_count_b = 41.0, moe = true, license = "Apache-2.0", source_url = "https://docs.mistral.ai/models/model-cards/mistral-large-3-25-12", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["coding", "tool_use", "long_context"]
[models."mistralai/mistral-small-2603"]
name = "Mistral Small 4"
provider = "openrouter"
context_window = 262144
logical_model = "mistral-small-4-2603"
equivalence_group = "mistral-small-4-2603"
served_variant = "openrouter"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60, cache_read_per_mtok = 0.015 }
architecture = { parameter_count_b = 119.0, active_parameter_count_b = 6.5, moe = true, license = "Apache-2.0", source_url = "https://docs.mistral.ai/models/model-cards/mistral-small-4-0-26-03", last_verified = "2026-06-05" }

# --- source: 60-models/20-open-weight-openrouter.toml ---
# Open-weight executor candidates (<$2/Mtok with function calling). Use
# these via OpenRouter or Fireworks for fast secondary-model dispatch.
# Pricing snapshot 2026-05 from OpenRouter / Artificial Analysis.
tier = "mid"
open_weight = true
strengths = ["cheap", "coding", "speed"]
[models."qwen/qwen3-coder"]
name = "Qwen3 Coder 480B A35B"
provider = "openrouter"
context_window = 262144
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.22, output_per_mtok = 1.80 }
availability = "serverless"

# Together lists Qwen3-Coder-Next-FP8 in GET /v1/models alongside its
# serverless catalog, but normal chat-completion calls fail with
# `model_not_available` and instruct the caller to create a dedicated
# endpoint. Carry the catalog row so price/route metadata is preserved,
# but mark `availability = "dedicated"` so hosts don't surface it as a
# one-click serverless option.
tier = "frontier"
open_weight = true
strengths = ["coding", "long_context", "agentic", "tool_use"]
benchmarks = { swe_bench_verified = 67.0 }
[models."Qwen/Qwen3-Coder-Next-FP8"]
name = "Qwen3 Coder Next FP8 (Together, dedicated)"
provider = "together"
context_window = 262144
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.18, output_per_mtok = 0.18 }
availability = "dedicated"
tier = "frontier"
open_weight = true
strengths = ["coding", "long_context", "agentic"]
complementary_with = ["anthropic-claude", "openai-gpt", "google-gemini", "deepseek", "kimi"]
[models."deepseek/deepseek-v3.2"]
name = "DeepSeek V3.2"
provider = "openrouter"
context_window = 131072
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.28, output_per_mtok = 0.42 }
tier = "mid"
open_weight = true
strengths = ["coding", "tool_use", "cheap"]
complementary_with = ["anthropic-claude", "openai-gpt", "google-gemini", "qwen", "kimi"]
[models."moonshotai/kimi-k2.6"]
name = "Kimi K2.6"
provider = "openrouter"
context_window = 262144
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.73, output_per_mtok = 3.49 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "long_context", "tool_use", "reasoning"]
benchmarks = { swe_bench_pro = 58.6, humanitys_last_exam_with_tools = 54.0 }
complementary_with = ["anthropic-claude", "openai-gpt", "google-gemini", "qwen", "deepseek"]
[models."openai/gpt-oss-120b"]
name = "GPT-OSS 120B"
provider = "openrouter"
context_window = 131072
logical_model = "openai-gpt-oss-120b"
equivalence_group = "openai-gpt-oss-120b"
served_variant = "openrouter"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60 }
architecture = { parameter_count_b = 117.0, active_parameter_count_b = 5.1, moe = true, license = "Apache-2.0", source_url = "https://developers.openai.com/api/docs/models/gpt-oss-120b", last_verified = "2026-06-05" }

# --- source: 60-models/30-cerebras.toml ---
# Cerebras-hosted open-weight models. Serverless rows mirror
# https://api.cerebras.ai/public/v1/models; dedicated-endpoint families are
# intentionally not added as one-click routes unless Cerebras exposes a stable
# public wire ID for the standard endpoint. The headline binder-substrate
# candidate is gpt-oss-120b at very high token throughput; GLM 4.7 is the
# public preview coding/agentic route.
#
# Catalog keys are bare wire IDs (Cerebras's /v1/chat/completions wants
# the raw model name). Users routing via `model: "cerebras/<name>"` get
# the slash-prefixed selector stripped by `normalize_model_id` while
# `infer_provider` routes them to this provider.
tier = "mid"
open_weight = true
strengths = ["cheap", "tool_use"]
[models."gpt-oss-120b"]
name = "GPT-OSS 120B (Cerebras)"
provider = "cerebras"
context_window = 131072
logical_model = "openai-gpt-oss-120b"
equivalence_group = "openai-gpt-oss-120b"
served_variant = "cerebras-wafer-scale"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.35, output_per_mtok = 0.75 }
rate_limits = { rpm = 5, tpm = 30000, tph = 1000000, tpd = 1000000, tier = "free", source_url = "https://inference-docs.cerebras.ai/support/rate-limits", last_verified = "2026-06-05", notes = "Published Free Trial row; Developer (Pay as You Go) lists 1K RPM and 1M TPM with no hourly/daily cap." }
architecture = { parameter_count_b = 117.0, active_parameter_count_b = 5.1, moe = true, license = "Apache-2.0", source_url = "https://developers.openai.com/api/docs/models/gpt-oss-120b", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["speed", "cheap", "tool_use"]
[models."zai-glm-4.7"]
name = "Z.ai GLM 4.7 (Cerebras)"
provider = "cerebras"
context_window = 131072
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 2.25, output_per_mtok = 2.75 }
tier = "frontier"
open_weight = true
strengths = ["speed", "coding", "agentic", "tool_use", "reasoning"]
[models."llama-3.3-70b"]
name = "Llama 3.3 70B (Cerebras, dedicated legacy)"
provider = "cerebras"
context_window = 131072
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.85, output_per_mtok = 1.20 }
availability = "dedicated"
deprecated = true
deprecation_note = "Cerebras no longer returns this model from public discovery; use a provisioned dedicated endpoint alias if your organization still serves these weights."

# --- source: 60-models/40-minimax.toml ---
# MiniMax family ─ pricing pages:
#   https://platform.minimax.io/docs/guides/pricing-paygo
#   https://platform.minimax.io/docs/guides/model-invocation
#   https://platform.minimax.io/docs/api-reference/text-openai-api
#   llm-stats.com/models/minimax-m2-7.
# M3 standard pricing below uses the non-promotional <=512K-input rate:
# $0.60/M input, $2.40/M output, $0.12/M cache-read. MiniMax publishes a
# higher standard tier for >512K input ($1.20/M input, $4.80/M output,
# $0.24/M cache-read), but this catalog's ModelPricing shape is a single
# rate card, so the base standard tier is the source of truth for now.
# OpenRouter's June 2026 launch promotion is intentionally not copied into
# static TOML.
#
# MiniMax M2: 230B total / 10B active MoE shared across M2/M2.5/M2.7.
# Context windows from artificialanalysis.ai/models/minimax-m2-7 (205K).
# Pricing reflects the direct MiniMax API surface; OpenRouter mirrors are
# listed separately below.
#
# Tool calls + thinking-mode are supported (release notes call out
# "agentic harness" support); structured output is delimited (no native
# JSON schema mode), and prompt caching is hit-priced at 20% of input.
tier = "mid"
open_weight = true
strengths = ["speed", "tool_use"]
[models."MiniMax-M3"]
name = "MiniMax M3"
provider = "minimax"
context_window = 1000000
capabilities = ["tools", "vision", "video", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.60, output_per_mtok = 2.40, cache_read_per_mtok = 0.12 }
tier = "frontier"
open_weight = false
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context", "vision"]
[models."MiniMax-M2"]
name = "MiniMax M2"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 0.255, output_per_mtok = 1.00, cache_read_per_mtok = 0.051 }
tier = "mid"
open_weight = true
strengths = ["coding", "agentic", "cheap", "tool_use"]
benchmarks = { aa_intelligence_index = 45.0 }
[models."MiniMax-M2.5"]
name = "MiniMax M2.5"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 0.28, output_per_mtok = 1.10, cache_read_per_mtok = 0.056 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "long_context"]
[models."MiniMax-M2.5-highspeed"]
name = "MiniMax M2.5 (highspeed)"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.28, output_per_mtok = 1.10, cache_read_per_mtok = 0.056 }
tier = "mid"
open_weight = true
strengths = ["speed", "coding", "agentic"]
[models."MiniMax-M2.7"]
name = "MiniMax M2.7"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.30, output_per_mtok = 1.20, cache_read_per_mtok = 0.06 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context"]
benchmarks = { aa_intelligence_index = 50.0 }
[models."MiniMax-M2.7-highspeed"]
name = "MiniMax M2.7 (highspeed)"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming", "prompt_caching"]
pricing = { input_per_mtok = 0.30, output_per_mtok = 1.20, cache_read_per_mtok = 0.06 }
tier = "mid"
open_weight = true
strengths = ["speed", "coding", "agentic"]
[models."MiniMax-Text-01"]
name = "MiniMax Text 01"
provider = "minimax"
context_window = 1000000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.20, output_per_mtok = 1.10 }

# MiniMax mirror on OpenRouter — same family, OpenRouter adds margin and
# bundles native-tools passthrough so the openai_chat_completions wire
# format Just Works for callers without a direct MiniMax key. MiniMax M3
# launch-promo rates shown by OpenRouter are excluded from this static
# rate card; the M3 row uses the standard post-promo rate.
tier = "mid"
open_weight = true
strengths = ["long_context"]
[models."minimax/minimax-m3"]
name = "MiniMax M3 (via OpenRouter)"
provider = "openrouter"
context_window = 1048576
capabilities = ["tools", "vision", "video", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.60, output_per_mtok = 2.40, cache_read_per_mtok = 0.12 }
tier = "frontier"
open_weight = false
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context", "vision"]
[models."minimax/minimax-m2.7"]
name = "MiniMax M2.7 (via OpenRouter)"
provider = "openrouter"
context_window = 204800
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.40, output_per_mtok = 1.50 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context"]
[models."minimax/minimax-m2"]
name = "MiniMax M2 (via OpenRouter)"
provider = "openrouter"
context_window = 204800
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.33, output_per_mtok = 1.20 }

# --- source: 60-models/50-zai.toml ---
# Z.AI GLM-5 family. GLM-5.1 (released 2026-04-07) is the 754B open-weight
# flagship; GLM-5 is the prior generation. Direct Z.AI tariff via the
# OpenAI-compatible /v1 endpoint. OpenRouter mirrors live below.
tier = "mid"
open_weight = true
strengths = ["coding", "agentic", "cheap"]
[models."glm-5"]
name = "GLM 5"
provider = "zai"
context_window = 202752
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.98, output_per_mtok = 3.08, cache_read_per_mtok = 0.20 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use"]
[models."glm-5.1"]
name = "GLM 5.1"
provider = "zai"
context_window = 202752
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 1.40, output_per_mtok = 4.40, cache_read_per_mtok = 0.26 }

# OpenRouter mirror of GLM-5 family so callers without a Z.AI key still
# resolve a route. OR doesn't list GLM-4.6/4.7 — the canonical OR slugs
# are the GLM-5 generation.
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context"]
benchmarks = { swe_bench_pro_lead = 1.0 }
[models."z-ai/glm-5"]
name = "GLM 5 (via OpenRouter)"
provider = "openrouter"
context_window = 202752
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 1.20, output_per_mtok = 4.00 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic"]
[models."z-ai/glm-5.1"]
name = "GLM 5.1 (via OpenRouter)"
provider = "openrouter"
context_window = 202752
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.98, output_per_mtok = 3.08 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context"]
[models."z-ai/glm-5v-turbo"]
name = "GLM 5V Turbo (via OpenRouter)"
provider = "openrouter"
context_window = 202752
capabilities = ["tools", "streaming", "vision"]
pricing = { input_per_mtok = 1.20, output_per_mtok = 4.00 }

# --- source: 60-models/60-deepseek-openrouter-qwen.toml ---
# DeepSeek V4 family ─ pricing pages: api-docs.deepseek.com/quick_start/pricing.
# Both V4 models share a 1M-token context window and 384K-token output
# cap. `deepseek-chat`/`deepseek-reasoner` are retained as deprecated
# aliases on V4-Flash (non-thinking) and V4-Flash thinking-mode
# respectively; per provider notes they retire 2026-07-24.
tier = "mid"
open_weight = true
strengths = ["vision", "speed"]
[models."deepseek-v4-flash"]
name = "DeepSeek V4 Flash"
provider = "deepseek"
context_window = 1000000
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.14, output_per_mtok = 0.28, cache_read_per_mtok = 0.0028 }
tier = "mid"
open_weight = true
strengths = ["speed", "cheap", "tool_use", "reasoning", "long_context"]
benchmarks = { aa_intelligence_index = 58.0 }
[models."deepseek-v4-pro"]
name = "DeepSeek V4 Pro"
provider = "deepseek"
context_window = 1000000
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.435, output_per_mtok = 0.87, cache_read_per_mtok = 0.003625 }
tier = "frontier"
open_weight = true
strengths = ["reasoning", "coding", "tool_use", "long_context"]
benchmarks = { aa_intelligence_index = 68.0 }
[models."deepseek-chat"]
name = "DeepSeek Chat (legacy → V4 Flash, non-thinking)"
provider = "deepseek"
context_window = 1000000
capabilities = ["tools", "streaming", "prompt_caching"]
pricing = { input_per_mtok = 0.14, output_per_mtok = 0.28, cache_read_per_mtok = 0.0028 }
deprecated = true
deprecation_note = "Maps to deepseek-v4-flash non-thinking mode; retirement 2026-07-24 15:59 UTC per provider docs."
tier = "mid"
open_weight = true
strengths = ["coding", "tool_use"]
[models."deepseek-reasoner"]
name = "DeepSeek Reasoner (legacy → V4 Flash, thinking)"
provider = "deepseek"
context_window = 1000000
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.14, output_per_mtok = 0.28, cache_read_per_mtok = 0.0028 }
deprecated = true
deprecation_note = "Maps to deepseek-v4-flash thinking mode; retirement 2026-07-24 15:59 UTC per provider docs."

# DeepSeek V4 OpenRouter mirrors. OpenRouter publishes an independent
# rate card and a binary 1M-token context window for these routes.
tier = "reasoning"
open_weight = true
strengths = ["reasoning", "coding"]
[models."deepseek/deepseek-v4-flash"]
name = "DeepSeek V4 Flash (via OpenRouter)"
provider = "openrouter"
context_window = 1048576
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.0983, output_per_mtok = 0.1966, cache_read_per_mtok = 0.0197 }
tier = "mid"
open_weight = true
strengths = ["speed", "cheap", "tool_use", "reasoning", "long_context"]
[models."deepseek/deepseek-v4-pro"]
name = "DeepSeek V4 Pro (via OpenRouter)"
provider = "openrouter"
context_window = 1048576
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.435, output_per_mtok = 0.87, cache_read_per_mtok = 0.003625 }

# Open-router Qwen3.5 9B (kept for the `small` tier alias).
tier = "frontier"
open_weight = true
strengths = ["reasoning", "coding", "tool_use", "long_context"]
[models."Qwen/Qwen3.5-9B"]
name = "Qwen3.5 9B"
provider = "openrouter"
context_window = 131072
capabilities = ["tools", "streaming"]

# --- source: 60-models/70-local-ollama.toml ---
# Ollama / local models — no `pricing` (free); context_window reflects
# model card ceiling. `runtime_context_window` caps what Harn will
# actually feed the runtime (host memory budget).
tier = "small"
open_weight = true
strengths = ["cheap", "speed"]
[models."llama3.2"]
name = "Llama 3.2"
provider = "ollama"
context_window = 32000
stream_timeout = 300.0
capabilities = ["tools", "streaming"]
tier = "small"
open_weight = true
strengths = ["cheap", "speed"]
[models."gemma4:26b"]
name = "Gemma 4 26B MoE"
provider = "ollama"
context_window = 262144
runtime_context_window = 32768
stream_timeout = 300.0
capabilities = ["tools", "vision", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["vision", "tool_use"]
# Gemma 4 12B — encoder-free unified multimodal model (Apache 2.0), built to run
# on a 16GB laptop. Published on Ollama only as quantized variants (no bare
# `gemma4:12b` tag). Verified via `ollama show`: the quantized 12B builds expose
# `tools` + native `thinking` but are TEXT-ONLY at 128K context — the vision
# projector is dropped in these conversions, unlike the larger 26b/31b builds.
# MLX is the Apple-Silicon path; nvfp4 targets NVIDIA Blackwell; mxfp8 is the
# higher-fidelity quant.
[models."gemma4:12b-mlx"]
name = "Gemma 4 12B (MLX)"
provider = "ollama"
context_window = 131072
runtime_context_window = 32768
stream_timeout = 240.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["speed", "cheap"]
[models."gemma4:12b-nvfp4"]
name = "Gemma 4 12B (NVFP4)"
provider = "ollama"
context_window = 131072
runtime_context_window = 32768
stream_timeout = 240.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["cheap", "tool_use"]
[models."gemma4:12b-mxfp8"]
name = "Gemma 4 12B (MXFP8)"
provider = "ollama"
context_window = 131072
runtime_context_window = 32768
stream_timeout = 240.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["cheap", "tool_use"]
[models."devstral-small-2:24b"]
name = "Devstral Small 2 24B"
provider = "ollama"
context_window = 262144
runtime_context_window = 32768
stream_timeout = 600.0
capabilities = ["tools", "streaming"]

# --- source: 60-models/80-local-runtimes.toml ---
# llama.cpp — Unsloth Dynamic 2.0 GGUF served by llama-server.
tier = "mid"
open_weight = true
strengths = ["coding", "agentic"]
[models."qwen3.6-35b-a3b-ud-q4-k-xl"]
name = "Qwen3.6 35B (Unsloth Q4_K_XL, llama.cpp)"
provider = "llamacpp"
context_window = 262144
runtime_context_window = 65536
stream_timeout = 900.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["coding"]
[models."qwen3.6-35b-a3b-ud-q4-k-xl".local_memory]
measured_resident_gib = 19.5
measured_context_window = 8192
measured_cache_type = "q8_0"
base_resident_gib = 19.0
kv_cache_gib_per_1k_ctx = 0.10
default_cache_type = "q8_0"
safety_margin_gib = 4.0
max_recommended_context = 65536
cache_type_multipliers = { q8_0 = 1.0, f16 = 2.0, q4_0 = 0.5, q4_1 = 0.5, q5_0 = 0.625, q5_1 = 0.625 }
last_verified = "2026-06-05"
notes = "Empirical llama-server RSS was about 19.5 GiB at ctx=8192 with q8_0 KV on Apple Silicon. Treat as a conservative launch guard, not an exact allocator model."
[models."qwen3.6-35b-a3b-ud-q5-k-xl"]
name = "Qwen3.6 35B (Unsloth Q5_K_XL, llama.cpp)"
provider = "llamacpp"
context_window = 262144
runtime_context_window = 65536
stream_timeout = 900.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["coding"]
[models."qwen3.6-35b-a3b"]
name = "Qwen3.6 35B (llama.cpp)"
provider = "llamacpp"
context_window = 262144
runtime_context_window = 65536
stream_timeout = 900.0
capabilities = ["tools", "streaming", "thinking"]

# Apple Silicon MLX.
tier = "mid"
open_weight = true
strengths = ["coding"]
[models."unsloth/Qwen3.6-27B-UD-MLX-4bit"]
name = "Qwen3.6 27B (MLX 4-bit)"
provider = "mlx"
context_window = 262144
stream_timeout = 900.0
capabilities = ["tools", "vision", "streaming", "thinking"]

# Local OpenAI-compatible servers (vLLM / bring-your-own).
tier = "mid"
open_weight = true
strengths = ["coding", "vision"]
[models."gemma-4-e2b-it"]
name = "Gemma 4 E2B (local)"
provider = "local"
context_window = 131072
stream_timeout = 300.0
capabilities = ["streaming", "thinking"]
tier = "small"
open_weight = true
strengths = ["cheap", "speed"]
[models."gemma-4-e4b-it"]
name = "Gemma 4 E4B (local)"
provider = "local"
context_window = 131072
stream_timeout = 300.0
capabilities = ["streaming", "thinking"]
tier = "small"
open_weight = true
strengths = ["cheap"]
[models."gemma-4-26b-a4b-it"]
name = "Gemma 4 26B MoE (local)"
provider = "local"
context_window = 131072
stream_timeout = 600.0
capabilities = ["streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["coding"]
[models."gemma-4-31b-it"]
name = "Gemma 4 31B (local)"
provider = "local"
context_window = 131072
stream_timeout = 600.0
capabilities = ["streaming", "thinking"]
tier = "frontier"
open_weight = true
strengths = ["coding", "long_context"]
[models."gemma-4-12b-it"]
name = "Gemma 4 12B (local)"
provider = "local"
context_window = 131072
stream_timeout = 300.0
capabilities = ["streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["cheap", "speed"]

# --- source: 60-models/90-hosted-gemma-cohere-xai-groq.toml ---
# Gemma 4 — hosted (Apache 2.0, multimodal text+image, 256K context, native
# thinking). The 12B is on-device only; the 26B MoE and 31B dense are also served
# directly by hosted APIs. Each provider route is registered as its own catalog
# row keyed by that provider's wire id (the alias-validation contract requires a
# matching row): OpenRouter ids are org-prefixed and the Gemini API rows use the
# `models/` REST resource name — both collision-free with the bare `local` keys.
[models."google/gemma-4-31b-it"]
name = "Gemma 4 31B (OpenRouter)"
provider = "openrouter"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
pricing = { input_per_mtok = 0.12, output_per_mtok = 0.37 }
tier = "frontier"
open_weight = true
strengths = ["vision", "reasoning", "coding", "cheap"]
[models."google/gemma-4-26b-a4b-it"]
name = "Gemma 4 26B MoE (OpenRouter)"
provider = "openrouter"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
pricing = { input_per_mtok = 0.06, output_per_mtok = 0.33 }
tier = "mid"
open_weight = true
strengths = ["vision", "cheap", "speed"]
[models."models/gemma-4-31b-it"]
name = "Gemma 4 31B (Gemini API)"
provider = "gemini"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
tier = "frontier"
open_weight = true
strengths = ["vision", "reasoning", "coding", "cheap"]
[models."models/gemma-4-26b-a4b-it"]
name = "Gemma 4 26B MoE (Gemini API)"
provider = "gemini"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["vision", "cheap", "speed"]
[models."google/gemma-4-31B-it"]
name = "Gemma 4 31B (Together)"
provider = "together"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
pricing = { input_per_mtok = 0.20, output_per_mtok = 0.50 }
tier = "frontier"
open_weight = true
strengths = ["vision", "reasoning", "coding"]

[models."groq/openai/gpt-oss-120b"]
name = "GPT-OSS 120B (Groq)"
provider = "groq"
context_window = 131072
logical_model = "openai-gpt-oss-120b"
equivalence_group = "openai-gpt-oss-120b"
served_variant = "groq-lpu"
wire_model = "openai/gpt-oss-120b"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60 }
rate_limits = { rpm = 1000, tpm = 250000, tier = "developer", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
architecture = { parameter_count_b = 117.0, active_parameter_count_b = 5.1, moe = true, license = "Apache-2.0", source_url = "https://developers.openai.com/api/docs/models/gpt-oss-120b", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["speed", "cheap", "tool_use", "reasoning"]

[models."llama-3.3-70b-versatile"]
name = "Llama 3.3 70B Versatile (Groq)"
provider = "groq"
context_window = 131072
logical_model = "llama-3.3-70b-instruct"
equivalence_group = "llama-3.3-70b-instruct"
served_variant = "groq-lpu"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.59, output_per_mtok = 0.79 }
rate_limits = { rpm = 1000, tpm = 300000, tier = "developer", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
architecture = { parameter_count_b = 70.0, moe = false, license = "Llama 3.3 Community", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
tier = "mid"
open_weight = true
strengths = ["speed", "cheap", "tool_use"]

[models."llama-3.1-8b-instant"]
name = "Llama 3.1 8B Instant (Groq)"
provider = "groq"
context_window = 131072
logical_model = "llama-3.1-8b-instruct"
equivalence_group = "llama-3.1-8b-instruct"
served_variant = "groq-lpu"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.05, output_per_mtok = 0.08 }
rate_limits = { rpm = 1000, tpm = 250000, tier = "developer", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
architecture = { parameter_count_b = 8.0, moe = false, license = "Llama 3.1 Community", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
tier = "small"
open_weight = true
strengths = ["speed", "cheap"]

# Cohere and xAI first-party routes. Both are OpenAI-compatible enough for
# Harn's chat-completions adapter, but their economics are unusual: Command A+
# is free up to account limits, while xAI's coding model exposes very high TPM
# on regional clusters.
[models."command-a-plus-05-2026"]
name = "Command A+"
provider = "cohere"
context_window = 256000
logical_model = "command-a-plus-05-2026"
equivalence_group = "command-a-plus-05-2026"
api_dialect = "openai_chat_compat"
capabilities = ["tools", "streaming", "thinking", "vision"]
pricing = { input_per_mtok = 2.50, output_per_mtok = 10.00 }
rate_limits = { rpm = 20, tier = "trial", source_url = "https://docs.cohere.com/docs/rate-limits", last_verified = "2026-06-05", notes = "Command A+ trial keys are 20 RPM and 1,000 calls/month; production is sales-gated. Token pricing is the public Command A+ API tariff." }
architecture = { parameter_count_b = 111.0, moe = true, source_url = "https://docs.cohere.com/docs/command-a-plus", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["agentic", "tool_use", "reasoning", "multilingual", "vision"]

[models."grok-build-0.1"]
name = "Grok Build 0.1"
provider = "xai"
context_window = 256000
logical_model = "grok-code-fast-1"
equivalence_group = "grok-code-fast-1"
api_dialect = "openai_chat_compat"
capabilities = ["tools", "streaming", "thinking", "vision", "prompt_caching"]
pricing = { input_per_mtok = 1.00, output_per_mtok = 2.00, cache_read_per_mtok = 0.20 }
rate_limits = { rpm = 1800, tpm = 10000000, tier = "us-east-1", source_url = "https://docs.x.ai/developers/models/grok-code-fast-1", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = false
strengths = ["coding", "agentic", "tool_use", "reasoning", "vision"]