default_provider = "anthropic"
[providers.anthropic]
base_url = "https://api.anthropic.com/v1"
auth_style = "header"
auth_header = "x-api-key"
auth_env = "ANTHROPIC_API_KEY"
chat_endpoint = "/messages"
features = ["prompt_caching", "thinking"]
cost_per_1k_in = 0.003
cost_per_1k_out = 0.015
latency_p50_ms = 2500
extra_headers = { "anthropic-version" = "2023-06-01" }
[providers.anthropic.healthcheck]
method = "POST"
path = "/messages/count_tokens"
body = '{"model":"claude-sonnet-4-6","messages":[{"role":"user","content":"x"}]}'
[providers.openai]
base_url = "https://api.openai.com/v1"
auth_style = "bearer"
auth_env = "OPENAI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1800
[providers.openai.healthcheck]
method = "GET"
path = "/models"
[providers.openrouter]
base_url = "https://openrouter.ai/api/v1"
auth_style = "bearer"
auth_env = "OPENROUTER_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.003
cost_per_1k_out = 0.015
latency_p50_ms = 2200
[providers.openrouter.healthcheck]
method = "GET"
path = "/auth/key"
[providers.huggingface]
base_url = "https://router.huggingface.co/v1"
auth_style = "bearer"
auth_env = ["HF_TOKEN", "HUGGINGFACE_API_KEY"]
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 2400
[providers.huggingface.healthcheck]
method = "GET"
url = "https://huggingface.co/api/whoami-v2"
[providers.ollama]
base_url = "http://localhost:11434"
base_url_env = "OLLAMA_HOST"
auth_style = "none"
chat_endpoint = "/api/chat"
completion_endpoint = "/api/generate"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 1200
[providers.ollama.healthcheck]
method = "GET"
path = "/api/tags"
[providers.gemini]
base_url = "https://generativelanguage.googleapis.com"
base_url_env = "GEMINI_BASE_URL"
auth_style = "header"
auth_header = "x-goog-api-key"
auth_env = ["GEMINI_API_KEY", "GOOGLE_API_KEY"]
chat_endpoint = "/v1beta/models"
cost_per_1k_in = 0.00125
cost_per_1k_out = 0.005
latency_p50_ms = 1800
[providers.gemini.healthcheck]
method = "GET"
path = "/v1beta/models"
[providers.mistral]
base_url = "https://api.mistral.ai/v1"
base_url_env = "MISTRAL_BASE_URL"
auth_style = "bearer"
auth_env = "MISTRAL_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0005
cost_per_1k_out = 0.0015
latency_p50_ms = 1800
features = ["native_tools"]
[providers.mistral.healthcheck]
method = "GET"
path = "/models"
[providers.cohere]
base_url = "https://api.cohere.ai/compatibility/v1"
base_url_env = "COHERE_BASE_URL"
auth_style = "bearer"
auth_env = "COHERE_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1900
features = ["native_tools", "reasoning"]
[providers.cohere.healthcheck]
method = "GET"
path = "/models"
[providers.xai]
base_url = "https://api.x.ai/v1"
base_url_env = "XAI_BASE_URL"
auth_style = "bearer"
auth_env = "XAI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.001
cost_per_1k_out = 0.002
latency_p50_ms = 1600
features = ["responses_api", "native_tools", "reasoning"]
[providers.xai.healthcheck]
method = "GET"
path = "/models"
[providers.together]
base_url = "https://api.together.xyz/v1"
base_url_env = "TOGETHER_AI_BASE_URL"
auth_style = "bearer"
auth_env = "TOGETHER_AI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 1600
[providers.together.healthcheck]
method = "GET"
path = "/models"
[providers.groq]
base_url = "https://api.groq.com/openai/v1"
base_url_env = "GROQ_BASE_URL"
auth_style = "bearer"
auth_env = "GROQ_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0001
cost_per_1k_out = 0.0003
latency_p50_ms = 450
[providers.groq.healthcheck]
method = "GET"
path = "/models"
[providers.cerebras]
base_url = "https://api.cerebras.ai/v1"
base_url_env = "CEREBRAS_BASE_URL"
auth_style = "bearer"
auth_env = "CEREBRAS_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.00035
cost_per_1k_out = 0.00075
latency_p50_ms = 150
features = ["native_tools"]
[providers.cerebras.healthcheck]
method = "GET"
path = "/models"
[providers.deepseek]
base_url = "https://api.deepseek.com/v1"
base_url_env = "DEEPSEEK_BASE_URL"
auth_style = "bearer"
auth_env = "DEEPSEEK_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.00014
cost_per_1k_out = 0.00028
latency_p50_ms = 1800
[providers.deepseek.healthcheck]
method = "GET"
path = "/models"
[providers.fireworks]
base_url = "https://api.fireworks.ai/inference/v1"
base_url_env = "FIREWORKS_BASE_URL"
auth_style = "bearer"
auth_env = "FIREWORKS_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 1400
[providers.fireworks.healthcheck]
method = "GET"
path = "/models"
[providers.dashscope]
base_url = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
base_url_env = "DASHSCOPE_BASE_URL"
auth_style = "bearer"
auth_env = "DASHSCOPE_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0003
cost_per_1k_out = 0.0012
latency_p50_ms = 1600
[providers.dashscope.healthcheck]
method = "GET"
path = "/models"
[providers.minimax]
base_url = "https://api.minimax.io/v1"
base_url_env = "MINIMAX_BASE_URL"
auth_style = "bearer"
auth_env = "MINIMAX_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0006
cost_per_1k_out = 0.0024
latency_p50_ms = 1700
[providers.minimax.healthcheck]
method = "GET"
path = "/models"
[providers.zai]
base_url = "https://api.z.ai/v1"
base_url_env = "ZAI_BASE_URL"
auth_style = "bearer"
auth_env = ["ZAI_API_KEY", "ZHIPU_API_KEY"]
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0004
cost_per_1k_out = 0.0017
latency_p50_ms = 1900
[providers.zai.healthcheck]
method = "GET"
path = "/models"
[providers.bedrock]
base_url = ""
base_url_env = "BEDROCK_BASE_URL"
auth_style = "aws_sigv4"
chat_endpoint = "/model/{model}/converse"
features = ["native_tools"]
latency_p50_ms = 2600
[providers.azure_openai]
base_url = "https://{resource}.openai.azure.com"
base_url_env = "AZURE_OPENAI_ENDPOINT"
auth_style = "azure_openai"
auth_env = ["AZURE_OPENAI_API_KEY", "AZURE_OPENAI_AD_TOKEN", "AZURE_OPENAI_BEARER_TOKEN"]
chat_endpoint = "/openai/deployments/{deployment}/chat/completions?api-version={api_version}"
features = ["native_tools"]
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1900
[providers.vertex]
base_url = "https://aiplatform.googleapis.com/v1"
base_url_env = "VERTEX_AI_BASE_URL"
auth_style = "bearer"
auth_env = ["VERTEX_AI_ACCESS_TOKEN", "GOOGLE_OAUTH_ACCESS_TOKEN", "GOOGLE_APPLICATION_CREDENTIALS"]
chat_endpoint = "/projects/{project}/locations/{location}/publishers/google/models/{model}:generateContent"
features = ["native_tools"]
cost_per_1k_in = 0.00125
cost_per_1k_out = 0.005
latency_p50_ms = 2100
[providers.local]
base_url = "http://localhost:8000"
base_url_env = "LOCAL_LLM_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900
[providers.local.healthcheck]
method = "GET"
path = "/v1/models"
[providers.llamacpp]
base_url = "http://127.0.0.1:8001"
base_url_env = "LLAMACPP_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900
[providers.llamacpp.healthcheck]
method = "GET"
path = "/v1/models"
[providers.mlx]
base_url = "http://127.0.0.1:8002"
base_url_env = "MLX_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900
[providers.mlx.healthcheck]
method = "GET"
path = "/v1/models"
[providers.vllm]
base_url = "http://localhost:8000"
base_url_env = "VLLM_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 800
[providers.vllm.healthcheck]
method = "GET"
path = "/v1/models"
[providers.tgi]
base_url = "http://localhost:8080"
base_url_env = "TGI_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 950
[providers.tgi.healthcheck]
method = "GET"
path = "/health"
[providers.ollama.local_runtime]
kind = "daemon_api"
command = "ollama"
default_port = 11434
stop = "keep_alive_zero"
source_url = "https://github.com/ollama/ollama/blob/main/docs/api.md"
last_verified = "2026-06-05"
notes = "Load via Ollama generate/chat warmup; unload by posting an empty prompt with keep_alive=0."
[providers.llamacpp.local_runtime]
kind = "managed_process"
command = "llama-server"
model_source_env = "LLAMACPP_MODEL"
default_port = 8001
model_arg = "--model"
served_model_arg = "--alias"
host_arg = "--host"
port_arg = "--port"
ctx_arg = "--ctx-size"
parallel_arg = "--parallel"
gpu_layers_arg = "--n-gpu-layers"
cache_type_k_arg = "--cache-type-k"
cache_type_v_arg = "--cache-type-v"
cache_ram_arg = "--cache-ram"
default_args = ["--jinja", "--reasoning", "off", "--reasoning-format", "deepseek", "--metrics", "--flash-attn", "on"]
stop = "pid"
source_url = "https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md"
last_verified = "2026-06-05"
notes = "OpenAI-compatible HTTP server. Use --model-source or LLAMACPP_MODEL for the GGUF path; Harn records the launched PID for local stop."
[providers.mlx.local_runtime]
kind = "managed_process"
command = "mlx_lm.server"
model_source_env = "MLX_MODEL"
default_port = 8002
model_arg = "--model"
host_arg = "--host"
port_arg = "--port"
stop = "pid"
source_url = "https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/SERVER.md"
last_verified = "2026-06-05"
notes = "OpenAI-like MLX-LM server. Use --model-source or MLX_MODEL for an MLX-compatible path or Hugging Face repo id."
[[inference_rules]]
pattern = "claude-*"
provider = "anthropic"
[[inference_rules]]
pattern = "gpt-*"
provider = "openai"
[[inference_rules]]
pattern = "o1*"
provider = "openai"
[[inference_rules]]
pattern = "o3*"
provider = "openai"
[[inference_rules]]
pattern = "o4*"
provider = "openai"
[[inference_rules]]
pattern = "anthropic.claude-*"
provider = "bedrock"
[[inference_rules]]
pattern = "meta.llama*"
provider = "bedrock"
[[inference_rules]]
pattern = "amazon.*"
provider = "bedrock"
[[inference_rules]]
pattern = "mistral.*"
provider = "bedrock"
[[inference_rules]]
pattern = "cohere.*"
provider = "bedrock"
[[inference_rules]]
pattern = "gemini-*"
provider = "gemini"
[[inference_rules]]
pattern = "mistral-*"
provider = "mistral"
[[inference_rules]]
pattern = "devstral-*"
provider = "mistral"
[[inference_rules]]
pattern = "command-*"
provider = "cohere"
[[inference_rules]]
pattern = "grok-*"
provider = "xai"
[[inference_rules]]
pattern = "groq/*"
provider = "groq"
[[inference_rules]]
pattern = "cerebras/*"
provider = "cerebras"
[[inference_rules]]
pattern = "MiniMax-*"
provider = "minimax"
[[inference_rules]]
pattern = "glm-*"
provider = "zai"
[[inference_rules]]
pattern = "zhipu/*"
provider = "zai"
[[inference_rules]]
pattern = "deepseek-v4*"
provider = "deepseek"
[[inference_rules]]
pattern = "deepseek-chat"
provider = "deepseek"
[[inference_rules]]
pattern = "deepseek-reasoner"
provider = "deepseek"
[tier_defaults]
default = "mid"
[aliases.sonnet]
id = "claude-sonnet-4-6"
provider = "anthropic"
[aliases.opus]
id = "claude-opus-4-8"
provider = "anthropic"
[aliases.haiku]
id = "claude-haiku-4-5-20251001"
provider = "anthropic"
[aliases.frontier]
id = "claude-sonnet-4-6"
provider = "anthropic"
[aliases."tier/frontier"]
id = "claude-sonnet-4-6"
provider = "anthropic"
[aliases.mid]
id = "gpt-4o-mini"
provider = "openai"
[aliases."tier/mid"]
id = "gpt-4o-mini"
provider = "openai"
[aliases.small]
id = "Qwen/Qwen3.5-9B"
provider = "openrouter"
[aliases."tier/small"]
id = "Qwen/Qwen3.5-9B"
provider = "openrouter"
[aliases.local-gemma4]
id = "gemma-4-26b-a4b-it"
provider = "local"
[aliases.local-gemma4-26b]
id = "gemma-4-26b-a4b-it"
provider = "local"
[aliases.local-gemma4-31b]
id = "gemma-4-31b-it"
provider = "local"
[aliases.local-gemma4-e4b]
id = "gemma-4-e4b-it"
provider = "local"
[aliases.local-gemma4-e2b]
id = "gemma-4-e2b-it"
provider = "local"
[aliases.ollama-gemma4]
id = "gemma4:26b"
provider = "ollama"
tool_format = "text"
[aliases.ollama-gemma4-26b]
id = "gemma4:26b"
provider = "ollama"
tool_format = "text"
[aliases.ollama-gemma4-12b]
id = "gemma4:12b-mlx"
provider = "ollama"
tool_format = "text"
[aliases.ollama-gemma4-12b-nvfp4]
id = "gemma4:12b-nvfp4"
provider = "ollama"
tool_format = "text"
[aliases.local-gemma4-12b]
id = "gemma-4-12b-it"
provider = "local"
[aliases.gemini-gemma4-31b]
id = "models/gemma-4-31b-it"
provider = "gemini"
[aliases.gemini-gemma4-26b]
id = "models/gemma-4-26b-a4b-it"
provider = "gemini"
[aliases.openrouter-gemma4-31b]
id = "google/gemma-4-31b-it"
provider = "openrouter"
[aliases.openrouter-gemma4-26b]
id = "google/gemma-4-26b-a4b-it"
provider = "openrouter"
[aliases.together-gemma4-31b]
id = "google/gemma-4-31B-it"
provider = "together"
[aliases."llamacpp-qwen3.6"]
id = "qwen3.6-35b-a3b"
provider = "llamacpp"
tool_format = "text"
[aliases."llamacpp-qwen3.6-q4"]
id = "qwen3.6-35b-a3b-ud-q4-k-xl"
provider = "llamacpp"
tool_format = "native"
[aliases."local-qwen3.6"]
id = "qwen3.6-35b-a3b-ud-q4-k-xl"
provider = "llamacpp"
tool_format = "native"
[aliases."local-qwen3.6-gguf"]
id = "qwen3.6-35b-a3b-ud-q4-k-xl"
provider = "llamacpp"
tool_format = "native"
[aliases.mlx-qwen36-27b]
id = "unsloth/Qwen3.6-27B-UD-MLX-4bit"
provider = "mlx"
[aliases."mlx-qwen3.6-27b"]
id = "unsloth/Qwen3.6-27B-UD-MLX-4bit"
provider = "mlx"
tool_format = "native"
[aliases."mlx-qwen3.6-27b-q4"]
id = "unsloth/Qwen3.6-27B-UD-MLX-4bit"
provider = "mlx"
tool_format = "native"
[aliases."local-qwen3.6-27b"]
id = "unsloth/Qwen3.6-27B-UD-MLX-4bit"
provider = "mlx"
tool_format = "native"
[aliases.minimax]
id = "MiniMax-M3"
provider = "minimax"
[aliases."minimax-m2"]
id = "MiniMax-M2"
provider = "minimax"
[aliases."minimax-m2.5"]
id = "MiniMax-M2.5"
provider = "minimax"
[aliases."minimax-m2.7"]
id = "MiniMax-M2.7"
provider = "minimax"
[aliases."minimax-m3"]
id = "MiniMax-M3"
provider = "minimax"
[aliases.glm]
id = "glm-5.1"
provider = "zai"
[aliases."glm-5"]
id = "glm-5"
provider = "zai"
[aliases."glm-5.1"]
id = "glm-5.1"
provider = "zai"
[aliases.deepseek]
id = "deepseek-v4-flash"
provider = "deepseek"
[aliases."deepseek-flash"]
id = "deepseek-v4-flash"
provider = "deepseek"
[aliases."deepseek-pro"]
id = "deepseek-v4-pro"
provider = "deepseek"
[aliases."deepseek-v4-flash"]
id = "deepseek-v4-flash"
provider = "deepseek"
[aliases."deepseek-v4-pro"]
id = "deepseek-v4-pro"
provider = "deepseek"
[aliases.cohere]
id = "command-a-plus-05-2026"
provider = "cohere"
[aliases."command-a-plus"]
id = "command-a-plus-05-2026"
provider = "cohere"
[aliases.grok-code]
id = "grok-build-0.1"
provider = "xai"
[aliases."grok-code-fast"]
id = "grok-build-0.1"
provider = "xai"
[aliases.devstral-small-2]
id = "devstral-small-2:24b"
provider = "ollama"
tool_format = "text"
[aliases.ollama-devstral-small-2]
id = "devstral-small-2:24b"
provider = "ollama"
tool_format = "text"
[aliases.ollama-devstral-small-2-native]
id = "devstral-small-2:24b"
provider = "ollama"
tool_format = "native"
[alias_tool_calling.ollama-gemma4]
native = "unknown"
text = "unknown"
streaming_native = "unknown"
fallback_mode = "disabled"
failure_reason = "requires_tool_probe"
[alias_tool_calling.ollama-gemma4-12b]
native = "unknown"
text = "unknown"
streaming_native = "unknown"
fallback_mode = "disabled"
failure_reason = "requires_tool_probe"
[alias_tool_calling."llamacpp-qwen3.6-q4"]
native = "pass"
text = "unknown"
streaming_native = "pass"
fallback_mode = "native"
last_probe_at = "2026-06-05"
[alias_tool_calling."local-qwen3.6"]
native = "pass"
text = "unknown"
streaming_native = "pass"
fallback_mode = "native"
last_probe_at = "2026-06-05"
[alias_tool_calling."local-qwen3.6-gguf"]
native = "pass"
text = "unknown"
streaming_native = "pass"
fallback_mode = "native"
last_probe_at = "2026-06-05"
[alias_tool_calling."mlx-qwen3.6-27b"]
native = "unknown"
text = "unknown"
streaming_native = "unknown"
fallback_mode = "native"
failure_reason = "requires_served_identity_and_tool_probe"
[qc_defaults]
anthropic = "claude-haiku-4-5-20251001"
openai = "gpt-4o-mini"
openrouter = "google/gemini-2.5-flash"
ollama = "llama3.2"
local = "gpt-4o"
mistral = "mistral-small-2603"
cohere = "command-a-plus-05-2026"
xai = "grok-build-0.1"
groq = "llama-3.1-8b-instant"
minimax = "MiniMax-M2.5-highspeed"
zai = "glm-5"
deepseek = "deepseek-v4-flash"
[models."claude-3-5-haiku-20241022"]
name = "Claude Haiku 3.5"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 0.80, output_per_mtok = 4.00, cache_read_per_mtok = 0.08, cache_write_per_mtok = 1.00 }
tier = "small"
open_weight = false
strengths = ["speed", "cheap", "summarization", "tool_use"]
[models."claude-haiku-4-5-20251001"]
name = "Claude Haiku 4.5"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 1.00, output_per_mtok = 5.00, cache_read_per_mtok = 0.10, cache_write_per_mtok = 1.25 }
tier = "mid"
open_weight = false
strengths = ["speed", "cheap", "coding", "tool_use", "summarization"]
[models."claude-3-5-sonnet-20240620"]
name = "Claude Sonnet 3.5 (2024-06-20)"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context"]
[models."claude-3-5-sonnet-20241022"]
name = "Claude Sonnet 3.5 (2024-10-22)"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context"]
[models."claude-sonnet-4-20250514"]
name = "Claude Sonnet 4"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
deprecated = true
deprecation_note = "Sunset 2026-06-15 per Anthropic deprecations page. Replaced by claude-sonnet-4-6."
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 49.0 }
[models."claude-sonnet-4-5"]
name = "Claude Sonnet 4.5"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
deprecated = true
deprecation_note = "Sunset 2026-05-15 per Anthropic deprecations page. Replaced by claude-sonnet-4-6."
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 77.2 }
[models."claude-sonnet-4-6"]
name = "Claude Sonnet 4.6"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 79.6 }
complementary_with = ["openai-gpt", "google-gemini", "qwen", "deepseek", "kimi"]
[models."claude-sonnet-4-7"]
name = "Claude Sonnet 4.7"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 81.0 }
[models."anthropic/claude-haiku-4-5"]
name = "Claude Haiku 4.5 (via OpenRouter)"
provider = "openrouter"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching"]
pricing = { input_per_mtok = 1.00, output_per_mtok = 5.00, cache_read_per_mtok = 0.10, cache_write_per_mtok = 1.25 }
tier = "mid"
open_weight = false
strengths = ["speed", "cheap", "coding", "tool_use", "summarization"]
[models."anthropic/claude-sonnet-4-6"]
name = "Claude Sonnet 4.6 (via OpenRouter)"
provider = "openrouter"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 15.00, cache_read_per_mtok = 0.30, cache_write_per_mtok = 3.75 }
tier = "frontier"
open_weight = false
strengths = ["coding", "reasoning", "tool_use", "long_context", "agentic"]
complementary_with = ["openai-gpt", "google-gemini", "qwen", "deepseek", "kimi"]
[models."claude-3-opus-20240229"]
name = "Claude Opus 3"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 75.00, cache_read_per_mtok = 1.50, cache_write_per_mtok = 18.75 }
tier = "frontier"
open_weight = false
strengths = ["reasoning", "long_context"]
[models."claude-opus-4-20250514"]
name = "Claude Opus 4"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 75.00, cache_read_per_mtok = 1.50, cache_write_per_mtok = 18.75 }
deprecated = true
deprecation_note = "Sunset 2026-06-15 per Anthropic deprecations page. Replaced by claude-opus-4-8."
superseded_by = "claude-opus-4-8"
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 77.6 }
[models."claude-opus-4-1-20250805"]
name = "Claude Opus 4.1"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 75.00, cache_read_per_mtok = 1.50, cache_write_per_mtok = 18.75 }
deprecated = true
deprecation_note = "Superseded by claude-opus-4-8. No formal sunset yet; switch when convenient."
superseded_by = "claude-opus-4-8"
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 78.9 }
[models."claude-opus-4-6"]
name = "Claude Opus 4.6"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 5.00, output_per_mtok = 25.00, cache_read_per_mtok = 0.50, cache_write_per_mtok = 6.25 }
deprecated = true
deprecation_note = "Superseded by claude-opus-4-8. No formal sunset yet; switch when convenient."
superseded_by = "claude-opus-4-8"
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 80.8, swe_bench_pro = 53.4 }
fast_mode = { param = "speed", value = "fast", beta_header = "fast-mode-2026-02-01", otps_speedup = 2.5, status = "deprecated", pricing = { input_per_mtok = 30.00, output_per_mtok = 150.00, cache_read_per_mtok = 3.00, cache_write_per_mtok = 37.50 }, note = "Deprecated at the Opus 4.8 launch; removed ~30 days later, after which speed=fast silently falls back to standard speed/pricing. Migrate to Opus 4.8 or 4.7 fast mode." }
[models."claude-opus-4-7"]
name = "Claude Opus 4.7"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 5.00, output_per_mtok = 25.00, cache_read_per_mtok = 0.50, cache_write_per_mtok = 6.25 }
deprecated = true
deprecation_note = "Superseded by claude-opus-4-8. No formal sunset yet; switch when convenient."
superseded_by = "claude-opus-4-8"
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 87.6, swe_bench_pro = 64.3 }
fast_mode = { param = "speed", value = "fast", beta_header = "fast-mode-2026-02-01", otps_speedup = 2.5, status = "research_preview", pricing = { input_per_mtok = 30.00, output_per_mtok = 150.00, cache_read_per_mtok = 3.00, cache_write_per_mtok = 37.50 }, note = "Claude API + Managed Agents only. Migrate to Opus 4.8 fast mode for the cheaper 2x rate." }
[models."claude-opus-4-8"]
name = "Claude Opus 4.8"
provider = "anthropic"
context_window = 200000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 5.00, output_per_mtok = 25.00, cache_read_per_mtok = 0.50, cache_write_per_mtok = 6.25 }
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "long_context", "agentic"]
benchmarks = { swe_bench_verified = 88.6, swe_bench_pro = 69.2 }
complementary_with = ["openai-gpt", "google-gemini", "qwen", "deepseek", "kimi"]
fast_mode = { param = "speed", value = "fast", beta_header = "fast-mode-2026-02-01", otps_speedup = 2.5, status = "research_preview", pricing = { input_per_mtok = 10.00, output_per_mtok = 50.00, cache_read_per_mtok = 1.00, cache_write_per_mtok = 12.50 }, note = "Claude API + Managed Agents only (not Bedrock/Vertex/Foundry); excluded from Batch and Priority Tier. Switching speed invalidates the prompt cache. Waitlist/account-manager gated." }
[models."gpt-5.5"]
name = "GPT-5.5"
provider = "openai"
context_window = 400000
capabilities = ["tools", "streaming", "prompt_caching", "thinking"]
pricing = { input_per_mtok = 5.00, output_per_mtok = 30.00, cache_read_per_mtok = 0.50 }
tier = "frontier"
open_weight = false
strengths = ["reasoning", "coding", "tool_use", "long_context", "agentic"]
complementary_with = ["anthropic-claude", "google-gemini", "qwen", "deepseek", "kimi"]
fast_mode = { param = "service_tier", value = "fast", otps_speedup = 1.5, status = "ga", pricing = { input_per_mtok = 12.50, output_per_mtok = 75.00, cache_read_per_mtok = 1.25 }, note = "Codex \"Fast mode\" (service_tier=\"fast\", ~1.5x faster output) and API priority processing (service_tier=\"priority\") both bill at 2.5x standard. Not offered for long-context, fine-tuned models, or embeddings." }
[models."gpt-4o"]
name = "GPT-4o"
provider = "openai"
context_window = 128000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 2.50, output_per_mtok = 10.00, cache_read_per_mtok = 1.25 }
deprecated = true
deprecation_note = "API sunset 2026-02-17 per OpenAI deprecations page. Switch to gpt-5-mini for cheap routing or gpt-5 for frontier."
tier = "frontier"
open_weight = false
strengths = ["coding", "vision", "tool_use"]
[models."gpt-4o-mini"]
name = "GPT-4o Mini"
provider = "openai"
context_window = 128000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60 }
tier = "mid"
open_weight = false
strengths = ["speed", "cheap", "summarization", "tool_use"]
complementary_with = ["anthropic-claude", "google-gemini", "qwen", "deepseek", "kimi"]
[models."gpt-4-turbo"]
name = "GPT-4 Turbo"
provider = "openai"
context_window = 128000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 10.00, output_per_mtok = 30.00 }
deprecated = true
deprecation_note = "Superseded by gpt-5 family. Listed for cost-attribution backfill only."
tier = "frontier"
open_weight = false
strengths = ["coding", "tool_use"]
[models.o1]
name = "OpenAI o1"
provider = "openai"
context_window = 200000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 60.00, cache_read_per_mtok = 7.50 }
tier = "reasoning"
open_weight = false
strengths = ["reasoning"]
[models."o1-mini"]
name = "OpenAI o1-mini"
provider = "openai"
context_window = 128000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 3.00, output_per_mtok = 12.00, cache_read_per_mtok = 1.50 }
tier = "reasoning"
open_weight = false
strengths = ["reasoning", "cheap"]
[models.o3]
name = "OpenAI o3"
provider = "openai"
context_window = 200000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 15.00, output_per_mtok = 60.00, cache_read_per_mtok = 7.50 }
tier = "reasoning"
open_weight = false
strengths = ["reasoning", "coding"]
benchmarks = { swe_bench_verified = 69.1 }
[models."o3-mini"]
name = "OpenAI o3-mini"
provider = "openai"
context_window = 200000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 1.10, output_per_mtok = 4.40, cache_read_per_mtok = 0.55 }
tier = "reasoning"
open_weight = false
strengths = ["reasoning", "coding", "cheap"]
benchmarks = { swe_bench_verified = 49.3 }
[models."gemini-2.5-flash"]
name = "Gemini 2.5 Flash"
provider = "gemini"
context_window = 1048576
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.10, output_per_mtok = 0.40, cache_read_per_mtok = 0.025 }
tier = "mid"
open_weight = false
strengths = ["speed", "long_context", "vision", "cheap", "tool_use"]
complementary_with = ["anthropic-claude", "openai-gpt", "qwen", "deepseek", "kimi"]
[models."google/gemini-2.5-flash"]
name = "Gemini 2.5 Flash (via OpenRouter)"
provider = "openrouter"
context_window = 1048576
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.10, output_per_mtok = 0.40, cache_read_per_mtok = 0.025 }
tier = "mid"
open_weight = false
strengths = ["speed", "long_context", "vision", "cheap", "tool_use"]
complementary_with = ["anthropic-claude", "openai-gpt", "qwen", "deepseek", "kimi"]
[models."gemini-2.5-pro"]
name = "Gemini 2.5 Pro"
provider = "gemini"
context_window = 2097152
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 1.25, output_per_mtok = 5.00, cache_read_per_mtok = 0.3125 }
tier = "frontier"
open_weight = false
strengths = ["long_context", "vision", "reasoning", "coding"]
benchmarks = { swe_bench_verified = 63.8 }
complementary_with = ["anthropic-claude", "openai-gpt", "qwen", "deepseek", "kimi"]
[models."mistral-large-2512"]
name = "Mistral Large 3 2512"
provider = "mistral"
context_window = 262144
logical_model = "mistral-large-3-2512"
equivalence_group = "mistral-large-3-2512"
api_dialect = "openai_chat_compat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.50, output_per_mtok = 1.50, cache_read_per_mtok = 0.05 }
architecture = { parameter_count_b = 675.0, active_parameter_count_b = 41.0, moe = true, license = "Apache-2.0", source_url = "https://docs.mistral.ai/models/model-cards/mistral-large-3-25-12", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["coding", "tool_use", "long_context", "vision"]
[models."mistral-small-2603"]
name = "Mistral Small 4"
provider = "mistral"
context_window = 262144
logical_model = "mistral-small-4-2603"
equivalence_group = "mistral-small-4-2603"
api_dialect = "openai_chat_compat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60, cache_read_per_mtok = 0.015 }
architecture = { parameter_count_b = 119.0, active_parameter_count_b = 6.5, moe = true, license = "Apache-2.0", source_url = "https://docs.mistral.ai/models/model-cards/mistral-small-4-0-26-03", last_verified = "2026-06-05" }
tier = "mid"
open_weight = true
strengths = ["cheap", "coding", "speed", "tool_use", "long_context"]
[models."mistralai/mistral-large-2512"]
name = "Mistral Large 3 2512"
provider = "openrouter"
context_window = 262144
logical_model = "mistral-large-3-2512"
equivalence_group = "mistral-large-3-2512"
served_variant = "openrouter"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.50, output_per_mtok = 1.50, cache_read_per_mtok = 0.05 }
architecture = { parameter_count_b = 675.0, active_parameter_count_b = 41.0, moe = true, license = "Apache-2.0", source_url = "https://docs.mistral.ai/models/model-cards/mistral-large-3-25-12", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["coding", "tool_use", "long_context"]
[models."mistralai/mistral-small-2603"]
name = "Mistral Small 4"
provider = "openrouter"
context_window = 262144
logical_model = "mistral-small-4-2603"
equivalence_group = "mistral-small-4-2603"
served_variant = "openrouter"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60, cache_read_per_mtok = 0.015 }
architecture = { parameter_count_b = 119.0, active_parameter_count_b = 6.5, moe = true, license = "Apache-2.0", source_url = "https://docs.mistral.ai/models/model-cards/mistral-small-4-0-26-03", last_verified = "2026-06-05" }
tier = "mid"
open_weight = true
strengths = ["cheap", "coding", "speed"]
[models."qwen/qwen3-coder"]
name = "Qwen3 Coder 480B A35B"
provider = "openrouter"
context_window = 262144
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.22, output_per_mtok = 1.80 }
availability = "serverless"
tier = "frontier"
open_weight = true
strengths = ["coding", "long_context", "agentic", "tool_use"]
benchmarks = { swe_bench_verified = 67.0 }
[models."Qwen/Qwen3-Coder-Next-FP8"]
name = "Qwen3 Coder Next FP8 (Together, dedicated)"
provider = "together"
context_window = 262144
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.18, output_per_mtok = 0.18 }
availability = "dedicated"
tier = "frontier"
open_weight = true
strengths = ["coding", "long_context", "agentic"]
complementary_with = ["anthropic-claude", "openai-gpt", "google-gemini", "deepseek", "kimi"]
[models."deepseek/deepseek-v3.2"]
name = "DeepSeek V3.2"
provider = "openrouter"
context_window = 131072
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.28, output_per_mtok = 0.42 }
tier = "mid"
open_weight = true
strengths = ["coding", "tool_use", "cheap"]
complementary_with = ["anthropic-claude", "openai-gpt", "google-gemini", "qwen", "kimi"]
[models."moonshotai/kimi-k2.6"]
name = "Kimi K2.6"
provider = "openrouter"
context_window = 262144
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.73, output_per_mtok = 3.49 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "long_context", "tool_use", "reasoning"]
benchmarks = { swe_bench_pro = 58.6, humanitys_last_exam_with_tools = 54.0 }
complementary_with = ["anthropic-claude", "openai-gpt", "google-gemini", "qwen", "deepseek"]
[models."openai/gpt-oss-120b"]
name = "GPT-OSS 120B"
provider = "openrouter"
context_window = 131072
logical_model = "openai-gpt-oss-120b"
equivalence_group = "openai-gpt-oss-120b"
served_variant = "openrouter"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60 }
architecture = { parameter_count_b = 117.0, active_parameter_count_b = 5.1, moe = true, license = "Apache-2.0", source_url = "https://developers.openai.com/api/docs/models/gpt-oss-120b", last_verified = "2026-06-05" }
tier = "mid"
open_weight = true
strengths = ["cheap", "tool_use"]
[models."gpt-oss-120b"]
name = "GPT-OSS 120B (Cerebras)"
provider = "cerebras"
context_window = 131072
logical_model = "openai-gpt-oss-120b"
equivalence_group = "openai-gpt-oss-120b"
served_variant = "cerebras-wafer-scale"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.35, output_per_mtok = 0.75 }
rate_limits = { rpm = 5, tpm = 30000, tph = 1000000, tpd = 1000000, tier = "free", source_url = "https://inference-docs.cerebras.ai/support/rate-limits", last_verified = "2026-06-05", notes = "Published Free Trial row; Developer (Pay as You Go) lists 1K RPM and 1M TPM with no hourly/daily cap." }
architecture = { parameter_count_b = 117.0, active_parameter_count_b = 5.1, moe = true, license = "Apache-2.0", source_url = "https://developers.openai.com/api/docs/models/gpt-oss-120b", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["speed", "cheap", "tool_use"]
[models."zai-glm-4.7"]
name = "Z.ai GLM 4.7 (Cerebras)"
provider = "cerebras"
context_window = 131072
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 2.25, output_per_mtok = 2.75 }
tier = "frontier"
open_weight = true
strengths = ["speed", "coding", "agentic", "tool_use", "reasoning"]
[models."llama-3.3-70b"]
name = "Llama 3.3 70B (Cerebras, dedicated legacy)"
provider = "cerebras"
context_window = 131072
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.85, output_per_mtok = 1.20 }
availability = "dedicated"
deprecated = true
deprecation_note = "Cerebras no longer returns this model from public discovery; use a provisioned dedicated endpoint alias if your organization still serves these weights."
tier = "mid"
open_weight = true
strengths = ["speed", "tool_use"]
[models."MiniMax-M3"]
name = "MiniMax M3"
provider = "minimax"
context_window = 1000000
capabilities = ["tools", "vision", "video", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.60, output_per_mtok = 2.40, cache_read_per_mtok = 0.12 }
tier = "frontier"
open_weight = false
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context", "vision"]
[models."MiniMax-M2"]
name = "MiniMax M2"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 0.255, output_per_mtok = 1.00, cache_read_per_mtok = 0.051 }
tier = "mid"
open_weight = true
strengths = ["coding", "agentic", "cheap", "tool_use"]
benchmarks = { aa_intelligence_index = 45.0 }
[models."MiniMax-M2.5"]
name = "MiniMax M2.5"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 0.28, output_per_mtok = 1.10, cache_read_per_mtok = 0.056 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "long_context"]
[models."MiniMax-M2.5-highspeed"]
name = "MiniMax M2.5 (highspeed)"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.28, output_per_mtok = 1.10, cache_read_per_mtok = 0.056 }
tier = "mid"
open_weight = true
strengths = ["speed", "coding", "agentic"]
[models."MiniMax-M2.7"]
name = "MiniMax M2.7"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.30, output_per_mtok = 1.20, cache_read_per_mtok = 0.06 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context"]
benchmarks = { aa_intelligence_index = 50.0 }
[models."MiniMax-M2.7-highspeed"]
name = "MiniMax M2.7 (highspeed)"
provider = "minimax"
context_window = 204800
capabilities = ["tools", "streaming", "prompt_caching"]
pricing = { input_per_mtok = 0.30, output_per_mtok = 1.20, cache_read_per_mtok = 0.06 }
tier = "mid"
open_weight = true
strengths = ["speed", "coding", "agentic"]
[models."MiniMax-Text-01"]
name = "MiniMax Text 01"
provider = "minimax"
context_window = 1000000
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.20, output_per_mtok = 1.10 }
tier = "mid"
open_weight = true
strengths = ["long_context"]
[models."minimax/minimax-m3"]
name = "MiniMax M3 (via OpenRouter)"
provider = "openrouter"
context_window = 1048576
capabilities = ["tools", "vision", "video", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.60, output_per_mtok = 2.40, cache_read_per_mtok = 0.12 }
tier = "frontier"
open_weight = false
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context", "vision"]
[models."minimax/minimax-m2.7"]
name = "MiniMax M2.7 (via OpenRouter)"
provider = "openrouter"
context_window = 204800
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.40, output_per_mtok = 1.50 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context"]
[models."minimax/minimax-m2"]
name = "MiniMax M2 (via OpenRouter)"
provider = "openrouter"
context_window = 204800
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.33, output_per_mtok = 1.20 }
tier = "mid"
open_weight = true
strengths = ["coding", "agentic", "cheap"]
[models."glm-5"]
name = "GLM 5"
provider = "zai"
context_window = 202752
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.98, output_per_mtok = 3.08, cache_read_per_mtok = 0.20 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use"]
[models."glm-5.1"]
name = "GLM 5.1"
provider = "zai"
context_window = 202752
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 1.40, output_per_mtok = 4.40, cache_read_per_mtok = 0.26 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context"]
benchmarks = { swe_bench_pro_lead = 1.0 }
[models."z-ai/glm-5"]
name = "GLM 5 (via OpenRouter)"
provider = "openrouter"
context_window = 202752
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 1.20, output_per_mtok = 4.00 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic"]
[models."z-ai/glm-5.1"]
name = "GLM 5.1 (via OpenRouter)"
provider = "openrouter"
context_window = 202752
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.98, output_per_mtok = 3.08 }
tier = "frontier"
open_weight = true
strengths = ["coding", "agentic", "tool_use", "reasoning", "long_context"]
[models."z-ai/glm-5v-turbo"]
name = "GLM 5V Turbo (via OpenRouter)"
provider = "openrouter"
context_window = 202752
capabilities = ["tools", "streaming", "vision"]
pricing = { input_per_mtok = 1.20, output_per_mtok = 4.00 }
tier = "mid"
open_weight = true
strengths = ["vision", "speed"]
[models."deepseek-v4-flash"]
name = "DeepSeek V4 Flash"
provider = "deepseek"
context_window = 1000000
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.14, output_per_mtok = 0.28, cache_read_per_mtok = 0.0028 }
tier = "mid"
open_weight = true
strengths = ["speed", "cheap", "tool_use", "reasoning", "long_context"]
benchmarks = { aa_intelligence_index = 58.0 }
[models."deepseek-v4-pro"]
name = "DeepSeek V4 Pro"
provider = "deepseek"
context_window = 1000000
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.435, output_per_mtok = 0.87, cache_read_per_mtok = 0.003625 }
tier = "frontier"
open_weight = true
strengths = ["reasoning", "coding", "tool_use", "long_context"]
benchmarks = { aa_intelligence_index = 68.0 }
[models."deepseek-chat"]
name = "DeepSeek Chat (legacy → V4 Flash, non-thinking)"
provider = "deepseek"
context_window = 1000000
capabilities = ["tools", "streaming", "prompt_caching"]
pricing = { input_per_mtok = 0.14, output_per_mtok = 0.28, cache_read_per_mtok = 0.0028 }
deprecated = true
deprecation_note = "Maps to deepseek-v4-flash non-thinking mode; retirement 2026-07-24 15:59 UTC per provider docs."
tier = "mid"
open_weight = true
strengths = ["coding", "tool_use"]
[models."deepseek-reasoner"]
name = "DeepSeek Reasoner (legacy → V4 Flash, thinking)"
provider = "deepseek"
context_window = 1000000
capabilities = ["tools", "streaming", "thinking", "prompt_caching"]
pricing = { input_per_mtok = 0.14, output_per_mtok = 0.28, cache_read_per_mtok = 0.0028 }
deprecated = true
deprecation_note = "Maps to deepseek-v4-flash thinking mode; retirement 2026-07-24 15:59 UTC per provider docs."
tier = "reasoning"
open_weight = true
strengths = ["reasoning", "coding"]
[models."deepseek/deepseek-v4-flash"]
name = "DeepSeek V4 Flash (via OpenRouter)"
provider = "openrouter"
context_window = 1048576
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.0983, output_per_mtok = 0.1966, cache_read_per_mtok = 0.0197 }
tier = "mid"
open_weight = true
strengths = ["speed", "cheap", "tool_use", "reasoning", "long_context"]
[models."deepseek/deepseek-v4-pro"]
name = "DeepSeek V4 Pro (via OpenRouter)"
provider = "openrouter"
context_window = 1048576
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.435, output_per_mtok = 0.87, cache_read_per_mtok = 0.003625 }
tier = "frontier"
open_weight = true
strengths = ["reasoning", "coding", "tool_use", "long_context"]
[models."Qwen/Qwen3.5-9B"]
name = "Qwen3.5 9B"
provider = "openrouter"
context_window = 131072
capabilities = ["tools", "streaming"]
tier = "small"
open_weight = true
strengths = ["cheap", "speed"]
[models."llama3.2"]
name = "Llama 3.2"
provider = "ollama"
context_window = 32000
stream_timeout = 300.0
capabilities = ["tools", "streaming"]
tier = "small"
open_weight = true
strengths = ["cheap", "speed"]
[models."gemma4:26b"]
name = "Gemma 4 26B MoE"
provider = "ollama"
context_window = 262144
runtime_context_window = 32768
stream_timeout = 300.0
capabilities = ["tools", "vision", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["vision", "tool_use"]
[models."gemma4:12b-mlx"]
name = "Gemma 4 12B (MLX)"
provider = "ollama"
context_window = 131072
runtime_context_window = 32768
stream_timeout = 240.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["speed", "cheap"]
[models."gemma4:12b-nvfp4"]
name = "Gemma 4 12B (NVFP4)"
provider = "ollama"
context_window = 131072
runtime_context_window = 32768
stream_timeout = 240.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["cheap", "tool_use"]
[models."gemma4:12b-mxfp8"]
name = "Gemma 4 12B (MXFP8)"
provider = "ollama"
context_window = 131072
runtime_context_window = 32768
stream_timeout = 240.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["cheap", "tool_use"]
[models."devstral-small-2:24b"]
name = "Devstral Small 2 24B"
provider = "ollama"
context_window = 262144
runtime_context_window = 32768
stream_timeout = 600.0
capabilities = ["tools", "streaming"]
tier = "mid"
open_weight = true
strengths = ["coding", "agentic"]
[models."qwen3.6-35b-a3b-ud-q4-k-xl"]
name = "Qwen3.6 35B (Unsloth Q4_K_XL, llama.cpp)"
provider = "llamacpp"
context_window = 262144
runtime_context_window = 65536
stream_timeout = 900.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["coding"]
[models."qwen3.6-35b-a3b-ud-q4-k-xl".local_memory]
measured_resident_gib = 19.5
measured_context_window = 8192
measured_cache_type = "q8_0"
base_resident_gib = 19.0
kv_cache_gib_per_1k_ctx = 0.10
default_cache_type = "q8_0"
safety_margin_gib = 4.0
max_recommended_context = 65536
cache_type_multipliers = { q8_0 = 1.0, f16 = 2.0, q4_0 = 0.5, q4_1 = 0.5, q5_0 = 0.625, q5_1 = 0.625 }
last_verified = "2026-06-05"
notes = "Empirical llama-server RSS was about 19.5 GiB at ctx=8192 with q8_0 KV on Apple Silicon. Treat as a conservative launch guard, not an exact allocator model."
[models."qwen3.6-35b-a3b-ud-q5-k-xl"]
name = "Qwen3.6 35B (Unsloth Q5_K_XL, llama.cpp)"
provider = "llamacpp"
context_window = 262144
runtime_context_window = 65536
stream_timeout = 900.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["coding"]
[models."qwen3.6-35b-a3b"]
name = "Qwen3.6 35B (llama.cpp)"
provider = "llamacpp"
context_window = 262144
runtime_context_window = 65536
stream_timeout = 900.0
capabilities = ["tools", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["coding"]
[models."unsloth/Qwen3.6-27B-UD-MLX-4bit"]
name = "Qwen3.6 27B (MLX 4-bit)"
provider = "mlx"
context_window = 262144
stream_timeout = 900.0
capabilities = ["tools", "vision", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["coding", "vision"]
[models."gemma-4-e2b-it"]
name = "Gemma 4 E2B (local)"
provider = "local"
context_window = 131072
stream_timeout = 300.0
capabilities = ["streaming", "thinking"]
tier = "small"
open_weight = true
strengths = ["cheap", "speed"]
[models."gemma-4-e4b-it"]
name = "Gemma 4 E4B (local)"
provider = "local"
context_window = 131072
stream_timeout = 300.0
capabilities = ["streaming", "thinking"]
tier = "small"
open_weight = true
strengths = ["cheap"]
[models."gemma-4-26b-a4b-it"]
name = "Gemma 4 26B MoE (local)"
provider = "local"
context_window = 131072
stream_timeout = 600.0
capabilities = ["streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["coding"]
[models."gemma-4-31b-it"]
name = "Gemma 4 31B (local)"
provider = "local"
context_window = 131072
stream_timeout = 600.0
capabilities = ["streaming", "thinking"]
tier = "frontier"
open_weight = true
strengths = ["coding", "long_context"]
[models."gemma-4-12b-it"]
name = "Gemma 4 12B (local)"
provider = "local"
context_window = 131072
stream_timeout = 300.0
capabilities = ["streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["cheap", "speed"]
[models."google/gemma-4-31b-it"]
name = "Gemma 4 31B (OpenRouter)"
provider = "openrouter"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
pricing = { input_per_mtok = 0.12, output_per_mtok = 0.37 }
tier = "frontier"
open_weight = true
strengths = ["vision", "reasoning", "coding", "cheap"]
[models."google/gemma-4-26b-a4b-it"]
name = "Gemma 4 26B MoE (OpenRouter)"
provider = "openrouter"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
pricing = { input_per_mtok = 0.06, output_per_mtok = 0.33 }
tier = "mid"
open_weight = true
strengths = ["vision", "cheap", "speed"]
[models."models/gemma-4-31b-it"]
name = "Gemma 4 31B (Gemini API)"
provider = "gemini"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
tier = "frontier"
open_weight = true
strengths = ["vision", "reasoning", "coding", "cheap"]
[models."models/gemma-4-26b-a4b-it"]
name = "Gemma 4 26B MoE (Gemini API)"
provider = "gemini"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
tier = "mid"
open_weight = true
strengths = ["vision", "cheap", "speed"]
[models."google/gemma-4-31B-it"]
name = "Gemma 4 31B (Together)"
provider = "together"
context_window = 262144
capabilities = ["tools", "vision", "streaming", "thinking"]
pricing = { input_per_mtok = 0.20, output_per_mtok = 0.50 }
tier = "frontier"
open_weight = true
strengths = ["vision", "reasoning", "coding"]
[models."groq/openai/gpt-oss-120b"]
name = "GPT-OSS 120B (Groq)"
provider = "groq"
context_window = 131072
logical_model = "openai-gpt-oss-120b"
equivalence_group = "openai-gpt-oss-120b"
served_variant = "groq-lpu"
wire_model = "openai/gpt-oss-120b"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 0.15, output_per_mtok = 0.60 }
rate_limits = { rpm = 1000, tpm = 250000, tier = "developer", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
architecture = { parameter_count_b = 117.0, active_parameter_count_b = 5.1, moe = true, license = "Apache-2.0", source_url = "https://developers.openai.com/api/docs/models/gpt-oss-120b", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["speed", "cheap", "tool_use", "reasoning"]
[models."llama-3.3-70b-versatile"]
name = "Llama 3.3 70B Versatile (Groq)"
provider = "groq"
context_window = 131072
logical_model = "llama-3.3-70b-instruct"
equivalence_group = "llama-3.3-70b-instruct"
served_variant = "groq-lpu"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.59, output_per_mtok = 0.79 }
rate_limits = { rpm = 1000, tpm = 300000, tier = "developer", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
architecture = { parameter_count_b = 70.0, moe = false, license = "Llama 3.3 Community", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
tier = "mid"
open_weight = true
strengths = ["speed", "cheap", "tool_use"]
[models."llama-3.1-8b-instant"]
name = "Llama 3.1 8B Instant (Groq)"
provider = "groq"
context_window = 131072
logical_model = "llama-3.1-8b-instruct"
equivalence_group = "llama-3.1-8b-instruct"
served_variant = "groq-lpu"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.05, output_per_mtok = 0.08 }
rate_limits = { rpm = 1000, tpm = 250000, tier = "developer", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
architecture = { parameter_count_b = 8.0, moe = false, license = "Llama 3.1 Community", source_url = "https://console.groq.com/docs/models", last_verified = "2026-06-05" }
tier = "small"
open_weight = true
strengths = ["speed", "cheap"]
[models."command-a-plus-05-2026"]
name = "Command A+"
provider = "cohere"
context_window = 256000
logical_model = "command-a-plus-05-2026"
equivalence_group = "command-a-plus-05-2026"
api_dialect = "openai_chat_compat"
capabilities = ["tools", "streaming", "thinking", "vision"]
pricing = { input_per_mtok = 2.50, output_per_mtok = 10.00 }
rate_limits = { rpm = 20, tier = "trial", source_url = "https://docs.cohere.com/docs/rate-limits", last_verified = "2026-06-05", notes = "Command A+ trial keys are 20 RPM and 1,000 calls/month; production is sales-gated. Token pricing is the public Command A+ API tariff." }
architecture = { parameter_count_b = 111.0, moe = true, source_url = "https://docs.cohere.com/docs/command-a-plus", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["agentic", "tool_use", "reasoning", "multilingual", "vision"]
[models."grok-build-0.1"]
name = "Grok Build 0.1"
provider = "xai"
context_window = 256000
logical_model = "grok-code-fast-1"
equivalence_group = "grok-code-fast-1"
api_dialect = "openai_chat_compat"
capabilities = ["tools", "streaming", "thinking", "vision", "prompt_caching"]
pricing = { input_per_mtok = 1.00, output_per_mtok = 2.00, cache_read_per_mtok = 0.20 }
rate_limits = { rpm = 1800, tpm = 10000000, tier = "us-east-1", source_url = "https://docs.x.ai/developers/models/grok-code-fast-1", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = false
strengths = ["coding", "agentic", "tool_use", "reasoning", "vision"]