harn-vm 0.8.113

# ── Providers ────────────────────────────────────────────────────────────────
# Each [providers.X] block defines an LLM endpoint Harn can dial. The
# `auth_env` field can be a single string or an array (tried in order).
# `cost_per_1k_in/out` are coarse provider-level fallbacks used when a
# specific [models.X] entry has no `pricing` table.

[providers.anthropic]
base_url = "https://api.anthropic.com/v1"
auth_style = "header"
auth_header = "x-api-key"
auth_env = "ANTHROPIC_API_KEY"
chat_endpoint = "/messages"
features = ["prompt_caching", "thinking"]
cost_per_1k_in = 0.003
cost_per_1k_out = 0.015
latency_p50_ms = 2500
extra_headers = { "anthropic-version" = "2023-06-01" }

[providers.anthropic.healthcheck]
method = "POST"
path = "/messages/count_tokens"
body = '{"model":"claude-sonnet-4-6","messages":[{"role":"user","content":"x"}]}'

[providers.openai]
base_url = "https://api.openai.com/v1"
auth_style = "bearer"
auth_env = "OPENAI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1800

[providers.openai.healthcheck]
method = "GET"
path = "/models"

[providers.openrouter]
base_url = "https://openrouter.ai/api/v1"
auth_style = "bearer"
auth_env = "OPENROUTER_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.003
cost_per_1k_out = 0.015
latency_p50_ms = 2200

[providers.openrouter.healthcheck]
method = "GET"
path = "/auth/key"

[providers.huggingface]
base_url = "https://router.huggingface.co/v1"
auth_style = "bearer"
auth_env = ["HF_TOKEN", "HUGGINGFACE_API_KEY"]
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 2400

[providers.huggingface.healthcheck]
method = "GET"
url = "https://huggingface.co/api/whoami-v2"

# Ollama defaults to /api/chat (native NDJSON) so the test stubs keep
# working; hosts can flip to /v1/chat/completions via a providers.toml
# overlay to bypass Ollama's per-model tool-call post-processors
# (qwen3coder.go, qwen35.go) that raise HTTP 500s on text-mode responses
# for the Qwen3.5 family.
[providers.ollama]
base_url = "http://localhost:11434"
base_url_env = "OLLAMA_HOST"
auth_style = "none"
chat_endpoint = "/api/chat"
completion_endpoint = "/api/generate"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 1200

[providers.ollama.healthcheck]
method = "GET"
path = "/api/tags"

[providers.gemini]
base_url = "https://generativelanguage.googleapis.com"
base_url_env = "GEMINI_BASE_URL"
auth_style = "header"
auth_header = "x-goog-api-key"
auth_env = ["GEMINI_API_KEY", "GOOGLE_API_KEY"]
chat_endpoint = "/v1beta/models"
cost_per_1k_in = 0.00125
cost_per_1k_out = 0.005
latency_p50_ms = 1800

[providers.gemini.healthcheck]
method = "GET"
path = "/v1beta/models"

[providers.mistral]
base_url = "https://api.mistral.ai/v1"
base_url_env = "MISTRAL_BASE_URL"
auth_style = "bearer"
auth_env = "MISTRAL_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0005
cost_per_1k_out = 0.0015
latency_p50_ms = 1800
features = ["native_tools"]

[providers.mistral.healthcheck]
method = "GET"
path = "/models"

[providers.cohere]
base_url = "https://api.cohere.ai/compatibility/v1"
base_url_env = "COHERE_BASE_URL"
auth_style = "bearer"
auth_env = "COHERE_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1900
features = ["native_tools", "reasoning"]

[providers.cohere.healthcheck]
method = "GET"
path = "/models"

[providers.xai]
base_url = "https://api.x.ai/v1"
base_url_env = "XAI_BASE_URL"
auth_style = "bearer"
auth_env = "XAI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.001
cost_per_1k_out = 0.002
latency_p50_ms = 1600
features = ["responses_api", "native_tools", "reasoning"]

[providers.xai.healthcheck]
method = "GET"
path = "/models"

[providers.together]
base_url = "https://api.together.xyz/v1"
base_url_env = "TOGETHER_AI_BASE_URL"
auth_style = "bearer"
auth_env = "TOGETHER_AI_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 1600

[providers.together.healthcheck]
method = "GET"
path = "/models"

# Groq — OpenAI-compatible LPU-hosted fast inference. Headline ~840 tok/s
# on Llama 3.1 8B, ~594 tok/s on Llama 4 Scout. Useful executor target
# when sub-100ms TTFT matters more than raw quality.
[providers.groq]
base_url = "https://api.groq.com/openai/v1"
base_url_env = "GROQ_BASE_URL"
auth_style = "bearer"
auth_env = "GROQ_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0001
cost_per_1k_out = 0.0003
latency_p50_ms = 450

[providers.groq.healthcheck]
method = "GET"
path = "/models"

# Cerebras — OpenAI-compatible wafer-scale inference. High token throughput
# makes it a strong fit for latency-budgeted binder workloads; end-to-end
# p50 still includes client/provider round-trip time, so callers should keep
# their own wall-clock budget. Provider-level pricing here is a coarse
# default; per-model rows under [models.X] hold the authoritative numbers
# from Cerebras's public model discovery endpoint.
[providers.cerebras]
base_url = "https://api.cerebras.ai/v1"
base_url_env = "CEREBRAS_BASE_URL"
auth_style = "bearer"
auth_env = "CEREBRAS_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.00035
cost_per_1k_out = 0.00075
latency_p50_ms = 150
features = ["native_tools"]

[providers.cerebras.healthcheck]
method = "GET"
path = "/models"

[providers.deepseek]
base_url = "https://api.deepseek.com/v1"
base_url_env = "DEEPSEEK_BASE_URL"
auth_style = "bearer"
auth_env = "DEEPSEEK_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.00014
cost_per_1k_out = 0.00028
latency_p50_ms = 1800

[providers.deepseek.healthcheck]
method = "GET"
path = "/models"

[providers.fireworks]
base_url = "https://api.fireworks.ai/inference/v1"
base_url_env = "FIREWORKS_BASE_URL"
auth_style = "bearer"
auth_env = "FIREWORKS_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 1400

[providers.fireworks.healthcheck]
method = "GET"
path = "/models"

[providers.dashscope]
base_url = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
base_url_env = "DASHSCOPE_BASE_URL"
auth_style = "bearer"
auth_env = "DASHSCOPE_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0003
cost_per_1k_out = 0.0012
latency_p50_ms = 1600

[providers.dashscope.healthcheck]
method = "GET"
path = "/models"

# MiniMax — OpenAI-compatible endpoint for MiniMax M2/M3 models. The direct
# API serves MiniMax-M3 plus the open-weight M2 family on the same
# /v1/chat/completions URL; the wire format mirrors OpenAI chat completions
# with native tool calls and model-specific thinking controls.
[providers.minimax]
base_url = "https://api.minimax.io/v1"
base_url_env = "MINIMAX_BASE_URL"
auth_style = "bearer"
auth_env = "MINIMAX_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0006
cost_per_1k_out = 0.0024
latency_p50_ms = 1700

[providers.minimax.healthcheck]
method = "GET"
path = "/models"

# Z.AI — host of the GLM family (GLM-4.6, GLM-4.7). Z.AI publishes both a
# native PaaS endpoint (`/api/paas/v4`) and an OpenAI-compatible endpoint
# (`/v1`); Harn dials the OpenAI-compatible one so the existing
# openai_chat_completions wire format Just Works. The provider also
# accepts the legacy `ZHIPU_API_KEY` env var; callers can rely on the
# `auth_env` array trying each in order.
[providers.zai]
base_url = "https://api.z.ai/v1"
base_url_env = "ZAI_BASE_URL"
auth_style = "bearer"
auth_env = ["ZAI_API_KEY", "ZHIPU_API_KEY"]
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0004
cost_per_1k_out = 0.0017
latency_p50_ms = 1900

[providers.zai.healthcheck]
method = "GET"
path = "/models"

# Moonshot AI — first-party host of the Kimi K2 family. OpenAI-compatible
# `/v1/chat/completions`; the same models were previously only reachable
# through OpenRouter/Together aggregator routes. Moonshot also exposes an
# Anthropic-compatible `/anthropic` surface, but Harn dials the
# OpenAI-compatible endpoint so the shared openai_chat_completions wire
# format Just Works. Use the `.cn` base via MOONSHOT_BASE_URL for the
# China region.
[providers.moonshot]
base_url = "https://api.moonshot.ai/v1"
base_url_env = "MOONSHOT_BASE_URL"
auth_style = "bearer"
auth_env = ["MOONSHOT_API_KEY", "KIMI_API_KEY"]
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0006
cost_per_1k_out = 0.0025
latency_p50_ms = 1900
features = ["native_tools", "reasoning", "prompt_caching"]

[providers.moonshot.healthcheck]
method = "GET"
path = "/models"

# DeepInfra — OpenAI-compatible host for open-weight models (DeepSeek,
# Qwen, Llama, Kimi, GPT-OSS, Gemma). The compat surface lives under the
# `/v1/openai` path prefix, so the base_url carries it and chat_endpoint
# stays the standard `/chat/completions`. Catalog rows are keyed with a
# `deepinfra/<hf-id>` prefix and carry `wire_model` so they stay
# collision-free with the bare/openrouter ids for the same weights.
[providers.deepinfra]
base_url = "https://api.deepinfra.com/v1/openai"
base_url_env = "DEEPINFRA_BASE_URL"
auth_style = "bearer"
auth_env = ["DEEPINFRA_API_KEY", "DEEPINFRA_TOKEN"]
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0002
cost_per_1k_out = 0.0006
latency_p50_ms = 1500
features = ["native_tools"]

[providers.deepinfra.healthcheck]
method = "GET"
path = "/models"

# SambaNova Cloud — OpenAI-compatible RDU-hosted inference with very high
# token throughput on large open-weight models (DeepSeek, Llama 4, Qwen).
# A latency-budgeted executor target alongside Groq/Cerebras with a
# different model mix. Catalog rows use a `sambanova/<wire-id>` prefix +
# `wire_model` so they don't collide with other hosts of the same weights.
[providers.sambanova]
base_url = "https://api.sambanova.ai/v1"
base_url_env = "SAMBANOVA_BASE_URL"
auth_style = "bearer"
auth_env = "SAMBANOVA_API_KEY"
chat_endpoint = "/chat/completions"
completion_endpoint = "/completions"
cost_per_1k_in = 0.0006
cost_per_1k_out = 0.0012
latency_p50_ms = 350
features = ["native_tools"]

[providers.sambanova.healthcheck]
method = "GET"
path = "/models"

# AWS Bedrock — resolves credentials through env, profile, container, or
# EC2 instance roles, then signs Converse API calls with SigV4.
[providers.bedrock]
base_url = ""
base_url_env = "BEDROCK_BASE_URL"
auth_style = "aws_sigv4"
chat_endpoint = "/model/{model}/converse"
features = ["native_tools"]
latency_p50_ms = 2600

# Azure OpenAI — deployment name is routed in the URL; callers can
# either pass the deployment as the Harn model field or set
# AZURE_OPENAI_DEPLOYMENT.
[providers.azure_openai]
base_url = "https://{resource}.openai.azure.com"
base_url_env = "AZURE_OPENAI_ENDPOINT"
auth_style = "azure_openai"
auth_env = ["AZURE_OPENAI_API_KEY", "AZURE_OPENAI_AD_TOKEN", "AZURE_OPENAI_BEARER_TOKEN"]
chat_endpoint = "/openai/deployments/{deployment}/chat/completions?api-version={api_version}"
features = ["native_tools"]
cost_per_1k_in = 0.0025
cost_per_1k_out = 0.010
latency_p50_ms = 1900

[providers.vertex]
base_url = "https://aiplatform.googleapis.com/v1"
base_url_env = "VERTEX_AI_BASE_URL"
auth_style = "bearer"
auth_env = ["VERTEX_AI_ACCESS_TOKEN", "GOOGLE_OAUTH_ACCESS_TOKEN", "GOOGLE_APPLICATION_CREDENTIALS"]
chat_endpoint = "/projects/{project}/locations/{location}/publishers/google/models/{model}:generateContent"
features = ["native_tools"]
cost_per_1k_in = 0.00125
cost_per_1k_out = 0.005
latency_p50_ms = 2100

[providers.local]
base_url = "http://localhost:8000"
base_url_env = "LOCAL_LLM_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900

[providers.local.healthcheck]
method = "GET"
path = "/v1/models"

# llama.cpp — separate from `local` so capability rules can isolate Qwen
# chat-template thinking quirks from other local OpenAI-compatible hosts.
[providers.llamacpp]
base_url = "http://127.0.0.1:8001"
base_url_env = "LLAMACPP_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900

[providers.llamacpp.healthcheck]
method = "GET"
path = "/v1/models"

# Apple Silicon MLX. Harn owns readiness probing; hosts that want
# script-based auto-start should launch the process first, then call
# Harn again to verify readiness.
[providers.mlx]
base_url = "http://127.0.0.1:8002"
base_url_env = "MLX_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 900

[providers.mlx.healthcheck]
method = "GET"
path = "/v1/models"

[providers.vllm]
base_url = "http://localhost:8000"
base_url_env = "VLLM_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 800

[providers.vllm.healthcheck]
method = "GET"
path = "/v1/models"

[providers.tgi]
base_url = "http://localhost:8080"
base_url_env = "TGI_BASE_URL"
auth_style = "none"
chat_endpoint = "/v1/chat/completions"
completion_endpoint = "/v1/completions"
cost_per_1k_in = 0.0
cost_per_1k_out = 0.0
latency_p50_ms = 950

[providers.tgi.healthcheck]
method = "GET"
path = "/health"