harn-vm 0.8.113

# Cerebras-hosted open-weight models. Serverless rows mirror
# https://api.cerebras.ai/public/v1/models; dedicated-endpoint families are
# intentionally not added as one-click routes unless Cerebras exposes a stable
# public wire ID for the standard endpoint. The headline binder-substrate
# candidate is gpt-oss-120b at very high token throughput; GLM 4.7 is the
# public preview coding/agentic route.
#
# Catalog keys are bare wire IDs (Cerebras's /v1/chat/completions wants
# the raw model name). Users routing via `model: "cerebras/<name>"` get
# the slash-prefixed selector stripped by `normalize_model_id` while
# `infer_provider` routes them to this provider.
tier = "mid"
open_weight = true
strengths = ["cheap", "tool_use"]
[models."gpt-oss-120b"]
name = "GPT-OSS 120B (Cerebras)"
provider = "cerebras"
context_window = 131072
logical_model = "openai-gpt-oss-120b"
equivalence_group = "openai-gpt-oss-120b"
served_variant = "cerebras-wafer-scale"
api_dialect = "openai_chat"
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.35, output_per_mtok = 0.75 }
rate_limits = { rpm = 250, tpm = 250000, tph = 1000000, tpd = 1000000, tier = "developer", source_url = "https://inference-docs.cerebras.ai/support/rate-limits", last_verified = "2026-06-12", notes = "Developer (Pay as You Go) tier confirmed live from x-ratelimit-limit-requests/tokens response headers 2026-06-12 (250 RPM / 250K TPM). The prior Free Trial row (5 RPM / 30K TPM) needlessly throttled the proactive sliding-window limiter to a crawl. Override per-deployment with HARN_RATE_LIMIT_CEREBRAS_RPM / HARN_RATE_LIMIT_CEREBRAS_TPM." }
architecture = { parameter_count_b = 117.0, active_parameter_count_b = 5.1, moe = true, license = "Apache-2.0", source_url = "https://developers.openai.com/api/docs/models/gpt-oss-120b", last_verified = "2026-06-05" }
tier = "frontier"
open_weight = true
strengths = ["speed", "cheap", "tool_use"]
[models."zai-glm-4.7"]
name = "Z.ai GLM 4.7 (Cerebras)"
provider = "cerebras"
context_window = 131072
capabilities = ["tools", "streaming", "thinking"]
pricing = { input_per_mtok = 2.25, output_per_mtok = 2.75 }
tier = "frontier"
open_weight = true
strengths = ["speed", "coding", "agentic", "tool_use", "reasoning"]
[models."llama-3.3-70b"]
name = "Llama 3.3 70B (Cerebras, dedicated legacy)"
provider = "cerebras"
context_window = 131072
capabilities = ["tools", "streaming"]
pricing = { input_per_mtok = 0.85, output_per_mtok = 1.20 }
availability = "dedicated"
deprecated = true
deprecation_note = "Cerebras no longer returns this model from public discovery; use a provisioned dedicated endpoint alias if your organization still serves these weights."