tt-shared 0.1.2

# TokenTrimmer pricing catalog — versioned, release-decoupled rate data.
#
# This file is the single source of truth for per-model token rates. It is
# embedded at build time via `include_str!` (see `pricing.rs`) and parsed once
# into the `PricingCatalog`. Editing rates means editing DATA, not Rust source
# — so a rate refresh no longer requires touching a provider crate.
#
# Schema — one `[[entry]]` per (provider, model, effective_at):
#   provider                  = registry provider id ("openai", "anthropic", …)
#   model                     = exact model id the provider matches on
#   input_per_million         = USD per 1M input tokens
#   output_per_million        = USD per 1M output tokens
#   cached_input_per_million  = USD per 1M cached-READ input tokens (omit if N/A)
#   cache_write_per_million   = USD per 1M cache-WRITE input tokens (omit if N/A)
#   batch_input_per_million   = USD per 1M batch (async) input tokens (omit if N/A)
#   batch_output_per_million  = USD per 1M batch (async) output tokens (omit if N/A)
#   prompt_cache_min_tokens   = min prefix tokens before cache_control caches (omit if N/A)
#   effective_at              = RFC3339 UTC instant the rate took effect
#
# Price HISTORY: multiple entries for the same (provider, model) form a
# time-series. Live pricing uses the most recent `effective_at`; historical
# replay (`PricingCatalog::at`) selects the most recent entry with
# `effective_at <= request_ts`. When a rate changes, APPEND a new entry with
# the new `effective_at` — never edit a historical row, or replay drifts.
#
# Local / self-hosted models are always free and intentionally absent here
# (the local provider returns a zero rate directly).
#
# Snapshot: May 2026 baseline. effective_at = 2026-05-01T00:00:00Z for every
# row in this initial import (the date this catalog was first externalized);
# subsequent refreshes append rows with their real effective dates.
#
# Flagship verification — 2026-05-31:
#   OpenAI gpt-5.5 ($5/$30/$0.50) + gpt-5.4 ($2.50/$15/$0.25) confirmed vs
#   developers.openai.com/api/docs/pricing; added current gpt-5.5-pro,
#   gpt-5.4-mini, gpt-5.4-pro. gpt-4o/o3/o4-mini/text-embedding-3-* are off
#   OpenAI's current pricing page (legacy rows retained for replay).
#   Anthropic claude-haiku-4-5 ($1/$5), claude-sonnet-4-6 ($3/$15) confirmed
#   vs platform.claude.com/docs/.../pricing; added current claude-opus-4-8.
#   Gemini gemini-3.5-flash ($1.50/$9/$0.15), gemini-3.1-pro ($2/$12/$0.20),
#   gemini-3.1-flash-lite ($0.25/$1.50/$0.025) confirmed vs
#   ai.google.dev/gemini-api/docs/pricing — all exact matches.
#   NOT yet re-verified this pass: Groq, Mistral, Together, OpenRouter rows
#   (carried from the May snapshot — verify in the next refresh).

# ── OpenAI ──────────────────────────────────────────────────────────────────
# batch_{input,output}_per_million is OpenAI's Batch-API rate — a flat 50% of
# standard. prompt_cache_min_tokens = 1024: OpenAI auto-caches prompts ≥1024
# tokens (developers.openai.com/api/docs/prompt-caching).
#
# flex_{input,output}_per_million is OpenAI's Flex service tier
# (`service_tier: "flex"`) — a synchronous-but-slower tier billed at Batch-API
# rates (so flex == batch here). PRESENCE = flex eligibility: OpenAI lists Flex
# prices only for supported models (the gpt-5.x family); o3 / o4-mini are
# batch-only "specialized models" and intentionally carry NO flex rate, so a
# route opting them into flex is gated off + warned. Verified 2026-06 vs
# developers.openai.com/api/docs/guides/flex-processing + .../pricing.
[[entry]]
provider = "openai"
model = "gpt-5.5"
input_per_million = 5.00
output_per_million = 30.00
cached_input_per_million = 0.50
batch_input_per_million = 2.50
batch_output_per_million = 15.00
flex_input_per_million = 2.50
flex_output_per_million = 15.00
prompt_cache_min_tokens = 1024
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "openai"
model = "gpt-5.4"
input_per_million = 2.50
output_per_million = 15.00
cached_input_per_million = 0.25
batch_input_per_million = 1.25
batch_output_per_million = 7.50
flex_input_per_million = 1.25
flex_output_per_million = 7.50
prompt_cache_min_tokens = 1024
effective_at = "2026-05-01T00:00:00Z"

# Added 2026-05-31 — verified vs developers.openai.com/api/docs/pricing.
# gpt-5.x variants now on OpenAI's current pricing page (pro variants do not
# list a cached-input rate).
[[entry]]
provider = "openai"
model = "gpt-5.5-pro"
input_per_million = 30.00
output_per_million = 180.00
effective_at = "2026-05-31T00:00:00Z"

[[entry]]
provider = "openai"
model = "gpt-5.4-mini"
input_per_million = 0.75
output_per_million = 4.50
cached_input_per_million = 0.075
effective_at = "2026-05-31T00:00:00Z"

[[entry]]
provider = "openai"
model = "gpt-5.4-pro"
input_per_million = 30.00
output_per_million = 180.00
effective_at = "2026-05-31T00:00:00Z"

# Legacy: gpt-4o / gpt-4o-mini / o3 / o4-mini / text-embedding-3-* are no longer
# on OpenAI's current pricing page (2026-05-31). Rows retained for historical
# replay (PricingCatalog::at) — do NOT delete; their effective_at predates any
# delisting so past requests still price correctly.
[[entry]]
provider = "openai"
model = "gpt-4o"
input_per_million = 2.50
output_per_million = 10.00
cached_input_per_million = 1.25
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "openai"
model = "gpt-4o-mini"
input_per_million = 0.15
output_per_million = 0.60
cached_input_per_million = 0.075
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "openai"
model = "o3"
input_per_million = 60.00
output_per_million = 240.00
cached_input_per_million = 15.00
effective_at = "2026-05-01T00:00:00Z"

# Updated 2026-05-31 — o3's list price was cut sharply since the May snapshot.
# Current OpenRouter list price: $2/$8, cache-read $0.50. Appended per the
# versioning rule (the $60/$240 row above is retained for historical replay).
[[entry]]
provider = "openai"
model = "o3"
input_per_million = 2.00
output_per_million = 8.00
cached_input_per_million = 0.50
effective_at = "2026-05-31T00:00:00Z"

[[entry]]
provider = "openai"
model = "o4-mini"
input_per_million = 1.10
output_per_million = 4.40
cached_input_per_million = 0.275
effective_at = "2026-05-01T00:00:00Z"

# Embedding models — no completion tokens, so output rate is 0.
[[entry]]
provider = "openai"
model = "text-embedding-3-small"
input_per_million = 0.02
output_per_million = 0.00
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "openai"
model = "text-embedding-3-large"
input_per_million = 0.13
output_per_million = 0.00
effective_at = "2026-05-01T00:00:00Z"

# ── Anthropic ───────────────────────────────────────────────────────────────
# cached_input_per_million is the cache-READ rate (~10% of standard input).
# cache_write_per_million is the cache-WRITE (creation) rate (~1.25× standard
# input). Anthropic charges a write premium for tokens written to the prompt
# cache; this is distinct from the lower read rate on cache hits.
# batch_{input,output}_per_million are the async Batches-API rates — a flat 50%
# of the standard input/output rate (platform.claude.com/docs/.../batch-processing).
# prompt_cache_min_tokens is the minimum prefix length before a cache_control
# breakpoint actually caches: 4096 tokens on Opus 4.x / Haiku 4.5, 2048 on
# Sonnet 4.6 (platform.claude.com/docs/.../prompt-caching).
[[entry]]
provider = "anthropic"
model = "claude-haiku-4-5"
input_per_million = 1.00
output_per_million = 5.00
cached_input_per_million = 0.10
cache_write_per_million = 1.25
batch_input_per_million = 0.50
batch_output_per_million = 2.50
prompt_cache_min_tokens = 4096
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "anthropic"
model = "claude-sonnet-4-6"
input_per_million = 3.00
output_per_million = 15.00
cached_input_per_million = 0.30
cache_write_per_million = 3.75
batch_input_per_million = 1.50
batch_output_per_million = 7.50
prompt_cache_min_tokens = 2048
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "anthropic"
model = "claude-opus-4-7"
input_per_million = 5.00
output_per_million = 25.00
cached_input_per_million = 0.50
cache_write_per_million = 6.25
batch_input_per_million = 2.50
batch_output_per_million = 12.50
prompt_cache_min_tokens = 4096
effective_at = "2026-05-01T00:00:00Z"

# Added 2026-05-31 — Claude Opus 4.8 is now Anthropic's current Opus flagship
# (anthropic.com/claude/opus), $5/$25. Cache read/write follow Anthropic's
# documented 0.1× / 1.25× input formula (cached 0.50, write 6.25); batch is the
# flat 50% (2.50/12.50); 4096-token cache minimum like the other Opus models.
[[entry]]
provider = "anthropic"
model = "claude-opus-4-8"
input_per_million = 5.00
output_per_million = 25.00
cached_input_per_million = 0.50
cache_write_per_million = 6.25
batch_input_per_million = 2.50
batch_output_per_million = 12.50
prompt_cache_min_tokens = 4096
effective_at = "2026-05-31T00:00:00Z"

# ── Google Gemini ─────────────────────────────────────────────────────────────
# Headline rate is the ≤200K-token bracket; >200K bracket pricing is a v2 item.
# batch_{input,output}_per_million is the Batch-mode rate — a flat 50% of
# standard (ai.google.dev/gemini-api/docs/pricing). prompt_cache_min_tokens is
# left unset: Gemini implicit caching minimums vary by model and aren't pinned
# here yet (the provider's reported usage remains authoritative for billing).
[[entry]]
provider = "gemini"
model = "gemini-3.1-flash-lite"
input_per_million = 0.25
output_per_million = 1.50
cached_input_per_million = 0.025
batch_input_per_million = 0.125
batch_output_per_million = 0.75
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "gemini"
model = "gemini-3.5-flash"
input_per_million = 1.50
output_per_million = 9.00
cached_input_per_million = 0.15
batch_input_per_million = 0.75
batch_output_per_million = 4.50
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "gemini"
model = "gemini-3.1-pro"
input_per_million = 2.00
output_per_million = 12.00
cached_input_per_million = 0.20
batch_input_per_million = 1.00
batch_output_per_million = 6.00
effective_at = "2026-05-01T00:00:00Z"

# ── Groq ──────────────────────────────────────────────────────────────────────
[[entry]]
provider = "groq"
model = "llama-3.3-70b-versatile"
input_per_million = 0.59
output_per_million = 0.79
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "groq"
model = "llama-3.1-8b-instant"
input_per_million = 0.05
output_per_million = 0.08
effective_at = "2026-05-01T00:00:00Z"

# DEPRECATED by Groq (verified 2026-05-31): deepseek-r1-distill-llama-70b was
# retired 2025-09-02, mixtral-8x7b-32768 on 2025-03-05. Rows retained for
# historical replay, but the gateway's Groq registry should NOT offer these for
# routing. (Groq console: console.groq.com/docs/deprecations)
[[entry]]
provider = "groq"
model = "deepseek-r1-distill-llama-70b"
input_per_million = 0.75
output_per_million = 0.99
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "groq"
model = "mixtral-8x7b-32768"
input_per_million = 0.24
output_per_million = 0.24
effective_at = "2026-05-01T00:00:00Z"

# ── Mistral ───────────────────────────────────────────────────────────────────
[[entry]]
provider = "mistral"
model = "mistral-large-latest"
input_per_million = 2.00
output_per_million = 6.00
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "mistral"
model = "mistral-medium-latest"
input_per_million = 0.70
output_per_million = 2.10
effective_at = "2026-05-01T00:00:00Z"

# Updated 2026-05-31 — mistral-medium-latest now resolves to Mistral Medium 3
# at $0.40/$2.00 (mistral.ai/news/mistral-medium-3, confirmed via OpenRouter
# mistralai/mistral-medium-3.1). Was $0.70/$2.10.
[[entry]]
provider = "mistral"
model = "mistral-medium-latest"
input_per_million = 0.40
output_per_million = 2.00
effective_at = "2026-05-31T00:00:00Z"

[[entry]]
provider = "mistral"
model = "mistral-small-latest"
input_per_million = 0.20
output_per_million = 0.60
effective_at = "2026-05-01T00:00:00Z"

# Updated 2026-05-31 — mistral-small-latest now resolves to Mistral Small 4 at
# $0.10/$0.30 (mistral.ai/pricing). Was $0.20/$0.60.
[[entry]]
provider = "mistral"
model = "mistral-small-latest"
input_per_million = 0.10
output_per_million = 0.30
effective_at = "2026-05-31T00:00:00Z"

[[entry]]
provider = "mistral"
model = "codestral-latest"
input_per_million = 0.30
output_per_million = 0.90
effective_at = "2026-05-01T00:00:00Z"

# UNVERIFIED 2026-05-31 — pixtral-large is no longer on Mistral's current
# pricing page (mistral.ai/pricing) and isn't on OpenRouter; likely superseded
# by Mistral Small 4 (multimodal). Rate below is the May snapshot, unconfirmed —
# verify in the Mistral console or drop if the model is retired.
[[entry]]
provider = "mistral"
model = "pixtral-large-latest"
input_per_million = 2.00
output_per_million = 6.00
effective_at = "2026-05-01T00:00:00Z"

# ── Together ────────────────────────────────────────────────────────────────────
[[entry]]
provider = "together"
model = "meta-llama/Meta-Llama-3.3-70B-Instruct-Turbo"
input_per_million = 0.88
output_per_million = 0.88
effective_at = "2026-05-01T00:00:00Z"

# Updated 2026-05-31 — Together's hosted rate for this model is now $1.04/$1.04
# (confirmed via OpenRouter's per-endpoint pricing: Together endpoint on
# meta-llama/llama-3.3-70b-instruct). Was $0.88/$0.88.
[[entry]]
provider = "together"
model = "meta-llama/Meta-Llama-3.3-70B-Instruct-Turbo"
input_per_million = 1.04
output_per_million = 1.04
effective_at = "2026-05-31T00:00:00Z"

# UNVERIFIED 2026-05-31 — the next three (405B-Turbo, Qwen2.5-72B-Turbo,
# DeepSeek-V3) are NOT on Together's current serverless models catalog
# (docs.together.ai/docs/serverless-models) and have no Together endpoint on
# OpenRouter; likely retired/repriced. Rates below are the May snapshot,
# unconfirmed — verify in the Together console or drop if retired.
[[entry]]
provider = "together"
model = "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"
input_per_million = 3.50
output_per_million = 3.50
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "together"
model = "Qwen/Qwen2.5-72B-Instruct-Turbo"
input_per_million = 1.20
output_per_million = 1.20
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "together"
model = "deepseek-ai/DeepSeek-V3"
input_per_million = 1.25
output_per_million = 1.25
effective_at = "2026-05-01T00:00:00Z"

# ── OpenRouter ──────────────────────────────────────────────────────────────────
# BYOK aggregator; rates mirror upstream list price (OpenRouter's 5% fee is
# applied separately via the provider's fee_multiplier, not baked in here).
[[entry]]
provider = "openrouter"
model = "anthropic/claude-sonnet-4.6"
input_per_million = 3.00
output_per_million = 15.00
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "openrouter"
model = "openai/gpt-5.5"
input_per_million = 5.00
output_per_million = 30.00
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "openrouter"
model = "google/gemini-3.1-pro-preview"
input_per_million = 2.00
output_per_million = 12.00
effective_at = "2026-05-01T00:00:00Z"

[[entry]]
provider = "openrouter"
model = "meta-llama/llama-3.3-70b-instruct"
input_per_million = 0.59
output_per_million = 0.79
effective_at = "2026-05-01T00:00:00Z"

# Updated 2026-05-31 — OpenRouter list price for this model dropped to
# $0.10/$0.32 (openrouter:* rows mirror OpenRouter list price by definition).
[[entry]]
provider = "openrouter"
model = "meta-llama/llama-3.3-70b-instruct"
input_per_million = 0.10
output_per_million = 0.32
effective_at = "2026-05-31T00:00:00Z"

[[entry]]
provider = "openrouter"
model = "mistralai/mistral-large"
input_per_million = 2.00
output_per_million = 6.00
effective_at = "2026-05-01T00:00:00Z"