Skip to main content

minillmlib/provider/
response.rs

1//! Response types from LLM APIs
2
3use serde::{Deserialize, Serialize};
4use std::sync::Arc;
5
6/// Normalized token usage + cost, the common currency every provider's
7/// [`Provider`](super::Provider) parses its native wire shape into.
8///
9/// Input tokens are split into THREE DISJOINT, ADDITIVE buckets so caching is
10/// priced correctly across every provider's differing wire conventions:
11/// - `uncached_input_tokens`: full-price prompt tokens (no cache involved),
12/// - `cache_read_tokens`: served from a warm cache (cheap, ~0.1× input),
13/// - `cache_write_tokens`: written to the cache this request (a premium, ~1.25×).
14///
15/// They never overlap, so total input = the sum of the three, and cost is a clean
16/// weighted sum with no subtraction (the old single `cached_tokens` field forced a
17/// subtract that was correct for OpenAI's "cached is a subset of prompt_tokens"
18/// wire but WRONG for Anthropic's "input_tokens already excludes cached" wire).
19/// Each provider's parser maps its native fields into these disjoint buckets.
20///
21/// Built by the provider (the nested per-provider wire shapes don't match these
22/// flat fields), and serialized into node metadata for diagnostics. Deliberately
23/// NOT `Deserialize`: a derived flat-field deserializer would silently produce
24/// all-zero/`None` fields against the real nested payloads.
25#[derive(Debug, Clone, Default, Serialize)]
26pub struct Usage {
27    /// Full-price input tokens (NOT read from nor written to cache this request).
28    pub uncached_input_tokens: u32,
29
30    /// Input tokens served from a warm cache (priced at the cache-read rate).
31    pub cache_read_tokens: u32,
32
33    /// Input tokens written to the cache this request (priced at the cache-write
34    /// premium). Non-zero only on the request that creates/refreshes a cache entry.
35    pub cache_write_tokens: u32,
36
37    /// Number of tokens in the completion (output).
38    pub completion_tokens: u32,
39
40    /// Cost in USD (for OpenRouter, the fee; may be 0 on a BYOK free tier or when
41    /// the provider returns no native cost). `None` if the wire carried no cost.
42    pub cost: Option<f64>,
43
44    /// Upstream inference cost (only for BYOK requests, the actual
45    /// cost charged by the provider like Google Vertex or Bedrock)
46    pub upstream_inference_cost: Option<f64>,
47
48    /// Reasoning tokens (for models that support it)
49    pub reasoning_tokens: Option<u32>,
50}
51
52impl Usage {
53    /// Total input tokens processed = the three disjoint input buckets summed.
54    pub fn prompt_tokens(&self) -> u32 {
55        self.uncached_input_tokens + self.cache_read_tokens + self.cache_write_tokens
56    }
57
58    /// Total tokens (input + output).
59    pub fn total_tokens(&self) -> u32 {
60        self.prompt_tokens() + self.completion_tokens
61    }
62
63    /// Fold a later usage report into this one, keeping the non-zero/`Some` value
64    /// of each field. Needed for providers that split usage across streaming
65    /// events (Anthropic sends input tokens in `message_start` and output tokens
66    /// in `message_delta`); for single-usage-chunk providers (OpenAI) this is a
67    /// plain overwrite since the prior usage is all-zero/`None`.
68    pub(crate) fn merge_from(&mut self, other: &Usage) {
69        if other.uncached_input_tokens != 0 {
70            self.uncached_input_tokens = other.uncached_input_tokens;
71        }
72        if other.cache_read_tokens != 0 {
73            self.cache_read_tokens = other.cache_read_tokens;
74        }
75        if other.cache_write_tokens != 0 {
76            self.cache_write_tokens = other.cache_write_tokens;
77        }
78        if other.completion_tokens != 0 {
79            self.completion_tokens = other.completion_tokens;
80        }
81        self.cost = other.cost.or(self.cost);
82        self.upstream_inference_cost = other
83            .upstream_inference_cost
84            .or(self.upstream_inference_cost);
85        self.reasoning_tokens = other.reasoning_tokens.or(self.reasoning_tokens);
86    }
87}
88
89/// Whether the cost in a `CostInfo` was actually determined. Consumers must
90/// check this before treating the reported amount as truth: only `Resolved`
91/// carries an authoritative USD cost. Neither `Unpriced` nor `Unknown` may be
92/// silently counted as a real zero.
93#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
94pub enum CostResolution {
95    /// The USD cost is authoritative (returned natively by the provider, or
96    /// derived from real token counts and a configured `TokenPrice`).
97    #[default]
98    Resolved,
99    /// Token counts are real, but the provider returns no native cost and no
100    /// `TokenPrice` was configured for this generator/request, so the USD amount
101    /// is unknown. Set a `TokenPrice` (on the generator or per-request) to resolve
102    /// it. The `cost` field is 0.0 and must NOT be treated as a free request.
103    Unpriced,
104    /// Cost could not be determined at all (no usage was returned and any
105    /// out-of-band query failed). Numeric fields are best-effort.
106    Unknown,
107}
108
109/// Detailed cost information from a completion
110#[derive(Debug, Clone, Default, Serialize, Deserialize)]
111pub struct CostInfo {
112    /// Total cost in credits charged to your account
113    pub cost: f64,
114
115    /// Total prompt (input) tokens = uncached + cache-read + cache-write.
116    pub prompt_tokens: u32,
117
118    /// Number of completion (output) tokens
119    pub completion_tokens: u32,
120
121    /// Total tokens (input + output)
122    pub total_tokens: u32,
123
124    /// Input tokens served from a warm cache (priced at the cache-read rate).
125    pub cache_read_tokens: u32,
126
127    /// Input tokens written to the cache this request (priced at the cache-write
128    /// premium, a one-time cost when the cache entry is created/refreshed).
129    pub cache_write_tokens: u32,
130
131    /// Reasoning tokens (if any)
132    pub reasoning_tokens: Option<u32>,
133
134    /// The model used
135    pub model: String,
136
137    /// Response ID for tracking
138    pub response_id: String,
139
140    /// Whether `cost` was actually determined or could not be resolved.
141    pub resolution: CostResolution,
142}
143
144/// Callback function type for cost ingestion
145/// Called with CostInfo after each successful completion
146pub type CostCallback = Arc<dyn Fn(CostInfo) + Send + Sync>;
147
148/// A complete response from an LLM API.
149///
150/// Serialize-only: it is built in-code from parsed responses, never deserialized
151/// (and it embeds `Usage`, which is not `Deserialize` by design).
152#[derive(Debug, Clone, Serialize)]
153pub struct CompletionResponse {
154    /// Unique identifier for this completion
155    pub id: String,
156
157    /// The model that generated this response
158    pub model: String,
159
160    /// The generated text content
161    pub content: String,
162
163    /// Finish reason (e.g., "stop", "length", "tool_calls")
164    pub finish_reason: Option<String>,
165
166    /// Token usage statistics
167    pub usage: Option<Usage>,
168
169    /// Tool calls made by the model (if any)
170    pub tool_calls: Option<Vec<serde_json::Value>>,
171
172    /// Raw response for debugging
173    #[serde(skip)]
174    pub raw_response: Option<serde_json::Value>,
175}
176
177impl CompletionResponse {
178    /// Create a new completion response
179    pub fn new(
180        id: impl Into<String>,
181        model: impl Into<String>,
182        content: impl Into<String>,
183    ) -> Self {
184        Self {
185            id: id.into(),
186            model: model.into(),
187            content: content.into(),
188            finish_reason: None,
189            usage: None,
190            tool_calls: None,
191            raw_response: None,
192        }
193    }
194
195    /// Check if the completion finished normally
196    pub fn is_complete(&self) -> bool {
197        self.finish_reason.as_deref() == Some("stop")
198    }
199
200    /// Check if the completion was truncated due to length
201    pub fn is_truncated(&self) -> bool {
202        self.finish_reason.as_deref() == Some("length")
203    }
204
205    /// Check if the model made tool calls
206    pub fn has_tool_calls(&self) -> bool {
207        self.tool_calls.as_ref().is_some_and(|tc| !tc.is_empty())
208    }
209}
210
211/// A chunk from a streaming response
212#[derive(Debug, Clone, Default)]
213pub struct StreamChunk {
214    /// The provider's generation id, if this chunk carried one (every OpenAI-wire
215    /// chunk does). Threaded into the stream so out-of-band cost resolution can
216    /// query the REAL generation, not a locally-minted placeholder.
217    pub id: Option<String>,
218
219    /// The delta content in this chunk
220    pub delta: String,
221
222    /// Finish reason (only present in final chunk)
223    pub finish_reason: Option<String>,
224
225    /// Usage info (only present in final chunk for some providers)
226    pub usage: Option<Usage>,
227
228    /// Tool call deltas
229    pub tool_calls: Option<Vec<serde_json::Value>>,
230}
231
232impl StreamChunk {
233    /// Create a new stream chunk with content
234    pub fn content(delta: impl Into<String>) -> Self {
235        Self {
236            delta: delta.into(),
237            ..Default::default()
238        }
239    }
240
241    /// Create a final chunk with finish reason
242    pub fn finished(finish_reason: impl Into<String>) -> Self {
243        Self {
244            finish_reason: Some(finish_reason.into()),
245            ..Default::default()
246        }
247    }
248
249    /// Check if this is the final chunk
250    pub fn is_final(&self) -> bool {
251        self.finish_reason.is_some()
252    }
253}
254
255/// Truncate a response body for inclusion in an error/log, on a char boundary,
256/// so error strings can't balloon with a huge (and possibly prompt-bearing) body.
257pub(crate) fn preview_str(body: &str) -> String {
258    const MAX: usize = 200;
259    match body.char_indices().nth(MAX) {
260        Some((cut, _)) => format!("{}…", &body[..cut]),
261        None => body.to_string(),
262    }
263}
264
265/// A real error envelope: a NON-EMPTY object under `error`. A `null`, `{}`, or a
266/// falsy scalar (`false`/`0`/`""`) is not a failure and must not trip error
267/// handling (which would fail an otherwise-good response/stream and silently lose
268/// an accepted generation's cost). Every provider emits a populated object on a
269/// genuine error and omits the field entirely on success.
270fn error_object(raw: &serde_json::Value) -> Option<&serde_json::Value> {
271    raw.get("error")
272        .filter(|e| e.as_object().is_some_and(|o| !o.is_empty()))
273}
274
275/// If `raw` carries an OpenAI-wire `error` object, map it to a typed `Api` error.
276/// The single place the OpenAI-wire error envelope is decoded, so a 200-with-error
277/// body is surfaced identically whether it arrives as a full response or as an
278/// in-band streaming chunk. `None` when there is no error object.
279fn openai_error_in(raw: &serde_json::Value) -> Option<crate::error::MiniLLMError> {
280    let error = error_object(raw)?;
281    let message = error["message"]
282        .as_str()
283        .map(String::from)
284        .unwrap_or_else(|| preview_str(&error.to_string()));
285    // Use the error code only when it's a genuine numeric HTTP status in range;
286    // providers also send string codes (e.g. "rate_limit_exceeded") or values
287    // outside u16. Anything else is an upstream failure -> 502 (retryable), so a
288    // transient overload is never misclassified as a non-retryable success.
289    let status = error["code"]
290        .as_u64()
291        .filter(|&c| (100..=599).contains(&c))
292        .map(|c| c as u16)
293        .unwrap_or(502);
294    Some(crate::error::MiniLLMError::Api { status, message })
295}
296
297/// If `raw` carries an Anthropic `error` object, map it to a typed `Api` error
298/// (status 502: an upstream Anthropic failure is treated as retryable). The single
299/// place the Anthropic error envelope is decoded, so a 200-with-error body is
300/// surfaced identically from a full response and an in-band stream `error` event.
301fn anthropic_error_in(raw: &serde_json::Value) -> Option<crate::error::MiniLLMError> {
302    let error = error_object(raw)?;
303    let message = error["message"]
304        .as_str()
305        .map(String::from)
306        .unwrap_or_else(|| preview_str(&error.to_string()));
307    Some(crate::error::MiniLLMError::Api {
308        status: 502,
309        message,
310    })
311}
312
313/// Parse a raw OpenAI-wire response into a CompletionResponse (the default
314/// `Provider::parse_response`). `provider.parse_usage` extracts the usage so a
315/// provider with native cost fields (OpenRouter) reads them.
316///
317/// Many OpenAI-compatible providers (OpenRouter included) return HTTP 200 with
318/// an error body and no `choices`. We must surface that as a loud error instead
319/// of silently producing an empty completion, so callers never mistake an error
320/// for a successful empty response.
321pub fn parse_openai_response<P: super::Provider + ?Sized>(
322    raw: serde_json::Value,
323    provider: &P,
324) -> crate::error::Result<CompletionResponse> {
325    // A 200 response carrying an `error` object is a failure, not a completion.
326    if let Some(err) = openai_error_in(&raw) {
327        return Err(err);
328    }
329
330    let id = raw["id"].as_str().unwrap_or("").to_string();
331    let model = raw["model"].as_str().unwrap_or("").to_string();
332
333    // A well-formed completion must carry a first choice with a message. If it
334    // does not (and there was no error object above), the response is malformed.
335    let choice = raw["choices"]
336        .get(0)
337        .filter(|c| c.get("message").is_some())
338        .ok_or_else(|| {
339            crate::error::MiniLLMError::MalformedResponse(preview_str(&raw.to_string()))
340        })?;
341    let message = &choice["message"];
342
343    // `content` may legitimately be null/absent for a tool-call-only response.
344    let content = message["content"].as_str().unwrap_or("").to_string();
345    let tool_calls = message["tool_calls"].as_array().cloned();
346    let finish_reason = choice["finish_reason"].as_str().map(String::from);
347
348    // Usage parsing is provider-specific (field names, native cost fields).
349    let usage = provider.parse_usage(&raw);
350
351    Ok(CompletionResponse {
352        id,
353        model,
354        content,
355        finish_reason,
356        usage,
357        tool_calls,
358        raw_response: Some(raw),
359    })
360}
361
362/// Merge streaming tool-call deltas into an accumulator.
363///
364/// OpenAI-style streaming sends tool calls as deltas keyed by `index`: the first
365/// delta for an index carries `id`/`type`/`function.name`, and subsequent deltas
366/// append `function.arguments` fragments. This folds each delta into the slot at
367/// its `index`, concatenating argument fragments, so the accumulator ends up with
368/// the fully assembled tool calls.
369pub fn accumulate_tool_call_deltas(acc: &mut Vec<serde_json::Value>, deltas: &[serde_json::Value]) {
370    for delta in deltas {
371        // `index` de-multiplexes the deltas and is structurally required. A
372        // missing/non-numeric index would silently merge distinct calls into
373        // slot 0; an out-of-order high index would let a malicious stream size an
374        // arbitrary allocation (OOM). Real indices are small and contiguous, so we
375        // only accept "update an existing slot" or "append exactly one", and skip
376        // anything else loudly.
377        let Some(index) = delta["index"].as_u64().map(|i| i as usize) else {
378            tracing::warn!("tool_call delta missing numeric index, skipping");
379            continue;
380        };
381        if index > acc.len() {
382            tracing::warn!(
383                index,
384                len = acc.len(),
385                "tool_call delta index out of order, skipping"
386            );
387            continue;
388        }
389        if index == acc.len() {
390            acc.push(serde_json::json!({}));
391        }
392        let slot = &mut acc[index];
393
394        for key in ["id", "type"] {
395            if let Some(v) = delta.get(key).filter(|v| !v.is_null()) {
396                slot[key] = v.clone();
397            }
398        }
399
400        if let Some(func) = delta.get("function") {
401            if !slot["function"].is_object() {
402                slot["function"] = serde_json::json!({});
403            }
404            let slot_func = &mut slot["function"];
405            if let Some(name) = func.get("name").filter(|v| !v.is_null()) {
406                slot_func["name"] = name.clone();
407            }
408            if let Some(frag) = func.get("arguments").and_then(|v| v.as_str()) {
409                let existing = slot_func["arguments"].as_str().unwrap_or("").to_string();
410                slot_func["arguments"] = serde_json::json!(format!("{}{}", existing, frag));
411            }
412        }
413    }
414}
415
416/// Parse an OpenAI-wire streaming chunk from SSE data (the default
417/// `Provider::parse_chunk`). `provider.parse_usage` reads provider-specific usage
418/// out of the chunk.
419pub fn parse_openai_chunk<P: super::Provider + ?Sized>(
420    data: &str,
421    provider: &P,
422) -> Option<crate::error::Result<StreamChunk>> {
423    // Handle [DONE] marker
424    if data.trim() == "[DONE]" {
425        return Some(Ok(StreamChunk::finished("stop")));
426    }
427
428    // Parse JSON
429    let json: serde_json::Value = serde_json::from_str(data).ok()?;
430
431    // An in-band error frame on a 200 stream is a FAILURE, surfaced loudly through
432    // the channel (same path as a transport error) so it is never billed as an
433    // accepted generation. Mirrors `parse_openai_response`'s error handling.
434    if let Some(err) = openai_error_in(&json) {
435        return Some(Err(err));
436    }
437
438    // The provider's real generation id (every OpenAI-wire chunk carries it);
439    // threaded so out-of-band cost resolution targets the actual generation.
440    let id = json["id"]
441        .as_str()
442        .filter(|s| !s.is_empty())
443        .map(String::from);
444
445    // Provider-specific usage (OpenRouter/OpenAI send it in the last chunk).
446    let usage = provider.parse_usage(&json);
447
448    // Try to get choice (may not be present in usage-only chunks)
449    let choice = json["choices"].get(0);
450
451    let delta = choice
452        .and_then(|c| c["delta"]["content"].as_str())
453        .unwrap_or("")
454        .to_string();
455
456    let finish_reason = choice
457        .and_then(|c| c["finish_reason"].as_str())
458        .filter(|s| !s.is_empty())
459        .map(String::from);
460
461    let tool_calls = choice.and_then(|c| c["delta"]["tool_calls"].as_array().cloned());
462
463    // Return a chunk if it carries anything we track (id alone is not enough to
464    // surface, but it rides along with whatever else is present).
465    if delta.is_empty() && finish_reason.is_none() && usage.is_none() && tool_calls.is_none() {
466        return None;
467    }
468
469    Some(Ok(StreamChunk {
470        id,
471        delta,
472        finish_reason,
473        usage,
474        tool_calls,
475    }))
476}
477
478// =============================================================================
479// Anthropic `/v1/messages` envelope (`content[]`)
480// =============================================================================
481
482/// Parse Anthropic's usage object into the normalized DISJOINT buckets.
483///
484/// Anthropic's wire is ALREADY disjoint: `input_tokens` is the non-cached input
485/// only (tokens after the last cache breakpoint), and `cache_read_input_tokens` /
486/// `cache_creation_input_tokens` are SEPARATE additive counts. So the mapping is
487/// direct, no subtraction. Anthropic returns NO dollar cost, only token counts.
488/// Streaming `message_delta` carries only `output_tokens` (input folded from the
489/// earlier `message_start` via [`Usage::merge_from`]).
490fn parse_anthropic_usage(u: &serde_json::Value) -> Option<Usage> {
491    if u.is_null() {
492        return None;
493    }
494    Some(Usage {
495        uncached_input_tokens: u["input_tokens"].as_u64().unwrap_or(0) as u32,
496        cache_read_tokens: u["cache_read_input_tokens"].as_u64().unwrap_or(0) as u32,
497        cache_write_tokens: u["cache_creation_input_tokens"].as_u64().unwrap_or(0) as u32,
498        completion_tokens: u["output_tokens"].as_u64().unwrap_or(0) as u32,
499        cost: None,
500        upstream_inference_cost: None,
501        reasoning_tokens: None,
502    })
503}
504
505/// Parse a completed Anthropic `/v1/messages` response. The envelope is
506/// `content[]` blocks (text + optional tool_use), a top-level `stop_reason`, and
507/// a token-only `usage`. A 200 carrying an `error` object is surfaced loudly.
508pub fn parse_anthropic_response(
509    raw: serde_json::Value,
510) -> crate::error::Result<CompletionResponse> {
511    if let Some(err) = anthropic_error_in(&raw) {
512        return Err(err);
513    }
514
515    let content_blocks = raw["content"].as_array().ok_or_else(|| {
516        crate::error::MiniLLMError::MalformedResponse(preview_str(&raw.to_string()))
517    })?;
518
519    // Join every text block; collect tool_use blocks into the normalized shape.
520    let mut text = String::new();
521    let mut tool_calls: Vec<serde_json::Value> = Vec::new();
522    for block in content_blocks {
523        match block["type"].as_str() {
524            Some("text") => text.push_str(block["text"].as_str().unwrap_or("")),
525            Some("tool_use") => tool_calls.push(serde_json::json!({
526                "id": block["id"],
527                "type": "function",
528                "function": {
529                    "name": block["name"],
530                    "arguments": block["input"].to_string(),
531                },
532            })),
533            _ => {}
534        }
535    }
536
537    Ok(CompletionResponse {
538        id: raw["id"].as_str().unwrap_or("").to_string(),
539        model: raw["model"].as_str().unwrap_or("").to_string(),
540        content: text,
541        finish_reason: raw["stop_reason"].as_str().map(String::from),
542        usage: parse_anthropic_usage(&raw["usage"]),
543        tool_calls: (!tool_calls.is_empty()).then_some(tool_calls),
544        raw_response: Some(raw),
545    })
546}
547
548/// Parse one Anthropic SSE event payload into a [`StreamChunk`]. Anthropic streams
549/// a sequence of typed events; each maps to at most one chunk:
550/// - `message_start` → carries the message `id` + initial usage (input tokens),
551/// - `content_block_delta` (`delta.text_delta`) → the text delta,
552/// - `message_delta` → final usage (output tokens) + `stop_reason`,
553/// - `message_stop` → terminal marker.
554///
555/// Other events (`content_block_start/stop`, `ping`) carry nothing trackable.
556pub fn parse_anthropic_chunk(data: &str) -> Option<crate::error::Result<StreamChunk>> {
557    let json: serde_json::Value = serde_json::from_str(data).ok()?;
558    match json["type"].as_str()? {
559        // An in-band `error` event on a 200 stream is a FAILURE (e.g.
560        // `overloaded_error` mid-generation). Surface it loudly through the channel,
561        // same as the non-streaming `parse_anthropic_response`, so a failed stream
562        // is never billed as an accepted generation.
563        "error" => Some(Err(anthropic_error_in(&json).unwrap_or_else(|| {
564            crate::error::MiniLLMError::Api {
565                status: 502,
566                message: preview_str(&json.to_string()),
567            }
568        }))),
569        "message_start" => {
570            let msg = &json["message"];
571            let id = msg["id"]
572                .as_str()
573                .filter(|s| !s.is_empty())
574                .map(String::from);
575            let usage = parse_anthropic_usage(&msg["usage"]);
576            (id.is_some() || usage.is_some()).then(|| {
577                Ok(StreamChunk {
578                    id,
579                    usage,
580                    ..Default::default()
581                })
582            })
583        }
584        "content_block_delta" => {
585            let delta = json["delta"]["text"].as_str().unwrap_or("").to_string();
586            (!delta.is_empty()).then(|| {
587                Ok(StreamChunk {
588                    delta,
589                    ..Default::default()
590                })
591            })
592        }
593        "message_delta" => {
594            let finish_reason = json["delta"]["stop_reason"].as_str().map(String::from);
595            let usage = parse_anthropic_usage(&json["usage"]);
596            (finish_reason.is_some() || usage.is_some()).then(|| {
597                Ok(StreamChunk {
598                    finish_reason,
599                    usage,
600                    ..Default::default()
601                })
602            })
603        }
604        "message_stop" => Some(Ok(StreamChunk::finished("stop"))),
605        _ => None,
606    }
607}
608
609#[cfg(test)]
610mod tests {
611    use super::*;
612    use crate::provider::{OpenRouterProvider, Provider, TokenPrice};
613
614    /// The accounting used to parse usage in these tests (OpenAI-wire shape).
615    fn acct() -> OpenRouterProvider {
616        OpenRouterProvider
617    }
618
619    #[test]
620    fn parse_response_threads_tool_calls_and_finish_reason() {
621        let raw = serde_json::json!({
622            "id": "gen-1",
623            "model": "test-model",
624            "choices": [{
625                "finish_reason": "tool_calls",
626                "message": {
627                    "content": null,
628                    "tool_calls": [{"id": "call_1", "type": "function",
629                        "function": {"name": "get_weather", "arguments": "{}"}}]
630                }
631            }]
632        });
633        let resp = acct().parse_response(raw).unwrap();
634        assert_eq!(resp.id, "gen-1");
635        assert_eq!(resp.content, "");
636        assert_eq!(resp.finish_reason.as_deref(), Some("tool_calls"));
637        let tc = resp.tool_calls.expect("tool_calls threaded through");
638        assert_eq!(tc[0]["function"]["name"], "get_weather");
639    }
640
641    #[test]
642    fn parse_response_surfaces_200_error_body_loudly() {
643        // OpenRouter/OpenAI 200-with-error-body must become an Api error, not an
644        // empty success.
645        let raw = serde_json::json!({
646            "error": {"message": "model overloaded", "code": 503}
647        });
648        let err = acct().parse_response(raw).unwrap_err();
649        match err {
650            crate::error::MiniLLMError::Api { status, message } => {
651                assert_eq!(status, 503);
652                assert_eq!(message, "model overloaded");
653            }
654            other => panic!("expected Api error, got {other:?}"),
655        }
656    }
657
658    #[test]
659    fn parse_response_error_with_string_code_defaults_to_retryable_502() {
660        // A non-numeric error code (e.g. "rate_limit_exceeded") must NOT collapse
661        // to 200 (a fake non-retryable success); it becomes 502 (retryable).
662        let raw = serde_json::json!({
663            "error": {"message": "slow down", "code": "rate_limit_exceeded"}
664        });
665        match acct().parse_response(raw).unwrap_err() {
666            crate::error::MiniLLMError::Api { status, .. } => assert_eq!(status, 502),
667            other => panic!("expected Api error, got {other:?}"),
668        }
669        // An out-of-range numeric code also defaults to 502 (no u16 truncation).
670        let raw = serde_json::json!({ "error": {"message": "x", "code": 999_999} });
671        match acct().parse_response(raw).unwrap_err() {
672            crate::error::MiniLLMError::Api { status, .. } => assert_eq!(status, 502),
673            other => panic!("expected Api error, got {other:?}"),
674        }
675    }
676
677    #[test]
678    fn parse_response_rejects_malformed_missing_choices() {
679        let raw = serde_json::json!({ "id": "gen-1", "model": "m" });
680        assert!(acct().parse_response(raw).is_err());
681    }
682
683    #[test]
684    fn openrouter_parses_usage_and_aggregates_byok_cost() {
685        // The OpenRouter accounting parses its nested usage shape and sums the
686        // fee + BYOK upstream charge in its own cost_of (the aggregation that must
687        // stay provider-specific).
688        let raw = serde_json::json!({
689            "usage": {
690                "prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15,
691                "cost": 0.001,
692                "cost_details": {"upstream_inference_cost": 0.009},
693                "prompt_tokens_details": {"cached_tokens": 4},
694                "completion_tokens_details": {"reasoning_tokens": 2}
695            }
696        });
697        let usage = acct().parse_usage(&raw).expect("usage parsed");
698        assert_eq!(usage.prompt_tokens(), 10, "total input = sum of buckets");
699        assert_eq!(
700            usage.cache_read_tokens, 4,
701            "cached_tokens → cache_read bucket"
702        );
703        assert_eq!(
704            usage.uncached_input_tokens, 6,
705            "10 total − 4 cached = 6 uncached"
706        );
707        assert_eq!(usage.upstream_inference_cost, Some(0.009));
708        assert_eq!(usage.reasoning_tokens, Some(2));
709
710        let outcome = acct().cost_of(usage, None);
711        assert_eq!(outcome.resolution, CostResolution::Resolved);
712        assert!((outcome.usd - 0.010).abs() < 1e-9);
713    }
714
715    #[test]
716    fn openai_wire_splits_cache_read_as_subset_and_cache_write_as_additive() {
717        // The two cache buckets sit DIFFERENTLY relative to prompt_tokens, and
718        // getting it wrong mis-bills cache-heavy requests:
719        //   - cached_tokens (READ) is a SUBSET of prompt_tokens → subtract it.
720        //   - cache_write_tokens (WRITE) is ADDITIVE (billed on top, NOT in
721        //     prompt_tokens) → do NOT subtract it.
722        // prompt_tokens = 10000 (= 8000 uncached + 2000 cache-read); writes = 5000.
723        let raw = serde_json::json!({
724            "usage": {
725                "prompt_tokens": 10000,
726                "completion_tokens": 100,
727                "prompt_tokens_details": {
728                    "cached_tokens": 2000,
729                    "cache_write_tokens": 5000
730                }
731            }
732        });
733        let usage = acct().parse_usage(&raw).expect("usage parsed");
734        assert_eq!(usage.cache_read_tokens, 2000);
735        assert_eq!(
736            usage.cache_write_tokens, 5000,
737            "write read from cache_write_tokens"
738        );
739        assert_eq!(
740            usage.uncached_input_tokens, 8000,
741            "subtract only the cache-read subset (10000 − 2000), NOT the write"
742        );
743        // Total input = the three disjoint buckets: 8000 + 2000 + 5000 = 15000.
744        assert_eq!(
745            usage.prompt_tokens(),
746            15000,
747            "writes are additive, so total input exceeds prompt_tokens"
748        );
749
750        // Pricing must reflect all four buckets at their own rates ($/Mtok): input
751        // 3, read 0.3, write 3.75, output 15. 8000×3 + 2000×0.3 + 5000×3.75 +
752        // 100×15 = 24000+600+18750+1500 = 44850 micro-$ ⇒ $0.04485. The buggy
753        // `−(read+write)` split would yield uncached=3000 and undercharge the input.
754        let price = TokenPrice::new(3.0, 15.0).with_cache_rates(0.3, 3.75);
755        let usd = price.cost_of(&usage);
756        assert!((usd - 0.04485).abs() < 1e-9, "got {usd}");
757    }
758
759    #[test]
760    fn openai_wire_cached_exceeding_prompt_reports_unknown_not_a_fabricated_split() {
761        // The disjoint split assumes cache READS are a subset of prompt_tokens. If a
762        // wire violates that (cached > prompt), the split would be a silently-wrong
763        // cost, so parse_usage must FAIL LOUDLY (return None → Unknown), not clamp.
764        let raw = serde_json::json!({
765            "usage": {
766                "prompt_tokens": 10,
767                "completion_tokens": 5,
768                "prompt_tokens_details": {"cached_tokens": 15}
769            }
770        });
771        assert!(
772            acct().parse_usage(&raw).is_none(),
773            "cached > prompt must yield no usage (Unknown cost), not a clamped split"
774        );
775
776        // Boundary: cached == prompt is a valid subset (all input was a cache hit) →
777        // uncached 0, not rejected.
778        let raw = serde_json::json!({
779            "usage": {
780                "prompt_tokens": 10,
781                "completion_tokens": 5,
782                "prompt_tokens_details": {"cached_tokens": 10}
783            }
784        });
785        let usage = acct().parse_usage(&raw).expect("cached == prompt is valid");
786        assert_eq!(usage.uncached_input_tokens, 0);
787        assert_eq!(usage.cache_read_tokens, 10);
788    }
789
790    #[test]
791    fn error_object_ignores_benign_falsy_error_fields() {
792        // A clean response/chunk with a benign falsy `error` (null, {}, false, 0, "")
793        // must NOT be treated as a failure (which would fail a good stream and lose
794        // an accepted generation's cost). Only a non-empty error OBJECT is an error.
795        for benign in [
796            serde_json::json!({"error": null}),
797            serde_json::json!({"error": {}}),
798            serde_json::json!({"error": false}),
799            serde_json::json!({"error": 0}),
800            serde_json::json!({"error": ""}),
801            // A string/array `error` is not the real error envelope (every provider
802            // sends a non-empty OBJECT); treat it as non-error, matching the wire.
803            serde_json::json!({"error": "some string"}),
804            serde_json::json!({"error": ["a", "b"]}),
805            serde_json::json!({"id": "gen-1"}),
806        ] {
807            assert!(
808                openai_error_in(&benign).is_none(),
809                "benign error field must not be an error: {benign}"
810            );
811            assert!(anthropic_error_in(&benign).is_none());
812        }
813        // A real (non-empty) error object IS detected.
814        let real = serde_json::json!({"error": {"message": "boom"}});
815        assert!(openai_error_in(&real).is_some());
816        assert!(anthropic_error_in(&real).is_some());
817    }
818
819    #[test]
820    fn accumulate_tool_call_deltas_merges_by_index() {
821        let mut acc = Vec::new();
822        accumulate_tool_call_deltas(
823            &mut acc,
824            &[serde_json::json!({
825                "index": 0, "id": "call_1", "type": "function",
826                "function": {"name": "search", "arguments": "{\"q\":"}
827            })],
828        );
829        accumulate_tool_call_deltas(
830            &mut acc,
831            &[serde_json::json!({
832                "index": 0, "function": {"arguments": "\"rust\"}"}
833            })],
834        );
835        assert_eq!(acc.len(), 1);
836        assert_eq!(acc[0]["id"], "call_1");
837        assert_eq!(acc[0]["function"]["name"], "search");
838        assert_eq!(acc[0]["function"]["arguments"], "{\"q\":\"rust\"}");
839    }
840
841    #[test]
842    fn accumulate_tool_call_deltas_appends_two_distinct_calls() {
843        let mut acc = Vec::new();
844        accumulate_tool_call_deltas(
845            &mut acc,
846            &[
847                serde_json::json!({"index": 0, "id": "c0", "function": {"name": "a"}}),
848                serde_json::json!({"index": 1, "id": "c1", "function": {"name": "b"}}),
849            ],
850        );
851        assert_eq!(acc.len(), 2);
852        assert_eq!(acc[0]["id"], "c0");
853        assert_eq!(acc[1]["id"], "c1");
854    }
855
856    #[test]
857    fn accumulate_tool_call_deltas_rejects_out_of_range_and_missing_index() {
858        let mut acc = Vec::new();
859        // Huge index must NOT resize/allocate; it's skipped.
860        accumulate_tool_call_deltas(
861            &mut acc,
862            &[serde_json::json!({"index": 4_000_000_000u64, "id": "x"})],
863        );
864        assert!(acc.is_empty(), "out-of-order high index must be skipped");
865        // Missing index must NOT collapse into slot 0.
866        accumulate_tool_call_deltas(&mut acc, &[serde_json::json!({"id": "y"})]);
867        assert!(
868            acc.is_empty(),
869            "missing index must be skipped, not coerced to 0"
870        );
871    }
872
873    #[test]
874    fn parse_stream_chunk_done_marker() {
875        let chunk = acct().parse_chunk("[DONE]").unwrap().unwrap();
876        assert_eq!(chunk.finish_reason.as_deref(), Some("stop"));
877    }
878
879    #[test]
880    fn parse_stream_chunk_extracts_real_generation_id() {
881        // The chunk's top-level `id` must be threaded so cancellation cost
882        // resolution targets the real generation, not a placeholder.
883        let chunk = acct()
884            .parse_chunk(r#"{"id":"gen-abc","choices":[{"delta":{"content":"hi"}}]}"#)
885            .unwrap()
886            .unwrap();
887        assert_eq!(chunk.id.as_deref(), Some("gen-abc"));
888        assert_eq!(chunk.delta, "hi");
889    }
890
891    #[test]
892    fn openai_in_band_error_chunk_surfaces_as_err() {
893        // A 200 stream that emits a top-level `{"error":...}` frame must become a
894        // loud Err on the chunk path (not silently swallowed as None), so a failed
895        // generation is never billed as accepted.
896        let out = acct()
897            .parse_chunk(r#"{"error":{"message":"overloaded","code":503}}"#)
898            .expect("error frame must produce Some(Err), not None");
899        match out {
900            Err(crate::error::MiniLLMError::Api { status, message }) => {
901                assert_eq!(status, 503);
902                assert_eq!(message, "overloaded");
903            }
904            other => panic!("expected Some(Err(Api)), got {other:?}"),
905        }
906    }
907
908    // ---- Anthropic envelope ---------------------------------------------------
909
910    #[test]
911    fn anthropic_response_joins_text_blocks_and_parses_usage() {
912        let raw = serde_json::json!({
913            "id": "msg_1",
914            "model": "claude-haiku-4-5",
915            "content": [{"type": "text", "text": "Hello "}, {"type": "text", "text": "world"}],
916            "stop_reason": "end_turn",
917            "usage": {"input_tokens": 9, "output_tokens": 4, "cache_read_input_tokens": 2}
918        });
919        let resp = parse_anthropic_response(raw).unwrap();
920        assert_eq!(resp.id, "msg_1");
921        assert_eq!(resp.content, "Hello world");
922        assert_eq!(resp.finish_reason.as_deref(), Some("end_turn"));
923        let u = resp.usage.expect("usage parsed");
924        // Anthropic's input_tokens (9) EXCLUDES cached; cache_read (2) is additive.
925        assert_eq!(u.uncached_input_tokens, 9);
926        assert_eq!(u.cache_read_tokens, 2);
927        assert_eq!(u.cache_write_tokens, 0);
928        assert_eq!(
929            u.prompt_tokens(),
930            11,
931            "total input = 9 uncached + 2 cache-read"
932        );
933        assert_eq!(u.completion_tokens, 4);
934        assert_eq!(u.total_tokens(), 15);
935        assert!(u.cost.is_none(), "Anthropic never returns a dollar cost");
936    }
937
938    #[test]
939    fn anthropic_response_threads_tool_use_blocks() {
940        let raw = serde_json::json!({
941            "id": "msg_2", "model": "m",
942            "content": [
943                {"type": "text", "text": "calling"},
944                {"type": "tool_use", "id": "tu_1", "name": "get_weather",
945                 "input": {"city": "Paris"}}
946            ],
947            "stop_reason": "tool_use",
948            "usage": {"input_tokens": 5, "output_tokens": 2}
949        });
950        let resp = parse_anthropic_response(raw).unwrap();
951        assert_eq!(resp.content, "calling");
952        let tc = resp.tool_calls.expect("tool_use threaded");
953        assert_eq!(tc[0]["function"]["name"], "get_weather");
954        // input is serialized to a JSON string argument (OpenAI-normalized shape).
955        assert_eq!(tc[0]["function"]["arguments"], r#"{"city":"Paris"}"#);
956    }
957
958    #[test]
959    fn anthropic_response_surfaces_error_body_loudly() {
960        let raw = serde_json::json!({"type": "error",
961            "error": {"type": "overloaded_error", "message": "overloaded"}});
962        match parse_anthropic_response(raw).unwrap_err() {
963            crate::error::MiniLLMError::Api { message, .. } => assert_eq!(message, "overloaded"),
964            other => panic!("expected Api error, got {other:?}"),
965        }
966    }
967
968    #[test]
969    fn anthropic_response_rejects_missing_content() {
970        let raw = serde_json::json!({"id": "x", "model": "m"});
971        assert!(parse_anthropic_response(raw).is_err());
972    }
973
974    #[test]
975    fn anthropic_chunk_message_start_carries_id_and_input_usage() {
976        let c = parse_anthropic_chunk(
977            r#"{"type":"message_start","message":{"id":"msg_9","usage":{"input_tokens":15,"output_tokens":1}}}"#,
978        )
979        .unwrap()
980        .unwrap();
981        assert_eq!(c.id.as_deref(), Some("msg_9"));
982        assert_eq!(c.usage.as_ref().unwrap().uncached_input_tokens, 15);
983    }
984
985    #[test]
986    fn anthropic_chunk_content_delta_carries_text() {
987        let c = parse_anthropic_chunk(
988            r#"{"type":"content_block_delta","delta":{"type":"text_delta","text":"hi"}}"#,
989        )
990        .unwrap()
991        .unwrap();
992        assert_eq!(c.delta, "hi");
993        // Non-text events produce nothing.
994        assert!(parse_anthropic_chunk(r#"{"type":"content_block_start"}"#).is_none());
995        assert!(parse_anthropic_chunk(r#"{"type":"ping"}"#).is_none());
996    }
997
998    #[test]
999    fn anthropic_chunk_message_delta_carries_stop_and_output_usage() {
1000        let c = parse_anthropic_chunk(
1001            r#"{"type":"message_delta","delta":{"stop_reason":"end_turn"},"usage":{"output_tokens":9}}"#,
1002        )
1003        .unwrap()
1004        .unwrap();
1005        assert_eq!(c.finish_reason.as_deref(), Some("end_turn"));
1006        assert_eq!(c.usage.as_ref().unwrap().completion_tokens, 9);
1007        // message_stop terminates.
1008        let stop = parse_anthropic_chunk(r#"{"type":"message_stop"}"#)
1009            .unwrap()
1010            .unwrap();
1011        assert_eq!(stop.finish_reason.as_deref(), Some("stop"));
1012    }
1013
1014    #[test]
1015    fn anthropic_in_band_error_event_surfaces_as_err() {
1016        // The exact production failure: a 200 stream emitting an `error` event must
1017        // become a loud Err (not the old silent `_ => None`), so cost accounting
1018        // sees the failure and books nothing. Mirrors parse_anthropic_response.
1019        let out = parse_anthropic_chunk(
1020            r#"{"type":"error","error":{"type":"overloaded_error","message":"overloaded"}}"#,
1021        )
1022        .expect("error event must produce Some(Err), not None");
1023        match out {
1024            Err(crate::error::MiniLLMError::Api { message, .. }) => {
1025                assert_eq!(message, "overloaded")
1026            }
1027            other => panic!("expected Some(Err(Api)), got {other:?}"),
1028        }
1029    }
1030
1031    #[test]
1032    fn usage_merge_accumulates_split_input_and_output() {
1033        // Anthropic splits usage: input in message_start, output in message_delta.
1034        // merge_from must keep both and recompute the total.
1035        let mut acc = Usage {
1036            uncached_input_tokens: 15,
1037            completion_tokens: 1,
1038            ..Default::default()
1039        };
1040        let delta = Usage {
1041            uncached_input_tokens: 0,
1042            completion_tokens: 9,
1043            ..Default::default()
1044        };
1045        acc.merge_from(&delta);
1046        assert_eq!(
1047            acc.uncached_input_tokens, 15,
1048            "input from message_start preserved"
1049        );
1050        assert_eq!(
1051            acc.completion_tokens, 9,
1052            "output from message_delta applied"
1053        );
1054        assert_eq!(
1055            acc.total_tokens(),
1056            24,
1057            "total recomputed from merged buckets"
1058        );
1059    }
1060}