phi-core 0.7.0

Simple, effective agent loop with tool execution and event streaming
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
use crate::types::*;
use async_trait::async_trait;
use tokio::sync::mpsc;

use super::model::ModelConfig;

/*
ARCHITECTURE: The Provider Layer

This module defines the core abstraction for ALL LLM providers:

  StreamProvider trait  — the interface every provider must implement
  StreamEvent enum      — the event protocol sent through the channel
  StreamConfig struct   — the input to every provider call
  ProviderError enum    — the error taxonomy

Why streaming via a channel instead of returning a Vec of events?
Because streaming gives real-time UI updates. The user sees tokens as they arrive,
not after the entire response. An mpsc channel is the natural async Rust primitive
for this producer-consumer split.

The dual-output pattern:
  provider.stream(config, tx, cancel) → Future<Result<Message, Error>>
                          ↑                     ↑
               sends StreamEvents        returns final Message
               in real-time            after stream completes

The channel carries partial deltas; the return value carries the complete message.
*/

/// Events emitted during LLM streaming.
/*
ARCHITECTURE: `content_index` in delta events

LLM responses can contain MULTIPLE content blocks in one message:
  [Thinking("..."), Text("Hello"), ToolCall({id: "x", name: "bash", args: {...}})]

`content_index` identifies WHICH block a delta belongs to.
Without it, interleaved deltas from parallel content blocks would be ambiguous.

Example for an extended-thinking response:
  ThinkingDelta { content_index: 0, delta: "Let me " }
  ThinkingDelta { content_index: 0, delta: "think..." }
  TextDelta     { content_index: 1, delta: "Here's " }
  TextDelta     { content_index: 1, delta: "my answer." }
  ToolCallStart { content_index: 2, id: "call_1", name: "bash" }
  ToolCallDelta { content_index: 2, delta: "{\"cmd\":" }
  ToolCallEnd   { content_index: 2 }
  Done          { message: (complete Message) }
*/
#[derive(Debug, Clone)]
pub enum StreamEvent {
    /// Stream started — the LLM has begun generating. Consumers should create a placeholder.
    Start,
    /// A text token from the response text.
    TextDelta { content_index: usize, delta: String },
    /// A chunk from the model's chain-of-thought (extended thinking mode only).
    ThinkingDelta { content_index: usize, delta: String },
    /// The LLM began a tool call — id and name are now known.
    ToolCallStart {
        content_index: usize,
        id: String,
        name: String,
    },
    /// A JSON fragment for a tool call's arguments (accumulate until ToolCallEnd).
    ToolCallDelta { content_index: usize, delta: String },
    /// The tool call's argument JSON is complete.
    ToolCallEnd { content_index: usize },
    /// Stream completed successfully. `message` is the final complete Message.
    Done { message: Message },
    /// Stream failed. `message` is a synthetic error Message with stop_reason=Error.
    Error { message: Message },
}

/// Configuration for a streaming LLM call
/*
ARCHITECTURE: StreamConfig — the "envelope" passed into every provider call

Every `StreamProvider::stream()` call receives exactly one `StreamConfig`.
It bundles everything the provider needs to make one API request:
  - model_config — the complete provider identity: id, api_key, base_url, compat flags
  - messages / system_prompt / tools — the conversation payload
  - thinking_level / max_tokens / temperature — per-call generation overrides
  - cache_config — whether to send prompt-caching headers

`model_config` is required (non-optional). Every provider reads at minimum
`model_config.id` (model name) and `model_config.api_key` (auth credential).
Providers with custom endpoints also read `model_config.base_url`, `model_config.headers`,
and (for OpenAI-compat) `model_config.compat`.

Why not pass individual arguments?
  If `stream()` took 10 positional parameters it would be unergonomic and break
  callers every time we added a field. A config struct is extensible: adding a
  field is backward-compatible if the caller can use `Default::default()` for it.
  Python analogy: kwargs dict passed to a function, or a dataclass payload.

RUST QUIRK: `Option<u32>` and `Option<f32>` — "nullable" fields
  Rust has no null. `Option<T>` is an explicit "maybe absent" wrapper:
    `None`    → caller didn't set a value; provider uses its own default
    `Some(v)` → caller explicitly overrides the value
  Python analogy: `max_tokens: int | None = None`
*/
#[derive(Debug, Clone)]
pub struct StreamConfig {
    /// Complete provider identity: model id, api_key, base_url, compat flags, cost rates.
    /// All providers read `model_config.id` and `model_config.api_key`; most also read
    /// `model_config.base_url` and `model_config.headers`.
    pub model_config: ModelConfig,
    pub system_prompt: String,
    pub messages: Vec<Message>,
    pub tools: Vec<ToolDefinition>,
    pub thinking_level: ThinkingLevel,
    pub max_tokens: Option<u32>, // overrides model_config.max_tokens when Some
    pub temperature: Option<f32>,
    /// Prompt caching configuration. Default: enabled with auto strategy.
    pub cache_config: CacheConfig,
    /// Desired output shape. `Text` (the default) preserves the historical behaviour;
    /// `JsonObject` / `JsonSchema` request constrained JSON output from providers that
    /// support it natively (OpenAI, Google) or via tool-call emulation (Anthropic).
    /// Bedrock surfaces `ProviderError::SchemaMismatch` when set on a non-Anthropic
    /// foundation model that lacks structured-output support. See the capability matrix
    /// in `docs/specs/developer/provider.md` for per-provider coverage.
    pub response_format: ResponseFormat,
}

/// Desired output shape for an LLM call.
///
/// Default `Text` matches the historical free-form text behaviour. `JsonObject`
/// constrains output to syntactically valid JSON with no schema enforcement;
/// `JsonSchema` adds strict-shape enforcement when the provider supports it.
///
/// `Message::extract_json::<T>()` is the recommended way to parse the resulting
/// assistant message back into a typed value — it handles both native JSON-mode
/// output (text content is JSON) and tool-call emulation (arguments JSON of a
/// well-known synthetic tool) uniformly.
#[derive(Debug, Clone, Default)]
pub enum ResponseFormat {
    /// Free-form text. Default; providers ignore the field entirely.
    #[default]
    Text,
    /// Constrain output to valid JSON; no schema enforcement.
    ///
    /// Maps to:
    /// - OpenAI Completions / Responses / Azure: `response_format: { type: "json_object" }`
    /// - Google GenAI / Vertex: `responseMimeType: "application/json"`
    /// - Anthropic / Bedrock-Anthropic: a synthetic `respond_json` tool with an
    ///   empty-shape schema; the LLM is forced to call it with its answer
    /// - Bedrock non-Anthropic: not supported — provider returns `SchemaMismatch`
    JsonObject,
    /// Strict JSON Schema enforcement. The schema is forwarded to the provider when
    /// supported natively; otherwise emulated via tool-call shape (Anthropic).
    JsonSchema {
        /// JSON Schema (Draft 2020-12 compatible) describing the expected output.
        schema: serde_json::Value,
        /// Human-readable schema name (some providers use this in error messages).
        name: String,
        /// Whether the provider should enforce strict shape (no extra fields).
        /// Some providers' strict mode disables defaults — fall back to non-strict
        /// if unsupported.
        strict: bool,
    },
}

/// Tool definition sent to the LLM (schema only, no execute fn)
/*
ARCHITECTURE: ToolDefinition — the schema half of a tool

Every tool has two sides:
  1. `AgentTool` (types.rs) — the Rust struct that EXECUTES the tool (has code)
  2. `ToolDefinition` (here)  — the JSON schema that gets SENT TO THE LLM

When we call `provider.stream(config, ...)`, only `ToolDefinition` goes to the API.
The LLM never sees executable code — it only sees name/description/parameters so it
can decide whether to call the tool and how to format the arguments.

The separation exists because:
  - The provider layer is pure I/O; it doesn't execute tools
  - ToolDefinition is serializable (goes over the wire); AgentTool is not
  - `agent_loop.rs` bridges them: it converts AgentTool → ToolDefinition before
    calling stream(), then receives ToolCall content and finds the matching AgentTool

RUST QUIRK: `serde_json::Value` — a dynamically typed JSON tree
  JSON doesn't map to a fixed Rust type. `serde_json::Value` is an enum that
  can hold any valid JSON structure:
    Value::Object(Map<String, Value>)
    Value::Array(Vec<Value>)
    Value::String(String)
    Value::Number(Number)  — wraps i64/u64/f64
    Value::Bool(bool)
    Value::Null

  Tool parameters are represented as a JSON Schema object — a dynamic shape
  that varies per tool — so `serde_json::Value` is the right type here.

RUST QUIRK: `#[derive(Serialize, Deserialize)]`
  Requires the `serde` + `serde_json` crates.
  `Serialize`   → can convert this struct TO JSON (for sending to APIs)
  `Deserialize` → can reconstruct this struct FROM JSON (for round-tripping)
  Python analogy: combining json.dumps() and json.loads() support automatically.
*/
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolDefinition {
    pub name: String,
    pub description: String,
    /// JSON Schema object describing the tool's parameters.
    /// LLMs use this schema to know what arguments to pass when calling the tool.
    pub parameters: serde_json::Value,
}

use serde::{Deserialize, Serialize};

/// The core provider trait. Implement this for each LLM backend.
/*
ARCHITECTURE: StreamProvider — the single extension point for ALL LLM backends

Every LLM backend (Anthropic, OpenAI, Google, Bedrock, Azure, ...) implements
this one trait. The rest of the codebase interacts only with `&dyn StreamProvider`
— it never knows which concrete backend is being used at runtime.

This is the "Strategy" pattern: swap the provider, keep everything else constant.

The dual-output contract:
  1. `tx` (mpsc channel) — sends StreamEvents in real time as they arrive
     Consumers subscribe to this channel to update the UI with partial tokens.
  2. Return value `Result<Message, ProviderError>` — the fully assembled Message
     Only available after the stream completes. Contains the complete response.

Why both? Because `Message` is only complete when the stream ends, but the UI
needs to show tokens as they arrive (low latency). The channel handles the
"streaming display" concern; the return value handles the "final record" concern.

RUST QUIRK: `Send + Sync` trait bounds — thread safety requirements
  `Send`  → values of this type can be transferred across thread boundaries
  `Sync`  → references (&T) can be shared across thread boundaries simultaneously

  Why required on StreamProvider?
    The provider is stored as `Arc<dyn StreamProvider>` and accessed from
    async tasks that may run on different OS threads in the tokio thread pool.
    Without `Send + Sync`, the compiler would reject this as unsafe.

  What do they PREVENT?
    `Rc<T>` is not `Send` (non-atomic reference count, unsafe to move between threads)
    `RefCell<T>` is not `Sync` (non-atomic borrow flag, unsafe to share between threads)
    The bounds ensure implementations can't accidentally use these.

RUST QUIRK: `#[async_trait]` — async methods in traits
  Rust's native trait system doesn't support `async fn` in traits (as of stable Rust)
  because `async fn` returns an anonymous `impl Future<Output=T>` — each
  implementation would return a DIFFERENT type, violating the uniform vtable layout
  required by `dyn Trait`.

  `#[async_trait]` is a procedural macro from the `async-trait` crate that desugars:
    async fn stream(&self, ...) -> Result<...>
  into:
    fn stream(&self, ...) -> Pin<Box<dyn Future<Output=Result<...>> + Send + '_>>

  The `Pin<Box<dyn Future...>>` is a heap-allocated, type-erased future — same type
  for every implementation, so the vtable works. The `Send` bound ensures the future
  itself is thread-safe (can be awaited on any tokio thread).
  Python analogy: an abstract async method that subclasses override.
*/
#[async_trait]
pub trait StreamProvider: Send + Sync {
    /// Short, stable identifier for this provider type.
    ///
    /// Used as the `provider_id` component of auto-derived `loop_id` signatures:
    ///   `loop_id = "{session_id}.{provider_id}.{model_slug}.{N}"`
    ///
    /// Return a lowercase ASCII string with no spaces (e.g. `"anthropic"`, `"openai"`, `"google"`).
    /// Custom providers should return a unique, stable string.
    fn provider_id(&self) -> &str;

    /// Stream a completion. Send events through `tx` in real time.
    /// Returns the final, fully-assembled assistant `Message` after the stream ends.
    ///
    /// Implementors must:
    /// - Send `StreamEvent::Start` when the stream begins
    /// - Send `StreamEvent::TextDelta` / `ThinkingDelta` / `ToolCall*` as tokens arrive
    /// - Send `StreamEvent::Done { message }` or `StreamEvent::Error { message }` at the end
    /// - Honor `cancel` — stop early and return `Err(ProviderError::Cancelled)`
    async fn stream(
        &self,
        config: StreamConfig, // ALL REQUEST PARAMS — model, messages, tools, auth (bundled to avoid 10-arg signature)
        tx: mpsc::UnboundedSender<StreamEvent>, // OBSERVER — push StreamEvents here in real-time as tokens arrive
        cancel: tokio_util::sync::CancellationToken, // ABORT — check this; return Err(Cancelled) if triggered
    ) -> Result<Message, ProviderError>; // final fully-assembled Message (only available after stream ends)
}

/*
RUST QUIRK: `thiserror::Error` derive — auto-implementing `std::error::Error`

`std::error::Error` is the standard Rust error trait. Manually implementing it
requires also implementing `Display` and optionally `source()`. Boilerplate.

`thiserror` is a macro crate that generates all three from annotations:
  `#[error("API error: {0}")]` on a tuple variant:
    → Display impl: format!("API error: {}", self.0)
    → The {0} refers to the first (unnamed) field of the tuple variant.

  `#[error("Rate limited, retry after {retry_after_ms:?}ms")]` on a struct variant:
    → Display impl using the named field `retry_after_ms`
    → {:?} uses Debug formatting on the Option<u64> → "Some(60000)" or "None"

  `#[derive(thiserror::Error)]` also requires `#[derive(Debug)]` (already present).

Python analogy:
  class ProviderError(Exception):
      pass
  class ApiError(ProviderError):
      def __str__(self): return f"API error: {self.message}"

ARCHITECTURE: ProviderError variants — the error taxonomy

Variants map to HTTP status codes + semantic categories:
  `Api`            — 4xx/5xx errors that are NOT special (bad request, server error)
  `Network`        — Transport failures: connection refused, timeout, TLS error
  `Auth`           — 401/403 — bad or missing API key
  `RateLimited`    — 429 — too many requests; includes optional server-specified delay
  `ContextOverflow`— input too long for the model's context window
  `Cancelled`      — CancellationToken was triggered by the caller
  `Other`          — catch-all for anything that doesn't fit

Why a flat enum rather than a hierarchy?
  The agent loop has a simple decision tree:
    is_retryable() → retry (RateLimited, Network)
    is_context_overflow() → try compaction, then give up
    is Cancelled → clean shutdown
    everything else → surface to caller as failure
  A flat enum with methods makes this dispatch cheap and exhaustive.
*/
#[derive(Debug, thiserror::Error)]
pub enum ProviderError {
    /// A non-transient API error (bad request, server error, etc.).
    #[error("API error: {0}")]
    Api(String),
    /// Network/transport failure — connection refused, timeout, TLS error, etc.
    #[error("Network error: {0}")]
    Network(String),
    /// Authentication failure — bad or missing API key (HTTP 401/403).
    #[error("Auth error: {0}")]
    Auth(String),
    /// Rate limit hit (HTTP 429). `retry_after_ms` is the server-specified delay if present.
    #[error("Rate limited, retry after {retry_after_ms:?}ms")]
    RateLimited { retry_after_ms: Option<u64> },
    /// Input exceeds the model's context window. Caller should compact and retry.
    #[error("Context overflow: {message}")]
    ContextOverflow { message: String },
    /// The caller cancelled the request via `CancellationToken`.
    #[error("Cancelled")]
    Cancelled,
    /// Catch-all for errors that don't fit another category.
    #[error("{0}")]
    Other(String),
    /// Returned by structured-output paths when the requested `ResponseFormat` is
    /// unsupported by the provider, or when extracting JSON from a response fails
    /// (`Message::extract_json::<T>()` returns this on parse / deserialise errors).
    #[error("Schema mismatch: {reason}")]
    SchemaMismatch { reason: String },
}

impl ProviderError {
    /// Classify an HTTP error response into the appropriate error variant.
    ///
    /// Detects context overflow, rate limits, auth errors, and general API errors
    /// from the HTTP status code and response body.
    pub fn classify(
        status: u16,   // HTTP status code — 429, 401, 403, 400, 413, 5xx
        message: &str, // response body text — checked for overflow phrases; may be empty (Cerebras quirk)
    ) -> Self {
        if is_context_overflow(status, message) {
            Self::ContextOverflow {
                message: message.to_string(),
            }
        } else if status == 429 {
            Self::RateLimited {
                retry_after_ms: None,
            }
        } else if status == 401 || status == 403 {
            Self::Auth(message.to_string())
        } else {
            Self::Api(message.to_string())
        }
    }

    /// Returns true if this error indicates a context overflow.
    pub fn is_context_overflow(&self) -> bool {
        matches!(self, Self::ContextOverflow { .. })
    }
}

/// Known phrases that indicate context overflow across LLM providers.
///
/// Covers: Anthropic, OpenAI, Google Gemini, AWS Bedrock, xAI, Groq,
/// OpenRouter, llama.cpp, LM Studio, MiniMax, Kimi, GitHub Copilot,
/// and generic patterns.
/*
ARCHITECTURE: Centralised overflow detection — one place, all providers

Context overflow is a universal problem: every LLM has a finite token window.
But every provider expresses overflow differently:
  Anthropic: "prompt is too long: 213462 tokens > 200000 maximum"
  OpenAI:    "Your input exceeds the context window of this model"
  Gemini:    "The input token count (1196265) exceeds the maximum number of tokens allowed"
  Groq:      "Please reduce the length of the messages or completion"
  ...

Centralising these phrases in ONE constant means:
  1. Every provider uses `ProviderError::classify()` — no duplication
  2. Adding a new provider = adding one phrase to this array
  3. The agent loop only checks `is_context_overflow()` — doesn't know which provider

RUST QUIRK: `const OVERFLOW_PHRASES: &[&str]` — a compile-time constant

`const` — value is inlined at compile time (not a runtime allocation).
  The array lives in the binary's read-only data segment (`.rodata`).
  Python analogy: a module-level tuple of strings, but truly immutable.

`&[&str]` — a slice of string slices (two levels of reference):
  `&str`  — a reference to a string (UTF-8 bytes, stored somewhere)
  `&[T]`  — a "fat pointer" to a contiguous sequence of T (pointer + length)
  `&[&str]` — a reference to a sequence of `&str` items

  The string literals ("prompt is too long") are `&'static str` — they live
  forever in the binary, so no allocation, no lifetime issues.

Why not `Vec<String>`?
  `Vec<String>` is heap-allocated and built at runtime. A `const &[&str]` is
  zero runtime cost — the data is baked into the binary at compile time.

RUST QUIRK: `&[&str]` as the type for array literals
  You might expect `const X: [&str; 14] = [...]` (fixed-size array), but
  `&[&str]` (slice reference) is more ergonomic — the length is encoded in the
  fat pointer, not the type. Functions that iterate over it don't need to be
  generic over the array length.
*/
const OVERFLOW_PHRASES: &[&str] = &[
    "prompt is too long",                 // Anthropic
    "input is too long",                  // AWS Bedrock
    "exceeds the context window",         // OpenAI (Completions & Responses)
    "exceeds the maximum",                // Google Gemini ("input token count exceeds the maximum")
    "maximum prompt length",              // xAI
    "reduce the length of the messages",  // Groq
    "maximum context length",             // OpenRouter
    "exceeds the limit of",               // GitHub Copilot
    "exceeds the available context size", // llama.cpp
    "greater than the context length",    // LM Studio
    "context window exceeds limit",       // MiniMax
    "exceeded model token limit",         // Kimi
    "context length exceeded",            // Generic
    "context_length_exceeded",            // Generic (underscore variant)
    "too many tokens",                    // Generic
    "token limit exceeded",               // Generic
];

/// Check if an error message indicates context overflow (for use by types.rs).
/*
RUST QUIRK: `pub(crate)` — "public within this crate only"

`pub(crate)` sits between fully public (`pub`) and private (default).
  - `pub`         → anyone importing this crate can call it
  - `pub(crate)`  → only modules within THIS crate can call it
  - (no modifier) → only this module can call it

`is_context_overflow_message` is needed by `types.rs` (to classify SSE errors
embedded in the stream — not just HTTP status errors) but shouldn't be part of
the public library API. `pub(crate)` is the right scope.

RUST QUIRK: `.iter().any(|phrase| lower.contains(phrase))`
  `.iter()` — returns an iterator over `&&&str` (references to &str elements)
  `.any(predicate)` — short-circuits: returns `true` as soon as predicate is true
  `lower.contains(phrase)` — substring search (case-sensitive, but `lower` is already
    lowercased so we get case-insensitive matching for free)
  Python analogy: `any(phrase in lower for phrase in OVERFLOW_PHRASES)`
*/
pub(crate) fn is_context_overflow_message(message: &str) -> bool {
    let lower = message.to_lowercase(); // normalize to lowercase for case-insensitive matching
    OVERFLOW_PHRASES.iter().any(|phrase| lower.contains(phrase))
}

/// Check if an HTTP error response indicates context overflow.
/*
ARCHITECTURE: Two-path overflow detection

Path 1 — Empty body (Cerebras, Mistral quirk):
  Some providers return HTTP 400/413 with an EMPTY body when the input is too long.
  We can't match a phrase, so we infer overflow from (status=400|413) + empty body.

Path 2 — Phrase matching:
  All other providers include a descriptive message. Delegate to is_context_overflow_message().

The two paths are checked in order: empty-body first (cheaper), phrase-match second.

RUST QUIRK: `message.trim().is_empty()`
  `.trim()` removes leading/trailing whitespace, returning a `&str` slice of the original.
  `.is_empty()` returns true if the slice has length 0.
  Together: "is this message blank (or just whitespace)?"
  Python analogy: `not message.strip()`
*/
fn is_context_overflow(
    status: u16,   // HTTP status — 400/413 with empty body → overflow even without a phrase
    message: &str, // response body — matched against OVERFLOW_PHRASES; may be empty
) -> bool {
    // Some providers (Cerebras, Mistral) return 400/413 with empty body on overflow
    if (status == 400 || status == 413) && message.trim().is_empty() {
        return true;
    }
    is_context_overflow_message(message)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn classify_anthropic_overflow() {
        let err =
            ProviderError::classify(400, "prompt is too long: 213462 tokens > 200000 maximum");
        assert!(err.is_context_overflow());
    }

    #[test]
    fn classify_openai_overflow() {
        let err =
            ProviderError::classify(400, "Your input exceeds the context window of this model");
        assert!(err.is_context_overflow());
    }

    #[test]
    fn classify_google_overflow() {
        let err = ProviderError::classify(
            400,
            "The input token count (1196265) exceeds the maximum number of tokens allowed",
        );
        assert!(err.is_context_overflow());
    }

    #[test]
    fn classify_bedrock_overflow() {
        let err = ProviderError::classify(400, "input is too long for requested model");
        assert!(err.is_context_overflow());
    }

    #[test]
    fn classify_xai_overflow() {
        let err = ProviderError::classify(
            400,
            "This model's maximum prompt length is 131072 but request contains 537812 tokens",
        );
        assert!(err.is_context_overflow());
    }

    #[test]
    fn classify_groq_overflow() {
        let err = ProviderError::classify(
            400,
            "Please reduce the length of the messages or completion",
        );
        assert!(err.is_context_overflow());
    }

    #[test]
    fn classify_empty_body_overflow() {
        // Cerebras/Mistral return 400/413 with empty body
        let err = ProviderError::classify(413, "");
        assert!(err.is_context_overflow());
        let err = ProviderError::classify(400, "  ");
        assert!(err.is_context_overflow());
    }

    #[test]
    fn classify_rate_limit() {
        let err = ProviderError::classify(429, "rate limit exceeded");
        assert!(matches!(err, ProviderError::RateLimited { .. }));
    }

    #[test]
    fn classify_auth_error() {
        let err = ProviderError::classify(401, "invalid api key");
        assert!(matches!(err, ProviderError::Auth(_)));
        let err = ProviderError::classify(403, "forbidden");
        assert!(matches!(err, ProviderError::Auth(_)));
    }

    #[test]
    fn classify_regular_api_error() {
        let err = ProviderError::classify(400, "invalid request format");
        assert!(matches!(err, ProviderError::Api(_)));
        assert!(!err.is_context_overflow());
    }

    #[test]
    fn overflow_message_case_insensitive() {
        assert!(is_context_overflow_message("PROMPT IS TOO LONG"));
        assert!(is_context_overflow_message("Too Many Tokens in request"));
    }

    #[test]
    fn non_overflow_messages() {
        assert!(!is_context_overflow_message("invalid api key"));
        assert!(!is_context_overflow_message("internal server error"));
        assert!(!is_context_overflow_message(""));
    }
}