Skip to main content

sqlite_graphrag/
chat_api.rs

1//! HTTP client for the OpenRouter chat-completions API.
2//!
3//! Sends structured-output chat requests to the OpenAI-compatible endpoint
4//! at `openrouter.ai/api/v1/chat/completions` and returns the parsed JSON
5//! object the model produced under a strict `json_schema` `response_format`.
6//!
7//! This mirrors [`crate::embedding_api`] for the embeddings endpoint: same
8//! retry/backoff policy (immediate abort on 401/400/404, `retry-after` on
9//! 429, exponential backoff + jitter on 5xx) and the same minimal headers
10//! (only `Authorization: Bearer`, no `HTTP-Referer`/`X-Title`). The shared
11//! error envelope and backoff helper live in [`crate::openrouter_http`]
12//! (GAP-SG-74).
13//!
14//! v1.0.95 (ADR-0054): adds an OpenRouter REST transport for the `enrich`
15//! JUDGE so structured extraction no longer requires a locally installed
16//! `claude` / `codex` / `opencode` CLI subprocess.
17//!
18//! v1.1.00 (GAP-SG-70/72-chat): the OpenAI-compatible contract surfaces
19//! `choices[].finish_reason` and `usage.{prompt_tokens,completion_tokens}`.
20//! `finish_reason == "length"` means the response was truncated because
21//! `max_tokens` was too small — not a malformed generation.
22//! [`OpenRouterChatClient::complete`](crate::chat_api::OpenRouterChatClient::complete)
23//! now detects this BEFORE attempting JSON repair, grows `max_tokens` and
24//! re-issues the request (bounded by
25//! [`crate::constants::ENRICH_MAX_LENGTH_RETRIES`]), and always reports the
26//! diagnostics (`finish_reason`, token counts) to the caller via
27//! [`ChatCompletion`](crate::chat_api::ChatCompletion) on success or
28//! [`ChatError`](crate::chat_api::ChatError) on failure.
29
30use crate::errors::AppError;
31use crate::retry::AttemptOutcome;
32use secrecy::{ExposeSecret, SecretBox};
33use serde::{Deserialize, Serialize};
34use std::time::Duration;
35
36const OPENROUTER_CHAT_URL: &str = "https://openrouter.ai/api/v1/chat/completions";
37// GAP-SG-17: raised from 300 to 600 — the per-request fallback budget when a
38// caller passes `0`. Dense bodies near the model's ~32K-token context ceiling
39// regularly need more than five minutes to generate.
40const DEFAULT_TIMEOUT_SECS: u64 = 600;
41const DEFAULT_CONNECT_TIMEOUT_SECS: u64 = 10;
42
43/// Fixed `json_schema` name sent in the `response_format`. OpenRouter only
44/// requires a short identifier; the actual contract is carried by `schema`.
45const SCHEMA_NAME: &str = "enrich_output";
46
47#[derive(Serialize)]
48struct ChatRequest<'a> {
49    model: &'a str,
50    messages: Vec<ChatMessage<'a>>,
51    response_format: ResponseFormat,
52    provider: ProviderPrefs,
53    #[serde(skip_serializing_if = "Option::is_none")]
54    reasoning: Option<ReasoningPrefs>,
55    #[serde(skip_serializing_if = "Option::is_none")]
56    max_tokens: Option<u32>,
57}
58
59#[derive(Serialize)]
60struct ChatMessage<'a> {
61    role: &'a str,
62    content: String,
63}
64
65#[derive(Serialize)]
66struct ResponseFormat {
67    #[serde(rename = "type")]
68    format_type: &'static str,
69    json_schema: JsonSchemaSpec,
70}
71
72#[derive(Serialize)]
73struct JsonSchemaSpec {
74    name: &'static str,
75    strict: bool,
76    schema: serde_json::Value,
77}
78
79#[derive(Serialize)]
80struct ProviderPrefs {
81    require_parameters: bool,
82}
83
84#[derive(Serialize)]
85struct ReasoningPrefs {
86    enabled: bool,
87}
88
89#[derive(Deserialize)]
90struct ChatResponse {
91    #[serde(default)]
92    choices: Vec<Choice>,
93    #[serde(default)]
94    usage: Option<Usage>,
95    /// Structured provider error. OpenRouter may return this inside an HTTP 200
96    /// body (e.g. token/context-length overflow); without it the response would
97    /// parse into empty `choices` and surface the misleading "no structured
98    /// content" error instead of the real cause (GAP-SG-03).
99    #[serde(default)]
100    error: Option<crate::openrouter_http::ApiError>,
101}
102
103#[derive(Deserialize)]
104struct Choice {
105    message: RespMessage,
106    /// Why the model stopped generating: `"stop"` on a normal completion,
107    /// `"length"` when `max_tokens` cut the response short (GAP-SG-70/72-chat).
108    /// Absent from providers that omit it, hence `#[serde(default)]`.
109    #[serde(default)]
110    finish_reason: Option<String>,
111}
112
113#[derive(Deserialize)]
114struct RespMessage {
115    #[serde(default)]
116    content: Option<String>,
117}
118
119#[derive(Deserialize)]
120struct Usage {
121    #[serde(default)]
122    cost: Option<f64>,
123    /// Prompt token count reported by OpenRouter (GAP-SG-72-chat). Diagnostic
124    /// only — never used to gate control flow, so a missing value stays `None`.
125    #[serde(default)]
126    prompt_tokens: Option<u32>,
127    /// Completion token count reported by OpenRouter (GAP-SG-72-chat), used
128    /// alongside `finish_reason` to explain a truncated response.
129    #[serde(default)]
130    completion_tokens: Option<u32>,
131}
132
133/// Successful [`OpenRouterChatClient::complete`] result (GAP-SG-72-chat).
134///
135/// `finish_reason`, `prompt_tokens` and `completion_tokens` are the raw
136/// diagnostics OpenRouter attached to the response that ultimately succeeded
137/// (after any `max_tokens` growth retries — see [`Self::value`] and the
138/// module docs). They are `None` only when the provider omitted them.
139#[derive(Debug)]
140pub struct ChatCompletion {
141    /// Model output parsed as JSON (guaranteed to be a JSON object).
142    pub value: serde_json::Value,
143    /// Cost in USD read from `usage.cost`, or `0.0` when the provider omitted it.
144    pub cost_usd: f64,
145    /// `choices[0].finish_reason` from the response that produced `value`.
146    pub finish_reason: Option<String>,
147    /// `usage.prompt_tokens` from the response that produced `value`.
148    pub prompt_tokens: Option<u32>,
149    /// `usage.completion_tokens` from the response that produced `value`.
150    pub completion_tokens: Option<u32>,
151}
152
153/// [`OpenRouterChatClient::complete`] failure (GAP-SG-72-chat / GAP-SG-72
154/// reauditor addendum).
155///
156/// Wraps the underlying [`AppError`] with whatever truncation diagnostics were
157/// available at the point of failure. `finish_reason`/token fields are `None`
158/// when the failure happened before a response was parsed (network error, a
159/// permanent 4xx, or exhausted retries) — only failures that occur AFTER a
160/// `ChatResponse` was successfully decoded (JSON-repair or shape-guard
161/// failures) carry them.
162///
163/// `retry_class` is the retry verdict computed AT THE ORIGIN (the exact HTTP
164/// status, or the provider's structured error `code`), never inferred
165/// downstream from `source.to_string()`. The enrich queue consumes this field
166/// directly instead of pattern-matching the formatted message.
167#[derive(Debug)]
168pub struct ChatError {
169    /// Underlying cause, preserved via `source()` rather than restated.
170    pub source: AppError,
171    /// `choices[0].finish_reason` from the response that led to this error,
172    /// when one was decoded.
173    pub finish_reason: Option<String>,
174    /// `usage.prompt_tokens` from the response that led to this error, when
175    /// one was decoded.
176    pub prompt_tokens: Option<u32>,
177    /// `usage.completion_tokens` from the response that led to this error,
178    /// when one was decoded.
179    pub completion_tokens: Option<u32>,
180    /// Typed retry verdict computed where the failure originated (HTTP
181    /// status / provider code), not by matching `source`'s message.
182    pub retry_class: AttemptOutcome,
183}
184
185impl ChatError {
186    /// Wraps `source` with no diagnostics attached (used when no
187    /// `ChatResponse` was decoded before the failure) and the `retry_class`
188    /// computed by the caller at the exact HTTP status / provider code.
189    fn new(source: AppError, retry_class: AttemptOutcome) -> Self {
190        Self {
191            source,
192            finish_reason: None,
193            prompt_tokens: None,
194            completion_tokens: None,
195            retry_class,
196        }
197    }
198
199    /// Wraps `source` with the diagnostics captured from a decoded
200    /// `ChatResponse` that nonetheless failed downstream (repair or
201    /// shape-guard), plus its `retry_class`.
202    fn with_diagnostics(
203        source: AppError,
204        finish_reason: Option<String>,
205        prompt_tokens: Option<u32>,
206        completion_tokens: Option<u32>,
207        retry_class: AttemptOutcome,
208    ) -> Self {
209        Self {
210            source,
211            finish_reason,
212            prompt_tokens,
213            completion_tokens,
214            retry_class,
215        }
216    }
217}
218
219impl std::fmt::Display for ChatError {
220    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
221        std::fmt::Display::fmt(&self.source, f)
222    }
223}
224
225impl std::error::Error for ChatError {
226    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
227        Some(&self.source)
228    }
229}
230
231/// Process-wide OpenRouter chat client. Holds the model name so that callers
232/// only thread the per-item prompt/schema/input through [`Self::complete`].
233pub struct OpenRouterChatClient {
234    client: reqwest::Client,
235    api_key: SecretBox<String>,
236    model: String,
237    /// Endpoint each request is POSTed to. Always [`OPENROUTER_CHAT_URL`] in
238    /// production; only the test-only [`Self::new_with_url`] constructor
239    /// repoints it at a local mock server.
240    base_url: String,
241}
242
243impl OpenRouterChatClient {
244    /// Builds a chat client bound to `model`, applying `timeout_secs` as the
245    /// total per-request budget (wired from `--openrouter-timeout`). A value of
246    /// `0` falls back to `DEFAULT_TIMEOUT_SECS` so a missing or zero flag never
247    /// degrades into reqwest`'s immediate-timeout behaviour.
248    pub fn new(
249        api_key: SecretBox<String>,
250        model: String,
251        timeout_secs: u64,
252    ) -> Result<Self, AppError> {
253        let timeout_secs = if timeout_secs == 0 {
254            DEFAULT_TIMEOUT_SECS
255        } else {
256            timeout_secs
257        };
258        let client = reqwest::Client::builder()
259            .timeout(Duration::from_secs(timeout_secs))
260            .connect_timeout(Duration::from_secs(DEFAULT_CONNECT_TIMEOUT_SECS))
261            // Derived from the crate version so the UA never drifts again
262            // (GAP-SG-75 recurrence caught at the v1.1.01 release gate).
263            .user_agent(concat!("sqlite-graphrag/", env!("CARGO_PKG_VERSION")))
264            .build()
265            .map_err(|e| AppError::Validation(format!("failed to build HTTP client: {e}")))?;
266
267        Ok(Self {
268            client,
269            api_key,
270            model,
271            base_url: OPENROUTER_CHAT_URL.to_string(),
272        })
273    }
274
275    /// Test-only constructor that POSTs to an arbitrary `base_url` (such as a
276    /// `wiremock::MockServer`) instead of the public OpenRouter endpoint.
277    /// Behaviour is otherwise identical to [`Self::new`].
278    #[cfg(test)]
279    pub fn new_with_url(
280        api_key: SecretBox<String>,
281        model: String,
282        base_url: String,
283        timeout_secs: u64,
284    ) -> Result<Self, AppError> {
285        let mut client = Self::new(api_key, model, timeout_secs)?;
286        client.base_url = base_url;
287        Ok(client)
288    }
289
290    /// Returns the model bound to this client.
291    pub fn model(&self) -> &str {
292        &self.model
293    }
294
295    /// Runs a single structured-output completion, transparently growing
296    /// `max_tokens` and re-issuing the request when the model truncates its
297    /// output (GAP-SG-70).
298    ///
299    /// `schema_str` is the JSON Schema (as a string) the model must honour
300    /// under `strict: true`. When `input_text` is empty only the system
301    /// message is sent. `max_tokens` seeds the first attempt; `None` lets the
302    /// provider apply its own default.
303    ///
304    /// Returns [`ChatCompletion`] on success or [`ChatError`] on failure; both
305    /// carry `finish_reason`/token diagnostics when a response was decoded.
306    ///
307    /// # Errors
308    ///
309    /// Returns [`ChatError`] when: the schema is invalid JSON; the HTTP
310    /// request fails or exhausts retries; the provider returns a permanent
311    /// error (401/400/404, or a structured `error` object in a 2xx body); the
312    /// response carries no usable content; the content cannot be parsed as
313    /// JSON even after repair; the parsed JSON is not an object; or the
314    /// response is truncated (`finish_reason: "length"`) after
315    /// [`crate::constants::ENRICH_MAX_LENGTH_RETRIES`] `max_tokens` growth
316    /// attempts are exhausted.
317    pub async fn complete(
318        &self,
319        system_prompt: &str,
320        input_text: &str,
321        schema_str: &str,
322        max_tokens: Option<u32>,
323    ) -> Result<ChatCompletion, ChatError> {
324        // A malformed schema is a permanent caller/config error — classified
325        // explicitly (no blanket `From<AppError>` conversion exists for this
326        // type; every `ChatError` states its `retry_class` at construction).
327        let schema: serde_json::Value = serde_json::from_str(schema_str).map_err(|e| {
328            ChatError::new(
329                AppError::Validation(format!("invalid JSON schema for OpenRouter request: {e}")),
330                AttemptOutcome::HardFailure,
331            )
332        })?;
333
334        let mut current_max_tokens = max_tokens;
335
336        for length_attempt in 0..=crate::constants::ENRICH_MAX_LENGTH_RETRIES {
337            let response = self
338                .complete_one_attempt(&schema, system_prompt, input_text, current_max_tokens)
339                .await?;
340
341            let finish_reason = response
342                .choices
343                .first()
344                .and_then(|c| c.finish_reason.clone());
345            let prompt_tokens = response.usage.as_ref().and_then(|u| u.prompt_tokens);
346            let completion_tokens = response.usage.as_ref().and_then(|u| u.completion_tokens);
347
348            let truncated = finish_reason.as_deref() == Some("length");
349            let retries_left = length_attempt < crate::constants::ENRICH_MAX_LENGTH_RETRIES;
350
351            if truncated && retries_left {
352                let next_max_tokens = grow_max_tokens(current_max_tokens);
353                tracing::warn!(
354                    model = %self.model,
355                    attempt = length_attempt,
356                    previous_max_tokens = ?current_max_tokens,
357                    next_max_tokens,
358                    "OpenRouter completion truncated (finish_reason=length); \
359                     retrying with a larger max_tokens budget"
360                );
361                current_max_tokens = Some(next_max_tokens);
362                continue;
363            }
364
365            if truncated {
366                tracing::warn!(
367                    model = %self.model,
368                    max_length_retries = crate::constants::ENRICH_MAX_LENGTH_RETRIES,
369                    max_tokens = ?current_max_tokens,
370                    "OpenRouter completion still truncated after exhausting \
371                     max_tokens growth"
372                );
373            }
374
375            return self.finish_completion(
376                response,
377                finish_reason,
378                prompt_tokens,
379                completion_tokens,
380            );
381        }
382
383        unreachable!("loop always returns within ENRICH_MAX_LENGTH_RETRIES + 1 iterations")
384    }
385
386    /// Runs one HTTP attempt (including the mandatory-reasoning fallback) and
387    /// returns the decoded [`ChatResponse`] without inspecting `finish_reason`
388    /// or extracting content — that happens in [`Self::complete`] so the
389    /// `max_tokens` growth loop can re-issue the request first.
390    async fn complete_one_attempt(
391        &self,
392        schema: &serde_json::Value,
393        system_prompt: &str,
394        input_text: &str,
395        max_tokens: Option<u32>,
396    ) -> Result<ChatResponse, ChatError> {
397        // First attempt sends reasoning.enabled=false (token savings on the
398        // ~9 models that allow disabling). The ~4 reasoning-mandatory models
399        // (e.g. minimax-m2.7, gpt-oss-120b) reject it with HTTP 400 mentioning
400        // "reasoning"; on that specific failure we retry ONCE with the
401        // reasoning field omitted so the model uses its mandatory default. Any
402        // other error, or a second failure, propagates the original error.
403        let primary = self.build_request(
404            schema.clone(),
405            system_prompt,
406            input_text,
407            max_tokens,
408            Some(ReasoningPrefs { enabled: false }),
409        );
410        match self.execute_with_retry(&primary).await {
411            Ok(r) => Ok(r),
412            Err(first_err) => {
413                if reasoning_disable_rejected(&first_err) {
414                    tracing::warn!(
415                        model = %self.model,
416                        "model rejected reasoning.enabled=false (mandatory); \
417                         retrying once with reasoning omitted"
418                    );
419                    let fallback = self.build_request(
420                        schema.clone(),
421                        system_prompt,
422                        input_text,
423                        max_tokens,
424                        None,
425                    );
426                    match self.execute_with_retry(&fallback).await {
427                        Ok(r) => Ok(r),
428                        Err(_) => Err(first_err),
429                    }
430                } else {
431                    Err(first_err)
432                }
433            }
434        }
435    }
436
437    /// Extracts content, repairs/parses it as JSON, and enforces the
438    /// object-shape guard, attaching `finish_reason`/token diagnostics to any
439    /// failure.
440    ///
441    /// Every failure branch below (missing content, JSON-repair failure,
442    /// non-object shape) classifies as `AttemptOutcome::Transient`. This is a
443    /// deliberate, acknowledged tension with `rules_rust_retry_com_backoff.md`
444    /// ("NUNCA retentar erros de parsing ou deserialização" / "NUNCA retentar
445    /// erros de deserialização"): those rules target DETERMINISTIC parse
446    /// errors, where retrying the identical input reproduces the identical
447    /// failure. Here the "input" is `deepseek-v4-flash:nitro` sampling
448    /// variance — the SAME prompt can legitimately produce well-formed JSON
449    /// on the next generation (see GAP-SG-10). So this is a typed, bounded
450    /// hiccup, not a retry-forever loophole: it is capped by `--max-attempts`
451    /// (GAP-SG-09/GAP-SG-21) and dead-letters once attempts are exhausted.
452    fn finish_completion(
453        &self,
454        response: ChatResponse,
455        finish_reason: Option<String>,
456        prompt_tokens: Option<u32>,
457        completion_tokens: Option<u32>,
458    ) -> Result<ChatCompletion, ChatError> {
459        let content = response
460            .choices
461            .into_iter()
462            .next()
463            .and_then(|c| c.message.content)
464            .filter(|c| !c.trim().is_empty())
465            .ok_or_else(|| {
466                AppError::Validation(format!(
467                    "model '{}' returned no structured content (incompatible with \
468                     structured outputs, or refused the request)",
469                    self.model
470                ))
471            })
472            .map_err(|e| {
473                ChatError::with_diagnostics(
474                    e,
475                    finish_reason.clone(),
476                    prompt_tokens,
477                    completion_tokens,
478                    AttemptOutcome::Transient,
479                )
480            })?;
481
482        // GAP-SG-10: deepseek-v4-flash:nitro and similar models do not honour
483        // `json_schema` strict mode reliably — they wrap output in markdown
484        // fences, add trailing commas, or omit quotes around keys. Try a strict
485        // parse first (zero cost for well-formed JSON), then fall back to the
486        // repair pass (a Rust port of `json_repair`) before giving up.
487        let value = crate::json_repair::repair_to_value(&content).map_err(|e| {
488            ChatError::with_diagnostics(
489                AppError::Validation(format!(
490                    "model '{}' returned content that could not be parsed even after \
491                     JSON repair: {e}",
492                    self.model
493                )),
494                finish_reason.clone(),
495                prompt_tokens,
496                completion_tokens,
497                AttemptOutcome::Transient,
498            )
499        })?;
500
501        // GAP-SG-10: `llm_json` coerces aggressively — free text becomes a JSON
502        // string, empty input becomes `{}`, a lone delimiter becomes `null`. The
503        // enrich JUDGE contract is ALWAYS a JSON object, so a non-object result
504        // here is a malformed/refused generation, NOT a usable value. Reject it
505        // (the enrich classifier reclassifies this as a transient model hiccup,
506        // GAP-SG-09) instead of letting a coerced scalar masquerade as a
507        // valid-but-empty result downstream.
508        if !value.is_object() {
509            return Err(ChatError::with_diagnostics(
510                AppError::Validation(format!(
511                    "model '{}' returned non-object JSON after repair (got {}); \
512                     likely a refusal or malformed structured output",
513                    self.model,
514                    json_shape_name(&value)
515                )),
516                finish_reason,
517                prompt_tokens,
518                completion_tokens,
519                AttemptOutcome::Transient,
520            ));
521        }
522
523        let cost = response.usage.and_then(|u| u.cost).unwrap_or(0.0);
524
525        Ok(ChatCompletion {
526            value,
527            cost_usd: cost,
528            finish_reason,
529            prompt_tokens,
530            completion_tokens,
531        })
532    }
533
534    /// Builds a `ChatRequest` for one attempt. `reasoning` is `Some` on the
535    /// primary attempt (`enabled:false`) and `None` on the mandatory-reasoning
536    /// fallback, where the field is omitted entirely.
537    fn build_request<'a>(
538        &'a self,
539        schema: serde_json::Value,
540        system_prompt: &str,
541        input_text: &str,
542        max_tokens: Option<u32>,
543        reasoning: Option<ReasoningPrefs>,
544    ) -> ChatRequest<'a> {
545        let mut messages = Vec::with_capacity(2);
546        messages.push(ChatMessage {
547            role: "system",
548            content: system_prompt.to_string(),
549        });
550        if !input_text.is_empty() {
551            messages.push(ChatMessage {
552                role: "user",
553                content: input_text.to_string(),
554            });
555        }
556        ChatRequest {
557            model: &self.model,
558            messages,
559            response_format: ResponseFormat {
560                format_type: "json_schema",
561                json_schema: JsonSchemaSpec {
562                    name: SCHEMA_NAME,
563                    strict: true,
564                    schema,
565                },
566            },
567            provider: ProviderPrefs {
568                require_parameters: true,
569            },
570            reasoning,
571            max_tokens,
572        }
573    }
574
575    /// Runs the request/retry loop, classifying every failure into a
576    /// [`ChatError`] with `retry_class` set AT THE ORIGIN (the exact HTTP
577    /// status, or the provider's structured error code) — never inferred
578    /// downstream from a formatted message (reauditor addendum to
579    /// GAP-SG-72-chat).
580    async fn execute_with_retry(
581        &self,
582        request: &ChatRequest<'_>,
583    ) -> Result<ChatResponse, ChatError> {
584        let mut last_err: Option<ChatError> = None;
585
586        for attempt in 0..crate::openrouter_http::MAX_RETRIES {
587            let result = self
588                .client
589                .post(&self.base_url)
590                .header(
591                    "Authorization",
592                    format!("Bearer {}", self.api_key.expose_secret()),
593                )
594                .json(request)
595                .send()
596                .await;
597
598            let resp = match result {
599                Ok(r) => r,
600                Err(e) if e.is_timeout() => {
601                    return Err(ChatError::new(
602                        AppError::Validation("OpenRouter chat request timed out".into()),
603                        AttemptOutcome::Transient,
604                    ));
605                }
606                Err(e) => {
607                    last_err = Some(ChatError::new(
608                        AppError::Validation(format!("HTTP request failed: {e}")),
609                        AttemptOutcome::Transient,
610                    ));
611                    crate::openrouter_http::backoff(attempt).await;
612                    continue;
613                }
614            };
615
616            let status = resp.status();
617
618            if status.is_success() {
619                let body = resp.text().await.map_err(|e| {
620                    ChatError::new(
621                        AppError::Validation(format!("failed to read response body: {e}")),
622                        AttemptOutcome::Transient,
623                    )
624                })?;
625                match serde_json::from_str::<ChatResponse>(&body) {
626                    Ok(parsed) => {
627                        // A structured error object inside a 2xx body is
628                        // classified by its own `code` (GAP-SG-03 surfaces
629                        // the real code/message instead of letting empty
630                        // choices masquerade as no-structured-content).
631                        if let Some(api_err) = parsed.error {
632                            let retry_class =
633                                crate::openrouter_http::provider_error_retry_class(&api_err);
634                            return Err(ChatError::new(
635                                AppError::ProviderError {
636                                    code: api_err.code_string(),
637                                    message: api_err.message,
638                                },
639                                retry_class,
640                            ));
641                        }
642                        return Ok(parsed);
643                    }
644                    Err(e) => {
645                        tracing::warn!(
646                            attempt,
647                            body_len = body.len(),
648                            "HTTP 200 but parse failed (retrying): {e}"
649                        );
650                        last_err = Some(ChatError::new(
651                            AppError::Validation(format!("failed to parse chat response: {e}")),
652                            AttemptOutcome::Transient,
653                        ));
654                        crate::openrouter_http::backoff(attempt).await;
655                        continue;
656                    }
657                }
658            }
659
660            if status.as_u16() == 401 {
661                return Err(ChatError::new(
662                    AppError::Validation("invalid OpenRouter API key (HTTP 401)".into()),
663                    AttemptOutcome::HardFailure,
664                ));
665            }
666
667            if status.as_u16() == 400 || status.as_u16() == 404 {
668                let body = resp.text().await.unwrap_or_default();
669                return Err(ChatError::new(
670                    AppError::Validation(format!(
671                        "OpenRouter returned {status} for model '{}': {body}",
672                        self.model
673                    )),
674                    AttemptOutcome::HardFailure,
675                ));
676            }
677
678            if status.as_u16() == 429 {
679                let retry_after = resp
680                    .headers()
681                    .get("retry-after")
682                    .and_then(|v| v.to_str().ok())
683                    .and_then(|v| v.parse::<u64>().ok())
684                    .unwrap_or(2);
685                tracing::warn!(
686                    attempt,
687                    retry_after_secs = retry_after,
688                    "OpenRouter rate limited, waiting"
689                );
690                // GAP-SG-56: surface the Retry-After delay to the caller. If
691                // every attempt is rate limited, the loop exits with this
692                // RateLimited error (retryable) carrying the server-advised
693                // wait, instead of a generic max-retries-exceeded message.
694                last_err = Some(ChatError::new(
695                    AppError::RateLimited {
696                        detail: format!("OpenRouter HTTP 429 (retry-after {retry_after}s)"),
697                    },
698                    AttemptOutcome::Transient,
699                ));
700                tokio::time::sleep(Duration::from_secs(retry_after)).await;
701                continue;
702            }
703
704            if status.is_server_error() {
705                tracing::warn!(attempt, status = %status, "OpenRouter server error, retrying");
706                last_err = Some(ChatError::new(
707                    AppError::Validation(format!("OpenRouter server error: {status}")),
708                    AttemptOutcome::Transient,
709                ));
710                crate::openrouter_http::backoff(attempt).await;
711                continue;
712            }
713
714            let body = resp.text().await.unwrap_or_default();
715            return Err(ChatError::new(
716                AppError::Validation(format!("unexpected HTTP {status}: {body}")),
717                crate::openrouter_http::status_retry_class(status),
718            ));
719        }
720
721        // GAP-SG-72-chat addendum: exhausting every retry against a
722        // transient condition (429/5xx/timeout/network) is ITSELF transient
723        // — it is exactly the case the queue's `--max-attempts` backoff
724        // covers, and must never be reclassified as a permanent failure.
725        Err(last_err.unwrap_or_else(|| {
726            ChatError::new(
727                AppError::Validation("max retries exceeded for OpenRouter chat request".into()),
728                AttemptOutcome::Transient,
729            )
730        }))
731    }
732}
733
734/// Grows `current` for the next `max_tokens` retry after a truncated
735/// (`finish_reason: "length"`) response (GAP-SG-70/71). When `current` is
736/// `None` the caller left the provider default in place, so growth starts
737/// from [`crate::constants::ENRICH_INITIAL_MAX_TOKENS`] instead of an unknown
738/// base. The result is always capped at
739/// [`crate::constants::ENRICH_MAX_TOKENS_CEILING`].
740fn grow_max_tokens(current: Option<u32>) -> u32 {
741    let base = current.unwrap_or(crate::constants::ENRICH_INITIAL_MAX_TOKENS);
742    base.saturating_mul(crate::constants::ENRICH_MAX_TOKENS_GROWTH_FACTOR)
743        .min(crate::constants::ENRICH_MAX_TOKENS_CEILING)
744}
745
746/// True when an error from `execute_with_retry` indicates the model rejected
747/// `reasoning.enabled=false` because reasoning is mandatory: an HTTP 400 whose
748/// body mentions "reasoning" (case-insensitive). Triggers the one-shot retry
749/// with the `reasoning` field omitted.
750///
751/// This IS a legitimate, narrowly-scoped substring check on the underlying
752/// `AppError`'s message — not a retry-classification decision (that lives in
753/// `ChatError.retry_class`, computed at the origin). It only decides whether
754/// to attempt the mandatory-reasoning fallback shape, an orthogonal concern.
755fn reasoning_disable_rejected(err: &ChatError) -> bool {
756    let msg = err.source.to_string().to_lowercase();
757    msg.contains("400") && msg.contains("reasoning")
758}
759
760/// Names the JSON shape of `value` for diagnostics (GAP-SG-10). Used when the
761/// repaired model output is not the object the enrich JUDGE contract requires.
762fn json_shape_name(value: &serde_json::Value) -> &'static str {
763    match value {
764        serde_json::Value::Null => "null",
765        serde_json::Value::Bool(_) => "boolean",
766        serde_json::Value::Number(_) => "number",
767        serde_json::Value::String(_) => "string",
768        serde_json::Value::Array(_) => "array",
769        serde_json::Value::Object(_) => "object",
770    }
771}
772
773#[cfg(test)]
774mod tests {
775    use super::*;
776    use serde_json::json;
777    use wiremock::matchers::{body_partial_json, method, path};
778    use wiremock::{Mock, MockServer, ResponseTemplate};
779
780    const TEST_SCHEMA: &str = r#"{"type":"object"}"#;
781
782    fn key() -> SecretBox<String> {
783        SecretBox::new(Box::new("test-key".to_string()))
784    }
785
786    /// Builds a chat-completions success body whose single choice carries the
787    /// model output as a JSON *string* (the double-encoding the real API uses
788    /// under structured outputs), optionally attaching `usage.cost` and a
789    /// `finish_reason` (defaults to `"stop"`).
790    fn success_body(content: &str, cost: Option<f64>) -> serde_json::Value {
791        success_body_with_finish(content, cost, "stop")
792    }
793
794    /// Same as [`success_body`] but with an explicit `finish_reason`, used by
795    /// the GAP-SG-70 truncation tests.
796    fn success_body_with_finish(
797        content: &str,
798        cost: Option<f64>,
799        finish_reason: &str,
800    ) -> serde_json::Value {
801        let mut body = json!({
802            "choices": [{ "message": { "content": content }, "finish_reason": finish_reason }]
803        });
804        if let Some(c) = cost {
805            body["usage"] = json!({ "cost": c });
806        }
807        body
808    }
809
810    async fn client_for(server: &MockServer, model: &str) -> OpenRouterChatClient {
811        OpenRouterChatClient::new_with_url(
812            key(),
813            model.to_string(),
814            format!("{}/chat/completions", server.uri()),
815            30,
816        )
817        .expect("test client builds")
818    }
819
820    #[test]
821    fn new_builds_client_and_binds_model() {
822        let client = OpenRouterChatClient::new(key(), "z-ai/glm-5.2".to_string(), 30)
823            .expect("client builds");
824        assert_eq!(client.model(), "z-ai/glm-5.2");
825    }
826
827    #[test]
828    fn new_defaults_base_url_to_public_endpoint() {
829        let client = OpenRouterChatClient::new(key(), "z-ai/glm-5.2".to_string(), 30)
830            .expect("client builds");
831        assert_eq!(client.base_url, OPENROUTER_CHAT_URL);
832    }
833
834    #[test]
835    fn request_serializes_with_strict_schema_and_disabled_reasoning() {
836        let request = ChatRequest {
837            model: "deepseek/deepseek-v4-flash",
838            messages: vec![ChatMessage {
839                role: "system",
840                content: "extract".to_string(),
841            }],
842            response_format: ResponseFormat {
843                format_type: "json_schema",
844                json_schema: JsonSchemaSpec {
845                    name: SCHEMA_NAME,
846                    strict: true,
847                    schema: serde_json::json!({"type": "object"}),
848                },
849            },
850            provider: ProviderPrefs {
851                require_parameters: true,
852            },
853            reasoning: Some(ReasoningPrefs { enabled: false }),
854            max_tokens: None,
855        };
856        let json = serde_json::to_value(&request).expect("serializes");
857        assert_eq!(json["response_format"]["type"], "json_schema");
858        assert_eq!(json["response_format"]["json_schema"]["strict"], true);
859        assert_eq!(json["provider"]["require_parameters"], true);
860        assert_eq!(json["reasoning"]["enabled"], false);
861        // max_tokens omitted when None
862        assert!(json.get("max_tokens").is_none());
863    }
864
865    #[test]
866    fn grow_max_tokens_uses_initial_default_when_current_is_none() {
867        assert_eq!(
868            grow_max_tokens(None),
869            crate::constants::ENRICH_INITIAL_MAX_TOKENS
870                * crate::constants::ENRICH_MAX_TOKENS_GROWTH_FACTOR
871        );
872    }
873
874    #[test]
875    fn grow_max_tokens_caps_at_ceiling() {
876        assert_eq!(
877            grow_max_tokens(Some(crate::constants::ENRICH_MAX_TOKENS_CEILING)),
878            crate::constants::ENRICH_MAX_TOKENS_CEILING
879        );
880        assert_eq!(
881            grow_max_tokens(Some(u32::MAX)),
882            crate::constants::ENRICH_MAX_TOKENS_CEILING
883        );
884    }
885
886    #[tokio::test]
887    async fn complete_sends_wellformed_request_and_parses_content() {
888        let server = MockServer::start().await;
889        Mock::given(method("POST"))
890            .and(path("/chat/completions"))
891            .and(body_partial_json(json!({
892                "model": "deepseek/deepseek-v4-flash",
893                "response_format": {
894                    "type": "json_schema",
895                    "json_schema": { "name": "enrich_output", "strict": true }
896                },
897                "provider": { "require_parameters": true },
898                "reasoning": { "enabled": false }
899            })))
900            .respond_with(ResponseTemplate::new(200).set_body_json(success_body(
901                r#"{"entities":[],"relationships":[]}"#,
902                Some(0.0023),
903            )))
904            .expect(1)
905            .mount(&server)
906            .await;
907
908        let client = client_for(&server, "deepseek/deepseek-v4-flash").await;
909        let completion = client
910            .complete("system", "input", TEST_SCHEMA, None)
911            .await
912            .expect("completion succeeds");
913
914        assert_eq!(
915            completion.value,
916            json!({"entities": [], "relationships": []})
917        );
918        assert!((completion.cost_usd - 0.0023).abs() < f64::EPSILON);
919        assert_eq!(completion.finish_reason.as_deref(), Some("stop"));
920    }
921
922    #[tokio::test]
923    async fn complete_defaults_cost_to_zero_when_usage_absent() {
924        let server = MockServer::start().await;
925        Mock::given(method("POST"))
926            .respond_with(
927                ResponseTemplate::new(200).set_body_json(success_body(r#"{"entities":[]}"#, None)),
928            )
929            .mount(&server)
930            .await;
931
932        let client = client_for(&server, "z-ai/glm-5.2").await;
933        let completion = client
934            .complete("system", "", TEST_SCHEMA, Some(4096))
935            .await
936            .expect("completion succeeds");
937        assert_eq!(completion.cost_usd, 0.0);
938    }
939
940    #[tokio::test]
941    async fn complete_retries_on_429_honouring_retry_after() {
942        let server = MockServer::start().await;
943        Mock::given(method("POST"))
944            .respond_with(ResponseTemplate::new(429).insert_header("retry-after", "1"))
945            .up_to_n_times(1)
946            .expect(1)
947            .mount(&server)
948            .await;
949        Mock::given(method("POST"))
950            .respond_with(
951                ResponseTemplate::new(200).set_body_json(success_body(r#"{"ok":true}"#, Some(0.0))),
952            )
953            .expect(1)
954            .mount(&server)
955            .await;
956
957        let client = client_for(&server, "minimax/minimax-m3").await;
958        let completion = client
959            .complete("system", "input", TEST_SCHEMA, None)
960            .await
961            .expect("retried completion succeeds");
962        assert_eq!(completion.value, json!({"ok": true}));
963    }
964
965    #[tokio::test]
966    async fn complete_retries_on_5xx_with_backoff() {
967        let server = MockServer::start().await;
968        Mock::given(method("POST"))
969            .respond_with(ResponseTemplate::new(503))
970            .up_to_n_times(1)
971            .expect(1)
972            .mount(&server)
973            .await;
974        Mock::given(method("POST"))
975            .respond_with(
976                ResponseTemplate::new(200).set_body_json(success_body(r#"{"ok":1}"#, Some(0.0))),
977            )
978            .expect(1)
979            .mount(&server)
980            .await;
981
982        let client = client_for(&server, "openai/gpt-oss-120b").await;
983        let completion = client
984            .complete("system", "input", TEST_SCHEMA, None)
985            .await
986            .expect("retried completion succeeds");
987        assert_eq!(completion.value, json!({"ok": 1}));
988    }
989
990    #[tokio::test]
991    async fn complete_401_is_permanent_without_retry() {
992        let server = MockServer::start().await;
993        Mock::given(method("POST"))
994            .respond_with(ResponseTemplate::new(401))
995            .expect(1)
996            .mount(&server)
997            .await;
998
999        let client = client_for(&server, "z-ai/glm-5.2").await;
1000        let err = client
1001            .complete("system", "input", TEST_SCHEMA, None)
1002            .await
1003            .expect_err("401 is an error");
1004        assert!(err.to_string().contains("401"), "got: {err}");
1005        assert_eq!(err.retry_class, AttemptOutcome::HardFailure);
1006    }
1007
1008    #[tokio::test]
1009    async fn complete_400_returns_body_and_model_without_retry() {
1010        let server = MockServer::start().await;
1011        Mock::given(method("POST"))
1012            .respond_with(ResponseTemplate::new(400).set_body_string("schema not supported"))
1013            .expect(1)
1014            .mount(&server)
1015            .await;
1016
1017        let client = client_for(&server, "xiaomi/mimo-v2.5").await;
1018        let err = client
1019            .complete("system", "input", TEST_SCHEMA, None)
1020            .await
1021            .expect_err("400 is an error");
1022        let msg = err.to_string();
1023        assert!(msg.contains("400"), "got: {msg}");
1024        assert!(msg.contains("xiaomi/mimo-v2.5"), "got: {msg}");
1025        assert!(msg.contains("schema not supported"), "got: {msg}");
1026        assert_eq!(err.retry_class, AttemptOutcome::HardFailure);
1027    }
1028
1029    #[tokio::test]
1030    async fn complete_empty_choices_errors_citing_model() {
1031        let server = MockServer::start().await;
1032        Mock::given(method("POST"))
1033            .respond_with(ResponseTemplate::new(200).set_body_json(json!({ "choices": [] })))
1034            .mount(&server)
1035            .await;
1036
1037        let client = client_for(&server, "minimax/minimax-m2.7").await;
1038        let err = client
1039            .complete("system", "input", TEST_SCHEMA, None)
1040            .await
1041            .expect_err("empty choices is an error");
1042        let msg = err.to_string();
1043        assert!(msg.contains("minimax/minimax-m2.7"), "got: {msg}");
1044        assert!(msg.contains("no structured content"), "got: {msg}");
1045        assert_eq!(err.finish_reason, None);
1046        assert_eq!(
1047            err.retry_class,
1048            AttemptOutcome::Transient,
1049            "no-content is a model hiccup, not a permanent rejection"
1050        );
1051    }
1052
1053    #[tokio::test]
1054    async fn complete_empty_content_errors() {
1055        let server = MockServer::start().await;
1056        Mock::given(method("POST"))
1057            .respond_with(ResponseTemplate::new(200).set_body_json(success_body("   ", Some(0.0))))
1058            .mount(&server)
1059            .await;
1060
1061        let client = client_for(&server, "z-ai/glm-5.2:nitro").await;
1062        let err = client
1063            .complete("system", "input", TEST_SCHEMA, None)
1064            .await
1065            .expect_err("blank content is an error");
1066        assert!(
1067            err.to_string().contains("no structured content"),
1068            "got: {err}"
1069        );
1070    }
1071
1072    #[tokio::test]
1073    async fn complete_non_json_content_errors_as_incompatible() {
1074        // GAP-SG-10: free text is coerced by the repair pass into a JSON string
1075        // (not an object), so it is rejected by the shape guard rather than the
1076        // strict-parse error. The message names the offending shape + model.
1077        let server = MockServer::start().await;
1078        Mock::given(method("POST"))
1079            .respond_with(
1080                ResponseTemplate::new(200)
1081                    .set_body_json(success_body("this is not json", Some(0.0))),
1082            )
1083            .mount(&server)
1084            .await;
1085
1086        let client = client_for(&server, "google/gemini-3.1-flash-lite").await;
1087        let err = client
1088            .complete("system", "input", TEST_SCHEMA, None)
1089            .await
1090            .expect_err("non-json content is an error");
1091        let msg = err.to_string();
1092        assert!(msg.contains("non-object JSON after repair"), "got: {msg}");
1093        assert!(msg.contains("google/gemini-3.1-flash-lite"), "got: {msg}");
1094    }
1095
1096    #[tokio::test]
1097    async fn complete_repairs_markdown_fenced_object() {
1098        // GAP-SG-10: a model that wraps a valid object in a ```json fence (a
1099        // common deepseek-v4-flash:nitro defect) is repaired and parsed instead
1100        // of being rejected as non-JSON.
1101        let server = MockServer::start().await;
1102        Mock::given(method("POST"))
1103            .respond_with(ResponseTemplate::new(200).set_body_json(success_body(
1104                "```json\n{\"entities\":[\"rust\"],\"relationships\":[]}\n```",
1105                Some(0.0),
1106            )))
1107            .mount(&server)
1108            .await;
1109
1110        let client = client_for(&server, "deepseek/deepseek-v4-flash").await;
1111        let completion = client
1112            .complete("system", "input", TEST_SCHEMA, None)
1113            .await
1114            .expect("fenced object is repaired");
1115        assert_eq!(
1116            completion.value,
1117            json!({"entities": ["rust"], "relationships": []})
1118        );
1119    }
1120
1121    #[tokio::test]
1122    async fn complete_rejects_invalid_schema_before_network() {
1123        // No mock mounted: an unreachable URL proves we never hit the network.
1124        let client = OpenRouterChatClient::new_with_url(
1125            key(),
1126            "z-ai/glm-5.2".to_string(),
1127            "http://127.0.0.1:1/chat/completions".to_string(),
1128            30,
1129        )
1130        .expect("client builds");
1131        let err = client
1132            .complete("system", "input", "{not valid json", None)
1133            .await
1134            .expect_err("invalid schema is rejected");
1135        assert!(
1136            err.to_string().contains("invalid JSON schema"),
1137            "got: {err}"
1138        );
1139        assert_eq!(
1140            err.retry_class,
1141            AttemptOutcome::HardFailure,
1142            "a malformed schema is a permanent caller error"
1143        );
1144    }
1145
1146    #[tokio::test]
1147    async fn complete_retries_with_reasoning_omitted_when_mandatory() {
1148        let server = MockServer::start().await;
1149        // Primary attempt (reasoning.enabled=false) is rejected with a 400 whose
1150        // body mentions "reasoning" — the mandatory-reasoning signal that drives
1151        // the one-shot fallback.
1152        Mock::given(method("POST"))
1153            .respond_with(
1154                ResponseTemplate::new(400).set_body_string(
1155                    "reasoning is mandatory for this model and cannot be disabled",
1156                ),
1157            )
1158            .up_to_n_times(1)
1159            .expect(1)
1160            .mount(&server)
1161            .await;
1162        // Fallback attempt (reasoning field omitted) succeeds.
1163        Mock::given(method("POST"))
1164            .respond_with(ResponseTemplate::new(200).set_body_json(success_body(
1165                r#"{"entities":[],"relationships":[]}"#,
1166                Some(0.0),
1167            )))
1168            .expect(1)
1169            .mount(&server)
1170            .await;
1171
1172        let client = client_for(&server, "minimax/minimax-m2.7").await;
1173        let completion = client
1174            .complete("system", "input", TEST_SCHEMA, None)
1175            .await
1176            .expect("fallback completion succeeds");
1177        assert_eq!(
1178            completion.value,
1179            json!({"entities": [], "relationships": []})
1180        );
1181
1182        // Exactly two requests were sent: the FIRST carries reasoning.enabled=false,
1183        // the SECOND (fallback) OMITS the reasoning field entirely.
1184        let requests = server
1185            .received_requests()
1186            .await
1187            .expect("request recording is enabled");
1188        assert_eq!(requests.len(), 2, "expected primary + fallback requests");
1189        let first: serde_json::Value =
1190            serde_json::from_slice(&requests[0].body).expect("first request body is JSON");
1191        let second: serde_json::Value =
1192            serde_json::from_slice(&requests[1].body).expect("second request body is JSON");
1193        assert_eq!(
1194            first["reasoning"]["enabled"],
1195            json!(false),
1196            "primary request must send reasoning.enabled=false"
1197        );
1198        assert!(
1199            second.get("reasoning").is_none(),
1200            "fallback request must omit the reasoning field, got: {second}"
1201        );
1202    }
1203
1204    #[tokio::test]
1205    async fn complete_honours_configured_timeout() {
1206        // A 1s client timeout against a server that delays 2s proves the
1207        // --openrouter-timeout value is wired into the reqwest builder instead
1208        // of the fixed 300s default (regression: the flag was silently ignored).
1209        let server = MockServer::start().await;
1210        Mock::given(method("POST"))
1211            .respond_with(
1212                ResponseTemplate::new(200)
1213                    .set_delay(std::time::Duration::from_secs(2))
1214                    .set_body_json(success_body(r#"{"ok":1}"#, Some(0.0))),
1215            )
1216            .mount(&server)
1217            .await;
1218
1219        let client = OpenRouterChatClient::new_with_url(
1220            key(),
1221            "z-ai/glm-5.2".to_string(),
1222            format!("{}/chat/completions", server.uri()),
1223            1,
1224        )
1225        .expect("client builds");
1226        let err = client
1227            .complete("system", "input", TEST_SCHEMA, None)
1228            .await
1229            .expect_err("request exceeds the 1s timeout");
1230        assert!(err.to_string().contains("timed out"), "got: {err}");
1231    }
1232
1233    #[tokio::test]
1234    async fn complete_surfaces_provider_error_in_200_body() {
1235        // GAP-SG-03: an HTTP 200 whose body is a structured OpenRouter error
1236        // (token/context-length overflow) must surface the REAL message, not
1237        // the misleading no-structured-content from empty choices.
1238        let server = MockServer::start().await;
1239        Mock::given(method("POST"))
1240            .respond_with(ResponseTemplate::new(200).set_body_json(json!({
1241                "error": { "code": 400, "message": "context length exceeded" }
1242            })))
1243            .mount(&server)
1244            .await;
1245
1246        let client = client_for(&server, "deepseek/deepseek-v4-flash").await;
1247        let err = client
1248            .complete("system", "input", TEST_SCHEMA, None)
1249            .await
1250            .expect_err("provider error must surface");
1251        let msg = err.to_string();
1252        assert!(msg.contains("context length exceeded"), "got: {msg}");
1253        assert!(
1254            !msg.contains("no structured content"),
1255            "must not mask as empty choices: {msg}"
1256        );
1257        assert!(
1258            !msg.contains("missing field"),
1259            "must not mask as a missing field: {msg}"
1260        );
1261        assert_eq!(
1262            err.retry_class,
1263            AttemptOutcome::HardFailure,
1264            "code 400 (context length exceeded) is a permanent provider rejection"
1265        );
1266    }
1267
1268    #[tokio::test]
1269    async fn complete_classifies_provider_error_429_code_as_transient() {
1270        // Reauditor addendum: the provider error's structured `code` — never
1271        // its message — decides the retry verdict. A 429 code inside an
1272        // otherwise-2xx body is exactly as transient as an HTTP-level 429.
1273        let server = MockServer::start().await;
1274        Mock::given(method("POST"))
1275            .respond_with(ResponseTemplate::new(200).set_body_json(json!({
1276                "error": { "code": 429, "message": "rate limited" }
1277            })))
1278            .mount(&server)
1279            .await;
1280
1281        let client = client_for(&server, "deepseek/deepseek-v4-flash").await;
1282        let err = client
1283            .complete("system", "input", TEST_SCHEMA, None)
1284            .await
1285            .expect_err("provider rate-limit error must surface");
1286        assert_eq!(err.retry_class, AttemptOutcome::Transient);
1287    }
1288
1289    #[tokio::test]
1290    async fn complete_classifies_exhausted_5xx_retries_as_transient() {
1291        // GAP-SG-72-chat addendum: exhausting every retry against a
1292        // persistent 5xx is a TRANSIENT outcome (the queue's --max-attempts
1293        // is what eventually dead-letters it), never a HardFailure.
1294        let server = MockServer::start().await;
1295        Mock::given(method("POST"))
1296            .respond_with(ResponseTemplate::new(503))
1297            .mount(&server)
1298            .await;
1299
1300        let client = client_for(&server, "openai/gpt-oss-120b").await;
1301        let err = client
1302            .complete("system", "input", TEST_SCHEMA, None)
1303            .await
1304            .expect_err("persistent 5xx exhausts retries");
1305        assert_eq!(err.retry_class, AttemptOutcome::Transient);
1306    }
1307
1308    #[tokio::test]
1309    async fn complete_regrows_max_tokens_and_retries_on_length_truncation() {
1310        // GAP-SG-70: the first response is truncated (finish_reason="length")
1311        // with content that is NOT valid JSON on its own (proving the retry
1312        // happens BEFORE json_repair, not as a repair fallback). The second
1313        // response (after max_tokens grows) is well-formed and finishes
1314        // normally.
1315        let server = MockServer::start().await;
1316        Mock::given(method("POST"))
1317            .respond_with(
1318                ResponseTemplate::new(200).set_body_json(success_body_with_finish(
1319                    r#"{"entities":["trunc"#,
1320                    Some(0.001),
1321                    "length",
1322                )),
1323            )
1324            .up_to_n_times(1)
1325            .expect(1)
1326            .mount(&server)
1327            .await;
1328        Mock::given(method("POST"))
1329            .respond_with(
1330                ResponseTemplate::new(200).set_body_json(success_body_with_finish(
1331                    r#"{"entities":["rust"],"relationships":[]}"#,
1332                    Some(0.002),
1333                    "stop",
1334                )),
1335            )
1336            .expect(1)
1337            .mount(&server)
1338            .await;
1339
1340        let client = client_for(&server, "deepseek/deepseek-v4-flash:nitro").await;
1341        let completion = client
1342            .complete("system", "input", TEST_SCHEMA, Some(64))
1343            .await
1344            .expect("second attempt with grown max_tokens succeeds");
1345
1346        assert_eq!(
1347            completion.value,
1348            json!({"entities": ["rust"], "relationships": []})
1349        );
1350        assert_eq!(completion.finish_reason.as_deref(), Some("stop"));
1351
1352        let requests = server
1353            .received_requests()
1354            .await
1355            .expect("request recording is enabled");
1356        assert_eq!(requests.len(), 2, "expected exactly one regrowth retry");
1357        let first: serde_json::Value =
1358            serde_json::from_slice(&requests[0].body).expect("first request body is JSON");
1359        let second: serde_json::Value =
1360            serde_json::from_slice(&requests[1].body).expect("second request body is JSON");
1361        assert_eq!(first["max_tokens"], json!(64));
1362        assert_eq!(
1363            second["max_tokens"],
1364            json!(64 * crate::constants::ENRICH_MAX_TOKENS_GROWTH_FACTOR),
1365            "max_tokens must grow by ENRICH_MAX_TOKENS_GROWTH_FACTOR before the retry"
1366        );
1367    }
1368
1369    #[tokio::test]
1370    async fn complete_captures_finish_reason_and_tokens_on_success() {
1371        // GAP-SG-72-chat: a normal (non-truncated) completion still reports
1372        // finish_reason and both token counts to the caller.
1373        let server = MockServer::start().await;
1374        Mock::given(method("POST"))
1375            .respond_with(ResponseTemplate::new(200).set_body_json(json!({
1376                "choices": [{
1377                    "message": { "content": r#"{"ok":true}"# },
1378                    "finish_reason": "stop"
1379                }],
1380                "usage": { "cost": 0.001, "prompt_tokens": 120, "completion_tokens": 30 }
1381            })))
1382            .mount(&server)
1383            .await;
1384
1385        let client = client_for(&server, "z-ai/glm-5.2").await;
1386        let completion = client
1387            .complete("system", "input", TEST_SCHEMA, None)
1388            .await
1389            .expect("completion succeeds");
1390
1391        assert_eq!(completion.finish_reason.as_deref(), Some("stop"));
1392        assert_eq!(completion.prompt_tokens, Some(120));
1393        assert_eq!(completion.completion_tokens, Some(30));
1394    }
1395
1396    #[tokio::test]
1397    async fn complete_gives_up_after_exhausting_length_retries() {
1398        // GAP-SG-70: every attempt (primary + ENRICH_MAX_LENGTH_RETRIES
1399        // regrowth retries) reports finish_reason="length" with unparsable
1400        // content, so complete() must give up and return a ChatError instead
1401        // of retrying forever.
1402        let server = MockServer::start().await;
1403        Mock::given(method("POST"))
1404            .respond_with(
1405                ResponseTemplate::new(200).set_body_json(success_body_with_finish(
1406                    r#"[1, 2, 3"#,
1407                    Some(0.0),
1408                    "length",
1409                )),
1410            )
1411            .mount(&server)
1412            .await;
1413
1414        let client = client_for(&server, "deepseek/deepseek-v4-flash:nitro").await;
1415        let err = client
1416            .complete("system", "input", TEST_SCHEMA, Some(64))
1417            .await
1418            .expect_err("exhausted length retries must fail");
1419        assert_eq!(err.finish_reason.as_deref(), Some("length"));
1420        assert_eq!(
1421            err.retry_class,
1422            AttemptOutcome::Transient,
1423            "a repeatedly truncated response is a bounded-retry hiccup, not permanent"
1424        );
1425
1426        let requests = server
1427            .received_requests()
1428            .await
1429            .expect("request recording is enabled");
1430        assert_eq!(
1431            requests.len() as u32,
1432            crate::constants::ENRICH_MAX_LENGTH_RETRIES + 1,
1433            "expected the primary attempt plus every regrowth retry"
1434        );
1435    }
1436}