Skip to main content

sqlite_graphrag/
chat_api.rs

1//! HTTP client for the OpenRouter chat-completions API.
2//!
3//! Sends structured-output chat requests to the OpenAI-compatible endpoint
4//! at `openrouter.ai/api/v1/chat/completions` and returns the parsed JSON
5//! object the model produced under a strict `json_schema` `response_format`.
6//!
7//! This mirrors [`crate::embedding_api`] for the embeddings endpoint: same
8//! retry/backoff policy (immediate abort on 401/400/404, `retry-after` on
9//! 429, exponential backoff + jitter on 5xx) and the same minimal headers
10//! (only `Authorization: Bearer`, no `HTTP-Referer`/`X-Title`). The shared
11//! error envelope and backoff helper live in [`crate::openrouter_http`]
12//! (GAP-SG-74).
13//!
14//! v1.0.95 (ADR-0054): adds an OpenRouter REST transport for the `enrich`
15//! JUDGE so structured extraction no longer requires a locally installed
16//! `claude` / `codex` / `opencode` CLI subprocess.
17//!
18//! v1.1.00 (GAP-SG-70/72-chat): the OpenAI-compatible contract surfaces
19//! `choices[].finish_reason` and `usage.{prompt_tokens,completion_tokens}`.
20//! `finish_reason == "length"` means the response was truncated because
21//! `max_tokens` was too small — not a malformed generation. [`Self::complete`]
22//! now detects this BEFORE attempting JSON repair, grows `max_tokens` and
23//! re-issues the request (bounded by
24//! [`crate::constants::ENRICH_MAX_LENGTH_RETRIES`]), and always reports the
25//! diagnostics (`finish_reason`, token counts) to the caller via
26//! [`ChatCompletion`] on success or [`ChatError`] on failure.
27
28use crate::errors::AppError;
29use crate::retry::AttemptOutcome;
30use secrecy::{ExposeSecret, SecretBox};
31use serde::{Deserialize, Serialize};
32use std::time::Duration;
33
34const OPENROUTER_CHAT_URL: &str = "https://openrouter.ai/api/v1/chat/completions";
35// GAP-SG-17: raised from 300 to 600 — the per-request fallback budget when a
36// caller passes `0`. Dense bodies near the model's ~32K-token context ceiling
37// regularly need more than five minutes to generate.
38const DEFAULT_TIMEOUT_SECS: u64 = 600;
39const DEFAULT_CONNECT_TIMEOUT_SECS: u64 = 10;
40
41/// Fixed `json_schema` name sent in the `response_format`. OpenRouter only
42/// requires a short identifier; the actual contract is carried by `schema`.
43const SCHEMA_NAME: &str = "enrich_output";
44
45#[derive(Serialize)]
46struct ChatRequest<'a> {
47    model: &'a str,
48    messages: Vec<ChatMessage<'a>>,
49    response_format: ResponseFormat,
50    provider: ProviderPrefs,
51    #[serde(skip_serializing_if = "Option::is_none")]
52    reasoning: Option<ReasoningPrefs>,
53    #[serde(skip_serializing_if = "Option::is_none")]
54    max_tokens: Option<u32>,
55}
56
57#[derive(Serialize)]
58struct ChatMessage<'a> {
59    role: &'a str,
60    content: String,
61}
62
63#[derive(Serialize)]
64struct ResponseFormat {
65    #[serde(rename = "type")]
66    format_type: &'static str,
67    json_schema: JsonSchemaSpec,
68}
69
70#[derive(Serialize)]
71struct JsonSchemaSpec {
72    name: &'static str,
73    strict: bool,
74    schema: serde_json::Value,
75}
76
77#[derive(Serialize)]
78struct ProviderPrefs {
79    require_parameters: bool,
80}
81
82#[derive(Serialize)]
83struct ReasoningPrefs {
84    enabled: bool,
85}
86
87#[derive(Deserialize)]
88struct ChatResponse {
89    #[serde(default)]
90    choices: Vec<Choice>,
91    #[serde(default)]
92    usage: Option<Usage>,
93    /// Structured provider error. OpenRouter may return this inside an HTTP 200
94    /// body (e.g. token/context-length overflow); without it the response would
95    /// parse into empty `choices` and surface the misleading "no structured
96    /// content" error instead of the real cause (GAP-SG-03).
97    #[serde(default)]
98    error: Option<crate::openrouter_http::ApiError>,
99}
100
101#[derive(Deserialize)]
102struct Choice {
103    message: RespMessage,
104    /// Why the model stopped generating: `"stop"` on a normal completion,
105    /// `"length"` when `max_tokens` cut the response short (GAP-SG-70/72-chat).
106    /// Absent from providers that omit it, hence `#[serde(default)]`.
107    #[serde(default)]
108    finish_reason: Option<String>,
109}
110
111#[derive(Deserialize)]
112struct RespMessage {
113    #[serde(default)]
114    content: Option<String>,
115}
116
117#[derive(Deserialize)]
118struct Usage {
119    #[serde(default)]
120    cost: Option<f64>,
121    /// Prompt token count reported by OpenRouter (GAP-SG-72-chat). Diagnostic
122    /// only — never used to gate control flow, so a missing value stays `None`.
123    #[serde(default)]
124    prompt_tokens: Option<u32>,
125    /// Completion token count reported by OpenRouter (GAP-SG-72-chat), used
126    /// alongside `finish_reason` to explain a truncated response.
127    #[serde(default)]
128    completion_tokens: Option<u32>,
129}
130
131/// Successful [`OpenRouterChatClient::complete`] result (GAP-SG-72-chat).
132///
133/// `finish_reason`, `prompt_tokens` and `completion_tokens` are the raw
134/// diagnostics OpenRouter attached to the response that ultimately succeeded
135/// (after any `max_tokens` growth retries — see [`Self::value`] and the
136/// module docs). They are `None` only when the provider omitted them.
137#[derive(Debug)]
138pub struct ChatCompletion {
139    /// Model output parsed as JSON (guaranteed to be a JSON object).
140    pub value: serde_json::Value,
141    /// Cost in USD read from `usage.cost`, or `0.0` when the provider omitted it.
142    pub cost_usd: f64,
143    /// `choices[0].finish_reason` from the response that produced `value`.
144    pub finish_reason: Option<String>,
145    /// `usage.prompt_tokens` from the response that produced `value`.
146    pub prompt_tokens: Option<u32>,
147    /// `usage.completion_tokens` from the response that produced `value`.
148    pub completion_tokens: Option<u32>,
149}
150
151/// [`OpenRouterChatClient::complete`] failure (GAP-SG-72-chat / GAP-SG-72
152/// reauditor addendum).
153///
154/// Wraps the underlying [`AppError`] with whatever truncation diagnostics were
155/// available at the point of failure. `finish_reason`/token fields are `None`
156/// when the failure happened before a response was parsed (network error, a
157/// permanent 4xx, or exhausted retries) — only failures that occur AFTER a
158/// `ChatResponse` was successfully decoded (JSON-repair or shape-guard
159/// failures) carry them.
160///
161/// `retry_class` is the retry verdict computed AT THE ORIGIN (the exact HTTP
162/// status, or the provider's structured error `code`), never inferred
163/// downstream from `source.to_string()`. The enrich queue consumes this field
164/// directly instead of pattern-matching the formatted message.
165#[derive(Debug)]
166pub struct ChatError {
167    /// Underlying cause, preserved via `source()` rather than restated.
168    pub source: AppError,
169    /// `choices[0].finish_reason` from the response that led to this error,
170    /// when one was decoded.
171    pub finish_reason: Option<String>,
172    /// `usage.prompt_tokens` from the response that led to this error, when
173    /// one was decoded.
174    pub prompt_tokens: Option<u32>,
175    /// `usage.completion_tokens` from the response that led to this error,
176    /// when one was decoded.
177    pub completion_tokens: Option<u32>,
178    /// Typed retry verdict computed where the failure originated (HTTP
179    /// status / provider code), not by matching `source`'s message.
180    pub retry_class: AttemptOutcome,
181}
182
183impl ChatError {
184    /// Wraps `source` with no diagnostics attached (used when no
185    /// `ChatResponse` was decoded before the failure) and the `retry_class`
186    /// computed by the caller at the exact HTTP status / provider code.
187    fn new(source: AppError, retry_class: AttemptOutcome) -> Self {
188        Self {
189            source,
190            finish_reason: None,
191            prompt_tokens: None,
192            completion_tokens: None,
193            retry_class,
194        }
195    }
196
197    /// Wraps `source` with the diagnostics captured from a decoded
198    /// `ChatResponse` that nonetheless failed downstream (repair or
199    /// shape-guard), plus its `retry_class`.
200    fn with_diagnostics(
201        source: AppError,
202        finish_reason: Option<String>,
203        prompt_tokens: Option<u32>,
204        completion_tokens: Option<u32>,
205        retry_class: AttemptOutcome,
206    ) -> Self {
207        Self {
208            source,
209            finish_reason,
210            prompt_tokens,
211            completion_tokens,
212            retry_class,
213        }
214    }
215}
216
217impl std::fmt::Display for ChatError {
218    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
219        std::fmt::Display::fmt(&self.source, f)
220    }
221}
222
223impl std::error::Error for ChatError {
224    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
225        Some(&self.source)
226    }
227}
228
229/// Process-wide OpenRouter chat client. Holds the model name so that callers
230/// only thread the per-item prompt/schema/input through [`Self::complete`].
231pub struct OpenRouterChatClient {
232    client: reqwest::Client,
233    api_key: SecretBox<String>,
234    model: String,
235    /// Endpoint each request is POSTed to. Always [`OPENROUTER_CHAT_URL`] in
236    /// production; only the test-only [`Self::new_with_url`] constructor
237    /// repoints it at a local mock server.
238    base_url: String,
239}
240
241impl OpenRouterChatClient {
242    /// Builds a chat client bound to `model`, applying `timeout_secs` as the
243    /// total per-request budget (wired from `--openrouter-timeout`). A value of
244    /// `0` falls back to `DEFAULT_TIMEOUT_SECS` so a missing or zero flag never
245    /// degrades into reqwest`'s immediate-timeout behaviour.
246    pub fn new(
247        api_key: SecretBox<String>,
248        model: String,
249        timeout_secs: u64,
250    ) -> Result<Self, AppError> {
251        let timeout_secs = if timeout_secs == 0 {
252            DEFAULT_TIMEOUT_SECS
253        } else {
254            timeout_secs
255        };
256        let client = reqwest::Client::builder()
257            .timeout(Duration::from_secs(timeout_secs))
258            .connect_timeout(Duration::from_secs(DEFAULT_CONNECT_TIMEOUT_SECS))
259            .user_agent("sqlite-graphrag/1.1.00")
260            .build()
261            .map_err(|e| AppError::Validation(format!("failed to build HTTP client: {e}")))?;
262
263        Ok(Self {
264            client,
265            api_key,
266            model,
267            base_url: OPENROUTER_CHAT_URL.to_string(),
268        })
269    }
270
271    /// Test-only constructor that POSTs to an arbitrary `base_url` (such as a
272    /// `wiremock::MockServer`) instead of the public OpenRouter endpoint.
273    /// Behaviour is otherwise identical to [`Self::new`].
274    #[cfg(test)]
275    pub fn new_with_url(
276        api_key: SecretBox<String>,
277        model: String,
278        base_url: String,
279        timeout_secs: u64,
280    ) -> Result<Self, AppError> {
281        let mut client = Self::new(api_key, model, timeout_secs)?;
282        client.base_url = base_url;
283        Ok(client)
284    }
285
286    /// Returns the model bound to this client.
287    pub fn model(&self) -> &str {
288        &self.model
289    }
290
291    /// Runs a single structured-output completion, transparently growing
292    /// `max_tokens` and re-issuing the request when the model truncates its
293    /// output (GAP-SG-70).
294    ///
295    /// `schema_str` is the JSON Schema (as a string) the model must honour
296    /// under `strict: true`. When `input_text` is empty only the system
297    /// message is sent. `max_tokens` seeds the first attempt; `None` lets the
298    /// provider apply its own default.
299    ///
300    /// Returns [`ChatCompletion`] on success or [`ChatError`] on failure; both
301    /// carry `finish_reason`/token diagnostics when a response was decoded.
302    ///
303    /// # Errors
304    ///
305    /// Returns [`ChatError`] when: the schema is invalid JSON; the HTTP
306    /// request fails or exhausts retries; the provider returns a permanent
307    /// error (401/400/404, or a structured `error` object in a 2xx body); the
308    /// response carries no usable content; the content cannot be parsed as
309    /// JSON even after repair; the parsed JSON is not an object; or the
310    /// response is truncated (`finish_reason: "length"`) after
311    /// [`crate::constants::ENRICH_MAX_LENGTH_RETRIES`] `max_tokens` growth
312    /// attempts are exhausted.
313    pub async fn complete(
314        &self,
315        system_prompt: &str,
316        input_text: &str,
317        schema_str: &str,
318        max_tokens: Option<u32>,
319    ) -> Result<ChatCompletion, ChatError> {
320        // A malformed schema is a permanent caller/config error — classified
321        // explicitly (no blanket `From<AppError>` conversion exists for this
322        // type; every `ChatError` states its `retry_class` at construction).
323        let schema: serde_json::Value = serde_json::from_str(schema_str).map_err(|e| {
324            ChatError::new(
325                AppError::Validation(format!("invalid JSON schema for OpenRouter request: {e}")),
326                AttemptOutcome::HardFailure,
327            )
328        })?;
329
330        let mut current_max_tokens = max_tokens;
331
332        for length_attempt in 0..=crate::constants::ENRICH_MAX_LENGTH_RETRIES {
333            let response = self
334                .complete_one_attempt(&schema, system_prompt, input_text, current_max_tokens)
335                .await?;
336
337            let finish_reason = response
338                .choices
339                .first()
340                .and_then(|c| c.finish_reason.clone());
341            let prompt_tokens = response.usage.as_ref().and_then(|u| u.prompt_tokens);
342            let completion_tokens = response.usage.as_ref().and_then(|u| u.completion_tokens);
343
344            let truncated = finish_reason.as_deref() == Some("length");
345            let retries_left = length_attempt < crate::constants::ENRICH_MAX_LENGTH_RETRIES;
346
347            if truncated && retries_left {
348                let next_max_tokens = grow_max_tokens(current_max_tokens);
349                tracing::warn!(
350                    model = %self.model,
351                    attempt = length_attempt,
352                    previous_max_tokens = ?current_max_tokens,
353                    next_max_tokens,
354                    "OpenRouter completion truncated (finish_reason=length); \
355                     retrying with a larger max_tokens budget"
356                );
357                current_max_tokens = Some(next_max_tokens);
358                continue;
359            }
360
361            if truncated {
362                tracing::warn!(
363                    model = %self.model,
364                    max_length_retries = crate::constants::ENRICH_MAX_LENGTH_RETRIES,
365                    max_tokens = ?current_max_tokens,
366                    "OpenRouter completion still truncated after exhausting \
367                     max_tokens growth"
368                );
369            }
370
371            return self.finish_completion(
372                response,
373                finish_reason,
374                prompt_tokens,
375                completion_tokens,
376            );
377        }
378
379        unreachable!("loop always returns within ENRICH_MAX_LENGTH_RETRIES + 1 iterations")
380    }
381
382    /// Runs one HTTP attempt (including the mandatory-reasoning fallback) and
383    /// returns the decoded [`ChatResponse`] without inspecting `finish_reason`
384    /// or extracting content — that happens in [`Self::complete`] so the
385    /// `max_tokens` growth loop can re-issue the request first.
386    async fn complete_one_attempt(
387        &self,
388        schema: &serde_json::Value,
389        system_prompt: &str,
390        input_text: &str,
391        max_tokens: Option<u32>,
392    ) -> Result<ChatResponse, ChatError> {
393        // First attempt sends reasoning.enabled=false (token savings on the
394        // ~9 models that allow disabling). The ~4 reasoning-mandatory models
395        // (e.g. minimax-m2.7, gpt-oss-120b) reject it with HTTP 400 mentioning
396        // "reasoning"; on that specific failure we retry ONCE with the
397        // reasoning field omitted so the model uses its mandatory default. Any
398        // other error, or a second failure, propagates the original error.
399        let primary = self.build_request(
400            schema.clone(),
401            system_prompt,
402            input_text,
403            max_tokens,
404            Some(ReasoningPrefs { enabled: false }),
405        );
406        match self.execute_with_retry(&primary).await {
407            Ok(r) => Ok(r),
408            Err(first_err) => {
409                if reasoning_disable_rejected(&first_err) {
410                    tracing::warn!(
411                        model = %self.model,
412                        "model rejected reasoning.enabled=false (mandatory); \
413                         retrying once with reasoning omitted"
414                    );
415                    let fallback = self.build_request(
416                        schema.clone(),
417                        system_prompt,
418                        input_text,
419                        max_tokens,
420                        None,
421                    );
422                    match self.execute_with_retry(&fallback).await {
423                        Ok(r) => Ok(r),
424                        Err(_) => Err(first_err),
425                    }
426                } else {
427                    Err(first_err)
428                }
429            }
430        }
431    }
432
433    /// Extracts content, repairs/parses it as JSON, and enforces the
434    /// object-shape guard, attaching `finish_reason`/token diagnostics to any
435    /// failure.
436    ///
437    /// Every failure branch below (missing content, JSON-repair failure,
438    /// non-object shape) classifies as `AttemptOutcome::Transient`. This is a
439    /// deliberate, acknowledged tension with `rules_rust_retry_com_backoff.md`
440    /// ("NUNCA retentar erros de parsing ou deserialização" / "NUNCA retentar
441    /// erros de deserialização"): those rules target DETERMINISTIC parse
442    /// errors, where retrying the identical input reproduces the identical
443    /// failure. Here the "input" is `deepseek-v4-flash:nitro` sampling
444    /// variance — the SAME prompt can legitimately produce well-formed JSON
445    /// on the next generation (see GAP-SG-10). So this is a typed, bounded
446    /// hiccup, not a retry-forever loophole: it is capped by `--max-attempts`
447    /// (GAP-SG-09/GAP-SG-21) and dead-letters once attempts are exhausted.
448    fn finish_completion(
449        &self,
450        response: ChatResponse,
451        finish_reason: Option<String>,
452        prompt_tokens: Option<u32>,
453        completion_tokens: Option<u32>,
454    ) -> Result<ChatCompletion, ChatError> {
455        let content = response
456            .choices
457            .into_iter()
458            .next()
459            .and_then(|c| c.message.content)
460            .filter(|c| !c.trim().is_empty())
461            .ok_or_else(|| {
462                AppError::Validation(format!(
463                    "model '{}' returned no structured content (incompatible with \
464                     structured outputs, or refused the request)",
465                    self.model
466                ))
467            })
468            .map_err(|e| {
469                ChatError::with_diagnostics(
470                    e,
471                    finish_reason.clone(),
472                    prompt_tokens,
473                    completion_tokens,
474                    AttemptOutcome::Transient,
475                )
476            })?;
477
478        // GAP-SG-10: deepseek-v4-flash:nitro and similar models do not honour
479        // `json_schema` strict mode reliably — they wrap output in markdown
480        // fences, add trailing commas, or omit quotes around keys. Try a strict
481        // parse first (zero cost for well-formed JSON), then fall back to the
482        // repair pass (a Rust port of `json_repair`) before giving up.
483        let value = crate::json_repair::repair_to_value(&content).map_err(|e| {
484            ChatError::with_diagnostics(
485                AppError::Validation(format!(
486                    "model '{}' returned content that could not be parsed even after \
487                     JSON repair: {e}",
488                    self.model
489                )),
490                finish_reason.clone(),
491                prompt_tokens,
492                completion_tokens,
493                AttemptOutcome::Transient,
494            )
495        })?;
496
497        // GAP-SG-10: `llm_json` coerces aggressively — free text becomes a JSON
498        // string, empty input becomes `{}`, a lone delimiter becomes `null`. The
499        // enrich JUDGE contract is ALWAYS a JSON object, so a non-object result
500        // here is a malformed/refused generation, NOT a usable value. Reject it
501        // (the enrich classifier reclassifies this as a transient model hiccup,
502        // GAP-SG-09) instead of letting a coerced scalar masquerade as a
503        // valid-but-empty result downstream.
504        if !value.is_object() {
505            return Err(ChatError::with_diagnostics(
506                AppError::Validation(format!(
507                    "model '{}' returned non-object JSON after repair (got {}); \
508                     likely a refusal or malformed structured output",
509                    self.model,
510                    json_shape_name(&value)
511                )),
512                finish_reason,
513                prompt_tokens,
514                completion_tokens,
515                AttemptOutcome::Transient,
516            ));
517        }
518
519        let cost = response.usage.and_then(|u| u.cost).unwrap_or(0.0);
520
521        Ok(ChatCompletion {
522            value,
523            cost_usd: cost,
524            finish_reason,
525            prompt_tokens,
526            completion_tokens,
527        })
528    }
529
530    /// Builds a `ChatRequest` for one attempt. `reasoning` is `Some` on the
531    /// primary attempt (`enabled:false`) and `None` on the mandatory-reasoning
532    /// fallback, where the field is omitted entirely.
533    fn build_request<'a>(
534        &'a self,
535        schema: serde_json::Value,
536        system_prompt: &str,
537        input_text: &str,
538        max_tokens: Option<u32>,
539        reasoning: Option<ReasoningPrefs>,
540    ) -> ChatRequest<'a> {
541        let mut messages = Vec::with_capacity(2);
542        messages.push(ChatMessage {
543            role: "system",
544            content: system_prompt.to_string(),
545        });
546        if !input_text.is_empty() {
547            messages.push(ChatMessage {
548                role: "user",
549                content: input_text.to_string(),
550            });
551        }
552        ChatRequest {
553            model: &self.model,
554            messages,
555            response_format: ResponseFormat {
556                format_type: "json_schema",
557                json_schema: JsonSchemaSpec {
558                    name: SCHEMA_NAME,
559                    strict: true,
560                    schema,
561                },
562            },
563            provider: ProviderPrefs {
564                require_parameters: true,
565            },
566            reasoning,
567            max_tokens,
568        }
569    }
570
571    /// Runs the request/retry loop, classifying every failure into a
572    /// [`ChatError`] with `retry_class` set AT THE ORIGIN (the exact HTTP
573    /// status, or the provider's structured error code) — never inferred
574    /// downstream from a formatted message (reauditor addendum to
575    /// GAP-SG-72-chat).
576    async fn execute_with_retry(
577        &self,
578        request: &ChatRequest<'_>,
579    ) -> Result<ChatResponse, ChatError> {
580        let mut last_err: Option<ChatError> = None;
581
582        for attempt in 0..crate::openrouter_http::MAX_RETRIES {
583            let result = self
584                .client
585                .post(&self.base_url)
586                .header(
587                    "Authorization",
588                    format!("Bearer {}", self.api_key.expose_secret()),
589                )
590                .json(request)
591                .send()
592                .await;
593
594            let resp = match result {
595                Ok(r) => r,
596                Err(e) if e.is_timeout() => {
597                    return Err(ChatError::new(
598                        AppError::Validation("OpenRouter chat request timed out".into()),
599                        AttemptOutcome::Transient,
600                    ));
601                }
602                Err(e) => {
603                    last_err = Some(ChatError::new(
604                        AppError::Validation(format!("HTTP request failed: {e}")),
605                        AttemptOutcome::Transient,
606                    ));
607                    crate::openrouter_http::backoff(attempt).await;
608                    continue;
609                }
610            };
611
612            let status = resp.status();
613
614            if status.is_success() {
615                let body = resp.text().await.map_err(|e| {
616                    ChatError::new(
617                        AppError::Validation(format!("failed to read response body: {e}")),
618                        AttemptOutcome::Transient,
619                    )
620                })?;
621                match serde_json::from_str::<ChatResponse>(&body) {
622                    Ok(parsed) => {
623                        // A structured error object inside a 2xx body is
624                        // classified by its own `code` (GAP-SG-03 surfaces
625                        // the real code/message instead of letting empty
626                        // choices masquerade as no-structured-content).
627                        if let Some(api_err) = parsed.error {
628                            let retry_class =
629                                crate::openrouter_http::provider_error_retry_class(&api_err);
630                            return Err(ChatError::new(
631                                AppError::ProviderError {
632                                    code: api_err.code_string(),
633                                    message: api_err.message,
634                                },
635                                retry_class,
636                            ));
637                        }
638                        return Ok(parsed);
639                    }
640                    Err(e) => {
641                        tracing::warn!(
642                            attempt,
643                            body_len = body.len(),
644                            "HTTP 200 but parse failed (retrying): {e}"
645                        );
646                        last_err = Some(ChatError::new(
647                            AppError::Validation(format!("failed to parse chat response: {e}")),
648                            AttemptOutcome::Transient,
649                        ));
650                        crate::openrouter_http::backoff(attempt).await;
651                        continue;
652                    }
653                }
654            }
655
656            if status.as_u16() == 401 {
657                return Err(ChatError::new(
658                    AppError::Validation("invalid OpenRouter API key (HTTP 401)".into()),
659                    AttemptOutcome::HardFailure,
660                ));
661            }
662
663            if status.as_u16() == 400 || status.as_u16() == 404 {
664                let body = resp.text().await.unwrap_or_default();
665                return Err(ChatError::new(
666                    AppError::Validation(format!(
667                        "OpenRouter returned {status} for model '{}': {body}",
668                        self.model
669                    )),
670                    AttemptOutcome::HardFailure,
671                ));
672            }
673
674            if status.as_u16() == 429 {
675                let retry_after = resp
676                    .headers()
677                    .get("retry-after")
678                    .and_then(|v| v.to_str().ok())
679                    .and_then(|v| v.parse::<u64>().ok())
680                    .unwrap_or(2);
681                tracing::warn!(
682                    attempt,
683                    retry_after_secs = retry_after,
684                    "OpenRouter rate limited, waiting"
685                );
686                // GAP-SG-56: surface the Retry-After delay to the caller. If
687                // every attempt is rate limited, the loop exits with this
688                // RateLimited error (retryable) carrying the server-advised
689                // wait, instead of a generic max-retries-exceeded message.
690                last_err = Some(ChatError::new(
691                    AppError::RateLimited {
692                        detail: format!("OpenRouter HTTP 429 (retry-after {retry_after}s)"),
693                    },
694                    AttemptOutcome::Transient,
695                ));
696                tokio::time::sleep(Duration::from_secs(retry_after)).await;
697                continue;
698            }
699
700            if status.is_server_error() {
701                tracing::warn!(attempt, status = %status, "OpenRouter server error, retrying");
702                last_err = Some(ChatError::new(
703                    AppError::Validation(format!("OpenRouter server error: {status}")),
704                    AttemptOutcome::Transient,
705                ));
706                crate::openrouter_http::backoff(attempt).await;
707                continue;
708            }
709
710            let body = resp.text().await.unwrap_or_default();
711            return Err(ChatError::new(
712                AppError::Validation(format!("unexpected HTTP {status}: {body}")),
713                crate::openrouter_http::status_retry_class(status),
714            ));
715        }
716
717        // GAP-SG-72-chat addendum: exhausting every retry against a
718        // transient condition (429/5xx/timeout/network) is ITSELF transient
719        // — it is exactly the case the queue's `--max-attempts` backoff
720        // covers, and must never be reclassified as a permanent failure.
721        Err(last_err.unwrap_or_else(|| {
722            ChatError::new(
723                AppError::Validation("max retries exceeded for OpenRouter chat request".into()),
724                AttemptOutcome::Transient,
725            )
726        }))
727    }
728}
729
730/// Grows `current` for the next `max_tokens` retry after a truncated
731/// (`finish_reason: "length"`) response (GAP-SG-70/71). When `current` is
732/// `None` the caller left the provider default in place, so growth starts
733/// from [`crate::constants::ENRICH_INITIAL_MAX_TOKENS`] instead of an unknown
734/// base. The result is always capped at
735/// [`crate::constants::ENRICH_MAX_TOKENS_CEILING`].
736fn grow_max_tokens(current: Option<u32>) -> u32 {
737    let base = current.unwrap_or(crate::constants::ENRICH_INITIAL_MAX_TOKENS);
738    base.saturating_mul(crate::constants::ENRICH_MAX_TOKENS_GROWTH_FACTOR)
739        .min(crate::constants::ENRICH_MAX_TOKENS_CEILING)
740}
741
742/// True when an error from `execute_with_retry` indicates the model rejected
743/// `reasoning.enabled=false` because reasoning is mandatory: an HTTP 400 whose
744/// body mentions "reasoning" (case-insensitive). Triggers the one-shot retry
745/// with the `reasoning` field omitted.
746///
747/// This IS a legitimate, narrowly-scoped substring check on the underlying
748/// `AppError`'s message — not a retry-classification decision (that lives in
749/// `ChatError.retry_class`, computed at the origin). It only decides whether
750/// to attempt the mandatory-reasoning fallback shape, an orthogonal concern.
751fn reasoning_disable_rejected(err: &ChatError) -> bool {
752    let msg = err.source.to_string().to_lowercase();
753    msg.contains("400") && msg.contains("reasoning")
754}
755
756/// Names the JSON shape of `value` for diagnostics (GAP-SG-10). Used when the
757/// repaired model output is not the object the enrich JUDGE contract requires.
758fn json_shape_name(value: &serde_json::Value) -> &'static str {
759    match value {
760        serde_json::Value::Null => "null",
761        serde_json::Value::Bool(_) => "boolean",
762        serde_json::Value::Number(_) => "number",
763        serde_json::Value::String(_) => "string",
764        serde_json::Value::Array(_) => "array",
765        serde_json::Value::Object(_) => "object",
766    }
767}
768
769#[cfg(test)]
770mod tests {
771    use super::*;
772    use serde_json::json;
773    use wiremock::matchers::{body_partial_json, method, path};
774    use wiremock::{Mock, MockServer, ResponseTemplate};
775
776    const TEST_SCHEMA: &str = r#"{"type":"object"}"#;
777
778    fn key() -> SecretBox<String> {
779        SecretBox::new(Box::new("test-key".to_string()))
780    }
781
782    /// Builds a chat-completions success body whose single choice carries the
783    /// model output as a JSON *string* (the double-encoding the real API uses
784    /// under structured outputs), optionally attaching `usage.cost` and a
785    /// `finish_reason` (defaults to `"stop"`).
786    fn success_body(content: &str, cost: Option<f64>) -> serde_json::Value {
787        success_body_with_finish(content, cost, "stop")
788    }
789
790    /// Same as [`success_body`] but with an explicit `finish_reason`, used by
791    /// the GAP-SG-70 truncation tests.
792    fn success_body_with_finish(
793        content: &str,
794        cost: Option<f64>,
795        finish_reason: &str,
796    ) -> serde_json::Value {
797        let mut body = json!({
798            "choices": [{ "message": { "content": content }, "finish_reason": finish_reason }]
799        });
800        if let Some(c) = cost {
801            body["usage"] = json!({ "cost": c });
802        }
803        body
804    }
805
806    async fn client_for(server: &MockServer, model: &str) -> OpenRouterChatClient {
807        OpenRouterChatClient::new_with_url(
808            key(),
809            model.to_string(),
810            format!("{}/chat/completions", server.uri()),
811            30,
812        )
813        .expect("test client builds")
814    }
815
816    #[test]
817    fn new_builds_client_and_binds_model() {
818        let client = OpenRouterChatClient::new(key(), "z-ai/glm-5.2".to_string(), 30)
819            .expect("client builds");
820        assert_eq!(client.model(), "z-ai/glm-5.2");
821    }
822
823    #[test]
824    fn new_defaults_base_url_to_public_endpoint() {
825        let client = OpenRouterChatClient::new(key(), "z-ai/glm-5.2".to_string(), 30)
826            .expect("client builds");
827        assert_eq!(client.base_url, OPENROUTER_CHAT_URL);
828    }
829
830    #[test]
831    fn request_serializes_with_strict_schema_and_disabled_reasoning() {
832        let request = ChatRequest {
833            model: "deepseek/deepseek-v4-flash",
834            messages: vec![ChatMessage {
835                role: "system",
836                content: "extract".to_string(),
837            }],
838            response_format: ResponseFormat {
839                format_type: "json_schema",
840                json_schema: JsonSchemaSpec {
841                    name: SCHEMA_NAME,
842                    strict: true,
843                    schema: serde_json::json!({"type": "object"}),
844                },
845            },
846            provider: ProviderPrefs {
847                require_parameters: true,
848            },
849            reasoning: Some(ReasoningPrefs { enabled: false }),
850            max_tokens: None,
851        };
852        let json = serde_json::to_value(&request).expect("serializes");
853        assert_eq!(json["response_format"]["type"], "json_schema");
854        assert_eq!(json["response_format"]["json_schema"]["strict"], true);
855        assert_eq!(json["provider"]["require_parameters"], true);
856        assert_eq!(json["reasoning"]["enabled"], false);
857        // max_tokens omitted when None
858        assert!(json.get("max_tokens").is_none());
859    }
860
861    #[test]
862    fn grow_max_tokens_uses_initial_default_when_current_is_none() {
863        assert_eq!(
864            grow_max_tokens(None),
865            crate::constants::ENRICH_INITIAL_MAX_TOKENS
866                * crate::constants::ENRICH_MAX_TOKENS_GROWTH_FACTOR
867        );
868    }
869
870    #[test]
871    fn grow_max_tokens_caps_at_ceiling() {
872        assert_eq!(
873            grow_max_tokens(Some(crate::constants::ENRICH_MAX_TOKENS_CEILING)),
874            crate::constants::ENRICH_MAX_TOKENS_CEILING
875        );
876        assert_eq!(
877            grow_max_tokens(Some(u32::MAX)),
878            crate::constants::ENRICH_MAX_TOKENS_CEILING
879        );
880    }
881
882    #[tokio::test]
883    async fn complete_sends_wellformed_request_and_parses_content() {
884        let server = MockServer::start().await;
885        Mock::given(method("POST"))
886            .and(path("/chat/completions"))
887            .and(body_partial_json(json!({
888                "model": "deepseek/deepseek-v4-flash",
889                "response_format": {
890                    "type": "json_schema",
891                    "json_schema": { "name": "enrich_output", "strict": true }
892                },
893                "provider": { "require_parameters": true },
894                "reasoning": { "enabled": false }
895            })))
896            .respond_with(ResponseTemplate::new(200).set_body_json(success_body(
897                r#"{"entities":[],"relationships":[]}"#,
898                Some(0.0023),
899            )))
900            .expect(1)
901            .mount(&server)
902            .await;
903
904        let client = client_for(&server, "deepseek/deepseek-v4-flash").await;
905        let completion = client
906            .complete("system", "input", TEST_SCHEMA, None)
907            .await
908            .expect("completion succeeds");
909
910        assert_eq!(
911            completion.value,
912            json!({"entities": [], "relationships": []})
913        );
914        assert!((completion.cost_usd - 0.0023).abs() < f64::EPSILON);
915        assert_eq!(completion.finish_reason.as_deref(), Some("stop"));
916    }
917
918    #[tokio::test]
919    async fn complete_defaults_cost_to_zero_when_usage_absent() {
920        let server = MockServer::start().await;
921        Mock::given(method("POST"))
922            .respond_with(
923                ResponseTemplate::new(200).set_body_json(success_body(r#"{"entities":[]}"#, None)),
924            )
925            .mount(&server)
926            .await;
927
928        let client = client_for(&server, "z-ai/glm-5.2").await;
929        let completion = client
930            .complete("system", "", TEST_SCHEMA, Some(4096))
931            .await
932            .expect("completion succeeds");
933        assert_eq!(completion.cost_usd, 0.0);
934    }
935
936    #[tokio::test]
937    async fn complete_retries_on_429_honouring_retry_after() {
938        let server = MockServer::start().await;
939        Mock::given(method("POST"))
940            .respond_with(ResponseTemplate::new(429).insert_header("retry-after", "1"))
941            .up_to_n_times(1)
942            .expect(1)
943            .mount(&server)
944            .await;
945        Mock::given(method("POST"))
946            .respond_with(
947                ResponseTemplate::new(200).set_body_json(success_body(r#"{"ok":true}"#, Some(0.0))),
948            )
949            .expect(1)
950            .mount(&server)
951            .await;
952
953        let client = client_for(&server, "minimax/minimax-m3").await;
954        let completion = client
955            .complete("system", "input", TEST_SCHEMA, None)
956            .await
957            .expect("retried completion succeeds");
958        assert_eq!(completion.value, json!({"ok": true}));
959    }
960
961    #[tokio::test]
962    async fn complete_retries_on_5xx_with_backoff() {
963        let server = MockServer::start().await;
964        Mock::given(method("POST"))
965            .respond_with(ResponseTemplate::new(503))
966            .up_to_n_times(1)
967            .expect(1)
968            .mount(&server)
969            .await;
970        Mock::given(method("POST"))
971            .respond_with(
972                ResponseTemplate::new(200).set_body_json(success_body(r#"{"ok":1}"#, Some(0.0))),
973            )
974            .expect(1)
975            .mount(&server)
976            .await;
977
978        let client = client_for(&server, "openai/gpt-oss-120b").await;
979        let completion = client
980            .complete("system", "input", TEST_SCHEMA, None)
981            .await
982            .expect("retried completion succeeds");
983        assert_eq!(completion.value, json!({"ok": 1}));
984    }
985
986    #[tokio::test]
987    async fn complete_401_is_permanent_without_retry() {
988        let server = MockServer::start().await;
989        Mock::given(method("POST"))
990            .respond_with(ResponseTemplate::new(401))
991            .expect(1)
992            .mount(&server)
993            .await;
994
995        let client = client_for(&server, "z-ai/glm-5.2").await;
996        let err = client
997            .complete("system", "input", TEST_SCHEMA, None)
998            .await
999            .expect_err("401 is an error");
1000        assert!(err.to_string().contains("401"), "got: {err}");
1001        assert_eq!(err.retry_class, AttemptOutcome::HardFailure);
1002    }
1003
1004    #[tokio::test]
1005    async fn complete_400_returns_body_and_model_without_retry() {
1006        let server = MockServer::start().await;
1007        Mock::given(method("POST"))
1008            .respond_with(ResponseTemplate::new(400).set_body_string("schema not supported"))
1009            .expect(1)
1010            .mount(&server)
1011            .await;
1012
1013        let client = client_for(&server, "xiaomi/mimo-v2.5").await;
1014        let err = client
1015            .complete("system", "input", TEST_SCHEMA, None)
1016            .await
1017            .expect_err("400 is an error");
1018        let msg = err.to_string();
1019        assert!(msg.contains("400"), "got: {msg}");
1020        assert!(msg.contains("xiaomi/mimo-v2.5"), "got: {msg}");
1021        assert!(msg.contains("schema not supported"), "got: {msg}");
1022        assert_eq!(err.retry_class, AttemptOutcome::HardFailure);
1023    }
1024
1025    #[tokio::test]
1026    async fn complete_empty_choices_errors_citing_model() {
1027        let server = MockServer::start().await;
1028        Mock::given(method("POST"))
1029            .respond_with(ResponseTemplate::new(200).set_body_json(json!({ "choices": [] })))
1030            .mount(&server)
1031            .await;
1032
1033        let client = client_for(&server, "minimax/minimax-m2.7").await;
1034        let err = client
1035            .complete("system", "input", TEST_SCHEMA, None)
1036            .await
1037            .expect_err("empty choices is an error");
1038        let msg = err.to_string();
1039        assert!(msg.contains("minimax/minimax-m2.7"), "got: {msg}");
1040        assert!(msg.contains("no structured content"), "got: {msg}");
1041        assert_eq!(err.finish_reason, None);
1042        assert_eq!(
1043            err.retry_class,
1044            AttemptOutcome::Transient,
1045            "no-content is a model hiccup, not a permanent rejection"
1046        );
1047    }
1048
1049    #[tokio::test]
1050    async fn complete_empty_content_errors() {
1051        let server = MockServer::start().await;
1052        Mock::given(method("POST"))
1053            .respond_with(ResponseTemplate::new(200).set_body_json(success_body("   ", Some(0.0))))
1054            .mount(&server)
1055            .await;
1056
1057        let client = client_for(&server, "z-ai/glm-5.2:nitro").await;
1058        let err = client
1059            .complete("system", "input", TEST_SCHEMA, None)
1060            .await
1061            .expect_err("blank content is an error");
1062        assert!(
1063            err.to_string().contains("no structured content"),
1064            "got: {err}"
1065        );
1066    }
1067
1068    #[tokio::test]
1069    async fn complete_non_json_content_errors_as_incompatible() {
1070        // GAP-SG-10: free text is coerced by the repair pass into a JSON string
1071        // (not an object), so it is rejected by the shape guard rather than the
1072        // strict-parse error. The message names the offending shape + model.
1073        let server = MockServer::start().await;
1074        Mock::given(method("POST"))
1075            .respond_with(
1076                ResponseTemplate::new(200)
1077                    .set_body_json(success_body("this is not json", Some(0.0))),
1078            )
1079            .mount(&server)
1080            .await;
1081
1082        let client = client_for(&server, "google/gemini-3.1-flash-lite").await;
1083        let err = client
1084            .complete("system", "input", TEST_SCHEMA, None)
1085            .await
1086            .expect_err("non-json content is an error");
1087        let msg = err.to_string();
1088        assert!(msg.contains("non-object JSON after repair"), "got: {msg}");
1089        assert!(msg.contains("google/gemini-3.1-flash-lite"), "got: {msg}");
1090    }
1091
1092    #[tokio::test]
1093    async fn complete_repairs_markdown_fenced_object() {
1094        // GAP-SG-10: a model that wraps a valid object in a ```json fence (a
1095        // common deepseek-v4-flash:nitro defect) is repaired and parsed instead
1096        // of being rejected as non-JSON.
1097        let server = MockServer::start().await;
1098        Mock::given(method("POST"))
1099            .respond_with(ResponseTemplate::new(200).set_body_json(success_body(
1100                "```json\n{\"entities\":[\"rust\"],\"relationships\":[]}\n```",
1101                Some(0.0),
1102            )))
1103            .mount(&server)
1104            .await;
1105
1106        let client = client_for(&server, "deepseek/deepseek-v4-flash").await;
1107        let completion = client
1108            .complete("system", "input", TEST_SCHEMA, None)
1109            .await
1110            .expect("fenced object is repaired");
1111        assert_eq!(
1112            completion.value,
1113            json!({"entities": ["rust"], "relationships": []})
1114        );
1115    }
1116
1117    #[tokio::test]
1118    async fn complete_rejects_invalid_schema_before_network() {
1119        // No mock mounted: an unreachable URL proves we never hit the network.
1120        let client = OpenRouterChatClient::new_with_url(
1121            key(),
1122            "z-ai/glm-5.2".to_string(),
1123            "http://127.0.0.1:1/chat/completions".to_string(),
1124            30,
1125        )
1126        .expect("client builds");
1127        let err = client
1128            .complete("system", "input", "{not valid json", None)
1129            .await
1130            .expect_err("invalid schema is rejected");
1131        assert!(
1132            err.to_string().contains("invalid JSON schema"),
1133            "got: {err}"
1134        );
1135        assert_eq!(
1136            err.retry_class,
1137            AttemptOutcome::HardFailure,
1138            "a malformed schema is a permanent caller error"
1139        );
1140    }
1141
1142    #[tokio::test]
1143    async fn complete_retries_with_reasoning_omitted_when_mandatory() {
1144        let server = MockServer::start().await;
1145        // Primary attempt (reasoning.enabled=false) is rejected with a 400 whose
1146        // body mentions "reasoning" — the mandatory-reasoning signal that drives
1147        // the one-shot fallback.
1148        Mock::given(method("POST"))
1149            .respond_with(
1150                ResponseTemplate::new(400).set_body_string(
1151                    "reasoning is mandatory for this model and cannot be disabled",
1152                ),
1153            )
1154            .up_to_n_times(1)
1155            .expect(1)
1156            .mount(&server)
1157            .await;
1158        // Fallback attempt (reasoning field omitted) succeeds.
1159        Mock::given(method("POST"))
1160            .respond_with(ResponseTemplate::new(200).set_body_json(success_body(
1161                r#"{"entities":[],"relationships":[]}"#,
1162                Some(0.0),
1163            )))
1164            .expect(1)
1165            .mount(&server)
1166            .await;
1167
1168        let client = client_for(&server, "minimax/minimax-m2.7").await;
1169        let completion = client
1170            .complete("system", "input", TEST_SCHEMA, None)
1171            .await
1172            .expect("fallback completion succeeds");
1173        assert_eq!(
1174            completion.value,
1175            json!({"entities": [], "relationships": []})
1176        );
1177
1178        // Exactly two requests were sent: the FIRST carries reasoning.enabled=false,
1179        // the SECOND (fallback) OMITS the reasoning field entirely.
1180        let requests = server
1181            .received_requests()
1182            .await
1183            .expect("request recording is enabled");
1184        assert_eq!(requests.len(), 2, "expected primary + fallback requests");
1185        let first: serde_json::Value =
1186            serde_json::from_slice(&requests[0].body).expect("first request body is JSON");
1187        let second: serde_json::Value =
1188            serde_json::from_slice(&requests[1].body).expect("second request body is JSON");
1189        assert_eq!(
1190            first["reasoning"]["enabled"],
1191            json!(false),
1192            "primary request must send reasoning.enabled=false"
1193        );
1194        assert!(
1195            second.get("reasoning").is_none(),
1196            "fallback request must omit the reasoning field, got: {second}"
1197        );
1198    }
1199
1200    #[tokio::test]
1201    async fn complete_honours_configured_timeout() {
1202        // A 1s client timeout against a server that delays 2s proves the
1203        // --openrouter-timeout value is wired into the reqwest builder instead
1204        // of the fixed 300s default (regression: the flag was silently ignored).
1205        let server = MockServer::start().await;
1206        Mock::given(method("POST"))
1207            .respond_with(
1208                ResponseTemplate::new(200)
1209                    .set_delay(std::time::Duration::from_secs(2))
1210                    .set_body_json(success_body(r#"{"ok":1}"#, Some(0.0))),
1211            )
1212            .mount(&server)
1213            .await;
1214
1215        let client = OpenRouterChatClient::new_with_url(
1216            key(),
1217            "z-ai/glm-5.2".to_string(),
1218            format!("{}/chat/completions", server.uri()),
1219            1,
1220        )
1221        .expect("client builds");
1222        let err = client
1223            .complete("system", "input", TEST_SCHEMA, None)
1224            .await
1225            .expect_err("request exceeds the 1s timeout");
1226        assert!(err.to_string().contains("timed out"), "got: {err}");
1227    }
1228
1229    #[tokio::test]
1230    async fn complete_surfaces_provider_error_in_200_body() {
1231        // GAP-SG-03: an HTTP 200 whose body is a structured OpenRouter error
1232        // (token/context-length overflow) must surface the REAL message, not
1233        // the misleading no-structured-content from empty choices.
1234        let server = MockServer::start().await;
1235        Mock::given(method("POST"))
1236            .respond_with(ResponseTemplate::new(200).set_body_json(json!({
1237                "error": { "code": 400, "message": "context length exceeded" }
1238            })))
1239            .mount(&server)
1240            .await;
1241
1242        let client = client_for(&server, "deepseek/deepseek-v4-flash").await;
1243        let err = client
1244            .complete("system", "input", TEST_SCHEMA, None)
1245            .await
1246            .expect_err("provider error must surface");
1247        let msg = err.to_string();
1248        assert!(msg.contains("context length exceeded"), "got: {msg}");
1249        assert!(
1250            !msg.contains("no structured content"),
1251            "must not mask as empty choices: {msg}"
1252        );
1253        assert!(
1254            !msg.contains("missing field"),
1255            "must not mask as a missing field: {msg}"
1256        );
1257        assert_eq!(
1258            err.retry_class,
1259            AttemptOutcome::HardFailure,
1260            "code 400 (context length exceeded) is a permanent provider rejection"
1261        );
1262    }
1263
1264    #[tokio::test]
1265    async fn complete_classifies_provider_error_429_code_as_transient() {
1266        // Reauditor addendum: the provider error's structured `code` — never
1267        // its message — decides the retry verdict. A 429 code inside an
1268        // otherwise-2xx body is exactly as transient as an HTTP-level 429.
1269        let server = MockServer::start().await;
1270        Mock::given(method("POST"))
1271            .respond_with(ResponseTemplate::new(200).set_body_json(json!({
1272                "error": { "code": 429, "message": "rate limited" }
1273            })))
1274            .mount(&server)
1275            .await;
1276
1277        let client = client_for(&server, "deepseek/deepseek-v4-flash").await;
1278        let err = client
1279            .complete("system", "input", TEST_SCHEMA, None)
1280            .await
1281            .expect_err("provider rate-limit error must surface");
1282        assert_eq!(err.retry_class, AttemptOutcome::Transient);
1283    }
1284
1285    #[tokio::test]
1286    async fn complete_classifies_exhausted_5xx_retries_as_transient() {
1287        // GAP-SG-72-chat addendum: exhausting every retry against a
1288        // persistent 5xx is a TRANSIENT outcome (the queue's --max-attempts
1289        // is what eventually dead-letters it), never a HardFailure.
1290        let server = MockServer::start().await;
1291        Mock::given(method("POST"))
1292            .respond_with(ResponseTemplate::new(503))
1293            .mount(&server)
1294            .await;
1295
1296        let client = client_for(&server, "openai/gpt-oss-120b").await;
1297        let err = client
1298            .complete("system", "input", TEST_SCHEMA, None)
1299            .await
1300            .expect_err("persistent 5xx exhausts retries");
1301        assert_eq!(err.retry_class, AttemptOutcome::Transient);
1302    }
1303
1304    #[tokio::test]
1305    async fn complete_regrows_max_tokens_and_retries_on_length_truncation() {
1306        // GAP-SG-70: the first response is truncated (finish_reason="length")
1307        // with content that is NOT valid JSON on its own (proving the retry
1308        // happens BEFORE json_repair, not as a repair fallback). The second
1309        // response (after max_tokens grows) is well-formed and finishes
1310        // normally.
1311        let server = MockServer::start().await;
1312        Mock::given(method("POST"))
1313            .respond_with(
1314                ResponseTemplate::new(200).set_body_json(success_body_with_finish(
1315                    r#"{"entities":["trunc"#,
1316                    Some(0.001),
1317                    "length",
1318                )),
1319            )
1320            .up_to_n_times(1)
1321            .expect(1)
1322            .mount(&server)
1323            .await;
1324        Mock::given(method("POST"))
1325            .respond_with(
1326                ResponseTemplate::new(200).set_body_json(success_body_with_finish(
1327                    r#"{"entities":["rust"],"relationships":[]}"#,
1328                    Some(0.002),
1329                    "stop",
1330                )),
1331            )
1332            .expect(1)
1333            .mount(&server)
1334            .await;
1335
1336        let client = client_for(&server, "deepseek/deepseek-v4-flash:nitro").await;
1337        let completion = client
1338            .complete("system", "input", TEST_SCHEMA, Some(64))
1339            .await
1340            .expect("second attempt with grown max_tokens succeeds");
1341
1342        assert_eq!(
1343            completion.value,
1344            json!({"entities": ["rust"], "relationships": []})
1345        );
1346        assert_eq!(completion.finish_reason.as_deref(), Some("stop"));
1347
1348        let requests = server
1349            .received_requests()
1350            .await
1351            .expect("request recording is enabled");
1352        assert_eq!(requests.len(), 2, "expected exactly one regrowth retry");
1353        let first: serde_json::Value =
1354            serde_json::from_slice(&requests[0].body).expect("first request body is JSON");
1355        let second: serde_json::Value =
1356            serde_json::from_slice(&requests[1].body).expect("second request body is JSON");
1357        assert_eq!(first["max_tokens"], json!(64));
1358        assert_eq!(
1359            second["max_tokens"],
1360            json!(64 * crate::constants::ENRICH_MAX_TOKENS_GROWTH_FACTOR),
1361            "max_tokens must grow by ENRICH_MAX_TOKENS_GROWTH_FACTOR before the retry"
1362        );
1363    }
1364
1365    #[tokio::test]
1366    async fn complete_captures_finish_reason_and_tokens_on_success() {
1367        // GAP-SG-72-chat: a normal (non-truncated) completion still reports
1368        // finish_reason and both token counts to the caller.
1369        let server = MockServer::start().await;
1370        Mock::given(method("POST"))
1371            .respond_with(ResponseTemplate::new(200).set_body_json(json!({
1372                "choices": [{
1373                    "message": { "content": r#"{"ok":true}"# },
1374                    "finish_reason": "stop"
1375                }],
1376                "usage": { "cost": 0.001, "prompt_tokens": 120, "completion_tokens": 30 }
1377            })))
1378            .mount(&server)
1379            .await;
1380
1381        let client = client_for(&server, "z-ai/glm-5.2").await;
1382        let completion = client
1383            .complete("system", "input", TEST_SCHEMA, None)
1384            .await
1385            .expect("completion succeeds");
1386
1387        assert_eq!(completion.finish_reason.as_deref(), Some("stop"));
1388        assert_eq!(completion.prompt_tokens, Some(120));
1389        assert_eq!(completion.completion_tokens, Some(30));
1390    }
1391
1392    #[tokio::test]
1393    async fn complete_gives_up_after_exhausting_length_retries() {
1394        // GAP-SG-70: every attempt (primary + ENRICH_MAX_LENGTH_RETRIES
1395        // regrowth retries) reports finish_reason="length" with unparsable
1396        // content, so complete() must give up and return a ChatError instead
1397        // of retrying forever.
1398        let server = MockServer::start().await;
1399        Mock::given(method("POST"))
1400            .respond_with(
1401                ResponseTemplate::new(200).set_body_json(success_body_with_finish(
1402                    r#"[1, 2, 3"#,
1403                    Some(0.0),
1404                    "length",
1405                )),
1406            )
1407            .mount(&server)
1408            .await;
1409
1410        let client = client_for(&server, "deepseek/deepseek-v4-flash:nitro").await;
1411        let err = client
1412            .complete("system", "input", TEST_SCHEMA, Some(64))
1413            .await
1414            .expect_err("exhausted length retries must fail");
1415        assert_eq!(err.finish_reason.as_deref(), Some("length"));
1416        assert_eq!(
1417            err.retry_class,
1418            AttemptOutcome::Transient,
1419            "a repeatedly truncated response is a bounded-retry hiccup, not permanent"
1420        );
1421
1422        let requests = server
1423            .received_requests()
1424            .await
1425            .expect("request recording is enabled");
1426        assert_eq!(
1427            requests.len() as u32,
1428            crate::constants::ENRICH_MAX_LENGTH_RETRIES + 1,
1429            "expected the primary attempt plus every regrowth retry"
1430        );
1431    }
1432}