sqlite_graphrag/
chat_api.rs

1//! HTTP client for the OpenRouter chat-completions API.
2//!
3//! Sends structured-output chat requests to the OpenAI-compatible endpoint
4//! at `openrouter.ai/api/v1/chat/completions` and returns the parsed JSON
5//! object the model produced under a strict `json_schema` `response_format`.
6//!
7//! This mirrors [`crate::embedding_api`] for the embeddings endpoint: same
8//! retry/backoff policy (immediate abort on 401/400/404, `retry-after` on
9//! 429, exponential backoff + jitter on 5xx) and the same minimal headers
10//! (only `Authorization: Bearer`, no `HTTP-Referer`/`X-Title`). The shared
11//! error envelope and backoff helper live in [`crate::openrouter_http`]
12//! (GAP-SG-74).
13//!
14//! v1.0.95 (ADR-0054): adds an OpenRouter REST transport for the `enrich`
15//! JUDGE so structured extraction no longer requires a locally installed
16//! `claude` / `codex` / `opencode` CLI subprocess.
17//!
18//! v1.1.00 (GAP-SG-70/72-chat): the OpenAI-compatible contract surfaces
19//! `choices[].finish_reason` and `usage.{prompt_tokens,completion_tokens}`.
20//! `finish_reason == "length"` means the response was truncated because
21//! `max_tokens` was too small — not a malformed generation.
22//! [`OpenRouterChatClient::complete`](crate::chat_api::OpenRouterChatClient::complete)
23//! now detects this BEFORE attempting JSON repair, grows `max_tokens` and
24//! re-issues the request (bounded by
25//! [`crate::constants::ENRICH_MAX_LENGTH_RETRIES`]), and always reports the
26//! diagnostics (`finish_reason`, token counts) to the caller via
27//! [`ChatCompletion`](crate::chat_api::ChatCompletion) on success or
28//! [`ChatError`](crate::chat_api::ChatError) on failure.
29
30use crate::errors::AppError;
31use crate::retry::AttemptOutcome;
32use secrecy::{ExposeSecret, SecretBox};
33use serde::{Deserialize, Serialize};
34use std::time::Duration;
35
36use crate::constants::DEFAULT_OPENROUTER_CHAT_URL;
37// GAP-SG-17: raised from 300 to 600 — the per-request fallback budget when a
38// caller passes `0`. Dense bodies near the model's ~32K-token context ceiling
39// regularly need more than five minutes to generate.
40const DEFAULT_TIMEOUT_SECS: u64 = 600;
41const DEFAULT_CONNECT_TIMEOUT_SECS: u64 = 10;
42
43/// Fixed `json_schema` name sent in the `response_format`. OpenRouter only
44/// requires a short identifier; the actual contract is carried by `schema`.
45const SCHEMA_NAME: &str = "enrich_output";
46
47#[derive(Serialize)]
48struct ChatRequest<'a> {
49    model: &'a str,
50    messages: Vec<ChatMessage<'a>>,
51    response_format: ResponseFormat,
52    provider: ProviderPrefs,
53    #[serde(skip_serializing_if = "Option::is_none")]
54    reasoning: Option<ReasoningPrefs>,
55    #[serde(skip_serializing_if = "Option::is_none")]
56    max_tokens: Option<u32>,
57}
58
59#[derive(Serialize)]
60struct ChatMessage<'a> {
61    role: &'a str,
62    content: String,
63}
64
65#[derive(Serialize)]
66struct ResponseFormat {
67    #[serde(rename = "type")]
68    format_type: &'static str,
69    json_schema: JsonSchemaSpec,
70}
71
72#[derive(Serialize)]
73struct JsonSchemaSpec {
74    name: &'static str,
75    strict: bool,
76    schema: serde_json::Value,
77}
78
79#[derive(Serialize)]
80struct ProviderPrefs {
81    require_parameters: bool,
82}
83
84#[derive(Serialize)]
85struct ReasoningPrefs {
86    enabled: bool,
87}
88
89#[derive(Deserialize)]
90struct ChatResponse {
91    #[serde(default)]
92    choices: Vec<Choice>,
93    #[serde(default)]
94    usage: Option<Usage>,
95    /// Structured provider error. OpenRouter may return this inside an HTTP 200
96    /// body (e.g. token/context-length overflow); without it the response would
97    /// parse into empty `choices` and surface the misleading "no structured
98    /// content" error instead of the real cause (GAP-SG-03).
99    #[serde(default)]
100    error: Option<crate::openrouter_http::ApiError>,
101}
102
103#[derive(Deserialize)]
104struct Choice {
105    message: RespMessage,
106    /// Why the model stopped generating: `"stop"` on a normal completion,
107    /// `"length"` when `max_tokens` cut the response short (GAP-SG-70/72-chat).
108    /// Absent from providers that omit it, hence `#[serde(default)]`.
109    #[serde(default)]
110    finish_reason: Option<String>,
111}
112
113#[derive(Deserialize)]
114struct RespMessage {
115    #[serde(default)]
116    content: Option<String>,
117}
118
119#[derive(Deserialize)]
120struct Usage {
121    #[serde(default)]
122    cost: Option<f64>,
123    /// Prompt token count reported by OpenRouter (GAP-SG-72-chat). Diagnostic
124    /// only — never used to gate control flow, so a missing value stays `None`.
125    #[serde(default)]
126    prompt_tokens: Option<u32>,
127    /// Completion token count reported by OpenRouter (GAP-SG-72-chat), used
128    /// alongside `finish_reason` to explain a truncated response.
129    #[serde(default)]
130    completion_tokens: Option<u32>,
131}
132
133/// Successful [`OpenRouterChatClient::complete`] result (GAP-SG-72-chat).
134///
135/// `finish_reason`, `prompt_tokens` and `completion_tokens` are the raw
136/// diagnostics OpenRouter attached to the response that ultimately succeeded
137/// (after any `max_tokens` growth retries — see [`Self::value`] and the
138/// module docs). They are `None` only when the provider omitted them.
139#[derive(Debug)]
140pub struct ChatCompletion {
141    /// Model output parsed as JSON (guaranteed to be a JSON object).
142    pub value: serde_json::Value,
143    /// Cost in USD read from `usage.cost`, or `0.0` when the provider omitted it.
144    pub cost_usd: f64,
145    /// `choices[0].finish_reason` from the response that produced `value`.
146    pub finish_reason: Option<String>,
147    /// `usage.prompt_tokens` from the response that produced `value`.
148    pub prompt_tokens: Option<u32>,
149    /// `usage.completion_tokens` from the response that produced `value`.
150    pub completion_tokens: Option<u32>,
151}
152
153/// [`OpenRouterChatClient::complete`] failure (GAP-SG-72-chat / GAP-SG-72
154/// reauditor addendum).
155///
156/// Wraps the underlying [`AppError`] with whatever truncation diagnostics were
157/// available at the point of failure. `finish_reason`/token fields are `None`
158/// when the failure happened before a response was parsed (network error, a
159/// permanent 4xx, or exhausted retries) — only failures that occur AFTER a
160/// `ChatResponse` was successfully decoded (JSON-repair or shape-guard
161/// failures) carry them.
162///
163/// `retry_class` is the retry verdict computed AT THE ORIGIN (the exact HTTP
164/// status, or the provider's structured error `code`), never inferred
165/// downstream from `source.to_string()`. The enrich queue consumes this field
166/// directly instead of pattern-matching the formatted message.
167#[derive(Debug)]
168pub struct ChatError {
169    /// Underlying cause, preserved via `source()` rather than restated.
170    pub source: AppError,
171    /// `choices[0].finish_reason` from the response that led to this error,
172    /// when one was decoded.
173    pub finish_reason: Option<String>,
174    /// `usage.prompt_tokens` from the response that led to this error, when
175    /// one was decoded.
176    pub prompt_tokens: Option<u32>,
177    /// `usage.completion_tokens` from the response that led to this error,
178    /// when one was decoded.
179    pub completion_tokens: Option<u32>,
180    /// Typed retry verdict computed where the failure originated (HTTP
181    /// status / provider code), not by matching `source`'s message.
182    pub retry_class: AttemptOutcome,
183}
184
185impl ChatError {
186    /// Wraps `source` with no diagnostics attached (used when no
187    /// `ChatResponse` was decoded before the failure) and the `retry_class`
188    /// computed by the caller at the exact HTTP status / provider code.
189    fn new(source: AppError, retry_class: AttemptOutcome) -> Self {
190        Self {
191            source,
192            finish_reason: None,
193            prompt_tokens: None,
194            completion_tokens: None,
195            retry_class,
196        }
197    }
198
199    /// Wraps `source` with the diagnostics captured from a decoded
200    /// `ChatResponse` that nonetheless failed downstream (repair or
201    /// shape-guard), plus its `retry_class`.
202    fn with_diagnostics(
203        source: AppError,
204        finish_reason: Option<String>,
205        prompt_tokens: Option<u32>,
206        completion_tokens: Option<u32>,
207        retry_class: AttemptOutcome,
208    ) -> Self {
209        Self {
210            source,
211            finish_reason,
212            prompt_tokens,
213            completion_tokens,
214            retry_class,
215        }
216    }
217}
218
219impl std::fmt::Display for ChatError {
220    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
221        std::fmt::Display::fmt(&self.source, f)
222    }
223}
224
225impl std::error::Error for ChatError {
226    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
227        Some(&self.source)
228    }
229}
230
231/// Process-wide OpenRouter chat client. Holds the model name so that callers
232/// only thread the per-item prompt/schema/input through [`Self::complete`].
233pub struct OpenRouterChatClient {
234    client: reqwest::Client,
235    api_key: SecretBox<String>,
236    model: String,
237    /// Endpoint each request is POSTed to. Resolved from XDG/config at
238    /// construction (default: [`DEFAULT_OPENROUTER_CHAT_URL`]).
239    base_url: String,
240}
241
242impl OpenRouterChatClient {
243    /// Builds a chat client bound to `model`, applying `timeout_secs` as the
244    /// total per-request budget (wired from `--openrouter-timeout`). A value of
245    /// `0` falls back to `DEFAULT_TIMEOUT_SECS` so a missing or zero flag never
246    /// degrades into reqwest`'s immediate-timeout behaviour.
247    pub fn new(
248        api_key: SecretBox<String>,
249        model: String,
250        timeout_secs: u64,
251    ) -> Result<Self, AppError> {
252        let base_url = crate::runtime_config::openrouter_chat_url(DEFAULT_OPENROUTER_CHAT_URL);
253        Self::new_with_base_url(api_key, model, timeout_secs, base_url)
254    }
255
256    /// Build a client posting to an explicit `base_url` (XDG override, tests, gateways).
257    pub fn new_with_base_url(
258        api_key: SecretBox<String>,
259        model: String,
260        timeout_secs: u64,
261        base_url: String,
262    ) -> Result<Self, AppError> {
263        let timeout_secs = if timeout_secs == 0 {
264            DEFAULT_TIMEOUT_SECS
265        } else {
266            timeout_secs
267        };
268        let client = reqwest::Client::builder()
269            .timeout(Duration::from_secs(timeout_secs))
270            .connect_timeout(Duration::from_secs(DEFAULT_CONNECT_TIMEOUT_SECS))
271            .user_agent(concat!("sqlite-graphrag/", env!("CARGO_PKG_VERSION")))
272            .build()
273            .map_err(|e| {
274                AppError::Validation(crate::i18n::validation::http_client_build_failed(&e))
275            })?;
276
277        Ok(Self {
278            client,
279            api_key,
280            model,
281            base_url,
282        })
283    }
284
285    /// Test-only constructor that POSTs to an arbitrary `base_url`.
286    #[cfg(test)]
287    pub fn new_with_url(
288        api_key: SecretBox<String>,
289        model: String,
290        base_url: String,
291        timeout_secs: u64,
292    ) -> Result<Self, AppError> {
293        Self::new_with_base_url(api_key, model, timeout_secs, base_url)
294    }
295
296    /// Returns the model bound to this client.
297    pub fn model(&self) -> &str {
298        &self.model
299    }
300
301    /// Runs a single structured-output completion, transparently growing
302    /// `max_tokens` and re-issuing the request when the model truncates its
303    /// output (GAP-SG-70).
304    ///
305    /// `schema_str` is the JSON Schema (as a string) the model must honour
306    /// under `strict: true`. When `input_text` is empty only the system
307    /// message is sent. `max_tokens` seeds the first attempt; `None` lets the
308    /// provider apply its own default.
309    ///
310    /// Returns [`ChatCompletion`] on success or [`ChatError`] on failure; both
311    /// carry `finish_reason`/token diagnostics when a response was decoded.
312    ///
313    /// # Errors
314    ///
315    /// Returns [`ChatError`] when: the schema is invalid JSON; the HTTP
316    /// request fails or exhausts retries; the provider returns a permanent
317    /// error (401/400/404, or a structured `error` object in a 2xx body); the
318    /// response carries no usable content; the content cannot be parsed as
319    /// JSON even after repair; the parsed JSON is not an object; or the
320    /// response is truncated (`finish_reason: "length"`) after
321    /// [`crate::constants::ENRICH_MAX_LENGTH_RETRIES`] `max_tokens` growth
322    /// attempts are exhausted.
323    pub async fn complete(
324        &self,
325        system_prompt: &str,
326        input_text: &str,
327        schema_str: &str,
328        max_tokens: Option<u32>,
329    ) -> Result<ChatCompletion, ChatError> {
330        // A malformed schema is a permanent caller/config error — classified
331        // explicitly (no blanket `From<AppError>` conversion exists for this
332        // type; every `ChatError` states its `retry_class` at construction).
333        let schema: serde_json::Value = serde_json::from_str(schema_str).map_err(|e| {
334            ChatError::new(
335                AppError::Validation(crate::i18n::validation::invalid_json_schema_for_request(&e)),
336                AttemptOutcome::HardFailure,
337            )
338        })?;
339
340        let mut current_max_tokens = max_tokens;
341
342        for length_attempt in 0..=crate::constants::ENRICH_MAX_LENGTH_RETRIES {
343            let response = self
344                .complete_one_attempt(&schema, system_prompt, input_text, current_max_tokens)
345                .await?;
346
347            let finish_reason = response
348                .choices
349                .first()
350                .and_then(|c| c.finish_reason.clone());
351            let prompt_tokens = response.usage.as_ref().and_then(|u| u.prompt_tokens);
352            let completion_tokens = response.usage.as_ref().and_then(|u| u.completion_tokens);
353
354            let truncated = finish_reason.as_deref() == Some("length");
355            let retries_left = length_attempt < crate::constants::ENRICH_MAX_LENGTH_RETRIES;
356
357            if truncated && retries_left {
358                let next_max_tokens = grow_max_tokens(current_max_tokens);
359                tracing::warn!(
360                    model = %self.model,
361                    attempt = length_attempt,
362                    previous_max_tokens = ?current_max_tokens,
363                    next_max_tokens,
364                    "OpenRouter completion truncated (finish_reason=length); \
365                     retrying with a larger max_tokens budget"
366                );
367                current_max_tokens = Some(next_max_tokens);
368                continue;
369            }
370
371            if truncated {
372                tracing::warn!(
373                    model = %self.model,
374                    max_length_retries = crate::constants::ENRICH_MAX_LENGTH_RETRIES,
375                    max_tokens = ?current_max_tokens,
376                    "OpenRouter completion still truncated after exhausting \
377                     max_tokens growth"
378                );
379            }
380
381            return self.finish_completion(
382                response,
383                finish_reason,
384                prompt_tokens,
385                completion_tokens,
386            );
387        }
388
389        unreachable!("loop always returns within ENRICH_MAX_LENGTH_RETRIES + 1 iterations")
390    }
391
392    /// Runs one HTTP attempt (including the mandatory-reasoning fallback) and
393    /// returns the decoded [`ChatResponse`] without inspecting `finish_reason`
394    /// or extracting content — that happens in [`Self::complete`] so the
395    /// `max_tokens` growth loop can re-issue the request first.
396    async fn complete_one_attempt(
397        &self,
398        schema: &serde_json::Value,
399        system_prompt: &str,
400        input_text: &str,
401        max_tokens: Option<u32>,
402    ) -> Result<ChatResponse, ChatError> {
403        // First attempt sends reasoning.enabled=false (token savings on the
404        // ~9 models that allow disabling). The ~4 reasoning-mandatory models
405        // (e.g. minimax-m2.7, gpt-oss-120b) reject it with HTTP 400 mentioning
406        // "reasoning"; on that specific failure we retry ONCE with the
407        // reasoning field omitted so the model uses its mandatory default. Any
408        // other error, or a second failure, propagates the original error.
409        let primary = self.build_request(
410            schema.clone(),
411            system_prompt,
412            input_text,
413            max_tokens,
414            Some(ReasoningPrefs { enabled: false }),
415        );
416        match self.execute_with_retry(&primary).await {
417            Ok(r) => Ok(r),
418            Err(first_err) => {
419                if reasoning_disable_rejected(&first_err) {
420                    tracing::warn!(
421                        model = %self.model,
422                        "model rejected reasoning.enabled=false (mandatory); \
423                         retrying once with reasoning omitted"
424                    );
425                    let fallback = self.build_request(
426                        schema.clone(),
427                        system_prompt,
428                        input_text,
429                        max_tokens,
430                        None,
431                    );
432                    match self.execute_with_retry(&fallback).await {
433                        Ok(r) => Ok(r),
434                        Err(_) => Err(first_err),
435                    }
436                } else {
437                    Err(first_err)
438                }
439            }
440        }
441    }
442
443    /// Extracts content, repairs/parses it as JSON, and enforces the
444    /// object-shape guard, attaching `finish_reason`/token diagnostics to any
445    /// failure.
446    ///
447    /// Every failure branch below (missing content, JSON-repair failure,
448    /// non-object shape) classifies as `AttemptOutcome::Transient`. This is a
449    /// deliberate, acknowledged tension with `rules_rust_retry_com_backoff.md`
450    /// ("NUNCA retentar erros de parsing ou deserialização" / "NUNCA retentar
451    /// erros de deserialização"): those rules target DETERMINISTIC parse
452    /// errors, where retrying the identical input reproduces the identical
453    /// failure. Here the "input" is `deepseek-v4-flash:nitro` sampling
454    /// variance — the SAME prompt can legitimately produce well-formed JSON
455    /// on the next generation (see GAP-SG-10). So this is a typed, bounded
456    /// hiccup, not a retry-forever loophole: it is capped by `--max-attempts`
457    /// (GAP-SG-09/GAP-SG-21) and dead-letters once attempts are exhausted.
458    fn finish_completion(
459        &self,
460        response: ChatResponse,
461        finish_reason: Option<String>,
462        prompt_tokens: Option<u32>,
463        completion_tokens: Option<u32>,
464    ) -> Result<ChatCompletion, ChatError> {
465        let content = response
466            .choices
467            .into_iter()
468            .next()
469            .and_then(|c| c.message.content)
470            .filter(|c| !c.trim().is_empty())
471            .ok_or_else(|| {
472                AppError::Validation(crate::i18n::validation::model_no_structured_content(
473                    &self.model,
474                ))
475            })
476            .map_err(|e| {
477                ChatError::with_diagnostics(
478                    e,
479                    finish_reason.clone(),
480                    prompt_tokens,
481                    completion_tokens,
482                    AttemptOutcome::Transient,
483                )
484            })?;
485
486        // GAP-SG-10: deepseek-v4-flash:nitro and similar models do not honour
487        // `json_schema` strict mode reliably — they wrap output in markdown
488        // fences, add trailing commas, or omit quotes around keys. Try a strict
489        // parse first (zero cost for well-formed JSON), then fall back to the
490        // repair pass (a Rust port of `json_repair`) before giving up.
491        let value = crate::json_repair::repair_to_value(&content).map_err(|e| {
492            ChatError::with_diagnostics(
493                AppError::Validation(crate::i18n::validation::model_json_parse_failed(
494                    &self.model,
495                    &e,
496                )),
497                finish_reason.clone(),
498                prompt_tokens,
499                completion_tokens,
500                AttemptOutcome::Transient,
501            )
502        })?;
503
504        // GAP-SG-10: `llm_json` coerces aggressively — free text becomes a JSON
505        // string, empty input becomes `{}`, a lone delimiter becomes `null`. The
506        // enrich JUDGE contract is ALWAYS a JSON object, so a non-object result
507        // here is a malformed/refused generation, NOT a usable value. Reject it
508        // (the enrich classifier reclassifies this as a transient model hiccup,
509        // GAP-SG-09) instead of letting a coerced scalar masquerade as a
510        // valid-but-empty result downstream.
511        if !value.is_object() {
512            return Err(ChatError::with_diagnostics(
513                AppError::Validation(crate::i18n::validation::model_non_object_json(
514                    &self.model,
515                    json_shape_name(&value),
516                )),
517                finish_reason,
518                prompt_tokens,
519                completion_tokens,
520                AttemptOutcome::Transient,
521            ));
522        }
523
524        let cost = response.usage.and_then(|u| u.cost).unwrap_or(0.0);
525
526        Ok(ChatCompletion {
527            value,
528            cost_usd: cost,
529            finish_reason,
530            prompt_tokens,
531            completion_tokens,
532        })
533    }
534
535    /// Builds a `ChatRequest` for one attempt. `reasoning` is `Some` on the
536    /// primary attempt (`enabled:false`) and `None` on the mandatory-reasoning
537    /// fallback, where the field is omitted entirely.
538    fn build_request<'a>(
539        &'a self,
540        schema: serde_json::Value,
541        system_prompt: &str,
542        input_text: &str,
543        max_tokens: Option<u32>,
544        reasoning: Option<ReasoningPrefs>,
545    ) -> ChatRequest<'a> {
546        let mut messages = Vec::with_capacity(2);
547        messages.push(ChatMessage {
548            role: "system",
549            content: system_prompt.to_string(),
550        });
551        if !input_text.is_empty() {
552            messages.push(ChatMessage {
553                role: "user",
554                content: input_text.to_string(),
555            });
556        }
557        ChatRequest {
558            model: &self.model,
559            messages,
560            response_format: ResponseFormat {
561                format_type: "json_schema",
562                json_schema: JsonSchemaSpec {
563                    name: SCHEMA_NAME,
564                    strict: true,
565                    schema,
566                },
567            },
568            provider: ProviderPrefs {
569                require_parameters: true,
570            },
571            reasoning,
572            max_tokens,
573        }
574    }
575
576    /// Runs the request/retry loop, classifying every failure into a
577    /// [`ChatError`] with `retry_class` set AT THE ORIGIN (the exact HTTP
578    /// status, or the provider's structured error code) — never inferred
579    /// downstream from a formatted message (reauditor addendum to
580    /// GAP-SG-72-chat).
581    async fn execute_with_retry(
582        &self,
583        request: &ChatRequest<'_>,
584    ) -> Result<ChatResponse, ChatError> {
585        let mut last_err: Option<ChatError> = None;
586
587        for attempt in 0..crate::openrouter_http::MAX_RETRIES {
588            let result = self
589                .client
590                .post(&self.base_url)
591                .header(
592                    "Authorization",
593                    format!("Bearer {}", self.api_key.expose_secret()),
594                )
595                .json(request)
596                .send()
597                .await;
598
599            let resp = match result {
600                Ok(r) => r,
601                Err(e) if e.is_timeout() => {
602                    return Err(ChatError::new(
603                        AppError::Validation(crate::i18n::validation::openrouter_chat_timed_out()),
604                        AttemptOutcome::Transient,
605                    ));
606                }
607                Err(e) => {
608                    last_err = Some(ChatError::new(
609                        AppError::Validation(crate::i18n::validation::http_request_failed(&e)),
610                        AttemptOutcome::Transient,
611                    ));
612                    crate::openrouter_http::backoff(attempt).await;
613                    continue;
614                }
615            };
616
617            let status = resp.status();
618
619            if status.is_success() {
620                let body = resp.text().await.map_err(|e| {
621                    ChatError::new(
622                        AppError::Validation(
623                            crate::i18n::validation::failed_to_read_response_body(&e),
624                        ),
625                        AttemptOutcome::Transient,
626                    )
627                })?;
628                match serde_json::from_str::<ChatResponse>(&body) {
629                    Ok(parsed) => {
630                        // A structured error object inside a 2xx body is
631                        // classified by its own `code` (GAP-SG-03 surfaces
632                        // the real code/message instead of letting empty
633                        // choices masquerade as no-structured-content).
634                        if let Some(api_err) = parsed.error {
635                            let retry_class =
636                                crate::openrouter_http::provider_error_retry_class(&api_err);
637                            return Err(ChatError::new(
638                                AppError::ProviderError {
639                                    code: api_err.code_string(),
640                                    message: api_err.message,
641                                },
642                                retry_class,
643                            ));
644                        }
645                        return Ok(parsed);
646                    }
647                    Err(e) => {
648                        tracing::warn!(
649                            attempt,
650                            body_len = body.len(),
651                            "HTTP 200 but parse failed (retrying): {e}"
652                        );
653                        last_err = Some(ChatError::new(
654                            AppError::Validation(
655                                crate::i18n::validation::failed_to_parse_chat_response(&e),
656                            ),
657                            AttemptOutcome::Transient,
658                        ));
659                        crate::openrouter_http::backoff(attempt).await;
660                        continue;
661                    }
662                }
663            }
664
665            if status.as_u16() == 401 {
666                return Err(ChatError::new(
667                    AppError::Validation(crate::i18n::validation::openrouter_invalid_api_key_401()),
668                    AttemptOutcome::HardFailure,
669                ));
670            }
671
672            if status.as_u16() == 400 || status.as_u16() == 404 {
673                let body = resp.text().await.unwrap_or_default();
674                return Err(ChatError::new(
675                    AppError::Validation(crate::i18n::validation::openrouter_status_error(
676                        &status,
677                        &self.model,
678                        &body,
679                    )),
680                    AttemptOutcome::HardFailure,
681                ));
682            }
683
684            if status.as_u16() == 429 {
685                let retry_after = resp
686                    .headers()
687                    .get("retry-after")
688                    .and_then(|v| v.to_str().ok())
689                    .and_then(|v| v.parse::<u64>().ok())
690                    .unwrap_or(2);
691                tracing::warn!(
692                    attempt,
693                    retry_after_secs = retry_after,
694                    "OpenRouter rate limited, waiting"
695                );
696                // GAP-SG-56: surface the Retry-After delay to the caller. If
697                // every attempt is rate limited, the loop exits with this
698                // RateLimited error (retryable) carrying the server-advised
699                // wait, instead of a generic max-retries-exceeded message.
700                last_err = Some(ChatError::new(
701                    AppError::RateLimited {
702                        detail: format!("OpenRouter HTTP 429 (retry-after {retry_after}s)"),
703                    },
704                    AttemptOutcome::Transient,
705                ));
706                tokio::time::sleep(Duration::from_secs(retry_after)).await;
707                continue;
708            }
709
710            if status.is_server_error() {
711                tracing::warn!(attempt, status = %status, "OpenRouter server error, retrying");
712                last_err = Some(ChatError::new(
713                    AppError::Validation(crate::i18n::validation::openrouter_server_error(&status)),
714                    AttemptOutcome::Transient,
715                ));
716                crate::openrouter_http::backoff(attempt).await;
717                continue;
718            }
719
720            let body = resp.text().await.unwrap_or_default();
721            return Err(ChatError::new(
722                AppError::Validation(crate::i18n::validation::unexpected_http_status(
723                    &status, &body,
724                )),
725                crate::openrouter_http::status_retry_class(status),
726            ));
727        }
728
729        // GAP-SG-72-chat addendum: exhausting every retry against a
730        // transient condition (429/5xx/timeout/network) is ITSELF transient
731        // — it is exactly the case the queue's `--max-attempts` backoff
732        // covers, and must never be reclassified as a permanent failure.
733        Err(last_err.unwrap_or_else(|| {
734            ChatError::new(
735                AppError::Validation(crate::i18n::validation::openrouter_chat_max_retries()),
736                AttemptOutcome::Transient,
737            )
738        }))
739    }
740}
741
742/// Grows `current` for the next `max_tokens` retry after a truncated
743/// (`finish_reason: "length"`) response (GAP-SG-70/71). When `current` is
744/// `None` the caller left the provider default in place, so growth starts
745/// from [`crate::constants::ENRICH_INITIAL_MAX_TOKENS`] instead of an unknown
746/// base. The result is always capped at
747/// [`crate::constants::ENRICH_MAX_TOKENS_CEILING`].
748fn grow_max_tokens(current: Option<u32>) -> u32 {
749    let base = current.unwrap_or(crate::constants::ENRICH_INITIAL_MAX_TOKENS);
750    base.saturating_mul(crate::constants::ENRICH_MAX_TOKENS_GROWTH_FACTOR)
751        .min(crate::constants::ENRICH_MAX_TOKENS_CEILING)
752}
753
754/// True when an error from `execute_with_retry` indicates the model rejected
755/// `reasoning.enabled=false` because reasoning is mandatory: an HTTP 400 whose
756/// body mentions "reasoning" (case-insensitive). Triggers the one-shot retry
757/// with the `reasoning` field omitted.
758///
759/// This IS a legitimate, narrowly-scoped substring check on the underlying
760/// `AppError`'s message — not a retry-classification decision (that lives in
761/// `ChatError.retry_class`, computed at the origin). It only decides whether
762/// to attempt the mandatory-reasoning fallback shape, an orthogonal concern.
763fn reasoning_disable_rejected(err: &ChatError) -> bool {
764    let msg = err.source.to_string().to_lowercase();
765    msg.contains("400") && msg.contains("reasoning")
766}
767
768/// Names the JSON shape of `value` for diagnostics (GAP-SG-10). Used when the
769/// repaired model output is not the object the enrich JUDGE contract requires.
770fn json_shape_name(value: &serde_json::Value) -> &'static str {
771    match value {
772        serde_json::Value::Null => "null",
773        serde_json::Value::Bool(_) => "boolean",
774        serde_json::Value::Number(_) => "number",
775        serde_json::Value::String(_) => "string",
776        serde_json::Value::Array(_) => "array",
777        serde_json::Value::Object(_) => "object",
778    }
779}
780#[cfg(test)]
781#[path = "chat_api_tests.rs"]
782mod tests;
sqlite_graphrag/chat_api.rs

sqlite_graphrag/
chat_api.rs