Skip to main content

sqlite_graphrag/llm/
exit_code_hints.rs

1//! GAP-005 (v1.0.82): Mapeamento de exit codes de subprocesso LLM para diagnósticos acionáveis.
2
3use std::collections::HashMap;
4use std::sync::OnceLock;
5
6/// Tabela imutável de exit codes conhecidos para sugestões acionáveis.
7pub static EXIT_CODE_HINTS: OnceLock<HashMap<i32, &'static str>> = OnceLock::new();
8
9fn exit_code_hints_map() -> &'static HashMap<i32, &'static str> {
10    EXIT_CODE_HINTS.get_or_init(|| {
11        let mut m = HashMap::new();
12        m.insert(1, "subprocesso retornou erro genérico; verificar logs em ~/.local/share/sqlite-graphrag/llm-backend.log");
13        m.insert(2, "uso incorreto do CLI do subprocesso; rever flags passadas");
14        m.insert(101, "SIGABRT do kernel; possível panic no código do subprocesso");
15        m.insert(126, "binary não executável; executar chmod +x no binário");
16        m.insert(127, "binary não encontrado no PATH; verificar which codex ou which claude");
17        m.insert(134, "SIGABRT; abort interno do subprocesso — reportar bug upstream");
18        m.insert(137, "SIGKILL do OOM killer ou externo; verificar dmesg | grep -i kill e reduzir --llm-parallelism");
19        m.insert(139, "SIGSEGV; reportar bug upstream com stderr preservado");
20        m.insert(143, "SIGTERM externo; hook PreToolUse ou timeout cascateou");
21        m
22    })
23}
24
25/// Retorna diagnóstico acionável baseado no exit code.
26pub fn diagnose_exit_code(code: Option<i32>, signal: Option<i32>) -> String {
27    if let Some(sig) = signal {
28        return match sig {
29            2 => "SIGINT recebido; usuário cancelou operação".to_string(),
30            9 => "SIGKILL externo; OOM killer do kernel".to_string(),
31            15 => "SIGTERM externo; hook PreToolUse ou timeout cascateou".to_string(),
32            other => format!("signal Unix {other} não mapeado; consultar `kill -l`"),
33        };
34    }
35    let code = code.unwrap_or(-1);
36    exit_code_hints_map()
37        .get(&code)
38        .map(|s| s.to_string())
39        .unwrap_or_else(|| {
40            format!("exit code {code} desconhecido; consultar upstream docs do binary")
41        })
42}
43
44#[cfg(test)]
45mod tests {
46    use super::*;
47
48    #[test]
49    fn oom_killer_hint_contains_oom() {
50        let hint = diagnose_exit_code(Some(137), None);
51        assert!(hint.contains("OOM"), "expected OOM in: {hint}");
52    }
53
54    #[test]
55    fn not_found_hint_contains_path() {
56        let hint = diagnose_exit_code(Some(127), None);
57        assert!(hint.contains("PATH"), "expected PATH in: {hint}");
58    }
59
60    #[test]
61    fn sigterm_signal_hint() {
62        let hint = diagnose_exit_code(None, Some(15));
63        assert!(hint.contains("SIGTERM"), "expected SIGTERM in: {hint}");
64    }
65
66    #[test]
67    fn unknown_code_returns_generic() {
68        let hint = diagnose_exit_code(Some(42), None);
69        assert!(hint.contains("42"), "expected 42 in: {hint}");
70    }
71
72    #[test]
73    fn nine_exit_codes_mapped() {
74        assert_eq!(exit_code_hints_map().len(), 9);
75    }
76}
77
78// =============================================================================
79// v1.0.82 (GAP-005): LlmBackendError — diagnostic error for LLM subprocess
80// failures with captured stderr/stdout tails and an actionable hint.
81// =============================================================================
82
83/// Maximum number of bytes captured from each subprocess stream (stdout
84/// and stderr) for the diagnostic tail. 1 KiB matches the limit used by
85/// `tracing::log` macros and keeps the JSON envelope under 4 KiB.
86pub const DIAG_TAIL_BYTES: usize = 1024;
87
88/// Structured error for an LLM subprocess invocation that failed.
89///
90/// Each variant carries the information needed to diagnose the failure
91/// WITHOUT re-running the subprocess: the binary, the exit code, and
92/// a truncated tail of stdout/stderr so the operator can see WHY the
93/// call failed (rate limit, OAuth, OOM, segfault, missing binary, ...).
94///
95/// Distinct from `AppError::Embedding(String)` (the legacy v1.0.81
96/// shape) so the call sites can match on the failure category
97/// programmatically instead of parsing the message string. The
98/// `Display` impl preserves the legacy string format for back-compat
99/// with `tracing` consumers and the i18n layer.
100#[derive(Debug, Clone, PartialEq, Eq)]
101#[non_exhaustive]
102pub enum LlmBackendError {
103    /// Subprocess exited with a non-zero status. The `hint` is
104    /// looked up from [`EXIT_CODE_HINTS`] and tells the operator
105    /// what to do next (re-auth, reduce parallelism, report upstream,
106    /// etc.).
107    NonZeroExit {
108        /// Process exit code (`None` if killed by a signal).
109        exit_code: Option<i32>,
110        /// Unix signal that killed the process (2 = SIGINT, 15 = SIGTERM,
111        /// 9 = SIGKILL). `None` when the process exited normally.
112        signal: Option<i32>,
113        /// Last 1 KiB of the subprocess stdout, UTF-8 lossy-decoded.
114        stdout_tail: String,
115        /// Last 1 KiB of the subprocess stderr, UTF-8 lossy-decoded.
116        stderr_tail: String,
117        /// Path of the binary that was spawned (e.g. `/usr/bin/codex`).
118        binary: String,
119        /// Human-readable diagnostic from [`EXIT_CODE_HINTS`].
120        hint: String,
121    },
122    /// Subprocess could not be spawned at all (binary missing, no exec
123    /// permission, or the OS refused to fork). Distinct from
124    /// `NonZeroExit` so call sites can branch on "never started" vs
125    /// "started and crashed".
126    SpawnFailed {
127        /// Path of the binary that was supposed to be spawned.
128        binary: String,
129        /// Underlying `io::Error` message (e.g. "No such file or directory").
130        source: String,
131    },
132    /// Subprocess exceeded the per-call timeout (default 300s,
133    /// override via `SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS`).
134    Timeout {
135        /// Configured timeout in seconds.
136        secs: u64,
137        /// Path of the binary that was running when the timeout fired.
138        binary: String,
139    },
140    /// All backends in the fallback chain failed AND no fallback was
141    /// available. The call site should honour `--skip-embedding-on-failure`
142    /// to write a `pending_embeddings` row instead of propagating this.
143    NoBackendsAvailable,
144}
145
146impl LlmBackendError {
147    /// Returns the human-readable diagnostic for this error.
148    pub fn hint(&self) -> String {
149        match self {
150            Self::NonZeroExit { hint, .. } => hint.clone(),
151            Self::SpawnFailed { binary, source } => {
152                format!(
153                    "spawn of '{binary}' failed: {source}; check that the binary exists, is executable, and required env vars (PATH, HOME, ...) are set"
154                )
155            }
156            Self::Timeout { secs, binary } => {
157                format!(
158                    "subprocess '{binary}' exceeded the {secs}s timeout; \
159                     override via SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS"
160                )
161            }
162            Self::NoBackendsAvailable => "no backends succeeded and no fallback was configured; \
163                 pass --llm-fallback=codex,claude or --skip-embedding-on-failure"
164                .to_string(),
165        }
166    }
167
168    /// Truncates the tail of a UTF-8 string to `max_bytes`, breaking
169    /// on a char boundary so the result is always valid UTF-8.
170    pub fn truncate_tail(raw: &[u8], max_bytes: usize) -> String {
171        if raw.len() <= max_bytes {
172            return String::from_utf8_lossy(raw).into_owned();
173        }
174        // Find the last char boundary at or before `max_bytes`.
175        // `[u8]::is_char_boundary` is only on `str`, not `[u8]`, so we
176        // hand-roll the boundary check: a UTF-8 continuation byte has
177        // its top 2 bits set to 10, while a boundary byte has 0xxxxxxx
178        // (ASCII) or 11xxxxxx (start of multi-byte).
179        let mut cut = max_bytes.min(raw.len());
180        while cut > 0 && (raw[cut] >= 0x80 && raw[cut] < 0xC0) {
181            cut -= 1;
182        }
183        let mut s = String::from_utf8_lossy(&raw[..cut]).into_owned();
184        s.push_str("...[truncated]");
185        s
186    }
187}
188
189impl std::fmt::Display for LlmBackendError {
190    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
191        match self {
192            Self::NonZeroExit {
193                exit_code,
194                signal,
195                stdout_tail,
196                stderr_tail,
197                binary,
198                ..
199            } => {
200                let code_repr = match (exit_code, signal) {
201                    (Some(c), _) => format!("exit {c}"),
202                    (None, Some(s)) => format!("signal {s}"),
203                    _ => "unknown status".to_string(),
204                };
205                write!(
206                    f,
207                    "{binary} subprocess failed: {code_repr}; \
208                     stdout_tail={stdout_tail:?}; stderr_tail={stderr_tail:?}"
209                )
210            }
211            Self::SpawnFailed { binary, source } => {
212                write!(f, "{binary} spawn failed: {source}")
213            }
214            Self::Timeout { secs, binary } => {
215                write!(f, "{binary} timed out after {secs}s")
216            }
217            Self::NoBackendsAvailable => {
218                write!(f, "no LLM backends available; fallback chain exhausted")
219            }
220        }
221    }
222}
223
224impl std::error::Error for LlmBackendError {}
225
226/// Converts an `LlmBackendError` to a legacy `AppError::Embedding(String)`
227/// so call sites that still return the old shape keep compiling during
228/// the migration window. Once all call sites are migrated to return
229/// `LlmBackendError` directly, this helper can be deleted.
230pub fn into_legacy_embedding(err: &LlmBackendError) -> crate::errors::AppError {
231    crate::errors::AppError::Embedding(err.to_string())
232}
233
234#[cfg(test)]
235mod llm_backend_error_tests {
236    use super::*;
237
238    #[test]
239    fn truncate_tail_short_returns_input() {
240        let s = LlmBackendError::truncate_tail(b"hello", 1024);
241        assert_eq!(s, "hello");
242    }
243
244    #[test]
245    fn truncate_tail_long_appends_marker() {
246        let raw = vec![b'a'; 2048];
247        let s = LlmBackendError::truncate_tail(&raw, 1024);
248        assert!(s.ends_with("...[truncated]"));
249        // The prefix must be the original bytes up to the cut point.
250        assert!(s.starts_with(&"a".repeat(1024)));
251    }
252
253    #[test]
254    fn truncate_tail_respects_utf8_boundary() {
255        // 600 'é' chars = 1200 bytes; cut at 1023 (odd byte inside a 2-byte
256        // UTF-8 sequence) must back off to 1022 (boundary). 4-byte emoji
257        // would also be handled: 256 emoji = 1024 bytes, cut at 1023 must
258        // back off to 1020 (emoji = 4 bytes each, 1020 is boundary).
259        let raw = "é".repeat(600).into_bytes(); // 2 bytes per char
260        let s = LlmBackendError::truncate_tail(&raw, 1023);
261        assert_eq!(s.len(), 1022 + "...[truncated]".len());
262        assert!(s.ends_with("...[truncated]"));
263        // Confirm the cut is a valid UTF-8 boundary by re-decoding the
264        // first part (up to the cut marker).
265        let cut = s.trim_end_matches("...[truncated]").len();
266        let prefix = &s[..cut];
267        assert!(std::str::from_utf8(prefix.as_bytes()).is_ok());
268    }
269
270    #[test]
271    fn no_backends_hint_mentions_fallback() {
272        let err = LlmBackendError::NoBackendsAvailable;
273        assert!(err.hint().contains("--llm-fallback"));
274    }
275
276    #[test]
277    fn spawn_failed_hint_mentions_binary() {
278        let err = LlmBackendError::SpawnFailed {
279            binary: "claude".into(),
280            source: "No such file or directory".into(),
281        };
282        let h = err.hint();
283        assert!(h.contains("claude"));
284        assert!(h.contains("No such file or directory"));
285    }
286
287    #[test]
288    fn timeout_hint_mentions_env_var() {
289        let err = LlmBackendError::Timeout {
290            secs: 300,
291            binary: "codex".into(),
292        };
293        assert!(err.hint().contains("SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS"));
294    }
295
296    #[test]
297    fn non_zero_exit_display_includes_stderr_tail() {
298        let err = LlmBackendError::NonZeroExit {
299            exit_code: Some(1),
300            signal: None,
301            stdout_tail: "out-1k".into(),
302            stderr_tail: "err-1k".into(),
303            binary: "codex".into(),
304            hint: "diagnostic".into(),
305        };
306        let s = err.to_string();
307        assert!(s.contains("codex"));
308        assert!(s.contains("exit 1"));
309        assert!(s.contains("err-1k"));
310    }
311}