sqlite_graphrag/commands/dry_run_backend.rs
1//! v1.0.84 (ADR-0042 / GAP-002): resolve and emit the LLM backend that
2//! WOULD be invoked for embedding without actually spawning the
3//! subprocess. Used by `--dry-run-backend` for CI audit and pre-flight
4//! sanity-check of `--llm-backend` before long ingestion sessions.
5//!
6//! The output is a compact JSON envelope on stdout. stderr carries the
7//! human-friendly summary so operators can run `sqlite-graphrag --dry-run-backend`
8//! without piping through `jaq`.
9//!
10//! ## Schema (`dry-run-backend.schema.json`)
11//!
12//! ```json
13//! {
14//! "action": "dry_run_backend",
15//! "backend": "codex|claude|none",
16//! "binary": "/usr/local/bin/codex",
17//! "model": "gpt-5.5",
18//! "flavour": "codex|claude",
19//! "chain": "claude",
20//! "strict_env_clear": false
21//! }
22//! ```
23//!
24//! ## Implementation notes
25//!
26//! - We deliberately do NOT depend on the private fields of
27//! `LlmEmbedding`. The struct's `binary` and `flavour` fields are
28//! private to `crate::extract::llm_embedding`, so we re-probe the
29//! PATH here (cheap, idempotent) instead of forcing the core to add
30//! `pub(crate)` getters just for this audit path.
31//! - `model` comes from `LlmEmbedding::model_label()` which already
32//! exposes a stable public string of the form `<flavour>:<model>`.
33//! We strip the `<flavour>:` prefix to keep the schema flat.
34//! - When `--llm-backend none` is selected the envelope still emits
35//! the same shape with empty `binary` and `model`, so downstream
36//! pipelines can parse a single schema unconditionally.
37
38use crate::cli::{Cli, LlmBackendChoice};
39use crate::errors::AppError;
40use crate::extract::llm_embedding::LlmEmbedding;
41use crate::output::emit_json_compact;
42use crate::spawn::env_whitelist::is_strict_env_clear;
43use serde::Serialize;
44
45/// Compact JSON envelope emitted by `--dry-run-backend`.
46///
47/// Field order matches the documented schema. `chain` reflects
48/// `--llm-fallback` so operators can audit the fallback order without
49/// spawning `embedder::embed_with_fallback`.
50#[derive(Serialize)]
51pub struct DryRunBackendOutput {
52 pub action: &'static str,
53 pub backend: &'static str,
54 pub binary: String,
55 pub model: String,
56 pub flavour: &'static str,
57 pub chain: String,
58 pub strict_env_clear: bool,
59}
60
61/// Resolve the LLM backend that would be used for embedding and emit
62/// the JSON envelope. Returns `Err(AppError::Embedding)` when the
63/// requested backend CLI is missing from PATH.
64pub fn emit_dry_run_backend(cli: &Cli) -> Result<(), AppError> {
65 let payload = match cli.llm_backend {
66 LlmBackendChoice::None => DryRunBackendOutput {
67 action: "dry_run_backend",
68 backend: "none",
69 binary: String::new(),
70 model: String::new(),
71 flavour: "none",
72 chain: cli.llm_fallback.clone(),
73 strict_env_clear: is_strict_env_clear(),
74 },
75 LlmBackendChoice::Auto => {
76 // ADR-0038: codex is preferred; claude is the fallback when codex
77 // is absent. Mirrors `LlmEmbedding::detect_available()` exactly
78 // so the audit output never disagrees with the real spawn path.
79 let resolved = LlmEmbedding::detect_available()?;
80 backend_payload(&resolved, "codex-first-then-claude", cli, true)
81 }
82 LlmBackendChoice::Codex => {
83 let resolved = LlmEmbedding::detect_available()?;
84 let flavour = resolved.model_label();
85 // Guard: the user explicitly asked for codex. If detect_available
86 // returned a claude-backed client (no codex on PATH), we MUST
87 // surface that as an error rather than silently substitute.
88 // v1.0.84 (ADR-0042): claude must NOT silently replace codex
89 // when the user opts in via `--llm-backend codex`.
90 if flavour.starts_with("claude:") {
91 return Err(AppError::Embedding(
92 "`--llm-backend codex` requested but `codex` was not found on PATH \
93 (a `claude` binary was detected; refusing silent fallback per ADR-0042). \
94 Install `codex` (>= 0.130) or pass `--llm-backend claude` explicitly."
95 .to_string(),
96 ));
97 }
98 backend_payload(&resolved, "codex-explicit", cli, false)
99 }
100 LlmBackendChoice::Claude => {
101 let resolved = LlmEmbedding::detect_available()?;
102 let flavour = resolved.model_label();
103 // Symmetric guard for `--llm-backend claude`.
104 if flavour.starts_with("codex:") {
105 return Err(AppError::Embedding(
106 "`--llm-backend claude` requested but `claude` was not found on PATH \
107 (a `codex` binary was detected; refusing silent fallback per ADR-0042). \
108 Install `claude` (Claude Code >= 2.1) or pass `--llm-backend codex` explicitly."
109 .to_string(),
110 ));
111 }
112 backend_payload(&resolved, "claude-explicit", cli, false)
113 }
114 };
115
116 emit_json_compact(&payload)?;
117 Ok(())
118}
119
120/// Build the envelope from a successfully-resolved `LlmEmbedding`.
121///
122/// `chain_label` documents which CLI knob produced this payload
123/// (e.g. `codex-explicit` vs `codex-first-then-claude`) so the audit
124/// output is self-describing.
125fn backend_payload(
126 resolved: &LlmEmbedding,
127 chain_label: &str,
128 cli: &Cli,
129 is_auto: bool,
130) -> DryRunBackendOutput {
131 // `model_label()` returns `<flavour>:<model>` — split on the FIRST
132 // colon so model names with colons (rare but possible) survive.
133 // `flavour` must be a `&'static str` (the struct field type), so we
134 // leak the slice into a `Box<str>` to obtain a `'static` reference.
135 let label = resolved.model_label();
136 let (flavour, model) = match label.split_once(':') {
137 Some((f, m)) => (f, m.to_string()),
138 None => ("unknown", label.to_string()),
139 };
140 let flavour: &'static str = Box::leak(flavour.to_string().into_boxed_str());
141
142 // Re-probe PATH to surface the binary path the audit envelope
143 // promises. We prefer `which::which` over the private `LlmEmbedding`
144 // field so this file compiles independently of the `extract`
145 // module's internal layout. The result is canonicalized when
146 // possible so symlinks and shim wrappers don't leak location.
147 let binary = which::which(if is_auto {
148 // For Auto, prefer whichever the real spawn would pick first.
149 if which::which("codex").is_ok() {
150 "codex"
151 } else {
152 "claude"
153 }
154 } else {
155 flavour
156 })
157 .ok()
158 .and_then(|p| std::fs::canonicalize(&p).ok().or(Some(p)))
159 .map(|p| p.display().to_string())
160 .unwrap_or_default();
161
162 // Backend string is the `LlmBackendChoice` name for clarity in CI
163 // logs (operators filter on `backend == "codex"` etc.).
164 let backend = match cli.llm_backend {
165 LlmBackendChoice::Auto => {
166 if flavour == "codex" {
167 "codex"
168 } else {
169 "claude"
170 }
171 }
172 LlmBackendChoice::Codex => "codex",
173 LlmBackendChoice::Claude => "claude",
174 LlmBackendChoice::None => "none",
175 };
176
177 DryRunBackendOutput {
178 action: "dry_run_backend",
179 backend,
180 binary,
181 model,
182 flavour,
183 chain: if chain_label == "codex-first-then-claude" {
184 cli.llm_fallback.clone()
185 } else {
186 chain_label.to_string()
187 },
188 strict_env_clear: is_strict_env_clear(),
189 }
190}