sqlite_graphrag/extract/
llm_embedding.rs

1//! LLM-based embedding backend (v1.0.76 default; reworked in v1.0.79 G42).
2//!
3//! `LlmEmbedding` is the production embedding client. It wraps headless
4//! invocations of `claude code` or `codex` and returns f32 vectors of the
5//! active dimensionality (`crate::constants::embedding_dim()`, default 64).
6//!
7//! v1.0.79 (G42) changes:
8//! - S1: the dimensionality is no longer hardcoded here — the single
9//!   source of truth lives in `crate::constants` and the JSON schemas
10//!   are generated dynamically.
11//! - S2: `embed_batch` embeds N numbered texts per LLM call with the
12//!   `{items:[{i,v}]}` schema, collapsing 39 subprocess spawns into 4-5.
13//! - S4: the codex `--output-schema` file is a `tempfile::NamedTempFile`
14//!   with a randomised name created once per client and shared across
15//!   clones via `Arc` — no per-call write+delete, no PID-path races.
16//! - S5: the claude model honours `SQLITE_GRAPHRAG_CLAUDE_EMBED_MODEL`
17//!   (symmetric to the codex env var). ZERO hardcoded models without
18//!   an env override.
19//! - S6: `CLAUDE_CONFIG_DIR` points at an empty managed directory BY
20//!   DEFAULT, because `--strict-mcp-config`/`--mcp-config '{}'` are
21//!   silently ignored upstream (anthropics/claude-code#10787) and a
22//!   full `~/.claude` costs ~223k cache-creation tokens per call.
23//! - S7: the codex `request_user_input` failure mode maps to an
24//!   actionable error instead of an opaque exit 11.
25//! - BLOCO 4: every subprocess uses `kill_on_drop(true)` plus an
26//!   explicit `tokio::time::timeout`, so cancellation never leaks a
27//!   child and a hung LLM cannot stall the pipeline forever.
28//!
29//! OAuth is the only supported credential path. The constructor rejects
30//! `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` in the environment — see
31//! `v1.0.69 (G31) OAuth-Only Enforcement`.
32
33use crate::errors::AppError;
34use serde::Deserialize;
35use std::process::Stdio;
36use std::sync::Arc;
37use tokio::io::AsyncWriteExt;
38use tokio::process::Command;
39
40/// Default per-LLM-call timeout in seconds. Consistent with the
41/// `--claude-timeout` / `--codex-timeout` defaults used by ingest.
42/// Override via `SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS`.
43const DEFAULT_EMBED_TIMEOUT_SECS: u64 = 60;
44
45fn embed_timeout() -> std::time::Duration {
46    let secs = std::env::var("SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS")
47        .ok()
48        .and_then(|v| v.parse::<u64>().ok())
49        .filter(|&n| (10..=3_600).contains(&n))
50        .unwrap_or(DEFAULT_EMBED_TIMEOUT_SECS);
51    std::time::Duration::from_secs(secs)
52}
53
54/// v1.0.89 (GAP-4): scales the per-call timeout with batch size.
55/// A single-item batch uses the base timeout (60s default).
56/// Each additional item adds 15s to account for the LLM generating
57/// more embedding vectors in the same call.
58fn embed_timeout_for_batch(batch_size: usize) -> std::time::Duration {
59    let base = embed_timeout();
60    let extra = std::time::Duration::from_secs(15) * batch_size.saturating_sub(1) as u32;
61    base + extra
62}
63
64/// G42/S1: single-vector JSON schema generated from the active dim.
65fn build_single_schema(dim: usize) -> String {
66    format!(
67        r#"{{"type":"object","properties":{{"embedding":{{"type":"array","items":{{"type":"number"}},"minItems":{dim},"maxItems":{dim}}}}},"required":["embedding"],"additionalProperties":false}}"#
68    )
69}
70
71/// G42/S2: batch JSON schema `{items:[{i,v}]}`. The `items` array length
72/// is deliberately unconstrained so ONE schema file serves every batch
73/// size (index coverage is validated in Rust after parsing).
74fn build_batch_schema(dim: usize) -> String {
75    format!(
76        r#"{{"type":"object","properties":{{"items":{{"type":"array","items":{{"type":"object","properties":{{"i":{{"type":"integer"}},"v":{{"type":"array","items":{{"type":"number"}},"minItems":{dim},"maxItems":{dim}}}}},"required":["i","v"],"additionalProperties":false}}}}}},"required":["items"],"additionalProperties":false}}"#
77    )
78}
79
80#[derive(Clone, Debug)]
81pub struct LlmEmbedding {
82    /// Which LLM headless binary to spawn. `claude` or `codex`.
83    flavour: EmbeddingFlavour,
84    /// Cached path to the binary to avoid PATH lookups on every call.
85    binary: std::path::PathBuf,
86    /// Model name. Resolved from env overrides at construction time.
87    model: String,
88    /// G42/S4: lazily-created codex `--output-schema` tempfiles, shared
89    /// across clones. Keyed by dim so an env change between tests cannot
90    /// serve a stale schema.
91    codex_schemas: Arc<parking_lot::Mutex<CodexSchemaFiles>>,
92}
93
94#[derive(Debug, Default)]
95struct CodexSchemaFiles {
96    single: Option<(usize, Arc<tempfile::NamedTempFile>)>,
97    batch: Option<(usize, Arc<tempfile::NamedTempFile>)>,
98}
99
100#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize)]
101pub enum EmbeddingFlavour {
102    Claude,
103    Codex,
104}
105
106/// ADR-0042 / GAP-002: builder for [`LlmEmbedding`] that lets callers
107/// override the binary path and model without having to remember the
108/// env-var names per flavour. Replaces the duplicated `with_codex` /
109/// `with_claude` bodies that diverged in v1.0.82 (GAP-002: the Claude
110/// arm of `embed_via_backend` re-did the PATH probe via
111/// `LlmEmbedding::detect_available` and could silently pick `codex`).
112#[derive(Clone, Debug)]
113pub struct LlmEmbeddingBuilder {
114    flavour: EmbeddingFlavour,
115    binary_override: Option<std::path::PathBuf>,
116    model_override: Option<String>,
117}
118
119impl LlmEmbeddingBuilder {
120    /// Convenience: produce a Claude-backed builder pre-configured with
121    /// the canonical default binary + model.
122    /// Convenience: produce a Claude-backed builder pre-configured with
123    /// the canonical default binary + model.
124    pub fn claude_default() -> Self {
125        Self {
126            flavour: EmbeddingFlavour::Claude,
127            binary_override: None,
128            model_override: None,
129        }
130    }
131
132    /// Convenience: produce a Codex-backed builder pre-configured with
133    /// the canonical default binary + model.
134    pub fn codex_default() -> Self {
135        Self {
136            flavour: EmbeddingFlavour::Codex,
137            binary_override: None,
138            model_override: None,
139        }
140    }
141    /// Override the binary path (skips the `which::which` PATH probe).
142    pub fn override_binary(mut self, binary: std::path::PathBuf) -> Self {
143        self.binary_override = Some(binary);
144        self
145    }
146
147    /// Override the model name (skips the env-var lookup).
148    pub fn override_model(mut self, model: String) -> Self {
149        self.model_override = Some(model);
150        self
151    }
152
153    /// Build the [`LlmEmbedding`]. Enforces OAuth-only and resolves the
154    /// binary/model via the override or the env-var defaults.
155    pub fn build(self) -> Result<LlmEmbedding, AppError> {
156        LlmEmbedding::oauth_only_enforce()?;
157        let binary = match self.binary_override {
158            Some(path) => resolve_real_binary(&path),
159            None => {
160                let (env_var, which_name) = match self.flavour {
161                    EmbeddingFlavour::Codex => ("SQLITE_GRAPHRAG_CODEX_BINARY", "codex"),
162                    EmbeddingFlavour::Claude => ("SQLITE_GRAPHRAG_CLAUDE_BINARY", "claude"),
163                };
164                let path = std::env::var_os(env_var)
165                    .map(std::path::PathBuf::from)
166                    .or_else(|| which::which(which_name).ok())
167                    .ok_or_else(|| {
168                        AppError::Embedding(format!("`{which_name}` not found on PATH"))
169                    })?;
170                resolve_real_binary(&path)
171            }
172        };
173        let model = match self.model_override {
174            Some(m) => m,
175            None => match self.flavour {
176                EmbeddingFlavour::Codex => codex_embed_model(),
177                EmbeddingFlavour::Claude => claude_embed_model(),
178            },
179        };
180        Ok(LlmEmbedding {
181            flavour: self.flavour,
182            binary,
183            model,
184            codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
185        })
186    }
187}
188
189impl EmbeddingFlavour {
190    pub fn as_str(self) -> &'static str {
191        match self {
192            Self::Claude => "claude",
193            Self::Codex => "codex",
194        }
195    }
196}
197
198#[derive(Debug, Deserialize)]
199struct EmbeddingResponse {
200    embedding: Vec<f32>,
201}
202
203#[derive(Debug, Deserialize)]
204struct BatchEmbeddingResponse {
205    items: Vec<BatchEmbeddingItem>,
206}
207
208#[derive(Debug, Deserialize)]
209struct BatchEmbeddingItem {
210    i: usize,
211    v: Vec<f32>,
212}
213
214/// Follows symlinks and shell-script shim `exec` targets to find
215/// the real ELF binary. Shim wrappers (like `~/.graphrag-shim/codex`)
216/// can strip hardening flags; bypassing them is a security requirement.
217pub fn resolve_real_binary(path: &std::path::Path) -> std::path::PathBuf {
218    if let Ok(canonical) = std::fs::canonicalize(path) {
219        if is_elf_binary(&canonical) {
220            return canonical;
221        }
222        if let Some(exec_target) = extract_exec_target_from_shim(&canonical) {
223            if exec_target.exists() && is_elf_binary(&exec_target) {
224                return exec_target;
225            }
226        }
227        return canonical;
228    }
229    path.to_path_buf()
230}
231
232fn is_elf_binary(path: &std::path::Path) -> bool {
233    std::fs::read(path)
234        .map(|bytes| bytes.len() >= 4 && bytes[..4] == [0x7f, b'E', b'L', b'F'])
235        .unwrap_or(false)
236}
237
238fn extract_exec_target_from_shim(path: &std::path::Path) -> Option<std::path::PathBuf> {
239    let content = std::fs::read_to_string(path).ok()?;
240    if !content.starts_with("#!") {
241        return None;
242    }
243    for line in content.lines().rev() {
244        let trimmed = line.trim();
245        if trimmed.starts_with("exec ") {
246            let after_exec = trimmed.strip_prefix("exec ")?;
247            let binary = after_exec.split_whitespace().next()?;
248            return Some(std::path::PathBuf::from(binary));
249        }
250    }
251    None
252}
253
254/// G42/S5: claude embedding model with env override, symmetric to the
255/// codex `SQLITE_GRAPHRAG_CODEX_EMBED_MODEL` introduced in v1.0.78.
256fn claude_embed_model() -> String {
257    // Precedence: SQLITE_GRAPHRAG_CLAUDE_EMBED_MODEL > SQLITE_GRAPHRAG_LLM_MODEL > default
258    std::env::var("SQLITE_GRAPHRAG_CLAUDE_EMBED_MODEL")
259        .or_else(|_| std::env::var("SQLITE_GRAPHRAG_LLM_MODEL"))
260        .unwrap_or_else(|_| {
261            tracing::info!(
262                target: "llm_embedding",
263                "no model specified; defaulting to claude-sonnet-4-6"
264            );
265            "claude-sonnet-4-6".to_string()
266        })
267}
268
269fn codex_embed_model() -> String {
270    // Precedence: SQLITE_GRAPHRAG_CODEX_EMBED_MODEL > SQLITE_GRAPHRAG_LLM_MODEL > default
271    std::env::var("SQLITE_GRAPHRAG_CODEX_EMBED_MODEL")
272        .or_else(|_| std::env::var("SQLITE_GRAPHRAG_LLM_MODEL"))
273        .unwrap_or_else(|_| {
274            tracing::info!(
275                target: "llm_embedding",
276                "no model specified; defaulting to gpt-5.5"
277            );
278            "gpt-5.5".to_string()
279        })
280}
281
282impl LlmEmbedding {
283    /// Detects which LLM CLI is available on PATH and returns the
284    /// matching embedding client.
285    ///
286    /// v1.0.76: PREFERS `codex` over `claude` because:
287    /// - Claude Code 2.1+ ships a 180k+ token system context (plugins,
288    ///   skills, agents, MCP) that overflows the 200k context window
289    ///   for even trivial embedding prompts and returns "Prompt is too
290    ///   long". (v1.0.79/S6 mitigates this with an empty
291    ///   `CLAUDE_CONFIG_DIR`, but codex stays the lighter default.)
292    /// - Codex 0.134+ is lightweight (~5k system context) and the
293    ///   `StructuredOutput` tool reliably returns the requested vectors.
294    pub fn detect_available() -> Result<Self, AppError> {
295        Self::oauth_only_enforce()?;
296
297        // v1.0.89 (GAP-1): honour SQLITE_GRAPHRAG_CODEX_BINARY for the
298        // embedding pipeline, symmetric with SQLITE_GRAPHRAG_CLAUDE_BINARY.
299        let codex_path = std::env::var_os("SQLITE_GRAPHRAG_CODEX_BINARY")
300            .map(std::path::PathBuf::from)
301            .or_else(|| which::which("codex").ok());
302        if let Some(path) = codex_path {
303            return Ok(Self {
304                flavour: EmbeddingFlavour::Codex,
305                binary: resolve_real_binary(&path),
306                model: codex_embed_model(),
307                codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
308            });
309        }
310        // v1.0.89: honour SQLITE_GRAPHRAG_CLAUDE_BINARY for the embedding
311        // pipeline, not just ingest/enrich. This lets operators override the
312        // symlink-resolved path (e.g. a stale multi-instance binary).
313        let claude_path = std::env::var_os("SQLITE_GRAPHRAG_CLAUDE_BINARY")
314            .map(std::path::PathBuf::from)
315            .or_else(|| which::which("claude").ok());
316        if let Some(path) = claude_path {
317            return Ok(Self {
318                flavour: EmbeddingFlavour::Claude,
319                binary: resolve_real_binary(&path),
320                model: claude_embed_model(),
321                codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
322            });
323        }
324        Err(AppError::Embedding(
325            "no LLM CLI found on PATH: install `codex` (0.130+) or `claude` (Claude Code 2.1+)"
326                .to_string(),
327        ))
328    }
329
330    pub fn with_codex() -> Result<Self, AppError> {
331        Self::with_codex_builder().build()
332    }
333
334    pub fn with_claude() -> Result<Self, AppError> {
335        Self::with_claude_builder().build()
336    }
337
338    /// ADR-0042 / GAP-002: builder entry point for a codex-backed
339    /// embedder with default model resolution.
340    pub fn with_codex_builder() -> LlmEmbeddingBuilder {
341        LlmEmbeddingBuilder {
342            flavour: EmbeddingFlavour::Codex,
343            binary_override: None,
344            model_override: None,
345        }
346    }
347
348    /// ADR-0042 / GAP-002: builder entry point for a claude-backed
349    /// embedder with default model resolution.
350    pub fn with_claude_builder() -> LlmEmbeddingBuilder {
351        LlmEmbeddingBuilder {
352            flavour: EmbeddingFlavour::Claude,
353            binary_override: None,
354            model_override: None,
355        }
356    }
357    /// v1.0.69 (G31): refuse to spawn if an API key is set. The CLI
358    /// must use OAuth. The two API-key env vars are NOT in the
359    /// env-clear whitelist, so a parent process that exports them
360    /// will see this error.
361    fn oauth_only_enforce() -> Result<(), AppError> {
362        if std::env::var("ANTHROPIC_API_KEY").is_ok() {
363            return Err(AppError::Validation(
364                "ANTHROPIC_API_KEY is set; v1.0.76 requires OAuth. \
365                 unset it and use `claude login` instead."
366                    .into(),
367            ));
368        }
369        if std::env::var("OPENAI_API_KEY").is_ok() {
370            return Err(AppError::Validation(
371                "OPENAI_API_KEY is set; v1.0.76 requires OAuth. \
372                 unset it and use `codex login` instead."
373                    .into(),
374            ));
375        }
376        Ok(())
377    }
378
379    /// Embeds a single passage (chunk of a memory body). Returns an
380    /// f32 vector of the active dimensionality.
381    pub fn embed_passage(&self, text: &str) -> Result<Vec<f32>, AppError> {
382        self.invoke_with_prefix(crate::constants::PASSAGE_PREFIX, text)
383    }
384
385    /// Embeds a single query. The LLM uses a different prompt prefix
386    /// to disambiguate query from passage.
387    pub fn embed_query(&self, text: &str) -> Result<Vec<f32>, AppError> {
388        self.invoke_with_prefix(crate::constants::QUERY_PREFIX, text)
389    }
390
391    /// G56: returns a stable label for the active embedding model so the
392    /// in-process entity-embedding cache can key by `(model, text)`.
393    /// Embeddings produced by different models are not interchangeable,
394    /// so a cache entry from one model must never satisfy a request
395    /// served by another.
396    pub fn model_label(&self) -> String {
397        format!("{}:{}", self.flavour.as_str(), self.model)
398    }
399
400    /// ADR-0042 / BUG-003 fix: returns the resolved []
401    /// of this embedder. Used by  and
402    ///  to report the backend that
403    /// ACTUALLY executed the embedding (not the one requested in the
404    /// chain). When  substitutes claude
405    /// for a missing codex, the operator sees the truth in
406    /// .
407    pub fn flavour(&self) -> EmbeddingFlavour {
408        self.flavour
409    }
410
411    /// G42/S2: embeds a batch of `(global_index, text)` pairs in ONE
412    /// LLM call. Returns `(global_index, vector)` pairs. Async — this
413    /// is the unit of work scheduled by the bounded fan-out in
414    /// `crate::embedder`.
415    ///
416    /// Cancel safety: the future owns its subprocess via
417    /// `kill_on_drop(true)`, so dropping it (e.g. losing a
418    /// `tokio::select!` race against a cancellation token) kills the
419    /// child and leaks nothing.
420    pub async fn embed_batch_async(
421        &self,
422        prefix: &str,
423        batch: &[(usize, String)],
424    ) -> Result<Vec<(usize, Vec<f32>)>, AppError> {
425        let dim = crate::constants::embedding_dim();
426        if batch.is_empty() {
427            return Ok(Vec::new());
428        }
429        if batch.len() == 1 {
430            let (idx, text) = (&batch[0].0, &batch[0].1);
431            let v = self.invoke_single_async(prefix, text, dim).await?;
432            return Ok(vec![(*idx, v)]);
433        }
434
435        let mut prompt = format!(
436            "Generate {dim}-dimensional semantic embedding vectors for each numbered text below.\n\
437             Return a JSON object with an \"items\" array containing EXACTLY {n} items.\n\
438             Each item has \"i\" (the 1-based index) and \"v\" (the {dim}-float vector, values between -1 and 1).\n\n",
439            n = batch.len()
440        );
441        for (pos, (_, text)) in batch.iter().enumerate() {
442            prompt.push_str(&format!("{}: {prefix}{text}\n", pos + 1));
443        }
444
445        // v1.0.89 (GAP-4): scale timeout with batch size via env var override.
446        // embed_timeout() reads SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS; we set it
447        // to the batch-scaled value before the LLM call and restore after.
448        let batch_timeout = embed_timeout_for_batch(batch.len());
449        let prev_timeout = std::env::var("SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS").ok();
450        std::env::set_var(
451            "SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS",
452            batch_timeout.as_secs().to_string(),
453        );
454        let stdout = match self.flavour {
455            EmbeddingFlavour::Claude => {
456                self.invoke_claude(&prompt, &build_batch_schema(dim))
457                    .await?
458            }
459            EmbeddingFlavour::Codex => {
460                let schema = self.codex_schema_file(dim, true)?;
461                self.invoke_codex(&prompt, schema.path()).await?
462            }
463        };
464        // Restore previous timeout value.
465        match prev_timeout {
466            Some(v) => std::env::set_var("SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS", v),
467            None => std::env::remove_var("SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS"),
468        }
469
470        let parsed: BatchEmbeddingResponse = parse_llm_json(&stdout).map_err(|e| {
471            AppError::Embedding(format!(
472                "LLM batch embedding response parse failed: {e}; raw={stdout}"
473            ))
474        })?;
475        if parsed.items.len() != batch.len() {
476            return Err(AppError::Embedding(format!(
477                "LLM batch returned {} items, expected {} (G42/S2 coverage check)",
478                parsed.items.len(),
479                batch.len()
480            )));
481        }
482        let mut out: Vec<Option<Vec<f32>>> = vec![None; batch.len()];
483        for item in parsed.items {
484            if item.i == 0 || item.i > batch.len() {
485                return Err(AppError::Embedding(format!(
486                    "LLM batch item index {} out of range 1..={}",
487                    item.i,
488                    batch.len()
489                )));
490            }
491            if item.v.len() != dim {
492                return Err(AppError::Embedding(format!(
493                    "LLM batch item {} returned {} dims, expected {dim}; \
494                     refusing to truncate or pad silently (G42/C5)",
495                    item.i,
496                    item.v.len()
497                )));
498            }
499            out[item.i - 1] = Some(item.v);
500        }
501        let mut result = Vec::with_capacity(batch.len());
502        for (pos, slot) in out.into_iter().enumerate() {
503            let v = slot.ok_or_else(|| {
504                AppError::Embedding(format!(
505                    "LLM batch response is missing item index {} (G42/S2 coverage check)",
506                    pos + 1
507                ))
508            })?;
509            result.push((batch[pos].0, v));
510        }
511        Ok(result)
512    }
513
514    fn invoke_with_prefix(&self, prefix: &str, text: &str) -> Result<Vec<f32>, AppError> {
515        let dim = crate::constants::embedding_dim();
516        let inner = self.invoke_single_async(prefix, text, dim);
517        // v1.0.79 (G42/A2): reuse the process-wide multi-thread runtime
518        // instead of building a current-thread runtime PER CALL. Inside
519        // an existing runtime (tests, async commands) block_in_place
520        // keeps the worker pool healthy.
521        match tokio::runtime::Handle::try_current() {
522            Ok(handle) => tokio::task::block_in_place(|| handle.block_on(inner)),
523            Err(_) => crate::embedder::shared_runtime()?.block_on(inner),
524        }
525    }
526
527    async fn invoke_single_async(
528        &self,
529        prefix: &str,
530        text: &str,
531        dim: usize,
532    ) -> Result<Vec<f32>, AppError> {
533        let prompt = format!("{prefix}{text}");
534        let stdout = match self.flavour {
535            EmbeddingFlavour::Claude => {
536                self.invoke_claude(&prompt, &build_single_schema(dim))
537                    .await?
538            }
539            EmbeddingFlavour::Codex => {
540                let schema = self.codex_schema_file(dim, false)?;
541                self.invoke_codex(&prompt, schema.path()).await?
542            }
543        };
544        let parsed: EmbeddingResponse = parse_llm_json(&stdout).map_err(|e| {
545            AppError::Embedding(format!(
546                "LLM embedding response parse failed: {e}; raw={stdout}"
547            ))
548        })?;
549        if parsed.embedding.len() != dim {
550            return Err(AppError::Embedding(format!(
551                "LLM returned {} dims, expected {dim}; \
552                 refusing to truncate or pad silently (G42/C5)",
553                parsed.embedding.len()
554            )));
555        }
556        Ok(parsed.embedding)
557    }
558
559    /// G42/S4: returns the lazily-created, process-shared codex schema
560    /// tempfile for the requested mode. `NamedTempFile` randomises the
561    /// filename (no PID-based collisions) and removes the file on drop
562    /// of the last `Arc` clone.
563    fn codex_schema_file(
564        &self,
565        dim: usize,
566        batch: bool,
567    ) -> Result<Arc<tempfile::NamedTempFile>, AppError> {
568        let mut guard = self.codex_schemas.lock();
569        let slot = if batch {
570            &mut guard.batch
571        } else {
572            &mut guard.single
573        };
574        if let Some((cached_dim, file)) = slot {
575            if *cached_dim == dim {
576                return Ok(Arc::clone(file));
577            }
578        }
579        let content = if batch {
580            build_batch_schema(dim)
581        } else {
582            build_single_schema(dim)
583        };
584        let file = tempfile::Builder::new()
585            .prefix("sqlite-graphrag-embed-schema-")
586            .suffix(".json")
587            .tempfile()
588            .map_err(|e| AppError::Embedding(format!("schema tempfile create failed: {e}")))?;
589        std::fs::write(file.path(), content)
590            .map_err(|e| AppError::Embedding(format!("schema tempfile write failed: {e}")))?;
591        let file = Arc::new(file);
592        *slot = Some((dim, Arc::clone(&file)));
593        Ok(file)
594    }
595
596    async fn invoke_claude(&self, prompt: &str, schema: &str) -> Result<String, AppError> {
597        // v1.0.69 hardening: --strict-mcp-config --mcp-config <PATH> --settings
598        // '{"hooks":{}}' --dangerously-skip-permissions.
599        //
600        // v1.0.76 hardening: Claude Code 2.1+ renamed --output-schema to
601        // --json-schema and accepts the schema as an inline JSON string
602        // (NOT a file path). Also pass --output-format json so the
603        // response is a single JSON object on stdout.
604        //
605        // v1.0.79 (G42/S6): CLAUDE_CONFIG_DIR points at an empty managed
606        // directory BY DEFAULT — the MCP-isolation flags above are
607        // silently ignored upstream (anthropics/claude-code#10787) and a
608        // populated ~/.claude costs ~223k cache-creation tokens per call.
609        //
610        // v1.0.88 (BUG-2 fix, ADR-0046): the inline `--mcp-config '{}'`
611        // form was rejected by Claude Code 2.1.177 (ADR-0045 Bug 2).
612        // Substitute a tempfile path produced by
613        // `write_empty_mcp_config_tempfile()` and run the full
614        // preflight gate BEFORE `Command::spawn()`, mirroring what
615        // `invoke_codex` already does for the codex backend.
616        let mcp_config_path = crate::spawn::preflight::write_empty_mcp_config_tempfile()?;
617        let argv_refs: [std::ffi::OsString; 0] = [];
618        let preflight_args = crate::spawn::preflight::PreFlightArgs {
619            binary_path: &self.binary,
620            argv: &argv_refs,
621            workspace_root: std::path::Path::new("."),
622            mcp_config_inline_json: None,
623            expected_output_bytes: 65_536,
624            spawner_name: "llm_embedding",
625        };
626        crate::spawn::preflight::preflight_check(&preflight_args)?;
627        let mut cmd = Command::new(&self.binary);
628        cmd.arg("-p")
629            .arg(prompt)
630            .arg("--model")
631            .arg(&self.model)
632            .arg("--json-schema")
633            .arg(schema)
634            .arg("--output-format")
635            .arg("json")
636            .arg("--strict-mcp-config")
637            .arg("--mcp-config")
638            .arg(mcp_config_path.as_os_str())
639            .arg("--settings")
640            .arg(r#"{"hooks":{}}"#)
641            .arg("--dangerously-skip-permissions")
642            .env_clear()
643            .env("PATH", std::env::var("PATH").unwrap_or_default())
644            .env("HOME", std::env::var("HOME").unwrap_or_default())
645            .stdin(Stdio::null())
646            .stdout(Stdio::piped())
647            .stderr(Stdio::piped())
648            // BLOCO 4: cancellation (dropped future) must kill the child.
649            .kill_on_drop(true);
650        if let Some(config_dir) = claude_embedding_config_dir() {
651            cmd.env("CLAUDE_CONFIG_DIR", &config_dir);
652        }
653        let binary_str = self.binary.to_string_lossy().into_owned();
654        let output = match tokio::time::timeout(embed_timeout(), cmd.output()).await {
655            Err(_elapsed) => {
656                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
657                    &crate::llm::exit_code_hints::LlmBackendError::Timeout {
658                        secs: embed_timeout().as_secs(),
659                        binary: binary_str.clone(),
660                    },
661                ));
662            }
663            Ok(Err(e)) => {
664                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
665                    &crate::llm::exit_code_hints::LlmBackendError::SpawnFailed {
666                        binary: binary_str.clone(),
667                        source: e.to_string(),
668                    },
669                ));
670            }
671            Ok(Ok(o)) => o,
672        };
673        // G45-CR5 / ADR-0043 (v1.0.85): parse the JSON envelope from
674        // `claude -p --output-format json` and detect OAuth quota
675        // exhaustion by looking for the `rate_limit_error` or
676        // `usage` overflow markers before checking the subprocess
677        // exit status. This lets the deterministic fallback in
678        // hybrid-search and recall swap to codex immediately.
679        let stdout_str = String::from_utf8_lossy(&output.stdout);
680        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&stdout_str) {
681            let is_rate_limited = parsed
682                .get("is_error")
683                .and_then(|v| v.as_bool())
684                .unwrap_or(false)
685                && parsed
686                    .get("result")
687                    .and_then(|v| v.as_str())
688                    .map(|s| {
689                        s.contains("rate limit")
690                            || s.contains("quota")
691                            || s.contains("anthropic-ratelimit")
692                    })
693                    .unwrap_or(false);
694            if is_rate_limited {
695                return Err(AppError::Embedding(format!(
696                    "OAuth usage quota exhausted: claude rate_limit detected in stdout: {}",
697                    parsed
698                        .get("result")
699                        .and_then(|v| v.as_str())
700                        .unwrap_or("")
701                        .chars()
702                        .take(120)
703                        .collect::<String>()
704                )));
705            }
706        }
707        if !output.status.success() {
708            let (exit_code, signal) = if let Some(code) = output.status.code() {
709                (Some(code), None)
710            } else {
711                use std::os::unix::process::ExitStatusExt;
712                (None, output.status.signal())
713            };
714            let stdout_tail = crate::llm::exit_code_hints::LlmBackendError::truncate_tail(
715                &output.stdout,
716                crate::llm::exit_code_hints::DIAG_TAIL_BYTES,
717            );
718            let stderr_tail = crate::llm::exit_code_hints::LlmBackendError::truncate_tail(
719                &output.stderr,
720                crate::llm::exit_code_hints::DIAG_TAIL_BYTES,
721            );
722            let mut hint = crate::llm::exit_code_hints::diagnose_exit_code(exit_code, signal);
723            // v1.0.89 (GAP-5): detect expired OAuth and suggest actionable fix.
724            if stderr_tail.contains("401")
725                || stderr_tail.contains("Unauthorized")
726                || stderr_tail.contains("expired")
727                || stderr_tail.contains("login")
728                || stdout_tail.contains("401")
729                || stdout_tail.contains("Unauthorized")
730            {
731                hint.push_str(
732                    " | Claude OAuth token may be expired; run `claude login` to renew",
733                );
734            }
735            return Err(crate::llm::exit_code_hints::into_legacy_embedding(
736                &crate::llm::exit_code_hints::LlmBackendError::NonZeroExit {
737                    exit_code,
738                    signal,
739                    stdout_tail,
740                    stderr_tail,
741                    binary: binary_str,
742                    hint,
743                },
744            ));
745        }
746        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
747    }
748
749    async fn invoke_codex(
750        &self,
751        prompt: &str,
752        schema_path: &std::path::Path,
753    ) -> Result<String, AppError> {
754        let binary_str = self.binary.to_string_lossy().into_owned();
755        let mut cmd = build_codex_embedding_command(&self.binary, &self.model, schema_path);
756
757        // GAP-META-005 (v1.0.87, ADR-0045): pre-flight gate before spawn.
758        // `tokio::process::Command` does not expose `get_args()`, so we
759        // skip the argv-size check here and rely on binary + workspace
760        // root + output buffer guards. Embedding prompts are bounded by
761        // the schema validator so argv overflow is not a real risk here.
762        //
763        // v1.0.88 (BUG-7 fix, ADR-0046): propagate the preflight error
764        // directly via `AppError::PreFlightFailed` (via the `From`
765        // impl added in `errors.rs`) so callers and operators see the
766        // structured `PreFlightError` variant and the canonical exit
767        // code 16. The previous implementation wrapped the error in
768        // `LlmBackendError::SpawnFailed`, which mapped to a different
769        // exit code and masked the preflight signal.
770        let argv_refs: [std::ffi::OsString; 0] = [];
771        let preflight_args = crate::spawn::preflight::PreFlightArgs {
772            binary_path: &self.binary,
773            argv: &argv_refs,
774            workspace_root: std::path::Path::new("."),
775            mcp_config_inline_json: None,
776            expected_output_bytes: 65_536,
777            spawner_name: "llm_embedding",
778        };
779        crate::spawn::preflight::preflight_check(&preflight_args)?;
780        let _ = binary_str; // silenced: preflight does not need it
781
782        let mut child = match cmd.spawn() {
783            Ok(c) => c,
784            Err(e) => {
785                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
786                    &crate::llm::exit_code_hints::LlmBackendError::SpawnFailed {
787                        binary: binary_str,
788                        source: e.to_string(),
789                    },
790                ));
791            }
792        };
793        if let Some(mut stdin) = child.stdin.take() {
794            stdin
795                .write_all(prompt.as_bytes())
796                .await
797                .map_err(|e| AppError::Embedding(format!("codex stdin write failed: {e}")))?;
798            drop(stdin);
799        }
800        let output = match tokio::time::timeout(embed_timeout(), child.wait_with_output()).await {
801            Err(_elapsed) => {
802                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
803                    &crate::llm::exit_code_hints::LlmBackendError::Timeout {
804                        secs: embed_timeout().as_secs(),
805                        binary: binary_str,
806                    },
807                ));
808            }
809            Ok(Err(e)) => {
810                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
811                    &crate::llm::exit_code_hints::LlmBackendError::SpawnFailed {
812                        binary: binary_str,
813                        source: format!("codex wait failed: {e}"),
814                    },
815                ));
816            }
817            Ok(Ok(o)) => o,
818        };
819        if !output.status.success() {
820            let (exit_code, signal) = if let Some(code) = output.status.code() {
821                (Some(code), None)
822            } else {
823                use std::os::unix::process::ExitStatusExt;
824                (None, output.status.signal())
825            };
826            let stdout_tail = crate::llm::exit_code_hints::LlmBackendError::truncate_tail(
827                &output.stdout,
828                crate::llm::exit_code_hints::DIAG_TAIL_BYTES,
829            );
830            let stderr_tail = crate::llm::exit_code_hints::LlmBackendError::truncate_tail(
831                &output.stderr,
832                crate::llm::exit_code_hints::DIAG_TAIL_BYTES,
833            );
834            let hint = crate::llm::exit_code_hints::diagnose_exit_code(exit_code, signal);
835            // G42/S7: the headless spawn can still hit interactive
836            // prompts on some codex builds; keep the legacy request_user_input
837            // branch as a special-case hint, and stamp the diagnostic
838            // tail on top of the canonical NonZeroExit envelope.
839            let mut combined_hint = hint;
840            if stderr_tail.contains("request_user_input") {
841                combined_hint.push_str(
842                    " | codex requested interactive input in a headless embedding call; \
843                     upgrade codex (>= 0.134) or switch the embedding backend to claude",
844                );
845            }
846            return Err(crate::llm::exit_code_hints::into_legacy_embedding(
847                &crate::llm::exit_code_hints::LlmBackendError::NonZeroExit {
848                    exit_code,
849                    signal,
850                    stdout_tail,
851                    stderr_tail,
852                    binary: binary_str,
853                    hint: combined_hint,
854                },
855            ));
856        }
857        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
858    }
859}
860
861/// G42/S6: resolves the empty `CLAUDE_CONFIG_DIR` used for embedding
862/// subprocesses.
863///
864/// - `SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR` is honoured when set and
865///   pointing at a directory (same contract as G28-A in claude_runner);
866/// - otherwise a managed directory is created at
867///   `~/.local/state/sqlite-graphrag/claude-empty-config` (mode 0700).
868///   If `~/.claude/.credentials.json` exists (Linux OAuth storage) it is
869///   copied in so authentication still works; on macOS credentials live
870///   in the Keychain and the empty dir is sufficient.
871///
872/// Returns `None` only when HOME is unset AND no override is given —
873/// in that case the subprocess falls back to claude's own default.
874fn claude_embedding_config_dir() -> Option<std::path::PathBuf> {
875    if let Ok(dir) = std::env::var("SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR") {
876        let path = std::path::PathBuf::from(dir);
877        if path.is_dir() {
878            return Some(path);
879        }
880        tracing::warn!(
881            target: "embedding",
882            path = %path.display(),
883            "SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR is set but not a directory; \
884             falling back to the managed empty config dir"
885        );
886    }
887    let home = std::env::var("HOME").ok()?;
888    let dir = std::path::Path::new(&home)
889        .join(".local/state/sqlite-graphrag")
890        .join("claude-empty-config");
891    if std::fs::create_dir_all(&dir).is_err() {
892        return None;
893    }
894    #[cfg(unix)]
895    {
896        use std::os::unix::fs::PermissionsExt;
897        let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700));
898    }
899    // Linux stores OAuth credentials on disk; copy them so the isolated
900    // config dir still authenticates. Best-effort: macOS uses Keychain.
901    // v1.0.89: ALWAYS copy (was: skip if target exists). OAuth tokens
902    // expire and the stale copy causes 401 until manually deleted.
903    let creds = std::path::Path::new(&home).join(".claude/.credentials.json");
904    if creds.exists() {
905        let target = dir.join(".credentials.json");
906        let _ = std::fs::copy(&creds, &target);
907    }
908    Some(dir)
909}
910
911fn build_codex_embedding_command(
912    binary: &std::path::Path,
913    model: &str,
914    schema_path: &std::path::Path,
915) -> Command {
916    let mut cmd = Command::new(binary);
917    // v1.0.77: `-c` TOML overrides bypass the codex exec --sandbox propagation
918    // bug (openai/codex#18113). CLI flags alone are insufficient — the exec
919    // subcommand may not inherit --sandbox from the parent codex command.
920    cmd.arg("exec")
921        .arg("-c")
922        .arg("sandbox_mode='read-only'")
923        .arg("-c")
924        .arg("approval_policy='never'")
925        .arg("--json")
926        .arg("--output-schema")
927        .arg(schema_path)
928        .arg("--ephemeral")
929        .arg("--skip-git-repo-check")
930        .arg("--sandbox")
931        .arg("read-only")
932        .arg("--ignore-user-config")
933        .arg("--ignore-rules");
934    if crate::extract::codex_compat::codex_supports_ask_for_approval() {
935        cmd.arg("--ask-for-approval").arg("never");
936    }
937    // v1.0.89: use the real CODEX_HOME (~/.codex) instead of an isolated
938    // per-PID directory. The isolated dir caused cold-start overhead (codex
939    // creates ~6 SQLite databases on first run) that regularly exceeded
940    // the 30s embedding timeout. The --ignore-user-config + --ephemeral
941    // flags already prevent config pollution; CODEX_HOME only needs auth.
942    cmd.arg("--model")
943        .arg(model)
944        .arg("-")
945        .env_clear()
946        .env("PATH", std::env::var("PATH").unwrap_or_default())
947        .env("HOME", std::env::var("HOME").unwrap_or_default());
948    if let Ok(codex_home) = std::env::var("CODEX_HOME") {
949        cmd.env("CODEX_HOME", codex_home);
950    } else if let Ok(home) = std::env::var("HOME") {
951        let default_home = std::path::Path::new(&home).join(".codex");
952        if default_home.exists() {
953            cmd.env("CODEX_HOME", &default_home);
954        }
955    }
956    cmd.stdin(Stdio::piped())
957        .stdout(Stdio::piped())
958        .stderr(Stdio::piped())
959        // BLOCO 4: cancellation (dropped future) must kill the child.
960        .kill_on_drop(true);
961    cmd
962}
963
964// prepare_isolated_codex_home removed in v1.0.89: the per-PID isolated
965// CODEX_HOME caused cold-start overhead that exceeded the 30s embedding
966// timeout. The real ~/.codex is now used directly (see build_codex_embedding_command).
967
968/// Parse an LLM JSON response of type `T`. The two backends emit
969/// different shapes:
970/// - Claude (with `--output-format json`): single JSON object on stdout.
971/// - Codex (with `--json`): JSONL stream with one event per line; the
972///   `agent_message` event's `text` field is the JSON payload.
973///
974/// This helper accepts both shapes and returns the parsed value (or an
975/// error describing the first mismatch).
976fn parse_llm_json<T: serde::de::DeserializeOwned>(stdout: &str) -> Result<T, String> {
977    // Strategy 1: try the whole stdout as JSON (Claude path).
978    if let Ok(parsed) = serde_json::from_str::<T>(stdout) {
979        return Ok(parsed);
980    }
981    // Strategy 2: walk the JSONL line by line and pick the last
982    // `item.completed` of type `agent_message` (Codex path).
983    let mut last_agent_text: Option<String> = None;
984    for line in stdout.lines() {
985        let line = line.trim();
986        if line.is_empty() {
987            continue;
988        }
989        let Ok(event) = serde_json::from_str::<serde_json::Value>(line) else {
990            continue;
991        };
992        if event.get("type").and_then(|t| t.as_str()) != Some("item.completed") {
993            continue;
994        }
995        let item = match event.get("item") {
996            Some(i) => i,
997            None => continue,
998        };
999        if item.get("type").and_then(|t| t.as_str()) != Some("agent_message") {
1000            continue;
1001        }
1002        if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
1003            last_agent_text = Some(text.to_string());
1004        }
1005    }
1006    let text = last_agent_text
1007        .ok_or_else(|| "no agent_message found in codex JSONL output".to_string())?;
1008    serde_json::from_str::<T>(&text)
1009        .map_err(|e| format!("codex agent_message text does not match schema: {e}; raw={text}"))
1010}
1011
1012#[cfg(test)]
1013mod tests {
1014    use super::*;
1015
1016    fn test_client(flavour: EmbeddingFlavour, binary: std::path::PathBuf) -> LlmEmbedding {
1017        LlmEmbedding {
1018            flavour,
1019            binary,
1020            model: "gpt-5.4".to_string(),
1021            codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
1022        }
1023    }
1024
1025    #[test]
1026    fn embed_timeout_default_is_60() {
1027        assert_eq!(DEFAULT_EMBED_TIMEOUT_SECS, 60);
1028    }
1029
1030    #[test]
1031    #[serial_test::serial(env)]
1032    fn oauth_only_enforce_blocks_api_keys() {
1033        // SAFETY: this test only sets and unsets env vars; the
1034        // `serial(env)` group prevents cross-test interference.
1035        unsafe {
1036            std::env::set_var("ANTHROPIC_API_KEY", "test");
1037            assert!(LlmEmbedding::oauth_only_enforce().is_err());
1038            std::env::remove_var("ANTHROPIC_API_KEY");
1039
1040            std::env::set_var("OPENAI_API_KEY", "test");
1041            assert!(LlmEmbedding::oauth_only_enforce().is_err());
1042            std::env::remove_var("OPENAI_API_KEY");
1043        }
1044        assert!(LlmEmbedding::oauth_only_enforce().is_ok());
1045    }
1046
1047    #[test]
1048    fn flavour_as_str_is_stable() {
1049        assert_eq!(EmbeddingFlavour::Claude.as_str(), "claude");
1050        assert_eq!(EmbeddingFlavour::Codex.as_str(), "codex");
1051    }
1052
1053    #[test]
1054    fn single_schema_embeds_active_dim() {
1055        let schema = build_single_schema(64);
1056        assert!(schema.contains(r#""minItems":64"#));
1057        assert!(schema.contains(r#""maxItems":64"#));
1058        let parsed: serde_json::Value =
1059            serde_json::from_str(&schema).expect("single schema must be valid JSON");
1060        assert_eq!(parsed["properties"]["embedding"]["minItems"], 64);
1061    }
1062
1063    #[test]
1064    fn batch_schema_is_valid_json_and_unbounded_items() {
1065        let schema = build_batch_schema(64);
1066        let parsed: serde_json::Value =
1067            serde_json::from_str(&schema).expect("batch schema must be valid JSON");
1068        // The items array must NOT constrain its length so one schema
1069        // file serves every batch size (G42/S4).
1070        assert!(parsed["properties"]["items"].get("minItems").is_none());
1071        assert_eq!(
1072            parsed["properties"]["items"]["items"]["properties"]["v"]["minItems"],
1073            64
1074        );
1075    }
1076
1077    #[test]
1078    fn parse_llm_json_accepts_claude_json() {
1079        let stdout = r#"{"embedding":[0.0,1.0,2.0]}"#;
1080
1081        let parsed: EmbeddingResponse = parse_llm_json(stdout).expect("claude JSON must parse");
1082
1083        assert_eq!(parsed.embedding, vec![0.0, 1.0, 2.0]);
1084    }
1085
1086    #[test]
1087    fn parse_llm_json_accepts_codex_jsonl() {
1088        let stdout = r#"{"type":"thread.started","thread_id":"mock-thread-0"}
1089{"type":"item.completed","item":{"type":"agent_message","text":"{\"embedding\":[0.0,1.0,2.0]}"}}
1090{"type":"turn.completed","usage":{"input_tokens":1,"output_tokens":1}}"#;
1091
1092        let parsed: EmbeddingResponse = parse_llm_json(stdout).expect("codex JSONL must parse");
1093
1094        assert_eq!(parsed.embedding, vec![0.0, 1.0, 2.0]);
1095    }
1096
1097    #[test]
1098    fn parse_llm_json_rejects_jsonl_without_agent_message() {
1099        let stdout = r#"{"type":"thread.started","thread_id":"mock-thread-0"}"#;
1100
1101        let err = parse_llm_json::<EmbeddingResponse>(stdout)
1102            .expect_err("missing agent_message must fail");
1103
1104        assert!(err.contains("no agent_message"));
1105    }
1106
1107    #[test]
1108    fn parse_llm_json_accepts_batch_response() {
1109        let stdout = r#"{"items":[{"i":1,"v":[0.0,1.0]},{"i":2,"v":[2.0,3.0]}]}"#;
1110
1111        let parsed: BatchEmbeddingResponse = parse_llm_json(stdout).expect("batch JSON must parse");
1112
1113        assert_eq!(parsed.items.len(), 2);
1114        assert_eq!(parsed.items[0].i, 1);
1115        assert_eq!(parsed.items[1].v, vec![2.0, 3.0]);
1116    }
1117
1118    #[test]
1119    fn codex_schema_file_is_created_once_and_reused() {
1120        let client = test_client(
1121            EmbeddingFlavour::Codex,
1122            std::path::PathBuf::from("/bin/true"),
1123        );
1124        let first = client
1125            .codex_schema_file(64, false)
1126            .expect("schema file must be created");
1127        let second = client
1128            .codex_schema_file(64, false)
1129            .expect("schema file must be reused");
1130        assert_eq!(first.path(), second.path(), "same dim must reuse the file");
1131
1132        let batch = client
1133            .codex_schema_file(64, true)
1134            .expect("batch schema file must be created");
1135        assert_ne!(
1136            first.path(),
1137            batch.path(),
1138            "single and batch schemas are distinct files"
1139        );
1140
1141        let content = std::fs::read_to_string(first.path()).expect("schema file must be readable");
1142        assert!(content.contains(r#""minItems":64"#));
1143    }
1144
1145    #[test]
1146    fn codex_embedding_command_reads_prompt_from_stdin() {
1147        let schema_path = std::env::temp_dir().join("sqlite-graphrag-embed-schema-test.json");
1148        let cmd = build_codex_embedding_command(
1149            std::path::Path::new("/bin/true"),
1150            "gpt-5.4",
1151            &schema_path,
1152        );
1153        let argv: Vec<String> = cmd
1154            .as_std()
1155            .get_args()
1156            .filter_map(|arg| arg.to_str().map(|s| s.to_string()))
1157            .collect();
1158
1159        assert!(
1160            argv.iter().any(|arg| arg == "-"),
1161            "codex embedding command must read prompt from stdin: {argv:?}"
1162        );
1163        assert!(
1164            !argv.iter().any(|arg| arg.starts_with("passage: ")),
1165            "prompt text must not be passed as argv: {argv:?}"
1166        );
1167        for required in &[
1168            "exec",
1169            "-c",
1170            "sandbox_mode='read-only'",
1171            "approval_policy='never'",
1172            "--json",
1173            "--output-schema",
1174            "--ephemeral",
1175            "--skip-git-repo-check",
1176            "--sandbox",
1177            "read-only",
1178            "--ignore-user-config",
1179            "--ignore-rules",
1180            "--model",
1181            "gpt-5.4",
1182        ] {
1183            assert!(
1184                argv.iter().any(|arg| arg == required),
1185                "missing flag {required} in {argv:?}"
1186            );
1187        }
1188    }
1189
1190    #[cfg(unix)]
1191    #[test]
1192    #[serial_test::serial(env)]
1193    fn embed_passage_sends_prompt_to_codex_stdin() {
1194        use std::os::unix::fs::PermissionsExt;
1195
1196        // Pin the dimensionality so the mock script and the validation
1197        // agree regardless of test execution order.
1198        // SAFETY: guarded by serial(env).
1199        unsafe {
1200            std::env::set_var("SQLITE_GRAPHRAG_EMBEDDING_DIM", "64");
1201        }
1202
1203        let temp = tempfile::tempdir().expect("tempdir must exist");
1204        let binary = temp.path().join("codex-stdin-check");
1205        let script = r#"#!/usr/bin/env bash
1206set -euo pipefail
1207
1208prompt="$(cat)"
1209if [[ "$prompt" != "passage: codex-cli" ]]; then
1210  echo "unexpected stdin: $prompt" >&2
1211  exit 41
1212fi
1213
1214vals="0.0"
1215for _ in $(seq 2 64); do
1216  vals="$vals,0.0"
1217done
1218payload="{\"embedding\":[$vals]}"
1219escaped="${payload//\"/\\\"}"
1220echo "{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"$escaped\"}}"
1221"#;
1222        std::fs::write(&binary, script).expect("mock codex script must be written");
1223        let mut perms = std::fs::metadata(&binary)
1224            .expect("mock codex metadata must exist")
1225            .permissions();
1226        perms.set_mode(0o755);
1227        std::fs::set_permissions(&binary, perms).expect("mock codex must be executable");
1228
1229        let embedding = test_client(EmbeddingFlavour::Codex, binary);
1230
1231        let vector = embedding
1232            .embed_passage("codex-cli")
1233            .expect("stdin-backed codex embedding must succeed");
1234
1235        // SAFETY: guarded by serial(env).
1236        unsafe {
1237            std::env::remove_var("SQLITE_GRAPHRAG_EMBEDDING_DIM");
1238        }
1239
1240        assert_eq!(vector.len(), 64);
1241        assert!(vector.iter().all(|value| *value == 0.0));
1242    }
1243
1244    // ---------------------------------------------------------------
1245    // ADR-0042 / GAP-002: LlmEmbeddingBuilder unit tests
1246    // ---------------------------------------------------------------
1247
1248    /// `claude_default` is the `with_claude_builder` alias: returns a
1249    /// builder pre-set to the Claude flavour. Build requires the
1250    /// Claude binary to be on PATH; in CI without `claude`, the build
1251    /// fails with the canonical `claude not found` error, which is
1252    /// itself the proof that the flavour is propagated correctly.
1253    #[test]
1254    fn claude_default_resolves_path() {
1255        let builder = LlmEmbeddingBuilder::claude_default();
1256        assert_eq!(builder.flavour, EmbeddingFlavour::Claude);
1257        assert!(builder.binary_override.is_none());
1258        assert!(builder.model_override.is_none());
1259    }
1260
1261    /// `override_binary` short-circuits the PATH probe. The builder
1262    /// stores the override verbatim so the `build()` call can fall
1263    /// back to `resolve_real_binary` for ELF canonicalisation.
1264    #[test]
1265    fn override_binary_uses_provided() {
1266        let path = std::path::PathBuf::from("/tmp/fake-claude-binary");
1267        let builder = LlmEmbeddingBuilder::claude_default().override_binary(path.clone());
1268        assert_eq!(builder.binary_override.as_ref(), Some(&path));
1269    }
1270
1271    /// `override_model` short-circuits the env-var lookup. The model
1272    /// override travels untouched through `build()` so the LLM
1273    /// subprocess spawn honours it.
1274    #[test]
1275    fn override_model_uses_provided() {
1276        let builder =
1277            LlmEmbeddingBuilder::codex_default().override_model("gpt-5.4-custom".to_string());
1278        assert_eq!(builder.model_override.as_deref(), Some("gpt-5.4-custom"));
1279    }
1280
1281    // ---------------------------------------------------------------
1282    // v1.0.89 GAP tests
1283    // ---------------------------------------------------------------
1284
1285    #[test]
1286    fn embed_timeout_for_batch_scales_with_size() {
1287        let t1 = embed_timeout_for_batch(1);
1288        let t4 = embed_timeout_for_batch(4);
1289        let t8 = embed_timeout_for_batch(8);
1290        assert!(t1 < t4, "batch of 4 must have longer timeout than batch of 1");
1291        assert!(t4 < t8, "batch of 8 must have longer timeout than batch of 4");
1292        assert_eq!(t8 - t1, std::time::Duration::from_secs(15 * 7));
1293    }
1294
1295    #[test]
1296    fn embed_timeout_for_batch_single_equals_base() {
1297        let base = embed_timeout();
1298        let single = embed_timeout_for_batch(1);
1299        assert_eq!(base, single);
1300    }
1301}
sqlite_graphrag/extract/llm_embedding.rs

sqlite_graphrag/extract/
llm_embedding.rs