sqlite_graphrag/extract/
llm_embedding.rs

1//! LLM-based embedding backend (v1.0.76 default; reworked in v1.0.79 G42).
2//!
3//! `LlmEmbedding` is the production embedding client. It wraps headless
4//! invocations of `claude code` or `codex` and returns f32 vectors of the
5//! active dimensionality (`crate::constants::embedding_dim()`, default 64).
6//!
7//! v1.0.79 (G42) changes:
8//! - S1: the dimensionality is no longer hardcoded here — the single
9//!   source of truth lives in `crate::constants` and the JSON schemas
10//!   are generated dynamically.
11//! - S2: `embed_batch` embeds N numbered texts per LLM call with the
12//!   `{items:[{i,v}]}` schema, collapsing 39 subprocess spawns into 4-5.
13//! - S4: the codex `--output-schema` file is a `tempfile::NamedTempFile`
14//!   with a randomised name created once per client and shared across
15//!   clones via `Arc` — no per-call write+delete, no PID-path races.
16//! - S5: the claude model honours `SQLITE_GRAPHRAG_CLAUDE_EMBED_MODEL`
17//!   (symmetric to the codex env var). ZERO hardcoded models without
18//!   an env override.
19//! - S6: `CLAUDE_CONFIG_DIR` points at an empty managed directory BY
20//!   DEFAULT, because `--strict-mcp-config`/`--mcp-config '{}'` are
21//!   silently ignored upstream (anthropics/claude-code#10787) and a
22//!   full `~/.claude` costs ~223k cache-creation tokens per call.
23//! - S7: the codex `request_user_input` failure mode maps to an
24//!   actionable error instead of an opaque exit 11.
25//! - BLOCO 4: every subprocess uses `kill_on_drop(true)` plus an
26//!   explicit `tokio::time::timeout`, so cancellation never leaks a
27//!   child and a hung LLM cannot stall the pipeline forever.
28//!
29//! OAuth is the only supported credential path. The constructor rejects
30//! `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` in the environment — see
31//! `v1.0.69 (G31) OAuth-Only Enforcement`.
32
33use crate::errors::AppError;
34use serde::Deserialize;
35use std::process::Stdio;
36use std::sync::Arc;
37use tokio::io::AsyncWriteExt;
38use tokio::process::Command;
39
40/// Default per-LLM-call timeout in seconds. Consistent with the
41/// `--claude-timeout` / `--codex-timeout` defaults used by ingest.
42/// Override via `SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS`.
43const DEFAULT_EMBED_TIMEOUT_SECS: u64 = 300;
44
45fn embed_timeout() -> std::time::Duration {
46    let secs = std::env::var("SQLITE_GRAPHRAG_EMBED_TIMEOUT_SECS")
47        .ok()
48        .and_then(|v| v.parse::<u64>().ok())
49        .filter(|&n| (10..=3_600).contains(&n))
50        .unwrap_or(DEFAULT_EMBED_TIMEOUT_SECS);
51    std::time::Duration::from_secs(secs)
52}
53
54/// G42/S1: single-vector JSON schema generated from the active dim.
55fn build_single_schema(dim: usize) -> String {
56    format!(
57        r#"{{"type":"object","properties":{{"embedding":{{"type":"array","items":{{"type":"number"}},"minItems":{dim},"maxItems":{dim}}}}},"required":["embedding"],"additionalProperties":false}}"#
58    )
59}
60
61/// G42/S2: batch JSON schema `{items:[{i,v}]}`. The `items` array length
62/// is deliberately unconstrained so ONE schema file serves every batch
63/// size (index coverage is validated in Rust after parsing).
64fn build_batch_schema(dim: usize) -> String {
65    format!(
66        r#"{{"type":"object","properties":{{"items":{{"type":"array","items":{{"type":"object","properties":{{"i":{{"type":"integer"}},"v":{{"type":"array","items":{{"type":"number"}},"minItems":{dim},"maxItems":{dim}}}}},"required":["i","v"],"additionalProperties":false}}}}}},"required":["items"],"additionalProperties":false}}"#
67    )
68}
69
70#[derive(Clone, Debug)]
71pub struct LlmEmbedding {
72    /// Which LLM headless binary to spawn. `claude` or `codex`.
73    flavour: EmbeddingFlavour,
74    /// Cached path to the binary to avoid PATH lookups on every call.
75    binary: std::path::PathBuf,
76    /// Model name. Resolved from env overrides at construction time.
77    model: String,
78    /// G42/S4: lazily-created codex `--output-schema` tempfiles, shared
79    /// across clones. Keyed by dim so an env change between tests cannot
80    /// serve a stale schema.
81    codex_schemas: Arc<parking_lot::Mutex<CodexSchemaFiles>>,
82}
83
84#[derive(Debug, Default)]
85struct CodexSchemaFiles {
86    single: Option<(usize, Arc<tempfile::NamedTempFile>)>,
87    batch: Option<(usize, Arc<tempfile::NamedTempFile>)>,
88}
89
90#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize)]
91pub enum EmbeddingFlavour {
92    Claude,
93    Codex,
94}
95
96impl EmbeddingFlavour {
97    pub fn as_str(self) -> &'static str {
98        match self {
99            Self::Claude => "claude",
100            Self::Codex => "codex",
101        }
102    }
103}
104
105#[derive(Debug, Deserialize)]
106struct EmbeddingResponse {
107    embedding: Vec<f32>,
108}
109
110#[derive(Debug, Deserialize)]
111struct BatchEmbeddingResponse {
112    items: Vec<BatchEmbeddingItem>,
113}
114
115#[derive(Debug, Deserialize)]
116struct BatchEmbeddingItem {
117    i: usize,
118    v: Vec<f32>,
119}
120
121/// Follows symlinks and shell-script shim `exec` targets to find
122/// the real ELF binary. Shim wrappers (like `~/.graphrag-shim/codex`)
123/// can strip hardening flags; bypassing them is a security requirement.
124pub fn resolve_real_binary(path: &std::path::Path) -> std::path::PathBuf {
125    if let Ok(canonical) = std::fs::canonicalize(path) {
126        if is_elf_binary(&canonical) {
127            return canonical;
128        }
129        if let Some(exec_target) = extract_exec_target_from_shim(&canonical) {
130            if exec_target.exists() && is_elf_binary(&exec_target) {
131                return exec_target;
132            }
133        }
134        return canonical;
135    }
136    path.to_path_buf()
137}
138
139fn is_elf_binary(path: &std::path::Path) -> bool {
140    std::fs::read(path)
141        .map(|bytes| bytes.len() >= 4 && bytes[..4] == [0x7f, b'E', b'L', b'F'])
142        .unwrap_or(false)
143}
144
145fn extract_exec_target_from_shim(path: &std::path::Path) -> Option<std::path::PathBuf> {
146    let content = std::fs::read_to_string(path).ok()?;
147    if !content.starts_with("#!") {
148        return None;
149    }
150    for line in content.lines().rev() {
151        let trimmed = line.trim();
152        if trimmed.starts_with("exec ") {
153            let after_exec = trimmed.strip_prefix("exec ")?;
154            let binary = after_exec.split_whitespace().next()?;
155            return Some(std::path::PathBuf::from(binary));
156        }
157    }
158    None
159}
160
161/// G42/S5: claude embedding model with env override, symmetric to the
162/// codex `SQLITE_GRAPHRAG_CODEX_EMBED_MODEL` introduced in v1.0.78.
163fn claude_embed_model() -> String {
164    std::env::var("SQLITE_GRAPHRAG_CLAUDE_EMBED_MODEL")
165        .unwrap_or_else(|_| "claude-sonnet-4-6".to_string())
166}
167
168fn codex_embed_model() -> String {
169    std::env::var("SQLITE_GRAPHRAG_CODEX_EMBED_MODEL").unwrap_or_else(|_| "gpt-5.5".to_string())
170}
171
172impl LlmEmbedding {
173    /// Detects which LLM CLI is available on PATH and returns the
174    /// matching embedding client.
175    ///
176    /// v1.0.76: PREFERS `codex` over `claude` because:
177    /// - Claude Code 2.1+ ships a 180k+ token system context (plugins,
178    ///   skills, agents, MCP) that overflows the 200k context window
179    ///   for even trivial embedding prompts and returns "Prompt is too
180    ///   long". (v1.0.79/S6 mitigates this with an empty
181    ///   `CLAUDE_CONFIG_DIR`, but codex stays the lighter default.)
182    /// - Codex 0.134+ is lightweight (~5k system context) and the
183    ///   `StructuredOutput` tool reliably returns the requested vectors.
184    pub fn detect_available() -> Result<Self, AppError> {
185        Self::oauth_only_enforce()?;
186
187        if let Ok(path) = which::which("codex") {
188            return Ok(Self {
189                flavour: EmbeddingFlavour::Codex,
190                binary: resolve_real_binary(&path),
191                model: codex_embed_model(),
192                codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
193            });
194        }
195        if let Ok(path) = which::which("claude") {
196            return Ok(Self {
197                flavour: EmbeddingFlavour::Claude,
198                binary: resolve_real_binary(&path),
199                model: claude_embed_model(),
200                codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
201            });
202        }
203        Err(AppError::Embedding(
204            "no LLM CLI found on PATH: install `codex` (0.130+) or `claude` (Claude Code 2.1+)"
205                .to_string(),
206        ))
207    }
208
209    pub fn with_codex() -> Result<Self, AppError> {
210        Self::oauth_only_enforce()?;
211        let path = which::which("codex")
212            .map_err(|_| AppError::Embedding("`codex` not found on PATH".to_string()))?;
213        Ok(Self {
214            flavour: EmbeddingFlavour::Codex,
215            binary: resolve_real_binary(&path),
216            model: codex_embed_model(),
217            codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
218        })
219    }
220
221    pub fn with_claude() -> Result<Self, AppError> {
222        Self::oauth_only_enforce()?;
223        let path = which::which("claude")
224            .map_err(|_| AppError::Embedding("`claude` not found on PATH".to_string()))?;
225        Ok(Self {
226            flavour: EmbeddingFlavour::Claude,
227            binary: resolve_real_binary(&path),
228            model: claude_embed_model(),
229            codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
230        })
231    }
232
233    /// v1.0.69 (G31): refuse to spawn if an API key is set. The CLI
234    /// must use OAuth. The two API-key env vars are NOT in the
235    /// env-clear whitelist, so a parent process that exports them
236    /// will see this error.
237    fn oauth_only_enforce() -> Result<(), AppError> {
238        if std::env::var("ANTHROPIC_API_KEY").is_ok() {
239            return Err(AppError::Validation(
240                "ANTHROPIC_API_KEY is set; v1.0.76 requires OAuth. \
241                 unset it and use `claude login` instead."
242                    .into(),
243            ));
244        }
245        if std::env::var("OPENAI_API_KEY").is_ok() {
246            return Err(AppError::Validation(
247                "OPENAI_API_KEY is set; v1.0.76 requires OAuth. \
248                 unset it and use `codex login` instead."
249                    .into(),
250            ));
251        }
252        Ok(())
253    }
254
255    /// Embeds a single passage (chunk of a memory body). Returns an
256    /// f32 vector of the active dimensionality.
257    pub fn embed_passage(&self, text: &str) -> Result<Vec<f32>, AppError> {
258        self.invoke_with_prefix(crate::constants::PASSAGE_PREFIX, text)
259    }
260
261    /// Embeds a single query. The LLM uses a different prompt prefix
262    /// to disambiguate query from passage.
263    pub fn embed_query(&self, text: &str) -> Result<Vec<f32>, AppError> {
264        self.invoke_with_prefix(crate::constants::QUERY_PREFIX, text)
265    }
266
267    /// G56: returns a stable label for the active embedding model so the
268    /// in-process entity-embedding cache can key by `(model, text)`.
269    /// Embeddings produced by different models are not interchangeable,
270    /// so a cache entry from one model must never satisfy a request
271    /// served by another.
272    pub fn model_label(&self) -> String {
273        format!("{}:{}", self.flavour.as_str(), self.model)
274    }
275
276    /// G42/S2: embeds a batch of `(global_index, text)` pairs in ONE
277    /// LLM call. Returns `(global_index, vector)` pairs. Async — this
278    /// is the unit of work scheduled by the bounded fan-out in
279    /// `crate::embedder`.
280    ///
281    /// Cancel safety: the future owns its subprocess via
282    /// `kill_on_drop(true)`, so dropping it (e.g. losing a
283    /// `tokio::select!` race against a cancellation token) kills the
284    /// child and leaks nothing.
285    pub async fn embed_batch_async(
286        &self,
287        prefix: &str,
288        batch: &[(usize, String)],
289    ) -> Result<Vec<(usize, Vec<f32>)>, AppError> {
290        let dim = crate::constants::embedding_dim();
291        if batch.is_empty() {
292            return Ok(Vec::new());
293        }
294        if batch.len() == 1 {
295            let (idx, text) = (&batch[0].0, &batch[0].1);
296            let v = self.invoke_single_async(prefix, text, dim).await?;
297            return Ok(vec![(*idx, v)]);
298        }
299
300        let mut prompt = format!(
301            "Generate {dim}-dimensional semantic embedding vectors for each numbered text below.\n\
302             Return a JSON object with an \"items\" array containing EXACTLY {n} items.\n\
303             Each item has \"i\" (the 1-based index) and \"v\" (the {dim}-float vector, values between -1 and 1).\n\n",
304            n = batch.len()
305        );
306        for (pos, (_, text)) in batch.iter().enumerate() {
307            prompt.push_str(&format!("{}: {prefix}{text}\n", pos + 1));
308        }
309
310        let stdout = match self.flavour {
311            EmbeddingFlavour::Claude => {
312                self.invoke_claude(&prompt, &build_batch_schema(dim))
313                    .await?
314            }
315            EmbeddingFlavour::Codex => {
316                let schema = self.codex_schema_file(dim, true)?;
317                self.invoke_codex(&prompt, schema.path()).await?
318            }
319        };
320
321        let parsed: BatchEmbeddingResponse = parse_llm_json(&stdout).map_err(|e| {
322            AppError::Embedding(format!(
323                "LLM batch embedding response parse failed: {e}; raw={stdout}"
324            ))
325        })?;
326        if parsed.items.len() != batch.len() {
327            return Err(AppError::Embedding(format!(
328                "LLM batch returned {} items, expected {} (G42/S2 coverage check)",
329                parsed.items.len(),
330                batch.len()
331            )));
332        }
333        let mut out: Vec<Option<Vec<f32>>> = vec![None; batch.len()];
334        for item in parsed.items {
335            if item.i == 0 || item.i > batch.len() {
336                return Err(AppError::Embedding(format!(
337                    "LLM batch item index {} out of range 1..={}",
338                    item.i,
339                    batch.len()
340                )));
341            }
342            if item.v.len() != dim {
343                return Err(AppError::Embedding(format!(
344                    "LLM batch item {} returned {} dims, expected {dim}; \
345                     refusing to truncate or pad silently (G42/C5)",
346                    item.i,
347                    item.v.len()
348                )));
349            }
350            out[item.i - 1] = Some(item.v);
351        }
352        let mut result = Vec::with_capacity(batch.len());
353        for (pos, slot) in out.into_iter().enumerate() {
354            let v = slot.ok_or_else(|| {
355                AppError::Embedding(format!(
356                    "LLM batch response is missing item index {} (G42/S2 coverage check)",
357                    pos + 1
358                ))
359            })?;
360            result.push((batch[pos].0, v));
361        }
362        Ok(result)
363    }
364
365    fn invoke_with_prefix(&self, prefix: &str, text: &str) -> Result<Vec<f32>, AppError> {
366        let dim = crate::constants::embedding_dim();
367        let inner = self.invoke_single_async(prefix, text, dim);
368        // v1.0.79 (G42/A2): reuse the process-wide multi-thread runtime
369        // instead of building a current-thread runtime PER CALL. Inside
370        // an existing runtime (tests, async commands) block_in_place
371        // keeps the worker pool healthy.
372        match tokio::runtime::Handle::try_current() {
373            Ok(handle) => tokio::task::block_in_place(|| handle.block_on(inner)),
374            Err(_) => crate::embedder::shared_runtime()?.block_on(inner),
375        }
376    }
377
378    async fn invoke_single_async(
379        &self,
380        prefix: &str,
381        text: &str,
382        dim: usize,
383    ) -> Result<Vec<f32>, AppError> {
384        let prompt = format!("{prefix}{text}");
385        let stdout = match self.flavour {
386            EmbeddingFlavour::Claude => {
387                self.invoke_claude(&prompt, &build_single_schema(dim))
388                    .await?
389            }
390            EmbeddingFlavour::Codex => {
391                let schema = self.codex_schema_file(dim, false)?;
392                self.invoke_codex(&prompt, schema.path()).await?
393            }
394        };
395        let parsed: EmbeddingResponse = parse_llm_json(&stdout).map_err(|e| {
396            AppError::Embedding(format!(
397                "LLM embedding response parse failed: {e}; raw={stdout}"
398            ))
399        })?;
400        if parsed.embedding.len() != dim {
401            return Err(AppError::Embedding(format!(
402                "LLM returned {} dims, expected {dim}; \
403                 refusing to truncate or pad silently (G42/C5)",
404                parsed.embedding.len()
405            )));
406        }
407        Ok(parsed.embedding)
408    }
409
410    /// G42/S4: returns the lazily-created, process-shared codex schema
411    /// tempfile for the requested mode. `NamedTempFile` randomises the
412    /// filename (no PID-based collisions) and removes the file on drop
413    /// of the last `Arc` clone.
414    fn codex_schema_file(
415        &self,
416        dim: usize,
417        batch: bool,
418    ) -> Result<Arc<tempfile::NamedTempFile>, AppError> {
419        let mut guard = self.codex_schemas.lock();
420        let slot = if batch {
421            &mut guard.batch
422        } else {
423            &mut guard.single
424        };
425        if let Some((cached_dim, file)) = slot {
426            if *cached_dim == dim {
427                return Ok(Arc::clone(file));
428            }
429        }
430        let content = if batch {
431            build_batch_schema(dim)
432        } else {
433            build_single_schema(dim)
434        };
435        let file = tempfile::Builder::new()
436            .prefix("sqlite-graphrag-embed-schema-")
437            .suffix(".json")
438            .tempfile()
439            .map_err(|e| AppError::Embedding(format!("schema tempfile create failed: {e}")))?;
440        std::fs::write(file.path(), content)
441            .map_err(|e| AppError::Embedding(format!("schema tempfile write failed: {e}")))?;
442        let file = Arc::new(file);
443        *slot = Some((dim, Arc::clone(&file)));
444        Ok(file)
445    }
446
447    async fn invoke_claude(&self, prompt: &str, schema: &str) -> Result<String, AppError> {
448        // v1.0.69 hardening: --strict-mcp-config --mcp-config '{}' --settings
449        // '{"hooks":{}}' --dangerously-skip-permissions.
450        //
451        // v1.0.76 hardening: Claude Code 2.1+ renamed --output-schema to
452        // --json-schema and accepts the schema as an inline JSON string
453        // (NOT a file path). Also pass --output-format json so the
454        // response is a single JSON object on stdout.
455        //
456        // v1.0.79 (G42/S6): CLAUDE_CONFIG_DIR points at an empty managed
457        // directory BY DEFAULT — the MCP-isolation flags above are
458        // silently ignored upstream (anthropics/claude-code#10787) and a
459        // populated ~/.claude costs ~223k cache-creation tokens per call.
460        let mut cmd = Command::new(&self.binary);
461        cmd.arg("-p")
462            .arg(prompt)
463            .arg("--model")
464            .arg(&self.model)
465            .arg("--json-schema")
466            .arg(schema)
467            .arg("--output-format")
468            .arg("json")
469            .arg("--strict-mcp-config")
470            .arg("--mcp-config")
471            .arg(r#"{"mcpServers":{}}"#)
472            .arg("--settings")
473            .arg(r#"{"hooks":{}}"#)
474            .arg("--dangerously-skip-permissions")
475            .env_clear()
476            .env("PATH", std::env::var("PATH").unwrap_or_default())
477            .env("HOME", std::env::var("HOME").unwrap_or_default())
478            .stdin(Stdio::null())
479            .stdout(Stdio::piped())
480            .stderr(Stdio::piped())
481            // BLOCO 4: cancellation (dropped future) must kill the child.
482            .kill_on_drop(true);
483        if let Some(config_dir) = claude_embedding_config_dir() {
484            cmd.env("CLAUDE_CONFIG_DIR", &config_dir);
485        }
486        let binary_str = self.binary.to_string_lossy().into_owned();
487        let output = match tokio::time::timeout(embed_timeout(), cmd.output()).await {
488            Err(_elapsed) => {
489                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
490                    &crate::llm::exit_code_hints::LlmBackendError::Timeout {
491                        secs: embed_timeout().as_secs(),
492                        binary: binary_str.clone(),
493                    },
494                ));
495            }
496            Ok(Err(e)) => {
497                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
498                    &crate::llm::exit_code_hints::LlmBackendError::SpawnFailed {
499                        binary: binary_str.clone(),
500                        source: e.to_string(),
501                    },
502                ));
503            }
504            Ok(Ok(o)) => o,
505        };
506        if !output.status.success() {
507            let (exit_code, signal) = if let Some(code) = output.status.code() {
508                (Some(code), None)
509            } else {
510                use std::os::unix::process::ExitStatusExt;
511                (None, output.status.signal())
512            };
513            let stdout_tail = crate::llm::exit_code_hints::LlmBackendError::truncate_tail(
514                &output.stdout,
515                crate::llm::exit_code_hints::DIAG_TAIL_BYTES,
516            );
517            let stderr_tail = crate::llm::exit_code_hints::LlmBackendError::truncate_tail(
518                &output.stderr,
519                crate::llm::exit_code_hints::DIAG_TAIL_BYTES,
520            );
521            let hint = crate::llm::exit_code_hints::diagnose_exit_code(exit_code, signal);
522            return Err(crate::llm::exit_code_hints::into_legacy_embedding(
523                &crate::llm::exit_code_hints::LlmBackendError::NonZeroExit {
524                    exit_code,
525                    signal,
526                    stdout_tail,
527                    stderr_tail,
528                    binary: binary_str,
529                    hint,
530                },
531            ));
532        }
533        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
534    }
535
536    async fn invoke_codex(
537        &self,
538        prompt: &str,
539        schema_path: &std::path::Path,
540    ) -> Result<String, AppError> {
541        let binary_str = self.binary.to_string_lossy().into_owned();
542        let mut child =
543            match build_codex_embedding_command(&self.binary, &self.model, schema_path).spawn() {
544                Ok(c) => c,
545                Err(e) => {
546                    return Err(crate::llm::exit_code_hints::into_legacy_embedding(
547                        &crate::llm::exit_code_hints::LlmBackendError::SpawnFailed {
548                            binary: binary_str,
549                            source: e.to_string(),
550                        },
551                    ));
552                }
553            };
554        if let Some(mut stdin) = child.stdin.take() {
555            stdin
556                .write_all(prompt.as_bytes())
557                .await
558                .map_err(|e| AppError::Embedding(format!("codex stdin write failed: {e}")))?;
559        }
560        let output = match tokio::time::timeout(embed_timeout(), child.wait_with_output()).await {
561            Err(_elapsed) => {
562                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
563                    &crate::llm::exit_code_hints::LlmBackendError::Timeout {
564                        secs: embed_timeout().as_secs(),
565                        binary: binary_str,
566                    },
567                ));
568            }
569            Ok(Err(e)) => {
570                return Err(crate::llm::exit_code_hints::into_legacy_embedding(
571                    &crate::llm::exit_code_hints::LlmBackendError::SpawnFailed {
572                        binary: binary_str,
573                        source: format!("codex wait failed: {e}"),
574                    },
575                ));
576            }
577            Ok(Ok(o)) => o,
578        };
579        if !output.status.success() {
580            let (exit_code, signal) = if let Some(code) = output.status.code() {
581                (Some(code), None)
582            } else {
583                use std::os::unix::process::ExitStatusExt;
584                (None, output.status.signal())
585            };
586            let stdout_tail = crate::llm::exit_code_hints::LlmBackendError::truncate_tail(
587                &output.stdout,
588                crate::llm::exit_code_hints::DIAG_TAIL_BYTES,
589            );
590            let stderr_tail = crate::llm::exit_code_hints::LlmBackendError::truncate_tail(
591                &output.stderr,
592                crate::llm::exit_code_hints::DIAG_TAIL_BYTES,
593            );
594            let hint = crate::llm::exit_code_hints::diagnose_exit_code(exit_code, signal);
595            // G42/S7: the headless spawn can still hit interactive
596            // prompts on some codex builds; keep the legacy request_user_input
597            // branch as a special-case hint, and stamp the diagnostic
598            // tail on top of the canonical NonZeroExit envelope.
599            let mut combined_hint = hint;
600            if stderr_tail.contains("request_user_input") {
601                combined_hint.push_str(
602                    " | codex requested interactive input in a headless embedding call; \
603                     upgrade codex (>= 0.134) or switch the embedding backend to claude",
604                );
605            }
606            return Err(crate::llm::exit_code_hints::into_legacy_embedding(
607                &crate::llm::exit_code_hints::LlmBackendError::NonZeroExit {
608                    exit_code,
609                    signal,
610                    stdout_tail,
611                    stderr_tail,
612                    binary: binary_str,
613                    hint: combined_hint,
614                },
615            ));
616        }
617        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
618    }
619}
620
621/// G42/S6: resolves the empty `CLAUDE_CONFIG_DIR` used for embedding
622/// subprocesses.
623///
624/// - `SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR` is honoured when set and
625///   pointing at a directory (same contract as G28-A in claude_runner);
626/// - otherwise a managed directory is created at
627///   `~/.local/state/sqlite-graphrag/claude-empty-config` (mode 0700).
628///   If `~/.claude/.credentials.json` exists (Linux OAuth storage) it is
629///   copied in so authentication still works; on macOS credentials live
630///   in the Keychain and the empty dir is sufficient.
631///
632/// Returns `None` only when HOME is unset AND no override is given —
633/// in that case the subprocess falls back to claude's own default.
634fn claude_embedding_config_dir() -> Option<std::path::PathBuf> {
635    if let Ok(dir) = std::env::var("SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR") {
636        let path = std::path::PathBuf::from(dir);
637        if path.is_dir() {
638            return Some(path);
639        }
640        tracing::warn!(
641            target: "embedding",
642            path = %path.display(),
643            "SQLITE_GRAPHRAG_CLAUDE_EMPTY_CONFIG_DIR is set but not a directory; \
644             falling back to the managed empty config dir"
645        );
646    }
647    let home = std::env::var("HOME").ok()?;
648    let dir = std::path::Path::new(&home)
649        .join(".local/state/sqlite-graphrag")
650        .join("claude-empty-config");
651    if std::fs::create_dir_all(&dir).is_err() {
652        return None;
653    }
654    #[cfg(unix)]
655    {
656        use std::os::unix::fs::PermissionsExt;
657        let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700));
658    }
659    // Linux stores OAuth credentials on disk; copy them so the isolated
660    // config dir still authenticates. Best-effort: macOS uses Keychain.
661    let creds = std::path::Path::new(&home).join(".claude/.credentials.json");
662    if creds.exists() {
663        let target = dir.join(".credentials.json");
664        if !target.exists() {
665            let _ = std::fs::copy(&creds, &target);
666        }
667    }
668    Some(dir)
669}
670
671fn build_codex_embedding_command(
672    binary: &std::path::Path,
673    model: &str,
674    schema_path: &std::path::Path,
675) -> Command {
676    let mut cmd = Command::new(binary);
677    // v1.0.77: `-c` TOML overrides bypass the codex exec --sandbox propagation
678    // bug (openai/codex#18113). CLI flags alone are insufficient — the exec
679    // subcommand may not inherit --sandbox from the parent codex command.
680    cmd.arg("exec")
681        .arg("-c")
682        .arg("sandbox_mode='read-only'")
683        .arg("-c")
684        .arg("approval_policy='never'")
685        .arg("--json")
686        .arg("--output-schema")
687        .arg(schema_path)
688        .arg("--ephemeral")
689        .arg("--skip-git-repo-check")
690        .arg("--sandbox")
691        .arg("read-only")
692        .arg("--ignore-user-config")
693        .arg("--ignore-rules");
694    if crate::extract::codex_compat::codex_supports_ask_for_approval() {
695        cmd.arg("--ask-for-approval").arg("never");
696    }
697    // v1.0.77: isolate codex from user config by pointing CODEX_HOME at a
698    // minimal directory containing only auth.json (OAuth credentials).
699    let codex_home = prepare_isolated_codex_home();
700    cmd.arg("--model")
701        .arg(model)
702        .arg("-")
703        .env_clear()
704        .env("PATH", std::env::var("PATH").unwrap_or_default())
705        .env("HOME", std::env::var("HOME").unwrap_or_default());
706    if let Some(ref ch) = codex_home {
707        cmd.env("CODEX_HOME", ch);
708    }
709    cmd.stdin(Stdio::piped())
710        .stdout(Stdio::piped())
711        .stderr(Stdio::piped())
712        // BLOCO 4: cancellation (dropped future) must kill the child.
713        .kill_on_drop(true);
714    cmd
715}
716
717fn prepare_isolated_codex_home() -> Option<std::path::PathBuf> {
718    let home = std::env::var("HOME").ok()?;
719    let real_auth = std::path::Path::new(&home).join(".codex/auth.json");
720    if !real_auth.exists() {
721        return None;
722    }
723    let base = std::path::Path::new(&home).join(".local/share/sqlite-graphrag");
724    let isolated = base.join(format!("codex-home-{}", std::process::id()));
725    let _ = std::fs::create_dir_all(&isolated);
726    let target = isolated.join("auth.json");
727    if !target.exists() {
728        let _ = std::fs::copy(&real_auth, &target);
729    }
730    Some(isolated)
731}
732
733/// Parse an LLM JSON response of type `T`. The two backends emit
734/// different shapes:
735/// - Claude (with `--output-format json`): single JSON object on stdout.
736/// - Codex (with `--json`): JSONL stream with one event per line; the
737///   `agent_message` event's `text` field is the JSON payload.
738///
739/// This helper accepts both shapes and returns the parsed value (or an
740/// error describing the first mismatch).
741fn parse_llm_json<T: serde::de::DeserializeOwned>(stdout: &str) -> Result<T, String> {
742    // Strategy 1: try the whole stdout as JSON (Claude path).
743    if let Ok(parsed) = serde_json::from_str::<T>(stdout) {
744        return Ok(parsed);
745    }
746    // Strategy 2: walk the JSONL line by line and pick the last
747    // `item.completed` of type `agent_message` (Codex path).
748    let mut last_agent_text: Option<String> = None;
749    for line in stdout.lines() {
750        let line = line.trim();
751        if line.is_empty() {
752            continue;
753        }
754        let Ok(event) = serde_json::from_str::<serde_json::Value>(line) else {
755            continue;
756        };
757        if event.get("type").and_then(|t| t.as_str()) != Some("item.completed") {
758            continue;
759        }
760        let item = match event.get("item") {
761            Some(i) => i,
762            None => continue,
763        };
764        if item.get("type").and_then(|t| t.as_str()) != Some("agent_message") {
765            continue;
766        }
767        if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
768            last_agent_text = Some(text.to_string());
769        }
770    }
771    let text = last_agent_text
772        .ok_or_else(|| "no agent_message found in codex JSONL output".to_string())?;
773    serde_json::from_str::<T>(&text)
774        .map_err(|e| format!("codex agent_message text does not match schema: {e}; raw={text}"))
775}
776
777#[cfg(test)]
778mod tests {
779    use super::*;
780
781    fn test_client(flavour: EmbeddingFlavour, binary: std::path::PathBuf) -> LlmEmbedding {
782        LlmEmbedding {
783            flavour,
784            binary,
785            model: "gpt-5.4".to_string(),
786            codex_schemas: Arc::new(parking_lot::Mutex::new(CodexSchemaFiles::default())),
787        }
788    }
789
790    #[test]
791    #[serial_test::serial(env)]
792    fn oauth_only_enforce_blocks_api_keys() {
793        // SAFETY: this test only sets and unsets env vars; the
794        // `serial(env)` group prevents cross-test interference.
795        unsafe {
796            std::env::set_var("ANTHROPIC_API_KEY", "test");
797            assert!(LlmEmbedding::oauth_only_enforce().is_err());
798            std::env::remove_var("ANTHROPIC_API_KEY");
799
800            std::env::set_var("OPENAI_API_KEY", "test");
801            assert!(LlmEmbedding::oauth_only_enforce().is_err());
802            std::env::remove_var("OPENAI_API_KEY");
803        }
804        assert!(LlmEmbedding::oauth_only_enforce().is_ok());
805    }
806
807    #[test]
808    fn flavour_as_str_is_stable() {
809        assert_eq!(EmbeddingFlavour::Claude.as_str(), "claude");
810        assert_eq!(EmbeddingFlavour::Codex.as_str(), "codex");
811    }
812
813    #[test]
814    fn single_schema_embeds_active_dim() {
815        let schema = build_single_schema(64);
816        assert!(schema.contains(r#""minItems":64"#));
817        assert!(schema.contains(r#""maxItems":64"#));
818        let parsed: serde_json::Value =
819            serde_json::from_str(&schema).expect("single schema must be valid JSON");
820        assert_eq!(parsed["properties"]["embedding"]["minItems"], 64);
821    }
822
823    #[test]
824    fn batch_schema_is_valid_json_and_unbounded_items() {
825        let schema = build_batch_schema(64);
826        let parsed: serde_json::Value =
827            serde_json::from_str(&schema).expect("batch schema must be valid JSON");
828        // The items array must NOT constrain its length so one schema
829        // file serves every batch size (G42/S4).
830        assert!(parsed["properties"]["items"].get("minItems").is_none());
831        assert_eq!(
832            parsed["properties"]["items"]["items"]["properties"]["v"]["minItems"],
833            64
834        );
835    }
836
837    #[test]
838    fn parse_llm_json_accepts_claude_json() {
839        let stdout = r#"{"embedding":[0.0,1.0,2.0]}"#;
840
841        let parsed: EmbeddingResponse = parse_llm_json(stdout).expect("claude JSON must parse");
842
843        assert_eq!(parsed.embedding, vec![0.0, 1.0, 2.0]);
844    }
845
846    #[test]
847    fn parse_llm_json_accepts_codex_jsonl() {
848        let stdout = r#"{"type":"thread.started","thread_id":"mock-thread-0"}
849{"type":"item.completed","item":{"type":"agent_message","text":"{\"embedding\":[0.0,1.0,2.0]}"}}
850{"type":"turn.completed","usage":{"input_tokens":1,"output_tokens":1}}"#;
851
852        let parsed: EmbeddingResponse = parse_llm_json(stdout).expect("codex JSONL must parse");
853
854        assert_eq!(parsed.embedding, vec![0.0, 1.0, 2.0]);
855    }
856
857    #[test]
858    fn parse_llm_json_rejects_jsonl_without_agent_message() {
859        let stdout = r#"{"type":"thread.started","thread_id":"mock-thread-0"}"#;
860
861        let err = parse_llm_json::<EmbeddingResponse>(stdout)
862            .expect_err("missing agent_message must fail");
863
864        assert!(err.contains("no agent_message"));
865    }
866
867    #[test]
868    fn parse_llm_json_accepts_batch_response() {
869        let stdout = r#"{"items":[{"i":1,"v":[0.0,1.0]},{"i":2,"v":[2.0,3.0]}]}"#;
870
871        let parsed: BatchEmbeddingResponse = parse_llm_json(stdout).expect("batch JSON must parse");
872
873        assert_eq!(parsed.items.len(), 2);
874        assert_eq!(parsed.items[0].i, 1);
875        assert_eq!(parsed.items[1].v, vec![2.0, 3.0]);
876    }
877
878    #[test]
879    fn codex_schema_file_is_created_once_and_reused() {
880        let client = test_client(
881            EmbeddingFlavour::Codex,
882            std::path::PathBuf::from("/bin/true"),
883        );
884        let first = client
885            .codex_schema_file(64, false)
886            .expect("schema file must be created");
887        let second = client
888            .codex_schema_file(64, false)
889            .expect("schema file must be reused");
890        assert_eq!(first.path(), second.path(), "same dim must reuse the file");
891
892        let batch = client
893            .codex_schema_file(64, true)
894            .expect("batch schema file must be created");
895        assert_ne!(
896            first.path(),
897            batch.path(),
898            "single and batch schemas are distinct files"
899        );
900
901        let content = std::fs::read_to_string(first.path()).expect("schema file must be readable");
902        assert!(content.contains(r#""minItems":64"#));
903    }
904
905    #[test]
906    fn codex_embedding_command_reads_prompt_from_stdin() {
907        let schema_path = std::env::temp_dir().join("sqlite-graphrag-embed-schema-test.json");
908        let cmd = build_codex_embedding_command(
909            std::path::Path::new("/bin/true"),
910            "gpt-5.4",
911            &schema_path,
912        );
913        let argv: Vec<String> = cmd
914            .as_std()
915            .get_args()
916            .filter_map(|arg| arg.to_str().map(|s| s.to_string()))
917            .collect();
918
919        assert!(
920            argv.iter().any(|arg| arg == "-"),
921            "codex embedding command must read prompt from stdin: {argv:?}"
922        );
923        assert!(
924            !argv.iter().any(|arg| arg.starts_with("passage: ")),
925            "prompt text must not be passed as argv: {argv:?}"
926        );
927        for required in &[
928            "exec",
929            "-c",
930            "sandbox_mode='read-only'",
931            "approval_policy='never'",
932            "--json",
933            "--output-schema",
934            "--ephemeral",
935            "--skip-git-repo-check",
936            "--sandbox",
937            "read-only",
938            "--ignore-user-config",
939            "--ignore-rules",
940            "--model",
941            "gpt-5.4",
942        ] {
943            assert!(
944                argv.iter().any(|arg| arg == required),
945                "missing flag {required} in {argv:?}"
946            );
947        }
948    }
949
950    #[cfg(unix)]
951    #[test]
952    #[serial_test::serial(env)]
953    fn embed_passage_sends_prompt_to_codex_stdin() {
954        use std::os::unix::fs::PermissionsExt;
955
956        // Pin the dimensionality so the mock script and the validation
957        // agree regardless of test execution order.
958        // SAFETY: guarded by serial(env).
959        unsafe {
960            std::env::set_var("SQLITE_GRAPHRAG_EMBEDDING_DIM", "64");
961        }
962
963        let temp = tempfile::tempdir().expect("tempdir must exist");
964        let binary = temp.path().join("codex-stdin-check");
965        let script = r#"#!/usr/bin/env bash
966set -euo pipefail
967
968prompt="$(cat)"
969if [[ "$prompt" != "passage: codex-cli" ]]; then
970  echo "unexpected stdin: $prompt" >&2
971  exit 41
972fi
973
974vals="0.0"
975for _ in $(seq 2 64); do
976  vals="$vals,0.0"
977done
978payload="{\"embedding\":[$vals]}"
979escaped="${payload//\"/\\\"}"
980echo "{\"type\":\"item.completed\",\"item\":{\"type\":\"agent_message\",\"text\":\"$escaped\"}}"
981"#;
982        std::fs::write(&binary, script).expect("mock codex script must be written");
983        let mut perms = std::fs::metadata(&binary)
984            .expect("mock codex metadata must exist")
985            .permissions();
986        perms.set_mode(0o755);
987        std::fs::set_permissions(&binary, perms).expect("mock codex must be executable");
988
989        let embedding = test_client(EmbeddingFlavour::Codex, binary);
990
991        let vector = embedding
992            .embed_passage("codex-cli")
993            .expect("stdin-backed codex embedding must succeed");
994
995        // SAFETY: guarded by serial(env).
996        unsafe {
997            std::env::remove_var("SQLITE_GRAPHRAG_EMBEDDING_DIM");
998        }
999
1000        assert_eq!(vector.len(), 64);
1001        assert!(vector.iter().all(|value| *value == 0.0));
1002    }
1003}
sqlite_graphrag/extract/llm_embedding.rs

sqlite_graphrag/extract/
llm_embedding.rs