Skip to main content

sqlite_graphrag/extract/
llm_embedding.rs

1//! LLM-based embedding backend (v1.0.76 default).
2//!
3//! `LlmEmbedding` is the production embedding client. It wraps a single
4//! headless invocation of `claude code` or `codex` and returns a 384-dim
5//! f32 vector parsed from the LLM's JSONL response.
6//!
7//! The embedding model is the same `multilingual-e5-small` from before, but
8//! the call now goes through the LLM's tool-use protocol (no MCP, no hooks).
9//! This is the single reason the binary is now one-shot: there is no daemon
10//! to keep the model loaded, the LLM subprocess is spawned on demand and
11//! killed when the response is parsed.
12//!
13//! OAuth is the only supported credential path. The constructor rejects
14//! `ANTHROPIC_API_KEY` / `OPENAI_API_KEY` in the environment — see
15//! `v1.0.69 (G31) OAuth-Only Enforcement`.
16
17use crate::errors::AppError;
18use serde::Deserialize;
19use std::process::Stdio;
20use tokio::io::AsyncWriteExt;
21use tokio::process::Command;
22
23/// Dimensionality of the embedding space. Matches the previous
24/// `multilingual-e5-small` model output and the `memory_embeddings.embedding`
25/// BLOB column size.
26pub const EMBEDDING_DIM: usize = 384;
27
28#[derive(Clone, Debug)]
29pub struct LlmEmbedding {
30    /// Which LLM headless binary to spawn. `claude` or `codex`.
31    flavour: EmbeddingFlavour,
32    /// Cached path to the binary to avoid PATH lookups on every call.
33    binary: std::path::PathBuf,
34    /// Optional model name passed via `--model`. Defaults are pinned.
35    model: String,
36}
37
38#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize)]
39pub enum EmbeddingFlavour {
40    Claude,
41    Codex,
42}
43
44impl EmbeddingFlavour {
45    pub fn as_str(self) -> &'static str {
46        match self {
47            Self::Claude => "claude",
48            Self::Codex => "codex",
49        }
50    }
51}
52
53#[derive(Debug, Deserialize)]
54struct EmbeddingResponse {
55    embedding: Vec<f32>,
56}
57
58/// Follows symlinks and shell-script shim `exec` targets to find
59/// the real ELF binary. Shim wrappers (like `~/.graphrag-shim/codex`)
60/// can strip hardening flags; bypassing them is a security requirement.
61pub fn resolve_real_binary(path: &std::path::Path) -> std::path::PathBuf {
62    if let Ok(canonical) = std::fs::canonicalize(path) {
63        if is_elf_binary(&canonical) {
64            return canonical;
65        }
66        if let Some(exec_target) = extract_exec_target_from_shim(&canonical) {
67            if exec_target.exists() && is_elf_binary(&exec_target) {
68                return exec_target;
69            }
70        }
71        return canonical;
72    }
73    path.to_path_buf()
74}
75
76fn is_elf_binary(path: &std::path::Path) -> bool {
77    std::fs::read(path)
78        .map(|bytes| bytes.len() >= 4 && bytes[..4] == [0x7f, b'E', b'L', b'F'])
79        .unwrap_or(false)
80}
81
82fn extract_exec_target_from_shim(path: &std::path::Path) -> Option<std::path::PathBuf> {
83    let content = std::fs::read_to_string(path).ok()?;
84    if !content.starts_with("#!") {
85        return None;
86    }
87    for line in content.lines().rev() {
88        let trimmed = line.trim();
89        if trimmed.starts_with("exec ") {
90            let after_exec = trimmed.strip_prefix("exec ")?;
91            let binary = after_exec.split_whitespace().next()?;
92            return Some(std::path::PathBuf::from(binary));
93        }
94    }
95    None
96}
97
98impl LlmEmbedding {
99    /// Detects which LLM CLI is available on PATH and returns the
100    /// matching embedding client.
101    ///
102    /// v1.0.76: PREFERS `codex` over `claude` because:
103    /// - Claude Code 2.1+ ships a 180k+ token system context (plugins,
104    ///   skills, agents, MCP) that overflows the 200k context window
105    ///   for even trivial embedding prompts and returns "Prompt is too
106    ///   long".
107    /// - Codex 0.134+ is lightweight (~5k system context) and the
108    ///   `StructuredOutput` tool reliably returns 384-dim vectors.
109    pub fn detect_available() -> Result<Self, AppError> {
110        Self::oauth_only_enforce()?;
111
112        if let Ok(path) = which::which("codex") {
113            return Ok(Self {
114                flavour: EmbeddingFlavour::Codex,
115                binary: resolve_real_binary(&path),
116                model: "gpt-5.4".to_string(),
117            });
118        }
119        if let Ok(path) = which::which("claude") {
120            return Ok(Self {
121                flavour: EmbeddingFlavour::Claude,
122                binary: resolve_real_binary(&path),
123                model: "claude-sonnet-4-6".to_string(),
124            });
125        }
126        Err(AppError::Embedding(
127            "no LLM CLI found on PATH: install `codex` (0.130+) or `claude` (Claude Code 2.1+)"
128                .to_string(),
129        ))
130    }
131
132    pub fn with_codex() -> Result<Self, AppError> {
133        Self::oauth_only_enforce()?;
134        let path = which::which("codex")
135            .map_err(|_| AppError::Embedding("`codex` not found on PATH".to_string()))?;
136        Ok(Self {
137            flavour: EmbeddingFlavour::Codex,
138            binary: resolve_real_binary(&path),
139            model: "gpt-5.4".to_string(),
140        })
141    }
142
143    pub fn with_claude() -> Result<Self, AppError> {
144        Self::oauth_only_enforce()?;
145        let path = which::which("claude")
146            .map_err(|_| AppError::Embedding("`claude` not found on PATH".to_string()))?;
147        Ok(Self {
148            flavour: EmbeddingFlavour::Claude,
149            binary: resolve_real_binary(&path),
150            model: "claude-sonnet-4-6".to_string(),
151        })
152    }
153
154    /// v1.0.69 (G31): refuse to spawn if an API key is set. The CLI
155    /// must use OAuth. The two API-key env vars are NOT in the
156    /// env-clear whitelist, so a parent process that exports them
157    /// will see this error.
158    fn oauth_only_enforce() -> Result<(), AppError> {
159        if std::env::var("ANTHROPIC_API_KEY").is_ok() {
160            return Err(AppError::Validation(
161                "ANTHROPIC_API_KEY is set; v1.0.76 requires OAuth. \
162                 unset it and use `claude login` instead."
163                    .into(),
164            ));
165        }
166        if std::env::var("OPENAI_API_KEY").is_ok() {
167            return Err(AppError::Validation(
168                "OPENAI_API_KEY is set; v1.0.76 requires OAuth. \
169                 unset it and use `codex login` instead."
170                    .into(),
171            ));
172        }
173        Ok(())
174    }
175
176    /// Embeds a single passage (chunk of a memory body). Returns a
177    /// 384-dim f32 vector suitable for cosine similarity.
178    pub fn embed_passage(&mut self, text: &str) -> Result<Vec<f32>, AppError> {
179        self.invoke_with_prefix(crate::constants::PASSAGE_PREFIX, text)
180    }
181
182    /// Embeds a single query. The LLM uses a different prompt prefix
183    /// to disambiguate query from passage.
184    pub fn embed_query(&mut self, text: &str) -> Result<Vec<f32>, AppError> {
185        self.invoke_with_prefix(crate::constants::QUERY_PREFIX, text)
186    }
187
188    fn invoke_with_prefix(&mut self, prefix: &str, text: &str) -> Result<Vec<f32>, AppError> {
189        // v1.0.76: tolerate being called from inside an existing tokio
190        // runtime (e.g. a test marked `#[tokio::test]`) by reusing the
191        // current Handle via block_in_place. When no runtime is in scope
192        // we build a one-shot current-thread runtime.
193        let prompt = format!("{prefix}{text}");
194        let inner = async {
195            match self.flavour {
196                EmbeddingFlavour::Claude => self.invoke_claude(&prompt).await,
197                EmbeddingFlavour::Codex => self.invoke_codex(&prompt).await,
198            }
199        };
200        let stdout: String = match tokio::runtime::Handle::try_current() {
201            Ok(handle) => tokio::task::block_in_place(|| handle.block_on(inner))?,
202            Err(_) => {
203                let rt = tokio::runtime::Builder::new_current_thread()
204                    .enable_all()
205                    .build()
206                    .map_err(|e| AppError::Embedding(format!("tokio runtime init failed: {e}")))?;
207                rt.block_on(inner)?
208            }
209        };
210
211        let parsed: EmbeddingResponse = parse_embedding_response(&stdout).map_err(|e| {
212            AppError::Embedding(format!(
213                "LLM embedding response parse failed: {e}; raw={stdout}"
214            ))
215        })?;
216        if parsed.embedding.len() != EMBEDDING_DIM {
217            return Err(AppError::Embedding(format!(
218                "LLM returned {} dims, expected {EMBEDDING_DIM}",
219                parsed.embedding.len()
220            )));
221        }
222        Ok(parsed.embedding)
223    }
224
225    async fn invoke_claude(&self, prompt: &str) -> Result<String, AppError> {
226        // v1.0.69 hardening: --strict-mcp-config --mcp-config '{}' --settings
227        // '{"hooks":{}}' --dangerously-skip-permissions.
228        //
229        // v1.0.76 hardening: Claude Code 2.1+ renamed --output-schema to
230        // --json-schema and accepts the schema as an inline JSON string
231        // (NOT a file path). Also pass --output-format json so the
232        // response is a single JSON object on stdout (the default text
233        // mode returns prose which fails the `embedding` field check).
234        const SCHEMA: &str = r#"{"type":"object","properties":{"embedding":{"type":"array","items":{"type":"number"},"minItems":384,"maxItems":384}},"required":["embedding"],"additionalProperties":false}"#;
235        let output = Command::new(&self.binary)
236            .arg("-p")
237            .arg(prompt)
238            .arg("--model")
239            .arg(&self.model)
240            .arg("--json-schema")
241            .arg(SCHEMA)
242            .arg("--output-format")
243            .arg("json")
244            .arg("--strict-mcp-config")
245            .arg("--mcp-config")
246            .arg(r#"{"mcpServers":{}}"#)
247            .arg("--settings")
248            .arg(r#"{"hooks":{}}"#)
249            .arg("--dangerously-skip-permissions")
250            .env_clear()
251            .env("PATH", std::env::var("PATH").unwrap_or_default())
252            .env("HOME", std::env::var("HOME").unwrap_or_default())
253            .stdin(Stdio::null())
254            .stdout(Stdio::piped())
255            .stderr(Stdio::piped())
256            .output()
257            .await
258            .map_err(|e| AppError::Embedding(format!("claude spawn failed: {e}")))?;
259        if !output.status.success() {
260            return Err(AppError::Embedding(format!(
261                "claude exited with {}: stderr={}",
262                output.status,
263                String::from_utf8_lossy(&output.stderr)
264            )));
265        }
266        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
267    }
268
269    async fn invoke_codex(&self, prompt: &str) -> Result<String, AppError> {
270        // v1.0.69 hardening: --json --output-schema --ephemeral --skip-git-repo-check
271        // --sandbox read-only --ignore-user-config --ignore-rules
272        //
273        // v1.0.76 hardening (G31 + G33 + codex 0.134+ compat):
274        // - --ask-for-approval removed in 0.134+ (Issue #26602) — gated
275        //   by the codex_compat helper
276        // - -c mcp_servers='{}' removed — value is parsed as string and
277        //   rejected ("expected a map"). --ignore-user-config already
278        //   covers the MCP isolation requirement.
279        // - --output-schema is a FILE PATH (not inline JSON like
280        //   claude's --json-schema). Write to a temp file in the
281        //   cache dir (matches the trusted-schema-path pattern used by
282        //   codex_spawn).
283        const SCHEMA: &str = r#"{"type":"object","properties":{"embedding":{"type":"array","items":{"type":"number"},"minItems":384,"maxItems":384}},"required":["embedding"],"additionalProperties":false}"#;
284        let schema_path = std::env::temp_dir().join(format!(
285            "sqlite-graphrag-embed-schema-{}.json",
286            std::process::id()
287        ));
288        std::fs::write(&schema_path, SCHEMA)
289            .map_err(|e| AppError::Embedding(format!("failed to write schema file: {e}")))?;
290        let mut child = build_codex_embedding_command(&self.binary, &self.model, &schema_path)
291            .spawn()
292            .map_err(|e| AppError::Embedding(format!("codex spawn failed: {e}")))?;
293        if let Some(mut stdin) = child.stdin.take() {
294            stdin
295                .write_all(prompt.as_bytes())
296                .await
297                .map_err(|e| AppError::Embedding(format!("codex stdin write failed: {e}")))?;
298        }
299        let output = child
300            .wait_with_output()
301            .await
302            .map_err(|e| AppError::Embedding(format!("codex wait failed: {e}")))?;
303        let _ = std::fs::remove_file(&schema_path);
304        if !output.status.success() {
305            return Err(AppError::Embedding(format!(
306                "codex exited with {}: stderr={}",
307                output.status,
308                String::from_utf8_lossy(&output.stderr)
309            )));
310        }
311        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
312    }
313}
314
315fn build_codex_embedding_command(
316    binary: &std::path::Path,
317    model: &str,
318    schema_path: &std::path::Path,
319) -> Command {
320    let mut cmd = Command::new(binary);
321    // v1.0.77: `-c` TOML overrides bypass the codex exec --sandbox propagation
322    // bug (openai/codex#18113). CLI flags alone are insufficient — the exec
323    // subcommand may not inherit --sandbox from the parent codex command.
324    cmd.arg("exec")
325        .arg("-c")
326        .arg("sandbox_mode='read-only'")
327        .arg("-c")
328        .arg("approval_policy='never'")
329        .arg("--json")
330        .arg("--output-schema")
331        .arg(schema_path)
332        .arg("--ephemeral")
333        .arg("--skip-git-repo-check")
334        .arg("--sandbox")
335        .arg("read-only")
336        .arg("--ignore-user-config")
337        .arg("--ignore-rules");
338    if crate::extract::codex_compat::codex_supports_ask_for_approval() {
339        cmd.arg("--ask-for-approval").arg("never");
340    }
341    // v1.0.77: isolate codex from user config by pointing CODEX_HOME at a
342    // minimal directory containing only auth.json (OAuth credentials).
343    let codex_home = prepare_isolated_codex_home();
344    cmd.arg("--model")
345        .arg(model)
346        .arg("-")
347        .env_clear()
348        .env("PATH", std::env::var("PATH").unwrap_or_default())
349        .env("HOME", std::env::var("HOME").unwrap_or_default());
350    if let Some(ref ch) = codex_home {
351        cmd.env("CODEX_HOME", ch);
352    }
353    cmd.stdin(Stdio::piped())
354        .stdout(Stdio::piped())
355        .stderr(Stdio::piped());
356    cmd
357}
358
359fn prepare_isolated_codex_home() -> Option<std::path::PathBuf> {
360    let home = std::env::var("HOME").ok()?;
361    let real_auth = std::path::Path::new(&home).join(".codex/auth.json");
362    if !real_auth.exists() {
363        return None;
364    }
365    let isolated =
366        std::env::temp_dir().join(format!("sqlite-graphrag-codex-home-{}", std::process::id()));
367    let _ = std::fs::create_dir_all(&isolated);
368    let target = isolated.join("auth.json");
369    if !target.exists() {
370        let _ = std::fs::copy(&real_auth, &target);
371    }
372    Some(isolated)
373}
374
375/// Parse the LLM embedding response. The two backends emit different
376/// shapes:
377/// - Claude (with `--output-format json`): single JSON object on stdout.
378/// - Codex (with `--json`): JSONL stream with one event per line; the
379///   `agent_message` event's `text` field is the JSON payload.
380///
381/// This helper accepts both shapes and returns the parsed
382/// `EmbeddingResponse` (or an error describing the first mismatch).
383fn parse_embedding_response(stdout: &str) -> Result<EmbeddingResponse, String> {
384    // Strategy 1: try the whole stdout as JSON (Claude path).
385    if let Ok(parsed) = serde_json::from_str::<EmbeddingResponse>(stdout) {
386        return Ok(parsed);
387    }
388    // Strategy 2: walk the JSONL line by line and pick the last
389    // `item.completed` of type `agent_message` (Codex path).
390    let mut last_agent_text: Option<String> = None;
391    for line in stdout.lines() {
392        let line = line.trim();
393        if line.is_empty() {
394            continue;
395        }
396        let Ok(event) = serde_json::from_str::<serde_json::Value>(line) else {
397            continue;
398        };
399        if event.get("type").and_then(|t| t.as_str()) != Some("item.completed") {
400            continue;
401        }
402        let item = match event.get("item") {
403            Some(i) => i,
404            None => continue,
405        };
406        if item.get("type").and_then(|t| t.as_str()) != Some("agent_message") {
407            continue;
408        }
409        if let Some(text) = item.get("text").and_then(|t| t.as_str()) {
410            last_agent_text = Some(text.to_string());
411        }
412    }
413    let text = last_agent_text
414        .ok_or_else(|| "no agent_message found in codex JSONL output".to_string())?;
415    serde_json::from_str::<EmbeddingResponse>(&text)
416        .map_err(|e| format!("codex agent_message text is not EmbeddingResponse: {e}; raw={text}"))
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422
423    #[test]
424    fn oauth_only_enforce_blocks_api_keys() {
425        // SAFETY: this test only sets and unsets env vars; no other test
426        // relies on the global env state.
427        unsafe {
428            std::env::set_var("ANTHROPIC_API_KEY", "test");
429            assert!(LlmEmbedding::oauth_only_enforce().is_err());
430            std::env::remove_var("ANTHROPIC_API_KEY");
431
432            std::env::set_var("OPENAI_API_KEY", "test");
433            assert!(LlmEmbedding::oauth_only_enforce().is_err());
434            std::env::remove_var("OPENAI_API_KEY");
435        }
436        assert!(LlmEmbedding::oauth_only_enforce().is_ok());
437    }
438
439    #[test]
440    fn flavour_as_str_is_stable() {
441        assert_eq!(EmbeddingFlavour::Claude.as_str(), "claude");
442        assert_eq!(EmbeddingFlavour::Codex.as_str(), "codex");
443    }
444
445    #[test]
446    fn parse_embedding_response_accepts_claude_json() {
447        let stdout = r#"{"embedding":[0.0,1.0,2.0]}"#;
448
449        let parsed = parse_embedding_response(stdout).expect("claude JSON must parse");
450
451        assert_eq!(parsed.embedding, vec![0.0, 1.0, 2.0]);
452    }
453
454    #[test]
455    fn parse_embedding_response_accepts_codex_jsonl() {
456        let stdout = r#"{"type":"thread.started","thread_id":"mock-thread-0"}
457{"type":"item.completed","item":{"type":"agent_message","text":"{\"embedding\":[0.0,1.0,2.0]}"}}
458{"type":"turn.completed","usage":{"input_tokens":1,"output_tokens":1}}"#;
459
460        let parsed = parse_embedding_response(stdout).expect("codex JSONL must parse");
461
462        assert_eq!(parsed.embedding, vec![0.0, 1.0, 2.0]);
463    }
464
465    #[test]
466    fn parse_embedding_response_rejects_jsonl_without_agent_message() {
467        let stdout = r#"{"type":"thread.started","thread_id":"mock-thread-0"}"#;
468
469        let err = parse_embedding_response(stdout).expect_err("missing agent_message must fail");
470
471        assert!(err.contains("no agent_message"));
472    }
473
474    #[test]
475    fn codex_embedding_command_reads_prompt_from_stdin() {
476        let schema_path = std::env::temp_dir().join("sqlite-graphrag-embed-schema-test.json");
477        let cmd = build_codex_embedding_command(
478            std::path::Path::new("/bin/true"),
479            "gpt-5.4",
480            &schema_path,
481        );
482        let argv: Vec<String> = cmd
483            .as_std()
484            .get_args()
485            .filter_map(|arg| arg.to_str().map(|s| s.to_string()))
486            .collect();
487
488        assert!(
489            argv.iter().any(|arg| arg == "-"),
490            "codex embedding command must read prompt from stdin: {argv:?}"
491        );
492        assert!(
493            !argv.iter().any(|arg| arg.starts_with("passage: ")),
494            "prompt text must not be passed as argv: {argv:?}"
495        );
496        for required in &[
497            "exec",
498            "-c",
499            "sandbox_mode='read-only'",
500            "approval_policy='never'",
501            "--json",
502            "--output-schema",
503            "--ephemeral",
504            "--skip-git-repo-check",
505            "--sandbox",
506            "read-only",
507            "--ignore-user-config",
508            "--ignore-rules",
509            "--model",
510            "gpt-5.4",
511        ] {
512            assert!(
513                argv.iter().any(|arg| arg == required),
514                "missing flag {required} in {argv:?}"
515            );
516        }
517    }
518
519    #[cfg(unix)]
520    #[test]
521    fn embed_passage_sends_prompt_to_codex_stdin() {
522        use std::os::unix::fs::PermissionsExt;
523
524        let temp = tempfile::tempdir().expect("tempdir must exist");
525        let binary = temp.path().join("codex-stdin-check");
526        let script = r#"#!/usr/bin/env bash
527set -euo pipefail
528
529prompt="$(cat)"
530if [[ "$prompt" != "passage: codex-cli" ]]; then
531  echo "unexpected stdin: $prompt" >&2
532  exit 41
533fi
534
535python3 - <<'PY'
536import json
537payload = json.dumps({"embedding": [0.0] * 384})
538print(json.dumps({
539    "type": "item.completed",
540    "item": {
541        "type": "agent_message",
542        "text": payload,
543    },
544}))
545PY
546"#;
547        std::fs::write(&binary, script).expect("mock codex script must be written");
548        let mut perms = std::fs::metadata(&binary)
549            .expect("mock codex metadata must exist")
550            .permissions();
551        perms.set_mode(0o755);
552        std::fs::set_permissions(&binary, perms).expect("mock codex must be executable");
553
554        let mut embedding = LlmEmbedding {
555            flavour: EmbeddingFlavour::Codex,
556            binary,
557            model: "gpt-5.4".to_string(),
558        };
559
560        let vector = embedding
561            .embed_passage("codex-cli")
562            .expect("stdin-backed codex embedding must succeed");
563
564        assert_eq!(vector.len(), EMBEDDING_DIM);
565        assert!(vector.iter().all(|value| *value == 0.0));
566    }
567}