Skip to main content

sqlite_graphrag/
embedder.rs

1//! Embedding generation for the GraphRAG memory.
2//!
3//! v1.0.76: the default build is **LLM-only** — the binary does NOT bundle
4//! fastembed / ort / ndarray / tokenizers. All embeddings are produced
5//! by a headless invocation of `claude code` or `codex` (OAuth, no MCP,
6//! no hooks) and stored as a BLOB in `memory_embeddings(memory_id, embedding,
7//! source)`. Vector similarity is computed in pure Rust at query time.
8//!
9//! # Workload classification (G42/S3, BLOCK 1 — MANDATORY)
10//!
11//! LLM embedding is **I/O-bound + subprocess-bound**: each call waits
12//! 5-60s on a network round-trip through a headless `claude -p` /
13//! `codex exec` subprocess while the local CPU stays idle. Concurrency
14//! therefore uses **tokio** (async I/O concurrency) and NEVER rayon
15//! (reserved for CPU-bound work).
16//!
17//! # Permit formula (G42/S3, BLOCO 2)
18//!
19//! ```text
20//! permits = clamp(--llm-parallelism, 1, 32)
21//!           .min(available_parallelism())
22//!           .min(available_ram_mb * 0.5 / LLM_WORKER_RSS_MB)
23//! ```
24//!
25//! `LLM_WORKER_RSS_MB = 350` (`crate::constants`): `claude -p` and
26//! `codex exec` are node processes with a typical Maximum RSS of
27//! 200-400 MB (measured via `/usr/bin/time -l` on macOS /
28//! `/usr/bin/time -v` on Linux), so the RAM bound is pertinent.
29//!
30//! # Locking contract (G42/A3 fix)
31//!
32//! The process-wide `Mutex<LlmEmbedding>` protects ONLY the cheap clone
33//! of the client configuration (flavour + binary path + model + shared
34//! schema tempfiles). It is NEVER held across network I/O — the
35//! v1.0.76-v1.0.78 `flush_group` held it for the whole sequential
36//! embedding loop, which is why `--llm-parallelism 8` measured an
37//! effective parallelism of 1.
38
39use crate::errors::AppError;
40use crate::extract::llm_embedding::LlmEmbedding;
41use parking_lot::Mutex;
42use std::path::Path;
43use std::sync::Arc;
44use std::sync::OnceLock;
45use tokio::sync::{mpsc, Semaphore};
46use tokio::task::JoinSet;
47use tokio_util::sync::CancellationToken;
48
49/// Process-wide LLM-embedding client behind a .
50///
51/// The lock guards configuration cloning only (see module docs); the
52/// actual LLM I/O happens on clones, outside the lock.
53///
54/// ADR-0042 / GAP-002: process-wide Claude-backed LLM-embedding client
55/// behind a `Mutex`. Distinct from `EMBEDDER` so the Claude path of
56/// `embed_via_backend` no longer re-probes PATH via `detect_available`
57/// (the v1.0.82 bug where requesting Claude could resolve to Codex).
58static CLAUDE_EMBEDDER: OnceLock<Mutex<LlmEmbedding>> = OnceLock::new();
59static OPENCODE_EMBEDDER: OnceLock<Mutex<LlmEmbedding>> = OnceLock::new();
60static OPENROUTER_CLIENT: OnceLock<crate::embedding_api::OpenRouterClient> = OnceLock::new();
61
62/// v1.0.95 (ADR-0054): process-wide OpenRouter chat-completions client for
63/// the `enrich` JUDGE. Distinct from `OPENROUTER_CLIENT` (embeddings) because
64/// the chat client binds a text model, not an embedding model.
65static OPENROUTER_CHAT_CLIENT: OnceLock<crate::chat_api::OpenRouterChatClient> = OnceLock::new();
66
67/// v1.0.93: check whether the OpenRouter client has been initialised.
68pub fn is_openrouter_initialized() -> bool {
69    OPENROUTER_CLIENT.get().is_some()
70}
71static EMBEDDER: OnceLock<Mutex<LlmEmbedding>> = OnceLock::new();
72
73/// Process-wide multi-thread tokio runtime for embedding I/O.
74///
75/// G42/A2 fix: v1.0.76-v1.0.78 built a current-thread runtime PER CALL.
76/// One runtime per process amortises the setup and hosts the bounded
77/// fan-out of `embed_texts_parallel`.
78static RUNTIME: OnceLock<tokio::runtime::Runtime> = OnceLock::new();
79
80/// Calibration base: chunk (long-text) batch size per LLM call at the
81/// calibration dimensionality (G42/S2). Use [`chunk_embed_batch_size`]
82/// for the dim-adaptive value (G44).
83pub const CHUNK_EMBED_BATCH_SIZE: usize = 8;
84
85/// Calibration base: entity-name (short-text) batch size per LLM call at
86/// the calibration dimensionality (G42/S2). Use [`entity_embed_batch_size`]
87/// for the dim-adaptive value (G44).
88pub const ENTITY_EMBED_BATCH_SIZE: usize = 25;
89
90/// Dimensionality the batch bases above were calibrated against (G44).
91pub const EMBED_BATCH_CALIBRATION_DIM: usize = 64;
92
93/// G44: scales a calibration-base batch size to the active dimensionality,
94/// keeping the float budget per LLM call constant (~512 floats for chunks,
95/// ~1600 for entity names — the budgets empirically validated at dim 64).
96/// Fixed batches of 8 at 384 dims asked for ~3072 floats per response:
97/// claude returned partial coverage (3 of 8 items, caught by the G42/C5
98/// check) and codex timed out at 300s. `base.max(1)` keeps the function
99/// total — `clamp` panics when the upper bound is below the lower one.
100fn adaptive_batch_for_dim(base: usize, dim: usize) -> usize {
101    let base = base.max(1);
102    (base * EMBED_BATCH_CALIBRATION_DIM / dim.max(1)).clamp(1, base)
103}
104
105/// Dim-adaptive batch size for chunk (long-text) embedding calls (G44).
106pub fn chunk_embed_batch_size() -> usize {
107    let dim = crate::constants::embedding_dim();
108    let batch = adaptive_batch_for_dim(CHUNK_EMBED_BATCH_SIZE, dim);
109    tracing::debug!(
110        dim,
111        base = CHUNK_EMBED_BATCH_SIZE,
112        batch,
113        "adaptive chunk batch size (G44)"
114    );
115    batch
116}
117
118/// Dim-adaptive batch size for entity-name (short-text) embedding calls (G44).
119pub fn entity_embed_batch_size() -> usize {
120    let dim = crate::constants::embedding_dim();
121    let batch = adaptive_batch_for_dim(ENTITY_EMBED_BATCH_SIZE, dim);
122    tracing::debug!(
123        dim,
124        base = ENTITY_EMBED_BATCH_SIZE,
125        batch,
126        "adaptive entity batch size (G44)"
127    );
128    batch
129}
130
131/// Returns the process-wide multi-thread runtime, building it on first use.
132pub(crate) fn shared_runtime() -> Result<&'static tokio::runtime::Runtime, AppError> {
133    if let Some(rt) = RUNTIME.get() {
134        return Ok(rt);
135    }
136    let rt = tokio::runtime::Builder::new_multi_thread()
137        .worker_threads(2)
138        .enable_all()
139        .build()
140        .map_err(|e| AppError::Embedding(format!("tokio runtime init failed: {e}")))?;
141    let _ = RUNTIME.set(rt);
142    RUNTIME.get().ok_or_else(|| {
143        AppError::Embedding("tokio runtime unavailable after initialisation".to_string())
144    })
145}
146
147/// Initialises the LLM-embedding client on first use and returns it.
148pub fn get_embedder(_models_dir: &Path) -> Result<&'static Mutex<LlmEmbedding>, AppError> {
149    if let Some(e) = EMBEDDER.get() {
150        return Ok(e);
151    }
152    let backend = LlmEmbedding::detect_available()?;
153    let _ = EMBEDDER.set(Mutex::new(backend));
154    EMBEDDER
155        .get()
156        .ok_or_else(|| AppError::Embedding("embedder unavailable after initialisation".to_string()))
157}
158
159/// ADR-0042 / GAP-002: returns the process-wide Claude embedder, lazily
160/// initialising it on first use. Binary and model overrides come from
161/// the explicit arguments; `None` falls back to PATH/env defaults via
162/// the builder.
163pub fn get_claude_embedder(
164    claude_binary: Option<&Path>,
165    claude_model: Option<&str>,
166) -> Result<&'static Mutex<LlmEmbedding>, AppError> {
167    if let Some(e) = CLAUDE_EMBEDDER.get() {
168        return Ok(e);
169    }
170    let mut builder = LlmEmbedding::with_claude_builder();
171    if let Some(b) = claude_binary {
172        builder = builder.override_binary(b.to_path_buf());
173    }
174    if let Some(m) = claude_model {
175        builder = builder.override_model(m.to_string());
176    }
177    let backend = builder.build()?;
178    let _ = CLAUDE_EMBEDDER.set(Mutex::new(backend));
179    CLAUDE_EMBEDDER.get().ok_or_else(|| {
180        AppError::Embedding("claude embedder unavailable after initialisation".to_string())
181    })
182}
183
184/// GAP-OPENCODE-001 / v1.0.90: returns the process-wide OpenCode embedder,
185/// lazily initialising it on first use. Binary and model overrides come
186/// from the explicit arguments; `None` falls back to PATH/env defaults via
187/// the builder.
188pub fn get_opencode_embedder(
189    opencode_binary: Option<&Path>,
190    opencode_model: Option<&str>,
191) -> Result<&'static Mutex<LlmEmbedding>, AppError> {
192    if let Some(e) = OPENCODE_EMBEDDER.get() {
193        return Ok(e);
194    }
195    let mut builder = LlmEmbedding::with_opencode_builder();
196    if let Some(b) = opencode_binary {
197        builder = builder.override_binary(b.to_path_buf());
198    }
199    if let Some(m) = opencode_model {
200        builder = builder.override_model(m.to_string());
201    }
202    let backend = builder.build()?;
203    let _ = OPENCODE_EMBEDDER.set(Mutex::new(backend));
204    OPENCODE_EMBEDDER.get().ok_or_else(|| {
205        AppError::Embedding("opencode embedder unavailable after initialisation".to_string())
206    })
207}
208
209pub fn get_openrouter_embedder(
210    api_key: secrecy::SecretBox<String>,
211    model: &str,
212    dim: usize,
213) -> Result<&'static crate::embedding_api::OpenRouterClient, AppError> {
214    if let Some(c) = OPENROUTER_CLIENT.get() {
215        return Ok(c);
216    }
217    let client = crate::embedding_api::OpenRouterClient::new(api_key, model.to_string(), dim)?;
218    let _ = OPENROUTER_CLIENT.set(client);
219    OPENROUTER_CLIENT.get().ok_or_else(|| {
220        AppError::Embedding("openrouter client unavailable after initialisation".to_string())
221    })
222}
223
224/// v1.0.95 (ADR-0054): initialises the process-wide OpenRouter chat client on
225/// first use and returns it. `model` is the text model the enrich JUDGE will
226/// call (no default; the caller validates presence upfront).
227pub fn get_openrouter_chat_client(
228    api_key: secrecy::SecretBox<String>,
229    model: &str,
230    timeout_secs: u64,
231) -> Result<&'static crate::chat_api::OpenRouterChatClient, AppError> {
232    if let Some(c) = OPENROUTER_CHAT_CLIENT.get() {
233        return Ok(c);
234    }
235    let client =
236        crate::chat_api::OpenRouterChatClient::new(api_key, model.to_string(), timeout_secs)?;
237    let _ = OPENROUTER_CHAT_CLIENT.set(client);
238    OPENROUTER_CHAT_CLIENT.get().ok_or_else(|| {
239        AppError::Embedding("openrouter chat client unavailable after initialisation".to_string())
240    })
241}
242
243/// v1.0.95: returns the process-wide OpenRouter chat client if it has already
244/// been initialised via [`get_openrouter_chat_client`]. Used by the enrich
245/// JUDGE dispatch, which initialises the singleton once at startup and then
246/// fetches it per item without re-threading the API key.
247pub fn openrouter_chat_client() -> Option<&'static crate::chat_api::OpenRouterChatClient> {
248    OPENROUTER_CHAT_CLIENT.get()
249}
250
251/// ADR-0042 / GAP-002: route a single passage through the Claude
252/// embedder. Used by the Claude arm of `embed_via_backend` so the
253/// fallback chain stops treating Claude as a synonym for codex.
254pub fn embed_via_claude_local(
255    _models_dir: &Path,
256    text: &str,
257    claude_binary: Option<&Path>,
258    claude_model: Option<&str>,
259) -> Result<Vec<f32>, AppError> {
260    let _slot_guard = acquire_llm_slot_for_embedding()?;
261    let embedder = get_claude_embedder(claude_binary, claude_model)?;
262    embed_passage(embedder, text)
263}
264
265/// BUG-003 / v1.0.85: split of  that also
266/// reports the resolved []. Always  because
267/// this path constructs a Claude-flavoured embedder via
268///  (no PATH probe, no silent substitution).
269pub fn embed_via_claude_local_resolved(
270    _models_dir: &Path,
271    text: &str,
272    claude_binary: Option<&Path>,
273    claude_model: Option<&str>,
274) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
275    let _slot_guard = acquire_llm_slot_for_embedding()?;
276    let embedder = get_claude_embedder(claude_binary, claude_model)?;
277    let v = embed_passage(embedder, text)?;
278    Ok((v, LlmBackendKind::Claude))
279}
280
281/// GAP-OPENCODE-001 / v1.0.90: route a single passage through the OpenCode
282/// embedder, reporting the resolved [`LlmBackendKind::Opencode`]. Constructs
283/// an OpenCode-flavoured embedder via `with_opencode_builder` (no PATH probe,
284/// no silent substitution).
285pub fn embed_via_opencode_local_resolved(
286    _models_dir: &Path,
287    text: &str,
288    opencode_binary: Option<&Path>,
289    opencode_model: Option<&str>,
290) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
291    let _slot_guard = acquire_llm_slot_for_embedding()?;
292    let embedder = get_opencode_embedder(opencode_binary, opencode_model)?;
293    let v = embed_passage(embedder, text)?;
294    Ok((v, LlmBackendKind::Opencode))
295}
296/// Clones the embedding-client configuration. The lock is held only for
297/// the duration of the clone — NEVER across I/O (G42/A3).
298fn clone_client(embedder: &Mutex<LlmEmbedding>) -> LlmEmbedding {
299    embedder.lock().clone()
300}
301
302/// Embeds a single passage for storage. Delegates to the configured LLM
303/// headless (claude code / codex). Returns a vector of the active
304/// dimensionality.
305pub fn embed_passage(embedder: &Mutex<LlmEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
306    let client = clone_client(embedder);
307    let result = client.embed_passage(text)?;
308    validate_dim(result)
309}
310
311/// Embeds a single query for similarity search. Same model and dim as
312/// `embed_passage`; the only difference is the LLM-side prompt prefix
313/// that the headless invocation uses to disambiguate.
314pub fn embed_query(embedder: &Mutex<LlmEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
315    let client = clone_client(embedder);
316    let result = client.embed_query(text)?;
317    validate_dim(result)
318}
319
320/// Embeds a batch of passages with token-count-aware batching.
321///
322/// Kept for API compatibility; since v1.0.79 it routes through the
323/// bounded parallel fan-out with conservative defaults.
324pub fn embed_passages_controlled(
325    embedder: &Mutex<LlmEmbedding>,
326    texts: &[&str],
327    _token_counts: &[usize],
328) -> Result<Vec<Vec<f32>>, AppError> {
329    if texts.is_empty() {
330        return Ok(Vec::new());
331    }
332    let owned: Vec<String> = texts.iter().map(|t| t.to_string()).collect();
333    embed_texts_parallel(embedder, &owned, 1, chunk_embed_batch_size())
334}
335
336pub fn embed_passage_local(models_dir: &Path, text: &str) -> Result<Vec<f32>, AppError> {
337    let _slot_guard = acquire_llm_slot_for_embedding()?;
338    let embedder = get_embedder(models_dir)?;
339    embed_passage(embedder, text)
340}
341
342/// v1.0.89 (BUG-SKIP-EMBED): reads `SQLITE_GRAPHRAG_SKIP_EMBEDDING_ON_FAILURE`
343/// env var (set by `--skip-embedding-on-failure` via main.rs propagation).
344/// Returns `true` when the user opted to persist with NULL embedding on failure.
345pub fn should_skip_embedding_on_failure() -> bool {
346    matches!(
347        std::env::var("SQLITE_GRAPHRAG_SKIP_EMBEDDING_ON_FAILURE").as_deref(),
348        Ok("1") | Ok("true")
349    )
350}
351
352/// v1.0.89 (BUG-SKIP-EMBED + GAP-EMBED-PROPAGATION): embed a passage
353/// honouring both `--llm-backend` and `--skip-embedding-on-failure`.
354///
355/// On success returns `Ok(Some(vec))`. On failure:
356/// - if `--skip-embedding-on-failure` is active, logs a warning and returns `Ok(None)`
357/// - otherwise propagates the error (exit 11)
358pub fn embed_passage_or_skip(
359    models_dir: &Path,
360    text: &str,
361    choice: Option<crate::cli::LlmBackendChoice>,
362) -> Result<Option<Vec<f32>>, AppError> {
363    match embed_passage_with_choice(models_dir, text, choice) {
364        Ok((v, _backend)) => Ok(Some(v)),
365        Err(AppError::Validation(msg)) => Err(AppError::Validation(msg)),
366        Err(e) => {
367            if should_skip_embedding_on_failure() {
368                tracing::warn!(
369                    error = %e,
370                    "embedding failed but --skip-embedding-on-failure is active; persisting with NULL embedding"
371                );
372                Ok(None)
373            } else {
374                Err(e)
375            }
376        }
377    }
378}
379
380/// BUG-003 / v1.0.85: split of `embed_passage_local` that reports the
381/// resolved [`LlmBackendKind`] based on the ACTUAL
382/// [`LlmEmbedding::flavour`] of the embedder constructed. When
383/// `LlmEmbedding::detect_available` substitutes claude for a missing
384/// codex, the operator sees the truth in `envelope.backend_invoked`.
385pub fn embed_passage_local_resolved(
386    models_dir: &Path,
387    text: &str,
388) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
389    let _slot_guard = acquire_llm_slot_for_embedding()?;
390    let embedder = get_embedder(models_dir)?;
391    let v = embed_passage(embedder, text)?;
392    let kind = match embedder.lock().flavour() {
393        crate::extract::llm_embedding::EmbeddingFlavour::Codex => LlmBackendKind::Codex,
394        crate::extract::llm_embedding::EmbeddingFlavour::Claude => LlmBackendKind::Claude,
395        crate::extract::llm_embedding::EmbeddingFlavour::Opencode => LlmBackendKind::Opencode,
396    };
397    Ok((v, kind))
398}
399
400pub fn embed_query_local(models_dir: &Path, text: &str) -> Result<Vec<f32>, AppError> {
401    let _slot_guard = acquire_llm_slot_for_embedding()?;
402    let embedder = get_embedder(models_dir)?;
403    embed_query(embedder, text)
404}
405
406// =============================================================================
407// v1.0.82 (GAP-003): wrappers que aceitam a escolha do CLI
408// (`crate::cli::LlmBackendChoice`) e a traduzem em uma chain para
409// `embed_with_fallback`. Centralizam a propagação do flag `--llm-backend`
410// nos 6 comandos que produzem embedding (`remember`, `edit`, `ingest`,
411// `enrich`, `recall`, `hybrid-search`).
412// =============================================================================
413
414/// Embed a single passage using the LLM backend selected by the user via
415/// `--llm-backend`. Routes to `embed_with_fallback` so failures fall
416/// through to the next backend in the chain before giving up.
417///
418/// When `choice` is `None` (e.g. a sub-command that does not yet
419/// expose the flag), behaviour matches `embed_passage_local` — the
420/// active embedder from `LlmEmbedding::detect_available` decides the
421/// backend.
422pub fn embed_passage_with_choice(
423    models_dir: &Path,
424    text: &str,
425    choice: Option<crate::cli::LlmBackendChoice>,
426) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
427    let _slot_guard = acquire_llm_slot_for_embedding()?;
428    match choice {
429        None => {
430            let embedder = get_embedder(models_dir)?;
431            embed_passage(embedder, text).map(|v| (v, LlmBackendKind::None))
432        }
433        Some(choice) => embed_with_fallback(models_dir, text, &choice.to_chain(), false),
434    }
435}
436
437/// v1.0.93: embedding with `EmbeddingBackendChoice` awareness. When the
438/// embedding backend is `Openrouter` or `Auto` with a live client, the
439/// chain prepends `OpenRouter` before the LLM subprocess backends.
440pub fn embed_passage_with_embedding_choice(
441    models_dir: &Path,
442    text: &str,
443    embedding_backend: crate::cli::EmbeddingBackendChoice,
444    llm_backend: crate::cli::LlmBackendChoice,
445) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
446    let _slot_guard = acquire_llm_slot_for_embedding()?;
447    let chain = embedding_backend.to_chain(llm_backend);
448    embed_with_fallback(models_dir, text, &chain, false)
449}
450
451/// failure, returns a structured `FallbackReason` so the caller can
452/// surface `vec_degraded` instead of a hard exit 11.
453///
454/// `None` matches the legacy `try_embed_query_with_fallback` path
455/// (uses the active embedder without an explicit chain).
456pub fn try_embed_query_with_choice(
457    models_dir: &Path,
458    text: &str,
459    choice: Option<crate::cli::LlmBackendChoice>,
460) -> Result<(Vec<f32>, LlmBackendKind), FallbackReason> {
461    match embed_passage_with_choice(models_dir, text, choice) {
462        // GAP-004 / v1.0.85.1: when the chain terminates on
463        //  (i.e. user passed
464        // or every preceding backend failed),  returns
465        //  instead of an error. Without this guard the
466        // empty vector would propagate to  which
467        // aborts with exit 11 ("embedding has 0 dims, expected 64").
468        // The caller's contract is to surface a typed
469        // so  and  can route to FTS5-puro via
470        // the existing  /  envelope.
471        // Intercept the empty-vector success path and surface it as
472        //  (introduced at v1.0.85 / ADR-0043
473        // for the symmetric LLM-returned-zero-dim case).
474        Ok((v, _backend)) if v.is_empty() => Err(FallbackReason::DimZero),
475        Ok((v, backend)) => Ok((v, backend)),
476        Err(e) => Err(classify_embedding_error(e)),
477    }
478}
479/// v1.0.93 (GAP-OR-INGEST): query embedding with `EmbeddingBackendChoice`
480/// awareness. Mirrors `try_embed_query_with_choice` but routes through
481/// `embed_passage_with_embedding_choice` so OpenRouter API is used when
482/// configured.
483pub fn try_embed_query_with_embedding_choice(
484    models_dir: &Path,
485    text: &str,
486    embedding_backend: crate::cli::EmbeddingBackendChoice,
487    llm_backend: crate::cli::LlmBackendChoice,
488) -> Result<(Vec<f32>, LlmBackendKind), FallbackReason> {
489    match embed_passage_with_embedding_choice(models_dir, text, embedding_backend, llm_backend) {
490        Ok((v, _backend)) if v.is_empty() => Err(FallbackReason::DimZero),
491        Ok((v, backend)) => Ok((v, backend)),
492        Err(e) => Err(classify_embedding_error(e)),
493    }
494}
495
496/// call. Reads the max-concurrency from
497/// `SQLITE_GRAPHRAG_LLM_MAX_HOST_CONCURRENCY` (default derived from
498/// `LLM_WORKER_RSS_MB` and available memory), and the wait timeout
499/// from `SQLITE_GRAPHRAG_LLM_SLOT_WAIT_SECS` (default 30s).
500///
501/// Returns `Ok(guard)` for happy path, `AppError::LockBusy` (exit 75)
502/// when no slot is available within the wait window, and
503/// `AppError::Validation` when the concurrency is 0.
504///
505/// The `LLM_SLOT_NO_WAIT` env var (or its CLI flag equivalent) sets
506/// `wait_secs = 0` to fail fast in tests.
507fn acquire_llm_slot_for_embedding() -> Result<crate::llm_slots::LlmSlotGuard, AppError> {
508    use crate::constants::{CLI_LOCK_DEFAULT_WAIT_SECS, LLM_WORKER_RSS_MB};
509    let max = std::env::var("SQLITE_GRAPHRAG_LLM_MAX_HOST_CONCURRENCY")
510        .ok()
511        .and_then(|s| s.parse::<u32>().ok())
512        .filter(|n| *n >= 1)
513        .unwrap_or_else(crate::llm_slots::default_max_concurrency);
514    let wait_secs = if std::env::var("SQLITE_GRAPHRAG_LLM_SLOT_NO_WAIT").is_ok() {
515        0
516    } else {
517        std::env::var("SQLITE_GRAPHRAG_LLM_SLOT_WAIT_SECS")
518            .ok()
519            .and_then(|s| s.parse::<u64>().ok())
520            .unwrap_or(CLI_LOCK_DEFAULT_WAIT_SECS)
521    };
522    let _ = LLM_WORKER_RSS_MB; // silence the unused import (used in default_max_concurrency)
523                               // GAP-003 / ADR-0043: when the slot semaphore is contended beyond the
524                               // backoff window (50 + 100 + 200 + 400 = 750ms total), return a
525                               // marker message that `classify_embedding_error` maps to
526                               // `FallbackReason::SlotExhausted` (discriminator `slot_exhausted`).
527                               // The window is shorter than the legacy 30s timeout, so the operator
528                               // observes FTS5-puro fallback quickly instead of after 30s of silence.
529    match crate::llm_slots::acquire_llm_slot(max, wait_secs) {
530        Ok(guard) => Ok(guard),
531        Err(e @ AppError::LockBusy { .. }) if wait_secs > 0 => Err(AppError::Embedding(format!(
532            "slot exhausted: {e} (fall back to FTS5)"
533        ))),
534        Err(e) => Err(e),
535    }
536}
537/// GAP-004 (v1.0.88): typed classifier for embedding error messages.
538///
539/// Decomposes the legacy `AppError::Embedding(String)` payload into a
540/// small enum so the call sites can branch on the cause instead of
541/// repeating `msg.contains(...)` literals. The classification is purely
542/// lexical (case-insensitive substring match on the error message) — no
543/// I/O, no retries, no telemetry, deterministic and safe under
544/// `#[serial_test::serial(env)]`.
545///
546/// 6 variants cover the 5 known discriminators from v1.0.85 (ADR-0043)
547/// plus an `Unknown` fallback for messages that do not match any marker.
548#[derive(Debug, Clone, Copy, PartialEq, Eq)]
549pub enum EmbeddingErrorKind {
550    /// OAuth token expired or absent; no backend can authenticate.
551    OAuth,
552    /// OAuth usage quota exhausted on the named backend.
553    Quota,
554    /// LLM slot semaphore exhausted after the backoff window.
555    SlotExhausted,
556    /// User-requested backend differs from the one that actually executed.
557    BackendMismatch,
558    /// Embedding returned a zero-dimensional vector (structural bug).
559    ZeroDimension,
560    /// Message did not match any of the 5 markers above.
561    Unknown,
562}
563
564impl EmbeddingErrorKind {
565    /// Classify an embedding error message into a typed kind.
566    ///
567    /// Order of checks matters: `OAuth` is matched before `Quota` because
568    /// both substrings can co-occur in the same message. `SlotExhausted`
569    /// is checked before `Quota` because the slot-sema path is more
570    /// specific (the LLM never even tried to authenticate). The checks
571    /// are case-insensitive so `OAuth` and `oauth` both classify to
572    /// `EmbeddingErrorKind::OAuth`.
573    pub fn classify(msg: &str) -> Self {
574        let m = msg.to_lowercase();
575        if m.contains("oauth") {
576            Self::OAuth
577        } else if m.contains("quota") {
578            Self::Quota
579        } else if m.contains("slot exhausted") {
580            Self::SlotExhausted
581        } else if m.contains("backend mismatch") {
582            Self::BackendMismatch
583        } else if m.contains("dim") && m.contains("zero") {
584            Self::ZeroDimension
585        } else {
586            Self::Unknown
587        }
588    }
589
590    /// Stable, machine-friendly discriminator code (lowercase, kebab-safe).
591    pub fn code(&self) -> &'static str {
592        match self {
593            Self::OAuth => "oauth",
594            Self::Quota => "quota",
595            Self::SlotExhausted => "slot-exhausted",
596            Self::BackendMismatch => "backend-mismatch",
597            Self::ZeroDimension => "zero-dimension",
598            Self::Unknown => "unknown",
599        }
600    }
601}
602
603impl std::fmt::Display for EmbeddingErrorKind {
604    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
605        f.write_str(self.code())
606    }
607}
608
609/// G58/S1: reason an embedding call could not be completed and the caller
610/// must fall back to a non-vector retrieval path (FTS5 prefix + LIKE).
611///
612/// Returned by [`try_embed_query_with_fallback`] so the `recall` and
613/// `hybrid-search` handlers can surface a structured `vec_degraded` /
614/// `warning` envelope instead of a hard `AppError::Embedding` exit 11.
615#[derive(Debug, Clone, PartialEq)]
616pub enum FallbackReason {
617    /// The LLM subprocess failed (rate limit, OAuth contention, quota
618    /// exhausted, model unparsable response, divergent dim, etc.).
619    /// Carries the original error message for observability.
620    EmbeddingFailed(String),
621    /// The LLM slot semaphore was exhausted: 8+ concurrent LLM
622    /// subprocesses blocked the acquire beyond the backoff window
623    /// (50ms + 100ms + 200ms + 400ms = 750ms total). Resolved at v1.0.85
624    /// (GAP-003 / ADR-0043).
625    SlotExhausted,
626    /// OAuth usage quota exhausted on the named backend. The caller
627    /// should retry with an alternative backend (codex ↔ claude)
628    /// before falling back to FTS5-puro.
629    OAuthQuota { backend: &'static str },
630    /// The user requested a backend that differs from the one that
631    /// actually executed the embedding (legacy "synonym for codex"
632    /// bug from v1.0.83). Resolved at v1.0.84 (GAP-002).
633    BackendMismatch {
634        requested: &'static str,
635        resolved: &'static str,
636    },
637    /// The embedding returned a zero-dimensional vector, signalling a
638    /// structural bug (the LLM did not produce any floats). Distinct
639    /// from OAuthQuota (quota exhausted) and EmbeddingFailed
640    /// (subprocess error).
641    DimZero,
642    /// The embedding was cancelled by an external signal (SIGTERM, etc.).
643    Cancelled,
644    /// The embedding exceeded its time budget. Carries the operation name
645    /// and the elapsed seconds for diagnostic logging.
646    Timeout {
647        operation: String,
648        duration_secs: u64,
649    },
650}
651
652impl FallbackReason {
653    /// Stable, machine-friendly reason code used by JSON envelopes
654    /// (`vec_degraded_reason`). Mirrors the v1.0.84 contract extended
655    /// at v1.0.85 with 4 new variants (GAP-003 / ADR-0043).
656    pub fn reason_code(&self) -> &'static str {
657        match self {
658            Self::EmbeddingFailed(_) => "embedding_failed",
659            Self::SlotExhausted => "slot_exhausted",
660            Self::OAuthQuota { .. } => "oauth_quota",
661            Self::BackendMismatch { .. } => "backend_mismatch",
662            Self::DimZero => "dim_zero",
663            Self::Cancelled => "cancelled",
664            Self::Timeout { .. } => "timeout",
665        }
666    }
667}
668
669impl std::fmt::Display for FallbackReason {
670    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
671        match self {
672            Self::EmbeddingFailed(msg) => write!(f, "embedding failed: {msg}"),
673            Self::SlotExhausted => write!(
674                f,
675                "slot exhausted: failed to acquire LLM slot after backoff window (max=8 concurrent, total backoff=750ms)"
676            ),
677            Self::OAuthQuota { backend } => {
678                write!(f, "OAuth usage quota exhausted on backend '{backend}'")
679            }
680            Self::BackendMismatch {
681                requested,
682                resolved,
683            } => {
684                write!(
685                    f,
686                    "backend mismatch: user requested '{requested}' but '{resolved}' was invoked"
687                )
688            }
689            Self::DimZero => write!(f, "embedding returned zero-dimensional vector"),
690            Self::Cancelled => write!(f, "embedding cancelled by external signal"),
691            Self::Timeout {
692                operation,
693                duration_secs,
694            } => {
695                write!(
696                    f,
697                    "embedding timed out after {duration_secs}s during {operation}"
698                )
699            }
700        }
701    }
702}
703
704impl std::error::Error for FallbackReason {}
705
706/// G58/S1: try to embed a query, mapping any failure to a structured
707/// [`FallbackReason`] so callers can route to FTS5 + LIKE fallback instead
708/// of returning exit 11 to the user.
709///
710/// This is the bridge between the hard-fail `embed_query_local` (used by
711/// write paths where embedding failure aborts the operation) and the
712/// graceful-degradation contract of `recall` / `hybrid-search` in v1.0.80.
713pub fn try_embed_query_with_fallback(
714    models_dir: &Path,
715    query: &str,
716) -> Result<(Vec<f32>, LlmBackendKind), FallbackReason> {
717    match embed_query_local(models_dir, query) {
718        Ok(v) => Ok((v, LlmBackendKind::None)),
719        Err(e) => Err(classify_embedding_error(e)),
720    }
721}
722
723/// G58 / ADR-0043 (v1.0.85): deterministic fallback for `recall` and
724/// `hybrid-search`.
725///
726/// - On `OAuthQuota { backend }`, retry once with the alternative backend
727///   (codex ↔ claude) before giving up.
728/// - On `SlotExhausted`, sleep 750ms and retry once (gives the slot
729///   semaphore time to release a permit from a sibling subprocess).
730/// - On any other `FallbackReason`, return immediately (deterministic).
731pub fn try_embed_query_with_deterministic_fallback(
732    models_dir: &Path,
733    query: &str,
734    choice: Option<crate::cli::LlmBackendChoice>,
735) -> Result<(Vec<f32>, LlmBackendKind), FallbackReason> {
736    match try_embed_query_with_choice(models_dir, query, choice) {
737        Ok(t) => Ok(t),
738        Err(reason @ FallbackReason::OAuthQuota { backend }) => {
739            let alt = match backend {
740                "codex" => Some(crate::cli::LlmBackendChoice::Claude),
741                "claude" => Some(crate::cli::LlmBackendChoice::Codex),
742                "opencode" => Some(crate::cli::LlmBackendChoice::Codex),
743                "openrouter" => Some(crate::cli::LlmBackendChoice::Codex),
744                _ => None,
745            };
746            if let Some(alt_choice) = alt {
747                try_embed_query_with_choice(models_dir, query, Some(alt_choice))
748            } else {
749                Err(reason)
750            }
751        }
752        Err(reason @ FallbackReason::SlotExhausted) => {
753            std::thread::sleep(std::time::Duration::from_millis(750));
754            try_embed_query_with_choice(models_dir, query, choice).or(Err(reason))
755        }
756        Err(other) => Err(other),
757    }
758}
759
760/// Classify an embedding [`AppError`] into a typed [`FallbackReason`].
761///
762/// v1.0.85 (ADR-0043): discriminates the 4 new causes (SlotExhausted,
763/// OAuthQuota, BackendMismatch, DimZero) from the legacy generic
764/// EmbeddingFailed bucket. The classification is purely lexical
765/// (substring match on the message) — no I/O, no retries, no
766/// telemetry, deterministic and `#[serial_test::serial(env)]`-safe.
767pub fn classify_embedding_error(err: AppError) -> FallbackReason {
768    match err {
769        AppError::Timeout {
770            operation,
771            duration_secs,
772        } => FallbackReason::Timeout {
773            operation,
774            duration_secs,
775        },
776        AppError::Embedding(msg) => match EmbeddingErrorKind::classify(&msg) {
777            // GAP-004 (v1.0.88): typed-discriminator dispatch.
778            // The lexical classifier picks the discriminator; the arms below
779            // enrich the result with the backend name and the
780            // requested/resolved pair that the JSON envelope needs.
781            //
782            // Note: `Cancelled` and `EmbeddingFailed(msg)` are not in the
783            // 6-variant enum (they have no lexical marker) so we keep them
784            // as explicit guards at the head of the match.
785            EmbeddingErrorKind::SlotExhausted => FallbackReason::SlotExhausted,
786            EmbeddingErrorKind::OAuth => {
787                let backend = if msg.contains("codex") {
788                    "codex"
789                } else if msg.contains("claude") || msg.contains("anthropic-ratelimit") {
790                    // G45-CR5: anthropic-ratelimit-* headers are emitted only by
791                    // the Claude CLI subprocess; treat them as claude quota
792                    // signals even when the message text omits the word
793                    // "claude" explicitly.
794                    "claude"
795                } else if msg.contains("opencode") {
796                    "opencode"
797                } else {
798                    "unknown"
799                };
800                FallbackReason::OAuthQuota { backend }
801            }
802            EmbeddingErrorKind::Quota => {
803                let backend = if msg.contains("codex") {
804                    "codex"
805                } else if msg.contains("claude") || msg.contains("anthropic-ratelimit") {
806                    "claude"
807                } else if msg.contains("opencode") {
808                    "opencode"
809                } else {
810                    "unknown"
811                };
812                FallbackReason::OAuthQuota { backend }
813            }
814            EmbeddingErrorKind::BackendMismatch => {
815                // The `msg.contains("claude")` arm is intentionally
816                // placed BEFORE the OAuth arm so that a backend-mismatch
817                // message that mentions both "claude" and "codex" maps to
818                // BackendMismatch (the more specific failure mode).
819                let (requested, resolved) =
820                    if msg.contains("requested claude") && msg.contains("but codex") {
821                        ("claude", "codex")
822                    } else if msg.contains("requested codex") && msg.contains("but claude") {
823                        ("codex", "claude")
824                    } else if msg.contains("requested claude") {
825                        ("claude", "unknown")
826                    } else if msg.contains("requested codex") {
827                        ("codex", "unknown")
828                    } else {
829                        ("unknown", "unknown")
830                    };
831                FallbackReason::BackendMismatch {
832                    requested,
833                    resolved,
834                }
835            }
836            EmbeddingErrorKind::ZeroDimension => FallbackReason::DimZero,
837            EmbeddingErrorKind::Unknown => {
838                if msg.contains("cancelled") {
839                    FallbackReason::Cancelled
840                } else {
841                    FallbackReason::EmbeddingFailed(msg)
842                }
843            }
844        },
845        e => FallbackReason::EmbeddingFailed(e.to_string()),
846    }
847}
848// backends before giving up. The chain order matches the user-supplied
849// `--llm-fallback` list (default: codex, claude, none).
850// =============================================================================
851
852/// Tries each LLM backend in `chain` in order, returning the first
853/// successful embedding. On failure, the diagnostic tail of the last
854/// error is preserved in the returned `AppError::Embedding` so the
855/// operator can see WHY every backend failed.
856///
857/// If `skip_on_failure` is `true` AND every backend fails, the function
858/// returns `Ok(Vec::new())` (an empty vector) to signal "persist
859/// without embedding" — the call site is then responsible for writing
860/// a `pending_embeddings` row that can be retried later by the
861/// `embedding retry` subcommand.
862///
863/// Defaults the chain to `[codex, claude, none]` when `chain` is
864/// empty, matching the v1.0.81 behaviour where codex was the
865/// implicit default and claude was the implicit fallback.
866pub fn embed_with_fallback(
867    models_dir: &Path,
868    text: &str,
869    chain: &[LlmBackendKind],
870    skip_on_failure: bool,
871) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
872    use crate::llm::exit_code_hints::LlmBackendError;
873    let effective: Vec<LlmBackendKind> = if chain.is_empty() {
874        vec![
875            LlmBackendKind::Codex,
876            LlmBackendKind::Claude,
877            LlmBackendKind::Opencode,
878            LlmBackendKind::None,
879        ]
880    } else {
881        chain.to_vec()
882    };
883
884    let mut last_err: Option<AppError> = None;
885    for backend in &effective {
886        // BUG-003 / v1.0.85: propagar o backend REAL retornado por
887        // embed_via_backend (que pode diferir do chain position quando
888        // LlmEmbedding::detect_available substitui codex por claude).
889        // O tuple `(_, requested_kind)` é descartado — só queremos o
890        // backend resolvido na primeira posição.
891        // ADR-0046 / BUG-11 v1.0.88: use `embed_via_backend_strict` so the
892        // sentinel `None` backend propagates the last real error instead
893        // of silently degrading to `Ok((Vec::new(), None))`. This is the
894        // path that caused preflight rejections to be swallowed by the
895        // chain's default trailing `None`.
896        match embed_via_backend_strict(
897            models_dir,
898            text,
899            backend,
900            last_err.as_ref(),
901            skip_on_failure,
902        ) {
903            Ok((v, resolved_kind)) => return Ok((v, resolved_kind)),
904            Err(e) => {
905                // ADR-0011: Validation errors (OAuth-only enforcement) are
906                // FATAL — propagate immediately without trying the next
907                // backend. This prevents the fallback chain from swallowing
908                // OAuth violations via the trailing `None` sentinel.
909                if matches!(e, AppError::Validation(_)) {
910                    return Err(e);
911                }
912                tracing::warn!(
913                    target: "embedding",
914                    backend = ?backend,
915                    error = %e,
916                    "embed_with_fallback: backend failed, trying next"
917                );
918                last_err = Some(e);
919            }
920        }
921    }
922    if skip_on_failure {
923        // Signal "persist with no embedding" via an empty vector paired
924        // with `None` so callers know the chain exhausted without a hit.
925        // Caller is responsible for writing a `pending_embeddings` row
926        // that can be retried later by the `embedding retry` subcommand.
927        return Ok((Vec::new(), LlmBackendKind::None));
928    }
929    Err(last_err
930        .unwrap_or_else(|| AppError::Embedding(LlmBackendError::NoBackendsAvailable.to_string())))
931}
932
933/// LLM backend kind for the fallback chain. Mirrors the CLI
934/// `--llm-backend` enum so users can pass the same value to
935/// `--llm-fallback` without translation.
936#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
937pub enum LlmBackendKind {
938    /// `codex exec` (default for v1.0.76+).
939    Codex,
940    /// `claude -p` (fallback for ChatGPT Pro OAuth unavailability).
941    Claude,
942    /// `opencode run` (v1.0.90).
943    Opencode,
944    /// OpenRouter HTTP API (v1.0.93).
945    OpenRouter,
946    /// No embedding — empty vector returned.
947    None,
948}
949
950impl LlmBackendKind {
951    /// Stable string label used in tracing and JSON envelopes. The
952    /// string values are part of the public contract for `envelope.backend_invoked`.
953    pub fn as_str(self) -> &'static str {
954        match self {
955            Self::Codex => "codex",
956            Self::Claude => "claude",
957            Self::Opencode => "opencode",
958            Self::OpenRouter => "openrouter",
959            Self::None => "none",
960        }
961    }
962}
963
964/// Embeds a single text via the given backend. Used by
965/// `embed_with_fallback` and exposed to allow direct one-shot
966/// selection without a chain.
967/// Embeds a single text via the given backend. Used by
968/// `embed_with_fallback` and exposed to allow direct one-shot
969/// selection without a chain.
970///
971/// BUG-003 / v1.0.85: returns `(Vec<f32>, LlmBackendKind)`. The
972/// second element reports the backend that ACTUALLY executed the
973/// embedding, not the chain position requested by the caller. When
974/// `LlmBackendKind::Codex` is requested but `codex` is absent from
975/// PATH, `LlmEmbedding::detect_available` substitutes claude and the
976/// tuple carries `LlmBackendKind::Claude` so the operator sees the
977/// truth in `envelope.backend_invoked`.
978pub fn embed_via_backend(
979    models_dir: &Path,
980    text: &str,
981    backend: &LlmBackendKind,
982) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
983    match backend {
984        LlmBackendKind::None => Ok((Vec::new(), LlmBackendKind::None)),
985        LlmBackendKind::Codex => embed_passage_local_resolved(models_dir, text),
986        LlmBackendKind::Claude => {
987            // ADR-0042 / GAP-002: route Claude through its own static
988            // embedder instead of re-using the Codex path (which used
989            // to silently pick Codex if PATH ordered it first).
990            tracing::debug!(
991                target: "embedder",
992                backend = "claude",
993                "embed_via_backend: forcing claude (ADR-0042 / GAP-002 fix)"
994            );
995            embed_via_claude_local_resolved(models_dir, text, None, None)
996        }
997        LlmBackendKind::Opencode => {
998            tracing::debug!(
999                target: "embedder",
1000                backend = "opencode",
1001                "embed_via_backend: forcing opencode (GAP-OPENCODE-001)"
1002            );
1003            embed_via_opencode_local_resolved(models_dir, text, None, None)
1004        }
1005        LlmBackendKind::OpenRouter => {
1006            tracing::debug!(
1007                target: "embedder",
1008                backend = "openrouter",
1009                "embed_via_backend: using OpenRouter API (v1.0.93)"
1010            );
1011            let client = OPENROUTER_CLIENT.get().ok_or_else(|| {
1012                AppError::Embedding(
1013                    "OpenRouter client not initialised; call get_openrouter_embedder first".into(),
1014                )
1015            })?;
1016            let rt = shared_runtime()?;
1017            let vec = rt.block_on(client.embed_single(text, client.default_input_type()))?;
1018            Ok((vec, LlmBackendKind::OpenRouter))
1019        }
1020    }
1021}
1022
1023// ADR-0046 / BUG-11 v1.0.88: specialisation of `embed_via_backend` that
1024// refuses to SILENTLY DEGRADE to `LlmBackendKind::None` after all real
1025// backends (Codex, Claude) have failed. The previous behaviour
1026// (`Ok((Vec::new(), None))`) caused the `remember` write path to persist
1027// memories with zero-dimensional embeddings — breaking `recall` and
1028// `hybrid-search` while returning exit 0 (BUG-11 CRITICAL).
1029//
1030// When `--llm-backend none` is explicitly requested (i.e. `last_err` is
1031// None AND the chain was a single-element `[None]`), pass
1032// `skip_on_failure = true` to `embed_with_fallback` to consume the empty
1033// vector via the pending-embeddings retry queue instead of persisting
1034// directly. This helper is the right hook for `remember`/`edit`/`ingest`.
1035pub fn embed_via_backend_strict(
1036    models_dir: &Path,
1037    text: &str,
1038    backend: &LlmBackendKind,
1039    last_err: Option<&AppError>,
1040    skip_on_failure: bool,
1041) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
1042    use crate::llm::exit_code_hints::LlmBackendError;
1043    match backend {
1044        LlmBackendKind::None => {
1045            // If the caller opted into skip_on_failure AND no prior
1046            // backend has recorded an error, the empty vector is
1047            // intentional (chain of only [None]).
1048            if skip_on_failure && last_err.is_none() {
1049                Ok((Vec::new(), LlmBackendKind::None))
1050            } else if last_err.is_some() {
1051                // The chain reached `None` after Codex/Claude failed.
1052                // Propagate the most recent error so `remember` aborts
1053                // instead of persisting a memory without an embedding.
1054                Err(match last_err {
1055                    Some(e) => AppError::Embedding(format!("{e}")),
1056                    None => AppError::Embedding(LlmBackendError::NoBackendsAvailable.to_string()),
1057                })
1058            } else {
1059                // Empty chain with no skip_on_failure — treat as a
1060                // configuration error (no backends available).
1061                Err(AppError::Embedding(
1062                    LlmBackendError::NoBackendsAvailable.to_string(),
1063                ))
1064            }
1065        }
1066        LlmBackendKind::Codex => embed_passage_local_resolved(models_dir, text),
1067        LlmBackendKind::Claude => {
1068            tracing::debug!(
1069                target: "embedder",
1070                backend = "claude",
1071                "embed_via_backend_strict: forcing claude (ADR-0042 / GAP-002 fix)"
1072            );
1073            embed_via_claude_local_resolved(models_dir, text, None, None)
1074        }
1075        LlmBackendKind::Opencode => {
1076            tracing::debug!(
1077                target: "embedder",
1078                backend = "opencode",
1079                "embed_via_backend_strict: forcing opencode (GAP-OPENCODE-001)"
1080            );
1081            embed_via_opencode_local_resolved(models_dir, text, None, None)
1082        }
1083        LlmBackendKind::OpenRouter => embed_via_backend(models_dir, text, backend),
1084    }
1085}
1086
1087/// Legacy one-shot wrapper around `embed_via_backend` that discards
1088/// the resolved backend. Kept for call sites that only care about
1089/// the vector and ignore the executed-backend signal. New code
1090/// should prefer `embed_via_backend` directly.
1091pub fn embed_via_backend_legacy(
1092    models_dir: &Path,
1093    text: &str,
1094    backend: &LlmBackendKind,
1095) -> Result<Vec<f32>, AppError> {
1096    embed_via_backend(models_dir, text, backend).map(|(v, _)| v)
1097}
1098
1099pub fn embed_passages_controlled_local(
1100    models_dir: &Path,
1101    texts: &[&str],
1102    token_counts: &[usize],
1103) -> Result<Vec<Vec<f32>>, AppError> {
1104    let embedder = get_embedder(models_dir)?;
1105    embed_passages_controlled(embedder, texts, token_counts)
1106}
1107
1108/// G42/S3: embeds `texts` through the bounded parallel fan-out and
1109/// returns vectors in input order.
1110pub fn embed_passages_parallel_local(
1111    models_dir: &Path,
1112    texts: &[String],
1113    parallelism: usize,
1114    batch_size: usize,
1115) -> Result<Vec<Vec<f32>>, AppError> {
1116    let embedder = get_embedder(models_dir)?;
1117    embed_texts_parallel(embedder, texts, parallelism, batch_size)
1118}
1119
1120/// GAP-OPENROUTER-REST-CONCURRENCY: result of one bounded fan-out chunk —
1121/// the chunk index paired with the batch embedding result, used to restore
1122/// input order after out-of-order `JoinSet` completion.
1123type EmbedChunkResult = (usize, Result<Vec<Vec<f32>>, AppError>);
1124
1125/// GAP-OPENROUTER-REST-CONCURRENCY: reassembles the flat vector list in
1126/// input order from chunk parts produced out-of-order by the bounded
1127/// `JoinSet` fan-out. Sorts by chunk index, then flattens, so the result
1128/// matches the original `texts` order exactly.
1129fn reassemble_ordered(mut parts: Vec<(usize, Vec<Vec<f32>>)>) -> Vec<Vec<f32>> {
1130    parts.sort_by_key(|(idx, _)| *idx);
1131    parts.into_iter().flat_map(|(_, v)| v).collect()
1132}
1133
1134/// v1.0.93 (GAP-OR-INGEST): embeds multiple passages with
1135/// `EmbeddingBackendChoice` awareness. When the resolved chain starts
1136/// with `OpenRouter` and the client is initialised, uses the HTTP batch
1137/// API (`embed_batch`) instead of subprocess fan-out — no LLM slot
1138/// consumed, ~200ms per batch vs ~15s per subprocess cold-start.
1139/// Falls back to `embed_passages_parallel_local` for LLM backends.
1140pub fn embed_passages_parallel_with_embedding_choice(
1141    models_dir: &Path,
1142    texts: &[String],
1143    parallelism: usize,
1144    batch_size: usize,
1145    embedding_backend: crate::cli::EmbeddingBackendChoice,
1146    llm_backend: crate::cli::LlmBackendChoice,
1147) -> Result<Vec<Vec<f32>>, AppError> {
1148    let chain = embedding_backend.to_chain(llm_backend);
1149    if chain.first() == Some(&LlmBackendKind::OpenRouter) && is_openrouter_initialized() {
1150        let client = OPENROUTER_CLIENT.get().ok_or_else(|| {
1151            AppError::Embedding(
1152                "OpenRouter client not initialised; call get_openrouter_embedder first".into(),
1153            )
1154        })?;
1155        let rt = shared_runtime()?;
1156
1157        // GAP-OPENROUTER-REST-CONCURRENCY: reuse the caller's `parallelism`
1158        // as a bounded fan-out width, clamped to a Cloudflare-safe range.
1159        // Small inputs stay serial — a single batch is one REST call, so the
1160        // JoinSet overhead would only add latency.
1161        let k = parallelism.clamp(1, 16);
1162        if texts.len() <= 32 || k == 1 {
1163            let refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
1164            let vecs = rt.block_on(client.embed_batch(&refs, client.default_input_type()))?;
1165            return Ok(vecs);
1166        }
1167
1168        // `client` is a `&'static OpenRouterClient` (OPENROUTER_CLIENT is a
1169        // static OnceLock), so it is Copy + Send + 'static and moves freely
1170        // into each spawned task. Chunk contents are cloned into owned
1171        // `Vec<String>` because `texts` is only borrowed.
1172        let vecs = rt.block_on(async move {
1173            let mut set: JoinSet<EmbedChunkResult> = JoinSet::new();
1174            let mut parts: Vec<(usize, Vec<Vec<f32>>)> = Vec::new();
1175
1176            for (idx, chunk) in texts.chunks(32).enumerate() {
1177                if set.len() >= k {
1178                    if let Some(joined) = set.join_next().await {
1179                        let (cidx, res) = joined.map_err(|e| {
1180                            AppError::Embedding(format!("embedding task join error: {e}"))
1181                        })?;
1182                        parts.push((cidx, res?));
1183                    }
1184                }
1185                let owned: Vec<String> = chunk.to_vec();
1186                set.spawn(async move {
1187                    let refs: Vec<&str> = owned.iter().map(|s| s.as_str()).collect();
1188                    let r = client.embed_batch(&refs, client.default_input_type()).await;
1189                    (idx, r)
1190                });
1191            }
1192
1193            while let Some(joined) = set.join_next().await {
1194                let (cidx, res) = joined
1195                    .map_err(|e| AppError::Embedding(format!("embedding task join error: {e}")))?;
1196                parts.push((cidx, res?));
1197            }
1198
1199            Ok::<Vec<Vec<f32>>, AppError>(reassemble_ordered(parts))
1200        })?;
1201        Ok(vecs)
1202    } else {
1203        embed_passages_parallel_local(models_dir, texts, parallelism, batch_size)
1204    }
1205}
1206
1207/// G56: in-process cache for entity embeddings keyed by `(model, text)`.
1208///
1209/// Schema v13 is immutable: `entity_embeddings` does not have a `text`
1210/// column, so a pure DB-side cache would require a schema bump. Instead
1211/// we keep a process-wide LRU-style map that survives within one CLI
1212/// invocation. The hit rate is high in `ingest` (re-embedding the same
1213/// canonical entity across thousands of memories) and modest in `remember`
1214/// (typical single-memory invocations).
1215///
1216/// Key: `blake3(model || "\0" || text)`. Value: `Arc<Vec<f32>>` so the
1217/// collector can drop the map entry while a `Vec` is still in flight.
1218type EntityEmbedCacheMap = std::collections::HashMap<u64, Arc<Vec<f32>>>;
1219
1220static ENTITY_EMBED_CACHE: OnceLock<parking_lot::Mutex<EntityEmbedCacheMap>> = OnceLock::new();
1221
1222fn entity_embed_cache() -> &'static parking_lot::Mutex<EntityEmbedCacheMap> {
1223    ENTITY_EMBED_CACHE.get_or_init(|| parking_lot::Mutex::new(std::collections::HashMap::new()))
1224}
1225
1226fn entity_cache_key(model: &str, text: &str) -> u64 {
1227    let mut hasher = blake3::Hasher::new();
1228    hasher.update(model.as_bytes());
1229    hasher.update(b"\0");
1230    hasher.update(text.as_bytes());
1231    let h = hasher.finalize();
1232    let bytes = h.as_bytes();
1233    u64::from_le_bytes([
1234        bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
1235    ])
1236}
1237
1238/// G56: embeds entity-name texts through a process-wide cache.
1239///
1240/// Skips any `(model, text)` pair already produced in this CLI invocation
1241/// and only spawns subprocesses for the cache misses. Returns vectors in
1242/// the same order as `texts`.
1243///
1244/// Designed for entity-name batches (short texts). For chunk embeds use
1245/// [`embed_passages_parallel_local`] directly — chunks are unique per
1246/// memory and cache hit rate is negligible.
1247pub fn embed_entity_texts_cached(
1248    models_dir: &Path,
1249    texts: &[String],
1250    parallelism: usize,
1251    embedding_backend: crate::cli::EmbeddingBackendChoice,
1252    llm_backend: crate::cli::LlmBackendChoice,
1253) -> Result<(Vec<Vec<f32>>, EmbedCacheStats), AppError> {
1254    if texts.is_empty() {
1255        return Ok((Vec::new(), EmbedCacheStats::default()));
1256    }
1257    // GAP-OR-ENTITY-EMBED: resolve the SAME chain the chunk path uses so the
1258    // entity embedding honours `--embedding-backend`/`--llm-backend` instead
1259    // of always forcing the codex subprocess (the old G56 code path).
1260    let chain = embedding_backend.to_chain(llm_backend);
1261
1262    // `none` short-circuit: when the resolved chain is exactly `[None]`
1263    // (`--embedding-backend llm --llm-backend none`) skip every backend and
1264    // return empty vectors WITHOUT spawning a subprocess. Empties are never
1265    // cached so a later call with a real backend in the same process is not
1266    // poisoned; they count as misses for stats parity with the chunk path.
1267    if chain.as_slice() == [LlmBackendKind::None] {
1268        let out: Vec<Vec<f32>> = texts.iter().map(|_| Vec::new()).collect();
1269        return Ok((
1270            out,
1271            EmbedCacheStats {
1272                requested: texts.len(),
1273                hits: 0,
1274                misses: texts.len(),
1275            },
1276        ));
1277    }
1278
1279    // Cache model label reflects the EFFECTIVE embedding backend. When the
1280    // chain actually routes through OpenRouter, vectors carry that model's
1281    // dim/MRL profile and must never collide with codex-produced vectors;
1282    // for the local path we keep the prior `model_label()` so the in-process
1283    // cache key is unchanged (no regression — this cache is process-local).
1284    let routed_openrouter =
1285        chain.first() == Some(&LlmBackendKind::OpenRouter) && is_openrouter_initialized();
1286    let model = if routed_openrouter {
1287        format!("openrouter:{}", crate::constants::embedding_dim())
1288    } else {
1289        get_embedder(models_dir)?.lock().model_label()
1290    };
1291    let cache = entity_embed_cache();
1292    let mut hits: Vec<Option<Arc<Vec<f32>>>> = vec![None; texts.len()];
1293    let mut miss_indices: Vec<usize> = Vec::with_capacity(texts.len());
1294    {
1295        let guard = cache.lock();
1296        for (i, text) in texts.iter().enumerate() {
1297            let key = entity_cache_key(&model, text);
1298            if let Some(v) = guard.get(&key) {
1299                hits[i] = Some(Arc::clone(v));
1300            } else {
1301                miss_indices.push(i);
1302            }
1303        }
1304    }
1305    let miss_count = miss_indices.len();
1306    if miss_count > 0 {
1307        let miss_texts: Vec<String> = miss_indices.iter().map(|&i| texts[i].clone()).collect();
1308        // GAP-OR-ENTITY-EMBED: route misses through the backend-aware batch
1309        // helper (same one the chunk path uses). With OpenRouter this hits the
1310        // REST `embed_batch` (~200ms) instead of the codex subprocess (~120s).
1311        let miss_vecs = embed_passages_parallel_with_embedding_choice(
1312            models_dir,
1313            &miss_texts,
1314            parallelism,
1315            entity_embed_batch_size(),
1316            embedding_backend,
1317            llm_backend,
1318        )?;
1319        let mut guard = cache.lock();
1320        for (slot, &orig_idx) in miss_indices.iter().enumerate() {
1321            let vec = Arc::new(miss_vecs[slot].clone());
1322            let key = entity_cache_key(&model, &texts[orig_idx]);
1323            guard.insert(key, Arc::clone(&vec));
1324            hits[orig_idx] = Some(vec);
1325        }
1326    }
1327    let mut out = Vec::with_capacity(texts.len());
1328    for hit in hits.into_iter() {
1329        let v = hit.ok_or_else(|| {
1330            AppError::Embedding("entity embed cache produced null result".to_string())
1331        })?;
1332        out.push((*v).clone());
1333    }
1334    Ok((
1335        out,
1336        EmbedCacheStats {
1337            requested: texts.len(),
1338            hits: texts.len() - miss_count,
1339            misses: miss_count,
1340        },
1341    ))
1342}
1343
1344/// G56: stats snapshot returned by [`embed_entity_texts_cached`].
1345#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, serde::Serialize)]
1346pub struct EmbedCacheStats {
1347    pub requested: usize,
1348    pub hits: usize,
1349    pub misses: usize,
1350}
1351
1352impl EmbedCacheStats {
1353    /// Hit rate as a fraction in `[0.0, 1.0]`. Returns 0.0 when nothing was requested.
1354    pub fn hit_rate(&self) -> f64 {
1355        if self.requested == 0 {
1356            0.0
1357        } else {
1358            self.hits as f64 / self.requested as f64
1359        }
1360    }
1361}
1362
1363/// G42/S3 core: bounded parallel batch embedding.
1364///
1365/// - texts are grouped into batches of `batch_size` (one LLM call per
1366///   batch, G42/S2);
1367/// - at most `effective_permits(parallelism)` LLM subprocesses run
1368///   simultaneously (`Arc<Semaphore>` + `acquire_owned`, BLOCO 2);
1369/// - results stream through a BOUNDED mpsc channel so the caller-side
1370///   collector applies backpressure and can persist incrementally
1371///   (BLOCO 5);
1372/// - the global `CancellationToken` aborts in-flight work on the first
1373///   signal; subprocesses die with their futures via `kill_on_drop`
1374///   (BLOCO 6).
1375pub fn embed_texts_parallel(
1376    embedder: &Mutex<LlmEmbedding>,
1377    texts: &[String],
1378    parallelism: usize,
1379    batch_size: usize,
1380) -> Result<Vec<Vec<f32>>, AppError> {
1381    let mut slots: Vec<Option<Vec<f32>>> = vec![None; texts.len()];
1382    embed_texts_parallel_with(embedder, texts, parallelism, batch_size, |idx, v| {
1383        slots[idx] = Some(v.to_vec());
1384        Ok(())
1385    })?;
1386    let mut out = Vec::with_capacity(slots.len());
1387    for (idx, slot) in slots.into_iter().enumerate() {
1388        out.push(slot.ok_or_else(|| {
1389            AppError::Embedding(format!("embedding fan-out lost item index {idx}"))
1390        })?);
1391    }
1392    Ok(out)
1393}
1394
1395/// Like [`embed_texts_parallel`] but invokes `on_result` as soon as each
1396/// embedding arrives (BLOCO 5: incremental persistence — a kill loses at
1397/// most the in-flight batches, never the already-delivered items).
1398pub fn embed_texts_parallel_with(
1399    embedder: &Mutex<LlmEmbedding>,
1400    texts: &[String],
1401    parallelism: usize,
1402    batch_size: usize,
1403    mut on_result: impl FnMut(usize, &[f32]) -> Result<(), AppError>,
1404) -> Result<(), AppError> {
1405    if texts.is_empty() {
1406        return Ok(());
1407    }
1408    let dim = crate::constants::embedding_dim();
1409    if texts.len() == 1 {
1410        let v = embed_passage(embedder, &texts[0])?;
1411        return on_result(0, &v);
1412    }
1413
1414    let client = clone_client(embedder);
1415    let permits = effective_permits(parallelism);
1416    let batches = build_batches(texts, batch_size.max(1));
1417    let token = crate::cancel_token().clone();
1418
1419    let work = move |batch: Vec<(usize, String)>| {
1420        let client = client.clone();
1421        async move {
1422            client
1423                .embed_batch_async(crate::constants::PASSAGE_PREFIX, &batch)
1424                .await
1425        }
1426    };
1427
1428    let fan_out = run_bounded(batches, permits, dim, token, work, &mut on_result);
1429    match tokio::runtime::Handle::try_current() {
1430        Ok(handle) => tokio::task::block_in_place(|| handle.block_on(fan_out)),
1431        Err(_) => shared_runtime()?.block_on(fan_out),
1432    }
1433}
1434
1435/// Groups `(global_index, text)` pairs into batches of `batch_size`.
1436fn build_batches(texts: &[String], batch_size: usize) -> Vec<Vec<(usize, String)>> {
1437    texts
1438        .iter()
1439        .cloned()
1440        .enumerate()
1441        .collect::<Vec<_>>()
1442        .chunks(batch_size)
1443        .map(|c| c.to_vec())
1444        .collect()
1445}
1446
1447/// G42/S3 BLOCO 2: effective permit count.
1448///
1449/// `permits = clamp(requested, 1, 32) ∧ cpus ∧ ram_livre*0.5/RSS` — see
1450/// the module docs for the measured RSS rationale.
1451pub fn effective_permits(requested: usize) -> usize {
1452    let cpus = std::thread::available_parallelism()
1453        .map(|n| n.get())
1454        .unwrap_or(4);
1455    let by_ram = ((crate::memory_guard::available_memory_mb() / 2)
1456        / crate::constants::LLM_WORKER_RSS_MB)
1457        .max(1) as usize;
1458    requested.clamp(1, 32).min(cpus).min(by_ram).max(1)
1459}
1460
1461/// Bounded fan-out engine. Generic over the per-batch work so the
1462/// concurrency contract is testable without spawning real LLMs.
1463///
1464/// Cancel safety (BLOCO 6/10): every task races its work against
1465/// `token.cancelled()` inside `tokio::select!`; both branches are
1466/// cancel-safe (the work future owns its subprocess via `kill_on_drop`,
1467/// and `cancelled()` is pure). On collector-side errors the `JoinSet`
1468/// is shut down, which drops in-flight futures and kills their
1469/// subprocesses.
1470async fn run_bounded<F, Fut>(
1471    batches: Vec<Vec<(usize, String)>>,
1472    permits: usize,
1473    dim: usize,
1474    token: CancellationToken,
1475    work: F,
1476    on_result: &mut impl FnMut(usize, &[f32]) -> Result<(), AppError>,
1477) -> Result<(), AppError>
1478where
1479    F: Fn(Vec<(usize, String)>) -> Fut + Clone + Send + 'static,
1480    Fut: std::future::Future<Output = Result<Vec<(usize, Vec<f32>)>, AppError>> + Send,
1481{
1482    let total_batches = batches.len();
1483    let semaphore = Arc::new(Semaphore::new(permits));
1484    // BLOCO 5: bounded channel — producers block when the collector is
1485    // behind (backpressure); PROIBIDO unbounded_channel between stages.
1486    let (tx, mut rx) = mpsc::channel::<Result<Vec<(usize, Vec<f32>)>, AppError>>(permits * 2);
1487    let mut set: JoinSet<()> = JoinSet::new();
1488
1489    for (batch_idx, batch) in batches.into_iter().enumerate() {
1490        let sem = Arc::clone(&semaphore);
1491        let token = token.clone();
1492        let tx = tx.clone();
1493        let work = work.clone();
1494        set.spawn(async move {
1495            let wait_start = std::time::Instant::now();
1496            // acquire_owned: RAII permit moved into the task; returned
1497            // on every exit path INCLUDING panic (BLOCO 2).
1498            let Ok(_permit) = sem.acquire_owned().await else {
1499                let _ = tx
1500                    .send(Err(AppError::Embedding("semaphore closed".to_string())))
1501                    .await;
1502                return;
1503            };
1504            let permit_wait_ms = wait_start.elapsed().as_millis() as u64;
1505            let work_start = std::time::Instant::now();
1506            // ADR-0034: when `SQLITE_GRAPHRAG_IGNORE_SHUTDOWN=1` is set the
1507            // cancellation arm is dropped and the batch runs to completion.
1508            // This unblocks audit/test invocations whose `SHUTDOWN` flag was
1509            // contaminated by an earlier signal handler in the same process
1510            // tree. Production code never sees this branch.
1511            let outcome = if crate::should_obey_shutdown() {
1512                tokio::select! {
1513                    res = work(batch) => res,
1514                    _ = token.cancelled() => Err(AppError::Embedding(
1515                        "embedding cancelled by shutdown signal".to_string(),
1516                    )),
1517                }
1518            } else {
1519                work(batch).await
1520            };
1521            // BLOCO 8: permit wait time logged SEPARATELY from work time.
1522            tracing::debug!(
1523                target: "embedding",
1524                batch_idx,
1525                permit_wait_ms,
1526                work_ms = work_start.elapsed().as_millis() as u64,
1527                ok = outcome.is_ok(),
1528                "embedding batch finished"
1529            );
1530            let _ = tx.send(outcome).await;
1531        });
1532    }
1533    drop(tx);
1534
1535    let mut completed = 0usize;
1536    let mut failed = 0usize;
1537    let mut cancelled = 0usize;
1538    let mut first_error: Option<AppError> = None;
1539
1540    while let Some(message) = rx.recv().await {
1541        match message {
1542            Ok(items) => {
1543                completed += 1;
1544                if first_error.is_none() {
1545                    for (idx, v) in items {
1546                        if v.len() != dim {
1547                            first_error = Some(AppError::Embedding(format!(
1548                                "LLM returned {} dims for item {idx}, expected {dim}; \
1549                                 refusing to truncate or pad silently (G42/C5)",
1550                                v.len()
1551                            )));
1552                            break;
1553                        }
1554                        if let Err(e) = on_result(idx, &v) {
1555                            first_error = Some(e);
1556                            break;
1557                        }
1558                    }
1559                    if first_error.is_some() {
1560                        // Abort remaining work: dropped futures kill
1561                        // their subprocesses via kill_on_drop (BLOCO 6).
1562                        set.shutdown().await;
1563                    }
1564                }
1565            }
1566            Err(e) => {
1567                if matches!(&e, AppError::Embedding(msg) if msg.contains("cancelled")) {
1568                    cancelled += 1;
1569                } else {
1570                    failed += 1;
1571                }
1572                if first_error.is_none() {
1573                    first_error = Some(e);
1574                    set.shutdown().await;
1575                }
1576            }
1577        }
1578    }
1579
1580    // Drain the JoinSet: surface panics distinctly (panic handling —
1581    // JoinError::is_panic tratado em todo join_next, BLOCO 9).
1582    while let Some(join_result) = set.join_next().await {
1583        if let Err(join_err) = join_result {
1584            if join_err.is_panic() {
1585                failed += 1;
1586                if first_error.is_none() {
1587                    first_error = Some(AppError::Embedding(format!(
1588                        "embedding task panicked: {join_err}"
1589                    )));
1590                }
1591            } else {
1592                cancelled += 1;
1593            }
1594        }
1595    }
1596
1597    // v1.0.85 (ADR-0043 hygiene): the fan-out summary event moved
1598    // from `tracing::info!` to `tracing::debug!` and the
1599    // `available_permits` field was removed — the user prohibited
1600    // pool-state telemetry (slot_pool_stats / slot_wait_ms) and
1601    // decorative `tracing::info!` events. The remaining counters
1602    // (total_batches / completed / failed / cancelled) describe the
1603    // progress of the operation itself, not the slot pool, and
1604    // remain visible to operators running with `RUST_LOG=debug` or
1605    // `-vvv`.
1606    tracing::debug!(
1607        target: "embedding",
1608        total_batches,
1609        completed,
1610        failed,
1611        cancelled,
1612        "embedding fan-out finished"
1613    );
1614
1615    match first_error {
1616        Some(e) => Err(e),
1617        None => Ok(()),
1618    }
1619}
1620
1621pub fn f32_to_bytes(v: &[f32]) -> Vec<u8> {
1622    let mut out = Vec::with_capacity(v.len() * 4);
1623    for f in v {
1624        out.extend_from_slice(&f.to_le_bytes());
1625    }
1626    out
1627}
1628
1629pub fn bytes_to_f32(bytes: &[u8]) -> Vec<f32> {
1630    let mut out = Vec::with_capacity(bytes.len() / 4);
1631    for chunk in bytes.chunks_exact(4) {
1632        out.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
1633    }
1634    out
1635}
1636
1637/// Returns the dimensionality of the embedding space. Used to
1638/// validate LLM responses and to size the in-memory cache.
1639pub fn embedding_dim() -> usize {
1640    crate::constants::embedding_dim()
1641}
1642
1643/// G42/C5: a vector with a divergent dimensionality is an ERROR, never
1644/// silently truncated or zero-padded (the pre-v1.0.79 `normalise_dim`
1645/// masked malformed LLM responses).
1646fn validate_dim(v: Vec<f32>) -> Result<Vec<f32>, AppError> {
1647    let dim = crate::constants::embedding_dim();
1648    if v.len() != dim {
1649        return Err(AppError::Embedding(format!(
1650            "embedding has {} dims, expected {dim}; \
1651             refusing to truncate or pad silently (G42/C5)",
1652            v.len()
1653        )));
1654    }
1655    Ok(v)
1656}
1657
1658#[cfg(test)]
1659mod tests {
1660    use super::*;
1661    use std::sync::atomic::{AtomicUsize, Ordering};
1662
1663    #[test]
1664    fn reassemble_ordered_restores_input_order() {
1665        // GAP-OPENROUTER-REST-CONCURRENCY: the bounded JoinSet fan-out
1666        // completes chunks out of order, so parts arrive shuffled. The
1667        // reassembly MUST restore the exact input order by chunk index.
1668        let parts = vec![
1669            (2, vec![vec![2.0_f32], vec![2.1]]),
1670            (0, vec![vec![0.0], vec![0.1]]),
1671            (1, vec![vec![1.0], vec![1.1]]),
1672        ];
1673        let out = reassemble_ordered(parts);
1674        assert_eq!(
1675            out,
1676            vec![
1677                vec![0.0_f32],
1678                vec![0.1],
1679                vec![1.0],
1680                vec![1.1],
1681                vec![2.0],
1682                vec![2.1],
1683            ]
1684        );
1685    }
1686
1687    #[test]
1688    fn f32_to_bytes_roundtrip() {
1689        let input = vec![0.0_f32, 1.5, -2.25, f32::MIN, f32::MAX];
1690        let bytes = f32_to_bytes(&input);
1691        assert_eq!(bytes.len(), input.len() * 4);
1692        let out = bytes_to_f32(&bytes);
1693        assert_eq!(out, input);
1694    }
1695
1696    #[test]
1697    fn validate_dim_rejects_divergent_vectors() {
1698        // G42/C5 acceptance criterion: a divergent vector MUST fail —
1699        // never be silently normalised.
1700        let dim = crate::constants::embedding_dim();
1701        let long = vec![0.0; dim + 10];
1702        assert!(validate_dim(long).is_err(), "longer vector must error");
1703        let short = vec![0.0; dim.saturating_sub(1).max(1)];
1704        assert!(validate_dim(short).is_err(), "shorter vector must error");
1705        let exact = vec![0.0; dim];
1706        assert_eq!(validate_dim(exact).expect("exact dim must pass").len(), dim);
1707    }
1708
1709    #[test]
1710    fn embedding_dim_matches_constants_source() {
1711        assert_eq!(embedding_dim(), crate::constants::embedding_dim());
1712    }
1713
1714    #[test]
1715    fn build_batches_preserves_global_indices() {
1716        let texts: Vec<String> = (0..10).map(|i| format!("t{i}")).collect();
1717        let batches = build_batches(&texts, 4);
1718        assert_eq!(batches.len(), 3);
1719        assert_eq!(batches[0].len(), 4);
1720        assert_eq!(batches[2].len(), 2);
1721        assert_eq!(batches[2][1].0, 9);
1722        assert_eq!(batches[2][1].1, "t9");
1723    }
1724
1725    #[test]
1726    fn effective_permits_clamps_to_bounds() {
1727        assert!(effective_permits(0) >= 1);
1728        assert!(effective_permits(1000) <= 32);
1729    }
1730
1731    fn test_batches(n: usize) -> Vec<Vec<(usize, String)>> {
1732        (0..n).map(|i| vec![(i, format!("t{i}"))]).collect()
1733    }
1734
1735    fn dummy_vec(dim: usize) -> Vec<f32> {
1736        vec![0.0; dim]
1737    }
1738
1739    /// G42 acceptance criterion: with N permits the measured peak of
1740    /// concurrent workers NEVER exceeds N, even with 10x more batches.
1741    #[test]
1742    fn concurrency_peak_never_exceeds_permits() {
1743        let permits = 4usize;
1744        let batches = test_batches(permits * 10);
1745        let dim = crate::constants::embedding_dim();
1746        let current = Arc::new(AtomicUsize::new(0));
1747        let peak = Arc::new(AtomicUsize::new(0));
1748
1749        let current_c = Arc::clone(&current);
1750        let peak_c = Arc::clone(&peak);
1751        let work = move |batch: Vec<(usize, String)>| {
1752            let current = Arc::clone(&current_c);
1753            let peak = Arc::clone(&peak_c);
1754            async move {
1755                let now = current.fetch_add(1, Ordering::SeqCst) + 1;
1756                peak.fetch_max(now, Ordering::SeqCst);
1757                tokio::time::sleep(std::time::Duration::from_millis(20)).await;
1758                current.fetch_sub(1, Ordering::SeqCst);
1759                Ok(batch
1760                    .into_iter()
1761                    .map(|(i, _)| (i, dummy_vec(dim)))
1762                    .collect())
1763            }
1764        };
1765
1766        let mut delivered = 0usize;
1767        let rt = tokio::runtime::Builder::new_multi_thread()
1768            .worker_threads(4)
1769            .enable_all()
1770            .build()
1771            .expect("test runtime");
1772        rt.block_on(run_bounded(
1773            batches,
1774            permits,
1775            dim,
1776            CancellationToken::new(),
1777            work,
1778            &mut |_idx, _v| {
1779                delivered += 1;
1780                Ok(())
1781            },
1782        ))
1783        .expect("fan-out must succeed");
1784
1785        assert_eq!(delivered, permits * 10, "every item must be delivered");
1786        assert!(
1787            peak.load(Ordering::SeqCst) <= permits,
1788            "peak concurrency {} exceeded permits {permits}",
1789            peak.load(Ordering::SeqCst)
1790        );
1791    }
1792
1793    /// G42 acceptance criterion: a panicking task returns its permit via
1794    /// RAII and surfaces as JoinError::is_panic, not a hang.
1795    #[test]
1796    fn panicking_task_returns_permit_and_surfaces_error() {
1797        let permits = 2usize;
1798        let batches = test_batches(4);
1799        let dim = crate::constants::embedding_dim();
1800
1801        let work = move |batch: Vec<(usize, String)>| async move {
1802            if batch[0].0 == 1 {
1803                panic!("intentional test panic");
1804            }
1805            Ok(batch
1806                .into_iter()
1807                .map(|(i, _)| (i, dummy_vec(dim)))
1808                .collect())
1809        };
1810
1811        let rt = tokio::runtime::Builder::new_multi_thread()
1812            .worker_threads(2)
1813            .enable_all()
1814            .build()
1815            .expect("test runtime");
1816        let result = rt.block_on(run_bounded(
1817            batches,
1818            permits,
1819            dim,
1820            CancellationToken::new(),
1821            work,
1822            &mut |_idx, _v| Ok(()),
1823        ));
1824
1825        let err = result.expect_err("panic must surface as an error");
1826        assert!(
1827            err.to_string().contains("panicked"),
1828            "error must mention the panic: {err}"
1829        );
1830    }
1831
1832    /// G42 acceptance criterion: cancellation aborts in-flight work and
1833    /// the fan-out terminates within the shutdown timeout.
1834    #[test]
1835    fn cancellation_terminates_fan_out_quickly() {
1836        let permits = 2usize;
1837        let batches = test_batches(8);
1838        let dim = crate::constants::embedding_dim();
1839        let token = CancellationToken::new();
1840
1841        let work = move |batch: Vec<(usize, String)>| async move {
1842            // Long enough that only cancellation can finish the test fast.
1843            tokio::time::sleep(std::time::Duration::from_secs(30)).await;
1844            Ok(batch
1845                .into_iter()
1846                .map(|(i, _)| (i, dummy_vec(dim)))
1847                .collect())
1848        };
1849
1850        let rt = tokio::runtime::Builder::new_multi_thread()
1851            .worker_threads(2)
1852            .enable_all()
1853            .build()
1854            .expect("test runtime");
1855        let cancel = token.clone();
1856        let start = std::time::Instant::now();
1857        let result = rt.block_on(async move {
1858            tokio::spawn(async move {
1859                tokio::time::sleep(std::time::Duration::from_millis(50)).await;
1860                cancel.cancel();
1861            });
1862            run_bounded(batches, permits, dim, token, work, &mut |_idx, _v| Ok(())).await
1863        });
1864
1865        assert!(result.is_err(), "cancelled fan-out must report an error");
1866        assert!(
1867            start.elapsed() < std::time::Duration::from_secs(10),
1868            "graceful shutdown must finish well under the work duration"
1869        );
1870    }
1871
1872    /// G42 acceptance criterion: a divergent dim coming out of the work
1873    /// stage fails the fan-out instead of being silently accepted.
1874    #[test]
1875    fn fan_out_rejects_divergent_dim() {
1876        let permits = 2usize;
1877        let batches = test_batches(2);
1878        let dim = crate::constants::embedding_dim();
1879
1880        let work = move |batch: Vec<(usize, String)>| async move {
1881            Ok(batch
1882                .into_iter()
1883                .map(|(i, _)| (i, vec![0.0f32; 3]))
1884                .collect::<Vec<(usize, Vec<f32>)>>())
1885        };
1886
1887        let rt = tokio::runtime::Builder::new_multi_thread()
1888            .worker_threads(2)
1889            .enable_all()
1890            .build()
1891            .expect("test runtime");
1892        let result = rt.block_on(run_bounded(
1893            batches,
1894            permits,
1895            dim,
1896            CancellationToken::new(),
1897            work,
1898            &mut |_idx, _v| Ok(()),
1899        ));
1900
1901        let err = result.expect_err("divergent dim must fail the fan-out");
1902        assert!(err.to_string().contains("G42/C5"), "error cites C5: {err}");
1903    }
1904
1905    /// G44: the calibration bases stay intact at the calibration dim.
1906    #[test]
1907    fn adaptive_batch_dim64_keeps_calibrated_sizes() {
1908        assert_eq!(adaptive_batch_for_dim(CHUNK_EMBED_BATCH_SIZE, 64), 8);
1909        assert_eq!(adaptive_batch_for_dim(ENTITY_EMBED_BATCH_SIZE, 64), 25);
1910    }
1911
1912    /// G44: legacy 384-dim databases shrink to reliable batch sizes.
1913    #[test]
1914    fn adaptive_batch_dim384_shrinks() {
1915        assert_eq!(adaptive_batch_for_dim(CHUNK_EMBED_BATCH_SIZE, 384), 1);
1916        assert_eq!(adaptive_batch_for_dim(ENTITY_EMBED_BATCH_SIZE, 384), 4);
1917    }
1918
1919    /// G44: intermediate dims scale proportionally to the float budget.
1920    #[test]
1921    fn adaptive_batch_intermediate_dims() {
1922        assert_eq!(adaptive_batch_for_dim(8, 128), 4);
1923        assert_eq!(adaptive_batch_for_dim(8, 256), 2);
1924    }
1925
1926    /// G44: dims below the calibration dim never exceed the base.
1927    #[test]
1928    fn adaptive_batch_small_dim_clamps_to_base() {
1929        assert_eq!(adaptive_batch_for_dim(8, 8), 8);
1930    }
1931
1932    /// G44: the function is total — no division by zero, no clamp panic.
1933    #[test]
1934    fn adaptive_batch_total_function() {
1935        assert_eq!(adaptive_batch_for_dim(8, 4096), 1);
1936        assert_eq!(adaptive_batch_for_dim(8, 0), 8);
1937        assert_eq!(adaptive_batch_for_dim(0, 64), 1);
1938    }
1939
1940    /// G44 end-to-end: the public wrappers follow the env-dim override.
1941    #[test]
1942    #[serial_test::serial(env)]
1943    fn adaptive_wrappers_follow_env_dim() {
1944        std::env::set_var("SQLITE_GRAPHRAG_EMBEDDING_DIM", "384");
1945        let chunk = chunk_embed_batch_size();
1946        let entity = entity_embed_batch_size();
1947        std::env::remove_var("SQLITE_GRAPHRAG_EMBEDDING_DIM");
1948        crate::constants::set_active_embedding_dim(crate::constants::DEFAULT_EMBEDDING_DIM);
1949        assert_eq!(chunk, 1, "384-dim chunk batch must shrink to 1 (G44)");
1950        assert_eq!(entity, 4, "384-dim entity batch must shrink to 4 (G44)");
1951    }
1952
1953    // ---------------------------------------------------------------
1954    // G58/S1: FallbackReason + try_embed_query_with_fallback tests
1955    // ---------------------------------------------------------------
1956
1957    /// GAP-004 (v1.0.88): EmbeddingErrorKind::classify maps an OAuth
1958    /// error message to the OAuth variant regardless of case or
1959    /// surrounding text.
1960    #[test]
1961    fn embedding_error_kind_classify_oauth_message() {
1962        assert_eq!(
1963            EmbeddingErrorKind::classify("OAuth token expired for claude"),
1964            EmbeddingErrorKind::OAuth,
1965        );
1966        assert_eq!(
1967            EmbeddingErrorKind::classify("oauth authentication failed"),
1968            EmbeddingErrorKind::OAuth,
1969        );
1970    }
1971
1972    /// GAP-004 (v1.0.88): EmbeddingErrorKind::classify maps a quota
1973    /// message to the Quota variant (without "OAuth" substring).
1974    #[test]
1975    fn embedding_error_kind_classify_quota_message() {
1976        assert_eq!(
1977            EmbeddingErrorKind::classify("quota exhausted on backend"),
1978            EmbeddingErrorKind::Quota,
1979        );
1980        assert_eq!(
1981            EmbeddingErrorKind::classify("Usage quota limit reached"),
1982            EmbeddingErrorKind::Quota,
1983        );
1984    }
1985
1986    /// GAP-004 (v1.0.88): EmbeddingErrorKind::classify maps a slot-sema
1987    /// message to the SlotExhausted variant (matched BEFORE Quota so
1988    /// the more specific LLM-never-tried path wins).
1989    #[test]
1990    fn embedding_error_kind_classify_slot_exhausted_message() {
1991        assert_eq!(
1992            EmbeddingErrorKind::classify(
1993                "slot exhausted: failed to acquire LLM slot after backoff"
1994            ),
1995            EmbeddingErrorKind::SlotExhausted,
1996        );
1997    }
1998
1999    /// GAP-004 (v1.0.88): EmbeddingErrorKind::classify maps a
2000    /// zero-dimensional vector error to the ZeroDimension variant.
2001    #[test]
2002    fn embedding_error_kind_classify_zero_dimension_message() {
2003        assert_eq!(
2004            EmbeddingErrorKind::classify("embedding returned dim=zero"),
2005            EmbeddingErrorKind::ZeroDimension,
2006        );
2007        assert_eq!(
2008            EmbeddingErrorKind::classify("got zero-dim vector from LLM"),
2009            EmbeddingErrorKind::ZeroDimension,
2010        );
2011    }
2012
2013    /// GAP-004 (v1.0.88): EmbeddingErrorKind::classify falls back to
2014    /// the Unknown variant when no marker matches, and the code()
2015    /// accessor returns the kebab-safe discriminator string.
2016    #[test]
2017    fn embedding_error_kind_classify_unknown_fallback() {
2018        assert_eq!(
2019            EmbeddingErrorKind::classify("unrelated subprocess error"),
2020            EmbeddingErrorKind::Unknown,
2021        );
2022        assert_eq!(
2023            EmbeddingErrorKind::classify("rate limit hit"),
2024            EmbeddingErrorKind::Unknown,
2025        );
2026        // code() returns the stable discriminator string.
2027        assert_eq!(EmbeddingErrorKind::OAuth.code(), "oauth");
2028        assert_eq!(EmbeddingErrorKind::Quota.code(), "quota");
2029        assert_eq!(EmbeddingErrorKind::SlotExhausted.code(), "slot-exhausted");
2030        assert_eq!(
2031            EmbeddingErrorKind::BackendMismatch.code(),
2032            "backend-mismatch"
2033        );
2034        assert_eq!(EmbeddingErrorKind::ZeroDimension.code(), "zero-dimension");
2035        assert_eq!(EmbeddingErrorKind::Unknown.code(), "unknown");
2036    }
2037
2038    /// Display impl covers all three variants without panicking.
2039    #[test]
2040    fn fallback_reason_display_does_not_panic() {
2041        let _ = FallbackReason::EmbeddingFailed("rate limit".into()).to_string();
2042        let _ = FallbackReason::Cancelled.to_string();
2043        let _ = FallbackReason::Timeout {
2044            operation: "embed_query".into(),
2045            duration_secs: 30,
2046        }
2047        .to_string();
2048    }
2049
2050    /// FallbackReason is PartialEq — used in test assertions to verify
2051    /// the mapping rules.
2052    #[test]
2053    fn fallback_reason_is_partial_eq() {
2054        assert_eq!(
2055            FallbackReason::EmbeddingFailed("a".into()),
2056            FallbackReason::EmbeddingFailed("a".into())
2057        );
2058        assert_eq!(FallbackReason::Cancelled, FallbackReason::Cancelled);
2059        assert_ne!(
2060            FallbackReason::EmbeddingFailed("a".into()),
2061            FallbackReason::EmbeddingFailed("b".into())
2062        );
2063        assert_ne!(
2064            FallbackReason::Cancelled,
2065            FallbackReason::Timeout {
2066                operation: "x".into(),
2067                duration_secs: 1
2068            }
2069        );
2070    }
2071
2072    /// Timeout variant preserves the operation name and duration from the
2073    /// original AppError::Timeout for observability.
2074    #[test]
2075    fn fallback_reason_timeout_preserves_fields() {
2076        let r = FallbackReason::Timeout {
2077            operation: "embed_query_local".into(),
2078            duration_secs: 300,
2079        };
2080        match r {
2081            FallbackReason::Timeout {
2082                operation,
2083                duration_secs,
2084            } => {
2085                assert_eq!(operation, "embed_query_local");
2086                assert_eq!(duration_secs, 300);
2087            }
2088            other => panic!("expected Timeout, got {other:?}"),
2089        }
2090    }
2091
2092    /// try_embed_query_with_fallback surfaces an EmbeddingFailed variant
2093    /// when the LLM subprocess errors. Uses a path that surely does not
2094    /// contain any embedder configuration (the binary is invoked as
2095    /// `codex` / `claude` via PATH which, in tests, defaults to nothing
2096    /// in scope, so `LlmEmbedding::detect_available()` returns Err).
2097    #[test]
2098    #[ignore = "G58 S1 stub: requires env without codex/claude on PATH; tracked as T5 of Fase 2"]
2099    fn try_embed_query_with_fallback_surfaces_embedding_failed_for_missing_binary() {
2100        // Pointing at a models dir that does not exist forces the embedder
2101        // init to fail; the error is mapped to EmbeddingFailed.
2102        let bogus = std::path::Path::new("/nonexistent-models-dir-for-g58-fallback-test");
2103        let result = try_embed_query_with_fallback(bogus, "hello world");
2104        match result {
2105            Err(FallbackReason::EmbeddingFailed(msg)) => {
2106                // The original error must survive in the message for ops triage.
2107                assert!(!msg.is_empty(), "fallback message must not be empty");
2108            }
2109            Err(FallbackReason::Cancelled) => {
2110                panic!("expected EmbeddingFailed, got Cancelled");
2111            }
2112            Err(FallbackReason::Timeout { .. }) => {
2113                panic!("expected EmbeddingFailed, got Timeout");
2114            }
2115            Err(FallbackReason::SlotExhausted) => {
2116                panic!("expected EmbeddingFailed, got SlotExhausted");
2117            }
2118            Err(FallbackReason::OAuthQuota { .. }) => {
2119                panic!("expected EmbeddingFailed, got OAuthQuota");
2120            }
2121            Err(FallbackReason::BackendMismatch { .. }) => {
2122                panic!("expected EmbeddingFailed, got BackendMismatch");
2123            }
2124            Err(FallbackReason::DimZero) => {
2125                panic!("expected EmbeddingFailed, got DimZero");
2126            }
2127            Ok(_) => {
2128                panic!("expected an error, got Ok — embedder must fail for bogus path");
2129            }
2130        }
2131    }
2132
2133    // G56: entity embed cache — unit tests
2134    #[test]
2135    fn g56_entity_cache_key_is_stable_and_distinct() {
2136        let k1 = entity_cache_key("codex:default", "sqlite-graphrag");
2137        let k2 = entity_cache_key("codex:default", "sqlite-graphrag");
2138        let k3 = entity_cache_key("codex:default", "claude-code");
2139        let k4 = entity_cache_key("claude:default", "sqlite-graphrag");
2140        assert_eq!(k1, k2, "same model+text must hash identically");
2141        assert_ne!(k1, k3, "different text must hash differently");
2142        assert_ne!(k1, k4, "different model must hash differently");
2143    }
2144
2145    #[test]
2146    fn g56_entity_embed_cache_stats_hit_rate() {
2147        let zero = EmbedCacheStats::default();
2148        assert_eq!(zero.hit_rate(), 0.0);
2149        let half = EmbedCacheStats {
2150            requested: 4,
2151            hits: 2,
2152            misses: 2,
2153        };
2154        assert!((half.hit_rate() - 0.5).abs() < 1e-9);
2155        let all = EmbedCacheStats {
2156            requested: 7,
2157            hits: 7,
2158            misses: 0,
2159        };
2160        assert!((all.hit_rate() - 1.0).abs() < 1e-9);
2161    }
2162
2163    #[test]
2164    fn g56_entity_embed_cache_populates_and_hits() {
2165        // Manually populate the cache: bypasses the LLM by writing a
2166        // known vector under a chosen (model, text) key, then verifies
2167        // the cache is consulted before any LLM call would happen.
2168        let cache = entity_embed_cache();
2169        let model = "test-model";
2170        let text = "sqlite-graphrag";
2171        let key = entity_cache_key(model, text);
2172        let stored = Arc::new(vec![0.42_f32; crate::constants::embedding_dim()]);
2173        cache.lock().insert(key, Arc::clone(&stored));
2174        let guard = cache.lock();
2175        let hit = guard.get(&key).expect("cache must return stored value");
2176        assert_eq!(hit.len(), crate::constants::embedding_dim());
2177        assert!((hit[0] - 0.42).abs() < 1e-6);
2178    }
2179
2180    #[test]
2181    fn g56_empty_texts_short_circuits_with_zero_stats() {
2182        // Cannot call embed_entity_texts_cached without an LLM on PATH,
2183        // so we only verify the empty-input contract via the stats struct.
2184        let stats = EmbedCacheStats::default();
2185        assert_eq!(stats.requested, 0);
2186        assert_eq!(stats.hits, 0);
2187        assert_eq!(stats.misses, 0);
2188        assert_eq!(stats.hit_rate(), 0.0);
2189    }
2190}
2191
2192// =============================================================================
2193// v1.0.82 (GAP-005) — embed_with_fallback tests
2194// =============================================================================
2195#[cfg(test)]
2196mod embed_with_fallback_tests {
2197    use super::*;
2198    use crate::llm::exit_code_hints::LlmBackendError;
2199
2200    #[test]
2201    fn none_backend_returns_empty_vector_without_calling_llm() {
2202        // The `None` backend short-circuits to `Ok(vec![])` without
2203        // touching the LLM at all. This is the signal the caller uses
2204        // to insert a `pending_embeddings` row.
2205        let (v, kind) = embed_via_backend(
2206            std::path::Path::new("/nonexistent"),
2207            "any text",
2208            &LlmBackendKind::None,
2209        )
2210        .expect("None backend never fails");
2211        assert!(v.is_empty());
2212        assert_eq!(kind, LlmBackendKind::None, "None backend must report None");
2213    }
2214
2215    #[test]
2216    fn empty_chain_defaults_to_codex_claude_none() {
2217        // Internal invariant: the default chain order is the v1.0.81
2218        // implicit order (codex first, then claude, then None as
2219        // graceful-degradation fallback).
2220        let defaults = [
2221            LlmBackendKind::Codex,
2222            LlmBackendKind::Claude,
2223            LlmBackendKind::None,
2224        ];
2225
2226        // ---------------------------------------------------------------
2227        // ADR-0042: as_str + reason_code unit tests
2228        // ---------------------------------------------------------------
2229
2230        #[allow(dead_code)]
2231        fn llm_backend_kind_as_str_is_stable() {
2232            assert_eq!(LlmBackendKind::Codex.as_str(), "codex");
2233            assert_eq!(LlmBackendKind::Claude.as_str(), "claude");
2234            assert_eq!(LlmBackendKind::None.as_str(), "none");
2235        }
2236
2237        #[allow(dead_code)]
2238        fn fallback_reason_reason_code_is_stable() {
2239            assert_eq!(
2240                FallbackReason::EmbeddingFailed("any".into()).reason_code(),
2241                "embedding_failed"
2242            );
2243            assert_eq!(FallbackReason::Cancelled.reason_code(), "cancelled");
2244            assert_eq!(
2245                FallbackReason::Timeout {
2246                    operation: "embed_query".into(),
2247                    duration_secs: 30
2248                }
2249                .reason_code(),
2250                "timeout"
2251            );
2252        }
2253        assert_eq!(defaults.len(), 3);
2254    }
2255
2256    #[test]
2257    fn embed_with_fallback_chain_of_only_none_aborts_without_skip_on_failure_v1088() {
2258        // ADR-0046 / BUG-11 v1.0.88: a fallback chain of only `[None]`
2259        // without `skip_on_failure=true` MUST abort with
2260        // `AppError::Embedding("no LLM backends available; fallback chain exhausted")`.
2261        //
2262        // Before BUG-11, the `None` tail returned `Ok((vec![], None))`
2263        // silently, which let `remember` persist a memory with a
2264        // zero-dimensional embedding (invisible to recall). The fix
2265        // routes the chain exhaustion through `embed_via_backend_strict`
2266        // so the caller can distinguish between "chain intentionally
2267        // degrades to skip" (skip_on_failure=true) and "chain has no
2268        // viable backend at all" (this test).
2269        let chain = vec![LlmBackendKind::None];
2270        let err = embed_with_fallback(
2271            std::path::Path::new("/nonexistent-models-dir-for-gap005-test"),
2272            "hello",
2273            &chain,
2274            false,
2275        )
2276        .expect_err("chain of only [None] without skip_on_failure MUST abort");
2277        let msg = format!("{err}");
2278        assert!(
2279            msg.contains("no LLM backends available"),
2280            "error must mention exhausted chain, got: {msg}"
2281        );
2282    }
2283    #[test]
2284    fn embed_with_fallback_skip_on_failure_with_only_none_returns_empty() {
2285        // skip_on_failure=true + a chain of only `None` returns Ok(vec![])
2286        // because the None short-circuit always succeeds. This is the
2287        // canonical contract: skip_on_failure is a no-op when None is
2288        // the tail because None already provides graceful degradation.
2289        let chain = vec![LlmBackendKind::None];
2290        let v = embed_with_fallback(
2291            std::path::Path::new("/nonexistent-models-dir-for-gap005-test"),
2292            "hello",
2293            &chain,
2294            true,
2295        )
2296        .expect("None chain is always Ok");
2297        assert!(v.0.is_empty(), "vector must be empty");
2298        assert_eq!(v.1, LlmBackendKind::None);
2299    }
2300    #[allow(dead_code)]
2301    fn llm_backend_error_no_backends_default_message() {
2302        // The fallback chain exhaustion error must mention
2303        // in its hint so the operator knows the remediation.
2304        let e = LlmBackendError::NoBackendsAvailable;
2305        let h = e.hint();
2306        assert!(h.contains("--llm-fallback"));
2307    }
2308
2309    #[test]
2310    fn llm_backend_error_nonzero_exit_carries_stderr_tail() {
2311        let e = LlmBackendError::NonZeroExit {
2312            exit_code: Some(137),
2313            signal: Some(9),
2314            stdout_tail: "out".into(),
2315            stderr_tail: "OOM killed".into(),
2316            binary: "codex".into(),
2317            hint: "OOM".into(),
2318        };
2319        let s = e.to_string();
2320        assert!(s.contains("codex"));
2321        assert!(s.contains("OOM killed"));
2322        assert!(s.contains("signal 9") || s.contains("exit 137"));
2323    }
2324}