sqlite_graphrag/
embedder.rs

1//! Embedding generation for the GraphRAG memory.
2//!
3//! v1.0.76: the default build is **LLM-only** — the binary does NOT bundle
4//! fastembed / ort / ndarray / tokenizers. All embeddings are produced
5//! by a headless invocation of `claude code` or `codex` (OAuth, no MCP,
6//! no hooks) and stored as a BLOB in `memory_embeddings(memory_id, embedding,
7//! source)`. Vector similarity is computed in pure Rust at query time.
8//!
9//! # Workload classification (G42/S3, BLOCK 1 — MANDATORY)
10//!
11//! LLM embedding is **I/O-bound + subprocess-bound**: each call waits
12//! 5-60s on a network round-trip through a headless `claude -p` /
13//! `codex exec` subprocess while the local CPU stays idle. Concurrency
14//! therefore uses **tokio** (async I/O concurrency) and NEVER rayon
15//! (reserved for CPU-bound work).
16//!
17//! # Permit formula (G42/S3, BLOCO 2)
18//!
19//! ```text
20//! permits = clamp(--llm-parallelism, 1, 32)
21//!           .min(available_parallelism())
22//!           .min(available_ram_mb * 0.5 / LLM_WORKER_RSS_MB)
23//! ```
24//!
25//! `LLM_WORKER_RSS_MB = 350` (`crate::constants`): `claude -p` and
26//! `codex exec` are node processes with a typical Maximum RSS of
27//! 200-400 MB (measured via `/usr/bin/time -l` on macOS /
28//! `/usr/bin/time -v` on Linux), so the RAM bound is pertinent.
29//!
30//! # Locking contract (G42/A3 fix)
31//!
32//! The process-wide `Mutex<LlmEmbedding>` protects ONLY the cheap clone
33//! of the client configuration (flavour + binary path + model + shared
34//! schema tempfiles). It is NEVER held across network I/O — the
35//! v1.0.76-v1.0.78 `flush_group` held it for the whole sequential
36//! embedding loop, which is why `--llm-parallelism 8` measured an
37//! effective parallelism of 1.
38
39use crate::errors::AppError;
40use crate::extract::llm_embedding::LlmEmbedding;
41use parking_lot::Mutex;
42use std::path::Path;
43use std::sync::Arc;
44use std::sync::OnceLock;
45use tokio::sync::{mpsc, Semaphore};
46use tokio::task::JoinSet;
47use tokio_util::sync::CancellationToken;
48
49/// Process-wide LLM-embedding client behind a .
50///
51/// The lock guards configuration cloning only (see module docs); the
52/// actual LLM I/O happens on clones, outside the lock.
53///
54/// ADR-0042 / GAP-002: process-wide Claude-backed LLM-embedding client
55/// behind a `Mutex`. Distinct from `EMBEDDER` so the Claude path of
56/// `embed_via_backend` no longer re-probes PATH via `detect_available`
57/// (the v1.0.82 bug where requesting Claude could resolve to Codex).
58static CLAUDE_EMBEDDER: OnceLock<Mutex<LlmEmbedding>> = OnceLock::new();
59static OPENCODE_EMBEDDER: OnceLock<Mutex<LlmEmbedding>> = OnceLock::new();
60static OPENROUTER_CLIENT: OnceLock<crate::embedding_api::OpenRouterClient> = OnceLock::new();
61
62/// v1.0.95 (ADR-0054): process-wide OpenRouter chat-completions client for
63/// the `enrich` JUDGE. Distinct from `OPENROUTER_CLIENT` (embeddings) because
64/// the chat client binds a text model, not an embedding model.
65static OPENROUTER_CHAT_CLIENT: OnceLock<crate::chat_api::OpenRouterChatClient> = OnceLock::new();
66
67/// v1.0.93: check whether the OpenRouter client has been initialised.
68pub fn is_openrouter_initialized() -> bool {
69    OPENROUTER_CLIENT.get().is_some()
70}
71static EMBEDDER: OnceLock<Mutex<LlmEmbedding>> = OnceLock::new();
72
73/// Process-wide multi-thread tokio runtime for embedding I/O.
74///
75/// G42/A2 fix: v1.0.76-v1.0.78 built a current-thread runtime PER CALL.
76/// One runtime per process amortises the setup and hosts the bounded
77/// fan-out of `embed_texts_parallel`.
78static RUNTIME: OnceLock<tokio::runtime::Runtime> = OnceLock::new();
79
80/// Calibration base: chunk (long-text) batch size per LLM call at the
81/// calibration dimensionality (G42/S2). Use [`chunk_embed_batch_size`]
82/// for the dim-adaptive value (G44).
83pub const CHUNK_EMBED_BATCH_SIZE: usize = 8;
84
85/// Calibration base: entity-name (short-text) batch size per LLM call at
86/// the calibration dimensionality (G42/S2). Use [`entity_embed_batch_size`]
87/// for the dim-adaptive value (G44).
88pub const ENTITY_EMBED_BATCH_SIZE: usize = 25;
89
90/// Dimensionality the batch bases above were calibrated against (G44).
91pub const EMBED_BATCH_CALIBRATION_DIM: usize = 64;
92
93/// G44: scales a calibration-base batch size to the active dimensionality,
94/// keeping the float budget per LLM call constant (~512 floats for chunks,
95/// ~1600 for entity names — the budgets empirically validated at dim 64).
96/// Fixed batches of 8 at 384 dims asked for ~3072 floats per response:
97/// claude returned partial coverage (3 of 8 items, caught by the G42/C5
98/// check) and codex timed out at 300s. `base.max(1)` keeps the function
99/// total — `clamp` panics when the upper bound is below the lower one.
100fn adaptive_batch_for_dim(base: usize, dim: usize) -> usize {
101    let base = base.max(1);
102    (base * EMBED_BATCH_CALIBRATION_DIM / dim.max(1)).clamp(1, base)
103}
104
105/// Dim-adaptive batch size for chunk (long-text) embedding calls (G44).
106pub fn chunk_embed_batch_size() -> usize {
107    let dim = crate::constants::embedding_dim();
108    let batch = adaptive_batch_for_dim(CHUNK_EMBED_BATCH_SIZE, dim);
109    tracing::debug!(
110        dim,
111        base = CHUNK_EMBED_BATCH_SIZE,
112        batch,
113        "adaptive chunk batch size (G44)"
114    );
115    batch
116}
117
118/// Dim-adaptive batch size for entity-name (short-text) embedding calls (G44).
119pub fn entity_embed_batch_size() -> usize {
120    let dim = crate::constants::embedding_dim();
121    let batch = adaptive_batch_for_dim(ENTITY_EMBED_BATCH_SIZE, dim);
122    tracing::debug!(
123        dim,
124        base = ENTITY_EMBED_BATCH_SIZE,
125        batch,
126        "adaptive entity batch size (G44)"
127    );
128    batch
129}
130
131/// Returns the process-wide multi-thread runtime, building it on first use.
132pub(crate) fn shared_runtime() -> Result<&'static tokio::runtime::Runtime, AppError> {
133    if let Some(rt) = RUNTIME.get() {
134        return Ok(rt);
135    }
136    let rt = tokio::runtime::Builder::new_multi_thread()
137        .worker_threads(2)
138        .enable_all()
139        .build()
140        .map_err(|e| AppError::Embedding(format!("tokio runtime init failed: {e}")))?;
141    let _ = RUNTIME.set(rt);
142    RUNTIME.get().ok_or_else(|| {
143        AppError::Embedding("tokio runtime unavailable after initialisation".to_string())
144    })
145}
146
147/// Initialises the LLM-embedding client on first use and returns it.
148pub fn get_embedder(_models_dir: &Path) -> Result<&'static Mutex<LlmEmbedding>, AppError> {
149    if let Some(e) = EMBEDDER.get() {
150        return Ok(e);
151    }
152    let backend = LlmEmbedding::detect_available()?;
153    let _ = EMBEDDER.set(Mutex::new(backend));
154    EMBEDDER
155        .get()
156        .ok_or_else(|| AppError::Embedding("embedder unavailable after initialisation".to_string()))
157}
158
159/// ADR-0042 / GAP-002: returns the process-wide Claude embedder, lazily
160/// initialising it on first use. Binary and model overrides come from
161/// the explicit arguments; `None` falls back to PATH/env defaults via
162/// the builder.
163pub fn get_claude_embedder(
164    claude_binary: Option<&Path>,
165    claude_model: Option<&str>,
166) -> Result<&'static Mutex<LlmEmbedding>, AppError> {
167    if let Some(e) = CLAUDE_EMBEDDER.get() {
168        return Ok(e);
169    }
170    let mut builder = LlmEmbedding::with_claude_builder();
171    if let Some(b) = claude_binary {
172        builder = builder.override_binary(b.to_path_buf());
173    }
174    if let Some(m) = claude_model {
175        builder = builder.override_model(m.to_string());
176    }
177    let backend = builder.build()?;
178    let _ = CLAUDE_EMBEDDER.set(Mutex::new(backend));
179    CLAUDE_EMBEDDER.get().ok_or_else(|| {
180        AppError::Embedding("claude embedder unavailable after initialisation".to_string())
181    })
182}
183
184/// GAP-OPENCODE-001 / v1.0.90: returns the process-wide OpenCode embedder,
185/// lazily initialising it on first use. Binary and model overrides come
186/// from the explicit arguments; `None` falls back to PATH/env defaults via
187/// the builder.
188pub fn get_opencode_embedder(
189    opencode_binary: Option<&Path>,
190    opencode_model: Option<&str>,
191) -> Result<&'static Mutex<LlmEmbedding>, AppError> {
192    if let Some(e) = OPENCODE_EMBEDDER.get() {
193        return Ok(e);
194    }
195    let mut builder = LlmEmbedding::with_opencode_builder();
196    if let Some(b) = opencode_binary {
197        builder = builder.override_binary(b.to_path_buf());
198    }
199    if let Some(m) = opencode_model {
200        builder = builder.override_model(m.to_string());
201    }
202    let backend = builder.build()?;
203    let _ = OPENCODE_EMBEDDER.set(Mutex::new(backend));
204    OPENCODE_EMBEDDER.get().ok_or_else(|| {
205        AppError::Embedding("opencode embedder unavailable after initialisation".to_string())
206    })
207}
208
209pub fn get_openrouter_embedder(
210    api_key: secrecy::SecretBox<String>,
211    model: &str,
212    dim: usize,
213) -> Result<&'static crate::embedding_api::OpenRouterClient, AppError> {
214    if let Some(c) = OPENROUTER_CLIENT.get() {
215        return Ok(c);
216    }
217    let client = crate::embedding_api::OpenRouterClient::new(api_key, model.to_string(), dim)?;
218    let _ = OPENROUTER_CLIENT.set(client);
219    OPENROUTER_CLIENT.get().ok_or_else(|| {
220        AppError::Embedding("openrouter client unavailable after initialisation".to_string())
221    })
222}
223
224/// v1.0.95 (ADR-0054): initialises the process-wide OpenRouter chat client on
225/// first use and returns it. `model` is the text model the enrich JUDGE will
226/// call (no default; the caller validates presence upfront).
227pub fn get_openrouter_chat_client(
228    api_key: secrecy::SecretBox<String>,
229    model: &str,
230    timeout_secs: u64,
231) -> Result<&'static crate::chat_api::OpenRouterChatClient, AppError> {
232    if let Some(c) = OPENROUTER_CHAT_CLIENT.get() {
233        return Ok(c);
234    }
235    let client =
236        crate::chat_api::OpenRouterChatClient::new(api_key, model.to_string(), timeout_secs)?;
237    let _ = OPENROUTER_CHAT_CLIENT.set(client);
238    OPENROUTER_CHAT_CLIENT.get().ok_or_else(|| {
239        AppError::Embedding("openrouter chat client unavailable after initialisation".to_string())
240    })
241}
242
243/// v1.0.95: returns the process-wide OpenRouter chat client if it has already
244/// been initialised via [`get_openrouter_chat_client`]. Used by the enrich
245/// JUDGE dispatch, which initialises the singleton once at startup and then
246/// fetches it per item without re-threading the API key.
247pub fn openrouter_chat_client() -> Option<&'static crate::chat_api::OpenRouterChatClient> {
248    OPENROUTER_CHAT_CLIENT.get()
249}
250
251/// ADR-0042 / GAP-002: route a single passage through the Claude
252/// embedder. Used by the Claude arm of `embed_via_backend` so the
253/// fallback chain stops treating Claude as a synonym for codex.
254pub fn embed_via_claude_local(
255    _models_dir: &Path,
256    text: &str,
257    claude_binary: Option<&Path>,
258    claude_model: Option<&str>,
259) -> Result<Vec<f32>, AppError> {
260    let _slot_guard = acquire_llm_slot_for_embedding()?;
261    let embedder = get_claude_embedder(claude_binary, claude_model)?;
262    embed_passage(embedder, text)
263}
264
265/// BUG-003 / v1.0.85: split of  that also
266/// reports the resolved []. Always  because
267/// this path constructs a Claude-flavoured embedder via
268///  (no PATH probe, no silent substitution).
269pub fn embed_via_claude_local_resolved(
270    _models_dir: &Path,
271    text: &str,
272    claude_binary: Option<&Path>,
273    claude_model: Option<&str>,
274) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
275    let _slot_guard = acquire_llm_slot_for_embedding()?;
276    let embedder = get_claude_embedder(claude_binary, claude_model)?;
277    let v = embed_passage(embedder, text)?;
278    Ok((v, LlmBackendKind::Claude))
279}
280
281/// GAP-OPENCODE-001 / v1.0.90: route a single passage through the OpenCode
282/// embedder, reporting the resolved [`LlmBackendKind::Opencode`]. Constructs
283/// an OpenCode-flavoured embedder via `with_opencode_builder` (no PATH probe,
284/// no silent substitution).
285pub fn embed_via_opencode_local_resolved(
286    _models_dir: &Path,
287    text: &str,
288    opencode_binary: Option<&Path>,
289    opencode_model: Option<&str>,
290) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
291    let _slot_guard = acquire_llm_slot_for_embedding()?;
292    let embedder = get_opencode_embedder(opencode_binary, opencode_model)?;
293    let v = embed_passage(embedder, text)?;
294    Ok((v, LlmBackendKind::Opencode))
295}
296/// Clones the embedding-client configuration. The lock is held only for
297/// the duration of the clone — NEVER across I/O (G42/A3).
298fn clone_client(embedder: &Mutex<LlmEmbedding>) -> LlmEmbedding {
299    embedder.lock().clone()
300}
301
302// When true, embed_passage/embed_query use the short query timeout so Auto
303// chains fail fast into FTS (GAP-E2E-06).
304thread_local! {
305    static QUERY_EMBED_FAST: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
306}
307
308fn with_query_embed_fast<T>(f: impl FnOnce() -> T) -> T {
309    QUERY_EMBED_FAST.with(|c| {
310        let prev = c.replace(true);
311        let out = f();
312        c.set(prev);
313        out
314    })
315}
316
317fn apply_query_timeout_if_needed(client: LlmEmbedding) -> LlmEmbedding {
318    if QUERY_EMBED_FAST.with(|c| c.get()) {
319        let secs = crate::runtime_config::resolve_u64(
320            None,
321            "llm.query_embed_timeout_secs",
322            crate::constants::DEFAULT_QUERY_EMBED_TIMEOUT_SECS,
323        );
324        client.with_timeout_secs(secs)
325    } else {
326        client
327    }
328}
329
330/// Embeds a single passage for storage. Delegates to the configured LLM
331/// headless (claude code / codex). Returns a vector of the active
332/// dimensionality.
333pub fn embed_passage(embedder: &Mutex<LlmEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
334    let client = apply_query_timeout_if_needed(clone_client(embedder));
335    let result = client.embed_passage(text)?;
336    validate_dim(result)
337}
338
339/// Embeds a single query for similarity search. Same model and dim as
340/// `embed_passage`; the only difference is the LLM-side prompt prefix
341/// that the headless invocation uses to disambiguate.
342pub fn embed_query(embedder: &Mutex<LlmEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
343    let client = apply_query_timeout_if_needed(clone_client(embedder));
344    let result = client.embed_query(text)?;
345    validate_dim(result)
346}
347
348/// Embeds a batch of passages with token-count-aware batching.
349///
350/// Kept for API compatibility; since v1.0.79 it routes through the
351/// bounded parallel fan-out with conservative defaults.
352pub fn embed_passages_controlled(
353    embedder: &Mutex<LlmEmbedding>,
354    texts: &[&str],
355    _token_counts: &[usize],
356) -> Result<Vec<Vec<f32>>, AppError> {
357    if texts.is_empty() {
358        return Ok(Vec::new());
359    }
360    let owned: Vec<String> = texts.iter().map(|t| t.to_string()).collect();
361    embed_texts_parallel(embedder, &owned, 1, chunk_embed_batch_size())
362}
363
364pub fn embed_passage_local(models_dir: &Path, text: &str) -> Result<Vec<f32>, AppError> {
365    let _slot_guard = acquire_llm_slot_for_embedding()?;
366    let embedder = get_embedder(models_dir)?;
367    embed_passage(embedder, text)
368}
369
370/// v1.0.89 (BUG-SKIP-EMBED): reads `SQLITE_GRAPHRAG_SKIP_EMBEDDING_ON_FAILURE`
371/// env var (set by `--skip-embedding-on-failure` via main.rs propagation).
372/// Returns `true` when the user opted to persist with NULL embedding on failure.
373pub fn should_skip_embedding_on_failure() -> bool {
374    crate::runtime_config::skip_embedding_on_failure()
375}
376
377/// v1.0.89 (BUG-SKIP-EMBED + GAP-EMBED-PROPAGATION): embed a passage
378/// honouring both `--llm-backend` and `--skip-embedding-on-failure`.
379///
380/// On success returns `Ok(Some(vec))`. On failure:
381/// - if `--skip-embedding-on-failure` is active, logs a warning and returns `Ok(None)`
382/// - otherwise propagates the error (exit 11)
383pub fn embed_passage_or_skip(
384    models_dir: &Path,
385    text: &str,
386    choice: Option<crate::cli::LlmBackendChoice>,
387) -> Result<Option<Vec<f32>>, AppError> {
388    match embed_passage_with_choice(models_dir, text, choice) {
389        Ok((v, _backend)) => Ok(Some(v)),
390        Err(AppError::Validation(msg)) => Err(AppError::Validation(msg)),
391        Err(e) => {
392            if should_skip_embedding_on_failure() {
393                tracing::warn!(
394                    error = %e,
395                    "embedding failed but --skip-embedding-on-failure is active; persisting with NULL embedding"
396                );
397                Ok(None)
398            } else {
399                Err(e)
400            }
401        }
402    }
403}
404
405/// BUG-003 / v1.0.85: split of `embed_passage_local` that reports the
406/// resolved [`LlmBackendKind`] based on the ACTUAL
407/// [`LlmEmbedding::flavour`] of the embedder constructed. When
408/// `LlmEmbedding::detect_available` substitutes claude for a missing
409/// codex, the operator sees the truth in `envelope.backend_invoked`.
410pub fn embed_passage_local_resolved(
411    models_dir: &Path,
412    text: &str,
413) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
414    let _slot_guard = acquire_llm_slot_for_embedding()?;
415    let embedder = get_embedder(models_dir)?;
416    let v = embed_passage(embedder, text)?;
417    let kind = match embedder.lock().flavour() {
418        crate::extract::llm_embedding::EmbeddingFlavour::Codex => LlmBackendKind::Codex,
419        crate::extract::llm_embedding::EmbeddingFlavour::Claude => LlmBackendKind::Claude,
420        crate::extract::llm_embedding::EmbeddingFlavour::Opencode => LlmBackendKind::Opencode,
421    };
422    Ok((v, kind))
423}
424
425pub fn embed_query_local(models_dir: &Path, text: &str) -> Result<Vec<f32>, AppError> {
426    let _slot_guard = acquire_llm_slot_for_embedding()?;
427    let embedder = get_embedder(models_dir)?;
428    embed_query(embedder, text)
429}
430
431// =============================================================================
432// v1.0.82 (GAP-003): wrappers que aceitam a escolha do CLI
433// (`crate::cli::LlmBackendChoice`) e a traduzem em uma chain para
434// `embed_with_fallback`. Centralizam a propagação do flag `--llm-backend`
435// nos 6 comandos que produzem embedding (`remember`, `edit`, `ingest`,
436// `enrich`, `recall`, `hybrid-search`).
437// =============================================================================
438
439/// Embed a single passage using the LLM backend selected by the user via
440/// `--llm-backend`. Routes to `embed_with_fallback` so failures fall
441/// through to the next backend in the chain before giving up.
442///
443/// When `choice` is `None` (e.g. a sub-command that does not yet
444/// expose the flag), behaviour matches `embed_passage_local` — the
445/// active embedder from `LlmEmbedding::detect_available` decides the
446/// backend.
447pub fn embed_passage_with_choice(
448    models_dir: &Path,
449    text: &str,
450    choice: Option<crate::cli::LlmBackendChoice>,
451) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
452    let _slot_guard = acquire_llm_slot_for_embedding()?;
453    match choice {
454        None => {
455            let embedder = get_embedder(models_dir)?;
456            embed_passage(embedder, text).map(|v| (v, LlmBackendKind::None))
457        }
458        Some(choice) => embed_with_fallback(models_dir, text, &choice.to_chain(), false),
459    }
460}
461
462/// v1.0.93: embedding with `EmbeddingBackendChoice` awareness. When the
463/// embedding backend is `Openrouter` or `Auto` with a live client, the
464/// chain prepends `OpenRouter` before the LLM subprocess backends.
465pub fn embed_passage_with_embedding_choice(
466    models_dir: &Path,
467    text: &str,
468    embedding_backend: crate::cli::EmbeddingBackendChoice,
469    llm_backend: crate::cli::LlmBackendChoice,
470) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
471    let _slot_guard = acquire_llm_slot_for_embedding()?;
472    let chain = embedding_backend.to_chain(llm_backend);
473    embed_with_fallback(models_dir, text, &chain, false)
474}
475
476/// failure, returns a structured `FallbackReason` so the caller can
477/// surface `vec_degraded` instead of a hard exit 11.
478///
479/// `None` matches the legacy `try_embed_query_with_fallback` path
480/// (uses the active embedder without an explicit chain).
481pub fn try_embed_query_with_choice(
482    models_dir: &Path,
483    text: &str,
484    choice: Option<crate::cli::LlmBackendChoice>,
485) -> Result<(Vec<f32>, LlmBackendKind), FallbackReason> {
486    match with_query_embed_fast(|| embed_passage_with_choice(models_dir, text, choice)) {
487        // GAP-004 / v1.0.85.1: when the chain terminates on
488        //  (i.e. user passed
489        // or every preceding backend failed),  returns
490        //  instead of an error. Without this guard the
491        // empty vector would propagate to  which
492        // aborts with exit 11 ("embedding has 0 dims, expected 64").
493        // The caller's contract is to surface a typed
494        // so  and  can route to FTS5-puro via
495        // the existing  /  envelope.
496        // Intercept the empty-vector success path and surface it as
497        //  (introduced at v1.0.85 / ADR-0043
498        // for the symmetric LLM-returned-zero-dim case).
499        Ok((v, _backend)) if v.is_empty() => Err(FallbackReason::DimZero),
500        Ok((v, backend)) => Ok((v, backend)),
501        Err(e) => Err(classify_embedding_error(e)),
502    }
503}
504/// v1.0.93 (GAP-OR-INGEST): query embedding with `EmbeddingBackendChoice`
505/// awareness. Mirrors `try_embed_query_with_choice` but routes through
506/// `embed_passage_with_embedding_choice` so OpenRouter API is used when
507/// configured.
508pub fn try_embed_query_with_embedding_choice(
509    models_dir: &Path,
510    text: &str,
511    embedding_backend: crate::cli::EmbeddingBackendChoice,
512    llm_backend: crate::cli::LlmBackendChoice,
513) -> Result<(Vec<f32>, LlmBackendKind), FallbackReason> {
514    match with_query_embed_fast(|| {
515        embed_passage_with_embedding_choice(models_dir, text, embedding_backend, llm_backend)
516    }) {
517        Ok((v, _backend)) if v.is_empty() => Err(FallbackReason::DimZero),
518        Ok((v, backend)) => Ok((v, backend)),
519        Err(e) => Err(classify_embedding_error(e)),
520    }
521}
522
523/// call. Reads the max-concurrency from
524/// `SQLITE_GRAPHRAG_LLM_MAX_HOST_CONCURRENCY` (default derived from
525/// `LLM_WORKER_RSS_MB` and available memory), and the wait timeout
526/// from `SQLITE_GRAPHRAG_LLM_SLOT_WAIT_SECS` (default 30s).
527///
528/// Returns `Ok(guard)` for happy path, `AppError::LockBusy` (exit 75)
529/// when no slot is available within the wait window, and
530/// `AppError::Validation` when the concurrency is 0.
531///
532/// The `LLM_SLOT_NO_WAIT` env var (or its CLI flag equivalent) sets
533/// `wait_secs = 0` to fail fast in tests.
534fn acquire_llm_slot_for_embedding() -> Result<crate::llm_slots::LlmSlotGuard, AppError> {
535    use crate::constants::{CLI_LOCK_DEFAULT_WAIT_SECS, LLM_WORKER_RSS_MB};
536    let default_max = crate::llm_slots::default_max_concurrency() as usize;
537    let max = crate::runtime_config::llm_max_host_concurrency(default_max).max(1) as u32;
538    let wait_secs = if crate::runtime_config::llm_slot_no_wait() {
539        0
540    } else {
541        crate::runtime_config::llm_slot_wait_secs(CLI_LOCK_DEFAULT_WAIT_SECS)
542    };
543    let _ = LLM_WORKER_RSS_MB; // silence the unused import (used in default_max_concurrency)
544                               // GAP-003 / ADR-0043: when the slot semaphore is contended beyond the
545                               // backoff window (50 + 100 + 200 + 400 = 750ms total), return a
546                               // marker message that `classify_embedding_error` maps to
547                               // `FallbackReason::SlotExhausted` (discriminator `slot_exhausted`).
548                               // The window is shorter than the legacy 30s timeout, so the operator
549                               // observes FTS5-puro fallback quickly instead of after 30s of silence.
550    match crate::llm_slots::acquire_llm_slot(max, wait_secs) {
551        Ok(guard) => Ok(guard),
552        Err(e @ AppError::LockBusy { .. }) if wait_secs > 0 => Err(AppError::Embedding(format!(
553            "slot exhausted: {e} (fall back to FTS5)"
554        ))),
555        Err(e) => Err(e),
556    }
557}
558/// GAP-004 (v1.0.88): typed classifier for embedding error messages.
559///
560/// Decomposes the legacy `AppError::Embedding(String)` payload into a
561/// small enum so the call sites can branch on the cause instead of
562/// repeating `msg.contains(...)` literals. The classification is purely
563/// lexical (case-insensitive substring match on the error message) — no
564/// I/O, no retries, no telemetry, deterministic and safe under
565/// `#[serial_test::serial(env)]`.
566///
567/// 6 variants cover the 5 known discriminators from v1.0.85 (ADR-0043)
568/// plus an `Unknown` fallback for messages that do not match any marker.
569#[derive(Debug, Clone, Copy, PartialEq, Eq)]
570pub enum EmbeddingErrorKind {
571    /// OAuth token expired or absent; no backend can authenticate.
572    OAuth,
573    /// OAuth usage quota exhausted on the named backend.
574    Quota,
575    /// LLM slot semaphore exhausted after the backoff window.
576    SlotExhausted,
577    /// User-requested backend differs from the one that actually executed.
578    BackendMismatch,
579    /// Embedding returned a zero-dimensional vector (structural bug).
580    ZeroDimension,
581    /// Message did not match any of the 5 markers above.
582    Unknown,
583}
584
585impl EmbeddingErrorKind {
586    /// Classify an embedding error message into a typed kind.
587    ///
588    /// Order of checks matters: `OAuth` is matched before `Quota` because
589    /// both substrings can co-occur in the same message. `SlotExhausted`
590    /// is checked before `Quota` because the slot-sema path is more
591    /// specific (the LLM never even tried to authenticate). The checks
592    /// are case-insensitive so `OAuth` and `oauth` both classify to
593    /// `EmbeddingErrorKind::OAuth`.
594    pub fn classify(msg: &str) -> Self {
595        let m = msg.to_lowercase();
596        if m.contains("oauth") {
597            Self::OAuth
598        } else if m.contains("quota") {
599            Self::Quota
600        } else if m.contains("slot exhausted") {
601            Self::SlotExhausted
602        } else if m.contains("backend mismatch") {
603            Self::BackendMismatch
604        } else if m.contains("dim") && m.contains("zero") {
605            Self::ZeroDimension
606        } else {
607            Self::Unknown
608        }
609    }
610
611    /// Stable, machine-friendly discriminator code (lowercase, kebab-safe).
612    pub fn code(&self) -> &'static str {
613        match self {
614            Self::OAuth => "oauth",
615            Self::Quota => "quota",
616            Self::SlotExhausted => "slot-exhausted",
617            Self::BackendMismatch => "backend-mismatch",
618            Self::ZeroDimension => "zero-dimension",
619            Self::Unknown => "unknown",
620        }
621    }
622}
623
624impl std::fmt::Display for EmbeddingErrorKind {
625    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
626        f.write_str(self.code())
627    }
628}
629
630/// G58/S1: reason an embedding call could not be completed and the caller
631/// must fall back to a non-vector retrieval path (FTS5 prefix + LIKE).
632///
633/// Returned by [`try_embed_query_with_fallback`] so the `recall` and
634/// `hybrid-search` handlers can surface a structured `vec_degraded` /
635/// `warning` envelope instead of a hard `AppError::Embedding` exit 11.
636#[derive(Debug, Clone, PartialEq)]
637pub enum FallbackReason {
638    /// The LLM subprocess failed (rate limit, OAuth contention, quota
639    /// exhausted, model unparsable response, divergent dim, etc.).
640    /// Carries the original error message for observability.
641    EmbeddingFailed(String),
642    /// The LLM slot semaphore was exhausted: 8+ concurrent LLM
643    /// subprocesses blocked the acquire beyond the backoff window
644    /// (50ms + 100ms + 200ms + 400ms = 750ms total). Resolved at v1.0.85
645    /// (GAP-003 / ADR-0043).
646    SlotExhausted,
647    /// OAuth usage quota exhausted on the named backend. The caller
648    /// should retry with an alternative backend (codex ↔ claude)
649    /// before falling back to FTS5-puro.
650    OAuthQuota { backend: &'static str },
651    /// The user requested a backend that differs from the one that
652    /// actually executed the embedding (legacy "synonym for codex"
653    /// bug from v1.0.83). Resolved at v1.0.84 (GAP-002).
654    BackendMismatch {
655        requested: &'static str,
656        resolved: &'static str,
657    },
658    /// The embedding returned a zero-dimensional vector, signalling a
659    /// structural bug (the LLM did not produce any floats). Distinct
660    /// from OAuthQuota (quota exhausted) and EmbeddingFailed
661    /// (subprocess error).
662    DimZero,
663    /// The embedding was cancelled by an external signal (SIGTERM, etc.).
664    Cancelled,
665    /// The embedding exceeded its time budget. Carries the operation name
666    /// and the elapsed seconds for diagnostic logging.
667    Timeout {
668        operation: String,
669        duration_secs: u64,
670    },
671}
672
673impl FallbackReason {
674    /// Stable, machine-friendly reason code used by JSON envelopes
675    /// (`vec_degraded_reason`). Mirrors the v1.0.84 contract extended
676    /// at v1.0.85 with 4 new variants (GAP-003 / ADR-0043).
677    pub fn reason_code(&self) -> &'static str {
678        match self {
679            Self::EmbeddingFailed(_) => "embedding_failed",
680            Self::SlotExhausted => "slot_exhausted",
681            Self::OAuthQuota { .. } => "oauth_quota",
682            Self::BackendMismatch { .. } => "backend_mismatch",
683            Self::DimZero => "dim_zero",
684            Self::Cancelled => "cancelled",
685            Self::Timeout { .. } => "timeout",
686        }
687    }
688}
689
690impl std::fmt::Display for FallbackReason {
691    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
692        match self {
693            Self::EmbeddingFailed(msg) => write!(f, "embedding failed: {msg}"),
694            Self::SlotExhausted => write!(
695                f,
696                "slot exhausted: failed to acquire LLM slot after backoff window (max=8 concurrent, total backoff=750ms)"
697            ),
698            Self::OAuthQuota { backend } => {
699                write!(f, "OAuth usage quota exhausted on backend '{backend}'")
700            }
701            Self::BackendMismatch {
702                requested,
703                resolved,
704            } => {
705                write!(
706                    f,
707                    "backend mismatch: user requested '{requested}' but '{resolved}' was invoked"
708                )
709            }
710            Self::DimZero => write!(f, "embedding returned zero-dimensional vector"),
711            Self::Cancelled => write!(f, "embedding cancelled by external signal"),
712            Self::Timeout {
713                operation,
714                duration_secs,
715            } => {
716                write!(
717                    f,
718                    "embedding timed out after {duration_secs}s during {operation}"
719                )
720            }
721        }
722    }
723}
724
725impl std::error::Error for FallbackReason {}
726
727/// G58/S1: try to embed a query, mapping any failure to a structured
728/// [`FallbackReason`] so callers can route to FTS5 + LIKE fallback instead
729/// of returning exit 11 to the user.
730///
731/// This is the bridge between the hard-fail `embed_query_local` (used by
732/// write paths where embedding failure aborts the operation) and the
733/// graceful-degradation contract of `recall` / `hybrid-search` in v1.0.80.
734pub fn try_embed_query_with_fallback(
735    models_dir: &Path,
736    query: &str,
737) -> Result<(Vec<f32>, LlmBackendKind), FallbackReason> {
738    match embed_query_local(models_dir, query) {
739        Ok(v) => Ok((v, LlmBackendKind::None)),
740        Err(e) => Err(classify_embedding_error(e)),
741    }
742}
743
744/// G58 / ADR-0043 (v1.0.85): deterministic fallback for `recall` and
745/// `hybrid-search`.
746///
747/// - On `OAuthQuota { backend }`, retry once with the alternative backend
748///   (codex ↔ claude) before giving up.
749/// - On `SlotExhausted`, sleep 750ms and retry once (gives the slot
750///   semaphore time to release a permit from a sibling subprocess).
751/// - On any other `FallbackReason`, return immediately (deterministic).
752pub fn try_embed_query_with_deterministic_fallback(
753    models_dir: &Path,
754    query: &str,
755    choice: Option<crate::cli::LlmBackendChoice>,
756) -> Result<(Vec<f32>, LlmBackendKind), FallbackReason> {
757    match try_embed_query_with_choice(models_dir, query, choice) {
758        Ok(t) => Ok(t),
759        Err(reason @ FallbackReason::OAuthQuota { backend }) => {
760            let alt = match backend {
761                "codex" => Some(crate::cli::LlmBackendChoice::Claude),
762                "claude" => Some(crate::cli::LlmBackendChoice::Codex),
763                "opencode" => Some(crate::cli::LlmBackendChoice::Codex),
764                "openrouter" => Some(crate::cli::LlmBackendChoice::Codex),
765                _ => None,
766            };
767            if let Some(alt_choice) = alt {
768                try_embed_query_with_choice(models_dir, query, Some(alt_choice))
769            } else {
770                Err(reason)
771            }
772        }
773        Err(reason @ FallbackReason::SlotExhausted) => {
774            std::thread::sleep(std::time::Duration::from_millis(750));
775            try_embed_query_with_choice(models_dir, query, choice).or(Err(reason))
776        }
777        Err(other) => Err(other),
778    }
779}
780
781/// Classify an embedding [`AppError`] into a typed [`FallbackReason`].
782///
783/// v1.0.85 (ADR-0043): discriminates the 4 new causes (SlotExhausted,
784/// OAuthQuota, BackendMismatch, DimZero) from the legacy generic
785/// EmbeddingFailed bucket. The classification is purely lexical
786/// (substring match on the message) — no I/O, no retries, no
787/// telemetry, deterministic and `#[serial_test::serial(env)]`-safe.
788pub fn classify_embedding_error(err: AppError) -> FallbackReason {
789    match err {
790        AppError::Timeout {
791            operation,
792            duration_secs,
793        } => FallbackReason::Timeout {
794            operation,
795            duration_secs,
796        },
797        AppError::Embedding(msg) => match EmbeddingErrorKind::classify(&msg) {
798            // GAP-004 (v1.0.88): typed-discriminator dispatch.
799            // The lexical classifier picks the discriminator; the arms below
800            // enrich the result with the backend name and the
801            // requested/resolved pair that the JSON envelope needs.
802            //
803            // Note: `Cancelled` and `EmbeddingFailed(msg)` are not in the
804            // 6-variant enum (they have no lexical marker) so we keep them
805            // as explicit guards at the head of the match.
806            EmbeddingErrorKind::SlotExhausted => FallbackReason::SlotExhausted,
807            EmbeddingErrorKind::OAuth => {
808                let backend = if msg.contains("codex") {
809                    "codex"
810                } else if msg.contains("claude") || msg.contains("anthropic-ratelimit") {
811                    // G45-CR5: anthropic-ratelimit-* headers are emitted only by
812                    // the Claude CLI subprocess; treat them as claude quota
813                    // signals even when the message text omits the word
814                    // "claude" explicitly.
815                    "claude"
816                } else if msg.contains("opencode") {
817                    "opencode"
818                } else {
819                    "unknown"
820                };
821                FallbackReason::OAuthQuota { backend }
822            }
823            EmbeddingErrorKind::Quota => {
824                let backend = if msg.contains("codex") {
825                    "codex"
826                } else if msg.contains("claude") || msg.contains("anthropic-ratelimit") {
827                    "claude"
828                } else if msg.contains("opencode") {
829                    "opencode"
830                } else {
831                    "unknown"
832                };
833                FallbackReason::OAuthQuota { backend }
834            }
835            EmbeddingErrorKind::BackendMismatch => {
836                // The `msg.contains("claude")` arm is intentionally
837                // placed BEFORE the OAuth arm so that a backend-mismatch
838                // message that mentions both "claude" and "codex" maps to
839                // BackendMismatch (the more specific failure mode).
840                let (requested, resolved) =
841                    if msg.contains("requested claude") && msg.contains("but codex") {
842                        ("claude", "codex")
843                    } else if msg.contains("requested codex") && msg.contains("but claude") {
844                        ("codex", "claude")
845                    } else if msg.contains("requested claude") {
846                        ("claude", "unknown")
847                    } else if msg.contains("requested codex") {
848                        ("codex", "unknown")
849                    } else {
850                        ("unknown", "unknown")
851                    };
852                FallbackReason::BackendMismatch {
853                    requested,
854                    resolved,
855                }
856            }
857            EmbeddingErrorKind::ZeroDimension => FallbackReason::DimZero,
858            EmbeddingErrorKind::Unknown => {
859                if msg.contains("cancelled") {
860                    FallbackReason::Cancelled
861                } else {
862                    FallbackReason::EmbeddingFailed(msg)
863                }
864            }
865        },
866        e => FallbackReason::EmbeddingFailed(e.to_string()),
867    }
868}
869// backends before giving up. The chain order matches the user-supplied
870// `--llm-fallback` list (default: codex, claude, none).
871// =============================================================================
872
873/// Tries each LLM backend in `chain` in order, returning the first
874/// successful embedding. On failure, the diagnostic tail of the last
875/// error is preserved in the returned `AppError::Embedding` so the
876/// operator can see WHY every backend failed.
877///
878/// If `skip_on_failure` is `true` AND every backend fails, the function
879/// returns `Ok(Vec::new())` (an empty vector) to signal "persist
880/// without embedding" — the call site is then responsible for writing
881/// a `pending_embeddings` row that can be retried later by the
882/// `embedding retry` subcommand.
883///
884/// Defaults the chain to `[codex, claude, none]` when `chain` is
885/// empty, matching the v1.0.81 behaviour where codex was the
886/// implicit default and claude was the implicit fallback.
887pub fn embed_with_fallback(
888    models_dir: &Path,
889    text: &str,
890    chain: &[LlmBackendKind],
891    skip_on_failure: bool,
892) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
893    use crate::llm::exit_code_hints::LlmBackendError;
894    let effective: Vec<LlmBackendKind> = if chain.is_empty() {
895        vec![
896            LlmBackendKind::Codex,
897            LlmBackendKind::Claude,
898            LlmBackendKind::Opencode,
899            LlmBackendKind::None,
900        ]
901    } else {
902        chain.to_vec()
903    };
904
905    let mut last_err: Option<AppError> = None;
906    for backend in &effective {
907        // GAP-E2E-06 / v1.1.8: fail-fast credential/binary probe so Auto
908        // does not burn ~20s on a dead Codex/Claude before FTS fallback.
909        if let Err(probe_err) = backend_ready_probe(backend) {
910            tracing::warn!(
911                target: "embedding",
912                backend = ?backend,
913                error = %probe_err,
914                "embed_with_fallback: backend probe failed, skipping"
915            );
916            last_err = Some(probe_err);
917            continue;
918        }
919        // BUG-003 / v1.0.85: propagar o backend REAL retornado por
920        // embed_via_backend (que pode diferir do chain position quando
921        // LlmEmbedding::detect_available substitui codex por claude).
922        // O tuple `(_, requested_kind)` é descartado — só queremos o
923        // backend resolvido na primeira posição.
924        // ADR-0046 / BUG-11 v1.0.88: use `embed_via_backend_strict` so the
925        // sentinel `None` backend propagates the last real error instead
926        // of silently degrading to `Ok((Vec::new(), None))`. This is the
927        // path that caused preflight rejections to be swallowed by the
928        // chain's default trailing `None`.
929        match embed_via_backend_strict(
930            models_dir,
931            text,
932            backend,
933            last_err.as_ref(),
934            skip_on_failure,
935        ) {
936            Ok((v, resolved_kind)) => return Ok((v, resolved_kind)),
937            Err(e) => {
938                // ADR-0011: Validation errors (OAuth-only enforcement) are
939                // FATAL — propagate immediately without trying the next
940                // backend. This prevents the fallback chain from swallowing
941                // OAuth violations via the trailing `None` sentinel.
942                if matches!(e, AppError::Validation(_)) {
943                    return Err(e);
944                }
945                tracing::warn!(
946                    target: "embedding",
947                    backend = ?backend,
948                    error = %e,
949                    "embed_with_fallback: backend failed, trying next"
950                );
951                last_err = Some(e);
952            }
953        }
954    }
955    if skip_on_failure {
956        // Signal "persist with no embedding" via an empty vector paired
957        // with `None` so callers know the chain exhausted without a hit.
958        // Caller is responsible for writing a `pending_embeddings` row
959        // that can be retried later by the `embedding retry` subcommand.
960        return Ok((Vec::new(), LlmBackendKind::None));
961    }
962    Err(last_err
963        .unwrap_or_else(|| AppError::Embedding(LlmBackendError::NoBackendsAvailable.to_string())))
964}
965
966/// LLM backend kind for the fallback chain. Mirrors the CLI
967/// `--llm-backend` enum so users can pass the same value to
968/// `--llm-fallback` without translation.
969#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
970pub enum LlmBackendKind {
971    /// `codex exec` (default for v1.0.76+).
972    Codex,
973    /// `claude -p` (fallback for ChatGPT Pro OAuth unavailability).
974    Claude,
975    /// `opencode run` (v1.0.90).
976    Opencode,
977    /// OpenRouter HTTP API (v1.0.93).
978    OpenRouter,
979    /// No embedding — empty vector returned.
980    None,
981}
982
983impl LlmBackendKind {
984    /// Stable string label used in tracing and JSON envelopes. The
985    /// string values are part of the public contract for `envelope.backend_invoked`.
986    pub fn as_str(self) -> &'static str {
987        match self {
988            Self::Codex => "codex",
989            Self::Claude => "claude",
990            Self::Opencode => "opencode",
991            Self::OpenRouter => "openrouter",
992            Self::None => "none",
993        }
994    }
995}
996
997/// Cheap readiness probe before spawning an LLM subprocess.
998///
999/// Checks binary presence on PATH and credential material on disk.
1000/// Does **not** perform network I/O. Failures are non-fatal for the
1001/// fallback chain — the caller skips to the next backend.
1002fn backend_ready_probe(backend: &LlmBackendKind) -> Result<(), AppError> {
1003    match backend {
1004        LlmBackendKind::None => Ok(()),
1005        LlmBackendKind::OpenRouter => {
1006            if OPENROUTER_CLIENT.get().is_some() {
1007                Ok(())
1008            } else {
1009                Err(AppError::Embedding(
1010                    "openrouter probe: client not initialised (skip)".into(),
1011                ))
1012            }
1013        }
1014        LlmBackendKind::Codex => {
1015            let bin = crate::runtime_config::codex_binary()
1016                .unwrap_or_else(|| "codex".into());
1017            if which::which(&bin).is_err() && which::which("codex").is_err() {
1018                return Err(AppError::Embedding(
1019                    "codex probe: binary not on PATH (skip)".into(),
1020                ));
1021            }
1022            // OAuth material: ~/.codex/auth.json or CODEX_HOME/auth.json
1023            let auth = std::env::var_os("CODEX_HOME")
1024                .map(std::path::PathBuf::from)
1025                .or_else(|| {
1026                    std::env::var_os("HOME").map(|h| {
1027                        std::path::PathBuf::from(h).join(".codex")
1028                    })
1029                })
1030                .map(|p| p.join("auth.json"));
1031            match auth {
1032                Some(p) if p.is_file() => Ok(()),
1033                _ => Err(AppError::Embedding(
1034                    "codex probe: auth.json missing (skip; use --llm-backend none or login)".into(),
1035                )),
1036            }
1037        }
1038        LlmBackendKind::Claude => {
1039            let bin = crate::runtime_config::claude_binary()
1040                .unwrap_or_else(|| "claude".into());
1041            if which::which(&bin).is_err() && which::which("claude").is_err() {
1042                return Err(AppError::Embedding(
1043                    "claude probe: binary not on PATH (skip)".into(),
1044                ));
1045            }
1046            Ok(())
1047        }
1048        LlmBackendKind::Opencode => {
1049            let bin = crate::runtime_config::opencode_binary()
1050                .unwrap_or_else(|| "opencode".into());
1051            if which::which(&bin).is_err() && which::which("opencode").is_err() {
1052                return Err(AppError::Embedding(
1053                    "opencode probe: binary not on PATH (skip)".into(),
1054                ));
1055            }
1056            Ok(())
1057        }
1058    }
1059}
1060
1061/// Embeds a single text via the given backend. Used by
1062/// `embed_with_fallback` and exposed to allow direct one-shot
1063/// selection without a chain.
1064/// Embeds a single text via the given backend. Used by
1065/// `embed_with_fallback` and exposed to allow direct one-shot
1066/// selection without a chain.
1067///
1068/// BUG-003 / v1.0.85: returns `(Vec<f32>, LlmBackendKind)`. The
1069/// second element reports the backend that ACTUALLY executed the
1070/// embedding, not the chain position requested by the caller. When
1071/// `LlmBackendKind::Codex` is requested but `codex` is absent from
1072/// PATH, `LlmEmbedding::detect_available` substitutes claude and the
1073/// tuple carries `LlmBackendKind::Claude` so the operator sees the
1074/// truth in `envelope.backend_invoked`.
1075pub fn embed_via_backend(
1076    models_dir: &Path,
1077    text: &str,
1078    backend: &LlmBackendKind,
1079) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
1080    match backend {
1081        LlmBackendKind::None => Ok((Vec::new(), LlmBackendKind::None)),
1082        LlmBackendKind::Codex => embed_passage_local_resolved(models_dir, text),
1083        LlmBackendKind::Claude => {
1084            // ADR-0042 / GAP-002: route Claude through its own static
1085            // embedder instead of re-using the Codex path (which used
1086            // to silently pick Codex if PATH ordered it first).
1087            tracing::debug!(
1088                target: "embedder",
1089                backend = "claude",
1090                "embed_via_backend: forcing claude (ADR-0042 / GAP-002 fix)"
1091            );
1092            embed_via_claude_local_resolved(models_dir, text, None, None)
1093        }
1094        LlmBackendKind::Opencode => {
1095            tracing::debug!(
1096                target: "embedder",
1097                backend = "opencode",
1098                "embed_via_backend: forcing opencode (GAP-OPENCODE-001)"
1099            );
1100            embed_via_opencode_local_resolved(models_dir, text, None, None)
1101        }
1102        LlmBackendKind::OpenRouter => {
1103            tracing::debug!(
1104                target: "embedder",
1105                backend = "openrouter",
1106                "embed_via_backend: using OpenRouter API (v1.0.93)"
1107            );
1108            let client = OPENROUTER_CLIENT.get().ok_or_else(|| {
1109                AppError::Embedding(
1110                    "OpenRouter client not initialised; call get_openrouter_embedder first".into(),
1111                )
1112            })?;
1113            // GAP-001 (v1.1.04): canonical nested-runtime guard. When called
1114            // from inside an existing tokio runtime (e.g. deep-research fan-out),
1115            // `block_in_place` parks the current worker thread and drives the
1116            // future via the existing handle instead of building a nested
1117            // runtime, which would panic with "Cannot start a runtime from
1118            // within a runtime".
1119            let vec = match tokio::runtime::Handle::try_current() {
1120                Ok(handle) => tokio::task::block_in_place(|| {
1121                    handle.block_on(client.embed_single(text, client.default_input_type()))
1122                })?,
1123                Err(_) => shared_runtime()?
1124                    .block_on(client.embed_single(text, client.default_input_type()))?,
1125            };
1126            Ok((vec, LlmBackendKind::OpenRouter))
1127        }
1128    }
1129}
1130
1131// ADR-0046 / BUG-11 v1.0.88: specialisation of `embed_via_backend` that
1132// refuses to SILENTLY DEGRADE to `LlmBackendKind::None` after all real
1133// backends (Codex, Claude) have failed. The previous behaviour
1134// (`Ok((Vec::new(), None))`) caused the `remember` write path to persist
1135// memories with zero-dimensional embeddings — breaking `recall` and
1136// `hybrid-search` while returning exit 0 (BUG-11 CRITICAL).
1137//
1138// When `--llm-backend none` is explicitly requested (i.e. `last_err` is
1139// None AND the chain was a single-element `[None]`), pass
1140// `skip_on_failure = true` to `embed_with_fallback` to consume the empty
1141// vector via the pending-embeddings retry queue instead of persisting
1142// directly. This helper is the right hook for `remember`/`edit`/`ingest`.
1143pub fn embed_via_backend_strict(
1144    models_dir: &Path,
1145    text: &str,
1146    backend: &LlmBackendKind,
1147    last_err: Option<&AppError>,
1148    skip_on_failure: bool,
1149) -> Result<(Vec<f32>, LlmBackendKind), AppError> {
1150    use crate::llm::exit_code_hints::LlmBackendError;
1151    match backend {
1152        LlmBackendKind::None => {
1153            // GAP-CLI-EMBED-NONE (v1.1.8): an intentional chain of only
1154            // `[None]` (`--llm-backend none`) MUST skip embedding with an
1155            // empty vector — matching the CLI help contract "skips embedding;
1156            // useful for tests". When `None` is reached *after* a real
1157            // backend failed (`last_err.is_some()`), honour
1158            // `skip_on_failure` or propagate the prior error (BUG-11).
1159            // Intentional none-only chain, or skip-on-failure after a prior error.
1160            if last_err.is_none() || skip_on_failure {
1161                Ok((Vec::new(), LlmBackendKind::None))
1162            } else {
1163                Err(match last_err {
1164                    Some(e) => AppError::Embedding(format!("{e}")),
1165                    None => AppError::Embedding(LlmBackendError::NoBackendsAvailable.to_string()),
1166                })
1167            }
1168        }
1169        LlmBackendKind::Codex => embed_passage_local_resolved(models_dir, text),
1170        LlmBackendKind::Claude => {
1171            tracing::debug!(
1172                target: "embedder",
1173                backend = "claude",
1174                "embed_via_backend_strict: forcing claude (ADR-0042 / GAP-002 fix)"
1175            );
1176            embed_via_claude_local_resolved(models_dir, text, None, None)
1177        }
1178        LlmBackendKind::Opencode => {
1179            tracing::debug!(
1180                target: "embedder",
1181                backend = "opencode",
1182                "embed_via_backend_strict: forcing opencode (GAP-OPENCODE-001)"
1183            );
1184            embed_via_opencode_local_resolved(models_dir, text, None, None)
1185        }
1186        LlmBackendKind::OpenRouter => embed_via_backend(models_dir, text, backend),
1187    }
1188}
1189
1190/// Legacy one-shot wrapper around `embed_via_backend` that discards
1191/// the resolved backend. Kept for call sites that only care about
1192/// the vector and ignore the executed-backend signal. New code
1193/// should prefer `embed_via_backend` directly.
1194pub fn embed_via_backend_legacy(
1195    models_dir: &Path,
1196    text: &str,
1197    backend: &LlmBackendKind,
1198) -> Result<Vec<f32>, AppError> {
1199    embed_via_backend(models_dir, text, backend).map(|(v, _)| v)
1200}
1201
1202pub fn embed_passages_controlled_local(
1203    models_dir: &Path,
1204    texts: &[&str],
1205    token_counts: &[usize],
1206) -> Result<Vec<Vec<f32>>, AppError> {
1207    let embedder = get_embedder(models_dir)?;
1208    embed_passages_controlled(embedder, texts, token_counts)
1209}
1210
1211/// G42/S3: embeds `texts` through the bounded parallel fan-out and
1212/// returns vectors in input order.
1213pub fn embed_passages_parallel_local(
1214    models_dir: &Path,
1215    texts: &[String],
1216    parallelism: usize,
1217    batch_size: usize,
1218) -> Result<Vec<Vec<f32>>, AppError> {
1219    let embedder = get_embedder(models_dir)?;
1220    embed_texts_parallel(embedder, texts, parallelism, batch_size)
1221}
1222
1223/// GAP-OPENROUTER-REST-CONCURRENCY: result of one bounded fan-out chunk —
1224/// the chunk index paired with the batch embedding result, used to restore
1225/// input order after out-of-order `JoinSet` completion.
1226type EmbedChunkResult = (usize, Result<Vec<Vec<f32>>, AppError>);
1227
1228/// GAP-OPENROUTER-REST-CONCURRENCY: reassembles the flat vector list in
1229/// input order from chunk parts produced out-of-order by the bounded
1230/// `JoinSet` fan-out. Sorts by chunk index, then flattens, so the result
1231/// matches the original `texts` order exactly.
1232fn reassemble_ordered(mut parts: Vec<(usize, Vec<Vec<f32>>)>) -> Vec<Vec<f32>> {
1233    parts.sort_by_key(|(idx, _)| *idx);
1234    parts.into_iter().flat_map(|(_, v)| v).collect()
1235}
1236
1237/// v1.0.93 (GAP-OR-INGEST): embeds multiple passages with
1238/// `EmbeddingBackendChoice` awareness. When the resolved chain starts
1239/// with `OpenRouter` and the client is initialised, uses the HTTP batch
1240/// API (`embed_batch`) instead of subprocess fan-out — no LLM slot
1241/// consumed, ~200ms per batch vs ~15s per subprocess cold-start.
1242/// Falls back to `embed_passages_parallel_local` for LLM backends.
1243pub fn embed_passages_parallel_with_embedding_choice(
1244    models_dir: &Path,
1245    texts: &[String],
1246    parallelism: usize,
1247    batch_size: usize,
1248    embedding_backend: crate::cli::EmbeddingBackendChoice,
1249    llm_backend: crate::cli::LlmBackendChoice,
1250) -> Result<Vec<Vec<f32>>, AppError> {
1251    let chain = embedding_backend.to_chain(llm_backend);
1252    if chain.first() == Some(&LlmBackendKind::OpenRouter) && is_openrouter_initialized() {
1253        let client = OPENROUTER_CLIENT.get().ok_or_else(|| {
1254            AppError::Embedding(
1255                "OpenRouter client not initialised; call get_openrouter_embedder first".into(),
1256            )
1257        })?;
1258
1259        // GAP-OPENROUTER-REST-CONCURRENCY: reuse the caller's `parallelism`
1260        // as a bounded fan-out width, clamped to a Cloudflare-safe range.
1261        // Small inputs stay serial — a single batch is one REST call, so the
1262        // JoinSet overhead would only add latency.
1263        let k = parallelism.clamp(1, 16);
1264        if texts.len() <= 32 || k == 1 {
1265            let refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
1266            // GAP-001 (v1.1.04): canonical nested-runtime guard.
1267            let vecs = match tokio::runtime::Handle::try_current() {
1268                Ok(handle) => tokio::task::block_in_place(|| {
1269                    handle.block_on(client.embed_batch(&refs, client.default_input_type()))
1270                })?,
1271                Err(_) => shared_runtime()?
1272                    .block_on(client.embed_batch(&refs, client.default_input_type()))?,
1273            };
1274            return Ok(vecs);
1275        }
1276
1277        // `client` is a `&'static OpenRouterClient` (OPENROUTER_CLIENT is a
1278        // static OnceLock), so it is Copy + Send + 'static and moves freely
1279        // into each spawned task. Chunk contents are cloned into owned
1280        // `Vec<String>` because `texts` is only borrowed.
1281        //
1282        // GAP-001 (v1.1.04): canonical nested-runtime guard. The async block
1283        // borrows `client`, `texts` and `k`, all of which remain valid for
1284        // both branches.
1285        let fan_out = async move {
1286            let mut set: JoinSet<EmbedChunkResult> = JoinSet::new();
1287            let mut parts: Vec<(usize, Vec<Vec<f32>>)> = Vec::new();
1288
1289            for (idx, chunk) in texts.chunks(32).enumerate() {
1290                if set.len() >= k {
1291                    if let Some(joined) = set.join_next().await {
1292                        let (cidx, res) = joined.map_err(|e| {
1293                            AppError::Embedding(format!("embedding task join error: {e}"))
1294                        })?;
1295                        parts.push((cidx, res?));
1296                    }
1297                }
1298                let owned: Vec<String> = chunk.to_vec();
1299                set.spawn(async move {
1300                    let refs: Vec<&str> = owned.iter().map(|s| s.as_str()).collect();
1301                    // `EmbedChunkResult` carries `AppError` (retry_class is
1302                    // only consumed by callers that match `EmbedError`
1303                    // directly, e.g. the enrich re-embed path).
1304                    let r = client
1305                        .embed_batch(&refs, client.default_input_type())
1306                        .await
1307                        .map_err(AppError::from);
1308                    (idx, r)
1309                });
1310            }
1311
1312            while let Some(joined) = set.join_next().await {
1313                let (cidx, res) = joined
1314                    .map_err(|e| AppError::Embedding(format!("embedding task join error: {e}")))?;
1315                parts.push((cidx, res?));
1316            }
1317
1318            Ok::<Vec<Vec<f32>>, AppError>(reassemble_ordered(parts))
1319        };
1320        let vecs = match tokio::runtime::Handle::try_current() {
1321            Ok(handle) => tokio::task::block_in_place(|| handle.block_on(fan_out))?,
1322            Err(_) => shared_runtime()?.block_on(fan_out)?,
1323        };
1324        Ok(vecs)
1325    } else {
1326        embed_passages_parallel_local(models_dir, texts, parallelism, batch_size)
1327    }
1328}
1329
1330/// G56: in-process cache for entity embeddings keyed by `(model, text)`.
1331///
1332/// Schema v13 is immutable: `entity_embeddings` does not have a `text`
1333/// column, so a pure DB-side cache would require a schema bump. Instead
1334/// we keep a process-wide LRU-style map that survives within one CLI
1335/// invocation. The hit rate is high in `ingest` (re-embedding the same
1336/// canonical entity across thousands of memories) and modest in `remember`
1337/// (typical single-memory invocations).
1338///
1339/// Key: `blake3(model || "\0" || text)`. Value: `Arc<Vec<f32>>` so the
1340/// collector can drop the map entry while a `Vec` is still in flight.
1341type EntityEmbedCacheMap = std::collections::HashMap<u64, Arc<Vec<f32>>>;
1342
1343static ENTITY_EMBED_CACHE: OnceLock<parking_lot::Mutex<EntityEmbedCacheMap>> = OnceLock::new();
1344
1345fn entity_embed_cache() -> &'static parking_lot::Mutex<EntityEmbedCacheMap> {
1346    ENTITY_EMBED_CACHE.get_or_init(|| parking_lot::Mutex::new(std::collections::HashMap::new()))
1347}
1348
1349fn entity_cache_key(model: &str, text: &str) -> u64 {
1350    let mut hasher = blake3::Hasher::new();
1351    hasher.update(model.as_bytes());
1352    hasher.update(b"\0");
1353    hasher.update(text.as_bytes());
1354    let h = hasher.finalize();
1355    let bytes = h.as_bytes();
1356    u64::from_le_bytes([
1357        bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
1358    ])
1359}
1360
1361/// G56: embeds entity-name texts through a process-wide cache.
1362///
1363/// Skips any `(model, text)` pair already produced in this CLI invocation
1364/// and only spawns subprocesses for the cache misses. Returns vectors in
1365/// the same order as `texts`.
1366///
1367/// Designed for entity-name batches (short texts). For chunk embeds use
1368/// [`embed_passages_parallel_local`] directly — chunks are unique per
1369/// memory and cache hit rate is negligible.
1370pub fn embed_entity_texts_cached(
1371    models_dir: &Path,
1372    texts: &[String],
1373    parallelism: usize,
1374    embedding_backend: crate::cli::EmbeddingBackendChoice,
1375    llm_backend: crate::cli::LlmBackendChoice,
1376) -> Result<(Vec<Vec<f32>>, EmbedCacheStats), AppError> {
1377    if texts.is_empty() {
1378        return Ok((Vec::new(), EmbedCacheStats::default()));
1379    }
1380    // GAP-OR-ENTITY-EMBED: resolve the SAME chain the chunk path uses so the
1381    // entity embedding honours `--embedding-backend`/`--llm-backend` instead
1382    // of always forcing the codex subprocess (the old G56 code path).
1383    let chain = embedding_backend.to_chain(llm_backend);
1384
1385    // `none` short-circuit: when the resolved chain is exactly `[None]`
1386    // (`--embedding-backend llm --llm-backend none`) skip every backend and
1387    // return empty vectors WITHOUT spawning a subprocess. Empties are never
1388    // cached so a later call with a real backend in the same process is not
1389    // poisoned; they count as misses for stats parity with the chunk path.
1390    if chain.as_slice() == [LlmBackendKind::None] {
1391        let out: Vec<Vec<f32>> = texts.iter().map(|_| Vec::new()).collect();
1392        return Ok((
1393            out,
1394            EmbedCacheStats {
1395                requested: texts.len(),
1396                hits: 0,
1397                misses: texts.len(),
1398            },
1399        ));
1400    }
1401
1402    // Cache model label reflects the EFFECTIVE embedding backend. When the
1403    // chain actually routes through OpenRouter, vectors carry that model's
1404    // dim/MRL profile and must never collide with codex-produced vectors;
1405    // for the local path we keep the prior `model_label()` so the in-process
1406    // cache key is unchanged (no regression — this cache is process-local).
1407    let routed_openrouter =
1408        chain.first() == Some(&LlmBackendKind::OpenRouter) && is_openrouter_initialized();
1409    let model = if routed_openrouter {
1410        format!("openrouter:{}", crate::constants::embedding_dim())
1411    } else {
1412        get_embedder(models_dir)?.lock().model_label()
1413    };
1414    let cache = entity_embed_cache();
1415    let mut hits: Vec<Option<Arc<Vec<f32>>>> = vec![None; texts.len()];
1416    let mut miss_indices: Vec<usize> = Vec::with_capacity(texts.len());
1417    {
1418        let guard = cache.lock();
1419        for (i, text) in texts.iter().enumerate() {
1420            let key = entity_cache_key(&model, text);
1421            if let Some(v) = guard.get(&key) {
1422                hits[i] = Some(Arc::clone(v));
1423            } else {
1424                miss_indices.push(i);
1425            }
1426        }
1427    }
1428    let miss_count = miss_indices.len();
1429    if miss_count > 0 {
1430        let miss_texts: Vec<String> = miss_indices.iter().map(|&i| texts[i].clone()).collect();
1431        // GAP-OR-ENTITY-EMBED: route misses through the backend-aware batch
1432        // helper (same one the chunk path uses). With OpenRouter this hits the
1433        // REST `embed_batch` (~200ms) instead of the codex subprocess (~120s).
1434        let miss_vecs = embed_passages_parallel_with_embedding_choice(
1435            models_dir,
1436            &miss_texts,
1437            parallelism,
1438            entity_embed_batch_size(),
1439            embedding_backend,
1440            llm_backend,
1441        )?;
1442        let mut guard = cache.lock();
1443        for (slot, &orig_idx) in miss_indices.iter().enumerate() {
1444            let vec = Arc::new(miss_vecs[slot].clone());
1445            let key = entity_cache_key(&model, &texts[orig_idx]);
1446            guard.insert(key, Arc::clone(&vec));
1447            hits[orig_idx] = Some(vec);
1448        }
1449    }
1450    let mut out = Vec::with_capacity(texts.len());
1451    for hit in hits.into_iter() {
1452        let v = hit.ok_or_else(|| {
1453            AppError::Embedding("entity embed cache produced null result".to_string())
1454        })?;
1455        out.push((*v).clone());
1456    }
1457    Ok((
1458        out,
1459        EmbedCacheStats {
1460            requested: texts.len(),
1461            hits: texts.len() - miss_count,
1462            misses: miss_count,
1463        },
1464    ))
1465}
1466
1467/// G56: stats snapshot returned by [`embed_entity_texts_cached`].
1468#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, serde::Serialize)]
1469pub struct EmbedCacheStats {
1470    pub requested: usize,
1471    pub hits: usize,
1472    pub misses: usize,
1473}
1474
1475impl EmbedCacheStats {
1476    /// Hit rate as a fraction in `[0.0, 1.0]`. Returns 0.0 when nothing was requested.
1477    pub fn hit_rate(&self) -> f64 {
1478        if self.requested == 0 {
1479            0.0
1480        } else {
1481            self.hits as f64 / self.requested as f64
1482        }
1483    }
1484}
1485
1486/// G42/S3 core: bounded parallel batch embedding.
1487///
1488/// - texts are grouped into batches of `batch_size` (one LLM call per
1489///   batch, G42/S2);
1490/// - at most `effective_permits(parallelism)` LLM subprocesses run
1491///   simultaneously (`Arc<Semaphore>` + `acquire_owned`, BLOCO 2);
1492/// - results stream through a BOUNDED mpsc channel so the caller-side
1493///   collector applies backpressure and can persist incrementally
1494///   (BLOCO 5);
1495/// - the global `CancellationToken` aborts in-flight work on the first
1496///   signal; subprocesses die with their futures via `kill_on_drop`
1497///   (BLOCO 6).
1498pub fn embed_texts_parallel(
1499    embedder: &Mutex<LlmEmbedding>,
1500    texts: &[String],
1501    parallelism: usize,
1502    batch_size: usize,
1503) -> Result<Vec<Vec<f32>>, AppError> {
1504    let mut slots: Vec<Option<Vec<f32>>> = vec![None; texts.len()];
1505    embed_texts_parallel_with(embedder, texts, parallelism, batch_size, |idx, v| {
1506        slots[idx] = Some(v.to_vec());
1507        Ok(())
1508    })?;
1509    let mut out = Vec::with_capacity(slots.len());
1510    for (idx, slot) in slots.into_iter().enumerate() {
1511        out.push(slot.ok_or_else(|| {
1512            AppError::Embedding(format!("embedding fan-out lost item index {idx}"))
1513        })?);
1514    }
1515    Ok(out)
1516}
1517
1518/// Like [`embed_texts_parallel`] but invokes `on_result` as soon as each
1519/// embedding arrives (BLOCO 5: incremental persistence — a kill loses at
1520/// most the in-flight batches, never the already-delivered items).
1521pub fn embed_texts_parallel_with(
1522    embedder: &Mutex<LlmEmbedding>,
1523    texts: &[String],
1524    parallelism: usize,
1525    batch_size: usize,
1526    mut on_result: impl FnMut(usize, &[f32]) -> Result<(), AppError>,
1527) -> Result<(), AppError> {
1528    if texts.is_empty() {
1529        return Ok(());
1530    }
1531    let dim = crate::constants::embedding_dim();
1532    if texts.len() == 1 {
1533        let v = embed_passage(embedder, &texts[0])?;
1534        return on_result(0, &v);
1535    }
1536
1537    let client = clone_client(embedder);
1538    let permits = effective_permits(parallelism);
1539    let batches = build_batches(texts, batch_size.max(1));
1540    let token = crate::cancel_token().clone();
1541
1542    let work = move |batch: Vec<(usize, String)>| {
1543        let client = client.clone();
1544        async move {
1545            client
1546                .embed_batch_async(crate::constants::PASSAGE_PREFIX, &batch)
1547                .await
1548        }
1549    };
1550
1551    let fan_out = run_bounded(batches, permits, dim, token, work, &mut on_result);
1552    match tokio::runtime::Handle::try_current() {
1553        Ok(handle) => tokio::task::block_in_place(|| handle.block_on(fan_out)),
1554        Err(_) => shared_runtime()?.block_on(fan_out),
1555    }
1556}
1557
1558/// Groups `(global_index, text)` pairs into batches of `batch_size`.
1559fn build_batches(texts: &[String], batch_size: usize) -> Vec<Vec<(usize, String)>> {
1560    texts
1561        .iter()
1562        .cloned()
1563        .enumerate()
1564        .collect::<Vec<_>>()
1565        .chunks(batch_size)
1566        .map(|c| c.to_vec())
1567        .collect()
1568}
1569
1570/// G42/S3 BLOCO 2: effective permit count.
1571///
1572/// `permits = clamp(requested, 1, 32) ∧ cpus ∧ ram_livre*0.5/RSS` — see
1573/// the module docs for the measured RSS rationale.
1574pub fn effective_permits(requested: usize) -> usize {
1575    let cpus = std::thread::available_parallelism()
1576        .map(|n| n.get())
1577        .unwrap_or(4);
1578    let by_ram = ((crate::memory_guard::available_memory_mb() / 2)
1579        / crate::constants::LLM_WORKER_RSS_MB)
1580        .max(1) as usize;
1581    requested.clamp(1, 32).min(cpus).min(by_ram).max(1)
1582}
1583
1584/// Bounded fan-out engine. Generic over the per-batch work so the
1585/// concurrency contract is testable without spawning real LLMs.
1586///
1587/// Cancel safety (BLOCO 6/10): every task races its work against
1588/// `token.cancelled()` inside `tokio::select!`; both branches are
1589/// cancel-safe (the work future owns its subprocess via `kill_on_drop`,
1590/// and `cancelled()` is pure). On collector-side errors the `JoinSet`
1591/// is shut down, which drops in-flight futures and kills their
1592/// subprocesses.
1593async fn run_bounded<F, Fut>(
1594    batches: Vec<Vec<(usize, String)>>,
1595    permits: usize,
1596    dim: usize,
1597    token: CancellationToken,
1598    work: F,
1599    on_result: &mut impl FnMut(usize, &[f32]) -> Result<(), AppError>,
1600) -> Result<(), AppError>
1601where
1602    F: Fn(Vec<(usize, String)>) -> Fut + Clone + Send + 'static,
1603    Fut: std::future::Future<Output = Result<Vec<(usize, Vec<f32>)>, AppError>> + Send,
1604{
1605    let total_batches = batches.len();
1606    let semaphore = Arc::new(Semaphore::new(permits));
1607    // BLOCO 5: bounded channel — producers block when the collector is
1608    // behind (backpressure); PROIBIDO unbounded_channel between stages.
1609    let (tx, mut rx) = mpsc::channel::<Result<Vec<(usize, Vec<f32>)>, AppError>>(permits * 2);
1610    let mut set: JoinSet<()> = JoinSet::new();
1611
1612    for (batch_idx, batch) in batches.into_iter().enumerate() {
1613        let sem = Arc::clone(&semaphore);
1614        let token = token.clone();
1615        let tx = tx.clone();
1616        let work = work.clone();
1617        set.spawn(async move {
1618            let wait_start = std::time::Instant::now();
1619            // acquire_owned: RAII permit moved into the task; returned
1620            // on every exit path INCLUDING panic (BLOCO 2).
1621            let Ok(_permit) = sem.acquire_owned().await else {
1622                let _ = tx
1623                    .send(Err(AppError::Embedding("semaphore closed".to_string())))
1624                    .await;
1625                return;
1626            };
1627            let permit_wait_ms = wait_start.elapsed().as_millis() as u64;
1628            let work_start = std::time::Instant::now();
1629            // ADR-0034: when `SQLITE_GRAPHRAG_IGNORE_SHUTDOWN=1` is set the
1630            // cancellation arm is dropped and the batch runs to completion.
1631            // This unblocks audit/test invocations whose `SHUTDOWN` flag was
1632            // contaminated by an earlier signal handler in the same process
1633            // tree. Production code never sees this branch.
1634            let outcome = if crate::should_obey_shutdown() {
1635                tokio::select! {
1636                    res = work(batch) => res,
1637                    _ = token.cancelled() => Err(AppError::Embedding(
1638                        "embedding cancelled by shutdown signal".to_string(),
1639                    )),
1640                }
1641            } else {
1642                work(batch).await
1643            };
1644            // BLOCO 8: permit wait time logged SEPARATELY from work time.
1645            tracing::debug!(
1646                target: "embedding",
1647                batch_idx,
1648                permit_wait_ms,
1649                work_ms = work_start.elapsed().as_millis() as u64,
1650                ok = outcome.is_ok(),
1651                "embedding batch finished"
1652            );
1653            let _ = tx.send(outcome).await;
1654        });
1655    }
1656    drop(tx);
1657
1658    let mut completed = 0usize;
1659    let mut failed = 0usize;
1660    let mut cancelled = 0usize;
1661    let mut first_error: Option<AppError> = None;
1662
1663    while let Some(message) = rx.recv().await {
1664        match message {
1665            Ok(items) => {
1666                completed += 1;
1667                if first_error.is_none() {
1668                    for (idx, v) in items {
1669                        if v.len() != dim {
1670                            first_error = Some(AppError::Embedding(format!(
1671                                "LLM returned {} dims for item {idx}, expected {dim}; \
1672                                 refusing to truncate or pad silently (G42/C5)",
1673                                v.len()
1674                            )));
1675                            break;
1676                        }
1677                        if let Err(e) = on_result(idx, &v) {
1678                            first_error = Some(e);
1679                            break;
1680                        }
1681                    }
1682                    if first_error.is_some() {
1683                        // Abort remaining work: dropped futures kill
1684                        // their subprocesses via kill_on_drop (BLOCO 6).
1685                        set.shutdown().await;
1686                    }
1687                }
1688            }
1689            Err(e) => {
1690                if matches!(&e, AppError::Embedding(msg) if msg.contains("cancelled")) {
1691                    cancelled += 1;
1692                } else {
1693                    failed += 1;
1694                }
1695                if first_error.is_none() {
1696                    first_error = Some(e);
1697                    set.shutdown().await;
1698                }
1699            }
1700        }
1701    }
1702
1703    // Drain the JoinSet: surface panics distinctly (panic handling —
1704    // JoinError::is_panic tratado em todo join_next, BLOCO 9).
1705    while let Some(join_result) = set.join_next().await {
1706        if let Err(join_err) = join_result {
1707            if join_err.is_panic() {
1708                failed += 1;
1709                if first_error.is_none() {
1710                    first_error = Some(AppError::Embedding(format!(
1711                        "embedding task panicked: {join_err}"
1712                    )));
1713                }
1714            } else {
1715                cancelled += 1;
1716            }
1717        }
1718    }
1719
1720    // v1.0.85 (ADR-0043 hygiene): the fan-out summary event moved
1721    // from `tracing::info!` to `tracing::debug!` and the
1722    // `available_permits` field was removed — the user prohibited
1723    // pool-state telemetry (slot_pool_stats / slot_wait_ms) and
1724    // decorative `tracing::info!` events. The remaining counters
1725    // (total_batches / completed / failed / cancelled) describe the
1726    // progress of the operation itself, not the slot pool, and
1727    // remain visible to operators running with `RUST_LOG=debug` or
1728    // `-vvv`.
1729    tracing::debug!(
1730        target: "embedding",
1731        total_batches,
1732        completed,
1733        failed,
1734        cancelled,
1735        "embedding fan-out finished"
1736    );
1737
1738    match first_error {
1739        Some(e) => Err(e),
1740        None => Ok(()),
1741    }
1742}
1743
1744pub fn f32_to_bytes(v: &[f32]) -> Vec<u8> {
1745    let mut out = Vec::with_capacity(v.len() * 4);
1746    for f in v {
1747        out.extend_from_slice(&f.to_le_bytes());
1748    }
1749    out
1750}
1751
1752pub fn bytes_to_f32(bytes: &[u8]) -> Vec<f32> {
1753    let mut out = Vec::with_capacity(bytes.len() / 4);
1754    for chunk in bytes.chunks_exact(4) {
1755        out.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
1756    }
1757    out
1758}
1759
1760/// Returns the dimensionality of the embedding space. Used to
1761/// validate LLM responses and to size the in-memory cache.
1762pub fn embedding_dim() -> usize {
1763    crate::constants::embedding_dim()
1764}
1765
1766/// G42/C5: a vector with a divergent dimensionality is an ERROR, never
1767/// silently truncated or zero-padded (the pre-v1.0.79 `normalise_dim`
1768/// masked malformed LLM responses).
1769fn validate_dim(v: Vec<f32>) -> Result<Vec<f32>, AppError> {
1770    let dim = crate::constants::embedding_dim();
1771    if v.len() != dim {
1772        return Err(AppError::Embedding(format!(
1773            "embedding has {} dims, expected {dim}; \
1774             refusing to truncate or pad silently (G42/C5)",
1775            v.len()
1776        )));
1777    }
1778    Ok(v)
1779}
1780
1781#[cfg(test)]
1782#[path = "embedder_tests.rs"]
1783mod tests;
1784
1785
1786// =============================================================================
1787// v1.0.82 (GAP-005) — embed_with_fallback tests
1788// =============================================================================
1789#[cfg(test)]
1790#[path = "embedder_fallback_tests.rs"]
1791mod embed_with_fallback_tests;
sqlite_graphrag/embedder.rs

sqlite_graphrag/
embedder.rs