Skip to main content

lunaris_embed/
fastembed.rs

1//! `FastembedEmbedder` — ONNX-backed EmbeddingGemma 300M via fastembed-rs.
2//!
3//! ## v0 forward-pass strategy
4//!
5//! Where [`crate::candle_gemma::CandleEmbeddingGemma`] reads the
6//! `embed_tokens.weight` matrix and mean-pools the first-layer token
7//! embeddings (a pragmatic "lexical" shortcut), this backend runs the
8//! **full ONNX forward pass** of `EmbeddingGemma300M` via `fastembed::TextEmbedding`
9//! (which sits on top of `ort` 2.x — the ONNX Runtime Rust binding). The graph
10//! emits `sentence_embedding` already mean-pooled inside the ONNX model; we
11//! defensively L2-normalise on the way out because the graph is NOT guaranteed
12//! to emit unit vectors across all model variants, and Moon `FT.SEARCH` cosine
13//! distance requires unit-norm rows.
14//!
15//! Weights auto-download on first call to `FastembedEmbedder::new` via
16//! `hf-hub` (TLS-enforced — `hf-hub-native-tls` feature). Cache directory
17//! defaults to `~/.cache/lunaris/models/fastembed/` (shares parent with the
18//! candle path's `~/.cache/lunaris/models/embedding-gemma-300m/`, so
19//! `rm -rf ~/.cache/lunaris/models/` wipes both backends in one go).
20//!
21//! ## `&mut self` -> `&self` adapter
22//!
23//! `fastembed::TextEmbedding::embed` is `&mut self` and synchronous (CPU-bound
24//! ORT call). The [`Embedder`] trait is `&self` and async. We bridge with
25//! `Arc<Inner { Mutex<TextEmbedding> }>`:
26//! - The Mutex is `parking_lot::Mutex` (CLAUDE.md lock discipline — never
27//!   `std::sync::Mutex` for new code).
28//! - The lock is acquired **inside** `tokio::task::spawn_blocking`, never held
29//!   across `.await` (CLAUDE.md: "never hold a lock across `.await`").
30//! - This serializes concurrent `embed_batch` calls per `FastembedEmbedder`
31//!   instance. That's fine: fastembed batches internally at `batch_size = 256`
32//!   and Lunaris ingest is single-writer-per-scope, so the Mutex never
33//!   meaningfully contends. Concurrent readers wanting parallelism construct
34//!   multiple `FastembedEmbedder` instances (one ORT session per instance).
35//!
36//! ## Defensive L2-normalize
37//!
38//! Each output row is L2-normalised on the host side, matching the candle
39//! path's invariant (and the trait-level expectation). If `l2 < f64::EPSILON`
40//! (a degenerate all-zeros graph output for an empty/pad-only input) we return
41//! the row unchanged — same behaviour as `candle_gemma.rs`.
42//!
43//! ## Failure modes
44//!
45//! | Condition                                              | Returned error                                                                     |
46//! |--------------------------------------------------------|-------------------------------------------------------------------------------------|
47//! | HF Hub download failure (no network, 4xx, TLS)         | `LunarisError::Storage(StorageError::Backend("fastembed: ..."))` (anyhow rewrap)    |
48//! | ORT session init failure (corrupt cache, bad ONNX)     | `LunarisError::Storage(StorageError::Backend("fastembed: ..."))`                    |
49//! | `TextEmbedding::embed` call failure (tokenizer, ORT)   | `LunarisError::Storage(StorageError::Backend("fastembed: ..."))`                    |
50//! | `tokio::task::spawn_blocking` join failure (panic)     | `LunarisError::Storage(StorageError::Backend("fastembed join: ..."))`               |
51//! | First-call row width ≠ [`FASTEMBED_GEMMA_DIM`]         | `LunarisError::Storage(StorageError::Backend("fastembed: dim mismatch ..."))`       |
52//! | Mutex poisoned                                         | Cannot occur — `parking_lot::Mutex` is poison-free by design.                       |
53
54use std::path::PathBuf;
55use std::sync::Arc;
56
57use async_trait::async_trait;
58use fastembed::{
59    EmbeddingModel, InitOptions, InitOptionsUserDefined, Pooling, QuantizationMode, TextEmbedding,
60    TokenizerFiles, UserDefinedEmbeddingModel,
61};
62use lunaris_core::{Embedder, LunarisError, StorageError};
63use parking_lot::Mutex;
64
65/// Output dimensionality of `EmbeddingGemma300M`. Fixed at 768d — matches
66/// [`crate::candle_gemma::EMBEDDING_GEMMA_DIM`] so the two backends are
67/// drop-in replacements through the `Embedder` trait surface.
68pub const FASTEMBED_GEMMA_DIM: usize = 768;
69
70/// Maximum input tokens per request (EmbeddingGemma context window). Mirrors
71/// [`crate::candle_gemma::EMBEDDING_GEMMA_MAX_TOKENS`] for parity; truncation
72/// is handled inside fastembed's tokenizer wrapper (we don't need to truncate
73/// on the host side as candle_gemma does).
74pub const FASTEMBED_GEMMA_MAX_TOKENS: usize = 2048;
75
76/// Environment variable that overrides the default fastembed cache directory.
77/// Mirrors the `LUNARIS_OLLAMA_URL` / `LUNARIS_OLLAMA_MODEL` env-override
78/// convention established in `crate::ollama` (feature-gated).
79pub const FASTEMBED_CACHE_DIR_ENV: &str = "LUNARIS_FASTEMBED_CACHE_DIR";
80
81// Phase 20 Plan 20-01 — execution-provider plumbing lives in a sibling module
82// to keep this file under the project's split threshold. Re-exported so the
83// public API surface (`lunaris_embed::fastembed::ExecutionPreference`) stays
84// unchanged for downstream callers.
85pub use crate::fastembed_exec::{
86    ExecutionPreference, FASTEMBED_EXECUTION_ENV, execution_from_env, parse_execution,
87};
88use crate::fastembed_exec::{build_execution_providers, requests_accelerator};
89
90/// Construction options for [`FastembedEmbedder`].
91///
92/// `Default` resolves `cache_dir` in priority order:
93/// 1. `$LUNARIS_FASTEMBED_CACHE_DIR` if set (operator-controllable for CI / sandboxes);
94/// 2. `~/.cache/lunaris/models/fastembed/` (shares parent with the candle cache);
95/// 3. `./lunaris/models/fastembed/` as a last-ditch fallback when `dirs::cache_dir`
96///    returns `None` (rare — only on platforms without a HOME concept).
97///
98/// `show_download_progress` defaults to `false` so server processes don't
99/// spew progress bars into structured logs. Set `true` for local CLI use.
100#[derive(Clone, Debug)]
101pub struct FastembedEmbedderOpts {
102    /// Filesystem path where fastembed stores auto-downloaded ONNX weights.
103    /// `None` means "resolve via the env-override → `dirs::cache_dir()` chain
104    /// at `Default` time"; once `Default` runs this is always `Some(...)`.
105    pub cache_dir: Option<PathBuf>,
106    /// Forwarded to `fastembed::InitOptions::with_show_download_progress`.
107    /// Default `false` to keep server logs clean.
108    pub show_download_progress: bool,
109    /// ORT execution-provider preference (Phase 20 Plan 20-01). `Default`
110    /// reads `$LUNARIS_FASTEMBED_EXECUTION`; unknown values resolve to `Cpu`
111    /// with a `tracing::warn`. Set programmatically when callers want to
112    /// override the environment.
113    pub execution: ExecutionPreference,
114}
115
116impl Default for FastembedEmbedderOpts {
117    fn default() -> Self {
118        Self {
119            cache_dir: Some(resolve_default_cache_dir()),
120            show_download_progress: false,
121            execution: execution_from_env(),
122        }
123    }
124}
125
126/// Resolve the default fastembed cache directory. See
127/// [`FastembedEmbedderOpts`] doc for the precedence chain.
128fn resolve_default_cache_dir() -> PathBuf {
129    if let Ok(env_dir) = std::env::var(FASTEMBED_CACHE_DIR_ENV)
130        && !env_dir.is_empty()
131    {
132        return PathBuf::from(env_dir);
133    }
134    let cache_root = dirs::cache_dir().unwrap_or_else(|| PathBuf::from("."));
135    cache_root.join("lunaris").join("models").join("fastembed")
136}
137
138/// ONNX-backed `EmbeddingGemma 300M` embedder. See module-level doc for the
139/// adapter strategy and failure-mode table.
140#[derive(Clone)]
141pub struct FastembedEmbedder {
142    /// Cheap-to-clone handle; the heavy ORT session lives inside the `Arc`.
143    inner: Arc<Inner>,
144}
145
146impl std::fmt::Debug for FastembedEmbedder {
147    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
148        f.debug_struct("FastembedEmbedder")
149            .field("dim", &self.inner.dim)
150            .field("cache_dir", &self.inner.cache_dir)
151            .finish()
152    }
153}
154
155struct Inner {
156    /// The ONNX session. `embed` is `&mut self` so the lock IS the
157    /// serialisation point — see module doc.
158    model: Mutex<TextEmbedding>,
159    /// Retained for `Debug` + future operator triage tracing. Not used in the
160    /// hot path. For the user-defined-model path this is `PathBuf::new()`
161    /// (empty) since the operator hands us bytes directly — there is no
162    /// on-disk cache by definition.
163    cache_dir: PathBuf,
164    /// Embedding dimensionality. For the default path this is
165    /// [`FASTEMBED_GEMMA_DIM`]; for the user-defined path it is the
166    /// operator-declared `dim` from [`FastembedUserDefinedOpts`].
167    ///
168    /// Made runtime (rather than the compile-time constant the Phase 19
169    /// implementation read) by Plan 20-01 Task 3 so bring-your-own-model
170    /// callers see their own dim through the [`Embedder`] trait surface.
171    dim: usize,
172}
173
174impl FastembedEmbedder {
175    /// Construct a real ONNX-backed embedder. On first call this triggers an
176    /// HF Hub download of the EmbeddingGemma 300M weights (~600 MB) into
177    /// `opts.cache_dir`; subsequent calls hit the cache.
178    ///
179    /// Construction is **synchronous** because fastembed's `try_new` is
180    /// itself synchronous — the I/O happens inline. Callers that need to
181    /// avoid stalling the runtime should wrap this in
182    /// `tokio::task::spawn_blocking` at the call site; we deliberately do
183    /// NOT wrap inside `new` so the error mapping stays straightforward and
184    /// the caller controls the spawn context.
185    pub fn new(opts: FastembedEmbedderOpts) -> Result<Self, LunarisError> {
186        let cache_dir = opts.cache_dir.unwrap_or_else(resolve_default_cache_dir);
187        let execution = opts.execution.clone();
188
189        // T-19-01-03 mitigation: log model + cache_dir at INFO so operators can
190        // diff env-to-env. Do NOT log inputs anywhere in this module
191        // (T-19-01-04).
192        tracing::info!(
193            backend = "fastembed",
194            model = "EmbeddingGemma300M",
195            cache_dir = %cache_dir.display(),
196            execution = ?execution,
197            "fastembed embedder constructing"
198        );
199
200        let build = |providers_enabled: bool| -> Result<TextEmbedding, anyhow::Error> {
201            let mut init = InitOptions::new(EmbeddingModel::EmbeddingGemma300M)
202                .with_cache_dir(cache_dir.clone())
203                .with_show_download_progress(opts.show_download_progress);
204            if providers_enabled {
205                init = init.with_execution_providers(build_execution_providers(&execution));
206            }
207            TextEmbedding::try_new(init)
208        };
209
210        let model = try_with_fallback(&execution, build)?;
211
212        // Best-effort label: fastembed's `Session` doesn't expose the active
213        // EP, so we report the requested preference here. The fallback path
214        // emits its own `warn` if it kicked in, which is the durable signal
215        // for "you asked for accelerator but got CPU".
216        let resolved = execution.clone();
217        tracing::info!(
218            backend = "fastembed",
219            model = "EmbeddingGemma300M",
220            execution = ?resolved,
221            "fastembed embedder initialized"
222        );
223
224        Ok(Self {
225            inner: Arc::new(Inner {
226                model: Mutex::new(model),
227                cache_dir,
228                dim: FASTEMBED_GEMMA_DIM,
229            }),
230        })
231    }
232
233    /// Bring-your-own ONNX model (Phase 20 Plan 20-01). The operator supplies
234    /// the model bytes + tokenizer bytes in [`FastembedUserDefinedOpts`] and
235    /// declares the output dimensionality (`dim`); the constructor wires
236    /// fastembed's [`UserDefinedEmbeddingModel`] / `InitOptionsUserDefined`
237    /// and returns a ready embedder.
238    ///
239    /// # Trust requirement
240    ///
241    /// The ONNX bytes execute in-process through ONNX Runtime. They MUST come
242    /// from a trusted source (operator-controlled model registry, not
243    /// user-uploaded content) — Lunaris performs no graph validation. See
244    /// `.planning/phases/20-fastembed-adoption/20-01-PLAN.md` threat
245    /// `T-20-01-01`.
246    ///
247    /// # Storage-dim constraint
248    ///
249    /// Lunaris's default storage schema is **768-d** (Moon FT index + Postgres
250    /// `vector(768)` column). Operators bringing a model whose `dim != 768`
251    /// MUST also reindex storage — this is the storage-side migration covered
252    /// by Plan 20-03. Lunaris does NOT enforce dim parity between embedder
253    /// and storage on the hot path; a mismatch surfaces as a backend insert
254    /// error at first ingest.
255    ///
256    /// # Example
257    ///
258    /// ```no_run
259    /// use std::sync::Arc;
260    /// use lunaris_embed::fastembed::{
261    ///     FastembedEmbedder, FastembedUserDefinedOpts, PoolingMode, ExecutionPreference,
262    /// };
263    ///
264    /// # fn demo() -> Result<(), Box<dyn std::error::Error>> {
265    /// let onnx = std::fs::read("models/helios-finetuned.onnx")?;
266    /// let tok = std::fs::read("models/helios-finetuned/tokenizer.json")?;
267    /// let embedder = FastembedEmbedder::from_user_defined(FastembedUserDefinedOpts {
268    ///     onnx_file: onnx,
269    ///     tokenizer_file: tok,
270    ///     tokenizer_config_file: None,
271    ///     special_tokens_map_file: None,
272    ///     config_file: None,
273    ///     dim: 1024, // MUST match the ONNX model's output dim
274    ///     pooling: PoolingMode::Mean,
275    ///     execution: ExecutionPreference::Cpu,
276    ///     max_length: 2048,
277    /// })?;
278    /// // let lunaris = Lunaris::open(url).await?.with_embedder(Arc::new(embedder));
279    /// let _ = Arc::new(embedder);
280    /// # Ok(()) }
281    /// ```
282    pub fn from_user_defined(opts: FastembedUserDefinedOpts) -> Result<Self, LunarisError> {
283        if opts.onnx_file.is_empty() {
284            return Err(LunarisError::Storage(StorageError::Backend(
285                "fastembed: from_user_defined called with empty onnx_file bytes".to_string(),
286            )));
287        }
288        if opts.tokenizer_file.is_empty() {
289            return Err(LunarisError::Storage(StorageError::Backend(
290                "fastembed: from_user_defined called with empty tokenizer_file bytes".to_string(),
291            )));
292        }
293        if opts.dim == 0 {
294            return Err(LunarisError::Storage(StorageError::Backend(
295                "fastembed: from_user_defined called with dim = 0".to_string(),
296            )));
297        }
298
299        let execution = opts.execution.clone();
300        let dim = opts.dim;
301        let max_length = opts.max_length;
302
303        tracing::info!(
304            backend = "fastembed",
305            model = "user-defined",
306            dim,
307            execution = ?execution,
308            "fastembed user-defined embedder constructing"
309        );
310
311        // The struct is non-`Clone` once we move bytes in. Construct once;
312        // fallback retry below requires a second model — for the user-defined
313        // path we keep buffers around in `Option<...>` so the fallback path
314        // can reuse them without double-copying multi-MB onnx blobs.
315        let user_model = UserDefinedEmbeddingModel {
316            onnx_file: opts.onnx_file,
317            external_initializers: Vec::new(),
318            tokenizer_files: TokenizerFiles {
319                tokenizer_file: opts.tokenizer_file,
320                config_file: opts.config_file.unwrap_or_default(),
321                special_tokens_map_file: opts.special_tokens_map_file.unwrap_or_default(),
322                tokenizer_config_file: opts.tokenizer_config_file.unwrap_or_default(),
323            },
324            pooling: Some(opts.pooling.into()),
325            quantization: QuantizationMode::None,
326            output_key: None,
327        };
328
329        let model = try_user_defined_with_fallback(&execution, user_model, max_length)?;
330
331        // Best-effort label: fastembed's `Session` doesn't expose the active
332        // EP, so we report the requested preference here. The fallback path
333        // emits its own `warn` if it kicked in, which is the durable signal
334        // for "you asked for accelerator but got CPU".
335        let resolved = execution.clone();
336        tracing::info!(
337            backend = "fastembed",
338            model = "user-defined",
339            dim,
340            execution = ?resolved,
341            "fastembed user-defined embedder initialized"
342        );
343
344        Ok(Self {
345            inner: Arc::new(Inner { model: Mutex::new(model), cache_dir: PathBuf::new(), dim }),
346        })
347    }
348}
349
350/// Options for [`FastembedEmbedder::from_user_defined`]. All byte buffers are
351/// moved into the constructor — they aren't retained inside the embedder once
352/// the ONNX session has been built (the session owns its parsed graph).
353///
354/// **Storage-side dim invariant:** see the constructor's rustdoc — `dim` must
355/// match the ONNX model's output AND should match Lunaris's storage schema
356/// (default 768) unless storage is reindexed.
357#[derive(Clone, Debug)]
358pub struct FastembedUserDefinedOpts {
359    /// Raw bytes of the ONNX graph (e.g., `model.onnx`).
360    pub onnx_file: Vec<u8>,
361    /// Raw bytes of the HF-format `tokenizer.json`.
362    pub tokenizer_file: Vec<u8>,
363    /// Optional `tokenizer_config.json` bytes. Empty if `None`.
364    pub tokenizer_config_file: Option<Vec<u8>>,
365    /// Optional `special_tokens_map.json` bytes.
366    pub special_tokens_map_file: Option<Vec<u8>>,
367    /// Optional model `config.json` bytes (architecture metadata).
368    pub config_file: Option<Vec<u8>>,
369    /// Output dimensionality declared by the operator. MUST match what the
370    /// ONNX graph actually emits; a mismatch surfaces as a vector-index
371    /// rejection at the first ingest call.
372    pub dim: usize,
373    /// Pooling strategy applied to token-level embeddings to produce the
374    /// sentence vector. Mirrors fastembed's [`Pooling`] enum.
375    pub pooling: PoolingMode,
376    /// ORT execution provider preference (same enum as the default path).
377    pub execution: ExecutionPreference,
378    /// Token context window. Defaults to 2048 to match `EmbeddingGemma300M`.
379    pub max_length: usize,
380}
381
382impl Default for FastembedUserDefinedOpts {
383    fn default() -> Self {
384        Self {
385            onnx_file: Vec::new(),
386            tokenizer_file: Vec::new(),
387            tokenizer_config_file: None,
388            special_tokens_map_file: None,
389            config_file: None,
390            dim: 0,
391            pooling: PoolingMode::Mean,
392            execution: execution_from_env(),
393            max_length: FASTEMBED_GEMMA_MAX_TOKENS,
394        }
395    }
396}
397
398/// Lunaris-facing pooling enum — decouples callers from a direct
399/// [`fastembed::Pooling`] type dependency.
400///
401/// `Cls` mirrors fastembed's BERT-style first-token pooling; `Mean` is the
402/// recommended setting for sentence-similarity models (EmbeddingGemma + most
403/// BGE variants).
404#[derive(Clone, Debug, Default, PartialEq, Eq)]
405pub enum PoolingMode {
406    /// CLS-token pooling (BERT-style). Maps to [`fastembed::Pooling::Cls`].
407    Cls,
408    /// Mean pooling with attention-mask weighting. Maps to
409    /// [`fastembed::Pooling::Mean`].
410    #[default]
411    Mean,
412}
413
414impl From<PoolingMode> for Pooling {
415    fn from(m: PoolingMode) -> Self {
416        match m {
417            PoolingMode::Cls => Pooling::Cls,
418            PoolingMode::Mean => Pooling::Mean,
419        }
420    }
421}
422
423/// Try the construction closure with execution providers; on failure when an
424/// accelerator was requested, retry once with CPU only and a `tracing::warn`.
425fn try_with_fallback<F>(
426    pref: &ExecutionPreference,
427    mut build: F,
428) -> Result<TextEmbedding, LunarisError>
429where
430    F: FnMut(bool) -> Result<TextEmbedding, anyhow::Error>,
431{
432    let want_accelerator = requests_accelerator(pref);
433    match build(want_accelerator) {
434        Ok(m) => Ok(m),
435        Err(e) if want_accelerator => {
436            // T-20-01-03 mitigation: %e (Display) — don't dump full provider
437            // debug context (which may include driver paths) into logs.
438            tracing::warn!(
439                error = %e,
440                requested = ?pref,
441                "fastembed execution provider init failed, falling back to CPU"
442            );
443            build(false).map_err(anyhow_to_lunaris)
444        }
445        Err(e) => Err(anyhow_to_lunaris(e)),
446    }
447}
448
449/// User-defined variant of [`try_with_fallback`]. Owns the
450/// `UserDefinedEmbeddingModel` so the fallback retry doesn't have to clone
451/// multi-MB byte buffers — fastembed's struct is `Clone`, so we keep the
452/// owned copy in scope and pass clones in.
453fn try_user_defined_with_fallback(
454    pref: &ExecutionPreference,
455    user_model: UserDefinedEmbeddingModel,
456    max_length: usize,
457) -> Result<TextEmbedding, LunarisError> {
458    let want_accelerator = requests_accelerator(pref);
459    let build = |providers_enabled: bool, m: UserDefinedEmbeddingModel| {
460        let mut init = InitOptionsUserDefined::new().with_max_length(max_length);
461        if providers_enabled {
462            init = init.with_execution_providers(build_execution_providers(pref));
463        }
464        TextEmbedding::try_new_from_user_defined(m, init)
465    };
466
467    if want_accelerator {
468        // Keep an unconsumed clone to retry on the CPU path if the accelerator
469        // session-build fails.
470        let retry_model = user_model.clone();
471        match build(true, user_model) {
472            Ok(m) => Ok(m),
473            Err(e) => {
474                tracing::warn!(
475                    error = %e,
476                    requested = ?pref,
477                    "fastembed (user-defined) execution provider init failed, falling back to CPU"
478                );
479                build(false, retry_model).map_err(anyhow_to_lunaris)
480            }
481        }
482    } else {
483        build(false, user_model).map_err(anyhow_to_lunaris)
484    }
485}
486
487#[async_trait]
488impl Embedder for FastembedEmbedder {
489    fn dim(&self) -> usize {
490        // Phase 20 Plan 20-01 Task 3 — read runtime dim from Inner. For the
491        // default `new()` path this is `FASTEMBED_GEMMA_DIM` (768); for the
492        // `from_user_defined` path it is operator-declared.
493        self.inner.dim
494    }
495
496    async fn embed_batch(&self, inputs: &[&str]) -> Result<Vec<Vec<f32>>, LunarisError> {
497        if inputs.is_empty() {
498            return Ok(Vec::new());
499        }
500
501        // Move owned inputs across the spawn_blocking boundary — `&str`
502        // borrows are not `'static` so we have to materialise `String`s.
503        let owned: Vec<String> = inputs.iter().map(|s| (*s).to_string()).collect();
504        let inner = self.inner.clone();
505        let expected_dim = inner.dim;
506
507        tokio::task::spawn_blocking(move || -> Result<Vec<Vec<f32>>, LunarisError> {
508            // Acquire the Mutex INSIDE the blocking closure. CLAUDE.md lock
509            // discipline: never across `.await`. `parking_lot::Mutex` is
510            // poison-free so the unwrap-like `lock()` cannot fail.
511            let raw: Vec<Vec<f32>> = {
512                let mut guard = inner.model.lock();
513                // `None` -> use fastembed's default batch size (256).
514                guard.embed(owned, None).map_err(anyhow_to_lunaris)?
515            }; // guard drops here; subsequent normalisation is lock-free.
516
517            let mut out: Vec<Vec<f32>> = Vec::with_capacity(raw.len());
518            for row in raw.into_iter() {
519                if row.len() != expected_dim {
520                    return Err(LunarisError::Storage(StorageError::Backend(format!(
521                        "fastembed: dim mismatch — model returned {} dims, expected {expected_dim}",
522                        row.len()
523                    ))));
524                }
525                out.push(l2_normalize_row(row, expected_dim));
526            }
527            Ok(out)
528        })
529        .await
530        .map_err(|e| LunarisError::Storage(StorageError::Backend(format!("fastembed join: {e}"))))?
531    }
532}
533
534/// L2-normalise a single row in place. If the row is degenerate
535/// (`l2 < f64::EPSILON`) it is returned unchanged — matches
536/// [`crate::candle_gemma`]'s behaviour and avoids dividing by zero.
537///
538/// `expected_dim` is passed for the debug-assert only; the function is
539/// dim-agnostic post Phase 20 Plan 20-01 (the user-defined model path may
540/// have `dim != 768`).
541#[inline]
542fn l2_normalize_row(row: Vec<f32>, expected_dim: usize) -> Vec<f32> {
543    let l2 = row.iter().map(|x| (*x as f64).powi(2)).sum::<f64>().sqrt();
544    if l2 > f64::EPSILON {
545        let mut out: Vec<f32> = row;
546        for v in out.iter_mut() {
547            *v = (*v as f64 / l2) as f32;
548        }
549        debug_assert_eq!(out.len(), expected_dim);
550        out
551    } else {
552        row
553    }
554}
555
556/// Bridge `anyhow::Error` (fastembed's error surface) to `LunarisError`.
557/// Mirrors the candle path's `candle_err` helper.
558#[inline]
559fn anyhow_to_lunaris(e: anyhow::Error) -> LunarisError {
560    LunarisError::Storage(StorageError::Backend(format!("fastembed: {e}")))
561}
562
563#[cfg(test)]
564mod tests {
565    use super::*;
566
567    #[test]
568    fn opts_default_resolves_to_cache_subdir() {
569        // Guard against env pollution from sibling tests (or the shell).
570        // Use `unsafe`? No — std::env::remove_var is unsafe in edition 2024;
571        // we work around by snapshotting and restoring around the assertion.
572        // Easier: assert the *suffix* path components are right and just
573        // skip the assertion if the env var is set externally (operator-set
574        // overrides are explicitly allowed by the API contract).
575        let env_override = std::env::var(FASTEMBED_CACHE_DIR_ENV).ok();
576        if env_override.is_some() {
577            // Operator override active — Default returns that, by contract.
578            return;
579        }
580        let opts = FastembedEmbedderOpts::default();
581        let path = opts.cache_dir.expect("default sets a cache_dir");
582        let s = path.to_string_lossy().to_string();
583        assert!(
584            s.contains("lunaris") && s.contains("models") && s.contains("fastembed"),
585            "default cache_dir should include the v0 cache layout, got: {s}"
586        );
587    }
588
589    #[test]
590    fn dim_constant_is_768() {
591        assert_eq!(FASTEMBED_GEMMA_DIM, 768);
592    }
593
594    #[test]
595    fn l2_normalize_unit_vector() {
596        // Construct a non-unit vector; expect ‖result‖₂ ≈ 1.
597        let mut row = vec![0.0_f32; FASTEMBED_GEMMA_DIM];
598        row[0] = 3.0;
599        row[1] = 4.0; // ‖row‖₂ = 5
600        let out = l2_normalize_row(row, FASTEMBED_GEMMA_DIM);
601        let l2 = out.iter().map(|x| (*x as f64).powi(2)).sum::<f64>().sqrt();
602        assert!((l2 - 1.0).abs() < 1e-6, "expected unit norm, got {l2}");
603        // 3/5 = 0.6, 4/5 = 0.8 — exact in f32.
604        assert!((out[0] - 0.6).abs() < 1e-6);
605        assert!((out[1] - 0.8).abs() < 1e-6);
606    }
607
608    #[test]
609    fn l2_normalize_degenerate_row_returned_as_is() {
610        // All-zero row: norm < EPSILON → returned unchanged (matches
611        // candle_gemma).
612        let row = vec![0.0_f32; FASTEMBED_GEMMA_DIM];
613        let out = l2_normalize_row(row, FASTEMBED_GEMMA_DIM);
614        assert_eq!(out.len(), FASTEMBED_GEMMA_DIM);
615        assert!(out.iter().all(|&x| x == 0.0));
616    }
617
618    // ---- Phase 20 Plan 20-01 ------------------------------------------------
619    // ExecutionPreference + parse_execution tests live alongside their
620    // implementation in `crate::fastembed_exec`. Tests below cover the parts
621    // of Plan 20-01 that touch the embedder construction surface specifically:
622    // from_user_defined error paths + PoolingMode mapping.
623
624    #[test]
625    fn from_user_defined_empty_onnx_returns_actionable_error() {
626        // Empty bytes path — the constructor short-circuits BEFORE calling
627        // into fastembed/ORT (so this test is offline-runnable). The error
628        // string MUST contain `"fastembed"` so operators can grep for it.
629        let opts = FastembedUserDefinedOpts {
630            onnx_file: Vec::new(),
631            tokenizer_file: vec![0u8; 4],
632            dim: 768,
633            ..Default::default()
634        };
635        let err = FastembedEmbedder::from_user_defined(opts).expect_err("empty onnx");
636        let msg = format!("{err}");
637        assert!(
638            msg.contains("fastembed") && msg.contains("onnx_file"),
639            "unexpected error message: {msg}"
640        );
641    }
642
643    #[test]
644    fn from_user_defined_empty_tokenizer_returns_actionable_error() {
645        let opts = FastembedUserDefinedOpts {
646            onnx_file: vec![0u8; 4],
647            tokenizer_file: Vec::new(),
648            dim: 768,
649            ..Default::default()
650        };
651        let err = FastembedEmbedder::from_user_defined(opts).expect_err("empty tokenizer");
652        let msg = format!("{err}");
653        assert!(
654            msg.contains("fastembed") && msg.contains("tokenizer_file"),
655            "unexpected error message: {msg}"
656        );
657    }
658
659    #[test]
660    fn from_user_defined_zero_dim_returns_actionable_error() {
661        let opts = FastembedUserDefinedOpts {
662            onnx_file: vec![0u8; 4],
663            tokenizer_file: vec![0u8; 4],
664            dim: 0,
665            ..Default::default()
666        };
667        let err = FastembedEmbedder::from_user_defined(opts).expect_err("zero dim");
668        let msg = format!("{err}");
669        assert!(msg.contains("fastembed") && msg.contains("dim"), "unexpected: {msg}");
670    }
671
672    #[test]
673    fn from_user_defined_bad_onnx_bytes_surfaces_fastembed_error() {
674        // Non-empty but invalid ONNX bytes — passes our front-door validation
675        // and hits fastembed/ORT proper, which rejects them. The error MUST
676        // be a `LunarisError::Storage(StorageError::Backend(..))` containing
677        // the `"fastembed"` substring.
678        let opts = FastembedUserDefinedOpts {
679            onnx_file: b"not-a-real-onnx-graph".to_vec(),
680            tokenizer_file: b"not-a-real-tokenizer".to_vec(),
681            dim: 768,
682            ..Default::default()
683        };
684        let err = FastembedEmbedder::from_user_defined(opts).expect_err("bad bytes");
685        let msg = format!("{err}");
686        assert!(msg.contains("fastembed"), "expected fastembed-prefixed error, got: {msg}");
687    }
688
689    #[test]
690    fn pooling_mode_maps_to_fastembed_pooling() {
691        let cls: Pooling = PoolingMode::Cls.into();
692        assert!(matches!(cls, Pooling::Cls));
693        let mean: Pooling = PoolingMode::Mean.into();
694        assert!(matches!(mean, Pooling::Mean));
695    }
696}
697
698// -----------------------------------------------------------------------------
699// `embedder-it`-gated real-model smoke. Auto-downloads ~600 MB of ONNX weights
700// on first run (30-90s cold; subsequent runs hit the cache in `~/.cache/
701// lunaris/models/fastembed/embeddinggemma-300m-onnx/`). Verify by deleting
702// that subdir and re-running — fastembed re-downloads transparently. Not
703// included in the default test run; CI's existing `embedder-it` job picks
704// this up automatically and Plan 19-02 expands the matrix.
705// -----------------------------------------------------------------------------
706#[cfg(all(test, feature = "embedder-it"))]
707mod live_tests {
708    use super::*;
709
710    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
711    async fn fastembed_loads_real_model_and_embeds_one_batch() {
712        let embedder = FastembedEmbedder::new(FastembedEmbedderOpts::default())
713            .expect("real model load — auto-download to ~/.cache/lunaris/models/fastembed/");
714        assert_eq!(embedder.dim(), FASTEMBED_GEMMA_DIM);
715        let inputs: [&str; 2] = ["hello world", "lunaris memory engine"];
716        let vecs = embedder.embed_batch(&inputs).await.expect("embed_batch");
717        assert_eq!(vecs.len(), 2);
718        for v in &vecs {
719            assert_eq!(v.len(), FASTEMBED_GEMMA_DIM);
720            let l2 = v.iter().map(|x| (*x as f64).powi(2)).sum::<f64>().sqrt();
721            assert!((l2 - 1.0).abs() < 1e-3, "L2 norm = {l2}, expected ~ 1.0");
722        }
723    }
724}