solo-storage 0.10.2

// SPDX-License-Identifier: Apache-2.0

//! [`BundledEmbedder`] — bundled CPU sentence-transformer embedder
//! (v0.9.0 P3 / plan §3 Decision 1 / BLOCKER 1 strategy c).
//!
//! ## What this is
//!
//! A real CPU-only semantic embedder (`all-MiniLM-L6-v2`, 384-dim, ~22 MB
//! quantized ONNX, Apache-2.0) that runs without any external service
//! (no Ollama, no API key, no network at recall time). First use on a
//! fresh install pulls the model files via fastembed's hf-hub cache
//! (~22 MB) to `~/.fastembed_cache` (or the platform equivalent);
//! subsequent runs load from disk.
//!
//! ## Build vs runtime cost
//!
//! - **Build time**: enabling the `bundled-embedder` Cargo feature
//!   transitively pulls `fastembed` + `pykeio/ort`. ort's `download-
//!   binaries` feature fetches a prebuilt `libonnxruntime` from pyke's
//!   CDN during `cargo build`, cached at `~/.cache/ort/` (or platform
//!   equivalent). Adds **~22 MB** to the final binary on supported
//!   targets (x86_64/aarch64 glibc Linux, macOS, Windows MSVC).
//!
//! - **First-use runtime**: the first call to [`BundledEmbedder::try_new`]
//!   on a host that hasn't run this before downloads the all-MiniLM-L6-v2
//!   model files from HuggingFace (~22 MB) into `~/.fastembed_cache/`.
//!   This is a one-time cost; the daemon also caches the loaded
//!   `TextEmbedding` in-process via [`tokio::sync::OnceCell`] so
//!   subsequent calls reuse it.
//!
//! ## Plan deviation: hf-hub cache vs `include_bytes!`
//!
//! The v0.9.0 plan §6 (`BundledEmbedder` sketch) called for vendoring
//! the ONNX + tokenizer assets under `assets/` and embedding them via
//! `include_bytes!`. Two practical pressures pushed P3 toward
//! fastembed's built-in `EmbeddingModel::AllMiniLML6V2` + hf-hub cache
//! instead:
//!
//!   1. **Repo bloat**: vendoring 22 MB ONNX + 1 MB tokenizer JSONs +
//!      1 MB tokenizer config across the model variants would put
//!      ~25 MB of binary blobs into git history without LFS. The
//!      repo's NOTICE file does not currently flag the LFS pattern,
//!      and Solo's release pipeline has no LFS integration.
//!   2. **fastembed's `try_new(InitOptions::new(model))` IS the
//!      supported path**: fastembed-rs 5.13 ships a built-in
//!      `EmbeddingModel` enum (44 known models including
//!      `AllMiniLML6V2`) with first-class hf-hub caching. The
//!      `try_new_from_user_defined` API supporting `Vec<u8>` bytes is
//!      available too — but for a model that fastembed already
//!      registers, going through the registry path is the
//!      conventional choice.
//!
//! Functional outcome matches the plan: zero-API-key install,
//! semantic recall without Ollama, and the same fallback to
//! `StubEmbedder` when init fails. The runtime contract changes:
//! "works fully offline forever after install" → "works fully offline
//! after the first call completes". This is an acceptable trade for
//! v0.9.0 and is flagged for v0.9.1+ if zero-network-ever becomes a
//! requirement (the `try_new_from_user_defined` path is forward-
//! compatible — same embedder identity, no schema change).
//!
//! ## Identity
//!
//! `Embedder::name() == "bundled:all-MiniLM-L6-v2"`,
//! `version() == "v1"`, `dim() == 384`, `dtype() == F32`.
//!
//! Matches plan §6's locked identity exactly. The `solo reembed`
//! migration tool keys on `(name, version)` so a future move to the
//! `try_new_from_user_defined` path will reuse the same identity (no
//! schema migration, no automatic reembed prompt).
//!
//! ## Thread safety
//!
//! `BundledEmbedder` holds an `Arc<tokio::sync::OnceCell<Arc<Mutex<
//! TextEmbedding>>>>`. The OnceCell guarantees single initialisation
//! across concurrent first-use calls. The inner `tokio::sync::Mutex`
//! serialises `TextEmbedding::embed` (which takes `&mut self`); ort's
//! inference is non-reentrant on a single session, so per-batch
//! serialisation matches the upstream contract.
//!
//! ## Fallback semantics
//!
//! `try_new` is fallible. If construction fails (model file download
//! failure, ort init panic-mapped-to-Err, missing system-level glibc
//! version, etc.) the caller is expected to fall back to
//! [`crate::embedder::StubEmbedder::default_stub`] and emit a
//! `tracing::warn!` line — `build_embedder_from_env` in `mod.rs`
//! implements this fallback path explicitly.

use std::sync::Arc;

use async_trait::async_trait;
use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
use solo_core::{Embedder, Embedding, EmbeddingDtype, Error, Result};
use tokio::sync::{Mutex, OnceCell};

/// Embedder identity (matches `Embedder::name()`).
pub const BUNDLED_EMBEDDER_NAME: &str = "bundled:all-MiniLM-L6-v2";
/// Solo-side wrapper version. Bump on any change to embedding output
/// shape so `solo reembed` regenerates affected vectors.
pub const BUNDLED_EMBEDDER_VERSION: &str = "v1";
/// 384-dim — fixed for all-MiniLM-L6-v2; baked into the model.
pub const BUNDLED_EMBEDDER_DIM: usize = 384;

/// Bundled CPU sentence-transformer embedder.
///
/// Cheap to clone — the underlying model handle is `Arc`-wrapped.
#[derive(Clone)]
pub struct BundledEmbedder {
    /// Lazy-loaded model handle. First call to [`Self::ensure_model`]
    /// runs the constructor under the OnceCell guard; subsequent calls
    /// share the loaded model.
    ///
    /// `Arc<Mutex<TextEmbedding>>` (not `Arc<TextEmbedding>`) because
    /// fastembed's `TextEmbedding::embed` takes `&mut self`. Locking
    /// per batch matches ort's non-reentrant inference contract.
    model: Arc<OnceCell<Arc<Mutex<TextEmbedding>>>>,
}

impl Default for BundledEmbedder {
    fn default() -> Self {
        Self::new()
    }
}

impl std::fmt::Debug for BundledEmbedder {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("BundledEmbedder")
            .field("name", &BUNDLED_EMBEDDER_NAME)
            .field("dim", &BUNDLED_EMBEDDER_DIM)
            .field("loaded", &self.model.get().is_some())
            .finish()
    }
}

impl BundledEmbedder {
    /// Construct a lazy embedder handle. The model is NOT loaded here —
    /// first `embed_batch` call triggers the download + init.
    ///
    /// This keeps daemon startup fast (no ~22 MB HF download blocking
    /// boot on a fresh install) and lets `solo doctor` test config
    /// validity without spinning up ort.
    pub fn new() -> Self {
        Self {
            model: Arc::new(OnceCell::new()),
        }
    }

    /// Eagerly load the model. Useful at `solo daemon` boot time so the
    /// first user query doesn't pay the model-download tail latency
    /// (~22 MB pull + ~500 ms ort session init on cold cache).
    ///
    /// Idempotent: a second call after `try_new` (or after a lazy
    /// embed_batch) is a no-op that returns immediately.
    ///
    /// Returns the cached model handle so callers can verify a single
    /// init occurred.
    pub async fn try_new(&self) -> Result<Arc<Mutex<TextEmbedding>>> {
        self.ensure_model().await.cloned()
    }

    /// Internal: lazy model init. Returns a reference into the OnceCell
    /// so we can verify single-init by `Arc::strong_count` in tests.
    async fn ensure_model(&self) -> Result<&Arc<Mutex<TextEmbedding>>> {
        self.model
            .get_or_try_init(|| async {
                tracing::info!(
                    model = BUNDLED_EMBEDDER_NAME,
                    "loading bundled embedder (first-use download from hf-hub if not cached)"
                );
                // `spawn_blocking` because fastembed's `TextEmbedding::
                // try_new` is synchronous and may run ort's session-init
                // + an hf-hub fetch (~22 MB on a cold cache) on the
                // calling thread. Off-loading keeps the tokio runtime
                // responsive during the one-time pull.
                let model = tokio::task::spawn_blocking(|| {
                    TextEmbedding::try_new(
                        InitOptions::new(EmbeddingModel::AllMiniLML6V2)
                            // Quiet by default — fastembed otherwise
                            // emits stdout progress bars that show up
                            // mid-test on the developer's terminal.
                            .with_show_download_progress(false),
                    )
                })
                .await
                .map_err(|e| {
                    Error::embedder(format!(
                        "bundled embedder init task panicked or was cancelled: {e}"
                    ))
                })?
                .map_err(|e| {
                    Error::embedder(format!(
                        "bundled embedder init failed (fastembed/ort): {e}. \
                         Fall back to SOLO_EMBEDDER=ollama or set [embedder] \
                         name = \"stub\" in solo.config.toml."
                    ))
                })?;
                Ok(Arc::new(Mutex::new(model)))
            })
            .await
    }
}

#[async_trait]
impl Embedder for BundledEmbedder {
    fn name(&self) -> &str {
        BUNDLED_EMBEDDER_NAME
    }

    fn version(&self) -> &str {
        BUNDLED_EMBEDDER_VERSION
    }

    fn dim(&self) -> usize {
        BUNDLED_EMBEDDER_DIM
    }

    fn dtype(&self) -> EmbeddingDtype {
        EmbeddingDtype::F32
    }

    async fn embed_batch(&self, texts: &[&str]) -> Result<Vec<Embedding>> {
        if texts.is_empty() {
            return Ok(Vec::new());
        }

        let model = self.ensure_model().await?.clone();

        // Copy texts to owned strings so the spawn_blocking closure
        // can move them — &[&str] borrows can't cross the boundary.
        let owned: Vec<String> = texts.iter().map(|t| (*t).to_string()).collect();

        let vectors: Vec<Vec<f32>> = tokio::task::spawn_blocking(move || {
            let mut guard = model.blocking_lock();
            guard.embed(&owned, None)
        })
        .await
        .map_err(|e| {
            Error::embedder(format!(
                "bundled embedder inference task panicked or was cancelled: {e}"
            ))
        })?
        .map_err(|e| {
            Error::embedder(format!("bundled embedder embed_batch failed: {e}"))
        })?;

        // Convert each Vec<f32> into our Embedding wire shape. Validate
        // dim per vector — defends against fastembed returning a
        // truncated batch on partial-failure (we'd rather error than
        // silently miscompare in HNSW recall).
        let mut out = Vec::with_capacity(vectors.len());
        for v in vectors {
            if v.len() != BUNDLED_EMBEDDER_DIM {
                return Err(Error::embedder(format!(
                    "bundled embedder returned dim {} (expected {})",
                    v.len(),
                    BUNDLED_EMBEDDER_DIM
                )));
            }
            let mut data = Vec::with_capacity(v.len() * 4);
            for f in &v {
                data.extend_from_slice(&f.to_le_bytes());
            }
            out.push(Embedding {
                dtype: EmbeddingDtype::F32,
                dim: BUNDLED_EMBEDDER_DIM,
                data,
            });
        }
        Ok(out)
    }
}

#[cfg(test)]
mod tests {
    //! Tests for [`BundledEmbedder`]. Gated by `#[cfg(feature =
    //! "bundled-embedder")]` at the module-include site in `mod.rs`,
    //! so this whole file only compiles + runs when the feature is on.
    //!
    //! On the first test run in a fresh build environment fastembed
    //! downloads ~22 MB from HuggingFace; subsequent runs reuse the
    //! cache. Tests share a single `BundledEmbedder` handle via the
    //! `SHARED_EMBEDDER` static — this serialises model init across
    //! the parallel cargo-test runners (without the share, eight
    //! concurrent first-use calls each try to `download_model_to_cache`
    //! and hf-hub's per-file lock surfaces flaky "Failed to retrieve
    //! model.onnx" errors when two writers race on the same partial-
    //! download path). The `is_lazy_at_construction` and
    //! `try_new_loads_eagerly_and_is_idempotent` cases need a fresh
    //! handle — they construct their own.

    use super::*;
    use std::sync::OnceLock;

    /// Shared embedder reused across the data-validation tests so
    /// fastembed only runs `try_new` once per test binary (the
    /// hf-hub cache lock is process-wide; concurrent first-use from
    /// parallel test tasks otherwise hits a flaky "model.onnx not
    /// found" inside `~/.cache/huggingface/hub/`).
    fn shared() -> &'static BundledEmbedder {
        static SHARED: OnceLock<BundledEmbedder> = OnceLock::new();
        SHARED.get_or_init(BundledEmbedder::new)
    }

    /// Cosine similarity helper for the semantic-sanity tests.
    fn cosine(a: &Embedding, b: &Embedding) -> f32 {
        let av = a.as_f32_slice().expect("a is f32");
        let bv = b.as_f32_slice().expect("b is f32");
        assert_eq!(av.len(), bv.len(), "dim mismatch");
        let dot: f32 = av.iter().zip(bv.iter()).map(|(x, y)| x * y).sum();
        let na: f32 = av.iter().map(|x| x * x).sum::<f32>().sqrt();
        let nb: f32 = bv.iter().map(|x| x * x).sum::<f32>().sqrt();
        dot / (na * nb).max(1e-9)
    }

    #[tokio::test]
    async fn bundled_embedder_produces_384_dim_vectors() {
        let v = shared()
            .embed("hello world")
            .await
            .expect("embed should succeed");
        assert_eq!(v.dim, BUNDLED_EMBEDDER_DIM);
        assert_eq!(v.dtype, EmbeddingDtype::F32);
        assert_eq!(v.data.len(), BUNDLED_EMBEDDER_DIM * 4);
        v.validate().expect("embedding length invariant");
    }

    #[tokio::test]
    async fn bundled_embedder_emits_expected_identity() {
        // Identity-only — no need to share, no embed call.
        let e = BundledEmbedder::new();
        assert_eq!(e.name(), "bundled:all-MiniLM-L6-v2");
        assert_eq!(e.version(), "v1");
        assert_eq!(e.dim(), 384);
        assert_eq!(e.dtype(), EmbeddingDtype::F32);
    }

    #[tokio::test]
    async fn bundled_embedder_is_deterministic_across_calls() {
        let a = shared().embed("the quick brown fox").await.unwrap();
        let b = shared().embed("the quick brown fox").await.unwrap();
        assert_eq!(a.data, b.data, "same input must produce identical bytes");
    }

    #[tokio::test]
    async fn bundled_embedder_distinct_inputs_produce_distinct_vectors() {
        let a = shared().embed("alpha").await.unwrap();
        let b = shared().embed("beta").await.unwrap();
        assert_ne!(a.data, b.data);
    }

    #[tokio::test]
    async fn bundled_embedder_does_semantic_work() {
        // The whole point of bundling a real model: cosine(semantically
        // similar) should beat cosine(dissimilar). If this regresses,
        // either the model swapped under us or the embed pipeline is
        // returning nonsense.
        let a = shared().embed("the cat sat on the mat").await.unwrap();
        let b = shared().embed("a feline rested on the rug").await.unwrap();
        let c = shared()
            .embed("Rust's borrow checker enforces aliasing rules")
            .await
            .unwrap();

        let sim_ab = cosine(&a, &b);
        let sim_ac = cosine(&a, &c);
        assert!(
            sim_ab > sim_ac,
            "semantically similar pair (cat/feline) should beat dissimilar \
             (cat/Rust): sim_ab={sim_ab} sim_ac={sim_ac}"
        );
        assert!(sim_ab > 0.0, "semantic similarity should be positive");
    }

    #[tokio::test]
    async fn bundled_embedder_handles_utf8_multi_byte() {
        // Multi-byte: emoji, CJK, RTL Arabic. The tokenizer is
        // sentencepiece/wordpiece — UTF-8-safe.
        let v = shared()
            .embed("こんにちは 🦀 مرحبا")
            .await
            .expect("multi-byte UTF-8 must embed cleanly");
        assert_eq!(v.dim, BUNDLED_EMBEDDER_DIM);
        v.validate().unwrap();
    }

    #[tokio::test]
    async fn bundled_embedder_empty_input_returns_empty_batch() {
        // Documented choice: empty batch → empty output (no error).
        // Avoids forcing every caller to filter out empty input lists.
        // For a single empty STRING via embed("") fastembed produces a
        // valid vector (the model has a [CLS] embedding) — covered by
        // a separate case below.
        let out = shared().embed_batch(&[]).await.unwrap();
        assert_eq!(out.len(), 0);
    }

    #[tokio::test]
    async fn bundled_embedder_empty_string_returns_valid_vector() {
        // Empty string: fastembed/tokenizer produces a [CLS]-only
        // embedding. We return it — caller can filter if they want.
        let v = shared()
            .embed("")
            .await
            .expect("empty string is valid input");
        assert_eq!(v.dim, BUNDLED_EMBEDDER_DIM);
        v.validate().unwrap();
    }

    #[tokio::test]
    async fn bundled_embedder_batch_preserves_input_order() {
        let inputs = ["one", "two", "three", "four"];
        let batch = shared().embed_batch(&inputs).await.unwrap();
        assert_eq!(batch.len(), inputs.len());
        // Each batch entry must match the single-call result for the
        // same text — defends against fastembed reordering inputs
        // inside the batch (it doesn't, but the contract is load-
        // bearing for HNSW row alignment).
        for (i, text) in inputs.iter().enumerate() {
            let single = shared().embed(text).await.unwrap();
            assert_eq!(batch[i].data, single.data, "batch[{i}] != single({text})");
        }
    }

    #[tokio::test]
    async fn bundled_embedder_concurrent_calls_do_not_deadlock() {
        // 8 parallel embed tasks against the shared handle. The
        // OnceCell serialises init; the inner Mutex serialises
        // inference. None of them should deadlock; all should
        // eventually return.
        let mut handles = Vec::new();
        for i in 0..8 {
            handles.push(tokio::spawn(async move {
                let text = format!("concurrent call number {i}");
                shared().embed(&text).await
            }));
        }
        for h in handles {
            let v = h.await.expect("join").expect("embed");
            assert_eq!(v.dim, BUNDLED_EMBEDDER_DIM);
        }
    }

    #[tokio::test]
    async fn bundled_embedder_is_lazy_at_construction() {
        // Constructor must NOT load the model — `new()` should return
        // before any hf-hub download or ort session-init runs.
        // Verified by inspecting the OnceCell state.
        //
        // This case needs a FRESH embedder (not `shared()`) so we can
        // assert the OnceCell is empty pre-call. After this test
        // completes the per-test embedder is dropped — no leak into
        // the shared one. The lazy-init assertion is the load-bearing
        // claim of the test; we don't need to actually call embed()
        // here (and avoiding that call also dodges a second hf-hub
        // download race against the shared embedder running in
        // parallel).
        let e = BundledEmbedder::new();
        assert!(
            e.model.get().is_none(),
            "OnceCell must be empty before any embed/try_new call"
        );
    }

    #[tokio::test]
    async fn bundled_embedder_try_new_loads_eagerly_and_is_idempotent() {
        // Use the shared handle so we don't trigger a second hf-hub
        // download race. After the first test in the binary has
        // populated the OnceCell, both try_new calls here are
        // OnceCell-hit fast paths.
        let model1 = shared().try_new().await.expect("eager init");
        let model2 = shared().try_new().await.expect("second eager init");
        // Same Arc — both calls must hit the same OnceCell slot.
        assert!(Arc::ptr_eq(&model1, &model2), "try_new should be idempotent");
    }

    #[tokio::test]
    async fn bundled_embedder_normalised_or_valid_floats() {
        // Sanity: every component must be finite (no NaN/Inf). Some
        // models normalise to unit length, some don't; all-MiniLM-L6-v2
        // does NOT normalise by default (fastembed leaves that to the
        // caller). We just check finiteness here — recall code handles
        // normalisation downstream.
        let v = shared().embed("finite floats only").await.unwrap();
        let slice = v.as_f32_slice().unwrap();
        for (i, f) in slice.iter().enumerate() {
            assert!(
                f.is_finite(),
                "non-finite component at index {i}: {f}"
            );
        }
    }
}