Skip to main content

ski/embed/
mod.rs

1//! Embedding backends behind a single trait.
2//!
3//! - [`bow::BowEmbedder`] — deterministic hashed bag-of-words. No deps, no
4//!   network, no model. Always available; used for tests and offline fallback.
5//! - `fast::FastEmbedder` — real bge-small / MiniLM via fastembed (ONNX). Behind
6//!   the `fastembed` cargo feature.
7
8pub mod bow;
9#[cfg(feature = "fastembed")]
10pub mod fast;
11
12/// Whether a text is a search query or an indexed document. bge models are
13/// asymmetric (query gets an instruction prefix); symmetric models ignore this.
14#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15pub enum EmbedKind {
16    Query,
17    Document,
18}
19
20pub trait Embedder {
21    /// Stable id used as the index's `model` tag (changing it forces reindex).
22    fn id(&self) -> String;
23    fn embed(&self, texts: &[String], kind: EmbedKind) -> anyhow::Result<Vec<Vec<f32>>>;
24
25    /// Score floor under which the top match is treated as "nothing relevant"
26    /// and the hook injects nothing. Embedder-specific because cosine
27    /// distributions differ sharply: the hashed bag-of-words space is sparse, so
28    /// unrelated text scores near 0 and a low floor works; bge is anisotropic, so
29    /// even unrelated text cosines ~0.5 and the floor must sit well above that.
30    /// The default is calibrated for bag-of-words; embedders override it.
31    fn min_similarity(&self) -> f32 {
32        0.30
33    }
34
35    /// Max score gap below the single best match for a co-relevant peer to still
36    /// be injected. Tighter spaces (bge) need a smaller margin. Default is for
37    /// bag-of-words.
38    fn score_margin(&self) -> f32 {
39        0.15
40    }
41}
42
43/// Whether [`build`] would resolve `model` to the real dense backend (vs the
44/// bag-of-words fallback) — without constructing it, which on a cold cache
45/// triggers the model download. Lets `ski doctor` describe the active backend
46/// and decide whether a smoke test is safe to run.
47pub fn is_dense(model: &str) -> bool {
48    #[cfg(feature = "fastembed")]
49    {
50        fast::FastEmbedder::recognized(model)
51    }
52    #[cfg(not(feature = "fastembed"))]
53    {
54        let _ = model;
55        false
56    }
57}
58
59/// The `id()` the backend picked by [`build`] for `model` will report — what a
60/// matching index's `model` tag must equal. Also construction-free.
61pub fn expected_id(model: &str) -> String {
62    if is_dense(model) {
63        model.to_string()
64    } else {
65        bow::BowEmbedder::new().id()
66    }
67}
68
69/// Pick a backend for `model`. With the `fastembed` feature and a recognized
70/// model id, returns the real embedder; otherwise the offline bag-of-words one.
71pub fn build(model: &str) -> anyhow::Result<Box<dyn Embedder>> {
72    #[cfg(feature = "fastembed")]
73    {
74        if let Some(e) = fast::FastEmbedder::try_for(model)? {
75            return Ok(Box::new(e));
76        }
77    }
78    let _ = model;
79    Ok(Box::new(bow::BowEmbedder::new()))
80}