ski/embed/mod.rs
1//! Embedding backends behind a single trait.
2//!
3//! - [`bow::BowEmbedder`] — deterministic hashed bag-of-words. No deps, no
4//! network, no model. Always available; used for tests and offline fallback.
5//! - `fast::FastEmbedder` — real bge-small / MiniLM via fastembed (ONNX). Behind
6//! the `fastembed` cargo feature.
7
8pub mod bow;
9#[cfg(feature = "fastembed")]
10pub mod fast;
11
12/// Whether a text is a search query or an indexed document. bge models are
13/// asymmetric (query gets an instruction prefix); symmetric models ignore this.
14#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15pub enum EmbedKind {
16 Query,
17 Document,
18}
19
20pub trait Embedder {
21 /// Stable id used as the index's `model` tag (changing it forces reindex).
22 fn id(&self) -> String;
23 fn embed(&self, texts: &[String], kind: EmbedKind) -> anyhow::Result<Vec<Vec<f32>>>;
24
25 /// Score floor under which the top match is treated as "nothing relevant"
26 /// and the hook injects nothing. Embedder-specific because cosine
27 /// distributions differ sharply: the hashed bag-of-words space is sparse, so
28 /// unrelated text scores near 0 and a low floor works; bge is anisotropic, so
29 /// even unrelated text cosines ~0.5 and the floor must sit well above that.
30 /// The default is calibrated for bag-of-words; embedders override it.
31 fn min_similarity(&self) -> f32 {
32 0.30
33 }
34
35 /// Max score gap below the single best match for a co-relevant peer to still
36 /// be injected. Tighter spaces (bge) need a smaller margin. Default is for
37 /// bag-of-words.
38 fn score_margin(&self) -> f32 {
39 0.15
40 }
41}
42
43/// Whether [`build`] would resolve `model` to the real dense backend (vs the
44/// bag-of-words fallback) — without constructing it, which on a cold cache
45/// triggers the model download. Lets `ski doctor` describe the active backend
46/// and decide whether a smoke test is safe to run.
47pub fn is_dense(model: &str) -> bool {
48 #[cfg(feature = "fastembed")]
49 {
50 fast::FastEmbedder::recognized(model)
51 }
52 #[cfg(not(feature = "fastembed"))]
53 {
54 let _ = model;
55 false
56 }
57}
58
59/// The `id()` the backend picked by [`build`] for `model` will report — what a
60/// matching index's `model` tag must equal. Also construction-free.
61pub fn expected_id(model: &str) -> String {
62 if is_dense(model) {
63 model.to_string()
64 } else {
65 bow::BowEmbedder::new().id()
66 }
67}
68
69/// Pick a backend for `model`. With the `fastembed` feature and a recognized
70/// model id, returns the real embedder; otherwise the offline bag-of-words one.
71pub fn build(model: &str) -> anyhow::Result<Box<dyn Embedder>> {
72 #[cfg(feature = "fastembed")]
73 {
74 if let Some(e) = fast::FastEmbedder::try_for(model)? {
75 return Ok(Box::new(e));
76 }
77 }
78 let _ = model;
79 Ok(Box::new(bow::BowEmbedder::new()))
80}