Skip to main content

sqlite_graphrag/
embedder.rs

1//! fastembed wrapper and per-process embedding cache.
2//!
3//! Owns the in-process `TextEmbedding` model and exposes batch encode/query
4//! helpers used by remember, recall, and related commands.
5
6use crate::constants::{
7    EMBEDDING_DIM, EMBEDDING_MAX_TOKENS, FASTEMBED_BATCH_SIZE, PASSAGE_PREFIX, QUERY_PREFIX,
8    REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS, REMEMBER_MAX_CONTROLLED_BATCH_PADDED_TOKENS,
9};
10use crate::errors::AppError;
11use fastembed::{EmbeddingModel, ExecutionProviderDispatch, TextEmbedding, TextInitOptions};
12use ort::execution_providers::CPU;
13use std::path::Path;
14use std::sync::{Mutex, OnceLock};
15
16static EMBEDDER: OnceLock<Mutex<TextEmbedding>> = OnceLock::new();
17
18/// Returns the process-wide singleton embedder, initializing it on first call.
19/// Subsequent calls return the cached instance regardless of `models_dir`.
20pub fn get_embedder(models_dir: &Path) -> Result<&'static Mutex<TextEmbedding>, AppError> {
21    if let Some(m) = EMBEDDER.get() {
22        return Ok(m);
23    }
24
25    maybe_init_dynamic_ort(models_dir)?;
26
27    // Multi-layer mitigation of the explosive RSS observed with variable-shape
28    // payloads. The three current layers are:
29    //   1. `with_arena_allocator(false)` on the CPU execution provider (line below)
30    //   2. env var `ORT_DISABLE_CPU_MEM_ARENA=1` in `main.rs` (default since v1.0.18)
31    //   3. env var `ORT_NUM_THREADS=1` + `ORT_INTRA_OP_NUM_THREADS=1` in `main.rs`
32    // The `with_memory_pattern(false)` flag exists in ort 2.0 (`SessionBuilder`)
33    // but fastembed 5.13.2 does NOT expose access to a custom SessionBuilder via
34    // `TextInitOptions`. If RSS grows again in real corpora, the next
35    // mitigation requires one of the following paths:
36    //   - Fork fastembed to expose `SessionBuilder::with_memory_pattern(false)`
37    //   - Bypass fastembed and use ort directly with a custom SessionBuilder
38    //   - Fixed padding in `plan_controlled_batches` to eliminate variable shapes
39    // References:
40    //   https://onnxruntime.ai/docs/performance/tune-performance/memory.html
41    //   https://github.com/qdrant/fastembed/issues/570
42    let cpu_ep: ExecutionProviderDispatch = CPU::default().with_arena_allocator(false).build();
43
44    let model = TextEmbedding::try_new(
45        TextInitOptions::new(EmbeddingModel::MultilingualE5Small)
46            .with_execution_providers(vec![cpu_ep])
47            .with_max_length(EMBEDDING_MAX_TOKENS)
48            .with_show_download_progress(true)
49            .with_cache_dir(models_dir.to_path_buf()),
50    )
51    .map_err(|e| AppError::Embedding(e.to_string()))?;
52    // If another thread raced and won, discard our instance and return theirs.
53    let _ = EMBEDDER.set(Mutex::new(model));
54    EMBEDDER.get().ok_or_else(|| {
55        AppError::Embedding(
56            "embedder OnceLock unexpectedly empty after set() (likely a racing initializer aborted before completion)"
57                .into(),
58        )
59    })
60}
61
62#[cfg(all(target_arch = "aarch64", target_os = "linux", target_env = "gnu"))]
63fn maybe_init_dynamic_ort(models_dir: &Path) -> Result<(), AppError> {
64    let mut candidates = Vec::new();
65
66    if let Ok(path) = std::env::var("ORT_DYLIB_PATH") {
67        if !path.is_empty() {
68            candidates.push(std::path::PathBuf::from(path));
69        }
70    }
71
72    if let Ok(exe) = std::env::current_exe() {
73        if let Some(dir) = exe.parent() {
74            candidates.push(dir.join("libonnxruntime.so"));
75            candidates.push(dir.join("lib").join("libonnxruntime.so"));
76        }
77    }
78
79    candidates.push(models_dir.join("libonnxruntime.so"));
80
81    for path in candidates {
82        if !path.exists() {
83            continue;
84        }
85
86        std::env::set_var("ORT_DYLIB_PATH", &path);
87        let _ = ort::init_from(&path)
88            .map_err(|e| AppError::Embedding(e.to_string()))?
89            .commit();
90        return Ok(());
91    }
92
93    Ok(())
94}
95
96#[cfg(not(all(target_arch = "aarch64", target_os = "linux", target_env = "gnu")))]
97fn maybe_init_dynamic_ort(_models_dir: &Path) -> Result<(), AppError> {
98    Ok(())
99}
100
101/// Embeds a single passage using the `passage:` prefix required by E5 models.
102///
103/// # Errors
104/// Returns `Err` when the mutex is poisoned or the model returns an unexpected result.
105pub fn embed_passage(embedder: &Mutex<TextEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
106    let prefixed = format!("{PASSAGE_PREFIX}{text}");
107    let results = embedder
108        .lock()
109        .map_err(|e| AppError::Embedding(format!("embedder mutex poisoned: {e}")))?
110        .embed(vec![prefixed.as_str()], Some(1))
111        .map_err(|e| AppError::Embedding(e.to_string()))?;
112    let emb = results
113        .into_iter()
114        .next()
115        .ok_or_else(|| AppError::Embedding("empty embedding result".into()))?;
116    assert_eq!(emb.len(), EMBEDDING_DIM, "unexpected embedding dimension");
117    Ok(emb)
118}
119
120/// Embeds a search query using the `query:` prefix required by E5 models.
121///
122/// # Errors
123/// Returns `Err` when the mutex is poisoned or the model returns an unexpected result.
124pub fn embed_query(embedder: &Mutex<TextEmbedding>, text: &str) -> Result<Vec<f32>, AppError> {
125    let prefixed = format!("{QUERY_PREFIX}{text}");
126    let results = embedder
127        .lock()
128        .map_err(|e| AppError::Embedding(format!("embedder mutex poisoned: {e}")))?
129        .embed(vec![prefixed.as_str()], Some(1))
130        .map_err(|e| AppError::Embedding(e.to_string()))?;
131    let emb = results
132        .into_iter()
133        .next()
134        .ok_or_else(|| AppError::Embedding("empty embedding result".into()))?;
135    Ok(emb)
136}
137
138/// Embeds multiple passages in a single ONNX batch call.
139///
140/// `batch_size` is capped at `FASTEMBED_BATCH_SIZE`. All texts receive the `passage:` prefix.
141///
142/// # Errors
143/// Returns `Err` when the mutex is poisoned or the model inference fails.
144pub fn embed_passages_batch(
145    embedder: &Mutex<TextEmbedding>,
146    texts: &[&str],
147    batch_size: usize,
148) -> Result<Vec<Vec<f32>>, AppError> {
149    let prefixed: Vec<String> = texts
150        .iter()
151        .map(|t| format!("{PASSAGE_PREFIX}{t}"))
152        .collect();
153    let strs: Vec<&str> = prefixed.iter().map(String::as_str).collect();
154    let results = embedder
155        .lock()
156        .map_err(|e| AppError::Embedding(format!("embedder mutex poisoned: {e}")))?
157        .embed(strs, Some(batch_size.min(FASTEMBED_BATCH_SIZE)))
158        .map_err(|e| AppError::Embedding(e.to_string()))?;
159    for emb in &results {
160        assert_eq!(emb.len(), EMBEDDING_DIM, "unexpected embedding dimension");
161    }
162    Ok(results)
163}
164
165/// Returns the number of batches that [`embed_passages_controlled`] would produce
166/// for the given `token_counts` slice without running inference.
167pub fn controlled_batch_count(token_counts: &[usize]) -> usize {
168    plan_controlled_batches(token_counts).len()
169}
170
171/// Embeds passages grouped into token-budget-aware batches to avoid OOM on variable-length inputs.
172///
173/// `texts` and `token_counts` must have the same length. Batches are planned using an
174/// internal budget algorithm and single-item batches fall back to [`embed_passage`].
175///
176/// # Errors
177/// Returns `Err` when lengths differ, the mutex is poisoned, or inference fails.
178pub fn embed_passages_controlled(
179    embedder: &Mutex<TextEmbedding>,
180    texts: &[&str],
181    token_counts: &[usize],
182) -> Result<Vec<Vec<f32>>, AppError> {
183    if texts.len() != token_counts.len() {
184        return Err(AppError::Internal(anyhow::anyhow!(
185            "texts/token_counts length mismatch in controlled embedding"
186        )));
187    }
188
189    let mut results = Vec::with_capacity(texts.len());
190    for (start, end) in plan_controlled_batches(token_counts) {
191        if end - start == 1 {
192            results.push(embed_passage(embedder, texts[start])?);
193            continue;
194        }
195
196        results.extend(embed_passages_batch(
197            embedder,
198            &texts[start..end],
199            end - start,
200        )?);
201    }
202
203    Ok(results)
204}
205
206/// Embed multiple passages serially.
207///
208/// This path intentionally avoids ONNX batch inference for robustness when
209/// real-world Markdown chunks trigger pathological runtime behavior.
210pub fn embed_passages_serial<'a, I>(
211    embedder: &Mutex<TextEmbedding>,
212    texts: I,
213) -> Result<Vec<Vec<f32>>, AppError>
214where
215    I: IntoIterator<Item = &'a str>,
216{
217    let iter = texts.into_iter();
218    let (lower, _) = iter.size_hint();
219    let mut results = Vec::with_capacity(lower);
220    for text in iter {
221        results.push(embed_passage(embedder, text)?);
222    }
223    Ok(results)
224}
225
226fn plan_controlled_batches(token_counts: &[usize]) -> Vec<(usize, usize)> {
227    let mut batches = Vec::new();
228    let mut start = 0usize;
229
230    while start < token_counts.len() {
231        let mut end = start + 1;
232        let mut max_tokens = token_counts[start].max(1);
233
234        while end < token_counts.len() && end - start < REMEMBER_MAX_CONTROLLED_BATCH_CHUNKS {
235            let candidate_max = max_tokens.max(token_counts[end].max(1));
236            let candidate_len = end + 1 - start;
237            if candidate_max * candidate_len > REMEMBER_MAX_CONTROLLED_BATCH_PADDED_TOKENS {
238                break;
239            }
240            max_tokens = candidate_max;
241            end += 1;
242        }
243
244        batches.push((start, end));
245        start = end;
246    }
247
248    batches
249}
250
251/// Convert &[f32] to &[u8] for sqlite-vec storage.
252/// # Safety
253/// Safe because f32 has no padding and is well-defined bit pattern.
254pub fn f32_to_bytes(v: &[f32]) -> &[u8] {
255    unsafe { std::slice::from_raw_parts(v.as_ptr() as *const u8, std::mem::size_of_val(v)) }
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261    use crate::constants::{EMBEDDING_DIM, PASSAGE_PREFIX, QUERY_PREFIX};
262
263    // --- f32_to_bytes tests (pure function, no model) ---
264
265    #[test]
266    fn f32_to_bytes_empty_slice_returns_empty() {
267        let v: Vec<f32> = vec![];
268        assert_eq!(f32_to_bytes(&v), &[] as &[u8]);
269    }
270
271    #[test]
272    fn f32_to_bytes_one_element_returns_4_bytes() {
273        let v = vec![1.0_f32];
274        let bytes = f32_to_bytes(&v);
275        assert_eq!(bytes.len(), 4);
276        // roundtrip: the 4 bytes must reconstruct the original f32
277        let recovered = f32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
278        assert_eq!(recovered, 1.0_f32);
279    }
280
281    #[test]
282    fn f32_to_bytes_length_is_4x_elements() {
283        let v = vec![0.0_f32, 1.0, 2.0, 3.0];
284        assert_eq!(f32_to_bytes(&v).len(), v.len() * 4);
285    }
286
287    #[test]
288    fn f32_to_bytes_zero_encoded_as_4_zeros() {
289        let v = vec![0.0_f32];
290        assert_eq!(f32_to_bytes(&v), &[0u8, 0, 0, 0]);
291    }
292
293    #[test]
294    fn f32_to_bytes_roundtrip_vector_embedding_dim() {
295        let v: Vec<f32> = (0..EMBEDDING_DIM).map(|i| i as f32 * 0.001).collect();
296        let bytes = f32_to_bytes(&v);
297        assert_eq!(bytes.len(), EMBEDDING_DIM * 4);
298        // reconstructs and compares first and last element
299        let first = f32::from_le_bytes(bytes[0..4].try_into().unwrap());
300        assert!((first - 0.0_f32).abs() < 1e-6);
301        let last_start = (EMBEDDING_DIM - 1) * 4;
302        let last = f32::from_le_bytes(bytes[last_start..last_start + 4].try_into().unwrap());
303        assert!((last - (EMBEDDING_DIM - 1) as f32 * 0.001).abs() < 1e-4);
304    }
305
306    // --- verifies prefixes used by the embedder (no model) ---
307
308    #[test]
309    fn passage_prefix_not_empty() {
310        assert_eq!(PASSAGE_PREFIX, "passage: ");
311    }
312
313    #[test]
314    fn query_prefix_not_empty() {
315        assert_eq!(QUERY_PREFIX, "query: ");
316    }
317
318    #[test]
319    fn embedding_dim_is_384() {
320        assert_eq!(EMBEDDING_DIM, 384);
321    }
322
323    // --- testes com modelo real (ignorados no CI normal) ---
324
325    #[test]
326    #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
327    fn embed_passage_returns_vector_with_correct_dimension() {
328        let dir = tempfile::tempdir().unwrap();
329        let embedder = get_embedder(dir.path()).unwrap();
330        let result = embed_passage(embedder, "test text").unwrap();
331        assert_eq!(result.len(), EMBEDDING_DIM);
332    }
333
334    #[test]
335    #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
336    fn embed_query_returns_vector_with_correct_dimension() {
337        let dir = tempfile::tempdir().unwrap();
338        let embedder = get_embedder(dir.path()).unwrap();
339        let result = embed_query(embedder, "test query").unwrap();
340        assert_eq!(result.len(), EMBEDDING_DIM);
341    }
342
343    #[test]
344    #[ignore = "requires ~600 MB model on disk; run with --include-ignored"]
345    fn embed_passages_batch_returns_one_vector_per_text() {
346        let dir = tempfile::tempdir().unwrap();
347        let embedder = get_embedder(dir.path()).unwrap();
348        let textos = ["primeiro", "segundo"];
349        let results = embed_passages_batch(embedder, &textos, 2).unwrap();
350        assert_eq!(results.len(), 2);
351        for emb in &results {
352            assert_eq!(emb.len(), EMBEDDING_DIM);
353        }
354    }
355
356    #[test]
357    fn controlled_batch_plan_respects_budget() {
358        assert_eq!(
359            plan_controlled_batches(&[100, 100, 100, 100, 300, 300]),
360            vec![(0, 4), (4, 5), (5, 6)]
361        );
362    }
363
364    #[test]
365    fn controlled_batch_count_returns_one_for_single_chunk() {
366        assert_eq!(controlled_batch_count(&[350]), 1);
367    }
368}