prollytree 0.4.0

A prolly (probabilistic) tree for efficient storage, retrieval, and modification of ordered data.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

//! MiniLM-L6-v2 sentence embedder backed by Candle (PR 4b).
//!
//! Wires the `Embedder` trait to `sentence-transformers/all-MiniLM-L6-v2`:
//! a 384-dimensional BERT-based sentence encoder. Pure Rust via Candle, no
//! native deps (other than what `tokenizers` already brings in via `onig`).
//!
//! # Lifecycle
//!
//! The model and tokenizer are downloaded lazily on the first `embed()` call
//! into a local cache (`~/.cache/prollytree/embedders/<model_id>/<revision>/`
//! by default; override via `$PROLLYTREE_EMBEDDER_CACHE`). Subsequent calls
//! reuse the cached files. The download is ~90 MB total (tokenizer +
//! safetensors weights).
//!
//! # Determinism
//!
//! The version string is `"sentence-transformers/all-MiniLM-L6-v2@<revision>"`,
//! so a [`crate::proximity::TextIndex`] built under one revision refuses to
//! reopen under another (unless you call `reindex_from_texts` to re-embed).

use crate::proximity::embedder::{EmbedError, Embedder};
use candle_core::{DType, Device, IndexOp, Tensor};
use candle_nn::VarBuilder;
use candle_transformers::models::bert::{BertModel, Config, DTYPE};
use parking_lot::Mutex;
use std::path::PathBuf;
use tokenizers::Tokenizer;

/// Default HuggingFace model id.
pub const DEFAULT_MODEL_ID: &str = "sentence-transformers/all-MiniLM-L6-v2";
/// Default revision — pinned to an immutable HuggingFace commit SHA so the
/// persisted version string (`"{model_id}@{revision}"`) cannot silently
/// accept reopens after upstream republishes `main`. Update this constant
/// (and bump the persisted version) only when intentionally moving to a new
/// model build. The SHA below corresponds to the snapshot used to bring up
/// this integration; to refresh, fetch the latest commit on the model's
/// `main` branch and pin its hash.
pub const DEFAULT_REVISION: &str = "8b3219a92973c328a8e22fadcfa821b5dc75636a";
/// MiniLM-L6-v2 produces 384-d embeddings.
pub const MINILM_DIM: u16 = 384;

/// Sentence embedder using all-MiniLM-L6-v2 via Candle.
pub struct MiniLmEmbedder {
    model_id: String,
    revision: String,
    /// Cached version string: `"{model_id}@{revision}"`.
    version: String,
    /// Lazily-initialised model + tokenizer + device. Locked behind a Mutex
    /// so `embed()` can take `&self` and remain `Send + Sync`.
    state: Mutex<Option<LoadedModel>>,
}

struct LoadedModel {
    tokenizer: Tokenizer,
    bert: BertModel,
    device: Device,
}

impl std::fmt::Debug for MiniLmEmbedder {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("MiniLmEmbedder")
            .field("model_id", &self.model_id)
            .field("revision", &self.revision)
            .field("loaded", &self.state.lock().is_some())
            .finish()
    }
}

impl Default for MiniLmEmbedder {
    fn default() -> Self {
        Self::new(DEFAULT_MODEL_ID, DEFAULT_REVISION)
    }
}

impl MiniLmEmbedder {
    /// Build an embedder pointing at the given HuggingFace model id and
    /// revision. Use [`MiniLmEmbedder::default`] for the recommended
    /// `sentence-transformers/all-MiniLM-L6-v2@main`.
    pub fn new(model_id: &str, revision: &str) -> Self {
        let version = format!("{model_id}@{revision}");
        Self {
            model_id: model_id.to_string(),
            revision: revision.to_string(),
            version,
            state: Mutex::new(None),
        }
    }

    /// Eagerly download + load the model. Useful if the caller wants to pay
    /// the cold-start cost up-front (e.g., during server startup) rather
    /// than during the first `embed()` call.
    pub fn warm_up(&self) -> Result<(), EmbedError> {
        self.ensure_loaded()
    }

    fn ensure_loaded(&self) -> Result<(), EmbedError> {
        let mut state = self.state.lock();
        if state.is_some() {
            return Ok(());
        }
        let loaded = load_model(&self.model_id, &self.revision)
            .map_err(|e| EmbedError::Failure(format!("MiniLM load: {e}")))?;
        *state = Some(loaded);
        Ok(())
    }
}

impl Embedder for MiniLmEmbedder {
    fn id(&self) -> &str {
        "candle:minilm-l6-v2"
    }

    fn version(&self) -> &str {
        &self.version
    }

    fn dim(&self) -> u16 {
        MINILM_DIM
    }

    fn embed(&self, text: &str) -> Result<Vec<f32>, EmbedError> {
        self.ensure_loaded()?;
        let state = self.state.lock();
        let loaded = state.as_ref().expect("loaded by ensure_loaded");

        embed_one(&loaded.tokenizer, &loaded.bert, &loaded.device, text)
            .map_err(|e| EmbedError::Failure(format!("MiniLM embed: {e}")))
    }
}

// ---------------------------------------------------------------------------
// Implementation helpers
// ---------------------------------------------------------------------------

fn load_model(
    model_id: &str,
    revision: &str,
) -> Result<LoadedModel, Box<dyn std::error::Error + Send + Sync>> {
    let cache_dir = embedder_cache_dir(model_id, revision)?;
    std::fs::create_dir_all(&cache_dir)?;

    let config_path = fetch_with_cache(&cache_dir, model_id, revision, "config.json")?;
    let tokenizer_path = fetch_with_cache(&cache_dir, model_id, revision, "tokenizer.json")?;
    let weights_path = fetch_with_cache(&cache_dir, model_id, revision, "model.safetensors")?;

    let config_json = std::fs::read_to_string(&config_path)?;
    let config: Config = serde_json::from_str(&config_json)?;

    let tokenizer = Tokenizer::from_file(&tokenizer_path).map_err(
        |e| -> Box<dyn std::error::Error + Send + Sync> { format!("tokenizer: {e}").into() },
    )?;

    let device = Device::Cpu;
    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[weights_path], DTYPE, &device)? };
    let bert = BertModel::load(vb, &config)?;

    Ok(LoadedModel {
        tokenizer,
        bert,
        device,
    })
}

/// Resolve the on-disk cache directory for a given (model_id, revision).
///
/// Honoured precedence:
///
/// 1. `$PROLLYTREE_EMBEDDER_CACHE/<model_id>/<revision>/`
/// 2. `<dirs::cache_dir()>/prollytree/embedders/<model_id>/<revision>/`
fn embedder_cache_dir(
    model_id: &str,
    revision: &str,
) -> Result<PathBuf, Box<dyn std::error::Error + Send + Sync>> {
    let base = if let Ok(custom) = std::env::var("PROLLYTREE_EMBEDDER_CACHE") {
        PathBuf::from(custom)
    } else {
        dirs::cache_dir()
            .ok_or("could not resolve a user cache directory")?
            .join("prollytree")
            .join("embedders")
    };
    // `model_id` and `revision` come from the public `MiniLmEmbedder::new`
    // constructor — so they're user-controlled. Reject absolute paths or
    // `..` segments outright, and sanitise other separators, to make sure the
    // resulting cache path stays under `base` rather than escaping it (which
    // `PathBuf::join` would silently allow for an absolute argument).
    Ok(base
        .join(sanitise_cache_component(model_id)?)
        .join(sanitise_cache_component(revision)?))
}

/// Validate a path component that came from user-controlled input. The
/// component is rejected if it would let the join escape its base, and
/// otherwise has its forward/backslashes replaced with `_` so the result is
/// always a single safe directory name.
fn sanitise_cache_component(s: &str) -> Result<String, Box<dyn std::error::Error + Send + Sync>> {
    if s.is_empty() {
        return Err("embedder cache component must not be empty".into());
    }
    if std::path::Path::new(s).is_absolute() {
        return Err(format!("embedder cache component must not be absolute: {s:?}").into());
    }
    if s.split(['/', '\\']).any(|seg| seg == ".." || seg == ".") {
        return Err(format!("embedder cache component must not contain `..` or `.`: {s:?}").into());
    }
    // Replace path separators with `_` so a name like
    // `sentence-transformers/all-MiniLM-L6-v2` becomes a single directory
    // rather than two nested levels (keeps the layout predictable across
    // platforms where one separator might be more privileged than another).
    Ok(s.replace(['/', '\\'], "_"))
}

/// Fetch a single file from HuggingFace into the cache directory, returning
/// the local path. If the file is already present, the download is skipped.
fn fetch_with_cache(
    cache_dir: &std::path::Path,
    model_id: &str,
    revision: &str,
    filename: &str,
) -> Result<PathBuf, Box<dyn std::error::Error + Send + Sync>> {
    let local_path = cache_dir.join(filename);
    if local_path.exists() {
        return Ok(local_path);
    }

    let url = format!("https://huggingface.co/{model_id}/resolve/{revision}/{filename}");
    let agent = ureq::AgentBuilder::new()
        .redirects(10)
        .timeout_connect(std::time::Duration::from_secs(30))
        .build();
    let response =
        agent
            .get(&url)
            .call()
            .map_err(|e| -> Box<dyn std::error::Error + Send + Sync> {
                format!("download {url}: {e}").into()
            })?;
    if response.status() != 200 {
        return Err(format!("download {url}: unexpected status {}", response.status()).into());
    }

    // Write to a temp file first, then rename atomically so a partial
    // download never leaves a corrupted cache file behind.
    let tmp_path = cache_dir.join(format!("{filename}.partial"));
    {
        let mut reader = response.into_reader();
        let mut writer = std::fs::File::create(&tmp_path)?;
        std::io::copy(&mut reader, &mut writer)?;
    }
    std::fs::rename(&tmp_path, &local_path)?;
    Ok(local_path)
}

fn embed_one(
    tokenizer: &Tokenizer,
    bert: &BertModel,
    device: &Device,
    text: &str,
) -> Result<Vec<f32>, Box<dyn std::error::Error + Send + Sync>> {
    let encoding =
        tokenizer
            .encode(text, true)
            .map_err(|e| -> Box<dyn std::error::Error + Send + Sync> {
                format!("encode: {e}").into()
            })?;
    let input_ids: Vec<u32> = encoding.get_ids().to_vec();
    let attention_mask: Vec<u32> = encoding.get_attention_mask().to_vec();
    let seq_len = input_ids.len();

    if seq_len == 0 {
        // Edge case: empty input. Return a zero vector that's still
        // L2-normalised (i.e., zero remains zero — downstream distance
        // calls handle zero gracefully).
        return Ok(vec![0.0_f32; MINILM_DIM as usize]);
    }

    let input_ids = Tensor::new(input_ids.as_slice(), device)?.unsqueeze(0)?;
    let attention_mask_t = Tensor::new(attention_mask.as_slice(), device)?.unsqueeze(0)?;
    let token_type_ids = input_ids.zeros_like()?;

    // Forward pass — token-level embeddings of shape [1, seq_len, hidden].
    let token_embeddings = bert.forward(&input_ids, &token_type_ids, Some(&attention_mask_t))?;

    // Mean-pool over the sequence axis, weighted by the attention mask.
    let mask_f = attention_mask_t.to_dtype(DType::F32)?.unsqueeze(2)?;
    let mask_expanded = mask_f.broadcast_as(token_embeddings.shape())?;
    let weighted = token_embeddings.broadcast_mul(&mask_expanded)?;
    let summed = weighted.sum(1)?; // [1, hidden]
    let counts = attention_mask_t
        .to_dtype(DType::F32)?
        .sum_keepdim(1)?
        .clamp(1e-9_f32, f32::MAX)?; // [1, 1]
    let mean = summed.broadcast_div(&counts)?; // [1, hidden]

    // L2-normalise.
    let norm = mean
        .sqr()?
        .sum_keepdim(1)?
        .sqrt()?
        .clamp(1e-12_f32, f32::MAX)?; // [1, 1]
    let normalised = mean.broadcast_div(&norm)?;

    let vector: Vec<f32> = normalised.i(0)?.to_vec1::<f32>()?;
    if vector.len() != MINILM_DIM as usize {
        return Err(format!("expected {} dims, got {}", MINILM_DIM, vector.len()).into());
    }
    Ok(vector)
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Helper: skip embed-running tests when the network or HF cache isn't
    /// available. We attempt one warm-up; if it fails, mark the test as
    /// not-run rather than failing CI on offline machines.
    fn require_loaded_or_skip(e: &MiniLmEmbedder, test_name: &str) -> bool {
        match e.warm_up() {
            Ok(()) => true,
            Err(err) => {
                eprintln!(
                    "[{test_name}] skipping: MiniLM model not loadable ({err}); \
                     run with network access + HF cache to validate."
                );
                false
            }
        }
    }

    // ---- Pure metadata tests (no model load required) --------------------

    #[test]
    fn metadata_is_stable() {
        let e = MiniLmEmbedder::default();
        assert_eq!(e.id(), "candle:minilm-l6-v2");
        assert_eq!(e.dim(), 384);
        assert!(e.version().contains(DEFAULT_MODEL_ID));
        assert!(e.version().contains(DEFAULT_REVISION));
    }

    #[test]
    fn version_distinguishes_revisions() {
        let a = MiniLmEmbedder::new(DEFAULT_MODEL_ID, "main");
        let b = MiniLmEmbedder::new(DEFAULT_MODEL_ID, "refs/pr/1");
        assert_ne!(a.version(), b.version());
    }

    #[test]
    fn version_distinguishes_models() {
        let a = MiniLmEmbedder::new("foo/bar", "main");
        let b = MiniLmEmbedder::new("baz/qux", "main");
        assert_ne!(a.version(), b.version());
    }

    // ---- Network-dependent tests (mark ignored so default test runs skip
    //      them; pass `--include-ignored` to run with model download). -----

    #[test]
    #[ignore = "downloads model from HuggingFace; run with --include-ignored"]
    fn embed_returns_correct_dim() {
        let e = MiniLmEmbedder::default();
        if !require_loaded_or_skip(&e, "embed_returns_correct_dim") {
            return;
        }
        let v = e.embed("hello world").unwrap();
        assert_eq!(v.len(), 384);
    }

    #[test]
    #[ignore = "downloads model from HuggingFace; run with --include-ignored"]
    fn embed_is_deterministic() {
        let e = MiniLmEmbedder::default();
        if !require_loaded_or_skip(&e, "embed_is_deterministic") {
            return;
        }
        let a = e.embed("the quick brown fox").unwrap();
        let b = e.embed("the quick brown fox").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    #[ignore = "downloads model from HuggingFace; run with --include-ignored"]
    fn embed_distinguishes_different_text() {
        let e = MiniLmEmbedder::default();
        if !require_loaded_or_skip(&e, "embed_distinguishes_different_text") {
            return;
        }
        let a = e.embed("the quick brown fox").unwrap();
        let b = e
            .embed("an entirely unrelated sentence about ducks")
            .unwrap();
        assert_ne!(a, b);
    }

    #[test]
    #[ignore = "downloads model from HuggingFace; run with --include-ignored"]
    fn embed_semantically_similar_texts_are_close() {
        let e = MiniLmEmbedder::default();
        if !require_loaded_or_skip(&e, "embed_semantically_similar_texts_are_close") {
            return;
        }
        let a = e.embed("the cat sat on the mat").unwrap();
        let b = e.embed("a cat is sitting on a mat").unwrap();
        let c = e.embed("quantum field theory and renormalization").unwrap();

        // Cosine distance: 1 - cosine_similarity (smaller is closer).
        let cos = |x: &[f32], y: &[f32]| -> f32 {
            let dot: f32 = x.iter().zip(y).map(|(a, b)| a * b).sum();
            let nx: f32 = x.iter().map(|v| v * v).sum::<f32>().sqrt();
            let ny: f32 = y.iter().map(|v| v * v).sum::<f32>().sqrt();
            1.0 - dot / (nx * ny + 1e-9)
        };
        let close = cos(&a, &b);
        let far = cos(&a, &c);
        assert!(
            close < far,
            "expected '{}' / '{}' (cos {:.3}) closer than '{}' / '{}' (cos {:.3})",
            "the cat sat on the mat",
            "a cat is sitting on a mat",
            close,
            "the cat sat on the mat",
            "quantum field theory and renormalization",
            far
        );
    }
}