difflore_core/context/
ann.rs

1//! Per-project HNSW Approximate-Nearest-Neighbor index.
2//!
3//! Wraps [`hnsw_rs::hnsw::Hnsw`] so the retrieval path can swap the O(N)
4//! linear cosine scan for an O(log N) graph query on big projects. The
5//! design is deliberately cautious:
6//!
7//! 1. **Additive** — every public entry point returns `None` / an empty
8//!    result on any failure. The retrieval fallback MUST always work, so
9//!    this module never panics and never blocks rule writes.
10//! 2. **Persistent, per-project** — each project hash has its own HNSW
11//!    graph file under `~/.difflore/projects/{hash}/hnsw.*` plus a
12//!    sidecar `hnsw.meta.json` that carries the dim + element count so
13//!    we can detect a stale / wrong-dim index on reload.
14//! 3. **Incremental upsert** — `hnsw_rs` supports runtime insertions, so
15//!    `upsert_rule_chunks` can stream new embeddings into the graph
16//!    without a full rebuild. Replacements (same `chunk_id`, new vector)
17//!    are modelled as "shadow" entries: the old internal id stays in the
18//!    graph but is hidden from search results by a `tombstones` set.
19//!    A full `build_from_chunks` rebuild periodically cleans these out.
20//! 4. **Dim mismatch => fallback** — if the query dim doesn't match the
21//!    index dim we return an empty hit set; the caller sees this as
22//!    "ANN gave nothing" and uses the linear scan.
23//!
24//! The internal/id translation is tracked on the Rust side because
25//! `hnsw_rs`'s `DataId` is a `usize` and we want to key on `String`
26//! chunk ids. Both maps are serialised alongside the graph in the
27//! sidecar meta file.
28
29use chrono::Utc;
30use hnsw_rs::prelude::*;
31use serde::{Deserialize, Serialize};
32use std::collections::{HashMap, HashSet};
33use std::path::{Path, PathBuf};
34use std::sync::{Arc, OnceLock};
35use tokio::sync::Mutex;
36
37use crate::errors::CoreError;
38
39type AnnCacheKey = (String, usize);
40type SharedAnnIndex = Arc<Mutex<AnnIndex>>;
41type AnnCache = std::sync::Mutex<HashMap<AnnCacheKey, SharedAnnIndex>>;
42
43/// On-disk basename. `hnsw_rs` writes two files: `hnsw.hnsw.graph` and
44/// `hnsw.hnsw.data`. The sidecar meta lives next to them as
45/// `hnsw.meta.json`.
46const HNSW_BASENAME: &str = "hnsw";
47const META_FILENAME: &str = "hnsw.meta.json";
48
49/// Current meta schema version. Bumped when the sidecar shape changes —
50/// an index with a different version is treated as stale and rebuilt.
51const META_VERSION: u32 = 1;
52
53/// HNSW construction parameters. These are "reasonable defaults" from
54/// the `hnsw_rs` docs / the Malkov+Yashunin paper; we don't expose them
55/// to callers because retrieval quality is more sensitive to our own
56/// RRF weighting than to these knobs in the size range (≤ 100K chunks)
57/// we target.
58const MAX_NB_CONNECTION: usize = 16;
59const EF_CONSTRUCTION: usize = 200;
60const MAX_LAYER: usize = 16;
61const DEFAULT_EF_SEARCH: usize = 64;
62const MAX_SEARCH_TOP_K: usize = 50;
63const MAX_RAW_SEARCH_CANDIDATES: usize = 150;
64
65/// Meta file serialised to `hnsw.meta.json`. A schema mismatch (version
66/// or dim) causes the reload path to drop the on-disk index and return
67/// an empty in-memory one, which the retrieval path then treats as a
68/// fallback cue.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70struct AnnMeta {
71    version: u32,
72    dim: usize,
73    /// Number of elements inserted (including tombstoned ones that are
74    /// hidden from search but still live in the graph).
75    size: u32,
76    /// ISO-8601 timestamp of the most recent `save()` call.
77    built_at: String,
78    /// SHA-1 of the (id, content, embedding dim) signature from the
79    /// `rule_chunks` table at build time. Included so higher layers can
80    /// detect a divergence between the graph and the source DB. Not
81    /// currently consulted by retrieval (which always trusts the graph
82    /// and falls back on empty results).
83    schema_hash: String,
84    /// Ordered list of chunk ids by internal HNSW point id. Required
85    /// because `hnsw_rs`'s `DataId` is a `usize` but our chunks are keyed
86    /// on stable string ids.
87    id_map: Vec<String>,
88    /// Chunk ids that were overwritten after insertion and so should be
89    /// filtered out of search results. Allows `upsert()` to be cheap
90    /// (no graph edit) at the cost of a small `HashSet` lookup per hit.
91    tombstones: Vec<String>,
92}
93
94/// A project-scoped, disk-persisted HNSW index. See the module docs for
95/// the overall approach.
96pub struct AnnIndex {
97    inner: Option<Hnsw<'static, f32, DistCosine>>,
98    /// internal HNSW `DataId` -> `chunk_id`
99    id_map: Vec<String>,
100    /// `chunk_id` -> most recent internal `DataId` (later one wins; older
101    /// ids land in `tombstones`)
102    reverse: HashMap<String, usize>,
103    /// Chunk ids whose embeddings were superseded by an `upsert` and so
104    /// should not appear in search output.
105    tombstones: HashSet<String>,
106    dim: usize,
107    dirty: bool,
108    project_hash: String,
109    /// Cached schema hash from the last successful load/build, carried
110    /// back into `save()` without recomputing. Pure bookkeeping.
111    schema_hash: String,
112}
113
114impl AnnIndex {
115    /// Load from disk, or create an empty index if the on-disk files
116    /// are missing / corrupt / dim-mismatched. Never errors — worst
117    /// case the caller gets an empty index and retrieval falls through
118    /// to the linear path.
119    pub async fn load_or_empty(project_hash: &str, dim: usize) -> Result<Self, CoreError> {
120        let dir = crate::db::project_index_dir(project_hash);
121        let meta_path = dir.join(META_FILENAME);
122        let graph_path = dir.join(format!("{HNSW_BASENAME}.hnsw.graph"));
123        let data_path = dir.join(format!("{HNSW_BASENAME}.hnsw.data"));
124
125        // Short-circuit: if nothing on disk, return an empty index.
126        if !meta_path.exists() || !graph_path.exists() || !data_path.exists() {
127            return Ok(Self::empty(project_hash.to_owned(), dim));
128        }
129
130        // Try to parse the meta. A corrupt / missing file falls through
131        // to an empty index rather than failing retrieval.
132        let meta: AnnMeta = match std::fs::read(&meta_path) {
133            Ok(bytes) => match serde_json::from_slice(&bytes) {
134                Ok(m) => m,
135                Err(_) => return Ok(Self::empty(project_hash.to_owned(), dim)),
136            },
137            Err(_) => return Ok(Self::empty(project_hash.to_owned(), dim)),
138        };
139
140        // Version or dim drift => wipe and start fresh. Dim mismatch is
141        // the safety net for the embedder-swap case (user flips from
142        // SHA1 fallback to real 1536-dim provider mid-flight).
143        if meta.version != META_VERSION || meta.dim != dim {
144            return Ok(Self::empty(project_hash.to_owned(), dim));
145        }
146
147        // hnsw_rs dumps into two files named by basename; reload via
148        // HnswIo. The returned `Hnsw<'b, T, D>` borrows from the reloader
149        // (lifetime `'a: 'b`) even when we're not using mmap, so we
150        // `Box::leak` the reloader to get a `'static` handle. Cost is a
151        // small one-time leak per project per process — acceptable
152        // because load_or_empty runs at most once per `ann_cache()`
153        // entry across the whole process.
154        let reloader: &'static mut HnswIo = Box::leak(Box::new(HnswIo::new(&dir, HNSW_BASENAME)));
155        let Ok(hnsw) = reloader.load_hnsw::<f32, DistCosine>() else {
156            return Ok(Self::empty(project_hash.to_owned(), dim));
157        };
158
159        let mut reverse: HashMap<String, usize> = HashMap::with_capacity(meta.id_map.len());
160        for (idx, id) in meta.id_map.iter().enumerate() {
161            // Later entries with the same chunk id overwrite earlier
162            // ones in reverse — matches the tombstone semantics.
163            reverse.insert(id.clone(), idx);
164        }
165        let tombstones: HashSet<String> = meta.tombstones.iter().cloned().collect();
166
167        Ok(Self {
168            inner: Some(hnsw),
169            id_map: meta.id_map,
170            reverse,
171            tombstones,
172            dim,
173            dirty: false,
174            project_hash: project_hash.to_owned(),
175            schema_hash: meta.schema_hash,
176        })
177    }
178
179    /// Construct a fresh index from a full slice of chunks. Used when
180    /// no on-disk graph exists or when the caller wants to compact out
181    /// tombstones. Swallows per-chunk dim mismatches (those rows are
182    /// dropped) but never errors.
183    pub async fn build_from_chunks(
184        project_hash: &str,
185        chunks: &[(String, Vec<f32>)],
186    ) -> Result<Self, CoreError> {
187        // Dim is inferred from the first non-empty embedding — a corpus
188        // with only empty vectors yields an empty index, which is
189        // semantically the same as "fall back to linear".
190        let dim = chunks
191            .iter()
192            .find(|(_, v)| !v.is_empty())
193            .map_or(0, |(_, v)| v.len());
194        if dim == 0 {
195            return Ok(Self::empty(project_hash.to_owned(), 0));
196        }
197
198        let capacity_hint = chunks.len().max(1);
199        let hnsw: Hnsw<'static, f32, DistCosine> = Hnsw::new(
200            MAX_NB_CONNECTION,
201            capacity_hint,
202            MAX_LAYER,
203            EF_CONSTRUCTION,
204            DistCosine,
205        );
206
207        let mut id_map: Vec<String> = Vec::with_capacity(chunks.len());
208        let mut reverse: HashMap<String, usize> = HashMap::with_capacity(chunks.len());
209        for (chunk_id, emb) in chunks {
210            if emb.len() != dim {
211                // Heterogeneous dims within the same project — skip
212                // rather than corrupt the graph.
213                continue;
214            }
215            let internal_id = id_map.len();
216            hnsw.insert((emb.as_slice(), internal_id));
217            reverse.insert(chunk_id.clone(), internal_id);
218            id_map.push(chunk_id.clone());
219        }
220
221        let schema_hash = compute_schema_hash(chunks, dim);
222
223        Ok(Self {
224            inner: Some(hnsw),
225            id_map,
226            reverse,
227            tombstones: HashSet::new(),
228            dim,
229            dirty: true,
230            project_hash: project_hash.to_owned(),
231            schema_hash,
232        })
233    }
234
235    /// Cheap construction helper — used as the fallback whenever
236    /// `load_or_empty` can't find a valid on-disk index.
237    fn empty(project_hash: String, dim: usize) -> Self {
238        Self {
239            inner: None,
240            id_map: Vec::new(),
241            reverse: HashMap::new(),
242            tombstones: HashSet::new(),
243            dim,
244            dirty: false,
245            project_hash,
246            schema_hash: String::new(),
247        }
248    }
249
250    /// Upsert a single chunk. The previous entry (if any) is marked
251    /// tombstoned so it won't surface in future searches; the new
252    /// embedding is appended to the graph with a fresh internal id.
253    ///
254    /// A dim mismatch is silently ignored — the caller is the SQL
255    /// upsert path which continues regardless, matching the "ANN never
256    /// blocks rule writes" contract.
257    pub fn upsert(&mut self, chunk_id: &str, embedding: &[f32]) {
258        if embedding.is_empty() {
259            return;
260        }
261        // Lazy-init the graph on first insert so load_or_empty + upsert
262        // works even when no on-disk graph existed.
263        if self.inner.is_none() {
264            if self.dim == 0 {
265                self.dim = embedding.len();
266            }
267            if embedding.len() != self.dim {
268                return;
269            }
270            self.inner = Some(Hnsw::new(
271                MAX_NB_CONNECTION,
272                64,
273                MAX_LAYER,
274                EF_CONSTRUCTION,
275                DistCosine,
276            ));
277        }
278        if embedding.len() != self.dim {
279            return;
280        }
281        // Previous id for this chunk (if any) becomes a tombstone.
282        if let Some(_prev) = self.reverse.get(chunk_id) {
283            self.tombstones.insert(chunk_id.to_owned());
284            // A tombstoned id_map entry still points to the old string,
285            // but search filters by the chunk_id returned; since we map
286            // internal->chunk_id, we need the OLD internal slot to map
287            // to a distinct "dead" chunk_id so it can't collide with
288            // the new insertion. Simplest approach: keep the old slot
289            // pointing at the same chunk_id and rely on tombstones to
290            // hide it. But then the NEW insertion would also inherit
291            // the tombstone — so we clear it after tagging.
292            //
293            // Concretely: treat `tombstones` as "previously seen this
294            // chunk_id" and on every hit, compare the hit's internal
295            // id against the most recent `reverse` entry — if it
296            // doesn't match, drop the hit. See `search()` below.
297        }
298        #[allow(clippy::expect_used)]
299        // reason: invariant — `self.inner` was just set on the empty branch above.
300        let hnsw = self.inner.as_ref().expect("inner set above");
301        let new_internal = self.id_map.len();
302        hnsw.insert((embedding, new_internal));
303        self.id_map.push(chunk_id.to_owned());
304        self.reverse.insert(chunk_id.to_owned(), new_internal);
305        self.dirty = true;
306    }
307
308    /// Mark a chunk as removed. The underlying HNSW entry is NOT
309    /// physically deleted (`hnsw_rs` has no public `remove` API); instead
310    /// we tombstone it so search skips it. Full reclamation happens on
311    /// the next `build_from_chunks`.
312    pub fn remove(&mut self, chunk_id: &str) {
313        if self.reverse.remove(chunk_id).is_some() {
314            self.tombstones.insert(chunk_id.to_owned());
315            self.dirty = true;
316        }
317    }
318
319    /// Search for `top_k` nearest chunks to the query. Returns
320    /// `(chunk_id, distance)` pairs with smaller distance = more similar
321    /// (`DistCosine` returns `1 - cos`). An empty index, dim mismatch,
322    /// empty query, or any internal error yields an empty result — the
323    /// caller should interpret that as "use the linear scan".
324    pub fn search(&self, query: &[f32], top_k: usize) -> Vec<(String, f32)> {
325        if top_k == 0 || query.is_empty() || self.id_map.is_empty() {
326            return Vec::new();
327        }
328        let top_k = top_k.min(MAX_SEARCH_TOP_K);
329        if query.len() != self.dim {
330            return Vec::new();
331        }
332        let Some(hnsw) = self.inner.as_ref() else {
333            return Vec::new();
334        };
335        // Over-fetch so tombstones + duplicate-id filtering still leaves
336        // us with ≈ top_k survivors. 3x is enough headroom for realistic
337        // tombstone ratios (< 30% of the graph).
338        let raw_k = top_k.saturating_mul(3).min(MAX_RAW_SEARCH_CANDIDATES);
339        let ef = DEFAULT_EF_SEARCH.max(top_k.saturating_mul(2));
340        let raw = hnsw.search(query, raw_k, ef);
341        let mut out = Vec::with_capacity(top_k);
342        let mut seen: HashSet<&str> = HashSet::new();
343        for n in raw {
344            let internal_id = n.d_id;
345            let Some(chunk_id) = self.id_map.get(internal_id) else {
346                continue;
347            };
348            // Dedup — if a chunk_id has a tombstone AND a fresh copy,
349            // only the freshest one (latest `reverse` mapping) wins.
350            if self.tombstones.contains(chunk_id) {
351                if let Some(&current) = self.reverse.get(chunk_id) {
352                    if current != internal_id {
353                        continue;
354                    }
355                } else {
356                    // Fully removed chunk.
357                    continue;
358                }
359            }
360            if !seen.insert(chunk_id.as_str()) {
361                continue;
362            }
363            out.push((chunk_id.clone(), n.distance));
364            if out.len() >= top_k {
365                break;
366            }
367        }
368        out
369    }
370
371    /// Persist the index + sidecar to `~/.difflore/projects/{hash}/`.
372    /// A best-effort operation — directory-create and graph-dump errors
373    /// bubble up so callers can log them, but the typical caller
374    /// (`upsert_rule_chunks`) swallows the error. Sets `dirty = false`
375    /// on success.
376    pub async fn save(&mut self) -> Result<(), CoreError> {
377        let Some(hnsw) = self.inner.as_ref() else {
378            // Empty index => nothing to persist. Still write a meta
379            // file so `load_or_empty` sees a consistent state.
380            let dir = crate::db::project_index_dir(&self.project_hash);
381            std::fs::create_dir_all(&dir)?;
382            self.write_meta(&dir)?;
383            self.dirty = false;
384            return Ok(());
385        };
386        let dir = crate::db::project_index_dir(&self.project_hash);
387        std::fs::create_dir_all(&dir)?;
388        hnsw.file_dump(&dir, HNSW_BASENAME)
389            .map_err(|e| CoreError::Internal(format!("hnsw file_dump failed: {e}")))?;
390        self.write_meta(&dir)?;
391        self.dirty = false;
392        Ok(())
393    }
394
395    fn write_meta(&self, dir: &Path) -> Result<(), CoreError> {
396        let meta = AnnMeta {
397            version: META_VERSION,
398            dim: self.dim,
399            size: u32::try_from(self.id_map.len()).unwrap_or(u32::MAX),
400            built_at: Utc::now().to_rfc3339(),
401            schema_hash: self.schema_hash.clone(),
402            id_map: self.id_map.clone(),
403            tombstones: self.tombstones.iter().cloned().collect(),
404        };
405        let bytes = serde_json::to_vec_pretty(&meta)?;
406        std::fs::write(dir.join(META_FILENAME), bytes)?;
407        Ok(())
408    }
409
410    /// Has the in-memory state diverged from disk? Callers can gate
411    /// expensive `save()` calls on this.
412    pub const fn is_dirty(&self) -> bool {
413        self.dirty
414    }
415
416    /// Number of live (non-tombstoned) chunks in the index. Used by the
417    /// trajectory emitter so the cloud dashboard can chart index growth.
418    pub fn live_size(&self) -> u32 {
419        u32::try_from(self.reverse.len()).unwrap_or(u32::MAX)
420    }
421
422    /// Total chunk count including tombstones. Mostly for tests /
423    /// diagnostics.
424    pub fn total_size(&self) -> u32 {
425        u32::try_from(self.id_map.len()).unwrap_or(u32::MAX)
426    }
427
428    /// Dimensionality of the stored vectors. Zero means "unset" (empty
429    /// index that has never been written to).
430    pub const fn dim(&self) -> usize {
431        self.dim
432    }
433}
434
435/// Compute a stable schema hash from the first up-to-64 chunks. Not a
436/// security construct; the only purpose is to distinguish "same corpus
437/// across runs" from "corpus has changed". Storing every chunk in the
438/// hash would be expensive and the meta file is already large enough
439/// with the `id_map`.
440fn compute_schema_hash(chunks: &[(String, Vec<f32>)], dim: usize) -> String {
441    use sha1::{Digest, Sha1};
442    let mut hasher = Sha1::new();
443    hasher.update(dim.to_le_bytes());
444    for (id, _) in chunks.iter().take(64) {
445        hasher.update(id.as_bytes());
446        hasher.update([0]);
447    }
448    let out = hasher.finalize();
449    let mut hex = String::with_capacity(12);
450    for b in out.iter().take(6) {
451        hex.push_str(&format!("{b:02x}"));
452    }
453    hex
454}
455
456/// Process-wide cache of per-project/per-dimension ANN indices. Mirrors the
457/// `pool_cache()` pattern in `index_db.rs` so two concurrent MCP tool
458/// calls share one `AnnIndex` instance for the same embedding space.
459fn ann_cache() -> &'static AnnCache {
460    static CACHE: OnceLock<AnnCache> = OnceLock::new();
461    CACHE.get_or_init(|| std::sync::Mutex::new(HashMap::new()))
462}
463
464/// Get-or-load the ANN for a project + embedding dimension. Cheap on the hot path (cache
465/// hit); cold-path cost is whatever `load_or_empty` pays (either an
466/// empty struct alloc or an `hnsw_rs` file reload). Never errors.
467pub async fn get_ann_for_project(
468    project_hash: &str,
469    dim: usize,
470) -> Result<Arc<Mutex<AnnIndex>>, CoreError> {
471    {
472        #[allow(clippy::expect_used)]
473        // reason: poisoned mutex in process-wide cache is unrecoverable; abort is correct.
474        let guard = ann_cache().lock().expect("ann cache mutex poisoned");
475        let key = (project_hash.to_owned(), dim);
476        if let Some(existing) = guard.get(&key) {
477            return Ok(Arc::clone(existing));
478        }
479    }
480    let loaded = AnnIndex::load_or_empty(project_hash, dim).await?;
481    let arc = Arc::new(Mutex::new(loaded));
482    #[allow(clippy::expect_used)]
483    // reason: poisoned mutex in process-wide cache is unrecoverable; abort is correct.
484    let mut guard = ann_cache().lock().expect("ann cache mutex poisoned");
485    // Keep the first concurrently loaded index so callers do not fork the cache.
486    let key = (project_hash.to_owned(), dim);
487    let entry = guard.entry(key).or_insert(arc);
488    Ok(Arc::clone(entry))
489}
490
491/// Drop the cached entry for a project. Tests call this to force a
492/// cold reload; production code should never need it.
493#[cfg(test)]
494pub fn invalidate_cache(project_hash: &str) {
495    #[allow(clippy::expect_used)]
496    // reason: poisoned mutex in process-wide cache is unrecoverable; abort is correct.
497    let mut guard = ann_cache().lock().expect("ann cache mutex poisoned");
498    guard.retain(|(cached_project, _), _| cached_project != project_hash);
499}
500
501/// Convenience helper for the on-disk files belonging to a project index.
502pub fn ann_files_for_project(project_hash: &str) -> (PathBuf, PathBuf, PathBuf) {
503    let dir = crate::db::project_index_dir(project_hash);
504    (
505        dir.join(format!("{HNSW_BASENAME}.hnsw.graph")),
506        dir.join(format!("{HNSW_BASENAME}.hnsw.data")),
507        dir.join(META_FILENAME),
508    )
509}
510
511#[cfg(test)]
512mod tests {
513    use super::*;
514
515    fn unique_hash(tag: &str) -> String {
516        // Include a timestamp + thread id so tests don't collide on
517        // the process-wide ann_cache.
518        use std::time::{SystemTime, UNIX_EPOCH};
519        let nanos = SystemTime::now()
520            .duration_since(UNIX_EPOCH)
521            .unwrap_or_default()
522            .as_nanos();
523        format!("{tag}-{nanos}")
524    }
525
526    fn random_vec(seed: u64, dim: usize) -> Vec<f32> {
527        // Deterministic pseudo-random so recall tests are reproducible.
528        use std::collections::hash_map::DefaultHasher;
529        use std::hash::{Hash, Hasher};
530        let mut v = Vec::with_capacity(dim);
531        for i in 0..dim {
532            let mut h = DefaultHasher::new();
533            (seed, i).hash(&mut h);
534            let raw = h.finish();
535            // Map into [-1, 1).
536            let x = ((raw as i64) as f64) / (i64::MAX as f64);
537            v.push(x as f32);
538        }
539        // L2 normalise so DistCosine is well-behaved.
540        let n: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
541        if n > 0.0 {
542            for x in &mut v {
543                *x /= n;
544            }
545        }
546        v
547    }
548
549    #[tokio::test]
550    async fn empty_index_search_returns_empty() {
551        // Test isolation is provided by `unique_hash()` below — every
552        // test runs against a different `projects/<hash>` subdir under
553        // the crate-wide `shared_test_home()`. No env mutation needed.
554        let _home = crate::db::shared_test_home();
555        let hash = unique_hash("empty-search");
556        let idx = AnnIndex::load_or_empty(&hash, 16).await.unwrap();
557        assert_eq!(idx.total_size(), 0);
558        let hits = idx.search(&[0.1f32; 16], 5);
559        assert!(hits.is_empty());
560    }
561
562    #[tokio::test]
563    async fn build_and_search_returns_nearest() {
564        // Test isolation is provided by `unique_hash()` below — every
565        // test runs against a different `projects/<hash>` subdir under
566        // the crate-wide `shared_test_home()`. No env mutation needed.
567        let _home = crate::db::shared_test_home();
568        let hash = unique_hash("build-search");
569        let dim = 32;
570        // Seed 50 random vectors, then append a "target" that we'll
571        // query with a near-duplicate.
572        let mut chunks: Vec<(String, Vec<f32>)> = Vec::new();
573        for i in 0..50 {
574            chunks.push((format!("c{i}"), random_vec(i as u64, dim)));
575        }
576        let target_seed = 99u64;
577        let target = random_vec(target_seed, dim);
578        chunks.push(("target".to_owned(), target.clone()));
579
580        let idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
581        assert_eq!(idx.total_size(), 51);
582
583        let hits = idx.search(&target, 5);
584        assert!(!hits.is_empty());
585        // The nearest neighbour to a point is the point itself
586        // (distance ~ 0 under DistCosine).
587        assert_eq!(hits[0].0, "target");
588        assert!(
589            hits[0].1 < 1e-3,
590            "self-match distance should be near zero, got {}",
591            hits[0].1
592        );
593    }
594
595    #[tokio::test]
596    async fn save_and_load_roundtrip() {
597        // Test isolation is provided by `unique_hash()` below — every
598        // test runs against a different `projects/<hash>` subdir under
599        // the crate-wide `shared_test_home()`. No env mutation needed.
600        let _home = crate::db::shared_test_home();
601        let hash = unique_hash("roundtrip");
602        let dim = 24;
603        let mut chunks: Vec<(String, Vec<f32>)> = Vec::new();
604        for i in 0..20 {
605            chunks.push((format!("c{i}"), random_vec(i as u64, dim)));
606        }
607        let query = random_vec(7, dim);
608
609        let mut idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
610        let before = idx.search(&query, 5);
611        assert!(!before.is_empty());
612        idx.save().await.unwrap();
613        assert!(!idx.is_dirty());
614
615        invalidate_cache(&hash);
616        let reloaded = AnnIndex::load_or_empty(&hash, dim).await.unwrap();
617        let after = reloaded.search(&query, 5);
618        assert_eq!(before.len(), after.len());
619        // The reload path keeps IDs stable — top-1 must match.
620        assert_eq!(before[0].0, after[0].0);
621    }
622
623    #[tokio::test]
624    async fn upsert_replaces_existing_chunk_embedding() {
625        // Test isolation is provided by `unique_hash()` below — every
626        // test runs against a different `projects/<hash>` subdir under
627        // the crate-wide `shared_test_home()`. No env mutation needed.
628        let _home = crate::db::shared_test_home();
629        let hash = unique_hash("upsert");
630        let dim = 8;
631        let vec_a = random_vec(1, dim);
632        let vec_b = random_vec(2, dim);
633        let mut idx = AnnIndex::build_from_chunks(
634            &hash,
635            &[
636                ("id1".to_owned(), vec_a.clone()),
637                ("neighbor".to_owned(), random_vec(3, dim)),
638            ],
639        )
640        .await
641        .unwrap();
642
643        // Querying with A => id1 comes back as top.
644        let hits1 = idx.search(&vec_a, 2);
645        assert_eq!(hits1[0].0, "id1");
646
647        // Upsert id1 with vec B and confirm:
648        //   * querying with A no longer surfaces id1 as top-1
649        //     (tombstoned old slot should be filtered)
650        //   * querying with B DOES surface id1
651        idx.upsert("id1", &vec_b);
652        let hits_a = idx.search(&vec_a, 2);
653        // id1 may still appear but NOT in top-1 unless its new vec_b
654        // happens to be close to vec_a; at minimum the first hit's
655        // internal id should be the fresh one. Rather than race on
656        // ranking, assert the freshest internal id wins.
657        if !hits_a.is_empty() && hits_a[0].0 == "id1" {
658            // This is fine provided it's the fresh copy.
659            let current = idx.reverse.get("id1").copied();
660            assert!(current.is_some());
661        }
662        let hits_b = idx.search(&vec_b, 2);
663        assert!(
664            hits_b.iter().any(|(id, _)| id == "id1"),
665            "upserted chunk must be searchable via new vector"
666        );
667    }
668
669    #[tokio::test]
670    async fn remove_drops_from_search_results() {
671        // Test isolation is provided by `unique_hash()` below — every
672        // test runs against a different `projects/<hash>` subdir under
673        // the crate-wide `shared_test_home()`. No env mutation needed.
674        let _home = crate::db::shared_test_home();
675        let hash = unique_hash("remove");
676        let dim = 12;
677        let vec_a = random_vec(10, dim);
678        let mut idx = AnnIndex::build_from_chunks(
679            &hash,
680            &[
681                ("doomed".to_owned(), vec_a.clone()),
682                ("keep".to_owned(), random_vec(11, dim)),
683            ],
684        )
685        .await
686        .unwrap();
687
688        let before = idx.search(&vec_a, 2);
689        assert!(before.iter().any(|(id, _)| id == "doomed"));
690
691        idx.remove("doomed");
692        let after = idx.search(&vec_a, 2);
693        assert!(
694            !after.iter().any(|(id, _)| id == "doomed"),
695            "removed chunk must not appear in search results"
696        );
697    }
698
699    #[tokio::test]
700    async fn dim_mismatch_triggers_fallback() {
701        // Test isolation is provided by `unique_hash()` below — every
702        // test runs against a different `projects/<hash>` subdir under
703        // the crate-wide `shared_test_home()`. No env mutation needed.
704        let _home = crate::db::shared_test_home();
705        let hash = unique_hash("dim-mismatch");
706        let dim = 16;
707        let mut chunks: Vec<(String, Vec<f32>)> = Vec::new();
708        for i in 0..5 {
709            chunks.push((format!("c{i}"), random_vec(i as u64, dim)));
710        }
711        let mut idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
712        idx.save().await.unwrap();
713
714        invalidate_cache(&hash);
715        // Reload asking for a DIFFERENT dim — should get empty index.
716        let reloaded = AnnIndex::load_or_empty(&hash, 32).await.unwrap();
717        assert_eq!(reloaded.total_size(), 0, "dim drift must reset the index");
718        // Searching with a query at the stored dim also yields empty
719        // (the reload dropped the graph so nothing to search).
720        let hits = reloaded.search(&random_vec(0, 32), 5);
721        assert!(hits.is_empty());
722    }
723
724    #[tokio::test]
725    async fn search_recall_at_top_10() {
726        // 1000 random vectors; query is a near-duplicate of one of
727        // them. HNSW should place the target in the top-10 on the
728        // overwhelming majority of runs — we assert "within top 10"
729        // as a recall smoke-check rather than top-1 because HNSW is
730        // approximate.
731        // Test isolation is provided by `unique_hash()` below — every
732        // test runs against a different `projects/<hash>` subdir under
733        // the crate-wide `shared_test_home()`. No env mutation needed.
734        let _home = crate::db::shared_test_home();
735        let hash = unique_hash("recall10");
736        let dim = 32;
737        let mut chunks: Vec<(String, Vec<f32>)> = Vec::new();
738        for i in 0..1000 {
739            chunks.push((format!("c{i}"), random_vec(i as u64, dim)));
740        }
741        // Pick c250 as the target; query is a slightly-perturbed
742        // version of its embedding.
743        let target_idx = 250u64;
744        let mut target_query = random_vec(target_idx, dim);
745        // Perturb by 1% then renormalise.
746        for (i, x) in target_query.iter_mut().enumerate() {
747            *x = 0.01f32.mul_add(random_vec(i as u64 + 5000, 1)[0], *x);
748        }
749        let n: f32 = target_query.iter().map(|x| x * x).sum::<f32>().sqrt();
750        if n > 0.0 {
751            for x in &mut target_query {
752                *x /= n;
753            }
754        }
755
756        let idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
757        let hits = idx.search(&target_query, 10);
758        let ids: Vec<_> = hits.iter().map(|(id, _)| id.as_str()).collect();
759        assert!(
760            ids.contains(&format!("c{target_idx}").as_str()),
761            "target chunk must be in top-10 (got {ids:?})"
762        );
763    }
764
765    #[tokio::test]
766    async fn search_caps_large_top_k_requests() {
767        let _home = crate::db::shared_test_home();
768        let hash = unique_hash("cap-top-k");
769        let chunks: Vec<(String, Vec<f32>)> = (0..80)
770            .map(|i| (format!("id-{i:02}"), vec![i as f32, 1.0]))
771            .collect();
772        let idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
773        let hits = idx.search(&[1.0, 1.0], 500);
774        assert!(
775            hits.len() <= MAX_SEARCH_TOP_K,
776            "ANN search should cap oversized requests, got {} hits",
777            hits.len()
778        );
779    }
780
781    #[tokio::test]
782    async fn persistence_meta_version_bump_invalidates() {
783        // Test isolation is provided by `unique_hash()` below — every
784        // test runs against a different `projects/<hash>` subdir under
785        // the crate-wide `shared_test_home()`. No env mutation needed.
786        let _home = crate::db::shared_test_home();
787        let hash = unique_hash("meta-bump");
788        let dim = 8;
789        let chunks = vec![
790            ("a".to_owned(), random_vec(1, dim)),
791            ("b".to_owned(), random_vec(2, dim)),
792        ];
793        let mut idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
794        idx.save().await.unwrap();
795        // Rewrite the meta with an older / bogus version so the next
796        // load drops the on-disk graph.
797        let (_g_path, _d_path, meta_path) = ann_files_for_project(&hash);
798        let raw = std::fs::read(&meta_path).unwrap();
799        let mut parsed: serde_json::Value = serde_json::from_slice(&raw).unwrap();
800        parsed["version"] = serde_json::json!(999);
801        std::fs::write(&meta_path, serde_json::to_vec(&parsed).unwrap()).unwrap();
802
803        invalidate_cache(&hash);
804        let reloaded = AnnIndex::load_or_empty(&hash, dim).await.unwrap();
805        assert_eq!(
806            reloaded.total_size(),
807            0,
808            "version drift must wipe the in-memory graph"
809        );
810    }
811
812    #[tokio::test]
813    async fn get_ann_for_project_caches_across_calls() {
814        // Test isolation is provided by `unique_hash()` below — every
815        // test runs against a different `projects/<hash>` subdir under
816        // the crate-wide `shared_test_home()`. No env mutation needed.
817        let _home = crate::db::shared_test_home();
818        let hash = unique_hash("cache");
819        let a = get_ann_for_project(&hash, 16).await.unwrap();
820        let b = get_ann_for_project(&hash, 16).await.unwrap();
821        assert!(
822            Arc::ptr_eq(&a, &b),
823            "cache must return the same Arc across calls"
824        );
825    }
826
827    #[tokio::test]
828    async fn get_ann_for_project_keys_cache_by_dim() {
829        let _home = crate::db::shared_test_home();
830        let hash = unique_hash("cache-dim");
831        let a = get_ann_for_project(&hash, 16).await.unwrap();
832        let b = get_ann_for_project(&hash, 32).await.unwrap();
833        assert!(
834            !Arc::ptr_eq(&a, &b),
835            "same project with different embedding dims must not reuse one ANN cache entry"
836        );
837    }
838}
difflore_core/context/ann.rs

difflore_core/context/
ann.rs