difflore_core/context/ann.rs
1//! Per-project HNSW Approximate-Nearest-Neighbor index.
2//!
3//! Wraps [`hnsw_rs::hnsw::Hnsw`] so the retrieval path can swap the O(N)
4//! linear cosine scan for an O(log N) graph query on big projects. The
5//! design is deliberately cautious:
6//!
7//! 1. **Additive** — every public entry point returns `None` / an empty
8//! result on any failure. The retrieval fallback MUST always work, so
9//! this module never panics and never blocks rule writes.
10//! 2. **Persistent, per-project** — each project hash has its own HNSW
11//! graph file under `~/.difflore/projects/{hash}/hnsw.*` plus a
12//! sidecar `hnsw.meta.json` that carries the dim + element count so
13//! we can detect a stale / wrong-dim index on reload.
14//! 3. **Incremental upsert** — `hnsw_rs` supports runtime insertions, so
15//! `upsert_rule_chunks` can stream new embeddings into the graph
16//! without a full rebuild. Replacements (same `chunk_id`, new vector)
17//! are modelled as "shadow" entries: the old internal id stays in the
18//! graph but is hidden from search results by a `tombstones` set.
19//! A full `build_from_chunks` rebuild periodically cleans these out.
20//! 4. **Dim mismatch => fallback** — if the query dim doesn't match the
21//! index dim we return an empty hit set; the caller sees this as
22//! "ANN gave nothing" and uses the linear scan.
23//!
24//! The internal/id translation is tracked on the Rust side because
25//! `hnsw_rs`'s `DataId` is a `usize` and we want to key on `String`
26//! chunk ids. Both maps are serialised alongside the graph in the
27//! sidecar meta file.
28
29use chrono::Utc;
30use hnsw_rs::prelude::*;
31use serde::{Deserialize, Serialize};
32use std::collections::{HashMap, HashSet};
33use std::path::{Path, PathBuf};
34use std::sync::{Arc, OnceLock};
35use tokio::sync::Mutex;
36
37use crate::errors::CoreError;
38
39type AnnCacheKey = (String, usize);
40type SharedAnnIndex = Arc<Mutex<AnnIndex>>;
41type AnnCache = std::sync::Mutex<HashMap<AnnCacheKey, SharedAnnIndex>>;
42
43/// On-disk basename. `hnsw_rs` writes two files: `hnsw.hnsw.graph` and
44/// `hnsw.hnsw.data`. The sidecar meta lives next to them as
45/// `hnsw.meta.json`.
46const HNSW_BASENAME: &str = "hnsw";
47const META_FILENAME: &str = "hnsw.meta.json";
48
49/// Current meta schema version. Bumped when the sidecar shape changes —
50/// an index with a different version is treated as stale and rebuilt.
51const META_VERSION: u32 = 1;
52
53/// HNSW construction parameters. These are "reasonable defaults" from
54/// the `hnsw_rs` docs / the Malkov+Yashunin paper; we don't expose them
55/// to callers because retrieval quality is more sensitive to our own
56/// RRF weighting than to these knobs in the size range (≤ 100K chunks)
57/// we target.
58const MAX_NB_CONNECTION: usize = 16;
59const EF_CONSTRUCTION: usize = 200;
60const MAX_LAYER: usize = 16;
61const DEFAULT_EF_SEARCH: usize = 64;
62const MAX_SEARCH_TOP_K: usize = 50;
63const MAX_RAW_SEARCH_CANDIDATES: usize = 150;
64
65/// Meta file serialised to `hnsw.meta.json`. A schema mismatch (version
66/// or dim) causes the reload path to drop the on-disk index and return
67/// an empty in-memory one, which the retrieval path then treats as a
68/// fallback cue.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70struct AnnMeta {
71 version: u32,
72 dim: usize,
73 /// Number of elements inserted (including tombstoned ones that are
74 /// hidden from search but still live in the graph).
75 size: u32,
76 /// ISO-8601 timestamp of the most recent `save()` call.
77 built_at: String,
78 /// SHA-1 of the (id, content, embedding dim) signature from the
79 /// `rule_chunks` table at build time. Included so higher layers can
80 /// detect a divergence between the graph and the source DB. Not
81 /// currently consulted by retrieval (which always trusts the graph
82 /// and falls back on empty results).
83 schema_hash: String,
84 /// Ordered list of chunk ids by internal HNSW point id. Required
85 /// because `hnsw_rs`'s `DataId` is a `usize` but our chunks are keyed
86 /// on stable string ids.
87 id_map: Vec<String>,
88 /// Chunk ids that were overwritten after insertion and so should be
89 /// filtered out of search results. Allows `upsert()` to be cheap
90 /// (no graph edit) at the cost of a small `HashSet` lookup per hit.
91 tombstones: Vec<String>,
92}
93
94/// A project-scoped, disk-persisted HNSW index. See the module docs for
95/// the overall approach.
96pub struct AnnIndex {
97 inner: Option<Hnsw<'static, f32, DistCosine>>,
98 /// internal HNSW `DataId` -> `chunk_id`
99 id_map: Vec<String>,
100 /// `chunk_id` -> most recent internal `DataId` (later one wins; older
101 /// ids land in `tombstones`)
102 reverse: HashMap<String, usize>,
103 /// Chunk ids whose embeddings were superseded by an `upsert` and so
104 /// should not appear in search output.
105 tombstones: HashSet<String>,
106 dim: usize,
107 dirty: bool,
108 project_hash: String,
109 /// Cached schema hash from the last successful load/build, carried
110 /// back into `save()` without recomputing. Pure bookkeeping.
111 schema_hash: String,
112}
113
114impl AnnIndex {
115 /// Load from disk, or create an empty index if the on-disk files
116 /// are missing / corrupt / dim-mismatched. Never errors — worst
117 /// case the caller gets an empty index and retrieval falls through
118 /// to the linear path.
119 pub async fn load_or_empty(project_hash: &str, dim: usize) -> Result<Self, CoreError> {
120 let dir = crate::db::project_index_dir(project_hash);
121 let meta_path = dir.join(META_FILENAME);
122 let graph_path = dir.join(format!("{HNSW_BASENAME}.hnsw.graph"));
123 let data_path = dir.join(format!("{HNSW_BASENAME}.hnsw.data"));
124
125 // Short-circuit: if nothing on disk, return an empty index.
126 if !meta_path.exists() || !graph_path.exists() || !data_path.exists() {
127 return Ok(Self::empty(project_hash.to_owned(), dim));
128 }
129
130 // Try to parse the meta. A corrupt / missing file falls through
131 // to an empty index rather than failing retrieval.
132 let meta: AnnMeta = match std::fs::read(&meta_path) {
133 Ok(bytes) => match serde_json::from_slice(&bytes) {
134 Ok(m) => m,
135 Err(_) => return Ok(Self::empty(project_hash.to_owned(), dim)),
136 },
137 Err(_) => return Ok(Self::empty(project_hash.to_owned(), dim)),
138 };
139
140 // Version or dim drift => wipe and start fresh. Dim mismatch is
141 // the safety net for the embedder-swap case (user flips from
142 // SHA1 fallback to real 1536-dim provider mid-flight).
143 if meta.version != META_VERSION || meta.dim != dim {
144 return Ok(Self::empty(project_hash.to_owned(), dim));
145 }
146
147 // hnsw_rs dumps into two files named by basename; reload via
148 // HnswIo. The returned `Hnsw<'b, T, D>` borrows from the reloader
149 // (lifetime `'a: 'b`) even when we're not using mmap, so we
150 // `Box::leak` the reloader to get a `'static` handle. Cost is a
151 // small one-time leak per project per process — acceptable
152 // because load_or_empty runs at most once per `ann_cache()`
153 // entry across the whole process.
154 let reloader: &'static mut HnswIo = Box::leak(Box::new(HnswIo::new(&dir, HNSW_BASENAME)));
155 let Ok(hnsw) = reloader.load_hnsw::<f32, DistCosine>() else {
156 return Ok(Self::empty(project_hash.to_owned(), dim));
157 };
158
159 let mut reverse: HashMap<String, usize> = HashMap::with_capacity(meta.id_map.len());
160 for (idx, id) in meta.id_map.iter().enumerate() {
161 // Later entries with the same chunk id overwrite earlier
162 // ones in reverse — matches the tombstone semantics.
163 reverse.insert(id.clone(), idx);
164 }
165 let tombstones: HashSet<String> = meta.tombstones.iter().cloned().collect();
166
167 Ok(Self {
168 inner: Some(hnsw),
169 id_map: meta.id_map,
170 reverse,
171 tombstones,
172 dim,
173 dirty: false,
174 project_hash: project_hash.to_owned(),
175 schema_hash: meta.schema_hash,
176 })
177 }
178
179 /// Construct a fresh index from a full slice of chunks. Used when
180 /// no on-disk graph exists or when the caller wants to compact out
181 /// tombstones. Swallows per-chunk dim mismatches (those rows are
182 /// dropped) but never errors.
183 pub async fn build_from_chunks(
184 project_hash: &str,
185 chunks: &[(String, Vec<f32>)],
186 ) -> Result<Self, CoreError> {
187 // Dim is inferred from the first non-empty embedding — a corpus
188 // with only empty vectors yields an empty index, which is
189 // semantically the same as "fall back to linear".
190 let dim = chunks
191 .iter()
192 .find(|(_, v)| !v.is_empty())
193 .map_or(0, |(_, v)| v.len());
194 if dim == 0 {
195 return Ok(Self::empty(project_hash.to_owned(), 0));
196 }
197
198 let capacity_hint = chunks.len().max(1);
199 let hnsw: Hnsw<'static, f32, DistCosine> = Hnsw::new(
200 MAX_NB_CONNECTION,
201 capacity_hint,
202 MAX_LAYER,
203 EF_CONSTRUCTION,
204 DistCosine,
205 );
206
207 let mut id_map: Vec<String> = Vec::with_capacity(chunks.len());
208 let mut reverse: HashMap<String, usize> = HashMap::with_capacity(chunks.len());
209 for (chunk_id, emb) in chunks {
210 if emb.len() != dim {
211 // Heterogeneous dims within the same project — skip
212 // rather than corrupt the graph.
213 continue;
214 }
215 let internal_id = id_map.len();
216 hnsw.insert((emb.as_slice(), internal_id));
217 reverse.insert(chunk_id.clone(), internal_id);
218 id_map.push(chunk_id.clone());
219 }
220
221 let schema_hash = compute_schema_hash(chunks, dim);
222
223 Ok(Self {
224 inner: Some(hnsw),
225 id_map,
226 reverse,
227 tombstones: HashSet::new(),
228 dim,
229 dirty: true,
230 project_hash: project_hash.to_owned(),
231 schema_hash,
232 })
233 }
234
235 /// Cheap construction helper — used as the fallback whenever
236 /// `load_or_empty` can't find a valid on-disk index.
237 fn empty(project_hash: String, dim: usize) -> Self {
238 Self {
239 inner: None,
240 id_map: Vec::new(),
241 reverse: HashMap::new(),
242 tombstones: HashSet::new(),
243 dim,
244 dirty: false,
245 project_hash,
246 schema_hash: String::new(),
247 }
248 }
249
250 /// Upsert a single chunk. The previous entry (if any) is marked
251 /// tombstoned so it won't surface in future searches; the new
252 /// embedding is appended to the graph with a fresh internal id.
253 ///
254 /// A dim mismatch is silently ignored — the caller is the SQL
255 /// upsert path which continues regardless, matching the "ANN never
256 /// blocks rule writes" contract.
257 pub fn upsert(&mut self, chunk_id: &str, embedding: &[f32]) {
258 if embedding.is_empty() {
259 return;
260 }
261 // Lazy-init the graph on first insert so load_or_empty + upsert
262 // works even when no on-disk graph existed.
263 if self.inner.is_none() {
264 if self.dim == 0 {
265 self.dim = embedding.len();
266 }
267 if embedding.len() != self.dim {
268 return;
269 }
270 self.inner = Some(Hnsw::new(
271 MAX_NB_CONNECTION,
272 64,
273 MAX_LAYER,
274 EF_CONSTRUCTION,
275 DistCosine,
276 ));
277 }
278 if embedding.len() != self.dim {
279 return;
280 }
281 // Previous id for this chunk (if any) becomes a tombstone.
282 if let Some(_prev) = self.reverse.get(chunk_id) {
283 self.tombstones.insert(chunk_id.to_owned());
284 // A tombstoned id_map entry still points to the old string,
285 // but search filters by the chunk_id returned; since we map
286 // internal->chunk_id, we need the OLD internal slot to map
287 // to a distinct "dead" chunk_id so it can't collide with
288 // the new insertion. Simplest approach: keep the old slot
289 // pointing at the same chunk_id and rely on tombstones to
290 // hide it. But then the NEW insertion would also inherit
291 // the tombstone — so we clear it after tagging.
292 //
293 // Concretely: treat `tombstones` as "previously seen this
294 // chunk_id" and on every hit, compare the hit's internal
295 // id against the most recent `reverse` entry — if it
296 // doesn't match, drop the hit. See `search()` below.
297 }
298 #[allow(clippy::expect_used)]
299 // reason: invariant — `self.inner` was just set on the empty branch above.
300 let hnsw = self.inner.as_ref().expect("inner set above");
301 let new_internal = self.id_map.len();
302 hnsw.insert((embedding, new_internal));
303 self.id_map.push(chunk_id.to_owned());
304 self.reverse.insert(chunk_id.to_owned(), new_internal);
305 self.dirty = true;
306 }
307
308 /// Mark a chunk as removed. The underlying HNSW entry is NOT
309 /// physically deleted (`hnsw_rs` has no public `remove` API); instead
310 /// we tombstone it so search skips it. Full reclamation happens on
311 /// the next `build_from_chunks`.
312 pub fn remove(&mut self, chunk_id: &str) {
313 if self.reverse.remove(chunk_id).is_some() {
314 self.tombstones.insert(chunk_id.to_owned());
315 self.dirty = true;
316 }
317 }
318
319 /// Search for `top_k` nearest chunks to the query. Returns
320 /// `(chunk_id, distance)` pairs with smaller distance = more similar
321 /// (`DistCosine` returns `1 - cos`). An empty index, dim mismatch,
322 /// empty query, or any internal error yields an empty result — the
323 /// caller should interpret that as "use the linear scan".
324 pub fn search(&self, query: &[f32], top_k: usize) -> Vec<(String, f32)> {
325 if top_k == 0 || query.is_empty() || self.id_map.is_empty() {
326 return Vec::new();
327 }
328 let top_k = top_k.min(MAX_SEARCH_TOP_K);
329 if query.len() != self.dim {
330 return Vec::new();
331 }
332 let Some(hnsw) = self.inner.as_ref() else {
333 return Vec::new();
334 };
335 // Over-fetch so tombstones + duplicate-id filtering still leaves
336 // us with ≈ top_k survivors. 3x is enough headroom for realistic
337 // tombstone ratios (< 30% of the graph).
338 let raw_k = top_k.saturating_mul(3).min(MAX_RAW_SEARCH_CANDIDATES);
339 let ef = DEFAULT_EF_SEARCH.max(top_k.saturating_mul(2));
340 let raw = hnsw.search(query, raw_k, ef);
341 let mut out = Vec::with_capacity(top_k);
342 let mut seen: HashSet<&str> = HashSet::new();
343 for n in raw {
344 let internal_id = n.d_id;
345 let Some(chunk_id) = self.id_map.get(internal_id) else {
346 continue;
347 };
348 // Dedup — if a chunk_id has a tombstone AND a fresh copy,
349 // only the freshest one (latest `reverse` mapping) wins.
350 if self.tombstones.contains(chunk_id) {
351 if let Some(¤t) = self.reverse.get(chunk_id) {
352 if current != internal_id {
353 continue;
354 }
355 } else {
356 // Fully removed chunk.
357 continue;
358 }
359 }
360 if !seen.insert(chunk_id.as_str()) {
361 continue;
362 }
363 out.push((chunk_id.clone(), n.distance));
364 if out.len() >= top_k {
365 break;
366 }
367 }
368 out
369 }
370
371 /// Persist the index + sidecar to `~/.difflore/projects/{hash}/`.
372 /// A best-effort operation — directory-create and graph-dump errors
373 /// bubble up so callers can log them, but the typical caller
374 /// (`upsert_rule_chunks`) swallows the error. Sets `dirty = false`
375 /// on success.
376 pub async fn save(&mut self) -> Result<(), CoreError> {
377 let Some(hnsw) = self.inner.as_ref() else {
378 // Empty index => nothing to persist. Still write a meta
379 // file so `load_or_empty` sees a consistent state.
380 let dir = crate::db::project_index_dir(&self.project_hash);
381 std::fs::create_dir_all(&dir)?;
382 self.write_meta(&dir)?;
383 self.dirty = false;
384 return Ok(());
385 };
386 let dir = crate::db::project_index_dir(&self.project_hash);
387 std::fs::create_dir_all(&dir)?;
388 hnsw.file_dump(&dir, HNSW_BASENAME)
389 .map_err(|e| CoreError::Internal(format!("hnsw file_dump failed: {e}")))?;
390 self.write_meta(&dir)?;
391 self.dirty = false;
392 Ok(())
393 }
394
395 fn write_meta(&self, dir: &Path) -> Result<(), CoreError> {
396 let meta = AnnMeta {
397 version: META_VERSION,
398 dim: self.dim,
399 size: u32::try_from(self.id_map.len()).unwrap_or(u32::MAX),
400 built_at: Utc::now().to_rfc3339(),
401 schema_hash: self.schema_hash.clone(),
402 id_map: self.id_map.clone(),
403 tombstones: self.tombstones.iter().cloned().collect(),
404 };
405 let bytes = serde_json::to_vec_pretty(&meta)?;
406 std::fs::write(dir.join(META_FILENAME), bytes)?;
407 Ok(())
408 }
409
410 /// Has the in-memory state diverged from disk? Callers can gate
411 /// expensive `save()` calls on this.
412 pub const fn is_dirty(&self) -> bool {
413 self.dirty
414 }
415
416 /// Number of live (non-tombstoned) chunks in the index. Used by the
417 /// trajectory emitter so the cloud dashboard can chart index growth.
418 pub fn live_size(&self) -> u32 {
419 u32::try_from(self.reverse.len()).unwrap_or(u32::MAX)
420 }
421
422 /// Total chunk count including tombstones. Mostly for tests /
423 /// diagnostics.
424 pub fn total_size(&self) -> u32 {
425 u32::try_from(self.id_map.len()).unwrap_or(u32::MAX)
426 }
427
428 /// Dimensionality of the stored vectors. Zero means "unset" (empty
429 /// index that has never been written to).
430 pub const fn dim(&self) -> usize {
431 self.dim
432 }
433}
434
435/// Compute a stable schema hash from the first up-to-64 chunks. Not a
436/// security construct; the only purpose is to distinguish "same corpus
437/// across runs" from "corpus has changed". Storing every chunk in the
438/// hash would be expensive and the meta file is already large enough
439/// with the `id_map`.
440fn compute_schema_hash(chunks: &[(String, Vec<f32>)], dim: usize) -> String {
441 use sha1::{Digest, Sha1};
442 let mut hasher = Sha1::new();
443 hasher.update(dim.to_le_bytes());
444 for (id, _) in chunks.iter().take(64) {
445 hasher.update(id.as_bytes());
446 hasher.update([0]);
447 }
448 let out = hasher.finalize();
449 let mut hex = String::with_capacity(12);
450 for b in out.iter().take(6) {
451 hex.push_str(&format!("{b:02x}"));
452 }
453 hex
454}
455
456/// Process-wide cache of per-project/per-dimension ANN indices. Mirrors the
457/// `pool_cache()` pattern in `index_db.rs` so two concurrent MCP tool
458/// calls share one `AnnIndex` instance for the same embedding space.
459fn ann_cache() -> &'static AnnCache {
460 static CACHE: OnceLock<AnnCache> = OnceLock::new();
461 CACHE.get_or_init(|| std::sync::Mutex::new(HashMap::new()))
462}
463
464/// Get-or-load the ANN for a project + embedding dimension. Cheap on the hot path (cache
465/// hit); cold-path cost is whatever `load_or_empty` pays (either an
466/// empty struct alloc or an `hnsw_rs` file reload). Never errors.
467pub async fn get_ann_for_project(
468 project_hash: &str,
469 dim: usize,
470) -> Result<Arc<Mutex<AnnIndex>>, CoreError> {
471 {
472 #[allow(clippy::expect_used)]
473 // reason: poisoned mutex in process-wide cache is unrecoverable; abort is correct.
474 let guard = ann_cache().lock().expect("ann cache mutex poisoned");
475 let key = (project_hash.to_owned(), dim);
476 if let Some(existing) = guard.get(&key) {
477 return Ok(Arc::clone(existing));
478 }
479 }
480 let loaded = AnnIndex::load_or_empty(project_hash, dim).await?;
481 let arc = Arc::new(Mutex::new(loaded));
482 #[allow(clippy::expect_used)]
483 // reason: poisoned mutex in process-wide cache is unrecoverable; abort is correct.
484 let mut guard = ann_cache().lock().expect("ann cache mutex poisoned");
485 // Keep the first concurrently loaded index so callers do not fork the cache.
486 let key = (project_hash.to_owned(), dim);
487 let entry = guard.entry(key).or_insert(arc);
488 Ok(Arc::clone(entry))
489}
490
491/// Drop the cached entry for a project. Tests call this to force a
492/// cold reload; production code should never need it.
493#[cfg(test)]
494pub fn invalidate_cache(project_hash: &str) {
495 #[allow(clippy::expect_used)]
496 // reason: poisoned mutex in process-wide cache is unrecoverable; abort is correct.
497 let mut guard = ann_cache().lock().expect("ann cache mutex poisoned");
498 guard.retain(|(cached_project, _), _| cached_project != project_hash);
499}
500
501/// Convenience helper for the on-disk files belonging to a project index.
502pub fn ann_files_for_project(project_hash: &str) -> (PathBuf, PathBuf, PathBuf) {
503 let dir = crate::db::project_index_dir(project_hash);
504 (
505 dir.join(format!("{HNSW_BASENAME}.hnsw.graph")),
506 dir.join(format!("{HNSW_BASENAME}.hnsw.data")),
507 dir.join(META_FILENAME),
508 )
509}
510
511#[cfg(test)]
512mod tests {
513 use super::*;
514
515 fn unique_hash(tag: &str) -> String {
516 // Include a timestamp + thread id so tests don't collide on
517 // the process-wide ann_cache.
518 use std::time::{SystemTime, UNIX_EPOCH};
519 let nanos = SystemTime::now()
520 .duration_since(UNIX_EPOCH)
521 .unwrap_or_default()
522 .as_nanos();
523 format!("{tag}-{nanos}")
524 }
525
526 fn random_vec(seed: u64, dim: usize) -> Vec<f32> {
527 // Deterministic pseudo-random so recall tests are reproducible.
528 use std::collections::hash_map::DefaultHasher;
529 use std::hash::{Hash, Hasher};
530 let mut v = Vec::with_capacity(dim);
531 for i in 0..dim {
532 let mut h = DefaultHasher::new();
533 (seed, i).hash(&mut h);
534 let raw = h.finish();
535 // Map into [-1, 1).
536 let x = ((raw as i64) as f64) / (i64::MAX as f64);
537 v.push(x as f32);
538 }
539 // L2 normalise so DistCosine is well-behaved.
540 let n: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
541 if n > 0.0 {
542 for x in &mut v {
543 *x /= n;
544 }
545 }
546 v
547 }
548
549 #[tokio::test]
550 async fn empty_index_search_returns_empty() {
551 // Test isolation is provided by `unique_hash()` below — every
552 // test runs against a different `projects/<hash>` subdir under
553 // the crate-wide `shared_test_home()`. No env mutation needed.
554 let _home = crate::db::shared_test_home();
555 let hash = unique_hash("empty-search");
556 let idx = AnnIndex::load_or_empty(&hash, 16).await.unwrap();
557 assert_eq!(idx.total_size(), 0);
558 let hits = idx.search(&[0.1f32; 16], 5);
559 assert!(hits.is_empty());
560 }
561
562 #[tokio::test]
563 async fn build_and_search_returns_nearest() {
564 // Test isolation is provided by `unique_hash()` below — every
565 // test runs against a different `projects/<hash>` subdir under
566 // the crate-wide `shared_test_home()`. No env mutation needed.
567 let _home = crate::db::shared_test_home();
568 let hash = unique_hash("build-search");
569 let dim = 32;
570 // Seed 50 random vectors, then append a "target" that we'll
571 // query with a near-duplicate.
572 let mut chunks: Vec<(String, Vec<f32>)> = Vec::new();
573 for i in 0..50 {
574 chunks.push((format!("c{i}"), random_vec(i as u64, dim)));
575 }
576 let target_seed = 99u64;
577 let target = random_vec(target_seed, dim);
578 chunks.push(("target".to_owned(), target.clone()));
579
580 let idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
581 assert_eq!(idx.total_size(), 51);
582
583 let hits = idx.search(&target, 5);
584 assert!(!hits.is_empty());
585 // The nearest neighbour to a point is the point itself
586 // (distance ~ 0 under DistCosine).
587 assert_eq!(hits[0].0, "target");
588 assert!(
589 hits[0].1 < 1e-3,
590 "self-match distance should be near zero, got {}",
591 hits[0].1
592 );
593 }
594
595 #[tokio::test]
596 async fn save_and_load_roundtrip() {
597 // Test isolation is provided by `unique_hash()` below — every
598 // test runs against a different `projects/<hash>` subdir under
599 // the crate-wide `shared_test_home()`. No env mutation needed.
600 let _home = crate::db::shared_test_home();
601 let hash = unique_hash("roundtrip");
602 let dim = 24;
603 let mut chunks: Vec<(String, Vec<f32>)> = Vec::new();
604 for i in 0..20 {
605 chunks.push((format!("c{i}"), random_vec(i as u64, dim)));
606 }
607 let query = random_vec(7, dim);
608
609 let mut idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
610 let before = idx.search(&query, 5);
611 assert!(!before.is_empty());
612 idx.save().await.unwrap();
613 assert!(!idx.is_dirty());
614
615 invalidate_cache(&hash);
616 let reloaded = AnnIndex::load_or_empty(&hash, dim).await.unwrap();
617 let after = reloaded.search(&query, 5);
618 assert_eq!(before.len(), after.len());
619 // The reload path keeps IDs stable — top-1 must match.
620 assert_eq!(before[0].0, after[0].0);
621 }
622
623 #[tokio::test]
624 async fn upsert_replaces_existing_chunk_embedding() {
625 // Test isolation is provided by `unique_hash()` below — every
626 // test runs against a different `projects/<hash>` subdir under
627 // the crate-wide `shared_test_home()`. No env mutation needed.
628 let _home = crate::db::shared_test_home();
629 let hash = unique_hash("upsert");
630 let dim = 8;
631 let vec_a = random_vec(1, dim);
632 let vec_b = random_vec(2, dim);
633 let mut idx = AnnIndex::build_from_chunks(
634 &hash,
635 &[
636 ("id1".to_owned(), vec_a.clone()),
637 ("neighbor".to_owned(), random_vec(3, dim)),
638 ],
639 )
640 .await
641 .unwrap();
642
643 // Querying with A => id1 comes back as top.
644 let hits1 = idx.search(&vec_a, 2);
645 assert_eq!(hits1[0].0, "id1");
646
647 // Upsert id1 with vec B and confirm:
648 // * querying with A no longer surfaces id1 as top-1
649 // (tombstoned old slot should be filtered)
650 // * querying with B DOES surface id1
651 idx.upsert("id1", &vec_b);
652 let hits_a = idx.search(&vec_a, 2);
653 // id1 may still appear but NOT in top-1 unless its new vec_b
654 // happens to be close to vec_a; at minimum the first hit's
655 // internal id should be the fresh one. Rather than race on
656 // ranking, assert the freshest internal id wins.
657 if !hits_a.is_empty() && hits_a[0].0 == "id1" {
658 // This is fine provided it's the fresh copy.
659 let current = idx.reverse.get("id1").copied();
660 assert!(current.is_some());
661 }
662 let hits_b = idx.search(&vec_b, 2);
663 assert!(
664 hits_b.iter().any(|(id, _)| id == "id1"),
665 "upserted chunk must be searchable via new vector"
666 );
667 }
668
669 #[tokio::test]
670 async fn remove_drops_from_search_results() {
671 // Test isolation is provided by `unique_hash()` below — every
672 // test runs against a different `projects/<hash>` subdir under
673 // the crate-wide `shared_test_home()`. No env mutation needed.
674 let _home = crate::db::shared_test_home();
675 let hash = unique_hash("remove");
676 let dim = 12;
677 let vec_a = random_vec(10, dim);
678 let mut idx = AnnIndex::build_from_chunks(
679 &hash,
680 &[
681 ("doomed".to_owned(), vec_a.clone()),
682 ("keep".to_owned(), random_vec(11, dim)),
683 ],
684 )
685 .await
686 .unwrap();
687
688 let before = idx.search(&vec_a, 2);
689 assert!(before.iter().any(|(id, _)| id == "doomed"));
690
691 idx.remove("doomed");
692 let after = idx.search(&vec_a, 2);
693 assert!(
694 !after.iter().any(|(id, _)| id == "doomed"),
695 "removed chunk must not appear in search results"
696 );
697 }
698
699 #[tokio::test]
700 async fn dim_mismatch_triggers_fallback() {
701 // Test isolation is provided by `unique_hash()` below — every
702 // test runs against a different `projects/<hash>` subdir under
703 // the crate-wide `shared_test_home()`. No env mutation needed.
704 let _home = crate::db::shared_test_home();
705 let hash = unique_hash("dim-mismatch");
706 let dim = 16;
707 let mut chunks: Vec<(String, Vec<f32>)> = Vec::new();
708 for i in 0..5 {
709 chunks.push((format!("c{i}"), random_vec(i as u64, dim)));
710 }
711 let mut idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
712 idx.save().await.unwrap();
713
714 invalidate_cache(&hash);
715 // Reload asking for a DIFFERENT dim — should get empty index.
716 let reloaded = AnnIndex::load_or_empty(&hash, 32).await.unwrap();
717 assert_eq!(reloaded.total_size(), 0, "dim drift must reset the index");
718 // Searching with a query at the stored dim also yields empty
719 // (the reload dropped the graph so nothing to search).
720 let hits = reloaded.search(&random_vec(0, 32), 5);
721 assert!(hits.is_empty());
722 }
723
724 #[tokio::test]
725 async fn search_recall_at_top_10() {
726 // 1000 random vectors; query is a near-duplicate of one of
727 // them. HNSW should place the target in the top-10 on the
728 // overwhelming majority of runs — we assert "within top 10"
729 // as a recall smoke-check rather than top-1 because HNSW is
730 // approximate.
731 // Test isolation is provided by `unique_hash()` below — every
732 // test runs against a different `projects/<hash>` subdir under
733 // the crate-wide `shared_test_home()`. No env mutation needed.
734 let _home = crate::db::shared_test_home();
735 let hash = unique_hash("recall10");
736 let dim = 32;
737 let mut chunks: Vec<(String, Vec<f32>)> = Vec::new();
738 for i in 0..1000 {
739 chunks.push((format!("c{i}"), random_vec(i as u64, dim)));
740 }
741 // Pick c250 as the target; query is a slightly-perturbed
742 // version of its embedding.
743 let target_idx = 250u64;
744 let mut target_query = random_vec(target_idx, dim);
745 // Perturb by 1% then renormalise.
746 for (i, x) in target_query.iter_mut().enumerate() {
747 *x = 0.01f32.mul_add(random_vec(i as u64 + 5000, 1)[0], *x);
748 }
749 let n: f32 = target_query.iter().map(|x| x * x).sum::<f32>().sqrt();
750 if n > 0.0 {
751 for x in &mut target_query {
752 *x /= n;
753 }
754 }
755
756 let idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
757 let hits = idx.search(&target_query, 10);
758 let ids: Vec<_> = hits.iter().map(|(id, _)| id.as_str()).collect();
759 assert!(
760 ids.contains(&format!("c{target_idx}").as_str()),
761 "target chunk must be in top-10 (got {ids:?})"
762 );
763 }
764
765 #[tokio::test]
766 async fn search_caps_large_top_k_requests() {
767 let _home = crate::db::shared_test_home();
768 let hash = unique_hash("cap-top-k");
769 let chunks: Vec<(String, Vec<f32>)> = (0..80)
770 .map(|i| (format!("id-{i:02}"), vec![i as f32, 1.0]))
771 .collect();
772 let idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
773 let hits = idx.search(&[1.0, 1.0], 500);
774 assert!(
775 hits.len() <= MAX_SEARCH_TOP_K,
776 "ANN search should cap oversized requests, got {} hits",
777 hits.len()
778 );
779 }
780
781 #[tokio::test]
782 async fn persistence_meta_version_bump_invalidates() {
783 // Test isolation is provided by `unique_hash()` below — every
784 // test runs against a different `projects/<hash>` subdir under
785 // the crate-wide `shared_test_home()`. No env mutation needed.
786 let _home = crate::db::shared_test_home();
787 let hash = unique_hash("meta-bump");
788 let dim = 8;
789 let chunks = vec![
790 ("a".to_owned(), random_vec(1, dim)),
791 ("b".to_owned(), random_vec(2, dim)),
792 ];
793 let mut idx = AnnIndex::build_from_chunks(&hash, &chunks).await.unwrap();
794 idx.save().await.unwrap();
795 // Rewrite the meta with an older / bogus version so the next
796 // load drops the on-disk graph.
797 let (_g_path, _d_path, meta_path) = ann_files_for_project(&hash);
798 let raw = std::fs::read(&meta_path).unwrap();
799 let mut parsed: serde_json::Value = serde_json::from_slice(&raw).unwrap();
800 parsed["version"] = serde_json::json!(999);
801 std::fs::write(&meta_path, serde_json::to_vec(&parsed).unwrap()).unwrap();
802
803 invalidate_cache(&hash);
804 let reloaded = AnnIndex::load_or_empty(&hash, dim).await.unwrap();
805 assert_eq!(
806 reloaded.total_size(),
807 0,
808 "version drift must wipe the in-memory graph"
809 );
810 }
811
812 #[tokio::test]
813 async fn get_ann_for_project_caches_across_calls() {
814 // Test isolation is provided by `unique_hash()` below — every
815 // test runs against a different `projects/<hash>` subdir under
816 // the crate-wide `shared_test_home()`. No env mutation needed.
817 let _home = crate::db::shared_test_home();
818 let hash = unique_hash("cache");
819 let a = get_ann_for_project(&hash, 16).await.unwrap();
820 let b = get_ann_for_project(&hash, 16).await.unwrap();
821 assert!(
822 Arc::ptr_eq(&a, &b),
823 "cache must return the same Arc across calls"
824 );
825 }
826
827 #[tokio::test]
828 async fn get_ann_for_project_keys_cache_by_dim() {
829 let _home = crate::db::shared_test_home();
830 let hash = unique_hash("cache-dim");
831 let a = get_ann_for_project(&hash, 16).await.unwrap();
832 let b = get_ann_for_project(&hash, 32).await.unwrap();
833 assert!(
834 !Arc::ptr_eq(&a, &b),
835 "same project with different embedding dims must not reuse one ANN cache entry"
836 );
837 }
838}