sqlrite/sql/
hnsw.rs

1//! HNSW (Hierarchical Navigable Small World) approximate-nearest-neighbor
2//! index. Pure algorithm; no SQL integration in this module.
3//!
4//! HNSW is the industry-standard ANN algorithm for in-memory vector search:
5//! a multi-layer graph where each node lives at some randomly-assigned max
6//! layer; higher layers are sparser, layer 0 contains every node. Search
7//! starts at the entry point (the node at the current top layer), greedily
8//! descends layer-by-layer, then does a beam search at layer 0.
9//!
10//! ```text
11//!     layer 2:   [A] -- [E]                    sparse
12//!                 |       |
13//!     layer 1:   [A] -- [E] -- [G] -- [J]      mid
14//!                 |  /  |  \   |  \   |
15//!     layer 0:   [A,B,C,D,E,F,G,H,I,J,...]     dense (every node)
16//! ```
17//!
18//! ## What this module is responsible for
19//!
20//! - The graph (per-node, per-layer neighbor lists)
21//! - Layer assignment for new nodes (geometric distribution)
22//! - Insertion: greedy descent + beam search + neighbor pruning
23//! - Query: greedy descent + beam search at layer 0, return top-k
24//!
25//! ## What it is NOT responsible for (yet)
26//!
27//! - **Storing vectors.** The algorithm calls a `get_vec(node_id) -> &[f32]`
28//!   closure to fetch the vector for any node it touches. In Phase 7d.2
29//!   that closure will read from the SQL table holding the indexed
30//!   column; in tests it reads from an in-memory `Vec<Vec<f32>>`.
31//! - **Persistence.** The graph lives in `HashMap<i64, Node>` for now.
32//!   Phase 7d.3 wires it into the cell-encoded page format.
33//! - **DELETE / UPDATE.** Pre-existing nodes can't be removed today.
34//!   Soft-delete + lazy rebuild is the planned approach for 7d.2/7d.3.
35//!
36//! ## Parameters (per Phase 7 plan Q2 — fixed defaults)
37//!
38//! - `M = 16`              — max neighbors per node at layers > 0
39//! - `m_max0 = 32` (= 2·M) — max neighbors at layer 0
40//! - `ef_construction = 200` — beam width during INSERT
41//! - `ef_search = 50`      — default beam width during query
42//! - `m_l = 1/ln(M) ≈ 0.36`  — layer-assignment scale
43//!
44//! ## Invariants
45//!
46//! - Every `node.layers` Vec has length `node_max_layer + 1` for that node.
47//! - `node.layers[i]` contains node_ids of neighbors at layer i. Each
48//!   neighbor is itself a node in `nodes`; symmetrical (if A → B at layer i
49//!   then B → A at layer i, modulo pruning).
50//! - `entry_point` is `Some(id)` iff `nodes` is non-empty. The entry node
51//!   has the highest max-layer of any node currently in the graph.
52
53use std::cmp::Ordering;
54use std::collections::{BinaryHeap, HashMap, HashSet};
55
56/// Distance metric used by the HNSW index. Must match what the
57/// surrounding `vec_distance_*` SQL function would compute on the same
58/// pair of vectors — otherwise the index probe and the brute-force
59/// fallback would disagree on which rows are "nearest". See
60/// `src/sql/executor.rs`'s `vec_distance_l2` / `_cosine` / `_dot` for
61/// the canonical implementations.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum DistanceMetric {
64    L2,
65    Cosine,
66    Dot,
67}
68
69impl DistanceMetric {
70    /// Computes the configured distance between two equal-dimension
71    /// vectors. Returns `f32::INFINITY` for the cosine/zero-magnitude
72    /// edge case; HNSW treats infinity as "worst possible candidate" and
73    /// will prefer any finite alternative, which matches the SQL-level
74    /// behaviour where `vec_distance_cosine` errors but the optimizer's
75    /// fallback path simply skips the offending row.
76    pub fn compute(self, a: &[f32], b: &[f32]) -> f32 {
77        debug_assert_eq!(a.len(), b.len(), "vector dim mismatch in HNSW distance");
78        match self {
79            DistanceMetric::L2 => {
80                let mut sum = 0.0f32;
81                for i in 0..a.len() {
82                    let d = a[i] - b[i];
83                    sum += d * d;
84                }
85                sum.sqrt()
86            }
87            DistanceMetric::Cosine => {
88                let mut dot = 0.0f32;
89                let mut na = 0.0f32;
90                let mut nb = 0.0f32;
91                for i in 0..a.len() {
92                    dot += a[i] * b[i];
93                    na += a[i] * a[i];
94                    nb += b[i] * b[i];
95                }
96                let denom = (na * nb).sqrt();
97                if denom == 0.0 {
98                    f32::INFINITY
99                } else {
100                    1.0 - dot / denom
101                }
102            }
103            DistanceMetric::Dot => {
104                let mut dot = 0.0f32;
105                for i in 0..a.len() {
106                    dot += a[i] * b[i];
107                }
108                -dot
109            }
110        }
111    }
112}
113
114/// Per-node metadata: a list of neighbor IDs for each layer this node
115/// lives in. `layers[0]` is layer 0 (densest); `layers[layers.len() - 1]`
116/// is the highest layer this node reaches.
117#[derive(Debug, Clone, Default)]
118pub struct Node {
119    /// Indexed by layer (0 = dense). `layers[i]` is the neighbor list
120    /// for this node at layer i. Always sorted-by-distance is *not* a
121    /// guaranteed invariant — pruning maintains it after each
122    /// modification, but during insert we may briefly hold an
123    /// unsorted set.
124    pub layers: Vec<Vec<i64>>,
125}
126
127impl Node {
128    /// Maximum layer this node reaches. Equals `layers.len() - 1`.
129    pub fn max_layer(&self) -> usize {
130        self.layers.len() - 1
131    }
132}
133
134/// HNSW algorithm parameters. Phase 7 ships fixed defaults (Q2 in the
135/// plan); this struct is `Clone + Copy` so callers wanting to fork an
136/// experimental tuning can do so without touching the index itself.
137#[derive(Debug, Clone, Copy)]
138pub struct HnswParams {
139    pub m: usize,
140    pub m_max0: usize,
141    pub ef_construction: usize,
142    pub ef_search: usize,
143    pub m_l: f32,
144}
145
146impl Default for HnswParams {
147    fn default() -> Self {
148        let m = 16;
149        Self {
150            m,
151            m_max0: 2 * m,
152            ef_construction: 200,
153            ef_search: 50,
154            m_l: 1.0 / (m as f32).ln(),
155        }
156    }
157}
158
159/// In-memory HNSW graph. See module docs for the model.
160#[derive(Debug, Clone)]
161pub struct HnswIndex {
162    pub params: HnswParams,
163    pub distance: DistanceMetric,
164    /// Node id of the entry point. `None` iff the index is empty.
165    /// At all times this is the id of the node with the highest
166    /// max-layer; if multiple nodes tie for the top layer, the
167    /// most-recently-promoted one wins.
168    pub entry_point: Option<i64>,
169    /// Highest layer currently populated. 0 when the index has at
170    /// most one node, grows as new nodes get assigned higher layers.
171    pub top_layer: usize,
172    /// Node id → its per-layer neighbor lists.
173    pub nodes: HashMap<i64, Node>,
174    /// xorshift64 RNG state for layer assignment. Seeded explicitly via
175    /// `new` so tests can pin a known sequence.
176    rng_state: u64,
177}
178
179impl HnswIndex {
180    /// Builds an empty HNSW index with default parameters and the given
181    /// distance metric + RNG seed. A seed of 0 is mapped to a small
182    /// nonzero constant — xorshift gets stuck at zero.
183    pub fn new(distance: DistanceMetric, seed: u64) -> Self {
184        let seed = if seed == 0 { 0x9E3779B97F4A7C15 } else { seed };
185        Self {
186            params: HnswParams::default(),
187            distance,
188            entry_point: None,
189            top_layer: 0,
190            nodes: HashMap::new(),
191            rng_state: seed,
192        }
193    }
194
195    /// True if no nodes have been inserted yet.
196    pub fn is_empty(&self) -> bool {
197        self.nodes.is_empty()
198    }
199
200    /// Number of nodes currently in the index.
201    pub fn len(&self) -> usize {
202        self.nodes.len()
203    }
204
205    /// Phase 7d.3 — produces (node_id, layers) pairs in ascending node_id
206    /// order, suitable for serializing the graph to disk via the
207    /// `HnswNodeCell` wire format. The graph's metadata
208    /// (entry_point + top_layer) is recoverable from the nodes alone:
209    /// top_layer = max(max_layer); entry_point = any node at top_layer.
210    /// So we don't ship a separate metadata cell.
211    pub fn serialize_nodes(&self) -> Vec<(i64, Vec<Vec<i64>>)> {
212        let mut out: Vec<(i64, Vec<Vec<i64>>)> = self
213            .nodes
214            .iter()
215            .map(|(id, n)| (*id, n.layers.clone()))
216            .collect();
217        out.sort_by_key(|(id, _)| *id);
218        out
219    }
220
221    /// Phase 7d.3 — rebuilds an HnswIndex from a stream of (node_id, layers)
222    /// pairs as produced by `serialize_nodes` and round-tripped through
223    /// `HnswNodeCell` encode/decode. The rebuilt index has the same nodes,
224    /// same neighbor lists, same entry_point + top_layer as the source.
225    /// `seed` is fresh; the deserialized index is never inserted into via
226    /// the algorithmic `insert` path so the seed only matters if a caller
227    /// later calls `insert` after deserializing (then it controls layer
228    /// assignment for the appended node).
229    pub fn from_persisted_nodes<I>(distance: DistanceMetric, seed: u64, nodes: I) -> Self
230    where
231        I: IntoIterator<Item = (i64, Vec<Vec<i64>>)>,
232    {
233        let mut idx = Self::new(distance, seed);
234        let mut top_layer = 0usize;
235        let mut entry_point: Option<i64> = None;
236        for (id, layers) in nodes {
237            let max_layer = layers.len().saturating_sub(1);
238            if max_layer > top_layer || entry_point.is_none() {
239                top_layer = max_layer;
240                entry_point = Some(id);
241            }
242            idx.nodes.insert(id, Node { layers });
243        }
244        idx.top_layer = top_layer;
245        idx.entry_point = entry_point;
246        idx
247    }
248
249    /// Inserts a node into the graph. The node id must be unique;
250    /// re-inserting an existing id is a no-op (returns without error).
251    /// `vec` is the new node's vector; `get_vec` looks up the vector
252    /// for any other node id the algorithm touches.
253    pub fn insert<F>(&mut self, node_id: i64, vec: &[f32], get_vec: F)
254    where
255        F: Fn(i64) -> Vec<f32>,
256    {
257        if self.nodes.contains_key(&node_id) {
258            return;
259        }
260
261        // First node: trivial case. Becomes entry point at layer 0.
262        if self.is_empty() {
263            self.nodes.insert(
264                node_id,
265                Node {
266                    layers: vec![Vec::new()],
267                },
268            );
269            self.entry_point = Some(node_id);
270            self.top_layer = 0;
271            return;
272        }
273
274        // Pick a layer for this new node.
275        let target_layer = self.pick_layer();
276
277        // Pre-allocate the new node's layer lists (empty for now;
278        // populated below).
279        let new_node = Node {
280            layers: vec![Vec::new(); target_layer + 1],
281        };
282        self.nodes.insert(node_id, new_node);
283
284        // Greedy descent from top down to (target_layer + 1) — at each
285        // layer above our target, advance the entry point to the
286        // single closest node. We don't add edges at these layers
287        // because the new node doesn't live there.
288        let mut entry = self.entry_point.expect("non-empty index has entry point");
289        for layer in (target_layer + 1..=self.top_layer).rev() {
290            let nearest = self.search_layer(vec, &[entry], 1, layer, &get_vec);
291            if let Some((_, id)) = nearest.into_iter().next() {
292                entry = id;
293            }
294        }
295
296        // Beam search + connect at each layer the new node lives in.
297        // We work top-down; the entry point for each layer is the best
298        // candidate found at the layer above.
299        let mut entries = vec![entry];
300        for layer in (0..=target_layer).rev() {
301            let candidates =
302                self.search_layer(vec, &entries, self.params.ef_construction, layer, &get_vec);
303
304            // Pick up to M neighbors from candidates (M_max0 at layer 0
305            // since we allow more connections at the dense layer).
306            let m_max = if layer == 0 {
307                self.params.m_max0
308            } else {
309                self.params.m
310            };
311            let neighbors: Vec<i64> = candidates
312                .iter()
313                .take(self.params.m)
314                .map(|(_, id)| *id)
315                .collect();
316
317            // Wire up the bidirectional edges.
318            self.nodes.get_mut(&node_id).expect("just inserted").layers[layer] = neighbors.clone();
319
320            for &nb in &neighbors {
321                let nb_layers = &mut self.nodes.get_mut(&nb).expect("neighbor must exist").layers;
322                if layer >= nb_layers.len() {
323                    // Neighbor doesn't actually live at this layer — shouldn't
324                    // happen because search_layer only returns nodes at this
325                    // layer, but defend against it.
326                    continue;
327                }
328                nb_layers[layer].push(node_id);
329
330                // Prune the neighbor's edge list if it's now over its M_max
331                // budget. Pruning policy: keep the closest M_max nodes
332                // by distance. (Distance recomputed; no precomputed values.)
333                if nb_layers[layer].len() > m_max {
334                    let nb_vec = get_vec(nb);
335                    let mut by_dist: Vec<(f32, i64)> = nb_layers[layer]
336                        .iter()
337                        .map(|id| (self.distance.compute(&nb_vec, &get_vec(*id)), *id))
338                        .collect();
339                    by_dist
340                        .sort_by(|(da, _), (db, _)| da.partial_cmp(db).unwrap_or(Ordering::Equal));
341                    by_dist.truncate(m_max);
342                    nb_layers[layer] = by_dist.into_iter().map(|(_, id)| id).collect();
343                }
344            }
345
346            // Carry the candidate set forward as entry points for the
347            // next (lower) layer.
348            entries = candidates.into_iter().map(|(_, id)| id).collect();
349        }
350
351        // If this new node lives higher than the current top, promote it.
352        if target_layer > self.top_layer {
353            self.top_layer = target_layer;
354            self.entry_point = Some(node_id);
355        }
356    }
357
358    /// Returns the k nearest node ids to `query`, in distance-ascending
359    /// order (closest first). Empty index returns an empty Vec.
360    pub fn search<F>(&self, query: &[f32], k: usize, get_vec: F) -> Vec<i64>
361    where
362        F: Fn(i64) -> Vec<f32>,
363    {
364        if self.is_empty() || k == 0 {
365            return Vec::new();
366        }
367
368        // Greedy descent from the top down to layer 1.
369        let mut entry = self.entry_point.expect("non-empty index has entry point");
370        for layer in (1..=self.top_layer).rev() {
371            let nearest = self.search_layer(query, &[entry], 1, layer, &get_vec);
372            if let Some((_, id)) = nearest.into_iter().next() {
373                entry = id;
374            }
375        }
376
377        // Beam search at layer 0 with width = max(ef_search, k).
378        let ef = self.params.ef_search.max(k);
379        let candidates = self.search_layer(query, &[entry], ef, 0, &get_vec);
380
381        candidates.into_iter().take(k).map(|(_, id)| id).collect()
382    }
383
384    /// Runs a beam search at one layer starting from `entries`, returning
385    /// the top-`ef` nearest nodes to `query` found, sorted by distance
386    /// ascending.
387    ///
388    /// This is the workhorse of both insert and search. The two priority
389    /// queues — "candidates" (nodes still to expand) and "results"
390    /// (current best ef found) — terminate when the closest unexpanded
391    /// candidate is farther than the worst kept result.
392    fn search_layer<F>(
393        &self,
394        query: &[f32],
395        entries: &[i64],
396        ef: usize,
397        layer: usize,
398        get_vec: &F,
399    ) -> Vec<(f32, i64)>
400    where
401        F: Fn(i64) -> Vec<f32>,
402    {
403        let mut visited: HashSet<i64> = HashSet::with_capacity(ef * 2);
404        // candidates: min-heap of (distance, id) — pop closest first.
405        let mut candidates: BinaryHeap<MinHeapItem> = BinaryHeap::with_capacity(ef * 2);
406        // results: max-heap of (distance, id) — top is the worst kept.
407        let mut results: BinaryHeap<MaxHeapItem> = BinaryHeap::with_capacity(ef);
408
409        for &id in entries {
410            if !visited.insert(id) {
411                continue;
412            }
413            let d = self.distance.compute(query, &get_vec(id));
414            candidates.push(MinHeapItem { dist: d, id });
415            results.push(MaxHeapItem { dist: d, id });
416        }
417
418        while let Some(MinHeapItem {
419            dist: c_dist,
420            id: c_id,
421        }) = candidates.pop()
422        {
423            // If the closest unexpanded candidate is worse than the
424            // worst kept result, no further expansion can improve the
425            // result set. Bail.
426            if let Some(worst) = results.peek() {
427                if results.len() >= ef && c_dist > worst.dist {
428                    break;
429                }
430            }
431
432            // Expand: visit each neighbor of c_id at this layer.
433            let neighbors = self
434                .nodes
435                .get(&c_id)
436                .and_then(|n| n.layers.get(layer))
437                .cloned()
438                .unwrap_or_default();
439            for nb in neighbors {
440                if !visited.insert(nb) {
441                    continue;
442                }
443                let d = self.distance.compute(query, &get_vec(nb));
444                let admit = if results.len() < ef {
445                    true
446                } else {
447                    d < results.peek().unwrap().dist
448                };
449                if admit {
450                    candidates.push(MinHeapItem { dist: d, id: nb });
451                    results.push(MaxHeapItem { dist: d, id: nb });
452                    if results.len() > ef {
453                        results.pop();
454                    }
455                }
456            }
457        }
458
459        // Drain results into a sorted vec. results is a max-heap, so
460        // popping gives descending order; reverse for ascending.
461        let mut out: Vec<(f32, i64)> = Vec::with_capacity(results.len());
462        while let Some(item) = results.pop() {
463            out.push((item.dist, item.id));
464        }
465        out.reverse();
466        out
467    }
468
469    /// Picks a layer for a new node using the standard HNSW geometric
470    /// distribution: `L = floor(-ln(uniform) * m_l)`. With M=16, mL ≈ 0.36,
471    /// so:
472    ///   - P(L=0) ≈ 1 - 1/M = 15/16
473    ///   - P(L=1) ≈ 1/16 - 1/256
474    ///   - P(L=2) ≈ 1/256 - …
475    /// i.e., most new nodes live only at layer 0; a few percolate up.
476    fn pick_layer(&mut self) -> usize {
477        let u = self.next_uniform().max(1e-6); // guard log(0)
478        let layer = (-u.ln() * self.params.m_l).floor() as usize;
479        // Cap at top_layer + 1 to keep the graph from sprouting empty
480        // layers above the current top — matches the original HNSW
481        // paper's recommendation.
482        layer.min(self.top_layer + 1)
483    }
484
485    /// Pulls a uniform-on-(0, 1] f32 from the internal xorshift state.
486    /// Top 24 bits of the next u64, divided by 2^24 — gives 24-bit
487    /// uniform precision, plenty for layer assignment.
488    fn next_uniform(&mut self) -> f32 {
489        let mut x = self.rng_state;
490        x ^= x << 13;
491        x ^= x >> 7;
492        x ^= x << 17;
493        self.rng_state = x;
494        ((x >> 40) as u32) as f32 / (1u32 << 24) as f32
495    }
496}
497
498// -----------------------------------------------------------------
499// Heap items
500//
501// Rust's BinaryHeap is a max-heap that uses Ord. f32 doesn't impl Ord
502// (NaN), so we wrap (distance, id) pairs and provide custom Ord that
503// uses partial_cmp with NaN treated as Greater (NaN sorts as worst).
504//
505// MinHeapItem inverts the comparison so BinaryHeap<MinHeapItem> behaves
506// as a min-heap — top is the smallest distance, popping gives ascending
507// order.
508//
509// MaxHeapItem uses the natural ordering — top is the largest distance.
510
511#[derive(Debug, Clone, Copy)]
512struct MinHeapItem {
513    dist: f32,
514    id: i64,
515}
516
517impl PartialEq for MinHeapItem {
518    fn eq(&self, other: &Self) -> bool {
519        self.dist == other.dist && self.id == other.id
520    }
521}
522impl Eq for MinHeapItem {}
523impl PartialOrd for MinHeapItem {
524    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
525        Some(self.cmp(other))
526    }
527}
528impl Ord for MinHeapItem {
529    fn cmp(&self, other: &Self) -> Ordering {
530        // Reverse so smallest distance bubbles to top.
531        other
532            .dist
533            .partial_cmp(&self.dist)
534            .unwrap_or(Ordering::Equal)
535            .then(other.id.cmp(&self.id))
536    }
537}
538
539#[derive(Debug, Clone, Copy)]
540struct MaxHeapItem {
541    dist: f32,
542    id: i64,
543}
544
545impl PartialEq for MaxHeapItem {
546    fn eq(&self, other: &Self) -> bool {
547        self.dist == other.dist && self.id == other.id
548    }
549}
550impl Eq for MaxHeapItem {}
551impl PartialOrd for MaxHeapItem {
552    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
553        Some(self.cmp(other))
554    }
555}
556impl Ord for MaxHeapItem {
557    fn cmp(&self, other: &Self) -> Ordering {
558        // Natural so largest distance bubbles to top.
559        self.dist
560            .partial_cmp(&other.dist)
561            .unwrap_or(Ordering::Equal)
562            .then(self.id.cmp(&other.id))
563    }
564}
565
566// -----------------------------------------------------------------
567// Tests
568// -----------------------------------------------------------------
569
570#[cfg(test)]
571mod tests {
572    use super::*;
573
574    /// Deterministic xorshift to generate test vectors.
575    fn random_vec(state: &mut u64, dim: usize) -> Vec<f32> {
576        (0..dim)
577            .map(|_| {
578                let mut x = *state;
579                x ^= x << 13;
580                x ^= x >> 7;
581                x ^= x << 17;
582                *state = x;
583                ((x >> 40) as u32) as f32 / (1u32 << 24) as f32
584            })
585            .collect()
586    }
587
588    /// Brute-force nearest-neighbors baseline for recall comparison.
589    fn brute_force_topk(
590        vectors: &[Vec<f32>],
591        query: &[f32],
592        k: usize,
593        metric: DistanceMetric,
594    ) -> Vec<i64> {
595        let mut by_dist: Vec<(f32, i64)> = vectors
596            .iter()
597            .enumerate()
598            .map(|(i, v)| (metric.compute(query, v), i as i64))
599            .collect();
600        by_dist.sort_by(|(a, _), (b, _)| a.partial_cmp(b).unwrap_or(Ordering::Equal));
601        by_dist.into_iter().take(k).map(|(_, id)| id).collect()
602    }
603
604    /// recall@k — fraction of the brute-force top-k that the HNSW
605    /// search also returned (in any order).
606    fn recall_at_k(hnsw_result: &[i64], baseline: &[i64]) -> f32 {
607        let baseline_set: HashSet<i64> = baseline.iter().copied().collect();
608        let hits = hnsw_result
609            .iter()
610            .filter(|id| baseline_set.contains(id))
611            .count();
612        hits as f32 / baseline.len() as f32
613    }
614
615    #[test]
616    fn empty_index_returns_empty_search() {
617        let idx = HnswIndex::new(DistanceMetric::L2, 42);
618        let vectors: Vec<Vec<f32>> = vec![];
619        let result = idx.search(&[0.0; 4], 5, |id| vectors[id as usize].clone());
620        assert!(result.is_empty());
621    }
622
623    #[test]
624    fn single_node_returns_only_itself() {
625        let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
626        let v0 = vec![1.0, 2.0, 3.0];
627        let vectors = vec![v0.clone()];
628        idx.insert(0, &v0, |id| vectors[id as usize].clone());
629        let result = idx.search(&[0.0; 3], 5, |id| vectors[id as usize].clone());
630        assert_eq!(result, vec![0]);
631    }
632
633    #[test]
634    fn duplicate_insert_is_noop() {
635        let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
636        let v0 = vec![1.0, 2.0];
637        let vectors = vec![v0.clone()];
638        idx.insert(0, &v0, |id| vectors[id as usize].clone());
639        idx.insert(0, &v0, |id| vectors[id as usize].clone());
640        assert_eq!(idx.len(), 1);
641    }
642
643    #[test]
644    fn k_zero_returns_empty() {
645        let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
646        let vectors = vec![vec![1.0, 0.0], vec![0.0, 1.0]];
647        for (i, v) in vectors.iter().enumerate() {
648            idx.insert(i as i64, v, |id| vectors[id as usize].clone());
649        }
650        let result = idx.search(&[0.5, 0.5], 0, |id| vectors[id as usize].clone());
651        assert!(result.is_empty());
652    }
653
654    #[test]
655    fn small_graph_finds_exact_nearest() {
656        // 5 well-separated points in 2D — HNSW should find the exact
657        // nearest with no recall loss for k=1 and k=3.
658        let vectors: Vec<Vec<f32>> = vec![
659            vec![0.0, 0.0],
660            vec![10.0, 0.0],
661            vec![0.0, 10.0],
662            vec![10.0, 10.0],
663            vec![5.0, 5.0],
664        ];
665        let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
666        for (i, v) in vectors.iter().enumerate() {
667            idx.insert(i as i64, v, |id| vectors[id as usize].clone());
668        }
669
670        // Query at (1, 1): nearest is (0, 0).
671        let result = idx.search(&[1.0, 1.0], 1, |id| vectors[id as usize].clone());
672        assert_eq!(result, vec![0]);
673
674        // Query at (5.5, 5.5): top-3 should be id=4 (5,5), then any
675        // two of the corners at distance ~7.78.
676        let result = idx.search(&[5.5, 5.5], 3, |id| vectors[id as usize].clone());
677        assert_eq!(result.len(), 3);
678        assert_eq!(result[0], 4, "closest to (5.5,5.5) should be id=4");
679    }
680
681    #[test]
682    fn recall_at_10_is_high_on_random_vectors_l2() {
683        // Standard recall test: 1000 random vectors in 8D, query for
684        // top-10 with HNSW, compare to brute-force ground truth.
685        // Modern HNSW papers target recall@10 > 0.95; we should clear
686        // that comfortably on this small benchmark.
687        let mut state: u64 = 0xDEADBEEF;
688        let dim = 8;
689        let n = 1000;
690        let queries = 20;
691        let k = 10;
692
693        let vectors: Vec<Vec<f32>> = (0..n).map(|_| random_vec(&mut state, dim)).collect();
694
695        let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
696        for (i, v) in vectors.iter().enumerate() {
697            idx.insert(i as i64, v, |id| vectors[id as usize].clone());
698        }
699
700        let mut total_recall = 0.0f32;
701        for _ in 0..queries {
702            let q = random_vec(&mut state, dim);
703            let hnsw_top = idx.search(&q, k, |id| vectors[id as usize].clone());
704            let baseline = brute_force_topk(&vectors, &q, k, DistanceMetric::L2);
705            total_recall += recall_at_k(&hnsw_top, &baseline);
706        }
707        let avg_recall = total_recall / queries as f32;
708        assert!(
709            avg_recall >= 0.95,
710            "recall@{k} dropped below 0.95: avg={avg_recall:.3}"
711        );
712    }
713
714    #[test]
715    fn recall_at_10_is_high_on_random_vectors_cosine() {
716        // Same shape as the L2 test but with cosine distance, to
717        // exercise the alternative metric through the same pipeline.
718        let mut state: u64 = 0xC0FFEE;
719        let dim = 16;
720        let n = 500;
721        let queries = 20;
722        let k = 10;
723
724        let vectors: Vec<Vec<f32>> = (0..n).map(|_| random_vec(&mut state, dim)).collect();
725
726        let mut idx = HnswIndex::new(DistanceMetric::Cosine, 42);
727        for (i, v) in vectors.iter().enumerate() {
728            idx.insert(i as i64, v, |id| vectors[id as usize].clone());
729        }
730
731        let mut total_recall = 0.0f32;
732        for _ in 0..queries {
733            let q = random_vec(&mut state, dim);
734            let hnsw_top = idx.search(&q, k, |id| vectors[id as usize].clone());
735            let baseline = brute_force_topk(&vectors, &q, k, DistanceMetric::Cosine);
736            total_recall += recall_at_k(&hnsw_top, &baseline);
737        }
738        let avg_recall = total_recall / queries as f32;
739        assert!(
740            avg_recall >= 0.95,
741            "cosine recall@{k} dropped below 0.95: avg={avg_recall:.3}"
742        );
743    }
744
745    #[test]
746    fn entry_point_promotes_when_higher_layer_node_inserted() {
747        // The graph's entry point should always be a node at the
748        // current top layer. Insert two nodes; if the second lands at
749        // a higher layer, it becomes the entry point.
750        // We can't easily force a particular layer (it's randomized),
751        // so check the invariant: after every insert, the entry node's
752        // max_layer == top_layer.
753        let mut state: u64 = 0xABCDEF;
754        let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
755        let dim = 4;
756        let mut vectors: Vec<Vec<f32>> = Vec::new();
757        for i in 0..50 {
758            vectors.push(random_vec(&mut state, dim));
759            let v = vectors[i].clone();
760            idx.insert(i as i64, &v, |id| vectors[id as usize].clone());
761
762            // Check invariant.
763            let entry = idx.entry_point.expect("non-empty");
764            let entry_max = idx.nodes[&entry].max_layer();
765            assert_eq!(
766                entry_max, idx.top_layer,
767                "entry-point invariant broken at step {i}: entry {entry} has max_layer {entry_max}, top_layer is {}",
768                idx.top_layer
769            );
770        }
771    }
772
773    #[test]
774    fn neighbor_lists_respect_m_max() {
775        // After inserting 200 points with M=16 (so M_max0 = 32), no
776        // node should have more than 32 neighbors at layer 0 or more
777        // than 16 at any higher layer.
778        let mut state: u64 = 0x123456;
779        let mut idx = HnswIndex::new(DistanceMetric::L2, 42);
780        let dim = 4;
781        let mut vectors: Vec<Vec<f32>> = Vec::new();
782        for i in 0..200 {
783            vectors.push(random_vec(&mut state, dim));
784            let v = vectors[i].clone();
785            idx.insert(i as i64, &v, |id| vectors[id as usize].clone());
786        }
787
788        for (id, node) in &idx.nodes {
789            for (layer, neighbors) in node.layers.iter().enumerate() {
790                let cap = if layer == 0 {
791                    idx.params.m_max0
792                } else {
793                    idx.params.m
794                };
795                assert!(
796                    neighbors.len() <= cap,
797                    "node {id} layer {layer} has {} > cap {cap}",
798                    neighbors.len()
799                );
800            }
801        }
802    }
803
804    #[test]
805    fn deterministic_with_fixed_seed() {
806        // Same seed + same insert order → same graph topology.
807        // Catches accidental sources of nondeterminism (HashMap
808        // iteration order, etc.).
809        let mut state: u64 = 0x999;
810        let dim = 4;
811        let n = 50;
812        let vectors: Vec<Vec<f32>> = (0..n).map(|_| random_vec(&mut state, dim)).collect();
813
814        let mut idx_a = HnswIndex::new(DistanceMetric::L2, 42);
815        let mut idx_b = HnswIndex::new(DistanceMetric::L2, 42);
816        for (i, v) in vectors.iter().enumerate() {
817            idx_a.insert(i as i64, v, |id| vectors[id as usize].clone());
818            idx_b.insert(i as i64, v, |id| vectors[id as usize].clone());
819        }
820
821        // Same top layer.
822        assert_eq!(idx_a.top_layer, idx_b.top_layer);
823        // Same entry point.
824        assert_eq!(idx_a.entry_point, idx_b.entry_point);
825        // Same node count and same per-node max-layer for every id.
826        // (Neighbor list contents may differ trivially if HashMap
827        // iteration sneaked in; if this fails, fix the source first.)
828        assert_eq!(idx_a.nodes.len(), idx_b.nodes.len());
829        for (id, node_a) in &idx_a.nodes {
830            let node_b = idx_b.nodes.get(id).expect("missing id");
831            assert_eq!(node_a.max_layer(), node_b.max_layer(), "id={id}");
832        }
833    }
834}
sqlrite/sql/hnsw.rs

sqlrite/sql/
hnsw.rs