Skip to main content

nodedb_vector/hnsw/
graph.rs

1//! HNSW graph structure — nodes, parameters, core index operations.
2//!
3//! Production implementation per Malkov & Yashunin (2018).
4//! FP32 construction for structural integrity; heuristic neighbor selection.
5
6use crate::distance::distance;
7
8// Re-export shared params from nodedb-types.
9pub use nodedb_types::hnsw::HnswParams;
10
11/// Result of a k-NN search.
12#[derive(Debug, Clone)]
13pub struct SearchResult {
14    /// Internal node identifier (insertion order).
15    pub id: u32,
16    /// Distance from the query vector.
17    pub distance: f32,
18}
19
20/// A node in the HNSW graph.
21pub struct Node {
22    /// Full-precision vector data.
23    pub vector: Vec<f32>,
24    /// Neighbors at each layer this node participates in.
25    pub neighbors: Vec<Vec<u32>>,
26    /// Tombstone flag for soft-deletion.
27    pub deleted: bool,
28}
29
30/// Hierarchical Navigable Small World graph index.
31///
32/// - FP32 construction for structural integrity
33/// - Heuristic neighbor selection (Algorithm 4)
34/// - Beam search with configurable ef parameter
35pub struct HnswIndex {
36    pub(crate) params: HnswParams,
37    pub(crate) dim: usize,
38    pub(crate) nodes: Vec<Node>,
39    pub(crate) entry_point: Option<u32>,
40    pub(crate) max_layer: usize,
41    pub(crate) rng: Xorshift64,
42    /// Flat neighbor storage for zero-copy access after checkpoint restore.
43    /// When present, `neighbors_at()` reads from here instead of per-node Vecs.
44    /// Cleared on first mutation (insert/delete).
45    pub(crate) flat_neighbors: Option<crate::hnsw::flat_neighbors::FlatNeighborStore>,
46}
47
48impl HnswIndex {
49    /// Get neighbors of a node at a specific layer.
50    /// Uses flat zero-copy storage if available, otherwise per-node Vec.
51    #[inline]
52    pub(crate) fn neighbors_at(&self, node_id: u32, layer: usize) -> &[u32] {
53        if let Some(ref flat) = self.flat_neighbors {
54            return flat.neighbors_at(node_id, layer);
55        }
56        let node = &self.nodes[node_id as usize];
57        if layer < node.neighbors.len() {
58            &node.neighbors[layer]
59        } else {
60            &[]
61        }
62    }
63
64    /// Number of layers a node participates in.
65    #[inline]
66    pub(crate) fn node_num_layers(&self, node_id: u32) -> usize {
67        if let Some(ref flat) = self.flat_neighbors {
68            return flat.num_layers(node_id);
69        }
70        self.nodes[node_id as usize].neighbors.len()
71    }
72
73    /// Ensure mutable per-node neighbor Vecs are available.
74    /// Materializes flat storage back to per-node Vecs if needed.
75    pub(crate) fn ensure_mutable_neighbors(&mut self) {
76        if let Some(flat) = self.flat_neighbors.take() {
77            let nested = flat.to_nested(self.nodes.len());
78            for (i, layers) in nested.into_iter().enumerate() {
79                self.nodes[i].neighbors = layers;
80            }
81        }
82    }
83}
84
85/// Lightweight xorshift64 PRNG for layer assignment.
86pub struct Xorshift64(pub u64);
87
88impl Xorshift64 {
89    pub fn new(seed: u64) -> Self {
90        Self(seed.max(1))
91    }
92
93    pub fn next_f64(&mut self) -> f64 {
94        self.0 ^= self.0 << 13;
95        self.0 ^= self.0 >> 7;
96        self.0 ^= self.0 << 17;
97        (self.0 as f64) / (u64::MAX as f64)
98    }
99}
100
101/// Ordered candidate for priority queues during search and construction.
102#[derive(Clone, Copy, PartialEq)]
103pub struct Candidate {
104    pub dist: f32,
105    pub id: u32,
106}
107
108impl Eq for Candidate {}
109
110impl PartialOrd for Candidate {
111    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
112        Some(self.cmp(other))
113    }
114}
115
116impl Ord for Candidate {
117    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
118        self.dist
119            .partial_cmp(&other.dist)
120            .unwrap_or(std::cmp::Ordering::Equal)
121            .then(self.id.cmp(&other.id))
122    }
123}
124
125impl HnswIndex {
126    /// Create a new empty HNSW index.
127    pub fn new(dim: usize, params: HnswParams) -> Self {
128        Self {
129            dim,
130            nodes: Vec::new(),
131            entry_point: None,
132            max_layer: 0,
133            rng: Xorshift64::new(42),
134            flat_neighbors: None,
135            params,
136        }
137    }
138
139    /// Create with a specific RNG seed (for deterministic testing).
140    pub fn with_seed(dim: usize, params: HnswParams, seed: u64) -> Self {
141        Self {
142            dim,
143            nodes: Vec::new(),
144            entry_point: None,
145            max_layer: 0,
146            rng: Xorshift64::new(seed),
147            flat_neighbors: None,
148            params,
149        }
150    }
151
152    pub fn len(&self) -> usize {
153        self.nodes.len()
154    }
155
156    pub fn live_count(&self) -> usize {
157        self.nodes.len() - self.tombstone_count()
158    }
159
160    pub fn tombstone_count(&self) -> usize {
161        self.nodes.iter().filter(|n| n.deleted).count()
162    }
163
164    /// Tombstone ratio: fraction of nodes that are deleted.
165    pub fn tombstone_ratio(&self) -> f64 {
166        if self.nodes.is_empty() {
167            0.0
168        } else {
169            self.tombstone_count() as f64 / self.nodes.len() as f64
170        }
171    }
172
173    pub fn is_empty(&self) -> bool {
174        self.live_count() == 0
175    }
176
177    /// Soft-delete a vector by internal node ID.
178    pub fn delete(&mut self, id: u32) -> bool {
179        if let Some(node) = self.nodes.get_mut(id as usize) {
180            if node.deleted {
181                return false;
182            }
183            node.deleted = true;
184            true
185        } else {
186            false
187        }
188    }
189
190    pub fn is_deleted(&self, id: u32) -> bool {
191        self.nodes.get(id as usize).is_none_or(|n| n.deleted)
192    }
193
194    pub fn undelete(&mut self, id: u32) -> bool {
195        if let Some(node) = self.nodes.get_mut(id as usize)
196            && node.deleted
197        {
198            node.deleted = false;
199            return true;
200        }
201        false
202    }
203
204    pub fn dim(&self) -> usize {
205        self.dim
206    }
207
208    pub fn get_vector(&self, id: u32) -> Option<&[f32]> {
209        self.nodes.get(id as usize).map(|n| n.vector.as_slice())
210    }
211
212    pub fn params(&self) -> &HnswParams {
213        &self.params
214    }
215
216    pub fn entry_point(&self) -> Option<u32> {
217        self.entry_point
218    }
219
220    pub fn max_layer(&self) -> usize {
221        self.max_layer
222    }
223
224    /// Current RNG state (for snapshot reproducibility).
225    pub fn rng_state(&self) -> u64 {
226        self.rng.0
227    }
228
229    /// Approximate memory usage in bytes (vector data + neighbor lists).
230    pub fn memory_usage_bytes(&self) -> usize {
231        let vector_bytes = self.nodes.len() * self.dim * std::mem::size_of::<f32>();
232        let neighbor_bytes: usize = self
233            .nodes
234            .iter()
235            .map(|n| {
236                n.neighbors
237                    .iter()
238                    .map(|layer| layer.len() * 4)
239                    .sum::<usize>()
240            })
241            .sum();
242        let node_overhead = self.nodes.len() * std::mem::size_of::<Node>();
243        vector_bytes + neighbor_bytes + node_overhead
244    }
245
246    /// Export all vectors for snapshot transfer.
247    pub fn export_vectors(&self) -> Vec<Vec<f32>> {
248        self.nodes.iter().map(|n| n.vector.clone()).collect()
249    }
250
251    /// Export all neighbor lists for snapshot transfer.
252    pub fn export_neighbors(&self) -> Vec<Vec<Vec<u32>>> {
253        self.nodes.iter().map(|n| n.neighbors.clone()).collect()
254    }
255
256    /// Assign a random layer using the exponential distribution.
257    pub(crate) fn random_layer(&mut self) -> usize {
258        let ml = 1.0 / (self.params.m as f64).ln();
259        let r = self.rng.next_f64().max(f64::MIN_POSITIVE);
260        (-r.ln() * ml).floor() as usize
261    }
262
263    /// Compute distance between a query vector and a stored node.
264    pub(crate) fn dist_to_node(&self, query: &[f32], node_id: u32) -> f32 {
265        distance(
266            query,
267            &self.nodes[node_id as usize].vector,
268            self.params.metric,
269        )
270    }
271
272    /// Max neighbors allowed at a given layer.
273    pub(crate) fn max_neighbors(&self, layer: usize) -> usize {
274        if layer == 0 {
275            self.params.m0
276        } else {
277            self.params.m
278        }
279    }
280
281    /// Compact the index by removing all tombstoned nodes.
282    pub fn compact(&mut self) -> usize {
283        let tombstone_count = self.tombstone_count();
284        if tombstone_count == 0 {
285            return 0;
286        }
287        self.ensure_mutable_neighbors();
288
289        let mut id_map: Vec<u32> = Vec::with_capacity(self.nodes.len());
290        let mut new_id = 0u32;
291        for node in &self.nodes {
292            if node.deleted {
293                id_map.push(u32::MAX);
294            } else {
295                id_map.push(new_id);
296                new_id += 1;
297            }
298        }
299
300        let mut new_nodes: Vec<Node> = Vec::with_capacity(new_id as usize);
301        for node in self.nodes.drain(..) {
302            if node.deleted {
303                continue;
304            }
305            let remapped_neighbors: Vec<Vec<u32>> = node
306                .neighbors
307                .into_iter()
308                .map(|layer_neighbors| {
309                    layer_neighbors
310                        .into_iter()
311                        .filter_map(|old_nid| {
312                            let new_nid = id_map[old_nid as usize];
313                            if new_nid == u32::MAX {
314                                None
315                            } else {
316                                Some(new_nid)
317                            }
318                        })
319                        .collect()
320                })
321                .collect();
322            new_nodes.push(Node {
323                vector: node.vector,
324                neighbors: remapped_neighbors,
325                deleted: false,
326            });
327        }
328
329        self.entry_point = if let Some(old_ep) = self.entry_point {
330            let new_ep = id_map[old_ep as usize];
331            if new_ep == u32::MAX {
332                new_nodes
333                    .iter()
334                    .enumerate()
335                    .max_by_key(|(_, n)| n.neighbors.len())
336                    .map(|(i, _)| i as u32)
337            } else {
338                Some(new_ep)
339            }
340        } else {
341            None
342        };
343
344        self.max_layer = new_nodes
345            .iter()
346            .map(|n| n.neighbors.len().saturating_sub(1))
347            .max()
348            .unwrap_or(0);
349
350        self.nodes = new_nodes;
351        tombstone_count
352    }
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358    use crate::distance::DistanceMetric;
359
360    #[test]
361    fn create_empty_index() {
362        let idx = HnswIndex::new(3, HnswParams::default());
363        assert_eq!(idx.len(), 0);
364        assert!(idx.is_empty());
365        assert!(idx.entry_point().is_none());
366    }
367
368    #[test]
369    fn params_default() {
370        let p = HnswParams::default();
371        assert_eq!(p.m, 16);
372        assert_eq!(p.m0, 32);
373        assert_eq!(p.ef_construction, 200);
374        assert_eq!(p.metric, DistanceMetric::Cosine);
375    }
376
377    #[test]
378    fn candidate_ordering() {
379        let a = Candidate { dist: 0.1, id: 1 };
380        let b = Candidate { dist: 0.5, id: 2 };
381        assert!(a < b);
382    }
383}