1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
//! BFS-based graph reordering for improved cache locality during HNSW search.
//!
//! After index construction, vectors are stored in insertion order. Reordering
//! them in BFS traversal order from the entry point improves spatial locality
//! when following graph edges during search, reducing cache misses by 15-30%.
//!
//! Reference: "Graph Reordering for Cache-Efficient Near Neighbor Search"
//! (arXiv:2104.03221, NeurIPS 2022).
use super::super::distance::DistanceEngine;
use super::super::layer::NodeId;
use super::{NativeHnsw, NO_ENTRY_POINT};
use std::collections::VecDeque;
use std::sync::atomic::Ordering;
/// Minimum element count below which reordering provides no measurable benefit.
/// At <1000 vectors, the entire working set fits in L2 cache.
const REORDER_THRESHOLD: usize = 1000;
impl<D: DistanceEngine> NativeHnsw<D> {
/// Reorders graph nodes in BFS traversal order for improved cache locality.
///
/// After reordering, vectors that are close in the graph are also close
/// in memory, reducing cache misses during search traversal.
///
/// # When to call
///
/// - After `build()` completes for a static index
/// - After compaction of a dynamic index
/// - Not needed for small indices (< 1000 vectors)
///
/// # Errors
///
/// Returns an error if vector storage reordering fails.
pub fn reorder_for_locality(&self) -> crate::error::Result<()> {
let count = self.count.load(Ordering::Relaxed);
if count < REORDER_THRESHOLD {
return Ok(());
}
let entry = self.entry_point.load(Ordering::Acquire);
if entry == NO_ENTRY_POINT {
return Ok(());
}
let permutation = self.compute_bfs_order(entry, count);
if permutation.is_empty() {
return Ok(());
}
self.apply_permutation(&permutation)?;
// Reordering rewrites both the vector buffer AND every neighbour
// list, so any cached CSR / flat-vector snapshot built from the
// pre-reorder topology is now stale. The contract on
// `invalidate_gpu_caches` is "call after every mutation that
// changes the set of active nodes or their vector data" — an
// in-place permutation qualifies. This was a pre-existing gap
// (PR #626 never invalidated here either); folding the fix into
// PR-B of #634 since the whole point of the unified version
// counter is to close this class of bug.
#[cfg(feature = "gpu")]
self.invalidate_gpu_caches();
Ok(())
}
/// Computes BFS traversal order starting from the entry point on layer 0.
///
/// Returns a permutation where `result[i]` is the old node ID that should
/// occupy position `i` after reordering. Disconnected nodes are appended
/// in their original order after the BFS component.
fn compute_bfs_order(&self, entry: NodeId, count: usize) -> Vec<NodeId> {
let layers = self.layers.read();
if layers.is_empty() {
return Vec::new();
}
let mut order = Vec::with_capacity(count);
let mut visited = vec![false; count];
let mut queue = VecDeque::with_capacity(count);
if entry < count {
visited[entry] = true;
queue.push_back(entry);
}
self.bfs_walk(&layers[0], &mut queue, &mut visited, &mut order, count);
self.append_unvisited(&visited, &mut order);
order
}
/// Runs BFS on the given layer, draining the queue and appending nodes to `order`.
#[allow(clippy::unused_self)] // Reason: method receiver for future per-graph config
fn bfs_walk(
&self,
layer: &super::super::layer::Layer,
queue: &mut VecDeque<NodeId>,
visited: &mut [bool],
order: &mut Vec<NodeId>,
count: usize,
) {
while let Some(node) = queue.pop_front() {
order.push(node);
let _ = layer.with_neighbors(node, |neighbors| {
for &neighbor in neighbors {
if neighbor < count && !visited[neighbor] {
visited[neighbor] = true;
queue.push_back(neighbor);
}
}
});
}
}
/// Appends any unvisited nodes (disconnected components) to `order`.
#[allow(clippy::unused_self)] // Reason: method receiver for future per-graph config
fn append_unvisited(&self, visited: &[bool], order: &mut Vec<NodeId>) {
for (node, &was_visited) in visited.iter().enumerate() {
if !was_visited {
order.push(node);
}
}
}
/// Applies a permutation to vectors, neighbor lists, and the entry point.
///
/// After reordering, builds a PDX columnar layout from the reordered
/// vectors for SIMD-parallel distance computation.
fn apply_permutation(&self, new_order: &[NodeId]) -> crate::error::Result<()> {
let count = new_order.len();
let old_to_new = Self::build_reverse_mapping(new_order, count);
self.reorder_vectors(new_order)?;
self.remap_neighbor_ids(&old_to_new);
self.update_entry_point(&old_to_new, count);
self.build_columnar_layout();
Ok(())
}
/// Builds a PDX block-columnar layout from the current vector storage.
///
/// This transposes row-major vectors into 64-vector blocks where each
/// dimension is contiguous, enabling SIMD-parallel distance computation.
fn build_columnar_layout(&self) {
let vectors_guard = self.vectors.read();
if let Some(vectors) = vectors_guard.as_ref() {
let pdx = super::super::columnar_vectors::ColumnarVectors::from_contiguous(vectors);
*self.columnar.write() = Some(pdx);
}
}
/// Builds a reverse mapping: `result[old_id] = new_id`.
fn build_reverse_mapping(new_order: &[NodeId], count: usize) -> Vec<usize> {
let mut old_to_new = vec![0usize; count];
for (new_id, &old_id) in new_order.iter().enumerate() {
if old_id < count {
old_to_new[old_id] = new_id;
}
}
old_to_new
}
/// Reorders vector storage according to the given permutation.
fn reorder_vectors(&self, new_order: &[NodeId]) -> crate::error::Result<()> {
let mut guard = self.vectors.write();
if let Some(storage) = guard.as_mut() {
storage.reorder(new_order)?;
}
Ok(())
}
/// Remaps all neighbor IDs in all layers according to the mapping.
fn remap_neighbor_ids(&self, old_to_new: &[usize]) {
let mut layers = self.layers.write();
for layer in layers.iter_mut() {
layer.remap_ids(old_to_new);
}
}
/// Updates the entry point to its new ID after permutation.
///
/// Called during `reorder_for_locality()` which runs single-threaded
/// after all inserts complete. No concurrent promotions are possible,
/// so a direct atomic store with `Release` ordering is sufficient.
fn update_entry_point(&self, old_to_new: &[usize], count: usize) {
let old_ep = self.entry_point.load(Ordering::Acquire);
if old_ep != NO_ENTRY_POINT && old_ep < count {
self.entry_point
.store(old_to_new[old_ep], Ordering::Release);
}
}
}
#[cfg(test)]
mod tests {
use super::super::super::distance::CpuDistance;
use super::*;
use crate::distance::DistanceMetric;
/// Creates a small test index with `n` vectors of dimension `dim`.
fn build_test_index(n: usize, dim: usize) -> NativeHnsw<CpuDistance> {
let distance = CpuDistance::new(DistanceMetric::Euclidean);
let hnsw = NativeHnsw::new(distance, 8, 32, n);
for i in 0..n {
// Reason: cast_precision_loss acceptable for test data generation.
#[allow(clippy::cast_precision_loss)]
let v: Vec<f32> = (0..dim).map(|d| (i * dim + d) as f32).collect();
hnsw.insert(&v).unwrap();
}
hnsw
}
#[test]
fn reorder_skips_small_index() {
let hnsw = build_test_index(50, 4);
// Should be a no-op for <1000 vectors
assert!(hnsw.reorder_for_locality().is_ok());
}
#[test]
fn reorder_preserves_search_results() {
let hnsw = build_test_index(1200, 4);
// Search before reorder
let query = vec![1.0, 2.0, 3.0, 4.0];
let before = hnsw.search(&query, 5, 64);
let _before_ids: Vec<NodeId> = before.iter().map(|(id, _)| *id).collect();
// Reorder
hnsw.reorder_for_locality().unwrap();
// Search after reorder — results should contain the same vectors
// (IDs change but the distances should be identical)
let after = hnsw.search(&query, 5, 64);
assert_eq!(before.len(), after.len(), "Result count changed");
// Distances should match (order may differ by tie-breaking)
let mut before_dists: Vec<f32> = before.iter().map(|(_, d)| *d).collect();
let mut after_dists: Vec<f32> = after.iter().map(|(_, d)| *d).collect();
before_dists.sort_by(f32::total_cmp);
after_dists.sort_by(f32::total_cmp);
for (b, a) in before_dists.iter().zip(after_dists.iter()) {
assert!(
(b - a).abs() < 1e-5,
"Distance mismatch: before={b}, after={a}"
);
}
// Verify entry point is still valid
let ep_id = hnsw.entry_point.load(Ordering::Acquire);
assert_ne!(ep_id, NO_ENTRY_POINT, "Entry point lost after reorder");
assert!(
ep_id < hnsw.count.load(Ordering::Relaxed),
"Entry point out of bounds"
);
// Verify all IDs in results are within bounds
for (id, _) in &after {
assert!(*id < hnsw.count.load(Ordering::Relaxed));
}
}
#[test]
fn bfs_order_covers_all_nodes() {
let hnsw = build_test_index(1200, 4);
let count = hnsw.count.load(Ordering::Relaxed);
let ep = hnsw.entry_point.load(Ordering::Acquire);
assert_ne!(ep, NO_ENTRY_POINT, "entry_point must be set");
let order = hnsw.compute_bfs_order(ep, count);
assert_eq!(order.len(), count, "BFS order must cover all nodes");
// Verify it's a valid permutation (each node appears exactly once)
let mut seen = vec![false; count];
for &id in &order {
assert!(id < count, "BFS order contains out-of-bounds ID: {id}");
assert!(!seen[id], "BFS order contains duplicate ID: {id}");
seen[id] = true;
}
}
#[test]
fn reverse_mapping_is_inverse() {
let new_order = vec![3, 1, 4, 0, 2];
let old_to_new = NativeHnsw::<CpuDistance>::build_reverse_mapping(&new_order, 5);
// new_order[0] = 3 => old_to_new[3] = 0
// new_order[1] = 1 => old_to_new[1] = 1
// new_order[2] = 4 => old_to_new[4] = 2
// new_order[3] = 0 => old_to_new[0] = 3
// new_order[4] = 2 => old_to_new[2] = 4
assert_eq!(old_to_new, vec![3, 1, 4, 0, 2]);
}
}