1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
// HNSW Index Internal Helpers
//
// This module contains internal helper methods for HnswIndex.
// Includes layer management, ID translation, validation, and layer insertion.
//
// Note: This file is included via include! macro in index.rs
// All imports are inherited from the parent module
// Note: HnswIndex is defined in the parent (index.rs)
// This file is included via include! macro, so types are available in scope
impl HnswIndex {
/// Validate the HNSW configuration
pub(crate) fn validate_config(config: &crate::hnsw::config::HnswConfig) -> Result<(), crate::hnsw::errors::HnswError> {
use crate::hnsw::errors::{HnswError, HnswIndexError};
if config.dimension == 0 {
return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
}
if config.m == 0 {
return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
}
if config.ef_construction < config.m {
return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
}
if config.ef_search == 0 {
return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
}
if config.ml == 0 {
return Err(HnswError::Index(HnswIndexError::InvalidNodeId(0)));
}
Ok(())
}
/// Determine which layer to insert a vector into using exponential distribution
pub(crate) fn determine_insertion_level(&mut self) -> usize {
if self.config.enable_multilayer {
if let Some(distributor) = &mut self.level_distributor {
distributor.sample_level_internal()
} else {
0 // Fallback to single-layer if distributor not initialized
}
} else {
0 // Single-layer mode for backward compatibility
}
}
/// Insert a vector into a specific layer
pub(crate) fn insert_into_layer(&mut self, vector_id: u64, level: usize) -> Result<(), crate::hnsw::errors::HnswError> {
use crate::hnsw::errors::{HnswError, HnswIndexError};
// Determine local node ID based on mode
// In multi-layer mode: use LayerMappings for ID translation
// In single-layer mode: direct conversion (vector_id - 1)
let node_id = if let Some(manager) = &mut self.multi_layer_manager {
// Multi-layer mode: use LayerMappings to get local ID
manager.get_local_id(vector_id, level)
.ok_or(HnswError::Index(HnswIndexError::NodeNotFound(vector_id)))?
} else {
// Single-layer mode: direct 1-based to 0-based conversion
vector_id - 1
};
// Ensure layer exists, create if necessary
while level >= self.layers.len() {
let new_level = self.layers.len() as u8;
let max_connections = (self.config.m / 2_usize.pow(new_level as u32)).max(1);
self.layers.push(crate::hnsw::layer::HnswLayer::new(new_level, max_connections));
}
// Add the node to the layer first (this makes it an entry point if it's one of the first nodes)
{
let layer = &mut self.layers[level];
layer.add_node(node_id)?;
}
// For base layer, no need to connect to existing entry points if this is the first node
if level == 0 && self.layers[level].node_count() == 1 {
return Ok(());
}
// PROPER HNSW INSERTION: Search for nearest neighbors, then connect to them
// 1. Load vectors for distance computation
let vectors = self.load_vectors_as_array()?;
// 2. Get the vector data for the new node
let new_vector = &vectors[node_id as usize];
// 3. Find entry points to start the search
let global_entry_points = self.get_layer_entry_points(level);
let entry_points: Vec<u64> = if let Some(manager) = &self.multi_layer_manager {
// Multi-layer mode: use LayerMappings for ID translation
global_entry_points
.into_iter()
.filter_map(|global_id| manager.get_local_id(global_id, level))
.collect()
} else {
// Single-layer mode: direct 1-based to 0-based conversion
global_entry_points
.into_iter()
.map(|global_id| global_id - 1)
.collect()
};
if entry_points.is_empty() {
// No entry points yet, this must be the first node
return Ok(());
}
// 4. Search for the nearest neighbors to the new vector
let ef = self.config.ef_construction;
let search_result = self.search_engine.search_layer(
&self.layers[level],
new_vector,
&vectors,
&entry_points,
ef,
)?;
// 5. Select top M neighbors (limited by max_connections)
let candidates = search_result.neighbors();
let distances = search_result.distances();
let m = self.layers[level].max_connections();
// Build distance map for the new node's neighbors
let mut neighbor_distances = std::collections::HashMap::new();
for (i, &neighbor_id) in candidates.iter().enumerate() {
if i < m {
neighbor_distances.insert(neighbor_id, distances[i]);
}
}
// 6. Add connections from new node to its nearest neighbors
// The new node gets connections to its M nearest neighbors
let layer = &mut self.layers[level];
for &neighbor_id in neighbor_distances.keys() {
if neighbor_id != node_id {
layer.add_one_way_connection(node_id, neighbor_id)?;
}
}
// Prune the new node's connections by distance (keeps closest)
layer.prune_connections_by_distance(node_id, &neighbor_distances);
// 7. Add reverse connections from neighbors to the new node
// We use a more lenient pruning strategy for reverse connections to ensure
// the graph remains well-connected. Only prune if we exceed 2*M connections.
let max_reverse_conns = (m * 2).max(32); // More lenient limit
for (&neighbor_id, _dist_to_neighbor) in neighbor_distances.iter() {
if neighbor_id != node_id {
// Add reverse connection (existing node -> new node)
layer.add_one_way_connection(neighbor_id, node_id)?;
// Only prune if significantly over limit
if let Ok(existing_conns) = layer.get_connections(neighbor_id) {
if existing_conns.len() > max_reverse_conns {
// Build distance map with small distance for new node to keep it
let mut reverse_distances = std::collections::HashMap::new();
reverse_distances.insert(node_id, 0.0); // Keep new connection
for &existing_id in existing_conns {
if existing_id != node_id {
reverse_distances.insert(existing_id, 1.0); // Existing connections
}
}
layer.prune_connections_by_distance(neighbor_id, &reverse_distances);
}
}
}
}
Ok(())
}
/// Get entry points for a specific layer
///
/// Returns global vector IDs (1-based) for vectors that are entry points
/// in the specified layer. In multi-layer mode, uses the manager to
/// translate local node IDs to global vector IDs.
pub(crate) fn get_layer_entry_points(&self, level: usize) -> Vec<u64> {
if self.layers.is_empty() {
return Vec::new();
}
if level == self.layers.len() - 1 {
// Top layer: return all entry points (already 1-based vector IDs)
self.entry_points.clone()
} else if level == 0 {
// Base layer: use its own entry points
let layer_entry_points = self.layers[level].get_entry_points();
if let Some(manager) = &self.multi_layer_manager {
// Multi-layer mode: convert local node IDs to global vector IDs
layer_entry_points
.iter()
.filter_map(|&local_id| manager.get_global_id(level, local_id))
.collect()
} else {
// Single-layer mode: direct 0-based to 1-based conversion
layer_entry_points
.iter()
.map(|&node_id| node_id + 1)
.collect()
}
} else {
// Intermediate layers: use entry points from the layer above
if level + 1 < self.layers.len() {
let layer_entry_points = self.layers[level + 1].get_entry_points();
if let Some(manager) = &self.multi_layer_manager {
// Multi-layer mode: convert local node IDs to global vector IDs
layer_entry_points
.iter()
.filter_map(|&local_id| manager.get_global_id(level + 1, local_id))
.collect()
} else {
// Single-layer mode: direct 0-based to 1-based conversion
layer_entry_points
.iter()
.map(|&node_id| node_id + 1)
.collect()
}
} else {
Vec::new()
}
}
}
/// Get local ID for a global vector ID in a specific layer
///
/// In multi-layer mode, uses LayerMappings for ID translation.
/// In single-layer mode, uses direct 1-based to 0-based conversion.
///
/// # Arguments
///
/// * `vector_id` - Global vector ID (1-based)
/// * `layer_id` - Layer ID (0-based)
///
/// # Returns
///
/// Local node ID for the layer
pub(crate) fn get_local_id_for_layer(&self, vector_id: u64, layer_id: usize) -> Result<u64, crate::hnsw::errors::HnswError> {
use crate::hnsw::errors::{HnswError, HnswIndexError};
if let Some(manager) = &self.multi_layer_manager {
manager.get_local_id(vector_id, layer_id)
.ok_or(HnswError::Index(HnswIndexError::NodeNotFound(vector_id)))
} else {
// Single-layer mode: direct conversion
Ok(vector_id - 1)
}
}
/// Get global ID for a local node ID in a specific layer
///
/// In multi-layer mode, uses LayerMappings for ID translation.
/// In single-layer mode, uses direct 0-based to 1-based conversion.
///
/// # Arguments
///
/// * `layer_id` - Layer ID (0-based)
/// * `local_id` - Local node ID (0-based)
///
/// # Returns
///
/// Global vector ID (1-based)
pub(crate) fn get_global_id_for_layer(&self, layer_id: usize, local_id: u64) -> Result<u64, crate::hnsw::errors::HnswError> {
use crate::hnsw::errors::{HnswError, HnswIndexError};
if let Some(manager) = &self.multi_layer_manager {
manager.get_global_id(layer_id, local_id)
.ok_or(HnswError::Index(HnswIndexError::InvalidNodeId(local_id)))
} else {
// Single-layer mode: direct conversion
Ok(local_id + 1)
}
}
/// Load all vectors as a 0-indexed array for layer operations
///
/// Creates a vectors array where vectors[node_id] = vector_data,
/// allowing efficient lookup by layer-local node IDs.
///
/// # Returns
///
/// Vectors array indexed by 0-based node IDs
pub(crate) fn load_vectors_as_array(&self) -> Result<Vec<Vec<f32>>, crate::hnsw::errors::HnswError> {
let vector_ids = self.storage.list_vectors()?;
let max_vector_id = vector_ids.iter().copied().max().unwrap_or(0);
let mut vectors_array = vec![vec![]; max_vector_id as usize + 1];
for vector_id in vector_ids {
if let Ok(Some(vector)) = self.storage.get_vector(vector_id) {
let node_id = (vector_id - 1) as usize;
if node_id < vectors_array.len() {
vectors_array[node_id] = vector;
}
}
}
Ok(vectors_array)
}
}