sqlitegraph/hnsw/config.rs
1//! HNSW Algorithm Configuration
2//!
3//! This module defines configuration parameters for Hierarchical Navigable Small World (HNSW)
4//! vector index construction and search operations. These parameters directly control
5//! index performance, memory usage, and search quality.
6//!
7//! # Configuration Parameters
8//!
9//! - **Dimension**: Vector dimension count (must match all vectors)
10//! - **M**: Number of bi-directional connections per node (typically 5-48)
11//! - **ef_construction**: Dynamic candidate list size during construction (typically 100-800)
12//! - **ef_search**: Dynamic candidate list size during search (typically 10-200)
13//! - **ml**: Maximum number of layers in the index
14//! - **distance_metric**: Similarity calculation method
15//!
16//! # Performance Impact
17//!
18//! ## M Parameter
19//! - Higher M: Better recall, more memory, slower construction
20//! - Lower M: Faster construction, less memory, potentially lower recall
21//! - Recommended: 16 for most use cases, 32+ for high accuracy requirements
22//!
23//! ## ef_construction Parameter
24//! - Higher ef_construction: Better index quality, slower construction
25//! - Lower ef_construction: Faster construction, potentially lower search quality
26//! - Recommended: 200 for balanced performance
27//!
28//! # Examples
29//!
30//! ```rust
31//! use sqlitegraph::hnsw::{HnswConfig, DistanceMetric};
32//!
33//! let config = HnswConfig::builder()
34//! .dimension(768)
35//! .m_connections(16)
36//! .ef_construction(200)
37//! .ef_search(50)
38//! .distance_metric(DistanceMetric::Cosine)
39//! .build()
40//! .unwrap();
41//! # Ok::<(), Box<dyn std::error::Error>>(())
42//! ```
43
44use crate::hnsw::distance_metric::DistanceMetric;
45
46/// HNSW algorithm configuration parameters
47///
48/// This struct defines all parameters that control HNSW index behavior.
49/// These parameters significantly impact search quality, construction time,
50/// and memory usage patterns.
51///
52/// # Field Descriptions
53///
54/// ## dimension
55/// Vector dimension count. Must match all vectors inserted into the index.
56/// Typical values: 128-4096 depending on embedding model used.
57///
58/// ## m
59/// Number of bi-directional links created for each node during construction.
60/// This is the primary parameter controlling index connectivity.
61///
62/// - Lower values (5-12): Faster construction, less memory, lower recall
63/// - Medium values (16-24): Balanced performance (recommended)
64/// - Higher values (32-48): Better recall, more memory, slower construction
65///
66/// ## ef_construction
67/// Size of dynamic candidate list during index construction.
68/// Controls how thoroughly the algorithm explores the graph during insertion.
69///
70/// - Lower values (100-200): Faster construction
71/// - Higher values (400-800): Better index quality, slower construction
72///
73/// ## ef_search
74/// Size of dynamic candidate list during search operations.
75/// Controls search accuracy vs speed trade-off.
76///
77/// - Lower values (10-50): Faster search, potentially lower accuracy
78/// - Higher values (100-200): Better recall, slower search
79///
80/// ## ml
81/// Maximum number of layers in the HNSW structure.
82/// Calculated as floor(-ln(N) * ml_scale) where N is data size.
83/// Higher values create deeper graphs for better performance on large datasets.
84///
85/// ## distance_metric
86/// Distance function used for vector similarity calculation.
87/// Choose based on your vector data characteristics and use case requirements.
88///
89/// ## enable_multilayer
90/// Controls whether multi-layer HNSW functionality is enabled.
91/// When false (default), all vectors are inserted into the base layer only,
92/// providing backward compatibility and avoiding node ID conflicts.
93/// When true, proper multi-layer HNSW with exponential distribution is used.
94///
95/// ## multilayer_level_distribution_base
96/// Base value for exponential level distribution in multi-layer mode.
97/// Higher values create flatter layer distributions (more vectors in higher layers).
98/// Default value equals m for optimal performance.
99///
100/// ## multilayer_deterministic_seed
101/// Seed for deterministic random number generation in multi-layer operations.
102/// When Some(seed), reproducible level assignments are ensured.
103/// When None, non-deterministic behavior is used (default for production).
104///
105/// # Default Configuration
106///
107/// The default configuration provides good performance for most use cases:
108/// - Balanced search quality vs speed
109/// - Reasonable memory usage (~2.5x vector size)
110/// - Fast construction time
111/// - Robust to various data distributions
112/// - Single-layer mode for backward compatibility
113///
114/// # Multi-layer vs Single-layer Mode
115///
116/// ## Single-layer mode (enable_multilayer = false)
117/// - All vectors inserted into base layer (L0)
118/// - No node ID conflicts
119/// - Faster insertion, simpler search
120/// - Recommended for small datasets (<10k vectors) or when compatibility is critical
121///
122/// ## Multi-layer mode (enable_multilayer = true)
123/// - Exponential level distribution for optimal search performance
124/// - 3-10x faster search for large datasets (>10k vectors)
125/// - More complex insertion algorithm with bidirectional ID mapping
126/// - Recommended for large datasets where search performance is critical
127///
128/// # Examples
129///
130/// ```rust
131/// use sqlitegraph::hnsw::{HnswConfig, DistanceMetric};
132///
133/// // High-precision configuration
134/// let precise_config = HnswConfig {
135/// dimension: 768,
136/// m: 32,
137/// ef_construction: 400,
138/// ef_search: 100,
139/// ml: 24,
140/// distance_metric: DistanceMetric::Cosine,
141/// enable_multilayer: false,
142/// multilayer_level_distribution_base: None,
143/// multilayer_deterministic_seed: None,
144/// };
145///
146/// // Multi-layer configuration for large datasets
147/// let multilayer_config = HnswConfig {
148/// dimension: 768,
149/// m: 16,
150/// ef_construction: 200,
151/// ef_search: 50,
152/// ml: 16,
153/// distance_metric: DistanceMetric::Cosine,
154/// enable_multilayer: true,
155/// multilayer_level_distribution_base: Some(16),
156/// multilayer_deterministic_seed: Some(42),
157/// };
158/// ```
159#[derive(Debug, Clone, PartialEq)]
160pub struct HnswConfig {
161 /// Vector dimension count
162 /// Must match all vectors inserted into the index
163 /// Range: 1-4096 (practical limits)
164 pub dimension: usize,
165
166 /// Number of connections per node (M parameter)
167 /// Controls graph connectivity and memory usage
168 /// Range: 5-48 (typical), higher values require more memory
169 pub m: usize,
170
171 /// Construction ef parameter
172 /// Dynamic candidate list size during index building
173 /// Range: 100-800 (typical)
174 pub ef_construction: usize,
175
176 /// Search ef parameter
177 /// Dynamic candidate list size during search
178 /// Range: 10-200 (typical)
179 pub ef_search: usize,
180
181 /// Maximum number of layers
182 /// Controls maximum graph depth
183 /// Range: 8-32 (typical)
184 pub ml: u8,
185
186 /// Distance metric for similarity calculation
187 pub distance_metric: DistanceMetric,
188
189 /// Enable multi-layer HNSW functionality
190 /// When false, uses single-layer mode for backward compatibility
191 /// When true, enables proper multi-layer HNSW with exponential distribution
192 pub enable_multilayer: bool,
193
194 /// Base value for exponential level distribution in multi-layer mode
195 /// When None, uses m value as default
196 /// Higher values create flatter distributions (more vectors in higher layers)
197 pub multilayer_level_distribution_base: Option<usize>,
198
199 /// Seed for deterministic random number generation in multi-layer operations
200 /// When Some(seed), ensures reproducible level assignments
201 /// When None, uses non-deterministic behavior (default for production)
202 pub multilayer_deterministic_seed: Option<u64>,
203}
204
205impl HnswConfig {
206 /// Create a new HnswConfig with the specified parameters
207 ///
208 /// # Arguments
209 ///
210 /// * `dimension` - Vector dimension count
211 /// * `m` - Number of connections per node (M parameter)
212 /// * `ef_construction` - Dynamic candidate list size during construction
213 /// * `distance_metric` - Distance metric for similarity calculation
214 ///
215 /// # Returns
216 ///
217 /// A new HnswConfig instance with sensible defaults for other parameters
218 ///
219 /// # Examples
220 ///
221 /// ```rust
222 /// use sqlitegraph::hnsw::{HnswConfig, DistanceMetric};
223 ///
224 /// let config = HnswConfig::new(128, 16, 200, DistanceMetric::Cosine);
225 /// ```
226 pub fn new(
227 dimension: usize,
228 m: usize,
229 ef_construction: usize,
230 distance_metric: DistanceMetric,
231 ) -> Self {
232 HnswConfig {
233 dimension,
234 m,
235 ef_construction,
236 ef_search: ef_construction, // Default ef_search to ef_construction
237 ml: 16, // Reasonable default
238 distance_metric,
239 enable_multilayer: false, // Single-layer mode by default
240 multilayer_level_distribution_base: None,
241 multilayer_deterministic_seed: None,
242 }
243 }
244}
245
246impl Default for HnswConfig {
247 fn default() -> Self {
248 HnswConfig {
249 dimension: 768, // Common embedding size
250 m: 16, // Balanced connectivity
251 ef_construction: 200, // Good construction quality
252 ef_search: 50, // Balanced search speed/quality
253 ml: 16, // Reasonable depth
254 distance_metric: DistanceMetric::Cosine,
255 enable_multilayer: false, // Single-layer mode for backward compatibility
256 multilayer_level_distribution_base: None, // Use m as default
257 multilayer_deterministic_seed: None, // Non-deterministic for production
258 }
259 }
260}
261
262/// Create a new HnswConfig builder with default values
263///
264/// This is a convenience function that creates a builder for constructing
265/// HnswConfig instances with validation and sensible defaults.
266///
267/// # Examples
268///
269/// ```rust
270/// use sqlitegraph::hnsw::{hnsw_config, DistanceMetric};
271///
272/// let config = hnsw_config()
273/// .dimension(256)
274/// .m_connections(12)
275/// .distance_metric(DistanceMetric::Euclidean)
276/// .build()
277/// .unwrap();
278/// # Ok::<(), Box<dyn std::error::Error>>(())
279/// ```
280pub fn hnsw_config() -> crate::hnsw::builder::HnswConfigBuilder {
281 crate::hnsw::builder::HnswConfigBuilder::new()
282}
283
284#[cfg(test)]
285mod tests {
286 use super::*;
287
288 #[test]
289 fn test_default_config() {
290 let config = HnswConfig::default();
291
292 assert_eq!(config.dimension, 768);
293 assert_eq!(config.m, 16);
294 assert_eq!(config.ef_construction, 200);
295 assert_eq!(config.ef_search, 50);
296 assert_eq!(config.ml, 16);
297 assert_eq!(config.distance_metric, DistanceMetric::Cosine);
298 }
299
300 #[test]
301 fn test_config_clone() {
302 let config1 = HnswConfig {
303 dimension: 256,
304 m: 12,
305 ef_construction: 150,
306 ef_search: 40,
307 ml: 12,
308 distance_metric: DistanceMetric::Manhattan,
309 enable_multilayer: true,
310 multilayer_level_distribution_base: Some(12),
311 multilayer_deterministic_seed: Some(123),
312 };
313
314 let config2 = config1.clone();
315 assert_eq!(config1, config2);
316 }
317
318 #[test]
319 fn test_high_precision_config() {
320 let config = HnswConfig {
321 dimension: 1536,
322 m: 32,
323 ef_construction: 400,
324 ef_search: 100,
325 ml: 24,
326 distance_metric: DistanceMetric::Cosine,
327 enable_multilayer: false,
328 multilayer_level_distribution_base: None,
329 multilayer_deterministic_seed: None,
330 };
331
332 assert_eq!(config.dimension, 1536);
333 assert_eq!(config.m, 32);
334 assert!(config.ef_construction >= config.m);
335 assert!(!config.enable_multilayer);
336 }
337
338 #[test]
339 fn test_fast_construction_config() {
340 let config = HnswConfig {
341 dimension: 384,
342 m: 8,
343 ef_construction: 100,
344 ef_search: 20,
345 ml: 12,
346 distance_metric: DistanceMetric::Euclidean,
347 enable_multilayer: false,
348 multilayer_level_distribution_base: None,
349 multilayer_deterministic_seed: None,
350 };
351
352 assert_eq!(config.dimension, 384);
353 assert_eq!(config.m, 8);
354 assert_eq!(config.ef_construction, 100);
355 assert_eq!(config.ef_search, 20);
356 assert!(!config.enable_multilayer);
357 }
358
359 #[test]
360 fn test_hnsw_config_function() {
361 let config = hnsw_config()
362 .dimension(512)
363 .m_connections(24)
364 .distance_metric(DistanceMetric::Euclidean)
365 .build()
366 .unwrap();
367
368 assert_eq!(config.dimension, 512);
369 assert_eq!(config.m, 24);
370 assert_eq!(config.distance_metric, DistanceMetric::Euclidean);
371 assert!(!config.enable_multilayer); // Default is single-layer
372 }
373
374 #[test]
375 fn test_multilayer_config_defaults() {
376 let config = HnswConfig::default();
377
378 // Default should be single-layer mode for backward compatibility
379 assert!(!config.enable_multilayer);
380 assert_eq!(config.multilayer_level_distribution_base, None);
381 assert_eq!(config.multilayer_deterministic_seed, None);
382 }
383
384 #[test]
385 fn test_multilayer_config_enabled() {
386 let config = HnswConfig {
387 dimension: 256,
388 m: 16,
389 ef_construction: 200,
390 ef_search: 50,
391 ml: 16,
392 distance_metric: DistanceMetric::Cosine,
393 enable_multilayer: true,
394 multilayer_level_distribution_base: Some(16),
395 multilayer_deterministic_seed: Some(42),
396 };
397
398 assert!(config.enable_multilayer);
399 assert_eq!(config.multilayer_level_distribution_base, Some(16));
400 assert_eq!(config.multilayer_deterministic_seed, Some(42));
401 }
402
403 #[test]
404 fn test_multilayer_config_defaults_derivation() {
405 let config = HnswConfig {
406 dimension: 512,
407 m: 20,
408 ef_construction: 200,
409 ef_search: 50,
410 ml: 16,
411 distance_metric: DistanceMetric::Euclidean,
412 enable_multilayer: true,
413 multilayer_level_distribution_base: None, // Should use m
414 multilayer_deterministic_seed: None, // Should be non-deterministic
415 };
416
417 assert!(config.enable_multilayer);
418 assert_eq!(config.multilayer_level_distribution_base, None);
419 assert_eq!(config.multilayer_deterministic_seed, None);
420 }
421
422 #[test]
423 fn test_multilayer_config_validation() {
424 // Test that multi-layer config doesn't interfere with basic validation
425 let config = HnswConfig {
426 dimension: 768,
427 m: 16,
428 ef_construction: 200,
429 ef_search: 50,
430 ml: 16,
431 distance_metric: DistanceMetric::Cosine,
432 enable_multilayer: true,
433 multilayer_level_distribution_base: Some(32), // Higher than m
434 multilayer_deterministic_seed: Some(12345),
435 };
436
437 assert_eq!(config.dimension, 768);
438 assert_eq!(config.m, 16);
439 assert_eq!(config.multilayer_level_distribution_base, Some(32));
440 assert_eq!(config.multilayer_deterministic_seed, Some(12345));
441 }
442
443 #[test]
444 fn test_single_layer_vs_multi_layer_config() {
445 let single_layer = HnswConfig {
446 dimension: 512,
447 m: 16,
448 ef_construction: 200,
449 ef_search: 50,
450 ml: 16,
451 distance_metric: DistanceMetric::Cosine,
452 enable_multilayer: false,
453 multilayer_level_distribution_base: None,
454 multilayer_deterministic_seed: None,
455 };
456
457 let multi_layer = HnswConfig {
458 dimension: 512,
459 m: 16,
460 ef_construction: 200,
461 ef_search: 50,
462 ml: 16,
463 distance_metric: DistanceMetric::Cosine,
464 enable_multilayer: true,
465 multilayer_level_distribution_base: Some(16),
466 multilayer_deterministic_seed: Some(42),
467 };
468
469 // Should be different due to multi-layer settings
470 assert_ne!(single_layer, multi_layer);
471 assert!(!single_layer.enable_multilayer);
472 assert!(multi_layer.enable_multilayer);
473 }
474}