Skip to main content

sqlitegraph/hnsw/
config.rs

1//! HNSW Algorithm Configuration
2//!
3//! This module defines configuration parameters for Hierarchical Navigable Small World (HNSW)
4//! vector index construction and search operations. These parameters directly control
5//! index performance, memory usage, and search quality.
6//!
7//! # Configuration Parameters
8//!
9//! - **Dimension**: Vector dimension count (must match all vectors)
10//! - **M**: Number of bi-directional connections per node (typically 5-48)
11//! - **ef_construction**: Dynamic candidate list size during construction (typically 100-800)
12//! - **ef_search**: Dynamic candidate list size during search (typically 10-200)
13//! - **ml**: Maximum number of layers in the index
14//! - **distance_metric**: Similarity calculation method
15//!
16//! # Performance Impact
17//!
18//! ## M Parameter
19//! - Higher M: Better recall, more memory, slower construction
20//! - Lower M: Faster construction, less memory, potentially lower recall
21//! - Recommended: 16 for most use cases, 32+ for high accuracy requirements
22//!
23//! ## ef_construction Parameter
24//! - Higher ef_construction: Better index quality, slower construction
25//! - Lower ef_construction: Faster construction, potentially lower search quality
26//! - Recommended: 200 for balanced performance
27//!
28//! # Examples
29//!
30//! ```rust
31//! use sqlitegraph::hnsw::{HnswConfig, DistanceMetric};
32//!
33//! let config = HnswConfig::builder()
34//!     .dimension(768)
35//!     .m_connections(16)
36//!     .ef_construction(200)
37//!     .ef_search(50)
38//!     .distance_metric(DistanceMetric::Cosine)
39//!     .build()
40//!     .unwrap();
41//! # Ok::<(), Box<dyn std::error::Error>>(())
42//! ```
43
44use crate::hnsw::distance_metric::DistanceMetric;
45
46/// HNSW algorithm configuration parameters
47///
48/// This struct defines all parameters that control HNSW index behavior.
49/// These parameters significantly impact search quality, construction time,
50/// and memory usage patterns.
51///
52/// # Field Descriptions
53///
54/// ## dimension
55/// Vector dimension count. Must match all vectors inserted into the index.
56/// Typical values: 128-4096 depending on embedding model used.
57///
58/// ## m
59/// Number of bi-directional links created for each node during construction.
60/// This is the primary parameter controlling index connectivity.
61///
62/// - Lower values (5-12): Faster construction, less memory, lower recall
63/// - Medium values (16-24): Balanced performance (recommended)
64/// - Higher values (32-48): Better recall, more memory, slower construction
65///
66/// ## ef_construction
67/// Size of dynamic candidate list during index construction.
68/// Controls how thoroughly the algorithm explores the graph during insertion.
69///
70/// - Lower values (100-200): Faster construction
71/// - Higher values (400-800): Better index quality, slower construction
72///
73/// ## ef_search
74/// Size of dynamic candidate list during search operations.
75/// Controls search accuracy vs speed trade-off.
76///
77/// - Lower values (10-50): Faster search, potentially lower accuracy
78/// - Higher values (100-200): Better recall, slower search
79///
80/// ## ml
81/// Maximum number of layers in the HNSW structure.
82/// Calculated as floor(-ln(N) * ml_scale) where N is data size.
83/// Higher values create deeper graphs for better performance on large datasets.
84///
85/// ## distance_metric
86/// Distance function used for vector similarity calculation.
87/// Choose based on your vector data characteristics and use case requirements.
88///
89/// ## enable_multilayer
90/// Controls whether multi-layer HNSW functionality is enabled.
91/// When false (default), all vectors are inserted into the base layer only,
92/// providing backward compatibility and avoiding node ID conflicts.
93/// When true, proper multi-layer HNSW with exponential distribution is used.
94///
95/// ## multilayer_level_distribution_base
96/// Base value for exponential level distribution in multi-layer mode.
97/// Higher values create flatter layer distributions (more vectors in higher layers).
98/// Default value equals m for optimal performance.
99///
100/// ## multilayer_deterministic_seed
101/// Seed for deterministic random number generation in multi-layer operations.
102/// When Some(seed), reproducible level assignments are ensured.
103/// When None, non-deterministic behavior is used (default for production).
104///
105/// # Default Configuration
106///
107/// The default configuration provides good performance for most use cases:
108/// - Balanced search quality vs speed
109/// - Reasonable memory usage (~2.5x vector size)
110/// - Fast construction time
111/// - Robust to various data distributions
112/// - Single-layer mode for backward compatibility
113///
114/// # Multi-layer vs Single-layer Mode
115///
116/// ## Single-layer mode (enable_multilayer = false)
117/// - All vectors inserted into base layer (L0)
118/// - No node ID conflicts
119/// - Faster insertion, simpler search
120/// - Recommended for small datasets (<10k vectors) or when compatibility is critical
121///
122/// ## Multi-layer mode (enable_multilayer = true)
123/// - Exponential level distribution for optimal search performance
124/// - 3-10x faster search for large datasets (>10k vectors)
125/// - More complex insertion algorithm with bidirectional ID mapping
126/// - Recommended for large datasets where search performance is critical
127///
128/// # Examples
129///
130/// ```rust
131/// use sqlitegraph::hnsw::{HnswConfig, DistanceMetric};
132///
133/// // High-precision configuration
134/// let precise_config = HnswConfig {
135///     dimension: 768,
136///     m: 32,
137///     ef_construction: 400,
138///     ef_search: 100,
139///     ml: 24,
140///     distance_metric: DistanceMetric::Cosine,
141///     enable_multilayer: false,
142///     multilayer_level_distribution_base: None,
143///     multilayer_deterministic_seed: None,
144/// };
145///
146/// // Multi-layer configuration for large datasets
147/// let multilayer_config = HnswConfig {
148///     dimension: 768,
149///     m: 16,
150///     ef_construction: 200,
151///     ef_search: 50,
152///     ml: 16,
153///     distance_metric: DistanceMetric::Cosine,
154///     enable_multilayer: true,
155///     multilayer_level_distribution_base: Some(16),
156///     multilayer_deterministic_seed: Some(42),
157/// };
158/// ```
159#[derive(Debug, Clone, PartialEq)]
160pub struct HnswConfig {
161    /// Vector dimension count
162    /// Must match all vectors inserted into the index
163    /// Range: 1-4096 (practical limits)
164    pub dimension: usize,
165
166    /// Number of connections per node (M parameter)
167    /// Controls graph connectivity and memory usage
168    /// Range: 5-48 (typical), higher values require more memory
169    pub m: usize,
170
171    /// Construction ef parameter
172    /// Dynamic candidate list size during index building
173    /// Range: 100-800 (typical)
174    pub ef_construction: usize,
175
176    /// Search ef parameter
177    /// Dynamic candidate list size during search
178    /// Range: 10-200 (typical)
179    pub ef_search: usize,
180
181    /// Maximum number of layers
182    /// Controls maximum graph depth
183    /// Range: 8-32 (typical)
184    pub ml: u8,
185
186    /// Distance metric for similarity calculation
187    pub distance_metric: DistanceMetric,
188
189    /// Enable multi-layer HNSW functionality
190    /// When false, uses single-layer mode for backward compatibility
191    /// When true, enables proper multi-layer HNSW with exponential distribution
192    pub enable_multilayer: bool,
193
194    /// Base value for exponential level distribution in multi-layer mode
195    /// When None, uses m value as default
196    /// Higher values create flatter distributions (more vectors in higher layers)
197    pub multilayer_level_distribution_base: Option<usize>,
198
199    /// Seed for deterministic random number generation in multi-layer operations
200    /// When Some(seed), ensures reproducible level assignments
201    /// When None, uses non-deterministic behavior (default for production)
202    pub multilayer_deterministic_seed: Option<u64>,
203}
204
205impl HnswConfig {
206    /// Create a new HnswConfig with the specified parameters
207    ///
208    /// # Arguments
209    ///
210    /// * `dimension` - Vector dimension count
211    /// * `m` - Number of connections per node (M parameter)
212    /// * `ef_construction` - Dynamic candidate list size during construction
213    /// * `distance_metric` - Distance metric for similarity calculation
214    ///
215    /// # Returns
216    ///
217    /// A new HnswConfig instance with sensible defaults for other parameters
218    ///
219    /// # Examples
220    ///
221    /// ```rust
222    /// use sqlitegraph::hnsw::{HnswConfig, DistanceMetric};
223    ///
224    /// let config = HnswConfig::new(128, 16, 200, DistanceMetric::Cosine);
225    /// ```
226    pub fn new(
227        dimension: usize,
228        m: usize,
229        ef_construction: usize,
230        distance_metric: DistanceMetric,
231    ) -> Self {
232        HnswConfig {
233            dimension,
234            m,
235            ef_construction,
236            ef_search: ef_construction, // Default ef_search to ef_construction
237            ml: 16,                     // Reasonable default
238            distance_metric,
239            enable_multilayer: false, // Single-layer mode by default
240            multilayer_level_distribution_base: None,
241            multilayer_deterministic_seed: None,
242        }
243    }
244}
245
246impl Default for HnswConfig {
247    fn default() -> Self {
248        HnswConfig {
249            dimension: 768,       // Common embedding size
250            m: 16,                // Balanced connectivity
251            ef_construction: 200, // Good construction quality
252            ef_search: 50,        // Balanced search speed/quality
253            ml: 16,               // Reasonable depth
254            distance_metric: DistanceMetric::Cosine,
255            enable_multilayer: false, // Single-layer mode for backward compatibility
256            multilayer_level_distribution_base: None, // Use m as default
257            multilayer_deterministic_seed: None, // Non-deterministic for production
258        }
259    }
260}
261
262/// Create a new HnswConfig builder with default values
263///
264/// This is a convenience function that creates a builder for constructing
265/// HnswConfig instances with validation and sensible defaults.
266///
267/// # Examples
268///
269/// ```rust
270/// use sqlitegraph::hnsw::{hnsw_config, DistanceMetric};
271///
272/// let config = hnsw_config()
273///     .dimension(256)
274///     .m_connections(12)
275///     .distance_metric(DistanceMetric::Euclidean)
276///     .build()
277///     .unwrap();
278/// # Ok::<(), Box<dyn std::error::Error>>(())
279/// ```
280pub fn hnsw_config() -> crate::hnsw::builder::HnswConfigBuilder {
281    crate::hnsw::builder::HnswConfigBuilder::new()
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    #[test]
289    fn test_default_config() {
290        let config = HnswConfig::default();
291
292        assert_eq!(config.dimension, 768);
293        assert_eq!(config.m, 16);
294        assert_eq!(config.ef_construction, 200);
295        assert_eq!(config.ef_search, 50);
296        assert_eq!(config.ml, 16);
297        assert_eq!(config.distance_metric, DistanceMetric::Cosine);
298    }
299
300    #[test]
301    fn test_config_clone() {
302        let config1 = HnswConfig {
303            dimension: 256,
304            m: 12,
305            ef_construction: 150,
306            ef_search: 40,
307            ml: 12,
308            distance_metric: DistanceMetric::Manhattan,
309            enable_multilayer: true,
310            multilayer_level_distribution_base: Some(12),
311            multilayer_deterministic_seed: Some(123),
312        };
313
314        let config2 = config1.clone();
315        assert_eq!(config1, config2);
316    }
317
318    #[test]
319    fn test_high_precision_config() {
320        let config = HnswConfig {
321            dimension: 1536,
322            m: 32,
323            ef_construction: 400,
324            ef_search: 100,
325            ml: 24,
326            distance_metric: DistanceMetric::Cosine,
327            enable_multilayer: false,
328            multilayer_level_distribution_base: None,
329            multilayer_deterministic_seed: None,
330        };
331
332        assert_eq!(config.dimension, 1536);
333        assert_eq!(config.m, 32);
334        assert!(config.ef_construction >= config.m);
335        assert!(!config.enable_multilayer);
336    }
337
338    #[test]
339    fn test_fast_construction_config() {
340        let config = HnswConfig {
341            dimension: 384,
342            m: 8,
343            ef_construction: 100,
344            ef_search: 20,
345            ml: 12,
346            distance_metric: DistanceMetric::Euclidean,
347            enable_multilayer: false,
348            multilayer_level_distribution_base: None,
349            multilayer_deterministic_seed: None,
350        };
351
352        assert_eq!(config.dimension, 384);
353        assert_eq!(config.m, 8);
354        assert_eq!(config.ef_construction, 100);
355        assert_eq!(config.ef_search, 20);
356        assert!(!config.enable_multilayer);
357    }
358
359    #[test]
360    fn test_hnsw_config_function() {
361        let config = hnsw_config()
362            .dimension(512)
363            .m_connections(24)
364            .distance_metric(DistanceMetric::Euclidean)
365            .build()
366            .unwrap();
367
368        assert_eq!(config.dimension, 512);
369        assert_eq!(config.m, 24);
370        assert_eq!(config.distance_metric, DistanceMetric::Euclidean);
371        assert!(!config.enable_multilayer); // Default is single-layer
372    }
373
374    #[test]
375    fn test_multilayer_config_defaults() {
376        let config = HnswConfig::default();
377
378        // Default should be single-layer mode for backward compatibility
379        assert!(!config.enable_multilayer);
380        assert_eq!(config.multilayer_level_distribution_base, None);
381        assert_eq!(config.multilayer_deterministic_seed, None);
382    }
383
384    #[test]
385    fn test_multilayer_config_enabled() {
386        let config = HnswConfig {
387            dimension: 256,
388            m: 16,
389            ef_construction: 200,
390            ef_search: 50,
391            ml: 16,
392            distance_metric: DistanceMetric::Cosine,
393            enable_multilayer: true,
394            multilayer_level_distribution_base: Some(16),
395            multilayer_deterministic_seed: Some(42),
396        };
397
398        assert!(config.enable_multilayer);
399        assert_eq!(config.multilayer_level_distribution_base, Some(16));
400        assert_eq!(config.multilayer_deterministic_seed, Some(42));
401    }
402
403    #[test]
404    fn test_multilayer_config_defaults_derivation() {
405        let config = HnswConfig {
406            dimension: 512,
407            m: 20,
408            ef_construction: 200,
409            ef_search: 50,
410            ml: 16,
411            distance_metric: DistanceMetric::Euclidean,
412            enable_multilayer: true,
413            multilayer_level_distribution_base: None, // Should use m
414            multilayer_deterministic_seed: None,      // Should be non-deterministic
415        };
416
417        assert!(config.enable_multilayer);
418        assert_eq!(config.multilayer_level_distribution_base, None);
419        assert_eq!(config.multilayer_deterministic_seed, None);
420    }
421
422    #[test]
423    fn test_multilayer_config_validation() {
424        // Test that multi-layer config doesn't interfere with basic validation
425        let config = HnswConfig {
426            dimension: 768,
427            m: 16,
428            ef_construction: 200,
429            ef_search: 50,
430            ml: 16,
431            distance_metric: DistanceMetric::Cosine,
432            enable_multilayer: true,
433            multilayer_level_distribution_base: Some(32), // Higher than m
434            multilayer_deterministic_seed: Some(12345),
435        };
436
437        assert_eq!(config.dimension, 768);
438        assert_eq!(config.m, 16);
439        assert_eq!(config.multilayer_level_distribution_base, Some(32));
440        assert_eq!(config.multilayer_deterministic_seed, Some(12345));
441    }
442
443    #[test]
444    fn test_single_layer_vs_multi_layer_config() {
445        let single_layer = HnswConfig {
446            dimension: 512,
447            m: 16,
448            ef_construction: 200,
449            ef_search: 50,
450            ml: 16,
451            distance_metric: DistanceMetric::Cosine,
452            enable_multilayer: false,
453            multilayer_level_distribution_base: None,
454            multilayer_deterministic_seed: None,
455        };
456
457        let multi_layer = HnswConfig {
458            dimension: 512,
459            m: 16,
460            ef_construction: 200,
461            ef_search: 50,
462            ml: 16,
463            distance_metric: DistanceMetric::Cosine,
464            enable_multilayer: true,
465            multilayer_level_distribution_base: Some(16),
466            multilayer_deterministic_seed: Some(42),
467        };
468
469        // Should be different due to multi-layer settings
470        assert_ne!(single_layer, multi_layer);
471        assert!(!single_layer.enable_multilayer);
472        assert!(multi_layer.enable_multilayer);
473    }
474}