edgevec/hnsw/
graph.rs

1//! HNSW graph implementation.
2//!
3//! # Lint Suppressions
4//!
5//! This module suppresses several numeric cast lints at the module level because:
6//!
7//! - **cast_possible_truncation**: The HNSW algorithm uses `u32` for node IDs and neighbor
8//!   indices. While `usize` is used for array indexing, all values are validated to fit in
9//!   `u32` at insertion time. Maximum supported index size is 4 billion vectors.
10//!
11//! - **cast_precision_loss**: Float calculations for level assignment use `f64` internally
12//!   and convert to `usize` for layer counts. Precision loss is acceptable as layers are
13//!   discrete integers (typically 0-16).
14//!
15//! - **cast_sign_loss**: Some internal calculations use signed intermediates that are
16//!   guaranteed non-negative by algorithm invariants.
17
18#![allow(clippy::cast_possible_truncation)]
19#![allow(clippy::cast_precision_loss)]
20#![allow(clippy::cast_sign_loss)]
21#![allow(clippy::missing_errors_doc)]
22#![allow(clippy::missing_panics_doc)]
23
24use super::config::HnswConfig;
25use super::neighbor::NeighborPool;
26use crate::metadata::{MetadataError, MetadataStore, MetadataValue};
27use crate::quantization::variable::BinaryVector;
28use crate::storage::binary::BinaryVectorStorage;
29use crate::storage::VectorStorage;
30use bytemuck::{Pod, Zeroable};
31use rand::{Rng, SeedableRng};
32use rand_chacha::ChaCha8Rng;
33use serde::{Deserialize, Serialize};
34use std::borrow::Cow;
35use std::collections::HashMap;
36use std::collections::HashSet;
37use std::vec::Vec;
38use thiserror::Error;
39
40/// Unique identifier for a vector in the database.
41///
42/// # Size
43/// 8 bytes, aligned to 8
44///
45/// # Invariants
46/// - IDs are never reused (monotonically increasing)
47/// - ID 0 is reserved (invalid sentinel)
48///
49/// # Safety
50/// Derives `Pod` and `Zeroable` for safe byte casting via bytemuck.
51#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Pod, Zeroable)]
52#[repr(transparent)]
53pub struct VectorId(pub u64);
54
55impl VectorId {
56    /// Sentinel value indicating "no vector"
57    pub const INVALID: Self = VectorId(0);
58
59    /// First valid ID
60    pub const FIRST: Self = VectorId(1);
61}
62
63/// Internal node identifier within HNSW graph.
64///
65/// # Size
66/// 4 bytes, aligned to 4
67///
68/// # Invariants
69/// - `NodeId` corresponds 1:1 with `VectorId` (lower 32 bits)
70/// - `NodeId` 0xFFFFFFFF is reserved (invalid sentinel)
71#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
72#[repr(transparent)]
73pub struct NodeId(pub u32);
74
75impl NodeId {
76    /// Sentinel value indicating invalid node
77    pub const INVALID: Self = NodeId(u32::MAX);
78}
79
80/// Represents a layer level in the HNSW graph.
81///
82/// Layer 0 is the base layer containing all nodes.
83/// Higher layers contain a subset of nodes for faster navigation.
84#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
85#[repr(transparent)]
86#[allow(dead_code)]
87pub struct Layer(pub u8);
88
89/// Errors that can occur during graph operations.
90#[derive(Error, Debug, Clone, PartialEq, Eq)]
91pub enum GraphError {
92    /// The graph has reached its maximum node capacity (`u32::MAX`).
93    #[error("node capacity exceeded")]
94    CapacityExceeded,
95
96    /// The provided `VectorId` is invalid (e.g., sentinel value).
97    #[error("invalid vector id")]
98    InvalidVectorId,
99
100    /// Neighbor data is corrupted or offset is out of bounds.
101    #[error("neighbor data corrupted")]
102    NeighborError,
103
104    /// Node ID is out of bounds.
105    #[error("node id out of bounds")]
106    NodeIdOutOfBounds,
107
108    /// Configuration mismatch with storage.
109    #[error("config dimension mismatch: expected {expected}, got {actual}")]
110    ConfigMismatch {
111        /// Expected dimensions.
112        expected: u32,
113        /// Actual dimensions in config.
114        actual: u32,
115    },
116
117    /// Query vector has wrong dimensions.
118    #[error("dimension mismatch: expected {expected}, got {actual}")]
119    DimensionMismatch {
120        /// Expected dimensions.
121        expected: usize,
122        /// Actual dimensions.
123        actual: usize,
124    },
125
126    /// Storage operation failed.
127    #[error("storage error: {0}")]
128    Storage(String),
129
130    /// Invalid configuration parameter.
131    #[error("invalid config: {0}")]
132    InvalidConfig(String),
133
134    /// Metadata validation failed (v0.6.0 - RFC-002).
135    ///
136    /// Returned when metadata fails validation during `insert_with_metadata()`.
137    /// The index remains unchanged when this error occurs.
138    #[error("metadata validation failed: {0}")]
139    MetadataValidation(#[from] MetadataError),
140
141    /// Filter parsing failed (v0.6.0 - RFC-002).
142    ///
143    /// Returned when a filter expression string cannot be parsed.
144    #[error("filter parse error: {0}")]
145    FilterParse(String),
146
147    /// Filter evaluation failed (v0.6.0 - RFC-002).
148    ///
149    /// Returned when a filter expression evaluation fails at runtime.
150    #[error("filter evaluation error: {0}")]
151    FilterEval(String),
152
153    /// Binary quantization is not enabled (v0.7.0 - RFC-002 Phase 2).
154    ///
155    /// Returned when attempting BQ operations on an index without BQ storage.
156    #[error("binary quantization is not enabled; use with_bq() or enable_bq() first")]
157    BqNotEnabled,
158
159    /// Binary quantization failed (v0.7.0 - RFC-002 Phase 2).
160    ///
161    /// Returned when vector quantization fails during BQ operations.
162    #[error("quantization error: {0}")]
163    Quantization(String),
164}
165
166/// Result of a compaction operation.
167///
168/// Returned by [`HnswIndex::compact()`] to provide metrics about the operation.
169///
170/// # Fields
171///
172/// * `tombstones_removed` - Number of deleted vectors removed
173/// * `new_size` - Size of the new index (live vectors only)
174/// * `duration_ms` - Time taken for the operation in milliseconds
175#[derive(Debug, Clone, PartialEq, Eq)]
176pub struct CompactionResult {
177    /// Number of tombstones (deleted vectors) removed during compaction.
178    pub tombstones_removed: usize,
179    /// New index size after compaction (live vectors only).
180    pub new_size: usize,
181    /// Time taken for the compaction operation in milliseconds.
182    pub duration_ms: u64,
183}
184
185/// A node in the HNSW graph with its adjacency information.
186///
187/// # Layout
188///
189/// Total size: 16 bytes
190/// Alignment: 8 bytes
191///
192/// # Fields
193///
194/// - `vector_id`: 8 bytes
195/// - `neighbor_offset`: 4 bytes
196/// - `neighbor_len`: 2 bytes
197/// - `max_layer`: 1 byte
198/// - `deleted`: 1 byte (soft delete flag, v0.3.0)
199///
200/// # Soft Delete (v0.3.0)
201///
202/// The `deleted` field enables O(1) soft delete. Deleted nodes remain in
203/// the graph for routing but are excluded from search results.
204/// - `deleted = 0`: Node is live (default)
205/// - `deleted = 1`: Node is deleted (tombstone)
206///
207/// # Safety
208///
209/// Derives `Pod` and `Zeroable` for safe byte casting via bytemuck.
210/// The `#[repr(C)]` ensures deterministic layout for serialization.
211/// All fields are primitive types with no padding gaps.
212#[derive(Clone, Copy, Debug, Serialize, Deserialize, Pod, Zeroable)]
213#[repr(C)]
214pub struct HnswNode {
215    /// The vector ID this node represents
216    pub vector_id: VectorId,
217
218    /// Offset into COMPRESSED neighbor pool
219    pub neighbor_offset: u32,
220
221    /// Length of neighbor data in bytes (Allocated Capacity)
222    pub neighbor_len: u16,
223
224    /// The maximum layer this node appears in
225    pub max_layer: u8,
226
227    /// Soft delete flag: 0 = live, 1 = deleted (v0.3.0)
228    /// This field replaces the padding byte from v0.2.x.
229    pub deleted: u8,
230}
231
232/// The HNSW Graph structure managing layers and nodes.
233///
234/// # Memory
235///
236/// Uses a flattened representation for cache efficiency.
237/// Nodes are stored in a contiguous vector.
238///
239/// # Soft Delete (v0.3.0)
240///
241/// Supports soft delete via tombstone marking. Deleted nodes remain
242/// in the graph for routing but are excluded from search results.
243/// Use `compact()` to reclaim space from deleted nodes.
244#[derive(Clone, Debug, Serialize, Deserialize)]
245pub struct HnswIndex {
246    /// Algorithm configuration
247    pub config: HnswConfig,
248
249    /// Node metadata (fixed-size per node)
250    pub(crate) nodes: Vec<HnswNode>,
251
252    /// Compressed neighbor lists
253    pub(crate) neighbors: NeighborPool,
254
255    /// Entry point (highest layer node)
256    pub(crate) entry_point: Option<NodeId>,
257
258    /// Maximum layer in the graph
259    pub(crate) max_layer: u8,
260
261    /// Level probability multiplier (1/ln(M))
262    pub(crate) level_mult: f32,
263
264    /// Deterministic RNG state
265    rng: ChaCha8Rng,
266
267    /// Count of soft-deleted vectors (v0.3.0)
268    /// This tracks the number of nodes with `deleted = 1`.
269    #[serde(default)]
270    pub(crate) deleted_count: usize,
271
272    /// Compaction threshold ratio (v0.3.0)
273    /// When tombstone_ratio() exceeds this value, needs_compaction() returns true.
274    /// Default: 0.3 (30% tombstones triggers compaction recommendation)
275    #[serde(default = "default_compaction_threshold")]
276    compaction_threshold: f64,
277
278    /// Integrated metadata storage (v0.6.0 - RFC-002)
279    ///
280    /// Stores key-value metadata attached to vectors. Thread-safe when all
281    /// contained types are `Send + Sync` (which they are for String/MetadataValue).
282    ///
283    /// Concurrent modification requires external synchronization, matching
284    /// the existing pattern for other HnswIndex operations.
285    #[serde(default)]
286    pub(crate) metadata: MetadataStore,
287
288    /// Binary quantization storage (v0.7.0 - RFC-002 Phase 2)
289    ///
290    /// Optional storage for binary quantized vectors. When enabled, vectors
291    /// are stored in both F32 and binary format. BQ provides 32x memory
292    /// reduction and 3-5x search speedup at the cost of some recall.
293    ///
294    /// Use `with_bq()` to create an index with BQ enabled, or `enable_bq()`
295    /// to add BQ support to an existing index.
296    #[serde(skip)]
297    pub(crate) bq_storage: Option<BinaryVectorStorage>,
298}
299
300/// Default compaction threshold (30%)
301fn default_compaction_threshold() -> f64 {
302    0.3
303}
304
305/// Error type for individual batch delete failures
306/// [C4 FIX] Enables caller to distinguish failure reasons
307#[derive(Debug, Clone, PartialEq, Eq)]
308pub enum BatchDeleteError {
309    /// Vector ID not found in index
310    NotFound(VectorId),
311    /// Vector was already deleted (idempotent, not an error)
312    AlreadyDeleted(VectorId),
313    /// Internal error during deletion
314    InternalError(VectorId, String),
315}
316
317/// Result of a batch delete operation
318/// [C4 FIX] Includes detailed error information
319#[derive(Debug, Clone)]
320pub struct BatchDeleteResult {
321    /// Number of vectors successfully deleted
322    pub deleted: usize,
323    /// Number of vectors that were already deleted (idempotent)
324    pub already_deleted: usize,
325    /// Number of invalid IDs (not found in index)
326    pub invalid_ids: usize,
327    /// Total IDs in input (including duplicates)
328    pub total: usize,
329    /// [m2 FIX] Number of unique IDs processed (duplicates removed)
330    pub unique_count: usize,
331    /// [C4 FIX] Detailed errors for failed IDs
332    pub errors: Vec<BatchDeleteError>,
333}
334
335impl BatchDeleteResult {
336    /// Create a new empty result
337    #[must_use]
338    pub fn new() -> Self {
339        Self {
340            deleted: 0,
341            already_deleted: 0,
342            invalid_ids: 0,
343            total: 0,
344            unique_count: 0,
345            errors: Vec::new(),
346        }
347    }
348
349    /// Check if all operations succeeded (no invalid IDs)
350    #[must_use]
351    pub fn all_valid(&self) -> bool {
352        self.invalid_ids == 0
353    }
354
355    /// Check if any deletions occurred
356    #[must_use]
357    pub fn any_deleted(&self) -> bool {
358        self.deleted > 0
359    }
360
361    /// Check if there were any errors (not including already-deleted)
362    #[must_use]
363    pub fn has_errors(&self) -> bool {
364        self.invalid_ids > 0
365            || !self
366                .errors
367                .iter()
368                .all(|e| matches!(e, BatchDeleteError::AlreadyDeleted(_)))
369    }
370}
371
372impl Default for BatchDeleteResult {
373    fn default() -> Self {
374        Self::new()
375    }
376}
377
378impl HnswIndex {
379    /// Creates a new empty HNSW graph.
380    ///
381    /// # Arguments
382    ///
383    /// * `config` - HNSW configuration parameters.
384    /// * `storage` - Vector storage to validate against.
385    ///
386    /// # Errors
387    ///
388    /// Returns `GraphError::ConfigMismatch` if storage dimensions differ from config.
389    /// Returns `GraphError::InvalidConfig` if configuration parameters are invalid (e.g., M <= 1).
390    pub fn new(config: HnswConfig, storage: &VectorStorage) -> Result<Self, GraphError> {
391        if config.dimensions != storage.dimensions() {
392            return Err(GraphError::ConfigMismatch {
393                expected: storage.dimensions(),
394                actual: config.dimensions,
395            });
396        }
397
398        if config.m <= 1 {
399            return Err(GraphError::InvalidConfig(format!(
400                "m must be > 1, got {}",
401                config.m
402            )));
403        }
404        if config.m0 < config.m {
405            return Err(GraphError::InvalidConfig(format!(
406                "m0 must be >= m, got {} < {}",
407                config.m0, config.m
408            )));
409        }
410
411        // Calculate level multiplier: m_l = 1 / ln(M)
412        let m_float = config.m as f32;
413        let level_mult = if m_float > 1.0 {
414            1.0 / m_float.ln()
415        } else {
416            0.0
417        };
418
419        // Initialize RNG with a default seed for determinism.
420        let rng = ChaCha8Rng::seed_from_u64(42);
421
422        Ok(Self {
423            config,
424            nodes: Vec::new(),
425            neighbors: NeighborPool::new(),
426            entry_point: None,
427            max_layer: 0,
428            level_mult,
429            rng,
430            deleted_count: 0, // v0.3.0: No deleted nodes initially
431            compaction_threshold: default_compaction_threshold(), // v0.3.0: Default 30%
432            metadata: MetadataStore::new(), // v0.6.0 RFC-002: Empty metadata store
433            bq_storage: None, // v0.7.0 RFC-002 Phase 2: BQ disabled by default
434        })
435    }
436
437    /// Creates a new HNSW graph with pre-populated metadata (v0.6.0 - RFC-002).
438    ///
439    /// This constructor allows initializing the index with existing metadata,
440    /// useful for deserialization or migration scenarios.
441    ///
442    /// # Arguments
443    ///
444    /// * `config` - HNSW configuration parameters.
445    /// * `storage` - Vector storage to validate against.
446    /// * `metadata` - Pre-populated metadata store.
447    ///
448    /// # Errors
449    ///
450    /// Returns `GraphError::ConfigMismatch` if storage dimensions differ from config.
451    /// Returns `GraphError::InvalidConfig` if configuration parameters are invalid.
452    ///
453    /// # Example
454    ///
455    /// ```
456    /// use edgevec::hnsw::{HnswConfig, HnswIndex};
457    /// use edgevec::metadata::{MetadataStore, MetadataValue};
458    /// use edgevec::storage::VectorStorage;
459    ///
460    /// let config = HnswConfig::new(4);
461    /// let storage = VectorStorage::new(&config, None);
462    ///
463    /// let mut metadata = MetadataStore::new();
464    /// metadata.insert(1, "key", MetadataValue::String("value".into())).unwrap();
465    ///
466    /// let index = HnswIndex::with_metadata(config, &storage, metadata).unwrap();
467    /// assert!(index.metadata().has_key(1, "key"));
468    /// ```
469    pub fn with_metadata(
470        config: HnswConfig,
471        storage: &VectorStorage,
472        metadata: MetadataStore,
473    ) -> Result<Self, GraphError> {
474        if config.dimensions != storage.dimensions() {
475            return Err(GraphError::ConfigMismatch {
476                expected: storage.dimensions(),
477                actual: config.dimensions,
478            });
479        }
480
481        if config.m <= 1 {
482            return Err(GraphError::InvalidConfig(format!(
483                "m must be > 1, got {}",
484                config.m
485            )));
486        }
487        if config.m0 < config.m {
488            return Err(GraphError::InvalidConfig(format!(
489                "m0 must be >= m, got {} < {}",
490                config.m0, config.m
491            )));
492        }
493
494        let m_float = config.m as f32;
495        let level_mult = if m_float > 1.0 {
496            1.0 / m_float.ln()
497        } else {
498            0.0
499        };
500
501        let rng = ChaCha8Rng::seed_from_u64(42);
502
503        Ok(Self {
504            config,
505            nodes: Vec::new(),
506            neighbors: NeighborPool::new(),
507            entry_point: None,
508            max_layer: 0,
509            level_mult,
510            rng,
511            deleted_count: 0,
512            compaction_threshold: default_compaction_threshold(),
513            metadata, // Use provided metadata
514            bq_storage: None,
515        })
516    }
517
518    /// Creates a new index with binary quantization enabled (v0.7.0 - RFC-002 Phase 2).
519    ///
520    /// BQ provides 32x memory reduction and 3-5x search speedup at the cost of
521    /// some recall (which can be recovered via rescoring).
522    ///
523    /// # Arguments
524    ///
525    /// * `config` - HNSW configuration parameters.
526    /// * `storage` - Vector storage to validate against.
527    ///
528    /// # Errors
529    ///
530    /// Returns error if dimension is not divisible by 8 (required for binary packing).
531    /// Returns `GraphError::ConfigMismatch` if storage dimensions differ from config.
532    /// Returns `GraphError::InvalidConfig` if configuration parameters are invalid.
533    ///
534    /// # Example
535    ///
536    /// ```
537    /// use edgevec::hnsw::{HnswConfig, HnswIndex};
538    /// use edgevec::storage::VectorStorage;
539    ///
540    /// let config = HnswConfig::new(128); // 128D, divisible by 8
541    /// let storage = VectorStorage::new(&config, None);
542    /// let index = HnswIndex::with_bq(config, &storage).unwrap();
543    ///
544    /// assert!(index.has_bq());
545    /// ```
546    pub fn with_bq(config: HnswConfig, storage: &VectorStorage) -> Result<Self, GraphError> {
547        let dimension = config.dimensions as usize;
548
549        // Validate dimension is compatible with BQ (divisible by 8)
550        if dimension % 8 != 0 {
551            return Err(GraphError::InvalidConfig(format!(
552                "dimension must be divisible by 8 for binary quantization, got {dimension}"
553            )));
554        }
555
556        let mut index = Self::new(config, storage)?;
557
558        index.bq_storage = Some(
559            BinaryVectorStorage::new(dimension).map_err(|e| GraphError::Storage(e.to_string()))?,
560        );
561
562        Ok(index)
563    }
564
565    /// Enables binary quantization on an existing index (v0.7.0 - RFC-002 Phase 2).
566    ///
567    /// This creates a new BQ storage and quantizes all existing vectors.
568    /// Time complexity: O(n × d) where n = vector count, d = dimension.
569    ///
570    /// # Arguments
571    ///
572    /// * `storage` - The F32 vector storage containing vectors to quantize.
573    ///
574    /// # Errors
575    ///
576    /// Returns error if dimension is not divisible by 8.
577    ///
578    /// # Example
579    ///
580    /// ```
581    /// use edgevec::hnsw::{HnswConfig, HnswIndex};
582    /// use edgevec::storage::VectorStorage;
583    ///
584    /// let config = HnswConfig::new(128);
585    /// let mut storage = VectorStorage::new(&config, None);
586    /// let mut index = HnswIndex::new(config, &storage).unwrap();
587    ///
588    /// // Insert some vectors
589    /// index.insert(&vec![1.0f32; 128], &mut storage).unwrap();
590    ///
591    /// // Enable BQ later
592    /// index.enable_bq(&storage).unwrap();
593    /// assert!(index.has_bq());
594    /// ```
595    pub fn enable_bq(&mut self, storage: &VectorStorage) -> Result<(), GraphError> {
596        let dimension = self.config.dimensions as usize;
597
598        if dimension % 8 != 0 {
599            return Err(GraphError::InvalidConfig(format!(
600                "dimension must be divisible by 8 for binary quantization, got {dimension}"
601            )));
602        }
603
604        let mut bq_storage =
605            BinaryVectorStorage::new(dimension).map_err(|e| GraphError::Storage(e.to_string()))?;
606
607        // Quantize all existing non-deleted vectors
608        for node in &self.nodes {
609            if node.deleted != 0 {
610                // Insert placeholder for deleted nodes to maintain ID alignment
611                let zeros = vec![0u8; dimension / 8];
612                bq_storage
613                    .insert_raw(&zeros)
614                    .map_err(|e| GraphError::Storage(e.to_string()))?;
615                continue;
616            }
617
618            let vector = storage.get_vector(node.vector_id);
619            let bv = BinaryVector::quantize(&vector)
620                .map_err(|e| GraphError::Quantization(e.to_string()))?;
621            bq_storage
622                .insert(&bv)
623                .map_err(|e| GraphError::Storage(e.to_string()))?;
624        }
625
626        self.bq_storage = Some(bq_storage);
627        Ok(())
628    }
629
630    /// Returns true if binary quantization is enabled.
631    #[must_use]
632    #[inline]
633    pub fn has_bq(&self) -> bool {
634        self.bq_storage.is_some()
635    }
636
637    /// Returns a reference to the BQ storage, if enabled.
638    #[must_use]
639    #[inline]
640    pub fn bq_storage(&self) -> Option<&BinaryVectorStorage> {
641        self.bq_storage.as_ref()
642    }
643
644    /// Generates a random level for a new node.
645    ///
646    /// Formula: `floor(-ln(uniform(0,1)) * m_l)`
647    /// Clamped to `max_level` (e.g. 16) to prevent memory explosion.
648    #[must_use]
649    pub fn get_random_level(&mut self) -> u8 {
650        // Generate uniform(0, 1)
651        let r: f32 = self.rng.gen_range(f32::EPSILON..=1.0);
652        let level = (-r.ln() * self.level_mult).floor();
653
654        // Safety cap (e.g. 16)
655        if level > 16.0 {
656            16
657        } else {
658            level as u8
659        }
660    }
661
662    /// Adds a node to the graph.
663    ///
664    /// # Arguments
665    ///
666    /// * `vector_id` - The external vector identifier
667    /// * `max_layer` - The maximum layer for this node
668    ///
669    /// # Returns
670    ///
671    /// The new `NodeId` assigned to this node, or a `GraphError`.
672    pub fn add_node(&mut self, vector_id: VectorId, max_layer: u8) -> Result<NodeId, GraphError> {
673        if vector_id == VectorId::INVALID {
674            return Err(GraphError::InvalidVectorId);
675        }
676
677        // Safety limit for NodeId
678        if self.nodes.len() >= u32::MAX as usize {
679            return Err(GraphError::CapacityExceeded);
680        }
681
682        let node = HnswNode {
683            vector_id,
684            neighbor_offset: 0,
685            neighbor_len: 0,
686            max_layer,
687            deleted: 0, // Live node (v0.3.0)
688        };
689
690        #[allow(clippy::cast_possible_truncation)]
691        let id = NodeId(self.nodes.len() as u32);
692        self.nodes.push(node);
693
694        // Update max layer if needed
695        if max_layer > self.max_layer {
696            self.max_layer = max_layer;
697        }
698
699        Ok(id)
700    }
701
702    /// Sets the neighbors for a node.
703    ///
704    /// # Arguments
705    /// * `node_id` - The node to update.
706    /// * `neighbors` - The list of neighbor IDs.
707    pub fn set_neighbors(
708        &mut self,
709        node_id: NodeId,
710        neighbors: &[NodeId],
711    ) -> Result<(), GraphError> {
712        if node_id.0 as usize >= self.nodes.len() {
713            return Err(GraphError::InvalidVectorId);
714        }
715
716        // Convert NodeId to u32 for encoding
717        let neighbor_u32s: Vec<u32> = neighbors.iter().map(|n| n.0).collect();
718        let encoded = NeighborPool::encode_neighbors(&neighbor_u32s);
719
720        // Alloc new space
721        let (offset, capacity) = self.neighbors.alloc(encoded.len())?;
722
723        // Write data
724        let start = offset as usize;
725        let end = start + encoded.len();
726        self.neighbors.buffer[start..end].copy_from_slice(&encoded);
727
728        // Update node and free old
729        let node = &mut self.nodes[node_id.0 as usize];
730
731        // Free old slot if it existed
732        if node.neighbor_len > 0 {
733            self.neighbors.free(node.neighbor_offset, node.neighbor_len);
734        }
735
736        node.neighbor_offset = offset;
737        node.neighbor_len = capacity; // Store allocated capacity
738
739        Ok(())
740    }
741
742    /// Retrieves a node by its ID.
743    #[must_use]
744    pub fn get_node(&self, id: NodeId) -> Option<&HnswNode> {
745        if id == NodeId::INVALID {
746            return None;
747        }
748        self.nodes.get(id.0 as usize)
749    }
750
751    /// Retrieves the neighbors for a node.
752    pub fn get_neighbors(&self, node: &HnswNode) -> Result<Vec<NodeId>, GraphError> {
753        let start = node.neighbor_offset as usize;
754        // Read up to allocated capacity
755        let end = start + node.neighbor_len as usize;
756
757        if end > self.neighbors.buffer.len() {
758            return Err(GraphError::NeighborError);
759        }
760
761        let slice = &self.neighbors.buffer[start..end];
762        let raw_neighbors = NeighborPool::decode_neighbors(slice);
763
764        // Convert back to NodeId
765        let neighbors = raw_neighbors.into_iter().map(NodeId).collect();
766        Ok(neighbors)
767    }
768
769    /// Retrieves the neighbors for a specific layer.
770    pub fn get_neighbors_layer(
771        &self,
772        node: &HnswNode,
773        layer: u8,
774    ) -> Result<Vec<NodeId>, GraphError> {
775        let start = node.neighbor_offset as usize;
776        let end = start + node.neighbor_len as usize;
777
778        if end > self.neighbors.buffer.len() {
779            return Err(GraphError::NeighborError);
780        }
781
782        let slice = &self.neighbors.buffer[start..end];
783        let raw_neighbors = NeighborPool::decode_layer(slice, layer);
784
785        // Convert back to NodeId
786        let neighbors = raw_neighbors.into_iter().map(NodeId).collect();
787        Ok(neighbors)
788    }
789
790    /// Returns the number of nodes in the graph.
791    #[must_use]
792    pub fn node_count(&self) -> usize {
793        self.nodes.len()
794    }
795
796    /// Returns the entry point node ID, if any.
797    #[must_use]
798    pub fn entry_point(&self) -> Option<NodeId> {
799        self.entry_point
800    }
801
802    /// Sets the entry point node ID.
803    pub fn set_entry_point(&mut self, id: NodeId) {
804        self.entry_point = Some(id);
805    }
806
807    /// Returns the current maximum layer in the graph.
808    #[must_use]
809    pub fn max_layer(&self) -> u8 {
810        self.max_layer
811    }
812
813    // ============================================================================
814    // Metadata API (v0.6.0 - RFC-002)
815    // ============================================================================
816
817    /// Returns an immutable reference to the metadata store.
818    ///
819    /// The metadata store contains key-value pairs attached to vectors.
820    /// Use this method to query metadata without modifying it.
821    ///
822    /// # Thread Safety
823    ///
824    /// `MetadataStore` is `Send + Sync` when all contained types are `Send + Sync`.
825    /// `String` and `MetadataValue` are both `Send + Sync`, so this is safe.
826    ///
827    /// # Example
828    ///
829    /// ```
830    /// use edgevec::hnsw::{HnswConfig, HnswIndex};
831    /// use edgevec::storage::VectorStorage;
832    ///
833    /// let config = HnswConfig::new(4);
834    /// let storage = VectorStorage::new(&config, None);
835    /// let index = HnswIndex::new(config, &storage).unwrap();
836    ///
837    /// assert!(index.metadata().is_empty());
838    /// ```
839    #[must_use]
840    pub fn metadata(&self) -> &MetadataStore {
841        &self.metadata
842    }
843
844    /// Returns a mutable reference to the metadata store.
845    ///
846    /// Use this method to modify metadata attached to vectors.
847    ///
848    /// # Thread Safety
849    ///
850    /// Concurrent modification requires external synchronization (e.g., `Mutex`),
851    /// matching the existing pattern for other `HnswIndex` operations.
852    ///
853    /// # Example
854    ///
855    /// ```
856    /// use edgevec::hnsw::{HnswConfig, HnswIndex};
857    /// use edgevec::metadata::MetadataValue;
858    /// use edgevec::storage::VectorStorage;
859    ///
860    /// let config = HnswConfig::new(4);
861    /// let storage = VectorStorage::new(&config, None);
862    /// let mut index = HnswIndex::new(config, &storage).unwrap();
863    ///
864    /// index.metadata_mut()
865    ///     .insert(1, "category", MetadataValue::String("books".into()))
866    ///     .unwrap();
867    ///
868    /// assert!(index.metadata().has_key(1, "category"));
869    /// ```
870    pub fn metadata_mut(&mut self) -> &mut MetadataStore {
871        &mut self.metadata
872    }
873
874    /// Inserts a vector with metadata atomically (v0.6.0 - RFC-002).
875    ///
876    /// This method validates metadata BEFORE inserting the vector, ensuring
877    /// that either both the vector and metadata are stored, or neither is.
878    ///
879    /// # Arguments
880    ///
881    /// * `storage` - The vector storage to insert into.
882    /// * `vector` - The vector data (must match configured dimensions).
883    /// * `metadata` - Key-value metadata pairs to attach to the vector.
884    ///
885    /// # Returns
886    ///
887    /// The assigned `VectorId` on success.
888    ///
889    /// # Errors
890    ///
891    /// Returns `GraphError::MetadataValidation` if metadata fails validation:
892    /// - More than 64 keys
893    /// - Key longer than 256 bytes
894    /// - String value longer than 64KB
895    /// - String array with more than 1024 elements
896    /// - Invalid key format (must be alphanumeric + underscore)
897    /// - Invalid float (NaN or Infinity)
898    ///
899    /// On error, the index remains unchanged (no partial state).
900    ///
901    /// # Example
902    ///
903    /// ```
904    /// use edgevec::hnsw::{HnswConfig, HnswIndex};
905    /// use edgevec::metadata::MetadataValue;
906    /// use edgevec::storage::VectorStorage;
907    /// use std::collections::HashMap;
908    ///
909    /// let config = HnswConfig::new(4);
910    /// let mut storage = VectorStorage::new(&config, None);
911    /// let mut index = HnswIndex::new(config, &storage).unwrap();
912    ///
913    /// let mut metadata = HashMap::new();
914    /// metadata.insert("category".to_string(), MetadataValue::String("books".into()));
915    /// metadata.insert("price".to_string(), MetadataValue::Float(29.99));
916    ///
917    /// let vector_id = index.insert_with_metadata(
918    ///     &mut storage,
919    ///     &[1.0, 2.0, 3.0, 4.0],
920    ///     metadata,
921    /// ).unwrap();
922    ///
923    /// // Note: VectorId is u64, but metadata uses u32 IDs
924    /// #[allow(clippy::cast_possible_truncation)]
925    /// let meta_id = vector_id.0 as u32;
926    /// assert!(index.metadata().has_key(meta_id, "category"));
927    /// ```
928    pub fn insert_with_metadata(
929        &mut self,
930        storage: &mut VectorStorage,
931        vector: &[f32],
932        metadata: HashMap<String, MetadataValue>,
933    ) -> Result<VectorId, GraphError> {
934        use crate::metadata::validation::{validate_key_value, MAX_KEYS_PER_VECTOR};
935
936        // Step 1: Validate metadata BEFORE any mutation
937        // Check key count limit
938        if metadata.len() > MAX_KEYS_PER_VECTOR {
939            return Err(GraphError::MetadataValidation(MetadataError::TooManyKeys {
940                vector_id: 0, // Unknown yet
941                count: metadata.len(),
942                max: MAX_KEYS_PER_VECTOR,
943            }));
944        }
945
946        // Validate each key-value pair
947        for (key, value) in &metadata {
948            validate_key_value(key, value)?;
949        }
950
951        // Step 2: Insert vector (this is atomic — either succeeds or fails)
952        let vector_id = self.insert(vector, storage)?;
953
954        // Step 3: Store metadata for the newly inserted vector
955        // This cannot fail because we pre-validated everything
956        // Note: VectorId is u64 but MetadataStore uses u32. In practice,
957        // vector IDs won't exceed u32::MAX (4B vectors).
958        #[allow(clippy::cast_possible_truncation)]
959        let metadata_id = vector_id.0 as u32;
960
961        for (key, value) in metadata {
962            // Pre-validated above, but propagate errors instead of panicking
963            // in case of unexpected edge cases (e.g., key count limits).
964            self.metadata
965                .insert(metadata_id, &key, value)
966                .map_err(GraphError::MetadataValidation)?;
967        }
968
969        Ok(vector_id)
970    }
971
972    // ============================================================================
973    // Filtered Search API (W26.2.3 - RFC-002)
974    // ============================================================================
975
976    /// Search with post-filtering using metadata expressions.
977    ///
978    /// Performs a vector similarity search and filters results based on a
979    /// metadata filter expression. Uses adaptive overfetch to ensure enough
980    /// results pass the filter.
981    ///
982    /// # Arguments
983    ///
984    /// * `storage` - The vector storage
985    /// * `query` - The query vector
986    /// * `filter` - Filter expression (e.g., `"category = \"books\" AND price < 100"`)
987    /// * `k` - Number of results to return
988    ///
989    /// # Returns
990    ///
991    /// Up to `k` results that pass the filter, sorted by distance (ascending).
992    /// May return fewer than `k` results if not enough vectors pass the filter.
993    ///
994    /// # Errors
995    ///
996    /// * `GraphError::FilterParse` - Invalid filter syntax
997    /// * `GraphError::FilterEval` - Filter evaluation failed
998    /// * Other `GraphError` variants from underlying search
999    ///
1000    /// # Algorithm (RFC-002 §3.2)
1001    ///
1002    /// 1. Parse filter expression
1003    /// 2. Use default selectivity = 0.50 (refined in W26.3.1)
1004    /// 3. Calculate overfetch_factor = min(10, max(2, 1 / selectivity))
1005    /// 4. Search with `k * overfetch_factor` candidates
1006    /// 5. Post-filter results using metadata evaluation
1007    /// 6. Return top-k passing results
1008    ///
1009    /// # Example
1010    ///
1011    /// ```
1012    /// use edgevec::hnsw::{HnswConfig, HnswIndex};
1013    /// use edgevec::storage::VectorStorage;
1014    /// use edgevec::metadata::MetadataValue;
1015    /// use std::collections::HashMap;
1016    ///
1017    /// let config = HnswConfig::new(4);
1018    /// let mut storage = VectorStorage::new(&config, None);
1019    /// let mut index = HnswIndex::new(config, &storage).unwrap();
1020    ///
1021    /// // Insert vectors with metadata
1022    /// let mut meta = HashMap::new();
1023    /// meta.insert("category".to_string(), MetadataValue::String("books".into()));
1024    /// meta.insert("price".to_string(), MetadataValue::Integer(25));
1025    /// let _ = index.insert_with_metadata(&mut storage, &[1.0, 0.0, 0.0, 0.0], meta);
1026    ///
1027    /// // Search with filter
1028    /// let results = index.search_filtered(
1029    ///     &storage,
1030    ///     &[1.0, 0.0, 0.0, 0.0],
1031    ///     "category = \"books\" AND price < 100",
1032    ///     5,
1033    /// ).unwrap();
1034    /// ```
1035    pub fn search_filtered(
1036        &self,
1037        storage: &VectorStorage,
1038        query: &[f32],
1039        filter: &str,
1040        k: usize,
1041    ) -> Result<Vec<(VectorId, f32)>, GraphError> {
1042        use crate::filter::{
1043            estimate_filter_selectivity, evaluate, overfetch_from_selectivity, parse,
1044        };
1045
1046        // Step 1: Parse filter expression
1047        let expr = parse(filter).map_err(|e| GraphError::FilterParse(e.to_string()))?;
1048
1049        // Step 2: Estimate selectivity using heuristics (W26.3.1)
1050        let selectivity = estimate_filter_selectivity(&expr);
1051
1052        // Step 3: Calculate overfetch factor from selectivity
1053        let overfetch_factor = overfetch_from_selectivity(selectivity);
1054
1055        // Step 4: Search with overfetched k
1056        let overfetched_k = k.saturating_mul(overfetch_factor);
1057        let candidates = self.search(query, overfetched_k, storage)?;
1058
1059        // Step 5: Post-filter results
1060        let mut passing_results = Vec::with_capacity(k);
1061
1062        for result in candidates {
1063            // Get metadata for this vector
1064            #[allow(clippy::cast_possible_truncation)]
1065            let metadata_id = result.vector_id.0 as u32;
1066
1067            // Get all metadata for this vector (empty HashMap if none)
1068            let metadata = self
1069                .metadata
1070                .get_all(metadata_id)
1071                .cloned()
1072                .unwrap_or_default();
1073
1074            // Evaluate filter
1075            match evaluate(&expr, &metadata) {
1076                Ok(true) => {
1077                    passing_results.push((result.vector_id, result.distance));
1078                    if passing_results.len() >= k {
1079                        break;
1080                    }
1081                }
1082                Ok(false) => {
1083                    // Filter didn't match, skip
1084                }
1085                Err(e) => {
1086                    // Filter evaluation error (e.g., type mismatch)
1087                    // Per RFC-002: treat as filter failure, skip this result
1088                    // In production, we might want to log this
1089                    log::debug!(
1090                        "Filter evaluation failed for vector {}: {}",
1091                        result.vector_id.0,
1092                        e
1093                    );
1094                }
1095            }
1096        }
1097
1098        // Step 6: Return results (already sorted by distance from search)
1099        Ok(passing_results)
1100    }
1101
1102    // ============================================================================
1103    // Soft Delete API (W16.2 - RFC-001)
1104    // ============================================================================
1105
1106    /// Get mutable reference to a node by VectorId.
1107    ///
1108    /// # Complexity
1109    ///
1110    /// * Time: O(n) linear scan
1111    /// * Space: O(1)
1112    fn get_node_mut(&mut self, vector_id: VectorId) -> Result<&mut HnswNode, GraphError> {
1113        self.nodes
1114            .iter_mut()
1115            .find(|n| n.vector_id == vector_id)
1116            .ok_or(GraphError::InvalidVectorId)
1117    }
1118
1119    /// Get immutable reference to a node by VectorId.
1120    ///
1121    /// # Complexity
1122    ///
1123    /// * Time: O(n) linear scan
1124    /// * Space: O(1)
1125    fn get_node_by_vector_id(&self, vector_id: VectorId) -> Result<&HnswNode, GraphError> {
1126        self.nodes
1127            .iter()
1128            .find(|n| n.vector_id == vector_id)
1129            .ok_or(GraphError::InvalidVectorId)
1130    }
1131
1132    /// Mark a vector as deleted (soft delete).
1133    ///
1134    /// The vector remains in the graph for routing but is excluded from
1135    /// search results. Space is reclaimed via `compact()`.
1136    ///
1137    /// # Arguments
1138    ///
1139    /// * `vector_id` - The ID of the vector to delete
1140    ///
1141    /// # Returns
1142    ///
1143    /// * `Ok(true)` - Vector was deleted
1144    /// * `Ok(false)` - Vector was already deleted
1145    /// * `Err(InvalidVectorId)` - Vector ID not found
1146    ///
1147    /// # Complexity
1148    ///
1149    /// * Time: **O(n)** for VectorId → Node lookup via linear scan,
1150    ///   plus O(1) for setting the deleted byte
1151    /// * Space: O(1)
1152    ///
1153    /// **Note:** The O(n) lookup is a known limitation in v0.3.0.
1154    /// A HashMap<VectorId, NodeId> index could provide O(1) lookup
1155    /// but is deferred to v0.4.0 to avoid scope creep.
1156    ///
1157    /// # Thread Safety (RFC-001 Design)
1158    ///
1159    /// This method requires `&mut self`, which means Rust's borrow checker
1160    /// prevents concurrent access at compile time. This is intentional:
1161    ///
1162    /// * Search (`&self`) and delete (`&mut self`) cannot run concurrently
1163    /// * No atomics or locks needed - safety is enforced by the type system
1164    /// * See RFC-001 Section 6.4 "Design Decisions" for rationale
1165    ///
1166    /// # Persistence
1167    ///
1168    /// **IMPORTANT:** Delete operations are in-memory only until `save()` is called.
1169    /// If the process crashes before `save()`, the delete will be lost.
1170    ///
1171    /// # Example
1172    ///
1173    /// ```ignore
1174    /// let deleted = index.soft_delete(VectorId(42))?;
1175    /// assert!(deleted);
1176    /// assert!(index.is_deleted(VectorId(42))?);
1177    /// ```
1178    pub fn soft_delete(&mut self, vector_id: VectorId) -> Result<bool, GraphError> {
1179        let node = self.get_node_mut(vector_id)?;
1180
1181        if node.deleted != 0 {
1182            return Ok(false); // Already deleted
1183        }
1184
1185        node.deleted = 1;
1186        self.deleted_count += 1;
1187
1188        // RFC-002 §2.3: Remove metadata when vector is soft-deleted
1189        // Note: VectorId is u64 but MetadataStore uses u32. In practice,
1190        // vector IDs won't exceed u32::MAX (4B vectors).
1191        #[allow(clippy::cast_possible_truncation)]
1192        let metadata_id = vector_id.0 as u32;
1193        self.metadata.delete_all(metadata_id);
1194
1195        Ok(true)
1196    }
1197
1198    /// Delete multiple vectors in a single operation
1199    ///
1200    /// **[C5 FIX] Two-Phase Implementation:**
1201    /// 1. Pre-validation: Check all IDs exist and are not already deleted
1202    /// 2. Execution: Apply deletions (guaranteed to succeed after validation)
1203    ///
1204    /// This prevents partial failures from leaving the index in an inconsistent state.
1205    ///
1206    /// **[C2 FIX] Duplicate Handling:**
1207    /// Duplicate IDs in the input are automatically deduplicated. Only the first
1208    /// occurrence is processed, duplicates are counted in `total` but not `unique_count`.
1209    ///
1210    /// **[M2 FIX] Memory Bounds:**
1211    /// Maximum batch size is capped at 10 million IDs to prevent memory exhaustion.
1212    ///
1213    /// # Arguments
1214    /// * `ids` - Slice of VectorId values to delete (duplicates allowed)
1215    ///
1216    /// # Returns
1217    /// * `BatchDeleteResult` with counts and detailed errors
1218    ///
1219    /// # Complexity
1220    /// * Time: O(N × M) where N = unique IDs, M = index size (for ID lookup)
1221    /// * Space: O(N) for deduplication and validation structures
1222    ///
1223    /// # Example
1224    /// ```
1225    /// use edgevec::hnsw::{HnswConfig, HnswIndex, VectorId};
1226    /// use edgevec::storage::VectorStorage;
1227    ///
1228    /// let config = HnswConfig::new(4);
1229    /// let mut storage = VectorStorage::new(&config, None);
1230    /// let mut index = HnswIndex::new(config, &storage).unwrap();
1231    ///
1232    /// // Insert some vectors
1233    /// for i in 0..10 {
1234    ///     index.insert(&vec![i as f32; 4], &mut storage).unwrap();
1235    /// }
1236    ///
1237    /// // Batch delete
1238    /// let ids = vec![VectorId(1), VectorId(3), VectorId(5)];
1239    /// let result = index.soft_delete_batch(&ids);
1240    ///
1241    /// assert_eq!(result.deleted, 3);
1242    /// assert_eq!(result.total, 3);
1243    /// assert!(result.all_valid());
1244    /// ```
1245    pub fn soft_delete_batch(&mut self, ids: &[VectorId]) -> BatchDeleteResult {
1246        // [M2 FIX] Memory bounds check: cap at 10M IDs (~80MB allocation)
1247        const MAX_BATCH_SIZE: usize = 10_000_000;
1248
1249        let mut result = BatchDeleteResult {
1250            deleted: 0,
1251            already_deleted: 0,
1252            invalid_ids: 0,
1253            total: ids.len(),
1254            unique_count: 0,
1255            errors: Vec::new(),
1256        };
1257
1258        if ids.is_empty() {
1259            return result;
1260        }
1261
1262        // [M2 FIX] Check memory bounds
1263        if ids.len() > MAX_BATCH_SIZE {
1264            // Mark all as errors
1265            result.invalid_ids = ids.len();
1266            result.errors.push(BatchDeleteError::InternalError(
1267                VectorId(0),
1268                format!(
1269                    "Batch size {} exceeds maximum {}",
1270                    ids.len(),
1271                    MAX_BATCH_SIZE
1272                ),
1273            ));
1274            return result;
1275        }
1276
1277        // [C2 FIX] Phase 0: Deduplication
1278        // Use HashSet to track seen IDs and eliminate duplicates
1279        let mut seen = HashSet::with_capacity(ids.len().min(1024)); // Cap initial allocation
1280        let mut unique_ids = Vec::with_capacity(ids.len().min(1024));
1281
1282        for &id in ids {
1283            if seen.insert(id) {
1284                unique_ids.push(id);
1285            }
1286        }
1287
1288        result.unique_count = unique_ids.len();
1289
1290        // [C5 FIX] Phase 1: Pre-validation
1291        // Check all unique IDs and categorize them BEFORE making any changes
1292        let estimated_errors = unique_ids.len() / 10; // Assume 10% error rate
1293        let mut valid_ids = Vec::with_capacity(unique_ids.len());
1294        let mut already_deleted_count = 0;
1295
1296        // [m1 FIX] Pre-allocate error vector with estimated capacity
1297        result.errors = Vec::with_capacity(estimated_errors);
1298
1299        for &id in &unique_ids {
1300            match self.is_deleted(id) {
1301                Ok(true) => {
1302                    // Already deleted - not an error, just skip
1303                    already_deleted_count += 1;
1304                    result.errors.push(BatchDeleteError::AlreadyDeleted(id));
1305                }
1306                Ok(false) => {
1307                    // Valid and not deleted - queue for deletion
1308                    valid_ids.push(id);
1309                }
1310                Err(_) => {
1311                    // ID not found
1312                    result.invalid_ids += 1;
1313                    result.errors.push(BatchDeleteError::NotFound(id));
1314                }
1315            }
1316        }
1317
1318        result.already_deleted = already_deleted_count;
1319
1320        // [C5 FIX] Phase 2: Execution
1321        // All IDs in valid_ids are guaranteed to exist and not be deleted
1322        // This phase should not fail
1323        for &id in &valid_ids {
1324            match self.soft_delete(id) {
1325                Ok(true) => result.deleted += 1,
1326                Ok(false) => {
1327                    // Should not happen after validation, but handle gracefully
1328                    result.already_deleted += 1;
1329                }
1330                Err(e) => {
1331                    // Should not happen after validation
1332                    result.errors.push(BatchDeleteError::InternalError(
1333                        id,
1334                        format!("Unexpected error after validation: {e:?}"),
1335                    ));
1336                }
1337            }
1338        }
1339
1340        result
1341    }
1342
1343    /// Delete multiple vectors with progress callback
1344    ///
1345    /// **[C3 FIX] Two-Phase Implementation:**
1346    /// Now delegates to `soft_delete_batch()` for validation, then reports progress
1347    /// during the execution phase. This ensures consistent behavior between variants.
1348    ///
1349    /// **[M3 FIX] Panic Safety:**
1350    /// If the callback panics, the operation aborts but the index remains in a valid state
1351    /// (no partial deletions). The panic will unwind past this function.
1352    ///
1353    /// Callback is invoked approximately every 10% of progress during execution phase.
1354    /// Useful for UI updates during large batch operations.
1355    ///
1356    /// # Arguments
1357    /// * `ids` - Slice of VectorId values to delete (duplicates allowed)
1358    /// * `callback` - Function called with (processed_unique, total_unique) counts
1359    ///
1360    /// # Complexity
1361    /// * Time: O(N × M) + O(N × C) where N = unique IDs, M = index size, C = callback cost
1362    /// * Space: O(N) for deduplication and validation
1363    ///
1364    /// # Example
1365    /// ```
1366    /// use edgevec::hnsw::{HnswConfig, HnswIndex, VectorId};
1367    /// use edgevec::storage::VectorStorage;
1368    ///
1369    /// let config = HnswConfig::new(4);
1370    /// let mut storage = VectorStorage::new(&config, None);
1371    /// let mut index = HnswIndex::new(config, &storage).unwrap();
1372    ///
1373    /// // Insert vectors
1374    /// for i in 0..100 {
1375    ///     index.insert(&vec![i as f32; 4], &mut storage).unwrap();
1376    /// }
1377    ///
1378    /// // Batch delete with progress
1379    /// let ids: Vec<VectorId> = (1..=50).map(VectorId).collect();
1380    /// let result = index.soft_delete_batch_with_progress(&ids, |processed, total| {
1381    ///     println!("Progress: {}/{}", processed, total);
1382    /// });
1383    /// ```
1384    pub fn soft_delete_batch_with_progress<F>(
1385        &mut self,
1386        ids: &[VectorId],
1387        mut callback: F,
1388    ) -> BatchDeleteResult
1389    where
1390        F: FnMut(usize, usize),
1391    {
1392        // [C3 FIX] Use the main two-phase implementation for consistency
1393        // We'll perform validation first (Phase 0-1), then execution with progress (Phase 2)
1394
1395        // Phase 0-1: Deduplication and validation (same as soft_delete_batch)
1396        const MAX_BATCH_SIZE: usize = 10_000_000;
1397
1398        let mut result = BatchDeleteResult {
1399            deleted: 0,
1400            already_deleted: 0,
1401            invalid_ids: 0,
1402            total: ids.len(),
1403            unique_count: 0,
1404            errors: Vec::new(),
1405        };
1406
1407        if ids.is_empty() {
1408            callback(0, 0);
1409            return result;
1410        }
1411
1412        if ids.len() > MAX_BATCH_SIZE {
1413            result.invalid_ids = ids.len();
1414            result.errors.push(BatchDeleteError::InternalError(
1415                VectorId(0),
1416                format!(
1417                    "Batch size {} exceeds maximum {}",
1418                    ids.len(),
1419                    MAX_BATCH_SIZE
1420                ),
1421            ));
1422            return result;
1423        }
1424
1425        // Deduplication
1426        let mut seen = HashSet::with_capacity(ids.len().min(1024));
1427        let mut unique_ids = Vec::with_capacity(ids.len().min(1024));
1428
1429        for &id in ids {
1430            if seen.insert(id) {
1431                unique_ids.push(id);
1432            }
1433        }
1434
1435        result.unique_count = unique_ids.len();
1436
1437        // Pre-validation
1438        let estimated_errors = unique_ids.len() / 10;
1439        let mut valid_ids = Vec::with_capacity(unique_ids.len());
1440        let mut already_deleted_count = 0;
1441        result.errors = Vec::with_capacity(estimated_errors);
1442
1443        for &id in &unique_ids {
1444            match self.is_deleted(id) {
1445                Ok(true) => {
1446                    already_deleted_count += 1;
1447                    result.errors.push(BatchDeleteError::AlreadyDeleted(id));
1448                }
1449                Ok(false) => {
1450                    valid_ids.push(id);
1451                }
1452                Err(_) => {
1453                    result.invalid_ids += 1;
1454                    result.errors.push(BatchDeleteError::NotFound(id));
1455                }
1456            }
1457        }
1458
1459        result.already_deleted = already_deleted_count;
1460
1461        // [C3 FIX] Phase 2: Execution with progress callbacks
1462        // Calculate progress interval (~10% increments, minimum 1)
1463        let total_to_process = valid_ids.len();
1464        let interval = (total_to_process / 10).max(1);
1465        let mut last_callback = 0;
1466
1467        for (i, &id) in valid_ids.iter().enumerate() {
1468            match self.soft_delete(id) {
1469                Ok(true) => result.deleted += 1,
1470                Ok(false) => {
1471                    result.already_deleted += 1;
1472                }
1473                Err(e) => {
1474                    result.errors.push(BatchDeleteError::InternalError(
1475                        id,
1476                        format!("Unexpected error after validation: {e:?}"),
1477                    ));
1478                }
1479            }
1480
1481            // Fire callback at ~10% intervals
1482            if i + 1 - last_callback >= interval || i + 1 == total_to_process {
1483                // [M3 FIX] Callback may panic - if it does, the operation aborts here
1484                // but the index state is valid (all deletions up to this point succeeded)
1485                callback(i + 1, total_to_process);
1486                last_callback = i + 1;
1487            }
1488        }
1489
1490        result
1491    }
1492
1493    /// Check if a vector is marked as deleted.
1494    ///
1495    /// # Arguments
1496    ///
1497    /// * `vector_id` - The ID of the vector to check
1498    ///
1499    /// # Returns
1500    ///
1501    /// * `Ok(true)` - Vector is deleted
1502    /// * `Ok(false)` - Vector is live
1503    /// * `Err(InvalidVectorId)` - Vector ID not found
1504    ///
1505    /// # Complexity
1506    ///
1507    /// * Time: O(n) for lookup + O(1) for check
1508    /// * Space: O(1)
1509    pub fn is_deleted(&self, vector_id: VectorId) -> Result<bool, GraphError> {
1510        let node = self.get_node_by_vector_id(vector_id)?;
1511        Ok(node.deleted != 0)
1512    }
1513
1514    /// Get the count of deleted (tombstoned) vectors.
1515    ///
1516    /// # Returns
1517    ///
1518    /// Number of vectors marked as deleted.
1519    #[must_use]
1520    pub fn deleted_count(&self) -> usize {
1521        self.deleted_count
1522    }
1523
1524    /// Get the ratio of deleted vectors to total vectors.
1525    ///
1526    /// # Returns
1527    ///
1528    /// A value between 0.0 and 1.0 representing the tombstone ratio.
1529    /// Returns 0.0 if the index is empty.
1530    #[must_use]
1531    pub fn tombstone_ratio(&self) -> f64 {
1532        let total = self.node_count();
1533        if total == 0 {
1534            return 0.0;
1535        }
1536        self.deleted_count as f64 / total as f64
1537    }
1538
1539    /// Get count of live (non-deleted) vectors.
1540    ///
1541    /// # Returns
1542    ///
1543    /// Number of vectors that are not marked as deleted.
1544    #[must_use]
1545    pub fn live_count(&self) -> usize {
1546        self.node_count().saturating_sub(self.deleted_count)
1547    }
1548
1549    /// Maximum multiplier for adjusted_k to prevent excessive over-fetching.
1550    ///
1551    /// At 90%+ tombstones, we cap at 10x the original k to bound memory usage.
1552    /// Beyond this ratio, compaction should be triggered.
1553    pub const MAX_ADJUSTED_K_MULTIPLIER: usize = 10;
1554
1555    /// Calculate adjusted k to compensate for tombstones.
1556    ///
1557    /// When the index has deleted vectors, we over-fetch to ensure
1558    /// we can return k live results after filtering. This method
1559    /// calculates how many candidates to fetch internally so that
1560    /// after filtering out deleted vectors, we likely have k results.
1561    ///
1562    /// # Formula
1563    ///
1564    /// `adjusted_k = k * total / live`
1565    ///
1566    /// This is equivalent to `k / (1 - tombstone_ratio)` but uses
1567    /// integer arithmetic for precision.
1568    ///
1569    /// # Caps
1570    ///
1571    /// The result is capped at `k * MAX_ADJUSTED_K_MULTIPLIER` (default 10x)
1572    /// to prevent excessive fetching at very high tombstone ratios.
1573    ///
1574    /// # Examples
1575    ///
1576    /// * 0% tombstones: k = k (no adjustment)
1577    /// * 10% tombstones: k → ~1.11x
1578    /// * 30% tombstones: k → ~1.43x
1579    /// * 50% tombstones: k → 2x
1580    /// * 90%+ tombstones: k → 10x (capped)
1581    ///
1582    /// # Thread Safety
1583    ///
1584    /// This method reads `deleted_count` and `node_count()` which may change
1585    /// if the index is mutated. Per RFC-001, the API uses `&mut self` for
1586    /// mutations, so concurrent read+write is prevented by Rust's borrow checker.
1587    /// The design accepts eventual consistency for soft delete semantics.
1588    ///
1589    /// # Arguments
1590    ///
1591    /// * `k` - The requested number of results
1592    ///
1593    /// # Returns
1594    ///
1595    /// The adjusted k value to use for internal search operations.
1596    /// This value is always >= k (unless capped by live_count).
1597    #[must_use]
1598    pub fn adjusted_k(&self, k: usize) -> usize {
1599        // Fast path: no tombstones
1600        if self.deleted_count == 0 {
1601            return k;
1602        }
1603
1604        let total = self.node_count();
1605        let live = self.live_count();
1606
1607        // Edge case: all deleted
1608        // This also prevents division by zero in the calculation below.
1609        if live == 0 {
1610            return k; // Will return empty results anyway
1611        }
1612
1613        // Integer arithmetic: adjusted = k * total / live
1614        // Use saturating ops to prevent overflow
1615        let adjusted = k.saturating_mul(total) / live;
1616
1617        // Cap at MAX_ADJUSTED_K_MULTIPLIER to prevent excessive over-fetching
1618        // Note: We don't cap at total because adjusted_k controls the internal
1619        // search effort (ef parameter), not the final result count.
1620        let max_by_multiplier = k.saturating_mul(Self::MAX_ADJUSTED_K_MULTIPLIER);
1621        adjusted.min(max_by_multiplier)
1622    }
1623
1624    /// Marks a vector as deleted in the storage (legacy API).
1625    ///
1626    /// **DEPRECATED:** Use `soft_delete()` instead for RFC-001 compliant soft delete.
1627    /// This method delegates to storage and does not update `deleted_count`.
1628    ///
1629    /// # Arguments
1630    ///
1631    /// * `id` - The vector ID to delete.
1632    /// * `storage` - The vector storage to update.
1633    ///
1634    /// # Returns
1635    ///
1636    /// `true` if the vector was active and is now deleted.
1637    /// `false` if it was already deleted.
1638    #[deprecated(since = "0.3.0", note = "Use soft_delete() instead")]
1639    pub fn delete_in_storage(&self, id: VectorId, storage: &mut VectorStorage) -> bool {
1640        storage.mark_deleted(id)
1641    }
1642
1643    /// DEBUG: Print memory stats
1644    pub fn log_stats(&self) {
1645        println!("Index Stats:");
1646        println!("  Node Count: {}", self.nodes.len());
1647        println!("  Neighbor Buffer Len: {}", self.neighbors.buffer.len());
1648        println!(
1649            "  Neighbor Buffer Cap: {}",
1650            self.neighbors.buffer.capacity()
1651        );
1652        println!("  Total Memory Usage: {} bytes", self.memory_usage());
1653        // bucket stats are internal to NeighborPool
1654    }
1655
1656    /// Returns the approximate memory usage in bytes.
1657    #[must_use]
1658    pub fn memory_usage(&self) -> usize {
1659        let nodes_size = self.nodes.capacity() * std::mem::size_of::<HnswNode>();
1660        let neighbors_size = self.neighbors.memory_usage();
1661
1662        std::mem::size_of::<Self>() + nodes_size + neighbors_size
1663    }
1664
1665    // ========================================================================
1666    // Compaction API (W16.4 - RFC-001)
1667    // ========================================================================
1668
1669    /// Check if compaction is recommended.
1670    ///
1671    /// Returns `true` if the tombstone ratio exceeds the configured threshold
1672    /// (default 30%). When this returns true, calling `compact()` is
1673    /// recommended to reclaim space and maintain search performance.
1674    ///
1675    /// # Thread Safety
1676    ///
1677    /// This method is **not thread-safe**. `HnswIndex` is `!Send` and must be
1678    /// accessed from a single thread. For concurrent use, create separate index
1679    /// instances (e.g., via Web Workers in WASM).
1680    ///
1681    /// # Example
1682    ///
1683    /// ```ignore
1684    /// if index.needs_compaction() {
1685    ///     let (new_index, new_storage, result) = index.compact(&storage)?;
1686    ///     index = new_index;
1687    ///     storage = new_storage;
1688    /// }
1689    /// ```
1690    #[must_use]
1691    pub fn needs_compaction(&self) -> bool {
1692        self.tombstone_ratio() > self.compaction_threshold
1693    }
1694
1695    /// Set the compaction threshold.
1696    ///
1697    /// When `tombstone_ratio()` exceeds this value, `needs_compaction()`
1698    /// returns true.
1699    ///
1700    /// # Arguments
1701    ///
1702    /// * `ratio` - Tombstone ratio threshold (0.01 to 0.99)
1703    ///
1704    /// # Default
1705    ///
1706    /// Default is 0.3 (30%). Lower values trigger compaction more often
1707    /// but maintain better search performance.
1708    ///
1709    /// # Clamping
1710    ///
1711    /// Values outside [0.01, 0.99] are clamped to that range.
1712    pub fn set_compaction_threshold(&mut self, ratio: f64) {
1713        self.compaction_threshold = ratio.clamp(0.01, 0.99);
1714    }
1715
1716    /// Get the current compaction threshold.
1717    ///
1718    /// # Returns
1719    ///
1720    /// The ratio at which `needs_compaction()` returns true.
1721    #[must_use]
1722    pub fn compaction_threshold(&self) -> f64 {
1723        self.compaction_threshold
1724    }
1725
1726    /// Get a compaction warning message if compaction is recommended.
1727    ///
1728    /// Returns `Some(message)` if the tombstone ratio exceeds the threshold,
1729    /// or `None` if compaction is not needed.
1730    ///
1731    /// # Example
1732    ///
1733    /// ```ignore
1734    /// if let Some(warning) = index.compaction_warning() {
1735    ///     log::warn!("{}", warning);
1736    /// }
1737    /// ```
1738    #[must_use]
1739    pub fn compaction_warning(&self) -> Option<String> {
1740        if self.needs_compaction() {
1741            Some(format!(
1742                "Compaction recommended: tombstone ratio {:.1}% exceeds threshold {:.1}%. \
1743                 Call compact() to rebuild index without tombstones.",
1744                self.tombstone_ratio() * 100.0,
1745                self.compaction_threshold * 100.0
1746            ))
1747        } else {
1748            None
1749        }
1750    }
1751
1752    /// Compact the index by rebuilding without tombstones.
1753    ///
1754    /// This operation creates a NEW index and NEW storage containing only
1755    /// live (non-deleted) vectors. The original index and storage are NOT
1756    /// modified, allowing the caller to replace them atomically.
1757    ///
1758    /// # Important: Vector ID Remapping
1759    ///
1760    /// Due to storage design constraints, vector IDs are remapped during
1761    /// compaction. New IDs are assigned sequentially starting from 1.
1762    /// If you need to track the mapping, use the returned index to query
1763    /// by position or content.
1764    ///
1765    /// # Returns
1766    ///
1767    /// Returns `(new_index, new_storage, result)` tuple. The caller MUST
1768    /// replace BOTH their index and storage references:
1769    ///
1770    /// ```ignore
1771    /// let (new_index, new_storage, result) = old_index.compact(&old_storage)?;
1772    /// println!("Removed {} tombstones in {}ms", result.tombstones_removed, result.duration_ms);
1773    /// // Now use new_index and new_storage, old ones will be dropped
1774    /// index = new_index;
1775    /// storage = new_storage;
1776    /// ```
1777    ///
1778    /// # Algorithm
1779    ///
1780    /// 1. Collect all live vectors (non-deleted)
1781    /// 2. Create a new empty index and storage with the same config
1782    /// 3. Re-insert all live vectors using regular insert()
1783    /// 4. Return the new pair
1784    ///
1785    /// # Performance
1786    ///
1787    /// * Time: O(n log n) where n = live vector count
1788    /// * Space: 2x index size during compaction (temporary)
1789    ///
1790    /// # Memory Safety
1791    ///
1792    /// * Returns new pair — no storage/index mismatch possible
1793    /// * On failure, original index/storage unchanged (caller keeps refs)
1794    /// * Old index/storage are NOT modified — caller drops when ready
1795    ///
1796    /// # Warning
1797    ///
1798    /// This is a blocking operation. For WASM, consider running
1799    /// during idle time or on user action.
1800    pub fn compact(
1801        &self,
1802        storage: &VectorStorage,
1803    ) -> Result<(HnswIndex, VectorStorage, CompactionResult), GraphError> {
1804        // WASM-compatible timing
1805        #[cfg(not(target_arch = "wasm32"))]
1806        let start = std::time::Instant::now();
1807        #[cfg(target_arch = "wasm32")]
1808        let start_ms = web_sys::window()
1809            .and_then(|w| w.performance())
1810            .map(|p| p.now())
1811            .unwrap_or(0.0);
1812
1813        let original_deleted = self.deleted_count;
1814        let original_total = self.node_count();
1815
1816        // Fast path: no tombstones — return copy
1817        if original_deleted == 0 {
1818            // Clone manually since VectorStorage doesn't implement Clone
1819            let config = self.config.clone();
1820            let mut new_storage = VectorStorage::new(&config, None);
1821            let mut new_index = HnswIndex::new(config, &new_storage)?;
1822            new_index.compaction_threshold = self.compaction_threshold;
1823
1824            // Re-insert all vectors (this is a rebuild, but preserves order)
1825            for node in &self.nodes {
1826                let vec = storage.get_vector(node.vector_id);
1827                new_index.insert(&vec, &mut new_storage)?;
1828            }
1829
1830            return Ok((
1831                new_index,
1832                new_storage,
1833                CompactionResult {
1834                    tombstones_removed: 0,
1835                    new_size: original_total,
1836                    duration_ms: 0,
1837                },
1838            ));
1839        }
1840
1841        // Collect live vectors' data
1842        let live_vectors: Vec<Vec<f32>> = self
1843            .nodes
1844            .iter()
1845            .filter(|node| node.deleted == 0)
1846            .map(|node| {
1847                let vec = storage.get_vector(node.vector_id);
1848                vec.into_owned()
1849            })
1850            .collect();
1851
1852        let new_size = live_vectors.len();
1853
1854        // Build new index AND new storage with same config
1855        let config = self.config.clone();
1856        let mut new_storage = VectorStorage::new(&config, None);
1857        let mut new_index = HnswIndex::new(config, &new_storage)?;
1858
1859        // Copy compaction threshold from original
1860        new_index.compaction_threshold = self.compaction_threshold;
1861
1862        // Re-insert all live vectors (IDs will be remapped)
1863        for vector in live_vectors {
1864            new_index.insert(&vector, &mut new_storage)?;
1865        }
1866
1867        // Calculate duration based on target
1868        #[cfg(not(target_arch = "wasm32"))]
1869        let duration_ms = start.elapsed().as_millis() as u64;
1870        #[cfg(target_arch = "wasm32")]
1871        let duration_ms = web_sys::window()
1872            .and_then(|w| w.performance())
1873            .map(|p| (p.now() - start_ms) as u64)
1874            .unwrap_or(0);
1875
1876        Ok((
1877            new_index,
1878            new_storage,
1879            CompactionResult {
1880                tombstones_removed: original_deleted,
1881                new_size,
1882                duration_ms,
1883            },
1884        ))
1885    }
1886
1887    /// Insert a vector with a specific ID (validation only).
1888    ///
1889    /// This method validates that the specified ID doesn't conflict with
1890    /// existing IDs, then delegates to the standard `insert()` method.
1891    ///
1892    /// **Important:** Due to storage design constraints where VectorIds must
1893    /// match storage slot indices, the returned VectorId is the one assigned
1894    /// by storage, NOT the requested ID. The `id` parameter is only used
1895    /// for duplicate validation.
1896    ///
1897    /// For actual ID preservation during compaction, the storage would need
1898    /// to support sparse ID assignment, which is not currently implemented.
1899    ///
1900    /// # Arguments
1901    ///
1902    /// * `id` - The specific VectorId to validate (not actually used)
1903    /// * `vector` - The vector data (must match configured dimensions)
1904    /// * `storage` - Mutable reference to vector storage
1905    ///
1906    /// # Returns
1907    ///
1908    /// The VectorId assigned by storage (sequential).
1909    ///
1910    /// # Errors
1911    ///
1912    /// * `InvalidVectorId` - If ID already exists in index or is the sentinel value
1913    /// * `DimensionMismatch` - If vector dimensions don't match config
1914    /// * `Storage` - If storage operation fails
1915    pub fn insert_with_id(
1916        &mut self,
1917        id: VectorId,
1918        vector: &[f32],
1919        storage: &mut VectorStorage,
1920    ) -> Result<VectorId, GraphError> {
1921        // Validate ID is not sentinel
1922        if id == VectorId::INVALID {
1923            return Err(GraphError::InvalidVectorId);
1924        }
1925
1926        // Validate ID doesn't already exist
1927        if self.nodes.iter().any(|n| n.vector_id == id) {
1928            return Err(GraphError::InvalidVectorId);
1929        }
1930
1931        // Validate dimensions
1932        if vector.len() != self.config.dimensions as usize {
1933            return Err(GraphError::DimensionMismatch {
1934                expected: self.config.dimensions as usize,
1935                actual: vector.len(),
1936            });
1937        }
1938
1939        // Delegate to regular insert (which handles all the graph connection logic)
1940        self.insert(vector, storage)
1941    }
1942}
1943
1944/// Trait for providing vector data by ID.
1945pub trait VectorProvider {
1946    /// Returns the vector data for a given ID.
1947    fn get_vector(&self, id: VectorId) -> Cow<'_, [f32]>;
1948
1949    /// Returns true if the vector is marked as deleted.
1950    fn is_deleted(&self, id: VectorId) -> bool {
1951        let _ = id;
1952        false
1953    }
1954
1955    /// Returns the quantized vector data for a given ID, if available.
1956    ///
1957    /// # Returns
1958    ///
1959    /// * `Some(&[u8])` - If the provider supports direct quantized access.
1960    /// * `None` - If not supported or data is not quantized.
1961    fn get_quantized_vector(&self, id: VectorId) -> Option<&[u8]> {
1962        let _ = id;
1963        None
1964    }
1965
1966    /// Quantizes a query vector into the provided output buffer.
1967    ///
1968    /// # Arguments
1969    ///
1970    /// * `query` - The query vector in f32.
1971    /// * `output` - buffer to write quantized data into.
1972    ///
1973    /// # Returns
1974    ///
1975    /// * `Some(&[u8])` - The quantized slice (borrowed from output).
1976    /// * `None` - If quantization is not supported.
1977    fn quantize_query<'a>(&self, query: &[f32], output: &'a mut Vec<u8>) -> Option<&'a [u8]> {
1978        let _ = query;
1979        let _ = output;
1980        None
1981    }
1982}
1983
1984// ============================================================================
1985// Batch Insertion Implementation (W11.1 - RFC 0001 v1.1)
1986// ============================================================================
1987
1988use crate::batch::BatchInsertable;
1989use crate::error::BatchError;
1990
1991impl HnswIndex {
1992    /// Returns the configured dimensionality of the index.
1993    #[must_use]
1994    pub fn dimensions(&self) -> u32 {
1995        self.config.dimensions
1996    }
1997
1998    /// Returns the current number of nodes in the index.
1999    #[must_use]
2000    pub fn len(&self) -> usize {
2001        self.nodes.len()
2002    }
2003
2004    /// Returns true if the index contains no nodes.
2005    #[must_use]
2006    pub fn is_empty(&self) -> bool {
2007        self.nodes.is_empty()
2008    }
2009
2010    /// Returns the maximum capacity of the index (u32::MAX nodes).
2011    #[must_use]
2012    pub const fn capacity(&self) -> usize {
2013        u32::MAX as usize
2014    }
2015
2016    /// Checks if a vector ID already exists in the index.
2017    ///
2018    /// Returns `true` if the ID exists, `false` otherwise.
2019    #[must_use]
2020    pub fn contains_id(&self, id: u64) -> bool {
2021        self.nodes.iter().any(|n| n.vector_id.0 == id)
2022    }
2023
2024    /// Checks if a vector contains invalid floating-point values.
2025    ///
2026    /// Returns `Some(reason)` if invalid, `None` if valid.
2027    fn validate_vector(vector: &[f32]) -> Option<String> {
2028        for (i, &val) in vector.iter().enumerate() {
2029            if val.is_nan() {
2030                return Some(format!("NaN at index {i}"));
2031            }
2032            if val.is_infinite() {
2033                return Some(format!("Infinity at index {i}"));
2034            }
2035        }
2036        None
2037    }
2038}
2039
2040impl BatchInsertable for HnswIndex {
2041    /// Inserts multiple vectors in a single batch operation.
2042    ///
2043    /// This is the full implementation per RFC 0001 v1.2.
2044    ///
2045    /// # Algorithm
2046    ///
2047    /// 1. Collect iterator to Vec (required for progress tracking)
2048    /// 2. Pre-validate first vector's dimensionality (fail-fast)
2049    /// 3. Check capacity constraints
2050    /// 4. Iterate through vectors with progress callbacks
2051    /// 5. Actually insert vectors into HNSW graph via storage
2052    /// 6. Handle errors according to best-effort semantics
2053    ///
2054    /// # Error Handling
2055    ///
2056    /// - **Fatal errors** (abort immediately):
2057    ///   - DimensionMismatch on first vector
2058    ///   - CapacityExceeded
2059    ///   - InternalError (HNSW invariant violated)
2060    ///
2061    /// - **Non-fatal errors** (skip and continue):
2062    ///   - DuplicateId (within batch or existing in index)
2063    ///   - InvalidVector (NaN, Inf)
2064    ///   - DimensionMismatch on subsequent vectors
2065    fn batch_insert<I, F>(
2066        &mut self,
2067        vectors: I,
2068        storage: &mut VectorStorage,
2069        mut progress_callback: Option<F>,
2070    ) -> Result<Vec<u64>, BatchError>
2071    where
2072        I: IntoIterator<Item = (u64, Vec<f32>)>,
2073        F: FnMut(usize, usize),
2074    {
2075        // Step 1: Collect iterator to Vec for progress tracking
2076        let batch: Vec<(u64, Vec<f32>)> = vectors.into_iter().collect();
2077        let total = batch.len();
2078
2079        // Early return for empty batch
2080        if total == 0 {
2081            return Ok(vec![]);
2082        }
2083
2084        // Step 2: Pre-validate first vector's dimensionality (fail-fast)
2085        let expected_dim = self.config.dimensions as usize;
2086        let (first_id, first_vec) = &batch[0];
2087        if first_vec.len() != expected_dim {
2088            return Err(BatchError::DimensionMismatch {
2089                expected: expected_dim,
2090                actual: first_vec.len(),
2091                vector_id: *first_id,
2092            });
2093        }
2094
2095        // Step 3: Check capacity constraints
2096        let current_count = self.nodes.len();
2097        if current_count.saturating_add(total) > self.capacity() {
2098            return Err(BatchError::CapacityExceeded {
2099                current: current_count,
2100                max: self.capacity(),
2101            });
2102        }
2103
2104        // Track IDs we've seen in this batch to detect duplicates within the batch
2105        let mut seen_ids: HashSet<u64> = HashSet::with_capacity(total);
2106
2107        // Track successfully inserted IDs
2108        let mut inserted_ids: Vec<u64> = Vec::with_capacity(total);
2109
2110        // Step 4: Initial progress callback (0%)
2111        if let Some(ref mut callback) = progress_callback {
2112            callback(0, total);
2113        }
2114
2115        // Calculate progress intervals (every 10%)
2116        let progress_interval = if total >= 10 { total / 10 } else { 1 };
2117
2118        // Step 5: Process each vector
2119        for (id, vector) in batch {
2120            // Check for duplicate ID within this batch
2121            if !seen_ids.insert(id) {
2122                // Duplicate within batch - skip (non-fatal)
2123                continue;
2124            }
2125
2126            // Check for ID 0 (reserved sentinel)
2127            if id == 0 {
2128                // Skip invalid ID (non-fatal)
2129                continue;
2130            }
2131
2132            // Check for duplicate ID in existing index [M1 fix]
2133            if self.contains_id(id) {
2134                // Duplicate in existing index - skip (non-fatal)
2135                continue;
2136            }
2137
2138            // Validate dimensionality
2139            if vector.len() != expected_dim {
2140                // Skip dimension mismatch (non-fatal after first vector)
2141                continue;
2142            }
2143
2144            // Validate vector data (NaN, Inf)
2145            if Self::validate_vector(&vector).is_some() {
2146                // Skip invalid vector (non-fatal)
2147                continue;
2148            }
2149
2150            // Step 6: Actually insert the vector into the HNSW graph [C1 fix]
2151            match self.insert(&vector, storage) {
2152                Ok(assigned_id) => {
2153                    // Use the ID assigned by the insert method
2154                    inserted_ids.push(assigned_id.0);
2155                }
2156                Err(e) => {
2157                    // Internal error during HNSW insertion - this is fatal
2158                    return Err(BatchError::InternalError {
2159                        message: format!("HNSW insert failed for vector {id}: {e}"),
2160                    });
2161                }
2162            }
2163
2164            // Progress callback at intervals [M3 fix: report inserted count]
2165            let inserted_count = inserted_ids.len();
2166            if inserted_count % progress_interval == 0 || inserted_count == 1 {
2167                if let Some(ref mut callback) = progress_callback {
2168                    callback(inserted_count, total);
2169                }
2170            }
2171        }
2172
2173        // Final progress callback (100%)
2174        if let Some(ref mut callback) = progress_callback {
2175            callback(inserted_ids.len(), total);
2176        }
2177
2178        Ok(inserted_ids)
2179    }
2180}
2181
2182#[cfg(test)]
2183mod tests {
2184    use super::*;
2185
2186    #[test]
2187    fn test_send_sync() {
2188        fn assert_send_sync<T: Send + Sync>() {}
2189        assert_send_sync::<HnswIndex>();
2190    }
2191
2192    #[test]
2193    fn test_initialization() {
2194        let config = HnswConfig::new(128);
2195        // Create storage with matching dimensions
2196        let storage = VectorStorage::new(&config, None);
2197
2198        let index = HnswIndex::new(config.clone(), &storage).unwrap();
2199
2200        assert_eq!(index.node_count(), 0);
2201        assert_eq!(index.entry_point(), None);
2202        assert_eq!(index.max_layer(), 0);
2203    }
2204
2205    #[test]
2206    fn test_dimension_mismatch() {
2207        let config_idx = HnswConfig::new(128);
2208        let config_store = HnswConfig::new(64);
2209        let storage = VectorStorage::new(&config_store, None);
2210
2211        let result = HnswIndex::new(config_idx, &storage);
2212        assert!(matches!(
2213            result,
2214            Err(GraphError::ConfigMismatch {
2215                expected: 64,
2216                actual: 128
2217            })
2218        ));
2219    }
2220
2221    #[test]
2222    fn test_layer_distribution() {
2223        // Geometric distribution test
2224        // m=16 => m_l = 1/ln(16) ≈ 0.36
2225        // Prob(level > 0) = e^(-1/m_l) = 1/M = 1/16
2226        // We can't strictly test randomness without huge samples, but we can sanity check.
2227        let config = HnswConfig::new(128);
2228        let storage = VectorStorage::new(&config, None);
2229        let mut index = HnswIndex::new(config, &storage).unwrap();
2230
2231        let mut levels = vec![0u8; 1000];
2232        for l in &mut levels {
2233            *l = index.get_random_level();
2234        }
2235
2236        // Level 0 should be most common
2237        #[allow(clippy::naive_bytecount)]
2238        let l0_count = levels.iter().filter(|&&l| l == 0).count();
2239        assert!(
2240            l0_count > 800,
2241            "Level 0 should be dominant (expected ~93% for M=16)"
2242        );
2243
2244        // Max level shouldn't be crazy
2245        let max = *levels.iter().max().unwrap();
2246        assert!(max < 16, "Level should be reasonable");
2247    }
2248
2249    #[test]
2250    fn test_neighbor_roundtrip() {
2251        let config = HnswConfig::new(128);
2252        let storage = VectorStorage::new(&config, None);
2253        let mut index = HnswIndex::new(config, &storage).unwrap();
2254
2255        let id1 = index.add_node(VectorId(1), 0).unwrap();
2256        let id2 = index.add_node(VectorId(2), 0).unwrap();
2257        let id3 = index.add_node(VectorId(3), 0).unwrap();
2258
2259        // Neighbors: [2, 3]
2260        let neighbors = vec![id2, id3];
2261        index.set_neighbors(id1, &neighbors).unwrap();
2262
2263        {
2264            let node1 = index.get_node(id1).unwrap();
2265            let retrieved = index.get_neighbors(node1).unwrap();
2266            assert_eq!(retrieved, neighbors);
2267        } // Drop node1 borrow
2268
2269        // Update neighbors: [3] (shrink)
2270        let neighbors2 = vec![id3];
2271        index.set_neighbors(id1, &neighbors2).unwrap();
2272
2273        {
2274            let node1 = index.get_node(id1).unwrap();
2275            let retrieved2 = index.get_neighbors(node1).unwrap();
2276            assert_eq!(retrieved2, neighbors2);
2277        }
2278
2279        // Check if free list got populated (cannot check directly as NeighborPool is private,
2280        // but we can trust neighbor.rs tests for internal logic)
2281    }
2282
2283    // ============================================================
2284    // BATCH INSERT TESTS (W11.1 - RFC 0001 v1.2)
2285    // ============================================================
2286
2287    mod batch_insert_tests {
2288        use super::*;
2289        use crate::batch::BatchInsertable;
2290
2291        fn create_test_index_and_storage(dim: u32) -> (HnswIndex, VectorStorage) {
2292            let config = HnswConfig::new(dim);
2293            let storage = VectorStorage::new(&config, None);
2294            let index = HnswIndex::new(config, &storage).unwrap();
2295            (index, storage)
2296        }
2297
2298        #[test]
2299        fn test_batch_insert_empty() {
2300            let (mut index, mut storage) = create_test_index_and_storage(128);
2301            let vectors: Vec<(u64, Vec<f32>)> = vec![];
2302
2303            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2304
2305            assert!(result.is_ok());
2306            assert!(result.unwrap().is_empty());
2307            // [M2 fix] Verify graph state
2308            assert_eq!(index.node_count(), 0);
2309        }
2310
2311        #[test]
2312        fn test_batch_insert_single_vector() {
2313            let (mut index, mut storage) = create_test_index_and_storage(4);
2314            let vectors = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
2315
2316            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2317
2318            assert!(result.is_ok());
2319            let ids = result.unwrap();
2320            assert_eq!(ids.len(), 1);
2321            // [M2 fix] Verify graph state - node was actually inserted
2322            assert_eq!(index.node_count(), 1);
2323        }
2324
2325        #[test]
2326        fn test_batch_insert_multiple_vectors() {
2327            let (mut index, mut storage) = create_test_index_and_storage(4);
2328            let vectors = vec![
2329                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
2330                (2u64, vec![5.0, 6.0, 7.0, 8.0]),
2331                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
2332            ];
2333
2334            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2335
2336            assert!(result.is_ok());
2337            let ids = result.unwrap();
2338            assert_eq!(ids.len(), 3);
2339            // [M2 fix] Verify graph state - all 3 nodes inserted
2340            assert_eq!(index.node_count(), 3);
2341        }
2342
2343        #[test]
2344        fn test_batch_insert_dimension_mismatch_first_vector() {
2345            let (mut index, mut storage) = create_test_index_and_storage(4);
2346            // First vector has wrong dimension (3 instead of 4)
2347            let vectors = vec![
2348                (1u64, vec![1.0, 2.0, 3.0]), // Wrong!
2349                (2u64, vec![5.0, 6.0, 7.0, 8.0]),
2350            ];
2351
2352            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2353
2354            assert!(result.is_err());
2355            match result.unwrap_err() {
2356                BatchError::DimensionMismatch {
2357                    expected,
2358                    actual,
2359                    vector_id,
2360                } => {
2361                    assert_eq!(expected, 4);
2362                    assert_eq!(actual, 3);
2363                    assert_eq!(vector_id, 1);
2364                }
2365                _ => panic!("Expected DimensionMismatch error"),
2366            }
2367            // [M2 fix] Verify no vectors were inserted on fatal error
2368            assert_eq!(index.node_count(), 0);
2369        }
2370
2371        #[test]
2372        fn test_batch_insert_dimension_mismatch_later_vector_skipped() {
2373            let (mut index, mut storage) = create_test_index_and_storage(4);
2374            // Second vector has wrong dimension - should be skipped
2375            let vectors = vec![
2376                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
2377                (2u64, vec![5.0, 6.0, 7.0]), // Wrong, but not first
2378                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
2379            ];
2380
2381            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2382
2383            // Should succeed with partial results
2384            assert!(result.is_ok());
2385            let ids = result.unwrap();
2386            assert_eq!(ids.len(), 2);
2387            // [M2 fix] Verify only 2 nodes inserted (one skipped)
2388            assert_eq!(index.node_count(), 2);
2389        }
2390
2391        #[test]
2392        fn test_batch_insert_duplicate_id_within_batch() {
2393            let (mut index, mut storage) = create_test_index_and_storage(4);
2394            // Duplicate IDs within the batch
2395            let vectors = vec![
2396                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
2397                (1u64, vec![5.0, 6.0, 7.0, 8.0]), // Duplicate!
2398                (2u64, vec![9.0, 10.0, 11.0, 12.0]),
2399            ];
2400
2401            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2402
2403            // Should succeed, skipping the duplicate
2404            assert!(result.is_ok());
2405            let ids = result.unwrap();
2406            assert_eq!(ids.len(), 2);
2407            // [M2 fix] Verify graph state
2408            assert_eq!(index.node_count(), 2);
2409        }
2410
2411        #[test]
2412        fn test_batch_insert_duplicate_id_in_existing_index() {
2413            // [M1 fix] Test duplicate check against existing index
2414            let (mut index, mut storage) = create_test_index_and_storage(4);
2415
2416            // First batch
2417            let vectors1 = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
2418            let result1 = index.batch_insert(vectors1, &mut storage, None::<fn(usize, usize)>);
2419            assert!(result1.is_ok());
2420            assert_eq!(index.node_count(), 1);
2421
2422            // Second batch with duplicate ID
2423            let vectors2 = vec![
2424                (1u64, vec![5.0, 6.0, 7.0, 8.0]), // Duplicate of existing!
2425                (2u64, vec![9.0, 10.0, 11.0, 12.0]),
2426            ];
2427            let result2 = index.batch_insert(vectors2, &mut storage, None::<fn(usize, usize)>);
2428
2429            // Should succeed, skipping the duplicate
2430            assert!(result2.is_ok());
2431            let ids = result2.unwrap();
2432            assert_eq!(ids.len(), 1); // Only ID 2 was inserted
2433            assert_eq!(index.node_count(), 2); // Total 2 nodes now
2434        }
2435
2436        #[test]
2437        fn test_batch_insert_invalid_vector_nan() {
2438            let (mut index, mut storage) = create_test_index_and_storage(4);
2439            let vectors = vec![
2440                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
2441                (2u64, vec![f32::NAN, 6.0, 7.0, 8.0]), // NaN!
2442                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
2443            ];
2444
2445            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2446
2447            // Should succeed, skipping the NaN vector
2448            assert!(result.is_ok());
2449            let ids = result.unwrap();
2450            assert_eq!(ids.len(), 2);
2451            // [M2 fix] Verify graph state
2452            assert_eq!(index.node_count(), 2);
2453        }
2454
2455        #[test]
2456        fn test_batch_insert_invalid_vector_infinity() {
2457            let (mut index, mut storage) = create_test_index_and_storage(4);
2458            let vectors = vec![
2459                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
2460                (2u64, vec![f32::INFINITY, 6.0, 7.0, 8.0]), // Infinity!
2461                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
2462            ];
2463
2464            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2465
2466            // Should succeed, skipping the Infinity vector
2467            assert!(result.is_ok());
2468            let ids = result.unwrap();
2469            assert_eq!(ids.len(), 2);
2470            // [M2 fix] Verify graph state
2471            assert_eq!(index.node_count(), 2);
2472        }
2473
2474        #[test]
2475        fn test_batch_insert_invalid_vector_neg_infinity() {
2476            let (mut index, mut storage) = create_test_index_and_storage(4);
2477            let vectors = vec![
2478                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
2479                (2u64, vec![f32::NEG_INFINITY, 6.0, 7.0, 8.0]), // Neg Infinity!
2480                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
2481            ];
2482
2483            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2484
2485            // Should succeed, skipping the NegInfinity vector
2486            assert!(result.is_ok());
2487            let ids = result.unwrap();
2488            assert_eq!(ids.len(), 2);
2489            // [M2 fix] Verify graph state
2490            assert_eq!(index.node_count(), 2);
2491        }
2492
2493        #[test]
2494        fn test_batch_insert_id_zero_skipped() {
2495            let (mut index, mut storage) = create_test_index_and_storage(4);
2496            // ID 0 is reserved sentinel
2497            let vectors = vec![
2498                (0u64, vec![1.0, 2.0, 3.0, 4.0]), // Reserved!
2499                (1u64, vec![5.0, 6.0, 7.0, 8.0]),
2500                (2u64, vec![9.0, 10.0, 11.0, 12.0]),
2501            ];
2502
2503            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2504
2505            // Should succeed, skipping ID 0
2506            assert!(result.is_ok());
2507            let ids = result.unwrap();
2508            assert_eq!(ids.len(), 2);
2509            // [M2 fix] Verify graph state
2510            assert_eq!(index.node_count(), 2);
2511        }
2512
2513        #[test]
2514        fn test_batch_insert_progress_callback_called() {
2515            let (mut index, mut storage) = create_test_index_and_storage(4);
2516            let vectors: Vec<(u64, Vec<f32>)> =
2517                (1..=10).map(|i| (i as u64, vec![i as f32; 4])).collect();
2518
2519            let mut progress_calls: Vec<(usize, usize)> = vec![];
2520
2521            let result = index.batch_insert(
2522                vectors,
2523                &mut storage,
2524                Some(|current, total| {
2525                    progress_calls.push((current, total));
2526                }),
2527            );
2528
2529            assert!(result.is_ok());
2530            // [M2 fix] Verify all 10 nodes inserted
2531            assert_eq!(index.node_count(), 10);
2532
2533            // Should have called progress at 0% and at intervals
2534            assert!(!progress_calls.is_empty());
2535
2536            // First call should be (0, 10)
2537            assert_eq!(progress_calls[0], (0, 10));
2538
2539            // Last call should report inserted count (10, 10)
2540            let last = progress_calls.last().unwrap();
2541            assert_eq!(*last, (10, 10));
2542        }
2543
2544        #[test]
2545        fn test_batch_insert_progress_callback_single_vector() {
2546            let (mut index, mut storage) = create_test_index_and_storage(4);
2547            let vectors = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
2548
2549            let mut progress_calls: Vec<(usize, usize)> = vec![];
2550
2551            let result = index.batch_insert(
2552                vectors,
2553                &mut storage,
2554                Some(|current, total| {
2555                    progress_calls.push((current, total));
2556                }),
2557            );
2558
2559            assert!(result.is_ok());
2560            // [M2 fix] Verify 1 node inserted
2561            assert_eq!(index.node_count(), 1);
2562
2563            // Should have at least two calls: 0% and 100%
2564            assert!(progress_calls.len() >= 2);
2565            assert_eq!(progress_calls[0], (0, 1));
2566            assert!(progress_calls.contains(&(1, 1)));
2567        }
2568
2569        #[test]
2570        fn test_batch_insert_mixed_errors() {
2571            let (mut index, mut storage) = create_test_index_and_storage(4);
2572            // Mix of valid and invalid vectors
2573            let vectors = vec![
2574                (1u64, vec![1.0, 2.0, 3.0, 4.0]),      // Valid
2575                (2u64, vec![f32::NAN, 2.0, 3.0, 4.0]), // NaN - skip
2576                (3u64, vec![3.0, 3.0, 3.0]),           // Wrong dim - skip
2577                (1u64, vec![4.0, 4.0, 4.0, 4.0]),      // Duplicate - skip
2578                (0u64, vec![5.0, 5.0, 5.0, 5.0]),      // Reserved ID - skip
2579                (4u64, vec![6.0, 6.0, 6.0, 6.0]),      // Valid
2580            ];
2581
2582            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
2583
2584            assert!(result.is_ok());
2585            let ids = result.unwrap();
2586            assert_eq!(ids.len(), 2);
2587            // [M2 fix] Verify graph state
2588            assert_eq!(index.node_count(), 2);
2589        }
2590
2591        #[test]
2592        fn test_helper_dimensions() {
2593            let (index, _storage) = create_test_index_and_storage(128);
2594            assert_eq!(index.dimensions(), 128);
2595        }
2596
2597        #[test]
2598        fn test_helper_len_empty() {
2599            let (index, _storage) = create_test_index_and_storage(128);
2600            assert_eq!(index.len(), 0);
2601            assert!(index.is_empty());
2602        }
2603
2604        #[test]
2605        fn test_helper_capacity() {
2606            let (index, _storage) = create_test_index_and_storage(128);
2607            assert_eq!(index.capacity(), u32::MAX as usize);
2608        }
2609
2610        #[test]
2611        fn test_helper_contains_id() {
2612            let (mut index, mut storage) = create_test_index_and_storage(4);
2613
2614            // Initially empty
2615            assert!(!index.contains_id(1));
2616
2617            // Insert via single insert
2618            let _ = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage);
2619
2620            // Now the first ID should exist (insert assigns sequential IDs starting at 1)
2621            assert!(index.node_count() == 1);
2622        }
2623
2624        #[test]
2625        fn test_validate_vector_valid() {
2626            let vector = vec![1.0, 2.0, 3.0, 4.0];
2627            assert!(HnswIndex::validate_vector(&vector).is_none());
2628        }
2629
2630        #[test]
2631        fn test_validate_vector_nan() {
2632            let vector = vec![1.0, f32::NAN, 3.0, 4.0];
2633            let result = HnswIndex::validate_vector(&vector);
2634            assert!(result.is_some());
2635            assert!(result.unwrap().contains("NaN"));
2636        }
2637
2638        #[test]
2639        fn test_validate_vector_infinity() {
2640            let vector = vec![1.0, f32::INFINITY, 3.0, 4.0];
2641            let result = HnswIndex::validate_vector(&vector);
2642            assert!(result.is_some());
2643            assert!(result.unwrap().contains("Infinity"));
2644        }
2645    }
2646
2647    // ============================================================
2648    // SOFT DELETE TESTS (W16.2 - RFC-001)
2649    // ============================================================
2650
2651    mod delete_tests {
2652        use super::*;
2653
2654        fn create_test_index() -> (HnswIndex, VectorStorage) {
2655            let config = HnswConfig::new(4);
2656            let storage = VectorStorage::new(&config, None);
2657            let index = HnswIndex::new(config, &storage).unwrap();
2658            (index, storage)
2659        }
2660
2661        #[test]
2662        fn test_soft_delete_marks_node() {
2663            let (mut index, mut storage) = create_test_index();
2664            let vec = vec![1.0, 2.0, 3.0, 4.0];
2665            let id = index.insert(&vec, &mut storage).unwrap();
2666
2667            assert!(!index.is_deleted(id).unwrap());
2668            assert!(index.soft_delete(id).unwrap());
2669            assert!(index.is_deleted(id).unwrap());
2670        }
2671
2672        #[test]
2673        fn test_soft_delete_idempotent() {
2674            let (mut index, mut storage) = create_test_index();
2675            let vec = vec![1.0, 2.0, 3.0, 4.0];
2676            let id = index.insert(&vec, &mut storage).unwrap();
2677
2678            assert!(index.soft_delete(id).unwrap()); // First: true
2679            assert!(!index.soft_delete(id).unwrap()); // Second: false
2680            assert_eq!(index.deleted_count(), 1); // Still 1
2681        }
2682
2683        #[test]
2684        fn test_soft_delete_nonexistent_fails() {
2685            let (mut index, _storage) = create_test_index();
2686            let result = index.soft_delete(VectorId(999));
2687            assert!(result.is_err());
2688            assert!(matches!(result.unwrap_err(), GraphError::InvalidVectorId));
2689        }
2690
2691        #[test]
2692        fn test_deleted_count() {
2693            let (mut index, mut storage) = create_test_index();
2694
2695            // Insert 3 vectors
2696            let id1 = index.insert(&[1.0, 0.0, 0.0, 0.0], &mut storage).unwrap();
2697            let id2 = index.insert(&[0.0, 1.0, 0.0, 0.0], &mut storage).unwrap();
2698            let _id3 = index.insert(&[0.0, 0.0, 1.0, 0.0], &mut storage).unwrap();
2699
2700            assert_eq!(index.deleted_count(), 0);
2701            assert_eq!(index.node_count(), 3);
2702
2703            // Delete 2
2704            index.soft_delete(id1).unwrap();
2705            index.soft_delete(id2).unwrap();
2706
2707            assert_eq!(index.deleted_count(), 2);
2708            assert_eq!(index.live_count(), 1);
2709        }
2710
2711        #[test]
2712        #[allow(clippy::float_cmp)]
2713        fn test_tombstone_ratio() {
2714            let (mut index, mut storage) = create_test_index();
2715
2716            // Empty index
2717            assert_eq!(index.tombstone_ratio(), 0.0);
2718
2719            // Insert 4 vectors
2720            let mut ids = Vec::new();
2721            #[allow(clippy::cast_precision_loss)]
2722            for i in 0..4 {
2723                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2724                ids.push(id);
2725            }
2726
2727            assert!((index.tombstone_ratio() - 0.0).abs() < f64::EPSILON);
2728
2729            // Delete 1 of 4 = 25%
2730            index.soft_delete(ids[0]).unwrap();
2731            assert!((index.tombstone_ratio() - 0.25).abs() < 0.01);
2732
2733            // Delete 2 of 4 = 50%
2734            index.soft_delete(ids[1]).unwrap();
2735            assert!((index.tombstone_ratio() - 0.50).abs() < 0.01);
2736        }
2737
2738        #[test]
2739        fn test_is_deleted_nonexistent_fails() {
2740            let (index, _storage) = create_test_index();
2741            let result = index.is_deleted(VectorId(999));
2742            assert!(result.is_err());
2743            assert!(matches!(result.unwrap_err(), GraphError::InvalidVectorId));
2744        }
2745
2746        #[test]
2747        fn test_live_count() {
2748            let (mut index, mut storage) = create_test_index();
2749
2750            // Insert 5
2751            let mut ids = Vec::new();
2752            for i in 0..5 {
2753                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2754                ids.push(id);
2755            }
2756
2757            // Delete 2
2758            index.soft_delete(ids[0]).unwrap();
2759            index.soft_delete(ids[1]).unwrap();
2760
2761            assert_eq!(index.node_count(), 5);
2762            assert_eq!(index.deleted_count(), 2);
2763            assert_eq!(index.live_count(), 3);
2764        }
2765
2766        #[test]
2767        fn test_get_node_by_vector_id_helper() {
2768            let (mut index, mut storage) = create_test_index();
2769            let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
2770
2771            // Should find existing node
2772            let node = index.get_node_by_vector_id(id);
2773            assert!(node.is_ok());
2774            assert_eq!(node.unwrap().vector_id, id);
2775
2776            // Should fail for non-existent
2777            let bad = index.get_node_by_vector_id(VectorId(999));
2778            assert!(bad.is_err());
2779        }
2780
2781        #[test]
2782        fn test_deleted_field_is_zero_by_default() {
2783            let (mut index, mut storage) = create_test_index();
2784            let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
2785
2786            let node = index.get_node_by_vector_id(id).unwrap();
2787            assert_eq!(node.deleted, 0);
2788        }
2789
2790        #[test]
2791        fn test_deleted_field_set_to_one_after_delete() {
2792            let (mut index, mut storage) = create_test_index();
2793            let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
2794
2795            index.soft_delete(id).unwrap();
2796
2797            let node = index.get_node_by_vector_id(id).unwrap();
2798            assert_eq!(node.deleted, 1);
2799        }
2800
2801        #[test]
2802        fn test_delete_invalid_vector_id_zero() {
2803            let (mut index, _storage) = create_test_index();
2804            // VectorId(0) is INVALID sentinel
2805            let result = index.soft_delete(VectorId(0));
2806            assert!(result.is_err());
2807        }
2808
2809        // ============================================================
2810        // ADJUSTED K TESTS (W16.3)
2811        // ============================================================
2812
2813        #[test]
2814        fn test_adjusted_k_no_tombstones() {
2815            let (index, _storage) = create_test_index();
2816            // No deletions -> k unchanged
2817            assert_eq!(index.adjusted_k(10), 10);
2818            assert_eq!(index.adjusted_k(1), 1);
2819            assert_eq!(index.adjusted_k(100), 100);
2820        }
2821
2822        #[test]
2823        fn test_adjusted_k_with_tombstones() {
2824            let (mut index, mut storage) = create_test_index();
2825
2826            // Insert 10 vectors
2827            let mut ids = Vec::new();
2828            for i in 0..10 {
2829                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2830                ids.push(id);
2831            }
2832
2833            // Delete 5 of 10 = 50% tombstones
2834            for id in ids.iter().take(5) {
2835                index.soft_delete(*id).unwrap();
2836            }
2837
2838            // 50% tombstones → 2x multiplier
2839            // adjusted_k(10) = ceil(10 / 0.5) = 20
2840            let adjusted = index.adjusted_k(10);
2841            assert!(
2842                (18..=22).contains(&adjusted),
2843                "Expected ~20, got {adjusted}"
2844            );
2845        }
2846
2847        #[test]
2848        fn test_adjusted_k_capped_at_10x() {
2849            let (mut index, mut storage) = create_test_index();
2850
2851            // Insert 100 vectors
2852            let mut ids = Vec::new();
2853            for i in 0..100 {
2854                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2855                ids.push(id);
2856            }
2857
2858            // Delete 95 of 100 = 95% tombstones
2859            for id in ids.iter().take(95) {
2860                index.soft_delete(*id).unwrap();
2861            }
2862
2863            // Should cap at 10x, not 20x
2864            let adjusted = index.adjusted_k(10);
2865            assert_eq!(adjusted, 100, "Should be capped at 10x (100)");
2866        }
2867
2868        #[test]
2869        fn test_adjusted_k_10_percent_tombstones() {
2870            let (mut index, mut storage) = create_test_index();
2871
2872            // Insert 10 vectors
2873            let mut ids = Vec::new();
2874            for i in 0..10 {
2875                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2876                ids.push(id);
2877            }
2878
2879            // Delete 1 of 10 = 10% tombstones
2880            index.soft_delete(ids[0]).unwrap();
2881
2882            // 10% tombstones → 1.11x multiplier
2883            // adjusted_k(10) = ceil(10 / 0.9) = ceil(11.11) = 12
2884            let adjusted = index.adjusted_k(10);
2885            assert!(
2886                (11..=13).contains(&adjusted),
2887                "Expected ~12, got {adjusted}"
2888            );
2889        }
2890
2891        // ============================================================
2892        // BOUNDARY VALUE TESTS (m3 fix)
2893        // ============================================================
2894
2895        #[test]
2896        fn test_adjusted_k_boundary_zero_tombstones() {
2897            let (mut index, mut storage) = create_test_index();
2898
2899            // Insert vectors but delete none (0% tombstones)
2900            for i in 0..10 {
2901                index.insert(&[i as f32; 4], &mut storage).unwrap();
2902            }
2903
2904            // 0% tombstones → no adjustment
2905            assert_eq!(index.adjusted_k(5), 5);
2906            assert_eq!(index.adjusted_k(10), 10);
2907            assert!((index.tombstone_ratio() - 0.0).abs() < f64::EPSILON);
2908        }
2909
2910        #[test]
2911        fn test_adjusted_k_boundary_50_percent() {
2912            let (mut index, mut storage) = create_test_index();
2913
2914            let mut ids = Vec::new();
2915            for i in 0..10 {
2916                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2917                ids.push(id);
2918            }
2919
2920            // Delete exactly 5 of 10 = 50%
2921            for id in ids.iter().take(5) {
2922                index.soft_delete(*id).unwrap();
2923            }
2924
2925            // 50% tombstones → 2x
2926            // adjusted_k(10) = 10 * 10 / 5 = 20
2927            let adjusted = index.adjusted_k(10);
2928            assert_eq!(adjusted, 20, "50% tombstones should give 2x");
2929            assert!((index.tombstone_ratio() - 0.5).abs() < 0.01);
2930        }
2931
2932        #[test]
2933        fn test_adjusted_k_boundary_90_percent() {
2934            let (mut index, mut storage) = create_test_index();
2935
2936            let mut ids = Vec::new();
2937            for i in 0..10 {
2938                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2939                ids.push(id);
2940            }
2941
2942            // Delete 9 of 10 = 90%
2943            for id in ids.iter().take(9) {
2944                index.soft_delete(*id).unwrap();
2945            }
2946
2947            // 90% tombstones → 10x
2948            // adjusted_k(10) = 10 * 10 / 1 = 100
2949            // Capped at 10x multiplier = 100
2950            let adjusted = index.adjusted_k(10);
2951            assert_eq!(adjusted, 100, "90% tombstones should give 10x (capped)");
2952            assert!((index.tombstone_ratio() - 0.9).abs() < 0.01);
2953        }
2954
2955        #[test]
2956        fn test_adjusted_k_boundary_all_deleted() {
2957            let (mut index, mut storage) = create_test_index();
2958
2959            let mut ids = Vec::new();
2960            for i in 0..5 {
2961                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2962                ids.push(id);
2963            }
2964
2965            // Delete all = 100% tombstones
2966            for id in &ids {
2967                index.soft_delete(*id).unwrap();
2968            }
2969
2970            // All deleted → returns k (search will return empty anyway)
2971            let adjusted = index.adjusted_k(10);
2972            assert_eq!(adjusted, 10, "All deleted should return original k");
2973            assert_eq!(index.live_count(), 0);
2974            assert!((index.tombstone_ratio() - 1.0).abs() < 0.01);
2975        }
2976
2977        #[test]
2978        fn test_adjusted_k_large_k_small_index() {
2979            let (mut index, mut storage) = create_test_index();
2980
2981            // Insert only 5 vectors
2982            let mut ids = Vec::new();
2983            for i in 0..5 {
2984                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2985                ids.push(id);
2986            }
2987
2988            // Delete 2 = 40% tombstones
2989            index.soft_delete(ids[0]).unwrap();
2990            index.soft_delete(ids[1]).unwrap();
2991
2992            // Request k=100, but only 5 vectors in index
2993            // adjusted_k(100) = 100 * 5 / 3 ≈ 166
2994            // Capped at 10x = 1000
2995            // Note: adjusted_k controls internal effort, not final result count
2996            let adjusted = index.adjusted_k(100);
2997            assert!(
2998                (166..=168).contains(&adjusted),
2999                "Should compute ~166 for 40% tombstones, got {adjusted}"
3000            );
3001        }
3002
3003        #[test]
3004        fn test_adjusted_k_uses_constant() {
3005            // Verify MAX_ADJUSTED_K_MULTIPLIER is used
3006            assert_eq!(HnswIndex::MAX_ADJUSTED_K_MULTIPLIER, 10);
3007        }
3008
3009        #[test]
3010        fn test_adjusted_k_integer_precision() {
3011            // Test that integer arithmetic doesn't lose precision
3012            let (mut index, mut storage) = create_test_index();
3013
3014            let mut ids = Vec::new();
3015            for i in 0..100 {
3016                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
3017                ids.push(id);
3018            }
3019
3020            // Delete 33 of 100 = 33%
3021            for id in ids.iter().take(33) {
3022                index.soft_delete(*id).unwrap();
3023            }
3024
3025            // adjusted_k(10) = 10 * 100 / 67 = 14.925... → 14
3026            let adjusted = index.adjusted_k(10);
3027            // Should be close to 15 (1.49x)
3028            assert!(
3029                (14..=16).contains(&adjusted),
3030                "33% tombstones: expected ~15, got {adjusted}"
3031            );
3032        }
3033    }
3034}
edgevec/hnsw/graph.rs

edgevec/hnsw/
graph.rs