edgevec/hnsw/
graph.rs

1#![allow(clippy::cast_possible_truncation)]
2#![allow(clippy::cast_precision_loss)]
3#![allow(clippy::cast_sign_loss)]
4#![allow(clippy::missing_errors_doc)]
5#![allow(clippy::missing_panics_doc)]
6
7use super::config::HnswConfig;
8use super::neighbor::NeighborPool;
9use crate::storage::VectorStorage;
10use bytemuck::{Pod, Zeroable};
11use rand::{Rng, SeedableRng};
12use rand_chacha::ChaCha8Rng;
13use serde::{Deserialize, Serialize};
14use std::borrow::Cow;
15use std::vec::Vec;
16use thiserror::Error;
17
18/// Unique identifier for a vector in the database.
19///
20/// # Size
21/// 8 bytes, aligned to 8
22///
23/// # Invariants
24/// - IDs are never reused (monotonically increasing)
25/// - ID 0 is reserved (invalid sentinel)
26///
27/// # Safety
28/// Derives `Pod` and `Zeroable` for safe byte casting via bytemuck.
29#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Pod, Zeroable)]
30#[repr(transparent)]
31pub struct VectorId(pub u64);
32
33impl VectorId {
34    /// Sentinel value indicating "no vector"
35    pub const INVALID: Self = VectorId(0);
36
37    /// First valid ID
38    pub const FIRST: Self = VectorId(1);
39}
40
41/// Internal node identifier within HNSW graph.
42///
43/// # Size
44/// 4 bytes, aligned to 4
45///
46/// # Invariants
47/// - `NodeId` corresponds 1:1 with `VectorId` (lower 32 bits)
48/// - `NodeId` 0xFFFFFFFF is reserved (invalid sentinel)
49#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
50#[repr(transparent)]
51pub struct NodeId(pub u32);
52
53impl NodeId {
54    /// Sentinel value indicating invalid node
55    pub const INVALID: Self = NodeId(u32::MAX);
56}
57
58/// Represents a layer level in the HNSW graph.
59///
60/// Layer 0 is the base layer containing all nodes.
61/// Higher layers contain a subset of nodes for faster navigation.
62#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
63#[repr(transparent)]
64#[allow(dead_code)]
65pub struct Layer(pub u8);
66
67/// Errors that can occur during graph operations.
68#[derive(Error, Debug, Clone, PartialEq, Eq)]
69pub enum GraphError {
70    /// The graph has reached its maximum node capacity (`u32::MAX`).
71    #[error("node capacity exceeded")]
72    CapacityExceeded,
73
74    /// The provided `VectorId` is invalid (e.g., sentinel value).
75    #[error("invalid vector id")]
76    InvalidVectorId,
77
78    /// Neighbor data is corrupted or offset is out of bounds.
79    #[error("neighbor data corrupted")]
80    NeighborError,
81
82    /// Node ID is out of bounds.
83    #[error("node id out of bounds")]
84    NodeIdOutOfBounds,
85
86    /// Configuration mismatch with storage.
87    #[error("config dimension mismatch: expected {expected}, got {actual}")]
88    ConfigMismatch {
89        /// Expected dimensions.
90        expected: u32,
91        /// Actual dimensions in config.
92        actual: u32,
93    },
94
95    /// Query vector has wrong dimensions.
96    #[error("dimension mismatch: expected {expected}, got {actual}")]
97    DimensionMismatch {
98        /// Expected dimensions.
99        expected: usize,
100        /// Actual dimensions.
101        actual: usize,
102    },
103
104    /// Storage operation failed.
105    #[error("storage error: {0}")]
106    Storage(String),
107
108    /// Invalid configuration parameter.
109    #[error("invalid config: {0}")]
110    InvalidConfig(String),
111}
112
113/// Result of a compaction operation.
114///
115/// Returned by [`HnswIndex::compact()`] to provide metrics about the operation.
116///
117/// # Fields
118///
119/// * `tombstones_removed` - Number of deleted vectors removed
120/// * `new_size` - Size of the new index (live vectors only)
121/// * `duration_ms` - Time taken for the operation in milliseconds
122#[derive(Debug, Clone, PartialEq, Eq)]
123pub struct CompactionResult {
124    /// Number of tombstones (deleted vectors) removed during compaction.
125    pub tombstones_removed: usize,
126    /// New index size after compaction (live vectors only).
127    pub new_size: usize,
128    /// Time taken for the compaction operation in milliseconds.
129    pub duration_ms: u64,
130}
131
132/// A node in the HNSW graph with its adjacency information.
133///
134/// # Layout
135///
136/// Total size: 16 bytes
137/// Alignment: 8 bytes
138///
139/// # Fields
140///
141/// - `vector_id`: 8 bytes
142/// - `neighbor_offset`: 4 bytes
143/// - `neighbor_len`: 2 bytes
144/// - `max_layer`: 1 byte
145/// - `deleted`: 1 byte (soft delete flag, v0.3.0)
146///
147/// # Soft Delete (v0.3.0)
148///
149/// The `deleted` field enables O(1) soft delete. Deleted nodes remain in
150/// the graph for routing but are excluded from search results.
151/// - `deleted = 0`: Node is live (default)
152/// - `deleted = 1`: Node is deleted (tombstone)
153///
154/// # Safety
155///
156/// Derives `Pod` and `Zeroable` for safe byte casting via bytemuck.
157/// The `#[repr(C)]` ensures deterministic layout for serialization.
158/// All fields are primitive types with no padding gaps.
159#[derive(Clone, Copy, Debug, Serialize, Deserialize, Pod, Zeroable)]
160#[repr(C)]
161pub struct HnswNode {
162    /// The vector ID this node represents
163    pub vector_id: VectorId,
164
165    /// Offset into COMPRESSED neighbor pool
166    pub neighbor_offset: u32,
167
168    /// Length of neighbor data in bytes (Allocated Capacity)
169    pub neighbor_len: u16,
170
171    /// The maximum layer this node appears in
172    pub max_layer: u8,
173
174    /// Soft delete flag: 0 = live, 1 = deleted (v0.3.0)
175    /// This field replaces the padding byte from v0.2.x.
176    pub deleted: u8,
177}
178
179/// The HNSW Graph structure managing layers and nodes.
180///
181/// # Memory
182///
183/// Uses a flattened representation for cache efficiency.
184/// Nodes are stored in a contiguous vector.
185///
186/// # Soft Delete (v0.3.0)
187///
188/// Supports soft delete via tombstone marking. Deleted nodes remain
189/// in the graph for routing but are excluded from search results.
190/// Use `compact()` to reclaim space from deleted nodes.
191#[derive(Clone, Debug, Serialize, Deserialize)]
192pub struct HnswIndex {
193    /// Algorithm configuration
194    pub config: HnswConfig,
195
196    /// Node metadata (fixed-size per node)
197    pub(crate) nodes: Vec<HnswNode>,
198
199    /// Compressed neighbor lists
200    pub(crate) neighbors: NeighborPool,
201
202    /// Entry point (highest layer node)
203    pub(crate) entry_point: Option<NodeId>,
204
205    /// Maximum layer in the graph
206    pub(crate) max_layer: u8,
207
208    /// Level probability multiplier (1/ln(M))
209    pub(crate) level_mult: f32,
210
211    /// Deterministic RNG state
212    rng: ChaCha8Rng,
213
214    /// Count of soft-deleted vectors (v0.3.0)
215    /// This tracks the number of nodes with `deleted = 1`.
216    #[serde(default)]
217    pub(crate) deleted_count: usize,
218
219    /// Compaction threshold ratio (v0.3.0)
220    /// When tombstone_ratio() exceeds this value, needs_compaction() returns true.
221    /// Default: 0.3 (30% tombstones triggers compaction recommendation)
222    #[serde(default = "default_compaction_threshold")]
223    compaction_threshold: f64,
224}
225
226/// Default compaction threshold (30%)
227fn default_compaction_threshold() -> f64 {
228    0.3
229}
230
231impl HnswIndex {
232    /// Creates a new empty HNSW graph.
233    ///
234    /// # Arguments
235    ///
236    /// * `config` - HNSW configuration parameters.
237    /// * `storage` - Vector storage to validate against.
238    ///
239    /// # Errors
240    ///
241    /// Returns `GraphError::ConfigMismatch` if storage dimensions differ from config.
242    /// Returns `GraphError::InvalidConfig` if configuration parameters are invalid (e.g., M <= 1).
243    pub fn new(config: HnswConfig, storage: &VectorStorage) -> Result<Self, GraphError> {
244        if config.dimensions != storage.dimensions() {
245            return Err(GraphError::ConfigMismatch {
246                expected: storage.dimensions(),
247                actual: config.dimensions,
248            });
249        }
250
251        if config.m <= 1 {
252            return Err(GraphError::InvalidConfig(format!(
253                "m must be > 1, got {}",
254                config.m
255            )));
256        }
257        if config.m0 < config.m {
258            return Err(GraphError::InvalidConfig(format!(
259                "m0 must be >= m, got {} < {}",
260                config.m0, config.m
261            )));
262        }
263
264        // Calculate level multiplier: m_l = 1 / ln(M)
265        let m_float = config.m as f32;
266        let level_mult = if m_float > 1.0 {
267            1.0 / m_float.ln()
268        } else {
269            0.0
270        };
271
272        // Initialize RNG with a default seed for determinism.
273        let rng = ChaCha8Rng::seed_from_u64(42);
274
275        Ok(Self {
276            config,
277            nodes: Vec::new(),
278            neighbors: NeighborPool::new(),
279            entry_point: None,
280            max_layer: 0,
281            level_mult,
282            rng,
283            deleted_count: 0, // v0.3.0: No deleted nodes initially
284            compaction_threshold: default_compaction_threshold(), // v0.3.0: Default 30%
285        })
286    }
287
288    /// Generates a random level for a new node.
289    ///
290    /// Formula: `floor(-ln(uniform(0,1)) * m_l)`
291    /// Clamped to `max_level` (e.g. 16) to prevent memory explosion.
292    #[must_use]
293    pub fn get_random_level(&mut self) -> u8 {
294        // Generate uniform(0, 1)
295        let r: f32 = self.rng.gen_range(f32::EPSILON..=1.0);
296        let level = (-r.ln() * self.level_mult).floor();
297
298        // Safety cap (e.g. 16)
299        if level > 16.0 {
300            16
301        } else {
302            level as u8
303        }
304    }
305
306    /// Adds a node to the graph.
307    ///
308    /// # Arguments
309    ///
310    /// * `vector_id` - The external vector identifier
311    /// * `max_layer` - The maximum layer for this node
312    ///
313    /// # Returns
314    ///
315    /// The new `NodeId` assigned to this node, or a `GraphError`.
316    pub fn add_node(&mut self, vector_id: VectorId, max_layer: u8) -> Result<NodeId, GraphError> {
317        if vector_id == VectorId::INVALID {
318            return Err(GraphError::InvalidVectorId);
319        }
320
321        // Safety limit for NodeId
322        if self.nodes.len() >= u32::MAX as usize {
323            return Err(GraphError::CapacityExceeded);
324        }
325
326        let node = HnswNode {
327            vector_id,
328            neighbor_offset: 0,
329            neighbor_len: 0,
330            max_layer,
331            deleted: 0, // Live node (v0.3.0)
332        };
333
334        #[allow(clippy::cast_possible_truncation)]
335        let id = NodeId(self.nodes.len() as u32);
336        self.nodes.push(node);
337
338        // Update max layer if needed
339        if max_layer > self.max_layer {
340            self.max_layer = max_layer;
341        }
342
343        Ok(id)
344    }
345
346    /// Sets the neighbors for a node.
347    ///
348    /// # Arguments
349    /// * `node_id` - The node to update.
350    /// * `neighbors` - The list of neighbor IDs.
351    pub fn set_neighbors(
352        &mut self,
353        node_id: NodeId,
354        neighbors: &[NodeId],
355    ) -> Result<(), GraphError> {
356        if node_id.0 as usize >= self.nodes.len() {
357            return Err(GraphError::InvalidVectorId);
358        }
359
360        // Convert NodeId to u32 for encoding
361        let neighbor_u32s: Vec<u32> = neighbors.iter().map(|n| n.0).collect();
362        let encoded = NeighborPool::encode_neighbors(&neighbor_u32s);
363
364        // Alloc new space
365        let (offset, capacity) = self.neighbors.alloc(encoded.len())?;
366
367        // Write data
368        let start = offset as usize;
369        let end = start + encoded.len();
370        self.neighbors.buffer[start..end].copy_from_slice(&encoded);
371
372        // Update node and free old
373        let node = &mut self.nodes[node_id.0 as usize];
374
375        // Free old slot if it existed
376        if node.neighbor_len > 0 {
377            self.neighbors.free(node.neighbor_offset, node.neighbor_len);
378        }
379
380        node.neighbor_offset = offset;
381        node.neighbor_len = capacity; // Store allocated capacity
382
383        Ok(())
384    }
385
386    /// Retrieves a node by its ID.
387    #[must_use]
388    pub fn get_node(&self, id: NodeId) -> Option<&HnswNode> {
389        if id == NodeId::INVALID {
390            return None;
391        }
392        self.nodes.get(id.0 as usize)
393    }
394
395    /// Retrieves the neighbors for a node.
396    pub fn get_neighbors(&self, node: &HnswNode) -> Result<Vec<NodeId>, GraphError> {
397        let start = node.neighbor_offset as usize;
398        // Read up to allocated capacity
399        let end = start + node.neighbor_len as usize;
400
401        if end > self.neighbors.buffer.len() {
402            return Err(GraphError::NeighborError);
403        }
404
405        let slice = &self.neighbors.buffer[start..end];
406        let raw_neighbors = NeighborPool::decode_neighbors(slice);
407
408        // Convert back to NodeId
409        let neighbors = raw_neighbors.into_iter().map(NodeId).collect();
410        Ok(neighbors)
411    }
412
413    /// Retrieves the neighbors for a specific layer.
414    pub fn get_neighbors_layer(
415        &self,
416        node: &HnswNode,
417        layer: u8,
418    ) -> Result<Vec<NodeId>, GraphError> {
419        let start = node.neighbor_offset as usize;
420        let end = start + node.neighbor_len as usize;
421
422        if end > self.neighbors.buffer.len() {
423            return Err(GraphError::NeighborError);
424        }
425
426        let slice = &self.neighbors.buffer[start..end];
427        let raw_neighbors = NeighborPool::decode_layer(slice, layer);
428
429        // Convert back to NodeId
430        let neighbors = raw_neighbors.into_iter().map(NodeId).collect();
431        Ok(neighbors)
432    }
433
434    /// Returns the number of nodes in the graph.
435    #[must_use]
436    pub fn node_count(&self) -> usize {
437        self.nodes.len()
438    }
439
440    /// Returns the entry point node ID, if any.
441    #[must_use]
442    pub fn entry_point(&self) -> Option<NodeId> {
443        self.entry_point
444    }
445
446    /// Sets the entry point node ID.
447    pub fn set_entry_point(&mut self, id: NodeId) {
448        self.entry_point = Some(id);
449    }
450
451    /// Returns the current maximum layer in the graph.
452    #[must_use]
453    pub fn max_layer(&self) -> u8 {
454        self.max_layer
455    }
456
457    // ============================================================================
458    // Soft Delete API (W16.2 - RFC-001)
459    // ============================================================================
460
461    /// Get mutable reference to a node by VectorId.
462    ///
463    /// # Complexity
464    ///
465    /// * Time: O(n) linear scan
466    /// * Space: O(1)
467    fn get_node_mut(&mut self, vector_id: VectorId) -> Result<&mut HnswNode, GraphError> {
468        self.nodes
469            .iter_mut()
470            .find(|n| n.vector_id == vector_id)
471            .ok_or(GraphError::InvalidVectorId)
472    }
473
474    /// Get immutable reference to a node by VectorId.
475    ///
476    /// # Complexity
477    ///
478    /// * Time: O(n) linear scan
479    /// * Space: O(1)
480    fn get_node_by_vector_id(&self, vector_id: VectorId) -> Result<&HnswNode, GraphError> {
481        self.nodes
482            .iter()
483            .find(|n| n.vector_id == vector_id)
484            .ok_or(GraphError::InvalidVectorId)
485    }
486
487    /// Mark a vector as deleted (soft delete).
488    ///
489    /// The vector remains in the graph for routing but is excluded from
490    /// search results. Space is reclaimed via `compact()`.
491    ///
492    /// # Arguments
493    ///
494    /// * `vector_id` - The ID of the vector to delete
495    ///
496    /// # Returns
497    ///
498    /// * `Ok(true)` - Vector was deleted
499    /// * `Ok(false)` - Vector was already deleted
500    /// * `Err(InvalidVectorId)` - Vector ID not found
501    ///
502    /// # Complexity
503    ///
504    /// * Time: **O(n)** for VectorId → Node lookup via linear scan,
505    ///   plus O(1) for setting the deleted byte
506    /// * Space: O(1)
507    ///
508    /// **Note:** The O(n) lookup is a known limitation in v0.3.0.
509    /// A HashMap<VectorId, NodeId> index could provide O(1) lookup
510    /// but is deferred to v0.4.0 to avoid scope creep.
511    ///
512    /// # Thread Safety (RFC-001 Design)
513    ///
514    /// This method requires `&mut self`, which means Rust's borrow checker
515    /// prevents concurrent access at compile time. This is intentional:
516    ///
517    /// * Search (`&self`) and delete (`&mut self`) cannot run concurrently
518    /// * No atomics or locks needed - safety is enforced by the type system
519    /// * See RFC-001 Section 6.4 "Design Decisions" for rationale
520    ///
521    /// # Persistence
522    ///
523    /// **IMPORTANT:** Delete operations are in-memory only until `save()` is called.
524    /// If the process crashes before `save()`, the delete will be lost.
525    ///
526    /// # Example
527    ///
528    /// ```ignore
529    /// let deleted = index.soft_delete(VectorId(42))?;
530    /// assert!(deleted);
531    /// assert!(index.is_deleted(VectorId(42))?);
532    /// ```
533    pub fn soft_delete(&mut self, vector_id: VectorId) -> Result<bool, GraphError> {
534        let node = self.get_node_mut(vector_id)?;
535
536        if node.deleted != 0 {
537            return Ok(false); // Already deleted
538        }
539
540        node.deleted = 1;
541        self.deleted_count += 1;
542        Ok(true)
543    }
544
545    /// Check if a vector is marked as deleted.
546    ///
547    /// # Arguments
548    ///
549    /// * `vector_id` - The ID of the vector to check
550    ///
551    /// # Returns
552    ///
553    /// * `Ok(true)` - Vector is deleted
554    /// * `Ok(false)` - Vector is live
555    /// * `Err(InvalidVectorId)` - Vector ID not found
556    ///
557    /// # Complexity
558    ///
559    /// * Time: O(n) for lookup + O(1) for check
560    /// * Space: O(1)
561    pub fn is_deleted(&self, vector_id: VectorId) -> Result<bool, GraphError> {
562        let node = self.get_node_by_vector_id(vector_id)?;
563        Ok(node.deleted != 0)
564    }
565
566    /// Get the count of deleted (tombstoned) vectors.
567    ///
568    /// # Returns
569    ///
570    /// Number of vectors marked as deleted.
571    #[must_use]
572    pub fn deleted_count(&self) -> usize {
573        self.deleted_count
574    }
575
576    /// Get the ratio of deleted vectors to total vectors.
577    ///
578    /// # Returns
579    ///
580    /// A value between 0.0 and 1.0 representing the tombstone ratio.
581    /// Returns 0.0 if the index is empty.
582    #[must_use]
583    pub fn tombstone_ratio(&self) -> f64 {
584        let total = self.node_count();
585        if total == 0 {
586            return 0.0;
587        }
588        self.deleted_count as f64 / total as f64
589    }
590
591    /// Get count of live (non-deleted) vectors.
592    ///
593    /// # Returns
594    ///
595    /// Number of vectors that are not marked as deleted.
596    #[must_use]
597    pub fn live_count(&self) -> usize {
598        self.node_count().saturating_sub(self.deleted_count)
599    }
600
601    /// Maximum multiplier for adjusted_k to prevent excessive over-fetching.
602    ///
603    /// At 90%+ tombstones, we cap at 10x the original k to bound memory usage.
604    /// Beyond this ratio, compaction should be triggered.
605    pub const MAX_ADJUSTED_K_MULTIPLIER: usize = 10;
606
607    /// Calculate adjusted k to compensate for tombstones.
608    ///
609    /// When the index has deleted vectors, we over-fetch to ensure
610    /// we can return k live results after filtering. This method
611    /// calculates how many candidates to fetch internally so that
612    /// after filtering out deleted vectors, we likely have k results.
613    ///
614    /// # Formula
615    ///
616    /// `adjusted_k = k * total / live`
617    ///
618    /// This is equivalent to `k / (1 - tombstone_ratio)` but uses
619    /// integer arithmetic for precision.
620    ///
621    /// # Caps
622    ///
623    /// The result is capped at `k * MAX_ADJUSTED_K_MULTIPLIER` (default 10x)
624    /// to prevent excessive fetching at very high tombstone ratios.
625    ///
626    /// # Examples
627    ///
628    /// * 0% tombstones: k = k (no adjustment)
629    /// * 10% tombstones: k → ~1.11x
630    /// * 30% tombstones: k → ~1.43x
631    /// * 50% tombstones: k → 2x
632    /// * 90%+ tombstones: k → 10x (capped)
633    ///
634    /// # Thread Safety
635    ///
636    /// This method reads `deleted_count` and `node_count()` which may change
637    /// if the index is mutated. Per RFC-001, the API uses `&mut self` for
638    /// mutations, so concurrent read+write is prevented by Rust's borrow checker.
639    /// The design accepts eventual consistency for soft delete semantics.
640    ///
641    /// # Arguments
642    ///
643    /// * `k` - The requested number of results
644    ///
645    /// # Returns
646    ///
647    /// The adjusted k value to use for internal search operations.
648    /// This value is always >= k (unless capped by live_count).
649    #[must_use]
650    pub fn adjusted_k(&self, k: usize) -> usize {
651        // Fast path: no tombstones
652        if self.deleted_count == 0 {
653            return k;
654        }
655
656        let total = self.node_count();
657        let live = self.live_count();
658
659        // Edge case: all deleted
660        // This also prevents division by zero in the calculation below.
661        if live == 0 {
662            return k; // Will return empty results anyway
663        }
664
665        // Integer arithmetic: adjusted = k * total / live
666        // Use saturating ops to prevent overflow
667        let adjusted = k.saturating_mul(total) / live;
668
669        // Cap at MAX_ADJUSTED_K_MULTIPLIER to prevent excessive over-fetching
670        // Note: We don't cap at total because adjusted_k controls the internal
671        // search effort (ef parameter), not the final result count.
672        let max_by_multiplier = k.saturating_mul(Self::MAX_ADJUSTED_K_MULTIPLIER);
673        adjusted.min(max_by_multiplier)
674    }
675
676    /// Marks a vector as deleted in the storage (legacy API).
677    ///
678    /// **DEPRECATED:** Use `soft_delete()` instead for RFC-001 compliant soft delete.
679    /// This method delegates to storage and does not update `deleted_count`.
680    ///
681    /// # Arguments
682    ///
683    /// * `id` - The vector ID to delete.
684    /// * `storage` - The vector storage to update.
685    ///
686    /// # Returns
687    ///
688    /// `true` if the vector was active and is now deleted.
689    /// `false` if it was already deleted.
690    #[deprecated(since = "0.3.0", note = "Use soft_delete() instead")]
691    pub fn delete_in_storage(&self, id: VectorId, storage: &mut VectorStorage) -> bool {
692        storage.mark_deleted(id)
693    }
694
695    /// DEBUG: Print memory stats
696    pub fn log_stats(&self) {
697        println!("Index Stats:");
698        println!("  Node Count: {}", self.nodes.len());
699        println!("  Neighbor Buffer Len: {}", self.neighbors.buffer.len());
700        println!(
701            "  Neighbor Buffer Cap: {}",
702            self.neighbors.buffer.capacity()
703        );
704        println!("  Total Memory Usage: {} bytes", self.memory_usage());
705        // bucket stats are internal to NeighborPool
706    }
707
708    /// Returns the approximate memory usage in bytes.
709    #[must_use]
710    pub fn memory_usage(&self) -> usize {
711        let nodes_size = self.nodes.capacity() * std::mem::size_of::<HnswNode>();
712        let neighbors_size = self.neighbors.memory_usage();
713
714        std::mem::size_of::<Self>() + nodes_size + neighbors_size
715    }
716
717    // ========================================================================
718    // Compaction API (W16.4 - RFC-001)
719    // ========================================================================
720
721    /// Check if compaction is recommended.
722    ///
723    /// Returns `true` if the tombstone ratio exceeds the configured threshold
724    /// (default 30%). When this returns true, calling `compact()` is
725    /// recommended to reclaim space and maintain search performance.
726    ///
727    /// # Thread Safety
728    ///
729    /// This method is **not thread-safe**. `HnswIndex` is `!Send` and must be
730    /// accessed from a single thread. For concurrent use, create separate index
731    /// instances (e.g., via Web Workers in WASM).
732    ///
733    /// # Example
734    ///
735    /// ```ignore
736    /// if index.needs_compaction() {
737    ///     let (new_index, new_storage, result) = index.compact(&storage)?;
738    ///     index = new_index;
739    ///     storage = new_storage;
740    /// }
741    /// ```
742    #[must_use]
743    pub fn needs_compaction(&self) -> bool {
744        self.tombstone_ratio() > self.compaction_threshold
745    }
746
747    /// Set the compaction threshold.
748    ///
749    /// When `tombstone_ratio()` exceeds this value, `needs_compaction()`
750    /// returns true.
751    ///
752    /// # Arguments
753    ///
754    /// * `ratio` - Tombstone ratio threshold (0.01 to 0.99)
755    ///
756    /// # Default
757    ///
758    /// Default is 0.3 (30%). Lower values trigger compaction more often
759    /// but maintain better search performance.
760    ///
761    /// # Clamping
762    ///
763    /// Values outside [0.01, 0.99] are clamped to that range.
764    pub fn set_compaction_threshold(&mut self, ratio: f64) {
765        self.compaction_threshold = ratio.clamp(0.01, 0.99);
766    }
767
768    /// Get the current compaction threshold.
769    ///
770    /// # Returns
771    ///
772    /// The ratio at which `needs_compaction()` returns true.
773    #[must_use]
774    pub fn compaction_threshold(&self) -> f64 {
775        self.compaction_threshold
776    }
777
778    /// Get a compaction warning message if compaction is recommended.
779    ///
780    /// Returns `Some(message)` if the tombstone ratio exceeds the threshold,
781    /// or `None` if compaction is not needed.
782    ///
783    /// # Example
784    ///
785    /// ```ignore
786    /// if let Some(warning) = index.compaction_warning() {
787    ///     log::warn!("{}", warning);
788    /// }
789    /// ```
790    #[must_use]
791    pub fn compaction_warning(&self) -> Option<String> {
792        if self.needs_compaction() {
793            Some(format!(
794                "Compaction recommended: tombstone ratio {:.1}% exceeds threshold {:.1}%. \
795                 Call compact() to rebuild index without tombstones.",
796                self.tombstone_ratio() * 100.0,
797                self.compaction_threshold * 100.0
798            ))
799        } else {
800            None
801        }
802    }
803
804    /// Compact the index by rebuilding without tombstones.
805    ///
806    /// This operation creates a NEW index and NEW storage containing only
807    /// live (non-deleted) vectors. The original index and storage are NOT
808    /// modified, allowing the caller to replace them atomically.
809    ///
810    /// # Important: Vector ID Remapping
811    ///
812    /// Due to storage design constraints, vector IDs are remapped during
813    /// compaction. New IDs are assigned sequentially starting from 1.
814    /// If you need to track the mapping, use the returned index to query
815    /// by position or content.
816    ///
817    /// # Returns
818    ///
819    /// Returns `(new_index, new_storage, result)` tuple. The caller MUST
820    /// replace BOTH their index and storage references:
821    ///
822    /// ```ignore
823    /// let (new_index, new_storage, result) = old_index.compact(&old_storage)?;
824    /// println!("Removed {} tombstones in {}ms", result.tombstones_removed, result.duration_ms);
825    /// // Now use new_index and new_storage, old ones will be dropped
826    /// index = new_index;
827    /// storage = new_storage;
828    /// ```
829    ///
830    /// # Algorithm
831    ///
832    /// 1. Collect all live vectors (non-deleted)
833    /// 2. Create a new empty index and storage with the same config
834    /// 3. Re-insert all live vectors using regular insert()
835    /// 4. Return the new pair
836    ///
837    /// # Performance
838    ///
839    /// * Time: O(n log n) where n = live vector count
840    /// * Space: 2x index size during compaction (temporary)
841    ///
842    /// # Memory Safety
843    ///
844    /// * Returns new pair — no storage/index mismatch possible
845    /// * On failure, original index/storage unchanged (caller keeps refs)
846    /// * Old index/storage are NOT modified — caller drops when ready
847    ///
848    /// # Warning
849    ///
850    /// This is a blocking operation. For WASM, consider running
851    /// during idle time or on user action.
852    pub fn compact(
853        &self,
854        storage: &VectorStorage,
855    ) -> Result<(HnswIndex, VectorStorage, CompactionResult), GraphError> {
856        use std::time::Instant;
857        let start = Instant::now();
858
859        let original_deleted = self.deleted_count;
860        let original_total = self.node_count();
861
862        // Fast path: no tombstones — return copy
863        if original_deleted == 0 {
864            // Clone manually since VectorStorage doesn't implement Clone
865            let config = self.config.clone();
866            let mut new_storage = VectorStorage::new(&config, None);
867            let mut new_index = HnswIndex::new(config, &new_storage)?;
868            new_index.compaction_threshold = self.compaction_threshold;
869
870            // Re-insert all vectors (this is a rebuild, but preserves order)
871            for node in &self.nodes {
872                let vec = storage.get_vector(node.vector_id);
873                new_index.insert(&vec, &mut new_storage)?;
874            }
875
876            return Ok((
877                new_index,
878                new_storage,
879                CompactionResult {
880                    tombstones_removed: 0,
881                    new_size: original_total,
882                    duration_ms: 0,
883                },
884            ));
885        }
886
887        // Collect live vectors' data
888        let live_vectors: Vec<Vec<f32>> = self
889            .nodes
890            .iter()
891            .filter(|node| node.deleted == 0)
892            .map(|node| {
893                let vec = storage.get_vector(node.vector_id);
894                vec.into_owned()
895            })
896            .collect();
897
898        let new_size = live_vectors.len();
899
900        // Build new index AND new storage with same config
901        let config = self.config.clone();
902        let mut new_storage = VectorStorage::new(&config, None);
903        let mut new_index = HnswIndex::new(config, &new_storage)?;
904
905        // Copy compaction threshold from original
906        new_index.compaction_threshold = self.compaction_threshold;
907
908        // Re-insert all live vectors (IDs will be remapped)
909        for vector in live_vectors {
910            new_index.insert(&vector, &mut new_storage)?;
911        }
912
913        let duration_ms = start.elapsed().as_millis() as u64;
914
915        Ok((
916            new_index,
917            new_storage,
918            CompactionResult {
919                tombstones_removed: original_deleted,
920                new_size,
921                duration_ms,
922            },
923        ))
924    }
925
926    /// Insert a vector with a specific ID (validation only).
927    ///
928    /// This method validates that the specified ID doesn't conflict with
929    /// existing IDs, then delegates to the standard `insert()` method.
930    ///
931    /// **Important:** Due to storage design constraints where VectorIds must
932    /// match storage slot indices, the returned VectorId is the one assigned
933    /// by storage, NOT the requested ID. The `id` parameter is only used
934    /// for duplicate validation.
935    ///
936    /// For actual ID preservation during compaction, the storage would need
937    /// to support sparse ID assignment, which is not currently implemented.
938    ///
939    /// # Arguments
940    ///
941    /// * `id` - The specific VectorId to validate (not actually used)
942    /// * `vector` - The vector data (must match configured dimensions)
943    /// * `storage` - Mutable reference to vector storage
944    ///
945    /// # Returns
946    ///
947    /// The VectorId assigned by storage (sequential).
948    ///
949    /// # Errors
950    ///
951    /// * `InvalidVectorId` - If ID already exists in index or is the sentinel value
952    /// * `DimensionMismatch` - If vector dimensions don't match config
953    /// * `Storage` - If storage operation fails
954    pub fn insert_with_id(
955        &mut self,
956        id: VectorId,
957        vector: &[f32],
958        storage: &mut VectorStorage,
959    ) -> Result<VectorId, GraphError> {
960        // Validate ID is not sentinel
961        if id == VectorId::INVALID {
962            return Err(GraphError::InvalidVectorId);
963        }
964
965        // Validate ID doesn't already exist
966        if self.nodes.iter().any(|n| n.vector_id == id) {
967            return Err(GraphError::InvalidVectorId);
968        }
969
970        // Validate dimensions
971        if vector.len() != self.config.dimensions as usize {
972            return Err(GraphError::DimensionMismatch {
973                expected: self.config.dimensions as usize,
974                actual: vector.len(),
975            });
976        }
977
978        // Delegate to regular insert (which handles all the graph connection logic)
979        self.insert(vector, storage)
980    }
981}
982
983/// Trait for providing vector data by ID.
984pub trait VectorProvider {
985    /// Returns the vector data for a given ID.
986    fn get_vector(&self, id: VectorId) -> Cow<'_, [f32]>;
987
988    /// Returns true if the vector is marked as deleted.
989    fn is_deleted(&self, id: VectorId) -> bool {
990        let _ = id;
991        false
992    }
993
994    /// Returns the quantized vector data for a given ID, if available.
995    ///
996    /// # Returns
997    ///
998    /// * `Some(&[u8])` - If the provider supports direct quantized access.
999    /// * `None` - If not supported or data is not quantized.
1000    fn get_quantized_vector(&self, id: VectorId) -> Option<&[u8]> {
1001        let _ = id;
1002        None
1003    }
1004
1005    /// Quantizes a query vector into the provided output buffer.
1006    ///
1007    /// # Arguments
1008    ///
1009    /// * `query` - The query vector in f32.
1010    /// * `output` - buffer to write quantized data into.
1011    ///
1012    /// # Returns
1013    ///
1014    /// * `Some(&[u8])` - The quantized slice (borrowed from output).
1015    /// * `None` - If quantization is not supported.
1016    fn quantize_query<'a>(&self, query: &[f32], output: &'a mut Vec<u8>) -> Option<&'a [u8]> {
1017        let _ = query;
1018        let _ = output;
1019        None
1020    }
1021}
1022
1023// ============================================================================
1024// Batch Insertion Implementation (W11.1 - RFC 0001 v1.1)
1025// ============================================================================
1026
1027use crate::batch::BatchInsertable;
1028use crate::error::BatchError;
1029use std::collections::HashSet;
1030
1031impl HnswIndex {
1032    /// Returns the configured dimensionality of the index.
1033    #[must_use]
1034    pub fn dimensions(&self) -> u32 {
1035        self.config.dimensions
1036    }
1037
1038    /// Returns the current number of nodes in the index.
1039    #[must_use]
1040    pub fn len(&self) -> usize {
1041        self.nodes.len()
1042    }
1043
1044    /// Returns true if the index contains no nodes.
1045    #[must_use]
1046    pub fn is_empty(&self) -> bool {
1047        self.nodes.is_empty()
1048    }
1049
1050    /// Returns the maximum capacity of the index (u32::MAX nodes).
1051    #[must_use]
1052    pub const fn capacity(&self) -> usize {
1053        u32::MAX as usize
1054    }
1055
1056    /// Checks if a vector ID already exists in the index.
1057    ///
1058    /// Returns `true` if the ID exists, `false` otherwise.
1059    #[must_use]
1060    pub fn contains_id(&self, id: u64) -> bool {
1061        self.nodes.iter().any(|n| n.vector_id.0 == id)
1062    }
1063
1064    /// Checks if a vector contains invalid floating-point values.
1065    ///
1066    /// Returns `Some(reason)` if invalid, `None` if valid.
1067    fn validate_vector(vector: &[f32]) -> Option<String> {
1068        for (i, &val) in vector.iter().enumerate() {
1069            if val.is_nan() {
1070                return Some(format!("NaN at index {i}"));
1071            }
1072            if val.is_infinite() {
1073                return Some(format!("Infinity at index {i}"));
1074            }
1075        }
1076        None
1077    }
1078}
1079
1080impl BatchInsertable for HnswIndex {
1081    /// Inserts multiple vectors in a single batch operation.
1082    ///
1083    /// This is the full implementation per RFC 0001 v1.2.
1084    ///
1085    /// # Algorithm
1086    ///
1087    /// 1. Collect iterator to Vec (required for progress tracking)
1088    /// 2. Pre-validate first vector's dimensionality (fail-fast)
1089    /// 3. Check capacity constraints
1090    /// 4. Iterate through vectors with progress callbacks
1091    /// 5. Actually insert vectors into HNSW graph via storage
1092    /// 6. Handle errors according to best-effort semantics
1093    ///
1094    /// # Error Handling
1095    ///
1096    /// - **Fatal errors** (abort immediately):
1097    ///   - DimensionMismatch on first vector
1098    ///   - CapacityExceeded
1099    ///   - InternalError (HNSW invariant violated)
1100    ///
1101    /// - **Non-fatal errors** (skip and continue):
1102    ///   - DuplicateId (within batch or existing in index)
1103    ///   - InvalidVector (NaN, Inf)
1104    ///   - DimensionMismatch on subsequent vectors
1105    fn batch_insert<I, F>(
1106        &mut self,
1107        vectors: I,
1108        storage: &mut VectorStorage,
1109        mut progress_callback: Option<F>,
1110    ) -> Result<Vec<u64>, BatchError>
1111    where
1112        I: IntoIterator<Item = (u64, Vec<f32>)>,
1113        F: FnMut(usize, usize),
1114    {
1115        // Step 1: Collect iterator to Vec for progress tracking
1116        let batch: Vec<(u64, Vec<f32>)> = vectors.into_iter().collect();
1117        let total = batch.len();
1118
1119        // Early return for empty batch
1120        if total == 0 {
1121            return Ok(vec![]);
1122        }
1123
1124        // Step 2: Pre-validate first vector's dimensionality (fail-fast)
1125        let expected_dim = self.config.dimensions as usize;
1126        let (first_id, first_vec) = &batch[0];
1127        if first_vec.len() != expected_dim {
1128            return Err(BatchError::DimensionMismatch {
1129                expected: expected_dim,
1130                actual: first_vec.len(),
1131                vector_id: *first_id,
1132            });
1133        }
1134
1135        // Step 3: Check capacity constraints
1136        let current_count = self.nodes.len();
1137        if current_count.saturating_add(total) > self.capacity() {
1138            return Err(BatchError::CapacityExceeded {
1139                current: current_count,
1140                max: self.capacity(),
1141            });
1142        }
1143
1144        // Track IDs we've seen in this batch to detect duplicates within the batch
1145        let mut seen_ids: HashSet<u64> = HashSet::with_capacity(total);
1146
1147        // Track successfully inserted IDs
1148        let mut inserted_ids: Vec<u64> = Vec::with_capacity(total);
1149
1150        // Step 4: Initial progress callback (0%)
1151        if let Some(ref mut callback) = progress_callback {
1152            callback(0, total);
1153        }
1154
1155        // Calculate progress intervals (every 10%)
1156        let progress_interval = if total >= 10 { total / 10 } else { 1 };
1157
1158        // Step 5: Process each vector
1159        for (id, vector) in batch {
1160            // Check for duplicate ID within this batch
1161            if !seen_ids.insert(id) {
1162                // Duplicate within batch - skip (non-fatal)
1163                continue;
1164            }
1165
1166            // Check for ID 0 (reserved sentinel)
1167            if id == 0 {
1168                // Skip invalid ID (non-fatal)
1169                continue;
1170            }
1171
1172            // Check for duplicate ID in existing index [M1 fix]
1173            if self.contains_id(id) {
1174                // Duplicate in existing index - skip (non-fatal)
1175                continue;
1176            }
1177
1178            // Validate dimensionality
1179            if vector.len() != expected_dim {
1180                // Skip dimension mismatch (non-fatal after first vector)
1181                continue;
1182            }
1183
1184            // Validate vector data (NaN, Inf)
1185            if Self::validate_vector(&vector).is_some() {
1186                // Skip invalid vector (non-fatal)
1187                continue;
1188            }
1189
1190            // Step 6: Actually insert the vector into the HNSW graph [C1 fix]
1191            match self.insert(&vector, storage) {
1192                Ok(assigned_id) => {
1193                    // Use the ID assigned by the insert method
1194                    inserted_ids.push(assigned_id.0);
1195                }
1196                Err(e) => {
1197                    // Internal error during HNSW insertion - this is fatal
1198                    return Err(BatchError::InternalError {
1199                        message: format!("HNSW insert failed for vector {id}: {e}"),
1200                    });
1201                }
1202            }
1203
1204            // Progress callback at intervals [M3 fix: report inserted count]
1205            let inserted_count = inserted_ids.len();
1206            if inserted_count % progress_interval == 0 || inserted_count == 1 {
1207                if let Some(ref mut callback) = progress_callback {
1208                    callback(inserted_count, total);
1209                }
1210            }
1211        }
1212
1213        // Final progress callback (100%)
1214        if let Some(ref mut callback) = progress_callback {
1215            callback(inserted_ids.len(), total);
1216        }
1217
1218        Ok(inserted_ids)
1219    }
1220}
1221
1222#[cfg(test)]
1223mod tests {
1224    use super::*;
1225
1226    #[test]
1227    fn test_send_sync() {
1228        fn assert_send_sync<T: Send + Sync>() {}
1229        assert_send_sync::<HnswIndex>();
1230    }
1231
1232    #[test]
1233    fn test_initialization() {
1234        let config = HnswConfig::new(128);
1235        // Create storage with matching dimensions
1236        let storage = VectorStorage::new(&config, None);
1237
1238        let index = HnswIndex::new(config.clone(), &storage).unwrap();
1239
1240        assert_eq!(index.node_count(), 0);
1241        assert_eq!(index.entry_point(), None);
1242        assert_eq!(index.max_layer(), 0);
1243    }
1244
1245    #[test]
1246    fn test_dimension_mismatch() {
1247        let config_idx = HnswConfig::new(128);
1248        let config_store = HnswConfig::new(64);
1249        let storage = VectorStorage::new(&config_store, None);
1250
1251        let result = HnswIndex::new(config_idx, &storage);
1252        assert!(matches!(
1253            result,
1254            Err(GraphError::ConfigMismatch {
1255                expected: 64,
1256                actual: 128
1257            })
1258        ));
1259    }
1260
1261    #[test]
1262    fn test_layer_distribution() {
1263        // Geometric distribution test
1264        // m=16 => m_l = 1/ln(16) ≈ 0.36
1265        // Prob(level > 0) = e^(-1/m_l) = 1/M = 1/16
1266        // We can't strictly test randomness without huge samples, but we can sanity check.
1267        let config = HnswConfig::new(128);
1268        let storage = VectorStorage::new(&config, None);
1269        let mut index = HnswIndex::new(config, &storage).unwrap();
1270
1271        let mut levels = vec![0u8; 1000];
1272        for l in levels.iter_mut() {
1273            *l = index.get_random_level();
1274        }
1275
1276        // Level 0 should be most common
1277        let l0_count = levels.iter().filter(|&&l| l == 0).count();
1278        assert!(
1279            l0_count > 800,
1280            "Level 0 should be dominant (expected ~93% for M=16)"
1281        );
1282
1283        // Max level shouldn't be crazy
1284        let max = *levels.iter().max().unwrap();
1285        assert!(max < 16, "Level should be reasonable");
1286    }
1287
1288    #[test]
1289    fn test_neighbor_roundtrip() {
1290        let config = HnswConfig::new(128);
1291        let storage = VectorStorage::new(&config, None);
1292        let mut index = HnswIndex::new(config, &storage).unwrap();
1293
1294        let id1 = index.add_node(VectorId(1), 0).unwrap();
1295        let id2 = index.add_node(VectorId(2), 0).unwrap();
1296        let id3 = index.add_node(VectorId(3), 0).unwrap();
1297
1298        // Neighbors: [2, 3]
1299        let neighbors = vec![id2, id3];
1300        index.set_neighbors(id1, &neighbors).unwrap();
1301
1302        {
1303            let node1 = index.get_node(id1).unwrap();
1304            let retrieved = index.get_neighbors(node1).unwrap();
1305            assert_eq!(retrieved, neighbors);
1306        } // Drop node1 borrow
1307
1308        // Update neighbors: [3] (shrink)
1309        let neighbors2 = vec![id3];
1310        index.set_neighbors(id1, &neighbors2).unwrap();
1311
1312        {
1313            let node1 = index.get_node(id1).unwrap();
1314            let retrieved2 = index.get_neighbors(node1).unwrap();
1315            assert_eq!(retrieved2, neighbors2);
1316        }
1317
1318        // Check if free list got populated (cannot check directly as NeighborPool is private,
1319        // but we can trust neighbor.rs tests for internal logic)
1320    }
1321
1322    // ============================================================
1323    // BATCH INSERT TESTS (W11.1 - RFC 0001 v1.2)
1324    // ============================================================
1325
1326    mod batch_insert_tests {
1327        use super::*;
1328        use crate::batch::BatchInsertable;
1329
1330        fn create_test_index_and_storage(dim: u32) -> (HnswIndex, VectorStorage) {
1331            let config = HnswConfig::new(dim);
1332            let storage = VectorStorage::new(&config, None);
1333            let index = HnswIndex::new(config, &storage).unwrap();
1334            (index, storage)
1335        }
1336
1337        #[test]
1338        fn test_batch_insert_empty() {
1339            let (mut index, mut storage) = create_test_index_and_storage(128);
1340            let vectors: Vec<(u64, Vec<f32>)> = vec![];
1341
1342            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1343
1344            assert!(result.is_ok());
1345            assert!(result.unwrap().is_empty());
1346            // [M2 fix] Verify graph state
1347            assert_eq!(index.node_count(), 0);
1348        }
1349
1350        #[test]
1351        fn test_batch_insert_single_vector() {
1352            let (mut index, mut storage) = create_test_index_and_storage(4);
1353            let vectors = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
1354
1355            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1356
1357            assert!(result.is_ok());
1358            let ids = result.unwrap();
1359            assert_eq!(ids.len(), 1);
1360            // [M2 fix] Verify graph state - node was actually inserted
1361            assert_eq!(index.node_count(), 1);
1362        }
1363
1364        #[test]
1365        fn test_batch_insert_multiple_vectors() {
1366            let (mut index, mut storage) = create_test_index_and_storage(4);
1367            let vectors = vec![
1368                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1369                (2u64, vec![5.0, 6.0, 7.0, 8.0]),
1370                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1371            ];
1372
1373            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1374
1375            assert!(result.is_ok());
1376            let ids = result.unwrap();
1377            assert_eq!(ids.len(), 3);
1378            // [M2 fix] Verify graph state - all 3 nodes inserted
1379            assert_eq!(index.node_count(), 3);
1380        }
1381
1382        #[test]
1383        fn test_batch_insert_dimension_mismatch_first_vector() {
1384            let (mut index, mut storage) = create_test_index_and_storage(4);
1385            // First vector has wrong dimension (3 instead of 4)
1386            let vectors = vec![
1387                (1u64, vec![1.0, 2.0, 3.0]), // Wrong!
1388                (2u64, vec![5.0, 6.0, 7.0, 8.0]),
1389            ];
1390
1391            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1392
1393            assert!(result.is_err());
1394            match result.unwrap_err() {
1395                BatchError::DimensionMismatch {
1396                    expected,
1397                    actual,
1398                    vector_id,
1399                } => {
1400                    assert_eq!(expected, 4);
1401                    assert_eq!(actual, 3);
1402                    assert_eq!(vector_id, 1);
1403                }
1404                _ => panic!("Expected DimensionMismatch error"),
1405            }
1406            // [M2 fix] Verify no vectors were inserted on fatal error
1407            assert_eq!(index.node_count(), 0);
1408        }
1409
1410        #[test]
1411        fn test_batch_insert_dimension_mismatch_later_vector_skipped() {
1412            let (mut index, mut storage) = create_test_index_and_storage(4);
1413            // Second vector has wrong dimension - should be skipped
1414            let vectors = vec![
1415                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1416                (2u64, vec![5.0, 6.0, 7.0]), // Wrong, but not first
1417                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1418            ];
1419
1420            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1421
1422            // Should succeed with partial results
1423            assert!(result.is_ok());
1424            let ids = result.unwrap();
1425            assert_eq!(ids.len(), 2);
1426            // [M2 fix] Verify only 2 nodes inserted (one skipped)
1427            assert_eq!(index.node_count(), 2);
1428        }
1429
1430        #[test]
1431        fn test_batch_insert_duplicate_id_within_batch() {
1432            let (mut index, mut storage) = create_test_index_and_storage(4);
1433            // Duplicate IDs within the batch
1434            let vectors = vec![
1435                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1436                (1u64, vec![5.0, 6.0, 7.0, 8.0]), // Duplicate!
1437                (2u64, vec![9.0, 10.0, 11.0, 12.0]),
1438            ];
1439
1440            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1441
1442            // Should succeed, skipping the duplicate
1443            assert!(result.is_ok());
1444            let ids = result.unwrap();
1445            assert_eq!(ids.len(), 2);
1446            // [M2 fix] Verify graph state
1447            assert_eq!(index.node_count(), 2);
1448        }
1449
1450        #[test]
1451        fn test_batch_insert_duplicate_id_in_existing_index() {
1452            // [M1 fix] Test duplicate check against existing index
1453            let (mut index, mut storage) = create_test_index_and_storage(4);
1454
1455            // First batch
1456            let vectors1 = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
1457            let result1 = index.batch_insert(vectors1, &mut storage, None::<fn(usize, usize)>);
1458            assert!(result1.is_ok());
1459            assert_eq!(index.node_count(), 1);
1460
1461            // Second batch with duplicate ID
1462            let vectors2 = vec![
1463                (1u64, vec![5.0, 6.0, 7.0, 8.0]), // Duplicate of existing!
1464                (2u64, vec![9.0, 10.0, 11.0, 12.0]),
1465            ];
1466            let result2 = index.batch_insert(vectors2, &mut storage, None::<fn(usize, usize)>);
1467
1468            // Should succeed, skipping the duplicate
1469            assert!(result2.is_ok());
1470            let ids = result2.unwrap();
1471            assert_eq!(ids.len(), 1); // Only ID 2 was inserted
1472            assert_eq!(index.node_count(), 2); // Total 2 nodes now
1473        }
1474
1475        #[test]
1476        fn test_batch_insert_invalid_vector_nan() {
1477            let (mut index, mut storage) = create_test_index_and_storage(4);
1478            let vectors = vec![
1479                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1480                (2u64, vec![f32::NAN, 6.0, 7.0, 8.0]), // NaN!
1481                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1482            ];
1483
1484            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1485
1486            // Should succeed, skipping the NaN vector
1487            assert!(result.is_ok());
1488            let ids = result.unwrap();
1489            assert_eq!(ids.len(), 2);
1490            // [M2 fix] Verify graph state
1491            assert_eq!(index.node_count(), 2);
1492        }
1493
1494        #[test]
1495        fn test_batch_insert_invalid_vector_infinity() {
1496            let (mut index, mut storage) = create_test_index_and_storage(4);
1497            let vectors = vec![
1498                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1499                (2u64, vec![f32::INFINITY, 6.0, 7.0, 8.0]), // Infinity!
1500                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1501            ];
1502
1503            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1504
1505            // Should succeed, skipping the Infinity vector
1506            assert!(result.is_ok());
1507            let ids = result.unwrap();
1508            assert_eq!(ids.len(), 2);
1509            // [M2 fix] Verify graph state
1510            assert_eq!(index.node_count(), 2);
1511        }
1512
1513        #[test]
1514        fn test_batch_insert_invalid_vector_neg_infinity() {
1515            let (mut index, mut storage) = create_test_index_and_storage(4);
1516            let vectors = vec![
1517                (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1518                (2u64, vec![f32::NEG_INFINITY, 6.0, 7.0, 8.0]), // Neg Infinity!
1519                (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1520            ];
1521
1522            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1523
1524            // Should succeed, skipping the NegInfinity vector
1525            assert!(result.is_ok());
1526            let ids = result.unwrap();
1527            assert_eq!(ids.len(), 2);
1528            // [M2 fix] Verify graph state
1529            assert_eq!(index.node_count(), 2);
1530        }
1531
1532        #[test]
1533        fn test_batch_insert_id_zero_skipped() {
1534            let (mut index, mut storage) = create_test_index_and_storage(4);
1535            // ID 0 is reserved sentinel
1536            let vectors = vec![
1537                (0u64, vec![1.0, 2.0, 3.0, 4.0]), // Reserved!
1538                (1u64, vec![5.0, 6.0, 7.0, 8.0]),
1539                (2u64, vec![9.0, 10.0, 11.0, 12.0]),
1540            ];
1541
1542            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1543
1544            // Should succeed, skipping ID 0
1545            assert!(result.is_ok());
1546            let ids = result.unwrap();
1547            assert_eq!(ids.len(), 2);
1548            // [M2 fix] Verify graph state
1549            assert_eq!(index.node_count(), 2);
1550        }
1551
1552        #[test]
1553        fn test_batch_insert_progress_callback_called() {
1554            let (mut index, mut storage) = create_test_index_and_storage(4);
1555            let vectors: Vec<(u64, Vec<f32>)> =
1556                (1..=10).map(|i| (i as u64, vec![i as f32; 4])).collect();
1557
1558            let mut progress_calls: Vec<(usize, usize)> = vec![];
1559
1560            let result = index.batch_insert(
1561                vectors,
1562                &mut storage,
1563                Some(|current, total| {
1564                    progress_calls.push((current, total));
1565                }),
1566            );
1567
1568            assert!(result.is_ok());
1569            // [M2 fix] Verify all 10 nodes inserted
1570            assert_eq!(index.node_count(), 10);
1571
1572            // Should have called progress at 0% and at intervals
1573            assert!(!progress_calls.is_empty());
1574
1575            // First call should be (0, 10)
1576            assert_eq!(progress_calls[0], (0, 10));
1577
1578            // Last call should report inserted count (10, 10)
1579            let last = progress_calls.last().unwrap();
1580            assert_eq!(*last, (10, 10));
1581        }
1582
1583        #[test]
1584        fn test_batch_insert_progress_callback_single_vector() {
1585            let (mut index, mut storage) = create_test_index_and_storage(4);
1586            let vectors = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
1587
1588            let mut progress_calls: Vec<(usize, usize)> = vec![];
1589
1590            let result = index.batch_insert(
1591                vectors,
1592                &mut storage,
1593                Some(|current, total| {
1594                    progress_calls.push((current, total));
1595                }),
1596            );
1597
1598            assert!(result.is_ok());
1599            // [M2 fix] Verify 1 node inserted
1600            assert_eq!(index.node_count(), 1);
1601
1602            // Should have at least two calls: 0% and 100%
1603            assert!(progress_calls.len() >= 2);
1604            assert_eq!(progress_calls[0], (0, 1));
1605            assert!(progress_calls.contains(&(1, 1)));
1606        }
1607
1608        #[test]
1609        fn test_batch_insert_mixed_errors() {
1610            let (mut index, mut storage) = create_test_index_and_storage(4);
1611            // Mix of valid and invalid vectors
1612            let vectors = vec![
1613                (1u64, vec![1.0, 2.0, 3.0, 4.0]),      // Valid
1614                (2u64, vec![f32::NAN, 2.0, 3.0, 4.0]), // NaN - skip
1615                (3u64, vec![3.0, 3.0, 3.0]),           // Wrong dim - skip
1616                (1u64, vec![4.0, 4.0, 4.0, 4.0]),      // Duplicate - skip
1617                (0u64, vec![5.0, 5.0, 5.0, 5.0]),      // Reserved ID - skip
1618                (4u64, vec![6.0, 6.0, 6.0, 6.0]),      // Valid
1619            ];
1620
1621            let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1622
1623            assert!(result.is_ok());
1624            let ids = result.unwrap();
1625            assert_eq!(ids.len(), 2);
1626            // [M2 fix] Verify graph state
1627            assert_eq!(index.node_count(), 2);
1628        }
1629
1630        #[test]
1631        fn test_helper_dimensions() {
1632            let (index, _storage) = create_test_index_and_storage(128);
1633            assert_eq!(index.dimensions(), 128);
1634        }
1635
1636        #[test]
1637        fn test_helper_len_empty() {
1638            let (index, _storage) = create_test_index_and_storage(128);
1639            assert_eq!(index.len(), 0);
1640            assert!(index.is_empty());
1641        }
1642
1643        #[test]
1644        fn test_helper_capacity() {
1645            let (index, _storage) = create_test_index_and_storage(128);
1646            assert_eq!(index.capacity(), u32::MAX as usize);
1647        }
1648
1649        #[test]
1650        fn test_helper_contains_id() {
1651            let (mut index, mut storage) = create_test_index_and_storage(4);
1652
1653            // Initially empty
1654            assert!(!index.contains_id(1));
1655
1656            // Insert via single insert
1657            let _ = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage);
1658
1659            // Now the first ID should exist (insert assigns sequential IDs starting at 1)
1660            assert!(index.node_count() == 1);
1661        }
1662
1663        #[test]
1664        fn test_validate_vector_valid() {
1665            let vector = vec![1.0, 2.0, 3.0, 4.0];
1666            assert!(HnswIndex::validate_vector(&vector).is_none());
1667        }
1668
1669        #[test]
1670        fn test_validate_vector_nan() {
1671            let vector = vec![1.0, f32::NAN, 3.0, 4.0];
1672            let result = HnswIndex::validate_vector(&vector);
1673            assert!(result.is_some());
1674            assert!(result.unwrap().contains("NaN"));
1675        }
1676
1677        #[test]
1678        fn test_validate_vector_infinity() {
1679            let vector = vec![1.0, f32::INFINITY, 3.0, 4.0];
1680            let result = HnswIndex::validate_vector(&vector);
1681            assert!(result.is_some());
1682            assert!(result.unwrap().contains("Infinity"));
1683        }
1684    }
1685
1686    // ============================================================
1687    // SOFT DELETE TESTS (W16.2 - RFC-001)
1688    // ============================================================
1689
1690    mod delete_tests {
1691        use super::*;
1692
1693        fn create_test_index() -> (HnswIndex, VectorStorage) {
1694            let config = HnswConfig::new(4);
1695            let storage = VectorStorage::new(&config, None);
1696            let index = HnswIndex::new(config, &storage).unwrap();
1697            (index, storage)
1698        }
1699
1700        #[test]
1701        fn test_soft_delete_marks_node() {
1702            let (mut index, mut storage) = create_test_index();
1703            let vec = vec![1.0, 2.0, 3.0, 4.0];
1704            let id = index.insert(&vec, &mut storage).unwrap();
1705
1706            assert!(!index.is_deleted(id).unwrap());
1707            assert!(index.soft_delete(id).unwrap());
1708            assert!(index.is_deleted(id).unwrap());
1709        }
1710
1711        #[test]
1712        fn test_soft_delete_idempotent() {
1713            let (mut index, mut storage) = create_test_index();
1714            let vec = vec![1.0, 2.0, 3.0, 4.0];
1715            let id = index.insert(&vec, &mut storage).unwrap();
1716
1717            assert!(index.soft_delete(id).unwrap()); // First: true
1718            assert!(!index.soft_delete(id).unwrap()); // Second: false
1719            assert_eq!(index.deleted_count(), 1); // Still 1
1720        }
1721
1722        #[test]
1723        fn test_soft_delete_nonexistent_fails() {
1724            let (mut index, _storage) = create_test_index();
1725            let result = index.soft_delete(VectorId(999));
1726            assert!(result.is_err());
1727            assert!(matches!(result.unwrap_err(), GraphError::InvalidVectorId));
1728        }
1729
1730        #[test]
1731        fn test_deleted_count() {
1732            let (mut index, mut storage) = create_test_index();
1733
1734            // Insert 3 vectors
1735            let id1 = index.insert(&[1.0, 0.0, 0.0, 0.0], &mut storage).unwrap();
1736            let id2 = index.insert(&[0.0, 1.0, 0.0, 0.0], &mut storage).unwrap();
1737            let _id3 = index.insert(&[0.0, 0.0, 1.0, 0.0], &mut storage).unwrap();
1738
1739            assert_eq!(index.deleted_count(), 0);
1740            assert_eq!(index.node_count(), 3);
1741
1742            // Delete 2
1743            index.soft_delete(id1).unwrap();
1744            index.soft_delete(id2).unwrap();
1745
1746            assert_eq!(index.deleted_count(), 2);
1747            assert_eq!(index.live_count(), 1);
1748        }
1749
1750        #[test]
1751        fn test_tombstone_ratio() {
1752            let (mut index, mut storage) = create_test_index();
1753
1754            // Empty index
1755            assert_eq!(index.tombstone_ratio(), 0.0);
1756
1757            // Insert 4 vectors
1758            let mut ids = Vec::new();
1759            for i in 0..4 {
1760                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1761                ids.push(id);
1762            }
1763
1764            assert_eq!(index.tombstone_ratio(), 0.0);
1765
1766            // Delete 1 of 4 = 25%
1767            index.soft_delete(ids[0]).unwrap();
1768            assert!((index.tombstone_ratio() - 0.25).abs() < 0.01);
1769
1770            // Delete 2 of 4 = 50%
1771            index.soft_delete(ids[1]).unwrap();
1772            assert!((index.tombstone_ratio() - 0.50).abs() < 0.01);
1773        }
1774
1775        #[test]
1776        fn test_is_deleted_nonexistent_fails() {
1777            let (index, _storage) = create_test_index();
1778            let result = index.is_deleted(VectorId(999));
1779            assert!(result.is_err());
1780            assert!(matches!(result.unwrap_err(), GraphError::InvalidVectorId));
1781        }
1782
1783        #[test]
1784        fn test_live_count() {
1785            let (mut index, mut storage) = create_test_index();
1786
1787            // Insert 5
1788            let mut ids = Vec::new();
1789            for i in 0..5 {
1790                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1791                ids.push(id);
1792            }
1793
1794            // Delete 2
1795            index.soft_delete(ids[0]).unwrap();
1796            index.soft_delete(ids[1]).unwrap();
1797
1798            assert_eq!(index.node_count(), 5);
1799            assert_eq!(index.deleted_count(), 2);
1800            assert_eq!(index.live_count(), 3);
1801        }
1802
1803        #[test]
1804        fn test_get_node_by_vector_id_helper() {
1805            let (mut index, mut storage) = create_test_index();
1806            let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
1807
1808            // Should find existing node
1809            let node = index.get_node_by_vector_id(id);
1810            assert!(node.is_ok());
1811            assert_eq!(node.unwrap().vector_id, id);
1812
1813            // Should fail for non-existent
1814            let bad = index.get_node_by_vector_id(VectorId(999));
1815            assert!(bad.is_err());
1816        }
1817
1818        #[test]
1819        fn test_deleted_field_is_zero_by_default() {
1820            let (mut index, mut storage) = create_test_index();
1821            let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
1822
1823            let node = index.get_node_by_vector_id(id).unwrap();
1824            assert_eq!(node.deleted, 0);
1825        }
1826
1827        #[test]
1828        fn test_deleted_field_set_to_one_after_delete() {
1829            let (mut index, mut storage) = create_test_index();
1830            let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
1831
1832            index.soft_delete(id).unwrap();
1833
1834            let node = index.get_node_by_vector_id(id).unwrap();
1835            assert_eq!(node.deleted, 1);
1836        }
1837
1838        #[test]
1839        fn test_delete_invalid_vector_id_zero() {
1840            let (mut index, _storage) = create_test_index();
1841            // VectorId(0) is INVALID sentinel
1842            let result = index.soft_delete(VectorId(0));
1843            assert!(result.is_err());
1844        }
1845
1846        // ============================================================
1847        // ADJUSTED K TESTS (W16.3)
1848        // ============================================================
1849
1850        #[test]
1851        fn test_adjusted_k_no_tombstones() {
1852            let (index, _storage) = create_test_index();
1853            // No deletions -> k unchanged
1854            assert_eq!(index.adjusted_k(10), 10);
1855            assert_eq!(index.adjusted_k(1), 1);
1856            assert_eq!(index.adjusted_k(100), 100);
1857        }
1858
1859        #[test]
1860        fn test_adjusted_k_with_tombstones() {
1861            let (mut index, mut storage) = create_test_index();
1862
1863            // Insert 10 vectors
1864            let mut ids = Vec::new();
1865            for i in 0..10 {
1866                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1867                ids.push(id);
1868            }
1869
1870            // Delete 5 of 10 = 50% tombstones
1871            for i in 0..5 {
1872                index.soft_delete(ids[i]).unwrap();
1873            }
1874
1875            // 50% tombstones → 2x multiplier
1876            // adjusted_k(10) = ceil(10 / 0.5) = 20
1877            let adjusted = index.adjusted_k(10);
1878            assert!(
1879                adjusted >= 18 && adjusted <= 22,
1880                "Expected ~20, got {adjusted}"
1881            );
1882        }
1883
1884        #[test]
1885        fn test_adjusted_k_capped_at_10x() {
1886            let (mut index, mut storage) = create_test_index();
1887
1888            // Insert 100 vectors
1889            let mut ids = Vec::new();
1890            for i in 0..100 {
1891                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1892                ids.push(id);
1893            }
1894
1895            // Delete 95 of 100 = 95% tombstones
1896            for i in 0..95 {
1897                index.soft_delete(ids[i]).unwrap();
1898            }
1899
1900            // Should cap at 10x, not 20x
1901            let adjusted = index.adjusted_k(10);
1902            assert_eq!(adjusted, 100, "Should be capped at 10x (100)");
1903        }
1904
1905        #[test]
1906        fn test_adjusted_k_10_percent_tombstones() {
1907            let (mut index, mut storage) = create_test_index();
1908
1909            // Insert 10 vectors
1910            let mut ids = Vec::new();
1911            for i in 0..10 {
1912                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1913                ids.push(id);
1914            }
1915
1916            // Delete 1 of 10 = 10% tombstones
1917            index.soft_delete(ids[0]).unwrap();
1918
1919            // 10% tombstones → 1.11x multiplier
1920            // adjusted_k(10) = ceil(10 / 0.9) = ceil(11.11) = 12
1921            let adjusted = index.adjusted_k(10);
1922            assert!(
1923                adjusted >= 11 && adjusted <= 13,
1924                "Expected ~12, got {adjusted}"
1925            );
1926        }
1927
1928        // ============================================================
1929        // BOUNDARY VALUE TESTS (m3 fix)
1930        // ============================================================
1931
1932        #[test]
1933        fn test_adjusted_k_boundary_zero_tombstones() {
1934            let (mut index, mut storage) = create_test_index();
1935
1936            // Insert vectors but delete none (0% tombstones)
1937            for i in 0..10 {
1938                index.insert(&[i as f32; 4], &mut storage).unwrap();
1939            }
1940
1941            // 0% tombstones → no adjustment
1942            assert_eq!(index.adjusted_k(5), 5);
1943            assert_eq!(index.adjusted_k(10), 10);
1944            assert_eq!(index.tombstone_ratio(), 0.0);
1945        }
1946
1947        #[test]
1948        fn test_adjusted_k_boundary_50_percent() {
1949            let (mut index, mut storage) = create_test_index();
1950
1951            let mut ids = Vec::new();
1952            for i in 0..10 {
1953                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1954                ids.push(id);
1955            }
1956
1957            // Delete exactly 5 of 10 = 50%
1958            for i in 0..5 {
1959                index.soft_delete(ids[i]).unwrap();
1960            }
1961
1962            // 50% tombstones → 2x
1963            // adjusted_k(10) = 10 * 10 / 5 = 20
1964            let adjusted = index.adjusted_k(10);
1965            assert_eq!(adjusted, 20, "50% tombstones should give 2x");
1966            assert!((index.tombstone_ratio() - 0.5).abs() < 0.01);
1967        }
1968
1969        #[test]
1970        fn test_adjusted_k_boundary_90_percent() {
1971            let (mut index, mut storage) = create_test_index();
1972
1973            let mut ids = Vec::new();
1974            for i in 0..10 {
1975                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1976                ids.push(id);
1977            }
1978
1979            // Delete 9 of 10 = 90%
1980            for i in 0..9 {
1981                index.soft_delete(ids[i]).unwrap();
1982            }
1983
1984            // 90% tombstones → 10x
1985            // adjusted_k(10) = 10 * 10 / 1 = 100
1986            // Capped at 10x multiplier = 100
1987            let adjusted = index.adjusted_k(10);
1988            assert_eq!(adjusted, 100, "90% tombstones should give 10x (capped)");
1989            assert!((index.tombstone_ratio() - 0.9).abs() < 0.01);
1990        }
1991
1992        #[test]
1993        fn test_adjusted_k_boundary_all_deleted() {
1994            let (mut index, mut storage) = create_test_index();
1995
1996            let mut ids = Vec::new();
1997            for i in 0..5 {
1998                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1999                ids.push(id);
2000            }
2001
2002            // Delete all = 100% tombstones
2003            for id in &ids {
2004                index.soft_delete(*id).unwrap();
2005            }
2006
2007            // All deleted → returns k (search will return empty anyway)
2008            let adjusted = index.adjusted_k(10);
2009            assert_eq!(adjusted, 10, "All deleted should return original k");
2010            assert_eq!(index.live_count(), 0);
2011            assert!((index.tombstone_ratio() - 1.0).abs() < 0.01);
2012        }
2013
2014        #[test]
2015        fn test_adjusted_k_large_k_small_index() {
2016            let (mut index, mut storage) = create_test_index();
2017
2018            // Insert only 5 vectors
2019            let mut ids = Vec::new();
2020            for i in 0..5 {
2021                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2022                ids.push(id);
2023            }
2024
2025            // Delete 2 = 40% tombstones
2026            index.soft_delete(ids[0]).unwrap();
2027            index.soft_delete(ids[1]).unwrap();
2028
2029            // Request k=100, but only 5 vectors in index
2030            // adjusted_k(100) = 100 * 5 / 3 ≈ 166
2031            // Capped at 10x = 1000
2032            // Note: adjusted_k controls internal effort, not final result count
2033            let adjusted = index.adjusted_k(100);
2034            assert!(
2035                adjusted >= 166 && adjusted <= 168,
2036                "Should compute ~166 for 40% tombstones, got {adjusted}"
2037            );
2038        }
2039
2040        #[test]
2041        fn test_adjusted_k_uses_constant() {
2042            // Verify MAX_ADJUSTED_K_MULTIPLIER is used
2043            assert_eq!(HnswIndex::MAX_ADJUSTED_K_MULTIPLIER, 10);
2044        }
2045
2046        #[test]
2047        fn test_adjusted_k_integer_precision() {
2048            // Test that integer arithmetic doesn't lose precision
2049            let (mut index, mut storage) = create_test_index();
2050
2051            let mut ids = Vec::new();
2052            for i in 0..100 {
2053                let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2054                ids.push(id);
2055            }
2056
2057            // Delete 33 of 100 = 33%
2058            for i in 0..33 {
2059                index.soft_delete(ids[i]).unwrap();
2060            }
2061
2062            // adjusted_k(10) = 10 * 100 / 67 = 14.925... → 14
2063            let adjusted = index.adjusted_k(10);
2064            // Should be close to 15 (1.49x)
2065            assert!(
2066                adjusted >= 14 && adjusted <= 16,
2067                "33% tombstones: expected ~15, got {adjusted}"
2068            );
2069        }
2070    }
2071}
edgevec/hnsw/graph.rs

edgevec/hnsw/
graph.rs