edgevec/hnsw/graph.rs
1#![allow(clippy::cast_possible_truncation)]
2#![allow(clippy::cast_precision_loss)]
3#![allow(clippy::cast_sign_loss)]
4#![allow(clippy::missing_errors_doc)]
5#![allow(clippy::missing_panics_doc)]
6
7use super::config::HnswConfig;
8use super::neighbor::NeighborPool;
9use crate::storage::VectorStorage;
10use bytemuck::{Pod, Zeroable};
11use rand::{Rng, SeedableRng};
12use rand_chacha::ChaCha8Rng;
13use serde::{Deserialize, Serialize};
14use std::borrow::Cow;
15use std::vec::Vec;
16use thiserror::Error;
17
18/// Unique identifier for a vector in the database.
19///
20/// # Size
21/// 8 bytes, aligned to 8
22///
23/// # Invariants
24/// - IDs are never reused (monotonically increasing)
25/// - ID 0 is reserved (invalid sentinel)
26///
27/// # Safety
28/// Derives `Pod` and `Zeroable` for safe byte casting via bytemuck.
29#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Pod, Zeroable)]
30#[repr(transparent)]
31pub struct VectorId(pub u64);
32
33impl VectorId {
34 /// Sentinel value indicating "no vector"
35 pub const INVALID: Self = VectorId(0);
36
37 /// First valid ID
38 pub const FIRST: Self = VectorId(1);
39}
40
41/// Internal node identifier within HNSW graph.
42///
43/// # Size
44/// 4 bytes, aligned to 4
45///
46/// # Invariants
47/// - `NodeId` corresponds 1:1 with `VectorId` (lower 32 bits)
48/// - `NodeId` 0xFFFFFFFF is reserved (invalid sentinel)
49#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
50#[repr(transparent)]
51pub struct NodeId(pub u32);
52
53impl NodeId {
54 /// Sentinel value indicating invalid node
55 pub const INVALID: Self = NodeId(u32::MAX);
56}
57
58/// Represents a layer level in the HNSW graph.
59///
60/// Layer 0 is the base layer containing all nodes.
61/// Higher layers contain a subset of nodes for faster navigation.
62#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
63#[repr(transparent)]
64#[allow(dead_code)]
65pub struct Layer(pub u8);
66
67/// Errors that can occur during graph operations.
68#[derive(Error, Debug, Clone, PartialEq, Eq)]
69pub enum GraphError {
70 /// The graph has reached its maximum node capacity (`u32::MAX`).
71 #[error("node capacity exceeded")]
72 CapacityExceeded,
73
74 /// The provided `VectorId` is invalid (e.g., sentinel value).
75 #[error("invalid vector id")]
76 InvalidVectorId,
77
78 /// Neighbor data is corrupted or offset is out of bounds.
79 #[error("neighbor data corrupted")]
80 NeighborError,
81
82 /// Node ID is out of bounds.
83 #[error("node id out of bounds")]
84 NodeIdOutOfBounds,
85
86 /// Configuration mismatch with storage.
87 #[error("config dimension mismatch: expected {expected}, got {actual}")]
88 ConfigMismatch {
89 /// Expected dimensions.
90 expected: u32,
91 /// Actual dimensions in config.
92 actual: u32,
93 },
94
95 /// Query vector has wrong dimensions.
96 #[error("dimension mismatch: expected {expected}, got {actual}")]
97 DimensionMismatch {
98 /// Expected dimensions.
99 expected: usize,
100 /// Actual dimensions.
101 actual: usize,
102 },
103
104 /// Storage operation failed.
105 #[error("storage error: {0}")]
106 Storage(String),
107
108 /// Invalid configuration parameter.
109 #[error("invalid config: {0}")]
110 InvalidConfig(String),
111}
112
113/// Result of a compaction operation.
114///
115/// Returned by [`HnswIndex::compact()`] to provide metrics about the operation.
116///
117/// # Fields
118///
119/// * `tombstones_removed` - Number of deleted vectors removed
120/// * `new_size` - Size of the new index (live vectors only)
121/// * `duration_ms` - Time taken for the operation in milliseconds
122#[derive(Debug, Clone, PartialEq, Eq)]
123pub struct CompactionResult {
124 /// Number of tombstones (deleted vectors) removed during compaction.
125 pub tombstones_removed: usize,
126 /// New index size after compaction (live vectors only).
127 pub new_size: usize,
128 /// Time taken for the compaction operation in milliseconds.
129 pub duration_ms: u64,
130}
131
132/// A node in the HNSW graph with its adjacency information.
133///
134/// # Layout
135///
136/// Total size: 16 bytes
137/// Alignment: 8 bytes
138///
139/// # Fields
140///
141/// - `vector_id`: 8 bytes
142/// - `neighbor_offset`: 4 bytes
143/// - `neighbor_len`: 2 bytes
144/// - `max_layer`: 1 byte
145/// - `deleted`: 1 byte (soft delete flag, v0.3.0)
146///
147/// # Soft Delete (v0.3.0)
148///
149/// The `deleted` field enables O(1) soft delete. Deleted nodes remain in
150/// the graph for routing but are excluded from search results.
151/// - `deleted = 0`: Node is live (default)
152/// - `deleted = 1`: Node is deleted (tombstone)
153///
154/// # Safety
155///
156/// Derives `Pod` and `Zeroable` for safe byte casting via bytemuck.
157/// The `#[repr(C)]` ensures deterministic layout for serialization.
158/// All fields are primitive types with no padding gaps.
159#[derive(Clone, Copy, Debug, Serialize, Deserialize, Pod, Zeroable)]
160#[repr(C)]
161pub struct HnswNode {
162 /// The vector ID this node represents
163 pub vector_id: VectorId,
164
165 /// Offset into COMPRESSED neighbor pool
166 pub neighbor_offset: u32,
167
168 /// Length of neighbor data in bytes (Allocated Capacity)
169 pub neighbor_len: u16,
170
171 /// The maximum layer this node appears in
172 pub max_layer: u8,
173
174 /// Soft delete flag: 0 = live, 1 = deleted (v0.3.0)
175 /// This field replaces the padding byte from v0.2.x.
176 pub deleted: u8,
177}
178
179/// The HNSW Graph structure managing layers and nodes.
180///
181/// # Memory
182///
183/// Uses a flattened representation for cache efficiency.
184/// Nodes are stored in a contiguous vector.
185///
186/// # Soft Delete (v0.3.0)
187///
188/// Supports soft delete via tombstone marking. Deleted nodes remain
189/// in the graph for routing but are excluded from search results.
190/// Use `compact()` to reclaim space from deleted nodes.
191#[derive(Clone, Debug, Serialize, Deserialize)]
192pub struct HnswIndex {
193 /// Algorithm configuration
194 pub config: HnswConfig,
195
196 /// Node metadata (fixed-size per node)
197 pub(crate) nodes: Vec<HnswNode>,
198
199 /// Compressed neighbor lists
200 pub(crate) neighbors: NeighborPool,
201
202 /// Entry point (highest layer node)
203 pub(crate) entry_point: Option<NodeId>,
204
205 /// Maximum layer in the graph
206 pub(crate) max_layer: u8,
207
208 /// Level probability multiplier (1/ln(M))
209 pub(crate) level_mult: f32,
210
211 /// Deterministic RNG state
212 rng: ChaCha8Rng,
213
214 /// Count of soft-deleted vectors (v0.3.0)
215 /// This tracks the number of nodes with `deleted = 1`.
216 #[serde(default)]
217 pub(crate) deleted_count: usize,
218
219 /// Compaction threshold ratio (v0.3.0)
220 /// When tombstone_ratio() exceeds this value, needs_compaction() returns true.
221 /// Default: 0.3 (30% tombstones triggers compaction recommendation)
222 #[serde(default = "default_compaction_threshold")]
223 compaction_threshold: f64,
224}
225
226/// Default compaction threshold (30%)
227fn default_compaction_threshold() -> f64 {
228 0.3
229}
230
231impl HnswIndex {
232 /// Creates a new empty HNSW graph.
233 ///
234 /// # Arguments
235 ///
236 /// * `config` - HNSW configuration parameters.
237 /// * `storage` - Vector storage to validate against.
238 ///
239 /// # Errors
240 ///
241 /// Returns `GraphError::ConfigMismatch` if storage dimensions differ from config.
242 /// Returns `GraphError::InvalidConfig` if configuration parameters are invalid (e.g., M <= 1).
243 pub fn new(config: HnswConfig, storage: &VectorStorage) -> Result<Self, GraphError> {
244 if config.dimensions != storage.dimensions() {
245 return Err(GraphError::ConfigMismatch {
246 expected: storage.dimensions(),
247 actual: config.dimensions,
248 });
249 }
250
251 if config.m <= 1 {
252 return Err(GraphError::InvalidConfig(format!(
253 "m must be > 1, got {}",
254 config.m
255 )));
256 }
257 if config.m0 < config.m {
258 return Err(GraphError::InvalidConfig(format!(
259 "m0 must be >= m, got {} < {}",
260 config.m0, config.m
261 )));
262 }
263
264 // Calculate level multiplier: m_l = 1 / ln(M)
265 let m_float = config.m as f32;
266 let level_mult = if m_float > 1.0 {
267 1.0 / m_float.ln()
268 } else {
269 0.0
270 };
271
272 // Initialize RNG with a default seed for determinism.
273 let rng = ChaCha8Rng::seed_from_u64(42);
274
275 Ok(Self {
276 config,
277 nodes: Vec::new(),
278 neighbors: NeighborPool::new(),
279 entry_point: None,
280 max_layer: 0,
281 level_mult,
282 rng,
283 deleted_count: 0, // v0.3.0: No deleted nodes initially
284 compaction_threshold: default_compaction_threshold(), // v0.3.0: Default 30%
285 })
286 }
287
288 /// Generates a random level for a new node.
289 ///
290 /// Formula: `floor(-ln(uniform(0,1)) * m_l)`
291 /// Clamped to `max_level` (e.g. 16) to prevent memory explosion.
292 #[must_use]
293 pub fn get_random_level(&mut self) -> u8 {
294 // Generate uniform(0, 1)
295 let r: f32 = self.rng.gen_range(f32::EPSILON..=1.0);
296 let level = (-r.ln() * self.level_mult).floor();
297
298 // Safety cap (e.g. 16)
299 if level > 16.0 {
300 16
301 } else {
302 level as u8
303 }
304 }
305
306 /// Adds a node to the graph.
307 ///
308 /// # Arguments
309 ///
310 /// * `vector_id` - The external vector identifier
311 /// * `max_layer` - The maximum layer for this node
312 ///
313 /// # Returns
314 ///
315 /// The new `NodeId` assigned to this node, or a `GraphError`.
316 pub fn add_node(&mut self, vector_id: VectorId, max_layer: u8) -> Result<NodeId, GraphError> {
317 if vector_id == VectorId::INVALID {
318 return Err(GraphError::InvalidVectorId);
319 }
320
321 // Safety limit for NodeId
322 if self.nodes.len() >= u32::MAX as usize {
323 return Err(GraphError::CapacityExceeded);
324 }
325
326 let node = HnswNode {
327 vector_id,
328 neighbor_offset: 0,
329 neighbor_len: 0,
330 max_layer,
331 deleted: 0, // Live node (v0.3.0)
332 };
333
334 #[allow(clippy::cast_possible_truncation)]
335 let id = NodeId(self.nodes.len() as u32);
336 self.nodes.push(node);
337
338 // Update max layer if needed
339 if max_layer > self.max_layer {
340 self.max_layer = max_layer;
341 }
342
343 Ok(id)
344 }
345
346 /// Sets the neighbors for a node.
347 ///
348 /// # Arguments
349 /// * `node_id` - The node to update.
350 /// * `neighbors` - The list of neighbor IDs.
351 pub fn set_neighbors(
352 &mut self,
353 node_id: NodeId,
354 neighbors: &[NodeId],
355 ) -> Result<(), GraphError> {
356 if node_id.0 as usize >= self.nodes.len() {
357 return Err(GraphError::InvalidVectorId);
358 }
359
360 // Convert NodeId to u32 for encoding
361 let neighbor_u32s: Vec<u32> = neighbors.iter().map(|n| n.0).collect();
362 let encoded = NeighborPool::encode_neighbors(&neighbor_u32s);
363
364 // Alloc new space
365 let (offset, capacity) = self.neighbors.alloc(encoded.len())?;
366
367 // Write data
368 let start = offset as usize;
369 let end = start + encoded.len();
370 self.neighbors.buffer[start..end].copy_from_slice(&encoded);
371
372 // Update node and free old
373 let node = &mut self.nodes[node_id.0 as usize];
374
375 // Free old slot if it existed
376 if node.neighbor_len > 0 {
377 self.neighbors.free(node.neighbor_offset, node.neighbor_len);
378 }
379
380 node.neighbor_offset = offset;
381 node.neighbor_len = capacity; // Store allocated capacity
382
383 Ok(())
384 }
385
386 /// Retrieves a node by its ID.
387 #[must_use]
388 pub fn get_node(&self, id: NodeId) -> Option<&HnswNode> {
389 if id == NodeId::INVALID {
390 return None;
391 }
392 self.nodes.get(id.0 as usize)
393 }
394
395 /// Retrieves the neighbors for a node.
396 pub fn get_neighbors(&self, node: &HnswNode) -> Result<Vec<NodeId>, GraphError> {
397 let start = node.neighbor_offset as usize;
398 // Read up to allocated capacity
399 let end = start + node.neighbor_len as usize;
400
401 if end > self.neighbors.buffer.len() {
402 return Err(GraphError::NeighborError);
403 }
404
405 let slice = &self.neighbors.buffer[start..end];
406 let raw_neighbors = NeighborPool::decode_neighbors(slice);
407
408 // Convert back to NodeId
409 let neighbors = raw_neighbors.into_iter().map(NodeId).collect();
410 Ok(neighbors)
411 }
412
413 /// Retrieves the neighbors for a specific layer.
414 pub fn get_neighbors_layer(
415 &self,
416 node: &HnswNode,
417 layer: u8,
418 ) -> Result<Vec<NodeId>, GraphError> {
419 let start = node.neighbor_offset as usize;
420 let end = start + node.neighbor_len as usize;
421
422 if end > self.neighbors.buffer.len() {
423 return Err(GraphError::NeighborError);
424 }
425
426 let slice = &self.neighbors.buffer[start..end];
427 let raw_neighbors = NeighborPool::decode_layer(slice, layer);
428
429 // Convert back to NodeId
430 let neighbors = raw_neighbors.into_iter().map(NodeId).collect();
431 Ok(neighbors)
432 }
433
434 /// Returns the number of nodes in the graph.
435 #[must_use]
436 pub fn node_count(&self) -> usize {
437 self.nodes.len()
438 }
439
440 /// Returns the entry point node ID, if any.
441 #[must_use]
442 pub fn entry_point(&self) -> Option<NodeId> {
443 self.entry_point
444 }
445
446 /// Sets the entry point node ID.
447 pub fn set_entry_point(&mut self, id: NodeId) {
448 self.entry_point = Some(id);
449 }
450
451 /// Returns the current maximum layer in the graph.
452 #[must_use]
453 pub fn max_layer(&self) -> u8 {
454 self.max_layer
455 }
456
457 // ============================================================================
458 // Soft Delete API (W16.2 - RFC-001)
459 // ============================================================================
460
461 /// Get mutable reference to a node by VectorId.
462 ///
463 /// # Complexity
464 ///
465 /// * Time: O(n) linear scan
466 /// * Space: O(1)
467 fn get_node_mut(&mut self, vector_id: VectorId) -> Result<&mut HnswNode, GraphError> {
468 self.nodes
469 .iter_mut()
470 .find(|n| n.vector_id == vector_id)
471 .ok_or(GraphError::InvalidVectorId)
472 }
473
474 /// Get immutable reference to a node by VectorId.
475 ///
476 /// # Complexity
477 ///
478 /// * Time: O(n) linear scan
479 /// * Space: O(1)
480 fn get_node_by_vector_id(&self, vector_id: VectorId) -> Result<&HnswNode, GraphError> {
481 self.nodes
482 .iter()
483 .find(|n| n.vector_id == vector_id)
484 .ok_or(GraphError::InvalidVectorId)
485 }
486
487 /// Mark a vector as deleted (soft delete).
488 ///
489 /// The vector remains in the graph for routing but is excluded from
490 /// search results. Space is reclaimed via `compact()`.
491 ///
492 /// # Arguments
493 ///
494 /// * `vector_id` - The ID of the vector to delete
495 ///
496 /// # Returns
497 ///
498 /// * `Ok(true)` - Vector was deleted
499 /// * `Ok(false)` - Vector was already deleted
500 /// * `Err(InvalidVectorId)` - Vector ID not found
501 ///
502 /// # Complexity
503 ///
504 /// * Time: **O(n)** for VectorId → Node lookup via linear scan,
505 /// plus O(1) for setting the deleted byte
506 /// * Space: O(1)
507 ///
508 /// **Note:** The O(n) lookup is a known limitation in v0.3.0.
509 /// A HashMap<VectorId, NodeId> index could provide O(1) lookup
510 /// but is deferred to v0.4.0 to avoid scope creep.
511 ///
512 /// # Thread Safety (RFC-001 Design)
513 ///
514 /// This method requires `&mut self`, which means Rust's borrow checker
515 /// prevents concurrent access at compile time. This is intentional:
516 ///
517 /// * Search (`&self`) and delete (`&mut self`) cannot run concurrently
518 /// * No atomics or locks needed - safety is enforced by the type system
519 /// * See RFC-001 Section 6.4 "Design Decisions" for rationale
520 ///
521 /// # Persistence
522 ///
523 /// **IMPORTANT:** Delete operations are in-memory only until `save()` is called.
524 /// If the process crashes before `save()`, the delete will be lost.
525 ///
526 /// # Example
527 ///
528 /// ```ignore
529 /// let deleted = index.soft_delete(VectorId(42))?;
530 /// assert!(deleted);
531 /// assert!(index.is_deleted(VectorId(42))?);
532 /// ```
533 pub fn soft_delete(&mut self, vector_id: VectorId) -> Result<bool, GraphError> {
534 let node = self.get_node_mut(vector_id)?;
535
536 if node.deleted != 0 {
537 return Ok(false); // Already deleted
538 }
539
540 node.deleted = 1;
541 self.deleted_count += 1;
542 Ok(true)
543 }
544
545 /// Check if a vector is marked as deleted.
546 ///
547 /// # Arguments
548 ///
549 /// * `vector_id` - The ID of the vector to check
550 ///
551 /// # Returns
552 ///
553 /// * `Ok(true)` - Vector is deleted
554 /// * `Ok(false)` - Vector is live
555 /// * `Err(InvalidVectorId)` - Vector ID not found
556 ///
557 /// # Complexity
558 ///
559 /// * Time: O(n) for lookup + O(1) for check
560 /// * Space: O(1)
561 pub fn is_deleted(&self, vector_id: VectorId) -> Result<bool, GraphError> {
562 let node = self.get_node_by_vector_id(vector_id)?;
563 Ok(node.deleted != 0)
564 }
565
566 /// Get the count of deleted (tombstoned) vectors.
567 ///
568 /// # Returns
569 ///
570 /// Number of vectors marked as deleted.
571 #[must_use]
572 pub fn deleted_count(&self) -> usize {
573 self.deleted_count
574 }
575
576 /// Get the ratio of deleted vectors to total vectors.
577 ///
578 /// # Returns
579 ///
580 /// A value between 0.0 and 1.0 representing the tombstone ratio.
581 /// Returns 0.0 if the index is empty.
582 #[must_use]
583 pub fn tombstone_ratio(&self) -> f64 {
584 let total = self.node_count();
585 if total == 0 {
586 return 0.0;
587 }
588 self.deleted_count as f64 / total as f64
589 }
590
591 /// Get count of live (non-deleted) vectors.
592 ///
593 /// # Returns
594 ///
595 /// Number of vectors that are not marked as deleted.
596 #[must_use]
597 pub fn live_count(&self) -> usize {
598 self.node_count().saturating_sub(self.deleted_count)
599 }
600
601 /// Maximum multiplier for adjusted_k to prevent excessive over-fetching.
602 ///
603 /// At 90%+ tombstones, we cap at 10x the original k to bound memory usage.
604 /// Beyond this ratio, compaction should be triggered.
605 pub const MAX_ADJUSTED_K_MULTIPLIER: usize = 10;
606
607 /// Calculate adjusted k to compensate for tombstones.
608 ///
609 /// When the index has deleted vectors, we over-fetch to ensure
610 /// we can return k live results after filtering. This method
611 /// calculates how many candidates to fetch internally so that
612 /// after filtering out deleted vectors, we likely have k results.
613 ///
614 /// # Formula
615 ///
616 /// `adjusted_k = k * total / live`
617 ///
618 /// This is equivalent to `k / (1 - tombstone_ratio)` but uses
619 /// integer arithmetic for precision.
620 ///
621 /// # Caps
622 ///
623 /// The result is capped at `k * MAX_ADJUSTED_K_MULTIPLIER` (default 10x)
624 /// to prevent excessive fetching at very high tombstone ratios.
625 ///
626 /// # Examples
627 ///
628 /// * 0% tombstones: k = k (no adjustment)
629 /// * 10% tombstones: k → ~1.11x
630 /// * 30% tombstones: k → ~1.43x
631 /// * 50% tombstones: k → 2x
632 /// * 90%+ tombstones: k → 10x (capped)
633 ///
634 /// # Thread Safety
635 ///
636 /// This method reads `deleted_count` and `node_count()` which may change
637 /// if the index is mutated. Per RFC-001, the API uses `&mut self` for
638 /// mutations, so concurrent read+write is prevented by Rust's borrow checker.
639 /// The design accepts eventual consistency for soft delete semantics.
640 ///
641 /// # Arguments
642 ///
643 /// * `k` - The requested number of results
644 ///
645 /// # Returns
646 ///
647 /// The adjusted k value to use for internal search operations.
648 /// This value is always >= k (unless capped by live_count).
649 #[must_use]
650 pub fn adjusted_k(&self, k: usize) -> usize {
651 // Fast path: no tombstones
652 if self.deleted_count == 0 {
653 return k;
654 }
655
656 let total = self.node_count();
657 let live = self.live_count();
658
659 // Edge case: all deleted
660 // This also prevents division by zero in the calculation below.
661 if live == 0 {
662 return k; // Will return empty results anyway
663 }
664
665 // Integer arithmetic: adjusted = k * total / live
666 // Use saturating ops to prevent overflow
667 let adjusted = k.saturating_mul(total) / live;
668
669 // Cap at MAX_ADJUSTED_K_MULTIPLIER to prevent excessive over-fetching
670 // Note: We don't cap at total because adjusted_k controls the internal
671 // search effort (ef parameter), not the final result count.
672 let max_by_multiplier = k.saturating_mul(Self::MAX_ADJUSTED_K_MULTIPLIER);
673 adjusted.min(max_by_multiplier)
674 }
675
676 /// Marks a vector as deleted in the storage (legacy API).
677 ///
678 /// **DEPRECATED:** Use `soft_delete()` instead for RFC-001 compliant soft delete.
679 /// This method delegates to storage and does not update `deleted_count`.
680 ///
681 /// # Arguments
682 ///
683 /// * `id` - The vector ID to delete.
684 /// * `storage` - The vector storage to update.
685 ///
686 /// # Returns
687 ///
688 /// `true` if the vector was active and is now deleted.
689 /// `false` if it was already deleted.
690 #[deprecated(since = "0.3.0", note = "Use soft_delete() instead")]
691 pub fn delete_in_storage(&self, id: VectorId, storage: &mut VectorStorage) -> bool {
692 storage.mark_deleted(id)
693 }
694
695 /// DEBUG: Print memory stats
696 pub fn log_stats(&self) {
697 println!("Index Stats:");
698 println!(" Node Count: {}", self.nodes.len());
699 println!(" Neighbor Buffer Len: {}", self.neighbors.buffer.len());
700 println!(
701 " Neighbor Buffer Cap: {}",
702 self.neighbors.buffer.capacity()
703 );
704 println!(" Total Memory Usage: {} bytes", self.memory_usage());
705 // bucket stats are internal to NeighborPool
706 }
707
708 /// Returns the approximate memory usage in bytes.
709 #[must_use]
710 pub fn memory_usage(&self) -> usize {
711 let nodes_size = self.nodes.capacity() * std::mem::size_of::<HnswNode>();
712 let neighbors_size = self.neighbors.memory_usage();
713
714 std::mem::size_of::<Self>() + nodes_size + neighbors_size
715 }
716
717 // ========================================================================
718 // Compaction API (W16.4 - RFC-001)
719 // ========================================================================
720
721 /// Check if compaction is recommended.
722 ///
723 /// Returns `true` if the tombstone ratio exceeds the configured threshold
724 /// (default 30%). When this returns true, calling `compact()` is
725 /// recommended to reclaim space and maintain search performance.
726 ///
727 /// # Thread Safety
728 ///
729 /// This method is **not thread-safe**. `HnswIndex` is `!Send` and must be
730 /// accessed from a single thread. For concurrent use, create separate index
731 /// instances (e.g., via Web Workers in WASM).
732 ///
733 /// # Example
734 ///
735 /// ```ignore
736 /// if index.needs_compaction() {
737 /// let (new_index, new_storage, result) = index.compact(&storage)?;
738 /// index = new_index;
739 /// storage = new_storage;
740 /// }
741 /// ```
742 #[must_use]
743 pub fn needs_compaction(&self) -> bool {
744 self.tombstone_ratio() > self.compaction_threshold
745 }
746
747 /// Set the compaction threshold.
748 ///
749 /// When `tombstone_ratio()` exceeds this value, `needs_compaction()`
750 /// returns true.
751 ///
752 /// # Arguments
753 ///
754 /// * `ratio` - Tombstone ratio threshold (0.01 to 0.99)
755 ///
756 /// # Default
757 ///
758 /// Default is 0.3 (30%). Lower values trigger compaction more often
759 /// but maintain better search performance.
760 ///
761 /// # Clamping
762 ///
763 /// Values outside [0.01, 0.99] are clamped to that range.
764 pub fn set_compaction_threshold(&mut self, ratio: f64) {
765 self.compaction_threshold = ratio.clamp(0.01, 0.99);
766 }
767
768 /// Get the current compaction threshold.
769 ///
770 /// # Returns
771 ///
772 /// The ratio at which `needs_compaction()` returns true.
773 #[must_use]
774 pub fn compaction_threshold(&self) -> f64 {
775 self.compaction_threshold
776 }
777
778 /// Get a compaction warning message if compaction is recommended.
779 ///
780 /// Returns `Some(message)` if the tombstone ratio exceeds the threshold,
781 /// or `None` if compaction is not needed.
782 ///
783 /// # Example
784 ///
785 /// ```ignore
786 /// if let Some(warning) = index.compaction_warning() {
787 /// log::warn!("{}", warning);
788 /// }
789 /// ```
790 #[must_use]
791 pub fn compaction_warning(&self) -> Option<String> {
792 if self.needs_compaction() {
793 Some(format!(
794 "Compaction recommended: tombstone ratio {:.1}% exceeds threshold {:.1}%. \
795 Call compact() to rebuild index without tombstones.",
796 self.tombstone_ratio() * 100.0,
797 self.compaction_threshold * 100.0
798 ))
799 } else {
800 None
801 }
802 }
803
804 /// Compact the index by rebuilding without tombstones.
805 ///
806 /// This operation creates a NEW index and NEW storage containing only
807 /// live (non-deleted) vectors. The original index and storage are NOT
808 /// modified, allowing the caller to replace them atomically.
809 ///
810 /// # Important: Vector ID Remapping
811 ///
812 /// Due to storage design constraints, vector IDs are remapped during
813 /// compaction. New IDs are assigned sequentially starting from 1.
814 /// If you need to track the mapping, use the returned index to query
815 /// by position or content.
816 ///
817 /// # Returns
818 ///
819 /// Returns `(new_index, new_storage, result)` tuple. The caller MUST
820 /// replace BOTH their index and storage references:
821 ///
822 /// ```ignore
823 /// let (new_index, new_storage, result) = old_index.compact(&old_storage)?;
824 /// println!("Removed {} tombstones in {}ms", result.tombstones_removed, result.duration_ms);
825 /// // Now use new_index and new_storage, old ones will be dropped
826 /// index = new_index;
827 /// storage = new_storage;
828 /// ```
829 ///
830 /// # Algorithm
831 ///
832 /// 1. Collect all live vectors (non-deleted)
833 /// 2. Create a new empty index and storage with the same config
834 /// 3. Re-insert all live vectors using regular insert()
835 /// 4. Return the new pair
836 ///
837 /// # Performance
838 ///
839 /// * Time: O(n log n) where n = live vector count
840 /// * Space: 2x index size during compaction (temporary)
841 ///
842 /// # Memory Safety
843 ///
844 /// * Returns new pair — no storage/index mismatch possible
845 /// * On failure, original index/storage unchanged (caller keeps refs)
846 /// * Old index/storage are NOT modified — caller drops when ready
847 ///
848 /// # Warning
849 ///
850 /// This is a blocking operation. For WASM, consider running
851 /// during idle time or on user action.
852 pub fn compact(
853 &self,
854 storage: &VectorStorage,
855 ) -> Result<(HnswIndex, VectorStorage, CompactionResult), GraphError> {
856 use std::time::Instant;
857 let start = Instant::now();
858
859 let original_deleted = self.deleted_count;
860 let original_total = self.node_count();
861
862 // Fast path: no tombstones — return copy
863 if original_deleted == 0 {
864 // Clone manually since VectorStorage doesn't implement Clone
865 let config = self.config.clone();
866 let mut new_storage = VectorStorage::new(&config, None);
867 let mut new_index = HnswIndex::new(config, &new_storage)?;
868 new_index.compaction_threshold = self.compaction_threshold;
869
870 // Re-insert all vectors (this is a rebuild, but preserves order)
871 for node in &self.nodes {
872 let vec = storage.get_vector(node.vector_id);
873 new_index.insert(&vec, &mut new_storage)?;
874 }
875
876 return Ok((
877 new_index,
878 new_storage,
879 CompactionResult {
880 tombstones_removed: 0,
881 new_size: original_total,
882 duration_ms: 0,
883 },
884 ));
885 }
886
887 // Collect live vectors' data
888 let live_vectors: Vec<Vec<f32>> = self
889 .nodes
890 .iter()
891 .filter(|node| node.deleted == 0)
892 .map(|node| {
893 let vec = storage.get_vector(node.vector_id);
894 vec.into_owned()
895 })
896 .collect();
897
898 let new_size = live_vectors.len();
899
900 // Build new index AND new storage with same config
901 let config = self.config.clone();
902 let mut new_storage = VectorStorage::new(&config, None);
903 let mut new_index = HnswIndex::new(config, &new_storage)?;
904
905 // Copy compaction threshold from original
906 new_index.compaction_threshold = self.compaction_threshold;
907
908 // Re-insert all live vectors (IDs will be remapped)
909 for vector in live_vectors {
910 new_index.insert(&vector, &mut new_storage)?;
911 }
912
913 let duration_ms = start.elapsed().as_millis() as u64;
914
915 Ok((
916 new_index,
917 new_storage,
918 CompactionResult {
919 tombstones_removed: original_deleted,
920 new_size,
921 duration_ms,
922 },
923 ))
924 }
925
926 /// Insert a vector with a specific ID (validation only).
927 ///
928 /// This method validates that the specified ID doesn't conflict with
929 /// existing IDs, then delegates to the standard `insert()` method.
930 ///
931 /// **Important:** Due to storage design constraints where VectorIds must
932 /// match storage slot indices, the returned VectorId is the one assigned
933 /// by storage, NOT the requested ID. The `id` parameter is only used
934 /// for duplicate validation.
935 ///
936 /// For actual ID preservation during compaction, the storage would need
937 /// to support sparse ID assignment, which is not currently implemented.
938 ///
939 /// # Arguments
940 ///
941 /// * `id` - The specific VectorId to validate (not actually used)
942 /// * `vector` - The vector data (must match configured dimensions)
943 /// * `storage` - Mutable reference to vector storage
944 ///
945 /// # Returns
946 ///
947 /// The VectorId assigned by storage (sequential).
948 ///
949 /// # Errors
950 ///
951 /// * `InvalidVectorId` - If ID already exists in index or is the sentinel value
952 /// * `DimensionMismatch` - If vector dimensions don't match config
953 /// * `Storage` - If storage operation fails
954 pub fn insert_with_id(
955 &mut self,
956 id: VectorId,
957 vector: &[f32],
958 storage: &mut VectorStorage,
959 ) -> Result<VectorId, GraphError> {
960 // Validate ID is not sentinel
961 if id == VectorId::INVALID {
962 return Err(GraphError::InvalidVectorId);
963 }
964
965 // Validate ID doesn't already exist
966 if self.nodes.iter().any(|n| n.vector_id == id) {
967 return Err(GraphError::InvalidVectorId);
968 }
969
970 // Validate dimensions
971 if vector.len() != self.config.dimensions as usize {
972 return Err(GraphError::DimensionMismatch {
973 expected: self.config.dimensions as usize,
974 actual: vector.len(),
975 });
976 }
977
978 // Delegate to regular insert (which handles all the graph connection logic)
979 self.insert(vector, storage)
980 }
981}
982
983/// Trait for providing vector data by ID.
984pub trait VectorProvider {
985 /// Returns the vector data for a given ID.
986 fn get_vector(&self, id: VectorId) -> Cow<'_, [f32]>;
987
988 /// Returns true if the vector is marked as deleted.
989 fn is_deleted(&self, id: VectorId) -> bool {
990 let _ = id;
991 false
992 }
993
994 /// Returns the quantized vector data for a given ID, if available.
995 ///
996 /// # Returns
997 ///
998 /// * `Some(&[u8])` - If the provider supports direct quantized access.
999 /// * `None` - If not supported or data is not quantized.
1000 fn get_quantized_vector(&self, id: VectorId) -> Option<&[u8]> {
1001 let _ = id;
1002 None
1003 }
1004
1005 /// Quantizes a query vector into the provided output buffer.
1006 ///
1007 /// # Arguments
1008 ///
1009 /// * `query` - The query vector in f32.
1010 /// * `output` - buffer to write quantized data into.
1011 ///
1012 /// # Returns
1013 ///
1014 /// * `Some(&[u8])` - The quantized slice (borrowed from output).
1015 /// * `None` - If quantization is not supported.
1016 fn quantize_query<'a>(&self, query: &[f32], output: &'a mut Vec<u8>) -> Option<&'a [u8]> {
1017 let _ = query;
1018 let _ = output;
1019 None
1020 }
1021}
1022
1023// ============================================================================
1024// Batch Insertion Implementation (W11.1 - RFC 0001 v1.1)
1025// ============================================================================
1026
1027use crate::batch::BatchInsertable;
1028use crate::error::BatchError;
1029use std::collections::HashSet;
1030
1031impl HnswIndex {
1032 /// Returns the configured dimensionality of the index.
1033 #[must_use]
1034 pub fn dimensions(&self) -> u32 {
1035 self.config.dimensions
1036 }
1037
1038 /// Returns the current number of nodes in the index.
1039 #[must_use]
1040 pub fn len(&self) -> usize {
1041 self.nodes.len()
1042 }
1043
1044 /// Returns true if the index contains no nodes.
1045 #[must_use]
1046 pub fn is_empty(&self) -> bool {
1047 self.nodes.is_empty()
1048 }
1049
1050 /// Returns the maximum capacity of the index (u32::MAX nodes).
1051 #[must_use]
1052 pub const fn capacity(&self) -> usize {
1053 u32::MAX as usize
1054 }
1055
1056 /// Checks if a vector ID already exists in the index.
1057 ///
1058 /// Returns `true` if the ID exists, `false` otherwise.
1059 #[must_use]
1060 pub fn contains_id(&self, id: u64) -> bool {
1061 self.nodes.iter().any(|n| n.vector_id.0 == id)
1062 }
1063
1064 /// Checks if a vector contains invalid floating-point values.
1065 ///
1066 /// Returns `Some(reason)` if invalid, `None` if valid.
1067 fn validate_vector(vector: &[f32]) -> Option<String> {
1068 for (i, &val) in vector.iter().enumerate() {
1069 if val.is_nan() {
1070 return Some(format!("NaN at index {i}"));
1071 }
1072 if val.is_infinite() {
1073 return Some(format!("Infinity at index {i}"));
1074 }
1075 }
1076 None
1077 }
1078}
1079
1080impl BatchInsertable for HnswIndex {
1081 /// Inserts multiple vectors in a single batch operation.
1082 ///
1083 /// This is the full implementation per RFC 0001 v1.2.
1084 ///
1085 /// # Algorithm
1086 ///
1087 /// 1. Collect iterator to Vec (required for progress tracking)
1088 /// 2. Pre-validate first vector's dimensionality (fail-fast)
1089 /// 3. Check capacity constraints
1090 /// 4. Iterate through vectors with progress callbacks
1091 /// 5. Actually insert vectors into HNSW graph via storage
1092 /// 6. Handle errors according to best-effort semantics
1093 ///
1094 /// # Error Handling
1095 ///
1096 /// - **Fatal errors** (abort immediately):
1097 /// - DimensionMismatch on first vector
1098 /// - CapacityExceeded
1099 /// - InternalError (HNSW invariant violated)
1100 ///
1101 /// - **Non-fatal errors** (skip and continue):
1102 /// - DuplicateId (within batch or existing in index)
1103 /// - InvalidVector (NaN, Inf)
1104 /// - DimensionMismatch on subsequent vectors
1105 fn batch_insert<I, F>(
1106 &mut self,
1107 vectors: I,
1108 storage: &mut VectorStorage,
1109 mut progress_callback: Option<F>,
1110 ) -> Result<Vec<u64>, BatchError>
1111 where
1112 I: IntoIterator<Item = (u64, Vec<f32>)>,
1113 F: FnMut(usize, usize),
1114 {
1115 // Step 1: Collect iterator to Vec for progress tracking
1116 let batch: Vec<(u64, Vec<f32>)> = vectors.into_iter().collect();
1117 let total = batch.len();
1118
1119 // Early return for empty batch
1120 if total == 0 {
1121 return Ok(vec![]);
1122 }
1123
1124 // Step 2: Pre-validate first vector's dimensionality (fail-fast)
1125 let expected_dim = self.config.dimensions as usize;
1126 let (first_id, first_vec) = &batch[0];
1127 if first_vec.len() != expected_dim {
1128 return Err(BatchError::DimensionMismatch {
1129 expected: expected_dim,
1130 actual: first_vec.len(),
1131 vector_id: *first_id,
1132 });
1133 }
1134
1135 // Step 3: Check capacity constraints
1136 let current_count = self.nodes.len();
1137 if current_count.saturating_add(total) > self.capacity() {
1138 return Err(BatchError::CapacityExceeded {
1139 current: current_count,
1140 max: self.capacity(),
1141 });
1142 }
1143
1144 // Track IDs we've seen in this batch to detect duplicates within the batch
1145 let mut seen_ids: HashSet<u64> = HashSet::with_capacity(total);
1146
1147 // Track successfully inserted IDs
1148 let mut inserted_ids: Vec<u64> = Vec::with_capacity(total);
1149
1150 // Step 4: Initial progress callback (0%)
1151 if let Some(ref mut callback) = progress_callback {
1152 callback(0, total);
1153 }
1154
1155 // Calculate progress intervals (every 10%)
1156 let progress_interval = if total >= 10 { total / 10 } else { 1 };
1157
1158 // Step 5: Process each vector
1159 for (id, vector) in batch {
1160 // Check for duplicate ID within this batch
1161 if !seen_ids.insert(id) {
1162 // Duplicate within batch - skip (non-fatal)
1163 continue;
1164 }
1165
1166 // Check for ID 0 (reserved sentinel)
1167 if id == 0 {
1168 // Skip invalid ID (non-fatal)
1169 continue;
1170 }
1171
1172 // Check for duplicate ID in existing index [M1 fix]
1173 if self.contains_id(id) {
1174 // Duplicate in existing index - skip (non-fatal)
1175 continue;
1176 }
1177
1178 // Validate dimensionality
1179 if vector.len() != expected_dim {
1180 // Skip dimension mismatch (non-fatal after first vector)
1181 continue;
1182 }
1183
1184 // Validate vector data (NaN, Inf)
1185 if Self::validate_vector(&vector).is_some() {
1186 // Skip invalid vector (non-fatal)
1187 continue;
1188 }
1189
1190 // Step 6: Actually insert the vector into the HNSW graph [C1 fix]
1191 match self.insert(&vector, storage) {
1192 Ok(assigned_id) => {
1193 // Use the ID assigned by the insert method
1194 inserted_ids.push(assigned_id.0);
1195 }
1196 Err(e) => {
1197 // Internal error during HNSW insertion - this is fatal
1198 return Err(BatchError::InternalError {
1199 message: format!("HNSW insert failed for vector {id}: {e}"),
1200 });
1201 }
1202 }
1203
1204 // Progress callback at intervals [M3 fix: report inserted count]
1205 let inserted_count = inserted_ids.len();
1206 if inserted_count % progress_interval == 0 || inserted_count == 1 {
1207 if let Some(ref mut callback) = progress_callback {
1208 callback(inserted_count, total);
1209 }
1210 }
1211 }
1212
1213 // Final progress callback (100%)
1214 if let Some(ref mut callback) = progress_callback {
1215 callback(inserted_ids.len(), total);
1216 }
1217
1218 Ok(inserted_ids)
1219 }
1220}
1221
1222#[cfg(test)]
1223mod tests {
1224 use super::*;
1225
1226 #[test]
1227 fn test_send_sync() {
1228 fn assert_send_sync<T: Send + Sync>() {}
1229 assert_send_sync::<HnswIndex>();
1230 }
1231
1232 #[test]
1233 fn test_initialization() {
1234 let config = HnswConfig::new(128);
1235 // Create storage with matching dimensions
1236 let storage = VectorStorage::new(&config, None);
1237
1238 let index = HnswIndex::new(config.clone(), &storage).unwrap();
1239
1240 assert_eq!(index.node_count(), 0);
1241 assert_eq!(index.entry_point(), None);
1242 assert_eq!(index.max_layer(), 0);
1243 }
1244
1245 #[test]
1246 fn test_dimension_mismatch() {
1247 let config_idx = HnswConfig::new(128);
1248 let config_store = HnswConfig::new(64);
1249 let storage = VectorStorage::new(&config_store, None);
1250
1251 let result = HnswIndex::new(config_idx, &storage);
1252 assert!(matches!(
1253 result,
1254 Err(GraphError::ConfigMismatch {
1255 expected: 64,
1256 actual: 128
1257 })
1258 ));
1259 }
1260
1261 #[test]
1262 fn test_layer_distribution() {
1263 // Geometric distribution test
1264 // m=16 => m_l = 1/ln(16) ≈ 0.36
1265 // Prob(level > 0) = e^(-1/m_l) = 1/M = 1/16
1266 // We can't strictly test randomness without huge samples, but we can sanity check.
1267 let config = HnswConfig::new(128);
1268 let storage = VectorStorage::new(&config, None);
1269 let mut index = HnswIndex::new(config, &storage).unwrap();
1270
1271 let mut levels = vec![0u8; 1000];
1272 for l in levels.iter_mut() {
1273 *l = index.get_random_level();
1274 }
1275
1276 // Level 0 should be most common
1277 let l0_count = levels.iter().filter(|&&l| l == 0).count();
1278 assert!(
1279 l0_count > 800,
1280 "Level 0 should be dominant (expected ~93% for M=16)"
1281 );
1282
1283 // Max level shouldn't be crazy
1284 let max = *levels.iter().max().unwrap();
1285 assert!(max < 16, "Level should be reasonable");
1286 }
1287
1288 #[test]
1289 fn test_neighbor_roundtrip() {
1290 let config = HnswConfig::new(128);
1291 let storage = VectorStorage::new(&config, None);
1292 let mut index = HnswIndex::new(config, &storage).unwrap();
1293
1294 let id1 = index.add_node(VectorId(1), 0).unwrap();
1295 let id2 = index.add_node(VectorId(2), 0).unwrap();
1296 let id3 = index.add_node(VectorId(3), 0).unwrap();
1297
1298 // Neighbors: [2, 3]
1299 let neighbors = vec![id2, id3];
1300 index.set_neighbors(id1, &neighbors).unwrap();
1301
1302 {
1303 let node1 = index.get_node(id1).unwrap();
1304 let retrieved = index.get_neighbors(node1).unwrap();
1305 assert_eq!(retrieved, neighbors);
1306 } // Drop node1 borrow
1307
1308 // Update neighbors: [3] (shrink)
1309 let neighbors2 = vec![id3];
1310 index.set_neighbors(id1, &neighbors2).unwrap();
1311
1312 {
1313 let node1 = index.get_node(id1).unwrap();
1314 let retrieved2 = index.get_neighbors(node1).unwrap();
1315 assert_eq!(retrieved2, neighbors2);
1316 }
1317
1318 // Check if free list got populated (cannot check directly as NeighborPool is private,
1319 // but we can trust neighbor.rs tests for internal logic)
1320 }
1321
1322 // ============================================================
1323 // BATCH INSERT TESTS (W11.1 - RFC 0001 v1.2)
1324 // ============================================================
1325
1326 mod batch_insert_tests {
1327 use super::*;
1328 use crate::batch::BatchInsertable;
1329
1330 fn create_test_index_and_storage(dim: u32) -> (HnswIndex, VectorStorage) {
1331 let config = HnswConfig::new(dim);
1332 let storage = VectorStorage::new(&config, None);
1333 let index = HnswIndex::new(config, &storage).unwrap();
1334 (index, storage)
1335 }
1336
1337 #[test]
1338 fn test_batch_insert_empty() {
1339 let (mut index, mut storage) = create_test_index_and_storage(128);
1340 let vectors: Vec<(u64, Vec<f32>)> = vec![];
1341
1342 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1343
1344 assert!(result.is_ok());
1345 assert!(result.unwrap().is_empty());
1346 // [M2 fix] Verify graph state
1347 assert_eq!(index.node_count(), 0);
1348 }
1349
1350 #[test]
1351 fn test_batch_insert_single_vector() {
1352 let (mut index, mut storage) = create_test_index_and_storage(4);
1353 let vectors = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
1354
1355 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1356
1357 assert!(result.is_ok());
1358 let ids = result.unwrap();
1359 assert_eq!(ids.len(), 1);
1360 // [M2 fix] Verify graph state - node was actually inserted
1361 assert_eq!(index.node_count(), 1);
1362 }
1363
1364 #[test]
1365 fn test_batch_insert_multiple_vectors() {
1366 let (mut index, mut storage) = create_test_index_and_storage(4);
1367 let vectors = vec![
1368 (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1369 (2u64, vec![5.0, 6.0, 7.0, 8.0]),
1370 (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1371 ];
1372
1373 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1374
1375 assert!(result.is_ok());
1376 let ids = result.unwrap();
1377 assert_eq!(ids.len(), 3);
1378 // [M2 fix] Verify graph state - all 3 nodes inserted
1379 assert_eq!(index.node_count(), 3);
1380 }
1381
1382 #[test]
1383 fn test_batch_insert_dimension_mismatch_first_vector() {
1384 let (mut index, mut storage) = create_test_index_and_storage(4);
1385 // First vector has wrong dimension (3 instead of 4)
1386 let vectors = vec![
1387 (1u64, vec![1.0, 2.0, 3.0]), // Wrong!
1388 (2u64, vec![5.0, 6.0, 7.0, 8.0]),
1389 ];
1390
1391 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1392
1393 assert!(result.is_err());
1394 match result.unwrap_err() {
1395 BatchError::DimensionMismatch {
1396 expected,
1397 actual,
1398 vector_id,
1399 } => {
1400 assert_eq!(expected, 4);
1401 assert_eq!(actual, 3);
1402 assert_eq!(vector_id, 1);
1403 }
1404 _ => panic!("Expected DimensionMismatch error"),
1405 }
1406 // [M2 fix] Verify no vectors were inserted on fatal error
1407 assert_eq!(index.node_count(), 0);
1408 }
1409
1410 #[test]
1411 fn test_batch_insert_dimension_mismatch_later_vector_skipped() {
1412 let (mut index, mut storage) = create_test_index_and_storage(4);
1413 // Second vector has wrong dimension - should be skipped
1414 let vectors = vec![
1415 (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1416 (2u64, vec![5.0, 6.0, 7.0]), // Wrong, but not first
1417 (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1418 ];
1419
1420 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1421
1422 // Should succeed with partial results
1423 assert!(result.is_ok());
1424 let ids = result.unwrap();
1425 assert_eq!(ids.len(), 2);
1426 // [M2 fix] Verify only 2 nodes inserted (one skipped)
1427 assert_eq!(index.node_count(), 2);
1428 }
1429
1430 #[test]
1431 fn test_batch_insert_duplicate_id_within_batch() {
1432 let (mut index, mut storage) = create_test_index_and_storage(4);
1433 // Duplicate IDs within the batch
1434 let vectors = vec![
1435 (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1436 (1u64, vec![5.0, 6.0, 7.0, 8.0]), // Duplicate!
1437 (2u64, vec![9.0, 10.0, 11.0, 12.0]),
1438 ];
1439
1440 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1441
1442 // Should succeed, skipping the duplicate
1443 assert!(result.is_ok());
1444 let ids = result.unwrap();
1445 assert_eq!(ids.len(), 2);
1446 // [M2 fix] Verify graph state
1447 assert_eq!(index.node_count(), 2);
1448 }
1449
1450 #[test]
1451 fn test_batch_insert_duplicate_id_in_existing_index() {
1452 // [M1 fix] Test duplicate check against existing index
1453 let (mut index, mut storage) = create_test_index_and_storage(4);
1454
1455 // First batch
1456 let vectors1 = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
1457 let result1 = index.batch_insert(vectors1, &mut storage, None::<fn(usize, usize)>);
1458 assert!(result1.is_ok());
1459 assert_eq!(index.node_count(), 1);
1460
1461 // Second batch with duplicate ID
1462 let vectors2 = vec![
1463 (1u64, vec![5.0, 6.0, 7.0, 8.0]), // Duplicate of existing!
1464 (2u64, vec![9.0, 10.0, 11.0, 12.0]),
1465 ];
1466 let result2 = index.batch_insert(vectors2, &mut storage, None::<fn(usize, usize)>);
1467
1468 // Should succeed, skipping the duplicate
1469 assert!(result2.is_ok());
1470 let ids = result2.unwrap();
1471 assert_eq!(ids.len(), 1); // Only ID 2 was inserted
1472 assert_eq!(index.node_count(), 2); // Total 2 nodes now
1473 }
1474
1475 #[test]
1476 fn test_batch_insert_invalid_vector_nan() {
1477 let (mut index, mut storage) = create_test_index_and_storage(4);
1478 let vectors = vec![
1479 (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1480 (2u64, vec![f32::NAN, 6.0, 7.0, 8.0]), // NaN!
1481 (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1482 ];
1483
1484 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1485
1486 // Should succeed, skipping the NaN vector
1487 assert!(result.is_ok());
1488 let ids = result.unwrap();
1489 assert_eq!(ids.len(), 2);
1490 // [M2 fix] Verify graph state
1491 assert_eq!(index.node_count(), 2);
1492 }
1493
1494 #[test]
1495 fn test_batch_insert_invalid_vector_infinity() {
1496 let (mut index, mut storage) = create_test_index_and_storage(4);
1497 let vectors = vec![
1498 (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1499 (2u64, vec![f32::INFINITY, 6.0, 7.0, 8.0]), // Infinity!
1500 (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1501 ];
1502
1503 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1504
1505 // Should succeed, skipping the Infinity vector
1506 assert!(result.is_ok());
1507 let ids = result.unwrap();
1508 assert_eq!(ids.len(), 2);
1509 // [M2 fix] Verify graph state
1510 assert_eq!(index.node_count(), 2);
1511 }
1512
1513 #[test]
1514 fn test_batch_insert_invalid_vector_neg_infinity() {
1515 let (mut index, mut storage) = create_test_index_and_storage(4);
1516 let vectors = vec![
1517 (1u64, vec![1.0, 2.0, 3.0, 4.0]),
1518 (2u64, vec![f32::NEG_INFINITY, 6.0, 7.0, 8.0]), // Neg Infinity!
1519 (3u64, vec![9.0, 10.0, 11.0, 12.0]),
1520 ];
1521
1522 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1523
1524 // Should succeed, skipping the NegInfinity vector
1525 assert!(result.is_ok());
1526 let ids = result.unwrap();
1527 assert_eq!(ids.len(), 2);
1528 // [M2 fix] Verify graph state
1529 assert_eq!(index.node_count(), 2);
1530 }
1531
1532 #[test]
1533 fn test_batch_insert_id_zero_skipped() {
1534 let (mut index, mut storage) = create_test_index_and_storage(4);
1535 // ID 0 is reserved sentinel
1536 let vectors = vec![
1537 (0u64, vec![1.0, 2.0, 3.0, 4.0]), // Reserved!
1538 (1u64, vec![5.0, 6.0, 7.0, 8.0]),
1539 (2u64, vec![9.0, 10.0, 11.0, 12.0]),
1540 ];
1541
1542 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1543
1544 // Should succeed, skipping ID 0
1545 assert!(result.is_ok());
1546 let ids = result.unwrap();
1547 assert_eq!(ids.len(), 2);
1548 // [M2 fix] Verify graph state
1549 assert_eq!(index.node_count(), 2);
1550 }
1551
1552 #[test]
1553 fn test_batch_insert_progress_callback_called() {
1554 let (mut index, mut storage) = create_test_index_and_storage(4);
1555 let vectors: Vec<(u64, Vec<f32>)> =
1556 (1..=10).map(|i| (i as u64, vec![i as f32; 4])).collect();
1557
1558 let mut progress_calls: Vec<(usize, usize)> = vec![];
1559
1560 let result = index.batch_insert(
1561 vectors,
1562 &mut storage,
1563 Some(|current, total| {
1564 progress_calls.push((current, total));
1565 }),
1566 );
1567
1568 assert!(result.is_ok());
1569 // [M2 fix] Verify all 10 nodes inserted
1570 assert_eq!(index.node_count(), 10);
1571
1572 // Should have called progress at 0% and at intervals
1573 assert!(!progress_calls.is_empty());
1574
1575 // First call should be (0, 10)
1576 assert_eq!(progress_calls[0], (0, 10));
1577
1578 // Last call should report inserted count (10, 10)
1579 let last = progress_calls.last().unwrap();
1580 assert_eq!(*last, (10, 10));
1581 }
1582
1583 #[test]
1584 fn test_batch_insert_progress_callback_single_vector() {
1585 let (mut index, mut storage) = create_test_index_and_storage(4);
1586 let vectors = vec![(1u64, vec![1.0, 2.0, 3.0, 4.0])];
1587
1588 let mut progress_calls: Vec<(usize, usize)> = vec![];
1589
1590 let result = index.batch_insert(
1591 vectors,
1592 &mut storage,
1593 Some(|current, total| {
1594 progress_calls.push((current, total));
1595 }),
1596 );
1597
1598 assert!(result.is_ok());
1599 // [M2 fix] Verify 1 node inserted
1600 assert_eq!(index.node_count(), 1);
1601
1602 // Should have at least two calls: 0% and 100%
1603 assert!(progress_calls.len() >= 2);
1604 assert_eq!(progress_calls[0], (0, 1));
1605 assert!(progress_calls.contains(&(1, 1)));
1606 }
1607
1608 #[test]
1609 fn test_batch_insert_mixed_errors() {
1610 let (mut index, mut storage) = create_test_index_and_storage(4);
1611 // Mix of valid and invalid vectors
1612 let vectors = vec![
1613 (1u64, vec![1.0, 2.0, 3.0, 4.0]), // Valid
1614 (2u64, vec![f32::NAN, 2.0, 3.0, 4.0]), // NaN - skip
1615 (3u64, vec![3.0, 3.0, 3.0]), // Wrong dim - skip
1616 (1u64, vec![4.0, 4.0, 4.0, 4.0]), // Duplicate - skip
1617 (0u64, vec![5.0, 5.0, 5.0, 5.0]), // Reserved ID - skip
1618 (4u64, vec![6.0, 6.0, 6.0, 6.0]), // Valid
1619 ];
1620
1621 let result = index.batch_insert(vectors, &mut storage, None::<fn(usize, usize)>);
1622
1623 assert!(result.is_ok());
1624 let ids = result.unwrap();
1625 assert_eq!(ids.len(), 2);
1626 // [M2 fix] Verify graph state
1627 assert_eq!(index.node_count(), 2);
1628 }
1629
1630 #[test]
1631 fn test_helper_dimensions() {
1632 let (index, _storage) = create_test_index_and_storage(128);
1633 assert_eq!(index.dimensions(), 128);
1634 }
1635
1636 #[test]
1637 fn test_helper_len_empty() {
1638 let (index, _storage) = create_test_index_and_storage(128);
1639 assert_eq!(index.len(), 0);
1640 assert!(index.is_empty());
1641 }
1642
1643 #[test]
1644 fn test_helper_capacity() {
1645 let (index, _storage) = create_test_index_and_storage(128);
1646 assert_eq!(index.capacity(), u32::MAX as usize);
1647 }
1648
1649 #[test]
1650 fn test_helper_contains_id() {
1651 let (mut index, mut storage) = create_test_index_and_storage(4);
1652
1653 // Initially empty
1654 assert!(!index.contains_id(1));
1655
1656 // Insert via single insert
1657 let _ = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage);
1658
1659 // Now the first ID should exist (insert assigns sequential IDs starting at 1)
1660 assert!(index.node_count() == 1);
1661 }
1662
1663 #[test]
1664 fn test_validate_vector_valid() {
1665 let vector = vec![1.0, 2.0, 3.0, 4.0];
1666 assert!(HnswIndex::validate_vector(&vector).is_none());
1667 }
1668
1669 #[test]
1670 fn test_validate_vector_nan() {
1671 let vector = vec![1.0, f32::NAN, 3.0, 4.0];
1672 let result = HnswIndex::validate_vector(&vector);
1673 assert!(result.is_some());
1674 assert!(result.unwrap().contains("NaN"));
1675 }
1676
1677 #[test]
1678 fn test_validate_vector_infinity() {
1679 let vector = vec![1.0, f32::INFINITY, 3.0, 4.0];
1680 let result = HnswIndex::validate_vector(&vector);
1681 assert!(result.is_some());
1682 assert!(result.unwrap().contains("Infinity"));
1683 }
1684 }
1685
1686 // ============================================================
1687 // SOFT DELETE TESTS (W16.2 - RFC-001)
1688 // ============================================================
1689
1690 mod delete_tests {
1691 use super::*;
1692
1693 fn create_test_index() -> (HnswIndex, VectorStorage) {
1694 let config = HnswConfig::new(4);
1695 let storage = VectorStorage::new(&config, None);
1696 let index = HnswIndex::new(config, &storage).unwrap();
1697 (index, storage)
1698 }
1699
1700 #[test]
1701 fn test_soft_delete_marks_node() {
1702 let (mut index, mut storage) = create_test_index();
1703 let vec = vec![1.0, 2.0, 3.0, 4.0];
1704 let id = index.insert(&vec, &mut storage).unwrap();
1705
1706 assert!(!index.is_deleted(id).unwrap());
1707 assert!(index.soft_delete(id).unwrap());
1708 assert!(index.is_deleted(id).unwrap());
1709 }
1710
1711 #[test]
1712 fn test_soft_delete_idempotent() {
1713 let (mut index, mut storage) = create_test_index();
1714 let vec = vec![1.0, 2.0, 3.0, 4.0];
1715 let id = index.insert(&vec, &mut storage).unwrap();
1716
1717 assert!(index.soft_delete(id).unwrap()); // First: true
1718 assert!(!index.soft_delete(id).unwrap()); // Second: false
1719 assert_eq!(index.deleted_count(), 1); // Still 1
1720 }
1721
1722 #[test]
1723 fn test_soft_delete_nonexistent_fails() {
1724 let (mut index, _storage) = create_test_index();
1725 let result = index.soft_delete(VectorId(999));
1726 assert!(result.is_err());
1727 assert!(matches!(result.unwrap_err(), GraphError::InvalidVectorId));
1728 }
1729
1730 #[test]
1731 fn test_deleted_count() {
1732 let (mut index, mut storage) = create_test_index();
1733
1734 // Insert 3 vectors
1735 let id1 = index.insert(&[1.0, 0.0, 0.0, 0.0], &mut storage).unwrap();
1736 let id2 = index.insert(&[0.0, 1.0, 0.0, 0.0], &mut storage).unwrap();
1737 let _id3 = index.insert(&[0.0, 0.0, 1.0, 0.0], &mut storage).unwrap();
1738
1739 assert_eq!(index.deleted_count(), 0);
1740 assert_eq!(index.node_count(), 3);
1741
1742 // Delete 2
1743 index.soft_delete(id1).unwrap();
1744 index.soft_delete(id2).unwrap();
1745
1746 assert_eq!(index.deleted_count(), 2);
1747 assert_eq!(index.live_count(), 1);
1748 }
1749
1750 #[test]
1751 fn test_tombstone_ratio() {
1752 let (mut index, mut storage) = create_test_index();
1753
1754 // Empty index
1755 assert_eq!(index.tombstone_ratio(), 0.0);
1756
1757 // Insert 4 vectors
1758 let mut ids = Vec::new();
1759 for i in 0..4 {
1760 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1761 ids.push(id);
1762 }
1763
1764 assert_eq!(index.tombstone_ratio(), 0.0);
1765
1766 // Delete 1 of 4 = 25%
1767 index.soft_delete(ids[0]).unwrap();
1768 assert!((index.tombstone_ratio() - 0.25).abs() < 0.01);
1769
1770 // Delete 2 of 4 = 50%
1771 index.soft_delete(ids[1]).unwrap();
1772 assert!((index.tombstone_ratio() - 0.50).abs() < 0.01);
1773 }
1774
1775 #[test]
1776 fn test_is_deleted_nonexistent_fails() {
1777 let (index, _storage) = create_test_index();
1778 let result = index.is_deleted(VectorId(999));
1779 assert!(result.is_err());
1780 assert!(matches!(result.unwrap_err(), GraphError::InvalidVectorId));
1781 }
1782
1783 #[test]
1784 fn test_live_count() {
1785 let (mut index, mut storage) = create_test_index();
1786
1787 // Insert 5
1788 let mut ids = Vec::new();
1789 for i in 0..5 {
1790 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1791 ids.push(id);
1792 }
1793
1794 // Delete 2
1795 index.soft_delete(ids[0]).unwrap();
1796 index.soft_delete(ids[1]).unwrap();
1797
1798 assert_eq!(index.node_count(), 5);
1799 assert_eq!(index.deleted_count(), 2);
1800 assert_eq!(index.live_count(), 3);
1801 }
1802
1803 #[test]
1804 fn test_get_node_by_vector_id_helper() {
1805 let (mut index, mut storage) = create_test_index();
1806 let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
1807
1808 // Should find existing node
1809 let node = index.get_node_by_vector_id(id);
1810 assert!(node.is_ok());
1811 assert_eq!(node.unwrap().vector_id, id);
1812
1813 // Should fail for non-existent
1814 let bad = index.get_node_by_vector_id(VectorId(999));
1815 assert!(bad.is_err());
1816 }
1817
1818 #[test]
1819 fn test_deleted_field_is_zero_by_default() {
1820 let (mut index, mut storage) = create_test_index();
1821 let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
1822
1823 let node = index.get_node_by_vector_id(id).unwrap();
1824 assert_eq!(node.deleted, 0);
1825 }
1826
1827 #[test]
1828 fn test_deleted_field_set_to_one_after_delete() {
1829 let (mut index, mut storage) = create_test_index();
1830 let id = index.insert(&[1.0, 2.0, 3.0, 4.0], &mut storage).unwrap();
1831
1832 index.soft_delete(id).unwrap();
1833
1834 let node = index.get_node_by_vector_id(id).unwrap();
1835 assert_eq!(node.deleted, 1);
1836 }
1837
1838 #[test]
1839 fn test_delete_invalid_vector_id_zero() {
1840 let (mut index, _storage) = create_test_index();
1841 // VectorId(0) is INVALID sentinel
1842 let result = index.soft_delete(VectorId(0));
1843 assert!(result.is_err());
1844 }
1845
1846 // ============================================================
1847 // ADJUSTED K TESTS (W16.3)
1848 // ============================================================
1849
1850 #[test]
1851 fn test_adjusted_k_no_tombstones() {
1852 let (index, _storage) = create_test_index();
1853 // No deletions -> k unchanged
1854 assert_eq!(index.adjusted_k(10), 10);
1855 assert_eq!(index.adjusted_k(1), 1);
1856 assert_eq!(index.adjusted_k(100), 100);
1857 }
1858
1859 #[test]
1860 fn test_adjusted_k_with_tombstones() {
1861 let (mut index, mut storage) = create_test_index();
1862
1863 // Insert 10 vectors
1864 let mut ids = Vec::new();
1865 for i in 0..10 {
1866 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1867 ids.push(id);
1868 }
1869
1870 // Delete 5 of 10 = 50% tombstones
1871 for i in 0..5 {
1872 index.soft_delete(ids[i]).unwrap();
1873 }
1874
1875 // 50% tombstones → 2x multiplier
1876 // adjusted_k(10) = ceil(10 / 0.5) = 20
1877 let adjusted = index.adjusted_k(10);
1878 assert!(
1879 adjusted >= 18 && adjusted <= 22,
1880 "Expected ~20, got {adjusted}"
1881 );
1882 }
1883
1884 #[test]
1885 fn test_adjusted_k_capped_at_10x() {
1886 let (mut index, mut storage) = create_test_index();
1887
1888 // Insert 100 vectors
1889 let mut ids = Vec::new();
1890 for i in 0..100 {
1891 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1892 ids.push(id);
1893 }
1894
1895 // Delete 95 of 100 = 95% tombstones
1896 for i in 0..95 {
1897 index.soft_delete(ids[i]).unwrap();
1898 }
1899
1900 // Should cap at 10x, not 20x
1901 let adjusted = index.adjusted_k(10);
1902 assert_eq!(adjusted, 100, "Should be capped at 10x (100)");
1903 }
1904
1905 #[test]
1906 fn test_adjusted_k_10_percent_tombstones() {
1907 let (mut index, mut storage) = create_test_index();
1908
1909 // Insert 10 vectors
1910 let mut ids = Vec::new();
1911 for i in 0..10 {
1912 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1913 ids.push(id);
1914 }
1915
1916 // Delete 1 of 10 = 10% tombstones
1917 index.soft_delete(ids[0]).unwrap();
1918
1919 // 10% tombstones → 1.11x multiplier
1920 // adjusted_k(10) = ceil(10 / 0.9) = ceil(11.11) = 12
1921 let adjusted = index.adjusted_k(10);
1922 assert!(
1923 adjusted >= 11 && adjusted <= 13,
1924 "Expected ~12, got {adjusted}"
1925 );
1926 }
1927
1928 // ============================================================
1929 // BOUNDARY VALUE TESTS (m3 fix)
1930 // ============================================================
1931
1932 #[test]
1933 fn test_adjusted_k_boundary_zero_tombstones() {
1934 let (mut index, mut storage) = create_test_index();
1935
1936 // Insert vectors but delete none (0% tombstones)
1937 for i in 0..10 {
1938 index.insert(&[i as f32; 4], &mut storage).unwrap();
1939 }
1940
1941 // 0% tombstones → no adjustment
1942 assert_eq!(index.adjusted_k(5), 5);
1943 assert_eq!(index.adjusted_k(10), 10);
1944 assert_eq!(index.tombstone_ratio(), 0.0);
1945 }
1946
1947 #[test]
1948 fn test_adjusted_k_boundary_50_percent() {
1949 let (mut index, mut storage) = create_test_index();
1950
1951 let mut ids = Vec::new();
1952 for i in 0..10 {
1953 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1954 ids.push(id);
1955 }
1956
1957 // Delete exactly 5 of 10 = 50%
1958 for i in 0..5 {
1959 index.soft_delete(ids[i]).unwrap();
1960 }
1961
1962 // 50% tombstones → 2x
1963 // adjusted_k(10) = 10 * 10 / 5 = 20
1964 let adjusted = index.adjusted_k(10);
1965 assert_eq!(adjusted, 20, "50% tombstones should give 2x");
1966 assert!((index.tombstone_ratio() - 0.5).abs() < 0.01);
1967 }
1968
1969 #[test]
1970 fn test_adjusted_k_boundary_90_percent() {
1971 let (mut index, mut storage) = create_test_index();
1972
1973 let mut ids = Vec::new();
1974 for i in 0..10 {
1975 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1976 ids.push(id);
1977 }
1978
1979 // Delete 9 of 10 = 90%
1980 for i in 0..9 {
1981 index.soft_delete(ids[i]).unwrap();
1982 }
1983
1984 // 90% tombstones → 10x
1985 // adjusted_k(10) = 10 * 10 / 1 = 100
1986 // Capped at 10x multiplier = 100
1987 let adjusted = index.adjusted_k(10);
1988 assert_eq!(adjusted, 100, "90% tombstones should give 10x (capped)");
1989 assert!((index.tombstone_ratio() - 0.9).abs() < 0.01);
1990 }
1991
1992 #[test]
1993 fn test_adjusted_k_boundary_all_deleted() {
1994 let (mut index, mut storage) = create_test_index();
1995
1996 let mut ids = Vec::new();
1997 for i in 0..5 {
1998 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
1999 ids.push(id);
2000 }
2001
2002 // Delete all = 100% tombstones
2003 for id in &ids {
2004 index.soft_delete(*id).unwrap();
2005 }
2006
2007 // All deleted → returns k (search will return empty anyway)
2008 let adjusted = index.adjusted_k(10);
2009 assert_eq!(adjusted, 10, "All deleted should return original k");
2010 assert_eq!(index.live_count(), 0);
2011 assert!((index.tombstone_ratio() - 1.0).abs() < 0.01);
2012 }
2013
2014 #[test]
2015 fn test_adjusted_k_large_k_small_index() {
2016 let (mut index, mut storage) = create_test_index();
2017
2018 // Insert only 5 vectors
2019 let mut ids = Vec::new();
2020 for i in 0..5 {
2021 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2022 ids.push(id);
2023 }
2024
2025 // Delete 2 = 40% tombstones
2026 index.soft_delete(ids[0]).unwrap();
2027 index.soft_delete(ids[1]).unwrap();
2028
2029 // Request k=100, but only 5 vectors in index
2030 // adjusted_k(100) = 100 * 5 / 3 ≈ 166
2031 // Capped at 10x = 1000
2032 // Note: adjusted_k controls internal effort, not final result count
2033 let adjusted = index.adjusted_k(100);
2034 assert!(
2035 adjusted >= 166 && adjusted <= 168,
2036 "Should compute ~166 for 40% tombstones, got {adjusted}"
2037 );
2038 }
2039
2040 #[test]
2041 fn test_adjusted_k_uses_constant() {
2042 // Verify MAX_ADJUSTED_K_MULTIPLIER is used
2043 assert_eq!(HnswIndex::MAX_ADJUSTED_K_MULTIPLIER, 10);
2044 }
2045
2046 #[test]
2047 fn test_adjusted_k_integer_precision() {
2048 // Test that integer arithmetic doesn't lose precision
2049 let (mut index, mut storage) = create_test_index();
2050
2051 let mut ids = Vec::new();
2052 for i in 0..100 {
2053 let id = index.insert(&[i as f32; 4], &mut storage).unwrap();
2054 ids.push(id);
2055 }
2056
2057 // Delete 33 of 100 = 33%
2058 for i in 0..33 {
2059 index.soft_delete(ids[i]).unwrap();
2060 }
2061
2062 // adjusted_k(10) = 10 * 100 / 67 = 14.925... → 14
2063 let adjusted = index.adjusted_k(10);
2064 // Should be close to 15 (1.49x)
2065 assert!(
2066 adjusted >= 14 && adjusted <= 16,
2067 "33% tombstones: expected ~15, got {adjusted}"
2068 );
2069 }
2070 }
2071}