Skip to main content

frankensearch_index/
lib.rs

1//! Vector index storage and loading for frankensearch.
2//!
3//! This crate implements the FSVI binary format reader/writer plus exact
4//! brute-force top-k vector search, with optional HNSW ANN acceleration.
5//!
6//! # FSVI File Layout
7//!
8//! All multi-byte integers are little-endian. The vector slab is 64-byte
9//! aligned for cache-line / SIMD friendliness.
10//!
11//! ```text
12//! ┌───────────────────────────────────────────┐
13//! │ Header (variable length)                  │
14//! │   magic: b"FSVI"              (4 bytes)   │
15//! │   version: u16                (2 bytes)   │
16//! │   embedder_id_len: u16        (2 bytes)   │
17//! │   embedder_id: [u8]           (variable)  │
18//! │   embedder_revision_len: u16  (2 bytes)   │
19//! │   embedder_revision: [u8]     (variable)  │
20//! │   dimension: u32              (4 bytes)   │
21//! │   quantization: u8            (1 byte)    │
22//! │   reserved: [u8; 3]           (3 bytes)   │
23//! │   record_count: u64           (8 bytes)   │
24//! │   vectors_offset: u64         (8 bytes)   │
25//! │   header_crc32: u32           (4 bytes)   │
26//! ├───────────────────────────────────────────┤
27//! │ Record Table                              │
28//! │   record_count × 16 bytes each:           │
29//! │     doc_id_hash: u64          (8 bytes)   │
30//! │     doc_id_offset: u32        (4 bytes)   │
31//! │     doc_id_len: u16           (2 bytes)   │
32//! │     flags: u16                (2 bytes)   │
33//! ├───────────────────────────────────────────┤
34//! │ String Table                              │
35//! │   Concatenated UTF-8 doc_id strings       │
36//! ├───────────────────────────────────────────┤
37//! │ Padding (to 64-byte alignment)            │
38//! ├───────────────────────────────────────────┤
39//! │ Vector Slab                               │
40//! │   record_count × dimension × elem_size    │
41//! │   (2 bytes/elem for f16, 4 for f32)       │
42//! └───────────────────────────────────────────┘
43//! ```
44
45#[cfg(feature = "ann")]
46pub mod hnsw;
47pub mod in_memory;
48pub mod mrl;
49pub mod quantization;
50mod repro_soft_delete_rollback;
51mod repro_wal_truncation;
52pub mod search;
53pub mod simd;
54pub mod two_tier;
55pub mod wal;
56pub mod warmup;
57
58use std::fs::{self, File, OpenOptions};
59use std::io::{BufWriter, Write};
60use std::path::{Path, PathBuf};
61use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
62
63use crc32fast::Hasher as Crc32;
64use frankensearch_core::{SearchError, SearchResult};
65use half::f16;
66use memmap2::MmapMut;
67use tracing::debug;
68
69#[cfg(feature = "ann")]
70pub use hnsw::{
71    AnnSearchStats, HNSW_DEFAULT_EF_CONSTRUCTION, HNSW_DEFAULT_EF_SEARCH, HNSW_DEFAULT_M,
72    HNSW_DEFAULT_MAX_LAYER, HnswConfig, HnswIndex,
73};
74pub use in_memory::{InMemoryTwoTierIndex, InMemoryVectorIndex};
75pub use mrl::{MrlConfig, MrlSearchStats};
76pub use quantization::ScalarQuantizer;
77pub use search::{PARALLEL_CHUNK_SIZE, PARALLEL_THRESHOLD, SearchParams};
78pub use simd::{
79    cosine_similarity_f16, dot_product_f16_bytes_f32, dot_product_f16_f32,
80    dot_product_f32_bytes_f32, dot_product_f32_f32,
81};
82pub use two_tier::{
83    TwoTierIndex, TwoTierIndexBuilder, VECTOR_INDEX_FALLBACK_FILENAME, VECTOR_INDEX_FAST_FILENAME,
84    VECTOR_INDEX_QUALITY_FILENAME,
85};
86#[cfg(feature = "ann")]
87pub use two_tier::{VECTOR_ANN_FAST_FILENAME, VECTOR_ANN_QUALITY_FILENAME};
88pub use wal::{CompactionStats, WalConfig, wal_path_for};
89pub use warmup::{AdaptiveConfig, HeatMap, WarmUpConfig, WarmUpResult, WarmUpStrategy};
90
91/// Magic bytes at the start of every FSVI file.
92pub const FSVI_MAGIC: [u8; 4] = *b"FSVI";
93
94/// Supported FSVI format version.
95pub const FSVI_VERSION: u16 = 1;
96
97const RECORD_SIZE_BYTES: usize = 16;
98const VECTOR_ALIGN_BYTES: u64 = 64;
99const RECORD_FLAG_TOMBSTONE: u16 = 0x0001;
100const TOMBSTONE_VACUUM_THRESHOLD: f64 = 0.20;
101
102/// Vector element quantization stored in the FSVI slab.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104#[repr(u8)]
105pub enum Quantization {
106    /// Full-precision float32.
107    F32 = 0,
108    /// Half-precision float16.
109    F16 = 1,
110}
111
112impl Quantization {
113    pub(crate) fn from_wire(value: u8, path: &Path) -> SearchResult<Self> {
114        match value {
115            0 => Ok(Self::F32),
116            1 => Ok(Self::F16),
117            _ => Err(index_corrupted(
118                path,
119                format!("unsupported quantization byte: {value}"),
120            )),
121        }
122    }
123
124    const fn bytes_per_element(self) -> usize {
125        match self {
126            Self::F32 => 4,
127            Self::F16 => 2,
128        }
129    }
130}
131
132/// Parsed metadata from an FSVI file header.
133#[derive(Debug, Clone, PartialEq, Eq)]
134pub struct VectorMetadata {
135    /// Stable embedder id used to build the index.
136    pub embedder_id: String,
137    /// Model revision identifier (e.g. pinned commit hash).
138    pub embedder_revision: String,
139    /// Vector dimensionality.
140    pub dimension: usize,
141    /// Stored quantization.
142    pub quantization: Quantization,
143    /// Compaction generation counter (0-255) used for stale WAL detection.
144    pub compaction_gen: u8,
145    /// Number of records in the index.
146    pub record_count: usize,
147    /// Byte offset to the aligned vector slab.
148    pub vectors_offset: u64,
149}
150
151/// Statistics returned by [`VectorIndex::vacuum`].
152#[derive(Debug, Clone, PartialEq, Eq)]
153pub struct VacuumStats {
154    /// Records in the main index before vacuum.
155    pub records_before: usize,
156    /// Records in the main index after vacuum.
157    pub records_after: usize,
158    /// Tombstoned records removed by vacuum.
159    pub tombstones_removed: usize,
160    /// Approximate number of bytes reclaimed in the main index file.
161    pub bytes_reclaimed: usize,
162    /// Time taken by the vacuum operation.
163    pub duration: Duration,
164}
165
166#[derive(Debug, Clone, Copy)]
167pub(crate) struct RecordEntry {
168    pub(crate) doc_id_hash: u64,
169    pub(crate) doc_id_offset: u32,
170    pub(crate) doc_id_len: u16,
171    pub(crate) flags: u16,
172}
173
174#[derive(Debug)]
175pub struct VectorIndex {
176    pub(crate) path: PathBuf,
177    pub(crate) data: MmapMut,
178    pub(crate) metadata: VectorMetadata,
179    pub(crate) records_offset: usize,
180    pub(crate) strings_offset: usize,
181    pub(crate) vectors_offset: usize,
182    /// WAL entries for incremental updates (empty if no WAL exists).
183    pub(crate) wal_entries: Vec<wal::WalEntry>,
184    /// WAL configuration.
185    wal_config: WalConfig,
186}
187
188impl VectorIndex {
189    /// Open an existing FSVI index from disk.
190    ///
191    /// # Errors
192    ///
193    /// Returns `SearchError::IndexNotFound` if the file does not exist and
194    /// `SearchError::IndexCorrupted` when header/layout validation fails.
195    #[allow(unsafe_code, clippy::too_many_lines)] // MmapMut::map_mut requires unsafe for memory-mapped I/O.
196    pub fn open(path: &Path) -> SearchResult<Self> {
197        if !path.exists() {
198            return Err(SearchError::IndexNotFound {
199                path: path.to_path_buf(),
200            });
201        }
202
203        let file = OpenOptions::new()
204            .read(true)
205            .write(true)
206            .open(path)
207            .map_err(SearchError::Io)?;
208        let data = unsafe { MmapMut::map_mut(&file).map_err(SearchError::Io)? };
209        let (metadata, header_len) = parse_header(path, &data)?;
210
211        let records_bytes = metadata
212            .record_count
213            .checked_mul(RECORD_SIZE_BYTES)
214            .ok_or_else(|| index_corrupted(path, "record table size overflow"))?;
215        let records_offset = header_len;
216        let strings_offset = records_offset
217            .checked_add(records_bytes)
218            .ok_or_else(|| index_corrupted(path, "record table offset overflow"))?;
219        let vectors_offset = usize::try_from(metadata.vectors_offset)
220            .map_err(|_| index_corrupted(path, "vectors_offset does not fit in usize"))?;
221        if vectors_offset < strings_offset {
222            return Err(index_corrupted(
223                path,
224                "vectors_offset points inside the record table/string table region",
225            ));
226        }
227
228        let vector_bytes = metadata
229            .record_count
230            .checked_mul(metadata.dimension)
231            .and_then(|v| v.checked_mul(metadata.quantization.bytes_per_element()))
232            .ok_or_else(|| index_corrupted(path, "vector slab size overflow"))?;
233        let required_len = vectors_offset
234            .checked_add(vector_bytes)
235            .ok_or_else(|| index_corrupted(path, "vector slab end overflow"))?;
236        if data.len() < required_len {
237            return Err(index_corrupted(
238                path,
239                format!(
240                    "truncated file: have {} bytes, need at least {} bytes",
241                    data.len(),
242                    required_len
243                ),
244            ));
245        }
246
247        let warm_up_config = WarmUpConfig::from_env();
248        if !matches!(warm_up_config.strategy, WarmUpStrategy::None) {
249            let warm_up = warmup::warm_up_bytes(&data, header_len, &warm_up_config, None);
250            debug!(
251                target: "frankensearch.warmup",
252                path = %path.display(),
253                strategy = %warm_up.strategy_name,
254                pages_touched = warm_up.pages_touched,
255                bytes_touched = warm_up.bytes_touched,
256                budget_exhausted = warm_up.budget_exhausted,
257                "index warm-up complete"
258            );
259        }
260
261        // Load WAL entries if a sidecar file exists.
262        let wal_path = wal::wal_path_for(path);
263        let (wal_entries_raw, wal_compaction_gen, valid_len) =
264            wal::read_wal(&wal_path, metadata.dimension, metadata.quantization)?;
265
266        let mut deduped_wal = Vec::with_capacity(wal_entries_raw.len());
267        let mut seen_ids = std::collections::HashSet::new();
268        for entry in wal_entries_raw.into_iter().rev() {
269            if seen_ids.insert(entry.doc_id.clone()) {
270                deduped_wal.push(entry);
271            }
272        }
273        deduped_wal.reverse();
274        let mut wal_entries = deduped_wal;
275
276        let is_stale = if valid_len > 0 {
277            if wal_compaction_gen == 0 {
278                metadata.compaction_gen > 0
279            } else {
280                let expected = next_generation(metadata.compaction_gen);
281                wal_compaction_gen != expected
282            }
283        } else {
284            false
285        };
286
287        if is_stale {
288            tracing::warn!(
289                path = %path.display(),
290                main_gen = metadata.compaction_gen,
291                wal_gen = wal_compaction_gen,
292                "discarding stale/mismatched WAL entries and removing file"
293            );
294            wal_entries.clear();
295            if wal_path.exists() {
296                let _ = std::fs::remove_file(&wal_path);
297            }
298        } else if wal_path.exists() {
299            let actual_len = std::fs::metadata(&wal_path).map_err(SearchError::Io)?.len();
300            if actual_len > valid_len {
301                tracing::warn!(
302                    path = %wal_path.display(),
303                    actual_len,
304                    valid_len,
305                    "truncating corrupted WAL trailer"
306                );
307                let file = OpenOptions::new()
308                    .write(true)
309                    .open(&wal_path)
310                    .map_err(SearchError::Io)?;
311                file.set_len(valid_len).map_err(SearchError::Io)?;
312                file.sync_all().map_err(SearchError::Io)?;
313            }
314        }
315
316        Ok(Self {
317            path: path.to_path_buf(),
318            data,
319            metadata,
320            records_offset,
321            strings_offset,
322            vectors_offset,
323            wal_entries,
324            wal_config: WalConfig::default(),
325        })
326    }
327
328    /// Create a writer that stores vectors as f16 with an empty revision string.
329    ///
330    /// # Errors
331    ///
332    /// Returns `SearchError::InvalidConfig` when arguments are invalid
333    /// (for example, zero dimension or oversized header fields).
334    pub fn create(
335        path: &Path,
336        embedder_id: &str,
337        dimension: usize,
338    ) -> SearchResult<VectorIndexWriter> {
339        Self::create_with_revision(path, embedder_id, "", dimension, Quantization::F16)
340    }
341
342    /// Create a writer with explicit embedder revision and quantization.
343    ///
344    /// # Errors
345    ///
346    /// Returns `SearchError::InvalidConfig` when arguments are invalid
347    /// (for example, zero dimension or oversized header fields).
348    pub fn create_with_revision(
349        path: &Path,
350        embedder_id: &str,
351        embedder_revision: &str,
352        dimension: usize,
353        quantization: Quantization,
354    ) -> SearchResult<VectorIndexWriter> {
355        if dimension == 0 {
356            return Err(SearchError::InvalidConfig {
357                field: "dimension".to_owned(),
358                value: "0".to_owned(),
359                reason: "dimension must be greater than zero".to_owned(),
360            });
361        }
362        validate_header_string(embedder_id, "embedder_id")?;
363        validate_header_string(embedder_revision, "embedder_revision")?;
364        let _ = u32::try_from(dimension).map_err(|_| SearchError::InvalidConfig {
365            field: "dimension".to_owned(),
366            value: dimension.to_string(),
367            reason: "dimension must fit in u32 for FSVI header encoding".to_owned(),
368        })?;
369
370        Ok(VectorIndexWriter {
371            path: path.to_path_buf(),
372            embedder_id: embedder_id.to_owned(),
373            embedder_revision: embedder_revision.to_owned(),
374            dimension,
375            quantization,
376            compaction_gen: 1,
377            records: Vec::new(),
378        })
379    }
380
381    /// Number of vectors in this index.
382    #[must_use]
383    pub const fn record_count(&self) -> usize {
384        self.metadata.record_count
385    }
386
387    /// Embedding dimensionality.
388    #[must_use]
389    pub const fn dimension(&self) -> usize {
390        self.metadata.dimension
391    }
392
393    /// Embedder id stored in the index header.
394    #[must_use]
395    pub fn embedder_id(&self) -> &str {
396        &self.metadata.embedder_id
397    }
398
399    /// Embedder revision stored in the index header.
400    #[must_use]
401    pub fn embedder_revision(&self) -> &str {
402        &self.metadata.embedder_revision
403    }
404
405    /// Stored quantization.
406    #[must_use]
407    pub const fn quantization(&self) -> Quantization {
408        self.metadata.quantization
409    }
410
411    /// Full parsed metadata.
412    #[must_use]
413    pub const fn metadata(&self) -> &VectorMetadata {
414        &self.metadata
415    }
416
417    // ─── WAL / Incremental Update API ───────────────────────────────────
418
419    /// Set the WAL configuration for incremental updates.
420    pub const fn set_wal_config(&mut self, config: WalConfig) {
421        self.wal_config = config;
422    }
423
424    /// Number of entries in the write-ahead log (pending compaction).
425    #[must_use]
426    pub const fn wal_record_count(&self) -> usize {
427        self.wal_entries.len()
428    }
429
430    /// Whether the WAL is large enough that compaction is recommended.
431    ///
432    /// Returns `true` when the WAL exceeds either the absolute threshold
433    /// or the ratio threshold relative to the main index size.
434    #[must_use]
435    pub fn needs_compaction(&self) -> bool {
436        if self.wal_entries.is_empty() {
437            return false;
438        }
439        if self.wal_entries.len() >= self.wal_config.compaction_threshold {
440            return true;
441        }
442        if self.record_count() > 0 {
443            #[allow(clippy::cast_precision_loss)]
444            let ratio = self.wal_entries.len() as f64 / self.record_count() as f64;
445            // NaN compaction_ratio makes >= always false, silently disabling
446            // ratio-based compaction. Fall back to the default.
447            let threshold = if self.wal_config.compaction_ratio.is_finite() {
448                self.wal_config.compaction_ratio
449            } else {
450                0.10
451            };
452            if ratio >= threshold {
453                return true;
454            }
455        }
456        false
457    }
458
459    /// Tombstone (soft-delete) a document by `doc_id`.
460    ///
461    /// Returns `Ok(true)` when a live record was marked deleted, and `Ok(false)`
462    /// when the document does not exist or is already tombstoned.
463    ///
464    /// # Errors
465    ///
466    /// Returns `SearchError::Io` for filesystem write/sync failures and
467    /// `SearchError::IndexCorrupted` if the on-disk record table is malformed.
468    pub fn soft_delete(&mut self, doc_id: &str) -> SearchResult<bool> {
469        self.soft_delete_batch(&[doc_id]).map(|count| count > 0)
470    }
471
472    /// Tombstone a batch of document ids.
473    ///
474    /// Returns the number of records that transitioned from live -> deleted.
475    ///
476    /// # Errors
477    ///
478    /// Returns the first IO/corruption error encountered while updating flags.
479    pub fn soft_delete_batch(&mut self, doc_ids: &[&str]) -> SearchResult<usize> {
480        let mut deleted = 0usize;
481        let mut wal_changed = false;
482
483        // Track modified main index entries for potential rollback
484        let mut modified_main_entries = Vec::new();
485
486        // Use a fast lookup for WAL entries to delete
487        let mut to_delete_set = std::collections::HashSet::with_capacity(doc_ids.len());
488        for &id in doc_ids {
489            to_delete_set.insert(id);
490        }
491
492        // 1. Mark all matching records in the main index as tombstoned.
493        for &doc_id in doc_ids {
494            let doc_id_hash = fnv1a_hash(doc_id.as_bytes());
495            if let Some(mut index) = self.find_first_hash_match(doc_id_hash)? {
496                while index > 0 {
497                    let prev = self.record_at(index - 1)?;
498                    if prev.doc_id_hash != doc_id_hash {
499                        break;
500                    }
501                    index -= 1;
502                }
503
504                for candidate in index..self.record_count() {
505                    let entry = self.record_at(candidate)?;
506                    if entry.doc_id_hash != doc_id_hash {
507                        break;
508                    }
509                    if !is_tombstoned_flags(entry.flags) {
510                        let candidate_doc_id = self.doc_id_at(candidate)?;
511                        if candidate_doc_id == doc_id {
512                            let flags = entry.flags | RECORD_FLAG_TOMBSTONE;
513                            self.set_record_flags(candidate, flags)?;
514                            modified_main_entries.push((candidate, entry.flags));
515                            deleted += 1;
516                        }
517                    }
518                }
519            }
520        }
521
522        // 2. Remove all matching records from WAL entries.
523        let original_wal_len = self.wal_entries.len();
524        let filtered: Vec<wal::WalEntry> = self
525            .wal_entries
526            .iter()
527            .filter(|entry| !to_delete_set.contains(entry.doc_id.as_str()))
528            .cloned()
529            .collect();
530
531        let prev_wal = if filtered.len() < original_wal_len {
532            deleted += original_wal_len - filtered.len();
533            wal_changed = true;
534            std::mem::replace(&mut self.wal_entries, filtered)
535        } else {
536            Vec::new()
537        };
538
539        // 3. Rewrite WAL sidecar once if anything was removed.
540        if wal_changed {
541            if let Err(err) = self.rewrite_wal_sidecar() {
542                self.wal_entries = prev_wal;
543                // Rollback main index modifications
544                for (candidate, original_flags) in modified_main_entries {
545                    if let Err(rollback_err) = self.set_record_flags(candidate, original_flags) {
546                        tracing::error!(
547                            error = %rollback_err,
548                            candidate,
549                            "failed to rollback main index flag during soft_delete_batch failure"
550                        );
551                    }
552                }
553                tracing::error!(
554                    error = %err,
555                    "failed to rewrite WAL sidecar during batch delete"
556                );
557                return Err(err);
558            }
559        }
560
561        Ok(deleted)
562    }
563
564    /// Whether the record at `record_index` is tombstoned.
565    #[must_use]
566    pub fn is_deleted(&self, record_index: usize) -> bool {
567        matches!(
568            self.record_at(record_index),
569            Ok(entry) if is_tombstoned_flags(entry.flags)
570        )
571    }
572
573    /// Number of tombstoned records in the main index.
574    #[must_use]
575    pub fn tombstone_count(&self) -> usize {
576        (0..self.record_count())
577            .filter(|&index| self.is_deleted(index))
578            .count()
579    }
580
581    /// Fraction of records that are tombstoned (`tombstones / record_count`).
582    #[must_use]
583    #[allow(clippy::cast_precision_loss)]
584    pub fn tombstone_ratio(&self) -> f64 {
585        if self.record_count() == 0 {
586            return 0.0;
587        }
588        self.tombstone_count() as f64 / self.record_count() as f64
589    }
590
591    /// Whether the tombstone ratio exceeds the default vacuum threshold.
592    #[must_use]
593    pub fn needs_vacuum(&self) -> bool {
594        self.tombstone_ratio() > TOMBSTONE_VACUUM_THRESHOLD
595    }
596
597    /// Rewrite the main index file without tombstoned records.
598    ///
599    /// WAL entries are preserved and reloaded after the rewrite.
600    ///
601    /// # Errors
602    ///
603    /// Returns `SearchError::Io` for filesystem failures and
604    /// `SearchError::IndexCorrupted` for malformed data.
605    pub fn vacuum(&mut self) -> SearchResult<VacuumStats> {
606        let start = Instant::now();
607        let records_before = self.record_count();
608        let bytes_before = self.data.len();
609        let tombstones_before = self.tombstone_count();
610
611        if records_before == 0 || tombstones_before == 0 {
612            return Ok(VacuumStats {
613                records_before,
614                records_after: records_before,
615                tombstones_removed: 0,
616                bytes_reclaimed: 0,
617                duration: start.elapsed(),
618            });
619        }
620
621        // Collect live entries from main index.
622        let mut sources = Vec::with_capacity(records_before - tombstones_before);
623        for index in 0..records_before {
624            if !self.is_deleted(index) {
625                sources.push(MergeSource::Main(index));
626            }
627        }
628
629        self.rewrite_index(&sources, self.metadata.compaction_gen)?;
630
631        let records_after = self.record_count();
632        let bytes_reclaimed = bytes_before.saturating_sub(self.data.len());
633        Ok(VacuumStats {
634            records_before,
635            records_after,
636            tombstones_removed: records_before.saturating_sub(records_after),
637            bytes_reclaimed,
638            duration: start.elapsed(),
639        })
640    }
641
642    /// Append a single vector to the index via the WAL.
643    ///
644    /// The vector is immediately searchable. It is written to the WAL
645    /// sidecar file for crash safety.
646    ///
647    /// # Errors
648    ///
649    /// Returns `SearchError::DimensionMismatch` for wrong embedding lengths
650    /// and `SearchError::Io` for filesystem failures.
651    pub fn append(&mut self, doc_id: &str, vector: &[f32]) -> SearchResult<()> {
652        self.append_batch(&[(doc_id.to_owned(), vector.to_vec())])
653    }
654
655    /// Append a batch of vectors to the index via the WAL.
656    ///
657    /// All vectors in the batch are written atomically to a single WAL
658    /// batch (one CRC covers the whole batch).
659    ///
660    /// # Errors
661    ///
662    /// Returns `SearchError::DimensionMismatch` for wrong embedding lengths,
663    /// `SearchError::InvalidConfig` for invalid values, and
664    /// `SearchError::Io` for filesystem failures.
665    pub fn append_batch(&mut self, entries: &[(String, Vec<f32>)]) -> SearchResult<()> {
666        if entries.is_empty() {
667            return Ok(());
668        }
669
670        // Validate all entries before writing anything.
671        for (doc_id, vector) in entries {
672            if vector.len() != self.dimension() {
673                return Err(SearchError::DimensionMismatch {
674                    expected: self.dimension(),
675                    found: vector.len(),
676                });
677            }
678            if vector.iter().any(|v| !v.is_finite()) {
679                return Err(SearchError::InvalidConfig {
680                    field: "embedding".to_owned(),
681                    value: "<contains non-finite values>".to_owned(),
682                    reason: "all embedding values must be finite".to_owned(),
683                });
684            }
685            let _ = u16::try_from(doc_id.len()).map_err(|_| SearchError::InvalidConfig {
686                field: "doc_id".to_owned(),
687                value: doc_id.clone(),
688                reason: "doc_id byte length must fit in u16".to_owned(),
689            })?;
690        }
691
692        // Soft-delete any existing entries (main or WAL) for these documents
693        // so that the newly appended WAL entries replace them entirely.
694        let doc_ids: Vec<&str> = entries.iter().map(|(id, _)| id.as_str()).collect();
695        self.soft_delete_batch(&doc_ids)?;
696
697        let mut wal_entries: Vec<wal::WalEntry> = Vec::with_capacity(entries.len());
698        let mut seen = std::collections::HashSet::new();
699        for (doc_id, embedding) in entries.iter().rev() {
700            if seen.insert(doc_id) {
701                wal_entries.push(wal::WalEntry {
702                    doc_id: doc_id.clone(),
703                    doc_id_hash: fnv1a_hash(doc_id.as_bytes()),
704                    embedding: embedding.clone(),
705                });
706            }
707        }
708        wal_entries.reverse();
709
710        // Write to WAL file.
711        let wal_path = wal::wal_path_for(&self.path);
712        wal::append_wal_batch(
713            &wal_path,
714            &wal_entries,
715            self.dimension(),
716            self.quantization(),
717            next_generation(self.metadata.compaction_gen),
718            self.wal_config.fsync_on_write,
719        )?;
720
721        // Deduplicate existing WAL entries by doc_id before extending.
722        for new_entry in &wal_entries {
723            self.wal_entries
724                .retain(|existing| existing.doc_id != new_entry.doc_id);
725        }
726        // Add to in-memory entries (immediately searchable).
727        self.wal_entries.extend(wal_entries.clone());
728
729        // BEST-EFFORT: Tombstone the old main index entries so they don't pollute the top-K heap.
730        // If this crashes before completing, it's fine; they will be resolved out later (though they might steal a top-K slot temporarily).
731        for entry in &wal_entries {
732            let hash = entry.doc_id_hash;
733            if let Ok(Some(mut index)) = self.find_first_hash_match(hash) {
734                while index > 0 {
735                    if let Ok(prev) = self.record_at(index - 1) {
736                        if prev.doc_id_hash != hash {
737                            break;
738                        }
739                        index -= 1;
740                    } else {
741                        break;
742                    }
743                }
744                for candidate in index..self.record_count() {
745                    if let Ok(rec) = self.record_at(candidate) {
746                        if rec.doc_id_hash != hash {
747                            break;
748                        }
749                        if !is_tombstoned_flags(rec.flags) {
750                            if let Ok(candidate_doc_id) = self.doc_id_at(candidate) {
751                                if candidate_doc_id == entry.doc_id {
752                                    let flags = rec.flags | RECORD_FLAG_TOMBSTONE;
753                                    if let Err(err) = self.set_record_flags(candidate, flags) {
754                                        tracing::warn!(
755                                            target: "frankensearch.index",
756                                            path = %self.path.display(),
757                                            candidate_index = candidate,
758                                            doc_id = %entry.doc_id,
759                                            error = %err,
760                                            "WAL replay: failed to tombstone superseded record; \
761                                             duplicate may persist until next compaction"
762                                        );
763                                    }
764                                    break;
765                                }
766                            }
767                        }
768                    } else {
769                        break;
770                    }
771                }
772            }
773        }
774
775        debug!(
776            target: "frankensearch.index",
777            path = %self.path.display(),
778            batch_size = entries.len(),
779            wal_total = self.wal_entries.len(),
780            "appended to WAL"
781        );
782        Ok(())
783    }
784
785    /// Compact the WAL into the main index.
786    ///
787    /// Rewrites the main index file with all main + WAL records merged,
788    /// then removes the WAL sidecar. The index is atomically swapped
789    /// (write to tmp, rename over original).
790    ///
791    /// # Errors
792    ///
793    /// Returns `SearchError::Io` for filesystem failures and
794    /// `SearchError::InvalidConfig` for encoding issues.
795    #[allow(clippy::cast_precision_loss)]
796    pub fn compact(&mut self) -> SearchResult<CompactionStats> {
797        let start = Instant::now();
798        let main_before = self.record_count();
799        let wal_count = self.wal_entries.len();
800
801        if wal_count == 0 {
802            return Ok(CompactionStats {
803                main_records_before: main_before,
804                wal_records: 0,
805                total_records_after: main_before,
806                elapsed_ms: 0.0,
807            });
808        }
809
810        let deduped_sources = (|| -> SearchResult<Vec<MergeSource>> {
811            #[derive(Clone, Copy)]
812            struct SortKey<'a> {
813                doc_id_hash: u64,
814                doc_id: &'a str,
815            }
816
817            #[derive(Clone, Copy)]
818            struct KeyedSource<'a> {
819                key: SortKey<'a>,
820                source: MergeSource,
821            }
822
823            // Collect all sources with their sort keys.
824            let mut keyed_sources = Vec::with_capacity(main_before + wal_count);
825            for i in 0..main_before {
826                if !self.is_deleted(i) {
827                    let entry = self.record_at(i)?;
828                    let doc_id = self.doc_id_at(i)?;
829                    keyed_sources.push(KeyedSource {
830                        key: SortKey {
831                            doc_id_hash: entry.doc_id_hash,
832                            doc_id,
833                        },
834                        source: MergeSource::Main(i),
835                    });
836                }
837            }
838            for (idx, entry) in self.wal_entries.iter().enumerate() {
839                keyed_sources.push(KeyedSource {
840                    key: SortKey {
841                        doc_id_hash: entry.doc_id_hash,
842                        doc_id: &entry.doc_id,
843                    },
844                    source: MergeSource::Wal(idx),
845                });
846            }
847
848            // Sort to ensure binary search property.
849            keyed_sources.sort_by(|a, b| {
850                a.key
851                    .doc_id_hash
852                    .cmp(&b.key.doc_id_hash)
853                    .then(a.key.doc_id.cmp(b.key.doc_id))
854            });
855
856            // Deduplicate sources by doc_id, keeping the latest (WAL over Main).
857            // Since `keyed_sources` is sorted, duplicates are adjacent and the stable sort
858            // ensures that newer sources (WAL) appear after older sources (Main).
859            let mut deduped: Vec<KeyedSource<'_>> = Vec::with_capacity(keyed_sources.len());
860            for item in keyed_sources {
861                if let Some(last) = deduped.last_mut() {
862                    if item.key.doc_id_hash == last.key.doc_id_hash
863                        && item.key.doc_id == last.key.doc_id
864                    {
865                        // Overwrite the older entry with the newer one
866                        *last = item;
867                        continue;
868                    }
869                }
870                deduped.push(item);
871            }
872
873            Ok(deduped
874                .into_iter()
875                .map(|item| item.source)
876                .collect::<Vec<_>>())
877        })()?;
878
879        // Perform the rewrite.
880        self.rewrite_index(
881            &deduped_sources,
882            next_generation(self.metadata.compaction_gen),
883        )?;
884
885        // After rewrite_index succeeds, clear in-memory WAL state immediately
886        // (the data is now in the main index). If remove_wal fails, the stale
887        // WAL file on disk will be detected and discarded on next open() via
888        // the generation counter.
889        self.wal_entries.clear();
890
891        // Then try to remove the WAL file (best-effort).
892        let wal_path = wal::wal_path_for(&self.path);
893        if let Err(e) = wal::remove_wal(&wal_path) {
894            tracing::warn!("failed to remove WAL file after compaction: {e}");
895        }
896
897        let elapsed = start.elapsed();
898        let stats = CompactionStats {
899            main_records_before: main_before,
900            wal_records: wal_count,
901            total_records_after: self.record_count(),
902            elapsed_ms: elapsed.as_secs_f64() * 1000.0,
903        };
904
905        debug!(
906            target: "frankensearch.index",
907            path = %self.path.display(),
908            main_before,
909            wal_count,
910            total_after = stats.total_records_after,
911            elapsed_ms = format_args!("{:.1}", stats.elapsed_ms),
912            "compaction complete"
913        );
914        Ok(stats)
915    }
916
917    fn resolve_sort_key<'a>(&'a self, source: &MergeSource) -> SearchResult<(u64, &'a str)> {
918        match source {
919            MergeSource::Main(idx) => {
920                let entry = self.record_at(*idx)?;
921                let id = self.doc_id_at(*idx)?;
922                Ok((entry.doc_id_hash, id))
923            }
924            MergeSource::Wal(idx) => {
925                let entry = &self.wal_entries[*idx];
926                Ok((entry.doc_id_hash, &entry.doc_id))
927            }
928        }
929    }
930
931    #[allow(clippy::too_many_lines)]
932    fn rewrite_index(&mut self, sources: &[MergeSource], new_gen: u8) -> SearchResult<()> {
933        let record_count = sources.len();
934        let records_bytes = record_count.checked_mul(RECORD_SIZE_BYTES).ok_or_else(|| {
935            SearchError::InvalidConfig {
936                field: "record_count".to_owned(),
937                value: record_count.to_string(),
938                reason: "record table size overflow".to_owned(),
939            }
940        })?;
941        let records_bytes_u64 =
942            u64::try_from(records_bytes).map_err(|_| SearchError::InvalidConfig {
943                field: "record_count".to_owned(),
944                value: record_count.to_string(),
945                reason: "record table size does not fit in u64".to_owned(),
946            })?;
947
948        // Pass 1: Build Record Table and calculate layout.
949        // We buffer the Record Table in memory (16 bytes * N).
950        // 10M records = 160MB, which is acceptable.
951        let mut record_table = Vec::with_capacity(records_bytes);
952        let mut current_string_offset = 0u32;
953        let mut string_table_len = 0u64;
954
955        for source in sources {
956            let (doc_id_hash, doc_id) = self.resolve_sort_key(source)?;
957            let doc_id_len = doc_id.len();
958
959            // Validation
960            let len_u16 = u16::try_from(doc_id_len).map_err(|_| SearchError::InvalidConfig {
961                field: "doc_id_len".to_owned(),
962                value: doc_id_len.to_string(),
963                reason: "doc_id length exceeds u16".to_owned(),
964            })?;
965            let len_u32 = u32::from(len_u16);
966            let len_u64 = u64::from(len_u16);
967            if current_string_offset.checked_add(len_u32).is_none() {
968                return Err(SearchError::InvalidConfig {
969                    field: "doc_id_offset".to_owned(),
970                    value: "overflow".to_owned(),
971                    reason: "string table offset exceeds u32".to_owned(),
972                });
973            }
974
975            // Append to record table
976            record_table.extend_from_slice(&doc_id_hash.to_le_bytes());
977            record_table.extend_from_slice(&current_string_offset.to_le_bytes());
978            record_table.extend_from_slice(&len_u16.to_le_bytes());
979            record_table.extend_from_slice(&0u16.to_le_bytes()); // Flags cleared (tombstones gone)
980
981            current_string_offset += len_u32;
982            string_table_len += len_u64;
983        }
984
985        // Calculate layout
986        let provisional_header = build_header_prefix(
987            &self.metadata.embedder_id,
988            &self.metadata.embedder_revision,
989            self.dimension(),
990            self.quantization(),
991            new_gen,
992            record_count,
993            0,
994        )?;
995        let header_len = provisional_header.len() + 4; // + CRC
996        let header_len_u64 = u64::try_from(header_len).map_err(|_| SearchError::InvalidConfig {
997            field: "header".to_owned(),
998            value: header_len.to_string(),
999            reason: "header length does not fit in u64".to_owned(),
1000        })?;
1001
1002        let pre_vector = header_len_u64
1003            .checked_add(records_bytes_u64)
1004            .and_then(|v| v.checked_add(string_table_len))
1005            .ok_or_else(|| SearchError::InvalidConfig {
1006                field: "layout".to_owned(),
1007                value: "overflow".to_owned(),
1008                reason: "layout offset overflow".to_owned(),
1009            })?;
1010
1011        let vectors_offset = align_up(pre_vector, VECTOR_ALIGN_BYTES)?;
1012        let padding_len = usize::try_from(vectors_offset - pre_vector).map_err(|_| {
1013            SearchError::InvalidConfig {
1014                field: "padding_len".to_owned(),
1015                value: (vectors_offset - pre_vector).to_string(),
1016                reason: "padding length exceeds usize".to_owned(),
1017            }
1018        })?;
1019
1020        // Open temp file
1021        let tmp_path = temporary_output_path(&self.path);
1022
1023        // Helper: perform all I/O into tmp_path, rename atomically, and reload.
1024        // If anything fails after the temp file is created, we clean it up.
1025        let result = (|| -> SearchResult<()> {
1026            let mut file = OpenOptions::new()
1027                .create(true)
1028                .truncate(true)
1029                .write(true)
1030                .open(&tmp_path)?;
1031            {
1032                let mut writer = BufWriter::with_capacity(256 * 1024, &mut file);
1033
1034                // Pass 2: Write Header and Record Table
1035                let mut header_prefix = build_header_prefix(
1036                    &self.metadata.embedder_id,
1037                    &self.metadata.embedder_revision,
1038                    self.dimension(),
1039                    self.quantization(),
1040                    new_gen,
1041                    record_count,
1042                    vectors_offset,
1043                )?;
1044                let header_crc = crc32(&header_prefix);
1045                header_prefix.extend_from_slice(&header_crc.to_le_bytes());
1046
1047                writer.write_all(&header_prefix)?;
1048                writer.write_all(&record_table)?;
1049
1050                // Pass 3: Write String Table
1051                for source in sources {
1052                    let (_, doc_id) = self.resolve_sort_key(source)?;
1053                    writer.write_all(doc_id.as_bytes())?;
1054                }
1055
1056                // Padding
1057                if padding_len > 0 {
1058                    writer.write_all(&vec![0u8; padding_len])?;
1059                }
1060
1061                // Pass 4: Write Vectors
1062                match self.quantization() {
1063                    Quantization::F16 => {
1064                        for source in sources {
1065                            match source {
1066                                MergeSource::Main(idx) => {
1067                                    // Fast path: copy raw bytes
1068                                    let start = self.vector_start(*idx)?;
1069                                    let len = self.dimension() * 2;
1070                                    let bytes = &self.data[start..start + len];
1071                                    writer.write_all(bytes)?;
1072                                }
1073                                MergeSource::Wal(idx) => {
1074                                    // Slow path: encode
1075                                    let entry = &self.wal_entries[*idx];
1076                                    for &val in &entry.embedding {
1077                                        writer.write_all(&f16::from_f32(val).to_le_bytes())?;
1078                                    }
1079                                }
1080                            }
1081                        }
1082                    }
1083                    Quantization::F32 => {
1084                        for source in sources {
1085                            match source {
1086                                MergeSource::Main(idx) => {
1087                                    // Fast path: copy raw bytes
1088                                    let start = self.vector_start(*idx)?;
1089                                    let len = self.dimension() * 4;
1090                                    let bytes = &self.data[start..start + len];
1091                                    writer.write_all(bytes)?;
1092                                }
1093                                MergeSource::Wal(idx) => {
1094                                    // Slow path: encode
1095                                    let entry = &self.wal_entries[*idx];
1096                                    for &val in &entry.embedding {
1097                                        writer.write_all(&val.to_le_bytes())?;
1098                                    }
1099                                }
1100                            }
1101                        }
1102                    }
1103                }
1104                writer.flush()?;
1105            }
1106
1107            file.sync_all()?;
1108            fs::rename(&tmp_path, &self.path)?;
1109            sync_parent_directory(&self.path)?;
1110            Ok(())
1111        })();
1112
1113        if result.is_err() {
1114            // Clean up the temp file on error (best-effort).
1115            if tmp_path.exists() {
1116                if let Err(cleanup_err) = fs::remove_file(&tmp_path) {
1117                    tracing::warn!(
1118                        "failed to clean up temp file {} after rewrite error: {cleanup_err}",
1119                        tmp_path.display()
1120                    );
1121                }
1122            }
1123        }
1124        result?;
1125
1126        // Reload
1127        let config = self.wal_config.clone();
1128        let reloaded = Self::open(&self.path)?;
1129        self.data = reloaded.data;
1130        self.metadata = reloaded.metadata;
1131        self.records_offset = reloaded.records_offset;
1132        self.strings_offset = reloaded.strings_offset;
1133        self.vectors_offset = reloaded.vectors_offset;
1134        // WAL entries are cleared by caller if compacting, or preserved if vacuuming
1135        // But vacuum preserves WAL on disk, so open() loads them.
1136        // Vacuum caller ignores the reloaded WAL entries? No, vacuum preserves them.
1137        // self.vacuum() impl:
1138        //   writer.finish()
1139        //   Self::open() -> loads WAL entries
1140        //   self.wal_entries = reloaded.wal_entries
1141        // So we need to update self.wal_entries from reloaded.
1142        self.wal_entries = reloaded.wal_entries;
1143        self.wal_config = config;
1144
1145        Ok(())
1146    }
1147
1148    /// Resolve the document id at `index`.
1149    ///
1150    /// # Errors
1151    ///
1152    /// Returns `SearchError::InvalidConfig` for out-of-range indices and
1153    /// `SearchError::IndexCorrupted` for malformed record/string tables.
1154    pub fn doc_id_at(&self, index: usize) -> SearchResult<&str> {
1155        self.ensure_index(index)?;
1156        let entry = self.record_at(index)?;
1157        let doc_id_offset = usize::try_from(entry.doc_id_offset).map_err(|_| {
1158            index_corrupted(
1159                &self.path,
1160                format!("doc_id_offset overflow for record at index {index}"),
1161            )
1162        })?;
1163        let doc_id_len = usize::from(entry.doc_id_len);
1164        let start = self
1165            .strings_offset
1166            .checked_add(doc_id_offset)
1167            .ok_or_else(|| index_corrupted(&self.path, "doc_id start offset overflow"))?;
1168        let end = start
1169            .checked_add(doc_id_len)
1170            .ok_or_else(|| index_corrupted(&self.path, "doc_id end offset overflow"))?;
1171        if end > self.vectors_offset {
1172            return Err(index_corrupted(
1173                &self.path,
1174                format!(
1175                    "doc_id range [{start}, {end}) exceeds string table end {}",
1176                    self.vectors_offset
1177                ),
1178            ));
1179        }
1180        std::str::from_utf8(&self.data[start..end]).map_err(|error| {
1181            index_corrupted(
1182                &self.path,
1183                format!("invalid UTF-8 in doc_id at index {index}: {error}"),
1184            )
1185        })
1186    }
1187
1188    /// Decode a vector as f32 values.
1189    ///
1190    /// # Errors
1191    ///
1192    /// Returns `SearchError::InvalidConfig` for out-of-range indices and
1193    /// `SearchError::IndexCorrupted` for malformed vector slab data.
1194    pub fn vector_at_f32(&self, index: usize) -> SearchResult<Vec<f32>> {
1195        self.ensure_index(index)?;
1196        let start = self.vector_start(index)?;
1197        let dim = self.dimension();
1198        match self.quantization() {
1199            Quantization::F32 => {
1200                let byte_len = dim.checked_mul(4).ok_or_else(|| {
1201                    index_corrupted(&self.path, "f32 vector byte length overflow")
1202                })?;
1203                let end = start
1204                    .checked_add(byte_len)
1205                    .ok_or_else(|| index_corrupted(&self.path, "f32 vector end overflow"))?;
1206                if end > self.data.len() {
1207                    return Err(index_corrupted(
1208                        &self.path,
1209                        "f32 vector extends past file end",
1210                    ));
1211                }
1212                let mut out = Vec::with_capacity(dim);
1213                for chunk in self.data[start..end].chunks_exact(4) {
1214                    out.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
1215                }
1216                Ok(out)
1217            }
1218            Quantization::F16 => {
1219                let byte_len = dim.checked_mul(2).ok_or_else(|| {
1220                    index_corrupted(&self.path, "f16 vector byte length overflow")
1221                })?;
1222                let end = start
1223                    .checked_add(byte_len)
1224                    .ok_or_else(|| index_corrupted(&self.path, "f16 vector end overflow"))?;
1225                if end > self.data.len() {
1226                    return Err(index_corrupted(
1227                        &self.path,
1228                        "f16 vector extends past file end",
1229                    ));
1230                }
1231                let mut out = Vec::with_capacity(dim);
1232                for chunk in self.data[start..end].chunks_exact(2) {
1233                    out.push(f16::from_le_bytes([chunk[0], chunk[1]]).to_f32());
1234                }
1235                Ok(out)
1236            }
1237        }
1238    }
1239
1240    /// Decode a vector as f16 values.
1241    ///
1242    /// # Errors
1243    ///
1244    /// Returns `SearchError::InvalidConfig` for out-of-range indices and
1245    /// `SearchError::IndexCorrupted` for malformed vector slab data.
1246    pub fn vector_at_f16(&self, index: usize) -> SearchResult<Vec<f16>> {
1247        self.ensure_index(index)?;
1248        let start = self.vector_start(index)?;
1249        let dim = self.dimension();
1250        match self.quantization() {
1251            Quantization::F16 => {
1252                let byte_len = dim.checked_mul(2).ok_or_else(|| {
1253                    index_corrupted(&self.path, "f16 vector byte length overflow")
1254                })?;
1255                let end = start
1256                    .checked_add(byte_len)
1257                    .ok_or_else(|| index_corrupted(&self.path, "f16 vector end overflow"))?;
1258                if end > self.data.len() {
1259                    return Err(index_corrupted(
1260                        &self.path,
1261                        "f16 vector extends past file end",
1262                    ));
1263                }
1264                let mut out = Vec::with_capacity(dim);
1265                for chunk in self.data[start..end].chunks_exact(2) {
1266                    out.push(f16::from_le_bytes([chunk[0], chunk[1]]));
1267                }
1268                Ok(out)
1269            }
1270            Quantization::F32 => {
1271                let byte_len = dim.checked_mul(4).ok_or_else(|| {
1272                    index_corrupted(&self.path, "f32 vector byte length overflow")
1273                })?;
1274                let end = start
1275                    .checked_add(byte_len)
1276                    .ok_or_else(|| index_corrupted(&self.path, "f32 vector end overflow"))?;
1277                if end > self.data.len() {
1278                    return Err(index_corrupted(
1279                        &self.path,
1280                        "f32 vector extends past file end",
1281                    ));
1282                }
1283                let mut out = Vec::with_capacity(dim);
1284                for chunk in self.data[start..end].chunks_exact(4) {
1285                    out.push(f16::from_f32(f32::from_le_bytes([
1286                        chunk[0], chunk[1], chunk[2], chunk[3],
1287                    ])));
1288                }
1289                Ok(out)
1290            }
1291        }
1292    }
1293
1294    /// Binary-search the sorted record table by document hash.
1295    #[must_use]
1296    pub fn find_index_by_doc_hash(&self, doc_id_hash: u64) -> Option<usize> {
1297        let mut low = 0usize;
1298        let mut high = self.record_count();
1299        while low < high {
1300            let mid = low + (high - low) / 2;
1301            let entry = self.record_at(mid).ok()?;
1302            match entry.doc_id_hash.cmp(&doc_id_hash) {
1303                std::cmp::Ordering::Less => low = mid + 1,
1304                std::cmp::Ordering::Greater => high = mid,
1305                std::cmp::Ordering::Equal => {
1306                    let mut first = mid;
1307                    while first > 0 {
1308                        let prev = self.record_at(first - 1).ok()?;
1309                        if prev.doc_id_hash != doc_id_hash {
1310                            break;
1311                        }
1312                        first -= 1;
1313                    }
1314                    for index in first..self.record_count() {
1315                        let entry = self.record_at(index).ok()?;
1316                        if entry.doc_id_hash != doc_id_hash {
1317                            break;
1318                        }
1319                        if !is_tombstoned_flags(entry.flags) {
1320                            return Some(index);
1321                        }
1322                    }
1323                    return None;
1324                }
1325            }
1326        }
1327        None
1328    }
1329
1330    /// Fetch embeddings for hashed doc ids (f16 values).
1331    ///
1332    /// Missing hashes return `None` entries at the same position.
1333    #[must_use]
1334    pub fn get_embeddings(&self, doc_id_hashes: &[u64]) -> Vec<Option<Vec<f16>>> {
1335        doc_id_hashes
1336            .iter()
1337            .map(|&hash| {
1338                for entry in self.wal_entries.iter().rev() {
1339                    if entry.doc_id_hash == hash {
1340                        // WAL embeddings are f32, we need to convert them to f16
1341                        return Some(
1342                            entry
1343                                .embedding
1344                                .iter()
1345                                .map(|&v| half::f16::from_f32(v))
1346                                .collect(),
1347                        );
1348                    }
1349                }
1350                if let Some(index) = self.find_index_by_doc_hash(hash) {
1351                    if let Ok(vec) = self.vector_at_f16(index) {
1352                        return Some(vec);
1353                    }
1354                }
1355                None
1356            })
1357            .collect()
1358    }
1359
1360    fn ensure_index(&self, index: usize) -> SearchResult<()> {
1361        if index >= self.record_count() {
1362            return Err(SearchError::InvalidConfig {
1363                field: "index".to_owned(),
1364                value: index.to_string(),
1365                reason: format!(
1366                    "index out of range for record_count={}",
1367                    self.record_count()
1368                ),
1369            });
1370        }
1371        Ok(())
1372    }
1373
1374    pub(crate) fn find_index_by_doc_id(&self, doc_id: &str) -> SearchResult<Option<usize>> {
1375        let doc_id_hash = fnv1a_hash(doc_id.as_bytes());
1376        let Some(mut index) = self.find_first_hash_match(doc_id_hash)? else {
1377            return Ok(None);
1378        };
1379        while index > 0 {
1380            let prev = self.record_at(index - 1)?;
1381            if prev.doc_id_hash != doc_id_hash {
1382                break;
1383            }
1384            index -= 1;
1385        }
1386
1387        for candidate in index..self.record_count() {
1388            let entry = self.record_at(candidate)?;
1389            if entry.doc_id_hash != doc_id_hash {
1390                break;
1391            }
1392            if !is_tombstoned_flags(entry.flags) {
1393                let candidate_doc_id = self.doc_id_at(candidate)?;
1394                if candidate_doc_id == doc_id {
1395                    return Ok(Some(candidate));
1396                }
1397            }
1398        }
1399        Ok(None)
1400    }
1401
1402    fn find_first_hash_match(&self, doc_id_hash: u64) -> SearchResult<Option<usize>> {
1403        let mut low = 0usize;
1404        let mut high = self.record_count();
1405        while low < high {
1406            let mid = low + (high - low) / 2;
1407            let entry = self.record_at(mid)?;
1408            match entry.doc_id_hash.cmp(&doc_id_hash) {
1409                std::cmp::Ordering::Less => low = mid + 1,
1410                std::cmp::Ordering::Greater => high = mid,
1411                std::cmp::Ordering::Equal => return Ok(Some(mid)),
1412            }
1413        }
1414        Ok(None)
1415    }
1416
1417    fn record_flags_offset(&self, index: usize) -> SearchResult<usize> {
1418        self.ensure_index(index)?;
1419        let record_offset = self
1420            .records_offset
1421            .checked_add(index.checked_mul(RECORD_SIZE_BYTES).ok_or_else(|| {
1422                index_corrupted(&self.path, "record offset multiplication overflow")
1423            })?)
1424            .ok_or_else(|| index_corrupted(&self.path, "record offset overflow"))?;
1425        record_offset
1426            .checked_add(14)
1427            .ok_or_else(|| index_corrupted(&self.path, "flags offset overflow"))
1428    }
1429
1430    fn set_record_flags(&mut self, index: usize, flags: u16) -> SearchResult<()> {
1431        let flags_offset = self.record_flags_offset(index)?;
1432        let end = flags_offset
1433            .checked_add(2)
1434            .ok_or_else(|| index_corrupted(&self.path, "flags end overflow"))?;
1435        if end > self.data.len() {
1436            return Err(index_corrupted(
1437                &self.path,
1438                "flags offset points beyond mapped data",
1439            ));
1440        }
1441
1442        let flag_bytes = flags.to_le_bytes();
1443        self.data[flags_offset..end].copy_from_slice(&flag_bytes);
1444        self.data
1445            .flush_range(flags_offset, 2)
1446            .map_err(SearchError::Io)?;
1447        Ok(())
1448    }
1449
1450    fn rewrite_wal_sidecar(&self) -> SearchResult<()> {
1451        let wal_path = wal::wal_path_for(&self.path);
1452        if self.wal_entries.is_empty() {
1453            wal::remove_wal(&wal_path)?;
1454            return Ok(());
1455        }
1456
1457        let mut tmp = wal_path.as_os_str().to_os_string();
1458        tmp.push(".tmp");
1459        let tmp_path = PathBuf::from(tmp);
1460        let _ = wal::remove_wal(&tmp_path);
1461
1462        if let Err(e) = wal::append_wal_batch(
1463            &tmp_path,
1464            &self.wal_entries,
1465            self.dimension(),
1466            self.quantization(),
1467            next_generation(self.metadata.compaction_gen),
1468            self.wal_config.fsync_on_write,
1469        ) {
1470            let _ = fs::remove_file(&tmp_path);
1471            return Err(e);
1472        }
1473
1474        match fs::rename(&tmp_path, &wal_path) {
1475            Ok(()) => Ok(()),
1476            Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => {
1477                wal::remove_wal(&wal_path)?;
1478                fs::rename(&tmp_path, &wal_path)?;
1479                Ok(())
1480            }
1481            Err(error) => {
1482                let _ = wal::remove_wal(&tmp_path);
1483                Err(error.into())
1484            }
1485        }
1486    }
1487
1488    pub(crate) fn record_at(&self, index: usize) -> SearchResult<RecordEntry> {
1489        self.ensure_index(index)?;
1490        let offset = self
1491            .records_offset
1492            .checked_add(index.checked_mul(RECORD_SIZE_BYTES).ok_or_else(|| {
1493                index_corrupted(&self.path, "record offset multiplication overflow")
1494            })?)
1495            .ok_or_else(|| index_corrupted(&self.path, "record offset overflow"))?;
1496        let end = offset
1497            .checked_add(RECORD_SIZE_BYTES)
1498            .ok_or_else(|| index_corrupted(&self.path, "record end overflow"))?;
1499        if end > self.data.len() {
1500            return Err(index_corrupted(
1501                &self.path,
1502                "record table extends beyond file size",
1503            ));
1504        }
1505        let chunk = &self.data[offset..end];
1506        Ok(RecordEntry {
1507            doc_id_hash: u64::from_le_bytes([
1508                chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6], chunk[7],
1509            ]),
1510            doc_id_offset: u32::from_le_bytes([chunk[8], chunk[9], chunk[10], chunk[11]]),
1511            doc_id_len: u16::from_le_bytes([chunk[12], chunk[13]]),
1512            flags: u16::from_le_bytes([chunk[14], chunk[15]]),
1513        })
1514    }
1515
1516    fn vector_start(&self, index: usize) -> SearchResult<usize> {
1517        let stride = self
1518            .dimension()
1519            .checked_mul(self.quantization().bytes_per_element())
1520            .ok_or_else(|| index_corrupted(&self.path, "vector stride overflow"))?;
1521        self.vectors_offset
1522            .checked_add(
1523                index
1524                    .checked_mul(stride)
1525                    .ok_or_else(|| index_corrupted(&self.path, "vector index overflow"))?,
1526            )
1527            .ok_or_else(|| index_corrupted(&self.path, "vector offset overflow"))
1528    }
1529}
1530
1531#[derive(Debug, Clone)]
1532struct PendingRecord {
1533    doc_id: String,
1534    doc_id_hash: u64,
1535    flags: u16,
1536    embedding: Vec<f32>,
1537}
1538
1539#[derive(Debug, Clone, Copy)]
1540enum MergeSource {
1541    Main(usize),
1542    Wal(usize),
1543}
1544
1545#[derive(Debug)]
1546pub struct VectorIndexWriter {
1547    path: PathBuf,
1548    embedder_id: String,
1549    embedder_revision: String,
1550    dimension: usize,
1551    quantization: Quantization,
1552    compaction_gen: u8,
1553    records: Vec<PendingRecord>,
1554}
1555
1556impl VectorIndexWriter {
1557    /// Append a single `(doc_id, embedding)` record.
1558    ///
1559    /// # Errors
1560    ///
1561    /// Returns `SearchError::DimensionMismatch` for wrong embedding lengths
1562    /// and `SearchError::InvalidConfig` for invalid values.
1563    pub fn write_record(&mut self, doc_id: &str, embedding: &[f32]) -> SearchResult<()> {
1564        if embedding.len() != self.dimension {
1565            return Err(SearchError::DimensionMismatch {
1566                expected: self.dimension,
1567                found: embedding.len(),
1568            });
1569        }
1570        if embedding.iter().any(|value| !value.is_finite()) {
1571            return Err(SearchError::InvalidConfig {
1572                field: "embedding".to_owned(),
1573                value: "<contains non-finite values>".to_owned(),
1574                reason: "all embedding values must be finite".to_owned(),
1575            });
1576        }
1577        let _ = u16::try_from(doc_id.len()).map_err(|_| SearchError::InvalidConfig {
1578            field: "doc_id".to_owned(),
1579            value: doc_id.to_owned(),
1580            reason: "doc_id byte length must fit in u16".to_owned(),
1581        })?;
1582        self.records.push(PendingRecord {
1583            doc_id: doc_id.to_owned(),
1584            doc_id_hash: fnv1a_hash(doc_id.as_bytes()),
1585            flags: 0,
1586            embedding: embedding.to_vec(),
1587        });
1588        Ok(())
1589    }
1590
1591    #[allow(dead_code)]
1592    pub(crate) const fn with_generation(mut self, generation: u8) -> Self {
1593        self.compaction_gen = generation;
1594        self
1595    }
1596
1597    /// Persist the index to disk, including fsync of file and parent directory.
1598    ///
1599    /// # Errors
1600    ///
1601    /// Returns `SearchError::InvalidConfig` for layout/encoding failures and
1602    /// `SearchError::Io` for filesystem write/sync failures.
1603    #[allow(clippy::too_many_lines)]
1604    pub fn finish(mut self) -> SearchResult<()> {
1605        self.records.sort_by(|left, right| {
1606            left.doc_id_hash
1607                .cmp(&right.doc_id_hash)
1608                .then(left.doc_id.cmp(&right.doc_id))
1609        });
1610
1611        let record_count = self.records.len();
1612        let records_bytes = record_count.checked_mul(RECORD_SIZE_BYTES).ok_or_else(|| {
1613            SearchError::InvalidConfig {
1614                field: "record_count".to_owned(),
1615                value: record_count.to_string(),
1616                reason: "record table size overflow".to_owned(),
1617            }
1618        })?;
1619        let records_bytes_u64 =
1620            u64::try_from(records_bytes).map_err(|_| SearchError::InvalidConfig {
1621                field: "record_count".to_owned(),
1622                value: record_count.to_string(),
1623                reason: "record table size does not fit in u64".to_owned(),
1624            })?;
1625
1626        let mut string_table = Vec::<u8>::new();
1627        let mut record_entries = Vec::<RecordEntry>::with_capacity(record_count);
1628        for record in &self.records {
1629            let offset_u32 =
1630                u32::try_from(string_table.len()).map_err(|_| SearchError::InvalidConfig {
1631                    field: "doc_id_offset".to_owned(),
1632                    value: string_table.len().to_string(),
1633                    reason: "string table offset exceeds u32".to_owned(),
1634                })?;
1635            let doc_id_bytes = record.doc_id.as_bytes();
1636            let len_u16 =
1637                u16::try_from(doc_id_bytes.len()).map_err(|_| SearchError::InvalidConfig {
1638                    field: "doc_id_len".to_owned(),
1639                    value: doc_id_bytes.len().to_string(),
1640                    reason: "doc_id length exceeds u16".to_owned(),
1641                })?;
1642            string_table.extend_from_slice(doc_id_bytes);
1643            record_entries.push(RecordEntry {
1644                doc_id_hash: record.doc_id_hash,
1645                doc_id_offset: offset_u32,
1646                doc_id_len: len_u16,
1647                flags: record.flags,
1648            });
1649        }
1650
1651        let string_table_len_u64 =
1652            u64::try_from(string_table.len()).map_err(|_| SearchError::InvalidConfig {
1653                field: "string_table".to_owned(),
1654                value: string_table.len().to_string(),
1655                reason: "string table length does not fit in u64".to_owned(),
1656            })?;
1657
1658        let provisional_header = build_header_prefix(
1659            &self.embedder_id,
1660            &self.embedder_revision,
1661            self.dimension,
1662            self.quantization,
1663            self.compaction_gen,
1664            record_count,
1665            0,
1666        )?;
1667        let header_len =
1668            provisional_header
1669                .len()
1670                .checked_add(4)
1671                .ok_or_else(|| SearchError::InvalidConfig {
1672                    field: "header".to_owned(),
1673                    value: provisional_header.len().to_string(),
1674                    reason: "header length overflow".to_owned(),
1675                })?;
1676        let header_len_u64 = u64::try_from(header_len).map_err(|_| SearchError::InvalidConfig {
1677            field: "header".to_owned(),
1678            value: header_len.to_string(),
1679            reason: "header length does not fit in u64".to_owned(),
1680        })?;
1681        let pre_vector = header_len_u64
1682            .checked_add(records_bytes_u64)
1683            .and_then(|value| value.checked_add(string_table_len_u64))
1684            .ok_or_else(|| SearchError::InvalidConfig {
1685                field: "layout".to_owned(),
1686                value: format!("{header_len_u64}+{records_bytes_u64}+{string_table_len_u64}"),
1687                reason: "layout offset overflow".to_owned(),
1688            })?;
1689        let vectors_offset = align_up(pre_vector, VECTOR_ALIGN_BYTES)?;
1690        let padding_len_u64 =
1691            vectors_offset
1692                .checked_sub(pre_vector)
1693                .ok_or_else(|| SearchError::InvalidConfig {
1694                    field: "layout".to_owned(),
1695                    value: format!("{vectors_offset}-{pre_vector}"),
1696                    reason: "negative padding detected".to_owned(),
1697                })?;
1698        let padding_len =
1699            usize::try_from(padding_len_u64).map_err(|_| SearchError::InvalidConfig {
1700                field: "padding".to_owned(),
1701                value: padding_len_u64.to_string(),
1702                reason: "padding length does not fit in usize".to_owned(),
1703            })?;
1704
1705        let mut header_prefix = build_header_prefix(
1706            &self.embedder_id,
1707            &self.embedder_revision,
1708            self.dimension,
1709            self.quantization,
1710            self.compaction_gen,
1711            record_count,
1712            vectors_offset,
1713        )?;
1714        let header_crc = crc32(&header_prefix);
1715        header_prefix.extend_from_slice(&header_crc.to_le_bytes());
1716
1717        let tmp_path = temporary_output_path(&self.path);
1718        let result = (|| -> SearchResult<()> {
1719            let mut file = OpenOptions::new()
1720                .create(true)
1721                .truncate(true)
1722                .write(true)
1723                .open(&tmp_path)?;
1724            {
1725                let mut writer = BufWriter::with_capacity(256 * 1024, &mut file);
1726
1727                writer.write_all(&header_prefix)?;
1728                for entry in &record_entries {
1729                    writer.write_all(&entry.doc_id_hash.to_le_bytes())?;
1730                    writer.write_all(&entry.doc_id_offset.to_le_bytes())?;
1731                    writer.write_all(&entry.doc_id_len.to_le_bytes())?;
1732                    writer.write_all(&entry.flags.to_le_bytes())?;
1733                }
1734                writer.write_all(&string_table)?;
1735                if padding_len > 0 {
1736                    writer.write_all(&vec![0_u8; padding_len])?;
1737                }
1738                write_vector_slab(&mut writer, &self.records, self.quantization)?;
1739                writer.flush()?;
1740            }
1741
1742            file.sync_all()?;
1743            fs::rename(&tmp_path, &self.path)?;
1744            sync_parent_directory(&self.path)?;
1745            Ok(())
1746        })();
1747
1748        if result.is_err() {
1749            if tmp_path.exists() {
1750                if let Err(cleanup_err) = fs::remove_file(&tmp_path) {
1751                    tracing::warn!(
1752                        "failed to clean up temp file {} after write error: {cleanup_err}",
1753                        tmp_path.display()
1754                    );
1755                }
1756            }
1757        }
1758        result?;
1759
1760        debug!(
1761            target: "frankensearch.index",
1762            path = %self.path.display(),
1763            record_count,
1764            dimension = self.dimension,
1765            quantization = self.quantization as u8,
1766            vectors_offset,
1767            "wrote fsvi index"
1768        );
1769        Ok(())
1770    }
1771}
1772
1773fn parse_header(path: &Path, data: &[u8]) -> SearchResult<(VectorMetadata, usize)> {
1774    let mut cursor = 0usize;
1775    let magic = read_array::<4>(path, data, &mut cursor, "magic")?;
1776    if magic != FSVI_MAGIC {
1777        return Err(index_corrupted(
1778            path,
1779            format!("bad magic bytes: expected {FSVI_MAGIC:?}, found {magic:?}"),
1780        ));
1781    }
1782
1783    let version = u16::from_le_bytes(read_array::<2>(path, data, &mut cursor, "version")?);
1784    if version != FSVI_VERSION {
1785        return Err(SearchError::IndexVersionMismatch {
1786            expected: FSVI_VERSION,
1787            found: version,
1788        });
1789    }
1790
1791    let embedder_id_len = usize::from(u16::from_le_bytes(read_array::<2>(
1792        path,
1793        data,
1794        &mut cursor,
1795        "embedder_id_len",
1796    )?));
1797    let embedder_id_bytes = read_slice(path, data, &mut cursor, embedder_id_len, "embedder_id")?;
1798    let embedder_id = std::str::from_utf8(embedder_id_bytes)
1799        .map_err(|error| index_corrupted(path, format!("invalid UTF-8 in embedder_id: {error}")))?
1800        .to_owned();
1801
1802    let embedder_revision_len = usize::from(u16::from_le_bytes(read_array::<2>(
1803        path,
1804        data,
1805        &mut cursor,
1806        "embedder_revision_len",
1807    )?));
1808    let embedder_revision_bytes = read_slice(
1809        path,
1810        data,
1811        &mut cursor,
1812        embedder_revision_len,
1813        "embedder_revision",
1814    )?;
1815    let embedder_revision = std::str::from_utf8(embedder_revision_bytes)
1816        .map_err(|error| {
1817            index_corrupted(path, format!("invalid UTF-8 in embedder_revision: {error}"))
1818        })?
1819        .to_owned();
1820
1821    let dimension_u32 = u32::from_le_bytes(read_array::<4>(path, data, &mut cursor, "dimension")?);
1822    let dimension = usize::try_from(dimension_u32)
1823        .map_err(|_| index_corrupted(path, "dimension does not fit in usize"))?;
1824    if dimension == 0 {
1825        return Err(index_corrupted(path, "dimension must be greater than zero"));
1826    }
1827
1828    let quantization_byte = read_array::<1>(path, data, &mut cursor, "quantization")?[0];
1829    let quantization = Quantization::from_wire(quantization_byte, path)?;
1830
1831    // Use first reserved byte for compaction generation
1832    let reserved = read_array::<3>(path, data, &mut cursor, "reserved")?;
1833    let compaction_gen = reserved[0];
1834    // reserved[1..2] remain unused
1835
1836    let record_count_u64 =
1837        u64::from_le_bytes(read_array::<8>(path, data, &mut cursor, "record_count")?);
1838    let record_count = usize::try_from(record_count_u64)
1839        .map_err(|_| index_corrupted(path, "record_count does not fit in usize"))?;
1840    let vectors_offset =
1841        u64::from_le_bytes(read_array::<8>(path, data, &mut cursor, "vectors_offset")?);
1842    let expected_crc =
1843        u32::from_le_bytes(read_array::<4>(path, data, &mut cursor, "header_crc32")?);
1844    let actual_crc = crc32(&data[..cursor - 4]);
1845    if actual_crc != expected_crc {
1846        return Err(index_corrupted(
1847            path,
1848            format!("header CRC mismatch: expected {expected_crc:#010x}, got {actual_crc:#010x}"),
1849        ));
1850    }
1851
1852    Ok((
1853        VectorMetadata {
1854            embedder_id,
1855            embedder_revision,
1856            dimension,
1857            quantization,
1858            compaction_gen,
1859            record_count,
1860            vectors_offset,
1861        },
1862        cursor,
1863    ))
1864}
1865
1866fn read_array<const N: usize>(
1867    path: &Path,
1868    data: &[u8],
1869    cursor: &mut usize,
1870    field: &str,
1871) -> SearchResult<[u8; N]> {
1872    let slice = read_slice(path, data, cursor, N, field)?;
1873    let mut out = [0_u8; N];
1874    out.copy_from_slice(slice);
1875    Ok(out)
1876}
1877
1878fn read_slice<'a>(
1879    path: &Path,
1880    data: &'a [u8],
1881    cursor: &mut usize,
1882    len: usize,
1883    field: &str,
1884) -> SearchResult<&'a [u8]> {
1885    let end = cursor
1886        .checked_add(len)
1887        .ok_or_else(|| index_corrupted(path, format!("{field} offset overflow")))?;
1888    if end > data.len() {
1889        return Err(index_corrupted(
1890            path,
1891            format!("{field} is truncated (wanted {len} bytes)"),
1892        ));
1893    }
1894    let out = &data[*cursor..end];
1895    *cursor = end;
1896    Ok(out)
1897}
1898
1899fn build_header_prefix(
1900    embedder_id: &str,
1901    embedder_revision: &str,
1902    dimension: usize,
1903    quantization: Quantization,
1904    compaction_gen: u8,
1905    record_count: usize,
1906    vectors_offset: u64,
1907) -> SearchResult<Vec<u8>> {
1908    validate_header_string(embedder_id, "embedder_id")?;
1909    validate_header_string(embedder_revision, "embedder_revision")?;
1910    let dimension_u32 = u32::try_from(dimension).map_err(|_| SearchError::InvalidConfig {
1911        field: "dimension".to_owned(),
1912        value: dimension.to_string(),
1913        reason: "dimension must fit in u32".to_owned(),
1914    })?;
1915    let record_count_u64 = u64::try_from(record_count).map_err(|_| SearchError::InvalidConfig {
1916        field: "record_count".to_owned(),
1917        value: record_count.to_string(),
1918        reason: "record_count must fit in u64".to_owned(),
1919    })?;
1920    let mut out = Vec::with_capacity(
1921        4 + 2 + 2 + embedder_id.len() + 2 + embedder_revision.len() + 4 + 1 + 3 + 8 + 8,
1922    );
1923    out.extend_from_slice(&FSVI_MAGIC);
1924    out.extend_from_slice(&FSVI_VERSION.to_le_bytes());
1925    out.extend_from_slice(
1926        &u16::try_from(embedder_id.len())
1927            .map_err(|_| SearchError::InvalidConfig {
1928                field: "embedder_id".to_owned(),
1929                value: embedder_id.to_owned(),
1930                reason: "embedder_id byte length must fit in u16".to_owned(),
1931            })?
1932            .to_le_bytes(),
1933    );
1934    out.extend_from_slice(embedder_id.as_bytes());
1935    out.extend_from_slice(
1936        &u16::try_from(embedder_revision.len())
1937            .map_err(|_| SearchError::InvalidConfig {
1938                field: "embedder_revision".to_owned(),
1939                value: embedder_revision.to_owned(),
1940                reason: "embedder_revision byte length must fit in u16".to_owned(),
1941            })?
1942            .to_le_bytes(),
1943    );
1944    out.extend_from_slice(embedder_revision.as_bytes());
1945    out.extend_from_slice(&dimension_u32.to_le_bytes());
1946    out.push(quantization as u8);
1947    out.push(compaction_gen);
1948    out.extend_from_slice(&[0_u8; 2]);
1949    out.extend_from_slice(&record_count_u64.to_le_bytes());
1950    out.extend_from_slice(&vectors_offset.to_le_bytes());
1951    Ok(out)
1952}
1953
1954fn validate_header_string(value: &str, field: &str) -> SearchResult<()> {
1955    if value.is_empty() && field == "embedder_id" {
1956        return Err(SearchError::InvalidConfig {
1957            field: field.to_owned(),
1958            value: value.to_owned(),
1959            reason: "embedder_id cannot be empty".to_owned(),
1960        });
1961    }
1962    let _ = u16::try_from(value.len()).map_err(|_| SearchError::InvalidConfig {
1963        field: field.to_owned(),
1964        value: value.to_owned(),
1965        reason: "value length must fit in u16".to_owned(),
1966    })?;
1967    Ok(())
1968}
1969
1970fn write_vector_slab<W: Write>(
1971    writer: &mut W,
1972    records: &[PendingRecord],
1973    quantization: Quantization,
1974) -> SearchResult<()> {
1975    match quantization {
1976        Quantization::F16 => {
1977            for record in records {
1978                for value in &record.embedding {
1979                    writer.write_all(&f16::from_f32(*value).to_le_bytes())?;
1980                }
1981            }
1982        }
1983        Quantization::F32 => {
1984            for record in records {
1985                for value in &record.embedding {
1986                    writer.write_all(&value.to_le_bytes())?;
1987                }
1988            }
1989        }
1990    }
1991    Ok(())
1992}
1993
1994fn align_up(value: u64, alignment: u64) -> SearchResult<u64> {
1995    if alignment == 0 {
1996        return Ok(value);
1997    }
1998    let add = alignment
1999        .checked_sub(1)
2000        .ok_or_else(|| SearchError::InvalidConfig {
2001            field: "alignment".to_owned(),
2002            value: alignment.to_string(),
2003            reason: "alignment underflow".to_owned(),
2004        })?;
2005    let padded = value
2006        .checked_add(add)
2007        .ok_or_else(|| SearchError::InvalidConfig {
2008            field: "alignment".to_owned(),
2009            value: format!("{value}+{add}"),
2010            reason: "alignment overflow".to_owned(),
2011        })?;
2012    Ok((padded / alignment) * alignment)
2013}
2014
2015fn temporary_output_path(path: &Path) -> PathBuf {
2016    let now = SystemTime::now()
2017        .duration_since(UNIX_EPOCH)
2018        .unwrap_or_default()
2019        .as_nanos();
2020    let pid = std::process::id();
2021    let mut os = path.as_os_str().to_os_string();
2022    os.push(format!(".tmp.{pid}.{now}"));
2023    PathBuf::from(os)
2024}
2025
2026fn sync_parent_directory(path: &Path) -> SearchResult<()> {
2027    #[cfg(unix)]
2028    {
2029        if let Some(parent) = path.parent() {
2030            let dir = File::open(parent)?;
2031            dir.sync_all()?;
2032        }
2033    }
2034    #[cfg(not(unix))]
2035    {
2036        let _ = path;
2037    }
2038    Ok(())
2039}
2040
2041fn index_corrupted(path: &Path, detail: impl Into<String>) -> SearchError {
2042    SearchError::IndexCorrupted {
2043        path: path.to_path_buf(),
2044        detail: detail.into(),
2045    }
2046}
2047
2048fn crc32(data: &[u8]) -> u32 {
2049    let mut hasher = Crc32::new();
2050    hasher.update(data);
2051    hasher.finalize()
2052}
2053
2054pub(crate) fn fnv1a_hash(bytes: &[u8]) -> u64 {
2055    let mut hash = 0xcbf2_9ce4_8422_2325_u64;
2056    for &byte in bytes {
2057        hash ^= u64::from(byte);
2058        hash = hash.wrapping_mul(0x0100_0000_01b3_u64);
2059    }
2060    hash
2061}
2062
2063const fn is_tombstoned_flags(flags: u16) -> bool {
2064    flags & RECORD_FLAG_TOMBSTONE != 0
2065}
2066
2067const fn next_generation(current: u8) -> u8 {
2068    if current == 255 { 1 } else { current + 1 }
2069}
2070
2071#[cfg(test)]
2072mod tests {
2073    use super::*;
2074
2075    fn temp_index_path(name: &str) -> PathBuf {
2076        let now = SystemTime::now()
2077            .duration_since(UNIX_EPOCH)
2078            .unwrap_or_default()
2079            .as_nanos();
2080        std::env::temp_dir().join(format!(
2081            "frankensearch-index-{name}-{}-{now}.fsvi",
2082            std::process::id()
2083        ))
2084    }
2085
2086    fn sample_vector(base: f32, dim: usize) -> Vec<f32> {
2087        vec![base; dim]
2088    }
2089
2090    #[test]
2091    fn round_trip_f16_with_revision_and_lookup() {
2092        let path = temp_index_path("round-trip");
2093        let mut writer =
2094            VectorIndex::create_with_revision(&path, "fnv1a-384", "rev-123", 8, Quantization::F16)
2095                .expect("writer");
2096        writer
2097            .write_record("doc-b", &sample_vector(1.0, 8))
2098            .expect("write doc-b");
2099        writer
2100            .write_record("doc-a", &sample_vector(2.0, 8))
2101            .expect("write doc-a");
2102        writer.finish().expect("finish");
2103
2104        let index = VectorIndex::open(&path).expect("open index");
2105        assert_eq!(index.record_count(), 2);
2106        assert_eq!(index.dimension(), 8);
2107        assert_eq!(index.embedder_id(), "fnv1a-384");
2108        assert_eq!(index.embedder_revision(), "rev-123");
2109        assert_eq!(index.quantization(), Quantization::F16);
2110        assert_eq!(index.metadata().vectors_offset % VECTOR_ALIGN_BYTES, 0);
2111
2112        let hash_a = fnv1a_hash(b"doc-a");
2113        let pos_a = index
2114            .find_index_by_doc_hash(hash_a)
2115            .expect("hash lookup should find doc-a");
2116        let doc_id = index.doc_id_at(pos_a).expect("doc id");
2117        assert_eq!(doc_id, "doc-a");
2118        let vec_a = index.vector_at_f32(pos_a).expect("vector");
2119        assert_eq!(vec_a.len(), 8);
2120        assert!((vec_a[0] - 2.0).abs() < 0.002);
2121    }
2122
2123    #[test]
2124    fn detects_header_crc_corruption() {
2125        let path = temp_index_path("crc");
2126        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2127        writer
2128            .write_record("doc-1", &sample_vector(0.5, 4))
2129            .expect("write");
2130        writer.finish().expect("finish");
2131
2132        let mut bytes = fs::read(&path).expect("read index");
2133        // Flip a byte in the header payload before crc.
2134        bytes[6] ^= 0xAA;
2135        fs::write(&path, bytes).expect("rewrite corrupt index");
2136
2137        let error = VectorIndex::open(&path).expect_err("corruption should be detected");
2138        assert!(matches!(error, SearchError::IndexCorrupted { .. }));
2139    }
2140
2141    #[test]
2142    fn write_record_dimension_mismatch_is_error() {
2143        let path = temp_index_path("dim-mismatch");
2144        let mut writer = VectorIndex::create(&path, "fnv1a-384", 3).expect("writer");
2145        let error = writer
2146            .write_record("doc-1", &[1.0, 2.0])
2147            .expect_err("must reject wrong dimension");
2148        assert!(matches!(
2149            error,
2150            SearchError::DimensionMismatch {
2151                expected: 3,
2152                found: 2
2153            }
2154        ));
2155    }
2156
2157    #[test]
2158    fn empty_index_round_trip() {
2159        let path = temp_index_path("empty");
2160        let writer = VectorIndex::create(&path, "fnv1a-384", 16).expect("writer");
2161        writer.finish().expect("finish");
2162
2163        let index = VectorIndex::open(&path).expect("open");
2164        assert_eq!(index.record_count(), 0);
2165        assert_eq!(index.dimension(), 16);
2166    }
2167
2168    #[test]
2169    fn get_embeddings_returns_none_for_missing_hashes() {
2170        let path = temp_index_path("get-embeddings");
2171        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2172        writer
2173            .write_record("doc-1", &[0.1, 0.2, 0.3, 0.4])
2174            .expect("write");
2175        writer.finish().expect("finish");
2176
2177        let index = VectorIndex::open(&path).expect("open");
2178        let existing = fnv1a_hash(b"doc-1");
2179        let missing = fnv1a_hash(b"missing");
2180        let embeddings = index.get_embeddings(&[existing, missing]);
2181        assert!(embeddings[0].is_some());
2182        assert!(embeddings[1].is_none());
2183        assert_eq!(embeddings[0].as_ref().expect("existing").len(), 4);
2184    }
2185
2186    #[test]
2187    fn soft_delete_marks_record_and_hides_hash_lookup() {
2188        let path = temp_index_path("soft-delete-main");
2189        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2190        writer
2191            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2192            .expect("write doc-a");
2193        writer
2194            .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2195            .expect("write doc-b");
2196        writer.finish().expect("finish");
2197
2198        let mut index = VectorIndex::open(&path).expect("open");
2199        assert!(index.soft_delete("doc-a").expect("soft delete"));
2200        assert!(!index.soft_delete("doc-a").expect("idempotent soft delete"));
2201
2202        let hash_a = fnv1a_hash(b"doc-a");
2203        let hash_b = fnv1a_hash(b"doc-b");
2204        assert_eq!(index.find_index_by_doc_hash(hash_a), None);
2205        assert!(index.find_index_by_doc_hash(hash_b).is_some());
2206        assert_eq!(index.tombstone_count(), 1);
2207
2208        let hits = index
2209            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2210            .expect("search");
2211        assert_eq!(hits.len(), 1);
2212        assert_eq!(hits[0].doc_id, "doc-b");
2213
2214        std::fs::remove_file(&path).ok();
2215    }
2216
2217    #[test]
2218    fn soft_delete_missing_returns_false() {
2219        let path = temp_index_path("soft-delete-missing");
2220        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2221        writer
2222            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2223            .expect("write");
2224        writer.finish().expect("finish");
2225
2226        let mut index = VectorIndex::open(&path).expect("open");
2227        assert!(
2228            !index
2229                .soft_delete("missing-doc")
2230                .expect("missing soft delete")
2231        );
2232        assert_eq!(index.tombstone_count(), 0);
2233
2234        std::fs::remove_file(&path).ok();
2235    }
2236
2237    #[test]
2238    fn soft_delete_batch_counts_only_new_tombstones() {
2239        let path = temp_index_path("soft-delete-batch");
2240        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2241        writer
2242            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2243            .expect("write a");
2244        writer
2245            .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2246            .expect("write b");
2247        writer
2248            .write_record("doc-c", &[0.0, 0.0, 1.0, 0.0])
2249            .expect("write c");
2250        writer.finish().expect("finish");
2251
2252        let mut index = VectorIndex::open(&path).expect("open");
2253        let deleted = index
2254            .soft_delete_batch(&["doc-a", "doc-b", "missing", "doc-a"])
2255            .expect("batch delete");
2256        assert_eq!(deleted, 2);
2257        assert_eq!(index.tombstone_count(), 2);
2258
2259        std::fs::remove_file(&path).ok();
2260    }
2261
2262    #[test]
2263    fn tombstone_ratio_and_needs_vacuum_threshold() {
2264        let path = temp_index_path("soft-delete-ratio");
2265        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2266        for i in 0..10 {
2267            writer
2268                .write_record(&format!("doc-{i}"), &sample_vector(0.1, 4))
2269                .expect("write");
2270        }
2271        writer.finish().expect("finish");
2272
2273        let mut index = VectorIndex::open(&path).expect("open");
2274        assert!(index.tombstone_ratio().abs() < f64::EPSILON);
2275        assert!(!index.needs_vacuum());
2276
2277        index.soft_delete("doc-0").expect("delete 0");
2278        index.soft_delete("doc-1").expect("delete 1");
2279        assert_eq!(index.tombstone_count(), 2);
2280        assert!((index.tombstone_ratio() - 0.2).abs() < f64::EPSILON);
2281        assert!(!index.needs_vacuum(), "threshold is strict greater-than");
2282
2283        index.soft_delete("doc-2").expect("delete 2");
2284        assert_eq!(index.tombstone_count(), 3);
2285        assert!(index.needs_vacuum());
2286
2287        std::fs::remove_file(&path).ok();
2288    }
2289
2290    #[test]
2291    fn vacuum_removes_tombstones_and_preserves_live_results() {
2292        let path = temp_index_path("soft-delete-vacuum");
2293        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2294        writer
2295            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2296            .expect("write a");
2297        writer
2298            .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2299            .expect("write b");
2300        writer
2301            .write_record("doc-c", &[0.0, 0.0, 1.0, 0.0])
2302            .expect("write c");
2303        writer.finish().expect("finish");
2304
2305        let mut index = VectorIndex::open(&path).expect("open");
2306        index.soft_delete("doc-b").expect("delete b");
2307
2308        let pre_hits = index
2309            .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
2310            .expect("pre-vacuum search");
2311        assert_eq!(pre_hits.len(), 2);
2312        assert!(pre_hits.iter().all(|hit| hit.doc_id != "doc-b"));
2313
2314        let stats = index.vacuum().expect("vacuum");
2315        assert_eq!(stats.records_before, 3);
2316        assert_eq!(stats.records_after, 2);
2317        assert_eq!(stats.tombstones_removed, 1);
2318        assert!(stats.bytes_reclaimed > 0);
2319        assert!(stats.duration >= Duration::ZERO);
2320
2321        assert_eq!(index.record_count(), 2);
2322        assert_eq!(index.tombstone_count(), 0);
2323        assert_eq!(index.find_index_by_doc_hash(fnv1a_hash(b"doc-b")), None);
2324
2325        let post_hits = index
2326            .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
2327            .expect("post-vacuum search");
2328        assert_eq!(post_hits.len(), 2);
2329        assert!(post_hits.iter().all(|hit| hit.doc_id != "doc-b"));
2330
2331        std::fs::remove_file(&path).ok();
2332    }
2333
2334    #[test]
2335    fn soft_delete_and_search_interleaving_has_no_corruption() {
2336        use std::collections::HashSet;
2337        use std::sync::{Arc, Mutex};
2338
2339        let path = temp_index_path("soft-delete-concurrent");
2340        let dim = 4;
2341
2342        let mut writer = VectorIndex::create(&path, "fnv1a-384", dim).expect("writer");
2343        for i in 0..128 {
2344            writer
2345                .write_record(&format!("doc-{i:03}"), &[1.0, 0.0, 0.0, 0.0])
2346                .expect("write");
2347        }
2348        writer.finish().expect("finish");
2349
2350        let shared = Arc::new(Mutex::new(VectorIndex::open(&path).expect("open")));
2351        let deleter = {
2352            let index = Arc::clone(&shared);
2353            std::thread::spawn(move || {
2354                for i in 0..32 {
2355                    let mut guard = index.lock().expect("lock for delete");
2356                    let doc_id = format!("doc-{i:03}");
2357                    let _ = guard.soft_delete(&doc_id).expect("soft delete");
2358                }
2359            })
2360        };
2361
2362        let query = [1.0, 0.0, 0.0, 0.0];
2363        let searchers: Vec<_> = (0..4)
2364            .map(|_| {
2365                let index = Arc::clone(&shared);
2366                std::thread::spawn(move || {
2367                    for _ in 0..32 {
2368                        let hits = index
2369                            .lock()
2370                            .expect("lock for search")
2371                            .search_top_k(&query, 10, None)
2372                            .expect("search");
2373                        assert!(!hits.is_empty());
2374                    }
2375                })
2376            })
2377            .collect();
2378
2379        deleter.join().expect("join deleter");
2380        for handle in searchers {
2381            handle.join().expect("join searcher");
2382        }
2383
2384        let hits = shared
2385            .lock()
2386            .expect("lock final")
2387            .search_top_k(&query, 64, None)
2388            .expect("final search");
2389        let deleted_ids: HashSet<String> = (0..32).map(|i| format!("doc-{i:03}")).collect();
2390        assert!(hits.iter().all(|hit| !deleted_ids.contains(&hit.doc_id)));
2391
2392        std::fs::remove_file(&path).ok();
2393    }
2394
2395    #[test]
2396    fn soft_delete_preserves_existing_non_tombstone_flags() {
2397        let path = temp_index_path("soft-delete-flags");
2398        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2399        writer
2400            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2401            .expect("write doc-a");
2402        writer.finish().expect("finish");
2403
2404        let mut index = VectorIndex::open(&path).expect("open");
2405        let hash_a = fnv1a_hash(b"doc-a");
2406        let record_index = index
2407            .find_index_by_doc_hash(hash_a)
2408            .expect("record index for doc-a");
2409
2410        let custom_flag: u16 = 0x0004;
2411        index
2412            .set_record_flags(record_index, custom_flag)
2413            .expect("seed custom flag");
2414        assert_eq!(
2415            index.record_at(record_index).expect("read flags").flags,
2416            custom_flag
2417        );
2418
2419        assert!(index.soft_delete("doc-a").expect("soft delete doc-a"));
2420        let flags_after = index.record_at(record_index).expect("read flags").flags;
2421        assert_eq!(
2422            flags_after & RECORD_FLAG_TOMBSTONE,
2423            RECORD_FLAG_TOMBSTONE,
2424            "tombstone bit must be set",
2425        );
2426        assert_eq!(
2427            flags_after & custom_flag,
2428            custom_flag,
2429            "non-tombstone bits must remain untouched",
2430        );
2431
2432        std::fs::remove_file(&path).ok();
2433    }
2434
2435    #[test]
2436    fn tombstone_flag_persists_after_reopen() {
2437        let path = temp_index_path("soft-delete-persist");
2438        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2439        writer
2440            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2441            .expect("write a");
2442        writer
2443            .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2444            .expect("write b");
2445        writer.finish().expect("finish");
2446
2447        {
2448            let mut index = VectorIndex::open(&path).expect("open for delete");
2449            assert!(index.soft_delete("doc-a").expect("delete doc-a"));
2450            assert_eq!(index.tombstone_count(), 1);
2451        }
2452
2453        let reopened = VectorIndex::open(&path).expect("reopen");
2454        assert_eq!(reopened.tombstone_count(), 1);
2455        assert_eq!(reopened.find_index_by_doc_hash(fnv1a_hash(b"doc-a")), None);
2456        let hits = reopened
2457            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2458            .expect("search after reopen");
2459        assert!(hits.iter().all(|hit| hit.doc_id != "doc-a"));
2460
2461        std::fs::remove_file(&path).ok();
2462    }
2463
2464    #[test]
2465    fn delete_vacuum_append_cycle_keeps_expected_live_set() {
2466        use std::collections::HashSet;
2467
2468        let path = temp_index_path("soft-delete-reindex-cycle");
2469        let dim = 4;
2470
2471        let mut writer = VectorIndex::create(&path, "fnv1a-384", dim).expect("writer");
2472        for i in 0..100 {
2473            writer
2474                .write_record(&format!("doc-{i:03}"), &[1.0, 0.0, 0.0, 0.0])
2475                .expect("write initial doc");
2476        }
2477        writer.finish().expect("finish");
2478
2479        let mut index = VectorIndex::open(&path).expect("open");
2480        let delete_ids: Vec<String> = (0..50).map(|i| format!("doc-{i:03}")).collect();
2481        let delete_refs: Vec<&str> = delete_ids.iter().map(String::as_str).collect();
2482        let deleted = index.soft_delete_batch(&delete_refs).expect("batch delete");
2483        assert_eq!(deleted, 50);
2484        assert_eq!(index.tombstone_count(), 50);
2485
2486        let vacuum_stats = index.vacuum().expect("vacuum");
2487        assert_eq!(vacuum_stats.records_before, 100);
2488        assert_eq!(vacuum_stats.records_after, 50);
2489        assert_eq!(index.tombstone_count(), 0);
2490        assert_eq!(index.record_count(), 50);
2491
2492        let append_entries: Vec<(String, Vec<f32>)> = (100..150)
2493            .map(|i| (format!("doc-{i:03}"), vec![1.0, 0.0, 0.0, 0.0]))
2494            .collect();
2495        index.append_batch(&append_entries).expect("append batch");
2496        assert_eq!(index.wal_record_count(), 50);
2497
2498        let compact_stats = index.compact().expect("compact");
2499        assert_eq!(compact_stats.total_records_after, 100);
2500        assert_eq!(index.record_count(), 100);
2501        assert_eq!(index.wal_record_count(), 0);
2502
2503        let hits = index
2504            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 150, None)
2505            .expect("search");
2506        assert_eq!(hits.len(), 100);
2507        let ids: HashSet<String> = hits.iter().map(|hit| hit.doc_id.clone()).collect();
2508
2509        for i in 0..50 {
2510            assert!(
2511                !ids.contains(&format!("doc-{i:03}")),
2512                "deleted id must not be present",
2513            );
2514        }
2515        for i in 50..150 {
2516            assert!(
2517                ids.contains(&format!("doc-{i:03}")),
2518                "live id must be present",
2519            );
2520        }
2521
2522        std::fs::remove_file(&path).ok();
2523        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2524    }
2525
2526    #[test]
2527    fn tombstones_remain_excluded_with_wal_and_after_compaction() {
2528        let path = temp_index_path("soft-delete-wal-integration");
2529        let dim = 4;
2530
2531        let mut writer = VectorIndex::create(&path, "fnv1a-384", dim).expect("writer");
2532        writer
2533            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2534            .expect("write a");
2535        writer
2536            .write_record("doc-b", &[1.0, 0.0, 0.0, 0.0])
2537            .expect("write b");
2538        writer.finish().expect("finish");
2539
2540        let mut index = VectorIndex::open(&path).expect("open");
2541        assert!(index.soft_delete("doc-a").expect("delete a"));
2542        index
2543            .append("doc-c", &[1.0, 0.0, 0.0, 0.0])
2544            .expect("append c");
2545        assert_eq!(index.wal_record_count(), 1);
2546
2547        let pre_compact = index
2548            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2549            .expect("pre-compact search");
2550        assert_eq!(pre_compact.len(), 2);
2551        assert!(pre_compact.iter().all(|hit| hit.doc_id != "doc-a"));
2552        assert!(pre_compact.iter().any(|hit| hit.doc_id == "doc-b"));
2553        assert!(pre_compact.iter().any(|hit| hit.doc_id == "doc-c"));
2554
2555        index.compact().expect("compact");
2556        let post_compact = index
2557            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2558            .expect("post-compact search");
2559        assert_eq!(post_compact.len(), 2);
2560        assert!(post_compact.iter().all(|hit| hit.doc_id != "doc-a"));
2561        assert!(post_compact.iter().any(|hit| hit.doc_id == "doc-b"));
2562        assert!(post_compact.iter().any(|hit| hit.doc_id == "doc-c"));
2563
2564        std::fs::remove_file(&path).ok();
2565        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2566    }
2567
2568    #[test]
2569    fn vacuum_noop_when_no_tombstones() {
2570        let path = temp_index_path("soft-delete-vacuum-noop");
2571        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2572        writer
2573            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2574            .expect("write a");
2575        writer
2576            .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2577            .expect("write b");
2578        writer.finish().expect("finish");
2579
2580        let mut index = VectorIndex::open(&path).expect("open");
2581        assert_eq!(index.tombstone_count(), 0);
2582
2583        let stats = index.vacuum().expect("vacuum with no tombstones");
2584        assert_eq!(stats.records_before, 2);
2585        assert_eq!(stats.records_after, 2);
2586        assert_eq!(stats.tombstones_removed, 0);
2587        assert_eq!(index.record_count(), 2);
2588
2589        std::fs::remove_file(&path).ok();
2590    }
2591
2592    #[test]
2593    fn soft_delete_all_records_yields_empty_search() {
2594        let path = temp_index_path("soft-delete-all");
2595        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2596        for i in 0..5 {
2597            writer
2598                .write_record(&format!("doc-{i}"), &sample_vector(0.1, 4))
2599                .expect("write");
2600        }
2601        writer.finish().expect("finish");
2602
2603        let mut index = VectorIndex::open(&path).expect("open");
2604        for i in 0..5 {
2605            assert!(index.soft_delete(&format!("doc-{i}")).expect("delete"));
2606        }
2607        assert_eq!(index.tombstone_count(), 5);
2608        assert!((index.tombstone_ratio() - 1.0).abs() < f64::EPSILON);
2609        assert!(index.needs_vacuum());
2610
2611        let hits = index
2612            .search_top_k(&sample_vector(0.1, 4), 10, None)
2613            .expect("search");
2614        assert!(
2615            hits.is_empty(),
2616            "search with all deleted should return nothing"
2617        );
2618
2619        std::fs::remove_file(&path).ok();
2620    }
2621
2622    #[test]
2623    fn vacuum_after_deleting_all_records_yields_empty_index() {
2624        let path = temp_index_path("soft-delete-vacuum-all");
2625        let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2626        for i in 0..3 {
2627            writer
2628                .write_record(&format!("doc-{i}"), &[1.0, 0.0, 0.0, 0.0])
2629                .expect("write");
2630        }
2631        writer.finish().expect("finish");
2632
2633        let mut index = VectorIndex::open(&path).expect("open");
2634        for i in 0..3 {
2635            index.soft_delete(&format!("doc-{i}")).expect("delete");
2636        }
2637
2638        let stats = index.vacuum().expect("vacuum all deleted");
2639        assert_eq!(stats.records_before, 3);
2640        assert_eq!(stats.records_after, 0);
2641        assert_eq!(stats.tombstones_removed, 3);
2642        assert_eq!(index.record_count(), 0);
2643        assert_eq!(index.tombstone_count(), 0);
2644        assert!(index.tombstone_ratio().abs() < f64::EPSILON);
2645        assert!(!index.needs_vacuum());
2646
2647        let hits = index
2648            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2649            .expect("search");
2650        assert!(hits.is_empty());
2651
2652        std::fs::remove_file(&path).ok();
2653    }
2654
2655    // ─── WAL integration tests ─────────────────────────────────────────
2656
2657    #[test]
2658    fn append_single_vector_is_searchable() {
2659        let path = temp_index_path("wal-append-single");
2660        let dim = 4;
2661
2662        // Build initial index.
2663        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2664        writer
2665            .write_record("main-0", &[1.0, 0.0, 0.0, 0.0])
2666            .expect("write");
2667        writer.finish().expect("finish");
2668
2669        // Append via WAL.
2670        let mut index = VectorIndex::open(&path).expect("open");
2671        assert_eq!(index.wal_record_count(), 0);
2672        index
2673            .append("wal-0", &[0.0, 1.0, 0.0, 0.0])
2674            .expect("append");
2675        assert_eq!(index.wal_record_count(), 1);
2676
2677        // Search should find both main and WAL entries.
2678        let hits = index
2679            .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
2680            .expect("search");
2681        assert_eq!(hits.len(), 2);
2682        assert_eq!(hits[0].doc_id, "wal-0", "WAL entry should rank first");
2683
2684        // Cleanup.
2685        std::fs::remove_file(&path).ok();
2686        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2687    }
2688
2689    #[test]
2690    fn append_batch_all_searchable() {
2691        let path = temp_index_path("wal-append-batch");
2692        let dim = 4;
2693
2694        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2695        writer
2696            .write_record("main-0", &[1.0, 0.0, 0.0, 0.0])
2697            .expect("write");
2698        writer.finish().expect("finish");
2699
2700        let mut index = VectorIndex::open(&path).expect("open");
2701        index
2702            .append_batch(&[
2703                ("wal-0".to_owned(), vec![0.0, 1.0, 0.0, 0.0]),
2704                ("wal-1".to_owned(), vec![0.0, 0.0, 1.0, 0.0]),
2705                ("wal-2".to_owned(), vec![0.0, 0.0, 0.0, 1.0]),
2706            ])
2707            .expect("append batch");
2708        assert_eq!(index.wal_record_count(), 3);
2709
2710        let hits = index
2711            .search_top_k(&[1.0, 1.0, 1.0, 1.0], 10, None)
2712            .expect("search");
2713        assert_eq!(hits.len(), 4, "all 4 vectors should be returned");
2714
2715        std::fs::remove_file(&path).ok();
2716        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2717    }
2718
2719    #[test]
2720    fn compaction_merges_wal_into_main() {
2721        let path = temp_index_path("wal-compact");
2722        let dim = 4;
2723
2724        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2725        writer
2726            .write_record("main-0", &[1.0, 0.0, 0.0, 0.0])
2727            .expect("write");
2728        writer.finish().expect("finish");
2729
2730        let mut index = VectorIndex::open(&path).expect("open");
2731        index
2732            .append("wal-0", &[0.0, 1.0, 0.0, 0.0])
2733            .expect("append");
2734        index
2735            .append("wal-1", &[0.0, 0.0, 1.0, 0.0])
2736            .expect("append");
2737
2738        assert_eq!(index.record_count(), 1);
2739        assert_eq!(index.wal_record_count(), 2);
2740
2741        let stats = index.compact().expect("compact");
2742        assert_eq!(stats.main_records_before, 1);
2743        assert_eq!(stats.wal_records, 2);
2744        assert_eq!(stats.total_records_after, 3);
2745        assert_eq!(index.record_count(), 3);
2746        assert_eq!(index.wal_record_count(), 0);
2747        assert!(!wal::wal_path_for(&path).exists(), "WAL should be deleted");
2748
2749        // All records should still be searchable from main index.
2750        let hits = index
2751            .search_top_k(&[1.0, 1.0, 1.0, 1.0], 10, None)
2752            .expect("search");
2753        assert_eq!(hits.len(), 3);
2754
2755        std::fs::remove_file(&path).ok();
2756    }
2757
2758    #[test]
2759    fn needs_compaction_threshold() {
2760        let path = temp_index_path("wal-threshold");
2761        let dim = 4;
2762
2763        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2764        for i in 0..10 {
2765            writer
2766                .write_record(&format!("main-{i}"), &sample_vector(0.1, dim))
2767                .expect("write");
2768        }
2769        writer.finish().expect("finish");
2770
2771        let mut index = VectorIndex::open(&path).expect("open");
2772        index.set_wal_config(WalConfig {
2773            compaction_threshold: 5,
2774            compaction_ratio: 0.10,
2775            fsync_on_write: false,
2776        });
2777
2778        assert!(!index.needs_compaction());
2779
2780        // Add 1 entry: ratio = 1/10 = 0.10, hits the ratio threshold.
2781        index
2782            .append("wal-0", &sample_vector(0.2, dim))
2783            .expect("append");
2784        assert!(index.needs_compaction());
2785
2786        std::fs::remove_file(&path).ok();
2787        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2788    }
2789
2790    #[test]
2791    fn wal_survives_reopen() {
2792        let path = temp_index_path("wal-reopen");
2793        let dim = 4;
2794
2795        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2796        writer
2797            .write_record("main-0", &[1.0, 0.0, 0.0, 0.0])
2798            .expect("write");
2799        writer.finish().expect("finish");
2800
2801        // Append and drop.
2802        {
2803            let mut index = VectorIndex::open(&path).expect("open");
2804            index
2805                .append("wal-0", &[0.0, 1.0, 0.0, 0.0])
2806                .expect("append");
2807        }
2808
2809        // Reopen — WAL should be loaded automatically.
2810        let index = VectorIndex::open(&path).expect("reopen");
2811        assert_eq!(index.wal_record_count(), 1);
2812
2813        let hits = index
2814            .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
2815            .expect("search");
2816        assert_eq!(hits.len(), 2);
2817        assert_eq!(hits[0].doc_id, "wal-0");
2818
2819        std::fs::remove_file(&path).ok();
2820        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2821    }
2822
2823    #[test]
2824    fn append_dimension_mismatch_rejected() {
2825        let path = temp_index_path("wal-dim-mismatch");
2826        let dim = 4;
2827
2828        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2829        writer
2830            .write_record("main-0", &sample_vector(1.0, dim))
2831            .expect("write");
2832        writer.finish().expect("finish");
2833
2834        let mut index = VectorIndex::open(&path).expect("open");
2835        let err = index
2836            .append("bad", &[1.0, 2.0])
2837            .expect_err("should reject wrong dimension");
2838        assert!(matches!(err, SearchError::DimensionMismatch { .. }));
2839        assert_eq!(
2840            index.wal_record_count(),
2841            0,
2842            "failed append should not persist"
2843        );
2844
2845        std::fs::remove_file(&path).ok();
2846    }
2847
2848    #[test]
2849    fn compact_empty_wal_is_noop() {
2850        let path = temp_index_path("wal-compact-empty");
2851        let dim = 4;
2852
2853        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2854        writer
2855            .write_record("main-0", &sample_vector(1.0, dim))
2856            .expect("write");
2857        writer.finish().expect("finish");
2858
2859        let mut index = VectorIndex::open(&path).expect("open");
2860        let stats = index.compact().expect("compact empty WAL");
2861        assert_eq!(stats.wal_records, 0);
2862        assert_eq!(stats.total_records_after, 1);
2863
2864        std::fs::remove_file(&path).ok();
2865    }
2866
2867    #[test]
2868    fn wal_entries_rank_correctly_against_main() {
2869        let path = temp_index_path("wal-ranking");
2870        let dim = 4;
2871
2872        // Main index has a mediocre match.
2873        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2874        writer
2875            .write_record("main-mediocre", &[0.5, 0.5, 0.0, 0.0])
2876            .expect("write");
2877        writer.finish().expect("finish");
2878
2879        // WAL has a perfect match.
2880        let mut index = VectorIndex::open(&path).expect("open");
2881        index
2882            .append("wal-perfect", &[1.0, 0.0, 0.0, 0.0])
2883            .expect("append");
2884
2885        let hits = index
2886            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 2, None)
2887            .expect("search");
2888        assert_eq!(hits.len(), 2);
2889        assert_eq!(hits[0].doc_id, "wal-perfect");
2890        assert!(hits[0].score > hits[1].score);
2891
2892        std::fs::remove_file(&path).ok();
2893        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2894    }
2895
2896    #[test]
2897    fn append_duplicate_doc_id_both_searchable() {
2898        let path = temp_index_path("wal-dup-docid");
2899        let dim = 4;
2900
2901        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2902        writer
2903            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2904            .expect("write");
2905        writer.finish().expect("finish");
2906
2907        let mut index = VectorIndex::open(&path).expect("open");
2908        // Append a second entry with the same doc_id but different vector.
2909        index
2910            .append("doc-a", &[0.0, 0.0, 0.0, 1.0])
2911            .expect("append duplicate");
2912        assert_eq!(index.wal_record_count(), 1);
2913
2914        // WAL entry shadows the main-index entry (WAL is newer).
2915        let hits = index
2916            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2917            .expect("search");
2918        assert_eq!(
2919            hits.len(),
2920            1,
2921            "WAL shadows main — only WAL entry should appear"
2922        );
2923        assert_eq!(hits[0].doc_id, "doc-a");
2924
2925        std::fs::remove_file(&path).ok();
2926        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2927    }
2928
2929    #[test]
2930    fn append_large_batch_100_vectors() {
2931        let path = temp_index_path("wal-large-batch");
2932        let dim = 8;
2933
2934        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2935        writer
2936            .write_record("main-0", &sample_vector(1.0, dim))
2937            .expect("write");
2938        writer.finish().expect("finish");
2939
2940        let mut index = VectorIndex::open(&path).expect("open");
2941        let batch: Vec<(String, Vec<f32>)> = (0..100)
2942            .map(|i| {
2943                #[allow(clippy::cast_precision_loss)]
2944                let base = (i as f32) * 0.01;
2945                (format!("wal-{i:03}"), sample_vector(base, dim))
2946            })
2947            .collect();
2948        index.append_batch(&batch).expect("large batch");
2949        assert_eq!(index.wal_record_count(), 100);
2950
2951        let hits = index
2952            .search_top_k(&sample_vector(1.0, dim), 5, None)
2953            .expect("search");
2954        assert_eq!(hits.len(), 5);
2955        // The main-0 (base=1.0) should rank near the top with query [1.0, ...].
2956        assert!(hits.iter().any(|h| h.doc_id == "main-0"));
2957
2958        std::fs::remove_file(&path).ok();
2959        std::fs::remove_file(wal::wal_path_for(&path)).ok();
2960    }
2961
2962    #[test]
2963    fn concurrent_append_and_search() {
2964        use std::sync::Arc;
2965
2966        let path = temp_index_path("wal-concurrent");
2967        let dim = 4;
2968
2969        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2970        for i in 0..10 {
2971            writer
2972                .write_record(&format!("main-{i}"), &sample_vector(0.1, dim))
2973                .expect("write");
2974        }
2975        writer.finish().expect("finish");
2976
2977        // Append sequentially (VectorIndex is not Send+Sync for shared mutation),
2978        // then search from multiple threads using a snapshot.
2979        let mut index = VectorIndex::open(&path).expect("open");
2980        for i in 0..20 {
2981            index
2982                .append(&format!("wal-{i}"), &sample_vector(0.5, dim))
2983                .expect("append");
2984        }
2985
2986        let index = Arc::new(index);
2987        let query = sample_vector(1.0, dim);
2988
2989        let handles: Vec<_> = (0..4)
2990            .map(|_| {
2991                let idx = Arc::clone(&index);
2992                let q = query.clone();
2993                std::thread::spawn(move || idx.search_top_k(&q, 10, None).expect("search"))
2994            })
2995            .collect();
2996
2997        for handle in handles {
2998            let hits = handle.join().expect("thread join");
2999            assert_eq!(hits.len(), 10);
3000            // All scores should be positive (dot product of positive vectors).
3001            assert!(hits.iter().all(|h| h.score > 0.0));
3002        }
3003
3004        std::fs::remove_file(&path).ok();
3005        std::fs::remove_file(wal::wal_path_for(&path)).ok();
3006    }
3007
3008    #[test]
3009    fn wal_record_count_across_append_compact_cycles() {
3010        let path = temp_index_path("wal-count-cycle");
3011        let dim = 4;
3012
3013        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
3014        writer
3015            .write_record("main-0", &sample_vector(1.0, dim))
3016            .expect("write");
3017        writer.finish().expect("finish");
3018
3019        let mut index = VectorIndex::open(&path).expect("open");
3020        assert_eq!(index.wal_record_count(), 0);
3021        assert_eq!(index.record_count(), 1);
3022
3023        // Append 3 entries.
3024        index.append("w1", &sample_vector(0.1, dim)).expect("a1");
3025        index.append("w2", &sample_vector(0.2, dim)).expect("a2");
3026        index.append("w3", &sample_vector(0.3, dim)).expect("a3");
3027        assert_eq!(index.wal_record_count(), 3);
3028        assert_eq!(index.record_count(), 1);
3029
3030        // Compact.
3031        index.compact().expect("compact");
3032        assert_eq!(index.wal_record_count(), 0);
3033        assert_eq!(index.record_count(), 4);
3034
3035        // Append 2 more.
3036        index.append("w4", &sample_vector(0.4, dim)).expect("a4");
3037        index.append("w5", &sample_vector(0.5, dim)).expect("a5");
3038        assert_eq!(index.wal_record_count(), 2);
3039        assert_eq!(index.record_count(), 4);
3040
3041        // Total searchable = 4 + 2 = 6.
3042        let hits = index
3043            .search_top_k(&sample_vector(1.0, dim), 100, None)
3044            .expect("search");
3045        assert_eq!(hits.len(), 6);
3046
3047        std::fs::remove_file(&path).ok();
3048        std::fs::remove_file(wal::wal_path_for(&path)).ok();
3049    }
3050
3051    #[test]
3052    fn soft_delete_removes_wal_only_record_and_persists() {
3053        let path = temp_index_path("wal-soft-delete-only");
3054        let dim = 4;
3055
3056        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
3057        writer
3058            .write_record("main-0", &sample_vector(1.0, dim))
3059            .expect("write");
3060        writer.finish().expect("finish");
3061
3062        let mut index = VectorIndex::open(&path).expect("open");
3063        index
3064            .append("wal-only", &[0.0, 1.0, 0.0, 0.0])
3065            .expect("append wal-only");
3066        assert_eq!(index.wal_record_count(), 1);
3067
3068        assert!(index.soft_delete("wal-only").expect("soft delete wal-only"));
3069        assert_eq!(index.wal_record_count(), 0);
3070        let hits = index
3071            .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
3072            .expect("search");
3073        assert!(hits.iter().all(|hit| hit.doc_id != "wal-only"));
3074
3075        drop(index);
3076        let reopened = VectorIndex::open(&path).expect("reopen");
3077        assert_eq!(reopened.wal_record_count(), 0);
3078        let reopened_hits = reopened
3079            .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
3080            .expect("search after reopen");
3081        assert!(reopened_hits.iter().all(|hit| hit.doc_id != "wal-only"));
3082
3083        std::fs::remove_file(&path).ok();
3084        std::fs::remove_file(wal::wal_path_for(&path)).ok();
3085    }
3086
3087    #[test]
3088    fn soft_delete_clears_pending_wal_updates_for_same_doc_id() {
3089        let path = temp_index_path("wal-soft-delete-main-and-wal");
3090        let dim = 4;
3091
3092        let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
3093        writer
3094            .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
3095            .expect("write doc-a");
3096        writer.finish().expect("finish");
3097
3098        let mut index = VectorIndex::open(&path).expect("open");
3099        index
3100            .append("doc-a", &[0.0, 1.0, 0.0, 0.0])
3101            .expect("append doc-a update");
3102        index
3103            .append("doc-b", &[0.0, 0.0, 1.0, 0.0])
3104            .expect("append doc-b");
3105        assert_eq!(index.wal_record_count(), 2);
3106
3107        assert!(index.soft_delete("doc-a").expect("soft delete doc-a"));
3108        assert_eq!(
3109            index.wal_record_count(),
3110            1,
3111            "doc-a WAL entries should be purged"
3112        );
3113
3114        let hits = index
3115            .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
3116            .expect("search");
3117        assert!(
3118            hits.iter().all(|hit| hit.doc_id != "doc-a"),
3119            "doc-a should not be searchable from main or WAL"
3120        );
3121        assert!(hits.iter().any(|hit| hit.doc_id == "doc-b"));
3122
3123        std::fs::remove_file(&path).ok();
3124        std::fs::remove_file(wal::wal_path_for(&path)).ok();
3125    }
3126
3127    #[test]
3128    fn empty_index_append_only() {
3129        let path = temp_index_path("wal-empty-append");
3130        let dim = 4;
3131
3132        // Create an empty main index.
3133        let writer = VectorIndex::create(&path, "test", dim).expect("writer");
3134        writer.finish().expect("finish");
3135
3136        let mut index = VectorIndex::open(&path).expect("open");
3137        assert_eq!(index.record_count(), 0);
3138
3139        // Append to empty index via WAL.
3140        index
3141            .append("first", &[1.0, 0.0, 0.0, 0.0])
3142            .expect("append");
3143        assert_eq!(index.wal_record_count(), 1);
3144
3145        // Should still be searchable.
3146        let hits = index
3147            .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
3148            .expect("search");
3149        assert_eq!(hits.len(), 1);
3150        assert_eq!(hits[0].doc_id, "first");
3151
3152        // Compact from empty main + WAL.
3153        let stats = index.compact().expect("compact");
3154        assert_eq!(stats.main_records_before, 0);
3155        assert_eq!(stats.wal_records, 1);
3156        assert_eq!(stats.total_records_after, 1);
3157        assert_eq!(index.record_count(), 1);
3158
3159        std::fs::remove_file(&path).ok();
3160    }
3161
3162    // ─── Quantization edge cases ────────────────────────────────────────
3163
3164    #[test]
3165    fn quantization_bytes_per_element() {
3166        assert_eq!(Quantization::F32.bytes_per_element(), 4);
3167        assert_eq!(Quantization::F16.bytes_per_element(), 2);
3168    }
3169
3170    #[test]
3171    fn quantization_from_wire_valid() {
3172        let path = Path::new("test.fsvi");
3173        assert_eq!(Quantization::from_wire(0, path).unwrap(), Quantization::F32);
3174        assert_eq!(Quantization::from_wire(1, path).unwrap(), Quantization::F16);
3175    }
3176
3177    #[test]
3178    fn quantization_from_wire_invalid() {
3179        let path = Path::new("test.fsvi");
3180        assert!(Quantization::from_wire(2, path).is_err());
3181        assert!(Quantization::from_wire(255, path).is_err());
3182    }
3183
3184    // ─── align_up edge cases ────────────────────────────────────────────
3185
3186    #[test]
3187    fn align_up_zero_alignment() {
3188        assert_eq!(align_up(42, 0).unwrap(), 42);
3189    }
3190
3191    #[test]
3192    fn align_up_already_aligned() {
3193        assert_eq!(align_up(128, 64).unwrap(), 128);
3194    }
3195
3196    #[test]
3197    fn align_up_zero_value() {
3198        assert_eq!(align_up(0, 64).unwrap(), 0);
3199    }
3200
3201    #[test]
3202    fn align_up_one_over() {
3203        assert_eq!(align_up(65, 64).unwrap(), 128);
3204    }
3205
3206    // ─── fnv1a_hash edge cases ──────────────────────────────────────────
3207
3208    #[test]
3209    fn fnv1a_hash_empty_input() {
3210        let hash = fnv1a_hash(b"");
3211        assert_eq!(hash, 0xcbf2_9ce4_8422_2325);
3212    }
3213
3214    #[test]
3215    fn fnv1a_hash_deterministic() {
3216        let h1 = fnv1a_hash(b"hello");
3217        let h2 = fnv1a_hash(b"hello");
3218        assert_eq!(h1, h2);
3219    }
3220
3221    #[test]
3222    fn fnv1a_hash_different_inputs_differ() {
3223        let h1 = fnv1a_hash(b"doc-a");
3224        let h2 = fnv1a_hash(b"doc-b");
3225        assert_ne!(h1, h2);
3226    }
3227
3228    // ─── is_tombstoned_flags ────────────────────────────────────────────
3229
3230    #[test]
3231    fn tombstone_flag_logic() {
3232        assert!(!is_tombstoned_flags(0x0000));
3233        assert!(is_tombstoned_flags(RECORD_FLAG_TOMBSTONE));
3234        assert!(is_tombstoned_flags(0x0003)); // tombstone + custom
3235        assert!(!is_tombstoned_flags(0x0002)); // only custom
3236    }
3237
3238    // ─── validate_header_string ─────────────────────────────────────────
3239
3240    #[test]
3241    fn validate_header_string_empty_embedder_id_rejected() {
3242        let result = validate_header_string("", "embedder_id");
3243        assert!(result.is_err());
3244    }
3245
3246    #[test]
3247    fn validate_header_string_empty_embedder_revision_ok() {
3248        let result = validate_header_string("", "embedder_revision");
3249        assert!(result.is_ok());
3250    }
3251
3252    #[test]
3253    fn validate_header_string_normal_ok() {
3254        let result = validate_header_string("potion-128M", "embedder_id");
3255        assert!(result.is_ok());
3256    }
3257
3258    // ─── VectorMetadata clone/eq ────────────────────────────────────────
3259
3260    #[test]
3261    fn vector_metadata_clone_eq() {
3262        let meta = VectorMetadata {
3263            embedder_id: "test".to_owned(),
3264            embedder_revision: "v1".to_owned(),
3265            dimension: 256,
3266            quantization: Quantization::F16,
3267            compaction_gen: 0,
3268            record_count: 100,
3269            vectors_offset: 1024,
3270        };
3271        let cloned = meta.clone();
3272        assert_eq!(meta, cloned);
3273    }
3274
3275    // ─── VectorIndex::create validation ─────────────────────────────────
3276
3277    #[test]
3278    fn create_zero_dimension_rejected() {
3279        let path = temp_index_path("zero-dim");
3280        let result = VectorIndex::create(&path, "test", 0);
3281        assert!(result.is_err());
3282        assert!(matches!(
3283            result.unwrap_err(),
3284            SearchError::InvalidConfig { .. }
3285        ));
3286    }
3287
3288    #[test]
3289    fn create_empty_embedder_id_rejected() {
3290        let path = temp_index_path("empty-embedder");
3291        let result = VectorIndex::create(&path, "", 4);
3292        assert!(result.is_err());
3293    }
3294
3295    #[test]
3296    fn create_with_revision_empty_revision_ok() {
3297        let path = temp_index_path("empty-rev");
3298        let writer =
3299            VectorIndex::create_with_revision(&path, "test", "", 4, Quantization::F16).unwrap();
3300        writer.finish().unwrap();
3301        let index = VectorIndex::open(&path).unwrap();
3302        assert_eq!(index.embedder_revision(), "");
3303        std::fs::remove_file(&path).ok();
3304    }
3305
3306    // ─── VectorIndexWriter rejection cases ──────────────────────────────
3307
3308    #[test]
3309    fn write_record_nan_embedding_rejected() {
3310        let path = temp_index_path("nan-embed");
3311        let mut writer = VectorIndex::create(&path, "test", 3).unwrap();
3312        let result = writer.write_record("doc", &[1.0, f32::NAN, 0.0]);
3313        assert!(result.is_err());
3314        let err = format!("{}", result.unwrap_err());
3315        assert!(
3316            err.contains("non-finite"),
3317            "expected non-finite error, got: {err}"
3318        );
3319    }
3320
3321    #[test]
3322    fn write_record_inf_embedding_rejected() {
3323        let path = temp_index_path("inf-embed");
3324        let mut writer = VectorIndex::create(&path, "test", 3).unwrap();
3325        let result = writer.write_record("doc", &[1.0, f32::INFINITY, 0.0]);
3326        assert!(result.is_err());
3327    }
3328
3329    // ─── VectorIndex::open edge cases ───────────────────────────────────
3330
3331    #[test]
3332    fn open_nonexistent_file_returns_index_not_found() {
3333        let path = temp_index_path("nonexistent-open");
3334        let result = VectorIndex::open(&path);
3335        assert!(result.is_err());
3336        assert!(matches!(
3337            result.unwrap_err(),
3338            SearchError::IndexNotFound { .. }
3339        ));
3340    }
3341
3342    #[test]
3343    fn open_truncated_file_detected() {
3344        let path = temp_index_path("truncated-open");
3345        let mut writer = VectorIndex::create(&path, "test", 4).unwrap();
3346        writer.write_record("doc-0", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3347        writer.finish().unwrap();
3348
3349        let data = std::fs::read(&path).unwrap();
3350        std::fs::write(&path, &data[..data.len() - 4]).unwrap();
3351
3352        let result = VectorIndex::open(&path);
3353        assert!(result.is_err());
3354        let err = format!("{}", result.unwrap_err());
3355        assert!(
3356            err.contains("truncated") || err.contains("too small") || err.contains("extends"),
3357            "expected truncation error, got: {err}"
3358        );
3359
3360        std::fs::remove_file(&path).ok();
3361    }
3362
3363    // ─── FSVI constants ─────────────────────────────────────────────────
3364
3365    #[test]
3366    fn fsvi_magic_is_four_bytes() {
3367        assert_eq!(FSVI_MAGIC.len(), 4);
3368        assert_eq!(&FSVI_MAGIC, b"FSVI");
3369    }
3370
3371    #[test]
3372    fn fsvi_version_is_one() {
3373        assert_eq!(FSVI_VERSION, 1);
3374    }
3375
3376    #[test]
3377    fn record_size_is_sixteen() {
3378        assert_eq!(RECORD_SIZE_BYTES, 16);
3379    }
3380
3381    // ─── vector_at_f16 on f16 index ─────────────────────────────────────
3382
3383    #[test]
3384    fn vector_at_f16_roundtrip() {
3385        let path = temp_index_path("f16-at-roundtrip");
3386        let mut writer =
3387            VectorIndex::create_with_revision(&path, "test", "r1", 3, Quantization::F16).unwrap();
3388        writer.write_record("doc", &[0.5, -0.5, 1.0]).unwrap();
3389        writer.finish().unwrap();
3390
3391        let index = VectorIndex::open(&path).unwrap();
3392        let f16_vec = index.vector_at_f16(0).unwrap();
3393        assert_eq!(f16_vec.len(), 3);
3394        assert!((f16_vec[0].to_f32() - 0.5).abs() < 0.01);
3395        assert!((f16_vec[1].to_f32() - (-0.5)).abs() < 0.01);
3396        assert!((f16_vec[2].to_f32() - 1.0).abs() < 0.01);
3397
3398        std::fs::remove_file(&path).ok();
3399    }
3400
3401    // ─── vector_at_f16 on f32 index (converts) ─────────────────────────
3402
3403    #[test]
3404    fn vector_at_f16_from_f32_index() {
3405        let path = temp_index_path("f16-from-f32");
3406        let mut writer =
3407            VectorIndex::create_with_revision(&path, "test", "r1", 3, Quantization::F32).unwrap();
3408        writer.write_record("doc", &[0.25, -0.75, 1.0]).unwrap();
3409        writer.finish().unwrap();
3410
3411        let index = VectorIndex::open(&path).unwrap();
3412        let f16_vec = index.vector_at_f16(0).unwrap();
3413        assert_eq!(f16_vec.len(), 3);
3414        assert!((f16_vec[0].to_f32() - 0.25).abs() < 0.01);
3415
3416        std::fs::remove_file(&path).ok();
3417    }
3418
3419    // ─── metadata accessor ──────────────────────────────────────────────
3420
3421    #[test]
3422    fn metadata_accessor_returns_consistent_data() {
3423        let path = temp_index_path("metadata-accessor");
3424        let mut writer =
3425            VectorIndex::create_with_revision(&path, "emb-1", "rev-9", 16, Quantization::F32)
3426                .unwrap();
3427        writer.write_record("d", &[0.0; 16]).unwrap();
3428        writer.finish().unwrap();
3429
3430        let index = VectorIndex::open(&path).unwrap();
3431        let meta = index.metadata();
3432        assert_eq!(meta.embedder_id, "emb-1");
3433        assert_eq!(meta.embedder_revision, "rev-9");
3434        assert_eq!(meta.dimension, 16);
3435        assert_eq!(meta.quantization, Quantization::F32);
3436        assert_eq!(meta.record_count, 1);
3437        assert_eq!(meta.vectors_offset % 64, 0);
3438
3439        std::fs::remove_file(&path).ok();
3440    }
3441
3442    // ─── is_deleted accessor ────────────────────────────────────────────
3443
3444    #[test]
3445    fn is_deleted_false_for_live_record() {
3446        let path = temp_index_path("is-deleted-live");
3447        let mut writer = VectorIndex::create(&path, "test", 4).unwrap();
3448        writer.write_record("doc", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3449        writer.finish().unwrap();
3450
3451        let index = VectorIndex::open(&path).unwrap();
3452        assert!(!index.is_deleted(0));
3453
3454        std::fs::remove_file(&path).ok();
3455    }
3456
3457    // ─── tombstone_ratio empty index ────────────────────────────────────
3458
3459    #[test]
3460    fn tombstone_ratio_empty_index_is_zero() {
3461        let path = temp_index_path("tomb-ratio-empty");
3462        let writer = VectorIndex::create(&path, "test", 4).unwrap();
3463        writer.finish().unwrap();
3464
3465        let index = VectorIndex::open(&path).unwrap();
3466        assert!(index.tombstone_ratio().abs() < f64::EPSILON);
3467        assert!(!index.needs_vacuum());
3468
3469        std::fs::remove_file(&path).ok();
3470    }
3471
3472    // ─── WalConfig default ──────────────────────────────────────────────
3473
3474    #[test]
3475    fn wal_config_default_values() {
3476        let cfg = WalConfig::default();
3477        assert!(cfg.compaction_threshold > 0);
3478        assert!(cfg.compaction_ratio > 0.0);
3479    }
3480
3481    // ─── F32 roundtrip with explicit revision ───────────────────────────
3482
3483    #[test]
3484    fn f32_roundtrip_with_revision() {
3485        let path = temp_index_path("f32-rev-roundtrip");
3486        let original = vec![std::f32::consts::PI, std::f32::consts::E, 0.0, -1.0];
3487        let mut writer =
3488            VectorIndex::create_with_revision(&path, "f32-emb", "rev-42", 4, Quantization::F32)
3489                .unwrap();
3490        writer.write_record("doc", &original).unwrap();
3491        writer.finish().unwrap();
3492
3493        let index = VectorIndex::open(&path).unwrap();
3494        let recovered = index.vector_at_f32(0).unwrap();
3495        assert_eq!(recovered, original, "f32 must roundtrip exactly");
3496        assert_eq!(index.embedder_revision(), "rev-42");
3497
3498        std::fs::remove_file(&path).ok();
3499    }
3500
3501    // ─── Header CRC corruption by flipping data byte ────────────────────
3502
3503    #[test]
3504    fn header_crc_detects_embedder_id_corruption() {
3505        let path = temp_index_path("crc-embedder-corrupt");
3506        let mut writer = VectorIndex::create(&path, "test-embedder-long", 4).unwrap();
3507        writer.write_record("doc", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3508        writer.finish().unwrap();
3509
3510        let mut data = std::fs::read(&path).unwrap();
3511        // Flip a byte in the embedder_id region (after magic+version+id_len = 8 bytes)
3512        data[10] ^= 0xFF;
3513        std::fs::write(&path, &data).unwrap();
3514
3515        let result = VectorIndex::open(&path);
3516        assert!(result.is_err());
3517        let err = format!("{}", result.unwrap_err());
3518        assert!(
3519            err.contains("CRC") || err.contains("crc"),
3520            "expected CRC error, got: {err}"
3521        );
3522
3523        std::fs::remove_file(&path).ok();
3524    }
3525
3526    // ─── bd-1fh4 tests begin ──────────────────────────────────────────
3527
3528    #[test]
3529    fn vacuum_stats_debug_clone_partial_eq() {
3530        let stats = VacuumStats {
3531            records_before: 10,
3532            records_after: 8,
3533            tombstones_removed: 2,
3534            bytes_reclaimed: 1024,
3535            duration: Duration::from_millis(5),
3536        };
3537        let debug = format!("{stats:?}");
3538        assert!(debug.contains("VacuumStats"));
3539        assert!(debug.contains("records_before: 10"));
3540
3541        let cloned = stats.clone();
3542        assert_eq!(stats, cloned);
3543    }
3544
3545    #[test]
3546    fn quantization_debug_clone_copy_eq() {
3547        let f16 = Quantization::F16;
3548        let f32q = Quantization::F32;
3549
3550        let debug_f16 = format!("{f16:?}");
3551        assert!(debug_f16.contains("F16"));
3552        let debug_f32 = format!("{f32q:?}");
3553        assert!(debug_f32.contains("F32"));
3554
3555        let f16_copy = f16;
3556        assert_eq!(f16, f16_copy);
3557        let f32_copy = f32q;
3558        assert_eq!(f32q, f32_copy);
3559        assert_ne!(f16, f32q);
3560    }
3561
3562    #[test]
3563    fn vector_index_debug_includes_path() {
3564        let path = temp_index_path("debug-fmt");
3565        let writer = VectorIndex::create(&path, "test", 4).unwrap();
3566        writer.finish().unwrap();
3567
3568        let index = VectorIndex::open(&path).unwrap();
3569        let debug = format!("{index:?}");
3570        assert!(debug.contains("VectorIndex"));
3571
3572        std::fs::remove_file(&path).ok();
3573    }
3574
3575    #[test]
3576    fn set_wal_config_overrides_defaults() {
3577        let path = temp_index_path("wal-cfg-override");
3578        let dim = 4;
3579        let mut writer = VectorIndex::create(&path, "test", dim).unwrap();
3580        for i in 0..100 {
3581            writer
3582                .write_record(&format!("d{i}"), &sample_vector(0.1, dim))
3583                .unwrap();
3584        }
3585        writer.finish().unwrap();
3586
3587        let mut index = VectorIndex::open(&path).unwrap();
3588        // With 100 main records and default config, 1 WAL entry should not trigger.
3589        index.append("wal-1", &sample_vector(0.5, dim)).unwrap();
3590        assert!(!index.needs_compaction());
3591
3592        // Set a low threshold to trigger compaction.
3593        index.set_wal_config(WalConfig {
3594            compaction_threshold: 1,
3595            compaction_ratio: 0.001,
3596            fsync_on_write: false,
3597        });
3598        assert!(index.needs_compaction());
3599
3600        std::fs::remove_file(&path).ok();
3601        std::fs::remove_file(wal::wal_path_for(&path)).ok();
3602    }
3603
3604    #[test]
3605    fn find_index_by_doc_hash_empty_index_none() {
3606        let path = temp_index_path("hash-empty");
3607        let writer = VectorIndex::create(&path, "test", 4).unwrap();
3608        writer.finish().unwrap();
3609
3610        let index = VectorIndex::open(&path).unwrap();
3611        assert!(index.find_index_by_doc_hash(0xDEAD_BEEF).is_none());
3612        assert!(index.find_index_by_doc_hash(0).is_none());
3613
3614        std::fs::remove_file(&path).ok();
3615    }
3616
3617    #[test]
3618    fn get_embeddings_mixed_hit_miss() {
3619        let path = temp_index_path("emb-mixed");
3620        let mut writer =
3621            VectorIndex::create_with_revision(&path, "test", "r1", 3, Quantization::F16).unwrap();
3622        writer.write_record("alpha", &[1.0, 0.0, 0.0]).unwrap();
3623        writer.write_record("beta", &[0.0, 1.0, 0.0]).unwrap();
3624        writer.finish().unwrap();
3625
3626        let index = VectorIndex::open(&path).unwrap();
3627        let alpha_hash = fnv1a_hash(b"alpha");
3628        let beta_hash = fnv1a_hash(b"beta");
3629        let missing_hash = fnv1a_hash(b"gamma");
3630
3631        let results = index.get_embeddings(&[alpha_hash, missing_hash, beta_hash]);
3632        assert_eq!(results.len(), 3);
3633        assert!(results[0].is_some(), "alpha should be found");
3634        assert!(results[1].is_none(), "gamma should be missing");
3635        assert!(results[2].is_some(), "beta should be found");
3636
3637        std::fs::remove_file(&path).ok();
3638    }
3639
3640    #[test]
3641    fn append_batch_empty_is_noop() {
3642        let path = temp_index_path("append-empty-batch");
3643        let writer = VectorIndex::create(&path, "test", 4).unwrap();
3644        writer.finish().unwrap();
3645
3646        let mut index = VectorIndex::open(&path).unwrap();
3647        index.append_batch(&[]).unwrap();
3648        assert_eq!(index.wal_record_count(), 0);
3649
3650        std::fs::remove_file(&path).ok();
3651    }
3652
3653    #[test]
3654    fn append_nan_embedding_rejected() {
3655        let path = temp_index_path("append-nan");
3656        let writer = VectorIndex::create(&path, "test", 4).unwrap();
3657        writer.finish().unwrap();
3658
3659        let mut index = VectorIndex::open(&path).unwrap();
3660        let result = index.append("doc", &[1.0, f32::NAN, 0.0, 0.0]);
3661        assert!(result.is_err());
3662        let err = format!("{}", result.unwrap_err());
3663        assert!(err.contains("finite"), "expected finite error, got: {err}");
3664    }
3665
3666    #[test]
3667    fn append_inf_embedding_rejected() {
3668        let path = temp_index_path("append-inf");
3669        let writer = VectorIndex::create(&path, "test", 4).unwrap();
3670        writer.finish().unwrap();
3671
3672        let mut index = VectorIndex::open(&path).unwrap();
3673        let result = index.append("doc", &[1.0, 0.0, f32::INFINITY, 0.0]);
3674        assert!(result.is_err());
3675        let err = format!("{}", result.unwrap_err());
3676        assert!(err.contains("finite"), "expected finite error, got: {err}");
3677    }
3678
3679    #[test]
3680    fn soft_delete_already_deleted_returns_false() {
3681        let path = temp_index_path("double-delete");
3682        let mut writer = VectorIndex::create(&path, "test", 4).unwrap();
3683        writer.write_record("doc", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3684        writer.finish().unwrap();
3685
3686        let mut index = VectorIndex::open(&path).unwrap();
3687        assert!(index.soft_delete("doc").unwrap(), "first delete");
3688        assert!(!index.soft_delete("doc").unwrap(), "second delete");
3689        assert!(!index.soft_delete("doc").unwrap(), "third delete");
3690
3691        std::fs::remove_file(&path).ok();
3692    }
3693
3694    #[test]
3695    fn compact_preserves_wal_config() {
3696        let path = temp_index_path("compact-cfg");
3697        let dim = 4;
3698        let mut writer = VectorIndex::create(&path, "test", dim).unwrap();
3699        for i in 0..20 {
3700            writer
3701                .write_record(&format!("d{i}"), &sample_vector(0.1, dim))
3702                .unwrap();
3703        }
3704        writer.finish().unwrap();
3705
3706        let mut index = VectorIndex::open(&path).unwrap();
3707        let custom = WalConfig {
3708            compaction_threshold: 99,
3709            compaction_ratio: 0.90,
3710            fsync_on_write: false,
3711        };
3712        index.set_wal_config(custom);
3713        index.append("wal-1", &sample_vector(0.5, dim)).unwrap();
3714        index.compact().unwrap();
3715
3716        // After compaction, the custom config should be preserved.
3717        assert_eq!(index.wal_record_count(), 0);
3718        // Verify config persists: threshold=99 and ratio=0.90,
3719        // with 21 main records, 1 WAL entry → ratio ~0.048 < 0.90.
3720        index.append("wal-2", &sample_vector(0.3, dim)).unwrap();
3721        assert!(!index.needs_compaction());
3722
3723        std::fs::remove_file(&path).ok();
3724        std::fs::remove_file(wal::wal_path_for(&path)).ok();
3725    }
3726
3727    #[test]
3728    fn soft_delete_wal_restores_state_on_rewrite_failure() {
3729        let path = temp_index_path("wal-delete-restore");
3730        let dim = 4;
3731
3732        let mut writer = VectorIndex::create(&path, "test", dim).unwrap();
3733        writer
3734            .write_record("main-0", &sample_vector(1.0, dim))
3735            .unwrap();
3736        writer.finish().unwrap();
3737
3738        let mut index = VectorIndex::open(&path).unwrap();
3739        index.append("wal-a", &[0.0, 1.0, 0.0, 0.0]).unwrap();
3740        index.append("wal-b", &[0.0, 0.0, 1.0, 0.0]).unwrap();
3741        assert_eq!(index.wal_record_count(), 2);
3742
3743        // Make the WAL parent directory read-only to force a rewrite failure.
3744        let wal_file = wal::wal_path_for(&path);
3745        let wal_dir = wal_file.parent().unwrap();
3746        let original_perms = fs::metadata(wal_dir).unwrap().permissions();
3747        let mut readonly = original_perms.clone();
3748        readonly.set_readonly(true);
3749        if fs::set_permissions(wal_dir, readonly).is_err() {
3750            // Sandboxed environments may not allow permission changes; skip.
3751            std::fs::remove_file(&path).ok();
3752            std::fs::remove_file(wal::wal_path_for(&path)).ok();
3753            return;
3754        }
3755
3756        let result = index.soft_delete("wal-a");
3757
3758        // Restore directory permissions before any assertions so cleanup works.
3759        fs::set_permissions(wal_dir, original_perms).unwrap();
3760
3761        // The delete should have failed.
3762        assert!(result.is_err(), "expected error from read-only directory");
3763
3764        // In-memory WAL entries must be fully restored.
3765        assert_eq!(
3766            index.wal_record_count(),
3767            2,
3768            "WAL entries should be restored after rewrite failure"
3769        );
3770
3771        // Both entries should still be searchable.
3772        let hits = index.search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None).unwrap();
3773        assert!(hits.iter().any(|h| h.doc_id == "wal-a"));
3774        assert!(hits.iter().any(|h| h.doc_id == "wal-b"));
3775
3776        std::fs::remove_file(&path).ok();
3777        std::fs::remove_file(wal::wal_path_for(&path)).ok();
3778    }
3779
3780    // ─── Regression: Duplicate entries on compaction crash ──────────────
3781
3782    #[test]
3783    fn repro_duplicate_entries_on_compaction_crash() {
3784        let path = temp_index_path("compaction-crash");
3785        let dim = 4;
3786
3787        // 1. Create initial index with 1 document
3788        let mut writer =
3789            VectorIndex::create_with_revision(&path, "test", "v1", dim, Quantization::F16).unwrap();
3790        writer.write_record("doc-A", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3791        writer.finish().unwrap();
3792
3793        let mut index = VectorIndex::open(&path).unwrap();
3794
3795        // 2. Append a document to WAL
3796        index.append("doc-B", &[0.0, 1.0, 0.0, 0.0]).unwrap();
3797
3798        // Check state before "compaction"
3799        let hits = index.search_top_k(&[1.0, 1.0, 0.0, 0.0], 10, None).unwrap();
3800        assert_eq!(hits.len(), 2);
3801
3802        // 3. Simulate compaction crash:
3803        // We want to create a state where "doc-B" is in Main Index AND in WAL.
3804        // We can do this by running `compact` but preventing the WAL deletion.
3805        // Since we can't easily interrupt `compact`, we'll simulate the filesystem state.
3806
3807        // Close index to flush everything
3808        drop(index);
3809
3810        // Manually create the "post-compaction" main index that includes both A and B.
3811        let mut compact_writer =
3812            VectorIndex::create_with_revision(&path, "test", "v1", dim, Quantization::F16)
3813                .unwrap()
3814                .with_generation(2); // Simulate correct compaction increment
3815        compact_writer
3816            .write_record("doc-A", &[1.0, 0.0, 0.0, 0.0])
3817            .unwrap();
3818        compact_writer
3819            .write_record("doc-B", &[0.0, 1.0, 0.0, 0.0])
3820            .unwrap();
3821        compact_writer.finish().unwrap(); // Overwrites `path` with new index containing A and B.
3822
3823        // Restore the WAL file (because `finish` doesn't touch it, but we need to ensure it exists and has doc-B)
3824        // Actually, `finish` overwrites `path`. The WAL file is at `path.wal`.
3825        // We didn't delete `path.wal`. So `path.wal` still contains "doc-B".
3826
3827        // 4. Re-open index. It should load Main (A, B) and WAL (B).
3828        let index_reopened = VectorIndex::open(&path).unwrap();
3829
3830        // 5. Search. If bug exists, we'll see "doc-B" twice.
3831        let hits = index_reopened
3832            .search_top_k(&[1.0, 1.0, 0.0, 0.0], 10, None)
3833            .unwrap();
3834
3835        // Debug output
3836        for hit in &hits {
3837            println!("Hit: {} score={}", hit.doc_id, hit.score);
3838        }
3839
3840        // Clean up
3841        let _ = fs::remove_file(&path);
3842        let _ = wal::remove_wal(&wal::wal_path_for(&path));
3843
3844        // Assert failure
3845        let hit_count = hits.len();
3846        assert_eq!(
3847            hit_count, 2,
3848            "Should have exactly 2 hits (A and B), found {hit_count}"
3849        );
3850        let b_count = hits.iter().filter(|h| h.doc_id == "doc-B").count();
3851        assert_eq!(b_count, 1, "Should have exactly 1 'doc-B', found {b_count}");
3852    }
3853
3854    // ─── bd-1fh4 tests end ────────────────────────────────────────────
3855}