1#[cfg(feature = "ann")]
46pub mod hnsw;
47pub mod in_memory;
48pub mod mrl;
49pub mod quantization;
50mod repro_soft_delete_rollback;
51mod repro_wal_truncation;
52pub mod search;
53pub mod simd;
54pub mod two_tier;
55pub mod wal;
56pub mod warmup;
57
58use std::fs::{self, File, OpenOptions};
59use std::io::{BufWriter, Write};
60use std::path::{Path, PathBuf};
61use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
62
63use crc32fast::Hasher as Crc32;
64use frankensearch_core::{SearchError, SearchResult};
65use half::f16;
66use memmap2::MmapMut;
67use tracing::debug;
68
69#[cfg(feature = "ann")]
70pub use hnsw::{
71 AnnSearchStats, HNSW_DEFAULT_EF_CONSTRUCTION, HNSW_DEFAULT_EF_SEARCH, HNSW_DEFAULT_M,
72 HNSW_DEFAULT_MAX_LAYER, HnswConfig, HnswIndex,
73};
74pub use in_memory::{InMemoryTwoTierIndex, InMemoryVectorIndex};
75pub use mrl::{MrlConfig, MrlSearchStats};
76pub use quantization::ScalarQuantizer;
77pub use search::{PARALLEL_CHUNK_SIZE, PARALLEL_THRESHOLD, SearchParams};
78pub use simd::{
79 cosine_similarity_f16, dot_product_f16_bytes_f32, dot_product_f16_f32,
80 dot_product_f32_bytes_f32, dot_product_f32_f32,
81};
82pub use two_tier::{
83 TwoTierIndex, TwoTierIndexBuilder, VECTOR_INDEX_FALLBACK_FILENAME, VECTOR_INDEX_FAST_FILENAME,
84 VECTOR_INDEX_QUALITY_FILENAME,
85};
86#[cfg(feature = "ann")]
87pub use two_tier::{VECTOR_ANN_FAST_FILENAME, VECTOR_ANN_QUALITY_FILENAME};
88pub use wal::{CompactionStats, WalConfig, wal_path_for};
89pub use warmup::{AdaptiveConfig, HeatMap, WarmUpConfig, WarmUpResult, WarmUpStrategy};
90
91pub const FSVI_MAGIC: [u8; 4] = *b"FSVI";
93
94pub const FSVI_VERSION: u16 = 1;
96
97const RECORD_SIZE_BYTES: usize = 16;
98const VECTOR_ALIGN_BYTES: u64 = 64;
99const RECORD_FLAG_TOMBSTONE: u16 = 0x0001;
100const TOMBSTONE_VACUUM_THRESHOLD: f64 = 0.20;
101
102#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104#[repr(u8)]
105pub enum Quantization {
106 F32 = 0,
108 F16 = 1,
110}
111
112impl Quantization {
113 pub(crate) fn from_wire(value: u8, path: &Path) -> SearchResult<Self> {
114 match value {
115 0 => Ok(Self::F32),
116 1 => Ok(Self::F16),
117 _ => Err(index_corrupted(
118 path,
119 format!("unsupported quantization byte: {value}"),
120 )),
121 }
122 }
123
124 const fn bytes_per_element(self) -> usize {
125 match self {
126 Self::F32 => 4,
127 Self::F16 => 2,
128 }
129 }
130}
131
132#[derive(Debug, Clone, PartialEq, Eq)]
134pub struct VectorMetadata {
135 pub embedder_id: String,
137 pub embedder_revision: String,
139 pub dimension: usize,
141 pub quantization: Quantization,
143 pub compaction_gen: u8,
145 pub record_count: usize,
147 pub vectors_offset: u64,
149}
150
151#[derive(Debug, Clone, PartialEq, Eq)]
153pub struct VacuumStats {
154 pub records_before: usize,
156 pub records_after: usize,
158 pub tombstones_removed: usize,
160 pub bytes_reclaimed: usize,
162 pub duration: Duration,
164}
165
166#[derive(Debug, Clone, Copy)]
167pub(crate) struct RecordEntry {
168 pub(crate) doc_id_hash: u64,
169 pub(crate) doc_id_offset: u32,
170 pub(crate) doc_id_len: u16,
171 pub(crate) flags: u16,
172}
173
174#[derive(Debug)]
175pub struct VectorIndex {
176 pub(crate) path: PathBuf,
177 pub(crate) data: MmapMut,
178 pub(crate) metadata: VectorMetadata,
179 pub(crate) records_offset: usize,
180 pub(crate) strings_offset: usize,
181 pub(crate) vectors_offset: usize,
182 pub(crate) wal_entries: Vec<wal::WalEntry>,
184 wal_config: WalConfig,
186}
187
188impl VectorIndex {
189 #[allow(unsafe_code, clippy::too_many_lines)] pub fn open(path: &Path) -> SearchResult<Self> {
197 if !path.exists() {
198 return Err(SearchError::IndexNotFound {
199 path: path.to_path_buf(),
200 });
201 }
202
203 let file = OpenOptions::new()
204 .read(true)
205 .write(true)
206 .open(path)
207 .map_err(SearchError::Io)?;
208 let data = unsafe { MmapMut::map_mut(&file).map_err(SearchError::Io)? };
209 let (metadata, header_len) = parse_header(path, &data)?;
210
211 let records_bytes = metadata
212 .record_count
213 .checked_mul(RECORD_SIZE_BYTES)
214 .ok_or_else(|| index_corrupted(path, "record table size overflow"))?;
215 let records_offset = header_len;
216 let strings_offset = records_offset
217 .checked_add(records_bytes)
218 .ok_or_else(|| index_corrupted(path, "record table offset overflow"))?;
219 let vectors_offset = usize::try_from(metadata.vectors_offset)
220 .map_err(|_| index_corrupted(path, "vectors_offset does not fit in usize"))?;
221 if vectors_offset < strings_offset {
222 return Err(index_corrupted(
223 path,
224 "vectors_offset points inside the record table/string table region",
225 ));
226 }
227
228 let vector_bytes = metadata
229 .record_count
230 .checked_mul(metadata.dimension)
231 .and_then(|v| v.checked_mul(metadata.quantization.bytes_per_element()))
232 .ok_or_else(|| index_corrupted(path, "vector slab size overflow"))?;
233 let required_len = vectors_offset
234 .checked_add(vector_bytes)
235 .ok_or_else(|| index_corrupted(path, "vector slab end overflow"))?;
236 if data.len() < required_len {
237 return Err(index_corrupted(
238 path,
239 format!(
240 "truncated file: have {} bytes, need at least {} bytes",
241 data.len(),
242 required_len
243 ),
244 ));
245 }
246
247 let warm_up_config = WarmUpConfig::from_env();
248 if !matches!(warm_up_config.strategy, WarmUpStrategy::None) {
249 let warm_up = warmup::warm_up_bytes(&data, header_len, &warm_up_config, None);
250 debug!(
251 target: "frankensearch.warmup",
252 path = %path.display(),
253 strategy = %warm_up.strategy_name,
254 pages_touched = warm_up.pages_touched,
255 bytes_touched = warm_up.bytes_touched,
256 budget_exhausted = warm_up.budget_exhausted,
257 "index warm-up complete"
258 );
259 }
260
261 let wal_path = wal::wal_path_for(path);
263 let (wal_entries_raw, wal_compaction_gen, valid_len) =
264 wal::read_wal(&wal_path, metadata.dimension, metadata.quantization)?;
265
266 let mut deduped_wal = Vec::with_capacity(wal_entries_raw.len());
267 let mut seen_ids = std::collections::HashSet::new();
268 for entry in wal_entries_raw.into_iter().rev() {
269 if seen_ids.insert(entry.doc_id.clone()) {
270 deduped_wal.push(entry);
271 }
272 }
273 deduped_wal.reverse();
274 let mut wal_entries = deduped_wal;
275
276 let is_stale = if valid_len > 0 {
277 if wal_compaction_gen == 0 {
278 metadata.compaction_gen > 0
279 } else {
280 let expected = next_generation(metadata.compaction_gen);
281 wal_compaction_gen != expected
282 }
283 } else {
284 false
285 };
286
287 if is_stale {
288 tracing::warn!(
289 path = %path.display(),
290 main_gen = metadata.compaction_gen,
291 wal_gen = wal_compaction_gen,
292 "discarding stale/mismatched WAL entries and removing file"
293 );
294 wal_entries.clear();
295 if wal_path.exists() {
296 let _ = std::fs::remove_file(&wal_path);
297 }
298 } else if wal_path.exists() {
299 let actual_len = std::fs::metadata(&wal_path).map_err(SearchError::Io)?.len();
300 if actual_len > valid_len {
301 tracing::warn!(
302 path = %wal_path.display(),
303 actual_len,
304 valid_len,
305 "truncating corrupted WAL trailer"
306 );
307 let file = OpenOptions::new()
308 .write(true)
309 .open(&wal_path)
310 .map_err(SearchError::Io)?;
311 file.set_len(valid_len).map_err(SearchError::Io)?;
312 file.sync_all().map_err(SearchError::Io)?;
313 }
314 }
315
316 Ok(Self {
317 path: path.to_path_buf(),
318 data,
319 metadata,
320 records_offset,
321 strings_offset,
322 vectors_offset,
323 wal_entries,
324 wal_config: WalConfig::default(),
325 })
326 }
327
328 pub fn create(
335 path: &Path,
336 embedder_id: &str,
337 dimension: usize,
338 ) -> SearchResult<VectorIndexWriter> {
339 Self::create_with_revision(path, embedder_id, "", dimension, Quantization::F16)
340 }
341
342 pub fn create_with_revision(
349 path: &Path,
350 embedder_id: &str,
351 embedder_revision: &str,
352 dimension: usize,
353 quantization: Quantization,
354 ) -> SearchResult<VectorIndexWriter> {
355 if dimension == 0 {
356 return Err(SearchError::InvalidConfig {
357 field: "dimension".to_owned(),
358 value: "0".to_owned(),
359 reason: "dimension must be greater than zero".to_owned(),
360 });
361 }
362 validate_header_string(embedder_id, "embedder_id")?;
363 validate_header_string(embedder_revision, "embedder_revision")?;
364 let _ = u32::try_from(dimension).map_err(|_| SearchError::InvalidConfig {
365 field: "dimension".to_owned(),
366 value: dimension.to_string(),
367 reason: "dimension must fit in u32 for FSVI header encoding".to_owned(),
368 })?;
369
370 Ok(VectorIndexWriter {
371 path: path.to_path_buf(),
372 embedder_id: embedder_id.to_owned(),
373 embedder_revision: embedder_revision.to_owned(),
374 dimension,
375 quantization,
376 compaction_gen: 1,
377 records: Vec::new(),
378 })
379 }
380
381 #[must_use]
383 pub const fn record_count(&self) -> usize {
384 self.metadata.record_count
385 }
386
387 #[must_use]
389 pub const fn dimension(&self) -> usize {
390 self.metadata.dimension
391 }
392
393 #[must_use]
395 pub fn embedder_id(&self) -> &str {
396 &self.metadata.embedder_id
397 }
398
399 #[must_use]
401 pub fn embedder_revision(&self) -> &str {
402 &self.metadata.embedder_revision
403 }
404
405 #[must_use]
407 pub const fn quantization(&self) -> Quantization {
408 self.metadata.quantization
409 }
410
411 #[must_use]
413 pub const fn metadata(&self) -> &VectorMetadata {
414 &self.metadata
415 }
416
417 pub const fn set_wal_config(&mut self, config: WalConfig) {
421 self.wal_config = config;
422 }
423
424 #[must_use]
426 pub const fn wal_record_count(&self) -> usize {
427 self.wal_entries.len()
428 }
429
430 #[must_use]
435 pub fn needs_compaction(&self) -> bool {
436 if self.wal_entries.is_empty() {
437 return false;
438 }
439 if self.wal_entries.len() >= self.wal_config.compaction_threshold {
440 return true;
441 }
442 if self.record_count() > 0 {
443 #[allow(clippy::cast_precision_loss)]
444 let ratio = self.wal_entries.len() as f64 / self.record_count() as f64;
445 let threshold = if self.wal_config.compaction_ratio.is_finite() {
448 self.wal_config.compaction_ratio
449 } else {
450 0.10
451 };
452 if ratio >= threshold {
453 return true;
454 }
455 }
456 false
457 }
458
459 pub fn soft_delete(&mut self, doc_id: &str) -> SearchResult<bool> {
469 self.soft_delete_batch(&[doc_id]).map(|count| count > 0)
470 }
471
472 pub fn soft_delete_batch(&mut self, doc_ids: &[&str]) -> SearchResult<usize> {
480 let mut deleted = 0usize;
481 let mut wal_changed = false;
482
483 let mut modified_main_entries = Vec::new();
485
486 let mut to_delete_set = std::collections::HashSet::with_capacity(doc_ids.len());
488 for &id in doc_ids {
489 to_delete_set.insert(id);
490 }
491
492 for &doc_id in doc_ids {
494 let doc_id_hash = fnv1a_hash(doc_id.as_bytes());
495 if let Some(mut index) = self.find_first_hash_match(doc_id_hash)? {
496 while index > 0 {
497 let prev = self.record_at(index - 1)?;
498 if prev.doc_id_hash != doc_id_hash {
499 break;
500 }
501 index -= 1;
502 }
503
504 for candidate in index..self.record_count() {
505 let entry = self.record_at(candidate)?;
506 if entry.doc_id_hash != doc_id_hash {
507 break;
508 }
509 if !is_tombstoned_flags(entry.flags) {
510 let candidate_doc_id = self.doc_id_at(candidate)?;
511 if candidate_doc_id == doc_id {
512 let flags = entry.flags | RECORD_FLAG_TOMBSTONE;
513 self.set_record_flags(candidate, flags)?;
514 modified_main_entries.push((candidate, entry.flags));
515 deleted += 1;
516 }
517 }
518 }
519 }
520 }
521
522 let original_wal_len = self.wal_entries.len();
524 let filtered: Vec<wal::WalEntry> = self
525 .wal_entries
526 .iter()
527 .filter(|entry| !to_delete_set.contains(entry.doc_id.as_str()))
528 .cloned()
529 .collect();
530
531 let prev_wal = if filtered.len() < original_wal_len {
532 deleted += original_wal_len - filtered.len();
533 wal_changed = true;
534 std::mem::replace(&mut self.wal_entries, filtered)
535 } else {
536 Vec::new()
537 };
538
539 if wal_changed {
541 if let Err(err) = self.rewrite_wal_sidecar() {
542 self.wal_entries = prev_wal;
543 for (candidate, original_flags) in modified_main_entries {
545 if let Err(rollback_err) = self.set_record_flags(candidate, original_flags) {
546 tracing::error!(
547 error = %rollback_err,
548 candidate,
549 "failed to rollback main index flag during soft_delete_batch failure"
550 );
551 }
552 }
553 tracing::error!(
554 error = %err,
555 "failed to rewrite WAL sidecar during batch delete"
556 );
557 return Err(err);
558 }
559 }
560
561 Ok(deleted)
562 }
563
564 #[must_use]
566 pub fn is_deleted(&self, record_index: usize) -> bool {
567 matches!(
568 self.record_at(record_index),
569 Ok(entry) if is_tombstoned_flags(entry.flags)
570 )
571 }
572
573 #[must_use]
575 pub fn tombstone_count(&self) -> usize {
576 (0..self.record_count())
577 .filter(|&index| self.is_deleted(index))
578 .count()
579 }
580
581 #[must_use]
583 #[allow(clippy::cast_precision_loss)]
584 pub fn tombstone_ratio(&self) -> f64 {
585 if self.record_count() == 0 {
586 return 0.0;
587 }
588 self.tombstone_count() as f64 / self.record_count() as f64
589 }
590
591 #[must_use]
593 pub fn needs_vacuum(&self) -> bool {
594 self.tombstone_ratio() > TOMBSTONE_VACUUM_THRESHOLD
595 }
596
597 pub fn vacuum(&mut self) -> SearchResult<VacuumStats> {
606 let start = Instant::now();
607 let records_before = self.record_count();
608 let bytes_before = self.data.len();
609 let tombstones_before = self.tombstone_count();
610
611 if records_before == 0 || tombstones_before == 0 {
612 return Ok(VacuumStats {
613 records_before,
614 records_after: records_before,
615 tombstones_removed: 0,
616 bytes_reclaimed: 0,
617 duration: start.elapsed(),
618 });
619 }
620
621 let mut sources = Vec::with_capacity(records_before - tombstones_before);
623 for index in 0..records_before {
624 if !self.is_deleted(index) {
625 sources.push(MergeSource::Main(index));
626 }
627 }
628
629 self.rewrite_index(&sources, self.metadata.compaction_gen)?;
630
631 let records_after = self.record_count();
632 let bytes_reclaimed = bytes_before.saturating_sub(self.data.len());
633 Ok(VacuumStats {
634 records_before,
635 records_after,
636 tombstones_removed: records_before.saturating_sub(records_after),
637 bytes_reclaimed,
638 duration: start.elapsed(),
639 })
640 }
641
642 pub fn append(&mut self, doc_id: &str, vector: &[f32]) -> SearchResult<()> {
652 self.append_batch(&[(doc_id.to_owned(), vector.to_vec())])
653 }
654
655 pub fn append_batch(&mut self, entries: &[(String, Vec<f32>)]) -> SearchResult<()> {
666 if entries.is_empty() {
667 return Ok(());
668 }
669
670 for (doc_id, vector) in entries {
672 if vector.len() != self.dimension() {
673 return Err(SearchError::DimensionMismatch {
674 expected: self.dimension(),
675 found: vector.len(),
676 });
677 }
678 if vector.iter().any(|v| !v.is_finite()) {
679 return Err(SearchError::InvalidConfig {
680 field: "embedding".to_owned(),
681 value: "<contains non-finite values>".to_owned(),
682 reason: "all embedding values must be finite".to_owned(),
683 });
684 }
685 let _ = u16::try_from(doc_id.len()).map_err(|_| SearchError::InvalidConfig {
686 field: "doc_id".to_owned(),
687 value: doc_id.clone(),
688 reason: "doc_id byte length must fit in u16".to_owned(),
689 })?;
690 }
691
692 let doc_ids: Vec<&str> = entries.iter().map(|(id, _)| id.as_str()).collect();
695 self.soft_delete_batch(&doc_ids)?;
696
697 let mut wal_entries: Vec<wal::WalEntry> = Vec::with_capacity(entries.len());
698 let mut seen = std::collections::HashSet::new();
699 for (doc_id, embedding) in entries.iter().rev() {
700 if seen.insert(doc_id) {
701 wal_entries.push(wal::WalEntry {
702 doc_id: doc_id.clone(),
703 doc_id_hash: fnv1a_hash(doc_id.as_bytes()),
704 embedding: embedding.clone(),
705 });
706 }
707 }
708 wal_entries.reverse();
709
710 let wal_path = wal::wal_path_for(&self.path);
712 wal::append_wal_batch(
713 &wal_path,
714 &wal_entries,
715 self.dimension(),
716 self.quantization(),
717 next_generation(self.metadata.compaction_gen),
718 self.wal_config.fsync_on_write,
719 )?;
720
721 for new_entry in &wal_entries {
723 self.wal_entries
724 .retain(|existing| existing.doc_id != new_entry.doc_id);
725 }
726 self.wal_entries.extend(wal_entries.clone());
728
729 for entry in &wal_entries {
732 let hash = entry.doc_id_hash;
733 if let Ok(Some(mut index)) = self.find_first_hash_match(hash) {
734 while index > 0 {
735 if let Ok(prev) = self.record_at(index - 1) {
736 if prev.doc_id_hash != hash {
737 break;
738 }
739 index -= 1;
740 } else {
741 break;
742 }
743 }
744 for candidate in index..self.record_count() {
745 if let Ok(rec) = self.record_at(candidate) {
746 if rec.doc_id_hash != hash {
747 break;
748 }
749 if !is_tombstoned_flags(rec.flags) {
750 if let Ok(candidate_doc_id) = self.doc_id_at(candidate) {
751 if candidate_doc_id == entry.doc_id {
752 let flags = rec.flags | RECORD_FLAG_TOMBSTONE;
753 if let Err(err) = self.set_record_flags(candidate, flags) {
754 tracing::warn!(
755 target: "frankensearch.index",
756 path = %self.path.display(),
757 candidate_index = candidate,
758 doc_id = %entry.doc_id,
759 error = %err,
760 "WAL replay: failed to tombstone superseded record; \
761 duplicate may persist until next compaction"
762 );
763 }
764 break;
765 }
766 }
767 }
768 } else {
769 break;
770 }
771 }
772 }
773 }
774
775 debug!(
776 target: "frankensearch.index",
777 path = %self.path.display(),
778 batch_size = entries.len(),
779 wal_total = self.wal_entries.len(),
780 "appended to WAL"
781 );
782 Ok(())
783 }
784
785 #[allow(clippy::cast_precision_loss)]
796 pub fn compact(&mut self) -> SearchResult<CompactionStats> {
797 let start = Instant::now();
798 let main_before = self.record_count();
799 let wal_count = self.wal_entries.len();
800
801 if wal_count == 0 {
802 return Ok(CompactionStats {
803 main_records_before: main_before,
804 wal_records: 0,
805 total_records_after: main_before,
806 elapsed_ms: 0.0,
807 });
808 }
809
810 let deduped_sources = (|| -> SearchResult<Vec<MergeSource>> {
811 #[derive(Clone, Copy)]
812 struct SortKey<'a> {
813 doc_id_hash: u64,
814 doc_id: &'a str,
815 }
816
817 #[derive(Clone, Copy)]
818 struct KeyedSource<'a> {
819 key: SortKey<'a>,
820 source: MergeSource,
821 }
822
823 let mut keyed_sources = Vec::with_capacity(main_before + wal_count);
825 for i in 0..main_before {
826 if !self.is_deleted(i) {
827 let entry = self.record_at(i)?;
828 let doc_id = self.doc_id_at(i)?;
829 keyed_sources.push(KeyedSource {
830 key: SortKey {
831 doc_id_hash: entry.doc_id_hash,
832 doc_id,
833 },
834 source: MergeSource::Main(i),
835 });
836 }
837 }
838 for (idx, entry) in self.wal_entries.iter().enumerate() {
839 keyed_sources.push(KeyedSource {
840 key: SortKey {
841 doc_id_hash: entry.doc_id_hash,
842 doc_id: &entry.doc_id,
843 },
844 source: MergeSource::Wal(idx),
845 });
846 }
847
848 keyed_sources.sort_by(|a, b| {
850 a.key
851 .doc_id_hash
852 .cmp(&b.key.doc_id_hash)
853 .then(a.key.doc_id.cmp(b.key.doc_id))
854 });
855
856 let mut deduped: Vec<KeyedSource<'_>> = Vec::with_capacity(keyed_sources.len());
860 for item in keyed_sources {
861 if let Some(last) = deduped.last_mut() {
862 if item.key.doc_id_hash == last.key.doc_id_hash
863 && item.key.doc_id == last.key.doc_id
864 {
865 *last = item;
867 continue;
868 }
869 }
870 deduped.push(item);
871 }
872
873 Ok(deduped
874 .into_iter()
875 .map(|item| item.source)
876 .collect::<Vec<_>>())
877 })()?;
878
879 self.rewrite_index(
881 &deduped_sources,
882 next_generation(self.metadata.compaction_gen),
883 )?;
884
885 self.wal_entries.clear();
890
891 let wal_path = wal::wal_path_for(&self.path);
893 if let Err(e) = wal::remove_wal(&wal_path) {
894 tracing::warn!("failed to remove WAL file after compaction: {e}");
895 }
896
897 let elapsed = start.elapsed();
898 let stats = CompactionStats {
899 main_records_before: main_before,
900 wal_records: wal_count,
901 total_records_after: self.record_count(),
902 elapsed_ms: elapsed.as_secs_f64() * 1000.0,
903 };
904
905 debug!(
906 target: "frankensearch.index",
907 path = %self.path.display(),
908 main_before,
909 wal_count,
910 total_after = stats.total_records_after,
911 elapsed_ms = format_args!("{:.1}", stats.elapsed_ms),
912 "compaction complete"
913 );
914 Ok(stats)
915 }
916
917 fn resolve_sort_key<'a>(&'a self, source: &MergeSource) -> SearchResult<(u64, &'a str)> {
918 match source {
919 MergeSource::Main(idx) => {
920 let entry = self.record_at(*idx)?;
921 let id = self.doc_id_at(*idx)?;
922 Ok((entry.doc_id_hash, id))
923 }
924 MergeSource::Wal(idx) => {
925 let entry = &self.wal_entries[*idx];
926 Ok((entry.doc_id_hash, &entry.doc_id))
927 }
928 }
929 }
930
931 #[allow(clippy::too_many_lines)]
932 fn rewrite_index(&mut self, sources: &[MergeSource], new_gen: u8) -> SearchResult<()> {
933 let record_count = sources.len();
934 let records_bytes = record_count.checked_mul(RECORD_SIZE_BYTES).ok_or_else(|| {
935 SearchError::InvalidConfig {
936 field: "record_count".to_owned(),
937 value: record_count.to_string(),
938 reason: "record table size overflow".to_owned(),
939 }
940 })?;
941 let records_bytes_u64 =
942 u64::try_from(records_bytes).map_err(|_| SearchError::InvalidConfig {
943 field: "record_count".to_owned(),
944 value: record_count.to_string(),
945 reason: "record table size does not fit in u64".to_owned(),
946 })?;
947
948 let mut record_table = Vec::with_capacity(records_bytes);
952 let mut current_string_offset = 0u32;
953 let mut string_table_len = 0u64;
954
955 for source in sources {
956 let (doc_id_hash, doc_id) = self.resolve_sort_key(source)?;
957 let doc_id_len = doc_id.len();
958
959 let len_u16 = u16::try_from(doc_id_len).map_err(|_| SearchError::InvalidConfig {
961 field: "doc_id_len".to_owned(),
962 value: doc_id_len.to_string(),
963 reason: "doc_id length exceeds u16".to_owned(),
964 })?;
965 let len_u32 = u32::from(len_u16);
966 let len_u64 = u64::from(len_u16);
967 if current_string_offset.checked_add(len_u32).is_none() {
968 return Err(SearchError::InvalidConfig {
969 field: "doc_id_offset".to_owned(),
970 value: "overflow".to_owned(),
971 reason: "string table offset exceeds u32".to_owned(),
972 });
973 }
974
975 record_table.extend_from_slice(&doc_id_hash.to_le_bytes());
977 record_table.extend_from_slice(¤t_string_offset.to_le_bytes());
978 record_table.extend_from_slice(&len_u16.to_le_bytes());
979 record_table.extend_from_slice(&0u16.to_le_bytes()); current_string_offset += len_u32;
982 string_table_len += len_u64;
983 }
984
985 let provisional_header = build_header_prefix(
987 &self.metadata.embedder_id,
988 &self.metadata.embedder_revision,
989 self.dimension(),
990 self.quantization(),
991 new_gen,
992 record_count,
993 0,
994 )?;
995 let header_len = provisional_header.len() + 4; let header_len_u64 = u64::try_from(header_len).map_err(|_| SearchError::InvalidConfig {
997 field: "header".to_owned(),
998 value: header_len.to_string(),
999 reason: "header length does not fit in u64".to_owned(),
1000 })?;
1001
1002 let pre_vector = header_len_u64
1003 .checked_add(records_bytes_u64)
1004 .and_then(|v| v.checked_add(string_table_len))
1005 .ok_or_else(|| SearchError::InvalidConfig {
1006 field: "layout".to_owned(),
1007 value: "overflow".to_owned(),
1008 reason: "layout offset overflow".to_owned(),
1009 })?;
1010
1011 let vectors_offset = align_up(pre_vector, VECTOR_ALIGN_BYTES)?;
1012 let padding_len = usize::try_from(vectors_offset - pre_vector).map_err(|_| {
1013 SearchError::InvalidConfig {
1014 field: "padding_len".to_owned(),
1015 value: (vectors_offset - pre_vector).to_string(),
1016 reason: "padding length exceeds usize".to_owned(),
1017 }
1018 })?;
1019
1020 let tmp_path = temporary_output_path(&self.path);
1022
1023 let result = (|| -> SearchResult<()> {
1026 let mut file = OpenOptions::new()
1027 .create(true)
1028 .truncate(true)
1029 .write(true)
1030 .open(&tmp_path)?;
1031 {
1032 let mut writer = BufWriter::with_capacity(256 * 1024, &mut file);
1033
1034 let mut header_prefix = build_header_prefix(
1036 &self.metadata.embedder_id,
1037 &self.metadata.embedder_revision,
1038 self.dimension(),
1039 self.quantization(),
1040 new_gen,
1041 record_count,
1042 vectors_offset,
1043 )?;
1044 let header_crc = crc32(&header_prefix);
1045 header_prefix.extend_from_slice(&header_crc.to_le_bytes());
1046
1047 writer.write_all(&header_prefix)?;
1048 writer.write_all(&record_table)?;
1049
1050 for source in sources {
1052 let (_, doc_id) = self.resolve_sort_key(source)?;
1053 writer.write_all(doc_id.as_bytes())?;
1054 }
1055
1056 if padding_len > 0 {
1058 writer.write_all(&vec![0u8; padding_len])?;
1059 }
1060
1061 match self.quantization() {
1063 Quantization::F16 => {
1064 for source in sources {
1065 match source {
1066 MergeSource::Main(idx) => {
1067 let start = self.vector_start(*idx)?;
1069 let len = self.dimension() * 2;
1070 let bytes = &self.data[start..start + len];
1071 writer.write_all(bytes)?;
1072 }
1073 MergeSource::Wal(idx) => {
1074 let entry = &self.wal_entries[*idx];
1076 for &val in &entry.embedding {
1077 writer.write_all(&f16::from_f32(val).to_le_bytes())?;
1078 }
1079 }
1080 }
1081 }
1082 }
1083 Quantization::F32 => {
1084 for source in sources {
1085 match source {
1086 MergeSource::Main(idx) => {
1087 let start = self.vector_start(*idx)?;
1089 let len = self.dimension() * 4;
1090 let bytes = &self.data[start..start + len];
1091 writer.write_all(bytes)?;
1092 }
1093 MergeSource::Wal(idx) => {
1094 let entry = &self.wal_entries[*idx];
1096 for &val in &entry.embedding {
1097 writer.write_all(&val.to_le_bytes())?;
1098 }
1099 }
1100 }
1101 }
1102 }
1103 }
1104 writer.flush()?;
1105 }
1106
1107 file.sync_all()?;
1108 fs::rename(&tmp_path, &self.path)?;
1109 sync_parent_directory(&self.path)?;
1110 Ok(())
1111 })();
1112
1113 if result.is_err() {
1114 if tmp_path.exists() {
1116 if let Err(cleanup_err) = fs::remove_file(&tmp_path) {
1117 tracing::warn!(
1118 "failed to clean up temp file {} after rewrite error: {cleanup_err}",
1119 tmp_path.display()
1120 );
1121 }
1122 }
1123 }
1124 result?;
1125
1126 let config = self.wal_config.clone();
1128 let reloaded = Self::open(&self.path)?;
1129 self.data = reloaded.data;
1130 self.metadata = reloaded.metadata;
1131 self.records_offset = reloaded.records_offset;
1132 self.strings_offset = reloaded.strings_offset;
1133 self.vectors_offset = reloaded.vectors_offset;
1134 self.wal_entries = reloaded.wal_entries;
1143 self.wal_config = config;
1144
1145 Ok(())
1146 }
1147
1148 pub fn doc_id_at(&self, index: usize) -> SearchResult<&str> {
1155 self.ensure_index(index)?;
1156 let entry = self.record_at(index)?;
1157 let doc_id_offset = usize::try_from(entry.doc_id_offset).map_err(|_| {
1158 index_corrupted(
1159 &self.path,
1160 format!("doc_id_offset overflow for record at index {index}"),
1161 )
1162 })?;
1163 let doc_id_len = usize::from(entry.doc_id_len);
1164 let start = self
1165 .strings_offset
1166 .checked_add(doc_id_offset)
1167 .ok_or_else(|| index_corrupted(&self.path, "doc_id start offset overflow"))?;
1168 let end = start
1169 .checked_add(doc_id_len)
1170 .ok_or_else(|| index_corrupted(&self.path, "doc_id end offset overflow"))?;
1171 if end > self.vectors_offset {
1172 return Err(index_corrupted(
1173 &self.path,
1174 format!(
1175 "doc_id range [{start}, {end}) exceeds string table end {}",
1176 self.vectors_offset
1177 ),
1178 ));
1179 }
1180 std::str::from_utf8(&self.data[start..end]).map_err(|error| {
1181 index_corrupted(
1182 &self.path,
1183 format!("invalid UTF-8 in doc_id at index {index}: {error}"),
1184 )
1185 })
1186 }
1187
1188 pub fn vector_at_f32(&self, index: usize) -> SearchResult<Vec<f32>> {
1195 self.ensure_index(index)?;
1196 let start = self.vector_start(index)?;
1197 let dim = self.dimension();
1198 match self.quantization() {
1199 Quantization::F32 => {
1200 let byte_len = dim.checked_mul(4).ok_or_else(|| {
1201 index_corrupted(&self.path, "f32 vector byte length overflow")
1202 })?;
1203 let end = start
1204 .checked_add(byte_len)
1205 .ok_or_else(|| index_corrupted(&self.path, "f32 vector end overflow"))?;
1206 if end > self.data.len() {
1207 return Err(index_corrupted(
1208 &self.path,
1209 "f32 vector extends past file end",
1210 ));
1211 }
1212 let mut out = Vec::with_capacity(dim);
1213 for chunk in self.data[start..end].chunks_exact(4) {
1214 out.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
1215 }
1216 Ok(out)
1217 }
1218 Quantization::F16 => {
1219 let byte_len = dim.checked_mul(2).ok_or_else(|| {
1220 index_corrupted(&self.path, "f16 vector byte length overflow")
1221 })?;
1222 let end = start
1223 .checked_add(byte_len)
1224 .ok_or_else(|| index_corrupted(&self.path, "f16 vector end overflow"))?;
1225 if end > self.data.len() {
1226 return Err(index_corrupted(
1227 &self.path,
1228 "f16 vector extends past file end",
1229 ));
1230 }
1231 let mut out = Vec::with_capacity(dim);
1232 for chunk in self.data[start..end].chunks_exact(2) {
1233 out.push(f16::from_le_bytes([chunk[0], chunk[1]]).to_f32());
1234 }
1235 Ok(out)
1236 }
1237 }
1238 }
1239
1240 pub fn vector_at_f16(&self, index: usize) -> SearchResult<Vec<f16>> {
1247 self.ensure_index(index)?;
1248 let start = self.vector_start(index)?;
1249 let dim = self.dimension();
1250 match self.quantization() {
1251 Quantization::F16 => {
1252 let byte_len = dim.checked_mul(2).ok_or_else(|| {
1253 index_corrupted(&self.path, "f16 vector byte length overflow")
1254 })?;
1255 let end = start
1256 .checked_add(byte_len)
1257 .ok_or_else(|| index_corrupted(&self.path, "f16 vector end overflow"))?;
1258 if end > self.data.len() {
1259 return Err(index_corrupted(
1260 &self.path,
1261 "f16 vector extends past file end",
1262 ));
1263 }
1264 let mut out = Vec::with_capacity(dim);
1265 for chunk in self.data[start..end].chunks_exact(2) {
1266 out.push(f16::from_le_bytes([chunk[0], chunk[1]]));
1267 }
1268 Ok(out)
1269 }
1270 Quantization::F32 => {
1271 let byte_len = dim.checked_mul(4).ok_or_else(|| {
1272 index_corrupted(&self.path, "f32 vector byte length overflow")
1273 })?;
1274 let end = start
1275 .checked_add(byte_len)
1276 .ok_or_else(|| index_corrupted(&self.path, "f32 vector end overflow"))?;
1277 if end > self.data.len() {
1278 return Err(index_corrupted(
1279 &self.path,
1280 "f32 vector extends past file end",
1281 ));
1282 }
1283 let mut out = Vec::with_capacity(dim);
1284 for chunk in self.data[start..end].chunks_exact(4) {
1285 out.push(f16::from_f32(f32::from_le_bytes([
1286 chunk[0], chunk[1], chunk[2], chunk[3],
1287 ])));
1288 }
1289 Ok(out)
1290 }
1291 }
1292 }
1293
1294 #[must_use]
1296 pub fn find_index_by_doc_hash(&self, doc_id_hash: u64) -> Option<usize> {
1297 let mut low = 0usize;
1298 let mut high = self.record_count();
1299 while low < high {
1300 let mid = low + (high - low) / 2;
1301 let entry = self.record_at(mid).ok()?;
1302 match entry.doc_id_hash.cmp(&doc_id_hash) {
1303 std::cmp::Ordering::Less => low = mid + 1,
1304 std::cmp::Ordering::Greater => high = mid,
1305 std::cmp::Ordering::Equal => {
1306 let mut first = mid;
1307 while first > 0 {
1308 let prev = self.record_at(first - 1).ok()?;
1309 if prev.doc_id_hash != doc_id_hash {
1310 break;
1311 }
1312 first -= 1;
1313 }
1314 for index in first..self.record_count() {
1315 let entry = self.record_at(index).ok()?;
1316 if entry.doc_id_hash != doc_id_hash {
1317 break;
1318 }
1319 if !is_tombstoned_flags(entry.flags) {
1320 return Some(index);
1321 }
1322 }
1323 return None;
1324 }
1325 }
1326 }
1327 None
1328 }
1329
1330 #[must_use]
1334 pub fn get_embeddings(&self, doc_id_hashes: &[u64]) -> Vec<Option<Vec<f16>>> {
1335 doc_id_hashes
1336 .iter()
1337 .map(|&hash| {
1338 for entry in self.wal_entries.iter().rev() {
1339 if entry.doc_id_hash == hash {
1340 return Some(
1342 entry
1343 .embedding
1344 .iter()
1345 .map(|&v| half::f16::from_f32(v))
1346 .collect(),
1347 );
1348 }
1349 }
1350 if let Some(index) = self.find_index_by_doc_hash(hash) {
1351 if let Ok(vec) = self.vector_at_f16(index) {
1352 return Some(vec);
1353 }
1354 }
1355 None
1356 })
1357 .collect()
1358 }
1359
1360 fn ensure_index(&self, index: usize) -> SearchResult<()> {
1361 if index >= self.record_count() {
1362 return Err(SearchError::InvalidConfig {
1363 field: "index".to_owned(),
1364 value: index.to_string(),
1365 reason: format!(
1366 "index out of range for record_count={}",
1367 self.record_count()
1368 ),
1369 });
1370 }
1371 Ok(())
1372 }
1373
1374 pub(crate) fn find_index_by_doc_id(&self, doc_id: &str) -> SearchResult<Option<usize>> {
1375 let doc_id_hash = fnv1a_hash(doc_id.as_bytes());
1376 let Some(mut index) = self.find_first_hash_match(doc_id_hash)? else {
1377 return Ok(None);
1378 };
1379 while index > 0 {
1380 let prev = self.record_at(index - 1)?;
1381 if prev.doc_id_hash != doc_id_hash {
1382 break;
1383 }
1384 index -= 1;
1385 }
1386
1387 for candidate in index..self.record_count() {
1388 let entry = self.record_at(candidate)?;
1389 if entry.doc_id_hash != doc_id_hash {
1390 break;
1391 }
1392 if !is_tombstoned_flags(entry.flags) {
1393 let candidate_doc_id = self.doc_id_at(candidate)?;
1394 if candidate_doc_id == doc_id {
1395 return Ok(Some(candidate));
1396 }
1397 }
1398 }
1399 Ok(None)
1400 }
1401
1402 fn find_first_hash_match(&self, doc_id_hash: u64) -> SearchResult<Option<usize>> {
1403 let mut low = 0usize;
1404 let mut high = self.record_count();
1405 while low < high {
1406 let mid = low + (high - low) / 2;
1407 let entry = self.record_at(mid)?;
1408 match entry.doc_id_hash.cmp(&doc_id_hash) {
1409 std::cmp::Ordering::Less => low = mid + 1,
1410 std::cmp::Ordering::Greater => high = mid,
1411 std::cmp::Ordering::Equal => return Ok(Some(mid)),
1412 }
1413 }
1414 Ok(None)
1415 }
1416
1417 fn record_flags_offset(&self, index: usize) -> SearchResult<usize> {
1418 self.ensure_index(index)?;
1419 let record_offset = self
1420 .records_offset
1421 .checked_add(index.checked_mul(RECORD_SIZE_BYTES).ok_or_else(|| {
1422 index_corrupted(&self.path, "record offset multiplication overflow")
1423 })?)
1424 .ok_or_else(|| index_corrupted(&self.path, "record offset overflow"))?;
1425 record_offset
1426 .checked_add(14)
1427 .ok_or_else(|| index_corrupted(&self.path, "flags offset overflow"))
1428 }
1429
1430 fn set_record_flags(&mut self, index: usize, flags: u16) -> SearchResult<()> {
1431 let flags_offset = self.record_flags_offset(index)?;
1432 let end = flags_offset
1433 .checked_add(2)
1434 .ok_or_else(|| index_corrupted(&self.path, "flags end overflow"))?;
1435 if end > self.data.len() {
1436 return Err(index_corrupted(
1437 &self.path,
1438 "flags offset points beyond mapped data",
1439 ));
1440 }
1441
1442 let flag_bytes = flags.to_le_bytes();
1443 self.data[flags_offset..end].copy_from_slice(&flag_bytes);
1444 self.data
1445 .flush_range(flags_offset, 2)
1446 .map_err(SearchError::Io)?;
1447 Ok(())
1448 }
1449
1450 fn rewrite_wal_sidecar(&self) -> SearchResult<()> {
1451 let wal_path = wal::wal_path_for(&self.path);
1452 if self.wal_entries.is_empty() {
1453 wal::remove_wal(&wal_path)?;
1454 return Ok(());
1455 }
1456
1457 let mut tmp = wal_path.as_os_str().to_os_string();
1458 tmp.push(".tmp");
1459 let tmp_path = PathBuf::from(tmp);
1460 let _ = wal::remove_wal(&tmp_path);
1461
1462 if let Err(e) = wal::append_wal_batch(
1463 &tmp_path,
1464 &self.wal_entries,
1465 self.dimension(),
1466 self.quantization(),
1467 next_generation(self.metadata.compaction_gen),
1468 self.wal_config.fsync_on_write,
1469 ) {
1470 let _ = fs::remove_file(&tmp_path);
1471 return Err(e);
1472 }
1473
1474 match fs::rename(&tmp_path, &wal_path) {
1475 Ok(()) => Ok(()),
1476 Err(error) if error.kind() == std::io::ErrorKind::AlreadyExists => {
1477 wal::remove_wal(&wal_path)?;
1478 fs::rename(&tmp_path, &wal_path)?;
1479 Ok(())
1480 }
1481 Err(error) => {
1482 let _ = wal::remove_wal(&tmp_path);
1483 Err(error.into())
1484 }
1485 }
1486 }
1487
1488 pub(crate) fn record_at(&self, index: usize) -> SearchResult<RecordEntry> {
1489 self.ensure_index(index)?;
1490 let offset = self
1491 .records_offset
1492 .checked_add(index.checked_mul(RECORD_SIZE_BYTES).ok_or_else(|| {
1493 index_corrupted(&self.path, "record offset multiplication overflow")
1494 })?)
1495 .ok_or_else(|| index_corrupted(&self.path, "record offset overflow"))?;
1496 let end = offset
1497 .checked_add(RECORD_SIZE_BYTES)
1498 .ok_or_else(|| index_corrupted(&self.path, "record end overflow"))?;
1499 if end > self.data.len() {
1500 return Err(index_corrupted(
1501 &self.path,
1502 "record table extends beyond file size",
1503 ));
1504 }
1505 let chunk = &self.data[offset..end];
1506 Ok(RecordEntry {
1507 doc_id_hash: u64::from_le_bytes([
1508 chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6], chunk[7],
1509 ]),
1510 doc_id_offset: u32::from_le_bytes([chunk[8], chunk[9], chunk[10], chunk[11]]),
1511 doc_id_len: u16::from_le_bytes([chunk[12], chunk[13]]),
1512 flags: u16::from_le_bytes([chunk[14], chunk[15]]),
1513 })
1514 }
1515
1516 fn vector_start(&self, index: usize) -> SearchResult<usize> {
1517 let stride = self
1518 .dimension()
1519 .checked_mul(self.quantization().bytes_per_element())
1520 .ok_or_else(|| index_corrupted(&self.path, "vector stride overflow"))?;
1521 self.vectors_offset
1522 .checked_add(
1523 index
1524 .checked_mul(stride)
1525 .ok_or_else(|| index_corrupted(&self.path, "vector index overflow"))?,
1526 )
1527 .ok_or_else(|| index_corrupted(&self.path, "vector offset overflow"))
1528 }
1529}
1530
1531#[derive(Debug, Clone)]
1532struct PendingRecord {
1533 doc_id: String,
1534 doc_id_hash: u64,
1535 flags: u16,
1536 embedding: Vec<f32>,
1537}
1538
1539#[derive(Debug, Clone, Copy)]
1540enum MergeSource {
1541 Main(usize),
1542 Wal(usize),
1543}
1544
1545#[derive(Debug)]
1546pub struct VectorIndexWriter {
1547 path: PathBuf,
1548 embedder_id: String,
1549 embedder_revision: String,
1550 dimension: usize,
1551 quantization: Quantization,
1552 compaction_gen: u8,
1553 records: Vec<PendingRecord>,
1554}
1555
1556impl VectorIndexWriter {
1557 pub fn write_record(&mut self, doc_id: &str, embedding: &[f32]) -> SearchResult<()> {
1564 if embedding.len() != self.dimension {
1565 return Err(SearchError::DimensionMismatch {
1566 expected: self.dimension,
1567 found: embedding.len(),
1568 });
1569 }
1570 if embedding.iter().any(|value| !value.is_finite()) {
1571 return Err(SearchError::InvalidConfig {
1572 field: "embedding".to_owned(),
1573 value: "<contains non-finite values>".to_owned(),
1574 reason: "all embedding values must be finite".to_owned(),
1575 });
1576 }
1577 let _ = u16::try_from(doc_id.len()).map_err(|_| SearchError::InvalidConfig {
1578 field: "doc_id".to_owned(),
1579 value: doc_id.to_owned(),
1580 reason: "doc_id byte length must fit in u16".to_owned(),
1581 })?;
1582 self.records.push(PendingRecord {
1583 doc_id: doc_id.to_owned(),
1584 doc_id_hash: fnv1a_hash(doc_id.as_bytes()),
1585 flags: 0,
1586 embedding: embedding.to_vec(),
1587 });
1588 Ok(())
1589 }
1590
1591 #[allow(dead_code)]
1592 pub(crate) const fn with_generation(mut self, generation: u8) -> Self {
1593 self.compaction_gen = generation;
1594 self
1595 }
1596
1597 #[allow(clippy::too_many_lines)]
1604 pub fn finish(mut self) -> SearchResult<()> {
1605 self.records.sort_by(|left, right| {
1606 left.doc_id_hash
1607 .cmp(&right.doc_id_hash)
1608 .then(left.doc_id.cmp(&right.doc_id))
1609 });
1610
1611 let record_count = self.records.len();
1612 let records_bytes = record_count.checked_mul(RECORD_SIZE_BYTES).ok_or_else(|| {
1613 SearchError::InvalidConfig {
1614 field: "record_count".to_owned(),
1615 value: record_count.to_string(),
1616 reason: "record table size overflow".to_owned(),
1617 }
1618 })?;
1619 let records_bytes_u64 =
1620 u64::try_from(records_bytes).map_err(|_| SearchError::InvalidConfig {
1621 field: "record_count".to_owned(),
1622 value: record_count.to_string(),
1623 reason: "record table size does not fit in u64".to_owned(),
1624 })?;
1625
1626 let mut string_table = Vec::<u8>::new();
1627 let mut record_entries = Vec::<RecordEntry>::with_capacity(record_count);
1628 for record in &self.records {
1629 let offset_u32 =
1630 u32::try_from(string_table.len()).map_err(|_| SearchError::InvalidConfig {
1631 field: "doc_id_offset".to_owned(),
1632 value: string_table.len().to_string(),
1633 reason: "string table offset exceeds u32".to_owned(),
1634 })?;
1635 let doc_id_bytes = record.doc_id.as_bytes();
1636 let len_u16 =
1637 u16::try_from(doc_id_bytes.len()).map_err(|_| SearchError::InvalidConfig {
1638 field: "doc_id_len".to_owned(),
1639 value: doc_id_bytes.len().to_string(),
1640 reason: "doc_id length exceeds u16".to_owned(),
1641 })?;
1642 string_table.extend_from_slice(doc_id_bytes);
1643 record_entries.push(RecordEntry {
1644 doc_id_hash: record.doc_id_hash,
1645 doc_id_offset: offset_u32,
1646 doc_id_len: len_u16,
1647 flags: record.flags,
1648 });
1649 }
1650
1651 let string_table_len_u64 =
1652 u64::try_from(string_table.len()).map_err(|_| SearchError::InvalidConfig {
1653 field: "string_table".to_owned(),
1654 value: string_table.len().to_string(),
1655 reason: "string table length does not fit in u64".to_owned(),
1656 })?;
1657
1658 let provisional_header = build_header_prefix(
1659 &self.embedder_id,
1660 &self.embedder_revision,
1661 self.dimension,
1662 self.quantization,
1663 self.compaction_gen,
1664 record_count,
1665 0,
1666 )?;
1667 let header_len =
1668 provisional_header
1669 .len()
1670 .checked_add(4)
1671 .ok_or_else(|| SearchError::InvalidConfig {
1672 field: "header".to_owned(),
1673 value: provisional_header.len().to_string(),
1674 reason: "header length overflow".to_owned(),
1675 })?;
1676 let header_len_u64 = u64::try_from(header_len).map_err(|_| SearchError::InvalidConfig {
1677 field: "header".to_owned(),
1678 value: header_len.to_string(),
1679 reason: "header length does not fit in u64".to_owned(),
1680 })?;
1681 let pre_vector = header_len_u64
1682 .checked_add(records_bytes_u64)
1683 .and_then(|value| value.checked_add(string_table_len_u64))
1684 .ok_or_else(|| SearchError::InvalidConfig {
1685 field: "layout".to_owned(),
1686 value: format!("{header_len_u64}+{records_bytes_u64}+{string_table_len_u64}"),
1687 reason: "layout offset overflow".to_owned(),
1688 })?;
1689 let vectors_offset = align_up(pre_vector, VECTOR_ALIGN_BYTES)?;
1690 let padding_len_u64 =
1691 vectors_offset
1692 .checked_sub(pre_vector)
1693 .ok_or_else(|| SearchError::InvalidConfig {
1694 field: "layout".to_owned(),
1695 value: format!("{vectors_offset}-{pre_vector}"),
1696 reason: "negative padding detected".to_owned(),
1697 })?;
1698 let padding_len =
1699 usize::try_from(padding_len_u64).map_err(|_| SearchError::InvalidConfig {
1700 field: "padding".to_owned(),
1701 value: padding_len_u64.to_string(),
1702 reason: "padding length does not fit in usize".to_owned(),
1703 })?;
1704
1705 let mut header_prefix = build_header_prefix(
1706 &self.embedder_id,
1707 &self.embedder_revision,
1708 self.dimension,
1709 self.quantization,
1710 self.compaction_gen,
1711 record_count,
1712 vectors_offset,
1713 )?;
1714 let header_crc = crc32(&header_prefix);
1715 header_prefix.extend_from_slice(&header_crc.to_le_bytes());
1716
1717 let tmp_path = temporary_output_path(&self.path);
1718 let result = (|| -> SearchResult<()> {
1719 let mut file = OpenOptions::new()
1720 .create(true)
1721 .truncate(true)
1722 .write(true)
1723 .open(&tmp_path)?;
1724 {
1725 let mut writer = BufWriter::with_capacity(256 * 1024, &mut file);
1726
1727 writer.write_all(&header_prefix)?;
1728 for entry in &record_entries {
1729 writer.write_all(&entry.doc_id_hash.to_le_bytes())?;
1730 writer.write_all(&entry.doc_id_offset.to_le_bytes())?;
1731 writer.write_all(&entry.doc_id_len.to_le_bytes())?;
1732 writer.write_all(&entry.flags.to_le_bytes())?;
1733 }
1734 writer.write_all(&string_table)?;
1735 if padding_len > 0 {
1736 writer.write_all(&vec![0_u8; padding_len])?;
1737 }
1738 write_vector_slab(&mut writer, &self.records, self.quantization)?;
1739 writer.flush()?;
1740 }
1741
1742 file.sync_all()?;
1743 fs::rename(&tmp_path, &self.path)?;
1744 sync_parent_directory(&self.path)?;
1745 Ok(())
1746 })();
1747
1748 if result.is_err() {
1749 if tmp_path.exists() {
1750 if let Err(cleanup_err) = fs::remove_file(&tmp_path) {
1751 tracing::warn!(
1752 "failed to clean up temp file {} after write error: {cleanup_err}",
1753 tmp_path.display()
1754 );
1755 }
1756 }
1757 }
1758 result?;
1759
1760 debug!(
1761 target: "frankensearch.index",
1762 path = %self.path.display(),
1763 record_count,
1764 dimension = self.dimension,
1765 quantization = self.quantization as u8,
1766 vectors_offset,
1767 "wrote fsvi index"
1768 );
1769 Ok(())
1770 }
1771}
1772
1773fn parse_header(path: &Path, data: &[u8]) -> SearchResult<(VectorMetadata, usize)> {
1774 let mut cursor = 0usize;
1775 let magic = read_array::<4>(path, data, &mut cursor, "magic")?;
1776 if magic != FSVI_MAGIC {
1777 return Err(index_corrupted(
1778 path,
1779 format!("bad magic bytes: expected {FSVI_MAGIC:?}, found {magic:?}"),
1780 ));
1781 }
1782
1783 let version = u16::from_le_bytes(read_array::<2>(path, data, &mut cursor, "version")?);
1784 if version != FSVI_VERSION {
1785 return Err(SearchError::IndexVersionMismatch {
1786 expected: FSVI_VERSION,
1787 found: version,
1788 });
1789 }
1790
1791 let embedder_id_len = usize::from(u16::from_le_bytes(read_array::<2>(
1792 path,
1793 data,
1794 &mut cursor,
1795 "embedder_id_len",
1796 )?));
1797 let embedder_id_bytes = read_slice(path, data, &mut cursor, embedder_id_len, "embedder_id")?;
1798 let embedder_id = std::str::from_utf8(embedder_id_bytes)
1799 .map_err(|error| index_corrupted(path, format!("invalid UTF-8 in embedder_id: {error}")))?
1800 .to_owned();
1801
1802 let embedder_revision_len = usize::from(u16::from_le_bytes(read_array::<2>(
1803 path,
1804 data,
1805 &mut cursor,
1806 "embedder_revision_len",
1807 )?));
1808 let embedder_revision_bytes = read_slice(
1809 path,
1810 data,
1811 &mut cursor,
1812 embedder_revision_len,
1813 "embedder_revision",
1814 )?;
1815 let embedder_revision = std::str::from_utf8(embedder_revision_bytes)
1816 .map_err(|error| {
1817 index_corrupted(path, format!("invalid UTF-8 in embedder_revision: {error}"))
1818 })?
1819 .to_owned();
1820
1821 let dimension_u32 = u32::from_le_bytes(read_array::<4>(path, data, &mut cursor, "dimension")?);
1822 let dimension = usize::try_from(dimension_u32)
1823 .map_err(|_| index_corrupted(path, "dimension does not fit in usize"))?;
1824 if dimension == 0 {
1825 return Err(index_corrupted(path, "dimension must be greater than zero"));
1826 }
1827
1828 let quantization_byte = read_array::<1>(path, data, &mut cursor, "quantization")?[0];
1829 let quantization = Quantization::from_wire(quantization_byte, path)?;
1830
1831 let reserved = read_array::<3>(path, data, &mut cursor, "reserved")?;
1833 let compaction_gen = reserved[0];
1834 let record_count_u64 =
1837 u64::from_le_bytes(read_array::<8>(path, data, &mut cursor, "record_count")?);
1838 let record_count = usize::try_from(record_count_u64)
1839 .map_err(|_| index_corrupted(path, "record_count does not fit in usize"))?;
1840 let vectors_offset =
1841 u64::from_le_bytes(read_array::<8>(path, data, &mut cursor, "vectors_offset")?);
1842 let expected_crc =
1843 u32::from_le_bytes(read_array::<4>(path, data, &mut cursor, "header_crc32")?);
1844 let actual_crc = crc32(&data[..cursor - 4]);
1845 if actual_crc != expected_crc {
1846 return Err(index_corrupted(
1847 path,
1848 format!("header CRC mismatch: expected {expected_crc:#010x}, got {actual_crc:#010x}"),
1849 ));
1850 }
1851
1852 Ok((
1853 VectorMetadata {
1854 embedder_id,
1855 embedder_revision,
1856 dimension,
1857 quantization,
1858 compaction_gen,
1859 record_count,
1860 vectors_offset,
1861 },
1862 cursor,
1863 ))
1864}
1865
1866fn read_array<const N: usize>(
1867 path: &Path,
1868 data: &[u8],
1869 cursor: &mut usize,
1870 field: &str,
1871) -> SearchResult<[u8; N]> {
1872 let slice = read_slice(path, data, cursor, N, field)?;
1873 let mut out = [0_u8; N];
1874 out.copy_from_slice(slice);
1875 Ok(out)
1876}
1877
1878fn read_slice<'a>(
1879 path: &Path,
1880 data: &'a [u8],
1881 cursor: &mut usize,
1882 len: usize,
1883 field: &str,
1884) -> SearchResult<&'a [u8]> {
1885 let end = cursor
1886 .checked_add(len)
1887 .ok_or_else(|| index_corrupted(path, format!("{field} offset overflow")))?;
1888 if end > data.len() {
1889 return Err(index_corrupted(
1890 path,
1891 format!("{field} is truncated (wanted {len} bytes)"),
1892 ));
1893 }
1894 let out = &data[*cursor..end];
1895 *cursor = end;
1896 Ok(out)
1897}
1898
1899fn build_header_prefix(
1900 embedder_id: &str,
1901 embedder_revision: &str,
1902 dimension: usize,
1903 quantization: Quantization,
1904 compaction_gen: u8,
1905 record_count: usize,
1906 vectors_offset: u64,
1907) -> SearchResult<Vec<u8>> {
1908 validate_header_string(embedder_id, "embedder_id")?;
1909 validate_header_string(embedder_revision, "embedder_revision")?;
1910 let dimension_u32 = u32::try_from(dimension).map_err(|_| SearchError::InvalidConfig {
1911 field: "dimension".to_owned(),
1912 value: dimension.to_string(),
1913 reason: "dimension must fit in u32".to_owned(),
1914 })?;
1915 let record_count_u64 = u64::try_from(record_count).map_err(|_| SearchError::InvalidConfig {
1916 field: "record_count".to_owned(),
1917 value: record_count.to_string(),
1918 reason: "record_count must fit in u64".to_owned(),
1919 })?;
1920 let mut out = Vec::with_capacity(
1921 4 + 2 + 2 + embedder_id.len() + 2 + embedder_revision.len() + 4 + 1 + 3 + 8 + 8,
1922 );
1923 out.extend_from_slice(&FSVI_MAGIC);
1924 out.extend_from_slice(&FSVI_VERSION.to_le_bytes());
1925 out.extend_from_slice(
1926 &u16::try_from(embedder_id.len())
1927 .map_err(|_| SearchError::InvalidConfig {
1928 field: "embedder_id".to_owned(),
1929 value: embedder_id.to_owned(),
1930 reason: "embedder_id byte length must fit in u16".to_owned(),
1931 })?
1932 .to_le_bytes(),
1933 );
1934 out.extend_from_slice(embedder_id.as_bytes());
1935 out.extend_from_slice(
1936 &u16::try_from(embedder_revision.len())
1937 .map_err(|_| SearchError::InvalidConfig {
1938 field: "embedder_revision".to_owned(),
1939 value: embedder_revision.to_owned(),
1940 reason: "embedder_revision byte length must fit in u16".to_owned(),
1941 })?
1942 .to_le_bytes(),
1943 );
1944 out.extend_from_slice(embedder_revision.as_bytes());
1945 out.extend_from_slice(&dimension_u32.to_le_bytes());
1946 out.push(quantization as u8);
1947 out.push(compaction_gen);
1948 out.extend_from_slice(&[0_u8; 2]);
1949 out.extend_from_slice(&record_count_u64.to_le_bytes());
1950 out.extend_from_slice(&vectors_offset.to_le_bytes());
1951 Ok(out)
1952}
1953
1954fn validate_header_string(value: &str, field: &str) -> SearchResult<()> {
1955 if value.is_empty() && field == "embedder_id" {
1956 return Err(SearchError::InvalidConfig {
1957 field: field.to_owned(),
1958 value: value.to_owned(),
1959 reason: "embedder_id cannot be empty".to_owned(),
1960 });
1961 }
1962 let _ = u16::try_from(value.len()).map_err(|_| SearchError::InvalidConfig {
1963 field: field.to_owned(),
1964 value: value.to_owned(),
1965 reason: "value length must fit in u16".to_owned(),
1966 })?;
1967 Ok(())
1968}
1969
1970fn write_vector_slab<W: Write>(
1971 writer: &mut W,
1972 records: &[PendingRecord],
1973 quantization: Quantization,
1974) -> SearchResult<()> {
1975 match quantization {
1976 Quantization::F16 => {
1977 for record in records {
1978 for value in &record.embedding {
1979 writer.write_all(&f16::from_f32(*value).to_le_bytes())?;
1980 }
1981 }
1982 }
1983 Quantization::F32 => {
1984 for record in records {
1985 for value in &record.embedding {
1986 writer.write_all(&value.to_le_bytes())?;
1987 }
1988 }
1989 }
1990 }
1991 Ok(())
1992}
1993
1994fn align_up(value: u64, alignment: u64) -> SearchResult<u64> {
1995 if alignment == 0 {
1996 return Ok(value);
1997 }
1998 let add = alignment
1999 .checked_sub(1)
2000 .ok_or_else(|| SearchError::InvalidConfig {
2001 field: "alignment".to_owned(),
2002 value: alignment.to_string(),
2003 reason: "alignment underflow".to_owned(),
2004 })?;
2005 let padded = value
2006 .checked_add(add)
2007 .ok_or_else(|| SearchError::InvalidConfig {
2008 field: "alignment".to_owned(),
2009 value: format!("{value}+{add}"),
2010 reason: "alignment overflow".to_owned(),
2011 })?;
2012 Ok((padded / alignment) * alignment)
2013}
2014
2015fn temporary_output_path(path: &Path) -> PathBuf {
2016 let now = SystemTime::now()
2017 .duration_since(UNIX_EPOCH)
2018 .unwrap_or_default()
2019 .as_nanos();
2020 let pid = std::process::id();
2021 let mut os = path.as_os_str().to_os_string();
2022 os.push(format!(".tmp.{pid}.{now}"));
2023 PathBuf::from(os)
2024}
2025
2026fn sync_parent_directory(path: &Path) -> SearchResult<()> {
2027 #[cfg(unix)]
2028 {
2029 if let Some(parent) = path.parent() {
2030 let dir = File::open(parent)?;
2031 dir.sync_all()?;
2032 }
2033 }
2034 #[cfg(not(unix))]
2035 {
2036 let _ = path;
2037 }
2038 Ok(())
2039}
2040
2041fn index_corrupted(path: &Path, detail: impl Into<String>) -> SearchError {
2042 SearchError::IndexCorrupted {
2043 path: path.to_path_buf(),
2044 detail: detail.into(),
2045 }
2046}
2047
2048fn crc32(data: &[u8]) -> u32 {
2049 let mut hasher = Crc32::new();
2050 hasher.update(data);
2051 hasher.finalize()
2052}
2053
2054pub(crate) fn fnv1a_hash(bytes: &[u8]) -> u64 {
2055 let mut hash = 0xcbf2_9ce4_8422_2325_u64;
2056 for &byte in bytes {
2057 hash ^= u64::from(byte);
2058 hash = hash.wrapping_mul(0x0100_0000_01b3_u64);
2059 }
2060 hash
2061}
2062
2063const fn is_tombstoned_flags(flags: u16) -> bool {
2064 flags & RECORD_FLAG_TOMBSTONE != 0
2065}
2066
2067const fn next_generation(current: u8) -> u8 {
2068 if current == 255 { 1 } else { current + 1 }
2069}
2070
2071#[cfg(test)]
2072mod tests {
2073 use super::*;
2074
2075 fn temp_index_path(name: &str) -> PathBuf {
2076 let now = SystemTime::now()
2077 .duration_since(UNIX_EPOCH)
2078 .unwrap_or_default()
2079 .as_nanos();
2080 std::env::temp_dir().join(format!(
2081 "frankensearch-index-{name}-{}-{now}.fsvi",
2082 std::process::id()
2083 ))
2084 }
2085
2086 fn sample_vector(base: f32, dim: usize) -> Vec<f32> {
2087 vec![base; dim]
2088 }
2089
2090 #[test]
2091 fn round_trip_f16_with_revision_and_lookup() {
2092 let path = temp_index_path("round-trip");
2093 let mut writer =
2094 VectorIndex::create_with_revision(&path, "fnv1a-384", "rev-123", 8, Quantization::F16)
2095 .expect("writer");
2096 writer
2097 .write_record("doc-b", &sample_vector(1.0, 8))
2098 .expect("write doc-b");
2099 writer
2100 .write_record("doc-a", &sample_vector(2.0, 8))
2101 .expect("write doc-a");
2102 writer.finish().expect("finish");
2103
2104 let index = VectorIndex::open(&path).expect("open index");
2105 assert_eq!(index.record_count(), 2);
2106 assert_eq!(index.dimension(), 8);
2107 assert_eq!(index.embedder_id(), "fnv1a-384");
2108 assert_eq!(index.embedder_revision(), "rev-123");
2109 assert_eq!(index.quantization(), Quantization::F16);
2110 assert_eq!(index.metadata().vectors_offset % VECTOR_ALIGN_BYTES, 0);
2111
2112 let hash_a = fnv1a_hash(b"doc-a");
2113 let pos_a = index
2114 .find_index_by_doc_hash(hash_a)
2115 .expect("hash lookup should find doc-a");
2116 let doc_id = index.doc_id_at(pos_a).expect("doc id");
2117 assert_eq!(doc_id, "doc-a");
2118 let vec_a = index.vector_at_f32(pos_a).expect("vector");
2119 assert_eq!(vec_a.len(), 8);
2120 assert!((vec_a[0] - 2.0).abs() < 0.002);
2121 }
2122
2123 #[test]
2124 fn detects_header_crc_corruption() {
2125 let path = temp_index_path("crc");
2126 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2127 writer
2128 .write_record("doc-1", &sample_vector(0.5, 4))
2129 .expect("write");
2130 writer.finish().expect("finish");
2131
2132 let mut bytes = fs::read(&path).expect("read index");
2133 bytes[6] ^= 0xAA;
2135 fs::write(&path, bytes).expect("rewrite corrupt index");
2136
2137 let error = VectorIndex::open(&path).expect_err("corruption should be detected");
2138 assert!(matches!(error, SearchError::IndexCorrupted { .. }));
2139 }
2140
2141 #[test]
2142 fn write_record_dimension_mismatch_is_error() {
2143 let path = temp_index_path("dim-mismatch");
2144 let mut writer = VectorIndex::create(&path, "fnv1a-384", 3).expect("writer");
2145 let error = writer
2146 .write_record("doc-1", &[1.0, 2.0])
2147 .expect_err("must reject wrong dimension");
2148 assert!(matches!(
2149 error,
2150 SearchError::DimensionMismatch {
2151 expected: 3,
2152 found: 2
2153 }
2154 ));
2155 }
2156
2157 #[test]
2158 fn empty_index_round_trip() {
2159 let path = temp_index_path("empty");
2160 let writer = VectorIndex::create(&path, "fnv1a-384", 16).expect("writer");
2161 writer.finish().expect("finish");
2162
2163 let index = VectorIndex::open(&path).expect("open");
2164 assert_eq!(index.record_count(), 0);
2165 assert_eq!(index.dimension(), 16);
2166 }
2167
2168 #[test]
2169 fn get_embeddings_returns_none_for_missing_hashes() {
2170 let path = temp_index_path("get-embeddings");
2171 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2172 writer
2173 .write_record("doc-1", &[0.1, 0.2, 0.3, 0.4])
2174 .expect("write");
2175 writer.finish().expect("finish");
2176
2177 let index = VectorIndex::open(&path).expect("open");
2178 let existing = fnv1a_hash(b"doc-1");
2179 let missing = fnv1a_hash(b"missing");
2180 let embeddings = index.get_embeddings(&[existing, missing]);
2181 assert!(embeddings[0].is_some());
2182 assert!(embeddings[1].is_none());
2183 assert_eq!(embeddings[0].as_ref().expect("existing").len(), 4);
2184 }
2185
2186 #[test]
2187 fn soft_delete_marks_record_and_hides_hash_lookup() {
2188 let path = temp_index_path("soft-delete-main");
2189 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2190 writer
2191 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2192 .expect("write doc-a");
2193 writer
2194 .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2195 .expect("write doc-b");
2196 writer.finish().expect("finish");
2197
2198 let mut index = VectorIndex::open(&path).expect("open");
2199 assert!(index.soft_delete("doc-a").expect("soft delete"));
2200 assert!(!index.soft_delete("doc-a").expect("idempotent soft delete"));
2201
2202 let hash_a = fnv1a_hash(b"doc-a");
2203 let hash_b = fnv1a_hash(b"doc-b");
2204 assert_eq!(index.find_index_by_doc_hash(hash_a), None);
2205 assert!(index.find_index_by_doc_hash(hash_b).is_some());
2206 assert_eq!(index.tombstone_count(), 1);
2207
2208 let hits = index
2209 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2210 .expect("search");
2211 assert_eq!(hits.len(), 1);
2212 assert_eq!(hits[0].doc_id, "doc-b");
2213
2214 std::fs::remove_file(&path).ok();
2215 }
2216
2217 #[test]
2218 fn soft_delete_missing_returns_false() {
2219 let path = temp_index_path("soft-delete-missing");
2220 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2221 writer
2222 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2223 .expect("write");
2224 writer.finish().expect("finish");
2225
2226 let mut index = VectorIndex::open(&path).expect("open");
2227 assert!(
2228 !index
2229 .soft_delete("missing-doc")
2230 .expect("missing soft delete")
2231 );
2232 assert_eq!(index.tombstone_count(), 0);
2233
2234 std::fs::remove_file(&path).ok();
2235 }
2236
2237 #[test]
2238 fn soft_delete_batch_counts_only_new_tombstones() {
2239 let path = temp_index_path("soft-delete-batch");
2240 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2241 writer
2242 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2243 .expect("write a");
2244 writer
2245 .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2246 .expect("write b");
2247 writer
2248 .write_record("doc-c", &[0.0, 0.0, 1.0, 0.0])
2249 .expect("write c");
2250 writer.finish().expect("finish");
2251
2252 let mut index = VectorIndex::open(&path).expect("open");
2253 let deleted = index
2254 .soft_delete_batch(&["doc-a", "doc-b", "missing", "doc-a"])
2255 .expect("batch delete");
2256 assert_eq!(deleted, 2);
2257 assert_eq!(index.tombstone_count(), 2);
2258
2259 std::fs::remove_file(&path).ok();
2260 }
2261
2262 #[test]
2263 fn tombstone_ratio_and_needs_vacuum_threshold() {
2264 let path = temp_index_path("soft-delete-ratio");
2265 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2266 for i in 0..10 {
2267 writer
2268 .write_record(&format!("doc-{i}"), &sample_vector(0.1, 4))
2269 .expect("write");
2270 }
2271 writer.finish().expect("finish");
2272
2273 let mut index = VectorIndex::open(&path).expect("open");
2274 assert!(index.tombstone_ratio().abs() < f64::EPSILON);
2275 assert!(!index.needs_vacuum());
2276
2277 index.soft_delete("doc-0").expect("delete 0");
2278 index.soft_delete("doc-1").expect("delete 1");
2279 assert_eq!(index.tombstone_count(), 2);
2280 assert!((index.tombstone_ratio() - 0.2).abs() < f64::EPSILON);
2281 assert!(!index.needs_vacuum(), "threshold is strict greater-than");
2282
2283 index.soft_delete("doc-2").expect("delete 2");
2284 assert_eq!(index.tombstone_count(), 3);
2285 assert!(index.needs_vacuum());
2286
2287 std::fs::remove_file(&path).ok();
2288 }
2289
2290 #[test]
2291 fn vacuum_removes_tombstones_and_preserves_live_results() {
2292 let path = temp_index_path("soft-delete-vacuum");
2293 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2294 writer
2295 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2296 .expect("write a");
2297 writer
2298 .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2299 .expect("write b");
2300 writer
2301 .write_record("doc-c", &[0.0, 0.0, 1.0, 0.0])
2302 .expect("write c");
2303 writer.finish().expect("finish");
2304
2305 let mut index = VectorIndex::open(&path).expect("open");
2306 index.soft_delete("doc-b").expect("delete b");
2307
2308 let pre_hits = index
2309 .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
2310 .expect("pre-vacuum search");
2311 assert_eq!(pre_hits.len(), 2);
2312 assert!(pre_hits.iter().all(|hit| hit.doc_id != "doc-b"));
2313
2314 let stats = index.vacuum().expect("vacuum");
2315 assert_eq!(stats.records_before, 3);
2316 assert_eq!(stats.records_after, 2);
2317 assert_eq!(stats.tombstones_removed, 1);
2318 assert!(stats.bytes_reclaimed > 0);
2319 assert!(stats.duration >= Duration::ZERO);
2320
2321 assert_eq!(index.record_count(), 2);
2322 assert_eq!(index.tombstone_count(), 0);
2323 assert_eq!(index.find_index_by_doc_hash(fnv1a_hash(b"doc-b")), None);
2324
2325 let post_hits = index
2326 .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
2327 .expect("post-vacuum search");
2328 assert_eq!(post_hits.len(), 2);
2329 assert!(post_hits.iter().all(|hit| hit.doc_id != "doc-b"));
2330
2331 std::fs::remove_file(&path).ok();
2332 }
2333
2334 #[test]
2335 fn soft_delete_and_search_interleaving_has_no_corruption() {
2336 use std::collections::HashSet;
2337 use std::sync::{Arc, Mutex};
2338
2339 let path = temp_index_path("soft-delete-concurrent");
2340 let dim = 4;
2341
2342 let mut writer = VectorIndex::create(&path, "fnv1a-384", dim).expect("writer");
2343 for i in 0..128 {
2344 writer
2345 .write_record(&format!("doc-{i:03}"), &[1.0, 0.0, 0.0, 0.0])
2346 .expect("write");
2347 }
2348 writer.finish().expect("finish");
2349
2350 let shared = Arc::new(Mutex::new(VectorIndex::open(&path).expect("open")));
2351 let deleter = {
2352 let index = Arc::clone(&shared);
2353 std::thread::spawn(move || {
2354 for i in 0..32 {
2355 let mut guard = index.lock().expect("lock for delete");
2356 let doc_id = format!("doc-{i:03}");
2357 let _ = guard.soft_delete(&doc_id).expect("soft delete");
2358 }
2359 })
2360 };
2361
2362 let query = [1.0, 0.0, 0.0, 0.0];
2363 let searchers: Vec<_> = (0..4)
2364 .map(|_| {
2365 let index = Arc::clone(&shared);
2366 std::thread::spawn(move || {
2367 for _ in 0..32 {
2368 let hits = index
2369 .lock()
2370 .expect("lock for search")
2371 .search_top_k(&query, 10, None)
2372 .expect("search");
2373 assert!(!hits.is_empty());
2374 }
2375 })
2376 })
2377 .collect();
2378
2379 deleter.join().expect("join deleter");
2380 for handle in searchers {
2381 handle.join().expect("join searcher");
2382 }
2383
2384 let hits = shared
2385 .lock()
2386 .expect("lock final")
2387 .search_top_k(&query, 64, None)
2388 .expect("final search");
2389 let deleted_ids: HashSet<String> = (0..32).map(|i| format!("doc-{i:03}")).collect();
2390 assert!(hits.iter().all(|hit| !deleted_ids.contains(&hit.doc_id)));
2391
2392 std::fs::remove_file(&path).ok();
2393 }
2394
2395 #[test]
2396 fn soft_delete_preserves_existing_non_tombstone_flags() {
2397 let path = temp_index_path("soft-delete-flags");
2398 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2399 writer
2400 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2401 .expect("write doc-a");
2402 writer.finish().expect("finish");
2403
2404 let mut index = VectorIndex::open(&path).expect("open");
2405 let hash_a = fnv1a_hash(b"doc-a");
2406 let record_index = index
2407 .find_index_by_doc_hash(hash_a)
2408 .expect("record index for doc-a");
2409
2410 let custom_flag: u16 = 0x0004;
2411 index
2412 .set_record_flags(record_index, custom_flag)
2413 .expect("seed custom flag");
2414 assert_eq!(
2415 index.record_at(record_index).expect("read flags").flags,
2416 custom_flag
2417 );
2418
2419 assert!(index.soft_delete("doc-a").expect("soft delete doc-a"));
2420 let flags_after = index.record_at(record_index).expect("read flags").flags;
2421 assert_eq!(
2422 flags_after & RECORD_FLAG_TOMBSTONE,
2423 RECORD_FLAG_TOMBSTONE,
2424 "tombstone bit must be set",
2425 );
2426 assert_eq!(
2427 flags_after & custom_flag,
2428 custom_flag,
2429 "non-tombstone bits must remain untouched",
2430 );
2431
2432 std::fs::remove_file(&path).ok();
2433 }
2434
2435 #[test]
2436 fn tombstone_flag_persists_after_reopen() {
2437 let path = temp_index_path("soft-delete-persist");
2438 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2439 writer
2440 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2441 .expect("write a");
2442 writer
2443 .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2444 .expect("write b");
2445 writer.finish().expect("finish");
2446
2447 {
2448 let mut index = VectorIndex::open(&path).expect("open for delete");
2449 assert!(index.soft_delete("doc-a").expect("delete doc-a"));
2450 assert_eq!(index.tombstone_count(), 1);
2451 }
2452
2453 let reopened = VectorIndex::open(&path).expect("reopen");
2454 assert_eq!(reopened.tombstone_count(), 1);
2455 assert_eq!(reopened.find_index_by_doc_hash(fnv1a_hash(b"doc-a")), None);
2456 let hits = reopened
2457 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2458 .expect("search after reopen");
2459 assert!(hits.iter().all(|hit| hit.doc_id != "doc-a"));
2460
2461 std::fs::remove_file(&path).ok();
2462 }
2463
2464 #[test]
2465 fn delete_vacuum_append_cycle_keeps_expected_live_set() {
2466 use std::collections::HashSet;
2467
2468 let path = temp_index_path("soft-delete-reindex-cycle");
2469 let dim = 4;
2470
2471 let mut writer = VectorIndex::create(&path, "fnv1a-384", dim).expect("writer");
2472 for i in 0..100 {
2473 writer
2474 .write_record(&format!("doc-{i:03}"), &[1.0, 0.0, 0.0, 0.0])
2475 .expect("write initial doc");
2476 }
2477 writer.finish().expect("finish");
2478
2479 let mut index = VectorIndex::open(&path).expect("open");
2480 let delete_ids: Vec<String> = (0..50).map(|i| format!("doc-{i:03}")).collect();
2481 let delete_refs: Vec<&str> = delete_ids.iter().map(String::as_str).collect();
2482 let deleted = index.soft_delete_batch(&delete_refs).expect("batch delete");
2483 assert_eq!(deleted, 50);
2484 assert_eq!(index.tombstone_count(), 50);
2485
2486 let vacuum_stats = index.vacuum().expect("vacuum");
2487 assert_eq!(vacuum_stats.records_before, 100);
2488 assert_eq!(vacuum_stats.records_after, 50);
2489 assert_eq!(index.tombstone_count(), 0);
2490 assert_eq!(index.record_count(), 50);
2491
2492 let append_entries: Vec<(String, Vec<f32>)> = (100..150)
2493 .map(|i| (format!("doc-{i:03}"), vec![1.0, 0.0, 0.0, 0.0]))
2494 .collect();
2495 index.append_batch(&append_entries).expect("append batch");
2496 assert_eq!(index.wal_record_count(), 50);
2497
2498 let compact_stats = index.compact().expect("compact");
2499 assert_eq!(compact_stats.total_records_after, 100);
2500 assert_eq!(index.record_count(), 100);
2501 assert_eq!(index.wal_record_count(), 0);
2502
2503 let hits = index
2504 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 150, None)
2505 .expect("search");
2506 assert_eq!(hits.len(), 100);
2507 let ids: HashSet<String> = hits.iter().map(|hit| hit.doc_id.clone()).collect();
2508
2509 for i in 0..50 {
2510 assert!(
2511 !ids.contains(&format!("doc-{i:03}")),
2512 "deleted id must not be present",
2513 );
2514 }
2515 for i in 50..150 {
2516 assert!(
2517 ids.contains(&format!("doc-{i:03}")),
2518 "live id must be present",
2519 );
2520 }
2521
2522 std::fs::remove_file(&path).ok();
2523 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2524 }
2525
2526 #[test]
2527 fn tombstones_remain_excluded_with_wal_and_after_compaction() {
2528 let path = temp_index_path("soft-delete-wal-integration");
2529 let dim = 4;
2530
2531 let mut writer = VectorIndex::create(&path, "fnv1a-384", dim).expect("writer");
2532 writer
2533 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2534 .expect("write a");
2535 writer
2536 .write_record("doc-b", &[1.0, 0.0, 0.0, 0.0])
2537 .expect("write b");
2538 writer.finish().expect("finish");
2539
2540 let mut index = VectorIndex::open(&path).expect("open");
2541 assert!(index.soft_delete("doc-a").expect("delete a"));
2542 index
2543 .append("doc-c", &[1.0, 0.0, 0.0, 0.0])
2544 .expect("append c");
2545 assert_eq!(index.wal_record_count(), 1);
2546
2547 let pre_compact = index
2548 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2549 .expect("pre-compact search");
2550 assert_eq!(pre_compact.len(), 2);
2551 assert!(pre_compact.iter().all(|hit| hit.doc_id != "doc-a"));
2552 assert!(pre_compact.iter().any(|hit| hit.doc_id == "doc-b"));
2553 assert!(pre_compact.iter().any(|hit| hit.doc_id == "doc-c"));
2554
2555 index.compact().expect("compact");
2556 let post_compact = index
2557 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2558 .expect("post-compact search");
2559 assert_eq!(post_compact.len(), 2);
2560 assert!(post_compact.iter().all(|hit| hit.doc_id != "doc-a"));
2561 assert!(post_compact.iter().any(|hit| hit.doc_id == "doc-b"));
2562 assert!(post_compact.iter().any(|hit| hit.doc_id == "doc-c"));
2563
2564 std::fs::remove_file(&path).ok();
2565 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2566 }
2567
2568 #[test]
2569 fn vacuum_noop_when_no_tombstones() {
2570 let path = temp_index_path("soft-delete-vacuum-noop");
2571 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2572 writer
2573 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2574 .expect("write a");
2575 writer
2576 .write_record("doc-b", &[0.0, 1.0, 0.0, 0.0])
2577 .expect("write b");
2578 writer.finish().expect("finish");
2579
2580 let mut index = VectorIndex::open(&path).expect("open");
2581 assert_eq!(index.tombstone_count(), 0);
2582
2583 let stats = index.vacuum().expect("vacuum with no tombstones");
2584 assert_eq!(stats.records_before, 2);
2585 assert_eq!(stats.records_after, 2);
2586 assert_eq!(stats.tombstones_removed, 0);
2587 assert_eq!(index.record_count(), 2);
2588
2589 std::fs::remove_file(&path).ok();
2590 }
2591
2592 #[test]
2593 fn soft_delete_all_records_yields_empty_search() {
2594 let path = temp_index_path("soft-delete-all");
2595 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2596 for i in 0..5 {
2597 writer
2598 .write_record(&format!("doc-{i}"), &sample_vector(0.1, 4))
2599 .expect("write");
2600 }
2601 writer.finish().expect("finish");
2602
2603 let mut index = VectorIndex::open(&path).expect("open");
2604 for i in 0..5 {
2605 assert!(index.soft_delete(&format!("doc-{i}")).expect("delete"));
2606 }
2607 assert_eq!(index.tombstone_count(), 5);
2608 assert!((index.tombstone_ratio() - 1.0).abs() < f64::EPSILON);
2609 assert!(index.needs_vacuum());
2610
2611 let hits = index
2612 .search_top_k(&sample_vector(0.1, 4), 10, None)
2613 .expect("search");
2614 assert!(
2615 hits.is_empty(),
2616 "search with all deleted should return nothing"
2617 );
2618
2619 std::fs::remove_file(&path).ok();
2620 }
2621
2622 #[test]
2623 fn vacuum_after_deleting_all_records_yields_empty_index() {
2624 let path = temp_index_path("soft-delete-vacuum-all");
2625 let mut writer = VectorIndex::create(&path, "fnv1a-384", 4).expect("writer");
2626 for i in 0..3 {
2627 writer
2628 .write_record(&format!("doc-{i}"), &[1.0, 0.0, 0.0, 0.0])
2629 .expect("write");
2630 }
2631 writer.finish().expect("finish");
2632
2633 let mut index = VectorIndex::open(&path).expect("open");
2634 for i in 0..3 {
2635 index.soft_delete(&format!("doc-{i}")).expect("delete");
2636 }
2637
2638 let stats = index.vacuum().expect("vacuum all deleted");
2639 assert_eq!(stats.records_before, 3);
2640 assert_eq!(stats.records_after, 0);
2641 assert_eq!(stats.tombstones_removed, 3);
2642 assert_eq!(index.record_count(), 0);
2643 assert_eq!(index.tombstone_count(), 0);
2644 assert!(index.tombstone_ratio().abs() < f64::EPSILON);
2645 assert!(!index.needs_vacuum());
2646
2647 let hits = index
2648 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2649 .expect("search");
2650 assert!(hits.is_empty());
2651
2652 std::fs::remove_file(&path).ok();
2653 }
2654
2655 #[test]
2658 fn append_single_vector_is_searchable() {
2659 let path = temp_index_path("wal-append-single");
2660 let dim = 4;
2661
2662 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2664 writer
2665 .write_record("main-0", &[1.0, 0.0, 0.0, 0.0])
2666 .expect("write");
2667 writer.finish().expect("finish");
2668
2669 let mut index = VectorIndex::open(&path).expect("open");
2671 assert_eq!(index.wal_record_count(), 0);
2672 index
2673 .append("wal-0", &[0.0, 1.0, 0.0, 0.0])
2674 .expect("append");
2675 assert_eq!(index.wal_record_count(), 1);
2676
2677 let hits = index
2679 .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
2680 .expect("search");
2681 assert_eq!(hits.len(), 2);
2682 assert_eq!(hits[0].doc_id, "wal-0", "WAL entry should rank first");
2683
2684 std::fs::remove_file(&path).ok();
2686 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2687 }
2688
2689 #[test]
2690 fn append_batch_all_searchable() {
2691 let path = temp_index_path("wal-append-batch");
2692 let dim = 4;
2693
2694 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2695 writer
2696 .write_record("main-0", &[1.0, 0.0, 0.0, 0.0])
2697 .expect("write");
2698 writer.finish().expect("finish");
2699
2700 let mut index = VectorIndex::open(&path).expect("open");
2701 index
2702 .append_batch(&[
2703 ("wal-0".to_owned(), vec![0.0, 1.0, 0.0, 0.0]),
2704 ("wal-1".to_owned(), vec![0.0, 0.0, 1.0, 0.0]),
2705 ("wal-2".to_owned(), vec![0.0, 0.0, 0.0, 1.0]),
2706 ])
2707 .expect("append batch");
2708 assert_eq!(index.wal_record_count(), 3);
2709
2710 let hits = index
2711 .search_top_k(&[1.0, 1.0, 1.0, 1.0], 10, None)
2712 .expect("search");
2713 assert_eq!(hits.len(), 4, "all 4 vectors should be returned");
2714
2715 std::fs::remove_file(&path).ok();
2716 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2717 }
2718
2719 #[test]
2720 fn compaction_merges_wal_into_main() {
2721 let path = temp_index_path("wal-compact");
2722 let dim = 4;
2723
2724 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2725 writer
2726 .write_record("main-0", &[1.0, 0.0, 0.0, 0.0])
2727 .expect("write");
2728 writer.finish().expect("finish");
2729
2730 let mut index = VectorIndex::open(&path).expect("open");
2731 index
2732 .append("wal-0", &[0.0, 1.0, 0.0, 0.0])
2733 .expect("append");
2734 index
2735 .append("wal-1", &[0.0, 0.0, 1.0, 0.0])
2736 .expect("append");
2737
2738 assert_eq!(index.record_count(), 1);
2739 assert_eq!(index.wal_record_count(), 2);
2740
2741 let stats = index.compact().expect("compact");
2742 assert_eq!(stats.main_records_before, 1);
2743 assert_eq!(stats.wal_records, 2);
2744 assert_eq!(stats.total_records_after, 3);
2745 assert_eq!(index.record_count(), 3);
2746 assert_eq!(index.wal_record_count(), 0);
2747 assert!(!wal::wal_path_for(&path).exists(), "WAL should be deleted");
2748
2749 let hits = index
2751 .search_top_k(&[1.0, 1.0, 1.0, 1.0], 10, None)
2752 .expect("search");
2753 assert_eq!(hits.len(), 3);
2754
2755 std::fs::remove_file(&path).ok();
2756 }
2757
2758 #[test]
2759 fn needs_compaction_threshold() {
2760 let path = temp_index_path("wal-threshold");
2761 let dim = 4;
2762
2763 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2764 for i in 0..10 {
2765 writer
2766 .write_record(&format!("main-{i}"), &sample_vector(0.1, dim))
2767 .expect("write");
2768 }
2769 writer.finish().expect("finish");
2770
2771 let mut index = VectorIndex::open(&path).expect("open");
2772 index.set_wal_config(WalConfig {
2773 compaction_threshold: 5,
2774 compaction_ratio: 0.10,
2775 fsync_on_write: false,
2776 });
2777
2778 assert!(!index.needs_compaction());
2779
2780 index
2782 .append("wal-0", &sample_vector(0.2, dim))
2783 .expect("append");
2784 assert!(index.needs_compaction());
2785
2786 std::fs::remove_file(&path).ok();
2787 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2788 }
2789
2790 #[test]
2791 fn wal_survives_reopen() {
2792 let path = temp_index_path("wal-reopen");
2793 let dim = 4;
2794
2795 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2796 writer
2797 .write_record("main-0", &[1.0, 0.0, 0.0, 0.0])
2798 .expect("write");
2799 writer.finish().expect("finish");
2800
2801 {
2803 let mut index = VectorIndex::open(&path).expect("open");
2804 index
2805 .append("wal-0", &[0.0, 1.0, 0.0, 0.0])
2806 .expect("append");
2807 }
2808
2809 let index = VectorIndex::open(&path).expect("reopen");
2811 assert_eq!(index.wal_record_count(), 1);
2812
2813 let hits = index
2814 .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
2815 .expect("search");
2816 assert_eq!(hits.len(), 2);
2817 assert_eq!(hits[0].doc_id, "wal-0");
2818
2819 std::fs::remove_file(&path).ok();
2820 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2821 }
2822
2823 #[test]
2824 fn append_dimension_mismatch_rejected() {
2825 let path = temp_index_path("wal-dim-mismatch");
2826 let dim = 4;
2827
2828 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2829 writer
2830 .write_record("main-0", &sample_vector(1.0, dim))
2831 .expect("write");
2832 writer.finish().expect("finish");
2833
2834 let mut index = VectorIndex::open(&path).expect("open");
2835 let err = index
2836 .append("bad", &[1.0, 2.0])
2837 .expect_err("should reject wrong dimension");
2838 assert!(matches!(err, SearchError::DimensionMismatch { .. }));
2839 assert_eq!(
2840 index.wal_record_count(),
2841 0,
2842 "failed append should not persist"
2843 );
2844
2845 std::fs::remove_file(&path).ok();
2846 }
2847
2848 #[test]
2849 fn compact_empty_wal_is_noop() {
2850 let path = temp_index_path("wal-compact-empty");
2851 let dim = 4;
2852
2853 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2854 writer
2855 .write_record("main-0", &sample_vector(1.0, dim))
2856 .expect("write");
2857 writer.finish().expect("finish");
2858
2859 let mut index = VectorIndex::open(&path).expect("open");
2860 let stats = index.compact().expect("compact empty WAL");
2861 assert_eq!(stats.wal_records, 0);
2862 assert_eq!(stats.total_records_after, 1);
2863
2864 std::fs::remove_file(&path).ok();
2865 }
2866
2867 #[test]
2868 fn wal_entries_rank_correctly_against_main() {
2869 let path = temp_index_path("wal-ranking");
2870 let dim = 4;
2871
2872 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2874 writer
2875 .write_record("main-mediocre", &[0.5, 0.5, 0.0, 0.0])
2876 .expect("write");
2877 writer.finish().expect("finish");
2878
2879 let mut index = VectorIndex::open(&path).expect("open");
2881 index
2882 .append("wal-perfect", &[1.0, 0.0, 0.0, 0.0])
2883 .expect("append");
2884
2885 let hits = index
2886 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 2, None)
2887 .expect("search");
2888 assert_eq!(hits.len(), 2);
2889 assert_eq!(hits[0].doc_id, "wal-perfect");
2890 assert!(hits[0].score > hits[1].score);
2891
2892 std::fs::remove_file(&path).ok();
2893 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2894 }
2895
2896 #[test]
2897 fn append_duplicate_doc_id_both_searchable() {
2898 let path = temp_index_path("wal-dup-docid");
2899 let dim = 4;
2900
2901 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2902 writer
2903 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
2904 .expect("write");
2905 writer.finish().expect("finish");
2906
2907 let mut index = VectorIndex::open(&path).expect("open");
2908 index
2910 .append("doc-a", &[0.0, 0.0, 0.0, 1.0])
2911 .expect("append duplicate");
2912 assert_eq!(index.wal_record_count(), 1);
2913
2914 let hits = index
2916 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
2917 .expect("search");
2918 assert_eq!(
2919 hits.len(),
2920 1,
2921 "WAL shadows main — only WAL entry should appear"
2922 );
2923 assert_eq!(hits[0].doc_id, "doc-a");
2924
2925 std::fs::remove_file(&path).ok();
2926 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2927 }
2928
2929 #[test]
2930 fn append_large_batch_100_vectors() {
2931 let path = temp_index_path("wal-large-batch");
2932 let dim = 8;
2933
2934 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2935 writer
2936 .write_record("main-0", &sample_vector(1.0, dim))
2937 .expect("write");
2938 writer.finish().expect("finish");
2939
2940 let mut index = VectorIndex::open(&path).expect("open");
2941 let batch: Vec<(String, Vec<f32>)> = (0..100)
2942 .map(|i| {
2943 #[allow(clippy::cast_precision_loss)]
2944 let base = (i as f32) * 0.01;
2945 (format!("wal-{i:03}"), sample_vector(base, dim))
2946 })
2947 .collect();
2948 index.append_batch(&batch).expect("large batch");
2949 assert_eq!(index.wal_record_count(), 100);
2950
2951 let hits = index
2952 .search_top_k(&sample_vector(1.0, dim), 5, None)
2953 .expect("search");
2954 assert_eq!(hits.len(), 5);
2955 assert!(hits.iter().any(|h| h.doc_id == "main-0"));
2957
2958 std::fs::remove_file(&path).ok();
2959 std::fs::remove_file(wal::wal_path_for(&path)).ok();
2960 }
2961
2962 #[test]
2963 fn concurrent_append_and_search() {
2964 use std::sync::Arc;
2965
2966 let path = temp_index_path("wal-concurrent");
2967 let dim = 4;
2968
2969 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
2970 for i in 0..10 {
2971 writer
2972 .write_record(&format!("main-{i}"), &sample_vector(0.1, dim))
2973 .expect("write");
2974 }
2975 writer.finish().expect("finish");
2976
2977 let mut index = VectorIndex::open(&path).expect("open");
2980 for i in 0..20 {
2981 index
2982 .append(&format!("wal-{i}"), &sample_vector(0.5, dim))
2983 .expect("append");
2984 }
2985
2986 let index = Arc::new(index);
2987 let query = sample_vector(1.0, dim);
2988
2989 let handles: Vec<_> = (0..4)
2990 .map(|_| {
2991 let idx = Arc::clone(&index);
2992 let q = query.clone();
2993 std::thread::spawn(move || idx.search_top_k(&q, 10, None).expect("search"))
2994 })
2995 .collect();
2996
2997 for handle in handles {
2998 let hits = handle.join().expect("thread join");
2999 assert_eq!(hits.len(), 10);
3000 assert!(hits.iter().all(|h| h.score > 0.0));
3002 }
3003
3004 std::fs::remove_file(&path).ok();
3005 std::fs::remove_file(wal::wal_path_for(&path)).ok();
3006 }
3007
3008 #[test]
3009 fn wal_record_count_across_append_compact_cycles() {
3010 let path = temp_index_path("wal-count-cycle");
3011 let dim = 4;
3012
3013 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
3014 writer
3015 .write_record("main-0", &sample_vector(1.0, dim))
3016 .expect("write");
3017 writer.finish().expect("finish");
3018
3019 let mut index = VectorIndex::open(&path).expect("open");
3020 assert_eq!(index.wal_record_count(), 0);
3021 assert_eq!(index.record_count(), 1);
3022
3023 index.append("w1", &sample_vector(0.1, dim)).expect("a1");
3025 index.append("w2", &sample_vector(0.2, dim)).expect("a2");
3026 index.append("w3", &sample_vector(0.3, dim)).expect("a3");
3027 assert_eq!(index.wal_record_count(), 3);
3028 assert_eq!(index.record_count(), 1);
3029
3030 index.compact().expect("compact");
3032 assert_eq!(index.wal_record_count(), 0);
3033 assert_eq!(index.record_count(), 4);
3034
3035 index.append("w4", &sample_vector(0.4, dim)).expect("a4");
3037 index.append("w5", &sample_vector(0.5, dim)).expect("a5");
3038 assert_eq!(index.wal_record_count(), 2);
3039 assert_eq!(index.record_count(), 4);
3040
3041 let hits = index
3043 .search_top_k(&sample_vector(1.0, dim), 100, None)
3044 .expect("search");
3045 assert_eq!(hits.len(), 6);
3046
3047 std::fs::remove_file(&path).ok();
3048 std::fs::remove_file(wal::wal_path_for(&path)).ok();
3049 }
3050
3051 #[test]
3052 fn soft_delete_removes_wal_only_record_and_persists() {
3053 let path = temp_index_path("wal-soft-delete-only");
3054 let dim = 4;
3055
3056 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
3057 writer
3058 .write_record("main-0", &sample_vector(1.0, dim))
3059 .expect("write");
3060 writer.finish().expect("finish");
3061
3062 let mut index = VectorIndex::open(&path).expect("open");
3063 index
3064 .append("wal-only", &[0.0, 1.0, 0.0, 0.0])
3065 .expect("append wal-only");
3066 assert_eq!(index.wal_record_count(), 1);
3067
3068 assert!(index.soft_delete("wal-only").expect("soft delete wal-only"));
3069 assert_eq!(index.wal_record_count(), 0);
3070 let hits = index
3071 .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
3072 .expect("search");
3073 assert!(hits.iter().all(|hit| hit.doc_id != "wal-only"));
3074
3075 drop(index);
3076 let reopened = VectorIndex::open(&path).expect("reopen");
3077 assert_eq!(reopened.wal_record_count(), 0);
3078 let reopened_hits = reopened
3079 .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
3080 .expect("search after reopen");
3081 assert!(reopened_hits.iter().all(|hit| hit.doc_id != "wal-only"));
3082
3083 std::fs::remove_file(&path).ok();
3084 std::fs::remove_file(wal::wal_path_for(&path)).ok();
3085 }
3086
3087 #[test]
3088 fn soft_delete_clears_pending_wal_updates_for_same_doc_id() {
3089 let path = temp_index_path("wal-soft-delete-main-and-wal");
3090 let dim = 4;
3091
3092 let mut writer = VectorIndex::create(&path, "test", dim).expect("writer");
3093 writer
3094 .write_record("doc-a", &[1.0, 0.0, 0.0, 0.0])
3095 .expect("write doc-a");
3096 writer.finish().expect("finish");
3097
3098 let mut index = VectorIndex::open(&path).expect("open");
3099 index
3100 .append("doc-a", &[0.0, 1.0, 0.0, 0.0])
3101 .expect("append doc-a update");
3102 index
3103 .append("doc-b", &[0.0, 0.0, 1.0, 0.0])
3104 .expect("append doc-b");
3105 assert_eq!(index.wal_record_count(), 2);
3106
3107 assert!(index.soft_delete("doc-a").expect("soft delete doc-a"));
3108 assert_eq!(
3109 index.wal_record_count(),
3110 1,
3111 "doc-a WAL entries should be purged"
3112 );
3113
3114 let hits = index
3115 .search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None)
3116 .expect("search");
3117 assert!(
3118 hits.iter().all(|hit| hit.doc_id != "doc-a"),
3119 "doc-a should not be searchable from main or WAL"
3120 );
3121 assert!(hits.iter().any(|hit| hit.doc_id == "doc-b"));
3122
3123 std::fs::remove_file(&path).ok();
3124 std::fs::remove_file(wal::wal_path_for(&path)).ok();
3125 }
3126
3127 #[test]
3128 fn empty_index_append_only() {
3129 let path = temp_index_path("wal-empty-append");
3130 let dim = 4;
3131
3132 let writer = VectorIndex::create(&path, "test", dim).expect("writer");
3134 writer.finish().expect("finish");
3135
3136 let mut index = VectorIndex::open(&path).expect("open");
3137 assert_eq!(index.record_count(), 0);
3138
3139 index
3141 .append("first", &[1.0, 0.0, 0.0, 0.0])
3142 .expect("append");
3143 assert_eq!(index.wal_record_count(), 1);
3144
3145 let hits = index
3147 .search_top_k(&[1.0, 0.0, 0.0, 0.0], 10, None)
3148 .expect("search");
3149 assert_eq!(hits.len(), 1);
3150 assert_eq!(hits[0].doc_id, "first");
3151
3152 let stats = index.compact().expect("compact");
3154 assert_eq!(stats.main_records_before, 0);
3155 assert_eq!(stats.wal_records, 1);
3156 assert_eq!(stats.total_records_after, 1);
3157 assert_eq!(index.record_count(), 1);
3158
3159 std::fs::remove_file(&path).ok();
3160 }
3161
3162 #[test]
3165 fn quantization_bytes_per_element() {
3166 assert_eq!(Quantization::F32.bytes_per_element(), 4);
3167 assert_eq!(Quantization::F16.bytes_per_element(), 2);
3168 }
3169
3170 #[test]
3171 fn quantization_from_wire_valid() {
3172 let path = Path::new("test.fsvi");
3173 assert_eq!(Quantization::from_wire(0, path).unwrap(), Quantization::F32);
3174 assert_eq!(Quantization::from_wire(1, path).unwrap(), Quantization::F16);
3175 }
3176
3177 #[test]
3178 fn quantization_from_wire_invalid() {
3179 let path = Path::new("test.fsvi");
3180 assert!(Quantization::from_wire(2, path).is_err());
3181 assert!(Quantization::from_wire(255, path).is_err());
3182 }
3183
3184 #[test]
3187 fn align_up_zero_alignment() {
3188 assert_eq!(align_up(42, 0).unwrap(), 42);
3189 }
3190
3191 #[test]
3192 fn align_up_already_aligned() {
3193 assert_eq!(align_up(128, 64).unwrap(), 128);
3194 }
3195
3196 #[test]
3197 fn align_up_zero_value() {
3198 assert_eq!(align_up(0, 64).unwrap(), 0);
3199 }
3200
3201 #[test]
3202 fn align_up_one_over() {
3203 assert_eq!(align_up(65, 64).unwrap(), 128);
3204 }
3205
3206 #[test]
3209 fn fnv1a_hash_empty_input() {
3210 let hash = fnv1a_hash(b"");
3211 assert_eq!(hash, 0xcbf2_9ce4_8422_2325);
3212 }
3213
3214 #[test]
3215 fn fnv1a_hash_deterministic() {
3216 let h1 = fnv1a_hash(b"hello");
3217 let h2 = fnv1a_hash(b"hello");
3218 assert_eq!(h1, h2);
3219 }
3220
3221 #[test]
3222 fn fnv1a_hash_different_inputs_differ() {
3223 let h1 = fnv1a_hash(b"doc-a");
3224 let h2 = fnv1a_hash(b"doc-b");
3225 assert_ne!(h1, h2);
3226 }
3227
3228 #[test]
3231 fn tombstone_flag_logic() {
3232 assert!(!is_tombstoned_flags(0x0000));
3233 assert!(is_tombstoned_flags(RECORD_FLAG_TOMBSTONE));
3234 assert!(is_tombstoned_flags(0x0003)); assert!(!is_tombstoned_flags(0x0002)); }
3237
3238 #[test]
3241 fn validate_header_string_empty_embedder_id_rejected() {
3242 let result = validate_header_string("", "embedder_id");
3243 assert!(result.is_err());
3244 }
3245
3246 #[test]
3247 fn validate_header_string_empty_embedder_revision_ok() {
3248 let result = validate_header_string("", "embedder_revision");
3249 assert!(result.is_ok());
3250 }
3251
3252 #[test]
3253 fn validate_header_string_normal_ok() {
3254 let result = validate_header_string("potion-128M", "embedder_id");
3255 assert!(result.is_ok());
3256 }
3257
3258 #[test]
3261 fn vector_metadata_clone_eq() {
3262 let meta = VectorMetadata {
3263 embedder_id: "test".to_owned(),
3264 embedder_revision: "v1".to_owned(),
3265 dimension: 256,
3266 quantization: Quantization::F16,
3267 compaction_gen: 0,
3268 record_count: 100,
3269 vectors_offset: 1024,
3270 };
3271 let cloned = meta.clone();
3272 assert_eq!(meta, cloned);
3273 }
3274
3275 #[test]
3278 fn create_zero_dimension_rejected() {
3279 let path = temp_index_path("zero-dim");
3280 let result = VectorIndex::create(&path, "test", 0);
3281 assert!(result.is_err());
3282 assert!(matches!(
3283 result.unwrap_err(),
3284 SearchError::InvalidConfig { .. }
3285 ));
3286 }
3287
3288 #[test]
3289 fn create_empty_embedder_id_rejected() {
3290 let path = temp_index_path("empty-embedder");
3291 let result = VectorIndex::create(&path, "", 4);
3292 assert!(result.is_err());
3293 }
3294
3295 #[test]
3296 fn create_with_revision_empty_revision_ok() {
3297 let path = temp_index_path("empty-rev");
3298 let writer =
3299 VectorIndex::create_with_revision(&path, "test", "", 4, Quantization::F16).unwrap();
3300 writer.finish().unwrap();
3301 let index = VectorIndex::open(&path).unwrap();
3302 assert_eq!(index.embedder_revision(), "");
3303 std::fs::remove_file(&path).ok();
3304 }
3305
3306 #[test]
3309 fn write_record_nan_embedding_rejected() {
3310 let path = temp_index_path("nan-embed");
3311 let mut writer = VectorIndex::create(&path, "test", 3).unwrap();
3312 let result = writer.write_record("doc", &[1.0, f32::NAN, 0.0]);
3313 assert!(result.is_err());
3314 let err = format!("{}", result.unwrap_err());
3315 assert!(
3316 err.contains("non-finite"),
3317 "expected non-finite error, got: {err}"
3318 );
3319 }
3320
3321 #[test]
3322 fn write_record_inf_embedding_rejected() {
3323 let path = temp_index_path("inf-embed");
3324 let mut writer = VectorIndex::create(&path, "test", 3).unwrap();
3325 let result = writer.write_record("doc", &[1.0, f32::INFINITY, 0.0]);
3326 assert!(result.is_err());
3327 }
3328
3329 #[test]
3332 fn open_nonexistent_file_returns_index_not_found() {
3333 let path = temp_index_path("nonexistent-open");
3334 let result = VectorIndex::open(&path);
3335 assert!(result.is_err());
3336 assert!(matches!(
3337 result.unwrap_err(),
3338 SearchError::IndexNotFound { .. }
3339 ));
3340 }
3341
3342 #[test]
3343 fn open_truncated_file_detected() {
3344 let path = temp_index_path("truncated-open");
3345 let mut writer = VectorIndex::create(&path, "test", 4).unwrap();
3346 writer.write_record("doc-0", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3347 writer.finish().unwrap();
3348
3349 let data = std::fs::read(&path).unwrap();
3350 std::fs::write(&path, &data[..data.len() - 4]).unwrap();
3351
3352 let result = VectorIndex::open(&path);
3353 assert!(result.is_err());
3354 let err = format!("{}", result.unwrap_err());
3355 assert!(
3356 err.contains("truncated") || err.contains("too small") || err.contains("extends"),
3357 "expected truncation error, got: {err}"
3358 );
3359
3360 std::fs::remove_file(&path).ok();
3361 }
3362
3363 #[test]
3366 fn fsvi_magic_is_four_bytes() {
3367 assert_eq!(FSVI_MAGIC.len(), 4);
3368 assert_eq!(&FSVI_MAGIC, b"FSVI");
3369 }
3370
3371 #[test]
3372 fn fsvi_version_is_one() {
3373 assert_eq!(FSVI_VERSION, 1);
3374 }
3375
3376 #[test]
3377 fn record_size_is_sixteen() {
3378 assert_eq!(RECORD_SIZE_BYTES, 16);
3379 }
3380
3381 #[test]
3384 fn vector_at_f16_roundtrip() {
3385 let path = temp_index_path("f16-at-roundtrip");
3386 let mut writer =
3387 VectorIndex::create_with_revision(&path, "test", "r1", 3, Quantization::F16).unwrap();
3388 writer.write_record("doc", &[0.5, -0.5, 1.0]).unwrap();
3389 writer.finish().unwrap();
3390
3391 let index = VectorIndex::open(&path).unwrap();
3392 let f16_vec = index.vector_at_f16(0).unwrap();
3393 assert_eq!(f16_vec.len(), 3);
3394 assert!((f16_vec[0].to_f32() - 0.5).abs() < 0.01);
3395 assert!((f16_vec[1].to_f32() - (-0.5)).abs() < 0.01);
3396 assert!((f16_vec[2].to_f32() - 1.0).abs() < 0.01);
3397
3398 std::fs::remove_file(&path).ok();
3399 }
3400
3401 #[test]
3404 fn vector_at_f16_from_f32_index() {
3405 let path = temp_index_path("f16-from-f32");
3406 let mut writer =
3407 VectorIndex::create_with_revision(&path, "test", "r1", 3, Quantization::F32).unwrap();
3408 writer.write_record("doc", &[0.25, -0.75, 1.0]).unwrap();
3409 writer.finish().unwrap();
3410
3411 let index = VectorIndex::open(&path).unwrap();
3412 let f16_vec = index.vector_at_f16(0).unwrap();
3413 assert_eq!(f16_vec.len(), 3);
3414 assert!((f16_vec[0].to_f32() - 0.25).abs() < 0.01);
3415
3416 std::fs::remove_file(&path).ok();
3417 }
3418
3419 #[test]
3422 fn metadata_accessor_returns_consistent_data() {
3423 let path = temp_index_path("metadata-accessor");
3424 let mut writer =
3425 VectorIndex::create_with_revision(&path, "emb-1", "rev-9", 16, Quantization::F32)
3426 .unwrap();
3427 writer.write_record("d", &[0.0; 16]).unwrap();
3428 writer.finish().unwrap();
3429
3430 let index = VectorIndex::open(&path).unwrap();
3431 let meta = index.metadata();
3432 assert_eq!(meta.embedder_id, "emb-1");
3433 assert_eq!(meta.embedder_revision, "rev-9");
3434 assert_eq!(meta.dimension, 16);
3435 assert_eq!(meta.quantization, Quantization::F32);
3436 assert_eq!(meta.record_count, 1);
3437 assert_eq!(meta.vectors_offset % 64, 0);
3438
3439 std::fs::remove_file(&path).ok();
3440 }
3441
3442 #[test]
3445 fn is_deleted_false_for_live_record() {
3446 let path = temp_index_path("is-deleted-live");
3447 let mut writer = VectorIndex::create(&path, "test", 4).unwrap();
3448 writer.write_record("doc", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3449 writer.finish().unwrap();
3450
3451 let index = VectorIndex::open(&path).unwrap();
3452 assert!(!index.is_deleted(0));
3453
3454 std::fs::remove_file(&path).ok();
3455 }
3456
3457 #[test]
3460 fn tombstone_ratio_empty_index_is_zero() {
3461 let path = temp_index_path("tomb-ratio-empty");
3462 let writer = VectorIndex::create(&path, "test", 4).unwrap();
3463 writer.finish().unwrap();
3464
3465 let index = VectorIndex::open(&path).unwrap();
3466 assert!(index.tombstone_ratio().abs() < f64::EPSILON);
3467 assert!(!index.needs_vacuum());
3468
3469 std::fs::remove_file(&path).ok();
3470 }
3471
3472 #[test]
3475 fn wal_config_default_values() {
3476 let cfg = WalConfig::default();
3477 assert!(cfg.compaction_threshold > 0);
3478 assert!(cfg.compaction_ratio > 0.0);
3479 }
3480
3481 #[test]
3484 fn f32_roundtrip_with_revision() {
3485 let path = temp_index_path("f32-rev-roundtrip");
3486 let original = vec![std::f32::consts::PI, std::f32::consts::E, 0.0, -1.0];
3487 let mut writer =
3488 VectorIndex::create_with_revision(&path, "f32-emb", "rev-42", 4, Quantization::F32)
3489 .unwrap();
3490 writer.write_record("doc", &original).unwrap();
3491 writer.finish().unwrap();
3492
3493 let index = VectorIndex::open(&path).unwrap();
3494 let recovered = index.vector_at_f32(0).unwrap();
3495 assert_eq!(recovered, original, "f32 must roundtrip exactly");
3496 assert_eq!(index.embedder_revision(), "rev-42");
3497
3498 std::fs::remove_file(&path).ok();
3499 }
3500
3501 #[test]
3504 fn header_crc_detects_embedder_id_corruption() {
3505 let path = temp_index_path("crc-embedder-corrupt");
3506 let mut writer = VectorIndex::create(&path, "test-embedder-long", 4).unwrap();
3507 writer.write_record("doc", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3508 writer.finish().unwrap();
3509
3510 let mut data = std::fs::read(&path).unwrap();
3511 data[10] ^= 0xFF;
3513 std::fs::write(&path, &data).unwrap();
3514
3515 let result = VectorIndex::open(&path);
3516 assert!(result.is_err());
3517 let err = format!("{}", result.unwrap_err());
3518 assert!(
3519 err.contains("CRC") || err.contains("crc"),
3520 "expected CRC error, got: {err}"
3521 );
3522
3523 std::fs::remove_file(&path).ok();
3524 }
3525
3526 #[test]
3529 fn vacuum_stats_debug_clone_partial_eq() {
3530 let stats = VacuumStats {
3531 records_before: 10,
3532 records_after: 8,
3533 tombstones_removed: 2,
3534 bytes_reclaimed: 1024,
3535 duration: Duration::from_millis(5),
3536 };
3537 let debug = format!("{stats:?}");
3538 assert!(debug.contains("VacuumStats"));
3539 assert!(debug.contains("records_before: 10"));
3540
3541 let cloned = stats.clone();
3542 assert_eq!(stats, cloned);
3543 }
3544
3545 #[test]
3546 fn quantization_debug_clone_copy_eq() {
3547 let f16 = Quantization::F16;
3548 let f32q = Quantization::F32;
3549
3550 let debug_f16 = format!("{f16:?}");
3551 assert!(debug_f16.contains("F16"));
3552 let debug_f32 = format!("{f32q:?}");
3553 assert!(debug_f32.contains("F32"));
3554
3555 let f16_copy = f16;
3556 assert_eq!(f16, f16_copy);
3557 let f32_copy = f32q;
3558 assert_eq!(f32q, f32_copy);
3559 assert_ne!(f16, f32q);
3560 }
3561
3562 #[test]
3563 fn vector_index_debug_includes_path() {
3564 let path = temp_index_path("debug-fmt");
3565 let writer = VectorIndex::create(&path, "test", 4).unwrap();
3566 writer.finish().unwrap();
3567
3568 let index = VectorIndex::open(&path).unwrap();
3569 let debug = format!("{index:?}");
3570 assert!(debug.contains("VectorIndex"));
3571
3572 std::fs::remove_file(&path).ok();
3573 }
3574
3575 #[test]
3576 fn set_wal_config_overrides_defaults() {
3577 let path = temp_index_path("wal-cfg-override");
3578 let dim = 4;
3579 let mut writer = VectorIndex::create(&path, "test", dim).unwrap();
3580 for i in 0..100 {
3581 writer
3582 .write_record(&format!("d{i}"), &sample_vector(0.1, dim))
3583 .unwrap();
3584 }
3585 writer.finish().unwrap();
3586
3587 let mut index = VectorIndex::open(&path).unwrap();
3588 index.append("wal-1", &sample_vector(0.5, dim)).unwrap();
3590 assert!(!index.needs_compaction());
3591
3592 index.set_wal_config(WalConfig {
3594 compaction_threshold: 1,
3595 compaction_ratio: 0.001,
3596 fsync_on_write: false,
3597 });
3598 assert!(index.needs_compaction());
3599
3600 std::fs::remove_file(&path).ok();
3601 std::fs::remove_file(wal::wal_path_for(&path)).ok();
3602 }
3603
3604 #[test]
3605 fn find_index_by_doc_hash_empty_index_none() {
3606 let path = temp_index_path("hash-empty");
3607 let writer = VectorIndex::create(&path, "test", 4).unwrap();
3608 writer.finish().unwrap();
3609
3610 let index = VectorIndex::open(&path).unwrap();
3611 assert!(index.find_index_by_doc_hash(0xDEAD_BEEF).is_none());
3612 assert!(index.find_index_by_doc_hash(0).is_none());
3613
3614 std::fs::remove_file(&path).ok();
3615 }
3616
3617 #[test]
3618 fn get_embeddings_mixed_hit_miss() {
3619 let path = temp_index_path("emb-mixed");
3620 let mut writer =
3621 VectorIndex::create_with_revision(&path, "test", "r1", 3, Quantization::F16).unwrap();
3622 writer.write_record("alpha", &[1.0, 0.0, 0.0]).unwrap();
3623 writer.write_record("beta", &[0.0, 1.0, 0.0]).unwrap();
3624 writer.finish().unwrap();
3625
3626 let index = VectorIndex::open(&path).unwrap();
3627 let alpha_hash = fnv1a_hash(b"alpha");
3628 let beta_hash = fnv1a_hash(b"beta");
3629 let missing_hash = fnv1a_hash(b"gamma");
3630
3631 let results = index.get_embeddings(&[alpha_hash, missing_hash, beta_hash]);
3632 assert_eq!(results.len(), 3);
3633 assert!(results[0].is_some(), "alpha should be found");
3634 assert!(results[1].is_none(), "gamma should be missing");
3635 assert!(results[2].is_some(), "beta should be found");
3636
3637 std::fs::remove_file(&path).ok();
3638 }
3639
3640 #[test]
3641 fn append_batch_empty_is_noop() {
3642 let path = temp_index_path("append-empty-batch");
3643 let writer = VectorIndex::create(&path, "test", 4).unwrap();
3644 writer.finish().unwrap();
3645
3646 let mut index = VectorIndex::open(&path).unwrap();
3647 index.append_batch(&[]).unwrap();
3648 assert_eq!(index.wal_record_count(), 0);
3649
3650 std::fs::remove_file(&path).ok();
3651 }
3652
3653 #[test]
3654 fn append_nan_embedding_rejected() {
3655 let path = temp_index_path("append-nan");
3656 let writer = VectorIndex::create(&path, "test", 4).unwrap();
3657 writer.finish().unwrap();
3658
3659 let mut index = VectorIndex::open(&path).unwrap();
3660 let result = index.append("doc", &[1.0, f32::NAN, 0.0, 0.0]);
3661 assert!(result.is_err());
3662 let err = format!("{}", result.unwrap_err());
3663 assert!(err.contains("finite"), "expected finite error, got: {err}");
3664 }
3665
3666 #[test]
3667 fn append_inf_embedding_rejected() {
3668 let path = temp_index_path("append-inf");
3669 let writer = VectorIndex::create(&path, "test", 4).unwrap();
3670 writer.finish().unwrap();
3671
3672 let mut index = VectorIndex::open(&path).unwrap();
3673 let result = index.append("doc", &[1.0, 0.0, f32::INFINITY, 0.0]);
3674 assert!(result.is_err());
3675 let err = format!("{}", result.unwrap_err());
3676 assert!(err.contains("finite"), "expected finite error, got: {err}");
3677 }
3678
3679 #[test]
3680 fn soft_delete_already_deleted_returns_false() {
3681 let path = temp_index_path("double-delete");
3682 let mut writer = VectorIndex::create(&path, "test", 4).unwrap();
3683 writer.write_record("doc", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3684 writer.finish().unwrap();
3685
3686 let mut index = VectorIndex::open(&path).unwrap();
3687 assert!(index.soft_delete("doc").unwrap(), "first delete");
3688 assert!(!index.soft_delete("doc").unwrap(), "second delete");
3689 assert!(!index.soft_delete("doc").unwrap(), "third delete");
3690
3691 std::fs::remove_file(&path).ok();
3692 }
3693
3694 #[test]
3695 fn compact_preserves_wal_config() {
3696 let path = temp_index_path("compact-cfg");
3697 let dim = 4;
3698 let mut writer = VectorIndex::create(&path, "test", dim).unwrap();
3699 for i in 0..20 {
3700 writer
3701 .write_record(&format!("d{i}"), &sample_vector(0.1, dim))
3702 .unwrap();
3703 }
3704 writer.finish().unwrap();
3705
3706 let mut index = VectorIndex::open(&path).unwrap();
3707 let custom = WalConfig {
3708 compaction_threshold: 99,
3709 compaction_ratio: 0.90,
3710 fsync_on_write: false,
3711 };
3712 index.set_wal_config(custom);
3713 index.append("wal-1", &sample_vector(0.5, dim)).unwrap();
3714 index.compact().unwrap();
3715
3716 assert_eq!(index.wal_record_count(), 0);
3718 index.append("wal-2", &sample_vector(0.3, dim)).unwrap();
3721 assert!(!index.needs_compaction());
3722
3723 std::fs::remove_file(&path).ok();
3724 std::fs::remove_file(wal::wal_path_for(&path)).ok();
3725 }
3726
3727 #[test]
3728 fn soft_delete_wal_restores_state_on_rewrite_failure() {
3729 let path = temp_index_path("wal-delete-restore");
3730 let dim = 4;
3731
3732 let mut writer = VectorIndex::create(&path, "test", dim).unwrap();
3733 writer
3734 .write_record("main-0", &sample_vector(1.0, dim))
3735 .unwrap();
3736 writer.finish().unwrap();
3737
3738 let mut index = VectorIndex::open(&path).unwrap();
3739 index.append("wal-a", &[0.0, 1.0, 0.0, 0.0]).unwrap();
3740 index.append("wal-b", &[0.0, 0.0, 1.0, 0.0]).unwrap();
3741 assert_eq!(index.wal_record_count(), 2);
3742
3743 let wal_file = wal::wal_path_for(&path);
3745 let wal_dir = wal_file.parent().unwrap();
3746 let original_perms = fs::metadata(wal_dir).unwrap().permissions();
3747 let mut readonly = original_perms.clone();
3748 readonly.set_readonly(true);
3749 if fs::set_permissions(wal_dir, readonly).is_err() {
3750 std::fs::remove_file(&path).ok();
3752 std::fs::remove_file(wal::wal_path_for(&path)).ok();
3753 return;
3754 }
3755
3756 let result = index.soft_delete("wal-a");
3757
3758 fs::set_permissions(wal_dir, original_perms).unwrap();
3760
3761 assert!(result.is_err(), "expected error from read-only directory");
3763
3764 assert_eq!(
3766 index.wal_record_count(),
3767 2,
3768 "WAL entries should be restored after rewrite failure"
3769 );
3770
3771 let hits = index.search_top_k(&[0.0, 1.0, 0.0, 0.0], 10, None).unwrap();
3773 assert!(hits.iter().any(|h| h.doc_id == "wal-a"));
3774 assert!(hits.iter().any(|h| h.doc_id == "wal-b"));
3775
3776 std::fs::remove_file(&path).ok();
3777 std::fs::remove_file(wal::wal_path_for(&path)).ok();
3778 }
3779
3780 #[test]
3783 fn repro_duplicate_entries_on_compaction_crash() {
3784 let path = temp_index_path("compaction-crash");
3785 let dim = 4;
3786
3787 let mut writer =
3789 VectorIndex::create_with_revision(&path, "test", "v1", dim, Quantization::F16).unwrap();
3790 writer.write_record("doc-A", &[1.0, 0.0, 0.0, 0.0]).unwrap();
3791 writer.finish().unwrap();
3792
3793 let mut index = VectorIndex::open(&path).unwrap();
3794
3795 index.append("doc-B", &[0.0, 1.0, 0.0, 0.0]).unwrap();
3797
3798 let hits = index.search_top_k(&[1.0, 1.0, 0.0, 0.0], 10, None).unwrap();
3800 assert_eq!(hits.len(), 2);
3801
3802 drop(index);
3809
3810 let mut compact_writer =
3812 VectorIndex::create_with_revision(&path, "test", "v1", dim, Quantization::F16)
3813 .unwrap()
3814 .with_generation(2); compact_writer
3816 .write_record("doc-A", &[1.0, 0.0, 0.0, 0.0])
3817 .unwrap();
3818 compact_writer
3819 .write_record("doc-B", &[0.0, 1.0, 0.0, 0.0])
3820 .unwrap();
3821 compact_writer.finish().unwrap(); let index_reopened = VectorIndex::open(&path).unwrap();
3829
3830 let hits = index_reopened
3832 .search_top_k(&[1.0, 1.0, 0.0, 0.0], 10, None)
3833 .unwrap();
3834
3835 for hit in &hits {
3837 println!("Hit: {} score={}", hit.doc_id, hit.score);
3838 }
3839
3840 let _ = fs::remove_file(&path);
3842 let _ = wal::remove_wal(&wal::wal_path_for(&path));
3843
3844 let hit_count = hits.len();
3846 assert_eq!(
3847 hit_count, 2,
3848 "Should have exactly 2 hits (A and B), found {hit_count}"
3849 );
3850 let b_count = hits.iter().filter(|h| h.doc_id == "doc-B").count();
3851 assert_eq!(b_count, 1, "Should have exactly 1 'doc-B', found {b_count}");
3852 }
3853
3854 }