omendb_core/omen/
file.rs

1//! `OmenFile` - main API for .omen format
2//!
3//! Storage backend for `VectorStore`. Uses bincode for efficient binary serialization.
4
5use crate::omen::{
6    align_to_page,
7    header::{OmenHeader, HEADER_SIZE},
8    section::{SectionEntry, SectionType},
9    vectors::VectorSection,
10    wal::{Wal, WalEntry, WalEntryType},
11};
12use anyhow::Result;
13use memmap2::MmapMut;
14use serde::{Deserialize, Serialize};
15use serde_json::Value as JsonValue;
16use std::collections::HashMap;
17use std::fs::{File, OpenOptions};
18use std::io::{self, Read, Seek, SeekFrom, Write};
19use std::path::{Path, PathBuf};
20
21/// Configure OpenOptions for cross-platform compatibility.
22/// On Windows, enables full file sharing to avoid "Access is denied" errors.
23#[cfg(windows)]
24fn configure_open_options(opts: &mut OpenOptions) {
25    use std::os::windows::fs::OpenOptionsExt;
26    // FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
27    opts.share_mode(0x1 | 0x2 | 0x4);
28}
29
30#[cfg(not(windows))]
31fn configure_open_options(_opts: &mut OpenOptions) {
32    // No-op on Unix
33}
34
35/// Serializable metadata for checkpoint persistence
36#[derive(Serialize, Deserialize, Default)]
37struct CheckpointMetadata {
38    /// String ID → internal index
39    id_to_index: HashMap<String, u32>,
40    /// Internal index → string ID
41    index_to_id: HashMap<u32, String>,
42    /// Deleted vector indices
43    deleted: HashMap<u32, bool>,
44    /// Configuration values
45    config: HashMap<String, u64>,
46    /// Per-vector metadata (JSON bytes)
47    metadata: HashMap<u32, Vec<u8>>,
48}
49
50/// Checkpoint threshold (number of WAL entries before compaction)
51const CHECKPOINT_THRESHOLD: u64 = 1000;
52
53/// `OmenFile` - single-file vector database
54///
55/// Storage layer for vectors, metadata, and serialized HNSW index.
56/// Graph traversal is handled by `HNSWIndex` in the vector layer.
57pub struct OmenFile {
58    path: PathBuf,
59    file: File,
60    mmap: Option<MmapMut>,
61    header: OmenHeader,
62
63    // In-memory state (for writes before checkpoint)
64    vectors_mem: Vec<Vec<f32>>,
65    id_to_index: HashMap<String, u32>,
66    index_to_id: HashMap<u32, String>,
67    metadata_mem: HashMap<u32, Vec<u8>>,
68    deleted: HashMap<u32, bool>,
69    config: HashMap<String, u64>,
70
71    // WAL for durability
72    wal: Wal,
73
74    // Serialized HNSW index (persisted on checkpoint, loaded on open)
75    hnsw_index_bytes: Option<Vec<u8>>,
76}
77
78impl OmenFile {
79    /// Compute .omen path by appending extension (preserves full filename)
80    ///
81    /// Handles filenames with multiple dots (e.g., `test.db_64` → `test.db_64.omen`)
82    /// by appending `.omen` rather than replacing the extension.
83    #[must_use]
84    pub fn compute_omen_path(path: &Path) -> PathBuf {
85        if path.extension().is_some_and(|ext| ext == "omen") {
86            path.to_path_buf()
87        } else {
88            // Append .omen (don't use with_extension which replaces)
89            let mut omen = path.as_os_str().to_os_string();
90            omen.push(".omen");
91            PathBuf::from(omen)
92        }
93    }
94
95    /// Create a new .omen database
96    pub fn create(path: impl AsRef<Path>, dimensions: u32) -> io::Result<Self> {
97        let path = path.as_ref();
98        let omen_path = Self::compute_omen_path(path);
99        let wal_path = {
100            let mut wal = path.as_os_str().to_os_string();
101            wal.push(".wal");
102            PathBuf::from(wal)
103        };
104
105        // Create empty file with header
106        let mut opts = OpenOptions::new();
107        opts.read(true).write(true).create(true).truncate(true);
108        configure_open_options(&mut opts);
109        let mut file = opts.open(&omen_path)?;
110
111        let header = OmenHeader::new(dimensions);
112        file.write_all(&header.to_bytes())?;
113        file.sync_all()?;
114
115        let wal = Wal::open(&wal_path)?;
116
117        let mut config = HashMap::new();
118        config.insert("dimensions".to_string(), u64::from(dimensions));
119
120        Ok(Self {
121            path: omen_path,
122            file,
123            mmap: None,
124            header,
125            vectors_mem: Vec::new(),
126            id_to_index: HashMap::new(),
127            index_to_id: HashMap::new(),
128            metadata_mem: HashMap::new(),
129            deleted: HashMap::new(),
130            config,
131            wal,
132            hnsw_index_bytes: None,
133        })
134    }
135
136    /// Open an existing .omen database
137    pub fn open(path: impl AsRef<Path>) -> io::Result<Self> {
138        let path = path.as_ref();
139        let omen_path = Self::compute_omen_path(path);
140        let wal_path = {
141            let mut wal = path.as_os_str().to_os_string();
142            wal.push(".wal");
143            PathBuf::from(wal)
144        };
145
146        let mut opts = OpenOptions::new();
147        opts.read(true).write(true);
148        configure_open_options(&mut opts);
149        let mut file = opts.open(&omen_path)?;
150
151        // Read header
152        let mut header_buf = [0u8; HEADER_SIZE];
153        file.read_exact(&mut header_buf)?;
154        let header = OmenHeader::from_bytes(&header_buf)?;
155
156        // Create mmap if file has data
157        let file_len = file.metadata()?.len() as usize;
158        let mmap = if file_len > HEADER_SIZE {
159            Some(unsafe { MmapMut::map_mut(&file)? })
160        } else {
161            None
162        };
163
164        // Open WAL
165        let wal = Wal::open(&wal_path)?;
166
167        // Initialize config from header
168        let mut config = HashMap::new();
169        config.insert("dimensions".to_string(), u64::from(header.dimensions));
170        config.insert("count".to_string(), header.count);
171
172        // Load vectors from checkpoint if present
173        let mut vectors_mem = Vec::new();
174        let mut id_to_index = HashMap::new();
175        let mut index_to_id = HashMap::new();
176        let mut metadata_mem = HashMap::new();
177        let mut deleted = HashMap::new();
178
179        if let Some(ref mmap) = mmap {
180            // Load vectors
181            if let Some(vec_section) = header.get_section(SectionType::Vectors) {
182                let vec_offset = vec_section.offset as usize;
183                let dim = header.dimensions as usize;
184                let count = header.count as usize;
185
186                if dim > 0 && count > 0 {
187                    for i in 0..count {
188                        let start = vec_offset + i * dim * 4;
189                        let end = start + dim * 4;
190                        if end <= mmap.len() {
191                            let bytes = &mmap[start..end];
192                            let vector: Vec<f32> = bytes
193                                .chunks(4)
194                                .map(|chunk| {
195                                    let arr: [u8; 4] = chunk.try_into().unwrap_or([0; 4]);
196                                    f32::from_le_bytes(arr)
197                                })
198                                .collect();
199                            vectors_mem.push(vector);
200                        }
201                    }
202                }
203            }
204
205            // Load metadata section (bincode-encoded CheckpointMetadata)
206            if let Some(meta_section) = header.get_section(SectionType::MetadataRaw) {
207                let meta_offset = meta_section.offset as usize;
208                let meta_len = meta_section.length as usize;
209                if meta_offset + meta_len <= mmap.len() {
210                    let meta_bytes = &mmap[meta_offset..meta_offset + meta_len];
211                    if let Ok(meta) = bincode::deserialize::<CheckpointMetadata>(meta_bytes) {
212                        id_to_index = meta.id_to_index;
213                        index_to_id = meta.index_to_id;
214                        deleted = meta.deleted;
215                        config.extend(meta.config);
216                        metadata_mem = meta.metadata;
217                    }
218                }
219            }
220        }
221
222        // Load HNSW index bytes (if present)
223        let hnsw_index_bytes = if let Some(ref mmap) = mmap {
224            if let Some(hnsw_section) = header.get_section(SectionType::HnswIndex) {
225                let hnsw_offset = hnsw_section.offset as usize;
226                let hnsw_len = hnsw_section.length as usize;
227                if hnsw_offset + hnsw_len <= mmap.len() {
228                    Some(mmap[hnsw_offset..hnsw_offset + hnsw_len].to_vec())
229                } else {
230                    None
231                }
232            } else {
233                None
234            }
235        } else {
236            None
237        };
238
239        let mut db = Self {
240            path: omen_path,
241            file,
242            mmap,
243            header,
244            vectors_mem,
245            id_to_index,
246            index_to_id,
247            metadata_mem,
248            deleted,
249            config,
250            wal,
251            hnsw_index_bytes,
252        };
253
254        // Replay WAL
255        db.recover()?;
256
257        Ok(db)
258    }
259
260    /// Recover from WAL
261    fn recover(&mut self) -> io::Result<()> {
262        let entries = self.wal.entries_after_checkpoint()?;
263
264        for entry in entries {
265            if !entry.verify() {
266                // Skip corrupted entries
267                continue;
268            }
269
270            match entry.header.entry_type {
271                WalEntryType::InsertNode => {
272                    self.replay_insert(&entry.data)?;
273                }
274                WalEntryType::DeleteNode => {
275                    self.replay_delete(&entry.data)?;
276                }
277                WalEntryType::UpdateNeighbors => {
278                    self.replay_neighbors(&entry.data)?;
279                }
280                WalEntryType::UpdateMetadata | WalEntryType::Checkpoint => {
281                    // No-op: metadata updates tracked in cloud-4uv, checkpoint is marker only
282                }
283            }
284        }
285
286        Ok(())
287    }
288
289    /// Replay insert from WAL
290    fn replay_insert(&mut self, data: &[u8]) -> io::Result<()> {
291        let mut cursor = std::io::Cursor::new(data);
292
293        // Read string ID
294        let mut len_buf = [0u8; 4];
295        cursor.read_exact(&mut len_buf)?;
296        let id_len = u32::from_le_bytes(len_buf) as usize;
297        let mut id_buf = vec![0u8; id_len];
298        cursor.read_exact(&mut id_buf)?;
299        let string_id = String::from_utf8_lossy(&id_buf).to_string();
300
301        // Read level
302        let mut level_buf = [0u8; 1];
303        cursor.read_exact(&mut level_buf)?;
304        let level = level_buf[0];
305
306        // Read vector
307        cursor.read_exact(&mut len_buf)?;
308        let vec_len = u32::from_le_bytes(len_buf) as usize;
309        let mut vector = vec![0.0f32; vec_len];
310        for val in &mut vector {
311            let mut f32_buf = [0u8; 4];
312            cursor.read_exact(&mut f32_buf)?;
313            *val = f32::from_le_bytes(f32_buf);
314        }
315
316        // Read metadata
317        cursor.read_exact(&mut len_buf)?;
318        let meta_len = u32::from_le_bytes(len_buf) as usize;
319        let mut metadata = vec![0u8; meta_len];
320        cursor.read_exact(&mut metadata)?;
321
322        // Apply insert (level is ignored - HNSW graph managed by HNSWIndex)
323        let _ = level; // Consumed from WAL but not stored (HNSWIndex manages graph)
324        let index = self.vectors_mem.len() as u32;
325        self.vectors_mem.push(vector);
326        self.id_to_index.insert(string_id.clone(), index);
327        self.index_to_id.insert(index, string_id);
328        if !metadata.is_empty() {
329            self.metadata_mem.insert(index, metadata);
330        }
331
332        Ok(())
333    }
334
335    /// Replay delete from WAL
336    fn replay_delete(&mut self, data: &[u8]) -> io::Result<()> {
337        let mut cursor = std::io::Cursor::new(data);
338
339        // Read string ID
340        let mut len_buf = [0u8; 4];
341        cursor.read_exact(&mut len_buf)?;
342        let id_len = u32::from_le_bytes(len_buf) as usize;
343        let mut id_buf = vec![0u8; id_len];
344        cursor.read_exact(&mut id_buf)?;
345        let string_id = String::from_utf8_lossy(&id_buf).to_string();
346
347        if let Some(&index) = self.id_to_index.get(&string_id) {
348            self.deleted.insert(index, true);
349        }
350
351        Ok(())
352    }
353
354    /// Replay neighbors update from WAL (no-op: graph managed by `HNSWIndex`)
355    #[allow(clippy::unused_self, clippy::unnecessary_wraps)]
356    fn replay_neighbors(&mut self, _data: &[u8]) -> io::Result<()> {
357        // Neighbor updates are consumed from WAL but not stored.
358        // HNSWIndex rebuilds graph from vectors on recovery.
359        Ok(())
360    }
361
362    /// Insert a vector
363    ///
364    /// Note: Graph management (HNSW) is handled by `HNSWIndex` in the vector layer.
365    /// This method only handles storage: WAL, vectors, metadata.
366    pub fn insert(&mut self, id: &str, vector: &[f32], metadata: Option<&[u8]>) -> io::Result<()> {
367        if vector.len() != self.header.dimensions as usize {
368            return Err(io::Error::new(
369                io::ErrorKind::InvalidInput,
370                format!(
371                    "Vector dimensions mismatch: expected {}, got {}",
372                    self.header.dimensions,
373                    vector.len()
374                ),
375            ));
376        }
377
378        let metadata_bytes = metadata.unwrap_or(b"{}");
379
380        // 1. Append to WAL (durable)
381        // Level 0 is placeholder - actual HNSW levels managed by HNSWIndex
382        let entry = WalEntry::insert_node(0, id, 0, vector, metadata_bytes);
383        self.wal.append(entry)?;
384        self.wal.sync()?;
385
386        // 2. Update in-memory state
387        let index = self.vectors_mem.len() as u32;
388        self.vectors_mem.push(vector.to_vec());
389        self.id_to_index.insert(id.to_string(), index);
390        self.index_to_id.insert(index, id.to_string());
391        if metadata_bytes != b"{}" {
392            self.metadata_mem.insert(index, metadata_bytes.to_vec());
393        }
394
395        self.header.count += 1;
396
397        // 3. Periodic checkpoint
398        if self.wal.len() > CHECKPOINT_THRESHOLD {
399            self.checkpoint()?;
400        }
401
402        Ok(())
403    }
404
405    /// Find k nearest neighbors (simple greedy search)
406    fn find_nearest(&self, query: &[f32], k: usize) -> Vec<u32> {
407        if self.vectors_mem.is_empty() {
408            return Vec::new();
409        }
410
411        // Simple brute force for now - full HNSW search would be more efficient
412        let mut distances: Vec<(u32, f32)> = self
413            .vectors_mem
414            .iter()
415            .enumerate()
416            .filter(|(i, _)| !self.deleted.get(&(*i as u32)).copied().unwrap_or(false))
417            .map(|(i, v)| (i as u32, l2_distance(query, v)))
418            .collect();
419
420        distances.sort_by(|a, b| a.1.total_cmp(&b.1));
421
422        distances.into_iter().take(k).map(|(id, _)| id).collect()
423    }
424
425    /// Search for k nearest neighbors
426    #[must_use]
427    pub fn search(&self, query: &[f32], k: usize) -> Vec<(String, f32)> {
428        if query.len() != self.header.dimensions as usize {
429            return Vec::new();
430        }
431
432        let indices = self.find_nearest(query, k);
433
434        indices
435            .into_iter()
436            .filter_map(|idx| {
437                let id = self.index_to_id.get(&idx)?;
438                let vector = self.vectors_mem.get(idx as usize)?;
439                let distance = l2_distance(query, vector);
440                Some((id.clone(), distance))
441            })
442            .collect()
443    }
444
445    /// Delete a vector by ID
446    pub fn delete(&mut self, id: &str) -> io::Result<bool> {
447        if let Some(&index) = self.id_to_index.get(id) {
448            // Log to WAL
449            let entry = WalEntry::delete_node(0, id);
450            self.wal.append(entry)?;
451            self.wal.sync()?;
452
453            self.deleted.insert(index, true);
454            Ok(true)
455        } else {
456            Ok(false)
457        }
458    }
459
460    /// Get vector count
461    #[must_use]
462    pub fn len(&self) -> u64 {
463        self.header.count
464    }
465
466    /// Check if empty
467    #[must_use]
468    pub fn is_empty(&self) -> bool {
469        self.header.count == 0
470    }
471
472    /// Get dimensions
473    #[must_use]
474    pub fn dimensions(&self) -> u32 {
475        self.header.dimensions
476    }
477
478    /// Checkpoint - compact WAL into main file
479    pub fn checkpoint(&mut self) -> io::Result<()> {
480        if self.vectors_mem.is_empty() && self.hnsw_index_bytes.is_none() {
481            return Ok(());
482        }
483
484        // Serialize metadata with bincode (much faster than JSON)
485        let checkpoint_meta = CheckpointMetadata {
486            id_to_index: self.id_to_index.clone(),
487            index_to_id: self.index_to_id.clone(),
488            deleted: self.deleted.clone(),
489            config: self.config.clone(),
490            metadata: self.metadata_mem.clone(),
491        };
492        let metadata_bytes = bincode::serialize(&checkpoint_meta)
493            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
494
495        // Calculate section sizes
496        let vector_size =
497            VectorSection::size_for_count(self.header.dimensions, self.vectors_mem.len() as u64)
498                as usize;
499        let graph_size = 0; // Graph managed by HNSWIndex, not stored in OmenFile
500        let metadata_size = metadata_bytes.len();
501        let hnsw_size = self.hnsw_index_bytes.as_ref().map_or(0, Vec::len);
502
503        // Calculate offsets (page-aligned)
504        let vector_offset = align_to_page(HEADER_SIZE);
505        let graph_offset = align_to_page(vector_offset + vector_size);
506        let metadata_offset = align_to_page(graph_offset + graph_size);
507        let hnsw_offset = align_to_page(metadata_offset + metadata_size);
508        let total_size = align_to_page(hnsw_offset + hnsw_size);
509
510        // Drop mmap before resizing file (required on Windows - file cannot be
511        // resized while memory-mapped)
512        self.mmap = None;
513
514        // Extend file
515        self.file.set_len(total_size as u64)?;
516
517        // Write vectors
518        self.file.seek(SeekFrom::Start(vector_offset as u64))?;
519        for vector in &self.vectors_mem {
520            for &val in vector {
521                self.file.write_all(&val.to_le_bytes())?;
522            }
523        }
524
525        // Graph section is empty - HNSWIndex stores graph in hnsw_index_bytes
526
527        // Write metadata
528        self.file.seek(SeekFrom::Start(metadata_offset as u64))?;
529        self.file.write_all(&metadata_bytes)?;
530
531        // Write HNSW index (if present)
532        if let Some(ref hnsw_bytes) = self.hnsw_index_bytes {
533            self.file.seek(SeekFrom::Start(hnsw_offset as u64))?;
534            self.file.write_all(hnsw_bytes)?;
535        }
536
537        // Update header
538        self.header.count = self.vectors_mem.len() as u64;
539        self.header.entry_point = 0; // Entry point managed by HNSWIndex
540        self.header.set_section(SectionEntry::new(
541            SectionType::Vectors,
542            vector_offset as u64,
543            vector_size as u64,
544        ));
545        self.header.set_section(SectionEntry::new(
546            SectionType::Graph,
547            graph_offset as u64,
548            graph_size as u64,
549        ));
550        self.header.set_section(SectionEntry::new(
551            SectionType::MetadataRaw,
552            metadata_offset as u64,
553            metadata_size as u64,
554        ));
555        if hnsw_size > 0 {
556            self.header.set_section(SectionEntry::new(
557                SectionType::HnswIndex,
558                hnsw_offset as u64,
559                hnsw_size as u64,
560            ));
561        }
562
563        // Write header
564        self.file.seek(SeekFrom::Start(0))?;
565        self.file.write_all(&self.header.to_bytes())?;
566        self.file.sync_all()?;
567
568        // Truncate WAL
569        self.wal.truncate()?;
570
571        // Write checkpoint marker
572        self.wal.append(WalEntry::checkpoint(0))?;
573        self.wal.sync()?;
574
575        // Update mmap
576        self.mmap = Some(unsafe { MmapMut::map_mut(&self.file)? });
577
578        Ok(())
579    }
580}
581
582// ============================================================================
583// Storage API for VectorStore
584// ============================================================================
585
586impl OmenFile {
587    /// Store a vector by internal index
588    pub fn put_vector(&mut self, id: usize, vector: &[f32]) -> Result<()> {
589        let new_len = id + 1;
590        if self.vectors_mem.len() < new_len {
591            self.vectors_mem.resize_with(new_len, Vec::new);
592        }
593        self.vectors_mem[id] = vector.to_vec();
594        Ok(())
595    }
596
597    /// Get a vector by internal index
598    pub fn get_vector(&self, id: usize) -> Result<Option<Vec<f32>>> {
599        // Try memory first
600        if id < self.vectors_mem.len() && !self.vectors_mem[id].is_empty() {
601            return Ok(Some(self.vectors_mem[id].clone()));
602        }
603
604        // Fall back to mmap for quantized stores (vectors not in RAM)
605        if let Some(ref mmap) = self.mmap {
606            if let Some(vec_section) = self.header.get_section(SectionType::Vectors) {
607                let dim = self.header.dimensions as usize;
608                if dim > 0 && id < self.header.count as usize {
609                    let vec_offset = vec_section.offset as usize;
610                    let start = vec_offset + id * dim * 4;
611                    let end = start + dim * 4;
612                    if end <= mmap.len() {
613                        let bytes = &mmap[start..end];
614                        let vector: Vec<f32> = bytes
615                            .chunks(4)
616                            .map(|chunk| {
617                                let arr: [u8; 4] = chunk.try_into().unwrap_or([0; 4]);
618                                f32::from_le_bytes(arr)
619                            })
620                            .collect();
621                        return Ok(Some(vector));
622                    }
623                }
624            }
625        }
626
627        Ok(None)
628    }
629
630    /// Store metadata for a vector (as JSON)
631    pub fn put_metadata(&mut self, id: usize, metadata: &JsonValue) -> Result<()> {
632        let bytes = serde_json::to_vec(metadata)?;
633        self.metadata_mem.insert(id as u32, bytes);
634        Ok(())
635    }
636
637    /// Get metadata for a vector (as JSON)
638    pub fn get_metadata(&self, id: usize) -> Result<Option<JsonValue>> {
639        match self.metadata_mem.get(&(id as u32)) {
640            Some(bytes) => {
641                let metadata: JsonValue = serde_json::from_slice(bytes)?;
642                Ok(Some(metadata))
643            }
644            None => Ok(None),
645        }
646    }
647
648    /// Store string ID to internal index mapping
649    pub fn put_id_mapping(&mut self, string_id: &str, index: usize) -> Result<()> {
650        self.id_to_index.insert(string_id.to_string(), index as u32);
651        self.index_to_id.insert(index as u32, string_id.to_string());
652        Ok(())
653    }
654
655    /// Get internal index for a string ID
656    pub fn get_id_mapping(&self, string_id: &str) -> Result<Option<usize>> {
657        Ok(self.id_to_index.get(string_id).map(|&idx| idx as usize))
658    }
659
660    /// Get string ID for an internal index (reverse lookup)
661    pub fn get_string_id(&self, index: usize) -> Result<Option<String>> {
662        Ok(self.index_to_id.get(&(index as u32)).cloned())
663    }
664
665    /// Delete string ID mapping
666    pub fn delete_id_mapping(&mut self, string_id: &str) -> Result<()> {
667        if let Some(&index) = self.id_to_index.get(string_id) {
668            self.index_to_id.remove(&index);
669        }
670        self.id_to_index.remove(string_id);
671        Ok(())
672    }
673
674    /// Store configuration value
675    pub fn put_config(&mut self, key: &str, value: u64) -> Result<()> {
676        self.config.insert(key.to_string(), value);
677        // Sync dimensions to header
678        if key == "dimensions" {
679            self.header.dimensions = value as u32;
680        }
681        Ok(())
682    }
683
684    /// Get configuration value
685    pub fn get_config(&self, key: &str) -> Result<Option<u64>> {
686        Ok(self.config.get(key).copied())
687    }
688
689    /// Load all vectors from storage
690    pub fn load_all_vectors(&self) -> Result<Vec<(usize, Vec<f32>)>> {
691        Ok(self
692            .vectors_mem
693            .iter()
694            .enumerate()
695            .filter(|(_, v)| !v.is_empty())
696            .map(|(id, v)| (id, v.clone()))
697            .collect())
698    }
699
700    /// Increment vector count in storage
701    pub fn increment_count(&mut self) -> Result<usize> {
702        let count = self.config.get("count").copied().unwrap_or(0) as usize;
703        let new_count = count + 1;
704        self.config.insert("count".to_string(), new_count as u64);
705        self.header.count = new_count as u64;
706        Ok(new_count)
707    }
708
709    /// Get current vector count
710    pub fn get_count(&self) -> Result<usize> {
711        Ok(self.config.get("count").copied().unwrap_or(0) as usize)
712    }
713
714    /// Store quantization mode
715    ///
716    /// Mode values: 0=none, 1=sq8, 2=rabitq-4, 3=rabitq-2, 4=rabitq-8
717    pub fn put_quantization_mode(&mut self, mode: u64) -> Result<()> {
718        self.put_config("quantization", mode)
719    }
720
721    /// Get quantization mode
722    ///
723    /// Returns: 0=none, 1=sq8, 2=rabitq-4, 3=rabitq-2, 4=rabitq-8
724    pub fn get_quantization_mode(&self) -> Result<Option<u64>> {
725        self.get_config("quantization")
726    }
727
728    /// Check if store was created with quantization
729    pub fn is_quantized(&self) -> Result<bool> {
730        Ok(self.get_quantization_mode()?.unwrap_or(0) > 0)
731    }
732
733    /// Load all metadata from storage
734    pub fn load_all_metadata(&self) -> Result<HashMap<usize, JsonValue>> {
735        let mut result = HashMap::new();
736        for (&id, bytes) in &self.metadata_mem {
737            if let Ok(metadata) = serde_json::from_slice(bytes) {
738                result.insert(id as usize, metadata);
739            }
740        }
741        Ok(result)
742    }
743
744    /// Load all ID mappings from storage
745    pub fn load_all_id_mappings(&self) -> Result<HashMap<String, usize>> {
746        Ok(self
747            .id_to_index
748            .iter()
749            .map(|(id, &idx)| (id.clone(), idx as usize))
750            .collect())
751    }
752
753    /// Mark a vector as deleted (tombstone)
754    pub fn put_deleted(&mut self, id: usize) -> Result<()> {
755        self.deleted.insert(id as u32, true);
756        Ok(())
757    }
758
759    /// Check if a vector is deleted
760    pub fn is_deleted(&self, id: usize) -> Result<bool> {
761        Ok(self.deleted.get(&(id as u32)).copied().unwrap_or(false))
762    }
763
764    /// Remove deleted marker (for re-insertion)
765    pub fn remove_deleted(&mut self, id: usize) -> Result<()> {
766        self.deleted.remove(&(id as u32));
767        Ok(())
768    }
769
770    /// Load all deleted IDs from storage
771    pub fn load_all_deleted(&self) -> Result<HashMap<usize, bool>> {
772        Ok(self
773            .deleted
774            .iter()
775            .map(|(&id, &v)| (id as usize, v))
776            .collect())
777    }
778
779    /// Store serialized HNSW index bytes
780    ///
781    /// The bytes are persisted on the next checkpoint/flush.
782    /// `VectorStore` serializes `HNSWIndex` and stores it here.
783    pub fn put_hnsw_index(&mut self, bytes: Vec<u8>) {
784        self.hnsw_index_bytes = Some(bytes);
785    }
786
787    /// Get serialized HNSW index bytes (if present)
788    ///
789    /// Returns the bytes previously stored by `put_hnsw_index()`,
790    /// or loaded from disk on open.
791    #[must_use]
792    pub fn get_hnsw_index(&self) -> Option<&[u8]> {
793        self.hnsw_index_bytes.as_deref()
794    }
795
796    /// Check if HNSW index is stored
797    #[must_use]
798    pub fn has_hnsw_index(&self) -> bool {
799        self.hnsw_index_bytes.is_some()
800    }
801
802    /// Update HNSW parameters in the header
803    ///
804    /// These values are persisted to disk on the next checkpoint/flush.
805    pub fn set_hnsw_params(&mut self, m: u16, ef_construction: u16, ef_search: u16) {
806        self.header.m = m;
807        self.header.ef_construction = ef_construction;
808        self.header.ef_search = ef_search;
809    }
810
811    /// Get storage path
812    #[must_use]
813    pub fn path(&self) -> &Path {
814        &self.path
815    }
816
817    /// Get reference to the header
818    #[must_use]
819    pub fn header(&self) -> &OmenHeader {
820        &self.header
821    }
822
823    /// Flush all pending writes to disk
824    pub fn flush(&mut self) -> Result<()> {
825        self.checkpoint()?;
826        Ok(())
827    }
828
829    /// Batch set vectors with metadata and ID mappings
830    pub fn put_batch(&mut self, items: Vec<(usize, String, Vec<f32>, JsonValue)>) -> Result<()> {
831        if items.is_empty() {
832            return Ok(());
833        }
834
835        for (idx, string_id, vector, metadata) in items {
836            self.put_vector(idx, &vector)?;
837            self.put_metadata(idx, &metadata)?;
838            self.put_id_mapping(&string_id, idx)?;
839        }
840
841        // Update count
842        let current_count = self.get_count()?;
843        let new_count = self
844            .vectors_mem
845            .iter()
846            .filter(|v| !v.is_empty())
847            .count()
848            .max(current_count);
849        self.config.insert("count".to_string(), new_count as u64);
850        self.header.count = new_count as u64;
851
852        Ok(())
853    }
854}
855
856/// L2 distance between two vectors
857#[inline]
858fn l2_distance(a: &[f32], b: &[f32]) -> f32 {
859    a.iter()
860        .zip(b.iter())
861        .map(|(x, y)| (x - y).powi(2))
862        .sum::<f32>()
863        .sqrt()
864}
865
866#[cfg(test)]
867mod tests {
868    use super::*;
869    use tempfile::tempdir;
870
871    #[test]
872    fn test_create_and_insert() {
873        let dir = tempdir().unwrap();
874        let db_path = dir.path().join("test.omen");
875
876        let mut db = OmenFile::create(&db_path, 3).unwrap();
877        db.insert("vec1", &[1.0, 2.0, 3.0], None).unwrap();
878        db.insert("vec2", &[4.0, 5.0, 6.0], None).unwrap();
879
880        assert_eq!(db.len(), 2);
881    }
882
883    #[test]
884    fn test_search() {
885        let dir = tempdir().unwrap();
886        let db_path = dir.path().join("test.omen");
887
888        let mut db = OmenFile::create(&db_path, 3).unwrap();
889        db.insert("vec1", &[1.0, 0.0, 0.0], None).unwrap();
890        db.insert("vec2", &[0.0, 1.0, 0.0], None).unwrap();
891        db.insert("vec3", &[0.0, 0.0, 1.0], None).unwrap();
892
893        let results = db.search(&[1.0, 0.0, 0.0], 1);
894        assert_eq!(results.len(), 1);
895        assert_eq!(results[0].0, "vec1");
896    }
897
898    #[test]
899    fn test_checkpoint_and_reopen() {
900        let dir = tempdir().unwrap();
901        let db_path = dir.path().join("test.omen");
902
903        {
904            let mut db = OmenFile::create(&db_path, 3).unwrap();
905            db.insert("vec1", &[1.0, 2.0, 3.0], None).unwrap();
906            db.insert("vec2", &[4.0, 5.0, 6.0], None).unwrap();
907            db.checkpoint().unwrap();
908        }
909
910        {
911            let db = OmenFile::open(&db_path).unwrap();
912            assert_eq!(db.len(), 2);
913        }
914    }
915
916    #[test]
917    fn test_wal_recovery() {
918        let dir = tempdir().unwrap();
919        let db_path = dir.path().join("test.omen");
920
921        {
922            let mut db = OmenFile::create(&db_path, 3).unwrap();
923            db.insert("vec1", &[1.0, 2.0, 3.0], None).unwrap();
924            // Don't checkpoint - data is only in WAL
925        }
926
927        {
928            let db = OmenFile::open(&db_path).unwrap();
929            // Should recover from WAL
930            let results = db.search(&[1.0, 2.0, 3.0], 1);
931            assert_eq!(results.len(), 1);
932            assert_eq!(results[0].0, "vec1");
933        }
934    }
935}