Skip to main content

claw_vector/store/
mmap.rs

1// store/mmap.rs — memory-mapped vector file storage for fast random-access reads.
2use std::{
3    fs::OpenOptions,
4    path::{Path, PathBuf},
5};
6
7use byteorder::{ByteOrder, LittleEndian};
8use memmap2::{MmapMut, MmapOptions};
9
10use crate::error::{VectorError, VectorResult};
11
12const HEADER_SIZE: usize = 64;
13const MAGIC: &[u8; 8] = b"CLAWVEC1";
14const VERSION: u32 = 1;
15
16/// Header stored at the start of every mmap vector file.
17#[derive(Debug, Clone)]
18pub struct VecFileHeader {
19    /// Format magic bytes.
20    pub magic: [u8; 8],
21    /// Format version.
22    pub version: u32,
23    /// Vector dimensionality.
24    pub dimensions: u32,
25    /// Highest written slot count.
26    pub element_count: u64,
27    /// Reserved bytes for future metadata.
28    pub reserved: [u8; 40],
29}
30
31/// A memory-mapped file containing fixed-size raw `f32` vectors.
32pub struct MmapVectorFile {
33    /// Writable mmap covering the full file.
34    pub mmap: MmapMut,
35    /// Decoded file header.
36    pub header: VecFileHeader,
37    /// File path on disk.
38    pub path: PathBuf,
39}
40
41impl MmapVectorFile {
42    /// Create a new vector file with capacity for `max_elements` vectors.
43    pub fn create(path: &Path, dimensions: usize, max_elements: usize) -> VectorResult<Self> {
44        if dimensions == 0 {
45            return Err(VectorError::Config(
46                "mmap vector file dimensions must be greater than zero".into(),
47            ));
48        }
49        if max_elements == 0 {
50            return Err(VectorError::Config(
51                "mmap vector file max_elements must be greater than zero".into(),
52            ));
53        }
54        if let Some(parent) = path.parent() {
55            std::fs::create_dir_all(parent)?;
56        }
57
58        let file_size = HEADER_SIZE + max_elements * dimensions * std::mem::size_of::<f32>();
59        let file = OpenOptions::new()
60            .read(true)
61            .write(true)
62            .create(true)
63            .truncate(true)
64            .open(path)?;
65        file.set_len(file_size as u64)?;
66
67        let mmap = map_mut(&file)?;
68        let header = VecFileHeader {
69            magic: *MAGIC,
70            version: VERSION,
71            dimensions: dimensions as u32,
72            element_count: 0,
73            reserved: [0; 40],
74        };
75
76        let mut file = MmapVectorFile {
77            mmap,
78            header,
79            path: path.to_path_buf(),
80        };
81        file.sync_header();
82        file.flush()?;
83        Ok(file)
84    }
85
86    /// Open an existing vector file and validate its header.
87    pub fn open(path: &Path) -> VectorResult<Self> {
88        let file = OpenOptions::new().read(true).write(true).open(path)?;
89        let metadata = file.metadata()?;
90        if metadata.len() < HEADER_SIZE as u64 {
91            return Err(VectorError::Index(format!(
92                "mmap vector file '{}' is too small to contain a header",
93                path.display()
94            )));
95        }
96
97        let mmap = map_mut(&file)?;
98        let header = read_header(&mmap)?;
99        Ok(MmapVectorFile {
100            mmap,
101            header,
102            path: path.to_path_buf(),
103        })
104    }
105
106    /// Write a vector to a fixed slot, updating the element count if needed.
107    pub fn write_vector(&mut self, internal_id: usize, vector: &[f32]) -> VectorResult<()> {
108        let dimensions = self.dimensions();
109        if vector.len() != dimensions {
110            return Err(VectorError::DimensionMismatch {
111                expected: dimensions,
112                got: vector.len(),
113            });
114        }
115
116        let offset = self.vector_offset(internal_id)?;
117        let byte_len = std::mem::size_of_val(vector);
118        LittleEndian::write_f32_into(vector, &mut self.mmap[offset..offset + byte_len]);
119
120        let next_count = internal_id as u64 + 1;
121        if next_count > self.header.element_count {
122            self.header.element_count = next_count;
123            self.sync_header();
124        }
125
126        Ok(())
127    }
128
129    /// Read a vector from a fixed slot.
130    pub fn read_vector(&self, internal_id: usize) -> VectorResult<Vec<f32>> {
131        if internal_id >= self.element_count() {
132            return Err(VectorError::NotFound {
133                entity: "vector".into(),
134                id: internal_id.to_string(),
135            });
136        }
137
138        let offset = self.vector_offset(internal_id)?;
139        let byte_len = self.dimensions() * std::mem::size_of::<f32>();
140        let mut vector = vec![0.0f32; self.dimensions()];
141        LittleEndian::read_f32_into(&self.mmap[offset..offset + byte_len], &mut vector);
142        Ok(vector)
143    }
144
145    /// Zero out a vector slot without changing the file capacity.
146    pub fn delete_vector(&mut self, internal_id: usize) -> VectorResult<()> {
147        let offset = self.vector_offset(internal_id)?;
148        let byte_len = self.dimensions() * std::mem::size_of::<f32>();
149        self.mmap[offset..offset + byte_len].fill(0);
150        Ok(())
151    }
152
153    /// Flush pending changes to disk.
154    pub fn flush(&self) -> VectorResult<()> {
155        self.mmap.flush()?;
156        Ok(())
157    }
158
159    /// Return the number of written slots tracked in the header.
160    pub fn element_count(&self) -> usize {
161        self.header.element_count as usize
162    }
163
164    /// Return the dimensionality of vectors stored in the file.
165    pub fn dimensions(&self) -> usize {
166        self.header.dimensions as usize
167    }
168
169    /// Return the total file size in bytes.
170    pub fn file_size_bytes(&self) -> u64 {
171        self.mmap.len() as u64
172    }
173
174    fn sync_header(&mut self) {
175        self.mmap[..8].copy_from_slice(&self.header.magic);
176        LittleEndian::write_u32(&mut self.mmap[8..12], self.header.version);
177        LittleEndian::write_u32(&mut self.mmap[12..16], self.header.dimensions);
178        LittleEndian::write_u64(&mut self.mmap[16..24], self.header.element_count);
179        self.mmap[24..HEADER_SIZE].copy_from_slice(&self.header.reserved);
180    }
181
182    fn vector_offset(&self, internal_id: usize) -> VectorResult<usize> {
183        let bytes_per_vector = self.dimensions() * std::mem::size_of::<f32>();
184        let offset = HEADER_SIZE + internal_id * bytes_per_vector;
185        let end = offset + bytes_per_vector;
186        if end > self.mmap.len() {
187            return Err(VectorError::Index(format!(
188                "vector slot {internal_id} exceeds mmap file capacity for '{}'",
189                self.path.display()
190            )));
191        }
192        Ok(offset)
193    }
194}
195
196fn read_header(mmap: &[u8]) -> VectorResult<VecFileHeader> {
197    let mut magic = [0u8; 8];
198    magic.copy_from_slice(&mmap[..8]);
199    if &magic != MAGIC {
200        return Err(VectorError::Index("invalid mmap vector file magic".into()));
201    }
202
203    let version = LittleEndian::read_u32(&mmap[8..12]);
204    if version != VERSION {
205        return Err(VectorError::Index(format!(
206            "unsupported mmap vector file version {version}"
207        )));
208    }
209
210    let dimensions = LittleEndian::read_u32(&mmap[12..16]);
211    if dimensions == 0 {
212        return Err(VectorError::Index(
213            "mmap vector file dimensions must be greater than zero".into(),
214        ));
215    }
216
217    let element_count = LittleEndian::read_u64(&mmap[16..24]);
218    let mut reserved = [0u8; 40];
219    reserved.copy_from_slice(&mmap[24..HEADER_SIZE]);
220
221    Ok(VecFileHeader {
222        magic,
223        version,
224        dimensions,
225        element_count,
226        reserved,
227    })
228}
229
230fn map_mut(file: &std::fs::File) -> VectorResult<MmapMut> {
231    // SAFETY: the file handle remains alive for the duration of mapping creation and the
232    // returned mmap owns the OS mapping independently of the file descriptor afterwards.
233    unsafe { MmapOptions::new().map_mut(file).map_err(Into::into) }
234}
235
236#[cfg(test)]
237mod tests {
238    use tempfile::tempdir;
239
240    use super::MmapVectorFile;
241
242    #[test]
243    fn create_write_read_round_trip() {
244        let dir = tempdir().unwrap();
245        let path = dir.path().join("vectors.bin");
246        let mut file = MmapVectorFile::create(&path, 3, 8).unwrap();
247
248        file.write_vector(0, &[1.0, 2.0, 3.0]).unwrap();
249        file.write_vector(3, &[4.0, 5.0, 6.0]).unwrap();
250        file.flush().unwrap();
251
252        assert_eq!(file.read_vector(0).unwrap(), vec![1.0, 2.0, 3.0]);
253        assert_eq!(file.read_vector(3).unwrap(), vec![4.0, 5.0, 6.0]);
254        assert_eq!(file.element_count(), 4);
255    }
256
257    #[test]
258    fn delete_vector_zeros_slot() {
259        let dir = tempdir().unwrap();
260        let path = dir.path().join("vectors.bin");
261        let mut file = MmapVectorFile::create(&path, 2, 4).unwrap();
262
263        file.write_vector(1, &[7.0, 9.0]).unwrap();
264        file.delete_vector(1).unwrap();
265
266        assert_eq!(file.read_vector(1).unwrap(), vec![0.0, 0.0]);
267    }
268
269    #[test]
270    fn open_restores_header_and_data() {
271        let dir = tempdir().unwrap();
272        let path = dir.path().join("vectors.bin");
273        {
274            let mut file = MmapVectorFile::create(&path, 2, 4).unwrap();
275            file.write_vector(2, &[3.5, 8.5]).unwrap();
276            file.flush().unwrap();
277        }
278
279        let reopened = MmapVectorFile::open(&path).unwrap();
280        assert_eq!(reopened.dimensions(), 2);
281        assert_eq!(reopened.element_count(), 3);
282        assert_eq!(reopened.read_vector(2).unwrap(), vec![3.5, 8.5]);
283    }
284}