Skip to main content

nodedb_vector/mmap_segment/
writer.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Writer for the NDVS v2 on-disk vector segment format.
4
5use std::path::Path;
6
7use super::format::{
8    DTYPE_F32, FOOTER_SIZE, FORMAT_VERSION, HEADER_SIZE, MAGIC, VectorSegmentCodec, vec_pad,
9};
10
11/// Write a v2 NDVS segment file to `path`.
12///
13/// `surrogate_ids[i]` is the u64 surrogate for `vectors[i]`. The slice may be
14/// empty, in which case all surrogate IDs are written as 0.
15///
16/// # Errors
17///
18/// Returns `std::io::Error` on any I/O failure or arithmetic overflow.
19pub fn write_segment(
20    path: &Path,
21    dim: usize,
22    vectors: &[&[f32]],
23    surrogate_ids: &[u64],
24) -> std::io::Result<()> {
25    use std::io::Write as _;
26
27    debug_assert!(
28        surrogate_ids.is_empty() || surrogate_ids.len() == vectors.len(),
29        "surrogate_ids length must match vectors length or be empty"
30    );
31
32    if let Some(parent) = path.parent() {
33        std::fs::create_dir_all(parent)?;
34    }
35
36    let count = vectors.len() as u64;
37
38    let mut fd = std::fs::OpenOptions::new()
39        .read(true)
40        .write(true)
41        .create(true)
42        .truncate(true)
43        .open(path)?;
44
45    // Header — 32 bytes.
46    fd.write_all(&MAGIC)?;
47    fd.write_all(&FORMAT_VERSION.to_le_bytes())?;
48    fd.write_all(&0u16.to_le_bytes())?; // flags
49    fd.write_all(&(dim as u32).to_le_bytes())?;
50    fd.write_all(&count.to_le_bytes())?;
51    fd.write_all(&[DTYPE_F32])?;
52    fd.write_all(&[VectorSegmentCodec::None as u8])?;
53    fd.write_all(&[0u8; 10])?; // reserved (10 bytes → header total 32, 8-byte aligned)
54
55    // Vector data block — D × N × 4 bytes, row-major, no framing.
56    let mut written_vec_bytes: usize = 0;
57    for v in vectors {
58        debug_assert_eq!(v.len(), dim, "vector dimension mismatch during write");
59        let bytes: &[u8] =
60            unsafe { std::slice::from_raw_parts(v.as_ptr() as *const u8, v.len() * 4) };
61        fd.write_all(bytes)?;
62        written_vec_bytes += bytes.len();
63    }
64
65    // Pad to 8-byte alignment so the surrogate ID block is naturally aligned.
66    let pad = vec_pad(written_vec_bytes);
67    if pad > 0 {
68        fd.write_all(&[0u8; 8][..pad])?;
69    }
70
71    // Surrogate ID block — N × 8 bytes.
72    for i in 0..vectors.len() {
73        let sid: u64 = surrogate_ids.get(i).copied().unwrap_or(0);
74        fd.write_all(&sid.to_le_bytes())?;
75    }
76
77    fd.sync_all()?;
78    drop(fd);
79
80    // Compute CRC32C over the body (header + vector block + surrogate block).
81    let vec_bytes = dim
82        .checked_mul(vectors.len())
83        .and_then(|n| n.checked_mul(4))
84        .ok_or_else(|| {
85            std::io::Error::new(std::io::ErrorKind::InvalidData, "vector data size overflow")
86        })?;
87    let surrogate_bytes = vectors.len().checked_mul(8).ok_or_else(|| {
88        std::io::Error::new(
89            std::io::ErrorKind::InvalidData,
90            "surrogate block size overflow",
91        )
92    })?;
93    let pad_bytes = vec_pad(vec_bytes);
94    let body_len = HEADER_SIZE + vec_bytes + pad_bytes + surrogate_bytes;
95
96    let data = std::fs::read(path)?;
97    if data.len() != body_len {
98        return Err(std::io::Error::other(format!(
99            "unexpected file size after write: {} vs {body_len}",
100            data.len()
101        )));
102    }
103    let checksum = crc32c::crc32c(&data);
104
105    // Append the footer (46 bytes).
106    let mut fd = std::fs::OpenOptions::new().append(true).open(path)?;
107
108    let mut created_by = [0u8; 32];
109    let ver = env!("CARGO_PKG_VERSION").as_bytes();
110    let copy_len = ver.len().min(31);
111    created_by[..copy_len].copy_from_slice(&ver[..copy_len]);
112
113    fd.write_all(&FORMAT_VERSION.to_le_bytes())?; // footer format_version [0..2]
114    fd.write_all(&created_by)?; // created_by              [2..34]
115    fd.write_all(&checksum.to_le_bytes())?; // checksum    [34..38]
116    fd.write_all(&(FOOTER_SIZE as u32).to_le_bytes())?; // footer_size [38..42]
117    fd.write_all(&MAGIC)?; // trailing magic               [42..46]
118    fd.sync_all()?;
119
120    Ok(())
121}