Skip to main content

iqdb_persist/
format.rs

1//! The on-disk file header and wire format.
2//!
3//! The header is the first thing in a snapshot file. Its layout is
4//! **strict little-endian, fixed-width**:
5//!
6//! ```text
7//! offset  bytes  field
8//! 0       8      magic ("IQDBPRST")
9//! 8       4      version (u32 LE)
10//! 12      8      index_type length (u64 LE)
11//! 20      N      index_type (UTF-8, N bytes)
12//! 20+N    8      dim (u64 LE)
13//! 28+N    1      metric tag (u8)
14//! 29+N    8      n_vectors (u64 LE)
15//! 37+N    4      crc32 (u32 LE) -- of the payload only
16//! 41+N    ...    payload (impl-defined)
17//! ```
18//!
19//! Sizes (`index_type` length, `dim`, `n_vectors`) are always serialized
20//! as fixed-width `u64`, never as the host's `usize`. This keeps the
21//! format portable across 32- and 64-bit hosts.
22//!
23//! ## Metric tag values (on-disk contract)
24//!
25//! - `0` — Cosine
26//! - `1` — DotProduct
27//! - `2` — Euclidean
28//! - `3` — Manhattan
29//! - `4` — Hamming
30//!
31//! These values are part of the on-disk format contract. Once snapshot
32//! files exist on disk with a given tag → metric mapping, the mapping
33//! cannot change without a format-version bump.
34
35use std::io::{Read, Write};
36
37use iqdb_types::DistanceMetric;
38
39use crate::error::{PersistError, Result};
40
41/// Magic bytes that prefix every iqdb snapshot file.
42///
43/// # Examples
44///
45/// ```
46/// assert_eq!(&iqdb_persist::MAGIC, b"IQDBPRST");
47/// ```
48pub const MAGIC: [u8; 8] = *b"IQDBPRST";
49
50/// The on-disk format version this build writes.
51///
52/// Version `1` (v0.2–v0.3) stored the payload verbatim. Version `2` (v0.4+)
53/// prefixes the payload region with a compression preamble; version-1 files
54/// are still read (as uncompressed). **The format is frozen as of v0.5** —
55/// any future change goes through a version bump, never a silent
56/// reinterpretation.
57///
58/// # Examples
59///
60/// ```
61/// assert_eq!(iqdb_persist::CURRENT_VERSION, 2);
62/// ```
63pub const CURRENT_VERSION: u32 = 2;
64
65/// The oldest on-disk format version this build can still read.
66pub(crate) const MIN_SUPPORTED_VERSION: u32 = 1;
67
68/// The header at the start of every iqdb snapshot file.
69///
70/// The on-disk representation is fixed-width little-endian — see the
71/// module-level docs for the byte-level layout. The Rust struct stores
72/// `dim` and `n_vectors` as `usize` for ergonomic in-memory use; the
73/// reader and writer convert to/from `u64` at the wire boundary.
74///
75/// `crc32` is the CRC32 of the **payload bytes only** — it does not
76/// cover the header.
77///
78/// # Examples
79///
80/// ```
81/// use iqdb_persist::{FileHeader, CURRENT_VERSION, MAGIC};
82/// use iqdb_types::DistanceMetric;
83///
84/// let header = FileHeader {
85///     magic: MAGIC,
86///     version: CURRENT_VERSION,
87///     index_type: "flat".to_string(),
88///     dim: 128,
89///     metric: DistanceMetric::Cosine,
90///     n_vectors: 1_000,
91///     crc32: 0xDEADBEEF,
92/// };
93/// assert_eq!(header.index_type, "flat");
94/// ```
95#[derive(Debug, Clone, PartialEq, Eq)]
96pub struct FileHeader {
97    /// Magic bytes — always equal to [`MAGIC`].
98    pub magic: [u8; 8],
99    /// On-disk format version. The reader accepts any version in
100    /// `MIN_SUPPORTED_VERSION..=CURRENT_VERSION` and records which one it
101    /// read here, so the payload can be decoded per that version.
102    pub version: u32,
103    /// Stable index-type tag — matched against
104    /// [`crate::Persistable::INDEX_TYPE`] on load.
105    pub index_type: String,
106    /// Dimensionality of the vectors stored in the payload.
107    pub dim: usize,
108    /// Distance metric the index was built for.
109    pub metric: DistanceMetric,
110    /// Number of vectors stored in the payload.
111    pub n_vectors: usize,
112    /// CRC32 of the payload bytes (not of the header).
113    pub crc32: u32,
114}
115
116/// Convert a [`DistanceMetric`] to its stable on-disk tag byte.
117///
118/// `DistanceMetric` is `#[non_exhaustive]`; a metric this build of
119/// `iqdb-persist` predates has no assigned tag and yields
120/// [`PersistError::UnsupportedMetric`] rather than a silently-wrong byte.
121pub(crate) fn metric_to_tag(metric: DistanceMetric) -> Result<u8> {
122    Ok(match metric {
123        DistanceMetric::Cosine => 0,
124        DistanceMetric::DotProduct => 1,
125        DistanceMetric::Euclidean => 2,
126        DistanceMetric::Manhattan => 3,
127        DistanceMetric::Hamming => 4,
128        _ => return Err(PersistError::UnsupportedMetric { metric }),
129    })
130}
131
132/// Convert an on-disk tag byte back to a [`DistanceMetric`].
133///
134/// Returns [`PersistError::InvalidMetric`] for any value not in `0..=4`.
135pub(crate) fn tag_to_metric(tag: u8) -> Result<DistanceMetric> {
136    match tag {
137        0 => Ok(DistanceMetric::Cosine),
138        1 => Ok(DistanceMetric::DotProduct),
139        2 => Ok(DistanceMetric::Euclidean),
140        3 => Ok(DistanceMetric::Manhattan),
141        4 => Ok(DistanceMetric::Hamming),
142        _ => Err(PersistError::InvalidMetric { tag }),
143    }
144}
145
146fn usize_to_u64(value: usize, what: &'static str) -> Result<u64> {
147    u64::try_from(value).map_err(|_| PersistError::InvalidPayload {
148        reason: match what {
149            "dim" => "dim does not fit in u64",
150            "n_vectors" => "n_vectors does not fit in u64",
151            "index_type_len" => "index_type length does not fit in u64",
152            _ => "usize value does not fit in u64",
153        },
154    })
155}
156
157fn u64_to_usize(value: u64, what: &'static str) -> Result<usize> {
158    usize::try_from(value).map_err(|_| PersistError::InvalidPayload {
159        reason: match what {
160            "dim" => "dim does not fit in usize on this host",
161            "n_vectors" => "n_vectors does not fit in usize on this host",
162            "index_type_len" => "index_type length does not fit in usize on this host",
163            _ => "u64 value does not fit in usize on this host",
164        },
165    })
166}
167
168/// Write a [`FileHeader`] to `writer` in the fixed-width little-endian
169/// wire format.
170///
171/// # Errors
172///
173/// Returns [`PersistError::Io`] if a write fails, or
174/// [`PersistError::InvalidPayload`] if a `usize` field does not fit in
175/// `u64`.
176///
177/// # Examples
178///
179/// ```
180/// use std::io::Cursor;
181///
182/// use iqdb_persist::format::{read_header, write_header};
183/// use iqdb_persist::{CURRENT_VERSION, FileHeader, MAGIC};
184/// use iqdb_types::DistanceMetric;
185///
186/// let header = FileHeader {
187///     magic: MAGIC,
188///     version: CURRENT_VERSION,
189///     index_type: "flat".to_string(),
190///     dim: 8,
191///     metric: DistanceMetric::Euclidean,
192///     n_vectors: 3,
193///     crc32: 0,
194/// };
195/// let mut buf = Vec::new();
196/// write_header(&mut buf, &header).unwrap();
197/// let mut cur = Cursor::new(&buf[..]);
198/// let parsed = read_header(&mut cur).unwrap();
199/// assert_eq!(parsed, header);
200/// ```
201pub fn write_header(writer: &mut dyn Write, header: &FileHeader) -> Result<()> {
202    write_all(writer, &header.magic)?;
203    write_all(writer, &header.version.to_le_bytes())?;
204
205    let it_bytes = header.index_type.as_bytes();
206    let it_len = usize_to_u64(it_bytes.len(), "index_type_len")?;
207    write_all(writer, &it_len.to_le_bytes())?;
208    write_all(writer, it_bytes)?;
209
210    let dim_u64 = usize_to_u64(header.dim, "dim")?;
211    write_all(writer, &dim_u64.to_le_bytes())?;
212
213    write_all(writer, &[metric_to_tag(header.metric)?])?;
214
215    let n_u64 = usize_to_u64(header.n_vectors, "n_vectors")?;
216    write_all(writer, &n_u64.to_le_bytes())?;
217
218    write_all(writer, &header.crc32.to_le_bytes())?;
219    Ok(())
220}
221
222/// Read a [`FileHeader`] from `reader` and validate it.
223///
224/// Validation in v0.2:
225///
226/// - `magic` must equal [`MAGIC`] — otherwise
227///   [`PersistError::BadMagic`].
228/// - `version` must be in the supported range (up to [`CURRENT_VERSION`])
229///   — otherwise [`PersistError::UnsupportedVersion`].
230/// - The metric tag must be in the known set — otherwise
231///   [`PersistError::InvalidMetric`].
232///
233/// The `crc32` field is returned as-is — verifying the payload against
234/// it is the caller's responsibility (see [`crate::PersistedIndex`]).
235///
236/// # Errors
237///
238/// See above + [`PersistError::TruncatedHeader`] for truncated reads.
239///
240/// # Examples
241///
242/// See [`write_header`] for a round-trip example.
243pub fn read_header(reader: &mut dyn Read) -> Result<FileHeader> {
244    let mut magic = [0u8; 8];
245    read_exact_or_truncated(reader, &mut magic)?;
246    if magic != MAGIC {
247        return Err(PersistError::BadMagic { found: magic });
248    }
249
250    let mut buf4 = [0u8; 4];
251    read_exact_or_truncated(reader, &mut buf4)?;
252    let version = u32::from_le_bytes(buf4);
253    if !(MIN_SUPPORTED_VERSION..=CURRENT_VERSION).contains(&version) {
254        return Err(PersistError::UnsupportedVersion {
255            found: version,
256            supported: CURRENT_VERSION,
257        });
258    }
259
260    let mut buf8 = [0u8; 8];
261    read_exact_or_truncated(reader, &mut buf8)?;
262    let it_len_u64 = u64::from_le_bytes(buf8);
263    let it_len = u64_to_usize(it_len_u64, "index_type_len")?;
264
265    // Cap the on-disk length so a malicious or corrupted header can't ask us
266    // to allocate gigabytes. 4 KiB is comfortably larger than any plausible
267    // tag ("flat", "hnsw", "ivf-pq", ...).
268    const MAX_INDEX_TYPE_LEN: usize = 4096;
269    if it_len > MAX_INDEX_TYPE_LEN {
270        return Err(PersistError::InvalidPayload {
271            reason: "index_type length exceeds the 4 KiB cap",
272        });
273    }
274    let mut it_bytes = vec![0u8; it_len];
275    read_exact_or_truncated(reader, &mut it_bytes)?;
276    let index_type = String::from_utf8(it_bytes).map_err(|_| PersistError::InvalidPayload {
277        reason: "index_type is not valid UTF-8",
278    })?;
279
280    read_exact_or_truncated(reader, &mut buf8)?;
281    let dim = u64_to_usize(u64::from_le_bytes(buf8), "dim")?;
282
283    let mut metric_buf = [0u8; 1];
284    read_exact_or_truncated(reader, &mut metric_buf)?;
285    let metric = tag_to_metric(metric_buf[0])?;
286
287    read_exact_or_truncated(reader, &mut buf8)?;
288    let n_vectors = u64_to_usize(u64::from_le_bytes(buf8), "n_vectors")?;
289
290    read_exact_or_truncated(reader, &mut buf4)?;
291    let crc32 = u32::from_le_bytes(buf4);
292
293    Ok(FileHeader {
294        magic,
295        version,
296        index_type,
297        dim,
298        metric,
299        n_vectors,
300        crc32,
301    })
302}
303
304fn write_all(writer: &mut dyn Write, bytes: &[u8]) -> Result<()> {
305    // No `path` is available at this layer — callers wrap the writer-
306    // bound error with a meaningful path when one exists.
307    // (PersistedIndex::save writes into a Vec<u8>, so this never fails
308    // in the current flow.)
309    writer.write_all(bytes).map_err(|source| PersistError::Io {
310        path: std::path::PathBuf::new(),
311        source,
312    })
313}
314
315fn read_exact_or_truncated(reader: &mut dyn Read, buf: &mut [u8]) -> Result<()> {
316    match reader.read_exact(buf) {
317        Ok(()) => Ok(()),
318        Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
319            Err(PersistError::TruncatedHeader {
320                needed: buf.len(),
321                found: 0,
322            })
323        }
324        Err(source) => Err(PersistError::Io {
325            path: std::path::PathBuf::new(),
326            source,
327        }),
328    }
329}