iqdb_persist/format.rs
1//! The on-disk file header and wire format.
2//!
3//! The header is the first thing in a snapshot file. Its layout is
4//! **strict little-endian, fixed-width**:
5//!
6//! ```text
7//! offset bytes field
8//! 0 8 magic ("IQDBPRST")
9//! 8 4 version (u32 LE)
10//! 12 8 index_type length (u64 LE)
11//! 20 N index_type (UTF-8, N bytes)
12//! 20+N 8 dim (u64 LE)
13//! 28+N 1 metric tag (u8)
14//! 29+N 8 n_vectors (u64 LE)
15//! 37+N 4 crc32 (u32 LE) -- of the payload only
16//! 41+N ... payload (impl-defined)
17//! ```
18//!
19//! Sizes (`index_type` length, `dim`, `n_vectors`) are always serialized
20//! as fixed-width `u64`, never as the host's `usize`. This keeps the
21//! format portable across 32- and 64-bit hosts.
22//!
23//! ## Metric tag values (on-disk contract)
24//!
25//! - `0` — Cosine
26//! - `1` — DotProduct
27//! - `2` — Euclidean
28//! - `3` — Manhattan
29//! - `4` — Hamming
30//!
31//! These values are part of the on-disk format contract. Once snapshot
32//! files exist on disk with a given tag → metric mapping, the mapping
33//! cannot change without a format-version bump.
34
35use std::io::{Read, Write};
36
37use iqdb_types::DistanceMetric;
38
39use crate::error::{PersistError, Result};
40
41/// Magic bytes that prefix every iqdb snapshot file.
42///
43/// # Examples
44///
45/// ```
46/// assert_eq!(&iqdb_persist::MAGIC, b"IQDBPRST");
47/// ```
48pub const MAGIC: [u8; 8] = *b"IQDBPRST";
49
50/// The on-disk format version this build writes.
51///
52/// Version `1` (v0.2–v0.3) stored the payload verbatim. Version `2` (v0.4+)
53/// prefixes the payload region with a compression preamble; version-1 files
54/// are still read (as uncompressed). **The format is frozen as of v0.5** —
55/// any future change goes through a version bump, never a silent
56/// reinterpretation.
57///
58/// # Examples
59///
60/// ```
61/// assert_eq!(iqdb_persist::CURRENT_VERSION, 2);
62/// ```
63pub const CURRENT_VERSION: u32 = 2;
64
65/// The oldest on-disk format version this build can still read.
66pub(crate) const MIN_SUPPORTED_VERSION: u32 = 1;
67
68/// The header at the start of every iqdb snapshot file.
69///
70/// The on-disk representation is fixed-width little-endian — see the
71/// module-level docs for the byte-level layout. The Rust struct stores
72/// `dim` and `n_vectors` as `usize` for ergonomic in-memory use; the
73/// reader and writer convert to/from `u64` at the wire boundary.
74///
75/// `crc32` is the CRC32 of the **payload bytes only** — it does not
76/// cover the header.
77///
78/// # Examples
79///
80/// ```
81/// use iqdb_persist::{FileHeader, CURRENT_VERSION, MAGIC};
82/// use iqdb_types::DistanceMetric;
83///
84/// let header = FileHeader {
85/// magic: MAGIC,
86/// version: CURRENT_VERSION,
87/// index_type: "flat".to_string(),
88/// dim: 128,
89/// metric: DistanceMetric::Cosine,
90/// n_vectors: 1_000,
91/// crc32: 0xDEADBEEF,
92/// };
93/// assert_eq!(header.index_type, "flat");
94/// ```
95#[derive(Debug, Clone, PartialEq, Eq)]
96pub struct FileHeader {
97 /// Magic bytes — always equal to [`MAGIC`].
98 pub magic: [u8; 8],
99 /// On-disk format version. The reader accepts any version in
100 /// `MIN_SUPPORTED_VERSION..=CURRENT_VERSION` and records which one it
101 /// read here, so the payload can be decoded per that version.
102 pub version: u32,
103 /// Stable index-type tag — matched against
104 /// [`crate::Persistable::INDEX_TYPE`] on load.
105 pub index_type: String,
106 /// Dimensionality of the vectors stored in the payload.
107 pub dim: usize,
108 /// Distance metric the index was built for.
109 pub metric: DistanceMetric,
110 /// Number of vectors stored in the payload.
111 pub n_vectors: usize,
112 /// CRC32 of the payload bytes (not of the header).
113 pub crc32: u32,
114}
115
116/// Convert a [`DistanceMetric`] to its stable on-disk tag byte.
117///
118/// `DistanceMetric` is `#[non_exhaustive]`; a metric this build of
119/// `iqdb-persist` predates has no assigned tag and yields
120/// [`PersistError::UnsupportedMetric`] rather than a silently-wrong byte.
121pub(crate) fn metric_to_tag(metric: DistanceMetric) -> Result<u8> {
122 Ok(match metric {
123 DistanceMetric::Cosine => 0,
124 DistanceMetric::DotProduct => 1,
125 DistanceMetric::Euclidean => 2,
126 DistanceMetric::Manhattan => 3,
127 DistanceMetric::Hamming => 4,
128 _ => return Err(PersistError::UnsupportedMetric { metric }),
129 })
130}
131
132/// Convert an on-disk tag byte back to a [`DistanceMetric`].
133///
134/// Returns [`PersistError::InvalidMetric`] for any value not in `0..=4`.
135pub(crate) fn tag_to_metric(tag: u8) -> Result<DistanceMetric> {
136 match tag {
137 0 => Ok(DistanceMetric::Cosine),
138 1 => Ok(DistanceMetric::DotProduct),
139 2 => Ok(DistanceMetric::Euclidean),
140 3 => Ok(DistanceMetric::Manhattan),
141 4 => Ok(DistanceMetric::Hamming),
142 _ => Err(PersistError::InvalidMetric { tag }),
143 }
144}
145
146fn usize_to_u64(value: usize, what: &'static str) -> Result<u64> {
147 u64::try_from(value).map_err(|_| PersistError::InvalidPayload {
148 reason: match what {
149 "dim" => "dim does not fit in u64",
150 "n_vectors" => "n_vectors does not fit in u64",
151 "index_type_len" => "index_type length does not fit in u64",
152 _ => "usize value does not fit in u64",
153 },
154 })
155}
156
157fn u64_to_usize(value: u64, what: &'static str) -> Result<usize> {
158 usize::try_from(value).map_err(|_| PersistError::InvalidPayload {
159 reason: match what {
160 "dim" => "dim does not fit in usize on this host",
161 "n_vectors" => "n_vectors does not fit in usize on this host",
162 "index_type_len" => "index_type length does not fit in usize on this host",
163 _ => "u64 value does not fit in usize on this host",
164 },
165 })
166}
167
168/// Write a [`FileHeader`] to `writer` in the fixed-width little-endian
169/// wire format.
170///
171/// # Errors
172///
173/// Returns [`PersistError::Io`] if a write fails, or
174/// [`PersistError::InvalidPayload`] if a `usize` field does not fit in
175/// `u64`.
176///
177/// # Examples
178///
179/// ```
180/// use std::io::Cursor;
181///
182/// use iqdb_persist::format::{read_header, write_header};
183/// use iqdb_persist::{CURRENT_VERSION, FileHeader, MAGIC};
184/// use iqdb_types::DistanceMetric;
185///
186/// let header = FileHeader {
187/// magic: MAGIC,
188/// version: CURRENT_VERSION,
189/// index_type: "flat".to_string(),
190/// dim: 8,
191/// metric: DistanceMetric::Euclidean,
192/// n_vectors: 3,
193/// crc32: 0,
194/// };
195/// let mut buf = Vec::new();
196/// write_header(&mut buf, &header).unwrap();
197/// let mut cur = Cursor::new(&buf[..]);
198/// let parsed = read_header(&mut cur).unwrap();
199/// assert_eq!(parsed, header);
200/// ```
201pub fn write_header(writer: &mut dyn Write, header: &FileHeader) -> Result<()> {
202 write_all(writer, &header.magic)?;
203 write_all(writer, &header.version.to_le_bytes())?;
204
205 let it_bytes = header.index_type.as_bytes();
206 let it_len = usize_to_u64(it_bytes.len(), "index_type_len")?;
207 write_all(writer, &it_len.to_le_bytes())?;
208 write_all(writer, it_bytes)?;
209
210 let dim_u64 = usize_to_u64(header.dim, "dim")?;
211 write_all(writer, &dim_u64.to_le_bytes())?;
212
213 write_all(writer, &[metric_to_tag(header.metric)?])?;
214
215 let n_u64 = usize_to_u64(header.n_vectors, "n_vectors")?;
216 write_all(writer, &n_u64.to_le_bytes())?;
217
218 write_all(writer, &header.crc32.to_le_bytes())?;
219 Ok(())
220}
221
222/// Read a [`FileHeader`] from `reader` and validate it.
223///
224/// Validation in v0.2:
225///
226/// - `magic` must equal [`MAGIC`] — otherwise
227/// [`PersistError::BadMagic`].
228/// - `version` must be in the supported range (up to [`CURRENT_VERSION`])
229/// — otherwise [`PersistError::UnsupportedVersion`].
230/// - The metric tag must be in the known set — otherwise
231/// [`PersistError::InvalidMetric`].
232///
233/// The `crc32` field is returned as-is — verifying the payload against
234/// it is the caller's responsibility (see [`crate::PersistedIndex`]).
235///
236/// # Errors
237///
238/// See above + [`PersistError::TruncatedHeader`] for truncated reads.
239///
240/// # Examples
241///
242/// See [`write_header`] for a round-trip example.
243pub fn read_header(reader: &mut dyn Read) -> Result<FileHeader> {
244 let mut magic = [0u8; 8];
245 read_exact_or_truncated(reader, &mut magic)?;
246 if magic != MAGIC {
247 return Err(PersistError::BadMagic { found: magic });
248 }
249
250 let mut buf4 = [0u8; 4];
251 read_exact_or_truncated(reader, &mut buf4)?;
252 let version = u32::from_le_bytes(buf4);
253 if !(MIN_SUPPORTED_VERSION..=CURRENT_VERSION).contains(&version) {
254 return Err(PersistError::UnsupportedVersion {
255 found: version,
256 supported: CURRENT_VERSION,
257 });
258 }
259
260 let mut buf8 = [0u8; 8];
261 read_exact_or_truncated(reader, &mut buf8)?;
262 let it_len_u64 = u64::from_le_bytes(buf8);
263 let it_len = u64_to_usize(it_len_u64, "index_type_len")?;
264
265 // Cap the on-disk length so a malicious or corrupted header can't ask us
266 // to allocate gigabytes. 4 KiB is comfortably larger than any plausible
267 // tag ("flat", "hnsw", "ivf-pq", ...).
268 const MAX_INDEX_TYPE_LEN: usize = 4096;
269 if it_len > MAX_INDEX_TYPE_LEN {
270 return Err(PersistError::InvalidPayload {
271 reason: "index_type length exceeds the 4 KiB cap",
272 });
273 }
274 let mut it_bytes = vec![0u8; it_len];
275 read_exact_or_truncated(reader, &mut it_bytes)?;
276 let index_type = String::from_utf8(it_bytes).map_err(|_| PersistError::InvalidPayload {
277 reason: "index_type is not valid UTF-8",
278 })?;
279
280 read_exact_or_truncated(reader, &mut buf8)?;
281 let dim = u64_to_usize(u64::from_le_bytes(buf8), "dim")?;
282
283 let mut metric_buf = [0u8; 1];
284 read_exact_or_truncated(reader, &mut metric_buf)?;
285 let metric = tag_to_metric(metric_buf[0])?;
286
287 read_exact_or_truncated(reader, &mut buf8)?;
288 let n_vectors = u64_to_usize(u64::from_le_bytes(buf8), "n_vectors")?;
289
290 read_exact_or_truncated(reader, &mut buf4)?;
291 let crc32 = u32::from_le_bytes(buf4);
292
293 Ok(FileHeader {
294 magic,
295 version,
296 index_type,
297 dim,
298 metric,
299 n_vectors,
300 crc32,
301 })
302}
303
304fn write_all(writer: &mut dyn Write, bytes: &[u8]) -> Result<()> {
305 // No `path` is available at this layer — callers wrap the writer-
306 // bound error with a meaningful path when one exists.
307 // (PersistedIndex::save writes into a Vec<u8>, so this never fails
308 // in the current flow.)
309 writer.write_all(bytes).map_err(|source| PersistError::Io {
310 path: std::path::PathBuf::new(),
311 source,
312 })
313}
314
315fn read_exact_or_truncated(reader: &mut dyn Read, buf: &mut [u8]) -> Result<()> {
316 match reader.read_exact(buf) {
317 Ok(()) => Ok(()),
318 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => {
319 Err(PersistError::TruncatedHeader {
320 needed: buf.len(),
321 found: 0,
322 })
323 }
324 Err(source) => Err(PersistError::Io {
325 path: std::path::PathBuf::new(),
326 source,
327 }),
328 }
329}