Skip to main content

luci/storage/
header.rs

1use crate::core::{LuciError, Result};
2
3use crate::storage::block::{BLOCK_SIZE, BlockId, HEADER_SIZE};
4
5/// Magic bytes identifying a Luci index file.
6///
7/// The last two bytes encode a format family marker (`\x00`) and a
8/// non-zero sentinel (`\x01`) to detect truncated files.
9pub const MAGIC: [u8; 8] = *b"LUCI\x00\x00\x00\x01";
10
11/// Current format version. Incremented on breaking format changes.
12///
13/// v3 (2026-06): keyword columns gain a persisted per-block offset index
14/// (`ColumnType::KeywordBlocked = 8`) for O(1) ordinal→string lookup, per
15/// [[optimization-keyword-dict-offset-index]]. The bump is the old-binary
16/// safety net: an old binary maps the unknown `col_type = 8` to `Empty` and
17/// would silently drop the `_id` column, so it must instead reject the file
18/// loudly via the `format_version > FORMAT_VERSION` check. `commit()` stamps
19/// this version on every write, making any new-binary write a one-way upgrade.
20///
21/// v2 (2026-05): adds the per-field vector-index directory to
22/// `MetadataSnapshot` per [[global-vector-indices]]
23/// Alternative B. The HNSW graph for each `dense_vector` field is
24/// stored in its own file extent rather than per-segment.
25pub const FORMAT_VERSION: u32 = 3;
26
27// Byte offsets within the 4 KB header.
28const OFF_MAGIC: usize = 0;
29const OFF_VERSION: usize = 8;
30const OFF_BLOCK_SIZE: usize = 12;
31const OFF_ROOT_A_BLOCK: usize = 16;
32const OFF_ROOT_A_CHECKSUM: usize = 24;
33const OFF_ROOT_B_BLOCK: usize = 32;
34const OFF_ROOT_B_CHECKSUM: usize = 40;
35const OFF_ACTIVE_ROOT: usize = 48;
36
37/// Sentinel value for a root pointer that has never been written.
38const EMPTY_ROOT_BLOCK: u64 = u64::MAX;
39
40/// A root pointer in the file header.
41///
42/// Points to a metadata root block and stores its xxHash (XXH3) checksum
43/// for integrity validation during crash recovery.
44///
45/// See [[architecture-storage-format#Atomic Commit Protocol]].
46#[derive(Clone, Copy, Debug, PartialEq, Eq)]
47pub struct RootPointer {
48    /// Block containing the metadata root, or `None` if this slot has never
49    /// been committed to.
50    pub block_id: Option<BlockId>,
51    /// xxHash (XXH3-64) checksum of the metadata root block contents.
52    pub checksum: u64,
53}
54
55impl RootPointer {
56    /// A root pointer that has never been written.
57    pub const EMPTY: Self = Self {
58        block_id: None,
59        checksum: 0,
60    };
61
62    /// Create a root pointer referencing a metadata block.
63    pub const fn new(block_id: BlockId, checksum: u64) -> Self {
64        Self {
65            block_id: Some(block_id),
66            checksum,
67        }
68    }
69
70    /// Whether this root pointer has been written to.
71    pub const fn is_populated(&self) -> bool {
72        self.block_id.is_some()
73    }
74}
75
76/// Identifies which root pointer is active.
77///
78/// The two-root-pointer scheme alternates between A and B on each commit,
79/// ensuring the inactive root always holds the last known-good state.
80///
81/// See [[architecture-storage-format#Atomic Commit Protocol]].
82#[derive(Clone, Copy, Debug, PartialEq, Eq)]
83pub enum ActiveRoot {
84    A = 0,
85    B = 1,
86}
87
88impl ActiveRoot {
89    /// The other root — the one that will be written next.
90    pub const fn inactive(self) -> Self {
91        match self {
92            Self::A => Self::B,
93            Self::B => Self::A,
94        }
95    }
96}
97
98/// The 4 KB file header for a `.luci` index file.
99///
100/// Contains magic bytes, format version, block size, two root pointers for
101/// atomic commit, and an active root flag. Serializes to exactly
102/// [`HEADER_SIZE`] bytes.
103///
104/// See [[architecture-storage-format#File Layout]].
105#[derive(Clone, Debug, PartialEq, Eq)]
106pub struct FileHeader {
107    pub format_version: u32,
108    pub block_size: u32,
109    pub root_a: RootPointer,
110    pub root_b: RootPointer,
111    pub active_root: ActiveRoot,
112}
113
114impl FileHeader {
115    /// Create a header for a brand-new index file.
116    pub fn new() -> Self {
117        Self {
118            format_version: FORMAT_VERSION,
119            block_size: BLOCK_SIZE,
120            root_a: RootPointer::EMPTY,
121            root_b: RootPointer::EMPTY,
122            active_root: ActiveRoot::A,
123        }
124    }
125
126    /// Test-only constructor that pins `format_version` to an arbitrary value.
127    ///
128    /// `FORMAT_VERSION` is a compile-time `const`, so a test that needs to
129    /// fabricate an older-version header (e.g. to prove `commit()` re-stamps it
130    /// — see `version_stamped_on_any_commit`) cannot change the const; it sets
131    /// the field explicitly here, then serializes via `to_bytes()` rather than
132    /// hand-assembling raw header bytes.
133    #[cfg(test)]
134    pub fn with_format_version(version: u32) -> Self {
135        Self {
136            format_version: version,
137            ..Self::new()
138        }
139    }
140
141    /// The currently active root pointer.
142    pub fn active_root_pointer(&self) -> &RootPointer {
143        match self.active_root {
144            ActiveRoot::A => &self.root_a,
145            ActiveRoot::B => &self.root_b,
146        }
147    }
148
149    /// The inactive root pointer (will be overwritten on next commit).
150    pub fn inactive_root_pointer(&self) -> &RootPointer {
151        match self.active_root {
152            ActiveRoot::A => &self.root_b,
153            ActiveRoot::B => &self.root_a,
154        }
155    }
156
157    /// Prepare the next commit: write new metadata to the inactive root
158    /// pointer and flip the active flag.
159    ///
160    /// Also re-stamps `format_version` to the writing binary's
161    /// [`FORMAT_VERSION`]. A new binary always writes the current on-disk
162    /// formats (e.g. `ColumnType::KeywordBlocked`), so the header must
163    /// advertise the current version even when committing a file created by an
164    /// older binary — otherwise an old binary could later open the upgraded
165    /// file and silently misread the new column layout ([[code-must-not-lie]]).
166    /// This makes any new-binary write a one-way upgrade. The stamp survives
167    /// only because `SingleFileDirectory::commit` is refresh-first (it re-reads
168    /// the on-disk header *before* this flip); see
169    /// [[optimization-keyword-dict-offset-index]] and
170    /// [[project_hnsw_recall_merge_persist_bug]].
171    pub fn commit(&mut self, metadata_block: BlockId, checksum: u64) {
172        let new_root = RootPointer::new(metadata_block, checksum);
173        match self.active_root {
174            ActiveRoot::A => self.root_b = new_root,
175            ActiveRoot::B => self.root_a = new_root,
176        }
177        self.active_root = self.active_root.inactive();
178        self.format_version = FORMAT_VERSION;
179    }
180
181    /// Serialize the header to a 4 KB byte buffer.
182    pub fn to_bytes(&self) -> [u8; HEADER_SIZE as usize] {
183        let mut buf = [0u8; HEADER_SIZE as usize];
184
185        buf[OFF_MAGIC..OFF_MAGIC + 8].copy_from_slice(&MAGIC);
186        buf[OFF_VERSION..OFF_VERSION + 4].copy_from_slice(&self.format_version.to_le_bytes());
187        buf[OFF_BLOCK_SIZE..OFF_BLOCK_SIZE + 4].copy_from_slice(&self.block_size.to_le_bytes());
188
189        // Root A
190        let root_a_block = self
191            .root_a
192            .block_id
193            .map_or(EMPTY_ROOT_BLOCK, |b| b.as_u64());
194        buf[OFF_ROOT_A_BLOCK..OFF_ROOT_A_BLOCK + 8].copy_from_slice(&root_a_block.to_le_bytes());
195        buf[OFF_ROOT_A_CHECKSUM..OFF_ROOT_A_CHECKSUM + 8]
196            .copy_from_slice(&self.root_a.checksum.to_le_bytes());
197
198        // Root B
199        let root_b_block = self
200            .root_b
201            .block_id
202            .map_or(EMPTY_ROOT_BLOCK, |b| b.as_u64());
203        buf[OFF_ROOT_B_BLOCK..OFF_ROOT_B_BLOCK + 8].copy_from_slice(&root_b_block.to_le_bytes());
204        buf[OFF_ROOT_B_CHECKSUM..OFF_ROOT_B_CHECKSUM + 8]
205            .copy_from_slice(&self.root_b.checksum.to_le_bytes());
206
207        // Active root flag
208        buf[OFF_ACTIVE_ROOT] = self.active_root as u8;
209
210        buf
211    }
212
213    /// Deserialize a header from a 4 KB byte buffer.
214    ///
215    /// # Errors
216    ///
217    /// Returns `LuciError::IndexCorrupted` if the magic bytes are wrong,
218    /// the format version is unsupported, or the block size doesn't match.
219    pub fn from_bytes(buf: &[u8; HEADER_SIZE as usize]) -> Result<Self> {
220        // Validate magic.
221        if buf[OFF_MAGIC..OFF_MAGIC + 8] != MAGIC {
222            return Err(LuciError::IndexCorrupted(
223                "invalid magic bytes — not a Luci index file".into(),
224            ));
225        }
226
227        let format_version =
228            u32::from_le_bytes(buf[OFF_VERSION..OFF_VERSION + 4].try_into().unwrap());
229        if format_version > FORMAT_VERSION {
230            return Err(LuciError::IndexCorrupted(format!(
231                "unsupported format version {format_version} (max supported: {FORMAT_VERSION})"
232            )));
233        }
234
235        let block_size =
236            u32::from_le_bytes(buf[OFF_BLOCK_SIZE..OFF_BLOCK_SIZE + 4].try_into().unwrap());
237        if block_size != BLOCK_SIZE {
238            return Err(LuciError::IndexCorrupted(format!(
239                "unexpected block size {block_size} (expected {BLOCK_SIZE})"
240            )));
241        }
242
243        let root_a = read_root_pointer(buf, OFF_ROOT_A_BLOCK, OFF_ROOT_A_CHECKSUM);
244        let root_b = read_root_pointer(buf, OFF_ROOT_B_BLOCK, OFF_ROOT_B_CHECKSUM);
245
246        let active_root = match buf[OFF_ACTIVE_ROOT] {
247            0 => ActiveRoot::A,
248            1 => ActiveRoot::B,
249            other => {
250                return Err(LuciError::IndexCorrupted(format!(
251                    "invalid active root flag: {other}"
252                )));
253            }
254        };
255
256        Ok(Self {
257            format_version,
258            block_size,
259            root_a,
260            root_b,
261            active_root,
262        })
263    }
264}
265
266/// Compute the xxHash (XXH3-64) checksum of a metadata root block.
267///
268/// See [[architecture-storage-format#Checksum Algorithm]].
269pub fn xxh3_checksum(data: &[u8]) -> u64 {
270    xxhash_rust::xxh3::xxh3_64(data)
271}
272
273fn read_root_pointer(buf: &[u8], block_off: usize, checksum_off: usize) -> RootPointer {
274    let raw_block = u64::from_le_bytes(buf[block_off..block_off + 8].try_into().unwrap());
275    let checksum = u64::from_le_bytes(buf[checksum_off..checksum_off + 8].try_into().unwrap());
276
277    if raw_block == EMPTY_ROOT_BLOCK {
278        RootPointer::EMPTY
279    } else {
280        RootPointer::new(BlockId(raw_block), checksum)
281    }
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    #[test]
289    fn new_header_has_empty_roots() {
290        let h = FileHeader::new();
291        assert_eq!(h.format_version, FORMAT_VERSION);
292        assert_eq!(h.block_size, BLOCK_SIZE);
293        assert_eq!(h.root_a, RootPointer::EMPTY);
294        assert_eq!(h.root_b, RootPointer::EMPTY);
295        assert_eq!(h.active_root, ActiveRoot::A);
296    }
297
298    #[test]
299    fn round_trip_fresh_header() {
300        let h = FileHeader::new();
301        let bytes = h.to_bytes();
302        let h2 = FileHeader::from_bytes(&bytes).unwrap();
303        assert_eq!(h, h2);
304    }
305
306    #[test]
307    fn round_trip_with_populated_roots() {
308        let mut h = FileHeader::new();
309        h.root_a = RootPointer::new(BlockId(42), 0xDEAD_BEEF);
310        h.root_b = RootPointer::new(BlockId(99), 0xCAFE_BABE);
311        h.active_root = ActiveRoot::B;
312
313        let bytes = h.to_bytes();
314        let h2 = FileHeader::from_bytes(&bytes).unwrap();
315        assert_eq!(h, h2);
316    }
317
318    #[test]
319    fn commit_flips_active_root() {
320        let mut h = FileHeader::new();
321        assert_eq!(h.active_root, ActiveRoot::A);
322
323        // First commit writes to root B and makes it active.
324        h.commit(BlockId(5), 0x1111);
325        assert_eq!(h.active_root, ActiveRoot::B);
326        assert_eq!(h.root_b, RootPointer::new(BlockId(5), 0x1111));
327        assert_eq!(h.root_a, RootPointer::EMPTY);
328
329        // Second commit writes to root A and makes it active.
330        h.commit(BlockId(10), 0x2222);
331        assert_eq!(h.active_root, ActiveRoot::A);
332        assert_eq!(h.root_a, RootPointer::new(BlockId(10), 0x2222));
333        // Root B unchanged from first commit.
334        assert_eq!(h.root_b, RootPointer::new(BlockId(5), 0x1111));
335    }
336
337    #[test]
338    fn active_and_inactive_root_pointers() {
339        let mut h = FileHeader::new();
340        h.commit(BlockId(5), 0x1111);
341        // Active is now B.
342        assert_eq!(
343            *h.active_root_pointer(),
344            RootPointer::new(BlockId(5), 0x1111)
345        );
346        assert_eq!(*h.inactive_root_pointer(), RootPointer::EMPTY);
347    }
348
349    #[test]
350    fn bad_magic_is_rejected() {
351        let mut bytes = FileHeader::new().to_bytes();
352        bytes[0] = b'X';
353        let err = FileHeader::from_bytes(&bytes).unwrap_err();
354        assert!(format!("{err}").contains("magic"));
355    }
356
357    #[test]
358    fn future_version_is_rejected() {
359        let mut h = FileHeader::new();
360        h.format_version = FORMAT_VERSION + 1;
361        let bytes = h.to_bytes();
362        let err = FileHeader::from_bytes(&bytes).unwrap_err();
363        assert!(format!("{err}").contains("version"));
364    }
365
366    #[test]
367    fn wrong_block_size_is_rejected() {
368        let mut bytes = FileHeader::new().to_bytes();
369        // Overwrite block_size field with a different value.
370        bytes[OFF_BLOCK_SIZE..OFF_BLOCK_SIZE + 4].copy_from_slice(&(128 * 1024u32).to_le_bytes());
371        let err = FileHeader::from_bytes(&bytes).unwrap_err();
372        assert!(format!("{err}").contains("block size"));
373    }
374
375    #[test]
376    fn invalid_active_root_flag_is_rejected() {
377        let mut bytes = FileHeader::new().to_bytes();
378        bytes[OFF_ACTIVE_ROOT] = 2;
379        let err = FileHeader::from_bytes(&bytes).unwrap_err();
380        assert!(format!("{err}").contains("active root"));
381    }
382
383    #[test]
384    fn header_is_exactly_4kb() {
385        let bytes = FileHeader::new().to_bytes();
386        assert_eq!(bytes.len(), 4096);
387    }
388
389    #[test]
390    fn active_root_inactive_is_inverse() {
391        assert_eq!(ActiveRoot::A.inactive(), ActiveRoot::B);
392        assert_eq!(ActiveRoot::B.inactive(), ActiveRoot::A);
393    }
394
395    #[test]
396    fn root_pointer_empty_is_not_populated() {
397        assert!(!RootPointer::EMPTY.is_populated());
398    }
399
400    #[test]
401    fn root_pointer_with_block_is_populated() {
402        let rp = RootPointer::new(BlockId(0), 0);
403        assert!(rp.is_populated());
404    }
405
406    #[test]
407    fn xxh3_checksum_is_deterministic() {
408        let data = b"hello luci";
409        let c1 = xxh3_checksum(data);
410        let c2 = xxh3_checksum(data);
411        assert_eq!(c1, c2);
412        assert_ne!(c1, 0); // extremely unlikely to be zero
413    }
414
415    #[test]
416    fn xxh3_checksum_differs_for_different_data() {
417        let c1 = xxh3_checksum(b"block A");
418        let c2 = xxh3_checksum(b"block B");
419        assert_ne!(c1, c2);
420    }
421}