Skip to main content

firewood_storage/nodestore/
header.rs

1// Copyright (C) 2023, Ava Labs, Inc. All rights reserved.
2// See the file LICENSE.md for licensing terms.
3
4//! # Header Module
5//!
6//! This module defines the nodestore header structure and validation logic for ensuring
7//! database compatibility across different versions and configurations.
8//!
9//! ## Header Structure
10//!
11//! The `NodeStoreHeader` is stored at the beginning of every nodestore file and contains:
12//!
13//! - **Version String** - Human-readable firewood version (e.g., "firewood 0.1.0")
14//! - **Endianness Test** - Detects byte order mismatches between platforms
15//! - **Root Address** - Points to the merkle trie root node (if any)
16//! - **Storage Size** - Total allocated storage space
17//! - **Free Lists** - Array of free space linked list heads for each area size
18//!
19//! ## Storage Layout
20//!
21//! The header occupies the first 2048 bytes of storage:
22//! - Fixed size for alignment with disk block boundaries
23//! - Zero-padded to full size for consistent layout
24//! - Uses C-compatible representation for cross-language access
25//!
26
27use bytemuck_derive::{Pod, Zeroable};
28use std::io::{Error, ErrorKind, Read};
29
30use super::alloc::FreeLists;
31use super::primitives::{LinearAddress, area_size_hash};
32use crate::logger::{debug, trace};
33use crate::{NodeHashAlgorithm, TrieHash};
34
35/// A tuple indicating the address and hash of a node (the root node).
36pub type RootNodeInfo = (LinearAddress, TrieHash);
37
38/// Can be used by filesystem tooling such as "file" to identify the version of
39/// firewood used to create this `NodeStore` file.
40///
41/// From firewood v0.0.4 to sometime between v0.0.18, the 16 bytes "firewood 0.0.x"
42/// followed by enought NUL bytes to pad to 16 bytes. Between v0.0.18 and v0.1.0,
43/// the string was change to the literal "firewood-v1" plus padding. The version
44/// number in the string will stay fixed at `v1` and will be updated independently
45/// from the package version in the event of a major data format change.
46///
47/// The firewood version information stored is now stored in a different
48/// location in the header. The fixed string prevents issues where the version
49/// string length may overflow the allocated space truncating the string and
50/// losing relevant information.
51///
52/// Validation will continue to accept the old format for compatibility with
53/// existing databases. This change is purely cosmetic and does not affect any
54/// other functionality or data structures.
55///
56/// When a major data format is changed, this string should be updated to
57/// indicate the new major version.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Zeroable, Pod)]
59#[repr(transparent)]
60pub struct Version {
61    bytes: [u8; 16],
62}
63
64impl Version {
65    /// Construct a [`Version`] instance for the current build of firewood.
66    pub const fn new() -> Self {
67        // NB: const block prevents indexing into the array at runtime
68        const { Self::VALID_V1_VERSIONS[0] }
69    }
70
71    /// Validates that the version identifier is valid and compatible with the current
72    /// build of firewood.
73    ///
74    /// # Errors
75    ///
76    /// Returns an error if the version is not recognized as one of the known
77    /// valid versions.
78    pub fn validate(self) -> Result<(), Error> {
79        if Self::VALID_V1_VERSIONS.contains(&self) {
80            debug!("Database version {:?} is valid.", self.as_str());
81            Ok(())
82        } else {
83            Err(Error::new(
84                ErrorKind::InvalidData,
85                format!(
86                    "Database cannot be opened due to incompatible version: {:?}",
87                    self.as_str()
88                ),
89            ))
90        }
91    }
92
93    /// Returns the version string as a `&str`, trimming any trailing null bytes.
94    pub fn as_str(&self) -> &str {
95        std::str::from_utf8(&self.bytes)
96            .unwrap_or("<invalid utf8>")
97            .trim_matches('\0')
98    }
99
100    /// Returns a u128 representation of the version bytes.
101    ///
102    /// This is useful for comparisons and hashing as we can use integer
103    /// operations which are more efficient than byte-wise operations.
104    pub const fn as_u128(self) -> u128 {
105        u128::from_ne_bytes(self.bytes)
106    }
107
108    const fn is_firewood_v1(self) -> bool {
109        self.as_u128() == const { Self::VALID_V1_VERSIONS[0].as_u128() }
110    }
111
112    const fn from_static(bytes: &'static [u8; 16]) -> Self {
113        Self { bytes: *bytes }
114    }
115
116    /// After making the version a static string, there is no need to use semver
117    /// to parse the valid version strings since there are a finite set of valid
118    /// strings.
119    const VALID_V1_VERSIONS: [Version; 16] = [
120        Version::from_static(b"firewood-v1\0\0\0\0\0"),
121        Version::from_static(b"firewood 0.0.18\0"),
122        Version::from_static(b"firewood 0.0.17\0"),
123        Version::from_static(b"firewood 0.0.16\0"),
124        Version::from_static(b"firewood 0.0.15\0"),
125        Version::from_static(b"firewood 0.0.14\0"),
126        Version::from_static(b"firewood 0.0.13\0"),
127        Version::from_static(b"firewood 0.0.12\0"),
128        Version::from_static(b"firewood 0.0.11\0"),
129        Version::from_static(b"firewood 0.0.10\0"),
130        Version::from_static(b"firewood 0.0.9\0\0"),
131        Version::from_static(b"firewood 0.0.8\0\0"),
132        Version::from_static(b"firewood 0.0.7\0\0"),
133        Version::from_static(b"firewood 0.0.6\0\0"),
134        Version::from_static(b"firewood 0.0.5\0\0"),
135        Version::from_static(b"firewood 0.0.4\0\0"),
136    ];
137}
138
139impl Default for Version {
140    fn default() -> Self {
141        Self::new()
142    }
143}
144
145/// The cargo version used to create this database. The field is padded to 32
146/// bytes with null bytes.
147///
148/// This is the value of the `CARGO_PKG_VERSION` environment variable set by
149/// cargo at build time.
150#[derive(Debug, Clone, Copy, PartialEq, Eq, Zeroable, Pod)]
151#[repr(transparent)]
152pub struct CargoVersion {
153    bytes: [u8; 32],
154}
155
156// assert at compile time that the version fits in the allocated space
157const _: () = assert!(CargoVersion::CARGO_PKG_VERSION_LEN <= 32);
158
159impl CargoVersion {
160    const CARGO_PKG_VERSION: &'static str = env!("CARGO_PKG_VERSION");
161    const CARGO_PKG_VERSION_LEN: usize = Self::CARGO_PKG_VERSION.len();
162
163    // craft in constant context to avoid runtime `memcpy` call for `new()`
164    const INSTANCE: Self = {
165        let mut bytes = [0u8; 32];
166        const_copy(Self::CARGO_PKG_VERSION.as_bytes(), &mut bytes);
167        Self { bytes }
168    };
169
170    #[inline]
171    pub fn len(&self) -> usize {
172        self.bytes
173            .iter()
174            .position(|&b| b == 0)
175            .unwrap_or(self.bytes.len())
176    }
177
178    pub const fn is_empty(&self) -> bool {
179        self.bytes[0] == 0
180    }
181
182    pub fn as_str(&self) -> std::borrow::Cow<'_, str> {
183        // will not allocate for properly sourced values
184        #[expect(clippy::indexing_slicing, reason = "len() ensures we stay in bounds")]
185        String::from_utf8_lossy(&self.bytes[..self.len()])
186    }
187}
188
189#[derive(Debug, Clone, Copy, PartialEq, Eq, Zeroable, Pod)]
190#[repr(transparent)]
191pub struct GitDescribe {
192    bytes: [u8; 64],
193}
194
195// assert at compile time that the output fits in the allocated space
196const _: () = assert!(GitDescribe::GIT_DESCRIBE_LEN <= 64);
197
198impl GitDescribe {
199    const GIT_DESCRIBE: &'static str = git_version::git_version!(fallback = "");
200    const GIT_DESCRIBE_LEN: usize = Self::GIT_DESCRIBE.len();
201
202    // craft in constant context to avoid runtime `memcpy` call for `new()`
203    const INSTANCE: Self = {
204        let mut bytes = [0u8; 64];
205        const_copy(Self::GIT_DESCRIBE.as_bytes(), &mut bytes);
206        Self { bytes }
207    };
208
209    #[inline]
210    pub fn len(&self) -> usize {
211        self.bytes
212            .iter()
213            .position(|&b| b == 0)
214            .unwrap_or(self.bytes.len())
215    }
216
217    pub const fn is_empty(&self) -> bool {
218        self.bytes[0] == 0
219    }
220
221    pub fn as_str(&self) -> std::borrow::Cow<'_, str> {
222        // will not allocate for properly sourced values
223        #[expect(clippy::indexing_slicing, reason = "len() ensures we stay in bounds")]
224        String::from_utf8_lossy(&self.bytes[..self.len()])
225    }
226}
227
228/// Persisted metadata for a `NodeStore`.
229/// The [`NodeStoreHeader`] is at the start of the `ReadableStorage`.
230///
231/// `root_hash`, `cargo_version` and `git_describe` were added between v0.0.18
232/// and v0.1.0. If `version` is not equal to `firewood-v1`, this field may contain
233/// "uninitialized" data and must be ignored. Uninitialized data in this context
234/// means whatever the filesystem returns when reading from the sparse region of
235/// the file where there previously was no data written. This does not mean
236/// uninitialized memory with respect to memory safety, but does mean that the
237/// data may be garbage.
238#[derive(Debug, Clone, Copy, PartialEq, Eq, Zeroable, Pod)]
239#[repr(C)]
240pub struct NodeStoreHeader {
241    /// Identifies the version of the data format written to this `NodeStore`.
242    version: Version,
243    /// always "1"; verifies endianness
244    endian_test: u64,
245    /// Total allocated storage size (high water mark). New nodes are allocated
246    /// at this offset when the free list is empty, then this value is incremented.
247    size: u64,
248    /// Heads of the free lists for each area size class.
249    ///
250    /// Element `i` points to the first free area of size `AREA_SIZES[i]`. Each free area
251    /// contains a pointer to the next free area of the same size, forming a linked list.
252    ///
253    /// When allocating a new node, the allocator first checks the appropriate free list.
254    /// If empty, it falls back to bumping `size`. Free lists are populated at commit time
255    /// during revision reaping, when deleted nodes from evicted revisions are reclaimed.
256    free_lists: FreeLists,
257    /// Disk address of the merkle trie root node, or `None` for an empty trie.
258    ///
259    /// This is updated at commit time to point to the new root after changes are persisted.
260    /// The address is a direct file offset, not a hash-based reference.
261    root_address: Option<LinearAddress>,
262    /// The hash of the area sizes used in this database to prevent someone from changing the
263    /// area sizes and trying to read old databases with the wrong area sizes.
264    area_size_hash: [u8; 32],
265    /// Whether ethhash was enabled when this database was created.
266    node_hash_algorithm: u64,
267    /// The merkle root hash of the entire database when it was last committed.
268    ///
269    /// The value is meaningful only if `root_address` is `Some`.
270    root_hash: [u8; 32],
271    /// The cargo version used to create this database.
272    ///
273    /// This is the value of the `CARGO_PKG_VERSION` environment variable set by
274    /// cargo at build time. It is guaranteed to always be set if `version` is
275    /// equal to `firewood-v1`.
276    ///
277    /// The field is padded to 32 bytes with null bytes.
278    cargo_version: CargoVersion,
279    /// The git hash of the local repository used to create this database. This
280    /// may include a "-modified" suffix if there were uncommitted changes at
281    /// the time of building.
282    ///
283    /// If available, this is the output of `git describe --always --dirty=-modified`
284    /// at the time of build. Otherwise this will be an empty string.
285    ///
286    /// Note that this only applies when building from within the `firewood`
287    /// repository. Users importing `firewood-storage` from `crates.io` will
288    /// always get an empty string here.
289    ///
290    /// The field is padded to 64 bytes with null bytes.
291    ///
292    /// Example: `v0.0.18-31-ga6909f32f-modified`
293    ///
294    /// This field was added between v0.0.18 and v0.1.0. If `version` is not
295    /// equal to `firewood-v1`, this field may contain uninitialized data and
296    /// must be ignored.
297    git_describe: GitDescribe,
298}
299
300// Compile-time assertion that SIZE is large enough for the header. Does not work
301// within the impl block.
302const _: () = assert!(size_of::<NodeStoreHeader>() <= NodeStoreHeader::SIZE as usize);
303
304impl NodeStoreHeader {
305    /// The first SIZE bytes of the `ReadableStorage` are reserved for the
306    /// [`NodeStoreHeader`].
307    /// We also want it aligned to a disk block
308    pub const SIZE: u64 = 2048;
309
310    /// Reads the [`NodeStoreHeader`] from the start of the given storage.
311    ///
312    /// # Errors
313    ///
314    /// Returns an error if unable to read the header from storage or if there's
315    /// a node hash algorithm mismatch.
316    pub fn read_from_storage<S: crate::linear::ReadableStorage>(
317        storage: &S,
318    ) -> Result<Self, crate::FileIoError> {
319        // TODO(#1088): remove this after implementing runtime selection of hash algorithms
320        storage.node_hash_algorithm().validate_init().map_err(|e| {
321            storage.file_io_error(e, 0, Some("NodeHashAlgorithm::validate_init".to_string()))
322        })?;
323
324        let mut this = bytemuck::zeroed::<Self>();
325        let header_bytes = bytemuck::bytes_of_mut(&mut this);
326        storage
327            .stream_from(0)?
328            .read_exact(header_bytes)
329            .map_err(|e| {
330                storage.file_io_error(e, 0, Some("NodeStoreHeader::read_from_storage".to_string()))
331            })?;
332        this.validate(storage.node_hash_algorithm()).map_err(|e| {
333            storage.file_io_error(e, 0, Some("NodeStoreHeader::validate".to_string()))
334        })?;
335
336        Ok(this)
337    }
338
339    /// Creates a new header with default values and no root address.
340    #[must_use]
341    pub fn new(node_hash_algorithm: NodeHashAlgorithm) -> Self {
342        Self {
343            // The store just contains the header at this point
344            size: Self::SIZE,
345            endian_test: 1,
346            root_address: None,
347            version: Version::new(),
348            free_lists: Default::default(),
349            area_size_hash: area_size_hash().into(),
350            node_hash_algorithm: node_hash_algorithm as u64,
351            root_hash: TrieHash::empty().into(),
352            cargo_version: CargoVersion::INSTANCE,
353            git_describe: GitDescribe::INSTANCE,
354        }
355    }
356
357    /// Validates the header fields to ensure correctness and compatibility with
358    /// the current build of firewood.
359    ///
360    /// Checks performed:
361    ///
362    /// - Magic number / version string is valid. The first 16 bytes must match
363    ///   an expected magic number.
364    /// - Endianness test matches expected value (1). If not, reading from the
365    ///   database will produce incorrect results from mis-interpreted byte order.
366    /// - Area size hash matches the expected hash for the current build. This
367    ///   prevents corrupting the allocation structures by changing area sizes.
368    /// - Node hash algorithm flag matches the expected algorithm for this
369    ///   storage.
370    ///
371    /// # Errors
372    ///
373    /// Returns an error if validation fails.
374    pub fn validate(&self, expected_node_hash_algorithm: NodeHashAlgorithm) -> Result<(), Error> {
375        trace!("Checking version...");
376        self.version.validate()?;
377
378        trace!("Checking endianness...");
379        self.validate_endian_test()?;
380
381        trace!("Checking area size hash...");
382        self.validate_area_size_hash()?;
383
384        trace!("Checking if node hash algorithm flag matches storage...");
385        self.validate_node_hash_algorithm(expected_node_hash_algorithm)?;
386
387        if self.version == Version::VALID_V1_VERSIONS[0] {
388            debug!(
389                "Database was created with firewood version {}",
390                self.firewood_version_str()
391                    .as_deref()
392                    .unwrap_or("<unknown>"),
393            );
394        }
395
396        Ok(())
397    }
398
399    /// Get the size of the nodestore
400    #[must_use]
401    pub const fn size(&self) -> u64 {
402        self.size
403    }
404
405    /// Set the size of the nodestore
406    pub const fn set_size(&mut self, size: u64) {
407        self.size = size;
408    }
409
410    /// Get the free lists
411    #[must_use]
412    pub const fn free_lists(&self) -> &FreeLists {
413        &self.free_lists
414    }
415
416    /// Get mutable access to the free lists
417    pub const fn free_lists_mut(&mut self) -> &mut FreeLists {
418        &mut self.free_lists
419    }
420
421    /// Get the root address
422    #[must_use]
423    pub const fn root_address(&self) -> Option<LinearAddress> {
424        self.root_address
425    }
426
427    /// Get the root hash.
428    ///
429    /// This is None if the database was created before v0.1.0.
430    #[must_use]
431    pub(crate) fn root_hash(&self) -> Option<TrieHash> {
432        if self.version.is_firewood_v1() && self.root_address.is_some() {
433            Some(TrieHash::from(self.root_hash))
434        } else {
435            None
436        }
437    }
438
439    /// Update the root location, both address and hash.
440    ///
441    /// Note that this does not overwrite the version field, so it is possible
442    /// the root hash will be ignored when set.
443    pub fn set_root_location(&mut self, root_location: Option<RootNodeInfo>) {
444        let (root_address, root_hash) = root_location.unzip();
445        self.root_address = root_address;
446        self.root_hash = root_hash.unwrap_or_else(TrieHash::empty).into();
447    }
448
449    /// Get the offset of the `free_lists` field for use with `offset_of`!
450    #[must_use]
451    pub const fn free_lists_offset() -> u64 {
452        std::mem::offset_of!(NodeStoreHeader, free_lists) as u64
453    }
454
455    /// Get the cargo version of `firewood-storage` used to create this database,
456    /// if available.
457    #[must_use]
458    pub const fn cargo_version(&self) -> Option<&CargoVersion> {
459        if self.version.is_firewood_v1() {
460            Some(&self.cargo_version)
461        } else {
462            None
463        }
464    }
465
466    /// Get the git describe string of `firewood` used to create this database,
467    #[must_use]
468    pub const fn git_describe(&self) -> Option<&GitDescribe> {
469        if self.version.is_firewood_v1() {
470            Some(&self.git_describe)
471        } else {
472            None
473        }
474    }
475
476    /// Returns a version string identifying the version of `firewood-storage`
477    /// used to create this database, if available.
478    ///
479    /// This field was added between v0.0.18 and v0.1.0 and may be absent in
480    /// older databases.
481    ///
482    /// The returned string is either the `git describe` output (from the time
483    /// of build) if available, otherwise the cargo package version of the
484    /// `firewood-storage` crate.
485    pub(crate) fn firewood_version_str(&self) -> Option<std::borrow::Cow<'_, str>> {
486        self.git_describe()
487            .map(GitDescribe::as_str)
488            .or_else(|| self.cargo_version().map(CargoVersion::as_str))
489    }
490
491    fn validate_endian_test(&self) -> Result<(), Error> {
492        if self.endian_test == 1 {
493            Ok(())
494        } else {
495            Err(Error::new(
496                ErrorKind::InvalidData,
497                "Database cannot be opened due to difference in endianness",
498            ))
499        }
500    }
501
502    fn validate_area_size_hash(&self) -> Result<(), Error> {
503        if self.area_size_hash == area_size_hash().as_slice() {
504            Ok(())
505        } else {
506            Err(Error::new(
507                ErrorKind::InvalidData,
508                "Database cannot be opened due to difference in area size hash",
509            ))
510        }
511    }
512
513    fn validate_node_hash_algorithm(&self, expected: NodeHashAlgorithm) -> Result<(), Error> {
514        expected.validate_init()?;
515        NodeHashAlgorithm::try_from(self.node_hash_algorithm)
516            .map_err(|err| {
517                std::io::Error::new(
518                    std::io::ErrorKind::InvalidData,
519                    format!("hash mode flag in database header is invalid: {err}"),
520                )
521            })?
522            .validate_open(expected)
523    }
524}
525
526// memcpy but in const context. This bypasses the current limitation that
527// `slice::copy_from_slice` cannot be used in const fns (or const{} blocks).
528const fn const_copy(src: &[u8], dst: &mut [u8]) {
529    #![expect(
530        clippy::indexing_slicing,
531        clippy::arithmetic_side_effects,
532        reason = "const and sufficient bounds checks"
533    )]
534
535    let mut i = 0;
536    while i < src.len() && i < dst.len() {
537        dst[i] = src[i];
538        i += 1;
539    }
540}
541
542#[cfg(test)]
543mod tests {
544    use super::*;
545    use test_case::test_case;
546
547    #[test]
548    fn test_version_new_is_valid() {
549        Version::new()
550            .validate()
551            .expect("Version::new() should always be valid");
552    }
553
554    #[test_case(*b"invalid\0\0\0\0\0\0\0\0\0")]
555    #[test_case(*b"avalanche 0.1.0\0")]
556    #[test_case(*b"firewood 0.0.1\0\0")]
557    fn test_invalid_version_strings(bytes: [u8; 16]) {
558        assert!(Version { bytes }.validate().is_err());
559    }
560
561    #[test]
562    fn test_header_new() {
563        let header = NodeStoreHeader::new(NodeHashAlgorithm::compile_option());
564
565        // Check the header is correctly initialized.
566        assert_eq!(header.version, Version::new());
567        let empty_free_list: FreeLists = Default::default();
568        assert_eq!(*header.free_lists(), empty_free_list);
569    }
570}