firewood_storage/nodestore/header.rs
1// Copyright (C) 2023, Ava Labs, Inc. All rights reserved.
2// See the file LICENSE.md for licensing terms.
3
4//! # Header Module
5//!
6//! This module defines the nodestore header structure and validation logic for ensuring
7//! database compatibility across different versions and configurations.
8//!
9//! ## Header Structure
10//!
11//! The `NodeStoreHeader` is stored at the beginning of every nodestore file and contains:
12//!
13//! - **Version String** - Human-readable firewood version (e.g., "firewood 0.1.0")
14//! - **Endianness Test** - Detects byte order mismatches between platforms
15//! - **Root Address** - Points to the merkle trie root node (if any)
16//! - **Storage Size** - Total allocated storage space
17//! - **Free Lists** - Array of free space linked list heads for each area size
18//!
19//! ## Storage Layout
20//!
21//! The header occupies the first 2048 bytes of storage:
22//! - Fixed size for alignment with disk block boundaries
23//! - Zero-padded to full size for consistent layout
24//! - Uses C-compatible representation for cross-language access
25//!
26
27use bytemuck_derive::{Pod, Zeroable};
28use std::io::{Error, ErrorKind, Read};
29
30use super::alloc::FreeLists;
31use super::primitives::{LinearAddress, area_size_hash};
32use crate::logger::{debug, trace};
33use crate::{NodeHashAlgorithm, TrieHash};
34
35/// A tuple indicating the address and hash of a node (the root node).
36pub type RootNodeInfo = (LinearAddress, TrieHash);
37
38/// Can be used by filesystem tooling such as "file" to identify the version of
39/// firewood used to create this `NodeStore` file.
40///
41/// From firewood v0.0.4 to sometime between v0.0.18, the 16 bytes "firewood 0.0.x"
42/// followed by enought NUL bytes to pad to 16 bytes. Between v0.0.18 and v0.1.0,
43/// the string was change to the literal "firewood-v1" plus padding. The version
44/// number in the string will stay fixed at `v1` and will be updated independently
45/// from the package version in the event of a major data format change.
46///
47/// The firewood version information stored is now stored in a different
48/// location in the header. The fixed string prevents issues where the version
49/// string length may overflow the allocated space truncating the string and
50/// losing relevant information.
51///
52/// Validation will continue to accept the old format for compatibility with
53/// existing databases. This change is purely cosmetic and does not affect any
54/// other functionality or data structures.
55///
56/// When a major data format is changed, this string should be updated to
57/// indicate the new major version.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Zeroable, Pod)]
59#[repr(transparent)]
60pub struct Version {
61 bytes: [u8; 16],
62}
63
64impl Version {
65 /// Construct a [`Version`] instance for the current build of firewood.
66 pub const fn new() -> Self {
67 // NB: const block prevents indexing into the array at runtime
68 const { Self::VALID_V1_VERSIONS[0] }
69 }
70
71 /// Validates that the version identifier is valid and compatible with the current
72 /// build of firewood.
73 ///
74 /// # Errors
75 ///
76 /// Returns an error if the version is not recognized as one of the known
77 /// valid versions.
78 pub fn validate(self) -> Result<(), Error> {
79 if Self::VALID_V1_VERSIONS.contains(&self) {
80 debug!("Database version {:?} is valid.", self.as_str());
81 Ok(())
82 } else {
83 Err(Error::new(
84 ErrorKind::InvalidData,
85 format!(
86 "Database cannot be opened due to incompatible version: {:?}",
87 self.as_str()
88 ),
89 ))
90 }
91 }
92
93 /// Returns the version string as a `&str`, trimming any trailing null bytes.
94 pub fn as_str(&self) -> &str {
95 std::str::from_utf8(&self.bytes)
96 .unwrap_or("<invalid utf8>")
97 .trim_matches('\0')
98 }
99
100 /// Returns a u128 representation of the version bytes.
101 ///
102 /// This is useful for comparisons and hashing as we can use integer
103 /// operations which are more efficient than byte-wise operations.
104 pub const fn as_u128(self) -> u128 {
105 u128::from_ne_bytes(self.bytes)
106 }
107
108 const fn is_firewood_v1(self) -> bool {
109 self.as_u128() == const { Self::VALID_V1_VERSIONS[0].as_u128() }
110 }
111
112 const fn from_static(bytes: &'static [u8; 16]) -> Self {
113 Self { bytes: *bytes }
114 }
115
116 /// After making the version a static string, there is no need to use semver
117 /// to parse the valid version strings since there are a finite set of valid
118 /// strings.
119 const VALID_V1_VERSIONS: [Version; 16] = [
120 Version::from_static(b"firewood-v1\0\0\0\0\0"),
121 Version::from_static(b"firewood 0.0.18\0"),
122 Version::from_static(b"firewood 0.0.17\0"),
123 Version::from_static(b"firewood 0.0.16\0"),
124 Version::from_static(b"firewood 0.0.15\0"),
125 Version::from_static(b"firewood 0.0.14\0"),
126 Version::from_static(b"firewood 0.0.13\0"),
127 Version::from_static(b"firewood 0.0.12\0"),
128 Version::from_static(b"firewood 0.0.11\0"),
129 Version::from_static(b"firewood 0.0.10\0"),
130 Version::from_static(b"firewood 0.0.9\0\0"),
131 Version::from_static(b"firewood 0.0.8\0\0"),
132 Version::from_static(b"firewood 0.0.7\0\0"),
133 Version::from_static(b"firewood 0.0.6\0\0"),
134 Version::from_static(b"firewood 0.0.5\0\0"),
135 Version::from_static(b"firewood 0.0.4\0\0"),
136 ];
137}
138
139impl Default for Version {
140 fn default() -> Self {
141 Self::new()
142 }
143}
144
145/// The cargo version used to create this database. The field is padded to 32
146/// bytes with null bytes.
147///
148/// This is the value of the `CARGO_PKG_VERSION` environment variable set by
149/// cargo at build time.
150#[derive(Debug, Clone, Copy, PartialEq, Eq, Zeroable, Pod)]
151#[repr(transparent)]
152pub struct CargoVersion {
153 bytes: [u8; 32],
154}
155
156// assert at compile time that the version fits in the allocated space
157const _: () = assert!(CargoVersion::CARGO_PKG_VERSION_LEN <= 32);
158
159impl CargoVersion {
160 const CARGO_PKG_VERSION: &'static str = env!("CARGO_PKG_VERSION");
161 const CARGO_PKG_VERSION_LEN: usize = Self::CARGO_PKG_VERSION.len();
162
163 // craft in constant context to avoid runtime `memcpy` call for `new()`
164 const INSTANCE: Self = {
165 let mut bytes = [0u8; 32];
166 const_copy(Self::CARGO_PKG_VERSION.as_bytes(), &mut bytes);
167 Self { bytes }
168 };
169
170 #[inline]
171 pub fn len(&self) -> usize {
172 self.bytes
173 .iter()
174 .position(|&b| b == 0)
175 .unwrap_or(self.bytes.len())
176 }
177
178 pub const fn is_empty(&self) -> bool {
179 self.bytes[0] == 0
180 }
181
182 pub fn as_str(&self) -> std::borrow::Cow<'_, str> {
183 // will not allocate for properly sourced values
184 #[expect(clippy::indexing_slicing, reason = "len() ensures we stay in bounds")]
185 String::from_utf8_lossy(&self.bytes[..self.len()])
186 }
187}
188
189#[derive(Debug, Clone, Copy, PartialEq, Eq, Zeroable, Pod)]
190#[repr(transparent)]
191pub struct GitDescribe {
192 bytes: [u8; 64],
193}
194
195// assert at compile time that the output fits in the allocated space
196const _: () = assert!(GitDescribe::GIT_DESCRIBE_LEN <= 64);
197
198impl GitDescribe {
199 const GIT_DESCRIBE: &'static str = git_version::git_version!(fallback = "");
200 const GIT_DESCRIBE_LEN: usize = Self::GIT_DESCRIBE.len();
201
202 // craft in constant context to avoid runtime `memcpy` call for `new()`
203 const INSTANCE: Self = {
204 let mut bytes = [0u8; 64];
205 const_copy(Self::GIT_DESCRIBE.as_bytes(), &mut bytes);
206 Self { bytes }
207 };
208
209 #[inline]
210 pub fn len(&self) -> usize {
211 self.bytes
212 .iter()
213 .position(|&b| b == 0)
214 .unwrap_or(self.bytes.len())
215 }
216
217 pub const fn is_empty(&self) -> bool {
218 self.bytes[0] == 0
219 }
220
221 pub fn as_str(&self) -> std::borrow::Cow<'_, str> {
222 // will not allocate for properly sourced values
223 #[expect(clippy::indexing_slicing, reason = "len() ensures we stay in bounds")]
224 String::from_utf8_lossy(&self.bytes[..self.len()])
225 }
226}
227
228/// Persisted metadata for a `NodeStore`.
229/// The [`NodeStoreHeader`] is at the start of the `ReadableStorage`.
230///
231/// `root_hash`, `cargo_version` and `git_describe` were added between v0.0.18
232/// and v0.1.0. If `version` is not equal to `firewood-v1`, this field may contain
233/// "uninitialized" data and must be ignored. Uninitialized data in this context
234/// means whatever the filesystem returns when reading from the sparse region of
235/// the file where there previously was no data written. This does not mean
236/// uninitialized memory with respect to memory safety, but does mean that the
237/// data may be garbage.
238#[derive(Debug, Clone, Copy, PartialEq, Eq, Zeroable, Pod)]
239#[repr(C)]
240pub struct NodeStoreHeader {
241 /// Identifies the version of the data format written to this `NodeStore`.
242 version: Version,
243 /// always "1"; verifies endianness
244 endian_test: u64,
245 /// Total allocated storage size (high water mark). New nodes are allocated
246 /// at this offset when the free list is empty, then this value is incremented.
247 size: u64,
248 /// Heads of the free lists for each area size class.
249 ///
250 /// Element `i` points to the first free area of size `AREA_SIZES[i]`. Each free area
251 /// contains a pointer to the next free area of the same size, forming a linked list.
252 ///
253 /// When allocating a new node, the allocator first checks the appropriate free list.
254 /// If empty, it falls back to bumping `size`. Free lists are populated at commit time
255 /// during revision reaping, when deleted nodes from evicted revisions are reclaimed.
256 free_lists: FreeLists,
257 /// Disk address of the merkle trie root node, or `None` for an empty trie.
258 ///
259 /// This is updated at commit time to point to the new root after changes are persisted.
260 /// The address is a direct file offset, not a hash-based reference.
261 root_address: Option<LinearAddress>,
262 /// The hash of the area sizes used in this database to prevent someone from changing the
263 /// area sizes and trying to read old databases with the wrong area sizes.
264 area_size_hash: [u8; 32],
265 /// Whether ethhash was enabled when this database was created.
266 node_hash_algorithm: u64,
267 /// The merkle root hash of the entire database when it was last committed.
268 ///
269 /// The value is meaningful only if `root_address` is `Some`.
270 root_hash: [u8; 32],
271 /// The cargo version used to create this database.
272 ///
273 /// This is the value of the `CARGO_PKG_VERSION` environment variable set by
274 /// cargo at build time. It is guaranteed to always be set if `version` is
275 /// equal to `firewood-v1`.
276 ///
277 /// The field is padded to 32 bytes with null bytes.
278 cargo_version: CargoVersion,
279 /// The git hash of the local repository used to create this database. This
280 /// may include a "-modified" suffix if there were uncommitted changes at
281 /// the time of building.
282 ///
283 /// If available, this is the output of `git describe --always --dirty=-modified`
284 /// at the time of build. Otherwise this will be an empty string.
285 ///
286 /// Note that this only applies when building from within the `firewood`
287 /// repository. Users importing `firewood-storage` from `crates.io` will
288 /// always get an empty string here.
289 ///
290 /// The field is padded to 64 bytes with null bytes.
291 ///
292 /// Example: `v0.0.18-31-ga6909f32f-modified`
293 ///
294 /// This field was added between v0.0.18 and v0.1.0. If `version` is not
295 /// equal to `firewood-v1`, this field may contain uninitialized data and
296 /// must be ignored.
297 git_describe: GitDescribe,
298}
299
300// Compile-time assertion that SIZE is large enough for the header. Does not work
301// within the impl block.
302const _: () = assert!(size_of::<NodeStoreHeader>() <= NodeStoreHeader::SIZE as usize);
303
304impl NodeStoreHeader {
305 /// The first SIZE bytes of the `ReadableStorage` are reserved for the
306 /// [`NodeStoreHeader`].
307 /// We also want it aligned to a disk block
308 pub const SIZE: u64 = 2048;
309
310 /// Reads the [`NodeStoreHeader`] from the start of the given storage.
311 ///
312 /// # Errors
313 ///
314 /// Returns an error if unable to read the header from storage or if there's
315 /// a node hash algorithm mismatch.
316 pub fn read_from_storage<S: crate::linear::ReadableStorage>(
317 storage: &S,
318 ) -> Result<Self, crate::FileIoError> {
319 // TODO(#1088): remove this after implementing runtime selection of hash algorithms
320 storage.node_hash_algorithm().validate_init().map_err(|e| {
321 storage.file_io_error(e, 0, Some("NodeHashAlgorithm::validate_init".to_string()))
322 })?;
323
324 let mut this = bytemuck::zeroed::<Self>();
325 let header_bytes = bytemuck::bytes_of_mut(&mut this);
326 storage
327 .stream_from(0)?
328 .read_exact(header_bytes)
329 .map_err(|e| {
330 storage.file_io_error(e, 0, Some("NodeStoreHeader::read_from_storage".to_string()))
331 })?;
332 this.validate(storage.node_hash_algorithm()).map_err(|e| {
333 storage.file_io_error(e, 0, Some("NodeStoreHeader::validate".to_string()))
334 })?;
335
336 Ok(this)
337 }
338
339 /// Creates a new header with default values and no root address.
340 #[must_use]
341 pub fn new(node_hash_algorithm: NodeHashAlgorithm) -> Self {
342 Self {
343 // The store just contains the header at this point
344 size: Self::SIZE,
345 endian_test: 1,
346 root_address: None,
347 version: Version::new(),
348 free_lists: Default::default(),
349 area_size_hash: area_size_hash().into(),
350 node_hash_algorithm: node_hash_algorithm as u64,
351 root_hash: TrieHash::empty().into(),
352 cargo_version: CargoVersion::INSTANCE,
353 git_describe: GitDescribe::INSTANCE,
354 }
355 }
356
357 /// Validates the header fields to ensure correctness and compatibility with
358 /// the current build of firewood.
359 ///
360 /// Checks performed:
361 ///
362 /// - Magic number / version string is valid. The first 16 bytes must match
363 /// an expected magic number.
364 /// - Endianness test matches expected value (1). If not, reading from the
365 /// database will produce incorrect results from mis-interpreted byte order.
366 /// - Area size hash matches the expected hash for the current build. This
367 /// prevents corrupting the allocation structures by changing area sizes.
368 /// - Node hash algorithm flag matches the expected algorithm for this
369 /// storage.
370 ///
371 /// # Errors
372 ///
373 /// Returns an error if validation fails.
374 pub fn validate(&self, expected_node_hash_algorithm: NodeHashAlgorithm) -> Result<(), Error> {
375 trace!("Checking version...");
376 self.version.validate()?;
377
378 trace!("Checking endianness...");
379 self.validate_endian_test()?;
380
381 trace!("Checking area size hash...");
382 self.validate_area_size_hash()?;
383
384 trace!("Checking if node hash algorithm flag matches storage...");
385 self.validate_node_hash_algorithm(expected_node_hash_algorithm)?;
386
387 if self.version == Version::VALID_V1_VERSIONS[0] {
388 debug!(
389 "Database was created with firewood version {}",
390 self.firewood_version_str()
391 .as_deref()
392 .unwrap_or("<unknown>"),
393 );
394 }
395
396 Ok(())
397 }
398
399 /// Get the size of the nodestore
400 #[must_use]
401 pub const fn size(&self) -> u64 {
402 self.size
403 }
404
405 /// Set the size of the nodestore
406 pub const fn set_size(&mut self, size: u64) {
407 self.size = size;
408 }
409
410 /// Get the free lists
411 #[must_use]
412 pub const fn free_lists(&self) -> &FreeLists {
413 &self.free_lists
414 }
415
416 /// Get mutable access to the free lists
417 pub const fn free_lists_mut(&mut self) -> &mut FreeLists {
418 &mut self.free_lists
419 }
420
421 /// Get the root address
422 #[must_use]
423 pub const fn root_address(&self) -> Option<LinearAddress> {
424 self.root_address
425 }
426
427 /// Get the root hash.
428 ///
429 /// This is None if the database was created before v0.1.0.
430 #[must_use]
431 pub(crate) fn root_hash(&self) -> Option<TrieHash> {
432 if self.version.is_firewood_v1() && self.root_address.is_some() {
433 Some(TrieHash::from(self.root_hash))
434 } else {
435 None
436 }
437 }
438
439 /// Update the root location, both address and hash.
440 ///
441 /// Note that this does not overwrite the version field, so it is possible
442 /// the root hash will be ignored when set.
443 pub fn set_root_location(&mut self, root_location: Option<RootNodeInfo>) {
444 let (root_address, root_hash) = root_location.unzip();
445 self.root_address = root_address;
446 self.root_hash = root_hash.unwrap_or_else(TrieHash::empty).into();
447 }
448
449 /// Get the offset of the `free_lists` field for use with `offset_of`!
450 #[must_use]
451 pub const fn free_lists_offset() -> u64 {
452 std::mem::offset_of!(NodeStoreHeader, free_lists) as u64
453 }
454
455 /// Get the cargo version of `firewood-storage` used to create this database,
456 /// if available.
457 #[must_use]
458 pub const fn cargo_version(&self) -> Option<&CargoVersion> {
459 if self.version.is_firewood_v1() {
460 Some(&self.cargo_version)
461 } else {
462 None
463 }
464 }
465
466 /// Get the git describe string of `firewood` used to create this database,
467 #[must_use]
468 pub const fn git_describe(&self) -> Option<&GitDescribe> {
469 if self.version.is_firewood_v1() {
470 Some(&self.git_describe)
471 } else {
472 None
473 }
474 }
475
476 /// Returns a version string identifying the version of `firewood-storage`
477 /// used to create this database, if available.
478 ///
479 /// This field was added between v0.0.18 and v0.1.0 and may be absent in
480 /// older databases.
481 ///
482 /// The returned string is either the `git describe` output (from the time
483 /// of build) if available, otherwise the cargo package version of the
484 /// `firewood-storage` crate.
485 pub(crate) fn firewood_version_str(&self) -> Option<std::borrow::Cow<'_, str>> {
486 self.git_describe()
487 .map(GitDescribe::as_str)
488 .or_else(|| self.cargo_version().map(CargoVersion::as_str))
489 }
490
491 fn validate_endian_test(&self) -> Result<(), Error> {
492 if self.endian_test == 1 {
493 Ok(())
494 } else {
495 Err(Error::new(
496 ErrorKind::InvalidData,
497 "Database cannot be opened due to difference in endianness",
498 ))
499 }
500 }
501
502 fn validate_area_size_hash(&self) -> Result<(), Error> {
503 if self.area_size_hash == area_size_hash().as_slice() {
504 Ok(())
505 } else {
506 Err(Error::new(
507 ErrorKind::InvalidData,
508 "Database cannot be opened due to difference in area size hash",
509 ))
510 }
511 }
512
513 fn validate_node_hash_algorithm(&self, expected: NodeHashAlgorithm) -> Result<(), Error> {
514 expected.validate_init()?;
515 NodeHashAlgorithm::try_from(self.node_hash_algorithm)
516 .map_err(|err| {
517 std::io::Error::new(
518 std::io::ErrorKind::InvalidData,
519 format!("hash mode flag in database header is invalid: {err}"),
520 )
521 })?
522 .validate_open(expected)
523 }
524}
525
526// memcpy but in const context. This bypasses the current limitation that
527// `slice::copy_from_slice` cannot be used in const fns (or const{} blocks).
528const fn const_copy(src: &[u8], dst: &mut [u8]) {
529 #![expect(
530 clippy::indexing_slicing,
531 clippy::arithmetic_side_effects,
532 reason = "const and sufficient bounds checks"
533 )]
534
535 let mut i = 0;
536 while i < src.len() && i < dst.len() {
537 dst[i] = src[i];
538 i += 1;
539 }
540}
541
542#[cfg(test)]
543mod tests {
544 use super::*;
545 use test_case::test_case;
546
547 #[test]
548 fn test_version_new_is_valid() {
549 Version::new()
550 .validate()
551 .expect("Version::new() should always be valid");
552 }
553
554 #[test_case(*b"invalid\0\0\0\0\0\0\0\0\0")]
555 #[test_case(*b"avalanche 0.1.0\0")]
556 #[test_case(*b"firewood 0.0.1\0\0")]
557 fn test_invalid_version_strings(bytes: [u8; 16]) {
558 assert!(Version { bytes }.validate().is_err());
559 }
560
561 #[test]
562 fn test_header_new() {
563 let header = NodeStoreHeader::new(NodeHashAlgorithm::compile_option());
564
565 // Check the header is correctly initialized.
566 assert_eq!(header.version, Version::new());
567 let empty_free_list: FreeLists = Default::default();
568 assert_eq!(*header.free_lists(), empty_free_list);
569 }
570}