Skip to main content

signet_hot_mdbx/
lib.rs

1//! Implementation of the hot key-value storage using MDBX as the underlying
2//! database.
3//!
4//! ## Notes on implementation
5//!
6//! This module provides an implementation of the [`HotKv`] trait using MDBX as
7//! the underlying database. It includes functionality for opening and
8//! managing the MDBX environment, handling read-only and read-write
9//! transactions, and managing database tables.
10//!
11//! The [`DatabaseEnv`] struct encapsulates the MDBX environment and provides
12//! methods for starting transactions. The [`DatabaseArguments`] struct
13//! allows for configuring various parameters of the database environment,
14//! such as geometry, sync mode, and maximum readers.
15//!
16//! ### Table Metadata
17//!
18//! This implementation uses the default MDBX table to store metadata about
19//! each table, including whether it uses dual keys or fixed-size values. This
20//! metadata is cached in memory for efficient access during the lifetime of
21//! the environment. Each time a table is opened, its metadata is checked
22//! against the cached values to ensure consistency.
23//!
24//! Rough Edges:
25//! - The cache does not respect dropped transactions. Creating multiple tables
26//!   with the same name but different metadata in different transactions
27//!   may lead to inconsistencies.
28//! - Tables created outside of this implementation (e.g., via external tools)
29//!   will not have their metadata cached, which may lead to inconsistencies if
30//!   the same table is later opened with different metadata.
31//!
32//! Overall, we do NOT recommend using this to open existing databases that
33//! were not created and managed by this implementation.
34//!
35//! # Feature Flags
36//!
37//! - **`test-utils`**: Enables the `test_utils` module with MDBX test
38//!   helpers and conformance tests. Adds a `tempfile` dependency.
39//! - **`disable-lock`**: Disables the storage lock file, allowing multiple
40//!   processes to open the same database. Intended for testing scenarios.
41
42#![warn(
43    missing_copy_implementations,
44    missing_debug_implementations,
45    missing_docs,
46    unreachable_pub,
47    clippy::missing_const_for_fn,
48    rustdoc::all
49)]
50#![cfg_attr(not(test), warn(unused_crate_dependencies))]
51#![deny(unused_must_use, rust_2018_idioms)]
52#![cfg_attr(docsrs, feature(doc_cfg))]
53
54use signet_libmdbx::{
55    Environment, EnvironmentFlags, Geometry, Mode, Ro, RoSync, Rw, RwSync, SyncMode, ffi,
56    sys::{HandleSlowReadersReturnCode, PageSize},
57};
58use std::{ops::Range, path::Path};
59
60mod cursor;
61pub use cursor::{Cursor, CursorRo, CursorRoSync, CursorRw, CursorRwSync};
62
63mod db_info;
64pub use db_info::FixedSizeInfo;
65use db_info::FsiCache;
66
67mod error;
68pub use error::MdbxError;
69
70mod lock;
71pub use lock::{StorageLock, StorageLockError};
72
73#[cfg(any(test, feature = "test-utils"))]
74pub mod test_utils;
75
76mod tx;
77pub use tx::Tx;
78
79mod utils;
80
81use signet_hot::{
82    model::{HotKv, HotKvError, HotKvWrite},
83    tables::{
84        AccountChangeSets, AccountsHistory, Bytecodes, HeaderNumbers, Headers, NUM_TABLES,
85        PlainAccountState, PlainStorageState, StorageChangeSets, StorageHistory, Table,
86    },
87};
88
89/// The known table names, used to pre-populate the FSI cache at open time.
90const KNOWN_TABLE_NAMES: [&str; NUM_TABLES] = [
91    Headers::NAME,
92    HeaderNumbers::NAME,
93    Bytecodes::NAME,
94    PlainAccountState::NAME,
95    PlainStorageState::NAME,
96    AccountsHistory::NAME,
97    AccountChangeSets::NAME,
98    StorageHistory::NAME,
99    StorageChangeSets::NAME,
100];
101
102/// 1 KB in bytes
103pub const KILOBYTE: usize = 1024;
104/// 1 MB in bytes
105pub const MEGABYTE: usize = KILOBYTE * 1024;
106/// 1 GB in bytes
107pub const GIGABYTE: usize = MEGABYTE * 1024;
108/// 1 TB in bytes
109pub const TERABYTE: usize = GIGABYTE * 1024;
110
111/// MDBX allows up to 32767 readers (`MDBX_READERS_LIMIT`), but we limit it to slightly below that
112const DEFAULT_MAX_READERS: u64 = 32_000;
113
114/// Space that a read-only transaction can occupy until the warning is emitted.
115/// See [`signet_libmdbx::EnvironmentBuilder::set_handle_slow_readers`] for more
116/// information.
117const MAX_SAFE_READER_SPACE: usize = 10 * GIGABYTE;
118
119/// Environment used when opening a MDBX environment. Read-only or Read-write.
120#[derive(Clone, Copy, Debug, Eq, PartialEq)]
121pub enum DatabaseEnvKind {
122    /// Read-only MDBX environment.
123    RO,
124    /// Read-write MDBX environment.
125    RW,
126}
127
128impl DatabaseEnvKind {
129    /// Returns `true` if the environment is read-write.
130    pub const fn is_rw(&self) -> bool {
131        matches!(self, Self::RW)
132    }
133}
134
135/// Arguments for database initialization.
136#[derive(Clone, Debug)]
137pub struct DatabaseArguments {
138    /// Database geometry settings.
139    geometry: Geometry<Range<usize>>,
140
141    /// Open environment in exclusive/monopolistic mode. If [None], the default value is used.
142    ///
143    /// This can be used as a replacement for `MDB_NOLOCK`, which don't supported by MDBX. In this
144    /// way, you can get the minimal overhead, but with the correct multi-process and multi-thread
145    /// locking.
146    ///
147    /// If `true` = open environment in exclusive/monopolistic mode or return `MDBX_BUSY` if
148    /// environment already used by other process. The main feature of the exclusive mode is the
149    /// ability to open the environment placed on a network share.
150    ///
151    /// If `false` = open environment in cooperative mode, i.e. for multi-process
152    /// access/interaction/cooperation. The main requirements of the cooperative mode are:
153    /// - Data files MUST be placed in the LOCAL file system, but NOT on a network share.
154    /// - Environment MUST be opened only by LOCAL processes, but NOT over a network.
155    /// - OS kernel (i.e. file system and memory mapping implementation) and all processes that
156    ///   open the given environment MUST be running in the physically single RAM with
157    ///   cache-coherency. The only exception for cache-consistency requirement is Linux on MIPS
158    ///   architecture, but this case has not been tested for a long time).
159    ///
160    /// This flag affects only at environment opening but can't be changed after.
161    exclusive: Option<bool>,
162    /// MDBX allows up to 32767 readers (`MDBX_READERS_LIMIT`). This arg is to configure the max
163    /// readers.
164    max_readers: Option<u64>,
165    /// Defines the synchronization strategy used by the MDBX database when writing data to disk.
166    ///
167    /// This determines how aggressively MDBX ensures data durability versus prioritizing
168    /// performance. The available modes are:
169    ///
170    /// - [`SyncMode::Durable`]: Ensures all transactions are fully flushed to disk before they are
171    ///   considered committed.   This provides the highest level of durability and crash safety
172    ///   but may have a performance cost.
173    /// - [`SyncMode::SafeNoSync`]: Skips certain fsync operations to improve write performance.
174    ///   This mode still maintains database integrity but may lose the most recent transactions if
175    ///   the system crashes unexpectedly.
176    ///
177    /// Choose `Durable` if consistency and crash safety are critical (e.g., production
178    /// environments). Choose `SafeNoSync` if performance is more important and occasional data
179    /// loss is acceptable (e.g., testing or ephemeral data).
180    sync_mode: SyncMode,
181}
182
183impl Default for DatabaseArguments {
184    fn default() -> Self {
185        Self::new()
186    }
187}
188
189impl DatabaseArguments {
190    /// Create new database arguments with given client version.
191    pub fn new() -> Self {
192        Self {
193            geometry: Geometry {
194                size: Some(0..(8 * TERABYTE)),
195                growth_step: Some(4 * GIGABYTE as isize),
196                shrink_threshold: Some(0),
197                page_size: Some(PageSize::Set(utils::default_page_size())),
198            },
199            exclusive: None,
200            max_readers: None,
201            sync_mode: SyncMode::Durable,
202        }
203    }
204
205    /// Sets the upper size limit of the db environment, the maximum database size in bytes.
206    pub const fn with_geometry_max_size(mut self, max_size: Option<usize>) -> Self {
207        if let Some(max_size) = max_size {
208            self.geometry.size = Some(0..max_size);
209        }
210        self
211    }
212
213    /// Sets the database page size value.
214    pub const fn with_geometry_page_size(mut self, page_size: Option<usize>) -> Self {
215        if let Some(size) = page_size {
216            self.geometry.page_size = Some(PageSize::Set(size));
217        }
218
219        self
220    }
221
222    /// Sets the database sync mode.
223    pub const fn with_sync_mode(mut self, sync_mode: Option<SyncMode>) -> Self {
224        if let Some(sync_mode) = sync_mode {
225            self.sync_mode = sync_mode;
226        }
227
228        self
229    }
230
231    /// Configures the database growth step in bytes.
232    pub const fn with_growth_step(mut self, growth_step: Option<usize>) -> Self {
233        if let Some(growth_step) = growth_step {
234            self.geometry.growth_step = Some(growth_step as isize);
235        }
236        self
237    }
238
239    /// Set the mdbx exclusive flag.
240    pub const fn with_exclusive(mut self, exclusive: Option<bool>) -> Self {
241        self.exclusive = exclusive;
242        self
243    }
244
245    /// Set `max_readers` flag.
246    pub const fn with_max_readers(mut self, max_readers: Option<u64>) -> Self {
247        self.max_readers = max_readers;
248        self
249    }
250
251    /// Open a read-only database at `path` with the current arguments
252    pub fn open_ro(self, path: &Path) -> Result<DatabaseEnv, MdbxError> {
253        DatabaseEnv::open(path, DatabaseEnvKind::RO, self)
254    }
255
256    /// Open a read-write database at `path` with the current arguments
257    pub fn open_rw(self, path: &Path) -> Result<DatabaseEnv, MdbxError> {
258        DatabaseEnv::open(path, DatabaseEnvKind::RW, self)
259    }
260}
261
262/// MDBX database environment. Wraps the low-level [Environment], and
263/// implements the [`HotKv`] trait.
264
265#[derive(Debug, Clone)]
266pub struct DatabaseEnv {
267    /// Libmdbx-sys environment.
268    inner: Environment,
269    /// Cached FixedSizeInfo for tables, pre-populated at open time.
270    ///
271    /// The standard tables are created and their FSI entries cached during
272    /// [`DatabaseEnv::open`]. Do not manually close DBIs (e.g. via
273    /// `mdbx_dbi_close`) or dynamically drop tables at runtime.
274    fsi_cache: FsiCache,
275
276    /// Write lock for when dealing with a read-write environment.
277    _lock_file: Option<StorageLock>,
278}
279
280impl DatabaseEnv {
281    /// Opens the database at the specified path with the given `EnvKind`.
282    /// Acquires a lock file if opening in read-write mode.
283    pub fn open(
284        path: &Path,
285        kind: DatabaseEnvKind,
286        args: DatabaseArguments,
287    ) -> Result<Self, MdbxError> {
288        let _lock_file = if kind.is_rw() { Some(StorageLock::try_acquire(path)?) } else { None };
289
290        let mut inner_env = Environment::builder();
291
292        let mode = match kind {
293            DatabaseEnvKind::RO => Mode::ReadOnly,
294            DatabaseEnvKind::RW => {
295                // enable writemap mode in RW mode
296                inner_env.write_map();
297                Mode::ReadWrite { sync_mode: args.sync_mode }
298            }
299        };
300
301        inner_env.set_max_dbs(256);
302        inner_env.set_geometry(args.geometry);
303
304        fn is_current_process(id: u32) -> bool {
305            #[cfg(unix)]
306            {
307                id == std::os::unix::process::parent_id() || id == std::process::id()
308            }
309
310            #[cfg(not(unix))]
311            {
312                id == std::process::id()
313            }
314        }
315
316        extern "C" fn handle_slow_readers(
317            _env: *const ffi::MDBX_env,
318            _txn: *const ffi::MDBX_txn,
319            process_id: ffi::mdbx_pid_t,
320            thread_id: ffi::mdbx_tid_t,
321            read_txn_id: u64,
322            gap: std::ffi::c_uint,
323            space: usize,
324            retry: std::ffi::c_int,
325        ) -> HandleSlowReadersReturnCode {
326            if space > MAX_SAFE_READER_SPACE {
327                let message = if is_current_process(process_id as u32) {
328                    "Current process has a long-lived database transaction that grows the database file."
329                } else {
330                    "External process has a long-lived database transaction that grows the database file. \
331                     Use shorter-lived read transactions or shut down the node."
332                };
333                tracing::warn!(
334                    target: "storage::db::mdbx",
335                    ?process_id,
336                    ?thread_id,
337                    ?read_txn_id,
338                    ?gap,
339                    ?space,
340                    ?retry,
341                    "{message}"
342                )
343            }
344
345            HandleSlowReadersReturnCode::ProceedWithoutKillingReader
346        }
347        inner_env.set_handle_slow_readers(handle_slow_readers);
348
349        inner_env.set_flags(EnvironmentFlags {
350            mode,
351            // We disable readahead because it improves performance for linear scans, but
352            // worsens it for random access (which is our access pattern outside of sync)
353            no_rdahead: true,
354            coalesce: true,
355            exclusive: args.exclusive.unwrap_or_default(),
356            ..Default::default()
357        });
358        // Configure more readers
359        inner_env.set_max_readers(args.max_readers.unwrap_or(DEFAULT_MAX_READERS));
360        // This parameter sets the maximum size of the "reclaimed list", and the unit of measurement
361        // is "pages". Reclaimed list is the list of freed pages that's populated during the
362        // lifetime of DB transaction, and through which MDBX searches when it needs to insert new
363        // record with overflow pages. The flow is roughly the following:
364        // 0. We need to insert a record that requires N number of overflow pages (in consecutive
365        //    sequence inside the DB file).
366        // 1. Get some pages from the freelist, put them into the reclaimed list.
367        // 2. Search through the reclaimed list for the sequence of size N.
368        // 3. a. If found, return the sequence.
369        // 3. b. If not found, repeat steps 1-3. If the reclaimed list size is larger than
370        //    the `rp augment limit`, stop the search and allocate new pages at the end of the file:
371        //    https://github.com/paradigmxyz/reth/blob/2a4c78759178f66e30c8976ec5d243b53102fc9a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c#L11479-L11480.
372        //
373        // Basically, this parameter controls for how long do we search through the freelist before
374        // trying to allocate new pages. Smaller value will make MDBX to fallback to
375        // allocation faster, higher value will force MDBX to search through the freelist
376        // longer until the sequence of pages is found.
377        //
378        // The default value of this parameter is set depending on the DB size. The bigger the
379        // database, the larger is `rp augment limit`.
380        // https://github.com/paradigmxyz/reth/blob/2a4c78759178f66e30c8976ec5d243b53102fc9a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c#L10018-L10024.
381        //
382        // Previously, MDBX set this value as `256 * 1024` constant. Let's fallback to this,
383        // because we want to prioritize freelist lookup speed over database growth.
384        // https://github.com/paradigmxyz/reth/blob/fa2b9b685ed9787636d962f4366caf34a9186e66/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c#L16017.
385        inner_env.set_rp_augment_limit(256 * 1024);
386
387        let inner = inner_env.open(path)?;
388
389        let fsi_cache = if kind.is_rw() {
390            create_tables_and_populate_cache(&inner)?
391        } else {
392            populate_cache_ro(&inner)?
393        };
394
395        Ok(Self { inner, fsi_cache, _lock_file })
396    }
397
398    /// Start a new read-only transaction.
399    pub fn tx(&self) -> Result<Tx<Ro>, MdbxError> {
400        self.inner
401            .begin_ro_unsync()
402            .map(|tx| Tx::new(tx, self.fsi_cache.clone()))
403            .map_err(MdbxError::Mdbx)
404    }
405
406    /// Start a new read-write transaction.
407    pub fn tx_rw(&self) -> Result<Tx<Rw>, MdbxError> {
408        self.inner
409            .begin_rw_unsync()
410            .map(|tx| Tx::new(tx, self.fsi_cache.clone()))
411            .map_err(MdbxError::Mdbx)
412    }
413
414    /// Start a new read-only synchronous transaction.
415    pub fn tx_sync(&self) -> Result<Tx<RoSync>, MdbxError> {
416        self.inner
417            .begin_ro_sync()
418            .map(|tx| Tx::new(tx, self.fsi_cache.clone()))
419            .map_err(MdbxError::Mdbx)
420    }
421
422    /// Start a new read-write synchronous transaction.
423    pub fn tx_rw_sync(&self) -> Result<Tx<RwSync>, MdbxError> {
424        self.inner
425            .begin_rw_sync()
426            .map(|tx| Tx::new(tx, self.fsi_cache.clone()))
427            .map_err(MdbxError::Mdbx)
428    }
429}
430
431impl HotKv for DatabaseEnv {
432    type RoTx = Tx<Ro>;
433    type RwTx = Tx<Rw>;
434
435    fn reader(&self) -> Result<Self::RoTx, HotKvError> {
436        self.tx().map_err(HotKvError::from_err)
437    }
438
439    fn writer(&self) -> Result<Self::RwTx, HotKvError> {
440        self.tx_rw().map_err(HotKvError::from_err)
441    }
442}
443
444/// Create all standard hot storage tables and return a pre-populated
445/// [`FsiCache`]. Called during RW open.
446fn create_tables_and_populate_cache(env: &Environment) -> Result<FsiCache, MdbxError> {
447    let inner_tx = env.begin_rw_unsync().map_err(MdbxError::Mdbx)?;
448    // Tx requires an FsiCache, so we pass a throwaway empty one. The FSI
449    // entries written by queue_db_init's store_fsi calls land in this
450    // temporary cache's dynamic map — they are discarded. We re-read the
451    // authoritative values from the metadata table via read_known_fsi.
452    let tmp_cache = FsiCache::new(Default::default());
453    let tx = Tx::new(inner_tx, tmp_cache);
454    tx.queue_db_init()?;
455
456    let known = read_known_fsi(&tx)?;
457    tx.raw_commit()?;
458    Ok(FsiCache::new(known))
459}
460
461/// Read FSI entries for all known tables from the metadata table.
462fn read_known_fsi<K: signet_libmdbx::TransactionKind>(
463    tx: &Tx<K>,
464) -> Result<[(&'static str, FixedSizeInfo); NUM_TABLES], MdbxError> {
465    let mut known = [("", FixedSizeInfo::None); NUM_TABLES];
466    for (i, &name) in KNOWN_TABLE_NAMES.iter().enumerate() {
467        known[i] = (name, tx.read_fsi_from_table(name)?);
468    }
469    Ok(known)
470}
471
472/// Read FSI entries for all known tables via a temporary RO transaction.
473/// Called during RO open.
474fn populate_cache_ro(env: &Environment) -> Result<FsiCache, MdbxError> {
475    let inner_tx = env.begin_ro_unsync().map_err(MdbxError::Mdbx)?;
476    let tmp_cache = FsiCache::new(Default::default());
477    let tx = Tx::new(inner_tx, tmp_cache);
478    let known = read_known_fsi(&tx)?;
479    Ok(FsiCache::new(known))
480}