signet_hot_mdbx/lib.rs
1//! Implementation of the hot key-value storage using MDBX as the underlying
2//! database.
3//!
4//! ## Notes on implementation
5//!
6//! This module provides an implementation of the [`HotKv`] trait using MDBX as
7//! the underlying database. It includes functionality for opening and
8//! managing the MDBX environment, handling read-only and read-write
9//! transactions, and managing database tables.
10//!
11//! The [`DatabaseEnv`] struct encapsulates the MDBX environment and provides
12//! methods for starting transactions. The [`DatabaseArguments`] struct
13//! allows for configuring various parameters of the database environment,
14//! such as geometry, sync mode, and maximum readers.
15//!
16//! ### Table Metadata
17//!
18//! This implementation uses the default MDBX table to store metadata about
19//! each table, including whether it uses dual keys or fixed-size values. This
20//! metadata is cached in memory for efficient access during the lifetime of
21//! the environment. Each time a table is opened, its metadata is checked
22//! against the cached values to ensure consistency.
23//!
24//! Rough Edges:
25//! - The cache does not respect dropped transactions. Creating multiple tables
26//! with the same name but different metadata in different transactions
27//! may lead to inconsistencies.
28//! - Tables created outside of this implementation (e.g., via external tools)
29//! will not have their metadata cached, which may lead to inconsistencies if
30//! the same table is later opened with different metadata.
31//!
32//! Overall, we do NOT recommend using this to open existing databases that
33//! were not created and managed by this implementation.
34//!
35//! # Feature Flags
36//!
37//! - **`test-utils`**: Enables the `test_utils` module with MDBX test
38//! helpers and conformance tests. Adds a `tempfile` dependency.
39//! - **`disable-lock`**: Disables the storage lock file, allowing multiple
40//! processes to open the same database. Intended for testing scenarios.
41
42#![warn(
43 missing_copy_implementations,
44 missing_debug_implementations,
45 missing_docs,
46 unreachable_pub,
47 clippy::missing_const_for_fn,
48 rustdoc::all
49)]
50#![cfg_attr(not(test), warn(unused_crate_dependencies))]
51#![deny(unused_must_use, rust_2018_idioms)]
52#![cfg_attr(docsrs, feature(doc_cfg))]
53
54use signet_libmdbx::{
55 Environment, EnvironmentFlags, Geometry, Mode, Ro, RoSync, Rw, RwSync, SyncMode, ffi,
56 sys::{HandleSlowReadersReturnCode, PageSize},
57};
58use std::{ops::Range, path::Path};
59
60mod cursor;
61pub use cursor::{Cursor, CursorRo, CursorRoSync, CursorRw, CursorRwSync};
62
63mod db_info;
64pub use db_info::FixedSizeInfo;
65use db_info::FsiCache;
66
67mod error;
68pub use error::MdbxError;
69
70mod lock;
71pub use lock::{StorageLock, StorageLockError};
72
73#[cfg(any(test, feature = "test-utils"))]
74pub mod test_utils;
75
76mod tx;
77pub use tx::Tx;
78
79mod utils;
80
81use signet_hot::{
82 model::{HotKv, HotKvError, HotKvWrite},
83 tables::{
84 AccountChangeSets, AccountsHistory, Bytecodes, HeaderNumbers, Headers, NUM_TABLES,
85 PlainAccountState, PlainStorageState, StorageChangeSets, StorageHistory, Table,
86 },
87};
88
89/// The known table names, used to pre-populate the FSI cache at open time.
90const KNOWN_TABLE_NAMES: [&str; NUM_TABLES] = [
91 Headers::NAME,
92 HeaderNumbers::NAME,
93 Bytecodes::NAME,
94 PlainAccountState::NAME,
95 PlainStorageState::NAME,
96 AccountsHistory::NAME,
97 AccountChangeSets::NAME,
98 StorageHistory::NAME,
99 StorageChangeSets::NAME,
100];
101
102/// 1 KB in bytes
103pub const KILOBYTE: usize = 1024;
104/// 1 MB in bytes
105pub const MEGABYTE: usize = KILOBYTE * 1024;
106/// 1 GB in bytes
107pub const GIGABYTE: usize = MEGABYTE * 1024;
108/// 1 TB in bytes
109pub const TERABYTE: usize = GIGABYTE * 1024;
110
111/// MDBX allows up to 32767 readers (`MDBX_READERS_LIMIT`), but we limit it to slightly below that
112const DEFAULT_MAX_READERS: u64 = 32_000;
113
114/// Space that a read-only transaction can occupy until the warning is emitted.
115/// See [`signet_libmdbx::EnvironmentBuilder::set_handle_slow_readers`] for more
116/// information.
117const MAX_SAFE_READER_SPACE: usize = 10 * GIGABYTE;
118
119/// Environment used when opening a MDBX environment. Read-only or Read-write.
120#[derive(Clone, Copy, Debug, Eq, PartialEq)]
121pub enum DatabaseEnvKind {
122 /// Read-only MDBX environment.
123 RO,
124 /// Read-write MDBX environment.
125 RW,
126}
127
128impl DatabaseEnvKind {
129 /// Returns `true` if the environment is read-write.
130 pub const fn is_rw(&self) -> bool {
131 matches!(self, Self::RW)
132 }
133}
134
135/// Arguments for database initialization.
136#[derive(Clone, Debug)]
137pub struct DatabaseArguments {
138 /// Database geometry settings.
139 geometry: Geometry<Range<usize>>,
140
141 /// Open environment in exclusive/monopolistic mode. If [None], the default value is used.
142 ///
143 /// This can be used as a replacement for `MDB_NOLOCK`, which don't supported by MDBX. In this
144 /// way, you can get the minimal overhead, but with the correct multi-process and multi-thread
145 /// locking.
146 ///
147 /// If `true` = open environment in exclusive/monopolistic mode or return `MDBX_BUSY` if
148 /// environment already used by other process. The main feature of the exclusive mode is the
149 /// ability to open the environment placed on a network share.
150 ///
151 /// If `false` = open environment in cooperative mode, i.e. for multi-process
152 /// access/interaction/cooperation. The main requirements of the cooperative mode are:
153 /// - Data files MUST be placed in the LOCAL file system, but NOT on a network share.
154 /// - Environment MUST be opened only by LOCAL processes, but NOT over a network.
155 /// - OS kernel (i.e. file system and memory mapping implementation) and all processes that
156 /// open the given environment MUST be running in the physically single RAM with
157 /// cache-coherency. The only exception for cache-consistency requirement is Linux on MIPS
158 /// architecture, but this case has not been tested for a long time).
159 ///
160 /// This flag affects only at environment opening but can't be changed after.
161 exclusive: Option<bool>,
162 /// MDBX allows up to 32767 readers (`MDBX_READERS_LIMIT`). This arg is to configure the max
163 /// readers.
164 max_readers: Option<u64>,
165 /// Defines the synchronization strategy used by the MDBX database when writing data to disk.
166 ///
167 /// This determines how aggressively MDBX ensures data durability versus prioritizing
168 /// performance. The available modes are:
169 ///
170 /// - [`SyncMode::Durable`]: Ensures all transactions are fully flushed to disk before they are
171 /// considered committed. This provides the highest level of durability and crash safety
172 /// but may have a performance cost.
173 /// - [`SyncMode::SafeNoSync`]: Skips certain fsync operations to improve write performance.
174 /// This mode still maintains database integrity but may lose the most recent transactions if
175 /// the system crashes unexpectedly.
176 ///
177 /// Choose `Durable` if consistency and crash safety are critical (e.g., production
178 /// environments). Choose `SafeNoSync` if performance is more important and occasional data
179 /// loss is acceptable (e.g., testing or ephemeral data).
180 sync_mode: SyncMode,
181}
182
183impl Default for DatabaseArguments {
184 fn default() -> Self {
185 Self::new()
186 }
187}
188
189impl DatabaseArguments {
190 /// Create new database arguments with given client version.
191 pub fn new() -> Self {
192 Self {
193 geometry: Geometry {
194 size: Some(0..(8 * TERABYTE)),
195 growth_step: Some(4 * GIGABYTE as isize),
196 shrink_threshold: Some(0),
197 page_size: Some(PageSize::Set(utils::default_page_size())),
198 },
199 exclusive: None,
200 max_readers: None,
201 sync_mode: SyncMode::Durable,
202 }
203 }
204
205 /// Sets the upper size limit of the db environment, the maximum database size in bytes.
206 pub const fn with_geometry_max_size(mut self, max_size: Option<usize>) -> Self {
207 if let Some(max_size) = max_size {
208 self.geometry.size = Some(0..max_size);
209 }
210 self
211 }
212
213 /// Sets the database page size value.
214 pub const fn with_geometry_page_size(mut self, page_size: Option<usize>) -> Self {
215 if let Some(size) = page_size {
216 self.geometry.page_size = Some(PageSize::Set(size));
217 }
218
219 self
220 }
221
222 /// Sets the database sync mode.
223 pub const fn with_sync_mode(mut self, sync_mode: Option<SyncMode>) -> Self {
224 if let Some(sync_mode) = sync_mode {
225 self.sync_mode = sync_mode;
226 }
227
228 self
229 }
230
231 /// Configures the database growth step in bytes.
232 pub const fn with_growth_step(mut self, growth_step: Option<usize>) -> Self {
233 if let Some(growth_step) = growth_step {
234 self.geometry.growth_step = Some(growth_step as isize);
235 }
236 self
237 }
238
239 /// Set the mdbx exclusive flag.
240 pub const fn with_exclusive(mut self, exclusive: Option<bool>) -> Self {
241 self.exclusive = exclusive;
242 self
243 }
244
245 /// Set `max_readers` flag.
246 pub const fn with_max_readers(mut self, max_readers: Option<u64>) -> Self {
247 self.max_readers = max_readers;
248 self
249 }
250
251 /// Open a read-only database at `path` with the current arguments
252 pub fn open_ro(self, path: &Path) -> Result<DatabaseEnv, MdbxError> {
253 DatabaseEnv::open(path, DatabaseEnvKind::RO, self)
254 }
255
256 /// Open a read-write database at `path` with the current arguments
257 pub fn open_rw(self, path: &Path) -> Result<DatabaseEnv, MdbxError> {
258 DatabaseEnv::open(path, DatabaseEnvKind::RW, self)
259 }
260}
261
262/// MDBX database environment. Wraps the low-level [Environment], and
263/// implements the [`HotKv`] trait.
264
265#[derive(Debug, Clone)]
266pub struct DatabaseEnv {
267 /// Libmdbx-sys environment.
268 inner: Environment,
269 /// Cached FixedSizeInfo for tables, pre-populated at open time.
270 ///
271 /// The standard tables are created and their FSI entries cached during
272 /// [`DatabaseEnv::open`]. Do not manually close DBIs (e.g. via
273 /// `mdbx_dbi_close`) or dynamically drop tables at runtime.
274 fsi_cache: FsiCache,
275
276 /// Write lock for when dealing with a read-write environment.
277 _lock_file: Option<StorageLock>,
278}
279
280impl DatabaseEnv {
281 /// Opens the database at the specified path with the given `EnvKind`.
282 /// Acquires a lock file if opening in read-write mode.
283 pub fn open(
284 path: &Path,
285 kind: DatabaseEnvKind,
286 args: DatabaseArguments,
287 ) -> Result<Self, MdbxError> {
288 let _lock_file = if kind.is_rw() { Some(StorageLock::try_acquire(path)?) } else { None };
289
290 let mut inner_env = Environment::builder();
291
292 let mode = match kind {
293 DatabaseEnvKind::RO => Mode::ReadOnly,
294 DatabaseEnvKind::RW => {
295 // enable writemap mode in RW mode
296 inner_env.write_map();
297 Mode::ReadWrite { sync_mode: args.sync_mode }
298 }
299 };
300
301 inner_env.set_max_dbs(256);
302 inner_env.set_geometry(args.geometry);
303
304 fn is_current_process(id: u32) -> bool {
305 #[cfg(unix)]
306 {
307 id == std::os::unix::process::parent_id() || id == std::process::id()
308 }
309
310 #[cfg(not(unix))]
311 {
312 id == std::process::id()
313 }
314 }
315
316 extern "C" fn handle_slow_readers(
317 _env: *const ffi::MDBX_env,
318 _txn: *const ffi::MDBX_txn,
319 process_id: ffi::mdbx_pid_t,
320 thread_id: ffi::mdbx_tid_t,
321 read_txn_id: u64,
322 gap: std::ffi::c_uint,
323 space: usize,
324 retry: std::ffi::c_int,
325 ) -> HandleSlowReadersReturnCode {
326 if space > MAX_SAFE_READER_SPACE {
327 let message = if is_current_process(process_id as u32) {
328 "Current process has a long-lived database transaction that grows the database file."
329 } else {
330 "External process has a long-lived database transaction that grows the database file. \
331 Use shorter-lived read transactions or shut down the node."
332 };
333 tracing::warn!(
334 target: "storage::db::mdbx",
335 ?process_id,
336 ?thread_id,
337 ?read_txn_id,
338 ?gap,
339 ?space,
340 ?retry,
341 "{message}"
342 )
343 }
344
345 HandleSlowReadersReturnCode::ProceedWithoutKillingReader
346 }
347 inner_env.set_handle_slow_readers(handle_slow_readers);
348
349 inner_env.set_flags(EnvironmentFlags {
350 mode,
351 // We disable readahead because it improves performance for linear scans, but
352 // worsens it for random access (which is our access pattern outside of sync)
353 no_rdahead: true,
354 coalesce: true,
355 exclusive: args.exclusive.unwrap_or_default(),
356 ..Default::default()
357 });
358 // Configure more readers
359 inner_env.set_max_readers(args.max_readers.unwrap_or(DEFAULT_MAX_READERS));
360 // This parameter sets the maximum size of the "reclaimed list", and the unit of measurement
361 // is "pages". Reclaimed list is the list of freed pages that's populated during the
362 // lifetime of DB transaction, and through which MDBX searches when it needs to insert new
363 // record with overflow pages. The flow is roughly the following:
364 // 0. We need to insert a record that requires N number of overflow pages (in consecutive
365 // sequence inside the DB file).
366 // 1. Get some pages from the freelist, put them into the reclaimed list.
367 // 2. Search through the reclaimed list for the sequence of size N.
368 // 3. a. If found, return the sequence.
369 // 3. b. If not found, repeat steps 1-3. If the reclaimed list size is larger than
370 // the `rp augment limit`, stop the search and allocate new pages at the end of the file:
371 // https://github.com/paradigmxyz/reth/blob/2a4c78759178f66e30c8976ec5d243b53102fc9a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c#L11479-L11480.
372 //
373 // Basically, this parameter controls for how long do we search through the freelist before
374 // trying to allocate new pages. Smaller value will make MDBX to fallback to
375 // allocation faster, higher value will force MDBX to search through the freelist
376 // longer until the sequence of pages is found.
377 //
378 // The default value of this parameter is set depending on the DB size. The bigger the
379 // database, the larger is `rp augment limit`.
380 // https://github.com/paradigmxyz/reth/blob/2a4c78759178f66e30c8976ec5d243b53102fc9a/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c#L10018-L10024.
381 //
382 // Previously, MDBX set this value as `256 * 1024` constant. Let's fallback to this,
383 // because we want to prioritize freelist lookup speed over database growth.
384 // https://github.com/paradigmxyz/reth/blob/fa2b9b685ed9787636d962f4366caf34a9186e66/crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c#L16017.
385 inner_env.set_rp_augment_limit(256 * 1024);
386
387 let inner = inner_env.open(path)?;
388
389 let fsi_cache = if kind.is_rw() {
390 create_tables_and_populate_cache(&inner)?
391 } else {
392 populate_cache_ro(&inner)?
393 };
394
395 Ok(Self { inner, fsi_cache, _lock_file })
396 }
397
398 /// Start a new read-only transaction.
399 pub fn tx(&self) -> Result<Tx<Ro>, MdbxError> {
400 self.inner
401 .begin_ro_unsync()
402 .map(|tx| Tx::new(tx, self.fsi_cache.clone()))
403 .map_err(MdbxError::Mdbx)
404 }
405
406 /// Start a new read-write transaction.
407 pub fn tx_rw(&self) -> Result<Tx<Rw>, MdbxError> {
408 self.inner
409 .begin_rw_unsync()
410 .map(|tx| Tx::new(tx, self.fsi_cache.clone()))
411 .map_err(MdbxError::Mdbx)
412 }
413
414 /// Start a new read-only synchronous transaction.
415 pub fn tx_sync(&self) -> Result<Tx<RoSync>, MdbxError> {
416 self.inner
417 .begin_ro_sync()
418 .map(|tx| Tx::new(tx, self.fsi_cache.clone()))
419 .map_err(MdbxError::Mdbx)
420 }
421
422 /// Start a new read-write synchronous transaction.
423 pub fn tx_rw_sync(&self) -> Result<Tx<RwSync>, MdbxError> {
424 self.inner
425 .begin_rw_sync()
426 .map(|tx| Tx::new(tx, self.fsi_cache.clone()))
427 .map_err(MdbxError::Mdbx)
428 }
429}
430
431impl HotKv for DatabaseEnv {
432 type RoTx = Tx<Ro>;
433 type RwTx = Tx<Rw>;
434
435 fn reader(&self) -> Result<Self::RoTx, HotKvError> {
436 self.tx().map_err(HotKvError::from_err)
437 }
438
439 fn writer(&self) -> Result<Self::RwTx, HotKvError> {
440 self.tx_rw().map_err(HotKvError::from_err)
441 }
442}
443
444/// Create all standard hot storage tables and return a pre-populated
445/// [`FsiCache`]. Called during RW open.
446fn create_tables_and_populate_cache(env: &Environment) -> Result<FsiCache, MdbxError> {
447 let inner_tx = env.begin_rw_unsync().map_err(MdbxError::Mdbx)?;
448 // Tx requires an FsiCache, so we pass a throwaway empty one. The FSI
449 // entries written by queue_db_init's store_fsi calls land in this
450 // temporary cache's dynamic map — they are discarded. We re-read the
451 // authoritative values from the metadata table via read_known_fsi.
452 let tmp_cache = FsiCache::new(Default::default());
453 let tx = Tx::new(inner_tx, tmp_cache);
454 tx.queue_db_init()?;
455
456 let known = read_known_fsi(&tx)?;
457 tx.raw_commit()?;
458 Ok(FsiCache::new(known))
459}
460
461/// Read FSI entries for all known tables from the metadata table.
462fn read_known_fsi<K: signet_libmdbx::TransactionKind>(
463 tx: &Tx<K>,
464) -> Result<[(&'static str, FixedSizeInfo); NUM_TABLES], MdbxError> {
465 let mut known = [("", FixedSizeInfo::None); NUM_TABLES];
466 for (i, &name) in KNOWN_TABLE_NAMES.iter().enumerate() {
467 known[i] = (name, tx.read_fsi_from_table(name)?);
468 }
469 Ok(known)
470}
471
472/// Read FSI entries for all known tables via a temporary RO transaction.
473/// Called during RO open.
474fn populate_cache_ro(env: &Environment) -> Result<FsiCache, MdbxError> {
475 let inner_tx = env.begin_ro_unsync().map_err(MdbxError::Mdbx)?;
476 let tmp_cache = FsiCache::new(Default::default());
477 let tx = Tx::new(inner_tx, tmp_cache);
478 let known = read_known_fsi(&tx)?;
479 Ok(FsiCache::new(known))
480}