Skip to main content

lsm_tree/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5// no-std foundation: when the `std` feature is OFF the crate root opts into
6// `no_std`. Default builds keep `std` enabled (file I/O, threading, system
7// clock all live in `std`), so existing consumers see no behaviour change.
8// The migration to a fully no-std-clean build is incremental. Two patterns
9// coexist while the port is in progress:
10//   1. Modules with std-only dependencies that have no consumers above the
11//      `fs` / `version` / `tree` layer stay gated behind `#[cfg(feature =
12//      "std")]` and are ported in isolation.
13//   2. Modules whose std-only dependencies cascade through every consumer
14//      (`manifest_blocks`, vendored `sfa`, others noted at their `pub mod`
15//      site) remain UNCONDITIONAL today — gating them would require
16//      `#[cfg]` annotations on dozens of call sites without changing the
17//      no-std error count, because those call sites are themselves
18//      std-bound for unrelated reasons. They migrate in lockstep with the
19//      consumer layer rather than ahead of it.
20// The CI job `no-std-check` exercises `cargo check --no-default-features
21// --features alloc` against a real no-std target (`thumbv7em-none-eabihf`)
22// and tracks remaining work via the error count, which must monotonically
23// decrease across PRs.
24#![cfg_attr(not(feature = "std"), no_std)]
25
26//! Embedded LSM-tree storage engine.
27//!
28//! Provides keyed point reads, prefix and range scans, MVCC snapshots, block
29//! and file-descriptor caching, and a configurable compaction subsystem. No
30//! write-ahead log — durability is the caller's responsibility (`flush_active_memtable`
31//! forces persistence when needed).
32//!
33//! ## Highlights
34//!
35//! - **AMQ filter**: `BuRR` (Bumped Ribbon Retrieval, Walzer & Dillinger 2022) for
36//!   per-key and per-prefix membership checks. ~30% smaller filter blocks than a
37//!   same-FPR Bloom filter, or ~10× tighter FPR at the same memory budget.
38//! - **Compression**: pure-Rust zstd (incl. dictionary mode), LZ4, or none —
39//!   per-table and per-level policy.
40//! - **Encryption at rest**: AES-256-GCM block encryption with a caller-supplied
41//!   key.
42//! - **Range tombstones**: `delete_range` / `delete_prefix` (SST-encoded; the
43//!   feature was added in disk format V4 and remains supported in the current
44//!   V5 format — the V5 break extends the block header for per-block Reed-
45//!   Solomon Page ECC, not the tombstone encoding).
46//! - **Merge operators**: commutative-merge LSM operations with lazy resolution.
47//! - **K/V separation (`BlobTree`)**: large-value workloads with automatic GC.
48//! - **Pluggable `Fs`**: standard, in-memory, `io_uring`, or custom backends.
49//! - **MVCC**: snapshot reads at a chosen `SeqNo`, custom `UserComparator`.
50//! - **Concurrency**: thread-safe `BTreeMap`-like API.
51//!
52//! Keys: up to 65,535 bytes (`u16` length field). Values: up to 4,294,967,295
53//! bytes (`u32` length field, `2³² − 1`). Larger keys and values
54//! carry a proportional performance cost.
55//!
56//! ## Quick start
57//!
58//! ```no_run
59//! use lsm_tree::{AbstractTree, Config, SequenceNumberCounter};
60//!
61//! let folder = tempfile::tempdir().unwrap();
62//! let seqno = SequenceNumberCounter::default();
63//! let tree = Config::new(&folder, seqno.clone(), SequenceNumberCounter::default())
64//!     .open()
65//!     .unwrap();
66//!
67//! tree.insert("key", "value", seqno.next());
68//! let value = tree.get("key", lsm_tree::SeqNo::MAX).unwrap();
69//! assert_eq!(value.map(|v| v.to_vec()), Some(b"value".to_vec()));
70//! ```
71//!
72//! ## On-disk format
73//!
74//! Current version: **V5**. V5 introduces the `BuRR` filter wire format,
75//! per-block Reed-Solomon Page ECC, and per-entry (per-KV) checksum
76//! footers (collapsed into the same version because V5 had not shipped
77//! when they landed): the self-describing block types (`Meta` / `Manifest` /
78//! `ManifestFooter`) gain a `block_flags` byte whose `ECC_PARITY` bit marks a
79//! parity trailer and whose `KV_CHECKSUM_FOOTER` bit marks a per-entry
80//! checksum footer. SST block types (`Data` / `Index` / `Filter` /
81//! `RangeTombstone`) keep the compact header WITHOUT that byte and derive parity / footer
82//! presence from the per-SST meta descriptors (`descriptor#page_ecc`,
83//! `descriptor#kv_checksum`). The block magic is bumped so a pre-V5 reader
84//! rejects V5 blocks immediately at header decode.
85//! A V5 SST written with every optional transform off (no Page ECC, no per-KV
86//! footers) is still NOT byte-identical to a pre-V5 table — the bumped block
87//! magic and the per-SST meta descriptor keys always differ. Any
88//! "byte-identical when off" guarantees in the feature docs are within-V5 and
89//! payload-level (e.g. index entries when `seqno_in_index = false`), not a
90//! cross-version equivalence.
91//! V3-V4 databases are not readable by this version and vice versa. The
92//! manifest version gate rejects pre-V5 databases at `Tree::open` time.
93//! V4 introduced range tombstones (still supported).
94#![deny(clippy::all, missing_docs, clippy::cargo)]
95#![deny(clippy::unwrap_used)]
96#![deny(clippy::indexing_slicing)]
97#![warn(clippy::pedantic, clippy::nursery)]
98#![warn(clippy::expect_used)]
99#![allow(clippy::missing_const_for_fn)]
100#![warn(clippy::multiple_crate_versions)]
101#![allow(clippy::option_if_let_else)]
102#![warn(clippy::redundant_feature_names)]
103// The `#[test_log::test]` attribute macro expands to a body that places
104// `use` items after statements, which trips `items_after_statements` in
105// the test build (`clippy --all-targets`). The lint fires only on the
106// macro's generated code, not on anything hand-written, so allow it
107// crate-wide rather than annotating every instrumented test.
108#![allow(clippy::items_after_statements)]
109// Test fixtures routinely bind closely-named locals (`dict_a` / `dict_b`,
110// `tree_1` / `tree_2`) where the similarity is the point; `similar_names`
111// is pedantic noise on that style.
112#![allow(clippy::similar_names)]
113// Long scenario tests (fuzz harnesses, multi-phase integration cases)
114// legitimately run past the `too_many_lines` ceiling; splitting them
115// would obscure the scenario. The lint is only reached under
116// `--all-targets`, which lints test bodies.
117#![allow(clippy::too_many_lines)]
118#![cfg_attr(coverage_nightly, feature(coverage_attribute))]
119
120// `alloc` is the minimal hard dependency — the crate uses `Arc`, `Vec`,
121// `Box`, and other heap types throughout. `extern crate alloc` makes the
122// `alloc` crate root visible to `no_std` builds; under `std` it is a
123// no-op alias because the standard library re-exports the same types.
124#[macro_use]
125extern crate alloc;
126
127// f64 ceil / round: the native `f64` methods are std-only (they bind to
128// platform float intrinsics), so the `no_std` build routes through `libm`.
129// std keeps the native path (equal or faster); the results are identical.
130#[cfg(feature = "std")]
131#[inline]
132pub(crate) fn f64_ceil(x: f64) -> f64 {
133    x.ceil()
134}
135#[cfg(not(feature = "std"))]
136#[inline]
137pub(crate) fn f64_ceil(x: f64) -> f64 {
138    libm::ceil(x)
139}
140#[cfg(feature = "std")]
141#[inline]
142pub(crate) fn f32_round(x: f32) -> f32 {
143    x.round()
144}
145#[cfg(not(feature = "std"))]
146#[inline]
147pub(crate) fn f32_round(x: f32) -> f32 {
148    libm::roundf(x)
149}
150#[cfg(feature = "std")]
151#[inline]
152pub(crate) fn f32_ceil(x: f32) -> f32 {
153    x.ceil()
154}
155#[cfg(not(feature = "std"))]
156#[inline]
157pub(crate) fn f32_ceil(x: f32) -> f32 {
158    libm::ceilf(x)
159}
160#[cfg(feature = "std")]
161#[inline]
162pub(crate) fn f64_log2(x: f64) -> f64 {
163    x.log2()
164}
165#[cfg(not(feature = "std"))]
166#[inline]
167pub(crate) fn f64_log2(x: f64) -> f64 {
168    libm::log2(x)
169}
170#[cfg(feature = "std")]
171#[inline]
172pub(crate) fn f32_log2(x: f32) -> f32 {
173    x.log2()
174}
175#[cfg(not(feature = "std"))]
176#[inline]
177pub(crate) fn f32_log2(x: f32) -> f32 {
178    libm::log2f(x)
179}
180
181// `hashbrown` (not `std::collections`) so the crate-wide map / set aliases
182// compile on `no_std + alloc`. hashbrown IS the implementation std's HashMap
183// wraps, so the API and performance match; using it directly just drops the
184// std dependency. Hasher stays `FxHasher` (fast, non-DoS — internal keys are
185// not attacker-controlled).
186#[doc(hidden)]
187pub type HashMap<K, V> = hashbrown::HashMap<K, V, rustc_hash::FxBuildHasher>;
188
189pub(crate) type HashSet<K> = hashbrown::HashSet<K, rustc_hash::FxBuildHasher>;
190
191macro_rules! fail_iter {
192    ($e:expr) => {
193        match $e {
194            Ok(v) => v,
195            Err(e) => return Some(Err(e.into())),
196        }
197    };
198}
199
200macro_rules! unwrap {
201    ($x:expr) => {{ $x.expect("should read") }};
202}
203
204mod any_tree;
205
206mod abstract_tree;
207
208pub(crate) mod deletion_pause;
209pub mod heal_hints;
210
211/// Computed write-backpressure verdict (caller-honoured, see [`backpressure`]).
212pub mod backpressure;
213
214// `checkpoint` is `pub(crate)`: it contains internal helpers
215// (`link_or_copy_cross_fs`, `prepare_target`, `run_checkpoint`) used by
216// `Tree::create_checkpoint` and `BlobTree::create_checkpoint`. Exposing
217// it via `pub` (even with `#[doc(hidden)]`) would lock the helpers into
218// the stable surface; tests that need to exercise them live inline as
219// unit tests inside `src/checkpoint.rs`.
220#[cfg(feature = "std")]
221pub(crate) mod checkpoint;
222
223#[doc(hidden)]
224pub mod blob_tree;
225
226// Vendored, `no_std`-ported `byteview` (backs `Slice`); kept in-tree so the
227// engine carries no external dependency that fails to compile on `no_std`.
228mod byteview;
229
230mod comparator;
231
232#[doc(hidden)]
233mod cache;
234
235/// In-tree sharded S3-FIFO cache backing `cache` and `descriptor_table`
236/// (replaces `quick_cache`; works on `std` and `no_std + alloc`).
237mod sharded_cache;
238
239#[doc(hidden)]
240pub mod checksum;
241
242#[doc(hidden)]
243pub mod coding;
244
245pub mod compaction;
246#[doc(hidden)]
247pub mod compression;
248
249/// Block-level encryption at rest.
250pub mod encryption;
251
252/// Configuration
253pub mod config;
254
255#[doc(hidden)]
256pub mod descriptor_table;
257
258/// Shard-based Page ECC (XOR single-parity and Reed-Solomon).
259///
260/// Gated behind the `page_ecc` cargo feature so the
261/// `reed-solomon-simd` dependency is only pulled in when the feature
262/// is enabled.
263#[cfg(feature = "page_ecc")]
264pub mod ecc;
265
266#[doc(hidden)]
267pub mod file_accessor;
268
269mod double_ended_peekable;
270mod error;
271
272#[doc(hidden)]
273pub mod file;
274
275/// Pluggable filesystem abstraction for I/O backends.
276pub mod fs;
277
278pub mod hash;
279
280/// Local I/O trait surface mirroring `std::io::{Read, Write, Seek}`.
281///
282/// Provides `Error` / `ErrorKind` / `SeekFrom` plus the three trait
283/// definitions so the bounds on the [`fs`] traits no longer carry
284/// `std::io::*` directly. Under the `std` feature, supertrait
285/// aliases + blanket impls forward to `std::io` types so existing
286/// std-backed backends satisfy the trait surface automatically; the
287/// alias form also propagates BACK to `std::io`, so a `dyn FsFile`
288/// bounded on `crate::io::Read` still flows into `std::io::BufReader`,
289/// `byteorder`, and friends.
290///
291/// Scope: this prerequisite slice (per #311) lifts only
292/// `Read`/`Write`/`Seek` out of the [`fs`] trait bounds. The
293/// `io::Result<T>` return types and `&Path` argument types in
294/// `fs::Fs` / `fs::FsFile` still resolve to `std::io::Result<T>` and
295/// `std::path::Path` and migrate in follow-up commits; the full
296/// `--no-default-features --features alloc` build of the `fs::*`
297/// surface arrives once those two follow-ups land.
298pub mod io;
299
300mod heap;
301mod ingestion;
302mod iter_guard;
303mod key;
304mod key_range;
305mod loser_tree;
306mod manifest;
307#[doc(hidden)]
308pub mod manifest_blocks;
309mod memtable;
310mod merge_operator;
311pub(crate) mod rate_limiter;
312mod reseek;
313mod run_reader;
314mod run_scanner;
315// Vendored sfa is std-only internally (`std::io` / `std::fs` /
316// `std::path`). Unconditional for the same cascading reason as
317// `manifest_blocks` above: ~20 consumers across the table /
318// blob-file / inspect / verify / checkpoint paths reference sfa
319// types in unconditional code. Gating sfa alone explodes the
320// `no-std-check` error count via unresolved-module failures on
321// every consumer that hasn't been gated yet. Migration is the
322// whole std-bound layer at once (tracked as issue #358), not sfa
323// in isolation.
324#[doc(hidden)]
325pub mod sfa;
326
327#[doc(hidden)]
328pub mod merge;
329
330#[doc(hidden)]
331pub mod merge_source;
332#[doc(hidden)]
333pub mod seeking_merger;
334
335#[cfg(feature = "metrics")]
336pub(crate) mod metrics;
337
338// mod multi_reader;
339
340#[doc(hidden)]
341pub mod mvcc_stream;
342
343mod path;
344mod pinnable_slice;
345mod prefix;
346
347#[doc(hidden)]
348pub mod range;
349
350/// Runtime-toggleable configuration (`RuntimeConfig` + atomic-swap handle).
351pub mod runtime_config;
352
353/// Disaster-recovery: rebuild a missing/corrupt manifest from on-disk SSTs.
354// std-only: scans table folders and rewrites the manifest via std::fs.
355#[cfg(feature = "std")]
356pub mod repair;
357
358// std-only: block-granular SST salvage; reads source blocks and writes a
359// recovered SST via std::fs (see also `crate::repair`, `crate::verify`).
360#[cfg(feature = "std")]
361pub mod salvage;
362
363pub(crate) mod active_tombstone_set;
364pub(crate) mod range_tombstone;
365pub(crate) mod range_tombstone_filter;
366
367#[doc(hidden)]
368pub mod table;
369
370mod background_deleter;
371mod scan_since;
372
373/// Single-error-correct / double-error-detect word codecs for the Page ECC
374/// read path (pluggable SEC-DED shapes; default Hsiao `(72, 64)`).
375///
376/// Gated behind `page_ecc` and crate-internal: the codec only runs on the
377/// Page ECC recovery path. Trailer sizing on the read path uses a plain
378/// `ceil(len / 8)` so reading a SEC-DED SST does not require this module.
379#[cfg(feature = "page_ecc")]
380pub(crate) mod secded;
381
382mod seqno;
383mod slice;
384mod slice_windows;
385
386#[doc(hidden)]
387pub mod stop_signal;
388
389mod format_version;
390mod time;
391mod tree;
392
393pub use time::Clock;
394#[cfg(feature = "std")]
395pub use time::SystemClock;
396#[cfg(not(feature = "std"))]
397pub use time::set_clock;
398
399/// Utility functions
400pub mod util;
401
402mod value;
403mod value_type;
404mod write_batch;
405
406/// Integrity verification for SST and blob files.
407///
408/// The block-level scrub (per-block + per-KV checksum walking) runs over the
409/// injected [`Fs`](crate::fs::Fs) backend through `crate::io`, so it compiles on
410/// `no_std` (serial path; the multi-threaded fan-out and the full-file
411/// hash-by-path convenience stay behind `std`).
412pub mod verify;
413
414/// Out-of-band inspection of a single SST file.
415///
416/// Public read-only view of stored metadata (table id, key range,
417/// counts, compression, timestamp) without spinning up a `Tree`.
418/// Used by `sst-dump properties` and similar diagnostic tools. See
419/// the module docs for the recovery semantics (mirrors
420/// `Table::recover`'s TAIL-first / MID-fallback path from #295).
421#[cfg(feature = "std")]
422pub mod inspect;
423
424/// ECC patrol scrub: a proactive sweep over Page-ECC-protected SST blocks.
425///
426/// Reads blocks to detect and correct latent bit-rot before it accumulates
427/// past the parity budget. Std-gated for the same reason as [`verify`]: the
428/// sweep needs real filesystem I/O and thread-based parallelism.
429#[cfg(feature = "std")]
430pub mod scrub;
431
432pub mod storage_stats;
433pub use storage_stats::{
434    ApproximateRangeStats, LevelStats, RangeCardinality, SegmentStats, StorageStatistics,
435    StorageStats, StorageStatus,
436};
437
438mod version;
439mod vlog;
440
441/// User defined key (byte array)
442pub type UserKey = Slice;
443
444/// User defined data (byte array)
445pub type UserValue = Slice;
446
447/// KV-tuple (key + value)
448pub type KvPair = (UserKey, UserValue);
449
450// The `#[doc(hidden)]` block below re-exports crate internals that are reachable
451// at the crate root for benchmarks and integration tests, but are NOT part of the
452// public API contract — they carry no semver guarantee and may be renamed, moved,
453// or removed without a major version bump. External callers that import these
454// hidden items do so at their own risk. `cargo doc` excludes them from generated
455// rustdoc output; only intra-crate test/bench code is expected to use them.
456#[doc(hidden)]
457pub use {
458    checksum::Checksum,
459    iter_guard::{IterGuardImpl, SeekableGuardIter},
460    key_range::KeyRange,
461    merge::BoxedIterator,
462    slice::Builder,
463    // Re-exported for `benches/lsp.rs` only — see hidden-block contract above.
464    table::util::longest_shared_prefix_length,
465    table::{GlobalTableId, Table, TableId},
466    value::InternalValue,
467};
468
469#[doc(hidden)]
470pub use {
471    blob_tree::{Guard as BlobGuard, handle::BlobIndirection},
472    tree::Guard as StandardGuard,
473    tree::inner::TreeId,
474};
475
476pub use encryption::EncryptionProvider;
477
478#[cfg(feature = "encryption")]
479pub use encryption::Aes256GcmProvider;
480
481#[doc(hidden)]
482#[cfg(feature = "std")]
483pub use background_deleter::BackgroundDeleter;
484pub use pinnable_slice::PinnableSlice;
485#[cfg(feature = "std")]
486pub use repair::RepairReport;
487pub use write_batch::WriteBatch;
488
489pub use {
490    cache::Cache,
491    comparator::{DefaultUserComparator, SharedComparator, UserComparator},
492    compression::CompressionType,
493    config::{Config, KvSeparationOptions, TreeType},
494    error::{Error, Result},
495    format_version::FormatVersion,
496    iter_guard::IterGuard as Guard,
497    memtable::{Memtable, MemtableId},
498    merge_operator::MergeOperator,
499    prefix::PrefixExtractor,
500    seqno::{
501        MAX_SEQNO, SequenceNumberCounter, SequenceNumberGenerator, SharedSequenceNumberGenerator,
502    },
503    slice::Slice,
504    value::SeqNo,
505    value_type::ValueType,
506};
507
508pub use {
509    abstract_tree::{AbstractTree, CheckpointInfo},
510    any_tree::AnyTree,
511    blob_tree::BlobTree,
512    descriptor_table::DescriptorTable,
513    ingestion::AnyIngestion,
514    scan_since::ScanSinceEvent,
515    tree::Tree,
516    vlog::BlobFile,
517};
518
519#[cfg(zstd_any)]
520pub use compression::ZstdDictionary;
521
522#[cfg(feature = "metrics")]
523pub use metrics::{CacheStats, Metrics};
524
525pub use backpressure::{Backpressure, BackpressureThresholds};
526
527#[cfg(feature = "std")]
528#[doc(hidden)]
529#[must_use]
530#[allow(missing_docs, clippy::missing_errors_doc, clippy::unwrap_used)]
531pub fn get_tmp_folder() -> tempfile::TempDir {
532    if let Ok(p) = std::env::var("LSMT_TMP_FOLDER") {
533        tempfile::tempdir_in(p)
534    } else {
535        tempfile::tempdir()
536    }
537    .unwrap()
538}