lsm_tree/config/mod.rs
1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5mod block_size;
6mod compression;
7mod delete_strategy;
8mod filter;
9mod hash_ratio;
10mod locator;
11mod pinning;
12mod restart_interval;
13
14pub use block_size::BlockSizePolicy;
15pub use compression::CompressionPolicy;
16pub use delete_strategy::{DeleteStrategy, DeleteStrategyPolicy};
17pub use filter::{BloomConstructionPolicy, FilterPolicy, FilterPolicyEntry};
18pub use hash_ratio::HashRatioPolicy;
19pub use locator::{LocatorPolicy, LocatorPolicyEntry, LocatorPrecision};
20pub use pinning::PinningPolicy;
21pub use restart_interval::RestartIntervalPolicy;
22
23/// Partitioning policy for indexes and filters
24pub type PartitioningPolicy = PinningPolicy;
25
26#[cfg(feature = "std")]
27use crate::fs::StdFs;
28use crate::path::PathBuf;
29use crate::{
30 AnyTree, BlobTree, Cache, CompressionType, DescriptorTable, SharedSequenceNumberGenerator,
31 Tree,
32 compaction::filter::Factory,
33 comparator::SharedComparator,
34 encryption::EncryptionProvider,
35 file::TABLES_FOLDER,
36 fs::{Fs, SyncMode},
37 merge_operator::MergeOperator,
38 path::absolute_path,
39 prefix::PrefixExtractor,
40};
41// std-only: used solely by the std-gated `Config::default` / `Config::new`
42// constructors (the no_std path builds `Config` field-by-field).
43#[cfg(feature = "std")]
44use crate::{SequenceNumberCounter, comparator, path::Path, version::DEFAULT_LEVEL_COUNT};
45use alloc::sync::Arc;
46#[cfg(not(feature = "std"))]
47use alloc::vec::Vec;
48use core::ops::Range;
49
50/// Per-level filesystem routing entry for tiered storage.
51///
52/// Maps a range of LSM levels to a base directory and filesystem backend.
53/// Tables at these levels are stored under `path/tables/`.
54///
55/// # Example
56///
57/// ```
58/// use lsm_tree::config::LevelRoute;
59/// use lsm_tree::fs::StdFs;
60/// use std::sync::Arc;
61///
62/// // Hot tier: L0-L1 on NVMe
63/// let hot = LevelRoute {
64/// levels: 0..2,
65/// path: "/mnt/nvme/db".into(),
66/// fs: Arc::new(StdFs),
67/// };
68///
69/// // Cold tier: L4-L6 on HDD
70/// let cold = LevelRoute {
71/// levels: 4..7,
72/// path: "/mnt/hdd/db".into(),
73/// fs: Arc::new(StdFs),
74/// };
75/// ```
76#[derive(Clone)]
77pub struct LevelRoute {
78 /// LSM levels this route covers (e.g., `0..2` for L0–L1).
79 pub levels: Range<u8>,
80
81 /// Base data directory for tables at these levels.
82 pub path: PathBuf,
83
84 /// Filesystem backend for I/O at these levels.
85 pub fs: Arc<dyn Fs>,
86}
87
88impl core::fmt::Debug for LevelRoute {
89 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
90 f.debug_struct("LevelRoute")
91 .field("levels", &self.levels)
92 .field("path", &self.path)
93 .finish_non_exhaustive()
94 }
95}
96
97/// Policy governing what `Tree::open` does when the on-disk MANIFEST
98/// contains corrupt records.
99///
100/// Mirrors `RocksDB`'s `WALRecoveryMode` semantics, but applied to the
101/// manifest layer (`src/version/recovery.rs`) — lsm-tree itself has no
102/// WAL (durability lives one layer up in the parent fjall/keyspace
103/// crate's `Journal`). The MANIFEST is the equivalent surface where
104/// "loss-tolerance vs strict-consistency" matters at open time.
105///
106/// The default is [`AbsoluteConsistency`](Self::AbsoluteConsistency) —
107/// any corrupt record fails the open. Switching to a more permissive
108/// mode is an explicit, informed operator decision: you are trading
109/// "the tree might silently come up with missing tables / blob files"
110/// for "the tree comes up at all". When a non-default mode drops
111/// records, the recovery path emits a `warn!` summary with the
112/// AGGREGATE dropped count per section (`tables` / `blob_files`) —
113/// individual table IDs / blob-file IDs are NOT enumerated, because
114/// they were never decoded in the first place. Operators wanting a
115/// per-record audit trail should pair tail-tolerant recovery with an
116/// out-of-band integrity scan ([`verify_integrity`](crate::verify::verify_integrity))
117/// of the recovered tree.
118#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
119pub enum ManifestRecoveryMode {
120 /// Production-safe default. Any per-record decode mismatch (bad
121 /// XXH3, invalid tag, truncated TOC entry, declared-count overrun)
122 /// aborts the open with the original error. Surfaces every byte
123 /// of corruption; never silently drops data.
124 #[default]
125 AbsoluteConsistency,
126
127 /// Power-loss-at-write-tail salvage. If the per-section iteration
128 /// over the `tables` / `blob_files` records runs out of bytes
129 /// before the declared count is reached (truncated tail), keep
130 /// everything that decoded cleanly before the cut and emit a
131 /// `warn!` listing the dropped record counts.
132 ///
133 /// A declared count that exceeds the section's payload capacity
134 /// (e.g. `table_count` claims more entries than the section has
135 /// bytes for) is treated as the same "writer committed a count
136 /// header then truncated the entries" shape — the recovery
137 /// downgrades the original hard fail to a `warn!` and lets the
138 /// per-entry decode loop walk bytes-actually-present until the
139 /// first `UnexpectedEof`.
140 ///
141 /// Any decode error that is NOT a clean tail truncation (bad
142 /// `checksum_type` tag, etc.) still aborts the open — this mode
143 /// is specifically for "the writer never finished" scenarios,
144 /// not for arbitrary bit-rot in already-committed bytes.
145 TolerateCorruptedTailRecords,
146
147 /// Recover the largest consistent prefix and discard the rest.
148 /// Adapts `RocksDB`'s `kPointInTimeRecovery` accept-the-prefix
149 /// rule to the level/run/table nesting: on the first
150 /// record-decode mismatch inside the `tables` section, the
151 /// recovery keeps the records that decoded cleanly *before*
152 /// the corrupt one in the current run, plus every complete
153 /// earlier run in the same level, plus every complete earlier
154 /// level. "Record-decode mismatch" covers ALL three failure
155 /// shapes the per-record loop can surface:
156 ///
157 /// 1. Framing-layer XXH3 mismatch (the 8-byte digest in the
158 /// record header doesn't match `xxh3_64(payload)`).
159 /// 2. Framing-header structural failure (`len > MAX_FRAME_PAYLOAD`),
160 /// surfaced as `BadHeader`. Note: `LenMismatch` (decoded `len`
161 /// disagrees with a fixed-length pin) is a SEPARATE hard-abort
162 /// case in every recovery mode, not a record-decode mismatch
163 /// for the purpose of this mode.
164 /// 3. Payload decode failure AFTER a clean framing pass —
165 /// e.g. `Error::InvalidTag` from a corrupt `checksum_type`
166 /// byte inside an otherwise-framed-OK record. The framing
167 /// XXH3 happens to cover the corrupt byte too (it's a
168 /// digest of the whole payload), so the bytes decode
169 /// cleanly at the framing layer; the corruption only
170 /// surfaces inside the per-entry decode helper.
171 ///
172 /// PIT drops the corrupt record itself, the remaining records
173 /// of that run, and every level not yet read. The same rule
174 /// applies to the `blob_files` section. Clean tail-truncation
175 /// is still tolerated, same as
176 /// [`TolerateCorruptedTailRecords`](Self::TolerateCorruptedTailRecords).
177 PointInTimeRecovery,
178
179 /// Skip each corrupt record individually, keep all others.
180 /// Maximum-availability, lossy. On any per-record decode
181 /// mismatch — framing-layer XXH3 mismatch, payload-decode
182 /// failure inside an otherwise-framed-OK record (e.g.
183 /// `Error::InvalidTag` on a corrupt `checksum_type` byte), or
184 /// a framing-header `BadHeader` — the reader logs the skip
185 /// and advances exactly past the bad record using the
186 /// framing-supplied length field. If the length field itself
187 /// is unusable (the recorded length is outside the legal
188 /// range, so the next-record boundary is unknown), the rest
189 /// of that section is dropped. Intended companion to the
190 /// `repair_db` tooling tracked as `#303`: this mode recovers
191 /// what it can in-place; `repair_db` rebuilds the manifest
192 /// from the SST files
193 /// themselves when even this mode can't reach a usable state.
194 SkipAnyCorruptedRecords,
195}
196
197/// LSM-tree type
198#[derive(Copy, Clone, Debug, PartialEq, Eq)]
199pub enum TreeType {
200 /// Standard LSM-tree, see [`Tree`]
201 Standard,
202
203 /// Key-value separated LSM-tree, see [`BlobTree`]
204 Blob,
205}
206
207impl From<TreeType> for u8 {
208 fn from(val: TreeType) -> Self {
209 match val {
210 TreeType::Standard => 0,
211 TreeType::Blob => 1,
212 }
213 }
214}
215
216impl TryFrom<u8> for TreeType {
217 type Error = ();
218
219 fn try_from(value: u8) -> Result<Self, Self::Error> {
220 match value {
221 0 => Ok(Self::Standard),
222 1 => Ok(Self::Blob),
223 _ => Err(()),
224 }
225 }
226}
227
228#[cfg_attr(
229 not(feature = "std"),
230 allow(
231 dead_code,
232 reason = "default data-folder path used only on the std-gated default-config path"
233 )
234)]
235const DEFAULT_FILE_FOLDER: &str = ".lsm.data";
236
237/// Options for key-value separation
238#[derive(Clone, Debug, PartialEq)]
239pub struct KvSeparationOptions {
240 /// What type of compression is used for blobs
241 #[doc(hidden)]
242 pub compression: CompressionType,
243
244 /// Blob file target size in bytes
245 #[doc(hidden)]
246 pub file_target_size: u64,
247
248 /// Key-value separation threshold in bytes
249 #[doc(hidden)]
250 pub separation_threshold: u32,
251
252 #[doc(hidden)]
253 pub staleness_threshold: f32,
254
255 #[doc(hidden)]
256 pub age_cutoff: f32,
257
258 /// Pre-trained zstd dictionary for blob-file dictionary compression.
259 ///
260 /// Required when `compression` is [`CompressionType::ZstdDict`].
261 /// The `dict_id` in the compression type must match [`ZstdDictionary::id`](crate::ZstdDictionary::id).
262 #[cfg(zstd_any)]
263 #[doc(hidden)]
264 pub zstd_dictionary: Option<alloc::sync::Arc<crate::compression::ZstdDictionary>>,
265}
266
267impl Default for KvSeparationOptions {
268 fn default() -> Self {
269 Self {
270 #[cfg(feature="lz4")]
271 compression: CompressionType::Lz4,
272
273 #[cfg(not(feature="lz4"))]
274 compression: CompressionType::None,
275
276 file_target_size: /* 64 MiB */ 64 * 1_024 * 1_024,
277 separation_threshold: /* 1 KiB */ 1_024,
278
279 staleness_threshold: 0.25,
280 age_cutoff: 0.25,
281
282 #[cfg(zstd_any)]
283 zstd_dictionary: None,
284 }
285 }
286}
287
288impl KvSeparationOptions {
289 /// Sets the blob compression method.
290 #[must_use]
291 pub fn compression(mut self, compression: CompressionType) -> Self {
292 self.compression = compression;
293 self
294 }
295
296 /// Sets the target size of blob files.
297 ///
298 /// Smaller blob files allow more granular garbage collection
299 /// which allows lower space amp for lower write I/O cost.
300 ///
301 /// Larger blob files decrease the number of files on disk and maintenance
302 /// overhead.
303 ///
304 /// Defaults to 64 MiB.
305 #[must_use]
306 pub fn file_target_size(mut self, bytes: u64) -> Self {
307 self.file_target_size = bytes;
308 self
309 }
310
311 /// Sets the key-value separation threshold in bytes.
312 ///
313 /// Smaller value will reduce compaction overhead and thus write amplification,
314 /// at the cost of lower read performance.
315 ///
316 /// Defaults to 1 KiB.
317 #[must_use]
318 pub fn separation_threshold(mut self, bytes: u32) -> Self {
319 self.separation_threshold = bytes;
320 self
321 }
322
323 /// Sets the staleness threshold percentage.
324 ///
325 /// The staleness percentage determines how much a blob file needs to be fragmented to be
326 /// picked up by the garbage collection.
327 ///
328 /// Defaults to 33%.
329 #[must_use]
330 pub fn staleness_threshold(mut self, ratio: f32) -> Self {
331 self.staleness_threshold = ratio;
332 self
333 }
334
335 /// Sets the age cutoff threshold.
336 ///
337 /// Defaults to 20%.
338 #[must_use]
339 pub fn age_cutoff(mut self, ratio: f32) -> Self {
340 self.age_cutoff = ratio;
341 self
342 }
343
344 /// Sets the zstd dictionary for blob-file dictionary compression.
345 ///
346 /// Required when [`compression`](Self::compression) is set to
347 /// [`CompressionType::ZstdDict`]. The `dict_id` encoded in the
348 /// compression type must equal [`ZstdDictionary::id()`](crate::ZstdDictionary::id) of the
349 /// supplied dictionary; [`Config::open`] will return
350 /// [`Error::ZstdDictMismatch`](crate::Error::ZstdDictMismatch) if
351 /// they disagree.
352 #[cfg(zstd_any)]
353 #[must_use]
354 pub fn dict(
355 mut self,
356 dictionary: alloc::sync::Arc<crate::compression::ZstdDictionary>,
357 ) -> Self {
358 self.zstd_dictionary = Some(dictionary);
359 self
360 }
361}
362
363/// Tree configuration builder
364pub struct Config {
365 /// Folder path
366 #[doc(hidden)]
367 pub path: PathBuf,
368
369 /// Default filesystem backend for levels without an explicit route.
370 ///
371 /// Defaults to [`StdFs`]. Use [`Config::with_fs`] to plug in an
372 /// alternative backend such as [`MemFs`](crate::fs::MemFs).
373 ///
374 /// Both fresh tree creation and reopening (recovery) are supported
375 /// for any backend that implements [`Fs`].
376 #[doc(hidden)]
377 pub fs: Arc<dyn Fs>,
378
379 /// Per-level filesystem routing for tiered storage.
380 ///
381 /// When set, tables at different LSM levels can be stored on different
382 /// storage devices (e.g., NVMe for L0–L1, SSD for L2–L4, HDD for L5–L6).
383 /// Each entry maps a range of levels to a base directory and filesystem
384 /// backend. Uncovered levels fall back to the primary `path` and `fs`.
385 ///
386 /// Zero additional overhead when `None` — only a single branch check;
387 /// path construction allocations are unchanged.
388 #[doc(hidden)]
389 pub level_routes: Option<Vec<LevelRoute>>,
390
391 /// Block cache to use
392 #[doc(hidden)]
393 pub cache: Arc<Cache>,
394
395 /// Descriptor table to use
396 #[doc(hidden)]
397 pub descriptor_table: Option<Arc<DescriptorTable>>,
398
399 /// Number of levels of the LSM tree (depth of tree)
400 ///
401 /// Once set, the level count is fixed (in the "manifest" file)
402 pub level_count: u8,
403
404 /// What type of compression is used for data blocks
405 pub data_block_compression_policy: CompressionPolicy,
406
407 /// What type of compression is used for index blocks
408 pub index_block_compression_policy: CompressionPolicy,
409
410 /// Restart interval inside data blocks
411 pub data_block_restart_interval_policy: RestartIntervalPolicy,
412
413 /// Restart interval inside index blocks
414 pub index_block_restart_interval_policy: RestartIntervalPolicy,
415
416 /// Block size of data blocks
417 pub data_block_size_policy: BlockSizePolicy,
418
419 /// Whether to pin index blocks
420 pub index_block_pinning_policy: PinningPolicy,
421
422 /// Whether to pin filter blocks
423 pub filter_block_pinning_policy: PinningPolicy,
424
425 /// Whether to pin top level index of partitioned index
426 pub top_level_index_block_pinning_policy: PinningPolicy,
427
428 /// Whether to pin top level index of partitioned filter
429 pub top_level_filter_block_pinning_policy: PinningPolicy,
430
431 /// Data block hash ratio
432 pub data_block_hash_ratio_policy: HashRatioPolicy,
433
434 /// Whether to partition index blocks
435 pub index_block_partitioning_policy: PartitioningPolicy,
436
437 /// Whether to partition filter blocks
438 pub filter_block_partitioning_policy: PartitioningPolicy,
439
440 /// Partition size when using partitioned indexes
441 pub index_block_partition_size_policy: BlockSizePolicy,
442
443 /// Partition size when using partitioned filters
444 pub filter_block_partition_size_policy: BlockSizePolicy,
445
446 /// If `true`, the last level will not build filters, reducing the filter size of a database
447 /// by ~90% typically
448 pub(crate) expect_point_read_hits: bool,
449
450 /// Per-block Page ECC. When `true`, every block on disk carries a parity
451 /// trailer; on read, if the block's XXH3 disagrees with the on-disk bytes,
452 /// the reader attempts recovery from the trailer before surfacing the
453 /// corruption. The correction scheme is selected at runtime
454 /// (`update_runtime_config`): per-word SEC-DED (the default), single XOR
455 /// parity, or Reed-Solomon. Requires the `page_ecc` cargo feature — opening a
456 /// tree with `page_ecc = true` on a build without the feature returns
457 /// [`crate::Error::PageEccUnsupported`].
458 ///
459 /// Off by default. `RocksDB` ships per-block ECC as an operator-
460 /// chosen knob (typically off on RAID-protected media, on on
461 /// single-drive) and the cost is non-trivial on the write path,
462 /// so the default keeps the existing behaviour.
463 pub(crate) page_ecc: bool,
464
465 /// Initial [`crate::runtime_config::RuntimeConfig`] snapshot
466 /// the tree starts with. Seeds both the first
467 /// `persist_version` call and the Tree's
468 /// `RuntimeConfigHandle`, so a non-default value supplied via
469 /// [`Config::with_runtime_config`] is honoured from byte zero
470 /// of the manifest. Defaults to `RuntimeConfig::default()` —
471 /// matches the pre-existing implicit behaviour.
472 #[expect(
473 clippy::struct_field_names,
474 reason = "name mirrors the type for grep-ability across the persist + Tree handle init wiring"
475 )]
476 pub(crate) initial_runtime_config: crate::runtime_config::RuntimeConfig,
477
478 /// Filter construction policy
479 pub filter_policy: FilterPolicy,
480
481 /// Retrieval-ribbon locator policy (per level). Defaults to
482 /// [`LocatorPolicy::block_level`]: written SSTs carry an optional `locator`
483 /// section mapping each key to its data block for O(1) point reads (skipping
484 /// the index-block binary search). Set [`LocatorPolicy::disabled`] to opt
485 /// out — disabled levels produce byte-identical SSTs (no section).
486 pub locator_policy: LocatorPolicy,
487
488 /// Compaction filter factory
489 pub compaction_filter_factory: Option<Arc<dyn Factory>>,
490
491 /// Prefix extractor for prefix bloom filters.
492 ///
493 /// When set, the bloom filter indexes extracted prefixes in addition to
494 /// full keys, allowing prefix scans to skip segments that contain no
495 /// matching prefixes.
496 pub prefix_extractor: Option<Arc<dyn PrefixExtractor>>,
497
498 /// Merge operator for commutative operations
499 ///
500 /// When set, enables `merge()` operations that store partial updates
501 /// which are lazily combined during reads and compaction.
502 pub merge_operator: Option<Arc<dyn MergeOperator>>,
503
504 #[doc(hidden)]
505 pub kv_separation_opts: Option<KvSeparationOptions>,
506
507 /// Custom user key comparator.
508 ///
509 /// When set, all key comparisons use this comparator instead of the
510 /// default lexicographic byte ordering. Once a tree is opened with a
511 /// comparator, it must always be re-opened with the same comparator.
512 // Not `pub` — use `Config::comparator()` builder method as the public API.
513 #[doc(hidden)]
514 pub(crate) comparator: SharedComparator,
515
516 /// Block-level encryption provider for encryption at rest.
517 ///
518 /// When set, all blocks (data, index, filter, meta) are encrypted
519 /// using this provider after compression and before checksumming.
520 pub(crate) encryption: Option<Arc<dyn EncryptionProvider>>,
521
522 /// Policy governing what `Tree::open` does when the on-disk
523 /// MANIFEST contains corrupt records. Defaults to
524 /// [`ManifestRecoveryMode::AbsoluteConsistency`], the only
525 /// production-safe choice — any corruption aborts the open. Other
526 /// modes trade strict correctness for partial-availability after a
527 /// disaster; see the enum doc for the operational scenarios that
528 /// motivate each mode.
529 pub(crate) manifest_recovery_mode: ManifestRecoveryMode,
530
531 /// Durability level for every fsync the tree issues (SST writes,
532 /// manifest, version persist, directory syncs).
533 ///
534 /// Defaults to [`SyncMode::Normal`] (plain `fsync`), matching the
535 /// out-of-the-box durability of `RocksDB` and `SQLite`. Only observable on
536 /// macOS, where [`SyncMode::Full`] opts into the much slower
537 /// `F_FULLFSYNC` barrier; on other platforms both modes are plain
538 /// `fsync`. Set via [`Config::sync_mode`].
539 pub(crate) sync_mode: SyncMode,
540
541 /// When `true` (the default), [`Config::open`] and [`Config::repair`]
542 /// acquire an exclusive cross-process lock on a `LOCK` file in the tree
543 /// directory (an advisory OS file lock) and hold it for the lifetime of the
544 /// [`Tree`] (open) or the duration of the call (repair). A
545 /// second process attempting to open / repair the same directory fails fast
546 /// with [`Error::Locked`](crate::Error::Locked) instead of racing on the
547 /// manifest. Set `false` via [`Config::with_directory_lock`] only when the
548 /// embedder already enforces exclusive directory ownership at a higher layer
549 /// (e.g. a keyspace / journal manager). Best-effort per `Fs` backend: real
550 /// on-disk backends enforce it, in-memory backends are single-process and
551 /// satisfy it vacuously.
552 pub(crate) directory_lock: bool,
553
554 /// Edit-log size (bytes) past which the next manifest persist rotates: it
555 /// writes a fresh full snapshot and starts an empty log instead of appending
556 /// another [`VersionEdit`](crate::version::edit::VersionEdit). Bounds both
557 /// recovery replay time (edits to re-apply) and log disk use, while keeping
558 /// the common per-flush path a tiny `O(changed-levels)` append rather than an
559 /// `O(all-SSTs)` full manifest rewrite.
560 ///
561 /// Defaults to 1 MiB (≈ tens of thousands of edits). Set via
562 /// [`Config::manifest_log_rotate_bytes`]. A smaller value rotates more
563 /// often (shorter recovery, more frequent full-snapshot writes); `0` rotates
564 /// on every upgrade, degenerating to the full-rewrite-per-version behaviour.
565 pub(crate) manifest_log_rotate_bytes: u64,
566
567 /// Compaction I/O rate limit in bytes per second.
568 ///
569 /// Caps the rate at which the compaction worker is allowed to issue
570 /// I/O, so background compaction cannot saturate the device and starve
571 /// user point reads / range scans (P99 stability). `0` (the default)
572 /// means unlimited — no throttling, no behaviour change. Flush and
573 /// user reads are never throttled, only compaction. Set via
574 /// [`Config::compaction_rate_limit`].
575 pub(crate) compaction_rate_limit: u64,
576
577 /// Worker-thread count for compaction parallelism (`std` only), used two
578 /// ways: it sizes the per-tree block-compression pool built at open when
579 /// [`Self::compaction_pool`] is `None`, and it caps how many range-parallel
580 /// sub-compactions a single compaction is split into. Default
581 /// `max(1, available_parallelism / 2)` — leaves half the cores for
582 /// application work. `1` forces the serial path for both. Without the
583 /// `parallel` feature there is no built-in pool, so block compression and
584 /// sub-compaction ranges run serially even for a value > 1. Set via
585 /// [`Config::compaction_threads`].
586 #[cfg(feature = "std")] // no-std: parallel compaction unavailable (no threads)
587 pub(crate) compaction_threads: usize,
588
589 /// Optional shared compaction thread pool. `None` (default) = a per-tree
590 /// pool is built at [`crate::Tree::open`] sized by [`Self::compaction_threads`]
591 /// (predictable, matches the per-DB pattern). `Some` = caller-supplied
592 /// executor shared across every tree holding this `Arc`, bounding total
593 /// threads regardless of tree count. Set via [`Config::compaction_pool`].
594 #[cfg(feature = "std")]
595 pub(crate) compaction_pool: Option<Arc<dyn crate::table::writer::CompactionSpawner>>,
596
597 /// Minimum total input size (bytes) for a compaction to be split into
598 /// parallel sub-compactions. Below it the compaction stays single-threaded
599 /// (per-thread setup + extra output tables outweigh the parallelism on small
600 /// compactions). Default
601 /// [`SUBCOMPACTION_MIN_INPUT_BYTES`](crate::compaction::worker::SUBCOMPACTION_MIN_INPUT_BYTES)
602 /// (8 MiB). Set via [`Config::subcompaction_min_bytes`].
603 #[cfg(feature = "std")]
604 pub(crate) subcompaction_min_bytes: u64,
605
606 /// Test-only failpoint: when armed, the first parallel sub-compaction range
607 /// that observes it returns an error and disarms it, so the crash-safety
608 /// rollback paths (sibling output rollback, input restore) can be exercised
609 /// deterministically. Behind `cfg(test)`, never compiled into release builds.
610 #[cfg(all(test, feature = "std"))]
611 pub(crate) fail_one_subcompaction: Arc<core::sync::atomic::AtomicBool>,
612
613 /// Test-only failpoint: when armed, a tight-space compaction returns an error
614 /// immediately after durably installing (and punching) its FIRST slice, so
615 /// the crash-mid-loop recovery path (reopen a tree whose manifest carries a
616 /// persisted input restriction) can be exercised deterministically. Behind
617 /// `cfg(test)`, never compiled into release builds.
618 #[cfg(all(test, feature = "std"))]
619 pub(crate) fail_tight_after_first_slice: Arc<core::sync::atomic::AtomicBool>,
620
621 /// Pre-trained zstd dictionary for dictionary compression.
622 ///
623 /// When set together with a [`CompressionType::ZstdDict`] compression
624 /// policy, data blocks are compressed using this dictionary. The
625 /// dictionary must remain the same for the lifetime of the tree —
626 /// opening a tree with a different dictionary will produce
627 /// [`Error::ZstdDictMismatch`](crate::Error::ZstdDictMismatch) errors.
628 #[cfg(zstd_any)]
629 pub(crate) zstd_dictionary: Option<Arc<crate::compression::ZstdDictionary>>,
630
631 /// The global sequence number generator.
632 ///
633 /// Should be shared between multiple trees of a database.
634 pub(crate) seqno: SharedSequenceNumberGenerator,
635
636 /// Sequence number watermark that is visible to readers.
637 ///
638 /// Used for MVCC snapshots and to control which updates are
639 /// observable in a given view of the database.
640 pub(crate) visible_seqno: SharedSequenceNumberGenerator,
641}
642
643// TODO: remove default?
644// std-only: the default backend is `StdFs` and the default path is resolved
645// via std::path::absolute. no_std callers construct `Config` explicitly with a
646// caller-provided `Fs`.
647#[cfg(feature = "std")]
648impl Default for Config {
649 fn default() -> Self {
650 Self {
651 path: absolute_path(Path::new(DEFAULT_FILE_FOLDER)),
652 fs: Arc::new(StdFs),
653 level_routes: None,
654 descriptor_table: Some(Arc::new(DescriptorTable::new(256))),
655 seqno: SharedSequenceNumberGenerator::from(SequenceNumberCounter::default()),
656 visible_seqno: SharedSequenceNumberGenerator::from(SequenceNumberCounter::default()),
657
658 cache: Arc::new(Cache::with_capacity_bytes(
659 /* 16 MiB */ 16 * 1_024 * 1_024,
660 )),
661
662 data_block_restart_interval_policy: RestartIntervalPolicy::all(16),
663 index_block_restart_interval_policy: RestartIntervalPolicy::all(1),
664
665 level_count: DEFAULT_LEVEL_COUNT,
666
667 data_block_size_policy: BlockSizePolicy::all(4_096),
668
669 index_block_pinning_policy: PinningPolicy::new([true, true, false]),
670 filter_block_pinning_policy: PinningPolicy::new([true, false]),
671
672 top_level_index_block_pinning_policy: PinningPolicy::all(true), // TODO: implement
673 top_level_filter_block_pinning_policy: PinningPolicy::all(true), // TODO: implement
674
675 // Partitioned at every level so a bit-flip inside one
676 // sub-index block only takes out the keys covered by that
677 // partition, not the entire SST. A full-index SST has no
678 // within-block redundancy: one corrupt byte in the single
679 // index block makes every data block in the table
680 // unreachable. See tests/partitioned_index_blast_radius.rs
681 // for the isolation property this default relies on.
682 index_block_partitioning_policy: PinningPolicy::all(true),
683 // Filter-block default intentionally left at the pre-#329
684 // shape (L3+ only). A corrupt filter block can produce a
685 // false negative (filter says "not present" → read short-
686 // circuits → caller misses an existing key), which is a
687 // correctness hazard distinct from index corruption (where
688 // the read fails loudly). Flipping this default is tracked
689 // as a separate decision pending a filter blast-radius /
690 // false-negative analysis; symmetry with index is not
691 // sufficient justification on its own.
692 filter_block_partitioning_policy: PinningPolicy::new([false, false, false, true]),
693
694 index_block_partition_size_policy: BlockSizePolicy::all(4_096), // TODO: implement
695 filter_block_partition_size_policy: BlockSizePolicy::all(4_096), // TODO: implement
696
697 data_block_compression_policy: ({
698 #[cfg(feature = "lz4")]
699 let c = CompressionPolicy::new([CompressionType::None, CompressionType::Lz4]);
700
701 #[cfg(not(feature = "lz4"))]
702 let c = CompressionPolicy::new([CompressionType::None]);
703
704 c
705 }),
706 index_block_compression_policy: CompressionPolicy::all(CompressionType::None),
707
708 data_block_hash_ratio_policy: HashRatioPolicy::all(0.0),
709
710 locator_policy: LocatorPolicy::block_level(),
711 filter_policy: FilterPolicy::all(FilterPolicyEntry::Bloom(
712 BloomConstructionPolicy::BitsPerKey(10.0),
713 )),
714
715 compaction_filter_factory: None,
716 merge_operator: None,
717
718 prefix_extractor: None,
719
720 expect_point_read_hits: false,
721
722 page_ecc: false,
723
724 initial_runtime_config: crate::runtime_config::RuntimeConfig::default(),
725
726 kv_separation_opts: None,
727
728 #[cfg(zstd_any)]
729 zstd_dictionary: None,
730
731 comparator: comparator::default_comparator(),
732 encryption: None,
733 manifest_recovery_mode: ManifestRecoveryMode::AbsoluteConsistency,
734 sync_mode: SyncMode::Normal,
735 directory_lock: true,
736 manifest_log_rotate_bytes: 1024 * 1024,
737 compaction_rate_limit: 0,
738
739 #[cfg(feature = "std")]
740 compaction_threads: std::thread::available_parallelism()
741 .map_or(1, |n| (n.get() / 2).max(1)),
742 #[cfg(feature = "std")]
743 compaction_pool: None,
744 #[cfg(feature = "std")]
745 subcompaction_min_bytes: crate::compaction::worker::SUBCOMPACTION_MIN_INPUT_BYTES,
746 #[cfg(all(test, feature = "std"))]
747 fail_one_subcompaction: Arc::new(core::sync::atomic::AtomicBool::new(false)),
748 #[cfg(all(test, feature = "std"))]
749 fail_tight_after_first_slice: Arc::new(core::sync::atomic::AtomicBool::new(false)),
750 }
751 }
752}
753
754/// Name of the lock file created in a tree directory for the cross-process
755/// exclusive directory lock.
756#[cfg_attr(
757 not(feature = "std"),
758 allow(
759 dead_code,
760 reason = "directory-lock filename used only by the std-gated lock-acquisition path"
761 )
762)]
763pub(crate) const DIRECTORY_LOCK_FILE: &str = "LOCK";
764
765/// Acquires the cross-process exclusive directory lock when `enabled`.
766///
767/// Opens (creating if absent) a `LOCK` file under `dir` and takes a
768/// non-blocking exclusive advisory lock on it through the `Fs` backend. Returns
769/// the locked handle to hold for as long as exclusivity is required; dropping it
770/// releases the lock (the OS frees an advisory lock when the fd / handle
771/// closes). `Ok(None)` when `enabled` is false. Fails with
772/// [`Error::Locked`](crate::Error::Locked) when another live instance holds the
773/// lock. The directory must already exist (the caller creates it for a fresh
774/// tree before acquiring).
775#[cfg(feature = "std")]
776pub(crate) fn acquire_directory_lock(
777 fs: &dyn Fs,
778 dir: &Path,
779 enabled: bool,
780) -> crate::Result<Option<Box<dyn crate::fs::FsFile>>> {
781 if !enabled {
782 return Ok(None);
783 }
784 let lock_path = dir.join(DIRECTORY_LOCK_FILE);
785 let file = fs.open(
786 &lock_path,
787 &crate::fs::FsOpenOptions::new()
788 .read(true)
789 .write(true)
790 .create(true),
791 )?;
792 if file.try_lock_exclusive()? {
793 Ok(Some(file))
794 } else {
795 Err(crate::Error::Locked(dir.display().to_string()))
796 }
797}
798
799impl Config {
800 /// Initializes a new config
801 // std-only: seeds the remaining fields from `Config::default`, whose
802 // default `Fs` is `StdFs`. no_std callers build `Config` field-by-field
803 // with a caller-provided `Fs`.
804 #[cfg(feature = "std")]
805 pub fn new<P: AsRef<Path>>(
806 path: P,
807 seqno: SequenceNumberCounter,
808 visible_seqno: SequenceNumberCounter,
809 ) -> Self {
810 Self {
811 path: absolute_path(path.as_ref()),
812 seqno: Arc::new(seqno),
813 visible_seqno: Arc::new(visible_seqno),
814 ..Default::default()
815 }
816 }
817
818 /// Sets the default filesystem backend used for levels without an explicit route.
819 ///
820 /// Defaults to [`StdFs`]. Use [`MemFs`](crate::fs::MemFs) for
821 /// in-memory trees (testing, ephemeral indexes).
822 ///
823 /// # Example
824 ///
825 /// ```
826 /// # fn main() -> lsm_tree::Result<()> {
827 /// use lsm_tree::{Config, SequenceNumberCounter};
828 /// use lsm_tree::fs::MemFs;
829 ///
830 /// let tree = Config::new(
831 /// "/virtual/tree",
832 /// SequenceNumberCounter::default(),
833 /// SequenceNumberCounter::default(),
834 /// )
835 /// .with_fs(MemFs::new())
836 /// .open()?;
837 /// # Ok(())
838 /// # }
839 /// ```
840 #[must_use]
841 pub fn with_fs<F: Fs>(mut self, fs: F) -> Self {
842 self.fs = Arc::new(fs);
843 self
844 }
845
846 /// Sets the default filesystem backend from an existing shared handle.
847 ///
848 /// Useful when multiple configs should reuse the same backend
849 /// instance, including trait objects and backends that are not `Clone`.
850 ///
851 #[must_use]
852 pub fn with_shared_fs(mut self, fs: Arc<dyn Fs>) -> Self {
853 self.fs = fs;
854 self
855 }
856
857 /// Opens a tree using the config.
858 ///
859 /// # Errors
860 ///
861 /// Will return `Err` if an IO error occurs.
862 /// Returns [`Error::ZstdDictMismatch`](crate::Error::ZstdDictMismatch) if
863 /// the compression policy references a `dict_id` that doesn't match the
864 /// configured dictionary.
865 pub fn open(self) -> crate::Result<AnyTree> {
866 #[cfg(zstd_any)]
867 self.validate_zstd_dictionary()?;
868
869 // On a zstd build the live block path seals encrypted blocks through
870 // the AAD-bound envelope, so the configured provider MUST implement it.
871 // Reject an opaque-only provider here, at open time, instead of letting
872 // it fail on the first encrypted read/write.
873 #[cfg(zstd_any)]
874 if self
875 .encryption
876 .as_ref()
877 .is_some_and(|enc| !enc.supports_aad_block_path())
878 {
879 return Err(crate::Error::Encrypt(
880 "encryption provider does not implement the AAD-bound block path \
881 (encrypt_block_aad / decrypt_block_aad) required for encrypted \
882 blocks on a zstd build",
883 ));
884 }
885
886 Ok(if self.kv_separation_opts.is_some() {
887 AnyTree::Blob(BlobTree::open(self)?)
888 } else {
889 AnyTree::Standard(Tree::open(self)?)
890 })
891 }
892
893 /// Validates that every `ZstdDict` entry in compression policies references
894 /// a `dict_id` that matches the configured dictionary. Catches mismatches
895 /// at open time rather than at first block write/read.
896 #[cfg(zstd_any)]
897 fn validate_zstd_dictionary(&self) -> crate::Result<()> {
898 let dict_id = self.zstd_dictionary.as_ref().map(|d| d.id());
899
900 // NOTE: Only data block policies are validated. Index blocks never
901 // carry a dictionary — Writer::use_index_block_compression() downgrades
902 // ZstdDict to plain Zstd. Validating index policies here would reject
903 // configs that use ZstdDict solely for index blocks even though the
904 // writer handles them correctly.
905 for ct in self.data_block_compression_policy.iter() {
906 if let &CompressionType::ZstdDict {
907 dict_id: required, ..
908 } = ct
909 {
910 match dict_id {
911 None => {
912 return Err(crate::Error::ZstdDictMismatch {
913 expected: required,
914 got: None,
915 });
916 }
917 Some(actual) if actual != required => {
918 return Err(crate::Error::ZstdDictMismatch {
919 expected: required,
920 got: Some(actual),
921 });
922 }
923 _ => {}
924 }
925 }
926 }
927
928 // Blob files with ZstdDict compression must have a matching dictionary.
929 if let Some(ref kv_opts) = self.kv_separation_opts
930 && let CompressionType::ZstdDict {
931 dict_id: required, ..
932 } = kv_opts.compression
933 {
934 match kv_opts.zstd_dictionary.as_ref().map(|d| d.id()) {
935 None => {
936 return Err(crate::Error::ZstdDictMismatch {
937 expected: required,
938 got: None,
939 });
940 }
941 Some(actual) if actual != required => {
942 return Err(crate::Error::ZstdDictMismatch {
943 expected: required,
944 got: Some(actual),
945 });
946 }
947 _ => {}
948 }
949 }
950
951 Ok(())
952 }
953
954 /// Like [`Config::new`], but accepts pre-built shared generators.
955 ///
956 /// This is useful when the caller already has
957 /// [`SharedSequenceNumberGenerator`] instances (e.g., from a higher-level
958 /// database that shares generators across multiple trees).
959 // std-only: see [`Config::new`] — seeds via `Config::default` (`StdFs`).
960 #[cfg(feature = "std")]
961 pub fn new_with_generators<P: AsRef<Path>>(
962 path: P,
963 seqno: SharedSequenceNumberGenerator,
964 visible_seqno: SharedSequenceNumberGenerator,
965 ) -> Self {
966 Self {
967 path: absolute_path(path.as_ref()),
968 seqno,
969 visible_seqno,
970 ..Default::default()
971 }
972 }
973}
974
975#[cfg(all(test, zstd_any))]
976mod tests;
977
978impl Config {
979 /// Returns the tables folder path and [`Fs`] backend for the given level.
980 ///
981 /// If [`level_routes`](Self::level_routes) has an entry covering this
982 /// level, uses that entry's path and `Fs`. Otherwise falls back to the
983 /// primary [`path`](Self::path) and [`fs`](Self::fs).
984 #[must_use]
985 pub fn tables_folder_for_level(&self, level: u8) -> (PathBuf, Arc<dyn Fs>) {
986 if let Some(routes) = &self.level_routes {
987 for route in routes {
988 if route.levels.contains(&level) {
989 return (route.path.join(TABLES_FOLDER), route.fs.clone());
990 }
991 }
992 }
993 (self.path.join(TABLES_FOLDER), self.fs.clone())
994 }
995
996 /// Best-effort minimum free space (bytes) across every filesystem this tree
997 /// writes to: the primary [`path`](Self::path) plus each
998 /// [`level_routes`](Self::level_routes) volume.
999 ///
1000 /// The tightest volume bounds storage admission and compaction space gating,
1001 /// since a full routed (cold-tier) volume fails a flush / compaction
1002 /// targeting it even while the primary still has room. A backend that cannot
1003 /// report free space (or an I/O hiccup) contributes `u64::MAX`, so a probe
1004 /// failure never fabricates disk pressure.
1005 #[must_use]
1006 pub(crate) fn min_available_space(&self) -> u64 {
1007 let mut free = self.fs.available_space(&self.path).unwrap_or(u64::MAX);
1008 if let Some(routes) = &self.level_routes {
1009 for route in routes {
1010 free = free.min(route.fs.available_space(&route.path).unwrap_or(u64::MAX));
1011 }
1012 }
1013 free
1014 }
1015
1016 /// Returns all unique tables folders that need to be scanned during
1017 /// recovery: the primary folder plus every [`LevelRoute`] folder.
1018 #[must_use]
1019 pub fn all_tables_folders(&self) -> Vec<(PathBuf, Arc<dyn Fs>)> {
1020 let primary_fs: Arc<dyn Fs> = self.fs.clone();
1021 let mut folders: Vec<(PathBuf, Arc<dyn Fs>)> =
1022 vec![(self.path.join(TABLES_FOLDER), primary_fs)];
1023
1024 if let Some(routes) = &self.level_routes {
1025 for route in routes {
1026 let folder = route.path.join(TABLES_FOLDER);
1027 // Dedup by path: scanning the same directory twice would cause
1028 // already-recovered tables to be classified as orphans and
1029 // deleted. Routing the same path through different Fs backends
1030 // is a configuration error (level_routes validation in
1031 // Config::level_routes rejects overlapping ranges).
1032 if !folders.iter().any(|(p, _)| *p == folder) {
1033 folders.push((folder, route.fs.clone()));
1034 }
1035 }
1036 }
1037
1038 folders
1039 }
1040
1041 /// Configures per-level filesystem routing for tiered storage.
1042 ///
1043 /// Each [`LevelRoute`] maps a range of LSM levels to a base directory
1044 /// and filesystem backend. Levels not covered by any route fall back to
1045 /// the primary `path` and `fs`.
1046 ///
1047 /// # Reopen contract
1048 ///
1049 /// The route configuration is **not persisted** in the manifest.
1050 /// On reopen, the [`Config`] must specify `level_routes` such that
1051 /// [`all_tables_folders`](Self::all_tables_folders) includes every
1052 /// directory and filesystem pair that may contain existing SST files
1053 /// for this tree.
1054 ///
1055 /// Changing the mapping from levels to paths is allowed as long as
1056 /// the previously used folders remain covered. If old folders are
1057 /// omitted, recovery may fail with
1058 /// [`RouteMismatch`](crate::Error::RouteMismatch) (when all missing
1059 /// tables are on uncovered levels) or
1060 /// [`Unrecoverable`](crate::Error::Unrecoverable) (when some missing
1061 /// tables are on levels that are still covered).
1062 ///
1063 /// # Panics
1064 ///
1065 /// Panics if any route has an empty range or if any two routes have
1066 /// overlapping level ranges.
1067 #[must_use]
1068 pub fn level_routes(mut self, routes: Vec<LevelRoute>) -> Self {
1069 // Validate no empty/inverted ranges
1070 for route in &routes {
1071 assert!(
1072 route.levels.start < route.levels.end,
1073 "empty or inverted level route range: {:?}",
1074 route.levels,
1075 );
1076 }
1077
1078 // Validate no overlapping ranges
1079 for (i, a) in routes.iter().enumerate() {
1080 for b in routes.iter().skip(i + 1) {
1081 assert!(
1082 a.levels.end <= b.levels.start || b.levels.end <= a.levels.start,
1083 "overlapping level routes: {:?} and {:?}",
1084 a.levels,
1085 b.levels,
1086 );
1087 }
1088 }
1089 self.level_routes = if routes.is_empty() {
1090 None
1091 } else {
1092 // Normalize paths the same way Config::new normalizes self.path
1093 Some(
1094 routes
1095 .into_iter()
1096 .map(|mut r| {
1097 r.path = absolute_path(&r.path);
1098 r
1099 })
1100 .collect(),
1101 )
1102 };
1103 self
1104 }
1105
1106 /// Overrides the sequence number generator.
1107 ///
1108 /// By default, [`SequenceNumberCounter`] is used. This allows plugging in
1109 /// a custom generator (e.g., HLC for distributed databases).
1110 #[must_use]
1111 pub fn seqno_generator(mut self, generator: SharedSequenceNumberGenerator) -> Self {
1112 self.seqno = generator;
1113 self
1114 }
1115
1116 /// Overrides the visible sequence number generator.
1117 #[must_use]
1118 pub fn visible_seqno_generator(mut self, generator: SharedSequenceNumberGenerator) -> Self {
1119 self.visible_seqno = generator;
1120 self
1121 }
1122
1123 /// Sets the global cache.
1124 ///
1125 /// You can create a global [`Cache`] and share it between multiple
1126 /// trees to cap global cache memory usage.
1127 ///
1128 /// Defaults to a cache with 16 MiB of capacity *per tree*.
1129 #[must_use]
1130 pub fn use_cache(mut self, cache: Arc<Cache>) -> Self {
1131 self.cache = cache;
1132 self
1133 }
1134
1135 /// Sets the file descriptor cache.
1136 ///
1137 /// Can be shared across trees.
1138 #[must_use]
1139 pub fn use_descriptor_table(mut self, descriptor_table: Option<Arc<DescriptorTable>>) -> Self {
1140 self.descriptor_table = descriptor_table;
1141 self
1142 }
1143
1144 /// If `true`, the last level will not build filters, reducing the filter size of a database
1145 /// by ~90% typically.
1146 ///
1147 /// **Enable this only if you know that point reads generally are expected to find a key-value pair.**
1148 #[must_use]
1149 pub fn expect_point_read_hits(mut self, b: bool) -> Self {
1150 self.expect_point_read_hits = b;
1151 self
1152 }
1153
1154 /// Enables per-block Page ECC.
1155 ///
1156 /// When enabled, every block written by this tree carries a parity
1157 /// trailer; on read, if the block's XXH3 disagrees with the on-disk
1158 /// bytes, the reader attempts recovery from the trailer before surfacing
1159 /// the corruption. The correction scheme defaults to per-word SEC-DED and
1160 /// is selectable at runtime (`update_runtime_config`): per-word SEC-DED,
1161 /// single XOR parity, or Reed-Solomon.
1162 ///
1163 /// Opening a tree with `page_ecc = true` on a build that does not
1164 /// have the `page_ecc` cargo feature enabled returns
1165 /// [`crate::Error::PageEccUnsupported`] at `Tree::open` — the
1166 /// reader has no way to honour the parity trailer without the
1167 /// codec, so silently downgrading integrity is not an option.
1168 ///
1169 /// Wired into the on-disk write path via `MultiWriter::use_page_ecc`
1170 /// at every `Tree::open` / `Tree::ingestion` / compaction-worker
1171 /// `MultiWriter` construction site. With this flag set, every
1172 /// `Block::write_into` call those writers make upgrades its
1173 /// `BlockTransform` to the matching `*Ecc` variant — emitting the
1174 /// configured scheme's parity trailer and setting the `ECC_PARITY` flag
1175 /// in each block header (the trailer length is derived from
1176 /// `data_length`, not stored).
1177 #[must_use]
1178 pub fn page_ecc(mut self, enabled: bool) -> Self {
1179 self.page_ecc = enabled;
1180 self
1181 }
1182
1183 /// Enables or disables the cross-process directory lock (default: enabled).
1184 ///
1185 /// When enabled, [`Config::open`] and [`Config::repair`] acquire an
1186 /// exclusive advisory lock on a `LOCK` file in the tree directory, so a
1187 /// second process opening / repairing the same directory fails fast with
1188 /// [`Error::Locked`](crate::Error::Locked) rather than corrupting the shared
1189 /// manifest. Disable ONLY when exclusive directory ownership is already
1190 /// guaranteed at a higher layer (e.g. an embedding keyspace / journal
1191 /// manager that opens each directory at most once per host).
1192 #[must_use]
1193 pub fn with_directory_lock(mut self, enabled: bool) -> Self {
1194 self.directory_lock = enabled;
1195 self
1196 }
1197
1198 /// Sets the Page ECC scheme used when [`Self::page_ecc`] is enabled.
1199 ///
1200 /// ECC is off until `page_ecc(true)`. When on, this picks the
1201 /// algorithm:
1202 /// [`EccScheme::Secded`](crate::runtime_config::EccScheme::Secded)
1203 /// (per-word single-bit correct / double-bit detect, the default, supported
1204 /// at Block granularity),
1205 /// [`EccScheme::Xor`](crate::runtime_config::EccScheme::Xor) (RAID-5
1206 /// single-parity), or
1207 /// [`EccScheme::ReedSolomon`](crate::runtime_config::EccScheme::ReedSolomon).
1208 /// There is no implicit RS(4,2) default.
1209 #[must_use]
1210 pub fn ecc_scheme(mut self, scheme: crate::runtime_config::EccScheme) -> Self {
1211 self.initial_runtime_config.ecc_scheme = scheme;
1212 self
1213 }
1214
1215 /// Sets whether the writer clears per-file copy-on-write on newly created
1216 /// SST / blob files when the backing filesystem is copy-on-write (Btrfs).
1217 ///
1218 /// Default `true`: write-once SSTs gain no benefit from `CoW` but suffer a
1219 /// fragmentation penalty (~20% write throughput on Btrfs), so clearing it
1220 /// recovers the ext4-equivalent baseline. A no-op on non-`CoW` filesystems.
1221 /// Set `false` to preserve `CoW` (e.g. Btrfs subvolume snapshots that depend
1222 /// on it). See [`crate::runtime_config::RuntimeConfig::disable_cow_on_sst_files`].
1223 #[must_use]
1224 pub fn disable_cow_on_sst_files(mut self, enabled: bool) -> Self {
1225 self.initial_runtime_config.disable_cow_on_sst_files = enabled;
1226 self
1227 }
1228
1229 /// Sets whether [`crate::AbstractTree::create_checkpoint`] clones files via
1230 /// reflink (`FICLONE` / `clonefile`) when the filesystem supports it,
1231 /// falling back to a hard link otherwise.
1232 ///
1233 /// Default `true`: a reflinked checkpoint has an independent inode (no
1234 /// max-links constraint, modifications never touch the original) at O(1)
1235 /// cost via copy-on-write block sharing. A no-op (hard-link path) on
1236 /// filesystems without reflink. See
1237 /// [`crate::runtime_config::RuntimeConfig::use_reflink_for_checkpoint`].
1238 #[must_use]
1239 pub fn use_reflink_for_checkpoint(mut self, enabled: bool) -> Self {
1240 self.initial_runtime_config.use_reflink_for_checkpoint = enabled;
1241 self
1242 }
1243
1244 /// Sets the initial [`crate::runtime_config::RuntimeConfig`]
1245 /// snapshot the tree will start with.
1246 ///
1247 /// Seeds both the first manifest write and the live
1248 /// `RuntimeConfigHandle` exposed via
1249 /// [`crate::Tree::runtime_config`].
1250 ///
1251 /// **Manifest-hardening toggles** in the supplied snapshot
1252 /// that are currently wired through the writer
1253 /// (`manifest_footer_mirror`, `page_ecc` *as consumed by
1254 /// `manifest_blocks::writer` when picking the `BlockTransform`
1255 /// variant*) take effect from byte zero of the on-disk
1256 /// manifest rather than waiting for a post-open
1257 /// [`crate::Tree::update_runtime_config`] call. Subsequent
1258 /// updates still flow through the live handle and apply to
1259 /// the next manifest write.
1260 ///
1261 /// `manifest_kv_checksums` is plumbed in the snapshot but the
1262 /// writer does NOT yet consult or persist it (per-entry
1263 /// framing + footer-flag slot land in a follow-up). Setting
1264 /// it here today has no on-disk effect; it is exposed for
1265 /// forward-compat with no behaviour break.
1266 ///
1267 /// **Note on data-block ECC:** `RuntimeConfig::page_ecc`
1268 /// currently affects manifest Blocks only — data-block ECC is
1269 /// still gated by [`Config::page_ecc`] at tree-open time. The
1270 /// SST writer path consumes the tree-static config, not the
1271 /// runtime handle. Wiring through SST emission is a follow-up.
1272 #[must_use]
1273 pub fn with_runtime_config(mut self, runtime: crate::runtime_config::RuntimeConfig) -> Self {
1274 self.initial_runtime_config = runtime;
1275 self
1276 }
1277
1278 /// Sets the partitioning policy for filter blocks.
1279 #[must_use]
1280 pub fn filter_block_partitioning_policy(mut self, policy: PinningPolicy) -> Self {
1281 self.filter_block_partitioning_policy = policy;
1282 self
1283 }
1284
1285 /// Sets the partitioning policy for index blocks.
1286 #[must_use]
1287 pub fn index_block_partitioning_policy(mut self, policy: PinningPolicy) -> Self {
1288 self.index_block_partitioning_policy = policy;
1289 self
1290 }
1291
1292 /// Sets the pinning policy for filter blocks.
1293 #[must_use]
1294 pub fn filter_block_pinning_policy(mut self, policy: PinningPolicy) -> Self {
1295 self.filter_block_pinning_policy = policy;
1296 self
1297 }
1298
1299 /// Sets the pinning policy for index blocks.
1300 #[must_use]
1301 pub fn index_block_pinning_policy(mut self, policy: PinningPolicy) -> Self {
1302 self.index_block_pinning_policy = policy;
1303 self
1304 }
1305
1306 /// Sets the restart interval inside data blocks.
1307 ///
1308 /// A higher restart interval saves space while increasing lookup times
1309 /// inside data blocks.
1310 ///
1311 /// Default = 16
1312 ///
1313 /// # Panics
1314 ///
1315 /// Panics if any restart interval in `policy` is zero.
1316 #[must_use]
1317 pub fn data_block_restart_interval_policy(mut self, policy: RestartIntervalPolicy) -> Self {
1318 assert!(
1319 policy.iter().all(|interval| *interval > 0),
1320 "data block restart interval must be greater than zero",
1321 );
1322 self.data_block_restart_interval_policy = policy;
1323 self
1324 }
1325
1326 /// Sets the restart interval inside index blocks.
1327 ///
1328 /// A higher restart interval saves space while increasing lookup times
1329 /// inside index blocks.
1330 ///
1331 /// Default = 1
1332 ///
1333 /// # Panics
1334 ///
1335 /// Panics if any restart interval in `policy` is zero.
1336 #[must_use]
1337 pub fn index_block_restart_interval_policy(mut self, policy: RestartIntervalPolicy) -> Self {
1338 assert!(
1339 policy.iter().all(|interval| *interval > 0),
1340 "index block restart interval must be greater than zero",
1341 );
1342 self.index_block_restart_interval_policy = policy;
1343 self
1344 }
1345
1346 /// Sets the filter construction policy.
1347 #[must_use]
1348 pub fn filter_policy(mut self, policy: FilterPolicy) -> Self {
1349 self.filter_policy = policy;
1350 self
1351 }
1352
1353 /// Sets the retrieval-ribbon locator policy.
1354 ///
1355 /// On by default at [`LocatorPrecision::Block`] (see
1356 /// [`LocatorPolicy::block_level`]). When enabled for a level, written SSTs on
1357 /// that level carry an optional `locator` section mapping each key to its
1358 /// data block (and, at finer precisions, its slot), letting point reads skip
1359 /// the index-block binary search. Set [`LocatorPolicy::disabled`] to opt out;
1360 /// disabled levels emit byte-identical SSTs.
1361 #[must_use]
1362 pub fn locator_policy(mut self, policy: LocatorPolicy) -> Self {
1363 self.locator_policy = policy;
1364 self
1365 }
1366
1367 /// Sets the compression method for data blocks.
1368 #[must_use]
1369 pub fn data_block_compression_policy(mut self, policy: CompressionPolicy) -> Self {
1370 self.data_block_compression_policy = policy;
1371 self
1372 }
1373
1374 /// Sets the compression method for index blocks.
1375 #[must_use]
1376 pub fn index_block_compression_policy(mut self, policy: CompressionPolicy) -> Self {
1377 self.index_block_compression_policy = policy;
1378 self
1379 }
1380
1381 // TODO: level count is fixed to 7 right now
1382 // /// Sets the number of levels of the LSM tree (depth of tree).
1383 // ///
1384 // /// Defaults to 7, like `LevelDB` and `RocksDB`.
1385 // ///
1386 // /// Cannot be changed once set.
1387 // ///
1388 // /// # Panics
1389 // ///
1390 // /// Panics if `n` is 0.
1391 // #[must_use]
1392 // pub fn level_count(mut self, n: u8) -> Self {
1393 // assert!(n > 0);
1394
1395 // self.level_count = n;
1396 // self
1397 // }
1398
1399 /// Sets the data block size policy.
1400 #[must_use]
1401 pub fn data_block_size_policy(mut self, policy: BlockSizePolicy) -> Self {
1402 self.data_block_size_policy = policy;
1403 self
1404 }
1405
1406 /// Sets the hash ratio policy for data blocks.
1407 ///
1408 /// If greater than 0.0, a hash index is embedded into data blocks that can speed up reads
1409 /// inside the data block.
1410 #[must_use]
1411 pub fn data_block_hash_ratio_policy(mut self, policy: HashRatioPolicy) -> Self {
1412 self.data_block_hash_ratio_policy = policy;
1413 self
1414 }
1415
1416 /// Toggles key-value separation.
1417 #[must_use]
1418 pub fn with_kv_separation(mut self, opts: Option<KvSeparationOptions>) -> Self {
1419 self.kv_separation_opts = opts;
1420 self
1421 }
1422
1423 /// Installs a custom compaction filter.
1424 #[must_use]
1425 pub fn with_compaction_filter_factory(mut self, factory: Option<Arc<dyn Factory>>) -> Self {
1426 self.compaction_filter_factory = factory;
1427 self
1428 }
1429
1430 /// Sets the prefix extractor for prefix bloom filters.
1431 ///
1432 /// When configured, bloom filters will index key prefixes returned by
1433 /// the extractor. Prefix scans can then skip segments whose bloom
1434 /// filter reports no match for the scan prefix.
1435 #[must_use]
1436 pub fn prefix_extractor(mut self, extractor: Arc<dyn PrefixExtractor>) -> Self {
1437 self.prefix_extractor = Some(extractor);
1438 self
1439 }
1440
1441 /// Installs a merge operator for commutative operations.
1442 ///
1443 /// When set, enables [`crate::AbstractTree::merge`] which stores partial updates
1444 /// (operands) that are lazily combined during reads and compaction.
1445 #[must_use]
1446 pub fn with_merge_operator(mut self, op: Option<Arc<dyn MergeOperator>>) -> Self {
1447 self.merge_operator = op;
1448 self
1449 }
1450
1451 /// Sets a custom user key comparator.
1452 ///
1453 /// When configured, all key ordering (memtable, block index, merge,
1454 /// range scans) uses this comparator instead of the default lexicographic
1455 /// byte ordering.
1456 ///
1457 /// # Important
1458 ///
1459 /// The comparator's [`crate::UserComparator::name`] is persisted when a tree is
1460 /// first created. On subsequent opens the stored name is compared against
1461 /// the supplied comparator's name — a mismatch causes the open to fail
1462 /// with [`Error::ComparatorMismatch`](crate::Error::ComparatorMismatch).
1463 #[must_use]
1464 pub fn comparator(mut self, comparator: SharedComparator) -> Self {
1465 self.comparator = comparator;
1466 self
1467 }
1468
1469 /// Sets the block-level encryption provider for encryption at rest.
1470 ///
1471 /// When set, all blocks written to SST files are encrypted after
1472 /// compression and before checksumming, using the provided
1473 /// [`EncryptionProvider`].
1474 ///
1475 /// The caller is responsible for key management and rotation.
1476 /// See `crate::Aes256GcmProvider` (behind the `encryption` feature)
1477 /// for a ready-to-use AES-256-GCM implementation.
1478 ///
1479 /// **Important constraints:**
1480 /// - Encryption state is NOT recorded in SST metadata. Opening an
1481 /// encrypted tree without the correct provider (or vice versa) will
1482 /// cause block validation errors, not silent corruption.
1483 /// - Blob files (KV-separated large values) are NOT covered by
1484 /// block-level encryption. Large values stored via KV separation
1485 /// remain in plaintext on disk.
1486 #[must_use]
1487 pub fn with_encryption(mut self, encryption: Option<Arc<dyn EncryptionProvider>>) -> Self {
1488 self.encryption = encryption;
1489 self
1490 }
1491
1492 /// Sets the MANIFEST recovery policy for `Tree::open`.
1493 ///
1494 /// The default ([`ManifestRecoveryMode::AbsoluteConsistency`]) is the
1495 /// only choice that's safe for live production: any corrupt record
1496 /// in the on-disk manifest aborts the open. Switching to a more
1497 /// permissive mode trades strict correctness for partial
1498 /// availability after a disaster. The recovery path emits a
1499 /// `warn!` summary per affected section (aggregate counts: total
1500 /// table records dropped, total blob-file records dropped,
1501 /// header truncations) rather than one log line per dropped
1502 /// record — the dropped records were never decoded in the first
1503 /// place, so no per-record IDs are available. Always pair the
1504 /// non-default modes with an out-of-band integrity scan
1505 /// ([`verify_integrity`](crate::verify::verify_integrity) for
1506 /// whole-file XXH3 over every SST + blob file, or
1507 /// [`verify_block_checksums`](crate::verify::verify_block_checksums)
1508 /// for per-block granularity) before trusting the recovered tree
1509 /// for writes.
1510 ///
1511 /// See the [`ManifestRecoveryMode`] doc for per-variant semantics.
1512 #[must_use]
1513 pub fn manifest_recovery_mode(mut self, mode: ManifestRecoveryMode) -> Self {
1514 self.manifest_recovery_mode = mode;
1515 self
1516 }
1517
1518 /// Sets the durability level for every fsync the tree issues.
1519 ///
1520 /// Defaults to [`SyncMode::Normal`] (plain `fsync`, matching `RocksDB` /
1521 /// `SQLite` defaults). Pass [`SyncMode::Full`] to force `F_FULLFSYNC` on
1522 /// macOS for power-loss durability without an external journal — at a
1523 /// large per-flush cost. On non-macOS platforms both modes are
1524 /// identical (plain `fsync`).
1525 #[must_use]
1526 pub fn sync_mode(mut self, mode: SyncMode) -> Self {
1527 self.sync_mode = mode;
1528 self
1529 }
1530
1531 /// Sets the edit-log rotation threshold in bytes (default 1 MiB).
1532 ///
1533 /// Once the manifest edit log exceeds this size, the next version upgrade
1534 /// writes a fresh full snapshot and starts an empty log instead of appending
1535 /// another edit. Lower it to shorten recovery replay and cap log size at the
1536 /// cost of more frequent full-snapshot writes; `0` rotates on every upgrade.
1537 #[must_use]
1538 pub fn manifest_log_rotate_bytes(mut self, bytes: u64) -> Self {
1539 self.manifest_log_rotate_bytes = bytes;
1540 self
1541 }
1542
1543 /// Sets the compaction I/O rate limit in bytes per second.
1544 ///
1545 /// Caps how fast the compaction worker may issue I/O so background
1546 /// compaction does not saturate the device and spike user read P99.
1547 /// `0` (the default) disables throttling. Only compaction is limited;
1548 /// flush and user reads always pass through.
1549 #[must_use]
1550 pub fn compaction_rate_limit(mut self, bytes_per_sec: u64) -> Self {
1551 self.compaction_rate_limit = bytes_per_sec;
1552 self
1553 }
1554
1555 /// Sets the compaction worker-thread count.
1556 ///
1557 /// Under `std` this both sizes the per-tree block-compression pool built at
1558 /// open when no shared pool is supplied (see [`Self::compaction_pool`]) and
1559 /// caps how many range-parallel sub-compactions a compaction splits into.
1560 /// `1` keeps compaction serial. Default is `max(1, available_parallelism /
1561 /// 2)`. Without the `parallel` feature there is no built-in pool, so the
1562 /// work runs serially even for a value > 1.
1563 #[cfg(feature = "std")]
1564 #[must_use]
1565 pub fn compaction_threads(mut self, threads: usize) -> Self {
1566 // Clamp to >= 1: the documented semantics treat `1` as "serial", and a
1567 // 0-thread pool would be an invalid state.
1568 self.compaction_threads = threads.max(1);
1569 self
1570 }
1571
1572 /// Sets the minimum total input size (bytes) for a compaction to be split
1573 /// into parallel sub-compactions. Default 8 MiB. `0` splits every eligible
1574 /// compaction; a large value effectively disables sub-compaction (block
1575 /// compression still parallelizes via [`Self::compaction_threads`]).
1576 #[cfg(feature = "std")]
1577 #[must_use]
1578 pub fn subcompaction_min_bytes(mut self, bytes: u64) -> Self {
1579 self.subcompaction_min_bytes = bytes;
1580 self
1581 }
1582
1583 /// Supplies a shared compaction thread pool, used in place of the per-tree
1584 /// default. Pass one [`crate::table::writer::CompactionSpawner`] (e.g. a
1585 /// `RayonSpawner` wrapping a shared rayon thread pool) to several trees so
1586 /// the total worker-thread count stays bounded by the pool size rather than
1587 /// the number of open trees.
1588 #[cfg(feature = "std")]
1589 #[must_use]
1590 pub fn compaction_pool(
1591 mut self,
1592 pool: Option<Arc<dyn crate::table::writer::CompactionSpawner>>,
1593 ) -> Self {
1594 self.compaction_pool = pool;
1595 self
1596 }
1597
1598 /// Sets the pre-trained zstd dictionary for dictionary compression.
1599 ///
1600 /// When set, data blocks using [`CompressionType::ZstdDict`] will be
1601 /// compressed and decompressed with this dictionary. The dictionary
1602 /// should be trained on representative data samples for best results.
1603 ///
1604 /// Create a dictionary with [`ZstdDictionary::new`](crate::ZstdDictionary::new),
1605 /// then use [`CompressionType::zstd_dict`] to create a matching
1606 /// compression type:
1607 ///
1608 /// ```ignore
1609 /// use lsm_tree::{CompressionType, ZstdDictionary};
1610 ///
1611 /// let dict = ZstdDictionary::new(&training_data);
1612 /// let compression = CompressionType::zstd_dict(3, dict.id()).unwrap();
1613 ///
1614 /// config
1615 /// .zstd_dictionary(Some(Arc::new(dict)))
1616 /// .data_block_compression_policy(CompressionPolicy::all(compression));
1617 /// ```
1618 #[cfg(zstd_any)]
1619 #[must_use]
1620 pub fn zstd_dictionary(
1621 mut self,
1622 dictionary: Option<Arc<crate::compression::ZstdDictionary>>,
1623 ) -> Self {
1624 self.zstd_dictionary = dictionary;
1625 self
1626 }
1627}
1628
1629#[cfg(test)]
1630mod builder_tests;